Commit c9b7e42e3c0247c882a47e890be76123ddefb8d4

Authored by totoro
1 parent d57304cc

[~] enhancement of read csv file. Now return a list of String instead of ByteArr…

…ay and Offset in file of last returned line. This allow to keep the offset for restarting importation if the process was interrupted.
[!] Fix the find the first by using ByteArray instead of String because the Boyer-Moore algorithm is now in the VM and it was use to search String into binary data. When the first NULL was encountered the searching process was stopped. Of it was a bug to use the String instead of ByteArray.
[!] Line_reader now use fast_lexer_4
anubis_dev/library/data_base/read_csv.anubis
... ... @@ -5,6 +5,7 @@
5 5  
6 6  
7 7 read tools/basis.anubis
  8 +read tools/time.anubis
8 9 read lexical_analysis/fast_lexer_4.anubis
9 10  
10 11  
... ... @@ -12,8 +13,8 @@ read lexical_analysis/fast_lexer_4.anubis
12 13  
13 14 public type ReadCsvResult:
14 15 end_of_input,
15   - error(String), // an error message
16   - ok(List(ByteArray)). // a single record
  16 + error(String), // an error message
  17 + ok((Int, List(String))). // a single record and the offset of last read token
17 18  
18 19 public define One -> ReadCsvResult
19 20 make_read_csv_line
... ... @@ -99,12 +100,15 @@ define List(LexerItem(One,One))
99 100 ].
100 101  
101 102 The lexer described below skips to end of line (and eats the end of line).
  103 +
  104 +type EOL_Token:
  105 + eol_offset(Int offset).
102 106  
103   -define List(LexerItem(One,One))
  107 +define List(LexerItem(EOL_Token,One))
104 108 to_eol_description
105 109 =
106 110 [
107   - lexer_item("([^#r#n]*)((#n)|(#r#n))", return(((Int,Int) -> ByteArray b, Int l, LexingTools t, One u) |-> token(unique)))
  111 + lexer_item("([^#r#n]*)((#n)|(#r#n))", return(((Int,Int) -> ByteArray b, Int l, LexingTools t, One u) |-> token(eol_offset(t.offset(unique)))))
108 112 ].
109 113  
110 114  
... ... @@ -133,7 +137,7 @@ read generated/csv_s_skip_cell.anubis
133 137 read generated/csv_to_eol.anubis
134 138  
135 139  
136   -define One
  140 + define One
137 141 repeat
138 142 (
139 143 Int n,
... ... @@ -142,7 +146,7 @@ define One
142 146 if n =< 0 then unique else f(unique); repeat(n-1,f).
143 147  
144 148  
145   -define Int -> Result(String,ByteArray)
  149 + define Int -> Result(String,ByteArray)
146 150 read_next_cell
147 151 (
148 152 One -> One skip_cell,
... ... @@ -155,7 +159,8 @@ type CB_Result:
155 159 eof,
156 160 error(String),
157 161 skip,
158   - cell(ByteArray).
  162 + cell(String),
  163 + eol(Int offset).
159 164  
160 165  
161 166 define One -> ReadCsvResult
... ... @@ -164,25 +169,29 @@ define One -&gt; ReadCsvResult
164 169 List(One -> CB_Result) cbs,
165 170 One -> One to_eol
166 171 ) =
167   - with f = (List(One -> CB_Result) l) |-f1->
168   - if l is
  172 + with f = (List(One -> CB_Result) l, List(String) so_far) |-f1->
  173 + if l is
169 174 {
170   - [ ] then ok([ ]),
  175 + [ ] then ok((0, reverse(so_far))), //ok([ ]),
171 176 [f . g] then
172 177 if f(unique) is
173 178 {
174 179 eof then end_of_input,
175 180 error(e) then to_eol(unique); error(e),
176   - skip then (ReadCsvResult)f1(g)
177   - cell(c) then if (ReadCsvResult)f1(g) is
178   - {
179   - end_of_input then end_of_input,
180   - error(e) then to_eol(unique); error(e),
181   - ok(k) then ok([c . k])
182   - }
  181 + skip then (ReadCsvResult)f1(g, so_far),
  182 + cell(c) then
  183 + //print("read cell ["+c+"] ");
  184 + (ReadCsvResult)f1(g, [c. so_far]),
  185 + eol(offset) then
  186 + if g is
  187 + {
  188 + [] then ok((offset, reverse(so_far))),
  189 + [_ . _] then error("End Of Line unexpected")
  190 + }
  191 +
183 192 }
184 193 },
185   - (One u) |-> f(cbs).
  194 + (One u) |-> f(cbs, []).
186 195  
187 196  
188 197  
... ... @@ -253,7 +262,7 @@ public define One -&gt; ReadCsvResult
253 262 error(b,line,col) then error("in "+to_string(b)),
254 263 token(t) then if t is
255 264 {
256   - double_quote then cell(concat(reverse(so_far))),
  265 + double_quote then cell(to_string(concat(reverse(so_far)))),
257 266 two_double_quotes then aux([{0x22} . so_far]),
258 267 part(p) then aux([p . so_far])
259 268 }
... ... @@ -266,14 +275,14 @@ public define One -&gt; ReadCsvResult
266 275 {
267 276 eof then eof,
268 277 double_quote then read_in(u),
269   - separator(c) then cell(c)
  278 + separator(c) then cell(to_string(c))
270 279 }
271 280 },
272 281 to_eol = (One u) |-> if lex_eol(u) is
273 282 {
274 283 end_of_input then eof,
275 284 error(b,line,col) then error("eol "+to_string(b)),
276   - token(t) then skip
  285 + token(t) then if t is eol_offset(offset) then eol(offset)
277 286 },
278 287 make_read_csv_line(make_cbs(skip_cell,read_cell,to_eol,cols_to_get),
279 288 (One u) |-> forget(to_eol(u))).
... ... @@ -303,12 +312,12 @@ define Maybe(List(Int))
303 312  
304 313  
305 314 define One
306   - print
  315 + print_csv_line
307 316 (
308   - List(ByteArray) l
  317 + List(String) l
309 318 ) =
310 319 print("| ");
311   - map_forget((ByteArray b) |-> print(to_string(b)+" | "),l);
  320 + map_forget((String b) |-> print(b+" | "),l);
312 321 print("\n").
313 322  
314 323  
... ... @@ -322,14 +331,47 @@ define One
322 331 (
323 332 One -> ReadCsvResult f
324 333 ) =
  334 + println("----");
325 335 if f(unique) is
326 336 {
327 337 end_of_input then print("----------------------\n"),
328   - error(e) then print(e+"\n"); print_to_error(f),
329   - ok(l) then print(l); print_to_error(f)
  338 + error(e) then print("error ["+e+"]\n"); print_to_error(f),
  339 + ok(l) then if l is (offset, n) then print("[eol offset = "+offset+"] ");print_csv_line(n); print_to_error(f)
330 340 }.
331 341  
332   -
  342 +define One
  343 + show_perf
  344 + (
  345 + One -> ReadCsvResult f,
  346 + Int left,
  347 + Int read_line,
  348 + Int block_size,
  349 + UTime start_time
  350 + ) =
  351 + if f(unique) is
  352 + {
  353 + end_of_input then show_duration("lines read "+read_line, start_time);
  354 + print("----------------------\n"),
  355 + error(e) then print("error ["+e+"]\n"); print_to_error(f),
  356 + ok(l) then
  357 + with left = if left = 1 then
  358 + show_duration("lines read "+read_line+1, start_time);
  359 + block_size
  360 + else
  361 + left -1,
  362 + show_perf(f, left, read_line+1, block_size, start_time)
  363 + }.
  364 +
  365 +
  366 +define One
  367 + show_perf
  368 + (
  369 + One -> ReadCsvResult f,
  370 + Int block_size
  371 + )=
  372 + show_perf(f, block_size, 0, block_size, unow)
  373 + .
  374 +
333 375 global define One
334 376 read_csv_file
335 377 (
... ... @@ -337,9 +379,12 @@ global define One
337 379 ) =
338 380 if args is
339 381 {
340   - [ ] then syntax,
341   - [path . t] then if t is
  382 + [ ] then syntax,
  383 + [path . t] then
  384 + println("file "+path);
  385 + if t is
342 386 {
  387 +
343 388 [ ] then syntax,
344 389 [sep . l] then if sep:[",",";"]
345 390 then
... ... @@ -357,6 +402,7 @@ global define One
357 402 success(ls) then
358 403 with cs = no_doubles(qsort(cols,(Int x, Int y) |-> x < y)),
359 404 read_line = make_read_csv_line(ls,sep,cs),
  405 +// show_perf(read_line, 10000)
360 406 print_to_error(read_line)
361 407 }
362 408 }
... ...
anubis_dev/library/system/files.anubis
... ... @@ -1020,14 +1020,14 @@ define Maybe(Int)
1020 1020 find_the_first
1021 1021 (
1022 1022 Data_IO io,
1023   - String looking_for, //String to search
  1023 + ByteArray looking_for, //String to search
1024 1024 Int size, //size of the string to search
1025   - String buffer,
  1025 + ByteArray buffer,
1026 1026 Int current_pos,
1027 1027 Int buf_size,
1028 1028 Int buf_pos
1029 1029 )=
1030   - //println("buf_size :"+buf_size+ " buf_pos :"+buf_pos + " size : "+size);
  1030 + //println("general current pos: "+current_pos+" | buffer size: "+buf_size+ " | buffer pos: "+buf_pos + " | search size: "+size);
1031 1031 if (buf_size - buf_pos) < size then
1032 1032 //println("New buffer request current pos "+current_pos+" buffer_pos "+buf_pos);
1033 1033 if read_bytes(io, 65536) is // <- block size is 64k
... ... @@ -1035,23 +1035,30 @@ define Maybe(Int)
1035 1035 failure then println("read_bytes failure");failure, //finish
1036 1036 time_out then println("read_bytes timeout");failure, //finish
1037 1037 success(ba) then
1038   - with new_buffer = to_string(extract(to_byte_array(buffer), buf_pos, buf_size) + ba),
  1038 + //println("length of ba "+length(ba));
  1039 + with ex_ba = extract(buffer, buf_pos, buf_size),
  1040 + //println("length of ex_ba "+length(ex_ba));
  1041 + with new_ba = ex_ba + ba,
  1042 + //println("length of new_ba "+length(new_ba));
  1043 + with new_buffer = ex_ba + ba,
1039 1044 //println("SUCCESS New buffer length "+length(new_buffer)+" new current_pos "+current_pos);
1040 1045 find_the_first(io, looking_for, size, new_buffer, current_pos + buf_pos, length(new_buffer), 0),
1041 1046 truncated(ba) then
1042 1047 if length(ba) = 0 then
  1048 + //println("last buffer current position ["+current_pos+"]");
1043 1049 failure //finish
1044 1050 else
1045   - with new_buffer = to_string(extract(to_byte_array(buffer), buf_pos, buf_size) + ba),
  1051 + with new_buffer = extract(buffer, buf_pos, buf_size) + ba,
1046 1052 // println("TRUNCATED New buffer length "+length(new_buffer)+" new current_pos "+current_pos);
1047 1053 find_the_first(io, looking_for, size, new_buffer, current_pos + buf_pos, length(new_buffer), 0)
1048 1054 }
1049 1055 else
1050   - if find_string(buffer, looking_for, buf_pos) is
  1056 + if find_byte_array(buffer, looking_for, buf_pos) is
1051 1057 {
1052   - failure then find_the_first(io, looking_for, size, buffer, current_pos, buf_size, buf_size - (size-1)),
  1058 + failure then
  1059 + find_the_first(io, looking_for, size, buffer, current_pos, buf_size, buf_size - (size-1)),
1053 1060 success(pos) then
1054   - println("pattern ["+looking_for+"] found at offset "+(current_pos+pos));
  1061 + //println("pattern ["+to_string(looking_for)+"] found at offset "+(current_pos+pos));
1055 1062 success(current_pos + pos)
1056 1063 }
1057 1064 .
... ... @@ -1126,7 +1133,7 @@ public define Maybe(Int)
1126 1133 Data_IO io,
1127 1134 String search_string
1128 1135 ) =
1129   - find_the_first(io, search_string, length(search_string), "", 0, 0, 0).
  1136 + find_the_first(io, to_byte_array(search_string), length(search_string), constant_byte_array(0,0), 0, 0, 0).
1130 1137  
1131 1138 public define Maybe(Int)
1132 1139 find_the_first
... ... @@ -1138,7 +1145,7 @@ public define Maybe(Int)
1138 1145 {
1139 1146 failure then failure,
1140 1147 success(f) then
1141   - find_the_first(make_data_io(f), search_string, length(search_string), "", 0, 0, 0)
  1148 + find_the_first(make_data_io(f), to_byte_array(search_string), length(search_string), constant_byte_array(0,0), 0, 0, 0)
1142 1149 }.
1143 1150  
1144 1151 public define Maybe(Int)
... ... @@ -1153,7 +1160,8 @@ public define Maybe(Int)
1153 1160 failure then failure,
1154 1161 success(f) then
1155 1162 with size = file_size(filename),
1156   - find_the_first(make_data_io(f, start_position, size - start_position), search_string, length(search_string), "", 0, 0, 0)
  1163 + //println("file size "+size);
  1164 + find_the_first(make_data_io(f, start_position, size - start_position), to_byte_array(search_string), length(search_string), constant_byte_array(0,0), 0, 0, 0)
1157 1165 }.
1158 1166  
1159 1167 public define Maybe(Int)
... ... @@ -1168,6 +1176,6 @@ public define Maybe(Int)
1168 1176 {
1169 1177 failure then failure,
1170 1178 success(f) then
1171   - find_the_first(make_data_io(f, start_position, end_position - start_position), search_string, length(search_string), "", 0, 0, 0)
  1179 + find_the_first(make_data_io(f, start_position, end_position - start_position), to_byte_array(search_string), length(search_string), constant_byte_array(0,0), 0, 0, 0)
1172 1180 }.
1173 1181  
... ...
anubis_dev/library/tools/line_reader.anubis
... ... @@ -38,7 +38,7 @@
38 38  
39 39 //---------------------------------------------------------------------------
40 40  
41   -read lexical_analysis/fast_lexer_3.anubis
  41 +read lexical_analysis/fast_lexer_4.anubis
42 42  
43 43  
44 44 type Token:
... ... @@ -46,13 +46,13 @@ type Token:
46 46 eol.
47 47  
48 48 public type LineReaderLexer:
49   - line_reader_lexer(LexingStream(One) -> One -> LexerOutput(Token) /*lexer_base*/).
  49 + line_reader_lexer((LexingStream, One) -> One -> LexerOutput(Token) /*lexer_base*/).
50 50  
51 51 public type LineReader:
52 52 line_reader(One -> LexerOutput(Token) /*lexer*/,
53 53 One -> Int /*offset*/,
54 54 LineReaderLexer /*lexer_base*/,
55   - LexingStream(One) lexing_stream).
  55 + LexingStream lexing_stream).
56 56  
57 57 public define Int
58 58 current_offset
... ... @@ -96,30 +96,30 @@ public define Maybe(String)
96 96 public define Maybe(LineReaderLexer)
97 97 make_line_reader_lexer
98 98 =
99   - if make_lexer_and_automaton([
100   - lexer_item("#r?#n", return((ByteArray b, LexingTools t, One aux) |-> token(eol))),
101   - lexer_item("#r", return((ByteArray b, LexingTools t, One aux) |-> token(eol))),
102   - lexer_item("[^\r\n]*", return((ByteArray b, LexingTools t, One aux) |-> token(line(to_string(b))))),
103   - ],
  99 + if make_lexer([
  100 + lexer_item("#r?#n", return((ByteArray b, LexingTools t, One aux) |-> token(eol))),
  101 + lexer_item("#r", return((ByteArray b, LexingTools t, One aux) |-> token(eol))),
  102 + lexer_item("[^\r\n]*", return((ByteArray b, LexingTools t, One aux) |-> token(line(to_string(b))))),
  103 + ],
104 104 '#') is
105 105 {
106 106 error(msg) then print("Syntax error in regular expression: "+to_English(msg)+"\n"); failure,
107   - ok(p) then if p is (lexer, automaton) then success(line_reader_lexer(lexer))
  107 + ok(lexer) then success(line_reader_lexer(lexer))
108 108 }.
109 109  
110 110 public define LineReader
111 111 make_line_reader
112 112 (
113   - LexingStream(One) ls,
  113 + LexingStream ls,
114 114 LineReaderLexer make_lexer
115 115 ) =
116 116 if make_lexer is line_reader_lexer(lexer) then
117   - line_reader(lexer(ls), (One u) |-> offset(ls), make_lexer, ls).
  117 + line_reader(lexer(ls, unique), (One u) |-> offset(ls), make_lexer, ls).
118 118  
119 119 public define Maybe(LineReader)
120 120 make_line_reader
121 121 (
122   - LexingStream(One) ls,
  122 + LexingStream ls,
123 123 ) =
124 124 if make_line_reader_lexer is
125 125 {
... ... @@ -135,7 +135,7 @@ public define Maybe(LineReader)
135 135 (
136 136 String s,
137 137 ) =
138   - make_line_reader(make_lexing_stream("", s, unique)).
  138 + make_line_reader(make_lexing_stream("", s)).
139 139  
140 140 public define Maybe(LineReader)
141 141 make_line_reader
... ... @@ -146,8 +146,7 @@ public define Maybe(LineReader)
146 146 if make_lexing_stream("", /* preambule */
147 147 f, /* the opened file */
148 148 65536, /* size of buffer for the lexing stream */
149   - timeout, /* timeout (seconds) */
150   - unique)
  149 + timeout) /* timeout (seconds) */
151 150 is
152 151 {
153 152 failure then print("cannot make lexing stream.\n"); failure,
... ... @@ -159,7 +158,7 @@ public define LineReader
159 158 reset_line_reader
160 159 (
161 160 LineReader lr,
162   - LexingStream(One) ls,
  161 + LexingStream ls,
163 162 ) =
164 163 if lr is line_reader(lexer, offset, make_lexer, _) then
165 164 make_line_reader(ls, make_lexer).
... ...