Commit c9b7e42e3c0247c882a47e890be76123ddefb8d4
1 parent
d57304cc
[~] enhancement of read csv file. Now return a list of String instead of ByteArr…
…ay and Offset in file of last returned line. This allow to keep the offset for restarting importation if the process was interrupted. [!] Fix the find the first by using ByteArray instead of String because the Boyer-Moore algorithm is now in the VM and it was use to search String into binary data. When the first NULL was encountered the searching process was stopped. Of it was a bug to use the String instead of ByteArray. [!] Line_reader now use fast_lexer_4
Showing
3 changed files
with
110 additions
and
57 deletions
Show diff stats
anubis_dev/library/data_base/read_csv.anubis
| ... | ... | @@ -5,6 +5,7 @@ |
| 5 | 5 | |
| 6 | 6 | |
| 7 | 7 | read tools/basis.anubis |
| 8 | +read tools/time.anubis | |
| 8 | 9 | read lexical_analysis/fast_lexer_4.anubis |
| 9 | 10 | |
| 10 | 11 | |
| ... | ... | @@ -12,8 +13,8 @@ read lexical_analysis/fast_lexer_4.anubis |
| 12 | 13 | |
| 13 | 14 | public type ReadCsvResult: |
| 14 | 15 | end_of_input, |
| 15 | - error(String), // an error message | |
| 16 | - ok(List(ByteArray)). // a single record | |
| 16 | + error(String), // an error message | |
| 17 | + ok((Int, List(String))). // a single record and the offset of last read token | |
| 17 | 18 | |
| 18 | 19 | public define One -> ReadCsvResult |
| 19 | 20 | make_read_csv_line |
| ... | ... | @@ -99,12 +100,15 @@ define List(LexerItem(One,One)) |
| 99 | 100 | ]. |
| 100 | 101 | |
| 101 | 102 | The lexer described below skips to end of line (and eats the end of line). |
| 103 | + | |
| 104 | +type EOL_Token: | |
| 105 | + eol_offset(Int offset). | |
| 102 | 106 | |
| 103 | -define List(LexerItem(One,One)) | |
| 107 | +define List(LexerItem(EOL_Token,One)) | |
| 104 | 108 | to_eol_description |
| 105 | 109 | = |
| 106 | 110 | [ |
| 107 | - lexer_item("([^#r#n]*)((#n)|(#r#n))", return(((Int,Int) -> ByteArray b, Int l, LexingTools t, One u) |-> token(unique))) | |
| 111 | + lexer_item("([^#r#n]*)((#n)|(#r#n))", return(((Int,Int) -> ByteArray b, Int l, LexingTools t, One u) |-> token(eol_offset(t.offset(unique))))) | |
| 108 | 112 | ]. |
| 109 | 113 | |
| 110 | 114 | |
| ... | ... | @@ -133,7 +137,7 @@ read generated/csv_s_skip_cell.anubis |
| 133 | 137 | read generated/csv_to_eol.anubis |
| 134 | 138 | |
| 135 | 139 | |
| 136 | -define One | |
| 140 | + define One | |
| 137 | 141 | repeat |
| 138 | 142 | ( |
| 139 | 143 | Int n, |
| ... | ... | @@ -142,7 +146,7 @@ define One |
| 142 | 146 | if n =< 0 then unique else f(unique); repeat(n-1,f). |
| 143 | 147 | |
| 144 | 148 | |
| 145 | -define Int -> Result(String,ByteArray) | |
| 149 | + define Int -> Result(String,ByteArray) | |
| 146 | 150 | read_next_cell |
| 147 | 151 | ( |
| 148 | 152 | One -> One skip_cell, |
| ... | ... | @@ -155,7 +159,8 @@ type CB_Result: |
| 155 | 159 | eof, |
| 156 | 160 | error(String), |
| 157 | 161 | skip, |
| 158 | - cell(ByteArray). | |
| 162 | + cell(String), | |
| 163 | + eol(Int offset). | |
| 159 | 164 | |
| 160 | 165 | |
| 161 | 166 | define One -> ReadCsvResult |
| ... | ... | @@ -164,25 +169,29 @@ define One -> ReadCsvResult |
| 164 | 169 | List(One -> CB_Result) cbs, |
| 165 | 170 | One -> One to_eol |
| 166 | 171 | ) = |
| 167 | - with f = (List(One -> CB_Result) l) |-f1-> | |
| 168 | - if l is | |
| 172 | + with f = (List(One -> CB_Result) l, List(String) so_far) |-f1-> | |
| 173 | + if l is | |
| 169 | 174 | { |
| 170 | - [ ] then ok([ ]), | |
| 175 | + [ ] then ok((0, reverse(so_far))), //ok([ ]), | |
| 171 | 176 | [f . g] then |
| 172 | 177 | if f(unique) is |
| 173 | 178 | { |
| 174 | 179 | eof then end_of_input, |
| 175 | 180 | error(e) then to_eol(unique); error(e), |
| 176 | - skip then (ReadCsvResult)f1(g) | |
| 177 | - cell(c) then if (ReadCsvResult)f1(g) is | |
| 178 | - { | |
| 179 | - end_of_input then end_of_input, | |
| 180 | - error(e) then to_eol(unique); error(e), | |
| 181 | - ok(k) then ok([c . k]) | |
| 182 | - } | |
| 181 | + skip then (ReadCsvResult)f1(g, so_far), | |
| 182 | + cell(c) then | |
| 183 | + //print("read cell ["+c+"] "); | |
| 184 | + (ReadCsvResult)f1(g, [c. so_far]), | |
| 185 | + eol(offset) then | |
| 186 | + if g is | |
| 187 | + { | |
| 188 | + [] then ok((offset, reverse(so_far))), | |
| 189 | + [_ . _] then error("End Of Line unexpected") | |
| 190 | + } | |
| 191 | + | |
| 183 | 192 | } |
| 184 | 193 | }, |
| 185 | - (One u) |-> f(cbs). | |
| 194 | + (One u) |-> f(cbs, []). | |
| 186 | 195 | |
| 187 | 196 | |
| 188 | 197 | |
| ... | ... | @@ -253,7 +262,7 @@ public define One -> ReadCsvResult |
| 253 | 262 | error(b,line,col) then error("in "+to_string(b)), |
| 254 | 263 | token(t) then if t is |
| 255 | 264 | { |
| 256 | - double_quote then cell(concat(reverse(so_far))), | |
| 265 | + double_quote then cell(to_string(concat(reverse(so_far)))), | |
| 257 | 266 | two_double_quotes then aux([{0x22} . so_far]), |
| 258 | 267 | part(p) then aux([p . so_far]) |
| 259 | 268 | } |
| ... | ... | @@ -266,14 +275,14 @@ public define One -> ReadCsvResult |
| 266 | 275 | { |
| 267 | 276 | eof then eof, |
| 268 | 277 | double_quote then read_in(u), |
| 269 | - separator(c) then cell(c) | |
| 278 | + separator(c) then cell(to_string(c)) | |
| 270 | 279 | } |
| 271 | 280 | }, |
| 272 | 281 | to_eol = (One u) |-> if lex_eol(u) is |
| 273 | 282 | { |
| 274 | 283 | end_of_input then eof, |
| 275 | 284 | error(b,line,col) then error("eol "+to_string(b)), |
| 276 | - token(t) then skip | |
| 285 | + token(t) then if t is eol_offset(offset) then eol(offset) | |
| 277 | 286 | }, |
| 278 | 287 | make_read_csv_line(make_cbs(skip_cell,read_cell,to_eol,cols_to_get), |
| 279 | 288 | (One u) |-> forget(to_eol(u))). |
| ... | ... | @@ -303,12 +312,12 @@ define Maybe(List(Int)) |
| 303 | 312 | |
| 304 | 313 | |
| 305 | 314 | define One |
| 306 | ||
| 315 | + print_csv_line | |
| 307 | 316 | ( |
| 308 | - List(ByteArray) l | |
| 317 | + List(String) l | |
| 309 | 318 | ) = |
| 310 | 319 | print("| "); |
| 311 | - map_forget((ByteArray b) |-> print(to_string(b)+" | "),l); | |
| 320 | + map_forget((String b) |-> print(b+" | "),l); | |
| 312 | 321 | print("\n"). |
| 313 | 322 | |
| 314 | 323 | |
| ... | ... | @@ -322,14 +331,47 @@ define One |
| 322 | 331 | ( |
| 323 | 332 | One -> ReadCsvResult f |
| 324 | 333 | ) = |
| 334 | + println("----"); | |
| 325 | 335 | if f(unique) is |
| 326 | 336 | { |
| 327 | 337 | end_of_input then print("----------------------\n"), |
| 328 | - error(e) then print(e+"\n"); print_to_error(f), | |
| 329 | - ok(l) then print(l); print_to_error(f) | |
| 338 | + error(e) then print("error ["+e+"]\n"); print_to_error(f), | |
| 339 | + ok(l) then if l is (offset, n) then print("[eol offset = "+offset+"] ");print_csv_line(n); print_to_error(f) | |
| 330 | 340 | }. |
| 331 | 341 | |
| 332 | - | |
| 342 | +define One | |
| 343 | + show_perf | |
| 344 | + ( | |
| 345 | + One -> ReadCsvResult f, | |
| 346 | + Int left, | |
| 347 | + Int read_line, | |
| 348 | + Int block_size, | |
| 349 | + UTime start_time | |
| 350 | + ) = | |
| 351 | + if f(unique) is | |
| 352 | + { | |
| 353 | + end_of_input then show_duration("lines read "+read_line, start_time); | |
| 354 | + print("----------------------\n"), | |
| 355 | + error(e) then print("error ["+e+"]\n"); print_to_error(f), | |
| 356 | + ok(l) then | |
| 357 | + with left = if left = 1 then | |
| 358 | + show_duration("lines read "+read_line+1, start_time); | |
| 359 | + block_size | |
| 360 | + else | |
| 361 | + left -1, | |
| 362 | + show_perf(f, left, read_line+1, block_size, start_time) | |
| 363 | + }. | |
| 364 | + | |
| 365 | + | |
| 366 | +define One | |
| 367 | + show_perf | |
| 368 | + ( | |
| 369 | + One -> ReadCsvResult f, | |
| 370 | + Int block_size | |
| 371 | + )= | |
| 372 | + show_perf(f, block_size, 0, block_size, unow) | |
| 373 | + . | |
| 374 | + | |
| 333 | 375 | global define One |
| 334 | 376 | read_csv_file |
| 335 | 377 | ( |
| ... | ... | @@ -337,9 +379,12 @@ global define One |
| 337 | 379 | ) = |
| 338 | 380 | if args is |
| 339 | 381 | { |
| 340 | - [ ] then syntax, | |
| 341 | - [path . t] then if t is | |
| 382 | + [ ] then syntax, | |
| 383 | + [path . t] then | |
| 384 | + println("file "+path); | |
| 385 | + if t is | |
| 342 | 386 | { |
| 387 | + | |
| 343 | 388 | [ ] then syntax, |
| 344 | 389 | [sep . l] then if sep:[",",";"] |
| 345 | 390 | then |
| ... | ... | @@ -357,6 +402,7 @@ global define One |
| 357 | 402 | success(ls) then |
| 358 | 403 | with cs = no_doubles(qsort(cols,(Int x, Int y) |-> x < y)), |
| 359 | 404 | read_line = make_read_csv_line(ls,sep,cs), |
| 405 | +// show_perf(read_line, 10000) | |
| 360 | 406 | print_to_error(read_line) |
| 361 | 407 | } |
| 362 | 408 | } | ... | ... |
anubis_dev/library/system/files.anubis
| ... | ... | @@ -1020,14 +1020,14 @@ define Maybe(Int) |
| 1020 | 1020 | find_the_first |
| 1021 | 1021 | ( |
| 1022 | 1022 | Data_IO io, |
| 1023 | - String looking_for, //String to search | |
| 1023 | + ByteArray looking_for, //String to search | |
| 1024 | 1024 | Int size, //size of the string to search |
| 1025 | - String buffer, | |
| 1025 | + ByteArray buffer, | |
| 1026 | 1026 | Int current_pos, |
| 1027 | 1027 | Int buf_size, |
| 1028 | 1028 | Int buf_pos |
| 1029 | 1029 | )= |
| 1030 | - //println("buf_size :"+buf_size+ " buf_pos :"+buf_pos + " size : "+size); | |
| 1030 | + //println("general current pos: "+current_pos+" | buffer size: "+buf_size+ " | buffer pos: "+buf_pos + " | search size: "+size); | |
| 1031 | 1031 | if (buf_size - buf_pos) < size then |
| 1032 | 1032 | //println("New buffer request current pos "+current_pos+" buffer_pos "+buf_pos); |
| 1033 | 1033 | if read_bytes(io, 65536) is // <- block size is 64k |
| ... | ... | @@ -1035,23 +1035,30 @@ define Maybe(Int) |
| 1035 | 1035 | failure then println("read_bytes failure");failure, //finish |
| 1036 | 1036 | time_out then println("read_bytes timeout");failure, //finish |
| 1037 | 1037 | success(ba) then |
| 1038 | - with new_buffer = to_string(extract(to_byte_array(buffer), buf_pos, buf_size) + ba), | |
| 1038 | + //println("length of ba "+length(ba)); | |
| 1039 | + with ex_ba = extract(buffer, buf_pos, buf_size), | |
| 1040 | + //println("length of ex_ba "+length(ex_ba)); | |
| 1041 | + with new_ba = ex_ba + ba, | |
| 1042 | + //println("length of new_ba "+length(new_ba)); | |
| 1043 | + with new_buffer = ex_ba + ba, | |
| 1039 | 1044 | //println("SUCCESS New buffer length "+length(new_buffer)+" new current_pos "+current_pos); |
| 1040 | 1045 | find_the_first(io, looking_for, size, new_buffer, current_pos + buf_pos, length(new_buffer), 0), |
| 1041 | 1046 | truncated(ba) then |
| 1042 | 1047 | if length(ba) = 0 then |
| 1048 | + //println("last buffer current position ["+current_pos+"]"); | |
| 1043 | 1049 | failure //finish |
| 1044 | 1050 | else |
| 1045 | - with new_buffer = to_string(extract(to_byte_array(buffer), buf_pos, buf_size) + ba), | |
| 1051 | + with new_buffer = extract(buffer, buf_pos, buf_size) + ba, | |
| 1046 | 1052 | // println("TRUNCATED New buffer length "+length(new_buffer)+" new current_pos "+current_pos); |
| 1047 | 1053 | find_the_first(io, looking_for, size, new_buffer, current_pos + buf_pos, length(new_buffer), 0) |
| 1048 | 1054 | } |
| 1049 | 1055 | else |
| 1050 | - if find_string(buffer, looking_for, buf_pos) is | |
| 1056 | + if find_byte_array(buffer, looking_for, buf_pos) is | |
| 1051 | 1057 | { |
| 1052 | - failure then find_the_first(io, looking_for, size, buffer, current_pos, buf_size, buf_size - (size-1)), | |
| 1058 | + failure then | |
| 1059 | + find_the_first(io, looking_for, size, buffer, current_pos, buf_size, buf_size - (size-1)), | |
| 1053 | 1060 | success(pos) then |
| 1054 | - println("pattern ["+looking_for+"] found at offset "+(current_pos+pos)); | |
| 1061 | + //println("pattern ["+to_string(looking_for)+"] found at offset "+(current_pos+pos)); | |
| 1055 | 1062 | success(current_pos + pos) |
| 1056 | 1063 | } |
| 1057 | 1064 | . |
| ... | ... | @@ -1126,7 +1133,7 @@ public define Maybe(Int) |
| 1126 | 1133 | Data_IO io, |
| 1127 | 1134 | String search_string |
| 1128 | 1135 | ) = |
| 1129 | - find_the_first(io, search_string, length(search_string), "", 0, 0, 0). | |
| 1136 | + find_the_first(io, to_byte_array(search_string), length(search_string), constant_byte_array(0,0), 0, 0, 0). | |
| 1130 | 1137 | |
| 1131 | 1138 | public define Maybe(Int) |
| 1132 | 1139 | find_the_first |
| ... | ... | @@ -1138,7 +1145,7 @@ public define Maybe(Int) |
| 1138 | 1145 | { |
| 1139 | 1146 | failure then failure, |
| 1140 | 1147 | success(f) then |
| 1141 | - find_the_first(make_data_io(f), search_string, length(search_string), "", 0, 0, 0) | |
| 1148 | + find_the_first(make_data_io(f), to_byte_array(search_string), length(search_string), constant_byte_array(0,0), 0, 0, 0) | |
| 1142 | 1149 | }. |
| 1143 | 1150 | |
| 1144 | 1151 | public define Maybe(Int) |
| ... | ... | @@ -1153,7 +1160,8 @@ public define Maybe(Int) |
| 1153 | 1160 | failure then failure, |
| 1154 | 1161 | success(f) then |
| 1155 | 1162 | with size = file_size(filename), |
| 1156 | - find_the_first(make_data_io(f, start_position, size - start_position), search_string, length(search_string), "", 0, 0, 0) | |
| 1163 | + //println("file size "+size); | |
| 1164 | + find_the_first(make_data_io(f, start_position, size - start_position), to_byte_array(search_string), length(search_string), constant_byte_array(0,0), 0, 0, 0) | |
| 1157 | 1165 | }. |
| 1158 | 1166 | |
| 1159 | 1167 | public define Maybe(Int) |
| ... | ... | @@ -1168,6 +1176,6 @@ public define Maybe(Int) |
| 1168 | 1176 | { |
| 1169 | 1177 | failure then failure, |
| 1170 | 1178 | success(f) then |
| 1171 | - find_the_first(make_data_io(f, start_position, end_position - start_position), search_string, length(search_string), "", 0, 0, 0) | |
| 1179 | + find_the_first(make_data_io(f, start_position, end_position - start_position), to_byte_array(search_string), length(search_string), constant_byte_array(0,0), 0, 0, 0) | |
| 1172 | 1180 | }. |
| 1173 | 1181 | ... | ... |
anubis_dev/library/tools/line_reader.anubis
| ... | ... | @@ -38,7 +38,7 @@ |
| 38 | 38 | |
| 39 | 39 | //--------------------------------------------------------------------------- |
| 40 | 40 | |
| 41 | -read lexical_analysis/fast_lexer_3.anubis | |
| 41 | +read lexical_analysis/fast_lexer_4.anubis | |
| 42 | 42 | |
| 43 | 43 | |
| 44 | 44 | type Token: |
| ... | ... | @@ -46,13 +46,13 @@ type Token: |
| 46 | 46 | eol. |
| 47 | 47 | |
| 48 | 48 | public type LineReaderLexer: |
| 49 | - line_reader_lexer(LexingStream(One) -> One -> LexerOutput(Token) /*lexer_base*/). | |
| 49 | + line_reader_lexer((LexingStream, One) -> One -> LexerOutput(Token) /*lexer_base*/). | |
| 50 | 50 | |
| 51 | 51 | public type LineReader: |
| 52 | 52 | line_reader(One -> LexerOutput(Token) /*lexer*/, |
| 53 | 53 | One -> Int /*offset*/, |
| 54 | 54 | LineReaderLexer /*lexer_base*/, |
| 55 | - LexingStream(One) lexing_stream). | |
| 55 | + LexingStream lexing_stream). | |
| 56 | 56 | |
| 57 | 57 | public define Int |
| 58 | 58 | current_offset |
| ... | ... | @@ -96,30 +96,30 @@ public define Maybe(String) |
| 96 | 96 | public define Maybe(LineReaderLexer) |
| 97 | 97 | make_line_reader_lexer |
| 98 | 98 | = |
| 99 | - if make_lexer_and_automaton([ | |
| 100 | - lexer_item("#r?#n", return((ByteArray b, LexingTools t, One aux) |-> token(eol))), | |
| 101 | - lexer_item("#r", return((ByteArray b, LexingTools t, One aux) |-> token(eol))), | |
| 102 | - lexer_item("[^\r\n]*", return((ByteArray b, LexingTools t, One aux) |-> token(line(to_string(b))))), | |
| 103 | - ], | |
| 99 | + if make_lexer([ | |
| 100 | + lexer_item("#r?#n", return((ByteArray b, LexingTools t, One aux) |-> token(eol))), | |
| 101 | + lexer_item("#r", return((ByteArray b, LexingTools t, One aux) |-> token(eol))), | |
| 102 | + lexer_item("[^\r\n]*", return((ByteArray b, LexingTools t, One aux) |-> token(line(to_string(b))))), | |
| 103 | + ], | |
| 104 | 104 | '#') is |
| 105 | 105 | { |
| 106 | 106 | error(msg) then print("Syntax error in regular expression: "+to_English(msg)+"\n"); failure, |
| 107 | - ok(p) then if p is (lexer, automaton) then success(line_reader_lexer(lexer)) | |
| 107 | + ok(lexer) then success(line_reader_lexer(lexer)) | |
| 108 | 108 | }. |
| 109 | 109 | |
| 110 | 110 | public define LineReader |
| 111 | 111 | make_line_reader |
| 112 | 112 | ( |
| 113 | - LexingStream(One) ls, | |
| 113 | + LexingStream ls, | |
| 114 | 114 | LineReaderLexer make_lexer |
| 115 | 115 | ) = |
| 116 | 116 | if make_lexer is line_reader_lexer(lexer) then |
| 117 | - line_reader(lexer(ls), (One u) |-> offset(ls), make_lexer, ls). | |
| 117 | + line_reader(lexer(ls, unique), (One u) |-> offset(ls), make_lexer, ls). | |
| 118 | 118 | |
| 119 | 119 | public define Maybe(LineReader) |
| 120 | 120 | make_line_reader |
| 121 | 121 | ( |
| 122 | - LexingStream(One) ls, | |
| 122 | + LexingStream ls, | |
| 123 | 123 | ) = |
| 124 | 124 | if make_line_reader_lexer is |
| 125 | 125 | { |
| ... | ... | @@ -135,7 +135,7 @@ public define Maybe(LineReader) |
| 135 | 135 | ( |
| 136 | 136 | String s, |
| 137 | 137 | ) = |
| 138 | - make_line_reader(make_lexing_stream("", s, unique)). | |
| 138 | + make_line_reader(make_lexing_stream("", s)). | |
| 139 | 139 | |
| 140 | 140 | public define Maybe(LineReader) |
| 141 | 141 | make_line_reader |
| ... | ... | @@ -146,8 +146,7 @@ public define Maybe(LineReader) |
| 146 | 146 | if make_lexing_stream("", /* preambule */ |
| 147 | 147 | f, /* the opened file */ |
| 148 | 148 | 65536, /* size of buffer for the lexing stream */ |
| 149 | - timeout, /* timeout (seconds) */ | |
| 150 | - unique) | |
| 149 | + timeout) /* timeout (seconds) */ | |
| 151 | 150 | is |
| 152 | 151 | { |
| 153 | 152 | failure then print("cannot make lexing stream.\n"); failure, |
| ... | ... | @@ -159,7 +158,7 @@ public define LineReader |
| 159 | 158 | reset_line_reader |
| 160 | 159 | ( |
| 161 | 160 | LineReader lr, |
| 162 | - LexingStream(One) ls, | |
| 161 | + LexingStream ls, | |
| 163 | 162 | ) = |
| 164 | 163 | if lr is line_reader(lexer, offset, make_lexer, _) then |
| 165 | 164 | make_line_reader(ls, make_lexer). | ... | ... |