Commit d66e43f4abbd9d7538bf1e9af08e3b02a7ab89ef

Authored by Alain Prouté
1 parent 3a6efd40

Updated data_base/read_csv.anubis to use fast_lexer_5 instead of fast_lexer_4.

Showing 1 changed file with 48 additions and 31 deletions   Show diff stats
anubis_dev/library/data_base/read_csv.anubis
... ... @@ -6,7 +6,7 @@
6 6  
7 7 read tools/basis.anubis
8 8 read tools/time.anubis
9   -read lexical_analysis/fast_lexer_4.anubis
  9 +read lexical_analysis/fast_lexer_5.anubis
10 10  
11 11  
12 12 public type ReadCsvResult:
... ... @@ -38,7 +38,7 @@ type CellPrefixToken: // reading the beginning of a cell until the firs
38 38  
39 39 This lexer if for reading the beginning of a cell.
40 40  
41   -define List(LexerItem4(CellPrefixToken,One))
  41 +define List(LexerItem(CellPrefixToken,One))
42 42 begin_cell_description
43 43 (
44 44 String sep
... ... @@ -46,19 +46,19 @@ define List(LexerItem4(CellPrefixToken,One))
46 46 [
47 47 lexer_item("[# #t]*\"",
48 48 return(((Int,Int) -> ByteArray extract, Int l, LexingTools t, One u) |->
49   - token(double_quote))),
  49 + ok(double_quote))),
50 50  
51 51 lexer_item("[^#"+sep+"\"#r#n]*#"+sep,
52 52 return(((Int,Int) -> ByteArray extract, Int l, LexingTools t, One u) |->
53   - token(separator(extract(0,l-1))))),
  53 + ok(separator(extract(0,l-1))))),
54 54  
55 55 lexer_item("[^#"+sep+"\"#r#n]*#n" ,
56 56 return(((Int,Int) -> ByteArray extract, Int l, LexingTools t, One u) |->
57   - token(separator(extract(0,l-1))))),
  57 + ok(separator(extract(0,l-1))))),
58 58  
59 59 lexer_item("[^#"+sep+"\"#r#n]*(#r#n)" ,
60 60 return(((Int,Int) -> ByteArray extract, Int l, LexingTools t, One u) |->
61   - token(separator(extract(0,l-2)))))
  61 + ok(separator(extract(0,l-2)))))
62 62 ].
63 63  
64 64  
... ... @@ -66,52 +66,58 @@ define List(LexerItem4(CellPrefixToken,One))
66 66  
67 67  
68 68 type InToken:
  69 + eof,
69 70 double_quote, // can also be the end of file
70 71 two_double_quotes,
71 72 part(ByteArray). // part of cell
72 73  
73 74  
74   -define List(LexerItem4(InToken,One))
  75 +define List(LexerItem(InToken,One))
75 76 read_quoted_cell_description
76 77 (
77 78 String sep
78 79 ) =
79 80 [
80 81 lexer_item("[^\"]*" ,
81   - return((ByteArray b, LexingTools t, One u) |-> token(part(b)))),
  82 + return((ByteArray b, LexingTools t, One u) |-> ok(part(b)))),
82 83  
83 84 lexer_item("\"\"" ,
84   - return((ByteArray b, LexingTools t, One u) |-> token(two_double_quotes))),
  85 + return((ByteArray b, LexingTools t, One u) |-> ok(two_double_quotes))),
85 86  
86 87 lexer_item("\"[# #t]*(("+sep+")|(#n)|(#r#n))" ,
87   - return((ByteArray b, LexingTools t, One u) |-> token(double_quote)))
  88 + return((ByteArray b, LexingTools t, One u) |-> ok(double_quote)))
88 89 ].
89 90  
90 91  
91 92 The lexer described below skips a cell (and eats the trailing separator).
92 93  
93   -define List(LexerItem4(One,One))
  94 +type SkipToken:
  95 + eof,
  96 + skiped.
  97 +
  98 +define List(LexerItem(SkipToken,One))
94 99 skip_cell_description
95 100 (
96 101 String sep
97 102 ) =
98 103 [
99 104 lexer_item("(([^\"#n#r#"+sep+"]*)|([# #t]*\"([^\"]|(\"\"))*\"[# #t]*))#"+sep,
100   - return(((Int,Int) -> ByteArray b, Int l, LexingTools t, One u) |-> token(unique)))
  105 + return(((Int,Int) -> ByteArray b, Int l, LexingTools t, One u) |-> ok(skiped)))
101 106 ].
102 107  
103 108 The lexer described below skips to end of line (and eats the end of line).
104 109  
105 110 type EOL_Token:
  111 + eof,
106 112 eol_offset(Int offset).
107 113  
108   -define List(LexerItem4(EOL_Token,One))
  114 +define List(LexerItem(EOL_Token,One))
109 115 to_eol_description
110 116 =
111 117 [
112 118 lexer_item("([^#r#n]*)((#n)|(#r#n))",
113 119 return(((Int,Int) -> ByteArray b, Int l, LexingTools t, One u) |->
114   - token(eol_offset(t.offset(unique)))))
  120 + ok(eol_offset(t.offset(unique)))))
115 121 ].
116 122  
117 123  
... ... @@ -234,7 +240,17 @@ define List(One -> CB_Result)
234 240 }, r)).
235 241  
236 242  
237   -
  243 +define String
  244 + format
  245 + (
  246 + LexicalError(One) e
  247 + ) =
  248 + if e is
  249 + {
  250 + lex_error(b,t,a) then "error: '"+to_string(b)+"' at line "+line(t)(unique),
  251 + lex_error(message) then message,
  252 + other_error(a) then should_not_happen("unknown error")
  253 + }.
238 254  
239 255 public define One -> ReadCsvResult
240 256 make_read_csv_line
... ... @@ -243,28 +259,26 @@ public define One -> ReadCsvResult
243 259 String sep,
244 260 List(Int) cols_to_get
245 261 ) =
246   - with lex_skip = retrieve_lexer(skip_cell_description(sep), if sep = "," then csv_c_skip_cell else csv_s_skip_cell)(ls,unique),
247   - lex_begin = retrieve_lexer(begin_cell_description(sep), if sep = "," then csv_c_begin_cell else csv_s_begin_cell)(ls,unique),
248   - lex_in = retrieve_lexer(read_quoted_cell_description(sep), if sep = "," then csv_c_quoted_cell else csv_s_quoted_cell)(ls,unique),
249   - lex_eol = retrieve_lexer(to_eol_description, csv_to_eol)(ls,unique),
  262 + with lex_skip = retrieve_lexer(skip_cell_description(sep), if sep = "," then csv_c_skip_cell else csv_s_skip_cell, eof)(ls,unique),
  263 + lex_begin = retrieve_lexer(begin_cell_description(sep), if sep = "," then csv_c_begin_cell else csv_s_begin_cell, eof)(ls,unique),
  264 + lex_in = retrieve_lexer(read_quoted_cell_description(sep), if sep = "," then csv_c_quoted_cell else csv_s_quoted_cell,eof)(ls,unique),
  265 + lex_eol = retrieve_lexer(to_eol_description, csv_to_eol, eof)(ls,unique),
250 266 skip_cell = (One u) |-> (CB_Result)if lex_skip(u) is
251 267 {
252   - end_of_input then eof,
253   - error(b,line,col) then error("skip "+line+":"+col+" :"+to_string(b)),
254   - token(t) then skip
  268 + error(e) then error(format(e)),
  269 + ok(_) then skip
255 270 },
256 271 begin_cell = (One u) |-> (Result(String,CellPrefixToken))if lex_begin(u) is
257 272 {
258   - end_of_input then ok(eof),
259   - error(b,line,col) then error("begin "+to_string(b)),
260   - token(t) then ok(t)
  273 + error(e) then error(format(e)),
  274 + ok(t) then ok(t)
261 275 },
262 276 read_in_aux = (List(ByteArray) so_far) |-aux-> (CB_Result)if lex_in(unique) is
263 277 {
264   - end_of_input then eof,
265   - error(b,line,col) then error("in "+to_string(b)),
266   - token(t) then if t is
  278 + error(e) then error(format(e)),
  279 + ok(t) then if t is
267 280 {
  281 + eof then eof,
268 282 double_quote then cell(to_string(concat(reverse(so_far)))),
269 283 two_double_quotes then aux([{0x22} . so_far]),
270 284 part(p) then aux([p . so_far])
... ... @@ -283,9 +297,12 @@ public define One -> ReadCsvResult
283 297 },
284 298 to_eol = (One u) |-> if lex_eol(u) is
285 299 {
286   - end_of_input then eof,
287   - error(b,line,col) then error("eol "+to_string(b)),
288   - token(t) then if t is eol_offset(offset) then eol(offset)
  300 + error(e) then error(format(e)),
  301 + ok(t) then if t is
  302 + {
  303 + eof then eof,
  304 + eol_offset(offset) then eol(offset)
  305 + }
289 306 },
290 307 make_read_csv_line(make_cbs(skip_cell,read_cell,to_eol,cols_to_get),
291 308 (One u) |-> forget(to_eol(u))).
... ...