Commit d66e43f4abbd9d7538bf1e9af08e3b02a7ab89ef

Authored by Alain Prouté
1 parent 3a6efd40

Updated data_base/read_csv.anubis to use fast_lexer_5 instead of fast_lexer_4.

Showing 1 changed file with 48 additions and 31 deletions   Show diff stats
anubis_dev/library/data_base/read_csv.anubis
@@ -6,7 +6,7 @@ @@ -6,7 +6,7 @@
6 6
7 read tools/basis.anubis 7 read tools/basis.anubis
8 read tools/time.anubis 8 read tools/time.anubis
9 -read lexical_analysis/fast_lexer_4.anubis 9 +read lexical_analysis/fast_lexer_5.anubis
10 10
11 11
12 public type ReadCsvResult: 12 public type ReadCsvResult:
@@ -38,7 +38,7 @@ type CellPrefixToken: // reading the beginning of a cell until the firs @@ -38,7 +38,7 @@ type CellPrefixToken: // reading the beginning of a cell until the firs
38 38
39 This lexer if for reading the beginning of a cell. 39 This lexer if for reading the beginning of a cell.
40 40
41 -define List(LexerItem4(CellPrefixToken,One)) 41 +define List(LexerItem(CellPrefixToken,One))
42 begin_cell_description 42 begin_cell_description
43 ( 43 (
44 String sep 44 String sep
@@ -46,19 +46,19 @@ define List(LexerItem4(CellPrefixToken,One)) @@ -46,19 +46,19 @@ define List(LexerItem4(CellPrefixToken,One))
46 [ 46 [
47 lexer_item("[# #t]*\"", 47 lexer_item("[# #t]*\"",
48 return(((Int,Int) -> ByteArray extract, Int l, LexingTools t, One u) |-> 48 return(((Int,Int) -> ByteArray extract, Int l, LexingTools t, One u) |->
49 - token(double_quote))), 49 + ok(double_quote))),
50 50
51 lexer_item("[^#"+sep+"\"#r#n]*#"+sep, 51 lexer_item("[^#"+sep+"\"#r#n]*#"+sep,
52 return(((Int,Int) -> ByteArray extract, Int l, LexingTools t, One u) |-> 52 return(((Int,Int) -> ByteArray extract, Int l, LexingTools t, One u) |->
53 - token(separator(extract(0,l-1))))), 53 + ok(separator(extract(0,l-1))))),
54 54
55 lexer_item("[^#"+sep+"\"#r#n]*#n" , 55 lexer_item("[^#"+sep+"\"#r#n]*#n" ,
56 return(((Int,Int) -> ByteArray extract, Int l, LexingTools t, One u) |-> 56 return(((Int,Int) -> ByteArray extract, Int l, LexingTools t, One u) |->
57 - token(separator(extract(0,l-1))))), 57 + ok(separator(extract(0,l-1))))),
58 58
59 lexer_item("[^#"+sep+"\"#r#n]*(#r#n)" , 59 lexer_item("[^#"+sep+"\"#r#n]*(#r#n)" ,
60 return(((Int,Int) -> ByteArray extract, Int l, LexingTools t, One u) |-> 60 return(((Int,Int) -> ByteArray extract, Int l, LexingTools t, One u) |->
61 - token(separator(extract(0,l-2))))) 61 + ok(separator(extract(0,l-2)))))
62 ]. 62 ].
63 63
64 64
@@ -66,52 +66,58 @@ define List(LexerItem4(CellPrefixToken,One)) @@ -66,52 +66,58 @@ define List(LexerItem4(CellPrefixToken,One))
66 66
67 67
68 type InToken: 68 type InToken:
  69 + eof,
69 double_quote, // can also be the end of file 70 double_quote, // can also be the end of file
70 two_double_quotes, 71 two_double_quotes,
71 part(ByteArray). // part of cell 72 part(ByteArray). // part of cell
72 73
73 74
74 -define List(LexerItem4(InToken,One)) 75 +define List(LexerItem(InToken,One))
75 read_quoted_cell_description 76 read_quoted_cell_description
76 ( 77 (
77 String sep 78 String sep
78 ) = 79 ) =
79 [ 80 [
80 lexer_item("[^\"]*" , 81 lexer_item("[^\"]*" ,
81 - return((ByteArray b, LexingTools t, One u) |-> token(part(b)))), 82 + return((ByteArray b, LexingTools t, One u) |-> ok(part(b)))),
82 83
83 lexer_item("\"\"" , 84 lexer_item("\"\"" ,
84 - return((ByteArray b, LexingTools t, One u) |-> token(two_double_quotes))), 85 + return((ByteArray b, LexingTools t, One u) |-> ok(two_double_quotes))),
85 86
86 lexer_item("\"[# #t]*(("+sep+")|(#n)|(#r#n))" , 87 lexer_item("\"[# #t]*(("+sep+")|(#n)|(#r#n))" ,
87 - return((ByteArray b, LexingTools t, One u) |-> token(double_quote))) 88 + return((ByteArray b, LexingTools t, One u) |-> ok(double_quote)))
88 ]. 89 ].
89 90
90 91
91 The lexer described below skips a cell (and eats the trailing separator). 92 The lexer described below skips a cell (and eats the trailing separator).
92 93
93 -define List(LexerItem4(One,One)) 94 +type SkipToken:
  95 + eof,
  96 + skiped.
  97 +
  98 +define List(LexerItem(SkipToken,One))
94 skip_cell_description 99 skip_cell_description
95 ( 100 (
96 String sep 101 String sep
97 ) = 102 ) =
98 [ 103 [
99 lexer_item("(([^\"#n#r#"+sep+"]*)|([# #t]*\"([^\"]|(\"\"))*\"[# #t]*))#"+sep, 104 lexer_item("(([^\"#n#r#"+sep+"]*)|([# #t]*\"([^\"]|(\"\"))*\"[# #t]*))#"+sep,
100 - return(((Int,Int) -> ByteArray b, Int l, LexingTools t, One u) |-> token(unique))) 105 + return(((Int,Int) -> ByteArray b, Int l, LexingTools t, One u) |-> ok(skiped)))
101 ]. 106 ].
102 107
103 The lexer described below skips to end of line (and eats the end of line). 108 The lexer described below skips to end of line (and eats the end of line).
104 109
105 type EOL_Token: 110 type EOL_Token:
  111 + eof,
106 eol_offset(Int offset). 112 eol_offset(Int offset).
107 113
108 -define List(LexerItem4(EOL_Token,One)) 114 +define List(LexerItem(EOL_Token,One))
109 to_eol_description 115 to_eol_description
110 = 116 =
111 [ 117 [
112 lexer_item("([^#r#n]*)((#n)|(#r#n))", 118 lexer_item("([^#r#n]*)((#n)|(#r#n))",
113 return(((Int,Int) -> ByteArray b, Int l, LexingTools t, One u) |-> 119 return(((Int,Int) -> ByteArray b, Int l, LexingTools t, One u) |->
114 - token(eol_offset(t.offset(unique))))) 120 + ok(eol_offset(t.offset(unique)))))
115 ]. 121 ].
116 122
117 123
@@ -234,7 +240,17 @@ define List(One -> CB_Result) @@ -234,7 +240,17 @@ define List(One -> CB_Result)
234 }, r)). 240 }, r)).
235 241
236 242
237 - 243 +define String
  244 + format
  245 + (
  246 + LexicalError(One) e
  247 + ) =
  248 + if e is
  249 + {
  250 + lex_error(b,t,a) then "error: '"+to_string(b)+"' at line "+line(t)(unique),
  251 + lex_error(message) then message,
  252 + other_error(a) then should_not_happen("unknown error")
  253 + }.
238 254
239 public define One -> ReadCsvResult 255 public define One -> ReadCsvResult
240 make_read_csv_line 256 make_read_csv_line
@@ -243,28 +259,26 @@ public define One -> ReadCsvResult @@ -243,28 +259,26 @@ public define One -> ReadCsvResult
243 String sep, 259 String sep,
244 List(Int) cols_to_get 260 List(Int) cols_to_get
245 ) = 261 ) =
246 - with lex_skip = retrieve_lexer(skip_cell_description(sep), if sep = "," then csv_c_skip_cell else csv_s_skip_cell)(ls,unique),  
247 - lex_begin = retrieve_lexer(begin_cell_description(sep), if sep = "," then csv_c_begin_cell else csv_s_begin_cell)(ls,unique),  
248 - lex_in = retrieve_lexer(read_quoted_cell_description(sep), if sep = "," then csv_c_quoted_cell else csv_s_quoted_cell)(ls,unique),  
249 - lex_eol = retrieve_lexer(to_eol_description, csv_to_eol)(ls,unique), 262 + with lex_skip = retrieve_lexer(skip_cell_description(sep), if sep = "," then csv_c_skip_cell else csv_s_skip_cell, eof)(ls,unique),
  263 + lex_begin = retrieve_lexer(begin_cell_description(sep), if sep = "," then csv_c_begin_cell else csv_s_begin_cell, eof)(ls,unique),
  264 + lex_in = retrieve_lexer(read_quoted_cell_description(sep), if sep = "," then csv_c_quoted_cell else csv_s_quoted_cell,eof)(ls,unique),
  265 + lex_eol = retrieve_lexer(to_eol_description, csv_to_eol, eof)(ls,unique),
250 skip_cell = (One u) |-> (CB_Result)if lex_skip(u) is 266 skip_cell = (One u) |-> (CB_Result)if lex_skip(u) is
251 { 267 {
252 - end_of_input then eof,  
253 - error(b,line,col) then error("skip "+line+":"+col+" :"+to_string(b)),  
254 - token(t) then skip 268 + error(e) then error(format(e)),
  269 + ok(_) then skip
255 }, 270 },
256 begin_cell = (One u) |-> (Result(String,CellPrefixToken))if lex_begin(u) is 271 begin_cell = (One u) |-> (Result(String,CellPrefixToken))if lex_begin(u) is
257 { 272 {
258 - end_of_input then ok(eof),  
259 - error(b,line,col) then error("begin "+to_string(b)),  
260 - token(t) then ok(t) 273 + error(e) then error(format(e)),
  274 + ok(t) then ok(t)
261 }, 275 },
262 read_in_aux = (List(ByteArray) so_far) |-aux-> (CB_Result)if lex_in(unique) is 276 read_in_aux = (List(ByteArray) so_far) |-aux-> (CB_Result)if lex_in(unique) is
263 { 277 {
264 - end_of_input then eof,  
265 - error(b,line,col) then error("in "+to_string(b)),  
266 - token(t) then if t is 278 + error(e) then error(format(e)),
  279 + ok(t) then if t is
267 { 280 {
  281 + eof then eof,
268 double_quote then cell(to_string(concat(reverse(so_far)))), 282 double_quote then cell(to_string(concat(reverse(so_far)))),
269 two_double_quotes then aux([{0x22} . so_far]), 283 two_double_quotes then aux([{0x22} . so_far]),
270 part(p) then aux([p . so_far]) 284 part(p) then aux([p . so_far])
@@ -283,9 +297,12 @@ public define One -> ReadCsvResult @@ -283,9 +297,12 @@ public define One -> ReadCsvResult
283 }, 297 },
284 to_eol = (One u) |-> if lex_eol(u) is 298 to_eol = (One u) |-> if lex_eol(u) is
285 { 299 {
286 - end_of_input then eof,  
287 - error(b,line,col) then error("eol "+to_string(b)),  
288 - token(t) then if t is eol_offset(offset) then eol(offset) 300 + error(e) then error(format(e)),
  301 + ok(t) then if t is
  302 + {
  303 + eof then eof,
  304 + eol_offset(offset) then eol(offset)
  305 + }
289 }, 306 },
290 make_read_csv_line(make_cbs(skip_cell,read_cell,to_eol,cols_to_get), 307 make_read_csv_line(make_cbs(skip_cell,read_cell,to_eol,cols_to_get),
291 (One u) |-> forget(to_eol(u))). 308 (One u) |-> forget(to_eol(u))).