import_csv.anubis 9.05 KB

Edit Raw Blame History



 *Project*                                  Anubis

 *Title*                              Reading CSV tables.

 *Copyright*                  Copyright (c) Alain Prouté 2003-2006.

 *Released*

 *Created*
 *Author* Alain Prouté

 *Overview*

   This file  defines a tool for  reading tables. A  'table' is a file  containing 'lines'
   separated by a  special character (or a combination of  special characters), most often
   '\n' (ASCII 10)  or '\r' (ASCII 13).   Each line is separated into  'cells'.  A special
   character is used  for separating the cells  within a line, most often  '\t' (ASCII 9),
   ';' or '&'. Of course, this implies  that these special characters are not used for the
   content of the cells themselves.

   Below is  an example of a  table whose line separator  is '\n' (hence  invisible in the
   example), and cell separator is '&':

   cell_1_1&cell_1_2&cell_1_3
   cell_2_1&cell_2_2

   This  table has two  lines. Notice  that the  lines need  not have  the same  number of
   cells. Also, no line separator is following the string "cell_2_2" (the last byte in the
   file is '2'), otherwise the table would have 3 lines (with the last line empty).

   The content  of a cell is  considered as a  character string. The contents  of selected
   cells  must eventually  be  transformed into  integers  or floating  point numbers  (or
   anything else), but this transformation is not handled here.  The function defined here
   returns (maybe) a datum of type:

             List(List(String))

   which is to  be interpreted as a list of  lines, each line being a  list of cells, each
   cell being a character strings.

   The function requires that you declare  the possible separators. Each line separator is
   declared as  a list of 'Word8'. For  example, if the lines  of the table to  be read are
   separated  by carriage  return  '\r'  and line  feed  '\n' (in  this  order), the  line
   separator is represented by '[13,10]', or by '['\r','\n']'.

   On the contrary, we assume that only  one character is used as cell separator. Hence, a
   cell separator is represented by a single Word8. Below are realistic examples:

        ';'    '&'    '\t'    9

   The  function below  can handle  several line  separators and  several  cell separators
   simultaneously,  so  that  the same  call  to  the  function  can handle  tables  whose
   separators are different, or even handle  tables with mixed separators (not usual). The
   function can also ignore some charaters (typically the space (ASCII 32)).


 *Public*

public define Maybe(List(List(String)))
   read_table
     (
       String                                 filename,
       NonEmptyList(NonEmptyList(Word8))       line_separators,
       NonEmptyList(Word8)                     cell_separators
     ).

   For example, the next call can read several sorts of tables:

      read_table(filename,
                 [[10],[13],[13,10]],
                 [';','\t'])

   It reads  tables whose lines  are separated  either by '\n',  '\r' or '\r'  followed by
   '\n', and whose cells are separated either by ';' or by a tabulator.

   The result is  'failure' when the file  cannot be opened. Otherwise, a  table is always
   read (which may be empty, or wrong if you do not choose the right separators).


   --- That's all for the public part. ---------------------------------------------------

 *Private*


   Transforming a non empty list into a list.

define List($T)
   to_list
     (
       NonEmptyList($T) l
     ) =
   if l is [first . others] then [first . others].


   Checking if a list of Word8 'candidate' is a prefix in a line separator.


define Bool
   begins_line_separator
     (
       List(Word8) candidate,
       List(Word8) line_sep
     ) =
   if candidate is
     {
       [ ] then true,
       [ch . ct] then
         if line_sep is
           {
             [ ] then false,
             [sh . st] then
               if ch = sh
               then begins_line_separator(ct,st)
               else false
           }
     }.


   Here is the test.

define Bool
   begins_line_separator
     (
       List(Word8) candidate,
       List(List(Word8)) line_seps
     ) =
   if line_seps is
     {
       [ ] then false,
       [s . others] then
         if begins_line_separator(candidate,s)
         then true
         else begins_line_separator(candidate,others)
     }.


   We have two cross recursive functions 'read_table' and 'read_more_lines'.

define List(List(String))
   read_table
     (
       RStream file,
       List(List(Word8)) line_seps,
       List(Word8) line_sep_firsts,
       List(Word8) cell_seps,
       List(List(String)) so_far,
       List(String) current_line,
       List(Word8) current_cell
     ).


   Reading more lines from a table. We  have already read several lines (at least one) and
   we have read a character 'c' (or several) which is (are) the first in a line separator.
   We read a  new character 'd'. If a  line separator begins by 'c' 'd'  we continue until
   the characters  read do not  form a line  separator.  Of course  we must handle  end of
   file.  If  end of file  is read, the  last line of the  table is empty.   Otherwise, we
   return to 'read_table', with the correct 'current_line' and 'current_cell'.

define List(List(String))
   read_more_lines
     (
       RStream file,
       List(List(Word8)) line_seps,
       List(Word8) line_sep_firsts,
       List(Word8) cell_seps,
       List(List(String)) so_far,               // lines read so far
       List(Word8) firsts_in_sep                 // first characters of line separator (just read)
     ) =
   if *file is
     {
       failure then reverse([[] . so_far]),     // last line empty
       success(d) then with firsts = (List(Word8))[d . firsts_in_sep],
         if begins_line_separator(firsts,line_seps)
         then read_more_lines(file,line_seps,line_sep_firsts,cell_seps,so_far,firsts)
         else read_table(file,line_seps,line_sep_firsts,cell_seps,so_far,[],[d])
     }.


   Reading a table from an already opened file.

define List(List(String))
   read_table
     (
       RStream file,
       List(List(Word8)) line_seps,
       List(Word8) line_sep_firsts,       // first characters in line separators
       List(Word8) cell_seps,
       List(List(String)) so_far,        // lines read so far
       List(String) current_line,        // cells read so far in the current line
       List(Word8) current_cell           // characters read so far in the current cell
     ) =
   if *file is
     {
       failure then
         reverse([reverse([implode(reverse(current_cell)) . current_line]) . so_far]),

       success(c) then
         if member(line_sep_firsts,c)
         then read_more_lines(file,line_seps,line_sep_firsts,cell_seps,
                       [reverse([implode(reverse(current_cell)) . current_line]) . so_far],[c])
         else if member(cell_seps,c)
              then read_table(file,line_seps,line_sep_firsts,cell_seps,so_far,
                              [implode(reverse(current_cell)) . current_line],[])
              else read_table(file,line_seps,line_sep_firsts,cell_seps,so_far,current_line,
                              [c . current_cell])
     }.


   Now, here is our tool.

public define Maybe(List(List(String)))
   read_table
     (
       String                                 filename,
       NonEmptyList(NonEmptyList(Word8))       line_separators,
       NonEmptyList(Word8)                     cell_separators
     ) =
   if (Maybe(RStream))file(filename,read) is
     {
       failure then failure,
       success(file) then with line_seps = to_list(line_separators),
         success(read_table(file,
                            map(reverse,map(to_list,line_seps)),
                            map((NonEmptyList(Word8) l) |-> if l is [h.t] then h, line_seps),
                            to_list(cell_separators),
                            [],[],[]))
     }.


   Try it !


define One
   table_print
     (
       List(String) l
     ) =
   if l is
     {
       [ ] then print("\n"),
       [h . t] then print(h);
                    (if t is [] then unique else print(" | "));
                    table_print(t)
     }.

define One
   table_print
     (
       List(List(String)) t
     ) =
   if t is
     {
       [ ] then unique,
       [h . t] then table_print(h); table_print(t)
     }.

define One
   table_print
     (
       Maybe(List(List(String))) t
     ) =
   if t is
     {
       failure then print("File not found.\n"),
       success(l) then table_print(l)
     }.

global define One
   read_csv_table
     (
       List(String) args
     ) =
   if args is
     {
       [ ] then print("Usage: read_csv_table <filename>\n"),
       [h . t] then
         table_print(read_table(h,
                                [[10],[13],[13,10]],
                                [';','\t','&']))
     }.