http_get.anubis 9.93 KB

 *Project*                             The Anubis Project
   
 *Title*                         Getting a document from the Web.
   
 *Copyright*                     Copyright (c) Alain Prouté 2001. 


 *Author*       Alain Prouté
   

 
 *Overview*
   This file  defines the function  'http_get' which retrieves  a document from  the world
   wide  web  (a  similar  function  'https_get'  for  secured  documents  is  defined  in
   'https_get.anubis').  The function  simulates the behavior of a  browser, at least just
   what is needed to retrieve the document.  It does not display the document, but returns
   it (if  found) in  the form of  a string. It  also returns  the response line  from the
   server, and the list af all HTTP headers.
   
   The function 'http_get' takes the following arguments:
   
     - the name of the server to which the request is to be sent,    
     - the name (including the path) of the document on this server, 
     - a list of headers to be added to mandatory standard headers, 
     - a list of 'arguments'  in the form of pairs of strings  '(name,value)' to be sent as
       the body of the request. 

   
   The result  returned by 'http_get' has  the following type, which  defines the problems
   which may happen:   


read tools/basis.anubis
read system/string.anubis
read web/common.anubis   
read web/http_get_common.anubis   


public type HTTP_GET_Result:
  cannot_resolve_server_name(DNS_Result),
  cannot_connect_to_server(NetworkConnectError),
  transmission_problem, 
  request_refused_by_server,
  ok(String response,                    // HTTP response line from the server
     List(HTTP_header) headers,          // HTTP headers received from the server
     String document).                   // The HTML document itself
   
   
public define HTTP_GET_Result
  http_get
    (                                    //-------- example: -----------------------
      String server_name,                //         "www.machin.com"
      String document_name,              //         "/truc/bidule.html"
      List(HTTP_header) headers,         //         [http_header("Cookie","..."),...]
      List(HTTP_argument) arguments      //         [http_argument("ga","bu"),...]
    ). 

   The same one without the 'headers' argument:

public define HTTP_GET_Result
  http_get
    (                                    //-------- example: -----------------------
      String server_name,                //         "www.machin.com"
      String document_name,              //         "/truc/bidule.html"
      List(HTTP_argument) arguments      //         [http_argument("ga","bu"),...]
    ) = http_get(server_name,document_name,[],arguments).
   
   
   
   This  file also defines  the command  'http_get' to  be used  directly from  the system
   prompt.  To learn about the syntax, just  type 'http_get' at the system prompt, or have
   a look at the end of this file
   
   --- That's all for public definitions. ------------------------------------------------

   
      
   We need two functions for sending and receiving bytes. 
   
define Maybe(One)
  send
    (
      RWStream conn,      // where to send the text
      String text,            // the text to be sent
      Word32 n                 // start sending at character number 'n' in 'text'
    ) =
  if nth(to_Int(n),text) is
    { 
      failure then success(unique), 
      success(c) then 
        if conn <- c is 
          {
            failure then failure, 
            success(_) then send(conn,text,n+1)
          }
    }. 

define Maybe(String)
  receive_text_chunk
    (
      RWStream conn, 
      List(Word8) so_far,
      Word32 count
    ) =
  if count = 100
  then success(implode(reverse(so_far)))
  else if *conn is         // *conn waits for data to be readable from connection 
    {
      failure then success(implode(reverse(so_far))),   // means 'connection closed by peer'
      success(c) then receive_text_chunk(conn,[c . so_far],count+1)
    }. 
   
   
define HTTP_GET_Result
  receive
    (
      RWStream conn,
      String headers, 
      String text_so_far,
      Bool double_crlf_seen
    ) =
  if receive_text_chunk(conn,[],0) is 
    {
      failure then if separate_headers(headers) is 
        {
          [ ] then ok("",[],text_so_far), 
          [h . t] then if h is http_header(a,b) then ok(a,t,text_so_far)
        },
   
      success(s) then 
        if s = ""
        then if separate_headers(headers) is
          {
            [ ] then ok("",[],text_so_far), 
            [h . t] then if h is http_header(a,b) then ok(a,t,text_so_far)
          }
        else with new_s = text_so_far+s, 
             if double_crlf_seen
             then receive(conn,headers,new_s,true)
             else if has_double_crlf(new_s) is 
               {
                 failure then receive(conn,headers,new_s,false),
                 success(n) then 
                   if sub_string(new_s,n+4,length(new_s)-n-4) is 
                     {
                       failure then alert, 
                       success(s1) then
                         if sub_string(new_s,0,n) is 
                           {
                             failure then alert, 
                             success(h) then receive(conn,h,s1,true)
                           }
                     }
               }
    }. 


   
   The next function  has a valid TCP/IP  connection to the server, and  tries to retrieve
   the document.
   

define HTTP_GET_Result
  http_get
    (
      Bool print_all, 
      RWStream conn, 
      String server_name, 
      String document_name,
      List(HTTP_header) headers, 
      List(HTTP_argument) arguments,
    ) = 
  //
  // Send the HTTP request, and receive the answer:
  //
  with body = format_http_args(arguments), 
  with request =  (if arguments = [] then "GET " else "POST ")
                       + document_name + " HTTP/1.0" + crlf +
                  "Host: " + server_name + crlf +
                  "Accept-Charset: iso-8859-1,*,utf-8" + crlf + 
                  (if arguments = [] then "" 
                       else "Content-type: application/x-www-form-urlencoded" + crlf +
                            "Content-length: " + to_decimal(length(body))+ crlf) +
                  format_headers(headers) +
                  crlf +
                  body,
  (if print_all then 
   (
    print("----- request ----\n");
    print(request); 
    print("\n")
   ) else unique); 
  if send(conn,request,0) is
    {
      failure     then   transmission_problem,
      success(_)  then   receive(conn,"","",false)
    }. 
   
   
   The next function retrieves the document using the numerical (resolved) server address. 
   
define HTTP_GET_Result
  http_get
    (
      Bool print_all,
      Word32 server_addr,
      Word32 server_port, 
      String server_name, 
      String document_name,
      List(HTTP_header) headers, 
      List(HTTP_argument) arguments,
    ) =
  //
  // try to connect to the server before sending the request
  // 
  if (Result(NetworkConnectError,RWStream))connect(server_addr,server_port) is
    {
      error(e)   then   cannot_connect_to_server(e), 
      ok(conn)   then   http_get(print_all,conn,server_name,document_name,headers,arguments)
    }. 
   
   
define HTTP_GET_Result
  http_get
    (
      Bool print_all, 
      String server_name,
      String document_name,
      List(HTTP_header) headers, 
      List(HTTP_argument) arguments,
    ) = 
  if separate_name_port(server_name,80) is (name,port) then 
  //
  // resolve server name and call 'http_get' with numeric server address:
  //
  with a = dns(name), 
    if a is     ok(addr) 
    then        http_get(print_all,addr,port,name,document_name,headers,arguments)
    else        cannot_resolve_server_name(a). 
      
   
   Now, here is our public tool:
   
define HTTP_GET_Result
  http_get
    (
      String server_name,
      String document_name,
      List(HTTP_header) headers, 
      List(HTTP_argument) arguments,
    ) = http_get(false,server_name,document_name,headers,arguments). 
   
   
   
   Finally, we construct the executable module 'http_get':
   
define One
  recall_syntax =
    print("\nUsage: http_get <server> <document> [options] =<header> <value> ... -<arg> <value> ...\n");
    print("   Options are:\n");
    print("     -print_all       print request, response line, headers and document\n");
    print("                      (default is to print only the document)\n").
   
  
  
   
global define One
  http_get
    (
      List(String) args
    ) =
  if args is 
    {
      [ ]              then recall_syntax, 
      [server . t]     then if t is 
        {
          [ ]                  then recall_syntax, 
          [document . rest]    then 
            with print_all = member(rest,"-print_all"), 
                 headers   = get_headers(rest), 
                 arguments = get_arguments(rest), 
            if http_get(print_all,server,document,headers,arguments) is
              {
                cannot_resolve_server_name(dns_error) then 
                  print("Cannot resolve server name: " + format(dns_error) + ".\n"), 
   
                cannot_connect_to_server(connect_error) then 
                  print("Cannot connect to server: " + format(connect_error) + ".\n"), 
   
                transmission_problem then 
                  print("Transmission problem.\n"), 
   
                request_refused_by_server then 
                  print("The request has been refused by server: " + server + ".\n"),
    
                ok(response,headers1,document1) then 
                  (
                  if print_all
                  then (
                          print("\n----- response ----\n"); 
                          print(response); 
                          print("\n----- headers -----\n"); 
                          print_headers(headers1); 
                          print("----- document ----\n")
                        ) else unique
                  ); 
                  print(document1)     // on the screen (use a redirection to get it in a file)
              }
        }
    }.