read_html.anubis
42.9 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
1001
1002
1003
1004
1005
1006
1007
1008
1009
1010
1011
1012
1013
1014
1015
1016
1017
1018
1019
1020
1021
1022
1023
1024
1025
1026
1027
1028
1029
1030
1031
1032
1033
1034
1035
1036
1037
1038
1039
1040
1041
1042
1043
1044
1045
1046
1047
1048
1049
1050
1051
1052
1053
1054
1055
1056
1057
1058
1059
1060
1061
1062
1063
1064
1065
1066
1067
1068
1069
1070
1071
1072
1073
1074
1075
1076
1077
1078
1079
1080
1081
1082
1083
1084
1085
1086
1087
1088
1089
1090
1091
1092
1093
1094
1095
1096
1097
1098
1099
1100
1101
1102
1103
1104
1105
1106
1107
1108
1109
1110
1111
1112
1113
1114
1115
1116
1117
1118
1119
1120
1121
1122
1123
1124
1125
1126
1127
1128
1129
1130
1131
1132
1133
1134
1135
1136
1137
1138
1139
1140
1141
1142
1143
1144
1145
1146
1147
1148
1149
1150
1151
1152
1153
1154
1155
1156
1157
1158
1159
1160
1161
1162
1163
1164
1165
1166
1167
1168
1169
1170
1171
1172
1173
1174
1175
1176
1177
1178
1179
1180
1181
1182
1183
1184
1185
1186
1187
1188
1189
1190
1191
1192
1193
1194
1195
1196
1197
1198
1199
1200
1201
1202
1203
1204
1205
1206
1207
1208
1209
1210
1211
1212
1213
1214
1215
1216
1217
1218
1219
1220
1221
1222
1223
1224
1225
1226
1227
1228
1229
1230
1231
1232
1233
1234
1235
1236
1237
1238
1239
1240
1241
1242
1243
1244
1245
1246
1247
1248
1249
1250
1251
1252
1253
1254
1255
1256
1257
1258
1259
1260
1261
1262
1263
1264
1265
1266
1267
1268
1269
1270
1271
1272
1273
1274
1275
1276
1277
1278
1279
1280
1281
1282
1283
1284
1285
1286
1287
1288
1289
1290
1291
1292
1293
1294
1295
1296
1297
1298
1299
1300
1301
1302
1303
1304
1305
1306
1307
1308
1309
1310
1311
1312
1313
1314
1315
1316
1317
1318
1319
1320
1321
1322
1323
1324
1325
1326
1327
1328
1329
1330
1331
1332
1333
1334
1335
1336
1337
1338
1339
1340
1341
1342
1343
1344
1345
1346
1347
1348
1349
1350
1351
1352
1353
1354
1355
*Project* The Anubis Project
*Title* Extracting informations from an HTML page.
*Copyright* Copyright (c) Alain Prouté 2002.
*Author* Alain Prouté
***** THIS FILE MUST BE UPDATED BECAUSE IT USES 'PER PROCESS GLOBAL VARIABLES' ****
read http_get.anubis
read https_get.anubis
read http_get_common.anubis
read tools/basis.anubis
read system/string.anubis
*Overview*
This file contains a tool which allows to extract informations from an HTML page. The
page may be either a string, the content of a file, or at an URL on the Internet.
There are 109 HTML tags, including all those of HTML 4.0, those which are specific
either to Netscape or to Internet Explorer, and some deprecated tags. We add a tag
'unknown', to handle cases in which a tag cannot be recognized.
public type HTML_tag_name:
unknown(String name),
//unknown,
a, abbr, acronym, address, applet, area,
b, base, basefont, bdo, bgsound, big, blink, blockquote, body, br, button,
caption, center, cite, code, col, colgroup, comment,
dd, del, dfn, dir, div, dl, dt,
em, embed,
fieldset, font, form, frame, frameset,
h1, h2, h3, h4, h5, h6, head, hr, html,
i, iframe, ilayer, img, input, ins, isindex,
kbd, keygen,
label, layer, legend, li, link, listing,
map, marquee, menu, meta, multicol,
nextid, nobr, noembed, noframes, noscript,
object, ol, optgroup, option,
p, param, plaintext, pre,
q,
s, samp, script, select, server, small, spacer, span, strike, strong, style, sub, sup,
table, tbody, td, textarea, tfoot, th, thead, title, tr, tt,
u, ul,
var,
wbr,
xmp.
An HTML text is made of 'HTML items', which are either ordinary (uninterpreted) text or
tags. Tags have attributes, and may have a content. Attributes in tags are converted
into data of type 'HTML_attribute'.
public type HTML_attribute:
attr(String name, String value).
public type HTML_item:
text (String),
no_content_tag (HTML_tag_name name,
List(HTML_attribute) attributes),
content_tag (HTML_tag_name name,
List(HTML_attribute) attributes,
List(HTML_item) content).
For example, the following HTML text:
<table align=center>
<tr> <td> aaaaaa </td> <td> bbbbbb </td></tr>
<tr> <td> cccccc </td> <td> dddddd </td></tr>
</table>
will produce the following datum:
content_tag(table,
[attr("align","center")],
[
content_tag(tr,[],
content_tag(td,[],[text("aaaaaa")]),
content_tag(td,[],[text("bbbbbb")]),
),
content_tag(tr,[],
content_tag(td,[],[text("cccccc")]),
content_tag(td,[],[text("dddddd")]),
)
])
Finally, here is our HTML reading tool. The input may be read either from a string, a
file, or an URL on the Internet.
public define List(HTML_item) read_html_from_string (String html_document).
public define List(HTML_item) read_html_from_file (String file_name).
public define List(HTML_item) xread_html_from_http_url (String server_name,
String document_name,
List(HTTP_argument) operands).
public define List(HTML_item) read_html_from_https_url (String server_name,
String document_name,
List(HTTP_argument) operands,
(Maybe(X509)) -> Bool accept_policy).
The following variants should desappear:
define List(HTTP_argument)
map_to_arguments
(
List((String,String)) l
) =
if l is
{
[ ] then [ ],
[h . t] then if h is (n,v) then
[http_argument(n,v) . map_to_arguments(t)]
}.
public define List(HTML_item) read_html_from_http_url (String server_name,
String document_name,
List((String,String)) operands) =
xread_html_from_http_url(server_name,document_name,map_to_arguments(operands)).
public define List(HTML_item) read_html_from_https_url (String server_name,
String document_name,
List((String,String)) operands,
(Maybe(X509)) -> Bool accept_policy) =
read_html_from_https_url(server_name,document_name,map_to_arguments(operands),accept_policy).
If you want to know which tags are considered as having a content, have a look at the
definition of the function 'delimitors' below. A tag 't' has no content if and only if
'delimitors(t)' is the empty list.
*** Tools.
Once you have structured your HTML input by 'read_html_?' you may want to extract the
list of all tags with a given name (for example all tables or all table rows). Use the
following.
public define List(HTML_item)
get_all_tags
(
List(HTML_item) l,
HTML_tag_name tag
).
For example, 'get_all_tags(document,tr)' will produce the list of all <tr> tags in
'document' (not only at the top level, but even those which may exist within tables
which are themselves within other table rows). You may of course chain several uses of
this function.
You may also want to print an html item or a list of html items. In fact, this is just
the converse operation of 'read_html_?'.
public define One
print
(
HTML_item item,
Word32 margin // with a left margin of this number of characters
).
public define One
print
(
List(HTML_item) items,
Word32 margin // with a left margin of this number of characters
).
--- That's all for the public part. ---------------------------------------------------
An HTML tag may have a corresponding 'end tag'. Hence the following definition.
type HTML_tag:
tag(HTML_tag_name), // normal tag
end_tag(HTML_tag_name). // end tag
*** Classification of HTML tags.
The main problem with tags is that the content of the tag is right delimited in a
variety of ways. For example, the content of the <table> tag is right delimited by a
</table> tag. This is the simplest case. Unfortunatly, we also have the following
situations:
- the content of <option> is right delimited either by <option> or by </select>,
- the content of <li> is right delimited either by </li>, <li>, </ol> or </ul>.
In other words, since we need to know how the content of a tag is right delimited, we
need a function, giving the possible delimitors for each tag.
If this list is empty, this means that the tag does not need to be delimited. In other
words, the tag has no content. Some end tags may also be ommited. Nevertheless, they
are in our lists. This function is a sensitive part of this program. It may eventually
be updated to reflect more closely the way contents are right delimited.
Because of this problem, we can see that despite its apparent simplicity, HTML is
difficult to read. This illustrates the very poor design of this language.
define List(HTML_tag)
delimitors
(
HTML_tag_name name
) =
if name is
{
unknown(_) then (List(HTML_tag)) [ ],
//unknown then (List(HTML_tag)) [ ],
a then (List(HTML_tag)) [end_tag(a)],
abbr then (List(HTML_tag)) [end_tag(abbr)],
acronym then (List(HTML_tag)) [end_tag(acronym)],
address then (List(HTML_tag)) [end_tag(address)],
applet then (List(HTML_tag)) [end_tag(applet)],
area then (List(HTML_tag)) [ ],
b then (List(HTML_tag)) [end_tag(b),
end_tag(font) // OD
],
base then (List(HTML_tag)) [ ],
basefont then (List(HTML_tag)) [ ], // </basefont> exists but the tag has no content !
bdo then (List(HTML_tag)) [end_tag(bdo)],
bgsound then (List(HTML_tag)) [ ],
big then (List(HTML_tag)) [end_tag(big)],
blink then (List(HTML_tag)) [end_tag(blink)],
blockquote then (List(HTML_tag)) [end_tag(blockquote)],
body then (List(HTML_tag)) [end_tag(body),end_tag(html)],
br then (List(HTML_tag)) [ ],
button then (List(HTML_tag)) [end_tag(button)],
caption then (List(HTML_tag)) [end_tag(caption)],
center then (List(HTML_tag)) [end_tag(center)],
cite then (List(HTML_tag)) [end_tag(cite)],
code then (List(HTML_tag)) [end_tag(code)],
col then (List(HTML_tag)) [ ],
colgroup then (List(HTML_tag)) [end_tag(colgroup),tag(colgroup),
tag(thead),tag(tbody),tag(tfoot),tag(tr)],
comment then (List(HTML_tag)) [end_tag(comment)],
dd then (List(HTML_tag)) [end_tag(dd),tag(dt),end_tag(dl)],
del then (List(HTML_tag)) [end_tag(del)],
dfn then (List(HTML_tag)) [end_tag(dfn)],
dir then (List(HTML_tag)) [end_tag(dir)],
div then (List(HTML_tag)) [end_tag(div),tag(div),end_tag(body),end_tag(html)],
dl then (List(HTML_tag)) [end_tag(dl)],
dt then (List(HTML_tag)) [end_tag(dt),tag(dd)],
em then (List(HTML_tag)) [end_tag(em)],
embed then (List(HTML_tag)) [ ],
fieldset then (List(HTML_tag)) [end_tag(fieldset)],
font then (List(HTML_tag)) [end_tag(font),
end_tag(tr), // OD
tag(b) // OD
],
form then (List(HTML_tag)) [end_tag(form)],
frame then (List(HTML_tag)) [ ],
frameset then (List(HTML_tag)) [end_tag(frameset)],
h1 then (List(HTML_tag)) [end_tag(h1)],
h2 then (List(HTML_tag)) [end_tag(h2)],
h3 then (List(HTML_tag)) [end_tag(h3)],
h4 then (List(HTML_tag)) [end_tag(h4)],
h5 then (List(HTML_tag)) [end_tag(h5)],
h6 then (List(HTML_tag)) [end_tag(h6)],
head then (List(HTML_tag)) [end_tag(head),tag(body)],
hr then (List(HTML_tag)) [ ],
html then (List(HTML_tag)) [end_tag(html)],
i then (List(HTML_tag)) [end_tag(i)],
iframe then (List(HTML_tag)) [end_tag(iframe)],
ilayer then (List(HTML_tag)) [end_tag(ilayer)],
img then (List(HTML_tag)) [ ],
input then (List(HTML_tag)) [ ],
ins then (List(HTML_tag)) [end_tag(ins)],
isindex then (List(HTML_tag)) [ ],
kbd then (List(HTML_tag)) [end_tag(kbd)],
keygen then (List(HTML_tag)) [ ],
label then (List(HTML_tag)) [end_tag(label)],
layer then (List(HTML_tag)) [end_tag(layer)],
legend then (List(HTML_tag)) [end_tag(legend),tag(label)],
li then (List(HTML_tag)) [end_tag(li),tag(li),end_tag(ol),end_tag(ul),
end_tag(dir),end_tag(menu)],
link then (List(HTML_tag)) [ ],
listing then (List(HTML_tag)) [end_tag(listing)],
map then (List(HTML_tag)) [end_tag(map)],
marquee then (List(HTML_tag)) [end_tag(marquee)],
menu then (List(HTML_tag)) [end_tag(menu)],
meta then (List(HTML_tag)) [ ],
multicol then (List(HTML_tag)) [end_tag(multicol)],
nextid then (List(HTML_tag)) [ ],
nobr then (List(HTML_tag)) [end_tag(nobr)],
noembed then (List(HTML_tag)) [ ],
noframes then (List(HTML_tag)) [end_tag(noframes)],
noscript then (List(HTML_tag)) [end_tag(noscript)],
object then (List(HTML_tag)) [end_tag(object)],
ol then (List(HTML_tag)) [end_tag(ol)],
optgroup then (List(HTML_tag)) [end_tag(optgroup),tag(optgroup),end_tag(select)],
option then (List(HTML_tag)) [end_tag(option),tag(option),end_tag(select)],
p then (List(HTML_tag)) [end_tag(p),tag(p),end_tag(body),end_tag(html),
end_tag(font), // OD
tag(font) // OD
],
param then (List(HTML_tag)) [ ],
plaintext then (List(HTML_tag)) [end_tag(body),end_tag(html)],
pre then (List(HTML_tag)) [end_tag(pre)],
q then (List(HTML_tag)) [end_tag(q)],
s then (List(HTML_tag)) [end_tag(s)],
samp then (List(HTML_tag)) [end_tag(samp)],
script then (List(HTML_tag)) [end_tag(script)],
select then (List(HTML_tag)) [end_tag(select)],
server then (List(HTML_tag)) [end_tag(server)],
small then (List(HTML_tag)) [end_tag(small)],
spacer then (List(HTML_tag)) [ ],
span then (List(HTML_tag)) [end_tag(span)],
strike then (List(HTML_tag)) [end_tag(strike)],
strong then (List(HTML_tag)) [end_tag(strong)],
style then (List(HTML_tag)) [end_tag(style)],
sub then (List(HTML_tag)) [end_tag(sub)],
sup then (List(HTML_tag)) [end_tag(sup)],
table then (List(HTML_tag)) [end_tag(table)],
tbody then (List(HTML_tag)) [end_tag(tbody),tag(tfoot),end_tag(table)],
td then (List(HTML_tag)) [end_tag(td),tag(td),tag(th),tag(tr),
end_tag(tr),end_tag(table)],
textarea then (List(HTML_tag)) [end_tag(textarea)],
tfoot then (List(HTML_tag)) [end_tag(tfoot),end_tag(table)],
th then (List(HTML_tag)) [end_tag(th),tag(th),tag(td),tag(tr),
end_tag(tr),end_tag(table)],
thead then (List(HTML_tag)) [end_tag(thead),tag(tbody),tag(tfoot)],
title then (List(HTML_tag)) [end_tag(title)],
tr then (List(HTML_tag)) [end_tag(tr),tag(tr),end_tag(table)],
tt then (List(HTML_tag)) [end_tag(tt)],
u then (List(HTML_tag)) [end_tag(u)],
ul then (List(HTML_tag)) [end_tag(ul)],
var then (List(HTML_tag)) [end_tag(var)],
wbr then (List(HTML_tag)) [ ],
xmp then (List(HTML_tag)) [end_tag(xmp)],
}.
We need a function which recognizes a tag by its name (already put in lower case).
define HTML_tag_name
recognize_tag
(
String name1
) =
with name = to_lower(name1),
if nth((Int)0,name) is
{
failure then //unknown,
unknown(name),
success(c) then
if c = 'a' then
(
if name = "a" then a else
if name = "abbr" then abbr else
if name = "acronym" then acronym else
if name = "address" then address else
if name = "applet" then applet else
if name = "area" then area else
//unknown
unknown(name)
) else
if c = 'b' then
(
if name = "b" then b else
if name = "base" then base else
if name = "basefont" then basefont else
if name = "bdo" then bdo else
if name = "bgsound" then bgsound else
if name = "big" then big else
if name = "blink" then blink else
if name = "blockquote" then blockquote else
if name = "body" then body else
if name = "br" then br else
if name = "button" then button else
//unknown
unknown(name)
) else
if c = 'c' then
(
if name = "caption" then caption else
if name = "center" then center else
if name = "cite" then cite else
if name = "code" then code else
if name = "col" then col else
if name = "colgroup" then colgroup else
if name = "comment" then comment else
//unknown
unknown(name)
) else
if c = 'd' then
(
if name = "dd" then dd else
if name = "del" then del else
if name = "dfn" then dfn else
if name = "dir" then dir else
if name = "div" then div else
if name = "dl" then dl else
if name = "dt" then dt else
//unknown
unknown(name)
) else
if c = 'e' then
(
if name = "em" then em else
if name = "embed" then embed else
//unknown
unknown(name)
) else
if c = 'f' then
(
if name = "fieldset" then fieldset else
if name = "font" then font else
if name = "form" then form else
if name = "frame" then frame else
if name = "frameset" then frameset else
//unknown
unknown(name)
) else
if c = 'h' then
(
if name = "h1" then h1 else
if name = "h2" then h2 else
if name = "h3" then h3 else
if name = "h4" then h4 else
if name = "h5" then h5 else
if name = "h6" then h6 else
if name = "head" then head else
if name = "hr" then hr else
if name = "html" then html else
//unknown
unknown(name)
) else
if c = 'i' then
(
if name = "i" then i else
if name = "iframe" then iframe else
if name = "ilayer" then ilayer else
if name = "img" then img else
if name = "input" then input else
if name = "ins" then ins else
if name = "isindex" then isindex else
//unknown
unknown(name)
) else
if c = 'k' then
(
if name = "kbd" then kbd else
if name = "keygen" then keygen else
//unknown
unknown(name)
) else
if c = 'l' then
(
if name = "label" then label else
if name = "layer" then layer else
if name = "legend" then legend else
if name = "li" then li else
if name = "link" then link else
if name = "listing" then listing else
//unknown
unknown(name)
) else
if c = 'm' then
(
if name = "map" then map else
if name = "marquee" then marquee else
if name = "menu" then menu else
if name = "meta" then meta else
if name = "multicol" then multicol else
//unknown
unknown(name)
) else
if c = 'n' then
(
if name = "nextid" then nextid else
if name = "nobr" then nobr else
if name = "noembed" then noembed else
if name = "noframes" then noframes else
if name = "noscript" then noscript else
//unknown
unknown(name)
) else
if c = 'o' then
(
if name = "object" then object else
if name = "ol" then ol else
if name = "optgroup" then optgroup else
if name = "option" then option else
//unknown
unknown(name)
) else
if c = 'p' then
(
if name = "p" then p else
if name = "param" then param else
if name = "plaintext" then plaintext else
if name = "pre" then pre else
//unknown
unknown(name)
) else
if c = 'q' then
(
if name = "q" then q else
//unknown
unknown(name)
) else
if c = 's' then
(
if name = "s" then s else
if name = "samp" then samp else
if name = "script" then script else
if name = "select" then select else
if name = "server" then server else
if name = "small" then small else
if name = "spacer" then spacer else
if name = "span" then span else
if name = "strike" then strike else
if name = "strong" then strong else
if name = "style" then style else
if name = "sub" then sub else
if name = "sup" then sup else
//unknown
unknown(name)
) else
if c = 't' then
(
if name = "table" then table else
if name = "tbody" then tbody else
if name = "td" then td else
if name = "textarea" then textarea else
if name = "tfoot" then tfoot else
if name = "th" then th else
if name = "thead" then thead else
if name = "title" then title else
if name = "tr" then tr else
if name = "tt" then tt else
//unknown
unknown(name)
) else
if c = 'u' then
(
if name = "u" then u else
if name = "ul" then ul else
//unknown
unknown(name)
) else
if c = 'v' then
(
if name = "var" then var else
//unknown
unknown(name)
) else
if c = 'w' then
(
if name = "wbr" then wbr else
//unknown
unknown(name)
) else
if c = 'x' then
(
if name = "xmp" then xmp else
//unknown
unknown(name)
) else
//unknown
unknown(name)
}.
*** Reading characters.
Our HTML parser works with the following levels of syntactical concepts:
- characters,
- tokens (like symbols, equal sign, angle brackets, ...)
- macro-tokens (tags with their attributes, end tags and chunks of text)
- HTML items.
The source document may be given in several different forms.
public type Input:
string(String),
connR(RStream),
connRW(RWStream).
variable Word32 position = 0. used only by string documents.
variable Input document = string(""). source of original HTML document to be read
variable List(Word8) unput_chars = []. used only by connection documents
Reading the next character.
define Maybe(Word8)
next_char
=
if *document is
{
string(s) then
if nth(to_Int(*position),s) is
{
failure then failure,
success(c) then
position <- *position + 1;
success(c)
},
connR(addr) then
if *unput_chars is
{
[ ] then *addr,
[h . t] then unput_chars <- t; success(h)
},
connRW(addr) then
if *unput_chars is
{
[ ] then *addr,
[h . t] then unput_chars <- t; success(h)
}
}.
Unputting the most recently read character.
define One
unput_char
(
Word8 c
) =
if *document is
{
string(s) then
position <- *position - 1,
connR(addr) then
unput_chars <- [c . *unput_chars],
connRW(addr) then
unput_chars <- [c . *unput_chars],
}.
We need some classical tests on characters.
define Bool
is_blank
(
Word8 c
) =
if c = ' ' then true else
if c = '\t' then true else
if c = '\n' then true else
c = '\r'.
Names (of tags and attributes) may contain any character which has no special meaning
to HTML. Hence, we need a test for characters which may right delimit a name.
define Bool
is_name_delimitor
(
Word8 c
) =
if is_blank(c) then true else
if c = '<' then true else
if c = '>' then true else
if c = '=' then true else
false.
define Bool
is_letter_or_slash
(
Word8 c
) =
if ('a' +=< c & c +=< 'z') then true else
if ('A' +=< c & c +=< 'Z') then true else
c = '/'.
*** Reading tokens.
We read tokens only within a pair of angle brackets. In other words, chunks of text
flow are not made of tokens. Since HTML tags have the following form:
<tag_name name1=value1 ...>
</tag_name>
and since we have already read '<' when we begin to read tokens, our tokens are:
public type Token:
name_or_value(String), // tag names, attribute names, attribute values
equals, // equal sign
tag_end. // '>'
Values may begin by a double quote, like in
src="images/gaga.gif"
In that case, we read until the next double quote. Otherwise, values are delimited like
symbols.
Reading a double quoted value. The character '"' has already been read (and not
kept). We must read up to the next one (which is also not kept).
define Token
read_double_quoted
(
List(Word8) so_far
) =
if next_char is
{
failure then name_or_value(implode(reverse(so_far))),
success(c) then
if c = '\"'
then name_or_value(implode(reverse(so_far)))
else read_double_quoted([c . so_far])
}.
Reading a name or a value (not beginning by a double quote).
define String
read_name_or_value
(
List(Word8) so_far
) =
if next_char is
{
failure then implode(reverse(so_far)),
success(c) then
if is_name_delimitor(c)
then unput_char(c); implode(reverse(so_far))
else read_name_or_value([c . so_far])
}.
Now, we can read tokens. We have a mecanism for unputting tokens.
variable List(Token) unput_tokens = [].
define Token
next_token
(
One u
) =
if *unput_tokens is
{
[ ] then
if next_char is
{
failure then tag_end,
success(c) then
if is_blank(c) then next_token(u) else // ignore blanks
if c = '=' then equals else // detect '=' sign
if c = '\"' then read_double_quoted([]) else // detect double quotes
if c = '>' then tag_end else // detect end of tag
name_or_value(read_name_or_value([c])) // anything else may begin a name or value
},
[h . t] then
unput_tokens <- t; h
}.
define One
unput_token
(
Token tok
) =
unput_tokens <- [tok . *unput_tokens].
Reading an attribute.
define Maybe(HTML_attribute)
read_attribute
=
if next_token(unique) is
{
name_or_value(name) then
if next_token(unique) is
{
name_or_value(s) then unput_token(name_or_value(s)); success(attr(name,"")),
equals then if next_token(unique) is
{
name_or_value(value) then success(attr(name,value)),
equals then failure,
tag_end then unput_token(tag_end); success(attr(name,""))
},
tag_end then unput_token(tag_end); success(attr(name,""))
},
equals then failure,
tag_end then failure
}.
Reading a sequence of attribute definitions.
define List(HTML_attribute)
read_attributes
(
List(HTML_attribute) so_far
) =
if read_attribute is
{
failure then reverse(so_far),
success(a) then read_attributes([a . so_far])
}.
*** Reading macro-tokens.
HTML documents are sequences of text chunks and tags. The characters '<' and '>' are
used to switch from text to tags and from tags to text. Hence, it is quite easy to cut
down the HTML document into a sequence of 'macro-tokens'.
Important remark. It seems that an occurence of '<' within a text, which is not
immediatly followed by the name of a tag (without any blank in between), should be
considered as part of the text, i.e. not as the beginning of a tag. Actually, we
consider it as part of the text if the next character is not a letter.
public type MacroToken:
text(String),
tag(HTML_tag_name,List(HTML_attribute)),
end_tag(HTML_tag_name).
define Maybe(MacroToken) next_macro_token.
'read_end_tag' reads an end tag, assuming that '</' has already been read.
define Maybe(MacroToken)
read_end_tag
=
with name = read_name_or_value([]),
if next_token(unique) is
{
name_or_value(_) then failure,
equals then failure,
tag_end then success(end_tag(recognize_tag(name)))
}.
define Maybe(MacroToken)
skip_html_comment.
define Maybe(MacroToken)
read_tag.
define Maybe(MacroToken)
next_macro_token.
define Maybe(MacroToken)
skip_html_comment_3
=
if next_char is
{
failure then failure,
success(c) then
if c = '>' then next_macro_token else
skip_html_comment
}.
define Maybe(MacroToken)
skip_html_comment_2
=
if next_char is
{
failure then failure,
success(c) then
if c = '-' then skip_html_comment_3 else
skip_html_comment
}.
define Maybe(MacroToken)
skip_html_comment
=
if next_char is
{
failure then failure,
success(c) then
if c = '-' then skip_html_comment_2 else
skip_html_comment
}.
define Maybe(MacroToken)
read_text_chunk
(
List(Word8) so_far
).
'read_tag' reads a tag (begin or end) assuming that '<' has already been read.
define Maybe(MacroToken)
read_tag
=
if next_char is
{
failure then failure,
success(c) then
if c = '/' then read_end_tag else // this is an end tag
if c = ' ' then read_text_chunk([c,'<']) else
// if c = '!' then skip_html_comment else
with name = read_name_or_value([c]),
attributes = read_attributes([]),
success(tag(recognize_tag(name),attributes))
}.
Reading a text chunk.
define Maybe(MacroToken)
read_text_chunk
(
List(Word8) so_far
) =
if next_char is
{
failure then
if so_far is
{
[ ] then failure,
[_ . _] then success(text(implode(reverse(so_far))))
},
success(c) then
if is_blank(c) then
(
if so_far is
{
[ ] then read_text_chunk(so_far),
[_ . _] then read_text_chunk([c . so_far])
}
) else
if c = '<'
then if next_char is
{
failure then success(text(implode(reverse([c . so_far])))),
success(d) then if is_letter_or_slash(d)
then (unput_char(d); unput_char(c);
if so_far is
{
[ ] then next_macro_token,
[_ . _] then success(text(implode(reverse(so_far))))
})
else read_text_chunk([d , c . so_far])
}
else read_text_chunk([c . so_far])
}.
Getting the next macro-token. Again, we need a mecanism for unputting macro-tokens.
variable List(MacroToken) unput_macro_tokens = [].
define Maybe(MacroToken)
next_macro_token
=
if *unput_macro_tokens is
{
[ ] then
if next_char is
{
failure then failure,
success(c) then
if is_blank(c) then next_macro_token else
if c = '<'
then read_tag
else read_text_chunk([c])
},
[h . t] then
unput_macro_tokens <- t; success(h)
}.
define One
unput_macro_token
(
MacroToken mtok
) =
unput_macro_tokens <- [mtok . *unput_macro_tokens].
*** Reading the whole HTML document.
Now, reading a sequence of HTML items works as follows. If the next macro-token is a
chunk of text, we store this chunk of text. If the next macro-token is a tag with no
content, we store this tag. If the next macro-token is a tag with content, we call a
function which reads sequences of HTML items until an appropriate delimitor is
found. This makes an HTML item which is stored. Hence, obviously, what we need is just
this function.
define List(HTML_item)
read_html_items
(
List(HTML_tag) delim1,
List(HTML_item) so_far
) =
with mbmtok = next_macro_token,
if mbmtok is
{
failure then reverse(so_far),
success(mtok) then if mtok is
{
text(s) then
read_html_items(delim1,[text(s) . so_far]),
tag(name,attributes) then
if member(delim1,tag(name))
then unput_macro_token(mtok); reverse(so_far)
else with delim2 = delimitors(name),
if delim2 is
{
[ ] then read_html_items(delim1,[no_content_tag(name,attributes) . so_far]),
[_ . _] then with content = read_html_items(delim2,[]),
read_html_items(delim1,[content_tag(name,attributes,content) . so_far])
},
end_tag(name) then
if member(delim1,end_tag(name))
then reverse(so_far)
else read_html_items(delim1,so_far)
}
}.
*** The public tools.
public define List(HTML_item)
read_html_from_string
(
String html_document
) =
document <- string(html_document);
position <- 0;
read_html_items([],[]).
public define List(HTML_item)
read_html_from_file
(
String file_name
) =
if (Maybe(RStream))connect to file file_name is
{
failure then [],
success(file) then
document <- connR(file);
unput_chars <- [];
read_html_items([],[])
}.
public define List(HTML_item)
xread_html_from_http_url
(
String server_name,
String document_name,
List(HTTP_argument) operands,
) =
if http_get(server_name,document_name,operands) is
ok(r,h,s) then (
document <- string(s);
position <- 0;
read_html_items([],[])
)
else [].
public define List(HTML_item)
read_html_from_https_url
(
String server_name,
String document_name,
List(HTTP_argument) operands,
(Maybe(X509)) -> Bool accept_policy
) =
if https_get(server_name,document_name,operands,accept_policy) is
ok(r,h,s) then (
document <- string(s);
position <- 0;
read_html_items([],[])
)
else [].
*** Tools.
public define List(HTML_item)
get_all_tags
(
List(HTML_item) l,
HTML_tag_name tag
) =
if l is
{
[ ] then [ ],
[h . t] then
if h is
{
text(_) then get_all_tags(t,tag),
no_content_tag(tn,_) then
if tn = tag
then [h . get_all_tags(t,tag)]
else get_all_tags(t,tag),
content_tag(tn,_,c) then
if tn = tag
then [h . append(get_all_tags(c,tag),get_all_tags(t,tag))]
else append(get_all_tags(c,tag),get_all_tags(t,tag))
}
}.
*** Printing HTML items.
define String
name_of
(
HTML_tag_name tag
) =
if tag is
{
unknown(String name) then name,
//unknown then "???",
a then "a",
abbr then "abbr",
acronym then "acronym",
address then "address",
applet then "applet",
area then "area",
b then "b",
base then "base",
basefont then "basefont",
bdo then "bdo",
bgsound then "bgsound",
big then "big",
blink then "blink",
blockquote then "blockquote",
body then "body",
br then "br",
button then "button",
caption then "caption",
center then "center",
cite then "cite",
code then "code",
col then "col",
colgroup then "colgroup",
comment then "comment",
dd then "dd",
del then "del",
dfn then "dfn",
dir then "dir",
div then "div",
dl then "dl",
dt then "dt",
em then "em",
embed then "embed",
fieldset then "fieldset",
font then "font",
form then "form",
frame then "frame",
frameset then "frameset",
h1 then "h1",
h2 then "h2",
h3 then "h3",
h4 then "h4",
h5 then "h5",
h6 then "h6",
head then "head",
hr then "hr",
html then "html",
i then "i",
iframe then "iframe",
ilayer then "ilayer",
img then "img",
input then "input",
ins then "ins",
isindex then "isindex",
kbd then "kbd",
keygen then "keygen",
label then "label",
layer then "layer",
legend then "legend",
li then "li",
link then "link",
listing then "listing",
map then "map",
marquee then "marquee",
menu then "menu",
meta then "meta",
multicol then "multicol",
nextid then "nextid",
nobr then "nobr",
noembed then "noembed",
noframes then "noframes",
noscript then "noscript",
object then "object",
ol then "ol",
optgroup then "optgroup",
option then "option",
p then "p",
param then "param",
plaintext then "plaintext",
pre then "pre",
q then "q",
s then "s",
samp then "samp",
script then "script",
select then "select",
server then "server",
small then "small",
spacer then "spacer",
span then "span",
strike then "strike",
strong then "strong",
style then "style",
sub then "sub",
sup then "sup",
table then "table",
tbody then "tbody",
td then "td",
textarea then "textarea",
tfoot then "tfoot",
th then "th",
thead then "thead",
title then "title",
tr then "tr",
tt then "tt",
u then "u",
ul then "ul",
var then "var",
wbr then "wbr",
xmp then "xmp",
}.
define One
print_margin_aux
(
Word32 width
) =
if width -< 1 then unique else
(print(" "); print_margin_aux(width-1)).
define One
print_margin
(
Word32 width
) =
print("\n");
print_margin_aux(width).
define One
print
(
List(HTML_attribute) l
) =
if l is
{
[ ] then unique,
[h . t] then if h is attr(name,val) then
print(" "); print(name);
if val = "" then unique else (print("=\""); print(val); print("\""));
print(t)
}.
public define One
print
(
HTML_item item,
Word32 indent
) =
if item is
{
text(s) then
print_margin(indent);
print(s),
no_content_tag(tag,attrs) then
print_margin(indent);
print("<");
print(name_of(tag));
print(attrs);
print(">"),
content_tag(tag,attrs,cont) then
print_margin(indent);
print("<");
print(name_of(tag));
print(attrs);
print(">\n");
print(cont,indent+2);
print_margin(indent);
print("</");
print(name_of(tag));
print(">")
}.
public define One
print
(
List(HTML_item) items,
Word32 margin
) =
if items is
{
[ ] then unique,
[h . t] then print(h,margin); print(t,margin)
}.