Mercurial Hosting > luan
comparison core/src/luan/modules/parsers/Html.java @ 625:a3c1e11fb6aa
rewrite much of Html to be more understandable;
add Lucene html_highlighter();
author | Franklin Schmidt <fschmidt@gmail.com> |
---|---|
date | Tue, 12 Jan 2016 23:52:56 -0700 |
parents | |
children |
comparison
equal
deleted
inserted
replaced
624:8281a248c47e | 625:a3c1e11fb6aa |
---|---|
1 package luan.modules.parsers; | |
2 | |
3 import java.util.List; | |
4 import java.util.ArrayList; | |
5 import java.util.Set; | |
6 import java.util.HashSet; | |
7 import luan.LuanTable; | |
8 | |
9 | |
10 public final class Html { | |
11 | |
12 public static LuanTable toList(String text,LuanTable containerTagsTbl) throws ParseException { | |
13 return new Html(text,containerTagsTbl).parse(); | |
14 } | |
15 | |
16 private final Parser parser; | |
17 private final Set<String> containerTags = new HashSet<String>(); | |
18 | |
19 private Html(String text,LuanTable containerTagsTbl) { | |
20 this.parser = new Parser(text); | |
21 for( Object v : containerTagsTbl.asList() ) { | |
22 containerTags.add((String)v); | |
23 } | |
24 } | |
25 | |
26 private LuanTable parse() throws ParseException { | |
27 List list = new ArrayList(); | |
28 StringBuilder sb = new StringBuilder(); | |
29 while( !parser.endOfInput() ) { | |
30 if( parser.test('<') ) { | |
31 LuanTable tbl = parseTag(); | |
32 if( tbl != null ) { | |
33 String tagName = (String)tbl.rawGet("name"); | |
34 if( containerTags.contains(tagName) ) { | |
35 LuanTable container = parseContainer(tbl); | |
36 if( container != null ) | |
37 tbl = container; | |
38 } | |
39 if( tbl != null | |
40 || (tbl = parseComment()) != null | |
41 || (tbl = parseCdata()) != null | |
42 ) { | |
43 if( sb.length() > 0 ) { | |
44 list.add(sb.toString()); | |
45 sb.setLength(0); | |
46 } | |
47 list.add(tbl); | |
48 continue; | |
49 } | |
50 } | |
51 } | |
52 sb.append( parser.currentChar() ); | |
53 parser.anyChar(); | |
54 } | |
55 if( sb.length() > 0 ) | |
56 list.add(sb.toString()); | |
57 return new LuanTable(list); | |
58 } | |
59 | |
60 private LuanTable parseComment() { | |
61 parser.begin(); | |
62 if( !parser.match("<!--") ) | |
63 return parser.failure(null); | |
64 int start = parser.currentIndex(); | |
65 while( !parser.test("-->") ) { | |
66 if( !parser.anyChar() ) | |
67 return parser.failure(null); | |
68 } | |
69 String text = parser.textFrom(start); | |
70 LuanTable tbl = new LuanTable(); | |
71 tbl.rawPut("type","comment"); | |
72 tbl.rawPut("text",text); | |
73 return parser.success(tbl); | |
74 } | |
75 | |
76 private LuanTable parseCdata() { | |
77 parser.begin(); | |
78 if( !parser.match("<![CDATA[") ) | |
79 return parser.failure(null); | |
80 int start = parser.currentIndex(); | |
81 while( !parser.test("]]>") ) { | |
82 if( !parser.anyChar() ) | |
83 return parser.failure(null); | |
84 } | |
85 String text = parser.textFrom(start); | |
86 LuanTable tbl = new LuanTable(); | |
87 tbl.rawPut("type","cdata"); | |
88 tbl.rawPut("text",text); | |
89 return parser.success(tbl); | |
90 } | |
91 | |
92 private LuanTable parseContainer(LuanTable tag) { | |
93 String endTagName = '/' + (String)tag.rawGet("name"); | |
94 int start = parser.begin(); | |
95 int end; | |
96 while(true) { | |
97 if( parser.test('<') ) { | |
98 end = parser.currentIndex(); | |
99 LuanTable tag2 = parseTag(); | |
100 String s = (String)tag2.rawGet("name"); | |
101 if( s.equals(endTagName) ) | |
102 break; | |
103 } | |
104 if( !parser.anyChar() ) | |
105 return parser.failure(null); | |
106 } | |
107 String text = parser.text.substring(start,end); | |
108 LuanTable tbl = new LuanTable(); | |
109 tbl.rawPut("type","container"); | |
110 tbl.rawPut("tag",tag); | |
111 tbl.rawPut("text",text); | |
112 return parser.success(tbl); | |
113 } | |
114 | |
115 private LuanTable parseTag() { | |
116 parser.begin(); | |
117 if( !parser.match('<') ) | |
118 return parser.failure(null); | |
119 int start = parser.currentIndex(); | |
120 parser.match('/'); | |
121 if( !matchNameChar() ) | |
122 return parser.failure(null); | |
123 while( matchNameChar() ); | |
124 String name = parser.textFrom(start).toLowerCase(); | |
125 LuanTable attributes = new LuanTable(); | |
126 String attrName; | |
127 while( (attrName = parseAttrName()) != null ) { | |
128 String attrValue = parseAttrValue(); | |
129 attributes.rawPut( attrName, attrValue!=null ? attrValue : true ); | |
130 } | |
131 while( matchSpace() ); | |
132 boolean isEmpty = parser.match('/'); | |
133 if( !parser.match('>') ) | |
134 return parser.failure(null); | |
135 LuanTable tbl = new LuanTable(); | |
136 tbl.rawPut("type","tag"); | |
137 tbl.rawPut("name",name); | |
138 tbl.rawPut("attributes",attributes); | |
139 tbl.rawPut("is_empty",isEmpty); | |
140 return parser.success(tbl); | |
141 } | |
142 | |
143 private String parseAttrName() { | |
144 parser.begin(); | |
145 if( !matchSpace() ) | |
146 return parser.failure(null); | |
147 while( matchSpace() ); | |
148 int start = parser.currentIndex(); | |
149 if( !matchNameChar() ) | |
150 return parser.failure(null); | |
151 while( matchNameChar() ); | |
152 String name = parser.textFrom(start); | |
153 return parser.success(name); | |
154 } | |
155 | |
156 private String parseAttrValue() { | |
157 parser.begin(); | |
158 while( matchSpace() ); | |
159 if( !parser.match('=') ) | |
160 return parser.failure(null); | |
161 while( matchSpace() ); | |
162 if( parser.anyOf("\"'") ) { | |
163 char quote = parser.lastChar(); | |
164 int start = parser.currentIndex(); | |
165 while( !parser.test(quote) ) { | |
166 if( !parser.anyChar() ) | |
167 return parser.failure(null); | |
168 } | |
169 String value = parser.textFrom(start); | |
170 parser.match(quote); | |
171 return parser.success(value); | |
172 } | |
173 int start = parser.currentIndex(); | |
174 if( !matchValueChar() ) | |
175 return parser.failure(null); | |
176 while( matchValueChar() ); | |
177 String value = parser.textFrom(start); | |
178 return parser.success(value); | |
179 } | |
180 | |
181 private boolean matchNameChar() { | |
182 return parser.inCharRange('a','z') | |
183 || parser.inCharRange('A','Z') | |
184 || parser.inCharRange('0','9') | |
185 || parser.anyOf("_.-:") | |
186 ; | |
187 } | |
188 | |
189 private boolean matchValueChar() { | |
190 return parser.noneOf(" \t\r\n\"'>/="); | |
191 } | |
192 | |
193 private boolean matchSpace() { | |
194 return parser.anyOf(" \t\r\n"); | |
195 } | |
196 | |
197 } |