Mercurial Hosting > luan
diff core/src/luan/modules/parsers/Html.java @ 625:a3c1e11fb6aa
rewrite much of Html to be more understandable;
add Lucene html_highlighter();
author | Franklin Schmidt <fschmidt@gmail.com> |
---|---|
date | Tue, 12 Jan 2016 23:52:56 -0700 |
parents | |
children |
line wrap: on
line diff
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/core/src/luan/modules/parsers/Html.java Tue Jan 12 23:52:56 2016 -0700 @@ -0,0 +1,197 @@ +package luan.modules.parsers; + +import java.util.List; +import java.util.ArrayList; +import java.util.Set; +import java.util.HashSet; +import luan.LuanTable; + + +public final class Html { + + public static LuanTable toList(String text,LuanTable containerTagsTbl) throws ParseException { + return new Html(text,containerTagsTbl).parse(); + } + + private final Parser parser; + private final Set<String> containerTags = new HashSet<String>(); + + private Html(String text,LuanTable containerTagsTbl) { + this.parser = new Parser(text); + for( Object v : containerTagsTbl.asList() ) { + containerTags.add((String)v); + } + } + + private LuanTable parse() throws ParseException { + List list = new ArrayList(); + StringBuilder sb = new StringBuilder(); + while( !parser.endOfInput() ) { + if( parser.test('<') ) { + LuanTable tbl = parseTag(); + if( tbl != null ) { + String tagName = (String)tbl.rawGet("name"); + if( containerTags.contains(tagName) ) { + LuanTable container = parseContainer(tbl); + if( container != null ) + tbl = container; + } + if( tbl != null + || (tbl = parseComment()) != null + || (tbl = parseCdata()) != null + ) { + if( sb.length() > 0 ) { + list.add(sb.toString()); + sb.setLength(0); + } + list.add(tbl); + continue; + } + } + } + sb.append( parser.currentChar() ); + parser.anyChar(); + } + if( sb.length() > 0 ) + list.add(sb.toString()); + return new LuanTable(list); + } + + private LuanTable parseComment() { + parser.begin(); + if( !parser.match("<!--") ) + return parser.failure(null); + int start = parser.currentIndex(); + while( !parser.test("-->") ) { + if( !parser.anyChar() ) + return parser.failure(null); + } + String text = parser.textFrom(start); + LuanTable tbl = new LuanTable(); + tbl.rawPut("type","comment"); + tbl.rawPut("text",text); + return parser.success(tbl); + } + + private LuanTable parseCdata() { + parser.begin(); + if( !parser.match("<![CDATA[") ) + return parser.failure(null); + int start = parser.currentIndex(); + while( !parser.test("]]>") ) { + if( !parser.anyChar() ) + return parser.failure(null); + } + String text = parser.textFrom(start); + LuanTable tbl = new LuanTable(); + tbl.rawPut("type","cdata"); + tbl.rawPut("text",text); + return parser.success(tbl); + } + + private LuanTable parseContainer(LuanTable tag) { + String endTagName = '/' + (String)tag.rawGet("name"); + int start = parser.begin(); + int end; + while(true) { + if( parser.test('<') ) { + end = parser.currentIndex(); + LuanTable tag2 = parseTag(); + String s = (String)tag2.rawGet("name"); + if( s.equals(endTagName) ) + break; + } + if( !parser.anyChar() ) + return parser.failure(null); + } + String text = parser.text.substring(start,end); + LuanTable tbl = new LuanTable(); + tbl.rawPut("type","container"); + tbl.rawPut("tag",tag); + tbl.rawPut("text",text); + return parser.success(tbl); + } + + private LuanTable parseTag() { + parser.begin(); + if( !parser.match('<') ) + return parser.failure(null); + int start = parser.currentIndex(); + parser.match('/'); + if( !matchNameChar() ) + return parser.failure(null); + while( matchNameChar() ); + String name = parser.textFrom(start).toLowerCase(); + LuanTable attributes = new LuanTable(); + String attrName; + while( (attrName = parseAttrName()) != null ) { + String attrValue = parseAttrValue(); + attributes.rawPut( attrName, attrValue!=null ? attrValue : true ); + } + while( matchSpace() ); + boolean isEmpty = parser.match('/'); + if( !parser.match('>') ) + return parser.failure(null); + LuanTable tbl = new LuanTable(); + tbl.rawPut("type","tag"); + tbl.rawPut("name",name); + tbl.rawPut("attributes",attributes); + tbl.rawPut("is_empty",isEmpty); + return parser.success(tbl); + } + + private String parseAttrName() { + parser.begin(); + if( !matchSpace() ) + return parser.failure(null); + while( matchSpace() ); + int start = parser.currentIndex(); + if( !matchNameChar() ) + return parser.failure(null); + while( matchNameChar() ); + String name = parser.textFrom(start); + return parser.success(name); + } + + private String parseAttrValue() { + parser.begin(); + while( matchSpace() ); + if( !parser.match('=') ) + return parser.failure(null); + while( matchSpace() ); + if( parser.anyOf("\"'") ) { + char quote = parser.lastChar(); + int start = parser.currentIndex(); + while( !parser.test(quote) ) { + if( !parser.anyChar() ) + return parser.failure(null); + } + String value = parser.textFrom(start); + parser.match(quote); + return parser.success(value); + } + int start = parser.currentIndex(); + if( !matchValueChar() ) + return parser.failure(null); + while( matchValueChar() ); + String value = parser.textFrom(start); + return parser.success(value); + } + + private boolean matchNameChar() { + return parser.inCharRange('a','z') + || parser.inCharRange('A','Z') + || parser.inCharRange('0','9') + || parser.anyOf("_.-:") + ; + } + + private boolean matchValueChar() { + return parser.noneOf(" \t\r\n\"'>/="); + } + + private boolean matchSpace() { + return parser.anyOf(" \t\r\n"); + } + +}