diff options
-rw-r--r-- | readme.md | 6 | ||||
-rw-r--r-- | src/lib.rs | 102 |
2 files changed, 85 insertions, 23 deletions
diff --git a/readme.md b/readme.md index 89585a8..725a71a 100644 --- a/readme.md +++ b/readme.md @@ -1,4 +1,6 @@ opinionated, standards non-compliant HTML parser meant to consume handwritten HTML. -- all tags must close -- no > in tags except at the end (not even in attributes) \ No newline at end of file +- all tags must close *(even `<br/>`, `<meta/>`, `<link/>`)* +- no > in tags except at the end (not even in attributes) +- inline `<script>` and `<style>` must have their closing-tag be + first-of-line *(excluding whitespace)* \ No newline at end of file diff --git a/src/lib.rs b/src/lib.rs index 31e8a17..36a9075 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -60,16 +60,36 @@ impl Html { let mut children = vec![]; loop { - match Self::is_tag(rest.unwrap()) { - Some(( - ParsedTag { - closing: true, - name, - self_closing: false, - .. - }, - remaining, - )) if name == root_tag.name => { + // Special case <script> and <style> + if root_tag.name == "script" || root_tag.name == "style" { + let special = Self::special_parse(rest.unwrap(), root_tag.name); + + match special { + None => { + panic!("found tag '{}' with no end", root_tag.name); + } + Some((text, remaining)) => { + let remaining = if remaining.is_empty() { + None + } else { + Some(remaining) + }; + + return Consumed { + node: Node::Tag { + self_closing: false, + name: root_tag.name.into(), + children: vec![text!(text)], + }, + remaining, + }; + } + } + } + + // Find the closing end of out root_tag + if let Some((parsed, remaining)) = Self::is_tag(rest.unwrap()) { + if parsed.closing && parsed.name == root_tag.name { break Consumed { node: Node::Tag { self_closing: false, @@ -79,13 +99,38 @@ impl Html { remaining, }; } - _ => { - let cons = Self::parse_node(rest.unwrap()); - rest = cons.remaining; - children.push(cons.node); + } + + // Not our closing root? parse and push + let cons = Self::parse_node(rest.unwrap()); + rest = cons.remaining; + children.push(cons.node); + } + } + + fn special_parse<'a>(mut raw: &'a str, looking_for_name: &str) -> Option<(&'a str, &'a str)> { + let close = format!("</{looking_for_name}>"); + + let mut offset = 0; + loop { + match raw[offset..].find('\n') { + None => return None, + Some(nl_idx) => { + offset += nl_idx + 1; + match raw[offset..].find(|c: char| !c.is_ascii_whitespace()) { + None => return None, + Some(whole_idx) => { + let whole_start = &raw[offset + whole_idx..]; + if let Some(stripped) = whole_start.strip_prefix(&close) { + return Some((&raw[..offset + whole_idx], stripped)); + } + } + } } } } + + None } fn parse_comment(raw: &str) -> Option<Consumed> { @@ -212,13 +257,6 @@ struct ParsedTag<'a> { self_closing: bool, } -impl<'a> ParsedTag<'a> { - /// Whether or not this tag closes or self-closes - pub fn closes(&self) -> bool { - self.closing || self.self_closing - } -} - #[derive(Debug, PartialEq)] pub enum Node { Text(String), @@ -294,6 +332,28 @@ mod test { } #[test] + fn special_parse_find_tag_end() { + let basic = "words words\n</script>"; + let special = Html::special_parse(basic, "script"); + assert_eq!(special.unwrap().0, "words words\n"); + assert!(special.unwrap().1.is_empty()); + } + + #[test] + fn special_parse_correctly_ignore_non_start() { + let nonstart = "first_line\nlet end = '</script>';\n"; + let special = Html::special_parse(nonstart, "script"); + assert!(special.is_none()); + } + + #[test] + fn special_parse_correctly_handles_leading_whitespace() { + let white = "words words\n \t\t</script>"; + let special = Html::special_parse(white, "script"); + assert_eq!(special.unwrap().0, "words words\n \t\t"); + } + + #[test] fn parse_node_parses_comment() { let cmt = "<!-- Comment! -->"; |