diff options
author | gennyble <gen@nyble.dev> | 2023-12-27 17:18:12 -0600 |
---|---|---|
committer | gennyble <gen@nyble.dev> | 2023-12-27 17:18:12 -0600 |
commit | 4b4689ca0d086e11de52cbbf848fa0873acd7274 (patch) | |
tree | 9da09a0b93cf50b47dc771e4e6ff1a9cdaae14c1 | |
parent | 54dec82cb8620937306e9a16b4b532cb52c7ee55 (diff) | |
download | cutie-4b4689ca0d086e11de52cbbf848fa0873acd7274.tar.gz cutie-4b4689ca0d086e11de52cbbf848fa0873acd7274.zip |
rountrips!
-rw-r--r-- | readme.md | 2 | ||||
-rw-r--r-- | src/lib.rs | 188 | ||||
-rw-r--r-- | tests/nyble_pages.rs | 22 |
3 files changed, 169 insertions, 43 deletions
diff --git a/readme.md b/readme.md index 725a71a..d0fc3b9 100644 --- a/readme.md +++ b/readme.md @@ -1,6 +1,8 @@ opinionated, standards non-compliant HTML parser meant to consume handwritten HTML. - all tags must close *(even `<br/>`, `<meta/>`, `<link/>`)* +- tag names must be separated from the tag-body *(where the attributes go)* by a space character *(` `, 0x20)* +- self-closing tags must have the closing `/` at the ver y end of the body *(directly before the `>`)* - no > in tags except at the end (not even in attributes) - inline `<script>` and `<style>` must have their closing-tag be first-of-line *(excluding whitespace)* \ No newline at end of file diff --git a/src/lib.rs b/src/lib.rs index 26ec5dd..f1010ae 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -43,12 +43,14 @@ impl Html { Tag { name: root_tag.name.to_owned(), body: None, + self_closing: root_tag.self_closing, children: vec![], } } else { Tag { name: root_tag.name.into(), body: Some(root_tag.body.to_owned()), + self_closing: root_tag.self_closing, children: vec![], } }; @@ -65,11 +67,14 @@ impl Html { }; } + println!("Looking for {}", root_tag.name); + loop { // Special case <script> and <style> if root_tag.name == "script" && tag.get_attribute("src").is_none() || root_tag.name == "style" { + println!("\tparse special"); let special = Self::special_parse(rest.unwrap(), root_tag.name); match special { @@ -95,6 +100,7 @@ impl Html { // Find the closing end of out root_tag if let Some((parsed, remaining)) = Self::is_tag(rest.unwrap()) { if parsed.closing && parsed.name == root_tag.name { + println!("\tclosed {}", parsed.name); break Consumed { node: Node::Tag(tag), remaining, @@ -165,43 +171,64 @@ impl Html { }; let tag_innards = &raw[1..idx]; - let close = tag_innards.starts_with('/'); - let self_close_idx = { - let possible = &tag_innards[1..]; - let last_whole_char = possible.rfind(|c: char| !c.is_ascii_whitespace()).map(|n| n + 1); - let last_slash = possible.rfind('/').map(|n| n + 1); - match (last_slash, last_whole_char) { - (Some(slash), Some(whole)) if slash == whole => { - Some(slash) - }, - _ => None + // Tag is closing. Don't check for self-close + // as they cannot be on the same tag + if let Some(closing) = tag_innards.strip_prefix('/') { + match closing.find(' ') { + None => return Some((ParsedTag { + closing: true, + self_closing: false, + name: closing, + body: "" + }, rest)), + Some(idx) => { + let name = &closing[..idx]; + let body = &closing[idx..]; + + return Some((ParsedTag{ + closing: true, + self_closing: false, + name, + body + }, rest)) + } } - }; - let self_close = self_close_idx.is_some(); - // can't close and self_close - if close && self_close { return None; } - - // clean the close from the raw string - let name_raw = if let Some(close_idx) = self_close_idx { - &tag_innards[..close_idx] - } else if close { - &tag_innards[1..] - } else { - tag_innards - }; + } - let (name, body) = match name_raw.find(|c: char| c.is_ascii_whitespace()){ + if let Some(closing) = tag_innards.strip_suffix('/') { + match closing.find(' ') { + None => return Some((ParsedTag { + closing: false, + self_closing: true, + name: closing, + body: "" + }, rest)), + Some(idx) => { + let name = &closing[..idx]; + let body = &closing[idx+1..]; + + return Some((ParsedTag{ + closing: false, + self_closing: true, + name, + body + }, rest)) + } + } + } + + let (name, body) = match tag_innards.find(' '){ None => { - (name_raw, "") + (tag_innards, "") }, Some(idx) => { - (&name_raw[..idx], &name_raw[idx+1..]) + (&tag_innards[..idx], &tag_innards[idx+1..]) } }; Some((ParsedTag{ - closing: close, - self_closing: self_close, + closing: false, + self_closing: false, name, body }, rest)) @@ -242,6 +269,15 @@ impl Html { } } +impl fmt::Display for Html { + fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { + for node in &self.nodes { + write!(f, "{node}")?; + } + Ok(()) + } +} + struct Consumed<'a> { node: Node, remaining: Option<&'a str>, @@ -263,21 +299,29 @@ pub enum Node { Comment(String), } +impl fmt::Display for Node { + fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { + match self { + Self::Text(txt) => write!(f, "{txt}"), + Self::Tag(tag) => write!(f, "{tag}"), + Self::Comment(cmt) => write!(f, "<!--{cmt}-->"), + } + } +} + #[derive(Clone, Debug, PartialEq)] pub struct Tag { pub name: String, /// Everything inside the tag that's not it's name. Includes a /// self-close if there is one. pub body: Option<String>, + pub self_closing: bool, pub children: Vec<Node>, } impl Tag { pub fn self_closing(&self) -> bool { - self.body - .as_deref() - .map(|s| s.trim_end().ends_with('/')) - .unwrap_or(false) + self.self_closing } pub fn get_attribute<'a>(&'a self, key: &str) -> Option<&'a str> { @@ -340,12 +384,41 @@ impl Tag { } } +impl fmt::Display for Tag { + fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { + let Tag { + name, + body, + self_closing, + children, + } = self; + + let formatted_body = if let Some(body) = body { + format!(" {body}") + } else { + String::from("") + }; + + if *self_closing { + // we ignore our children if we're self-closing. + write!(f, "<{name} {}/>", body.as_deref().unwrap_or_default()) + } else { + write!(f, "<{name}{formatted_body}>")?; + for child in children { + write!(f, "{}", child)?; + } + write!(f, "</{name}>") + } + } +} + #[macro_export] macro_rules! tag { ($name:expr) => { $crate::Node::Tag($crate::Tag { name: String::from($name), body: None, + self_closing: false, children: vec![], }) }; @@ -354,6 +427,7 @@ macro_rules! tag { $crate::Node::Tag($crate::Tag { name: String::from($name), body: None, + self_closing: false, children: vec![$($children),+], }) }; @@ -362,6 +436,7 @@ macro_rules! tag { $crate::Node::Tag($crate::Tag { name: String::from($name), body: Some(String::from($body)), + self_closing: false, children: vec![], }) }; @@ -370,6 +445,7 @@ macro_rules! tag { $crate::Node::Tag($crate::Tag { name: String::from($name), body: Some(String::from($body)), + self_closing: false, children: vec![$($children),+], }) }; @@ -529,6 +605,39 @@ mod test { )] ) } + + fn test_roundtrip(raw: &str) { + let html = Html::parse(raw); + let string = html.to_string(); + + for (raw, html) in raw.lines().zip(string.lines()) { + assert_eq!(raw, html) + } + } + + #[test] + fn round_trip_simple() { + test_roundtrip("<p>Hello!</p>") + } + + #[test] + fn round_trip_complex() { + test_roundtrip( + r#" +<html> + <head> + <link rel="style.css"/> + <title>Title!</title> + <script> + alert("hello!"); + </script> + </head> + <body> + <p>Hello, <i>World!</i></p> + </body> +</html>"#, + ) + } } #[cfg(test)] @@ -540,6 +649,7 @@ mod tag_test { let tag = Tag { name: "div".into(), body: Some("contenteditable".into()), + self_closing: false, children: vec![], }; assert!(tag.get_attribute("contenteditable").is_some()) @@ -550,26 +660,18 @@ mod tag_test { let tag = Tag { name: "script".into(), body: Some("src=\"script.js\"".into()), + self_closing: false, children: vec![], }; assert_eq!(tag.get_attribute("src"), Some("script.js")) } #[test] - fn tag_finds_attribute_with_self_close() { - let tag = Tag { - name: "link".into(), - body: Some("href=\"style.css\" /".into()), - children: vec![], - }; - assert_eq!(tag.get_attribute("href"), Some("style.css")) - } - - #[test] fn tag_finds_boolean_in_centre() { let tag = Tag { name: "div".into(), - body: Some("id=\"divy\" contenteditable style=\"display: none;\" /".into()), + body: Some("id=\"divy\" contenteditable style=\"display: none;\"".into()), + self_closing: false, children: vec![], }; assert!(tag.get_attribute("contenteditable").is_some()); diff --git a/tests/nyble_pages.rs b/tests/nyble_pages.rs index 14ca1cc..2c99bc2 100644 --- a/tests/nyble_pages.rs +++ b/tests/nyble_pages.rs @@ -6,8 +6,30 @@ fn pares_homepage() { let html = Html::parse(&str); } +fn test_roundtrip(raw: &str) { + let html = Html::parse(raw); + let string = html.to_string(); + for (ln, (raw, round)) in raw.lines().zip(string.lines()).enumerate() { + if raw != round { + panic!("line {ln} differs!\n\traw: {raw}\n\ttrip: {round}") + } + } +} + +#[test] +fn homepage_roundtrip() { + let str = std::fs::read_to_string("tests/nyble.html").unwrap(); + test_roundtrip(&str); +} + #[test] fn parses_grass() { let str = std::fs::read_to_string("tests/touching_grass.html").unwrap(); let html = Html::parse(&str); } + +#[test] +fn grass_roundtrip() { + let str = std::fs::read_to_string("tests/touching_grass.html").unwrap(); + test_roundtrip(&str); +} |