diff options
Diffstat (limited to 'src/lib.rs')
-rw-r--r-- | src/lib.rs | 361 |
1 files changed, 361 insertions, 0 deletions
diff --git a/src/lib.rs b/src/lib.rs new file mode 100644 index 0000000..e0c8c5b --- /dev/null +++ b/src/lib.rs @@ -0,0 +1,361 @@ +pub struct Html { + pub nodes: Vec<Node>, +} + +impl Html { + pub fn parse<S: AsRef<str>>(raw: S) -> Self { + let mut raw = raw.as_ref(); + + let mut nodes = vec![]; + + loop { + let Consumed { node, remaining } = Self::parse_node(raw); + + nodes.push(node); + + match remaining { + None => break Self { nodes }, + Some(rem) => raw = rem, + } + } + } + + fn parse_node(raw: &str) -> Consumed { + match Self::is_tag(raw) { + Some(_) => match Self::parse_comment(raw) { + None => { + print!("Node "); + Self::parse_tag(raw) + } + Some(cmt) => cmt, + }, + None => { + print!("Text "); + let cons = Self::parse_text(raw); + println!("## {:?}", cons.node); + cons + } + } + } + + fn parse_tag(raw: &str) -> Consumed { + let (root_tag, mut rest) = Self::is_tag(raw).unwrap(); + + //println!("- {raw}"); + if root_tag.closing { + panic!( + "found closing tag when not expected! {:?}\n{raw}", + root_tag.name + ) + } else if root_tag.self_closing { + println!("self close return early"); + return Consumed { + node: Node::Tag { + self_closing: true, + name: root_tag.name.into(), + children: vec![], + }, + remaining: rest, + }; + } + + let mut children = vec![]; + + loop { + match Self::is_tag(rest.unwrap()) { + Some(( + ParsedTag { + closing: true, + name, + self_closing: false, + .. + }, + remaining, + )) if name == root_tag.name => { + println!("ret closed - {name}"); + break Consumed { + node: Node::Tag { + self_closing: false, + name: root_tag.name.to_owned(), + children, + }, + remaining, + }; + } + _ => { + println!("recur. ends on {}", root_tag.name,); + let cons = Self::parse_node(rest.unwrap()); + rest = cons.remaining; + children.push(cons.node); + } + } + } + } + + fn parse_comment(raw: &str) -> Option<Consumed> { + if raw.starts_with("<!--") { + let after_start = &raw[4..]; + match after_start.find("-->") { + None => None, + Some(end) => { + let comment = &after_start[..end]; + let rest = after_start.get(end + 3..); + + Some(Consumed { + node: Node::Comment { + body: comment.into(), + }, + remaining: rest, + }) + } + } + } else { + None + } + } + + /// check if the start of the string is a valid tag + #[rustfmt::skip] + fn is_tag(raw: &str) -> Option<(ParsedTag, Option<&str>)> { + // Starts '<' and any non-whitespace character + let starts_right = raw.starts_with('<') + && raw.chars().nth(1).map(|c| !c.is_ascii_whitespace()).unwrap_or(false); + + if !starts_right { + return None; + } + + match raw.find('>') { + // not a tag if there's no close + None => None, + Some(idx) => { + let rest = match raw.get(idx+1..) { + None => None, + Some("") => None, + Some(txt) => Some(txt) + }; + let tag_innards = &raw[1..idx]; + + let close = tag_innards.starts_with('/'); + let self_close_idx = { + let possible = &tag_innards[1..]; + let last_whole_char = possible.rfind(|c: char| !c.is_ascii_whitespace()).map(|n| n + 1); + let last_slash = possible.rfind('/').map(|n| n + 1); + match (last_slash, last_whole_char) { + (Some(slash), Some(whole)) if slash == whole => { + Some(slash) + }, + _ => None + } + }; + let self_close = self_close_idx.is_some(); + // can't close and self_close + if close && self_close { return None; } + + // clean the close from the raw string + let name_raw = if let Some(close_idx) = self_close_idx { + &tag_innards[..close_idx] + } else if close { + &tag_innards[1..] + } else { + tag_innards + }; + + let (name, body) = match name_raw.find(|c: char| c.is_ascii_whitespace()){ + None => { + (name_raw, "") + }, + Some(idx) => { + (&name_raw[..idx], &name_raw[idx+1..]) + } + }; + + Some((ParsedTag{ + closing: close, + self_closing: self_close, + name, + body + }, rest)) + } + } + } + + fn parse_text(raw: &str) -> Consumed { + let mut end_idx = 0; + let mut search_from = raw; + + loop { + match search_from.find('<') { + // if we ever run out of <'s, the entire string was text + None => { + break Consumed { + node: Node::Text(raw.to_owned()), + remaining: None, + } + } + Some(idx) => { + end_idx += idx; + + if Self::is_tag(&search_from[idx..]).is_some() { + // we've found a new tag, this text node is done + break Consumed { + node: Node::Text(raw[..end_idx].to_owned()), + remaining: Some(&raw[end_idx..]), + }; + } else { + // step over the < + end_idx += 1; + search_from = &raw[end_idx..]; + } + } + } + } + } +} + +struct Consumed<'a> { + node: Node, + remaining: Option<&'a str>, +} + +struct ParsedTag<'a> { + closing: bool, + name: &'a str, + // a tag's body is what exists between the end of the name and the end of + // the tag (including a self-close that may be there and any whitespace) + body: &'a str, + self_closing: bool, +} + +impl<'a> ParsedTag<'a> { + /// Whether or not this tag closes or self-closes + pub fn closes(&self) -> bool { + self.closing || self.self_closing + } +} + +#[derive(Debug, PartialEq)] +pub enum Node { + Text(String), + Tag { + // for roundtripping + self_closing: bool, + name: String, + children: Vec<Node>, + }, + Comment { + body: String, + }, +} + +#[macro_export] +macro_rules! text { + ($text:expr) => { + Node::Text(String::from($text)) + }; +} + +#[cfg(test)] +mod test { + use crate::{Html, Node}; + + macro_rules! text { + ($text:expr) => { + Node::Text(String::from($text)) + }; + } + + #[test] + fn parse_text_finds_start_of_tag() { + let no_tag = "Hello, World!"; + let starts_tag = "<p>Hello, World!"; + let ends_tag = "Hello, World!</p>"; + + let no_tag_res = Html::parse_text(no_tag); + assert_eq!(no_tag_res.node, text!("Hello, World!")); + assert!(no_tag_res.remaining.is_none()); + + let starts_tag_res = Html::parse_text(starts_tag); + assert_eq!(starts_tag_res.node, text!("")); + assert_eq!(starts_tag_res.remaining, Some(starts_tag)); + + let ends_tag_res = Html::parse_text(ends_tag); + assert_eq!(ends_tag_res.node, text!("Hello, World!")); + assert_eq!(ends_tag_res.remaining, Some("</p>")); + } + + #[test] + fn parse_text_correctly_ignores_nontags() { + let sentence = "The condition 2 < 1 should be 1 > 2"; + let weird = "Hello, < p>"; + let no_close = "Hello <p my name is "; + + let sentence_res = Html::parse_text(sentence); + assert_eq!(sentence_res.node, text!(sentence)); + assert!(sentence_res.remaining.is_none()); + + let weird_res = Html::parse_text(weird); + assert_eq!(weird_res.node, text!(weird)); + assert!(weird_res.remaining.is_none()); + + let no_close_res = Html::parse_text(no_close); + assert_eq!(no_close_res.node, text!(no_close)); + assert!(no_close_res.remaining.is_none()); + } + + #[test] + fn parse_node_parses_tag() { + let basic = "<p>Hello!</p>"; + + let hh = Html::parse_node(basic); + assert_eq!( + hh.node, + Node::Tag { + self_closing: false, + name: "p".into(), + children: vec![text!("Hello!")] + } + ) + } + + #[test] + fn parse_node_parses_nested_tags() { + let nested = "<p><p>Hello!</p></p>"; + + let hh = Html::parse_node(nested); + assert_eq!( + hh.node, + Node::Tag { + self_closing: false, + name: "p".into(), + children: vec![Node::Tag { + self_closing: false, + name: "p".into(), + children: vec![text!("Hello!")] + }] + } + ) + } + + #[test] + fn parse_multiple_toplevel() { + let nested = "<p>Hello </p><p>World!</p>"; + + let hh = Html::parse(nested); + assert_eq!( + hh.nodes, + vec![ + Node::Tag { + self_closing: false, + name: "p".into(), + children: vec![text!("Hello ")] + }, + Node::Tag { + self_closing: false, + name: "p".into(), + children: vec![text!("World!")] + } + ] + ) + } +} |