pub struct Html { pub nodes: Vec, } impl Html { pub fn parse>(raw: S) -> Self { let mut raw = raw.as_ref(); let mut nodes = vec![]; loop { let Consumed { node, remaining } = Self::parse_node(raw); nodes.push(node); match remaining { None => break Self { nodes }, Some(rem) => raw = rem, } } } fn parse_node(raw: &str) -> Consumed { match Self::is_tag(raw) { Some(_) => match Self::parse_comment(raw) { None => { print!("Node "); Self::parse_tag(raw) } Some(cmt) => cmt, }, None => { print!("Text "); let cons = Self::parse_text(raw); println!("## {:?}", cons.node); cons } } } fn parse_tag(raw: &str) -> Consumed { let (root_tag, mut rest) = Self::is_tag(raw).unwrap(); //println!("- {raw}"); if root_tag.closing { panic!( "found closing tag when not expected! {:?}\n{raw}", root_tag.name ) } else if root_tag.self_closing { println!("self close return early"); return Consumed { node: Node::Tag { self_closing: true, name: root_tag.name.into(), children: vec![], }, remaining: rest, }; } let mut children = vec![]; loop { match Self::is_tag(rest.unwrap()) { Some(( ParsedTag { closing: true, name, self_closing: false, .. }, remaining, )) if name == root_tag.name => { println!("ret closed - {name}"); break Consumed { node: Node::Tag { self_closing: false, name: root_tag.name.to_owned(), children, }, remaining, }; } _ => { println!("recur. ends on {}", root_tag.name,); let cons = Self::parse_node(rest.unwrap()); rest = cons.remaining; children.push(cons.node); } } } } fn parse_comment(raw: &str) -> Option { if raw.starts_with("") { None => None, Some(end) => { let comment = &after_start[..end]; let rest = after_start.get(end + 3..); Some(Consumed { node: Node::Comment { body: comment.into(), }, remaining: rest, }) } } } else { None } } /// check if the start of the string is a valid tag #[rustfmt::skip] fn is_tag(raw: &str) -> Option<(ParsedTag, Option<&str>)> { // Starts '<' and any non-whitespace character let starts_right = raw.starts_with('<') && raw.chars().nth(1).map(|c| !c.is_ascii_whitespace()).unwrap_or(false); if !starts_right { return None; } match raw.find('>') { // not a tag if there's no close None => None, Some(idx) => { let rest = match raw.get(idx+1..) { None => None, Some("") => None, Some(txt) => Some(txt) }; let tag_innards = &raw[1..idx]; let close = tag_innards.starts_with('/'); let self_close_idx = { let possible = &tag_innards[1..]; let last_whole_char = possible.rfind(|c: char| !c.is_ascii_whitespace()).map(|n| n + 1); let last_slash = possible.rfind('/').map(|n| n + 1); match (last_slash, last_whole_char) { (Some(slash), Some(whole)) if slash == whole => { Some(slash) }, _ => None } }; let self_close = self_close_idx.is_some(); // can't close and self_close if close && self_close { return None; } // clean the close from the raw string let name_raw = if let Some(close_idx) = self_close_idx { &tag_innards[..close_idx] } else if close { &tag_innards[1..] } else { tag_innards }; let (name, body) = match name_raw.find(|c: char| c.is_ascii_whitespace()){ None => { (name_raw, "") }, Some(idx) => { (&name_raw[..idx], &name_raw[idx+1..]) } }; Some((ParsedTag{ closing: close, self_closing: self_close, name, body }, rest)) } } } fn parse_text(raw: &str) -> Consumed { let mut end_idx = 0; let mut search_from = raw; loop { match search_from.find('<') { // if we ever run out of <'s, the entire string was text None => { break Consumed { node: Node::Text(raw.to_owned()), remaining: None, } } Some(idx) => { end_idx += idx; if Self::is_tag(&search_from[idx..]).is_some() { // we've found a new tag, this text node is done break Consumed { node: Node::Text(raw[..end_idx].to_owned()), remaining: Some(&raw[end_idx..]), }; } else { // step over the < end_idx += 1; search_from = &raw[end_idx..]; } } } } } } struct Consumed<'a> { node: Node, remaining: Option<&'a str>, } struct ParsedTag<'a> { closing: bool, name: &'a str, // a tag's body is what exists between the end of the name and the end of // the tag (including a self-close that may be there and any whitespace) body: &'a str, self_closing: bool, } impl<'a> ParsedTag<'a> { /// Whether or not this tag closes or self-closes pub fn closes(&self) -> bool { self.closing || self.self_closing } } #[derive(Debug, PartialEq)] pub enum Node { Text(String), Tag { // for roundtripping self_closing: bool, name: String, children: Vec, }, Comment { body: String, }, } #[macro_export] macro_rules! text { ($text:expr) => { Node::Text(String::from($text)) }; } #[cfg(test)] mod test { use crate::{Html, Node}; macro_rules! text { ($text:expr) => { Node::Text(String::from($text)) }; } #[test] fn parse_text_finds_start_of_tag() { let no_tag = "Hello, World!"; let starts_tag = "

Hello, World!"; let ends_tag = "Hello, World!

"; let no_tag_res = Html::parse_text(no_tag); assert_eq!(no_tag_res.node, text!("Hello, World!")); assert!(no_tag_res.remaining.is_none()); let starts_tag_res = Html::parse_text(starts_tag); assert_eq!(starts_tag_res.node, text!("")); assert_eq!(starts_tag_res.remaining, Some(starts_tag)); let ends_tag_res = Html::parse_text(ends_tag); assert_eq!(ends_tag_res.node, text!("Hello, World!")); assert_eq!(ends_tag_res.remaining, Some("

")); } #[test] fn parse_text_correctly_ignores_nontags() { let sentence = "The condition 2 < 1 should be 1 > 2"; let weird = "Hello, < p>"; let no_close = "Hello