pub struct Html {
pub nodes: Vec,
}
impl Html {
pub fn parse>(raw: S) -> Self {
let mut raw = raw.as_ref();
let mut nodes = vec![];
loop {
let Consumed { node, remaining } = Self::parse_node(raw);
nodes.push(node);
match remaining {
None => break Self { nodes },
Some(rem) => raw = rem,
}
}
}
fn parse_node(raw: &str) -> Consumed {
match Self::is_tag(raw) {
Some(_) => match Self::parse_comment(raw) {
None => {
print!("Node ");
Self::parse_tag(raw)
}
Some(cmt) => cmt,
},
None => {
print!("Text ");
let cons = Self::parse_text(raw);
println!("## {:?}", cons.node);
cons
}
}
}
fn parse_tag(raw: &str) -> Consumed {
let (root_tag, mut rest) = Self::is_tag(raw).unwrap();
//println!("- {raw}");
if root_tag.closing {
panic!(
"found closing tag when not expected! {:?}\n{raw}",
root_tag.name
)
} else if root_tag.self_closing {
println!("self close return early");
return Consumed {
node: Node::Tag {
self_closing: true,
name: root_tag.name.into(),
children: vec![],
},
remaining: rest,
};
}
let mut children = vec![];
loop {
match Self::is_tag(rest.unwrap()) {
Some((
ParsedTag {
closing: true,
name,
self_closing: false,
..
},
remaining,
)) if name == root_tag.name => {
println!("ret closed - {name}");
break Consumed {
node: Node::Tag {
self_closing: false,
name: root_tag.name.to_owned(),
children,
},
remaining,
};
}
_ => {
println!("recur. ends on {}", root_tag.name,);
let cons = Self::parse_node(rest.unwrap());
rest = cons.remaining;
children.push(cons.node);
}
}
}
}
fn parse_comment(raw: &str) -> Option {
if raw.starts_with("") {
None => None,
Some(end) => {
let comment = &after_start[..end];
let rest = after_start.get(end + 3..);
Some(Consumed {
node: Node::Comment {
body: comment.into(),
},
remaining: rest,
})
}
}
} else {
None
}
}
/// check if the start of the string is a valid tag
#[rustfmt::skip]
fn is_tag(raw: &str) -> Option<(ParsedTag, Option<&str>)> {
// Starts '<' and any non-whitespace character
let starts_right = raw.starts_with('<')
&& raw.chars().nth(1).map(|c| !c.is_ascii_whitespace()).unwrap_or(false);
if !starts_right {
return None;
}
match raw.find('>') {
// not a tag if there's no close
None => None,
Some(idx) => {
let rest = match raw.get(idx+1..) {
None => None,
Some("") => None,
Some(txt) => Some(txt)
};
let tag_innards = &raw[1..idx];
let close = tag_innards.starts_with('/');
let self_close_idx = {
let possible = &tag_innards[1..];
let last_whole_char = possible.rfind(|c: char| !c.is_ascii_whitespace()).map(|n| n + 1);
let last_slash = possible.rfind('/').map(|n| n + 1);
match (last_slash, last_whole_char) {
(Some(slash), Some(whole)) if slash == whole => {
Some(slash)
},
_ => None
}
};
let self_close = self_close_idx.is_some();
// can't close and self_close
if close && self_close { return None; }
// clean the close from the raw string
let name_raw = if let Some(close_idx) = self_close_idx {
&tag_innards[..close_idx]
} else if close {
&tag_innards[1..]
} else {
tag_innards
};
let (name, body) = match name_raw.find(|c: char| c.is_ascii_whitespace()){
None => {
(name_raw, "")
},
Some(idx) => {
(&name_raw[..idx], &name_raw[idx+1..])
}
};
Some((ParsedTag{
closing: close,
self_closing: self_close,
name,
body
}, rest))
}
}
}
fn parse_text(raw: &str) -> Consumed {
let mut end_idx = 0;
let mut search_from = raw;
loop {
match search_from.find('<') {
// if we ever run out of <'s, the entire string was text
None => {
break Consumed {
node: Node::Text(raw.to_owned()),
remaining: None,
}
}
Some(idx) => {
end_idx += idx;
if Self::is_tag(&search_from[idx..]).is_some() {
// we've found a new tag, this text node is done
break Consumed {
node: Node::Text(raw[..end_idx].to_owned()),
remaining: Some(&raw[end_idx..]),
};
} else {
// step over the <
end_idx += 1;
search_from = &raw[end_idx..];
}
}
}
}
}
}
struct Consumed<'a> {
node: Node,
remaining: Option<&'a str>,
}
struct ParsedTag<'a> {
closing: bool,
name: &'a str,
// a tag's body is what exists between the end of the name and the end of
// the tag (including a self-close that may be there and any whitespace)
body: &'a str,
self_closing: bool,
}
impl<'a> ParsedTag<'a> {
/// Whether or not this tag closes or self-closes
pub fn closes(&self) -> bool {
self.closing || self.self_closing
}
}
#[derive(Debug, PartialEq)]
pub enum Node {
Text(String),
Tag {
// for roundtripping
self_closing: bool,
name: String,
children: Vec,
},
Comment {
body: String,
},
}
#[macro_export]
macro_rules! text {
($text:expr) => {
Node::Text(String::from($text))
};
}
#[cfg(test)]
mod test {
use crate::{Html, Node};
macro_rules! text {
($text:expr) => {
Node::Text(String::from($text))
};
}
#[test]
fn parse_text_finds_start_of_tag() {
let no_tag = "Hello, World!";
let starts_tag = "Hello, World!";
let ends_tag = "Hello, World!
";
let no_tag_res = Html::parse_text(no_tag);
assert_eq!(no_tag_res.node, text!("Hello, World!"));
assert!(no_tag_res.remaining.is_none());
let starts_tag_res = Html::parse_text(starts_tag);
assert_eq!(starts_tag_res.node, text!(""));
assert_eq!(starts_tag_res.remaining, Some(starts_tag));
let ends_tag_res = Html::parse_text(ends_tag);
assert_eq!(ends_tag_res.node, text!("Hello, World!"));
assert_eq!(ends_tag_res.remaining, Some("
"));
}
#[test]
fn parse_text_correctly_ignores_nontags() {
let sentence = "The condition 2 < 1 should be 1 > 2";
let weird = "Hello, < p>";
let no_close = "Hello