From 0eefb6b385e78b142f691833c856c1cda01f20a0 Mon Sep 17 00:00:00 2001
From: gennyble
Date: Wed, 27 Dec 2023 05:34:40 -0600
Subject: parses
---
src/lib.rs | 361 +++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
1 file changed, 361 insertions(+)
create mode 100644 src/lib.rs
(limited to 'src')
diff --git a/src/lib.rs b/src/lib.rs
new file mode 100644
index 0000000..e0c8c5b
--- /dev/null
+++ b/src/lib.rs
@@ -0,0 +1,361 @@
+pub struct Html {
+ pub nodes: Vec,
+}
+
+impl Html {
+ pub fn parse>(raw: S) -> Self {
+ let mut raw = raw.as_ref();
+
+ let mut nodes = vec![];
+
+ loop {
+ let Consumed { node, remaining } = Self::parse_node(raw);
+
+ nodes.push(node);
+
+ match remaining {
+ None => break Self { nodes },
+ Some(rem) => raw = rem,
+ }
+ }
+ }
+
+ fn parse_node(raw: &str) -> Consumed {
+ match Self::is_tag(raw) {
+ Some(_) => match Self::parse_comment(raw) {
+ None => {
+ print!("Node ");
+ Self::parse_tag(raw)
+ }
+ Some(cmt) => cmt,
+ },
+ None => {
+ print!("Text ");
+ let cons = Self::parse_text(raw);
+ println!("## {:?}", cons.node);
+ cons
+ }
+ }
+ }
+
+ fn parse_tag(raw: &str) -> Consumed {
+ let (root_tag, mut rest) = Self::is_tag(raw).unwrap();
+
+ //println!("- {raw}");
+ if root_tag.closing {
+ panic!(
+ "found closing tag when not expected! {:?}\n{raw}",
+ root_tag.name
+ )
+ } else if root_tag.self_closing {
+ println!("self close return early");
+ return Consumed {
+ node: Node::Tag {
+ self_closing: true,
+ name: root_tag.name.into(),
+ children: vec![],
+ },
+ remaining: rest,
+ };
+ }
+
+ let mut children = vec![];
+
+ loop {
+ match Self::is_tag(rest.unwrap()) {
+ Some((
+ ParsedTag {
+ closing: true,
+ name,
+ self_closing: false,
+ ..
+ },
+ remaining,
+ )) if name == root_tag.name => {
+ println!("ret closed - {name}");
+ break Consumed {
+ node: Node::Tag {
+ self_closing: false,
+ name: root_tag.name.to_owned(),
+ children,
+ },
+ remaining,
+ };
+ }
+ _ => {
+ println!("recur. ends on {}", root_tag.name,);
+ let cons = Self::parse_node(rest.unwrap());
+ rest = cons.remaining;
+ children.push(cons.node);
+ }
+ }
+ }
+ }
+
+ fn parse_comment(raw: &str) -> Option {
+ if raw.starts_with("") {
+ None => None,
+ Some(end) => {
+ let comment = &after_start[..end];
+ let rest = after_start.get(end + 3..);
+
+ Some(Consumed {
+ node: Node::Comment {
+ body: comment.into(),
+ },
+ remaining: rest,
+ })
+ }
+ }
+ } else {
+ None
+ }
+ }
+
+ /// check if the start of the string is a valid tag
+ #[rustfmt::skip]
+ fn is_tag(raw: &str) -> Option<(ParsedTag, Option<&str>)> {
+ // Starts '<' and any non-whitespace character
+ let starts_right = raw.starts_with('<')
+ && raw.chars().nth(1).map(|c| !c.is_ascii_whitespace()).unwrap_or(false);
+
+ if !starts_right {
+ return None;
+ }
+
+ match raw.find('>') {
+ // not a tag if there's no close
+ None => None,
+ Some(idx) => {
+ let rest = match raw.get(idx+1..) {
+ None => None,
+ Some("") => None,
+ Some(txt) => Some(txt)
+ };
+ let tag_innards = &raw[1..idx];
+
+ let close = tag_innards.starts_with('/');
+ let self_close_idx = {
+ let possible = &tag_innards[1..];
+ let last_whole_char = possible.rfind(|c: char| !c.is_ascii_whitespace()).map(|n| n + 1);
+ let last_slash = possible.rfind('/').map(|n| n + 1);
+ match (last_slash, last_whole_char) {
+ (Some(slash), Some(whole)) if slash == whole => {
+ Some(slash)
+ },
+ _ => None
+ }
+ };
+ let self_close = self_close_idx.is_some();
+ // can't close and self_close
+ if close && self_close { return None; }
+
+ // clean the close from the raw string
+ let name_raw = if let Some(close_idx) = self_close_idx {
+ &tag_innards[..close_idx]
+ } else if close {
+ &tag_innards[1..]
+ } else {
+ tag_innards
+ };
+
+ let (name, body) = match name_raw.find(|c: char| c.is_ascii_whitespace()){
+ None => {
+ (name_raw, "")
+ },
+ Some(idx) => {
+ (&name_raw[..idx], &name_raw[idx+1..])
+ }
+ };
+
+ Some((ParsedTag{
+ closing: close,
+ self_closing: self_close,
+ name,
+ body
+ }, rest))
+ }
+ }
+ }
+
+ fn parse_text(raw: &str) -> Consumed {
+ let mut end_idx = 0;
+ let mut search_from = raw;
+
+ loop {
+ match search_from.find('<') {
+ // if we ever run out of <'s, the entire string was text
+ None => {
+ break Consumed {
+ node: Node::Text(raw.to_owned()),
+ remaining: None,
+ }
+ }
+ Some(idx) => {
+ end_idx += idx;
+
+ if Self::is_tag(&search_from[idx..]).is_some() {
+ // we've found a new tag, this text node is done
+ break Consumed {
+ node: Node::Text(raw[..end_idx].to_owned()),
+ remaining: Some(&raw[end_idx..]),
+ };
+ } else {
+ // step over the <
+ end_idx += 1;
+ search_from = &raw[end_idx..];
+ }
+ }
+ }
+ }
+ }
+}
+
+struct Consumed<'a> {
+ node: Node,
+ remaining: Option<&'a str>,
+}
+
+struct ParsedTag<'a> {
+ closing: bool,
+ name: &'a str,
+ // a tag's body is what exists between the end of the name and the end of
+ // the tag (including a self-close that may be there and any whitespace)
+ body: &'a str,
+ self_closing: bool,
+}
+
+impl<'a> ParsedTag<'a> {
+ /// Whether or not this tag closes or self-closes
+ pub fn closes(&self) -> bool {
+ self.closing || self.self_closing
+ }
+}
+
+#[derive(Debug, PartialEq)]
+pub enum Node {
+ Text(String),
+ Tag {
+ // for roundtripping
+ self_closing: bool,
+ name: String,
+ children: Vec,
+ },
+ Comment {
+ body: String,
+ },
+}
+
+#[macro_export]
+macro_rules! text {
+ ($text:expr) => {
+ Node::Text(String::from($text))
+ };
+}
+
+#[cfg(test)]
+mod test {
+ use crate::{Html, Node};
+
+ macro_rules! text {
+ ($text:expr) => {
+ Node::Text(String::from($text))
+ };
+ }
+
+ #[test]
+ fn parse_text_finds_start_of_tag() {
+ let no_tag = "Hello, World!";
+ let starts_tag = "Hello, World!";
+ let ends_tag = "Hello, World!
";
+
+ let no_tag_res = Html::parse_text(no_tag);
+ assert_eq!(no_tag_res.node, text!("Hello, World!"));
+ assert!(no_tag_res.remaining.is_none());
+
+ let starts_tag_res = Html::parse_text(starts_tag);
+ assert_eq!(starts_tag_res.node, text!(""));
+ assert_eq!(starts_tag_res.remaining, Some(starts_tag));
+
+ let ends_tag_res = Html::parse_text(ends_tag);
+ assert_eq!(ends_tag_res.node, text!("Hello, World!"));
+ assert_eq!(ends_tag_res.remaining, Some("
"));
+ }
+
+ #[test]
+ fn parse_text_correctly_ignores_nontags() {
+ let sentence = "The condition 2 < 1 should be 1 > 2";
+ let weird = "Hello, < p>";
+ let no_close = "Hello