about summary refs log tree commit diff
path: root/src/lib.rs
diff options
context:
space:
mode:
authorgennyble <gen@nyble.dev>2023-12-27 05:34:40 -0600
committergennyble <gen@nyble.dev>2023-12-27 05:34:40 -0600
commit0eefb6b385e78b142f691833c856c1cda01f20a0 (patch)
tree672411b8c5bdb54cab45141c7b927edb06688b7b /src/lib.rs
downloadcutie-0eefb6b385e78b142f691833c856c1cda01f20a0.tar.gz
cutie-0eefb6b385e78b142f691833c856c1cda01f20a0.zip
parses
Diffstat (limited to 'src/lib.rs')
-rw-r--r--src/lib.rs361
1 files changed, 361 insertions, 0 deletions
diff --git a/src/lib.rs b/src/lib.rs
new file mode 100644
index 0000000..e0c8c5b
--- /dev/null
+++ b/src/lib.rs
@@ -0,0 +1,361 @@
+pub struct Html {
+	pub nodes: Vec<Node>,
+}
+
+impl Html {
+	pub fn parse<S: AsRef<str>>(raw: S) -> Self {
+		let mut raw = raw.as_ref();
+
+		let mut nodes = vec![];
+
+		loop {
+			let Consumed { node, remaining } = Self::parse_node(raw);
+
+			nodes.push(node);
+
+			match remaining {
+				None => break Self { nodes },
+				Some(rem) => raw = rem,
+			}
+		}
+	}
+
+	fn parse_node(raw: &str) -> Consumed {
+		match Self::is_tag(raw) {
+			Some(_) => match Self::parse_comment(raw) {
+				None => {
+					print!("Node ");
+					Self::parse_tag(raw)
+				}
+				Some(cmt) => cmt,
+			},
+			None => {
+				print!("Text ");
+				let cons = Self::parse_text(raw);
+				println!("## {:?}", cons.node);
+				cons
+			}
+		}
+	}
+
+	fn parse_tag(raw: &str) -> Consumed {
+		let (root_tag, mut rest) = Self::is_tag(raw).unwrap();
+
+		//println!("- {raw}");
+		if root_tag.closing {
+			panic!(
+				"found closing tag when not expected! {:?}\n{raw}",
+				root_tag.name
+			)
+		} else if root_tag.self_closing {
+			println!("self close return early");
+			return Consumed {
+				node: Node::Tag {
+					self_closing: true,
+					name: root_tag.name.into(),
+					children: vec![],
+				},
+				remaining: rest,
+			};
+		}
+
+		let mut children = vec![];
+
+		loop {
+			match Self::is_tag(rest.unwrap()) {
+				Some((
+					ParsedTag {
+						closing: true,
+						name,
+						self_closing: false,
+						..
+					},
+					remaining,
+				)) if name == root_tag.name => {
+					println!("ret closed - {name}");
+					break Consumed {
+						node: Node::Tag {
+							self_closing: false,
+							name: root_tag.name.to_owned(),
+							children,
+						},
+						remaining,
+					};
+				}
+				_ => {
+					println!("recur. ends on {}", root_tag.name,);
+					let cons = Self::parse_node(rest.unwrap());
+					rest = cons.remaining;
+					children.push(cons.node);
+				}
+			}
+		}
+	}
+
+	fn parse_comment(raw: &str) -> Option<Consumed> {
+		if raw.starts_with("<!--") {
+			let after_start = &raw[4..];
+			match after_start.find("-->") {
+				None => None,
+				Some(end) => {
+					let comment = &after_start[..end];
+					let rest = after_start.get(end + 3..);
+
+					Some(Consumed {
+						node: Node::Comment {
+							body: comment.into(),
+						},
+						remaining: rest,
+					})
+				}
+			}
+		} else {
+			None
+		}
+	}
+
+	/// check if the start of the string is a valid tag
+	#[rustfmt::skip]
+	fn is_tag(raw: &str) -> Option<(ParsedTag,  Option<&str>)> {
+		// Starts '<' and any non-whitespace character
+		let starts_right = raw.starts_with('<')
+			&& raw.chars().nth(1).map(|c| !c.is_ascii_whitespace()).unwrap_or(false);
+
+		if !starts_right {
+			return None;
+		}
+
+		match raw.find('>') {
+			// not a tag if there's no close
+			None => None,
+			Some(idx) => {
+				let rest = match raw.get(idx+1..) {
+					None => None,
+					Some("") => None,
+					Some(txt) => Some(txt)
+				};
+				let tag_innards = &raw[1..idx];
+
+				let close = tag_innards.starts_with('/');
+				let self_close_idx = {
+					let possible = &tag_innards[1..];
+					let last_whole_char = possible.rfind(|c: char| !c.is_ascii_whitespace()).map(|n| n + 1);
+					let last_slash = possible.rfind('/').map(|n| n + 1);
+					match (last_slash, last_whole_char) {
+						(Some(slash), Some(whole)) if slash == whole => {
+							Some(slash)
+						},
+						_ => None
+					}
+				};
+				let self_close = self_close_idx.is_some();
+				// can't close and self_close
+				if close && self_close { return None; }
+
+				// clean the close from the raw string
+				let name_raw = if let Some(close_idx) = self_close_idx {
+					&tag_innards[..close_idx]
+				} else if close {
+					&tag_innards[1..]
+				} else {
+					tag_innards
+				};
+
+				let (name, body) = match name_raw.find(|c: char| c.is_ascii_whitespace()){
+					None => {
+						(name_raw, "")
+					},
+					Some(idx) => {
+						(&name_raw[..idx], &name_raw[idx+1..])
+					}
+				};
+
+				Some((ParsedTag{
+					closing: close,
+					self_closing: self_close,
+					name,
+					body
+				}, rest))
+			}
+		}
+	}
+
+	fn parse_text(raw: &str) -> Consumed {
+		let mut end_idx = 0;
+		let mut search_from = raw;
+
+		loop {
+			match search_from.find('<') {
+				// if we ever run out of <'s, the entire string was text
+				None => {
+					break Consumed {
+						node: Node::Text(raw.to_owned()),
+						remaining: None,
+					}
+				}
+				Some(idx) => {
+					end_idx += idx;
+
+					if Self::is_tag(&search_from[idx..]).is_some() {
+						// we've found a new tag, this text node is done
+						break Consumed {
+							node: Node::Text(raw[..end_idx].to_owned()),
+							remaining: Some(&raw[end_idx..]),
+						};
+					} else {
+						// step over the <
+						end_idx += 1;
+						search_from = &raw[end_idx..];
+					}
+				}
+			}
+		}
+	}
+}
+
+struct Consumed<'a> {
+	node: Node,
+	remaining: Option<&'a str>,
+}
+
+struct ParsedTag<'a> {
+	closing: bool,
+	name: &'a str,
+	// a tag's body is what exists between the end of the name and the end of
+	// the tag (including a self-close that may be there and any whitespace)
+	body: &'a str,
+	self_closing: bool,
+}
+
+impl<'a> ParsedTag<'a> {
+	/// Whether or not this tag closes or self-closes
+	pub fn closes(&self) -> bool {
+		self.closing || self.self_closing
+	}
+}
+
+#[derive(Debug, PartialEq)]
+pub enum Node {
+	Text(String),
+	Tag {
+		// for roundtripping
+		self_closing: bool,
+		name: String,
+		children: Vec<Node>,
+	},
+	Comment {
+		body: String,
+	},
+}
+
+#[macro_export]
+macro_rules! text {
+	($text:expr) => {
+		Node::Text(String::from($text))
+	};
+}
+
+#[cfg(test)]
+mod test {
+	use crate::{Html, Node};
+
+	macro_rules! text {
+		($text:expr) => {
+			Node::Text(String::from($text))
+		};
+	}
+
+	#[test]
+	fn parse_text_finds_start_of_tag() {
+		let no_tag = "Hello, World!";
+		let starts_tag = "<p>Hello, World!";
+		let ends_tag = "Hello, World!</p>";
+
+		let no_tag_res = Html::parse_text(no_tag);
+		assert_eq!(no_tag_res.node, text!("Hello, World!"));
+		assert!(no_tag_res.remaining.is_none());
+
+		let starts_tag_res = Html::parse_text(starts_tag);
+		assert_eq!(starts_tag_res.node, text!(""));
+		assert_eq!(starts_tag_res.remaining, Some(starts_tag));
+
+		let ends_tag_res = Html::parse_text(ends_tag);
+		assert_eq!(ends_tag_res.node, text!("Hello, World!"));
+		assert_eq!(ends_tag_res.remaining, Some("</p>"));
+	}
+
+	#[test]
+	fn parse_text_correctly_ignores_nontags() {
+		let sentence = "The condition 2 < 1 should be 1 > 2";
+		let weird = "Hello, < p>";
+		let no_close = "Hello <p my name is ";
+
+		let sentence_res = Html::parse_text(sentence);
+		assert_eq!(sentence_res.node, text!(sentence));
+		assert!(sentence_res.remaining.is_none());
+
+		let weird_res = Html::parse_text(weird);
+		assert_eq!(weird_res.node, text!(weird));
+		assert!(weird_res.remaining.is_none());
+
+		let no_close_res = Html::parse_text(no_close);
+		assert_eq!(no_close_res.node, text!(no_close));
+		assert!(no_close_res.remaining.is_none());
+	}
+
+	#[test]
+	fn parse_node_parses_tag() {
+		let basic = "<p>Hello!</p>";
+
+		let hh = Html::parse_node(basic);
+		assert_eq!(
+			hh.node,
+			Node::Tag {
+				self_closing: false,
+				name: "p".into(),
+				children: vec![text!("Hello!")]
+			}
+		)
+	}
+
+	#[test]
+	fn parse_node_parses_nested_tags() {
+		let nested = "<p><p>Hello!</p></p>";
+
+		let hh = Html::parse_node(nested);
+		assert_eq!(
+			hh.node,
+			Node::Tag {
+				self_closing: false,
+				name: "p".into(),
+				children: vec![Node::Tag {
+					self_closing: false,
+					name: "p".into(),
+					children: vec![text!("Hello!")]
+				}]
+			}
+		)
+	}
+
+	#[test]
+	fn parse_multiple_toplevel() {
+		let nested = "<p>Hello </p><p>World!</p>";
+
+		let hh = Html::parse(nested);
+		assert_eq!(
+			hh.nodes,
+			vec![
+				Node::Tag {
+					self_closing: false,
+					name: "p".into(),
+					children: vec![text!("Hello ")]
+				},
+				Node::Tag {
+					self_closing: false,
+					name: "p".into(),
+					children: vec![text!("World!")]
+				}
+			]
+		)
+	}
+}