about summary refs log tree commit diff
diff options
context:
space:
mode:
-rw-r--r--.gitignore1
-rw-r--r--.rustfmt.toml1
-rw-r--r--Cargo.lock7
-rw-r--r--Cargo.toml9
-rw-r--r--LICENSE15
-rw-r--r--readme.md4
-rw-r--r--src/lib.rs361
-rw-r--r--tests/homepage.rs7
-rw-r--r--tests/nyble.html99
9 files changed, 504 insertions, 0 deletions
diff --git a/.gitignore b/.gitignore
new file mode 100644
index 0000000..ea8c4bf
--- /dev/null
+++ b/.gitignore
@@ -0,0 +1 @@
+/target
diff --git a/.rustfmt.toml b/.rustfmt.toml
new file mode 100644
index 0000000..218e203
--- /dev/null
+++ b/.rustfmt.toml
@@ -0,0 +1 @@
+hard_tabs = true
diff --git a/Cargo.lock b/Cargo.lock
new file mode 100644
index 0000000..0006c5b
--- /dev/null
+++ b/Cargo.lock
@@ -0,0 +1,7 @@
+# This file is automatically @generated by Cargo.
+# It is not intended for manual editing.
+version = 3
+
+[[package]]
+name = "cutie"
+version = "0.1.0"
diff --git a/Cargo.toml b/Cargo.toml
new file mode 100644
index 0000000..f8822b4
--- /dev/null
+++ b/Cargo.toml
@@ -0,0 +1,9 @@
+[package]
+name = "cutie"
+version = "0.1.0"
+edition = "2021"
+license = "ISC"
+
+# See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html
+
+[dependencies]
diff --git a/LICENSE b/LICENSE
new file mode 100644
index 0000000..8a55383
--- /dev/null
+++ b/LICENSE
@@ -0,0 +1,15 @@
+ISC License
+
+Copyright 2023 gennyble <gen@nyble.dev>
+
+Permission to use, copy, modify, and/or distribute this software for
+any purposewith or without fee is hereby granted, provided that the
+above copyright notice and this permission notice appear in all copies.
+
+THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL
+WARRANTIES WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES
+OF MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE
+FOR ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY
+DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER
+IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING
+OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
\ No newline at end of file
diff --git a/readme.md b/readme.md
new file mode 100644
index 0000000..89585a8
--- /dev/null
+++ b/readme.md
@@ -0,0 +1,4 @@
+opinionated, standards non-compliant HTML parser meant to consume handwritten HTML.
+
+- all tags must close
+- no > in tags except at the end (not even in attributes)
\ No newline at end of file
diff --git a/src/lib.rs b/src/lib.rs
new file mode 100644
index 0000000..e0c8c5b
--- /dev/null
+++ b/src/lib.rs
@@ -0,0 +1,361 @@
+pub struct Html {
+	pub nodes: Vec<Node>,
+}
+
+impl Html {
+	pub fn parse<S: AsRef<str>>(raw: S) -> Self {
+		let mut raw = raw.as_ref();
+
+		let mut nodes = vec![];
+
+		loop {
+			let Consumed { node, remaining } = Self::parse_node(raw);
+
+			nodes.push(node);
+
+			match remaining {
+				None => break Self { nodes },
+				Some(rem) => raw = rem,
+			}
+		}
+	}
+
+	fn parse_node(raw: &str) -> Consumed {
+		match Self::is_tag(raw) {
+			Some(_) => match Self::parse_comment(raw) {
+				None => {
+					print!("Node ");
+					Self::parse_tag(raw)
+				}
+				Some(cmt) => cmt,
+			},
+			None => {
+				print!("Text ");
+				let cons = Self::parse_text(raw);
+				println!("## {:?}", cons.node);
+				cons
+			}
+		}
+	}
+
+	fn parse_tag(raw: &str) -> Consumed {
+		let (root_tag, mut rest) = Self::is_tag(raw).unwrap();
+
+		//println!("- {raw}");
+		if root_tag.closing {
+			panic!(
+				"found closing tag when not expected! {:?}\n{raw}",
+				root_tag.name
+			)
+		} else if root_tag.self_closing {
+			println!("self close return early");
+			return Consumed {
+				node: Node::Tag {
+					self_closing: true,
+					name: root_tag.name.into(),
+					children: vec![],
+				},
+				remaining: rest,
+			};
+		}
+
+		let mut children = vec![];
+
+		loop {
+			match Self::is_tag(rest.unwrap()) {
+				Some((
+					ParsedTag {
+						closing: true,
+						name,
+						self_closing: false,
+						..
+					},
+					remaining,
+				)) if name == root_tag.name => {
+					println!("ret closed - {name}");
+					break Consumed {
+						node: Node::Tag {
+							self_closing: false,
+							name: root_tag.name.to_owned(),
+							children,
+						},
+						remaining,
+					};
+				}
+				_ => {
+					println!("recur. ends on {}", root_tag.name,);
+					let cons = Self::parse_node(rest.unwrap());
+					rest = cons.remaining;
+					children.push(cons.node);
+				}
+			}
+		}
+	}
+
+	fn parse_comment(raw: &str) -> Option<Consumed> {
+		if raw.starts_with("<!--") {
+			let after_start = &raw[4..];
+			match after_start.find("-->") {
+				None => None,
+				Some(end) => {
+					let comment = &after_start[..end];
+					let rest = after_start.get(end + 3..);
+
+					Some(Consumed {
+						node: Node::Comment {
+							body: comment.into(),
+						},
+						remaining: rest,
+					})
+				}
+			}
+		} else {
+			None
+		}
+	}
+
+	/// check if the start of the string is a valid tag
+	#[rustfmt::skip]
+	fn is_tag(raw: &str) -> Option<(ParsedTag,  Option<&str>)> {
+		// Starts '<' and any non-whitespace character
+		let starts_right = raw.starts_with('<')
+			&& raw.chars().nth(1).map(|c| !c.is_ascii_whitespace()).unwrap_or(false);
+
+		if !starts_right {
+			return None;
+		}
+
+		match raw.find('>') {
+			// not a tag if there's no close
+			None => None,
+			Some(idx) => {
+				let rest = match raw.get(idx+1..) {
+					None => None,
+					Some("") => None,
+					Some(txt) => Some(txt)
+				};
+				let tag_innards = &raw[1..idx];
+
+				let close = tag_innards.starts_with('/');
+				let self_close_idx = {
+					let possible = &tag_innards[1..];
+					let last_whole_char = possible.rfind(|c: char| !c.is_ascii_whitespace()).map(|n| n + 1);
+					let last_slash = possible.rfind('/').map(|n| n + 1);
+					match (last_slash, last_whole_char) {
+						(Some(slash), Some(whole)) if slash == whole => {
+							Some(slash)
+						},
+						_ => None
+					}
+				};
+				let self_close = self_close_idx.is_some();
+				// can't close and self_close
+				if close && self_close { return None; }
+
+				// clean the close from the raw string
+				let name_raw = if let Some(close_idx) = self_close_idx {
+					&tag_innards[..close_idx]
+				} else if close {
+					&tag_innards[1..]
+				} else {
+					tag_innards
+				};
+
+				let (name, body) = match name_raw.find(|c: char| c.is_ascii_whitespace()){
+					None => {
+						(name_raw, "")
+					},
+					Some(idx) => {
+						(&name_raw[..idx], &name_raw[idx+1..])
+					}
+				};
+
+				Some((ParsedTag{
+					closing: close,
+					self_closing: self_close,
+					name,
+					body
+				}, rest))
+			}
+		}
+	}
+
+	fn parse_text(raw: &str) -> Consumed {
+		let mut end_idx = 0;
+		let mut search_from = raw;
+
+		loop {
+			match search_from.find('<') {
+				// if we ever run out of <'s, the entire string was text
+				None => {
+					break Consumed {
+						node: Node::Text(raw.to_owned()),
+						remaining: None,
+					}
+				}
+				Some(idx) => {
+					end_idx += idx;
+
+					if Self::is_tag(&search_from[idx..]).is_some() {
+						// we've found a new tag, this text node is done
+						break Consumed {
+							node: Node::Text(raw[..end_idx].to_owned()),
+							remaining: Some(&raw[end_idx..]),
+						};
+					} else {
+						// step over the <
+						end_idx += 1;
+						search_from = &raw[end_idx..];
+					}
+				}
+			}
+		}
+	}
+}
+
+struct Consumed<'a> {
+	node: Node,
+	remaining: Option<&'a str>,
+}
+
+struct ParsedTag<'a> {
+	closing: bool,
+	name: &'a str,
+	// a tag's body is what exists between the end of the name and the end of
+	// the tag (including a self-close that may be there and any whitespace)
+	body: &'a str,
+	self_closing: bool,
+}
+
+impl<'a> ParsedTag<'a> {
+	/// Whether or not this tag closes or self-closes
+	pub fn closes(&self) -> bool {
+		self.closing || self.self_closing
+	}
+}
+
+#[derive(Debug, PartialEq)]
+pub enum Node {
+	Text(String),
+	Tag {
+		// for roundtripping
+		self_closing: bool,
+		name: String,
+		children: Vec<Node>,
+	},
+	Comment {
+		body: String,
+	},
+}
+
+#[macro_export]
+macro_rules! text {
+	($text:expr) => {
+		Node::Text(String::from($text))
+	};
+}
+
+#[cfg(test)]
+mod test {
+	use crate::{Html, Node};
+
+	macro_rules! text {
+		($text:expr) => {
+			Node::Text(String::from($text))
+		};
+	}
+
+	#[test]
+	fn parse_text_finds_start_of_tag() {
+		let no_tag = "Hello, World!";
+		let starts_tag = "<p>Hello, World!";
+		let ends_tag = "Hello, World!</p>";
+
+		let no_tag_res = Html::parse_text(no_tag);
+		assert_eq!(no_tag_res.node, text!("Hello, World!"));
+		assert!(no_tag_res.remaining.is_none());
+
+		let starts_tag_res = Html::parse_text(starts_tag);
+		assert_eq!(starts_tag_res.node, text!(""));
+		assert_eq!(starts_tag_res.remaining, Some(starts_tag));
+
+		let ends_tag_res = Html::parse_text(ends_tag);
+		assert_eq!(ends_tag_res.node, text!("Hello, World!"));
+		assert_eq!(ends_tag_res.remaining, Some("</p>"));
+	}
+
+	#[test]
+	fn parse_text_correctly_ignores_nontags() {
+		let sentence = "The condition 2 < 1 should be 1 > 2";
+		let weird = "Hello, < p>";
+		let no_close = "Hello <p my name is ";
+
+		let sentence_res = Html::parse_text(sentence);
+		assert_eq!(sentence_res.node, text!(sentence));
+		assert!(sentence_res.remaining.is_none());
+
+		let weird_res = Html::parse_text(weird);
+		assert_eq!(weird_res.node, text!(weird));
+		assert!(weird_res.remaining.is_none());
+
+		let no_close_res = Html::parse_text(no_close);
+		assert_eq!(no_close_res.node, text!(no_close));
+		assert!(no_close_res.remaining.is_none());
+	}
+
+	#[test]
+	fn parse_node_parses_tag() {
+		let basic = "<p>Hello!</p>";
+
+		let hh = Html::parse_node(basic);
+		assert_eq!(
+			hh.node,
+			Node::Tag {
+				self_closing: false,
+				name: "p".into(),
+				children: vec![text!("Hello!")]
+			}
+		)
+	}
+
+	#[test]
+	fn parse_node_parses_nested_tags() {
+		let nested = "<p><p>Hello!</p></p>";
+
+		let hh = Html::parse_node(nested);
+		assert_eq!(
+			hh.node,
+			Node::Tag {
+				self_closing: false,
+				name: "p".into(),
+				children: vec![Node::Tag {
+					self_closing: false,
+					name: "p".into(),
+					children: vec![text!("Hello!")]
+				}]
+			}
+		)
+	}
+
+	#[test]
+	fn parse_multiple_toplevel() {
+		let nested = "<p>Hello </p><p>World!</p>";
+
+		let hh = Html::parse(nested);
+		assert_eq!(
+			hh.nodes,
+			vec![
+				Node::Tag {
+					self_closing: false,
+					name: "p".into(),
+					children: vec![text!("Hello ")]
+				},
+				Node::Tag {
+					self_closing: false,
+					name: "p".into(),
+					children: vec![text!("World!")]
+				}
+			]
+		)
+	}
+}
diff --git a/tests/homepage.rs b/tests/homepage.rs
new file mode 100644
index 0000000..da36b1c
--- /dev/null
+++ b/tests/homepage.rs
@@ -0,0 +1,7 @@
+use cutie::Html;
+
+#[test]
+fn pares_homepage() {
+	let str = std::fs::read_to_string("tests/nyble.html").unwrap();
+	let html = Html::parse(&str);
+}
diff --git a/tests/nyble.html b/tests/nyble.html
new file mode 100644
index 0000000..41614ab
--- /dev/null
+++ b/tests/nyble.html
@@ -0,0 +1,99 @@
+<html>
+
+<head>
+	<meta name="viewport" content="width=device-width, initial-scale=1.0" />
+	<meta charset="utf-8" />
+	<link rel="icon" type="image/png" href="/3directions.png" />
+	<link rel="icon" type="image/svg+xml" href="/3directions.svg" />
+	<title>nyble.dev</title>
+	<link rel="stylesheet" href="/styles/common.css" />
+	<link rel="stylesheet" href="/styles/index.css" />
+</head>
+
+<body>
+	<ul id="nav-access">
+		<li><a href="#content">Skip to main content</a></li>
+	</ul>
+
+	<main id="content">
+		<picture id="me">
+			<source srcset="/index/overflow.webp" type="image/webp" />
+			<img src="/index/overflow.png"
+				alt="myself with my head tilted back, arm outstretched holding a hat. There's a large amount of colorful noise covering my body." />
+		</picture>
+
+		<section id="contact">
+			<h2>Contact~,.</h2>
+			<ul>
+				<li>Email! <a href="mailto:gen@nyble.dev">gen@nyble.dev</a></li>
+				<li>Github? <a href="https://github.com/gennyble">@gennyble</a></li>
+				<!-- well, i guess that's it. -->
+				<!-- <li>Twitter.. <a href="https://twitter.com/gennyble">@gennyble</a></li> -->
+				<li>Cohost: <a href="https://cohost.org/gen">cohost/gen</a></li>
+				<li>Fedi&mdash;<br /><a href="https://amble.quest/mel" style="margin-left: 1rem">@mel@amble.quest</a>
+				</li>
+			</ul>
+		</section>
+
+		<section id="greeting" class="writing">
+			<h1>Hello,</h1>
+			<p>
+				welcome to this small place. in the great vastness of the internet you've found your way here, thanks.
+			</p>
+			<p>
+				i'm some kind of software person trying to make things. written things, visual things,
+				<i>interesting</i> things. i enjoy image processing and small, intentional tools.
+			</p>
+			<p>
+				while i work out structuring here better, these are some pages you might like to visit
+			</p>
+			<ol>
+				<li><a href="sillygifs/">sillygifs.html</a> &ndash; some strange GIF files</li>
+				<li><a href="touching-grass/">touching-grass.html</a> &ndash; a short video and a long over-analysis
+				</li>
+			</ol>
+
+			<p>
+				You can subscribe to the <a href="atom.xml">Atom feed</a> if you'd like. It gets updated if I
+				<a href="writing/">write</a> anything and also if I make something here that I want
+				to show :)
+			</p>
+
+			<section id="scrap">
+				<p><a href="scrap/">Scrap</a>; dropped things collected</p>
+				<ul>
+					<li><span>aug 7, 2023</span> wrench!</li>
+				</ul>
+			</section>
+
+			<!-- maybe later -->
+			<!--<ol>
+			<li><a href="whenwasit/">whenwasit</a> &ndash; cli tool for preserving file times</li>
+		</ol>-->
+
+			<!-- hi. are you reading this? -->
+			<!-- this is what we want. this is what we're working towards. a gentle graph of interconnected -->
+			<!-- things all reaching out and touching one another. -->
+			<!--
+		<p>
+			this place has a loose organization. pages are scattered around and heavily interlinked forming
+			a sort of gentle graph.
+		</p>
+		<p>
+			at the bottom of each page are outgoing links, those that reach out
+			to other pages, and incoming links, pages that reference the current one.
+		</p>
+			-->
+		</section>
+	</main>
+
+	<footer class="sized">
+		<ul id="dates">
+			<li id="dates-title">&nbsp;times</li>
+			<li><abbr title="creation time">c</abbr> Aug. 16, 2022 03:38</li>
+			<li><abbr title="modification time">m</abbr> Oct. 19, 2023 04:01</li>
+		</ul>
+	</footer>
+</body>
+
+</html>
\ No newline at end of file