From 0eefb6b385e78b142f691833c856c1cda01f20a0 Mon Sep 17 00:00:00 2001
From: gennyble
Date: Wed, 27 Dec 2023 05:34:40 -0600
Subject: parses
---
.gitignore | 1 +
.rustfmt.toml | 1 +
Cargo.lock | 7 ++
Cargo.toml | 9 ++
LICENSE | 15 +++
readme.md | 4 +
src/lib.rs | 361 ++++++++++++++++++++++++++++++++++++++++++++++++++++++
tests/homepage.rs | 7 ++
tests/nyble.html | 99 +++++++++++++++
9 files changed, 504 insertions(+)
create mode 100644 .gitignore
create mode 100644 .rustfmt.toml
create mode 100644 Cargo.lock
create mode 100644 Cargo.toml
create mode 100644 LICENSE
create mode 100644 readme.md
create mode 100644 src/lib.rs
create mode 100644 tests/homepage.rs
create mode 100644 tests/nyble.html
diff --git a/.gitignore b/.gitignore
new file mode 100644
index 0000000..ea8c4bf
--- /dev/null
+++ b/.gitignore
@@ -0,0 +1 @@
+/target
diff --git a/.rustfmt.toml b/.rustfmt.toml
new file mode 100644
index 0000000..218e203
--- /dev/null
+++ b/.rustfmt.toml
@@ -0,0 +1 @@
+hard_tabs = true
diff --git a/Cargo.lock b/Cargo.lock
new file mode 100644
index 0000000..0006c5b
--- /dev/null
+++ b/Cargo.lock
@@ -0,0 +1,7 @@
+# This file is automatically @generated by Cargo.
+# It is not intended for manual editing.
+version = 3
+
+[[package]]
+name = "cutie"
+version = "0.1.0"
diff --git a/Cargo.toml b/Cargo.toml
new file mode 100644
index 0000000..f8822b4
--- /dev/null
+++ b/Cargo.toml
@@ -0,0 +1,9 @@
+[package]
+name = "cutie"
+version = "0.1.0"
+edition = "2021"
+license = "ISC"
+
+# See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html
+
+[dependencies]
diff --git a/LICENSE b/LICENSE
new file mode 100644
index 0000000..8a55383
--- /dev/null
+++ b/LICENSE
@@ -0,0 +1,15 @@
+ISC License
+
+Copyright 2023 gennyble
+
+Permission to use, copy, modify, and/or distribute this software for
+any purposewith or without fee is hereby granted, provided that the
+above copyright notice and this permission notice appear in all copies.
+
+THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL
+WARRANTIES WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES
+OF MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE
+FOR ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY
+DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER
+IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING
+OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
\ No newline at end of file
diff --git a/readme.md b/readme.md
new file mode 100644
index 0000000..89585a8
--- /dev/null
+++ b/readme.md
@@ -0,0 +1,4 @@
+opinionated, standards non-compliant HTML parser meant to consume handwritten HTML.
+
+- all tags must close
+- no > in tags except at the end (not even in attributes)
\ No newline at end of file
diff --git a/src/lib.rs b/src/lib.rs
new file mode 100644
index 0000000..e0c8c5b
--- /dev/null
+++ b/src/lib.rs
@@ -0,0 +1,361 @@
+pub struct Html {
+ pub nodes: Vec,
+}
+
+impl Html {
+ pub fn parse>(raw: S) -> Self {
+ let mut raw = raw.as_ref();
+
+ let mut nodes = vec![];
+
+ loop {
+ let Consumed { node, remaining } = Self::parse_node(raw);
+
+ nodes.push(node);
+
+ match remaining {
+ None => break Self { nodes },
+ Some(rem) => raw = rem,
+ }
+ }
+ }
+
+ fn parse_node(raw: &str) -> Consumed {
+ match Self::is_tag(raw) {
+ Some(_) => match Self::parse_comment(raw) {
+ None => {
+ print!("Node ");
+ Self::parse_tag(raw)
+ }
+ Some(cmt) => cmt,
+ },
+ None => {
+ print!("Text ");
+ let cons = Self::parse_text(raw);
+ println!("## {:?}", cons.node);
+ cons
+ }
+ }
+ }
+
+ fn parse_tag(raw: &str) -> Consumed {
+ let (root_tag, mut rest) = Self::is_tag(raw).unwrap();
+
+ //println!("- {raw}");
+ if root_tag.closing {
+ panic!(
+ "found closing tag when not expected! {:?}\n{raw}",
+ root_tag.name
+ )
+ } else if root_tag.self_closing {
+ println!("self close return early");
+ return Consumed {
+ node: Node::Tag {
+ self_closing: true,
+ name: root_tag.name.into(),
+ children: vec![],
+ },
+ remaining: rest,
+ };
+ }
+
+ let mut children = vec![];
+
+ loop {
+ match Self::is_tag(rest.unwrap()) {
+ Some((
+ ParsedTag {
+ closing: true,
+ name,
+ self_closing: false,
+ ..
+ },
+ remaining,
+ )) if name == root_tag.name => {
+ println!("ret closed - {name}");
+ break Consumed {
+ node: Node::Tag {
+ self_closing: false,
+ name: root_tag.name.to_owned(),
+ children,
+ },
+ remaining,
+ };
+ }
+ _ => {
+ println!("recur. ends on {}", root_tag.name,);
+ let cons = Self::parse_node(rest.unwrap());
+ rest = cons.remaining;
+ children.push(cons.node);
+ }
+ }
+ }
+ }
+
+ fn parse_comment(raw: &str) -> Option {
+ if raw.starts_with("") {
+ None => None,
+ Some(end) => {
+ let comment = &after_start[..end];
+ let rest = after_start.get(end + 3..);
+
+ Some(Consumed {
+ node: Node::Comment {
+ body: comment.into(),
+ },
+ remaining: rest,
+ })
+ }
+ }
+ } else {
+ None
+ }
+ }
+
+ /// check if the start of the string is a valid tag
+ #[rustfmt::skip]
+ fn is_tag(raw: &str) -> Option<(ParsedTag, Option<&str>)> {
+ // Starts '<' and any non-whitespace character
+ let starts_right = raw.starts_with('<')
+ && raw.chars().nth(1).map(|c| !c.is_ascii_whitespace()).unwrap_or(false);
+
+ if !starts_right {
+ return None;
+ }
+
+ match raw.find('>') {
+ // not a tag if there's no close
+ None => None,
+ Some(idx) => {
+ let rest = match raw.get(idx+1..) {
+ None => None,
+ Some("") => None,
+ Some(txt) => Some(txt)
+ };
+ let tag_innards = &raw[1..idx];
+
+ let close = tag_innards.starts_with('/');
+ let self_close_idx = {
+ let possible = &tag_innards[1..];
+ let last_whole_char = possible.rfind(|c: char| !c.is_ascii_whitespace()).map(|n| n + 1);
+ let last_slash = possible.rfind('/').map(|n| n + 1);
+ match (last_slash, last_whole_char) {
+ (Some(slash), Some(whole)) if slash == whole => {
+ Some(slash)
+ },
+ _ => None
+ }
+ };
+ let self_close = self_close_idx.is_some();
+ // can't close and self_close
+ if close && self_close { return None; }
+
+ // clean the close from the raw string
+ let name_raw = if let Some(close_idx) = self_close_idx {
+ &tag_innards[..close_idx]
+ } else if close {
+ &tag_innards[1..]
+ } else {
+ tag_innards
+ };
+
+ let (name, body) = match name_raw.find(|c: char| c.is_ascii_whitespace()){
+ None => {
+ (name_raw, "")
+ },
+ Some(idx) => {
+ (&name_raw[..idx], &name_raw[idx+1..])
+ }
+ };
+
+ Some((ParsedTag{
+ closing: close,
+ self_closing: self_close,
+ name,
+ body
+ }, rest))
+ }
+ }
+ }
+
+ fn parse_text(raw: &str) -> Consumed {
+ let mut end_idx = 0;
+ let mut search_from = raw;
+
+ loop {
+ match search_from.find('<') {
+ // if we ever run out of <'s, the entire string was text
+ None => {
+ break Consumed {
+ node: Node::Text(raw.to_owned()),
+ remaining: None,
+ }
+ }
+ Some(idx) => {
+ end_idx += idx;
+
+ if Self::is_tag(&search_from[idx..]).is_some() {
+ // we've found a new tag, this text node is done
+ break Consumed {
+ node: Node::Text(raw[..end_idx].to_owned()),
+ remaining: Some(&raw[end_idx..]),
+ };
+ } else {
+ // step over the <
+ end_idx += 1;
+ search_from = &raw[end_idx..];
+ }
+ }
+ }
+ }
+ }
+}
+
+struct Consumed<'a> {
+ node: Node,
+ remaining: Option<&'a str>,
+}
+
+struct ParsedTag<'a> {
+ closing: bool,
+ name: &'a str,
+ // a tag's body is what exists between the end of the name and the end of
+ // the tag (including a self-close that may be there and any whitespace)
+ body: &'a str,
+ self_closing: bool,
+}
+
+impl<'a> ParsedTag<'a> {
+ /// Whether or not this tag closes or self-closes
+ pub fn closes(&self) -> bool {
+ self.closing || self.self_closing
+ }
+}
+
+#[derive(Debug, PartialEq)]
+pub enum Node {
+ Text(String),
+ Tag {
+ // for roundtripping
+ self_closing: bool,
+ name: String,
+ children: Vec,
+ },
+ Comment {
+ body: String,
+ },
+}
+
+#[macro_export]
+macro_rules! text {
+ ($text:expr) => {
+ Node::Text(String::from($text))
+ };
+}
+
+#[cfg(test)]
+mod test {
+ use crate::{Html, Node};
+
+ macro_rules! text {
+ ($text:expr) => {
+ Node::Text(String::from($text))
+ };
+ }
+
+ #[test]
+ fn parse_text_finds_start_of_tag() {
+ let no_tag = "Hello, World!";
+ let starts_tag = "Hello, World!";
+ let ends_tag = "Hello, World!
";
+
+ let no_tag_res = Html::parse_text(no_tag);
+ assert_eq!(no_tag_res.node, text!("Hello, World!"));
+ assert!(no_tag_res.remaining.is_none());
+
+ let starts_tag_res = Html::parse_text(starts_tag);
+ assert_eq!(starts_tag_res.node, text!(""));
+ assert_eq!(starts_tag_res.remaining, Some(starts_tag));
+
+ let ends_tag_res = Html::parse_text(ends_tag);
+ assert_eq!(ends_tag_res.node, text!("Hello, World!"));
+ assert_eq!(ends_tag_res.remaining, Some("
"));
+ }
+
+ #[test]
+ fn parse_text_correctly_ignores_nontags() {
+ let sentence = "The condition 2 < 1 should be 1 > 2";
+ let weird = "Hello, < p>";
+ let no_close = "Hello
+
+
+
+
+
+
+ nyble.dev
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ Hello,
+
+ welcome to this small place. in the great vastness of the internet you've found your way here, thanks.
+
+
+ i'm some kind of software person trying to make things. written things, visual things,
+ interesting things. i enjoy image processing and small, intentional tools.
+
+
+ while i work out structuring here better, these are some pages you might like to visit
+
+
+ sillygifs.html – some strange GIF files
+ touching-grass.html – a short video and a long over-analysis
+
+
+
+
+ You can subscribe to the Atom feed if you'd like. It gets updated if I
+ write anything and also if I make something here that I want
+ to show :)
+
+
+
+ Scrap ; dropped things collected
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ times
+ c Aug. 16, 2022 03:38
+ m Oct. 19, 2023 04:01
+
+
+
+
+