From a14a861d824230458333defd5cf43bafd2ca596d Mon Sep 17 00:00:00 2001 From: gennyble Date: Thu, 28 Dec 2023 01:07:53 -0600 Subject: query parsing, but this is not what i wanted actually --- Cargo.lock | 58 +++++++++++++++++++ Cargo.toml | 1 + src/lib.rs | 149 +++-------------------------------------------- src/query.rs | 185 +++++++++++++++++++++++++++++++++++++++++++++++++++++++++++ src/tag.rs | 177 ++++++++++++++++++++++++++++++++++++++++++++++++++++++++ 5 files changed, 429 insertions(+), 141 deletions(-) create mode 100644 src/query.rs create mode 100644 src/tag.rs diff --git a/Cargo.lock b/Cargo.lock index 0006c5b..5aa59d6 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -5,3 +5,61 @@ version = 3 [[package]] name = "cutie" version = "0.1.0" +dependencies = [ + "thiserror", +] + +[[package]] +name = "proc-macro2" +version = "1.0.71" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "75cb1540fadbd5b8fbccc4dddad2734eba435053f725621c070711a14bb5f4b8" +dependencies = [ + "unicode-ident", +] + +[[package]] +name = "quote" +version = "1.0.33" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "5267fca4496028628a95160fc423a33e8b2e6af8a5302579e322e4b520293cae" +dependencies = [ + "proc-macro2", +] + +[[package]] +name = "syn" +version = "2.0.43" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "ee659fb5f3d355364e1f3e5bc10fb82068efbf824a1e9d1c9504244a6469ad53" +dependencies = [ + "proc-macro2", + "quote", + "unicode-ident", +] + +[[package]] +name = "thiserror" +version = "1.0.52" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "83a48fd946b02c0a526b2e9481c8e2a17755e47039164a86c4070446e3a4614d" +dependencies = [ + "thiserror-impl", +] + +[[package]] +name = "thiserror-impl" +version = "1.0.52" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "e7fbe9b594d6568a6a1443250a7e67d80b74e1e96f6d1715e1e21cc1888291d3" +dependencies = [ + "proc-macro2", + "quote", + "syn", +] + +[[package]] +name = "unicode-ident" +version = "1.0.12" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "3354b9ac3fae1ff6755cb6db53683adb661634f67557942dea4facebec0fee4b" diff --git a/Cargo.toml b/Cargo.toml index f8822b4..36541e3 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -7,3 +7,4 @@ license = "ISC" # See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html [dependencies] +thiserror = "1.0.52" diff --git a/src/lib.rs b/src/lib.rs index f1010ae..df7eead 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -1,4 +1,8 @@ use core::fmt; +use tag::Tag; + +mod query; +mod tag; pub struct Html { pub nodes: Vec, @@ -21,6 +25,10 @@ impl Html { } } + pub fn query>(&self, query: S) -> Option<&Node> { + todo!() + } + fn parse_node(raw: &str) -> Consumed { match Self::is_tag(raw) { Some(_) => { @@ -309,109 +317,6 @@ impl fmt::Display for Node { } } -#[derive(Clone, Debug, PartialEq)] -pub struct Tag { - pub name: String, - /// Everything inside the tag that's not it's name. Includes a - /// self-close if there is one. - pub body: Option, - pub self_closing: bool, - pub children: Vec, -} - -impl Tag { - pub fn self_closing(&self) -> bool { - self.self_closing - } - - pub fn get_attribute<'a>(&'a self, key: &str) -> Option<&'a str> { - let body = match self.body.as_deref() { - None => return None, - Some(body) => body, - }; - - // get rid of potential self-close - let trimmed = if let Some(suffix) = body.trim().strip_suffix('/') { - suffix - } else { - body.trim() - }; - - let mut wrk = trimmed; - loop { - let key_end_idx = wrk.find(|c: char| c == ' ' || c == '='); - - match key_end_idx { - None => { - // boolean ends body - if wrk == key { - return Some(""); - } else { - break; - } - } - Some(idx) => match &wrk[idx..idx + 1] { - " " => { - // boolean - if &wrk[..idx] == key { - return Some(""); - } else { - wrk = &wrk[idx + 1..]; - } - } - "=" => { - // key-value - let found_name = &wrk[..idx]; - - // we're just assuming the attributes are properly - // formed right now. Skips the `=` and the `"` that - // should be there but we don't check for - wrk = &wrk[idx + 2..]; - let end = wrk.find('"').unwrap(); - let value = &wrk[..end]; - wrk = &wrk[end + 1..].trim_start(); - - if found_name == key { - return Some(value); - } - } - _ => unreachable!(), - }, - } - } - - None - } -} - -impl fmt::Display for Tag { - fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { - let Tag { - name, - body, - self_closing, - children, - } = self; - - let formatted_body = if let Some(body) = body { - format!(" {body}") - } else { - String::from("") - }; - - if *self_closing { - // we ignore our children if we're self-closing. - write!(f, "<{name} {}/>", body.as_deref().unwrap_or_default()) - } else { - write!(f, "<{name}{formatted_body}>")?; - for child in children { - write!(f, "{}", child)?; - } - write!(f, "") - } - } -} - #[macro_export] macro_rules! tag { ($name:expr) => { @@ -639,41 +544,3 @@ mod test { ) } } - -#[cfg(test)] -mod tag_test { - use crate::Tag; - - #[test] - fn tag_finds_boolen_attribute() { - let tag = Tag { - name: "div".into(), - body: Some("contenteditable".into()), - self_closing: false, - children: vec![], - }; - assert!(tag.get_attribute("contenteditable").is_some()) - } - - #[test] - fn tag_finds_kv_attribute() { - let tag = Tag { - name: "script".into(), - body: Some("src=\"script.js\"".into()), - self_closing: false, - children: vec![], - }; - assert_eq!(tag.get_attribute("src"), Some("script.js")) - } - - #[test] - fn tag_finds_boolean_in_centre() { - let tag = Tag { - name: "div".into(), - body: Some("id=\"divy\" contenteditable style=\"display: none;\"".into()), - self_closing: false, - children: vec![], - }; - assert!(tag.get_attribute("contenteditable").is_some()); - } -} diff --git a/src/query.rs b/src/query.rs new file mode 100644 index 0000000..796e372 --- /dev/null +++ b/src/query.rs @@ -0,0 +1,185 @@ +#[derive(Clone, Debug, PartialEq)] +pub enum QueryComponent { + /// Every child element of the tag + TagName(String), + /// Only direct children with the tag + DirectTagName(String), + /// The child element with the ID + Id(String), + /// Only direct children with the tag + DirectId(String), + /// Every child that has the class + Class(String), + /// Only direct children with the class + DirectClass(String), +} + +pub fn parse_query(mut raw: &str) -> Result, QueryParseError> { + let mut components = vec![]; + + let mut next_direct = false; + loop { + if raw.is_empty() { + break Ok(components); + } + + let part = match raw.find(['>', ' ']) { + None => { + let part = raw; + raw = &raw[raw.len()..raw.len()]; + part + } + Some(idx) => { + let part = &raw[..idx]; + + if &raw[idx..idx + 1] == ">" { + if next_direct { + return Err(QueryParseError::DoubleDirect); + } else { + next_direct = true; + } + } + + raw = &raw[idx + 1..]; + part + } + }; + + if part.is_empty() { + continue; + } + + if let Some(id) = part.strip_prefix('#') { + if id.contains(['#', '.']) { + return Err(QueryParseError::UnknownComponent { + malformed: id.into(), + }); + } + + if next_direct { + components.push(QueryComponent::DirectId(id.into())); + next_direct = false; + } else { + components.push(QueryComponent::Id(id.into())); + } + } else if let Some(class) = part.strip_prefix('.') { + if class.contains(['#', '.']) { + return Err(QueryParseError::UnknownComponent { + malformed: class.into(), + }); + } + + if next_direct { + components.push(QueryComponent::DirectClass(class.into())); + next_direct = false; + } else { + components.push(QueryComponent::Class(class.into())); + } + } else { + if part.contains(['#', '.']) { + return Err(QueryParseError::UnknownComponent { + malformed: part.into(), + }); + } + + if next_direct { + components.push(QueryComponent::DirectTagName(part.into())); + next_direct = false; + } else { + components.push(QueryComponent::TagName(part.into())); + } + } + } +} + +#[derive(Debug, thiserror::Error)] +pub enum QueryParseError { + #[error("Query ends with '>' which does not make sense. Are you missing a selector?")] + EndsInDirect, + #[error("Two direct descendent selectors (>) appeard together")] + DoubleDirect, + #[error( + "The component {malformed} does not make sense. Valid selectors are #id, .class, and tag" + )] + UnknownComponent { malformed: String }, +} + +#[cfg(test)] +mod test { + use super::parse_query; + + macro_rules! qc { + ($tag:expr) => { + $crate::query::QueryComponent::TagName(String::from($tag)) + }; + + (>$tag:expr) => { + $crate::query::QueryComponent::DirectTagName(String::from($tag)) + }; + + (ID $tag:expr) => { + $crate::query::QueryComponent::Id(String::from($tag)) + }; + + (>ID $tag:expr) => { + $crate::query::QueryComponent::DirectId(String::from($tag)) + }; + + (. $tag:expr) => { + $crate::query::QueryComponent::Class(String::from($tag)) + }; + + (>. $tag:expr) => { + $crate::query::QueryComponent::DirectClass(String::from($tag)) + }; + } + + #[test] + fn parses_tags() { + let raw = "main section p"; + let parse = parse_query(raw).unwrap(); + assert_eq!(parse, vec![qc!("main"), qc!("section"), qc!("p")]) + } + + #[test] + fn parses_direct_tags() { + let raw = "main > section > p"; + let parse = parse_query(raw).unwrap(); + assert_eq!(parse, vec![qc!("main"), qc!(> "section"), qc!(> "p")]) + } + + #[test] + fn parses_id() { + let raw = "main #job"; + let parse = parse_query(raw).unwrap(); + assert_eq!(parse, vec![qc!("main"), qc!(ID "job")]) + } + + #[test] + fn parses_direct_id() { + let raw = "main > #job"; + let parse = parse_query(raw).unwrap(); + assert_eq!(parse, vec![qc!("main"), qc!(>ID "job")]) + } + + #[test] + fn parses_class() { + let raw = "main .post"; + let parse = parse_query(raw).unwrap(); + assert_eq!(parse, vec![qc!("main"), qc!(."post")]) + } + + #[test] + fn parses_direct_class() { + let raw = "main > .post"; + let parse = parse_query(raw).unwrap(); + assert_eq!(parse, vec![qc!("main"), qc!(>."post")]) + } + + #[test] + fn parses_complex() { + let raw = "main > article"; + let parse = parse_query(raw).unwrap(); + assert_eq!(parse, vec![qc!("main"), qc!(>."post")]) + } +} diff --git a/src/tag.rs b/src/tag.rs new file mode 100644 index 0000000..e325b1f --- /dev/null +++ b/src/tag.rs @@ -0,0 +1,177 @@ +use core::fmt; + +use crate::Node; + +#[derive(Clone, Debug, PartialEq)] +pub struct Tag { + pub name: String, + /// Everything inside the tag that's not it's name. Includes a + /// self-close if there is one. + pub body: Option, + pub self_closing: bool, + pub children: Vec, +} + +impl Tag { + pub fn self_closing(&self) -> bool { + self.self_closing + } + + pub fn get_attribute<'a>(&'a self, key: &str) -> Option<&'a str> { + let body = match self.body.as_deref() { + None => return None, + Some(body) => body, + }; + + // get rid of potential self-close + let trimmed = if let Some(suffix) = body.trim().strip_suffix('/') { + suffix + } else { + body.trim() + }; + + let mut wrk = trimmed; + loop { + let key_end_idx = wrk.find(|c: char| c == ' ' || c == '='); + + match key_end_idx { + None => { + // boolean ends body + if wrk == key { + return Some(""); + } else { + break; + } + } + Some(idx) => match &wrk[idx..idx + 1] { + " " => { + // boolean + if &wrk[..idx] == key { + return Some(""); + } else { + wrk = &wrk[idx + 1..]; + } + } + "=" => { + // key-value + let found_name = &wrk[..idx]; + + // we're just assuming the attributes are properly + // formed right now. Skips the `=` and the `"` that + // should be there but we don't check for + wrk = &wrk[idx + 2..]; + let end = wrk.find('"').unwrap(); + let value = &wrk[..end]; + wrk = &wrk[end + 1..].trim_start(); + + if found_name == key { + return Some(value); + } + } + _ => unreachable!(), + }, + } + } + + None + } + + pub fn id(&self) -> Option<&str> { + match self.get_attribute("id") { + None => None, + Some("") => None, + Some(id) => Some(id), + } + } + + pub fn has_class(&self, name: &str) -> bool { + match self.get_attribute("class") { + None => false, + Some(classes) => { + for class in classes.split(' ') { + if class == name { + return true; + } + } + + false + } + } + } + + pub fn append_child>(&mut self, node: N) { + let node = node.into(); + self.children.push(node); + } + + /// Replace all children with one [Node::Text] + pub fn set_inner_text>(&mut self, txt: S) { + self.children = vec![Node::Text(txt.into())]; + } +} + +impl fmt::Display for Tag { + fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { + let Tag { + name, + body, + self_closing, + children, + } = self; + + let formatted_body = if let Some(body) = body { + format!(" {body}") + } else { + String::from("") + }; + + if *self_closing { + // we ignore our children if we're self-closing. + write!(f, "<{name} {}/>", body.as_deref().unwrap_or_default()) + } else { + write!(f, "<{name}{formatted_body}>")?; + for child in children { + write!(f, "{}", child)?; + } + write!(f, "") + } + } +} + +#[cfg(test)] +mod test { + use crate::Tag; + + #[test] + fn tag_finds_boolen_attribute() { + let tag = Tag { + name: "div".into(), + body: Some("contenteditable".into()), + self_closing: false, + children: vec![], + }; + assert!(tag.get_attribute("contenteditable").is_some()) + } + + #[test] + fn tag_finds_kv_attribute() { + let tag = Tag { + name: "script".into(), + body: Some("src=\"script.js\"".into()), + self_closing: false, + children: vec![], + }; + assert_eq!(tag.get_attribute("src"), Some("script.js")) + } + + #[test] + fn tag_finds_boolean_in_centre() { + let tag = Tag { + name: "div".into(), + body: Some("id=\"divy\" contenteditable style=\"display: none;\"".into()), + self_closing: false, + children: vec![], + }; + assert!(tag.get_attribute("contenteditable").is_some()); + } +} -- cgit 1.4.1-3-g733a5