Skip to content

Commit

Permalink
feat(all): minor fixes
Browse files Browse the repository at this point in the history
  • Loading branch information
martsokha committed Jun 26, 2024
1 parent 22d2f43 commit d5640b6
Show file tree
Hide file tree
Showing 12 changed files with 89 additions and 62 deletions.
2 changes: 1 addition & 1 deletion Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,7 @@ authors = ["Oleh Martsokha <[email protected]>"]
license = "MIT"

[workspace.dependencies]
tokio = { version = "1.38", default-features = false }
tokio = { version = "1", default-features = false }
futures-io = { version = "0.3", default-features = false }
futures-util = { version = "0.3", default-features = false }
futures-test = { version = "0.3", default-features = false }
Expand Down
3 changes: 1 addition & 2 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -17,5 +17,4 @@
protocol with the support of `crawl-delay`, `sitemap` and universal `*` match
extensions.
- [sitemapo](./sitemapo/): The implementation of the Sitemap (or URL inclusion)
protocol with the support of txt, xml formats and video, image, and news
extensions.
protocol with the support of txt and xml formats.
10 changes: 5 additions & 5 deletions countio/Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -35,10 +35,10 @@ tokio = ["dep:tokio"]
futures = ["dep:futures-io"]

[dependencies]
tokio = { workspace = true, optional = true }
futures-io = { workspace = true, optional = true, features = ["std"] }
tokio = { version = "1", default-features = false, optional = true }
futures-io = { version = "0.3", default-features = false, optional = true, features = ["std"] }

[dev-dependencies]
tokio = { workspace = true, features = ["rt", "macros", "io-util"] }
futures-util = { workspace = true }
futures-test = { workspace = true, features = ["std"] }
tokio = { version = "1", features = ["rt", "macros", "io-util"] }
futures-util = { version = "0.3", default-features = false }
futures-test = { version = "0.3", default-features = false, features = ["std"] }
8 changes: 4 additions & 4 deletions robotxt/Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -45,14 +45,14 @@ optimal = []
serde = ["dep:serde", "url/serde", "serde/derive", "serde/rc"]

[dependencies]
url = { workspace = true }
thiserror = { workspace = true }
url = { version = "2.5" }
thiserror = { version = "1.0" }
percent-encoding = { version = "2.3" }

nom = { version = "7.1", optional = true }
bstr = { version = "1.9", optional = true }
regex = { version = "1.10", optional = true }
serde = { workspace = true, optional = true }
serde = { version = "1.0", optional = true }

[dev-dependencies]
serde_json = { workspace = true }
serde_json = { version = "1.0" }
65 changes: 30 additions & 35 deletions robotxt/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -37,48 +37,43 @@ programming language with the support of `crawl-delay`, `sitemap` and universal
```rust
use robotxt::Robots;

fn main() {
let txt = r#"
User-Agent: foobot
Disallow: *
Allow: /example/
Disallow: /example/nope.txt
"#;

let r = Robots::from_bytes(txt.as_bytes(), "foobot");
assert!(r.is_relative_allowed("/example/yeah.txt"));
assert!(!r.is_relative_allowed("/example/nope.txt"));
assert!(!r.is_relative_allowed("/invalid/path.txt"));
}
let txt = r#"
User-Agent: foobot
Disallow: *
Allow: /example/
Disallow: /example/nope.txt
"#;

let r = Robots::from_bytes(txt.as_bytes(), "foobot");
assert!(r.is_relative_allowed("/example/yeah.txt"));
assert!(!r.is_relative_allowed("/example/nope.txt"));
assert!(!r.is_relative_allowed("/invalid/path.txt"));
```

- build the new `robots.txt` file in a declarative manner:

```rust
use robotxt::{RobotsBuilder, Result};

fn main() -> Result<()> {
let txt = RobotsBuilder::default()
.header("Robots.txt: Start")
.group(["foobot"], |u| {
u.crawl_delay(5)
.header("Rules for Foobot: Start")
.allow("/example/yeah.txt")
.disallow("/example/nope.txt")
.footer("Rules for Foobot: End")
})
.group(["barbot", "nombot"], |u| {
u.crawl_delay(2)
.disallow("/example/yeah.txt")
.disallow("/example/nope.txt")
})
.sitemap("https://example.com/sitemap_1.xml".try_into()?)
.sitemap("https://example.com/sitemap_1.xml".try_into()?)
.footer("Robots.txt: End");

println!("{}", txt.to_string());
Ok(())
}
let txt = RobotsBuilder::default()
.header("Robots.txt: Start")
.group(["foobot"], |u| {
u.crawl_delay(5)
.header("Rules for Foobot: Start")
.allow("/example/yeah.txt")
.disallow("/example/nope.txt")
.footer("Rules for Foobot: End")
})
.group(["barbot", "nombot"], |u| {
u.crawl_delay(2)
.disallow("/example/yeah.txt")
.disallow("/example/nope.txt")
})
.sitemap("https://example.com/sitemap_1.xml".try_into()?)
.sitemap("https://example.com/sitemap_1.xml".try_into()?)
.footer("Robots.txt: End");

println!("{}", txt.to_string());
```

### Links
Expand Down
7 changes: 6 additions & 1 deletion robotxt/lib.rs
Original file line number Diff line number Diff line change
Expand Up @@ -23,9 +23,14 @@ pub enum Error {
#[error("cannot be a base url")]
CannotBeBase,

/// Unable to create the expected path to the `robots.txt` file:
/// does not have a host.
#[error("does not have a host")]
NoHost,

/// Unable to create the expected path to the `robots.txt` file:
/// unexpected address scheme, expected `http` or `https`.
#[error("addr scheme: `{scheme}`, expected `http` or `https`")]
#[error("scheme: `{scheme}`, expected `http` or `https`")]
WrongScheme { scheme: String },

/// Unable to create the expected path to the `robots.txt` file:
Expand Down
16 changes: 15 additions & 1 deletion robotxt/parse/access.rs
Original file line number Diff line number Diff line change
@@ -1,18 +1,20 @@
use std::fmt;
use std::ops::Deref;

/// The result of the `robots.txt` retrieval attempt.
///
/// See [`Robots::from_access`].
/// Also see 2.3.1. Access Results in the specification.
///
/// [`Robots::from_access`]: crate::Robots::from_access
#[derive(Debug)]
#[derive(Debug, Copy, Clone)]
pub enum AccessResult<'a> {
/// 2.3.1.1. Successful Access
///
/// If the crawler successfully downloads the robots.txt file, the
/// crawler MUST follow the parseable rules.
Successful(&'a [u8]),

/// 2.3.1.2. Redirects
///
/// It's possible that a server responds to a robots.txt fetch request
Expand All @@ -27,6 +29,7 @@ pub enum AccessResult<'a> {
/// If there are more than five consecutive redirects, crawlers MAY
/// assume that the robots.txt file is unavailable.
Redirect,

/// 2.3.1.3. "Unavailable" Status
///
/// "Unavailable" means the crawler tries to fetch the robots.txt file
Expand All @@ -38,6 +41,7 @@ pub enum AccessResult<'a> {
/// unavailable to the crawler, then the crawler MAY access any resources
/// on the server.
Unavailable,

/// 2.3.1.4. "Unreachable" Status
///
/// If the robots.txt file is unreachable due to server or network
Expand Down Expand Up @@ -65,7 +69,17 @@ impl AccessResult<'_> {
}
}

impl Deref for AccessResult<'_> {
type Target = str;

#[inline]
fn deref(&self) -> &Self::Target {
self.as_str()
}
}

impl fmt::Display for AccessResult<'_> {
#[inline]
fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
write!(f, "{}", self.as_str())
}
Expand Down
26 changes: 20 additions & 6 deletions robotxt/paths/create.rs
Original file line number Diff line number Diff line change
@@ -1,7 +1,16 @@
use url::Url;

use crate::{Error, Result};

/// Returns the expected path to the `robots.txt` file
/// as the [`url::Url`].
/// as the `url::`[`Url`].
///
/// # Errors
///
/// Returns the error if the provided [`Url`] cannot be a base,
/// does not have a host or the schema is not `http` or `https`.
///
/// # Examples
///
/// ```rust
/// use url::Url;
Expand All @@ -12,38 +21,43 @@ use crate::{Error, Result};
/// let robots = create_url(&path).unwrap().to_string();
/// assert_eq!(robots, "https://example.com/robots.txt")
/// ```
pub fn create_url(path: &url::Url) -> Result<url::Url> {
pub fn create_url(path: &Url) -> Result<Url> {
let mut path = path.clone();

if path.cannot_be_a_base() {
return Err(Error::CannotBeBase);
}

if path.host().is_none() {
return Err(Error::NoHost);
}

if path.scheme() != "http" && path.scheme() != "https" {
return Err(Error::WrongScheme {
scheme: path.scheme().to_string(),
});
}

if !path.username().is_empty() {
path.set_username("").unwrap();
path.set_username("").expect("should pass base/host tests");
}

if path.password().is_some() {
path.set_password(None).unwrap();
path.set_password(None)
.expect("should pass base/host tests");
}

path.join("/robots.txt").map_err(Into::into)
}

#[cfg(test)]
mod test {
use super::*;
use crate::{create_url, url::Url, Result};

#[test]
fn from_url() -> Result<()> {
let path = "https://user:[email protected]/foo/sample.txt";
let path = url::Url::parse(path).unwrap();
let path = Url::parse(path).unwrap();

let robots = create_url(&path)?.to_string();
assert_eq!(robots, "https://example.com/robots.txt");
Expand Down
6 changes: 3 additions & 3 deletions sitemapo/Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -15,8 +15,8 @@ documentation = "https://docs.rs/sitemapo"
categories = ["parser-implementations", "web-programming"]
keywords = ["sitemaps", "sitemap", "inclusion", "crawler", "scraper"]
description = """
The implementation of the Sitemap.xml (or URL inclusion) protocol with
the support of txt & xml formats, and video, image, news extensions.
The implementation of the Sitemap.xml (or URL inclusion) protocol
with the support of txt & xml formats.
"""

[package.metadata.docs.rs]
Expand Down Expand Up @@ -44,7 +44,7 @@ url = { workspace = true }
thiserror = { workspace = true }
countio = { version = "0.2" }

quick-xml = { version = "0.31" }
quick-xml = { version = "0.32" }
bytes = { version = "1.6", features = [] }
time = { workspace = true, features = ["parsing", "formatting"] }

Expand Down
4 changes: 2 additions & 2 deletions sitemapo/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -17,8 +17,8 @@
[coverage-url]: https://app.codecov.io/gh/spire-rs/kit

The implementation of the Sitemap (or URL inclusion) protocol in the Rust
programming language with the support of `txt` & `xml` formats, and `video`,
`image`, `news` extensions (according to the Google's spec).
programming language with the support of `txt` & `xml` formats (according to the
Google's spec).

### Features

Expand Down
2 changes: 1 addition & 1 deletion sitemapo/parse/entry.rs
Original file line number Diff line number Diff line change
Expand Up @@ -185,7 +185,7 @@ mod async_parser {
loop {
self.inner.try_if_readable()?;
let event = self.inner.reader.read_event_into_async(&mut buf).await?;
match self.write_event(event)? {
match self.write_event(event) {
Output::Some(record) => return Ok(Some(record)),
Output::None => {}
Output::End => return Ok(None),
Expand Down
2 changes: 1 addition & 1 deletion sitemapo/parse/index.rs
Original file line number Diff line number Diff line change
Expand Up @@ -156,7 +156,7 @@ mod tokio {
loop {
self.inner.try_if_readable()?;
let event = self.inner.reader.read_event_into_async(&mut buf).await?;
match self.write_event(event)? {
match self.write_event(event) {
Output::Some(record) => return Ok(Some(record)),
Output::None => {}
Output::End => return Ok(None),
Expand Down

0 comments on commit d5640b6

Please sign in to comment.