Skip to content
23 changes: 12 additions & 11 deletions Cargo.toml
Original file line number Diff line number Diff line change
@@ -1,34 +1,35 @@
[package]
name = "cc-downloader"
version = "0.5.2"
edition = "2021"
version = "0.6.0"
edition = "2024"
authors = ["Pedro Ortiz Suarez <pedro@commoncrawl.org>"]
description = "A polite and user-friendly downloader for Common Crawl data."
license = "MIT OR Apache-2.0"
rust-version = "1.83"
rust-version = "1.85"
readme = "README.md"
homepage = "https://commoncrawl.org"
repository = "https://github.com/commoncrawl/cc-downloader"
documentation = "https://docs.rs/cc-downloader"

[dependencies]
clap = { version = "4.5.29", features = ["derive"] }
flate2 = "1.0.35"
clap = { version = "4.5.32", features = ["derive"] }
flate2 = "1.1.0"
futures = "0.3.31"
indicatif = "0.17.11"
reqwest = { version = "0.12.12", default-features = false, features = [
regex = "1.11.1"
reqwest = { version = "0.12.14", default-features = false, features = [
"stream",
"rustls-tls",
] }
reqwest-middleware = "0.4.0"
reqwest-middleware = "0.4.1"
reqwest-retry = "0.7.0"
tokio = { version = "1.43.0", features = ["full"] }
tokio-util = { version = "0.7.13", features = ["compat"] }
tokio = { version = "1.44.1", features = ["full"] }
tokio-util = { version = "0.7.14", features = ["compat"] }
url = "2.5.4"

[dev-dependencies]
serde = { version = "1.0.217", features = ["derive"] }
reqwest = { version = "0.12.12", default-features = false, features = [
serde = { version = "1.0.219", features = ["derive"] }
reqwest = { version = "0.12.14", default-features = false, features = [
"stream",
"rustls-tls",
"json",
Expand Down
2 changes: 1 addition & 1 deletion README.md
Original file line number Diff line number Diff line change
Expand Up @@ -43,7 +43,7 @@ Download paths for a given crawl
Usage: cc-downloader download-paths <CRAWL> <SUBSET> <DESTINATION>

Arguments:
<CRAWL> Crawl reference, e.g. CC-MAIN-2021-04
<CRAWL> Crawl reference, e.g. CC-MAIN-2021-04 or CC-NEWS-2025-01
<SUBSET> Data type [possible values: segment, warc, wat, wet, robotstxt, non200responses, cc-index, cc-index-table]
<DESTINATION> Destination folder

Expand Down
4 changes: 2 additions & 2 deletions SECURITY.md
Original file line number Diff line number Diff line change
Expand Up @@ -6,8 +6,8 @@ Only the latest minor version is being supported

| Version | Supported |
| ------- | ------------------ |
| 0.5.x | :white_check_mark: |
| < 0.5.0 | :x: |
| 0.6.x | :white_check_mark: |
| < 0.6.0 | :x: |

## Reporting a Vulnerability

Expand Down
18 changes: 16 additions & 2 deletions src/cli.rs
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
use std::path::PathBuf;

use clap::{Parser, Subcommand, ValueEnum};
use regex::Regex;

#[derive(Parser)]
#[command(version, about, long_about = None)]
Expand All @@ -13,8 +14,8 @@ pub struct Cli {
pub enum Commands {
/// Download paths for a given crawl
DownloadPaths {
/// Crawl reference, e.g. CC-MAIN-2021-04
#[arg(value_name = "CRAWL")]
/// Crawl reference, e.g. CC-MAIN-2021-04 or CC-NEWS-2025-01
#[arg(value_name = "CRAWL", value_parser = crawl_name_format)]
snapshot: String,

/// Data type
Expand Down Expand Up @@ -89,3 +90,16 @@ impl DataType {
}
}
}

fn crawl_name_format(crawl: &str) -> Result<String, String> {
let main_re = Regex::new(r"^(CC\-MAIN)\-([0-9]{4})\-([0-9]{2})$").unwrap();
let news_re = Regex::new(r"^(CC\-NEWS)\-([0-9]{4})\-([0-9]{2})$").unwrap();

let crawl_ref = crawl.to_uppercase();

if !(main_re.is_match(&crawl_ref) || news_re.is_match(&crawl_ref)) {
Err("Please use the CC-MAIN-YYYY-WW or the CC-NEWS-YYYY-MM format.".to_string())
} else {
Ok(crawl_ref)
}
}
41 changes: 37 additions & 4 deletions src/download.rs
Original file line number Diff line number Diff line change
@@ -1,8 +1,9 @@
use flate2::read::GzDecoder;
use indicatif::{MultiProgress, ProgressBar, ProgressStyle};
use reqwest::{header, Client, Url};
use regex::Regex;
use reqwest::{Client, Url, header};
use reqwest_middleware::{ClientBuilder, ClientWithMiddleware};
use reqwest_retry::{policies::ExponentialBackoff, Jitter, RetryTransientMiddleware};
use reqwest_retry::{Jitter, RetryTransientMiddleware, policies::ExponentialBackoff};
use std::{
fs::File,
io::{BufRead, BufReader},
Expand Down Expand Up @@ -74,7 +75,18 @@ fn new_client(max_retries: usize) -> Result<ClientWithMiddleware, DownloadError>
.build())
}

pub async fn download_paths(options: DownloadOptions<'_>) -> Result<(), DownloadError> {
pub async fn download_paths(mut options: DownloadOptions<'_>) -> Result<(), DownloadError> {
let news_re = Regex::new(r"^(CC\-NEWS)\-([0-9]{4})\-([0-9]{2})$").unwrap();

// Check if the snapshot is a news snapshot and reformat it
// The format of the main crawl urls is different from the news crawl urls
// https://data.commoncrawl.org/crawl-data/CC-NEWS/2025/01/warc.paths.gz
// https://data.commoncrawl.org/crawl-data/CC-MAIN-2025-08/warc.paths.gz
let snapshot_original_ref = options.snapshot.clone();
if news_re.is_match(&options.snapshot) {
let caps = news_re.captures(&options.snapshot).unwrap();
options.snapshot = format!("{}/{}/{}", &caps[1], &caps[2], &caps[3]);
}
let paths = format!(
"{}crawl-data/{}/{}.paths.gz",
BASE_URL, options.snapshot, options.data_type
Expand All @@ -89,6 +101,27 @@ pub async fn download_paths(options: DownloadOptions<'_>) -> Result<(), Download
.and_then(|segments| segments.last()) // Retrieves the last segment
.unwrap_or("file.download"); // Fallback to generic filename

let resp = client.head(url.as_str()).send().await?;
match resp.status() {
status if status.is_success() => (),
status if status.as_u16() == 404 => {
return Err(format!(
"\n\nThe reference combination you requested:\n\tCRAWL: {}\n\tSUBSET: {}\n\tURL: {}\n\nDoesn't seem to exist or it is currently not accessible.\n\tError code: {} {}",
snapshot_original_ref, options.data_type, url, status.as_str(), status.canonical_reason().unwrap_or("")
)
.into());
}
status => {
return Err(format!(
"Couldn't download URL: {}. Error code: {} {}",
url,
status.as_str(),
status.canonical_reason().unwrap_or("")
)
.into());
}
}

let request = client.get(url.as_str());

let mut dst = options.dst.to_path_buf();
Expand Down Expand Up @@ -134,7 +167,7 @@ async fn download_task(
} else {
// We return an Error if something goes wrong here
return Err(
format!("Couldn't download URL: {}. Error: {:?}", url, resp.status(),).into(),
format!("Couldn't download URL: {}. Error: {:?}", url, resp.status()).into(),
);
}
};
Expand Down