Skip to content

Commit

Permalink
Add rust tasks
Browse files Browse the repository at this point in the history
  • Loading branch information
Folyd committed Jun 28, 2024
1 parent 757a364 commit 1e9753b
Show file tree
Hide file tree
Showing 14 changed files with 1,183 additions and 0 deletions.
23 changes: 23 additions & 0 deletions rust/Cargo.toml
Original file line number Diff line number Diff line change
@@ -0,0 +1,23 @@
[package]
name = "rust-search-extension"
version = "0.1.0"
authors = ["Folyd <[email protected]>"]
edition = "2021"
publish = false

[dependencies]
serde = { version = "1", features = ["derive"] }
serde_json = "1"
minifier = "0.3"
unicode-segmentation = "1"
futures = "0.3"
tokio = { version = "1", features = ["macros", "time", "rt-multi-thread"] }
reqwest = { version = "0.11", features = ["json"] }
select = "0.6"
semver = { version = "1", features = ["serde"] }
rayon = "1"
regex = "1"
argh = "0.1"
rustsec = "0"
html-escape = "0"
db-dump = "0.7.1"
15 changes: 15 additions & 0 deletions rust/src/frequency.rs
Original file line number Diff line number Diff line change
@@ -0,0 +1,15 @@
#[derive(Debug)]
pub struct FrequencyWord {
pub word: String,
pub frequency: usize,
}

impl FrequencyWord {
#[inline]
pub fn score(&self) -> usize {
// Due to the prefix + suffix occupying two letters,
// we should minus the length to calculate the score.
// This will lead to a 0.4% reduction in file size.
(self.word.len() - 2) * self.frequency
}
}
50 changes: 50 additions & 0 deletions rust/src/main.rs
Original file line number Diff line number Diff line change
@@ -0,0 +1,50 @@
use std::error::Error;

use argh::FromArgs;

use crate::tasks::*;

mod frequency;
mod minify;
mod tasks;

/// Options
#[derive(FromArgs)]
struct Options {
#[argh(subcommand)]
subcommand: Subcommand,
}

/// Subcommands
#[derive(FromArgs)]
#[argh(subcommand)]
#[non_exhaustive]
enum Subcommand {
Advisory(AdvisoryTask),
Crates(CratesTask),
Books(BooksTask),
Caniuse(CaniuseTask),
Lints(LintsTask),
Labels(LabelsTask),
Rfcs(RfcsTask),
Rustc(RustcTask),
Targets(TargetsTask),
}

pub type Result<T> = std::result::Result<T, Box<dyn Error>>;

fn main() -> Result<()> {
let options: Options = argh::from_env();
match options.subcommand {
Subcommand::Advisory(cmd) => cmd.execute()?,
Subcommand::Crates(cmd) => cmd.execute()?,
Subcommand::Books(cmd) => cmd.execute()?,
Subcommand::Caniuse(cmd) => cmd.execute()?,
Subcommand::Lints(cmd) => cmd.execute()?,
Subcommand::Labels(cmd) => cmd.execute()?,
Subcommand::Rfcs(cmd) => cmd.execute()?,
Subcommand::Rustc(cmd) => cmd.execute()?,
Subcommand::Targets(cmd) => cmd.execute()?,
}
Ok(())
}
87 changes: 87 additions & 0 deletions rust/src/minify.rs
Original file line number Diff line number Diff line change
@@ -0,0 +1,87 @@
use minifier::js::{
aggregate_strings_into_array_filter, simple_minify, Keyword, ReservedChar, Token, Tokens,
};
use rayon::prelude::*;
use std::collections::HashMap;
use std::ops::Deref;
use unicode_segmentation::UnicodeSegmentation;

use crate::frequency::FrequencyWord;

#[derive(Debug)]
pub struct Minifier<'a> {
// A word to keys mapping. Such as <"cargo", "$0">.
mapping: HashMap<&'a str, String>,
}

impl<'a> Minifier<'a> {
const PREFIX: &'static str = "@$^&";
const SUFFIX: &'static str = "0123456789abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ";

pub fn new(frequency_words: &'a [FrequencyWord]) -> Minifier<'a> {
let keys: Vec<String> = Self::PREFIX
.chars()
.flat_map(|prefix| {
Self::SUFFIX
.chars()
.map(|suffix| format!("{prefix}{suffix}"))
.collect::<Vec<String>>()
})
.collect();

let words = frequency_words
.into_par_iter()
.take(keys.len())
.collect::<Vec<_>>();

Minifier {
mapping: words
.into_par_iter()
.enumerate()
.map(|(index, fw)| (fw.word.as_str(), keys.get(index).unwrap().to_owned()))
.collect(),
}
}

// Get the key to word mapping to help Javascript to decode the minified string.
pub fn get_key_to_word_mapping(&self) -> HashMap<String, String> {
self.mapping
.iter()
.map(|(key, value)| (value.to_owned(), (*key).to_owned()))
.collect()
}

#[inline]
pub fn minify_crate_name(&self, name: &str) -> String {
let vec: Vec<&str> = name
.split(|c| c == '_' || c == '-')
.map(|item| self.mapping.get(item).map(Deref::deref).unwrap_or(item))
.collect();
vec.join("_")
}

#[inline]
pub fn minify_description(&self, description: &str) -> String {
description
.split_word_bounds()
.map(|item| self.mapping.get(item).map(Deref::deref).unwrap_or(item))
.collect()
}

#[inline]
pub fn minify_js(json: &str) -> String {
let tokens: Tokens = simple_minify(json)
.into_iter()
.map(|token| match token {
Token::Keyword(Keyword::Null) => Token::Other("N"),
_ => token,
})
.collect::<Vec<_>>()
.into();
aggregate_strings_into_array_filter(tokens, "C", |tokens, position| {
// Ignore the key of json (AKA, the crate id).
position > 5 && !tokens[position + 1].eq_char(ReservedChar::Colon)
})
.to_string()
}
}
46 changes: 46 additions & 0 deletions rust/src/tasks/advisory.rs
Original file line number Diff line number Diff line change
@@ -0,0 +1,46 @@
use argh::FromArgs;
use rayon::prelude::*;
use rustsec::{database::Query, Collection, Database};
use std::{collections::HashMap, fs, io::Write, path::Path};

use super::Task;

const ADVISORY_INDEX_PATH: &str = "../docs/static/advisory";

/// Advisory task
#[derive(FromArgs)]
#[argh(subcommand, name = "advisory")]
pub struct AdvisoryTask {
/// destination path
#[argh(option, short = 'd', default = "ADVISORY_INDEX_PATH.to_string()")]
dest_path: String,
}

impl Task for AdvisoryTask {
fn execute(&self) -> crate::Result<()> {
let mut map = HashMap::new();
let db = Database::fetch()?;
for advisory in db
.query(&Query::new().collection(Collection::Crates).withdrawn(false))
.into_iter()
{
map.entry(&advisory.metadata.package)
.or_insert_with(Vec::new)
.push(advisory);
}

let path = Path::new(&self.dest_path);
if !path.exists() {
fs::create_dir(path)?;
}
map.par_iter_mut().for_each(|(package, advisories)| {
// sort advisories by date
advisories.sort_by(|a, b| b.metadata.date.cmp(&a.metadata.date));
let package = package.as_str().replace('-', "_");
let mut file = fs::File::create(path.join(format!("{package}.json"))).unwrap();
let json = serde_json::to_string_pretty(&advisories).unwrap();
file.write_all(json.as_bytes()).unwrap();
});
Ok(())
}
}
171 changes: 171 additions & 0 deletions rust/src/tasks/books.rs
Original file line number Diff line number Diff line change
@@ -0,0 +1,171 @@
use std::fs;
use std::path::Path;

use argh::FromArgs;
use futures::future::try_join_all;
use regex::Regex;
use select::document::Document;
use select::node::Node;
use select::predicate::{Class, Name};
use serde::ser::SerializeTuple;
use serde::{Serialize, Serializer};
use tokio::runtime::Runtime;

use crate::minify::Minifier;
use crate::tasks::Task;

const BOOKS_INDEX_PATH: &str = "../lib/index/books.js";
const COMMANDS: &str = include_str!("../../../lib/index/commands.js");

/// Books task
#[derive(FromArgs)]
#[argh(subcommand, name = "books")]
pub struct BooksTask {
/// destination path
#[argh(option, short = 'd', default = "BOOKS_INDEX_PATH.to_string()")]
dest_path: String,
}

#[derive(Debug)]
struct Page {
title: String,
path: String,
parent_titles: Option<Vec<String>>,
}

#[derive(Serialize, Debug, Default)]
struct Book<'a> {
name: &'a str,
url: &'a str,
#[serde(skip_deserializing)]
pages: Vec<Page>,
}

impl<'a> Book<'a> {
fn is_empty(&self) -> bool {
self.name.is_empty() || self.url.is_empty()
}
}

impl Page {
#[inline]
fn parse(node: &Node) -> Option<Page> {
if let Some(a) = node.first_child().filter(|n| n.is(Name("a"))) {
let title = a.text();
let path = a
.attr("href")
.unwrap()
.trim_end_matches(".html")
.to_string();

Some(Page {
title,
path,
parent_titles: None,
})
} else {
None
}
}
}

impl Serialize for Page {
#[inline]
fn serialize<S>(&self, serializer: S) -> Result<<S as Serializer>::Ok, <S as Serializer>::Error>
where
S: Serializer,
{
let mut ser = serializer.serialize_tuple(3)?;
ser.serialize_element(&self.title)?;
ser.serialize_element(&self.path)?;
ser.serialize_element(&self.parent_titles)?;
ser.end()
}
}

#[inline]
fn parse_node(node: &Node, parent_titles: Option<Vec<String>>) -> Vec<Page> {
let mut pages = vec![];
for child in node.children() {
if child.is(Class("expanded")) || child.first_child().filter(|n| n.is(Name("a"))).is_some()
{
if let Some(mut page) = Page::parse(&child) {
page.parent_titles = parent_titles.clone();
pages.push(page);
}
} else {
let mut new_parent_titles = parent_titles.clone().unwrap_or_default();
if let Some(page) = child.prev().and_then(|n| Page::parse(&n)) {
new_parent_titles.push(page.title);
if let Some(section) = child.find(Class("section")).next() {
pages.extend(parse_node(&section, Some(new_parent_titles)))
}
}
}
}
pages
}

async fn fetch_book(mut book: Book<'_>) -> crate::Result<Book<'_>> {
let html = reqwest::get(book.url).await?.text().await?;
let doc = Document::from(html.as_str());
if let Some(node) = doc.find(Class("chapter")).next() {
book.pages = parse_node(&node, None);
Ok(book)
} else {
println!("Parse failed, book `{}` is ignored.", book.name);
Ok(Book::default())
}
}

impl Task for BooksTask {
fn execute(&self) -> crate::Result<()> {
let rt = Runtime::new()?;
rt.block_on(self.run())?;
Ok(())
}
}

impl BooksTask {
async fn run(&self) -> crate::Result<()> {
let re = Regex::new(r#"^\["(.*)",\s?"(.*)"\]"#).unwrap();
let mut books = vec![];
let mut started = false;
for line in COMMANDS.lines() {
if line.trim().starts_with("\"book\"") {
started = true;
} else if line.trim().starts_with("\"book/zh\"") {
break;
}

if started {
if let Some(capture) = re.captures(line.trim()) {
let book = Book {
name: capture.get(1).unwrap().as_str(),
url: capture.get(2).unwrap().as_str(),
pages: Vec::default(),
};
books.push(book);
}
}
}
println!("{:?}", books);
let futures: Vec<_> = books.into_iter().map(fetch_book).collect();
match try_join_all(futures).await {
Ok(result) => {
let books: Vec<_> = result.into_iter().filter(|book| !book.is_empty()).collect();
let contents = format!(
"var N=null;const booksIndex={};export default booksIndex;",
serde_json::to_string(&books)?
);
let path = Path::new(&self.dest_path);
fs::write(path, Minifier::minify_js(&contents)).unwrap();
}
Err(error) => {
println!("{:?}", error);
}
}

Ok(())
}
}
Loading

0 comments on commit 1e9753b

Please sign in to comment.