diff --git a/.gitignore b/.gitignore index b6e4761..9f4d854 100644 --- a/.gitignore +++ b/.gitignore @@ -1,4 +1,6 @@ # Byte-compiled / optimized / DLL files +dataset/* +.idea/* __pycache__/ *.py[cod] *$py.class diff --git a/README.md b/README.md index 67c158a..16f3d8e 100644 --- a/README.md +++ b/README.md @@ -1 +1,95 @@ # Plagarism Detection +Code for detecting extrinsic and intrinsic plagiarism +### Requirements +```python +pip install -r requirements.txt +``` +If you encounter an error `ImportError: cannot import name 'complexity' from 'cophi'` +then run ` pip install cophi==1.2.3` + +### Config +```yaml +extrinsic: + source: + # dir where .txt files are stored for source + dir: + - dataset/subset/subset_1/sou_1 + - dataset/subset/subset_2/sou_2 + - dataset/subset/subset_3/sou_3 + + # If pth is not present it will compute do the pre-proceissing + # and save them so for next run its will skip the processing and used data from csv + pth: dataset/source_sent_all_three_subset.csv + + suspicious: + # dir where .txt files are stored for source + dir: + - dataset/subset/subset_1/sus_1 + + # If pth is not present it will compute do the pre-proceissing + # and save them so for next run its will skip the processing and used data from csv + pth: dataset/suspicious_sent.csv + + index: dataset/output/se_index_subset_1_2_3.index + save: dataset/output/set1/SE/se_output_subset_1_with_all_three_source.csv + +intrinsic: + suspicious: + # dir where .txt files are stored for source + dir: + - dataset/pan-plagiarism-corpus-2009.part3/pan-plagiarism-corpus-2009/intrinsic-analysis-corpus/suspicious-documents + - dataset/pan-plagiarism-corpus-2009.part2/pan-plagiarism-corpus-2009/intrinsic-analysis-corpus/suspicious-documents + - dataset/pan-plagiarism-corpus-2009/intrinsic-analysis-corpus/suspicious-documents + + # If pth is not present it will compute do the pre-proceissing + # and save them so for next run its will skip the processing and used data from csv + pth: path/to/suspicious_sent_intrinsic.csv + + save: path/to/save/intrinsic_output.csv + + features: + - automated_readability_index + - average_sentence_length_chars + - average_sentence_length_words + - average_syllables_per_word + - average_word_frequency_class + - average_word_length + - coleman_liau_index + - flesch_reading_ease + - functionword_frequency + - linsear_write_formula + - most_common_words_without_stopwords + - number_frequency + - punctuation_frequency + - sentence_length_distribution + - special_character_frequency + - stopword_ratio + - top_3_gram_frequency + - top_bigram_frequency + - top_word_bigram_frequency + - uppercase_frequency + - word_length_distribution + - yule_k_metric + +evaluation: + results: path/where/results.csv + ground_truth: path/where/ground_truth.csv +``` +### Run Extrinsic +```python +# USING TFIDF FOR FEATURES +python extrinsic_tfidf --config path/to/config.yaml + +# USING DISTILL_BERT FOR FEATURES +python extrinsic_se --config path/to/config.yaml +``` + +### Run Intrinsic +```python +python intrinsic --config path/to/config.yaml +``` + +### Evaluate +```python +python evaluation --config path/to/config.yaml +``` \ No newline at end of file diff --git a/config.yaml b/config.yaml new file mode 100644 index 0000000..758826b --- /dev/null +++ b/config.yaml @@ -0,0 +1,53 @@ +extrinsic: + source: + dir: + - dataset/subset/subset_1/sou_1 + - dataset/subset/subset_2/sou_2 + - dataset/subset/subset_3/sou_3 + pth: dataset/source_sent_all_three_subset.csv + + suspicious: + dir: + - dataset/subset/subset_1/sus_1 + pth: dataset/suspicious_sent.csv + + index: dataset/output/se_index_subset_1_2_3.index + save: dataset/output/set1/SE/se_output_subset_1_with_all_three_source.csv + +intrinsic: + suspicious: + dir: + - dataset/pan-plagiarism-corpus-2009.part3/pan-plagiarism-corpus-2009/intrinsic-analysis-corpus/suspicious-documents + - dataset/pan-plagiarism-corpus-2009.part2/pan-plagiarism-corpus-2009/intrinsic-analysis-corpus/suspicious-documents + - dataset/pan-plagiarism-corpus-2009/intrinsic-analysis-corpus/suspicious-documents + pth: dataset/suspicious_sent_intrinsic.csv + + save: dataset/output/intrinsic_output.csv + + features: + - automated_readability_index + - average_sentence_length_chars + - average_sentence_length_words + - average_syllables_per_word + - average_word_frequency_class + - average_word_length + - coleman_liau_index + - flesch_reading_ease + - functionword_frequency + - linsear_write_formula + - most_common_words_without_stopwords + - number_frequency + - punctuation_frequency + - sentence_length_distribution + - special_character_frequency + - stopword_ratio + - top_3_gram_frequency + - top_bigram_frequency + - top_word_bigram_frequency + - uppercase_frequency + - word_length_distribution + - yule_k_metric + +evaluation: + results: dataset/results.csv + ground_truth: dataset/ground_truth.csv \ No newline at end of file diff --git a/plagiarism/__init__.py b/plagiarism/__init__.py new file mode 100644 index 0000000..8648fba --- /dev/null +++ b/plagiarism/__init__.py @@ -0,0 +1,8 @@ +import logging + +logging.basicConfig( + format="[%(levelname)s][%(asctime)s]:%(message)s", + level=logging.INFO, + datefmt="%Y-%m-%d %H:%M:%S", +) +logger = logging.getLogger() diff --git a/plagiarism/detector.py b/plagiarism/detector.py new file mode 100644 index 0000000..b63d9d7 --- /dev/null +++ b/plagiarism/detector.py @@ -0,0 +1,227 @@ +import logging +import os +from dataclasses import dataclass +from typing import Optional, Any, List + +import hnswlib +import numpy as np +import pandas as pd +from sentence_transformers.util import cos_sim + +from plagiarism.doc import SourceDocumentCollection, SuspiciousDocumentCollection +from plagiarism.vectorizer import StyleEmbedding + +logger = logging.getLogger() + + +@dataclass +class ExtrinsicOutput: + nn: Any + score: Any + + +@dataclass +class IntrinsicOutput: + file_names: List + sentences: List + + +class Plagiarism: + def __init__( + self, + source_doc: Optional[SourceDocumentCollection] = None, + suspicious_doc: Optional[SuspiciousDocumentCollection] = None, + approach=None, + ): + + self.source_doc = source_doc + self.suspicious_doc = suspicious_doc + self.approach = approach + + self.index = None + + def query(self, **kwargs): + raise NotImplementedError + + def save(self, **kwargs): + raise NotImplementedError + + +class Extrinsic(Plagiarism): + def __init__( + self, + source_doc: Optional[SourceDocumentCollection] = None, + suspicious_doc: Optional[SuspiciousDocumentCollection] = None, + vector_model=None, + ): + super().__init__(source_doc, suspicious_doc, vector_model) + self._header = [ + "suspicious_filename", + "plagarised_filename", + "suspicious", + "plagarised", + "score", + ] + + def index_embedding(self, embeddings, pth, ef_construction=400, m=64, ef=50): + n, dim = embeddings.shape + self.index = hnswlib.Index(space="cosine", dim=dim) + self.index.init_index(max_elements=n, ef_construction=ef_construction, M=m) + self.index.add_items(embeddings, list(range(n))) + logger.info(f"SAVING GENERATED INDEX AT {pth}") + self.index.save_index(pth) + + self.index.set_ef(ef) + + def _load_saved_index(self, pth, dim, ef): + self.index = hnswlib.Index(space="cosine", dim=dim) + self.index.load_index(pth) + self.index.set_ef(ef) + + def nn_index( + self, index_pth, dim: int = None, ef_construction=400, m=64, ef=50, **kwargs + ): + if os.path.exists(index_pth): + logger.info(f"LOADING INDEX FROM {index_pth}") + self._load_saved_index(index_pth, dim, ef) + else: + logger.info("GENERATING INDEX") + embeddings = self.approach.run(self.source_doc.get_normalised_sentences()) + self.index_embedding( + embeddings, index_pth, ef_construction=ef_construction, m=m, ef=ef + ) + return self + + def query(self, nn=5): + logger.info("VECTORIZATION IN PROGRESS") + embeddings = self.approach.run(self.suspicious_doc.get_normalised_sentences()) + + logger.info("QUERYING DATA") + nn, distances = self.index.knn_query(embeddings, nn) + + return ExtrinsicOutput(nn, 1 - distances) + + def save(self, pth, extrinsic_output: ExtrinsicOutput, distance_threshold=0.20): + logger.info(f"SAVING IN PROGRESS AT {pth}") + + filtered_output_idx = np.where(extrinsic_output.score >= distance_threshold) + + suspicious_sentences_idx = filtered_output_idx[0] + source_sentences_idx = extrinsic_output.nn[filtered_output_idx] + + suspicious_sentences_filtered = self.suspicious_doc.get_sentences()[ + suspicious_sentences_idx + ] + source_sentences_filtered = self.source_doc.get_sentences()[ + source_sentences_idx + ] + + suspicious_file_filtered = self.suspicious_doc.get_file_names()[ + suspicious_sentences_idx + ] + source_file_filtered = self.source_doc.get_file_names()[source_sentences_idx] + + pd.DataFrame( + np.column_stack( + [ + suspicious_file_filtered, + source_file_filtered, + suspicious_sentences_filtered, + source_sentences_filtered, + np.round(extrinsic_output.score[filtered_output_idx], 2), + ] + ), + columns=self._header, + ).to_csv(pth) + + +class Intrinsic(Plagiarism): + def __init__( + self, + suspicious_doc: Optional[SuspiciousDocumentCollection] = None, + vector_model=None, + min_threshold: float = 0.60, + ignore_sentence_with_len: int = 500, + ): + super().__init__(None, suspicious_doc, vector_model) + self._header = [ + "suspicious_filename", + "plagarised", + ] + self._min_threshold = min_threshold + self._ignore_sentence_with_len = ignore_sentence_with_len + + def query(self, **kwargs): + plagiarised_sent = [] + file_names = [] + logger.info("QUERYING DATA") + for file_name, sentences in self.suspicious_doc.sentence_per_file_gen(): + if len(sentences) < self._ignore_sentence_with_len: + + embeddings = self.approach.run(sentences) + mean_embeddings = embeddings.mean(axis=0).reshape(1, -1) + cosine_scores = cos_sim(mean_embeddings, embeddings).numpy().flatten() + + plagiarised = list( + sentences[np.where(cosine_scores <= self._min_threshold)] + ) + + if len(plagiarised) > 0: + file_names.extend([file_name] * len(plagiarised)) + plagiarised_sent.extend(plagiarised) + else: + file_names.extend([file_name]) + plagiarised_sent.extend(["NONE"]) + + return IntrinsicOutput(file_names, plagiarised_sent) + + def save(self, pth, intrinsic_output: IntrinsicOutput, **kwargs): + pd.DataFrame( + np.column_stack( + [ + intrinsic_output.file_names, + intrinsic_output.sentences, + ] + ), + columns=self._header, + ).to_csv(pth) + + +def extrinsic_plg( + source_doc_pth, + suspicious_doc_pth, + source_doc_dir: list, + suspicious_doc_dir: list, + index_pth: str, + save_pth: str, + vector_model, + distance_threshold: float = 0.90, +): + source_doc = SourceDocumentCollection( + pth=source_doc_pth, + dir_iter=source_doc_dir, + ).extract_sentences() + + suspicious_doc = SuspiciousDocumentCollection( + pth=suspicious_doc_pth, dir_iter=suspicious_doc_dir + ).extract_sentences() + + ex = Extrinsic(source_doc, suspicious_doc, vector_model=vector_model) + ex.nn_index(index_pth) + ex_op = ex.query() + ex.save( + save_pth, + ex_op, + distance_threshold=distance_threshold, + ) + + +def intrinsic_plg(suspicious_pth: str, suspicious_dir: list, features: list): + suspicious_doc = SuspiciousDocumentCollection( + pth=suspicious_pth, + dir_iter=suspicious_dir, + ).extract_sentences() + + ii = Intrinsic(suspicious_doc=suspicious_doc, vector_model=StyleEmbedding(features)) + op = ii.query() + ii.save("intrinsic_output.csv", op) diff --git a/plagiarism/doc.py b/plagiarism/doc.py new file mode 100644 index 0000000..92be20d --- /dev/null +++ b/plagiarism/doc.py @@ -0,0 +1,109 @@ +import logging +import os +from typing import List + +import numpy as np +import pandas as pd +from tqdm import tqdm + +from plagiarism.util import get_sentences_from_df, generate_para_df, normalize_data + +logger = logging.getLogger() + + +class DocumentCollection: + def __init__(self, pth, dir_iter: List): + self._df = None + self.pth = pth + self.dir_iter = dir_iter + + @staticmethod + def _sentences(files): + logger.info("SENTENCE GENERATION") + df = pd.DataFrame() + for i, file in enumerate(tqdm(files)): + df1 = pd.DataFrame() + + sentences = get_sentences_from_df(generate_para_df(file)) + tokenized_sentences = dict() + + for j, sent in enumerate(sentences): + tokenized_text = normalize_data(sent) + if len(tokenized_text) >= 5: + tokenized_sentences[sent] = " ".join(tokenized_text) + + df1["filename"] = [str(os.path.basename(file))] * len(tokenized_sentences) + df1["sentences"] = list(tokenized_sentences.keys()) + df1["normalised"] = list(tokenized_sentences.values()) + + df = pd.concat([df, df1], ignore_index=True, sort=False) + df["idx"] = range(0, len(df)) + return df + + def get_sentences(self) -> np.ndarray: + return self._df["sentences"].to_numpy() + + def get_normalised_sentences(self) -> np.ndarray: + return self._df["normalised"].to_numpy() + + def get_filename(self, idx): + return self._df.loc[self._df["idx"] == idx]["filename"].values[0] + + def get_file_names(self) -> np.ndarray: + return self._df["filename"].to_numpy() + + def extract_sentences(self): + raise NotImplementedError + + def sentence_per_file_gen(self) -> np.ndarray: + _file_name = list(pd.unique(self.get_file_names())) + for file in tqdm(_file_name): + yield _file_name, self._df.loc[self._df["filename"] == file][ + "normalised" + ].values + + +class SourceDocumentCollection(DocumentCollection): + def __init__(self, pth, dir_iter: List): + super().__init__(pth, dir_iter) + + def extract_sentences(self): + if os.path.exists(self.pth): + logger.info(f"READING FROM {self.pth}") + self._df = pd.read_csv(self.pth) + else: + files_collection = list() + for sub_dir in self.dir_iter: + for root, dirs, files in os.walk(sub_dir): + for file in files: + if file.endswith(".txt"): + if "source-document" in file: + files_collection.append(os.path.join(root, file)) + + self._df = self._sentences(files_collection) + logger.info(f"Saving Generated sentences at {self.pth}") + self._df.to_csv(self.pth) + return self + + +class SuspiciousDocumentCollection(DocumentCollection): + def __init__(self, pth, dir_iter: List): + super().__init__(pth, dir_iter) + + def extract_sentences(self): + if os.path.exists(self.pth): + logger.info(f"READING FROM {self.pth}") + self._df = pd.read_csv(self.pth) + else: + files_collection = list() + for sub_dir in self.dir_iter: + for root, dirs, files in os.walk(sub_dir): + for file in files: + if file.endswith(".txt"): + if "suspicious-document" in file: + files_collection.append(os.path.join(root, file)) + + self._df = self._sentences(files_collection) + logger.info(f"Saving Generated sentences at {self.pth}") + self._df.to_csv(self.pth) + return self diff --git a/plagiarism/util.py b/plagiarism/util.py new file mode 100644 index 0000000..9d69ca9 --- /dev/null +++ b/plagiarism/util.py @@ -0,0 +1,213 @@ +import string +import re +import nltk +from typing import List + +import pandas as pd +from nltk import word_tokenize +from nltk.corpus import stopwords +from tqdm import tqdm + +INPUT_COL = "para" +PARA_COL = "para" + +pattern_digits = r"\d+(nd|th|st)*" +pattern_space = r"\s{2,}" +pattern_special_chars = r"[^\w\s]|(_)+" +pattern_url = r"(https|http)?:\/\/(\w|\.|\/|\?|\=|\&|\%)*\b" + +alphabets = "([A-Za-z])" +prefixes = "(Mr|St|Mrs|Ms|Dr|Prof|Capt|Cpt|Lt|Mt)[.]" +suffixes = "(Inc|Ltd|Jr|Sr|Co)" +starters = "(Mr|Mrs|Ms|Dr|He\s|She\s|It\s|They\s|Their\s|Our\s|We\s|But\s|However\s|That\s|This\s|Wherever)" +acronyms = "([A-Z][.][A-Z][.](?:[A-Z][.])?)" +websites = "[.](com|net|org|io|gov|me|edu)" +digits = "([0-9])" + +nltk.download("stopwords") +nltk.download("punkt") +nltk.download("wordnet") +nltk.download("omw-1.4") + + +def split_into_sentences(text): + text = " " + text + " " + text = text.replace("\n", " ") + text = re.sub(prefixes, "\\1", text) + text = re.sub(websites, "\\1", text) + if "Ph.D" in text: + text = text.replace("Ph.D.", "PhD") + text = re.sub("\s" + alphabets + "[.] ", " \\1 ", text) + text = re.sub(acronyms + " " + starters, "\\1 \\2", text) + text = re.sub( + alphabets + "[.]" + alphabets + "[.]" + alphabets + "[.]", + "\\1\\2\\3", + text, + ) + text = re.sub(alphabets + "[.]" + alphabets + "[.]", "\\1\\2", text) + text = re.sub(" " + suffixes + "[.] " + starters, " \\1 \\2", text) + text = re.sub(" " + suffixes + "[.]", " \\1", text) + text = re.sub(" " + alphabets + "[.]", " \\1", text) + text = re.sub(digits + "[.]" + digits, "\\1\\2", text) + if "e.g." in text: + text = text.replace("e.g.", "eg") + if "..." in text: + text = text.replace("...", "") + if "i.e." in text: + text = text.replace("i.e.", "ie") + if "”" in text: + text = text.replace(".”", "”.") + if '"' in text: + text = text.replace('."', '".') + if "!" in text: + text = text.replace('!"', '"!') + if "?" in text: + text = text.replace('?"', '"?') + text = text.replace(".", ".") + text = text.replace("?", "?") + text = text.replace("!", "!") + text = text.replace("", ".") + sentences = text.split("") + sentences = sentences[:-1] + sentences = [s.strip() for s in sentences] + return sentences + + +def generate_para_df(filepath): + para_content = list() + with open(filepath, "r", encoding="utf-8") as rf: + _content = [] + for line in rf: + if line == "\n": + para_content.append(" ".join(_content)) + _content = [] + else: + _content.append(line.strip()) + return pd.DataFrame(para_content, columns=[PARA_COL]) + + +def remove_symbols_numbers_letters_consonants(word_token: List): + clean_token = [] + for token in word_token: + token = token.lower() + new_token = re.sub(r"[^a-zA-Z]+", "", token) + if new_token != "" and len(new_token) >= 2: + vowels = len([v for v in new_token if v in "aeiou"]) + if vowels != 0: + clean_token.append(new_token) + return clean_token + + +def case_conversion(text: string): + return text.lower() + + +def apply_regex(text: string): + text = re.sub(pattern_url, "", text) + text = re.sub(pattern_digits, "", text) + text = re.sub(pattern_special_chars, " ", text) + text = re.sub(pattern_space, " ", text) + return text + + +def remove_stop_words(word_token: List): + stop_words = set(stopwords.words("english")) + words_filtered = [] + + for w in word_token: + if w not in stop_words: + words_filtered.append(w) + return words_filtered + + +def sentences_from_para(para): + return split_into_sentences(para) + + +def normalize_data(data: str): + text = case_conversion(data) + text = apply_regex(text) + + tokenized_text = word_tokenize(text) + tokenized_text = remove_symbols_numbers_letters_consonants(tokenized_text) + tokenized_text = remove_stop_words(tokenized_text) + return tokenized_text + + +def get_sentences_from_df(data): + _ip_sent = [] + for idx, row in data.iterrows(): + for sent in sentences_from_para(row[INPUT_COL]): + _ip_sent.append(sent) + return _ip_sent + + +def evaluation_iterator(results: str, ground_truth: str): + gt_df = pd.read_csv(ground_truth) + output_df = pd.read_csv(results) + suspicious_reference = gt_df.loc[gt_df.loc[:, "name"] == "artificial-plagiarism"][ + "suspicious_reference" + ].unique() + for i, sus in enumerate(tqdm(suspicious_reference)): + print(f"Loading: {i + 1}/{len(suspicious_reference)}") + temp_df = output_df.loc[ + output_df.loc[:, "suspicious_filename"] == sus.replace(".xml", ".txt") + ] + temp_gt_df = gt_df.loc[gt_df.loc[:, "suspicious_reference"] == sus] + temp_gt_df = temp_gt_df.loc[ + temp_gt_df.loc[:, "name"] == "artificial-plagiarism" + ] + suspicious_text = temp_df["suspicious"].unique() + suspicious_gt_text = "".join(temp_gt_df["suspicious_text"].to_list()) + + yield i, suspicious_text, suspicious_gt_text + + +def jaccard_similarity(text1: str, text2: str): + set1 = set(text1) + set2 = set(text2) + return float(len(set1.intersection(set2)) / len(set1.union(set2))) + + +def precision(results: str, ground_truth: str) -> List: + scores = [] + for i, suspicious_text, suspicious_gt_text in evaluation_iterator( + results, ground_truth + ): + match_len = 0 + for j, suspicious_sentence in enumerate(suspicious_text): + if suspicious_gt_text.find(suspicious_sentence.strip()) != -1: + match_len += len(suspicious_sentence.strip()) + + scores.append(match_len / len(suspicious_gt_text)) + + return scores + + +def recall(results: str, ground_truth: str) -> List: + scores = [] + for i, suspicious_text, suspicious_gt_text in evaluation_iterator( + results, ground_truth + ): + tp = 0 + fn = 0 + for j, suspicious_sentence in enumerate(suspicious_text): + if suspicious_gt_text.find(suspicious_sentence.strip()) != -1: + tp += 1 + else: + fn += 1 + if tp + fn == 0: + if len(suspicious_gt_text) == 0: + scores.append(1) + else: + scores.append(0) + continue + scores.append(tp / (tp + fn)) + return scores + + +def metric(results: str, ground_truth: str): + return { + "recall": recall(results, ground_truth), + "precision": precision(results, ground_truth), + } diff --git a/plagiarism/vectorizer.py b/plagiarism/vectorizer.py new file mode 100644 index 0000000..a358f9c --- /dev/null +++ b/plagiarism/vectorizer.py @@ -0,0 +1,48 @@ +from typing import Optional, List + +import numpy as np +from authorstyle import get_feature_vector, all_feature_functions, Text +from sentence_transformers import SentenceTransformer +from sklearn.feature_extraction.text import HashingVectorizer +from sklearn.preprocessing import MinMaxScaler + + +class Vectorizer: + def run(self, **kwargs): + raise NotImplementedError + + +class SE(Vectorizer): + def __init__(self, model_id: Optional[str] = "all-MiniLM-L6-v2"): + self._model = SentenceTransformer(model_id) + + def run(self, sentences: List, **kwargs): + _embeddings = list() + return self._model.encode(sentences) + + +class TFIDFHashing(Vectorizer): + def __init__(self, n_features=20): + self._model = HashingVectorizer(n_features=n_features) + + def run(self, sentences: List, **kwargs): + return self._model.fit_transform(sentences).toarray() + + +class StyleEmbedding(Vectorizer): + def __init__(self, features_to_use: List, **kwargs): + self._features_to_use = [] + for feat in all_feature_functions(): + if feat.__name__ in features_to_use: + self._features_to_use.append(feat) + + def run(self, sentences: List, **kwargs): + tex = Text(" ".join(sentences)) + tex.set_sliding_window(window_size=5, step_size=1, unit=sentences) + feat_vec = np.array( + get_feature_vector(tex, self._features_to_use, fragments=True) + ) + # normalised_feat_vec = self._norm_func.fit_transform( + # feat_vec.astype(np.float32).T + # ).T + return feat_vec / np.linalg.norm(feat_vec, axis=-1).reshape(-1, 1) diff --git a/requirements.txt b/requirements.txt new file mode 100644 index 0000000..1431262 --- /dev/null +++ b/requirements.txt @@ -0,0 +1,9 @@ +hnswlib == 0.6.2 +nltk == 3.7 +pandas == 1.3.5 +scikit_learn == 1.0.2 +sentence_transformers == 2.2.0 +tqdm == 4.64.0 +authorstyle==0.2 +cophi==1.2.3 +numpy diff --git a/run.py b/run.py new file mode 100644 index 0000000..ac322a6 --- /dev/null +++ b/run.py @@ -0,0 +1,52 @@ +import fire +from omegaconf import OmegaConf + +from plagiarism.detector import extrinsic_plg, intrinsic_plg +from plagiarism.util import metric +from plagiarism.vectorizer import TFIDFHashing, SE + + +def extrinsic_se(config): + conf = OmegaConf.load(config) + + extrinsic_plg( + conf.extrinsic.source.pth, + conf.extrinsic.suspicious.pth, + conf.extrinsic.source.dir, + conf.extrinsic.suspicious.dir, + conf.extrinsic.index, + conf.extrinsic.save, + SE(), + ) + + +def extrinsic_tfidf(config): + conf = OmegaConf.load(config) + + extrinsic_plg( + conf.extrinsic.source.pth, + conf.extrinsic.suspicious.pth, + conf.extrinsic.source.dir, + conf.extrinsic.suspicious.dir, + conf.extrinsic.index, + conf.extrinsic.save, + TFIDFHashing(), + ) + + +def intrinsic(config): + conf = OmegaConf.load(config) + intrinsic_plg( + conf.intrinsic.suspicious.pth, + conf.intrinsic.suspicious.dir, + conf.intrinsic.features, + ) + + +def evaluation(config): + conf = OmegaConf.load(config) + print(metric(**conf.evaluation)) + + +if __name__ == "__main__": + fire.Fire()