-
Notifications
You must be signed in to change notification settings - Fork 0
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Merge pull request #1 from cypherics/submission
Submission
- Loading branch information
Showing
10 changed files
with
815 additions
and
0 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,4 +1,6 @@ | ||
# Byte-compiled / optimized / DLL files | ||
dataset/* | ||
.idea/* | ||
__pycache__/ | ||
*.py[cod] | ||
*$py.class | ||
|
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1 +1,95 @@ | ||
# Plagarism Detection | ||
Code for detecting extrinsic and intrinsic plagiarism | ||
### Requirements | ||
```python | ||
pip install -r requirements.txt | ||
``` | ||
If you encounter an error `ImportError: cannot import name 'complexity' from 'cophi'` | ||
then run ` pip install cophi==1.2.3` | ||
|
||
### Config | ||
```yaml | ||
extrinsic: | ||
source: | ||
# dir where .txt files are stored for source | ||
dir: | ||
- dataset/subset/subset_1/sou_1 | ||
- dataset/subset/subset_2/sou_2 | ||
- dataset/subset/subset_3/sou_3 | ||
|
||
# If pth is not present it will compute do the pre-proceissing | ||
# and save them so for next run its will skip the processing and used data from csv | ||
pth: dataset/source_sent_all_three_subset.csv | ||
|
||
suspicious: | ||
# dir where .txt files are stored for source | ||
dir: | ||
- dataset/subset/subset_1/sus_1 | ||
|
||
# If pth is not present it will compute do the pre-proceissing | ||
# and save them so for next run its will skip the processing and used data from csv | ||
pth: dataset/suspicious_sent.csv | ||
|
||
index: dataset/output/se_index_subset_1_2_3.index | ||
save: dataset/output/set1/SE/se_output_subset_1_with_all_three_source.csv | ||
|
||
intrinsic: | ||
suspicious: | ||
# dir where .txt files are stored for source | ||
dir: | ||
- dataset/pan-plagiarism-corpus-2009.part3/pan-plagiarism-corpus-2009/intrinsic-analysis-corpus/suspicious-documents | ||
- dataset/pan-plagiarism-corpus-2009.part2/pan-plagiarism-corpus-2009/intrinsic-analysis-corpus/suspicious-documents | ||
- dataset/pan-plagiarism-corpus-2009/intrinsic-analysis-corpus/suspicious-documents | ||
|
||
# If pth is not present it will compute do the pre-proceissing | ||
# and save them so for next run its will skip the processing and used data from csv | ||
pth: path/to/suspicious_sent_intrinsic.csv | ||
|
||
save: path/to/save/intrinsic_output.csv | ||
|
||
features: | ||
- automated_readability_index | ||
- average_sentence_length_chars | ||
- average_sentence_length_words | ||
- average_syllables_per_word | ||
- average_word_frequency_class | ||
- average_word_length | ||
- coleman_liau_index | ||
- flesch_reading_ease | ||
- functionword_frequency | ||
- linsear_write_formula | ||
- most_common_words_without_stopwords | ||
- number_frequency | ||
- punctuation_frequency | ||
- sentence_length_distribution | ||
- special_character_frequency | ||
- stopword_ratio | ||
- top_3_gram_frequency | ||
- top_bigram_frequency | ||
- top_word_bigram_frequency | ||
- uppercase_frequency | ||
- word_length_distribution | ||
- yule_k_metric | ||
|
||
evaluation: | ||
results: path/where/results.csv | ||
ground_truth: path/where/ground_truth.csv | ||
``` | ||
### Run Extrinsic | ||
```python | ||
# USING TFIDF FOR FEATURES | ||
python extrinsic_tfidf --config path/to/config.yaml | ||
|
||
# USING DISTILL_BERT FOR FEATURES | ||
python extrinsic_se --config path/to/config.yaml | ||
``` | ||
|
||
### Run Intrinsic | ||
```python | ||
python intrinsic --config path/to/config.yaml | ||
``` | ||
|
||
### Evaluate | ||
```python | ||
python evaluation --config path/to/config.yaml | ||
``` |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,53 @@ | ||
extrinsic: | ||
source: | ||
dir: | ||
- dataset/subset/subset_1/sou_1 | ||
- dataset/subset/subset_2/sou_2 | ||
- dataset/subset/subset_3/sou_3 | ||
pth: dataset/source_sent_all_three_subset.csv | ||
|
||
suspicious: | ||
dir: | ||
- dataset/subset/subset_1/sus_1 | ||
pth: dataset/suspicious_sent.csv | ||
|
||
index: dataset/output/se_index_subset_1_2_3.index | ||
save: dataset/output/set1/SE/se_output_subset_1_with_all_three_source.csv | ||
|
||
intrinsic: | ||
suspicious: | ||
dir: | ||
- dataset/pan-plagiarism-corpus-2009.part3/pan-plagiarism-corpus-2009/intrinsic-analysis-corpus/suspicious-documents | ||
- dataset/pan-plagiarism-corpus-2009.part2/pan-plagiarism-corpus-2009/intrinsic-analysis-corpus/suspicious-documents | ||
- dataset/pan-plagiarism-corpus-2009/intrinsic-analysis-corpus/suspicious-documents | ||
pth: dataset/suspicious_sent_intrinsic.csv | ||
|
||
save: dataset/output/intrinsic_output.csv | ||
|
||
features: | ||
- automated_readability_index | ||
- average_sentence_length_chars | ||
- average_sentence_length_words | ||
- average_syllables_per_word | ||
- average_word_frequency_class | ||
- average_word_length | ||
- coleman_liau_index | ||
- flesch_reading_ease | ||
- functionword_frequency | ||
- linsear_write_formula | ||
- most_common_words_without_stopwords | ||
- number_frequency | ||
- punctuation_frequency | ||
- sentence_length_distribution | ||
- special_character_frequency | ||
- stopword_ratio | ||
- top_3_gram_frequency | ||
- top_bigram_frequency | ||
- top_word_bigram_frequency | ||
- uppercase_frequency | ||
- word_length_distribution | ||
- yule_k_metric | ||
|
||
evaluation: | ||
results: dataset/results.csv | ||
ground_truth: dataset/ground_truth.csv |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,8 @@ | ||
import logging | ||
|
||
logging.basicConfig( | ||
format="[%(levelname)s][%(asctime)s]:%(message)s", | ||
level=logging.INFO, | ||
datefmt="%Y-%m-%d %H:%M:%S", | ||
) | ||
logger = logging.getLogger() |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,227 @@ | ||
import logging | ||
import os | ||
from dataclasses import dataclass | ||
from typing import Optional, Any, List | ||
|
||
import hnswlib | ||
import numpy as np | ||
import pandas as pd | ||
from sentence_transformers.util import cos_sim | ||
|
||
from plagiarism.doc import SourceDocumentCollection, SuspiciousDocumentCollection | ||
from plagiarism.vectorizer import StyleEmbedding | ||
|
||
logger = logging.getLogger() | ||
|
||
|
||
@dataclass | ||
class ExtrinsicOutput: | ||
nn: Any | ||
score: Any | ||
|
||
|
||
@dataclass | ||
class IntrinsicOutput: | ||
file_names: List | ||
sentences: List | ||
|
||
|
||
class Plagiarism: | ||
def __init__( | ||
self, | ||
source_doc: Optional[SourceDocumentCollection] = None, | ||
suspicious_doc: Optional[SuspiciousDocumentCollection] = None, | ||
approach=None, | ||
): | ||
|
||
self.source_doc = source_doc | ||
self.suspicious_doc = suspicious_doc | ||
self.approach = approach | ||
|
||
self.index = None | ||
|
||
def query(self, **kwargs): | ||
raise NotImplementedError | ||
|
||
def save(self, **kwargs): | ||
raise NotImplementedError | ||
|
||
|
||
class Extrinsic(Plagiarism): | ||
def __init__( | ||
self, | ||
source_doc: Optional[SourceDocumentCollection] = None, | ||
suspicious_doc: Optional[SuspiciousDocumentCollection] = None, | ||
vector_model=None, | ||
): | ||
super().__init__(source_doc, suspicious_doc, vector_model) | ||
self._header = [ | ||
"suspicious_filename", | ||
"plagarised_filename", | ||
"suspicious", | ||
"plagarised", | ||
"score", | ||
] | ||
|
||
def index_embedding(self, embeddings, pth, ef_construction=400, m=64, ef=50): | ||
n, dim = embeddings.shape | ||
self.index = hnswlib.Index(space="cosine", dim=dim) | ||
self.index.init_index(max_elements=n, ef_construction=ef_construction, M=m) | ||
self.index.add_items(embeddings, list(range(n))) | ||
logger.info(f"SAVING GENERATED INDEX AT {pth}") | ||
self.index.save_index(pth) | ||
|
||
self.index.set_ef(ef) | ||
|
||
def _load_saved_index(self, pth, dim, ef): | ||
self.index = hnswlib.Index(space="cosine", dim=dim) | ||
self.index.load_index(pth) | ||
self.index.set_ef(ef) | ||
|
||
def nn_index( | ||
self, index_pth, dim: int = None, ef_construction=400, m=64, ef=50, **kwargs | ||
): | ||
if os.path.exists(index_pth): | ||
logger.info(f"LOADING INDEX FROM {index_pth}") | ||
self._load_saved_index(index_pth, dim, ef) | ||
else: | ||
logger.info("GENERATING INDEX") | ||
embeddings = self.approach.run(self.source_doc.get_normalised_sentences()) | ||
self.index_embedding( | ||
embeddings, index_pth, ef_construction=ef_construction, m=m, ef=ef | ||
) | ||
return self | ||
|
||
def query(self, nn=5): | ||
logger.info("VECTORIZATION IN PROGRESS") | ||
embeddings = self.approach.run(self.suspicious_doc.get_normalised_sentences()) | ||
|
||
logger.info("QUERYING DATA") | ||
nn, distances = self.index.knn_query(embeddings, nn) | ||
|
||
return ExtrinsicOutput(nn, 1 - distances) | ||
|
||
def save(self, pth, extrinsic_output: ExtrinsicOutput, distance_threshold=0.20): | ||
logger.info(f"SAVING IN PROGRESS AT {pth}") | ||
|
||
filtered_output_idx = np.where(extrinsic_output.score >= distance_threshold) | ||
|
||
suspicious_sentences_idx = filtered_output_idx[0] | ||
source_sentences_idx = extrinsic_output.nn[filtered_output_idx] | ||
|
||
suspicious_sentences_filtered = self.suspicious_doc.get_sentences()[ | ||
suspicious_sentences_idx | ||
] | ||
source_sentences_filtered = self.source_doc.get_sentences()[ | ||
source_sentences_idx | ||
] | ||
|
||
suspicious_file_filtered = self.suspicious_doc.get_file_names()[ | ||
suspicious_sentences_idx | ||
] | ||
source_file_filtered = self.source_doc.get_file_names()[source_sentences_idx] | ||
|
||
pd.DataFrame( | ||
np.column_stack( | ||
[ | ||
suspicious_file_filtered, | ||
source_file_filtered, | ||
suspicious_sentences_filtered, | ||
source_sentences_filtered, | ||
np.round(extrinsic_output.score[filtered_output_idx], 2), | ||
] | ||
), | ||
columns=self._header, | ||
).to_csv(pth) | ||
|
||
|
||
class Intrinsic(Plagiarism): | ||
def __init__( | ||
self, | ||
suspicious_doc: Optional[SuspiciousDocumentCollection] = None, | ||
vector_model=None, | ||
min_threshold: float = 0.60, | ||
ignore_sentence_with_len: int = 500, | ||
): | ||
super().__init__(None, suspicious_doc, vector_model) | ||
self._header = [ | ||
"suspicious_filename", | ||
"plagarised", | ||
] | ||
self._min_threshold = min_threshold | ||
self._ignore_sentence_with_len = ignore_sentence_with_len | ||
|
||
def query(self, **kwargs): | ||
plagiarised_sent = [] | ||
file_names = [] | ||
logger.info("QUERYING DATA") | ||
for file_name, sentences in self.suspicious_doc.sentence_per_file_gen(): | ||
if len(sentences) < self._ignore_sentence_with_len: | ||
|
||
embeddings = self.approach.run(sentences) | ||
mean_embeddings = embeddings.mean(axis=0).reshape(1, -1) | ||
cosine_scores = cos_sim(mean_embeddings, embeddings).numpy().flatten() | ||
|
||
plagiarised = list( | ||
sentences[np.where(cosine_scores <= self._min_threshold)] | ||
) | ||
|
||
if len(plagiarised) > 0: | ||
file_names.extend([file_name] * len(plagiarised)) | ||
plagiarised_sent.extend(plagiarised) | ||
else: | ||
file_names.extend([file_name]) | ||
plagiarised_sent.extend(["NONE"]) | ||
|
||
return IntrinsicOutput(file_names, plagiarised_sent) | ||
|
||
def save(self, pth, intrinsic_output: IntrinsicOutput, **kwargs): | ||
pd.DataFrame( | ||
np.column_stack( | ||
[ | ||
intrinsic_output.file_names, | ||
intrinsic_output.sentences, | ||
] | ||
), | ||
columns=self._header, | ||
).to_csv(pth) | ||
|
||
|
||
def extrinsic_plg( | ||
source_doc_pth, | ||
suspicious_doc_pth, | ||
source_doc_dir: list, | ||
suspicious_doc_dir: list, | ||
index_pth: str, | ||
save_pth: str, | ||
vector_model, | ||
distance_threshold: float = 0.90, | ||
): | ||
source_doc = SourceDocumentCollection( | ||
pth=source_doc_pth, | ||
dir_iter=source_doc_dir, | ||
).extract_sentences() | ||
|
||
suspicious_doc = SuspiciousDocumentCollection( | ||
pth=suspicious_doc_pth, dir_iter=suspicious_doc_dir | ||
).extract_sentences() | ||
|
||
ex = Extrinsic(source_doc, suspicious_doc, vector_model=vector_model) | ||
ex.nn_index(index_pth) | ||
ex_op = ex.query() | ||
ex.save( | ||
save_pth, | ||
ex_op, | ||
distance_threshold=distance_threshold, | ||
) | ||
|
||
|
||
def intrinsic_plg(suspicious_pth: str, suspicious_dir: list, features: list): | ||
suspicious_doc = SuspiciousDocumentCollection( | ||
pth=suspicious_pth, | ||
dir_iter=suspicious_dir, | ||
).extract_sentences() | ||
|
||
ii = Intrinsic(suspicious_doc=suspicious_doc, vector_model=StyleEmbedding(features)) | ||
op = ii.query() | ||
ii.save("intrinsic_output.csv", op) |
Oops, something went wrong.