Skip to content

Commit

Permalink
Merge pull request #1 from cypherics/submission
Browse files Browse the repository at this point in the history
Submission
  • Loading branch information
fuzailpalnak authored Jun 30, 2022
2 parents a29f1d7 + 84d548e commit c259109
Show file tree
Hide file tree
Showing 10 changed files with 815 additions and 0 deletions.
2 changes: 2 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
@@ -1,4 +1,6 @@
# Byte-compiled / optimized / DLL files
dataset/*
.idea/*
__pycache__/
*.py[cod]
*$py.class
Expand Down
94 changes: 94 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
@@ -1 +1,95 @@
# Plagarism Detection
Code for detecting extrinsic and intrinsic plagiarism
### Requirements
```python
pip install -r requirements.txt
```
If you encounter an error `ImportError: cannot import name 'complexity' from 'cophi'`
then run ` pip install cophi==1.2.3`

### Config
```yaml
extrinsic:
source:
# dir where .txt files are stored for source
dir:
- dataset/subset/subset_1/sou_1
- dataset/subset/subset_2/sou_2
- dataset/subset/subset_3/sou_3

# If pth is not present it will compute do the pre-proceissing
# and save them so for next run its will skip the processing and used data from csv
pth: dataset/source_sent_all_three_subset.csv

suspicious:
# dir where .txt files are stored for source
dir:
- dataset/subset/subset_1/sus_1

# If pth is not present it will compute do the pre-proceissing
# and save them so for next run its will skip the processing and used data from csv
pth: dataset/suspicious_sent.csv

index: dataset/output/se_index_subset_1_2_3.index
save: dataset/output/set1/SE/se_output_subset_1_with_all_three_source.csv

intrinsic:
suspicious:
# dir where .txt files are stored for source
dir:
- dataset/pan-plagiarism-corpus-2009.part3/pan-plagiarism-corpus-2009/intrinsic-analysis-corpus/suspicious-documents
- dataset/pan-plagiarism-corpus-2009.part2/pan-plagiarism-corpus-2009/intrinsic-analysis-corpus/suspicious-documents
- dataset/pan-plagiarism-corpus-2009/intrinsic-analysis-corpus/suspicious-documents

# If pth is not present it will compute do the pre-proceissing
# and save them so for next run its will skip the processing and used data from csv
pth: path/to/suspicious_sent_intrinsic.csv

save: path/to/save/intrinsic_output.csv

features:
- automated_readability_index
- average_sentence_length_chars
- average_sentence_length_words
- average_syllables_per_word
- average_word_frequency_class
- average_word_length
- coleman_liau_index
- flesch_reading_ease
- functionword_frequency
- linsear_write_formula
- most_common_words_without_stopwords
- number_frequency
- punctuation_frequency
- sentence_length_distribution
- special_character_frequency
- stopword_ratio
- top_3_gram_frequency
- top_bigram_frequency
- top_word_bigram_frequency
- uppercase_frequency
- word_length_distribution
- yule_k_metric

evaluation:
results: path/where/results.csv
ground_truth: path/where/ground_truth.csv
```
### Run Extrinsic
```python
# USING TFIDF FOR FEATURES
python extrinsic_tfidf --config path/to/config.yaml

# USING DISTILL_BERT FOR FEATURES
python extrinsic_se --config path/to/config.yaml
```

### Run Intrinsic
```python
python intrinsic --config path/to/config.yaml
```

### Evaluate
```python
python evaluation --config path/to/config.yaml
```
53 changes: 53 additions & 0 deletions config.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,53 @@
extrinsic:
source:
dir:
- dataset/subset/subset_1/sou_1
- dataset/subset/subset_2/sou_2
- dataset/subset/subset_3/sou_3
pth: dataset/source_sent_all_three_subset.csv

suspicious:
dir:
- dataset/subset/subset_1/sus_1
pth: dataset/suspicious_sent.csv

index: dataset/output/se_index_subset_1_2_3.index
save: dataset/output/set1/SE/se_output_subset_1_with_all_three_source.csv

intrinsic:
suspicious:
dir:
- dataset/pan-plagiarism-corpus-2009.part3/pan-plagiarism-corpus-2009/intrinsic-analysis-corpus/suspicious-documents
- dataset/pan-plagiarism-corpus-2009.part2/pan-plagiarism-corpus-2009/intrinsic-analysis-corpus/suspicious-documents
- dataset/pan-plagiarism-corpus-2009/intrinsic-analysis-corpus/suspicious-documents
pth: dataset/suspicious_sent_intrinsic.csv

save: dataset/output/intrinsic_output.csv

features:
- automated_readability_index
- average_sentence_length_chars
- average_sentence_length_words
- average_syllables_per_word
- average_word_frequency_class
- average_word_length
- coleman_liau_index
- flesch_reading_ease
- functionword_frequency
- linsear_write_formula
- most_common_words_without_stopwords
- number_frequency
- punctuation_frequency
- sentence_length_distribution
- special_character_frequency
- stopword_ratio
- top_3_gram_frequency
- top_bigram_frequency
- top_word_bigram_frequency
- uppercase_frequency
- word_length_distribution
- yule_k_metric

evaluation:
results: dataset/results.csv
ground_truth: dataset/ground_truth.csv
8 changes: 8 additions & 0 deletions plagiarism/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,8 @@
import logging

logging.basicConfig(
format="[%(levelname)s][%(asctime)s]:%(message)s",
level=logging.INFO,
datefmt="%Y-%m-%d %H:%M:%S",
)
logger = logging.getLogger()
227 changes: 227 additions & 0 deletions plagiarism/detector.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,227 @@
import logging
import os
from dataclasses import dataclass
from typing import Optional, Any, List

import hnswlib
import numpy as np
import pandas as pd
from sentence_transformers.util import cos_sim

from plagiarism.doc import SourceDocumentCollection, SuspiciousDocumentCollection
from plagiarism.vectorizer import StyleEmbedding

logger = logging.getLogger()


@dataclass
class ExtrinsicOutput:
nn: Any
score: Any


@dataclass
class IntrinsicOutput:
file_names: List
sentences: List


class Plagiarism:
def __init__(
self,
source_doc: Optional[SourceDocumentCollection] = None,
suspicious_doc: Optional[SuspiciousDocumentCollection] = None,
approach=None,
):

self.source_doc = source_doc
self.suspicious_doc = suspicious_doc
self.approach = approach

self.index = None

def query(self, **kwargs):
raise NotImplementedError

def save(self, **kwargs):
raise NotImplementedError


class Extrinsic(Plagiarism):
def __init__(
self,
source_doc: Optional[SourceDocumentCollection] = None,
suspicious_doc: Optional[SuspiciousDocumentCollection] = None,
vector_model=None,
):
super().__init__(source_doc, suspicious_doc, vector_model)
self._header = [
"suspicious_filename",
"plagarised_filename",
"suspicious",
"plagarised",
"score",
]

def index_embedding(self, embeddings, pth, ef_construction=400, m=64, ef=50):
n, dim = embeddings.shape
self.index = hnswlib.Index(space="cosine", dim=dim)
self.index.init_index(max_elements=n, ef_construction=ef_construction, M=m)
self.index.add_items(embeddings, list(range(n)))
logger.info(f"SAVING GENERATED INDEX AT {pth}")
self.index.save_index(pth)

self.index.set_ef(ef)

def _load_saved_index(self, pth, dim, ef):
self.index = hnswlib.Index(space="cosine", dim=dim)
self.index.load_index(pth)
self.index.set_ef(ef)

def nn_index(
self, index_pth, dim: int = None, ef_construction=400, m=64, ef=50, **kwargs
):
if os.path.exists(index_pth):
logger.info(f"LOADING INDEX FROM {index_pth}")
self._load_saved_index(index_pth, dim, ef)
else:
logger.info("GENERATING INDEX")
embeddings = self.approach.run(self.source_doc.get_normalised_sentences())
self.index_embedding(
embeddings, index_pth, ef_construction=ef_construction, m=m, ef=ef
)
return self

def query(self, nn=5):
logger.info("VECTORIZATION IN PROGRESS")
embeddings = self.approach.run(self.suspicious_doc.get_normalised_sentences())

logger.info("QUERYING DATA")
nn, distances = self.index.knn_query(embeddings, nn)

return ExtrinsicOutput(nn, 1 - distances)

def save(self, pth, extrinsic_output: ExtrinsicOutput, distance_threshold=0.20):
logger.info(f"SAVING IN PROGRESS AT {pth}")

filtered_output_idx = np.where(extrinsic_output.score >= distance_threshold)

suspicious_sentences_idx = filtered_output_idx[0]
source_sentences_idx = extrinsic_output.nn[filtered_output_idx]

suspicious_sentences_filtered = self.suspicious_doc.get_sentences()[
suspicious_sentences_idx
]
source_sentences_filtered = self.source_doc.get_sentences()[
source_sentences_idx
]

suspicious_file_filtered = self.suspicious_doc.get_file_names()[
suspicious_sentences_idx
]
source_file_filtered = self.source_doc.get_file_names()[source_sentences_idx]

pd.DataFrame(
np.column_stack(
[
suspicious_file_filtered,
source_file_filtered,
suspicious_sentences_filtered,
source_sentences_filtered,
np.round(extrinsic_output.score[filtered_output_idx], 2),
]
),
columns=self._header,
).to_csv(pth)


class Intrinsic(Plagiarism):
def __init__(
self,
suspicious_doc: Optional[SuspiciousDocumentCollection] = None,
vector_model=None,
min_threshold: float = 0.60,
ignore_sentence_with_len: int = 500,
):
super().__init__(None, suspicious_doc, vector_model)
self._header = [
"suspicious_filename",
"plagarised",
]
self._min_threshold = min_threshold
self._ignore_sentence_with_len = ignore_sentence_with_len

def query(self, **kwargs):
plagiarised_sent = []
file_names = []
logger.info("QUERYING DATA")
for file_name, sentences in self.suspicious_doc.sentence_per_file_gen():
if len(sentences) < self._ignore_sentence_with_len:

embeddings = self.approach.run(sentences)
mean_embeddings = embeddings.mean(axis=0).reshape(1, -1)
cosine_scores = cos_sim(mean_embeddings, embeddings).numpy().flatten()

plagiarised = list(
sentences[np.where(cosine_scores <= self._min_threshold)]
)

if len(plagiarised) > 0:
file_names.extend([file_name] * len(plagiarised))
plagiarised_sent.extend(plagiarised)
else:
file_names.extend([file_name])
plagiarised_sent.extend(["NONE"])

return IntrinsicOutput(file_names, plagiarised_sent)

def save(self, pth, intrinsic_output: IntrinsicOutput, **kwargs):
pd.DataFrame(
np.column_stack(
[
intrinsic_output.file_names,
intrinsic_output.sentences,
]
),
columns=self._header,
).to_csv(pth)


def extrinsic_plg(
source_doc_pth,
suspicious_doc_pth,
source_doc_dir: list,
suspicious_doc_dir: list,
index_pth: str,
save_pth: str,
vector_model,
distance_threshold: float = 0.90,
):
source_doc = SourceDocumentCollection(
pth=source_doc_pth,
dir_iter=source_doc_dir,
).extract_sentences()

suspicious_doc = SuspiciousDocumentCollection(
pth=suspicious_doc_pth, dir_iter=suspicious_doc_dir
).extract_sentences()

ex = Extrinsic(source_doc, suspicious_doc, vector_model=vector_model)
ex.nn_index(index_pth)
ex_op = ex.query()
ex.save(
save_pth,
ex_op,
distance_threshold=distance_threshold,
)


def intrinsic_plg(suspicious_pth: str, suspicious_dir: list, features: list):
suspicious_doc = SuspiciousDocumentCollection(
pth=suspicious_pth,
dir_iter=suspicious_dir,
).extract_sentences()

ii = Intrinsic(suspicious_doc=suspicious_doc, vector_model=StyleEmbedding(features))
op = ii.query()
ii.save("intrinsic_output.csv", op)
Loading

0 comments on commit c259109

Please sign in to comment.