Skip to content

Commit c259109

Browse files
authored
Merge pull request #1 from cypherics/submission
Submission
2 parents a29f1d7 + 84d548e commit c259109

File tree

10 files changed

+815
-0
lines changed

10 files changed

+815
-0
lines changed

.gitignore

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,6 @@
11
# Byte-compiled / optimized / DLL files
2+
dataset/*
3+
.idea/*
24
__pycache__/
35
*.py[cod]
46
*$py.class

README.md

Lines changed: 94 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1 +1,95 @@
11
# Plagarism Detection
2+
Code for detecting extrinsic and intrinsic plagiarism
3+
### Requirements
4+
```python
5+
pip install -r requirements.txt
6+
```
7+
If you encounter an error `ImportError: cannot import name 'complexity' from 'cophi'`
8+
then run ` pip install cophi==1.2.3`
9+
10+
### Config
11+
```yaml
12+
extrinsic:
13+
source:
14+
# dir where .txt files are stored for source
15+
dir:
16+
- dataset/subset/subset_1/sou_1
17+
- dataset/subset/subset_2/sou_2
18+
- dataset/subset/subset_3/sou_3
19+
20+
# If pth is not present it will compute do the pre-proceissing
21+
# and save them so for next run its will skip the processing and used data from csv
22+
pth: dataset/source_sent_all_three_subset.csv
23+
24+
suspicious:
25+
# dir where .txt files are stored for source
26+
dir:
27+
- dataset/subset/subset_1/sus_1
28+
29+
# If pth is not present it will compute do the pre-proceissing
30+
# and save them so for next run its will skip the processing and used data from csv
31+
pth: dataset/suspicious_sent.csv
32+
33+
index: dataset/output/se_index_subset_1_2_3.index
34+
save: dataset/output/set1/SE/se_output_subset_1_with_all_three_source.csv
35+
36+
intrinsic:
37+
suspicious:
38+
# dir where .txt files are stored for source
39+
dir:
40+
- dataset/pan-plagiarism-corpus-2009.part3/pan-plagiarism-corpus-2009/intrinsic-analysis-corpus/suspicious-documents
41+
- dataset/pan-plagiarism-corpus-2009.part2/pan-plagiarism-corpus-2009/intrinsic-analysis-corpus/suspicious-documents
42+
- dataset/pan-plagiarism-corpus-2009/intrinsic-analysis-corpus/suspicious-documents
43+
44+
# If pth is not present it will compute do the pre-proceissing
45+
# and save them so for next run its will skip the processing and used data from csv
46+
pth: path/to/suspicious_sent_intrinsic.csv
47+
48+
save: path/to/save/intrinsic_output.csv
49+
50+
features:
51+
- automated_readability_index
52+
- average_sentence_length_chars
53+
- average_sentence_length_words
54+
- average_syllables_per_word
55+
- average_word_frequency_class
56+
- average_word_length
57+
- coleman_liau_index
58+
- flesch_reading_ease
59+
- functionword_frequency
60+
- linsear_write_formula
61+
- most_common_words_without_stopwords
62+
- number_frequency
63+
- punctuation_frequency
64+
- sentence_length_distribution
65+
- special_character_frequency
66+
- stopword_ratio
67+
- top_3_gram_frequency
68+
- top_bigram_frequency
69+
- top_word_bigram_frequency
70+
- uppercase_frequency
71+
- word_length_distribution
72+
- yule_k_metric
73+
74+
evaluation:
75+
results: path/where/results.csv
76+
ground_truth: path/where/ground_truth.csv
77+
```
78+
### Run Extrinsic
79+
```python
80+
# USING TFIDF FOR FEATURES
81+
python extrinsic_tfidf --config path/to/config.yaml
82+
83+
# USING DISTILL_BERT FOR FEATURES
84+
python extrinsic_se --config path/to/config.yaml
85+
```
86+
87+
### Run Intrinsic
88+
```python
89+
python intrinsic --config path/to/config.yaml
90+
```
91+
92+
### Evaluate
93+
```python
94+
python evaluation --config path/to/config.yaml
95+
```

config.yaml

Lines changed: 53 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,53 @@
1+
extrinsic:
2+
source:
3+
dir:
4+
- dataset/subset/subset_1/sou_1
5+
- dataset/subset/subset_2/sou_2
6+
- dataset/subset/subset_3/sou_3
7+
pth: dataset/source_sent_all_three_subset.csv
8+
9+
suspicious:
10+
dir:
11+
- dataset/subset/subset_1/sus_1
12+
pth: dataset/suspicious_sent.csv
13+
14+
index: dataset/output/se_index_subset_1_2_3.index
15+
save: dataset/output/set1/SE/se_output_subset_1_with_all_three_source.csv
16+
17+
intrinsic:
18+
suspicious:
19+
dir:
20+
- dataset/pan-plagiarism-corpus-2009.part3/pan-plagiarism-corpus-2009/intrinsic-analysis-corpus/suspicious-documents
21+
- dataset/pan-plagiarism-corpus-2009.part2/pan-plagiarism-corpus-2009/intrinsic-analysis-corpus/suspicious-documents
22+
- dataset/pan-plagiarism-corpus-2009/intrinsic-analysis-corpus/suspicious-documents
23+
pth: dataset/suspicious_sent_intrinsic.csv
24+
25+
save: dataset/output/intrinsic_output.csv
26+
27+
features:
28+
- automated_readability_index
29+
- average_sentence_length_chars
30+
- average_sentence_length_words
31+
- average_syllables_per_word
32+
- average_word_frequency_class
33+
- average_word_length
34+
- coleman_liau_index
35+
- flesch_reading_ease
36+
- functionword_frequency
37+
- linsear_write_formula
38+
- most_common_words_without_stopwords
39+
- number_frequency
40+
- punctuation_frequency
41+
- sentence_length_distribution
42+
- special_character_frequency
43+
- stopword_ratio
44+
- top_3_gram_frequency
45+
- top_bigram_frequency
46+
- top_word_bigram_frequency
47+
- uppercase_frequency
48+
- word_length_distribution
49+
- yule_k_metric
50+
51+
evaluation:
52+
results: dataset/results.csv
53+
ground_truth: dataset/ground_truth.csv

plagiarism/__init__.py

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,8 @@
1+
import logging
2+
3+
logging.basicConfig(
4+
format="[%(levelname)s][%(asctime)s]:%(message)s",
5+
level=logging.INFO,
6+
datefmt="%Y-%m-%d %H:%M:%S",
7+
)
8+
logger = logging.getLogger()

plagiarism/detector.py

Lines changed: 227 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,227 @@
1+
import logging
2+
import os
3+
from dataclasses import dataclass
4+
from typing import Optional, Any, List
5+
6+
import hnswlib
7+
import numpy as np
8+
import pandas as pd
9+
from sentence_transformers.util import cos_sim
10+
11+
from plagiarism.doc import SourceDocumentCollection, SuspiciousDocumentCollection
12+
from plagiarism.vectorizer import StyleEmbedding
13+
14+
logger = logging.getLogger()
15+
16+
17+
@dataclass
18+
class ExtrinsicOutput:
19+
nn: Any
20+
score: Any
21+
22+
23+
@dataclass
24+
class IntrinsicOutput:
25+
file_names: List
26+
sentences: List
27+
28+
29+
class Plagiarism:
30+
def __init__(
31+
self,
32+
source_doc: Optional[SourceDocumentCollection] = None,
33+
suspicious_doc: Optional[SuspiciousDocumentCollection] = None,
34+
approach=None,
35+
):
36+
37+
self.source_doc = source_doc
38+
self.suspicious_doc = suspicious_doc
39+
self.approach = approach
40+
41+
self.index = None
42+
43+
def query(self, **kwargs):
44+
raise NotImplementedError
45+
46+
def save(self, **kwargs):
47+
raise NotImplementedError
48+
49+
50+
class Extrinsic(Plagiarism):
51+
def __init__(
52+
self,
53+
source_doc: Optional[SourceDocumentCollection] = None,
54+
suspicious_doc: Optional[SuspiciousDocumentCollection] = None,
55+
vector_model=None,
56+
):
57+
super().__init__(source_doc, suspicious_doc, vector_model)
58+
self._header = [
59+
"suspicious_filename",
60+
"plagarised_filename",
61+
"suspicious",
62+
"plagarised",
63+
"score",
64+
]
65+
66+
def index_embedding(self, embeddings, pth, ef_construction=400, m=64, ef=50):
67+
n, dim = embeddings.shape
68+
self.index = hnswlib.Index(space="cosine", dim=dim)
69+
self.index.init_index(max_elements=n, ef_construction=ef_construction, M=m)
70+
self.index.add_items(embeddings, list(range(n)))
71+
logger.info(f"SAVING GENERATED INDEX AT {pth}")
72+
self.index.save_index(pth)
73+
74+
self.index.set_ef(ef)
75+
76+
def _load_saved_index(self, pth, dim, ef):
77+
self.index = hnswlib.Index(space="cosine", dim=dim)
78+
self.index.load_index(pth)
79+
self.index.set_ef(ef)
80+
81+
def nn_index(
82+
self, index_pth, dim: int = None, ef_construction=400, m=64, ef=50, **kwargs
83+
):
84+
if os.path.exists(index_pth):
85+
logger.info(f"LOADING INDEX FROM {index_pth}")
86+
self._load_saved_index(index_pth, dim, ef)
87+
else:
88+
logger.info("GENERATING INDEX")
89+
embeddings = self.approach.run(self.source_doc.get_normalised_sentences())
90+
self.index_embedding(
91+
embeddings, index_pth, ef_construction=ef_construction, m=m, ef=ef
92+
)
93+
return self
94+
95+
def query(self, nn=5):
96+
logger.info("VECTORIZATION IN PROGRESS")
97+
embeddings = self.approach.run(self.suspicious_doc.get_normalised_sentences())
98+
99+
logger.info("QUERYING DATA")
100+
nn, distances = self.index.knn_query(embeddings, nn)
101+
102+
return ExtrinsicOutput(nn, 1 - distances)
103+
104+
def save(self, pth, extrinsic_output: ExtrinsicOutput, distance_threshold=0.20):
105+
logger.info(f"SAVING IN PROGRESS AT {pth}")
106+
107+
filtered_output_idx = np.where(extrinsic_output.score >= distance_threshold)
108+
109+
suspicious_sentences_idx = filtered_output_idx[0]
110+
source_sentences_idx = extrinsic_output.nn[filtered_output_idx]
111+
112+
suspicious_sentences_filtered = self.suspicious_doc.get_sentences()[
113+
suspicious_sentences_idx
114+
]
115+
source_sentences_filtered = self.source_doc.get_sentences()[
116+
source_sentences_idx
117+
]
118+
119+
suspicious_file_filtered = self.suspicious_doc.get_file_names()[
120+
suspicious_sentences_idx
121+
]
122+
source_file_filtered = self.source_doc.get_file_names()[source_sentences_idx]
123+
124+
pd.DataFrame(
125+
np.column_stack(
126+
[
127+
suspicious_file_filtered,
128+
source_file_filtered,
129+
suspicious_sentences_filtered,
130+
source_sentences_filtered,
131+
np.round(extrinsic_output.score[filtered_output_idx], 2),
132+
]
133+
),
134+
columns=self._header,
135+
).to_csv(pth)
136+
137+
138+
class Intrinsic(Plagiarism):
139+
def __init__(
140+
self,
141+
suspicious_doc: Optional[SuspiciousDocumentCollection] = None,
142+
vector_model=None,
143+
min_threshold: float = 0.60,
144+
ignore_sentence_with_len: int = 500,
145+
):
146+
super().__init__(None, suspicious_doc, vector_model)
147+
self._header = [
148+
"suspicious_filename",
149+
"plagarised",
150+
]
151+
self._min_threshold = min_threshold
152+
self._ignore_sentence_with_len = ignore_sentence_with_len
153+
154+
def query(self, **kwargs):
155+
plagiarised_sent = []
156+
file_names = []
157+
logger.info("QUERYING DATA")
158+
for file_name, sentences in self.suspicious_doc.sentence_per_file_gen():
159+
if len(sentences) < self._ignore_sentence_with_len:
160+
161+
embeddings = self.approach.run(sentences)
162+
mean_embeddings = embeddings.mean(axis=0).reshape(1, -1)
163+
cosine_scores = cos_sim(mean_embeddings, embeddings).numpy().flatten()
164+
165+
plagiarised = list(
166+
sentences[np.where(cosine_scores <= self._min_threshold)]
167+
)
168+
169+
if len(plagiarised) > 0:
170+
file_names.extend([file_name] * len(plagiarised))
171+
plagiarised_sent.extend(plagiarised)
172+
else:
173+
file_names.extend([file_name])
174+
plagiarised_sent.extend(["NONE"])
175+
176+
return IntrinsicOutput(file_names, plagiarised_sent)
177+
178+
def save(self, pth, intrinsic_output: IntrinsicOutput, **kwargs):
179+
pd.DataFrame(
180+
np.column_stack(
181+
[
182+
intrinsic_output.file_names,
183+
intrinsic_output.sentences,
184+
]
185+
),
186+
columns=self._header,
187+
).to_csv(pth)
188+
189+
190+
def extrinsic_plg(
191+
source_doc_pth,
192+
suspicious_doc_pth,
193+
source_doc_dir: list,
194+
suspicious_doc_dir: list,
195+
index_pth: str,
196+
save_pth: str,
197+
vector_model,
198+
distance_threshold: float = 0.90,
199+
):
200+
source_doc = SourceDocumentCollection(
201+
pth=source_doc_pth,
202+
dir_iter=source_doc_dir,
203+
).extract_sentences()
204+
205+
suspicious_doc = SuspiciousDocumentCollection(
206+
pth=suspicious_doc_pth, dir_iter=suspicious_doc_dir
207+
).extract_sentences()
208+
209+
ex = Extrinsic(source_doc, suspicious_doc, vector_model=vector_model)
210+
ex.nn_index(index_pth)
211+
ex_op = ex.query()
212+
ex.save(
213+
save_pth,
214+
ex_op,
215+
distance_threshold=distance_threshold,
216+
)
217+
218+
219+
def intrinsic_plg(suspicious_pth: str, suspicious_dir: list, features: list):
220+
suspicious_doc = SuspiciousDocumentCollection(
221+
pth=suspicious_pth,
222+
dir_iter=suspicious_dir,
223+
).extract_sentences()
224+
225+
ii = Intrinsic(suspicious_doc=suspicious_doc, vector_model=StyleEmbedding(features))
226+
op = ii.query()
227+
ii.save("intrinsic_output.csv", op)

0 commit comments

Comments
 (0)