diff --git a/.gitignore b/.gitignore
index b6e4761..9f4d854 100644
--- a/.gitignore
+++ b/.gitignore
@@ -1,4 +1,6 @@
 # Byte-compiled / optimized / DLL files
+dataset/*
+.idea/*
 __pycache__/
 *.py[cod]
 *$py.class
diff --git a/README.md b/README.md
index 67c158a..16f3d8e 100644
--- a/README.md
+++ b/README.md
@@ -1 +1,95 @@
 # Plagarism Detection 
+Code for detecting extrinsic and intrinsic plagiarism
+### Requirements
+```python
+pip install -r requirements.txt
+```
+If you encounter an error `ImportError: cannot import name 'complexity' from 'cophi'`
+then run ` pip install cophi==1.2.3`
+
+### Config
+```yaml
+extrinsic:
+  source:
+    # dir where .txt files are stored for source
+    dir:
+      - dataset/subset/subset_1/sou_1
+      - dataset/subset/subset_2/sou_2
+      - dataset/subset/subset_3/sou_3
+      
+    # If pth is not present it will compute do the pre-proceissing 
+    # and save them so for next run its will skip the processing and used data from csv
+    pth: dataset/source_sent_all_three_subset.csv
+
+  suspicious:
+    # dir where .txt files are stored for source
+    dir:
+      - dataset/subset/subset_1/sus_1
+    
+    # If pth is not present it will compute do the pre-proceissing 
+    # and save them so for next run its will skip the processing and used data from csv
+    pth: dataset/suspicious_sent.csv
+
+  index: dataset/output/se_index_subset_1_2_3.index
+  save: dataset/output/set1/SE/se_output_subset_1_with_all_three_source.csv
+
+intrinsic:
+  suspicious:
+    # dir where .txt files are stored for source
+    dir:
+      - dataset/pan-plagiarism-corpus-2009.part3/pan-plagiarism-corpus-2009/intrinsic-analysis-corpus/suspicious-documents
+      - dataset/pan-plagiarism-corpus-2009.part2/pan-plagiarism-corpus-2009/intrinsic-analysis-corpus/suspicious-documents
+      - dataset/pan-plagiarism-corpus-2009/intrinsic-analysis-corpus/suspicious-documents
+    
+    # If pth is not present it will compute do the pre-proceissing 
+    # and save them so for next run its will skip the processing and used data from csv
+    pth: path/to/suspicious_sent_intrinsic.csv
+
+  save: path/to/save/intrinsic_output.csv
+
+  features:
+    - automated_readability_index
+    - average_sentence_length_chars
+    - average_sentence_length_words
+    - average_syllables_per_word
+    - average_word_frequency_class
+    - average_word_length
+    - coleman_liau_index
+    - flesch_reading_ease
+    - functionword_frequency
+    - linsear_write_formula
+    - most_common_words_without_stopwords
+    - number_frequency
+    - punctuation_frequency
+    - sentence_length_distribution
+    - special_character_frequency
+    - stopword_ratio
+    - top_3_gram_frequency
+    - top_bigram_frequency
+    - top_word_bigram_frequency
+    - uppercase_frequency
+    - word_length_distribution
+    - yule_k_metric
+
+evaluation:
+  results: path/where/results.csv
+  ground_truth: path/where/ground_truth.csv
+```
+### Run Extrinsic
+```python
+# USING TFIDF FOR FEATURES
+python extrinsic_tfidf --config path/to/config.yaml
+
+# USING DISTILL_BERT FOR FEATURES
+python extrinsic_se --config path/to/config.yaml
+```
+
+### Run Intrinsic
+```python
+python intrinsic --config path/to/config.yaml
+```
+
+### Evaluate
+```python
+python evaluation --config path/to/config.yaml
+```
\ No newline at end of file
diff --git a/config.yaml b/config.yaml
new file mode 100644
index 0000000..758826b
--- /dev/null
+++ b/config.yaml
@@ -0,0 +1,53 @@
+extrinsic:
+  source:
+    dir:
+      - dataset/subset/subset_1/sou_1
+      - dataset/subset/subset_2/sou_2
+      - dataset/subset/subset_3/sou_3
+    pth: dataset/source_sent_all_three_subset.csv
+
+  suspicious:
+    dir:
+      - dataset/subset/subset_1/sus_1
+    pth: dataset/suspicious_sent.csv
+
+  index: dataset/output/se_index_subset_1_2_3.index
+  save: dataset/output/set1/SE/se_output_subset_1_with_all_three_source.csv
+
+intrinsic:
+  suspicious:
+    dir:
+      - dataset/pan-plagiarism-corpus-2009.part3/pan-plagiarism-corpus-2009/intrinsic-analysis-corpus/suspicious-documents
+      - dataset/pan-plagiarism-corpus-2009.part2/pan-plagiarism-corpus-2009/intrinsic-analysis-corpus/suspicious-documents
+      - dataset/pan-plagiarism-corpus-2009/intrinsic-analysis-corpus/suspicious-documents
+    pth: dataset/suspicious_sent_intrinsic.csv
+
+  save: dataset/output/intrinsic_output.csv
+
+  features:
+    - automated_readability_index
+    - average_sentence_length_chars
+    - average_sentence_length_words
+    - average_syllables_per_word
+    - average_word_frequency_class
+    - average_word_length
+    - coleman_liau_index
+    - flesch_reading_ease
+    - functionword_frequency
+    - linsear_write_formula
+    - most_common_words_without_stopwords
+    - number_frequency
+    - punctuation_frequency
+    - sentence_length_distribution
+    - special_character_frequency
+    - stopword_ratio
+    - top_3_gram_frequency
+    - top_bigram_frequency
+    - top_word_bigram_frequency
+    - uppercase_frequency
+    - word_length_distribution
+    - yule_k_metric
+
+evaluation:
+  results: dataset/results.csv
+  ground_truth: dataset/ground_truth.csv
\ No newline at end of file
diff --git a/plagiarism/__init__.py b/plagiarism/__init__.py
new file mode 100644
index 0000000..8648fba
--- /dev/null
+++ b/plagiarism/__init__.py
@@ -0,0 +1,8 @@
+import logging
+
+logging.basicConfig(
+    format="[%(levelname)s][%(asctime)s]:%(message)s",
+    level=logging.INFO,
+    datefmt="%Y-%m-%d %H:%M:%S",
+)
+logger = logging.getLogger()
diff --git a/plagiarism/detector.py b/plagiarism/detector.py
new file mode 100644
index 0000000..b63d9d7
--- /dev/null
+++ b/plagiarism/detector.py
@@ -0,0 +1,227 @@
+import logging
+import os
+from dataclasses import dataclass
+from typing import Optional, Any, List
+
+import hnswlib
+import numpy as np
+import pandas as pd
+from sentence_transformers.util import cos_sim
+
+from plagiarism.doc import SourceDocumentCollection, SuspiciousDocumentCollection
+from plagiarism.vectorizer import StyleEmbedding
+
+logger = logging.getLogger()
+
+
+@dataclass
+class ExtrinsicOutput:
+    nn: Any
+    score: Any
+
+
+@dataclass
+class IntrinsicOutput:
+    file_names: List
+    sentences: List
+
+
+class Plagiarism:
+    def __init__(
+        self,
+        source_doc: Optional[SourceDocumentCollection] = None,
+        suspicious_doc: Optional[SuspiciousDocumentCollection] = None,
+        approach=None,
+    ):
+
+        self.source_doc = source_doc
+        self.suspicious_doc = suspicious_doc
+        self.approach = approach
+
+        self.index = None
+
+    def query(self, **kwargs):
+        raise NotImplementedError
+
+    def save(self, **kwargs):
+        raise NotImplementedError
+
+
+class Extrinsic(Plagiarism):
+    def __init__(
+        self,
+        source_doc: Optional[SourceDocumentCollection] = None,
+        suspicious_doc: Optional[SuspiciousDocumentCollection] = None,
+        vector_model=None,
+    ):
+        super().__init__(source_doc, suspicious_doc, vector_model)
+        self._header = [
+            "suspicious_filename",
+            "plagarised_filename",
+            "suspicious",
+            "plagarised",
+            "score",
+        ]
+
+    def index_embedding(self, embeddings, pth, ef_construction=400, m=64, ef=50):
+        n, dim = embeddings.shape
+        self.index = hnswlib.Index(space="cosine", dim=dim)
+        self.index.init_index(max_elements=n, ef_construction=ef_construction, M=m)
+        self.index.add_items(embeddings, list(range(n)))
+        logger.info(f"SAVING GENERATED INDEX AT {pth}")
+        self.index.save_index(pth)
+
+        self.index.set_ef(ef)
+
+    def _load_saved_index(self, pth, dim, ef):
+        self.index = hnswlib.Index(space="cosine", dim=dim)
+        self.index.load_index(pth)
+        self.index.set_ef(ef)
+
+    def nn_index(
+        self, index_pth, dim: int = None, ef_construction=400, m=64, ef=50, **kwargs
+    ):
+        if os.path.exists(index_pth):
+            logger.info(f"LOADING INDEX FROM {index_pth}")
+            self._load_saved_index(index_pth, dim, ef)
+        else:
+            logger.info("GENERATING INDEX")
+            embeddings = self.approach.run(self.source_doc.get_normalised_sentences())
+            self.index_embedding(
+                embeddings, index_pth, ef_construction=ef_construction, m=m, ef=ef
+            )
+        return self
+
+    def query(self, nn=5):
+        logger.info("VECTORIZATION IN PROGRESS")
+        embeddings = self.approach.run(self.suspicious_doc.get_normalised_sentences())
+
+        logger.info("QUERYING DATA")
+        nn, distances = self.index.knn_query(embeddings, nn)
+
+        return ExtrinsicOutput(nn, 1 - distances)
+
+    def save(self, pth, extrinsic_output: ExtrinsicOutput, distance_threshold=0.20):
+        logger.info(f"SAVING IN PROGRESS AT {pth}")
+
+        filtered_output_idx = np.where(extrinsic_output.score >= distance_threshold)
+
+        suspicious_sentences_idx = filtered_output_idx[0]
+        source_sentences_idx = extrinsic_output.nn[filtered_output_idx]
+
+        suspicious_sentences_filtered = self.suspicious_doc.get_sentences()[
+            suspicious_sentences_idx
+        ]
+        source_sentences_filtered = self.source_doc.get_sentences()[
+            source_sentences_idx
+        ]
+
+        suspicious_file_filtered = self.suspicious_doc.get_file_names()[
+            suspicious_sentences_idx
+        ]
+        source_file_filtered = self.source_doc.get_file_names()[source_sentences_idx]
+
+        pd.DataFrame(
+            np.column_stack(
+                [
+                    suspicious_file_filtered,
+                    source_file_filtered,
+                    suspicious_sentences_filtered,
+                    source_sentences_filtered,
+                    np.round(extrinsic_output.score[filtered_output_idx], 2),
+                ]
+            ),
+            columns=self._header,
+        ).to_csv(pth)
+
+
+class Intrinsic(Plagiarism):
+    def __init__(
+        self,
+        suspicious_doc: Optional[SuspiciousDocumentCollection] = None,
+        vector_model=None,
+        min_threshold: float = 0.60,
+        ignore_sentence_with_len: int = 500,
+    ):
+        super().__init__(None, suspicious_doc, vector_model)
+        self._header = [
+            "suspicious_filename",
+            "plagarised",
+        ]
+        self._min_threshold = min_threshold
+        self._ignore_sentence_with_len = ignore_sentence_with_len
+
+    def query(self, **kwargs):
+        plagiarised_sent = []
+        file_names = []
+        logger.info("QUERYING DATA")
+        for file_name, sentences in self.suspicious_doc.sentence_per_file_gen():
+            if len(sentences) < self._ignore_sentence_with_len:
+
+                embeddings = self.approach.run(sentences)
+                mean_embeddings = embeddings.mean(axis=0).reshape(1, -1)
+                cosine_scores = cos_sim(mean_embeddings, embeddings).numpy().flatten()
+
+                plagiarised = list(
+                    sentences[np.where(cosine_scores <= self._min_threshold)]
+                )
+
+                if len(plagiarised) > 0:
+                    file_names.extend([file_name] * len(plagiarised))
+                    plagiarised_sent.extend(plagiarised)
+                else:
+                    file_names.extend([file_name])
+                    plagiarised_sent.extend(["NONE"])
+
+        return IntrinsicOutput(file_names, plagiarised_sent)
+
+    def save(self, pth, intrinsic_output: IntrinsicOutput, **kwargs):
+        pd.DataFrame(
+            np.column_stack(
+                [
+                    intrinsic_output.file_names,
+                    intrinsic_output.sentences,
+                ]
+            ),
+            columns=self._header,
+        ).to_csv(pth)
+
+
+def extrinsic_plg(
+    source_doc_pth,
+    suspicious_doc_pth,
+    source_doc_dir: list,
+    suspicious_doc_dir: list,
+    index_pth: str,
+    save_pth: str,
+    vector_model,
+    distance_threshold: float = 0.90,
+):
+    source_doc = SourceDocumentCollection(
+        pth=source_doc_pth,
+        dir_iter=source_doc_dir,
+    ).extract_sentences()
+
+    suspicious_doc = SuspiciousDocumentCollection(
+        pth=suspicious_doc_pth, dir_iter=suspicious_doc_dir
+    ).extract_sentences()
+
+    ex = Extrinsic(source_doc, suspicious_doc, vector_model=vector_model)
+    ex.nn_index(index_pth)
+    ex_op = ex.query()
+    ex.save(
+        save_pth,
+        ex_op,
+        distance_threshold=distance_threshold,
+    )
+
+
+def intrinsic_plg(suspicious_pth: str, suspicious_dir: list, features: list):
+    suspicious_doc = SuspiciousDocumentCollection(
+        pth=suspicious_pth,
+        dir_iter=suspicious_dir,
+    ).extract_sentences()
+
+    ii = Intrinsic(suspicious_doc=suspicious_doc, vector_model=StyleEmbedding(features))
+    op = ii.query()
+    ii.save("intrinsic_output.csv", op)
diff --git a/plagiarism/doc.py b/plagiarism/doc.py
new file mode 100644
index 0000000..92be20d
--- /dev/null
+++ b/plagiarism/doc.py
@@ -0,0 +1,109 @@
+import logging
+import os
+from typing import List
+
+import numpy as np
+import pandas as pd
+from tqdm import tqdm
+
+from plagiarism.util import get_sentences_from_df, generate_para_df, normalize_data
+
+logger = logging.getLogger()
+
+
+class DocumentCollection:
+    def __init__(self, pth, dir_iter: List):
+        self._df = None
+        self.pth = pth
+        self.dir_iter = dir_iter
+
+    @staticmethod
+    def _sentences(files):
+        logger.info("SENTENCE GENERATION")
+        df = pd.DataFrame()
+        for i, file in enumerate(tqdm(files)):
+            df1 = pd.DataFrame()
+
+            sentences = get_sentences_from_df(generate_para_df(file))
+            tokenized_sentences = dict()
+
+            for j, sent in enumerate(sentences):
+                tokenized_text = normalize_data(sent)
+                if len(tokenized_text) >= 5:
+                    tokenized_sentences[sent] = " ".join(tokenized_text)
+
+            df1["filename"] = [str(os.path.basename(file))] * len(tokenized_sentences)
+            df1["sentences"] = list(tokenized_sentences.keys())
+            df1["normalised"] = list(tokenized_sentences.values())
+
+            df = pd.concat([df, df1], ignore_index=True, sort=False)
+        df["idx"] = range(0, len(df))
+        return df
+
+    def get_sentences(self) -> np.ndarray:
+        return self._df["sentences"].to_numpy()
+
+    def get_normalised_sentences(self) -> np.ndarray:
+        return self._df["normalised"].to_numpy()
+
+    def get_filename(self, idx):
+        return self._df.loc[self._df["idx"] == idx]["filename"].values[0]
+
+    def get_file_names(self) -> np.ndarray:
+        return self._df["filename"].to_numpy()
+
+    def extract_sentences(self):
+        raise NotImplementedError
+
+    def sentence_per_file_gen(self) -> np.ndarray:
+        _file_name = list(pd.unique(self.get_file_names()))
+        for file in tqdm(_file_name):
+            yield _file_name, self._df.loc[self._df["filename"] == file][
+                "normalised"
+            ].values
+
+
+class SourceDocumentCollection(DocumentCollection):
+    def __init__(self, pth, dir_iter: List):
+        super().__init__(pth, dir_iter)
+
+    def extract_sentences(self):
+        if os.path.exists(self.pth):
+            logger.info(f"READING FROM {self.pth}")
+            self._df = pd.read_csv(self.pth)
+        else:
+            files_collection = list()
+            for sub_dir in self.dir_iter:
+                for root, dirs, files in os.walk(sub_dir):
+                    for file in files:
+                        if file.endswith(".txt"):
+                            if "source-document" in file:
+                                files_collection.append(os.path.join(root, file))
+
+            self._df = self._sentences(files_collection)
+            logger.info(f"Saving Generated sentences at {self.pth}")
+            self._df.to_csv(self.pth)
+        return self
+
+
+class SuspiciousDocumentCollection(DocumentCollection):
+    def __init__(self, pth, dir_iter: List):
+        super().__init__(pth, dir_iter)
+
+    def extract_sentences(self):
+        if os.path.exists(self.pth):
+            logger.info(f"READING FROM {self.pth}")
+            self._df = pd.read_csv(self.pth)
+        else:
+            files_collection = list()
+            for sub_dir in self.dir_iter:
+                for root, dirs, files in os.walk(sub_dir):
+                    for file in files:
+                        if file.endswith(".txt"):
+                            if "suspicious-document" in file:
+                                files_collection.append(os.path.join(root, file))
+
+            self._df = self._sentences(files_collection)
+            logger.info(f"Saving Generated sentences at {self.pth}")
+            self._df.to_csv(self.pth)
+        return self
diff --git a/plagiarism/util.py b/plagiarism/util.py
new file mode 100644
index 0000000..9d69ca9
--- /dev/null
+++ b/plagiarism/util.py
@@ -0,0 +1,213 @@
+import string
+import re
+import nltk
+from typing import List
+
+import pandas as pd
+from nltk import word_tokenize
+from nltk.corpus import stopwords
+from tqdm import tqdm
+
+INPUT_COL = "para"
+PARA_COL = "para"
+
+pattern_digits = r"\d+(nd|th|st)*"
+pattern_space = r"\s{2,}"
+pattern_special_chars = r"[^\w\s]|(_)+"
+pattern_url = r"(https|http)?:\/\/(\w|\.|\/|\?|\=|\&|\%)*\b"
+
+alphabets = "([A-Za-z])"
+prefixes = "(Mr|St|Mrs|Ms|Dr|Prof|Capt|Cpt|Lt|Mt)[.]"
+suffixes = "(Inc|Ltd|Jr|Sr|Co)"
+starters = "(Mr|Mrs|Ms|Dr|He\s|She\s|It\s|They\s|Their\s|Our\s|We\s|But\s|However\s|That\s|This\s|Wherever)"
+acronyms = "([A-Z][.][A-Z][.](?:[A-Z][.])?)"
+websites = "[.](com|net|org|io|gov|me|edu)"
+digits = "([0-9])"
+
+nltk.download("stopwords")
+nltk.download("punkt")
+nltk.download("wordnet")
+nltk.download("omw-1.4")
+
+
+def split_into_sentences(text):
+    text = " " + text + "  "
+    text = text.replace("\n", " ")
+    text = re.sub(prefixes, "\\1<prd>", text)
+    text = re.sub(websites, "<prd>\\1", text)
+    if "Ph.D" in text:
+        text = text.replace("Ph.D.", "Ph<prd>D<prd>")
+    text = re.sub("\s" + alphabets + "[.] ", " \\1<prd> ", text)
+    text = re.sub(acronyms + " " + starters, "\\1<stop> \\2", text)
+    text = re.sub(
+        alphabets + "[.]" + alphabets + "[.]" + alphabets + "[.]",
+        "\\1<prd>\\2<prd>\\3<prd>",
+        text,
+    )
+    text = re.sub(alphabets + "[.]" + alphabets + "[.]", "\\1<prd>\\2<prd>", text)
+    text = re.sub(" " + suffixes + "[.] " + starters, " \\1<stop> \\2", text)
+    text = re.sub(" " + suffixes + "[.]", " \\1<prd>", text)
+    text = re.sub(" " + alphabets + "[.]", " \\1<prd>", text)
+    text = re.sub(digits + "[.]" + digits, "\\1<prd>\\2", text)
+    if "e.g." in text:
+        text = text.replace("e.g.", "e<prd>g<prd>")
+    if "..." in text:
+        text = text.replace("...", "<prd><prd><prd>")
+    if "i.e." in text:
+        text = text.replace("i.e.", "i<prd>e<prd>")
+    if "”" in text:
+        text = text.replace(".”", "”.")
+    if '"' in text:
+        text = text.replace('."', '".')
+    if "!" in text:
+        text = text.replace('!"', '"!')
+    if "?" in text:
+        text = text.replace('?"', '"?')
+    text = text.replace(".", ".<stop>")
+    text = text.replace("?", "?<stop>")
+    text = text.replace("!", "!<stop>")
+    text = text.replace("<prd>", ".")
+    sentences = text.split("<stop>")
+    sentences = sentences[:-1]
+    sentences = [s.strip() for s in sentences]
+    return sentences
+
+
+def generate_para_df(filepath):
+    para_content = list()
+    with open(filepath, "r", encoding="utf-8") as rf:
+        _content = []
+        for line in rf:
+            if line == "\n":
+                para_content.append(" ".join(_content))
+                _content = []
+            else:
+                _content.append(line.strip())
+    return pd.DataFrame(para_content, columns=[PARA_COL])
+
+
+def remove_symbols_numbers_letters_consonants(word_token: List):
+    clean_token = []
+    for token in word_token:
+        token = token.lower()
+        new_token = re.sub(r"[^a-zA-Z]+", "", token)
+        if new_token != "" and len(new_token) >= 2:
+            vowels = len([v for v in new_token if v in "aeiou"])
+            if vowels != 0:
+                clean_token.append(new_token)
+    return clean_token
+
+
+def case_conversion(text: string):
+    return text.lower()
+
+
+def apply_regex(text: string):
+    text = re.sub(pattern_url, "", text)
+    text = re.sub(pattern_digits, "", text)
+    text = re.sub(pattern_special_chars, " ", text)
+    text = re.sub(pattern_space, " ", text)
+    return text
+
+
+def remove_stop_words(word_token: List):
+    stop_words = set(stopwords.words("english"))
+    words_filtered = []
+
+    for w in word_token:
+        if w not in stop_words:
+            words_filtered.append(w)
+    return words_filtered
+
+
+def sentences_from_para(para):
+    return split_into_sentences(para)
+
+
+def normalize_data(data: str):
+    text = case_conversion(data)
+    text = apply_regex(text)
+
+    tokenized_text = word_tokenize(text)
+    tokenized_text = remove_symbols_numbers_letters_consonants(tokenized_text)
+    tokenized_text = remove_stop_words(tokenized_text)
+    return tokenized_text
+
+
+def get_sentences_from_df(data):
+    _ip_sent = []
+    for idx, row in data.iterrows():
+        for sent in sentences_from_para(row[INPUT_COL]):
+            _ip_sent.append(sent)
+    return _ip_sent
+
+
+def evaluation_iterator(results: str, ground_truth: str):
+    gt_df = pd.read_csv(ground_truth)
+    output_df = pd.read_csv(results)
+    suspicious_reference = gt_df.loc[gt_df.loc[:, "name"] == "artificial-plagiarism"][
+        "suspicious_reference"
+    ].unique()
+    for i, sus in enumerate(tqdm(suspicious_reference)):
+        print(f"Loading: {i + 1}/{len(suspicious_reference)}")
+        temp_df = output_df.loc[
+            output_df.loc[:, "suspicious_filename"] == sus.replace(".xml", ".txt")
+        ]
+        temp_gt_df = gt_df.loc[gt_df.loc[:, "suspicious_reference"] == sus]
+        temp_gt_df = temp_gt_df.loc[
+            temp_gt_df.loc[:, "name"] == "artificial-plagiarism"
+        ]
+        suspicious_text = temp_df["suspicious"].unique()
+        suspicious_gt_text = "".join(temp_gt_df["suspicious_text"].to_list())
+
+        yield i, suspicious_text, suspicious_gt_text
+
+
+def jaccard_similarity(text1: str, text2: str):
+    set1 = set(text1)
+    set2 = set(text2)
+    return float(len(set1.intersection(set2)) / len(set1.union(set2)))
+
+
+def precision(results: str, ground_truth: str) -> List:
+    scores = []
+    for i, suspicious_text, suspicious_gt_text in evaluation_iterator(
+        results, ground_truth
+    ):
+        match_len = 0
+        for j, suspicious_sentence in enumerate(suspicious_text):
+            if suspicious_gt_text.find(suspicious_sentence.strip()) != -1:
+                match_len += len(suspicious_sentence.strip())
+
+        scores.append(match_len / len(suspicious_gt_text))
+
+    return scores
+
+
+def recall(results: str, ground_truth: str) -> List:
+    scores = []
+    for i, suspicious_text, suspicious_gt_text in evaluation_iterator(
+        results, ground_truth
+    ):
+        tp = 0
+        fn = 0
+        for j, suspicious_sentence in enumerate(suspicious_text):
+            if suspicious_gt_text.find(suspicious_sentence.strip()) != -1:
+                tp += 1
+            else:
+                fn += 1
+        if tp + fn == 0:
+            if len(suspicious_gt_text) == 0:
+                scores.append(1)
+            else:
+                scores.append(0)
+            continue
+        scores.append(tp / (tp + fn))
+    return scores
+
+
+def metric(results: str, ground_truth: str):
+    return {
+        "recall": recall(results, ground_truth),
+        "precision": precision(results, ground_truth),
+    }
diff --git a/plagiarism/vectorizer.py b/plagiarism/vectorizer.py
new file mode 100644
index 0000000..a358f9c
--- /dev/null
+++ b/plagiarism/vectorizer.py
@@ -0,0 +1,48 @@
+from typing import Optional, List
+
+import numpy as np
+from authorstyle import get_feature_vector, all_feature_functions, Text
+from sentence_transformers import SentenceTransformer
+from sklearn.feature_extraction.text import HashingVectorizer
+from sklearn.preprocessing import MinMaxScaler
+
+
+class Vectorizer:
+    def run(self, **kwargs):
+        raise NotImplementedError
+
+
+class SE(Vectorizer):
+    def __init__(self, model_id: Optional[str] = "all-MiniLM-L6-v2"):
+        self._model = SentenceTransformer(model_id)
+
+    def run(self, sentences: List, **kwargs):
+        _embeddings = list()
+        return self._model.encode(sentences)
+
+
+class TFIDFHashing(Vectorizer):
+    def __init__(self, n_features=20):
+        self._model = HashingVectorizer(n_features=n_features)
+
+    def run(self, sentences: List, **kwargs):
+        return self._model.fit_transform(sentences).toarray()
+
+
+class StyleEmbedding(Vectorizer):
+    def __init__(self, features_to_use: List, **kwargs):
+        self._features_to_use = []
+        for feat in all_feature_functions():
+            if feat.__name__ in features_to_use:
+                self._features_to_use.append(feat)
+
+    def run(self, sentences: List, **kwargs):
+        tex = Text(" ".join(sentences))
+        tex.set_sliding_window(window_size=5, step_size=1, unit=sentences)
+        feat_vec = np.array(
+            get_feature_vector(tex, self._features_to_use, fragments=True)
+        )
+        # normalised_feat_vec = self._norm_func.fit_transform(
+        #     feat_vec.astype(np.float32).T
+        # ).T
+        return feat_vec / np.linalg.norm(feat_vec, axis=-1).reshape(-1, 1)
diff --git a/requirements.txt b/requirements.txt
new file mode 100644
index 0000000..1431262
--- /dev/null
+++ b/requirements.txt
@@ -0,0 +1,9 @@
+hnswlib == 0.6.2
+nltk == 3.7
+pandas == 1.3.5
+scikit_learn == 1.0.2
+sentence_transformers == 2.2.0
+tqdm == 4.64.0
+authorstyle==0.2
+cophi==1.2.3
+numpy
diff --git a/run.py b/run.py
new file mode 100644
index 0000000..ac322a6
--- /dev/null
+++ b/run.py
@@ -0,0 +1,52 @@
+import fire
+from omegaconf import OmegaConf
+
+from plagiarism.detector import extrinsic_plg, intrinsic_plg
+from plagiarism.util import metric
+from plagiarism.vectorizer import TFIDFHashing, SE
+
+
+def extrinsic_se(config):
+    conf = OmegaConf.load(config)
+
+    extrinsic_plg(
+        conf.extrinsic.source.pth,
+        conf.extrinsic.suspicious.pth,
+        conf.extrinsic.source.dir,
+        conf.extrinsic.suspicious.dir,
+        conf.extrinsic.index,
+        conf.extrinsic.save,
+        SE(),
+    )
+
+
+def extrinsic_tfidf(config):
+    conf = OmegaConf.load(config)
+
+    extrinsic_plg(
+        conf.extrinsic.source.pth,
+        conf.extrinsic.suspicious.pth,
+        conf.extrinsic.source.dir,
+        conf.extrinsic.suspicious.dir,
+        conf.extrinsic.index,
+        conf.extrinsic.save,
+        TFIDFHashing(),
+    )
+
+
+def intrinsic(config):
+    conf = OmegaConf.load(config)
+    intrinsic_plg(
+        conf.intrinsic.suspicious.pth,
+        conf.intrinsic.suspicious.dir,
+        conf.intrinsic.features,
+    )
+
+
+def evaluation(config):
+    conf = OmegaConf.load(config)
+    print(metric(**conf.evaluation))
+
+
+if __name__ == "__main__":
+    fire.Fire()