Skip to content

Commit 672fb71

Browse files
authored
Merge pull request #30 from nbcstevenchen/main
Create Semantic Ranking for Title, Summary, Keywords, Title and Conference Name Searching
2 parents 04ce6ad + 31f92d3 commit 672fb71

File tree

6 files changed

+42893
-0
lines changed

6 files changed

+42893
-0
lines changed

cncf-youtube-channel-summarizer/data/cncf_video_summary_combine.csv

Lines changed: 42769 additions & 0 deletions
Large diffs are not rendered by default.
Binary file not shown.
Lines changed: 18 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,18 @@
1+
from sentence_transformers import SentenceTransformer
2+
import pandas as pd
3+
import pickle
4+
def embedding_generator(model, data):
5+
dataset = pd.read_csv('data/cncf_video_summary_combine.csv')
6+
bi_encoder = SentenceTransformer(model)
7+
bi_encoder.max_seq_length = 256 #Truncate long passages to 256 tokens
8+
##### Semantic Search #####
9+
# Encode the query using the bi-encoder and find potentially relevant passages
10+
embeddings = bi_encoder.encode(data, convert_to_tensor=True, show_progress_bar=True)
11+
return embeddings
12+
13+
14+
if __name__ == "__main__":
15+
dataset = pd.read_csv('data/cncf_video_summary_combine.csv')
16+
embeddings = embedding_generator('multi-qa-MiniLM-L6-cos-v1', dataset['merge'])
17+
with open('data/embedding.pkl', 'wb') as f:
18+
pickle.dump(embeddings.numpy(), f)
Lines changed: 21 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,21 @@
1+
import pandas as pd
2+
import os
3+
4+
def merge_csv():
5+
csv_list = []
6+
filename_list = os.listdir('data/')
7+
for i in range(0, len(filename_list)):
8+
if '.csv' in filename_list[i]:
9+
filename_list[i] = 'data/' + filename_list[i]
10+
csv_list.append(filename_list[i])
11+
12+
dataframes = [pd.read_csv(each_file) for each_file in csv_list]
13+
merged_df = pd.concat(dataframes, ignore_index=True)
14+
merged_df['merge'] = merged_df['video_title'] + ' ' + merged_df['conference_name'] + ' ' + merged_df[
15+
'summary'] + ' ' + merged_df['keywords']
16+
merged_df.to_csv('data/cncf_video_summary_combine.csv', index=False)
17+
18+
if __name__ == "__main__":
19+
merge_csv()
20+
21+

cncf-youtube-channel-summarizer/requirements.txt

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -6,3 +6,6 @@ tenacity==8.2.3
66
tqdm==4.64.0
77
youtube_transcript_api==0.6.2
88
transformers==4.41.1
9+
nltk==3.8.1
10+
rank_bm25==0.2.2
11+
sentence_transformers==3.0.1
Lines changed: 82 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,82 @@
1+
import nltk
2+
from nltk.tokenize import word_tokenize
3+
from rank_bm25 import BM25Okapi
4+
import pandas as pd
5+
from sentence_transformers import SentenceTransformer, util
6+
import pickle
7+
# Download required NLTK data
8+
nltk.download('punkt')
9+
10+
class BM25():
11+
def __init__(self, dataset, top_k=5):
12+
self.dataset = dataset
13+
self.top_k = top_k
14+
self.tokenized_corpus = [self.preprocess_text(doc) for doc in dataset['merge']]
15+
# Function to preprocess and tokenize text
16+
def preprocess_text(self, text):
17+
return word_tokenize(text.lower())
18+
19+
20+
# Function to perform a search query
21+
def search(self, query, bm25):
22+
tokenized_query = self.preprocess_text(query)
23+
scores = bm25.get_scores(tokenized_query)
24+
results = []
25+
top_n_indices = sorted(range(len(scores)), key=lambda i: scores[i], reverse=True)[:self.top_k]
26+
27+
video_ids = []
28+
for i in top_n_indices:
29+
results.append((self.dataset['merge'][i], scores[i]))
30+
video_ids.append(self.dataset.loc[i]['video_id'])
31+
print(results)
32+
return video_ids
33+
34+
35+
def run(self, query):
36+
# Initialize BM25
37+
bm25 = BM25Okapi(self.tokenized_corpus)
38+
# Example query
39+
# query = "CNCF Webinars"
40+
video_ids = self.search(query, bm25, )
41+
return video_ids
42+
43+
class BIENCODER():
44+
def __init__(self, dataset, embeddings, top_k=5):
45+
self.dataset = dataset
46+
self.embeddings = embeddings
47+
self.top_k = top_k
48+
self.bi_encoder = SentenceTransformer('multi-qa-MiniLM-L6-cos-v1')
49+
self.bi_encoder.max_seq_length = 256
50+
51+
def search(self, query):
52+
print("Input question:", query)
53+
question_embedding = self.bi_encoder.encode(query, convert_to_tensor=True)
54+
# question_embedding = question_embedding.cuda()
55+
hits = util.semantic_search(question_embedding, self.embeddings, top_k=self.top_k)
56+
hits = hits[0] # Get the hits for the first query
57+
# print(hits)
58+
59+
# Output of top-5 hits from bi-encoder
60+
print("\n-------------------------\n")
61+
print("Top-3 Bi-Encoder Retrieval hits")
62+
hits = sorted(hits, key=lambda x: x['score'], reverse=True)
63+
video_ids = []
64+
for hit in hits:
65+
print("\t{:.3f}\t{}".format(hit['score'], self.dataset['merge'][hit['corpus_id']]))
66+
video_ids.append(self.dataset.loc[hit['corpus_id']]['video_id'])
67+
return video_ids
68+
69+
if __name__ == "__main__":
70+
query = 'CNCF Webinars' ## input query
71+
dataset = pd.read_csv('data/cncf_video_summary_combine.csv')
72+
print('Method 1: BM25 alg for semantic search:')
73+
bm25_search = BM25(dataset, top_k=5)
74+
video_ids = bm25_search.run(query)
75+
print('here')
76+
print(video_ids)
77+
78+
print('Method 2: Deep learning for semantic search:')
79+
with open('data/embedding.pkl', 'rb') as f:
80+
embeddings = pickle.load(f)
81+
video_ids = BIENCODER(dataset, embeddings).search(query)
82+
print(video_ids)

0 commit comments

Comments
 (0)