Skip to content

Commit 997acea

Browse files
authored
Merge pull request #21 from nbcstevenchen/main
Make Summarization in Parallel
2 parents 531fb32 + 6d9c44d commit 997acea

File tree

5 files changed

+206
-2
lines changed

5 files changed

+206
-2
lines changed
Lines changed: 73 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,73 @@
1+
name: Run summmarization-parallel
2+
env:
3+
WATSONX_KEY: ${{secrets.WATSONX_KEY}}
4+
WATSONX_PROJECT_ID: ${{secrets.WATSONX_PROJECT_ID}}
5+
WATSONX_URL: ${{secrets.WATSONX_URL}}
6+
on:
7+
workflow_dispatch:
8+
9+
permissions:
10+
pull-requests: write
11+
contents: write
12+
repository-projects: write
13+
packages: write
14+
15+
jobs:
16+
run-script:
17+
runs-on: ubuntu-latest
18+
strategy:
19+
matrix:
20+
index: [0, 9, 19, 29, 39]
21+
22+
23+
steps:
24+
- name: Checkout code
25+
uses: actions/checkout@v3
26+
27+
- name: Set up Python
28+
uses: actions/setup-python@v4
29+
with:
30+
python-version: '3.10' # Specify the Python version you need
31+
32+
- name: Install dependencies
33+
run: |
34+
python -m pip install --upgrade pip
35+
pip install -r cncf-youtube-channel-summarizer/requirements.txt
36+
37+
- name: Run script for iteration
38+
run: |
39+
start=${{ matrix.index }}
40+
end=$((start + 9))
41+
echo "Running script with arguments: $start, $end"
42+
python cncf-youtube-channel-summarizer/transcript_summarizer_parallel.py $start $end
43+
44+
- name: Upload Summaries
45+
uses: actions/upload-artifact@v4
46+
with:
47+
name: cncf_video_summary_${{ matrix.index }}_${{ matrix.index_plus_one }}.csv
48+
path: cncf-youtube-channel-summarizer/data/cncf_video_summary_${{ matrix.index}}.csv
49+
50+
- name: Upload missed_video_id
51+
uses: actions/upload-artifact@v4
52+
with:
53+
name: missed_video_id_${{ matrix.index }}_${{ matrix.index_plus_one }}.txt
54+
path: cncf-youtube-channel-summarizer/data/missed_video_id_${{ matrix.index}}.txt
55+
56+
- name: Commit Change for "cncf_video_summary.csv"
57+
run: |
58+
git config --global user.name 'Yuhao Chen'
59+
git config --global user.email '[email protected]'
60+
git pull origin main
61+
git add cncf-youtube-channel-summarizer/data/cncf_video_summary_${{matrix.index}}.csv || exit 0
62+
git commit -m "Add Summaries and Keywords" --signoff || exit 0
63+
git push
64+
65+
66+
- name: Commit Change for "missed_video_id.txt"
67+
run: |
68+
git config --global user.name 'Yuhao Chen'
69+
git config --global user.email '[email protected]'
70+
git pull origin main
71+
git add cncf-youtube-channel-summarizer/data/missed_video_id_${{matrix.index}}.txt || exit 0
72+
git commit -m "Add missed_video_id" --signoff || exit 0
73+
git push

cncf-youtube-channel-summarizer/data/cncf_video_summary.csv

Lines changed: 0 additions & 1 deletion
This file was deleted.

cncf-youtube-channel-summarizer/data/missed_video_id.txt

Lines changed: 0 additions & 1 deletion
This file was deleted.

cncf-youtube-channel-summarizer/requirements.txt

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -5,3 +5,4 @@ pandas==1.5.3
55
tenacity==8.2.3
66
tqdm==4.64.0
77
youtube_transcript_api==0.6.2
8+
transformers==4.41.1
Lines changed: 132 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,132 @@
1+
2+
from langchain.text_splitter import RecursiveCharacterTextSplitter
3+
from ibm_watson_machine_learning.foundation_models import Model
4+
from langchain.chains.summarize import load_summarize_chain
5+
import json
6+
from langchain import PromptTemplate
7+
import pandas as pd
8+
import sys
9+
import os
10+
from tenacity import retry, stop_after_attempt, wait_fixed
11+
from logger import setup_logger
12+
13+
logger = setup_logger("TranscriptSummarizer")
14+
class TranscriptSummarizer():
15+
def __init__(self, model_id, summary_param, keywords_param, transcript_path):
16+
self.APIKEY = os.environ['WATSONX_KEY']
17+
self.project_id = os.environ['WATSONX_PROJECT_ID']
18+
self.url = os.environ['WATSONX_URL']
19+
self.model_id = model_id
20+
self.summary_param = summary_param
21+
self.keywords_param = keywords_param
22+
f = open(transcript_path)
23+
self.videos_dict = json.load(f)
24+
self.videos_dict = dict_to_list_of_dicts(self.videos_dict)
25+
self.llm_summary, self.llm_keywords = self.create_models()
26+
27+
def create_models(self):
28+
llm_summary = Model(
29+
model_id=self.model_id,
30+
credentials={
31+
"apikey": self.APIKEY,
32+
"url": self.url
33+
},
34+
project_id=self.project_id,
35+
params=self.summary_param
36+
)
37+
38+
llm_keywords = Model(
39+
model_id=self.model_id,
40+
credentials={
41+
"apikey": self.APIKEY,
42+
"url": self.url
43+
},
44+
project_id=self.project_id,
45+
params=self.keywords_param
46+
)
47+
return llm_summary, llm_keywords
48+
49+
def LLM_summarizer(self, llm_summary, llm_keywords, transcript, chunk_size, chunk_overlap, key):
50+
text_splitter = RecursiveCharacterTextSplitter(chunk_size=chunk_size, chunk_overlap=chunk_overlap)
51+
texts = text_splitter.create_documents([transcript])
52+
map_summary_template = open('cncf-youtube-channel-summarizer/prompt/chunks_summary_prompt.txt').readlines()
53+
map_summary_template = ''.join(map_summary_template)
54+
combine_summary_template = open('cncf-youtube-channel-summarizer/prompt/combine_summary_prompt.txt').readlines()
55+
combine_summary_template = ''.join(combine_summary_template)
56+
57+
keyword_template = open('cncf-youtube-channel-summarizer/prompt/keyword_template.txt').readlines()
58+
keyword_template = ''.join(keyword_template)
59+
60+
map_prompt = PromptTemplate(template=map_summary_template, input_variables=["text"])
61+
combine_prompt = PromptTemplate(template=combine_summary_template, input_variables=["text"])
62+
prompt_keywords = PromptTemplate(template=keyword_template, input_variables=["text"])
63+
chain_summary = load_summarize_chain(llm=llm_summary, chain_type="map_reduce", map_prompt=map_prompt,
64+
combine_prompt=combine_prompt, verbose=False)
65+
summary = chain_summary.run(texts)
66+
summary_doc = text_splitter.create_documents([summary])
67+
chain_keywords = load_summarize_chain(llm=llm_keywords, chain_type="stuff", prompt=prompt_keywords,
68+
verbose=False)
69+
keywords = chain_keywords.run(summary_doc)
70+
return summary, keywords
71+
72+
@retry(stop=stop_after_attempt(3), wait=wait_fixed(2))
73+
def run(self, start_index, end_index):
74+
max_sequence_length = self.llm_summary.get_details()['model_limits']['max_sequence_length']
75+
chunk_size = max_sequence_length - 1000
76+
chunk_overlap = 50
77+
cncf_video_summary = pd.DataFrame(columns=['video_id', 'video_title','conference_name', 'summary', 'keywords'])
78+
output_path = 'cncf-youtube-channel-summarizer/data/cncf_video_summary_' + str(start_index) + '.csv'
79+
cncf_video_summary.to_csv(output_path, index=False)
80+
81+
self.videos_dict = self.videos_dict[start_index:end_index+1]
82+
for i in range(0, len(self.videos_dict)):
83+
key = list(self.videos_dict[i].keys())[0]
84+
transcript = self.videos_dict[i][key]['transcript']
85+
try:
86+
summary, keywords = self.LLM_summarizer(self.llm_summary.to_langchain(), self.llm_keywords.to_langchain(), transcript,
87+
chunk_size, chunk_overlap, key)
88+
logger.info(f"Finish Video {key}, index {i}")
89+
except Exception as e:
90+
logger.error(f"Failed to generate the summary and keywords for video: {key}")
91+
logger.error(f"{e}")
92+
missed_video_id_path = 'cncf-youtube-channel-summarizer/data/missed_video_id_' + str(start_index) + '.txt'
93+
missed_video_id = open('cncf-youtube-channel-summarizer/data/missed_video_id.txt', 'a')
94+
missed_video_id.write(key+',')
95+
continue
96+
97+
data = {'video_id': [key], 'video_title': [self.videos_dict[i][key]['video_title']],
98+
'conference_name': [self.videos_dict[i][key]['play_list']['title']], 'summary': [summary],
99+
'keywords': [keywords]}
100+
df = pd.DataFrame(data)
101+
df.to_csv(output_path, mode='a', index=False, header=False)
102+
103+
def dict_to_list_of_dicts(dictionary):
104+
# Initialize an empty list for the list of dictionaries
105+
list_of_dicts = []
106+
107+
# Iterate through the dictionary items
108+
for key, value in dictionary.items():
109+
# Create a dictionary for the current key-value pair
110+
pair_dict = {key: value}
111+
# Append the dictionary to the list
112+
list_of_dicts.append(pair_dict)
113+
114+
return list_of_dicts
115+
116+
if __name__ == "__main__":
117+
args = sys.argv
118+
start_index = int(args[1])
119+
end_index = int(args[2])
120+
model_id = "ibm-mistralai/mixtral-8x7b-instruct-v01-q"
121+
summary_param = {
122+
'TEMPERATURE':0.7,
123+
'MAX_NEW_TOKENS':512,
124+
'TOP_K': 10,
125+
}
126+
keywords_param = {
127+
'TEMPERATURE': 0.1,
128+
'MAX_NEW_TOKENS': 128,
129+
'TOP_K': 10,
130+
}
131+
transcript_path = 'cncf-youtube-channel-summarizer/data/CNCF_video_information.json'
132+
summarizer = TranscriptSummarizer(model_id, summary_param, keywords_param, transcript_path).run(start_index, end_index)

0 commit comments

Comments
 (0)