|
| 1 | + |
| 2 | +from langchain.text_splitter import RecursiveCharacterTextSplitter |
| 3 | +from ibm_watson_machine_learning.foundation_models import Model |
| 4 | +from langchain.chains.summarize import load_summarize_chain |
| 5 | +import json |
| 6 | +from langchain import PromptTemplate |
| 7 | +import pandas as pd |
| 8 | +import sys |
| 9 | +import os |
| 10 | +from tenacity import retry, stop_after_attempt, wait_fixed |
| 11 | +from logger import setup_logger |
| 12 | + |
| 13 | +logger = setup_logger("TranscriptSummarizer") |
| 14 | +class TranscriptSummarizer(): |
| 15 | + def __init__(self, model_id, summary_param, keywords_param, transcript_path): |
| 16 | + self.APIKEY = os.environ['WATSONX_KEY'] |
| 17 | + self.project_id = os.environ['WATSONX_PROJECT_ID'] |
| 18 | + self.url = os.environ['WATSONX_URL'] |
| 19 | + self.model_id = model_id |
| 20 | + self.summary_param = summary_param |
| 21 | + self.keywords_param = keywords_param |
| 22 | + f = open(transcript_path) |
| 23 | + self.videos_dict = json.load(f) |
| 24 | + self.videos_dict = dict_to_list_of_dicts(self.videos_dict) |
| 25 | + self.llm_summary, self.llm_keywords = self.create_models() |
| 26 | + |
| 27 | + def create_models(self): |
| 28 | + llm_summary = Model( |
| 29 | + model_id=self.model_id, |
| 30 | + credentials={ |
| 31 | + "apikey": self.APIKEY, |
| 32 | + "url": self.url |
| 33 | + }, |
| 34 | + project_id=self.project_id, |
| 35 | + params=self.summary_param |
| 36 | + ) |
| 37 | + |
| 38 | + llm_keywords = Model( |
| 39 | + model_id=self.model_id, |
| 40 | + credentials={ |
| 41 | + "apikey": self.APIKEY, |
| 42 | + "url": self.url |
| 43 | + }, |
| 44 | + project_id=self.project_id, |
| 45 | + params=self.keywords_param |
| 46 | + ) |
| 47 | + return llm_summary, llm_keywords |
| 48 | + |
| 49 | + def LLM_summarizer(self, llm_summary, llm_keywords, transcript, chunk_size, chunk_overlap, key): |
| 50 | + text_splitter = RecursiveCharacterTextSplitter(chunk_size=chunk_size, chunk_overlap=chunk_overlap) |
| 51 | + texts = text_splitter.create_documents([transcript]) |
| 52 | + map_summary_template = open('cncf-youtube-channel-summarizer/prompt/chunks_summary_prompt.txt').readlines() |
| 53 | + map_summary_template = ''.join(map_summary_template) |
| 54 | + combine_summary_template = open('cncf-youtube-channel-summarizer/prompt/combine_summary_prompt.txt').readlines() |
| 55 | + combine_summary_template = ''.join(combine_summary_template) |
| 56 | + |
| 57 | + keyword_template = open('cncf-youtube-channel-summarizer/prompt/keyword_template.txt').readlines() |
| 58 | + keyword_template = ''.join(keyword_template) |
| 59 | + |
| 60 | + map_prompt = PromptTemplate(template=map_summary_template, input_variables=["text"]) |
| 61 | + combine_prompt = PromptTemplate(template=combine_summary_template, input_variables=["text"]) |
| 62 | + prompt_keywords = PromptTemplate(template=keyword_template, input_variables=["text"]) |
| 63 | + chain_summary = load_summarize_chain(llm=llm_summary, chain_type="map_reduce", map_prompt=map_prompt, |
| 64 | + combine_prompt=combine_prompt, verbose=False) |
| 65 | + summary = chain_summary.run(texts) |
| 66 | + summary_doc = text_splitter.create_documents([summary]) |
| 67 | + chain_keywords = load_summarize_chain(llm=llm_keywords, chain_type="stuff", prompt=prompt_keywords, |
| 68 | + verbose=False) |
| 69 | + keywords = chain_keywords.run(summary_doc) |
| 70 | + return summary, keywords |
| 71 | + |
| 72 | + @retry(stop=stop_after_attempt(3), wait=wait_fixed(2)) |
| 73 | + def run(self, start_index, end_index): |
| 74 | + max_sequence_length = self.llm_summary.get_details()['model_limits']['max_sequence_length'] |
| 75 | + chunk_size = max_sequence_length - 1000 |
| 76 | + chunk_overlap = 50 |
| 77 | + cncf_video_summary = pd.DataFrame(columns=['video_id', 'video_title','conference_name', 'summary', 'keywords']) |
| 78 | + output_path = 'cncf-youtube-channel-summarizer/data/cncf_video_summary_' + str(start_index) + '.csv' |
| 79 | + cncf_video_summary.to_csv(output_path, index=False) |
| 80 | + |
| 81 | + self.videos_dict = self.videos_dict[start_index:end_index+1] |
| 82 | + for i in range(0, len(self.videos_dict)): |
| 83 | + key = list(self.videos_dict[i].keys())[0] |
| 84 | + transcript = self.videos_dict[i][key]['transcript'] |
| 85 | + try: |
| 86 | + summary, keywords = self.LLM_summarizer(self.llm_summary.to_langchain(), self.llm_keywords.to_langchain(), transcript, |
| 87 | + chunk_size, chunk_overlap, key) |
| 88 | + logger.info(f"Finish Video {key}, index {i}") |
| 89 | + except Exception as e: |
| 90 | + logger.error(f"Failed to generate the summary and keywords for video: {key}") |
| 91 | + logger.error(f"{e}") |
| 92 | + missed_video_id_path = 'cncf-youtube-channel-summarizer/data/missed_video_id_' + str(start_index) + '.txt' |
| 93 | + missed_video_id = open('cncf-youtube-channel-summarizer/data/missed_video_id.txt', 'a') |
| 94 | + missed_video_id.write(key+',') |
| 95 | + continue |
| 96 | + |
| 97 | + data = {'video_id': [key], 'video_title': [self.videos_dict[i][key]['video_title']], |
| 98 | + 'conference_name': [self.videos_dict[i][key]['play_list']['title']], 'summary': [summary], |
| 99 | + 'keywords': [keywords]} |
| 100 | + df = pd.DataFrame(data) |
| 101 | + df.to_csv(output_path, mode='a', index=False, header=False) |
| 102 | + |
| 103 | +def dict_to_list_of_dicts(dictionary): |
| 104 | + # Initialize an empty list for the list of dictionaries |
| 105 | + list_of_dicts = [] |
| 106 | + |
| 107 | + # Iterate through the dictionary items |
| 108 | + for key, value in dictionary.items(): |
| 109 | + # Create a dictionary for the current key-value pair |
| 110 | + pair_dict = {key: value} |
| 111 | + # Append the dictionary to the list |
| 112 | + list_of_dicts.append(pair_dict) |
| 113 | + |
| 114 | + return list_of_dicts |
| 115 | + |
| 116 | +if __name__ == "__main__": |
| 117 | + args = sys.argv |
| 118 | + start_index = int(args[1]) |
| 119 | + end_index = int(args[2]) |
| 120 | + model_id = "ibm-mistralai/mixtral-8x7b-instruct-v01-q" |
| 121 | + summary_param = { |
| 122 | + 'TEMPERATURE':0.7, |
| 123 | + 'MAX_NEW_TOKENS':512, |
| 124 | + 'TOP_K': 10, |
| 125 | + } |
| 126 | + keywords_param = { |
| 127 | + 'TEMPERATURE': 0.1, |
| 128 | + 'MAX_NEW_TOKENS': 128, |
| 129 | + 'TOP_K': 10, |
| 130 | + } |
| 131 | + transcript_path = 'cncf-youtube-channel-summarizer/data/CNCF_video_information.json' |
| 132 | + summarizer = TranscriptSummarizer(model_id, summary_param, keywords_param, transcript_path).run(start_index, end_index) |
0 commit comments