├── README.md ├── Snakefile ├── download-videos.py ├── split-transcripts.py └── transcript-to-note.py /README.md: -------------------------------------------------------------------------------- 1 | # auto-lecture-note 2 | -------------------------------------------------------------------------------- /Snakefile: -------------------------------------------------------------------------------- 1 | VIDEONAMES = glob_wildcards('video/{name}.mp4').name 2 | SCENEDETECTION_THRESHOLD = 10 3 | 4 | rule all: 5 | input: 6 | expand('scenes/{name}-Scenes.csv', name=VIDEONAMES), 7 | expand('scenes/{name}-Stats.csv', name=VIDEONAMES), 8 | expand('scene-transcripts/{name}.tsv', name=VIDEONAMES), 9 | expand('scene-notes/{name}/done', name=VIDEONAMES), 10 | expand('notes/{name}.{dtype}', name=VIDEONAMES, dtype=['md', 'html']) 11 | 12 | rule detect_scene: 13 | input: 'video/{name}.mp4' 14 | output: 15 | scenes='scenes/{name}-Scenes.csv', 16 | stats='scenes/{name}-Stats.csv' 17 | params: 18 | stats_filename='{name}-Stats.csv' 19 | shell: 20 | 'scenedetect -o scenes --input "{input}" --stats "{params.stats_filename}" \ 21 | detect-content -t {SCENEDETECTION_THRESHOLD} \ 22 | list-scenes' 23 | 24 | rule split_transcripts: 25 | input: 26 | vtt='video/{name}.en.vtt', 27 | scenes='scenes/{name}-Scenes.csv' 28 | output: 'scene-transcripts/{name}.tsv' 29 | shell: 'python split-transcripts.py "{input.vtt}" "{input.scenes}" "{output}"' 30 | 31 | rule convert_transcript_to_notes: 32 | input: 'scene-transcripts/{name}.tsv' 33 | output: 'scene-notes/{name}/done' 34 | params: output_dir='scene-notes/{name}' 35 | resources: openai=1 36 | shell: 'python transcript-to-note.py "{input}" "{params.output_dir}"' 37 | 38 | rule merge_notes: 39 | input: 40 | transcripts='scene-transcripts/{name}.tsv', 41 | scene_notes_done_mark='scene-notes/{name}/done' 42 | output: 'notes/{name}.md' 43 | params: notedir='scene-notes/{name}' 44 | run: 45 | import pandas as pd 46 | 47 | notename = wildcards.name 48 | transcripts = pd.read_csv(input.transcripts, sep='\t') 49 | 50 | with open(output[0], 'w') as output: 51 | for scene_no, row in transcripts.iterrows(): 52 | scene_no += 1 # make scene numbers 1-indexed 53 | 54 | notefile = f'{params.notedir}/{scene_no:04d}-answer.md' 55 | note = open(notefile).read().splitlines() 56 | note.insert(1, f'Time: {row["start_time"]} – {row["end_time"]}') 57 | 58 | print(*note, sep='\n', file=output, end='\n\n') 59 | 60 | rule format_html_notes: 61 | input: 'notes/{name}.md' 62 | output: 'notes/{name}.html' 63 | shell: 'pandoc -f markdown -t html "{input}" -o "{output}"' 64 | -------------------------------------------------------------------------------- /download-videos.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | import subprocess as sp 3 | 4 | for url in open('youtube-urls.txt').read().split(): 5 | sp.check_call(['yt-dlp', '--concurrent-fragments', '4', '--write-auto-subs', url]) 6 | 7 | -------------------------------------------------------------------------------- /split-transcripts.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | import webvtt 3 | import re 4 | import pandas as pd 5 | 6 | pat_vtttime = re.compile(r'<\d\d:\d\d:\d\d\.\d\d\d>') 7 | 8 | def strip_c(txt): 9 | if txt.startswith(''): 10 | txt = txt[3:] 11 | if txt.endswith(''): 12 | txt = txt[:-4] 13 | return txt 14 | 15 | def parse_vtt_line(row): 16 | linestart_in_seconds = row['start'] 17 | txt = row['line2'] 18 | 19 | matches = [] 20 | for match in pat_vtttime.finditer(txt): 21 | tm = match.group()[1:-1] 22 | toks = list(map(float, tm.split(':'))) 23 | assert len(toks) == 3 24 | timemark = toks[0] * 3600 + toks[1] * 60 + toks[2] 25 | matches.append((match.start(), match.end(), timemark)) 26 | 27 | matches = pd.DataFrame(matches, columns=['start', 'end', 'timemark']) 28 | 29 | txtparts = pd.DataFrame({ 30 | 'txtstart': [0] + list(matches['end']), 31 | 'txtend': list(matches['start']) + [len(txt)], 32 | 'timestart': [linestart_in_seconds] + list(matches['timemark']), 33 | }) 34 | txtparts['txt'] = txtparts.apply(lambda row: strip_c(txt[int(row['txtstart']):int(row['txtend'])]), axis=1) 35 | 36 | return txtparts 37 | 38 | def load_vtt(vttfile): 39 | rows = [] 40 | for caption in webvtt.read(vttfile): 41 | assert len(caption.lines) == 2 42 | rows.append([ 43 | caption.start_in_seconds, 44 | caption.end_in_seconds, 45 | caption.lines[0], 46 | caption.lines[1], 47 | ]) 48 | vtt = pd.DataFrame(rows, columns=['start', 'end', 'line1', 'line2']) 49 | 50 | tokentimings = [] 51 | for _, row in vtt.iterrows(): 52 | tokentimings.append(parse_vtt_line(row)) 53 | tokentimings = pd.concat(tokentimings) 54 | 55 | tokentimings['timeend'] = tokentimings['timestart'].shift(-1) 56 | validtokens = tokentimings[tokentimings['txt'].apply(lambda x: len(x.strip())) > 0] 57 | validtokens = validtokens['timestart timeend txt'.split()].copy() 58 | validtokens.columns = ['start', 'end', 'token'] 59 | validtokens['token'] = validtokens['token'].apply(lambda x: x.strip()) 60 | 61 | return validtokens.reset_index(drop=True) 62 | 63 | def load_scenic_transcripts(vttfile, scenefile, min_scene_length=5): 64 | transcript_tokens = load_vtt(vttfile) 65 | sceneinfo = pd.read_csv(scenefile, skiprows=1) 66 | 67 | scene_transcripts = [] 68 | for _, scene in sceneinfo.iterrows(): 69 | scenestart, sceneend = scene['Start Time (seconds)'], scene['End Time (seconds)'] 70 | scene_transcript = transcript_tokens[transcript_tokens['start'].between(scenestart, sceneend)] 71 | scene_transcripts.append([ 72 | scenestart, sceneend, 73 | scene['Start Timecode'], scene['End Timecode'], 74 | len(scene_transcript), ' '.join(scene_transcript['token'])]) 75 | scene_transcripts = pd.DataFrame(scene_transcripts, 76 | columns='start end start_time end_time num_tokens transcript'.split()) 77 | 78 | return scene_transcripts[scene_transcripts['num_tokens'] >= min_scene_length].reset_index(drop=True) 79 | 80 | def diffuse_boundary(transcripts, max_diffusion=5): 81 | transcripts = transcripts.copy() 82 | 83 | tails = transcripts['transcript'].apply(lambda x: ' '.join(x.split()[-max_diffusion:])) 84 | heads = transcripts['transcript'].apply(lambda x: ' '.join(x.split()[:max_diffusion])) 85 | 86 | prefixes = pd.Series([''] + list(tails[:-1])) 87 | suffixes = pd.Series(list(heads[1:]) + ['']) 88 | 89 | transcripts['transcript'] = prefixes + ' ' + transcripts['transcript'] + ' ' + suffixes 90 | return transcripts 91 | 92 | 93 | if __name__ == '__main__': 94 | import sys 95 | subtitle_file = sys.argv[1] 96 | scene_file = sys.argv[2] 97 | output_file = sys.argv[3] 98 | 99 | transcripts = load_scenic_transcripts(subtitle_file, scene_file) 100 | diffuse_boundary(transcripts).to_csv(output_file, index=False, sep='\t') 101 | -------------------------------------------------------------------------------- /transcript-to-note.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | import openai, os 3 | import pandas as pd 4 | from collections import deque 5 | import json 6 | import sys 7 | 8 | openai.api_key_path = os.environ['HOME'] + '/.openai-api' 9 | 10 | system_instruction = """\ 11 | You are a university lecturer teaching virology for biology majors. Based on \ 12 | the transcript provided by user, you will prepare lecture notes for the part. \ 13 | The lecture note should start with a brief title in a single sentence. \ 14 | The body must be prepared in multiple bullet points. The answer should be \ 15 | in Korean. In the body, the scientific terms should be used in Korean with \ 16 | the English term in parenthesis. The body should not miss any important points \ 17 | from the transcript. The body should be concise and easy to understand. The \ 18 | answer must be formatted in Markdown. Use the first level heading for the \ 19 | title and the body should be in bullet points under the title. \ 20 | """ 21 | 22 | transcript_file = sys.argv[1] 23 | output_dir = sys.argv[2] 24 | context_size = 4 25 | 26 | if not os.path.isdir(output_dir): 27 | os.makedirs(output_dir) 28 | 29 | transcripts = pd.read_csv(transcript_file, sep='\t') 30 | system_context = [{'role': 'system', 'content': system_instruction}] 31 | context = deque() 32 | for scene_no, row in transcripts.iterrows(): 33 | scene_no += 1 # make it 1-based 34 | output_prefix = f'{output_dir}/{scene_no:04d}' 35 | context_file = output_prefix + '-context.json' 36 | 37 | if os.path.isfile(context_file): 38 | context = deque(json.load(open(context_file))) 39 | print(f'==> Skipping scene {scene_no}') 40 | continue 41 | 42 | transcript = row['transcript'] 43 | context.append({ 44 | 'role': 'user', 45 | 'content': transcript}) 46 | while len(context) > context_size: 47 | context.popleft() 48 | 49 | print(f'==> Requesting completion for scene {scene_no}') 50 | response = openai.ChatCompletion.create( 51 | model='gpt-4', 52 | messages=system_context + list(context), 53 | temperature=0.4, 54 | top_p=1, 55 | max_tokens=1000, 56 | frequency_penalty=0.0 57 | ) 58 | 59 | answer = response['choices'][0]['message'] 60 | context.append(answer) 61 | 62 | json.dump(list(context), open(context_file, 'w'), 63 | indent=2, ensure_ascii=False) 64 | open(output_prefix + '-answer.md', 'w').write(answer['content']) 65 | 66 | title = answer['content'].splitlines()[0] 67 | num_bullets = sum(bool(l.strip()) for l in answer['content'].splitlines()) - 1 68 | 69 | print(f' {title}') 70 | print(f' {num_bullets} Bullets.') 71 | print(f' - Used {response["usage"]["prompt_tokens"]} prompt and ' 72 | f'{response["usage"]["completion_tokens"]} completion tokens.') 73 | 74 | open(f'{output_dir}/done', 'w') 75 | --------------------------------------------------------------------------------