├── README.md
├── Snakefile
├── download-videos.py
├── split-transcripts.py
└── transcript-to-note.py


/README.md:
--------------------------------------------------------------------------------
1 | # auto-lecture-note
2 | 


--------------------------------------------------------------------------------
/Snakefile:
--------------------------------------------------------------------------------
 1 | VIDEONAMES = glob_wildcards('video/{name}.mp4').name
 2 | SCENEDETECTION_THRESHOLD = 10
 3 | 
 4 | rule all:
 5 |     input:
 6 |         expand('scenes/{name}-Scenes.csv', name=VIDEONAMES),
 7 |         expand('scenes/{name}-Stats.csv', name=VIDEONAMES),
 8 |         expand('scene-transcripts/{name}.tsv', name=VIDEONAMES),
 9 |         expand('scene-notes/{name}/done', name=VIDEONAMES),
10 |         expand('notes/{name}.{dtype}', name=VIDEONAMES, dtype=['md', 'html'])
11 | 
12 | rule detect_scene:
13 |     input: 'video/{name}.mp4'
14 |     output:
15 |         scenes='scenes/{name}-Scenes.csv',
16 |         stats='scenes/{name}-Stats.csv'
17 |     params:
18 |         stats_filename='{name}-Stats.csv'
19 |     shell:
20 |         'scenedetect -o scenes --input "{input}" --stats "{params.stats_filename}" \
21 |             detect-content -t {SCENEDETECTION_THRESHOLD} \
22 |             list-scenes'
23 | 
24 | rule split_transcripts:
25 |     input:
26 |         vtt='video/{name}.en.vtt',
27 |         scenes='scenes/{name}-Scenes.csv'
28 |     output: 'scene-transcripts/{name}.tsv'
29 |     shell: 'python split-transcripts.py "{input.vtt}" "{input.scenes}" "{output}"'
30 | 
31 | rule convert_transcript_to_notes:
32 |     input: 'scene-transcripts/{name}.tsv'
33 |     output: 'scene-notes/{name}/done'
34 |     params: output_dir='scene-notes/{name}'
35 |     resources: openai=1
36 |     shell: 'python transcript-to-note.py "{input}" "{params.output_dir}"'
37 | 
38 | rule merge_notes:
39 |     input:
40 |         transcripts='scene-transcripts/{name}.tsv',
41 |         scene_notes_done_mark='scene-notes/{name}/done'
42 |     output: 'notes/{name}.md'
43 |     params: notedir='scene-notes/{name}'
44 |     run:
45 |         import pandas as pd
46 | 
47 |         notename = wildcards.name
48 |         transcripts = pd.read_csv(input.transcripts, sep='\t')
49 | 
50 |         with open(output[0], 'w') as output:
51 |             for scene_no, row in transcripts.iterrows():
52 |                 scene_no += 1 # make scene numbers 1-indexed
53 | 
54 |                 notefile = f'{params.notedir}/{scene_no:04d}-answer.md'
55 |                 note = open(notefile).read().splitlines()
56 |                 note.insert(1, f'Time: {row["start_time"]} – {row["end_time"]}')
57 | 
58 |                 print(*note, sep='\n', file=output, end='\n\n')
59 | 
60 | rule format_html_notes:
61 |     input: 'notes/{name}.md'
62 |     output: 'notes/{name}.html'
63 |     shell: 'pandoc -f markdown -t html "{input}" -o "{output}"'
64 | 


--------------------------------------------------------------------------------
/download-videos.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python
2 | import subprocess as sp
3 | 
4 | for url in open('youtube-urls.txt').read().split():
5 |     sp.check_call(['yt-dlp', '--concurrent-fragments', '4', '--write-auto-subs', url])
6 | 
7 | 


--------------------------------------------------------------------------------
/split-transcripts.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python
  2 | import webvtt
  3 | import re
  4 | import pandas as pd
  5 | 
  6 | pat_vtttime = re.compile(r'<\d\d:\d\d:\d\d\.\d\d\d>')
  7 | 
  8 | def strip_c(txt):
  9 |     if txt.startswith('<c>'):
 10 |         txt = txt[3:]
 11 |     if txt.endswith('</c>'):
 12 |         txt = txt[:-4]
 13 |     return txt
 14 | 
 15 | def parse_vtt_line(row):
 16 |     linestart_in_seconds = row['start']
 17 |     txt = row['line2']
 18 | 
 19 |     matches = []
 20 |     for match in pat_vtttime.finditer(txt):
 21 |         tm = match.group()[1:-1]
 22 |         toks = list(map(float, tm.split(':')))
 23 |         assert len(toks) == 3
 24 |         timemark = toks[0] * 3600 + toks[1] * 60 + toks[2]
 25 |         matches.append((match.start(), match.end(), timemark))
 26 | 
 27 |     matches = pd.DataFrame(matches, columns=['start', 'end', 'timemark'])
 28 | 
 29 |     txtparts = pd.DataFrame({
 30 |         'txtstart': [0] + list(matches['end']),
 31 |         'txtend': list(matches['start']) + [len(txt)],
 32 |         'timestart': [linestart_in_seconds] + list(matches['timemark']),
 33 |     })
 34 |     txtparts['txt'] = txtparts.apply(lambda row: strip_c(txt[int(row['txtstart']):int(row['txtend'])]), axis=1)
 35 | 
 36 |     return txtparts
 37 | 
 38 | def load_vtt(vttfile):
 39 |     rows = []
 40 |     for caption in webvtt.read(vttfile):
 41 |         assert len(caption.lines) == 2
 42 |         rows.append([
 43 |             caption.start_in_seconds,
 44 |             caption.end_in_seconds,
 45 |             caption.lines[0],
 46 |             caption.lines[1],
 47 |         ])
 48 |     vtt = pd.DataFrame(rows, columns=['start', 'end', 'line1', 'line2'])
 49 | 
 50 |     tokentimings = []
 51 |     for _, row in vtt.iterrows():
 52 |         tokentimings.append(parse_vtt_line(row))
 53 |     tokentimings = pd.concat(tokentimings)
 54 | 
 55 |     tokentimings['timeend'] = tokentimings['timestart'].shift(-1)
 56 |     validtokens = tokentimings[tokentimings['txt'].apply(lambda x: len(x.strip())) > 0]
 57 |     validtokens = validtokens['timestart timeend txt'.split()].copy()
 58 |     validtokens.columns = ['start', 'end', 'token']
 59 |     validtokens['token'] = validtokens['token'].apply(lambda x: x.strip())
 60 | 
 61 |     return validtokens.reset_index(drop=True)
 62 | 
 63 | def load_scenic_transcripts(vttfile, scenefile, min_scene_length=5):
 64 |     transcript_tokens = load_vtt(vttfile)
 65 |     sceneinfo = pd.read_csv(scenefile, skiprows=1)
 66 | 
 67 |     scene_transcripts = []
 68 |     for _, scene in sceneinfo.iterrows():
 69 |         scenestart, sceneend = scene['Start Time (seconds)'], scene['End Time (seconds)']
 70 |         scene_transcript = transcript_tokens[transcript_tokens['start'].between(scenestart, sceneend)]
 71 |         scene_transcripts.append([
 72 |             scenestart, sceneend,
 73 |             scene['Start Timecode'], scene['End Timecode'],
 74 |             len(scene_transcript), ' '.join(scene_transcript['token'])])
 75 |     scene_transcripts = pd.DataFrame(scene_transcripts,
 76 |                                      columns='start end start_time end_time num_tokens transcript'.split())
 77 | 
 78 |     return scene_transcripts[scene_transcripts['num_tokens'] >= min_scene_length].reset_index(drop=True)
 79 | 
 80 | def diffuse_boundary(transcripts, max_diffusion=5):
 81 |     transcripts = transcripts.copy()
 82 | 
 83 |     tails = transcripts['transcript'].apply(lambda x: ' '.join(x.split()[-max_diffusion:]))
 84 |     heads = transcripts['transcript'].apply(lambda x: ' '.join(x.split()[:max_diffusion]))
 85 | 
 86 |     prefixes = pd.Series([''] + list(tails[:-1]))
 87 |     suffixes = pd.Series(list(heads[1:]) + [''])
 88 | 
 89 |     transcripts['transcript'] = prefixes + ' ' + transcripts['transcript'] + ' ' + suffixes
 90 |     return transcripts
 91 | 
 92 | 
 93 | if __name__ == '__main__':
 94 |     import sys
 95 |     subtitle_file = sys.argv[1]
 96 |     scene_file = sys.argv[2]
 97 |     output_file = sys.argv[3]
 98 | 
 99 |     transcripts = load_scenic_transcripts(subtitle_file, scene_file)
100 |     diffuse_boundary(transcripts).to_csv(output_file, index=False, sep='\t')
101 | 


--------------------------------------------------------------------------------
/transcript-to-note.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | import openai, os
 3 | import pandas as pd
 4 | from collections import deque
 5 | import json
 6 | import sys
 7 | 
 8 | openai.api_key_path = os.environ['HOME'] + '/.openai-api'
 9 | 
10 | system_instruction = """\
11 | You are a university lecturer teaching virology for biology majors. Based on \
12 | the transcript provided by user, you will prepare lecture notes for the part. \
13 | The lecture note should start with a brief title in a single sentence. \
14 | The body must be prepared in multiple bullet points. The answer should be \
15 | in Korean. In the body, the scientific terms should be used in Korean with \
16 | the English term in parenthesis. The body should not miss any important points \
17 | from the transcript. The body should be concise and easy to understand. The \
18 | answer must be formatted in Markdown. Use the first level heading for the \
19 | title and the body should be in bullet points under the title. \
20 | """
21 | 
22 | transcript_file = sys.argv[1]
23 | output_dir = sys.argv[2]
24 | context_size = 4
25 | 
26 | if not os.path.isdir(output_dir):
27 |     os.makedirs(output_dir)
28 | 
29 | transcripts = pd.read_csv(transcript_file, sep='\t')
30 | system_context = [{'role': 'system', 'content': system_instruction}]
31 | context = deque()
32 | for scene_no, row in transcripts.iterrows():
33 |     scene_no += 1 # make it 1-based
34 |     output_prefix = f'{output_dir}/{scene_no:04d}'
35 |     context_file = output_prefix + '-context.json'
36 | 
37 |     if os.path.isfile(context_file):
38 |         context = deque(json.load(open(context_file)))
39 |         print(f'==> Skipping scene {scene_no}')
40 |         continue
41 | 
42 |     transcript = row['transcript']
43 |     context.append({
44 |         'role': 'user',
45 |         'content': transcript})
46 |     while len(context) > context_size:
47 |         context.popleft()
48 | 
49 |     print(f'==> Requesting completion for scene {scene_no}')
50 |     response = openai.ChatCompletion.create(
51 |         model='gpt-4',
52 |         messages=system_context + list(context),
53 |         temperature=0.4,
54 |         top_p=1,
55 |         max_tokens=1000,
56 |         frequency_penalty=0.0
57 |     )
58 | 
59 |     answer = response['choices'][0]['message']
60 |     context.append(answer)
61 | 
62 |     json.dump(list(context), open(context_file, 'w'),
63 |               indent=2, ensure_ascii=False)
64 |     open(output_prefix + '-answer.md', 'w').write(answer['content'])
65 | 
66 |     title = answer['content'].splitlines()[0]
67 |     num_bullets = sum(bool(l.strip()) for l in answer['content'].splitlines()) - 1
68 | 
69 |     print(f'   {title}')
70 |     print(f'   {num_bullets} Bullets.')
71 |     print(f'   - Used {response["usage"]["prompt_tokens"]} prompt and '
72 |           f'{response["usage"]["completion_tokens"]} completion tokens.')
73 | 
74 | open(f'{output_dir}/done', 'w')
75 | 


--------------------------------------------------------------------------------