pdf2video - an example

├── pdf2video ├── __main__.py ├── __init__.py ├── parser.py └── pdf2video.py ├── sample.pdf ├── setup.cfg ├── sample.css ├── sample.html ├── setup.py ├── LICENSE ├── sample.txt └── README.md /pdf2video/__main__.py: -------------------------------------------------------------------------------- 1 | from .pdf2video import main 2 | main() 3 | -------------------------------------------------------------------------------- /sample.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/tjunttila/pdf2video/HEAD/sample.pdf -------------------------------------------------------------------------------- /setup.cfg: -------------------------------------------------------------------------------- 1 | [options.entry_points] 2 | console_scripts = 3 | pdf2video = pdf2video.pdf2video:main 4 | [build-system] 5 | requires = ["setuptools", "wheel"] 6 | [metadata] 7 | license_files = LICENSE 8 | -------------------------------------------------------------------------------- /pdf2video/__init__.py: -------------------------------------------------------------------------------- 1 | """ 2 | A tool for converting PDF presentations into narrated videos. 3 | Should only be called from the command line. 4 | Please see https://github.com/tjunttila/pdf2video/ for more details. 5 | """ 6 | -------------------------------------------------------------------------------- /sample.css: -------------------------------------------------------------------------------- 1 | video { 2 | border-style: solid; 3 | border-width: 2pt; 4 | border-color: #002F6C; 5 | border-radius: 5pt; 6 | } 7 | video::cue { 8 | /*font-size: 20pt;*/ 9 | color: white; 10 | background-color: #002F6C; 11 | opacity: 0.8; 12 | } 13 | -------------------------------------------------------------------------------- /sample.html: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | pdf2video - an example 6 | 7 | 8 | 9 |

10 | This HTML page shows how one can embed videos 11 | made with the pdf2video tool in web pages. 12 | WebVTT subtitles are supported by the tool. 13 |

14 |

19 | 20 | 21 | -------------------------------------------------------------------------------- /setup.py: -------------------------------------------------------------------------------- 1 | import setuptools 2 | 3 | with open("README.md", "r", encoding="utf-8") as f: 4 | long_description = f.read() 5 | 6 | setuptools.setup( 7 | name="pdf2video", # Replace with your own username 8 | version="0.2.1", 9 | author="T. Junttila", 10 | author_email="Tommi.Junttila@aalto.fi", 11 | description="A tool for making narrated videos from PDF presentations.", 12 | long_description=long_description, 13 | long_description_content_type="text/markdown", 14 | url="https://github.com/tjunttila/pdf2video", 15 | packages=setuptools.find_packages(), 16 | license = "MIT", 17 | classifiers=[ 18 | "Programming Language :: Python :: 3", 19 | "License :: OSI Approved :: MIT License", 20 | "Operating System :: OS Independent", 21 | ], 22 | python_requires='>=3.6', 23 | setup_requires=['wheel'] 24 | ) 25 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2020-2021 T. Junttila 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /sample.txt: -------------------------------------------------------------------------------- 1 | #page 2 | Welcome to a short sample presentation about the #sub#pdf-to-video#pdf2video# tool. 3 | This video is produced automatically with the tool. 4 | You can find more details in the GitHub page of the project. 5 | #20 6 | 7 | #page motivation 8 | Need to make videos of your PDF presentations? 9 | Tired in spending *hours* in recording and editing the audio tracks? 10 | #10 11 | The #sub#pdf-to-video#pdf2video# tool can help you! 12 | It is a small tool, written in the Python programming language, 13 | for making videos from PDF presentations. 14 | #8 15 | As input, it takes a PDF presentation and a textual script file. 16 | The presentation is then turned into a video, 17 | narrated by the Amazon Polly text-to-speech engine. 18 | #10 19 | In order to use the tool, 20 | you should have some common PDF and video tools installed in your computer. 21 | In addition, you should have access to Amazon Web Services. 22 | #20 23 | 24 | #page usage 25 | The use of the tool should be rather simple. 26 | One just provides the names of the inputs, 27 | the PDF file and the script file, 28 | as well as the name of the output video file. 29 | #10 30 | The tool also provides a number of options. 31 | For instance, one can select only some of the PDF pages 32 | to be included in the video. 33 | This makes it easier to split a long presentation into a set of shorter videos. 34 | #10 35 | In addition, the narration voice can be changed. 36 | For instance, this sample video is produced 37 | with the command shown in the slide. 38 | #10 39 | One can find the sample PDF and script files 40 | in the GitHub repository of the project. 41 | #30 42 | 43 | #page scripts_1 44 | The script files are rather simple text files. 45 | They should be easy to produce with *any* text editor. 46 | #10 47 | For each PDF page to be included in the video, 48 | the file contains a special header line, 49 | followed by the actual script text. 50 | #10 51 | In the text, some simple formatting commands can be used. 52 | For instance, 53 | one can make some text to be *#ph#read#red# in an emphasized style*. 54 | #10 55 | Similarly, one can make breaks of arbitrary lengths. 56 | #40 57 | 58 | #page scripts_2 59 | Subtitles are automatically generated from the script file. 60 | #10 61 | They can be customized with the #sub!hash-sub!#sub! modifier, 62 | which can be combined with the reading style modifiers. 63 | #20 64 | For instance, 65 | consider the example shown here. 66 | #30 67 | It is read as "#slow!big-#ph#Theta#Ti:t@# of n squared!" 68 | #8 69 | but the subtitles show #sub#the same in a mathematical form#Θ(n^2)#. 70 | #40 71 | That's all for this sample presentation! 72 | Please find more details in the GitHub page of the tool. 73 | #20 74 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Description 2 | 3 | `pdf2video` is a Python script that combines 4 | 5 | * (selected pages of) a [PDF](https://en.wikipedia.org/wiki/PDF) presentation, and 6 | * a text script 7 | 8 | into a video narrated by the [Amazon Polly](https://aws.amazon.com/polly/) text-to-speech engine. 9 | It can be used to generate, for instance, educational videos. 10 | 11 | Please see this [sample video](https://users.aalto.fi/tjunttil/pdf2video.mp4), 12 | produced with the tool, for a short introduction. 13 | Observe that some browsers don't show the subtitles embedded in MP4 videos, 14 | please see this [sample video with WebVTT subtitles](https://users.aalto.fi/tjunttil/pdf2video.html) in such as case. 15 | 16 | # Requirements 17 | 18 | Using `pdf2video` requires the following external tools and services: 19 | 20 | * [Python](https://www.python.org/) version 3.6 or later. 21 | * The `pdfinfo` and `pdftoppm` command line tools provided in the [poppler PDF rendering library](https://poppler.freedesktop.org/). 22 | 23 | In Ubuntu Linux, you can install these with `sudo apt get poppler-utils`. 24 | 25 | For macOs, they are available at least from [Homebrew](https://brew.sh/) with `brew install poppler`. 26 | * The `ffmpeg` command line tool from the [`FFmpeg`](https://ffmpeg.org/) framework. 27 | 28 | In Ubuntu Linux, you can install it with `sudo apt get ffmpeg`. 29 | 30 | For macOs, it is available at least from [Homebrew](https://brew.sh/) with `brew install ffmpeg`. 31 | * Access to [Amazon Web Services](https://aws.amazon.com/). 32 | * The [AWS Command Line Interface](https://aws.amazon.com/cli/) configured with a [profile](https://docs.aws.amazon.com/cli/latest/userguide/cli-configure-profiles.html) that can access the Polly service. To use the [neural voices](https://docs.aws.amazon.com/polly/latest/dg/ntts-voices-main.html) (recommended for the best quality), remember to select [a region in which they are supported](https://docs.aws.amazon.com/polly/latest/dg/NTTS-main.html). 33 | 34 | # Installation 35 | 36 | One can use `pip` to install `pdf2video` directly from GitHub: 37 | ``` 38 | python3 -m pip install git+https://github.com/tjunttila/pdf2video.git 39 | ``` 40 | See the [PyPA Installing Packages tutorial](https://packaging.python.org/tutorials/installing-packages/) for information on installing Python packages and on Python virtual environments. 41 | 42 | # Usage 43 | 44 | In the simplest case, 45 | ``` 46 | pdf2video presentation.pdf script.txt video.mp4 47 | ``` 48 | converts the PDF file `presentation.pdf` and 49 | the UTF-8 encoded script file `script.txt` 50 | into the video `video.mp4` narrated by the default voice (Amazon Polly standard voice Joanna in the current version). 51 | The video includes SRT subtitles that can be displayed by most video players. 52 | In addition, for HTML use, [WebVTT subtitles](https://www.w3schools.com/tags/tag_track.asp) are produced in a separate file as well. 53 | 54 | The selected PDF pages as well as the narration voice can be changed easily. 55 | For instance, the [sample video](https://users.aalto.fi/tjunttil/pdf2video.mp4) was produced with the command 56 | ``` 57 | pdf2video sample.pdf sample.txt --pages "1,2,4-6" --voice Matthew --neural --conversational sample.mp4 58 | ``` 59 | All the options can be printed with `pdf2video --help`. 60 | 61 | The script file is formatted as follows. 62 | The script for each presentation page starts with a line `#page [name]` and 63 | the following text then contains the script. The optional `[name]` parameter, that can be used in the `--only` option of the tool, is a string of ascii letters and underscores, possibly followed by a non-negative number. For instance `defs` and `example_3` are valid names. 64 | 65 | A line starting with `%` is a comment and thus ignored. 66 | 67 | In the script text, one can use the following modifiers: 68 | 69 | * `*text*` to read `text` in an emphasized style, 70 | * `@xyz@` to spell `xyz` as characters, 71 | * `#slow/text/` to read `text` in a slower rate, 72 | * `#high/text/` to use higher pitch for `text`, 73 | * `#low/text/` to use lower pitch for `text`, 74 | * `#n`, where `n` is a positive integer, to have a pause of length of `n`*100ms, 75 | * `#ph/word/pronunciation/` spell the `word` with the [X-SAMPA](https://en.wikipedia.org/wiki/X-SAMPA) `pronunciation`, and 76 | * `#sub/text/subtitle/` to use `subtitle` as the subtitle instead of the spoken `text`. 77 | 78 | Above, the `/` delimiter can be any other symbol not occurring in the "arguments" of the modifier. 79 | This allows one to nest modifiers. 80 | For instance, 81 | `#sub/big-#ph!Theta!Ti:.t@! of n/Θ(n)/` 82 | reads as "big-theta of n" but shows as `Θ(n)` in the subtitles. 83 | 84 | Please see the file [sample.txt](sample.txt) file for examples. 85 | 86 | 87 | # Some good practices and hints 88 | 89 | * Converting a script with many pages to video can take some time. For developing and debugging the script text, it is recommended to name the script pages with `#page pagename`, and then use the `--only` option of the tool to convert only the page under development. 90 | * For pronunciations, one can find [IPA](https://en.wikipedia.org/wiki/International_Phonetic_Alphabet) pronunciations in many online dictionaries, and then convert them to X-SAMPA by using the table in the [X-SAMPA Wikipedia page](https://en.wikipedia.org/wiki/X-SAMPA). 91 | * Whenever possible, avoid using the `@xyz@` construct as it seems to change the pitch of the whole sentence. 92 | 93 | 94 | # License 95 | 96 | The `pdf2video` tool is relased under the [MIT License](https://opensource.org/licenses/MIT). 97 | -------------------------------------------------------------------------------- /pdf2video/parser.py: -------------------------------------------------------------------------------- 1 | """ 2 | Parser for pdf2video script file syntax. 3 | Author: T. Junttila 4 | License: The MIT License 5 | """ 6 | 7 | from abc import ABC, abstractmethod 8 | import re 9 | import sys 10 | 11 | class AST(ABC): 12 | """Base class for abstract syntax tree nodes.""" 13 | 14 | @abstractmethod 15 | def to_ssml(self, neural): 16 | """Get the SSML representation of the sub-tree.""" 17 | 18 | @abstractmethod 19 | def to_words(self): 20 | """Get the plain words representation of the sub-tree.""" 21 | 22 | @abstractmethod 23 | def to_sub(self): 24 | """Get the sub-titles representation of the sub-tree.""" 25 | 26 | class ASTWord(AST): 27 | """An AST node for a word.""" 28 | def __init__(self, text): 29 | super().__init__() 30 | self.text = text 31 | def to_ssml(self, neural): 32 | return self.text 33 | def to_words(self): 34 | return [self.text] 35 | def to_sub(self): 36 | return self.text 37 | 38 | class ASTBreak(AST): 39 | """An AST node for a break.""" 40 | def __init__(self, time): 41 | self.time = time 42 | def to_ssml(self, neural): 43 | return '' 44 | def to_words(self): 45 | return [] 46 | def to_sub(self): 47 | return '' 48 | 49 | class ASTDelim(AST): 50 | """An AST node for a delimiter.""" 51 | def __init__(self, text): 52 | self.text = text 53 | def to_ssml(self, neural): 54 | return self.text 55 | def to_words(self): 56 | return [] 57 | def to_sub(self): 58 | return self.text 59 | 60 | class ASTSpace(AST): 61 | """An AST node for a white space.""" 62 | def __init__(self): 63 | pass 64 | def to_ssml(self, neural): 65 | return ' ' 66 | def to_words(self): 67 | return [] 68 | def to_sub(self): 69 | return ' ' 70 | 71 | class ASTEmph(AST): 72 | """An AST node for emphasized text.""" 73 | def __init__(self, children): 74 | self.children = children 75 | def to_ssml(self, neural): 76 | children_ssml = "".join([child.to_ssml(neural) for child in self.children]) 77 | if neural: 78 | return ''+children_ssml+'' 79 | return ''+children_ssml+'' 80 | def to_words(self): 81 | result = [] 82 | for child in self.children: 83 | result += child.to_words() 84 | return result 85 | def to_sub(self): 86 | return "".join([child.to_sub() for child in self.children]) 87 | 88 | class ASTPhoneme(AST): 89 | """An AST node for text read with phonemes.""" 90 | def __init__(self, text, xsampa): 91 | self.text = text 92 | self.xsampa = xsampa 93 | def to_ssml(self, neural): 94 | return f'{self.text}' 95 | def to_words(self): 96 | return re.split(r'\s+', self.text.strip()) 97 | def to_sub(self): 98 | return self.text 99 | 100 | class ASTSub(AST): 101 | """An AST node for text with different sub-title representation.""" 102 | def __init__(self, children, subtitles): 103 | self.children = children 104 | self.subtitles = subtitles 105 | def to_ssml(self, neural): 106 | children_ssml = [child.to_ssml(neural) for child in self.children] 107 | return "".join(children_ssml) 108 | def to_words(self): 109 | result = [] 110 | for child in self.children: 111 | result += child.to_words() 112 | return result 113 | def to_sub(self): 114 | return self.subtitles 115 | 116 | class ASTSlow(AST): 117 | """An AST node for text read slowly.""" 118 | def __init__(self, children): 119 | self.children = children 120 | def to_ssml(self, neural): 121 | children_ssml = "".join([child.to_ssml(neural) for child in self.children]) 122 | return ''+children_ssml+'' 123 | def to_words(self): 124 | result = [] 125 | for child in self.children: 126 | result += child.to_words() 127 | return result 128 | def to_sub(self): 129 | return "".join([child.to_sub() for child in self.children]) 130 | 131 | class ASTLow(AST): 132 | """An AST node for text read in low pitch.""" 133 | def __init__(self, children): 134 | self.children = children 135 | def to_ssml(self, neural): 136 | children_ssml = "".join([child.to_ssml(neural) for child in self.children]) 137 | if neural: 138 | # prosody pitch not yet in neural TTS, make it slightly slower 139 | return ''+children_ssml+'' 140 | return ''+children_ssml+'' 141 | def to_words(self): 142 | result = [] 143 | for child in self.children: 144 | result += child.to_words() 145 | return result 146 | def to_sub(self): 147 | return "".join([child.to_sub() for child in self.children]) 148 | 149 | class ASTHigh(AST): 150 | """An AST node for text read in high pitch.""" 151 | def __init__(self, children): 152 | self.children = children 153 | def to_ssml(self, neural): 154 | children_ssml = "".join([child.to_ssml(neural) for child in self.children]) 155 | if neural: 156 | # prosody pitch not yet in neural TTS, make it slightly faster 157 | return ''+children_ssml+'' 158 | return ''+children_ssml+'' 159 | def to_words(self): 160 | result = [] 161 | for child in self.children: 162 | result += child.to_words() 163 | return result 164 | def to_sub(self): 165 | return "".join([child.to_sub() for child in self.children]) 166 | 167 | class ASTSayAs(AST): 168 | """An AST node for text read as letters.""" 169 | def __init__(self, letters): 170 | self.letters = letters 171 | def to_ssml(self, neural): 172 | return ''+self.letters+'' 173 | def to_words(self): 174 | return re.split(r'\s+', self.letters.strip()) 175 | def to_sub(self): 176 | return self.letters 177 | 178 | 179 | def parse_to_ast(string, err_linenum = None): 180 | """Parse the script text string into a sequence of AST nodes.""" 181 | i = 0 182 | string_length = len(string) 183 | def read_until(chars): 184 | nonlocal i 185 | tmp = i 186 | while i < string_length and string[i] not in chars: 187 | i += 1 188 | return string[tmp:i] 189 | def err(msg): 190 | linenum_text = '' if err_linenum is None else f'On line {err_linenum}: ' 191 | print(linenum_text+msg) 192 | sys.exit(1) 193 | #assert False, msg 194 | result = [] 195 | while i < string_length: 196 | if string[i] == '#': 197 | if string[i:i+4] == '#sub': 198 | match = re.match( 199 | '^#sub(.)(?P((?!\1).)*?)\\1(?P_{((?!\1).)+?)\\1',
200 | string[i:])
201 | if match is None:
202 | err(f'Malformed #sub "{string[i:]}"')
203 | result.append(ASTSub(parse_to_ast(match['text']), match['sub']))
204 | i += len(match.group(0))
205 | continue
206 | if string[i:i+5] == '#slow':
207 | match = re.match('^#slow(.)(?P((?!\1).)+?)\\1', string[i:])
208 | if match is None:
209 | err(f'Malformed #slow "{string[i:]}"')
210 | result.append(ASTSlow(parse_to_ast(match['text'])))
211 | i += len(match.group(0))
212 | continue
213 | if string[i:i+4] == '#low':
214 | match = re.match('^#low(.)(?P((?!\1).)+?)\\1', string[i:])
215 | if match is None:
216 | err(f'Malformed #low "{string[i:]}"')
217 | result.append(ASTLow(parse_to_ast(match['text'])))
218 | i += len(match.group(0))
219 | continue
220 | if string[i:i+5] == '#high':
221 | match = re.match('^#high(.)(?P((?!\1).)+?)\\1', string[i:])
222 | if match is None:
223 | err(f'Malformed #high "{string[i:]}"')
224 | result.append(ASTHigh(parse_to_ast(match['text'])))
225 | i += len(match.group(0))
226 | continue
227 | if string[i:i+3] == '#ph':
228 | match = re.match(
229 | '^#ph(.)(?P((?!\1).)+?)\\1(?P((?!\1).)+?)\\1',
230 | string[i:])
231 | if match is None:
232 | err(f'Malformed #ph "{string[i:]}"')
233 | result.append(ASTPhoneme(match['text'], match['ph']))
234 | i += len(match.group(0))
235 | continue
236 | # Break #10
237 | match = re.match(r'^#(?P\d+)', string[i:])
238 | if match:
239 | result.append(ASTBreak(int(match['time'])))
240 | i += len(match.group(0))
241 | continue
242 | err(f'Unrecognized script command "{string[i:]}"')
243 | elif string[i] == '*':
244 | match = re.match(r'^\*(?P[^\*]+)\*', string[i:])
245 | if match is None:
246 | err(f'Malformed emphasis "{string[i:]}"')
247 | result.append(ASTEmph(parse_to_ast(match['text'])))
248 | i += len(match.group(0))
249 | elif string[i] == '@':
250 | match = re.match(r'^@(?P[^@]+)@', string[i:])
251 | if match is None:
252 | err(f'Malformed say-as "{string[i:]}"')
253 | result.append(ASTSayAs(match['text']))
254 | i += len(match.group(0))
255 | else:
256 | match = re.match(r'^\s+', string[i:])
257 | if match:
258 | result.append(ASTSpace())
259 | i += len(match.group(0))
260 | continue
261 | # Negative numbers are words
262 | match = re.match(r'^-\d+', string[i:])
263 | if match:
264 | result.append(ASTWord(match.group(0)))
265 | i += len(match.group(0))
266 | continue
267 | # Delimiters
268 | match = re.match('^[-.,:;!?"]', string[i:])
269 | if match:
270 | result.append(ASTDelim(match.group(0)))
271 | i += len(match.group(0))
272 | continue
273 | word = read_until([' ','\t','#','*','@','"','.',',',':',';','!','?'])
274 | result.append(ASTWord(word))
275 | return result
276 |
277 | def parse(string, neural):
278 | """Parse a script text line."""
279 | ast = parse_to_ast(string)
280 | ssml = "".join([node.to_ssml(neural) for node in ast])
281 | words = []
282 | for node in ast:
283 | words += node.to_words()
284 | sub = "".join([node.to_sub() for node in ast])
285 | return (ssml, words, sub)
286 |

--------------------------------------------------------------------------------
/pdf2video/pdf2video.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/python3
2 |
3 | """
4 | A small Python script for making videos by
5 | combining PDF and Amazon Polly narration.
6 | Author: T. Junttila
7 | License: The MIT License
8 | Requires:
9 | - pdfinfo
10 | - pdftoppm
11 | - ffmpeg
12 | - access to Amazon Web Services with a Polly-enabled profile
13 | """
14 |
15 | import argparse
16 | import hashlib
17 | import json
18 | import os
19 | import re
20 | import subprocess
21 | from subprocess import PIPE
22 | import sys
23 |
24 | from .parser import parse_to_ast, parse
25 |
26 | voices = ['Zeina', 'Zhiyu', 'Naja', 'Mads', 'Lotte', 'Ruben', 'Nicole',
27 | 'Russell', 'Amy', 'Emma', 'Brian', 'Aditi', 'Raveena', 'Ivy',
28 | 'Joanna', 'Kendra', 'Kimberly', 'Salli', 'Joey', 'Justin',
29 | 'Matthew', 'Geraint', 'Céline', 'Celine', 'Léa', 'Mathieu',
30 | 'Chantal', 'Marlene', 'Vicki', 'Hans', 'Aditi', 'Dóra', 'Dora',
31 | 'Karl', 'Carla', 'Bianca', 'Giorgio', 'Mizuki', 'Takumi', 'Seoyeon',
32 | 'Liv', 'Ewa', 'Maja', 'Jacek', 'Jan', 'Camila', 'Vitória', 'Vitoria',
33 | 'Ricardo', 'Inês', 'Ines', 'Cristiano', 'Carmen', 'Tatyana', 'Maxim',
34 | 'Conchita', 'Lucia', 'Enrique', 'Mia', 'Lupe', 'Penélope',
35 | 'Penelope', 'Miguel', 'Astrid', 'Filiz', 'Gwyneth']
36 |
37 | voices_neural = ['Amy', 'Emma', 'Brian', 'Ivy', 'Joanna', 'Kendra',
38 | 'Kimberly', 'Salli', 'Joey', 'Justin', 'Kevin', 'Matthew',
39 | 'Camila', 'Lupe']
40 |
41 | voices_conversational = ['Joanna', 'Matthew', 'Lupe']
42 |
43 | def millis_to_srt(millis):
44 | """Convert milliseconds time to the SRT subtitles format time string."""
45 | result = ''
46 | # milliseconds
47 | milliseconds = millis % 1000
48 | result = ('%03d' % milliseconds) + result
49 | millis = (millis - milliseconds) / 1000
50 | # seconds
51 | seconds = millis % 60
52 | result = ('%02d,' % seconds) + result
53 | millis = (millis - seconds) / 60
54 | # minutes
55 | minutes = millis % 60
56 | result = ('%02d:' % minutes) + result
57 | millis = (millis - minutes) / 60
58 | # hours
59 | result = ('%02d:' % millis) + result
60 | # ready
61 | return result
62 |
63 |
64 | def parse_page_range(args, execute, error):
65 | """
66 | Parse the page range.
67 | """
68 | pages = []
69 | if args.pages == 'all':
70 | # --pages parameter was not given
71 | # Use pdfinfo to find out the number of pages, select all
72 | cmd = f'{args.pdfinfo} {args.pdf_file}'
73 | exec_result = execute(cmd)
74 | nof_pages = None
75 | for line in exec_result.stdout.decode('utf-8').split('\n'):
76 | match = re.match(r'^Pages:\s*(\d+)\s*$', line)
77 | if match:
78 | nof_pages = int(match.group(1))
79 | break
80 | if nof_pages is None:
81 | error(f'Could not read the number of pages with "{cmd}"')
82 | pages = list(range(1, nof_pages+1))
83 | return pages
84 | # --pages parameter was given, parse it
85 | for comp in [c.strip() for c in args.pages.split(",")]:
86 | match = re.match(r'^(\d+)$', comp)
87 | if match:
88 | pages.append(int(match.group(1)))
89 | continue
90 | match = re.match(r'^(\d+)\s*-\s*(\d+)$', comp)
91 | if match:
92 | (start,end) = (int(match.group(1)), int(match.group(2)))
93 | length = end - start + 1
94 | if 0 < length < 10000:
95 | for i in range(start, end+1):
96 | pages.append(i)
97 | continue
98 | error('Invalid page range component: '+comp)
99 | return pages
100 |
101 |
102 | def parse_only(args, scripts, scripts_names, error):
103 | """
104 | Parse the 'only' range.
105 | """
106 | only = set()
107 | if args.only == 'the full set':
108 | # --only parameter was not given
109 | # Include all the #pages
110 | for i in range(0, len(scripts)):
111 | only.add(i)
112 | return only
113 | # --only parameter was given, parse it
114 | for comp in [c.strip() for c in args.only.split(",")]:
115 | # Single number
116 | match = re.match(r'^[1-9]\d*$', comp)
117 | if match:
118 | num = int(match.group(0))
119 | if not num <= len(scripts):
120 | error(f'#page {num} was selected in --only, ' \
121 | f'but only {len(scripts)} #pages exists')
122 | only.add(num-1)
123 | continue
124 | # Numeric range
125 | match = re.match(r'^([1-9]\d*)\s*-\s*([1-9]\d*)$', comp)
126 | if match:
127 | (start,end) = (int(match.group(1)), int(match.group(2)))
128 | length = end - start + 1
129 | if 0 < length < 10000:
130 | for num in range(start, end+1):
131 | if not num <= len(scripts):
132 | error(f'#page {num} was selected in --only, ' \
133 | f'but only {len(scripts)} #pages exists')
134 | only.add(num-1)
135 | continue
136 | # Single name
137 | match = re.match(r'^[a-zA-Z_]+([1-9]\d*)?$', comp)
138 | if match:
139 | name = match.group(0)
140 | if name not in scripts_names:
141 | error(f'#page named "{name}" was selected in --only, ' \
142 | f'but there is no #page with that name. ' \
143 | f'Available #page names are: ' \
144 | f'{",".join(sorted(scripts_names.keys()))}')
145 | only.add(scripts_names[name])
146 | continue
147 | # name range
148 | match = re.match(r'^([a-zA-Z_]+)([1-9]\d*)-([1-9]\d*)$', comp)
149 | if match:
150 | (base,start,end) = (match.group(1),int(match.group(2)),int(match.group(3)))
151 | length = end - start + 1
152 | if 0 < length < 10000:
153 | for i in range(start, end+1):
154 | name = base+str(i)
155 | if name not in scripts_names:
156 | error(f'#page named "{name}" was selected in --only, ' \
157 | f'but there is no #page with that name. ' \
158 | f'Available #page names are: ' \
159 | f'{",".join(sorted(scripts_names.keys()))}')
160 | only.add(scripts_names[name])
161 | continue
162 | error('Invalid "only" range component: '+comp)
163 | return only
164 |
165 |
166 | def read_scripts(script_file, error):
167 | """
168 | Read all the scripts from a file.
169 | """
170 | scripts = []
171 | scripts_names = {}
172 | script = []
173 | in_script = False
174 | in_script_name = None
175 | try:
176 | with open(script_file, 'r', encoding='utf-8') as file_object:
177 | linenum = 0
178 | def err(msg):
179 | error(f'on line {linenum}: {msg}')
180 | for line in file_object.readlines():
181 | line = line.rstrip()
182 | linenum += 1
183 | if line == '':
184 | # Ignore empty lines
185 | continue
186 | if re.match(r'^%', line):
187 | # Lines starting with % are comments, skip them
188 | continue
189 | # A "#page" line starting a new page?
190 | match = re.match(r'^#page\s*(?P\s+[a-zA-Z_]+([1-9]\d*)?)?\s*$', line)
191 | if match:
192 | if in_script:
193 | # The previous #page script is now fully read, save it
194 | if in_script_name is not None:
195 | if in_script_name in scripts_names:
196 | err(f'#page named "{in_script_name}" defined twice')
197 | scripts_names[in_script_name] = len(scripts)
198 | scripts.append(script)
199 | #print(m)
200 | name = match['name']
201 | #print(name)
202 | in_script_name = name.strip() if name is not None else None
203 | in_script = True
204 | script = []
205 | continue
206 | if line.startswith("#page"):
207 | err("Malformed #page line: "+line)
208 | if not in_script:
209 | err('In the script file, all text should be after a "#page" block')
210 | # Add the line to the current page
211 | script.append((line, linenum))
212 | # All lines read, add the last page
213 | if in_script:
214 | if in_script_name is not None:
215 | if in_script_name in scripts_names:
216 | err(f'#page named "{in_script_name}" defined twice')
217 | scripts_names[in_script_name] = len(scripts)
218 | scripts.append(script)
219 | except IOError:
220 | error(f'Could not read the script file "{script_file}"')
221 | return (scripts, scripts_names)
222 |
223 |
224 | def script_to_ssml_and_hash(script, args):
225 | """
226 | Transform a script to SSML.
227 | Also returns a hash of the voice, style, and the script
228 | for caching audio files produced by the TTS system.
229 | """
230 |
231 | hash_value = hashlib.sha256()
232 | hash_value.update(args.voice.encode('utf-8'))
233 | hash_value.update(str(args.neural).encode('utf-8'))
234 | hash_value.update(str(args.conversational).encode('utf-8'))
235 | ssml = ''
236 | ssml += ''
237 | if args.conversational:
238 | ssml += ''
239 | ssml += '\n'
240 | for (page_linenum, (line,linenum)) in enumerate(script):
241 | ast = parse_to_ast(line, linenum)
242 | l_ssml = ''
243 | # Start-of-the-line marks for subtitle synchronization
244 | l_ssml += f''
245 | # Line contents in SSML
246 | l_ssml += ''.join([node.to_ssml(args.neural) for node in ast])+'\n'
247 | # End-of-the-line marks for subtitle synchronization
248 | l_ssml += f''
249 | ssml += l_ssml
250 | hash_value.update(l_ssml.encode('utf-8'))
251 | if args.conversational:
252 | ssml += ''
253 | ssml += ''
254 | ssml += '\n'
255 | return (ssml, hash_value.hexdigest())
256 |
257 |
258 | def main():
259 | """The main routine."""
260 | description = 'A tool for converting PDF presentations into ' \
261 | 'narrated videos. Please see ' \
262 | 'https://github.com/tjunttila/pdf2video/ for more details.'
263 | argp = argparse.ArgumentParser(
264 | formatter_class = argparse.ArgumentDefaultsHelpFormatter,
265 | description = description)
266 | argp.add_argument('--voice', metavar='V', default='Joanna',
267 | help='the applied TTS voice')
268 | argp.add_argument('--neural', action='store_true',
269 | help='use neural TTS')
270 | argp.add_argument('--conversational', action='store_true',
271 | help='use conversational style')
272 | argp.add_argument('--aws_profile', metavar='A', default='default',
273 | help='a Polly-enabled AWS profile')
274 | argp.add_argument('--audio_cache', metavar='C', default='pdf2video-cache',
275 | help='the directory for caching TTS audio files')
276 | argp.add_argument('--temp_prefix', metavar='T', default='pdf2video-temp',
277 | help='the prefix for the created temporary files')
278 | argp.add_argument('--ignore_subtitles', action='store_true',
279 | help='do not include or produce subtitles')
280 | argp.add_argument('--quiet', action='store_true',
281 | help='do not print progress information')
282 | argp.add_argument('--pages', metavar='P', default='all', help=
283 | 'The PDF page range of the form "1,3,4-7,1". ' \
284 | 'Defines the mapping from the #page texts ' \
285 | 'in the script file to selected PDF pages.')
286 | argp.add_argument('--only', metavar='O', default='the full set',
287 | help = 'Only compile the selected #page texts. ' \
288 | 'Used mainly during the development to select some of ' \
289 | 'the #pages. A comma-sepated set of #page identifies, ' \
290 | 'which can be (i) numbers, (ii) #page names, or ' \
291 | '(iii) ranges of of those. Example: "1,usage,scripts_1-2" ' \
292 | 'compiles the first #page, the ones named usage, '\
293 | 'scripts_1, and scripts_2.')
294 | #argp.add_argument('--output', metavar='O', default='video.mp4',
295 | # help="the output file")
296 | argp.add_argument('--ffmpeg', default='ffmpeg',
297 | help='the FFmpeg command line tool executable')
298 | argp.add_argument('--pdfinfo', default='pdfinfo',
299 | help='the "pdfinfo" executable from Poppler utils')
300 | argp.add_argument('--pdftoppm', default='pdftoppm',
301 | help='the "pdftoppm" executable from Poppler utils')
302 | argp.add_argument('pdf_file', help="the input PDF file")
303 | argp.add_argument('script_file', help="the input script file")
304 | argp.add_argument('output_file', help="the output mp4 video file")
305 | #argp.add_argument('files', nargs=argparse.REMAINDER)
306 | args = argp.parse_args()
307 |
308 | def verbose(msg):
309 | if not args.quiet:
310 | print(msg)
311 |
312 | temp_image_files = []
313 | temp_ssml_files = []
314 | temp_ts_files = []
315 | def unlink(file_name):
316 | if file_name is None:
317 | return
318 | try:
319 | os.unlink(file_name)
320 | except FileNotFoundError:
321 | pass
322 | def clean_temps():
323 | # remove the created temporary files
324 | for file_name in temp_image_files:
325 | unlink(file_name)
326 | for file_name in temp_ssml_files:
327 | unlink(file_name)
328 | for file_name in temp_ts_files:
329 | unlink(file_name)
330 |
331 | def error(msg):
332 | clean_temps()
333 | argp.exit(1, msg+'\n')
334 |
335 | def execute(cmd):
336 | try:
337 | exec_result = subprocess.run(re.split(r'\s+', cmd.strip()),
338 | stdout=PIPE, stderr=PIPE, check=False)
339 | except Exception as err:
340 | error(f'Error when executing "{cmd}".\n'+str(err))
341 | if exec_result.returncode != 0:
342 | #print(" ".join(r.args))
343 | error(f'Error when executing "{cmd}". The last 10 lines of ' \
344 | f'the stderr output is as follows:\n' +
345 | '\n'.join((exec_result.stderr.decode('utf-8').split('\n'))[-11:]))
346 | return exec_result
347 |
348 | def make_dir(dir_name):
349 | if os.path.exists(dir_name):
350 | if not os.path.isdir(dir_name):
351 | error("Not a directory: "+dir_name)
352 | else: os.mkdir(dir_name)
353 |
354 |
355 | if not args.output_file.endswith(".mp4"):
356 | error("The output file name must end with .mp4")
357 |
358 | pages = parse_page_range(args, execute, error)
359 |
360 | # Check voice arguments consistency
361 | if args.voice not in voices:
362 | error(f'Unsupported voice {args.voice}. The available voices are {", ".join(voices)}.')
363 | if args.neural and args.voice not in voices_neural:
364 | error(f'The voice {args.voice} is not available in neural TTS. ' \
365 | f'The available neural voices are {", ".join(voices_neural)}.')
366 | if args.conversational:
367 | args.neural = True
368 | if args.voice not in voices_conversational:
369 | error(f'The voice {args.voice} is not available in ' \
370 | f'conversational style. The available conversational ' \
371 | f'voices are {", ".join(voices_conversational)}.')
372 |
373 | (scripts, scripts_names) = read_scripts(args.script_file, error)
374 |
375 | make_dir(args.audio_cache)
376 |
377 | if len(scripts) != len(pages):
378 | error(f'{len(pages)} PDF pages selected but the script file ' \
379 | f'contains {len(scripts)} scripts')
380 |
381 | only = parse_only(args, scripts, scripts_names, error)
382 |
383 | # Select and convert selected pages to images
384 | for (index, page_num) in enumerate(pages):
385 | if index not in only:
386 | temp_image_files.append(None)
387 | continue
388 | verbose(f'Extracting and converting PDF page {page_num}')
389 | image_file = f'{args.temp_prefix}-{index+1}'
390 | temp_image_files.append(image_file+".ppm")
391 | cmd = f'{args.pdftoppm} -scale-to-y 1080 -scale-to-x -1 ' \
392 | f'-f {page_num} -singlefile {args.pdf_file} {image_file}'
393 | execute(cmd)
394 |
395 | # Make audio files with AWS Polly (cache the results)
396 | audio_files = []
397 | marks_files = []
398 | profile_arg = '' if args.aws_profile == 'default' else f'--profile {args.aws_profile}'
399 | for (index, script) in enumerate(scripts):
400 | if index not in only:
401 | temp_ssml_files.append(None)
402 | audio_files.append(None)
403 | marks_files.append(None)
404 | continue
405 | #
406 | # Audio track
407 | #
408 | verbose('Making the audio track %d' % (index+1))
409 | (ssml, hash_hex) = script_to_ssml_and_hash(script, args)
410 | ssml_file = f'{args.temp_prefix}-{index+1}.ssml'
411 | temp_ssml_files.append(ssml_file)
412 | with open(ssml_file, "w", encoding='utf-8') as file_handle:
413 | file_handle.write(ssml)
414 | audio_file = os.path.join(args.audio_cache, hash_hex+".mp3")
415 | marks_file = os.path.join(args.audio_cache, hash_hex+".mrk")
416 | # Use Polly to generate the MP3 file if not in cache
417 | if os.path.isfile(audio_file):
418 | verbose(' Audio file found in cache')
419 | else:
420 | verbose(' Calling Polly for the audio file')
421 | cmd = f'aws {profile_arg} polly synthesize-speech ' \
422 | f'--text-type ssml --text file://{ssml_file} ' \
423 | f'--output-format mp3 --voice-id {args.voice}'
424 | if args.neural:
425 | cmd += ' --engine neural'
426 | cmd += f' {audio_file}'
427 | execute(cmd)
428 | audio_files.append(audio_file)
429 | #
430 | # Speech marks for subtitles
431 | #
432 | if not args.ignore_subtitles:
433 | # Use Polly to generate the speech marks JSON file if not in cache
434 | if os.path.isfile(marks_file):
435 | verbose(' Speech marks found in cache')
436 | else:
437 | verbose(' Calling Polly for speech marks')
438 | cmd = f'aws {profile_arg} polly synthesize-speech ' \
439 | f'--text-type ssml --text file://{ssml_file} ' \
440 | f'--output-format json ' \
441 | f'--speech-mark-types sentence word viseme ssml ' \
442 | f'--voice-id {args.voice}'
443 | if args.neural:
444 | cmd += ' --engine neural'
445 | cmd += f' {marks_file}'
446 | execute(cmd)
447 | marks_files.append(marks_file)
448 |
449 | if not args.ignore_subtitles:
450 | #
451 | # Make srt subtitles
452 | #
453 | for (index, script) in enumerate(scripts):
454 | if index not in only:
455 | continue
456 | # Read the speech marks, keep only the start and end-of-the-line marks
457 | marks_file = marks_files[index]
458 | starts = {}
459 | ends = {}
460 | with open(marks_file, 'r', encoding='utf-8') as f:
461 | for line in f.readlines():
462 | mark = json.loads(line)
463 | if mark['type'] != 'ssml':
464 | continue
465 | match = re.match(r'^s(?P\d+?)$', mark['value'])
466 | if match:
467 | starts[int(match['num'])] = mark['time']
468 | match = re.match(r'^e(?P\d+?)$', mark['value'])
469 | if match:
470 | ends[int(match['num'])] = mark['time']
471 | #print(starts)
472 | #print(ends)
473 | srts = []
474 | for (page_linenum, (line, _)) in enumerate(script):
475 | #print(page_linenum, line)
476 | if line.strip() == '':
477 | continue
478 | start = starts[page_linenum]
479 | end = ends[page_linenum]
480 | (dummy, words, sub) = parse(line, args.neural)
481 | if len(words) == 0:
482 | continue
483 | srts.append({'start': start, 'end': end, 'text': sub})
484 |
485 | srt_file = marks_file[:-4] + '.srt'
486 | with open(srt_file, 'w', encoding='utf-8') as f:
487 | for (srt_index, srt) in enumerate(srts):
488 | f.write(f'{srt_index+1}\n')
489 | f.write(millis_to_srt(srt['start'])+' --> '+millis_to_srt(srt['end'])+'\n')
490 | f.write(srt['text']+'\n')
491 | f.write('\n')
492 |
493 | # Combine images and audios to transport streams
494 | for (index, page_num) in enumerate(pages):
495 | if index not in only:
496 | continue
497 | verbose(f'Combining PDF page and audio: {index+1}')
498 | ts_file = f'{args.temp_prefix}-{index+1}.mp4'
499 | temp_ts_files.append(ts_file)
500 | audio_file = audio_files[index]
501 | cmd = f'{args.ffmpeg} -y -loop 1 -i {temp_image_files[index]} ' \
502 | f'-i {audio_file} -shortest -c:v libx264 ' \
503 | f'-vf scale=-2:1080,format=yuv420p -c:a copy ' \
504 | f'-tune stillimage d{ts_file}'
505 | execute(cmd)
506 | if args.ignore_subtitles:
507 | os.rename(f'd{ts_file}', f'{ts_file}')
508 | else:
509 | verbose(' Adding subtitles')
510 | srt_file = audio_file[:-4] + '.srt'
511 | if os.stat(srt_file).st_size == 0:
512 | os.rename(f'd{ts_file}', f'{ts_file}')
513 | else:
514 | cmd = f'{args.ffmpeg} -y -i d{ts_file} -i {srt_file} ' \
515 | f'-c copy -c:s mov_text -metadata:s:s:0 language=eng ' \
516 | f'{ts_file}'
517 | execute(cmd)
518 | unlink(f'd{ts_file}')
519 |
520 | # Combine the transport streams
521 | verbose(f'Combining the transport streams to "{args.output_file}"')
522 | lst_file = f'{args.temp_prefix}.lst'
523 | with open(lst_file, 'w', encoding='utf-8') as f:
524 | for ts_file in temp_ts_files:
525 | f.write(f'file {ts_file}\n')
526 | cmd = f'{args.ffmpeg} -y -f concat -i {lst_file} -c:v copy -c:a aac ' \
527 | f'-c:s copy -strict -2 {args.output_file}'
528 | execute(cmd)
529 |
530 | if not args.ignore_subtitles:
531 | # Produce the WebVTT subtitles (for HTML)
532 | vtt_file = args.output_file[:-4]+'.vtt'
533 | verbose(f'Producing WebVTT subtitles at "{vtt_file}"')
534 | cmd = f'{args.ffmpeg} -y -i {args.output_file} {vtt_file}'
535 | execute(cmd)
536 |
537 | clean_temps()
538 | sys.exit(0)
539 |
540 |
541 | if __name__ == '__main__':
542 | main()
543 |

--------------------------------------------------------------------------------}