├── .gitignore
├── README.md
└── convert.py


/.gitignore:
--------------------------------------------------------------------------------
  1 | # Byte-compiled / optimized / DLL files
  2 | __pycache__/
  3 | *.py[cod]
  4 | *$py.class
  5 | 
  6 | # C extensions
  7 | *.so
  8 | 
  9 | # Distribution / packaging
 10 | .Python
 11 | build/
 12 | develop-eggs/
 13 | dist/
 14 | downloads/
 15 | eggs/
 16 | .eggs/
 17 | lib/
 18 | lib64/
 19 | parts/
 20 | sdist/
 21 | var/
 22 | wheels/
 23 | pip-wheel-metadata/
 24 | share/python-wheels/
 25 | *.egg-info/
 26 | .installed.cfg
 27 | *.egg
 28 | MANIFEST
 29 | 
 30 | # PyInstaller
 31 | #  Usually these files are written by a python script from a template
 32 | #  before PyInstaller builds the exe, so as to inject date/other infos into it.
 33 | *.manifest
 34 | *.spec
 35 | 
 36 | # Installer logs
 37 | pip-log.txt
 38 | pip-delete-this-directory.txt
 39 | 
 40 | # Unit test / coverage reports
 41 | htmlcov/
 42 | .tox/
 43 | .nox/
 44 | .coverage
 45 | .coverage.*
 46 | .cache
 47 | nosetests.xml
 48 | coverage.xml
 49 | *.cover
 50 | *.py,cover
 51 | .hypothesis/
 52 | .pytest_cache/
 53 | 
 54 | # Translations
 55 | *.mo
 56 | *.pot
 57 | 
 58 | # Django stuff:
 59 | *.log
 60 | local_settings.py
 61 | db.sqlite3
 62 | db.sqlite3-journal
 63 | 
 64 | # Flask stuff:
 65 | instance/
 66 | .webassets-cache
 67 | 
 68 | # Scrapy stuff:
 69 | .scrapy
 70 | 
 71 | # Sphinx documentation
 72 | docs/_build/
 73 | 
 74 | # PyBuilder
 75 | target/
 76 | 
 77 | # Jupyter Notebook
 78 | .ipynb_checkpoints
 79 | 
 80 | # IPython
 81 | profile_default/
 82 | ipython_config.py
 83 | 
 84 | # pyenv
 85 | .python-version
 86 | 
 87 | # pipenv
 88 | #   According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
 89 | #   However, in case of collaboration, if having platform-specific dependencies or dependencies
 90 | #   having no cross-platform support, pipenv may install dependencies that don't work, or not
 91 | #   install all needed dependencies.
 92 | #Pipfile.lock
 93 | 
 94 | # PEP 582; used by e.g. github.com/David-OConnor/pyflow
 95 | __pypackages__/
 96 | 
 97 | # Celery stuff
 98 | celerybeat-schedule
 99 | celerybeat.pid
100 | 
101 | # SageMath parsed files
102 | *.sage.py
103 | 
104 | # Environments
105 | .env
106 | .venv
107 | env/
108 | venv/
109 | ENV/
110 | env.bak/
111 | venv.bak/
112 | 
113 | # Spyder project settings
114 | .spyderproject
115 | .spyproject
116 | 
117 | # Rope project settings
118 | .ropeproject
119 | 
120 | # mkdocs documentation
121 | /site
122 | 
123 | # mypy
124 | .mypy_cache/
125 | .dmypy.json
126 | dmypy.json
127 | 
128 | # Pyre type checker
129 | .pyre/
130 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | # arxiv-reader - Converting arXiv papers to audio.
 2 | ## Dr. Tristan Behrens, [LinkedIn](https://www.linkedin.com/in/dr-tristan-behrens-734967a2/).
 3 | 
 4 | Reads arXiv papers using Text-to-Speech. Uses the Facebook model `facebook/fastspeech2-en-ljspeech`.
 5 | 
 6 | ## Installation.
 7 | 
 8 | Install these requirements:
 9 | 
10 | ```
11 | pip install transformers
12 | pip install g2p_en
13 | pip install beautifulsoup4
14 | pip install arxiv-downloader
15 | pip install fairseq
16 | ```
17 | 
18 | The script also requires pandoc. If you are running on a Mac with Homebrew installed, you can install pandoc with:
19 | 
20 | ```
21 | brew install pandoc
22 | ```
23 | 
24 | If you do not run a Mac with Homebrew - which is a reason for regret - follow the installation instructions for your OS.
25 | 
26 | ## How to run.
27 | 
28 | If you want to convert "Attention is all you need" to speech, you can run the following command:
29 | 
30 | ```
31 | python convert.py "1706.03762"
32 | ```
33 | 
34 | Better use the quotes around the arXiv ID.
35 | 
36 | ## Issues and features.
37 | 
38 | Feel free to add more features and improve things. Pull requests are more than welcome. And do not hesitate GitHub issues.
39 | 
40 | Have fun!
41 | 


--------------------------------------------------------------------------------
/convert.py:
--------------------------------------------------------------------------------
  1 | # Copyright 2022 Tristan Behrens.
  2 | #
  3 | # Licensed under the Apache License, Version 2.0 (the "License");
  4 | # you may not use this file except in compliance with the License.
  5 | # You may obtain a copy of the License at
  6 | #
  7 | #     http://www.apache.org/licenses/LICENSE-2.0
  8 | #
  9 | # Unless required by applicable law or agreed to in writing, software
 10 | # distributed under the License is distributed on an "AS IS" BASIS,
 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 12 | # See the License for the specific language governing permissions and
 13 | # limitations under the License.
 14 | 
 15 | # Lint as: python3
 16 | 
 17 | import os
 18 | import glob
 19 | import bs4
 20 | from bs4 import BeautifulSoup
 21 | from fairseq.checkpoint_utils import load_model_ensemble_and_task_from_hf_hub
 22 | from fairseq.models.text_to_speech.hub_interface import TTSHubInterface
 23 | import scipy
 24 | import numpy as np
 25 | import sys
 26 | 
 27 | 
 28 | # Get the paper ID from the command line. If none is provided work with "Attention is all you need."
 29 | if len(sys.argv) == 1:
 30 |     paper_id = "1706.03762"
 31 | else:
 32 |     paper_id = sys.argv[1]
 33 | 
 34 | 
 35 | def main():
 36 | 
 37 |     # Make sure there is a temp directory. Delete it if it exists. Switch into it.
 38 |     if os.path.exists("temp"):
 39 |         os.system("rm -rf temp")
 40 |     os.mkdir("temp")
 41 |     os.chdir("temp")
 42 | 
 43 |     # Download the paper as .tar.gz
 44 |     print(f"Downloading paper {paper_id}...")
 45 |     os.system(f"arxiv-downloader --id {paper_id} --source")
 46 | 
 47 |     # Find the .tar.gz file.
 48 |     try:
 49 |         tar_gz_file = glob.glob(f"{paper_id}*.tar.gz")[0]
 50 |     except:
 51 |         print(f"Could not find the .tar.gz file for {paper_id}. Maybe the download did not work?")
 52 |         exit
 53 | 
 54 |     # Extract the .tar.gz file to a temp folder.
 55 |     os.system(f"tar -xzf {tar_gz_file}")
 56 | 
 57 |     # Convert to HTML.
 58 |     get_sentences_from_tex()
 59 | 
 60 |     # Go back. Up one level.
 61 |     os.chdir("..")
 62 | 
 63 |     # Convert to wav.
 64 |     convert_sentences_to_wav()
 65 | 
 66 |     # Remove temp folder.
 67 |     os.system(f"rm -rf temp")
 68 | 
 69 | 
 70 | def get_sentences_from_tex():
 71 | 
 72 |     # Find all the .tex files in the temp folder.
 73 |     tex_files = glob.glob(f"*.tex")
 74 | 
 75 |     # Find all the tex files whose content start with the string \documentclass.
 76 |     documentclass_files = []
 77 |     for tex_file in tex_files:
 78 |         with open(tex_file, "r") as f:
 79 |             if f.readline().startswith("\documentclass"):
 80 |                 documentclass_files.append(tex_file)
 81 |     assert len(documentclass_files) == 1, "There should be only one documentclass file."
 82 |     documentclass_file = documentclass_files[0]
 83 | 
 84 |     # Convert the .tex file to .md file.
 85 |     os.system(f"pandoc {documentclass_file} -o {paper_id}.html -t html5")
 86 | 
 87 |     # Load the .html file with BeautifulSoup4.
 88 |     with open(f"{paper_id}.html", "r") as f:
 89 |         html = f.read()
 90 |     soup = BeautifulSoup(html, "html.parser")
 91 | 
 92 |     # Cleanup. Rigorouslly, we want to remove a lot of tags.
 93 |     for element in soup.find_all("span", class_="citation"):
 94 |         element.decompose()
 95 |     for element in soup.find_all("span", class_="math inline"):
 96 |         element.decompose()
 97 |     for element in soup.find_all("span", class_="math display"):
 98 |         element.decompose()
 99 |     for element in soup.find_all("div", class_="figure"):
100 |         element.decompose()
101 |     for element in soup.find_all("div", class_="figure*"):
102 |         element.decompose()
103 |     for element in soup.find_all("div", class_="thebibliography"):
104 |         element.decompose()
105 |     for element in soup.find_all("div", class_="center"):
106 |         element.decompose()
107 |     for element in soup.find_all("section", class_="footnotes"):
108 |         element.decompose()
109 |     for element in soup.find_all("a"):
110 |         element.decompose()
111 |     for element in soup.find_all("figure"):
112 |         element.decompose()
113 | 
114 |     # Write the .html file back.
115 |     with open(f"{paper_id}_cleaned.html", "w") as f:
116 |         f.write(soup.prettify())
117 | 
118 |     # Read that .html file back line by line.
119 |     with open(f"{paper_id}_cleaned.html", "r") as f:
120 |         lines = f.readlines()
121 | 
122 |     # Convert to sentences. Go through the lines
123 |     sentences = []
124 |     accumumlated_sentence = ""
125 |     for line in lines:
126 |         if line.startswith("<"):
127 | 
128 |             # Opening tags that we expect.
129 |             if line.startswith("<p") or line.startswith("<h1") or line.startswith("<h2") or line.startswith("<h3") or line.startswith("<h4"):
130 |                 pass
131 |             
132 |             # Closing tags that we expect. 
133 |             elif line.startswith("</p>") or line.startswith("</h1>") or line.startswith("</h2>") or line.startswith("</h3>") or line.startswith("</h4>"):
134 |                 accumumlated_sentence = accumumlated_sentence.replace("\n", " ")
135 |                 
136 |                 # Split by period so that we can insert a pause.
137 |                 for x in accumumlated_sentence.split("."):
138 |                     sentences.append(x.strip())
139 |                     sentences.append("<PAUSE>")
140 |                 
141 |                 # Start over and add pause.
142 |                 accumumlated_sentence = ""
143 |                 sentences.append("<PAUSE>")
144 |             
145 |             else:
146 |                 print(f"Unexpected HTML tag: {line}")
147 |         
148 |         # Accumulate texts.
149 |         else:
150 |             accumumlated_sentence += line
151 | 
152 |     # Write to a file.
153 |     with open(f"{paper_id}_sentences.txt", "w") as f:
154 |         for sentence in sentences:
155 |             if sentence.strip() != "":
156 |                 f.write(sentence + "\n")
157 | 
158 |     # Done.
159 |     return sentences
160 | 
161 | 
162 | def convert_sentences_to_wav():
163 | 
164 |     # Load lines from the .txt file.
165 |     with open(f"temp/{paper_id}_sentences.txt", "r") as f:
166 |         sentences = f.readlines()
167 | 
168 | 
169 |     # Load the model.
170 |     print("Loading TTS model...")
171 |     models, cfg, task = load_model_ensemble_and_task_from_hf_hub(
172 |         "facebook/fastspeech2-en-ljspeech",
173 |         arg_overrides={"vocoder": "hifigan", "fp16": False}
174 |     )
175 |     TTSHubInterface.update_cfg_with_data_cfg(cfg, task.data_cfg)
176 |     generator = task.build_generator(models, cfg)
177 | 
178 |     # Generate line by line.
179 |     print("Generating...")
180 |     full_wave_file = []
181 |     rate = 44100
182 |     for text in sentences:
183 |         
184 |         text = text.strip()
185 | 
186 |         print(f"Text: \"{text}\"")
187 |         if text == "":
188 |             continue
189 | 
190 |         # Insert a pause.
191 |         if text == "<PAUSE>":
192 |             full_wave_file.extend(np.zeros(rate))
193 |             continue
194 | 
195 |         # Create the sample.
196 |         sample = TTSHubInterface.get_model_input(task, text)
197 |         wav, rate = TTSHubInterface.get_prediction(task, models[0], generator, sample)
198 |         
199 |         # Map wav from torch tensor to numpy array.
200 |         wav = wav.numpy()
201 | 
202 |         # Append.
203 |         full_wave_file.extend(wav)
204 | 
205 |     # Convert to numpy.
206 |     full_wave_file = np.array(full_wave_file, dtype=np.float32)
207 | 
208 |     # Save the generated audio to a file. 
209 |     wav_path = f"{paper_id}.wav"
210 |     print(f"Saving {wav_path}")
211 |     scipy.io.wavfile.write(wav_path, rate, full_wave_file)
212 |     print("Done.")
213 | 
214 | 
215 | # Call the main method.
216 | if __name__ == "__main__":
217 |     main()


--------------------------------------------------------------------------------