├── .gitignore ├── README.md └── convert.py /.gitignore: -------------------------------------------------------------------------------- 1 | # Byte-compiled / optimized / DLL files 2 | __pycache__/ 3 | *.py[cod] 4 | *$py.class 5 | 6 | # C extensions 7 | *.so 8 | 9 | # Distribution / packaging 10 | .Python 11 | build/ 12 | develop-eggs/ 13 | dist/ 14 | downloads/ 15 | eggs/ 16 | .eggs/ 17 | lib/ 18 | lib64/ 19 | parts/ 20 | sdist/ 21 | var/ 22 | wheels/ 23 | pip-wheel-metadata/ 24 | share/python-wheels/ 25 | *.egg-info/ 26 | .installed.cfg 27 | *.egg 28 | MANIFEST 29 | 30 | # PyInstaller 31 | # Usually these files are written by a python script from a template 32 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 33 | *.manifest 34 | *.spec 35 | 36 | # Installer logs 37 | pip-log.txt 38 | pip-delete-this-directory.txt 39 | 40 | # Unit test / coverage reports 41 | htmlcov/ 42 | .tox/ 43 | .nox/ 44 | .coverage 45 | .coverage.* 46 | .cache 47 | nosetests.xml 48 | coverage.xml 49 | *.cover 50 | *.py,cover 51 | .hypothesis/ 52 | .pytest_cache/ 53 | 54 | # Translations 55 | *.mo 56 | *.pot 57 | 58 | # Django stuff: 59 | *.log 60 | local_settings.py 61 | db.sqlite3 62 | db.sqlite3-journal 63 | 64 | # Flask stuff: 65 | instance/ 66 | .webassets-cache 67 | 68 | # Scrapy stuff: 69 | .scrapy 70 | 71 | # Sphinx documentation 72 | docs/_build/ 73 | 74 | # PyBuilder 75 | target/ 76 | 77 | # Jupyter Notebook 78 | .ipynb_checkpoints 79 | 80 | # IPython 81 | profile_default/ 82 | ipython_config.py 83 | 84 | # pyenv 85 | .python-version 86 | 87 | # pipenv 88 | # According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control. 89 | # However, in case of collaboration, if having platform-specific dependencies or dependencies 90 | # having no cross-platform support, pipenv may install dependencies that don't work, or not 91 | # install all needed dependencies. 92 | #Pipfile.lock 93 | 94 | # PEP 582; used by e.g. github.com/David-OConnor/pyflow 95 | __pypackages__/ 96 | 97 | # Celery stuff 98 | celerybeat-schedule 99 | celerybeat.pid 100 | 101 | # SageMath parsed files 102 | *.sage.py 103 | 104 | # Environments 105 | .env 106 | .venv 107 | env/ 108 | venv/ 109 | ENV/ 110 | env.bak/ 111 | venv.bak/ 112 | 113 | # Spyder project settings 114 | .spyderproject 115 | .spyproject 116 | 117 | # Rope project settings 118 | .ropeproject 119 | 120 | # mkdocs documentation 121 | /site 122 | 123 | # mypy 124 | .mypy_cache/ 125 | .dmypy.json 126 | dmypy.json 127 | 128 | # Pyre type checker 129 | .pyre/ 130 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # arxiv-reader - Converting arXiv papers to audio. 2 | ## Dr. Tristan Behrens, [LinkedIn](https://www.linkedin.com/in/dr-tristan-behrens-734967a2/). 3 | 4 | Reads arXiv papers using Text-to-Speech. Uses the Facebook model `facebook/fastspeech2-en-ljspeech`. 5 | 6 | ## Installation. 7 | 8 | Install these requirements: 9 | 10 | ``` 11 | pip install transformers 12 | pip install g2p_en 13 | pip install beautifulsoup4 14 | pip install arxiv-downloader 15 | pip install fairseq 16 | ``` 17 | 18 | The script also requires pandoc. If you are running on a Mac with Homebrew installed, you can install pandoc with: 19 | 20 | ``` 21 | brew install pandoc 22 | ``` 23 | 24 | If you do not run a Mac with Homebrew - which is a reason for regret - follow the installation instructions for your OS. 25 | 26 | ## How to run. 27 | 28 | If you want to convert "Attention is all you need" to speech, you can run the following command: 29 | 30 | ``` 31 | python convert.py "1706.03762" 32 | ``` 33 | 34 | Better use the quotes around the arXiv ID. 35 | 36 | ## Issues and features. 37 | 38 | Feel free to add more features and improve things. Pull requests are more than welcome. And do not hesitate GitHub issues. 39 | 40 | Have fun! 41 | -------------------------------------------------------------------------------- /convert.py: -------------------------------------------------------------------------------- 1 | # Copyright 2022 Tristan Behrens. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | # Lint as: python3 16 | 17 | import os 18 | import glob 19 | import bs4 20 | from bs4 import BeautifulSoup 21 | from fairseq.checkpoint_utils import load_model_ensemble_and_task_from_hf_hub 22 | from fairseq.models.text_to_speech.hub_interface import TTSHubInterface 23 | import scipy 24 | import numpy as np 25 | import sys 26 | 27 | 28 | # Get the paper ID from the command line. If none is provided work with "Attention is all you need." 29 | if len(sys.argv) == 1: 30 | paper_id = "1706.03762" 31 | else: 32 | paper_id = sys.argv[1] 33 | 34 | 35 | def main(): 36 | 37 | # Make sure there is a temp directory. Delete it if it exists. Switch into it. 38 | if os.path.exists("temp"): 39 | os.system("rm -rf temp") 40 | os.mkdir("temp") 41 | os.chdir("temp") 42 | 43 | # Download the paper as .tar.gz 44 | print(f"Downloading paper {paper_id}...") 45 | os.system(f"arxiv-downloader --id {paper_id} --source") 46 | 47 | # Find the .tar.gz file. 48 | try: 49 | tar_gz_file = glob.glob(f"{paper_id}*.tar.gz")[0] 50 | except: 51 | print(f"Could not find the .tar.gz file for {paper_id}. Maybe the download did not work?") 52 | exit 53 | 54 | # Extract the .tar.gz file to a temp folder. 55 | os.system(f"tar -xzf {tar_gz_file}") 56 | 57 | # Convert to HTML. 58 | get_sentences_from_tex() 59 | 60 | # Go back. Up one level. 61 | os.chdir("..") 62 | 63 | # Convert to wav. 64 | convert_sentences_to_wav() 65 | 66 | # Remove temp folder. 67 | os.system(f"rm -rf temp") 68 | 69 | 70 | def get_sentences_from_tex(): 71 | 72 | # Find all the .tex files in the temp folder. 73 | tex_files = glob.glob(f"*.tex") 74 | 75 | # Find all the tex files whose content start with the string \documentclass. 76 | documentclass_files = [] 77 | for tex_file in tex_files: 78 | with open(tex_file, "r") as f: 79 | if f.readline().startswith("\documentclass"): 80 | documentclass_files.append(tex_file) 81 | assert len(documentclass_files) == 1, "There should be only one documentclass file." 82 | documentclass_file = documentclass_files[0] 83 | 84 | # Convert the .tex file to .md file. 85 | os.system(f"pandoc {documentclass_file} -o {paper_id}.html -t html5") 86 | 87 | # Load the .html file with BeautifulSoup4. 88 | with open(f"{paper_id}.html", "r") as f: 89 | html = f.read() 90 | soup = BeautifulSoup(html, "html.parser") 91 | 92 | # Cleanup. Rigorouslly, we want to remove a lot of tags. 93 | for element in soup.find_all("span", class_="citation"): 94 | element.decompose() 95 | for element in soup.find_all("span", class_="math inline"): 96 | element.decompose() 97 | for element in soup.find_all("span", class_="math display"): 98 | element.decompose() 99 | for element in soup.find_all("div", class_="figure"): 100 | element.decompose() 101 | for element in soup.find_all("div", class_="figure*"): 102 | element.decompose() 103 | for element in soup.find_all("div", class_="thebibliography"): 104 | element.decompose() 105 | for element in soup.find_all("div", class_="center"): 106 | element.decompose() 107 | for element in soup.find_all("section", class_="footnotes"): 108 | element.decompose() 109 | for element in soup.find_all("a"): 110 | element.decompose() 111 | for element in soup.find_all("figure"): 112 | element.decompose() 113 | 114 | # Write the .html file back. 115 | with open(f"{paper_id}_cleaned.html", "w") as f: 116 | f.write(soup.prettify()) 117 | 118 | # Read that .html file back line by line. 119 | with open(f"{paper_id}_cleaned.html", "r") as f: 120 | lines = f.readlines() 121 | 122 | # Convert to sentences. Go through the lines 123 | sentences = [] 124 | accumumlated_sentence = "" 125 | for line in lines: 126 | if line.startswith("<"): 127 | 128 | # Opening tags that we expect. 129 | if line.startswith("
") or line.startswith("") or line.startswith("") or line.startswith("") or line.startswith(""):
134 | accumumlated_sentence = accumumlated_sentence.replace("\n", " ")
135 |
136 | # Split by period so that we can insert a pause.
137 | for x in accumumlated_sentence.split("."):
138 | sentences.append(x.strip())
139 | sentences.append("