├── pyproject.toml ├── LICENSE ├── README.md └── arxiv2kindle.py /pyproject.toml: -------------------------------------------------------------------------------- 1 | [tool.poetry] 2 | name = "arxiv2kindle" 3 | version = "0.1.3" 4 | description = "A simple tool to recompile arxiv papers to kindle-like format." 5 | authors = ["Dmitriy Serdyuk "] 6 | readme = "README.md" 7 | license = "MIT" 8 | homepage = "https://github.com/dmitriy-serdyuk/arxiv2kindle" 9 | repository = "https://github.com/dmitriy-serdyuk/arxiv2kindle" 10 | keywords = ["arxiv", "latex", "kindle"] 11 | packages = [ 12 | { include = "arxiv2kindle.py" }, 13 | ] 14 | classifiers = [ 15 | "Environment :: Console", 16 | "Programming Language :: Python :: 3", 17 | "License :: OSI Approved :: MIT License", 18 | "Operating System :: OS Independent", 19 | ] 20 | include = [ 21 | "LICENSE", 22 | "README.md" 23 | ] 24 | 25 | [tool.poetry.scripts] 26 | arxiv2kindle = 'arxiv2kindle:run' 27 | 28 | [tool.poetry.dependencies] 29 | python = "^3.7" 30 | arxiv = "^0.5.3" 31 | requests = "*" 32 | 33 | [build-system] 34 | requires = ["poetry>=0.12"] 35 | build-backend = "poetry.masonry.api" 36 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2020 Dmitriy Serdyuk 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # arxiv2kindle 2 | 3 | A simple script to recompile arxiv papers to kindle-like format. 4 | 5 | ## How does it work? 6 | 7 | This script downloads the LaTeX source from arxiv 8 | and re-compiles it trying to fit a smaller size. 9 | We also apply some simple transforms such as: 10 | - downsize images; 11 | - add automatic line breaks in formulas; 12 | - allow formulas be placed on the next line; 13 | - try to convert two-sided documents to one-sided format. 14 | 15 | All these transformations are automatic, so the success is not guaranteed. 16 | This approach will also not work for papers without the source. 17 | Nevertheless, in most cases the result is readable 18 | (tested on an old 6.5in x 4.5in Kindle). 19 | 20 | 21 | ## Usage 22 | 23 | With your paper of choice run: 24 | ``` 25 | arxiv2kindle --width 4 --height 6 --margin 0.2 1802.08395 - > out.pdf 26 | ``` 27 | or 28 | ``` 29 | arxiv2kindle --width 6 --height 4 --margin 0.2 --landscape "Towards end-to-end spoken language understanding" ./ 30 | ``` 31 | 32 | ## Installation 33 | 34 | `arxiv2kindle` requires `pip` version 10.0 or greater. 35 | 36 | To install the package, run 37 | ``` 38 | pip install arxiv2kindle 39 | ``` 40 | 41 | ## Acknowledgements 42 | 43 | This script is based on this amazing [notebook](https://gist.github.com/bshillingford/6259986edca707ca58dd). 44 | 45 | ## Related projects 46 | 47 | - https://github.com/cerisara/arxiv2kindle 48 | - https://knanagnostopoulos.blogspot.com/2013/03/from-arxiv-directly-to-my-kindle_15.html 49 | - https://dlmf.nist.gov/LaTeXML/ -------------------------------------------------------------------------------- /arxiv2kindle.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | 3 | import argparse 4 | import arxiv 5 | import logging 6 | import re 7 | import os 8 | import sys 9 | import subprocess 10 | import tempfile 11 | import tarfile 12 | from pathlib import Path 13 | 14 | 15 | logger = logging.getLogger(__name__) 16 | logger.setLevel(logging.INFO) 17 | 18 | HELP_EPILOG = """\ 19 | Example usage: 20 | 21 | %(prog)s --width 4 --height 6 --margin 0.2 1802.08395 - > out.pdf 22 | %(prog)s --width 6 --height 4 --margin 0.2 --landscape 1802.08395 ./ 23 | """ 24 | 25 | 26 | def parse_args(): 27 | parser = argparse.ArgumentParser( 28 | description="Convert arxiv paper to kindle-like size", 29 | formatter_class=argparse.RawDescriptionHelpFormatter, 30 | epilog=HELP_EPILOG 31 | ) 32 | parser.add_argument("query", help="arxiv paper url") 33 | parser.add_argument( 34 | "dest", type=Path, 35 | help="destination dir, if `-` provided, the file is streamed to stdout") 36 | group = parser.add_argument_group("Geometry") 37 | group.add_argument( 38 | '-W', "--width", default=4, type=float, 39 | help="width of the output pdf (inches)") 40 | group.add_argument( 41 | '-H', "--height", default=6, type=float, 42 | help="height of the output pdf (inches)") 43 | group.add_argument( 44 | '-m', "--margin", default=0.2, type=float, 45 | help="margin for the output pdf (inches)") 46 | group.add_argument( 47 | "--landscape", action='store_true', 48 | help="produce a landscape file") 49 | group.add_argument( 50 | "--portrait", dest='landscape', action='store_false', 51 | help="produce a portrait file (default option)") 52 | args = parser.parse_args() 53 | return args 54 | 55 | 56 | def download(query): 57 | try: 58 | paper, = arxiv.query(query, max_results=1) 59 | except ValueError: 60 | raise SystemError('Paper not found') 61 | arxiv_url = paper['arxiv_url'] 62 | arxiv_title = paper['title'] 63 | 64 | logger.info(f"Converting paper: [{arxiv_url}] {arxiv_title}") 65 | 66 | temp_dir = Path(tempfile.mkdtemp(prefix='arxiv2kindle_')) 67 | 68 | logger.info(f"Downloading the source...") 69 | arxiv.arxiv.download( 70 | paper, slugify=lambda _: 'src', dirpath=str(temp_dir), 71 | prefer_source_tarfile=True) 72 | 73 | logger.info(f'Extracting the source...') 74 | tar_file = temp_dir / 'src.tar.gz' 75 | if not tar_file.exists(): 76 | raise SystemError('Paper sources are not available') 77 | 78 | with tarfile.open(tar_file) as f: 79 | f.extractall(temp_dir) 80 | 81 | def is_main_file(file_name): 82 | with open(file_name, 'rt') as f: 83 | if '\\documentclass' in f.read(): 84 | return True 85 | return False 86 | 87 | main_files = [tex_file for tex_file in temp_dir.glob('*.tex') 88 | if is_main_file(tex_file)] 89 | assert len(main_files) == 1 90 | main_file, = main_files 91 | logger.info(f'Fount the main tex file: {main_file.name}') 92 | return temp_dir, main_file, arxiv_title 93 | 94 | 95 | def change_size(main_file, geom_settings, landscape): 96 | with open(main_file, 'rt') as f: 97 | src = f.readlines() 98 | 99 | # documentclass line index 100 | dclass_idx = next(idx for idx, line in enumerate(src) 101 | if '\\documentclass' in line) 102 | 103 | # filter comments/newlines for easier debugging: 104 | src = [line for line in src if line[0] != '%' and len(line.strip()) > 0] 105 | 106 | # strip font size, column stuff, and paper size stuff in documentclass line: 107 | src[dclass_idx] = re.sub(r'\b\d+pt\b', '', src[dclass_idx]) 108 | src[dclass_idx] = re.sub(r'\b\w+column\b', '', src[dclass_idx]) 109 | src[dclass_idx] = re.sub(r'\b\w+paper\b', '', src[dclass_idx]) 110 | # remove extraneous starting commas 111 | src[dclass_idx] = re.sub(r'(?<=\[),', '', src[dclass_idx]) 112 | # remove extraneous middle/ending commas 113 | src[dclass_idx] = re.sub(r',(?=[\],])', '', src[dclass_idx]) 114 | 115 | # find begin{document}: 116 | begindocs = [i for i, line in enumerate(src) if line.startswith(r'\begin{document}')] 117 | assert(len(begindocs) == 1) 118 | geom_settings_str = ",".join(k+"="+v for k, v in geom_settings.items()) 119 | geom_settings_str += ",landscape" if landscape else "" 120 | src.insert( 121 | begindocs[0], 122 | f'\\usepackage[{geom_settings_str}]{{geometry}}\n') 123 | src.insert(begindocs[0], '\\usepackage{times}\n') 124 | src.insert(begindocs[0], '\\pagestyle{empty}\n') 125 | src.insert(begindocs[0], '\\usepackage{breqn}\n') 126 | if landscape: 127 | src.insert(begindocs[0], '\\usepackage{pdflscape}\n') 128 | 129 | # shrink figures to be at most the size of the page: 130 | for i in range(len(src)): 131 | line = src[i] 132 | m = re.search(r'\\includegraphics\[width=([.\d]+)\\(line|text)width\]', line) 133 | if m: 134 | mul = m.group(1) 135 | src[i] = re.sub( 136 | r'\\includegraphics\[width=([.\d]+)\\(line|text)width\]', 137 | f'\\\\includegraphics[width={mul}\\\\textwidth,height={mul}\\\\textheight,keepaspectratio]', 138 | line) 139 | continue 140 | # deal with figures which do not have sizes specified 141 | if '\\includegraphics{' in line: 142 | src[i] = re.sub( 143 | r'\\includegraphics{', 144 | r'\\includegraphics[scale=0.5]{', 145 | line) 146 | continue 147 | # deal with scaled figures 148 | m = re.search(r'\\includegraphics\[scale=([.\d]+)\]', line) 149 | if m: 150 | mul = float(m.group(1)) 151 | src[i] = re.sub( 152 | r'\\includegraphics\[scale=([.\d]+)\]', 153 | f'\\\\includegraphics\\[scale={mul / 2}\\]', 154 | line) 155 | continue 156 | 157 | # allow placing inline equations on new line 158 | for i in range(len(src)): 159 | line = src[i] 160 | m = re.search(r'\$.+\$', line) 161 | if m: 162 | src[i] = "\\sloppy " + line 163 | 164 | os.rename(main_file, main_file.with_suffix('.tex.bak')) 165 | with open(main_file, 'wt') as f: 166 | f.writelines(src) 167 | 168 | 169 | def compile_tex(file_name): 170 | # Compile 3 times 171 | for _ in range(3): 172 | subprocess.run(['pdflatex', file_name], 173 | stdout=sys.stderr, 174 | cwd=file_name.parent) 175 | 176 | 177 | def rotate_pdf(pdf_file): 178 | os.rename(pdf_file, pdf_file.with_suffix('.pdf.bak')) 179 | subprocess.run( 180 | ['pdftk', pdf_file.with_suffix('.pdf.bak'), 181 | 'rotate', '1-endeast', 'output', pdf_file], 182 | stdout=sys.stderr, 183 | cwd=pdf_file.parent) 184 | 185 | 186 | def make_single_column(work_dir): 187 | for filename in work_dir.glob('*.sty'): 188 | with open(filename, 'rt') as f: 189 | src = f.readlines() 190 | out_src = [] 191 | for line in src: 192 | if line.strip() == '\\twocolumn': 193 | continue 194 | out_src.append(line) 195 | with open(filename, 'wt') as f: 196 | f.writelines(out_src) 197 | 198 | 199 | def check_prerec(landscape): 200 | result = subprocess.run(["pdflatex", "--version"], stdout=None, stderr=None) 201 | if result.returncode != 0: 202 | raise SystemError("no pdflatex found") 203 | if landscape: 204 | result = subprocess.run(["pdftk", "--version"], stdout=None, stderr=None) 205 | if result.returncode != 0: 206 | raise SystemError("no pdftk found (required for landscape mode)") 207 | 208 | 209 | def main(query, dest, width, height, margin, landscape): 210 | check_prerec(landscape) 211 | 212 | tmp_dir, main_file, title = download(query) 213 | if landscape: 214 | width, height = height, width 215 | geom_settings = dict( 216 | paperwidth=f'{width}in', 217 | paperheight=f'{height}in', 218 | margin=f'{margin}in') 219 | 220 | change_size(main_file, geom_settings, landscape) 221 | make_single_column(tmp_dir) 222 | compile_tex(main_file) 223 | pdf_file = main_file.with_suffix('.pdf') 224 | if landscape: 225 | rotate_pdf(pdf_file) 226 | 227 | if dest.is_dir(): 228 | os.rename(pdf_file, dest / (title + '.pdf')) 229 | elif str(dest) == '-': 230 | with open(main_file.with_suffix('.pdf'), 'rb') as fin: 231 | sys.stdout.buffer.write(fin.read()) 232 | else: 233 | os.rename(pdf_file, dest) 234 | 235 | 236 | def run(): 237 | main(**vars(parse_args())) 238 | 239 | 240 | if __name__ == "__main__": 241 | run() 242 | --------------------------------------------------------------------------------