├── Dockerfile
├── LICENSE
├── README.md
├── examples
    ├── cox.pdf
    └── london.pdf
└── paperify.sh


/Dockerfile:
--------------------------------------------------------------------------------
 1 | FROM debian:latest
 2 | 
 3 | # Build with:
 4 | # docker build --tag jstrieb/paperify:latest .
 5 | 
 6 | RUN apt-get update && \
 7 |   apt-get install --no-install-recommends --yes \
 8 |     pandoc \
 9 |     curl ca-certificates \
10 |     jq \
11 |     python3 \
12 |     imagemagick \
13 |     texlive texlive-publishers texlive-science lmodern texlive-latex-extra
14 | 
15 | COPY paperify.sh /usr/local/bin/paperify
16 | RUN chmod +x /usr/local/bin/paperify
17 | 
18 | WORKDIR /root/
19 | ENTRYPOINT ["paperify"]
20 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 | MIT License
 2 | 
 3 | Copyright (c) 2023 Jacob Strieb
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | of this software and associated documentation files (the "Software"), to deal
 7 | in the Software without restriction, including without limitation the rights
 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 | 
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 | 
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
  1 | # Paperify
  2 | 
  3 | Paperify transforms any document, web page, or ebook into a research paper.
  4 | 
  5 | The text of the generated paper is the same as the text of the original
  6 | document, but figures and equations from real papers are interspersed
  7 | throughout. 
  8 | 
  9 | A paper title and abstract are added (optionally generated by ChatGPT, if you
 10 | provide an API key), and the entire paper is compiled with the IEEE $\LaTeX$
 11 | template for added realism.
 12 | 
 13 | <div align="center">
 14 | 
 15 | ![example](https://github.com/jstrieb/paperify/assets/7355528/6233c47e-fbff-4a71-8991-09ba3112f241)
 16 | 
 17 | </div>
 18 | 
 19 | 
 20 | # Install
 21 | 
 22 | First, install the dependencies (or [use Docker](#docker)):
 23 | 
 24 | - curl
 25 | - Python 3
 26 | - Pandoc
 27 | - jq
 28 | - LaTeX (via TeXLive)
 29 | - ImageMagick (optional)
 30 | 
 31 | For example, on Debian-based systems (_e.g._, Debian, Ubuntu, Kali, WSL):
 32 | 
 33 | ``` bash
 34 | sudo apt update
 35 | sudo apt install --no-install-recommends \
 36 |   pandoc \
 37 |   curl ca-certificates \
 38 |   jq \
 39 |   python3 \
 40 |   imagemagick \
 41 |   texlive texlive-publishers texlive-science lmodern texlive-latex-extra
 42 | ```
 43 | 
 44 | Then, clone the repo (or directly pull the script), and execute it.
 45 | 
 46 | ``` bash
 47 | curl -L https://github.com/jstrieb/paperify/raw/master/paperify.sh \
 48 |   | sudo tee /usr/local/bin/paperify
 49 | sudo chmod +x /usr/local/bin/paperify
 50 | 
 51 | paperify -h
 52 | ```
 53 | 
 54 | 
 55 | # Examples
 56 | 
 57 | - [`examples/cox.pdf`](examples/cox.pdf)
 58 | 
 59 |   Convert [Russ Cox's transcript of Doug McIlroy's talk on the history of Bell
 60 |   Labs](https://research.swtch.com/bell-labs) into a paper saved to the `/tmp/`
 61 |   directory as `article.pdf`. 
 62 | 
 63 |   ```
 64 |   paperify \
 65 |     --from-format html \
 66 |     "https://research.swtch.com/bell-labs" \
 67 |     /tmp/article.pdf
 68 |   ```
 69 | 
 70 | - [`examples/london.pdf`](examples/london.pdf)
 71 |   
 72 |   Download figures and equations from the 1000 latest computer science papers
 73 |   on `arXiv.org`. Intersperse the figures and equations into Jack London's
 74 |   _Call of the Wild_ with a higher-than-default equation frequency. Use ChatGPT
 75 |   to generate a paper title, author, abstract, and metadata for an imaginary
 76 |   paper on soft body robotics. Save the file in the current directory as
 77 |   `london.pdf`.
 78 | 
 79 |   ```
 80 |   paperify \
 81 |     --arxiv-category cs \
 82 |     --num-papers 1000 \
 83 |     --equation-frequency 18 \
 84 |     --chatgpt-token "sk-[REDACTED]" \
 85 |     --chatgpt-topic "soft body robotics" \
 86 |     "https://standardebooks.org/ebooks/jack-london/the-call-of-the-wild/downloads/jack-london_the-call-of-the-wild.epub" \
 87 |     london.pdf
 88 |   ```
 89 | 
 90 | ## Docker
 91 | 
 92 | Alternatively, run Paperify from within a Docker container. To run the first
 93 | example from within Docker and build to `./build/cox.pdf`:
 94 | 
 95 | ``` bash
 96 | docker run \
 97 |   --rm \
 98 |   -it \
 99 |   --volume "$(pwd)/build":/root/build \
100 |   jstrieb/paperify \
101 |     --from-format html \
102 |     "https://research.swtch.com/bell-labs" \
103 |     build/cox.pdf
104 | ```
105 | 
106 | 
107 | # Usage
108 | 
109 | ```
110 | usage: paperify [OPTIONS] <URL or path> <output file>
111 | 
112 | OPTIONS:
113 |   --temp-dir <DIR>            Directory for assets (default: /tmp/paperify)
114 |   --from-format <FORMAT>      Format of input file (default: input suffix)
115 |   --arxiv-category <CAT>      arXiv.org paper category (default: math)
116 |   --num-papers <NUM>          Number of papers to download (default: 100)
117 |   --max-parallelism <PROCS>   Maximum simultaneous processes (default: 32)
118 |   --figure-frequency <N>      Chance of a figure is 1/N per paragraph (default: 25)
119 |   --equation-frequency <N>    Chance of an equation is 1/N per paragraph (default: 25)
120 |   --max-size <BYTES>          Max allowed image size in bytes (default 2500000)
121 |   --min-equation-length <N>   Minimum equation length in characters (default 5)
122 |   --max-equation-length <N>   Maximum equation length in characters (default 120)
123 |   --min-caption-length <N>    Minimum figure caption length in characters (default 20)
124 |   --chatgpt-token <TOKEN>     ChatGPT token to generate paper title, abstract, etc.
125 |   --chatgpt-topic <TOPIC>     Paper topic ChatGPT will generate metadta for
126 |   --quiet                     Don't log statuses
127 |   --skip-downloading          Don't download papers from arXiv.org
128 |   --skip-extracting           Don't extract equations and captions
129 |   --skip-metadata             Don't regenerate metadata
130 |   --skip-filtering            Don't filter out large files or non-diagram images
131 | ```
132 | 
133 | Note that the `--skip-*` flags are useful when you have already run the script
134 | once and do not want to repeat the process of downloading and extracting data.
135 | 
136 | 
137 | # Known Issues
138 | 
139 | - Images with query parameters in the `src` URL of some web pages are extracted
140 |   by Pandoc with the query parameters in the filename, and LaTeX gives errors
141 |   about "unknown file extension" when compiling.
142 | - Papers may contain images that are not diagrams, such as portraits of the
143 |   authors or institution logos. Paperify uses a highly imperfect heuristic to
144 |   remove these if the `convert` command line tool is present: only images with
145 |   white, nearly-white, or transparent pixels in the top left and bottom right
146 |   corners are kept. This works surprisingly well, but there are always some
147 |   false positives and false negatives.
148 | - Non-ASCII Unicode characters cannot be processed by `pdflatex`, and will be
149 |   stripped before the PDF is compiled.
150 | - Paperify uses Markdown as a (purposefully) lossy [intermediate
151 |   representation](https://en.wikipedia.org/wiki/Intermediate_representation)
152 |   for documents before they are converted to LaTeX. As a result, information
153 |   and styling from the original may be stripped.
154 | - A handful of papers contain huge numbers of images. The ones that do this
155 |   also tend to have some of the worst images. Images can be manually pruned
156 |   from the `/tmp/paperify/images` directory, and the same command can be re-run
157 |   with the `--skip-*` flags to rebuild the paper using new figures and
158 |   equations.
159 | - Different systems install different LaTeX packages. If you're missing
160 |   packages, you may want to bite the bullet and `apt install texlive-full`.
161 |   It's very big, but it's got everything you'll ever need in there.
162 | - Figure captions usually have nothing to do with figures themselves.
163 | - No matter how convincing a paper may appear, anyone looking over your
164 |   shoulder who actually reads the words will know very quickly that something
165 |   is off.
166 | - Side effects of reading the code include nausea, dizziness, confusion,
167 |   bleeding from the eyes, and deep love/hatred for the creators of Unix
168 |   pipelines.
169 | 
170 | 
171 | # How to Read the Code
172 | 
173 | In general, I'm a proponent of reading (or at least skimming) code before you
174 | run it, when possible. Usually, my code is written to be read. In this case,
175 | not so much.
176 | 
177 | Apologies in advance to anyone who tries to read the code. It started as four
178 | very cursed lines of Bash (without line wrapping) that I attempted to clean up
179 | a little. It is now many more than four lines of Bash, most of which remain
180 | very cursed. The small Python portion is particularly hard on the eyes, though
181 | it may possess a grotesque beauty for true functional programmers.
182 | 
183 | Everything is in `paperify.sh`. It can be read top-to-bottom or bottom-to-top,
184 | and there is a fat LaTeX template as a heredoc smack in the middle.
185 | 
186 | 
187 | # Project Status
188 | 
189 | Strange as it may sound, this project is complete. I want to live in a world
190 | where working software doesn't always grow until it becomes a Lovecraftian
191 | spaghetti monster. 
192 | 
193 | I have added every feature that I wanted to add. It does what I wanted it to
194 | do, as well as I wanted it to do it. No further development required. 
195 | 
196 | As such, I will try to address issues opened on GitHub, but I do not expect to
197 | address feature requests. I may merge pull requests.
198 | 
199 | Even if there are no recent commits, I'm hopeful that this script will continue
200 | to work many years from now.
201 | 
202 | 
203 | # Greetz & Acknowledgments
204 | 
205 | Greetz to several unnamed friends who offered helpful commentary prior to
206 | release. 
207 | 
208 | Special shout out to the friends who suggested, as a follow-up project, making
209 | a browser extension to transform the current web page into a scientific paper.
210 | Sort of like Firefox reader mode, but for viewing Twitter when someone looking
211 | over your shoulder expects you to be doing something else.
212 | 
213 | Thanks to [arXiv.org](https://arxiv.org) for hosting tons of papers with LaTeX
214 | source to mine. 
215 | 
216 | Greetz to Project Gutenberg, Standard Ebooks, and Alexandra Elbakyan.
217 | 
218 | Lovingly released on Labor Day 2023; dedicated to procrastinating laborers of
219 | knowledge.
220 | 


--------------------------------------------------------------------------------
/examples/cox.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jstrieb/paperify/a286b4a71b184b92a940bb7b98fd53df54105527/examples/cox.pdf


--------------------------------------------------------------------------------
/examples/london.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jstrieb/paperify/a286b4a71b184b92a940bb7b98fd53df54105527/examples/london.pdf


--------------------------------------------------------------------------------
/paperify.sh:
--------------------------------------------------------------------------------
   1 | #!/bin/sh
   2 | 
   3 | ###############################################################################
   4 | # Created by Jacob Strieb                                                     #
   5 | # September 2023                                                              #
   6 | #                                                                             #
   7 | # https://github.com/jstrieb/paperify                                         #
   8 | ###############################################################################
   9 | 
  10 | if sh "set -o pipefail" > /dev/null 2>&1; then
  11 |   set -o pipefail
  12 | fi
  13 | 
  14 | 
  15 | ###############################################################################
  16 | # Variables and constants                                                     #
  17 | ###############################################################################
  18 | 
  19 | ORIGINAL_FILE_URL=
  20 | OUTPUT_FILE=
  21 | FROM_FORMAT=
  22 | CHATGPT_TOKEN=
  23 | TEMP_DIR="/tmp/paperify"
  24 | ARXIV_CAT="math"
  25 | NUM_PAPERS="100"
  26 | MAX_CONCURRENCY="32"
  27 | FIGURE_PROB="25"
  28 | EQUATION_PROB="25"
  29 | MAX_SIZE="2500000"
  30 | MIN_EQUATION_LENGTH="5"
  31 | MAX_EQUATION_LENGTH="120"
  32 | MIN_CAPTION_LENGTH="20"
  33 | CHATGPT_TOPIC="cybersecurity"
  34 | QUIET="false"
  35 | SKIP_DOWNLOADING="false"
  36 | SKIP_REGENERATING_METADATA="false"
  37 | SKIP_EXTRACTING="false"
  38 | SKIP_FILTERING="false"
  39 | 
  40 | CLEAR="$(printf "\033[m")"
  41 | RED="$(printf "\033[1m\033[31m")"
  42 | ICONV_PARAM="$(
  43 |   if printf "test\n" | iconv --unicode-subst . >/dev/null 2>&1; then
  44 |     printf "%s\n" "--unicode-subst ."
  45 |   else
  46 |     printf "%s\n" "-c"
  47 |   fi
  48 | )"
  49 | 
  50 | 
  51 | ###############################################################################
  52 | # Utility functions                                                           #
  53 | ###############################################################################
  54 | 
  55 | echo() {
  56 |   if [ "${#}" -gt 0 ]; then
  57 |     printf "%s\n" "${@}"
  58 |   else
  59 |     printf "\n"
  60 |   fi
  61 | }
  62 | 
  63 | error() {
  64 |   printf "${RED}Error:${CLEAR} %s\n" "${@}" >&2
  65 | }
  66 | 
  67 | error_exit() {
  68 |   error "${@}"
  69 |   exit 1
  70 | }
  71 | 
  72 | log() {
  73 |   if ! "${QUIET}"; then
  74 |     echo "${@}" >&2
  75 |   fi
  76 | }
  77 | 
  78 | worker_wait() {
  79 |   while [ "$(jobs -p | wc -l)" -gt "${MAX_CONCURRENCY}" ]; do
  80 |     sleep 0.1
  81 |   done
  82 | }
  83 | 
  84 | rand_int() {
  85 |   if [ "${#}" -lt 1 ]; then
  86 |     < /dev/urandom \
  87 |         head -c 4 \
  88 |       | od -t uI \
  89 |       | head -n 1 \
  90 |       | sed 's/  */ /g' \
  91 |       | cut -d ' ' -f 2
  92 |   else
  93 |     # I know that modding random numbers can skew the distribution, but this
  94 |     # use case isn't serious enough for me to care.
  95 |     echo "$(( $(
  96 |       < /dev/urandom \
  97 |           head -c 4 \
  98 |         | od -t uI \
  99 |         | head -n 1 \
 100 |         | sed 's/  */ /g' \
 101 |         | cut -d ' ' -f 2
 102 |     ) % ${1} ))"
 103 |   fi
 104 | }
 105 | 
 106 | check_latex() {
 107 |   DIR="$(mktemp -d)"
 108 |   echo "${1}" \
 109 |     | pandoc \
 110 |       --from "markdown" \
 111 |       --to latex \
 112 |       --template template.tex \
 113 |       --output "${DIR}/out.tex" \
 114 |       -
 115 |   (
 116 |     cd "${DIR}" || exit 1
 117 |     if ! pdflatex out.tex >/dev/null; then
 118 |       exit 1
 119 |     fi
 120 |   )
 121 |   RESULT="${?}"
 122 |   rm -rf "${DIR}"
 123 |   return "${RESULT}"
 124 | }
 125 | 
 126 | usage() {
 127 |   cat <<EOF
 128 | usage: ${0} [OPTIONS] <URL or path> <output file>
 129 | 
 130 | OPTIONS:
 131 |   --temp-dir <DIR>            Directory for assets (default: ${TEMP_DIR})
 132 |   --from-format <FORMAT>      Format of input file (default: input suffix)
 133 |   --arxiv-category <CAT>      arXiv.org paper category (default: ${ARXIV_CAT})
 134 |   --num-papers <NUM>          Number of papers to download (default: ${NUM_PAPERS})
 135 |   --max-concurrency <PROCS>   Maximum simultaneous processes (default: ${MAX_CONCURRENCY})
 136 |   --figure-frequency <N>      Chance of a figure is 1/N per paragraph (default: ${FIGURE_PROB})
 137 |   --equation-frequency <N>    Chance of an equation is 1/N per paragraph (default: ${EQUATION_PROB})
 138 |   --max-size <BYTES>          Max allowed image size in bytes (default ${MAX_SIZE})
 139 |   --min-equation-length <N>   Minimum equation length in characters (default ${MIN_EQUATION_LENGTH})
 140 |   --max-equation-length <N>   Maximum equation length in characters (default ${MAX_EQUATION_LENGTH})
 141 |   --min-caption-length <N>    Minimum figure caption length in characters (default ${MIN_CAPTION_LENGTH})
 142 |   --chatgpt-token <TOKEN>     ChatGPT token to generate paper title, abstract, etc.
 143 |   --chatgpt-topic <TOPIC>     Paper topic ChatGPT will generate metadta for
 144 |   --quiet                     Don't log statuses
 145 |   --skip-downloading          Don't download papers from arXiv.org
 146 |   --skip-extracting           Don't extract equations and captions
 147 |   --skip-metadata             Don't regenerate metadata
 148 |   --skip-filtering            Don't filter out large files or non-diagram images
 149 | EOF
 150 | }
 151 | 
 152 | args_required() {
 153 |   EXPECTING="${1}"
 154 |   shift 1
 155 |   if [ "${#}" -le "${EXPECTING}" ]; then 
 156 |     error "${1} requires ${EXPECTING} argument$(
 157 |       [ "${EXPECTING}" -ge 2 ] && echo 's'
 158 |     )"
 159 |     echo
 160 |     usage
 161 |     exit 1
 162 |   fi
 163 | }
 164 | 
 165 | 
 166 | ###############################################################################
 167 | # Main procedures                                                             #
 168 | ###############################################################################
 169 | 
 170 | check_requirements() {
 171 |   for COMMAND in pandoc curl python3 pdflatex jq iconv; do
 172 |     if ! command -v "${COMMAND}" >/dev/null 2>&1; then
 173 |       error_exit "${COMMAND} must be installed and on the PATH for ${0} to run."
 174 |     fi
 175 |   done
 176 | }
 177 | 
 178 | parse_args() {
 179 |   while [ "${#}" -gt 0 ]; do
 180 |     case "${1}" in
 181 |       --temp-dir)
 182 |         args_required 1 "${@}"
 183 |         TEMP_DIR="${2}"
 184 |         shift 1
 185 |         ;;
 186 |       --from-format)
 187 |         args_required 1 "${@}"
 188 |         FROM_FORMAT="${2}"
 189 |         shift 1
 190 |         ;;
 191 |       --arxiv-category)
 192 |         args_required 1 "${@}"
 193 |         ARXIV_CAT="${2}"
 194 |         shift 1
 195 |         ;;
 196 |       --num-papers)
 197 |         args_required 1 "${@}"
 198 |         NUM_PAPERS="${2}"
 199 |         shift 1
 200 |         ;;
 201 |       --max-concurrency)
 202 |         args_required 1 "${@}"
 203 |         MAX_CONCURRENCY="${2}"
 204 |         shift 1
 205 |         ;;
 206 |       --figure-frequency)
 207 |         args_required 1 "${@}"
 208 |         FIGURE_PROB="${2}"
 209 |         shift 1
 210 |         ;;
 211 |       --equation-frequency)
 212 |         args_required 1 "${@}"
 213 |         EQUATION_PROB="${2}"
 214 |         shift 1
 215 |         ;;
 216 |       --max-size)
 217 |         args_required 1 "${@}"
 218 |         MAX_SIZE="${2}"
 219 |         shift 1
 220 |         ;;
 221 |       --min-equation-length)
 222 |         args_required 1 "${@}"
 223 |         MIN_EQUATION_LENGTH="${2}"
 224 |         shift 1
 225 |         ;;
 226 |       --max-equation-length)
 227 |         args_required 1 "${@}"
 228 |         MAX_EQUATION_LENGTH="${2}"
 229 |         shift 1
 230 |         ;;
 231 |       --min-caption-length)
 232 |         args_required 1 "${@}"
 233 |         MIN_CAPTION_LENGTH="${2}"
 234 |         shift 1
 235 |         ;;
 236 |       --chatgpt-token)
 237 |         args_required 1 "${@}"
 238 |         CHATGPT_TOKEN="${2}"
 239 |         shift 1
 240 |         ;;
 241 |       --chatgpt-topic)
 242 |         args_required 1 "${@}"
 243 |         CHATGPT_TOPIC="${2}"
 244 |         shift 1
 245 |         ;;
 246 |       --quiet)
 247 |         QUIET="true"
 248 |         ;;
 249 |       --skip-downloading)
 250 |         SKIP_DOWNLOADING="true"
 251 |         ;;
 252 |       --skip-extracting)
 253 |         SKIP_EXTRACTING="true"
 254 |         ;;
 255 |       --skip-metadata)
 256 |         SKIP_REGENERATING_METADATA="true"
 257 |         ;;
 258 |       --skip-filtering)
 259 |         SKIP_FILTERING="true"
 260 |         ;;
 261 |       -h|--help)
 262 |         usage
 263 |         exit 0
 264 |         ;;
 265 |       -*)
 266 |         error "Unrecognized argument '${1}'"
 267 |         echo
 268 |         usage
 269 |         exit 1
 270 |         ;;
 271 |       *)
 272 |         if [ -z "${ORIGINAL_FILE_URL}" ]; then
 273 |           ORIGINAL_FILE_URL="${1}"
 274 |           if [ -z "${FROM_FORMAT}" ]; then
 275 |             FROM_FORMAT="$(
 276 |               echo "${ORIGINAL_FILE_URL}" \
 277 |                 | sed 's/.*\.\(.*\)$/\1/'
 278 |             )"
 279 |           fi
 280 |         elif [ -z "${OUTPUT_FILE}" ]; then
 281 |           OUTPUT_FILE="${1}"
 282 |         fi
 283 |         ;;
 284 |     esac
 285 |     shift 1
 286 |   done
 287 | 
 288 |   if [ -z "${ORIGINAL_FILE_URL}" ]; then
 289 |     error "URL or file path argument expected."
 290 |     echo
 291 |     usage
 292 |     exit 1
 293 |   elif [ -z "${OUTPUT_FILE}" ]; then
 294 |     error "Output file argument expected."
 295 |     echo
 296 |     usage
 297 |     exit 1
 298 |   fi
 299 | }
 300 | 
 301 | open_temp_dir() {
 302 |   log "Creating directory ${TEMP_DIR} for intermediate work..."
 303 |   mkdir -p "${TEMP_DIR}"
 304 |   cd "${TEMP_DIR}" || error_exit "could not cd to temp directory ${TEMP_DIR}"
 305 | }
 306 | 
 307 | dump_latex_template() {
 308 |   cat <<"EOF" > template.tex
 309 | \PassOptionsToPackage{unicode$for(hyperrefoptions)$,$hyperrefoptions$$endfor$}{hyperref}
 310 | \PassOptionsToPackage{hyphens}{url}
 311 | $if(colorlinks)$
 312 | \PassOptionsToPackage{dvipsnames,svgnames,x11names}{xcolor}
 313 | $endif$
 314 | $if(dir)$
 315 | $if(latex-dir-rtl)$
 316 | \PassOptionsToPackage{RTLdocument}{bidi}
 317 | $endif$
 318 | $endif$
 319 | $if(CJKmainfont)$
 320 | \PassOptionsToPackage{space}{xeCJK}
 321 | $endif$
 322 | %
 323 | \documentclass[
 324 | $if(fontsize)$
 325 |   $fontsize$,
 326 | $endif$
 327 | $if(lang)$
 328 |   $babel-lang$,
 329 | $endif$
 330 | $if(papersize)$
 331 |   $papersize$paper,
 332 | $endif$
 333 | $for(classoption)$
 334 |   $classoption$$sep$,
 335 | $endfor$
 336 | ]{$documentclass$}
 337 | \usepackage{cite}
 338 | \usepackage{amsmath,amssymb,amsfonts}
 339 | \usepackage{algorithmic}
 340 | $if(fontfamily)$
 341 | \usepackage[$for(fontfamilyoptions)$$fontfamilyoptions$$sep$,$endfor$]{$fontfamily$}
 342 | $else$
 343 | \usepackage{lmodern}
 344 | $endif$
 345 | $if(linestretch)$
 346 | \usepackage{setspace}
 347 | $endif$
 348 | \usepackage{iftex}
 349 | \ifPDFTeX
 350 |   \usepackage[$if(fontenc)$$fontenc$$else$T1$endif$]{fontenc}
 351 |   \usepackage[utf8]{inputenc}
 352 |   \usepackage{textcomp} % provide euro and other symbols
 353 | \else % if luatex or xetex
 354 | $if(mathspec)$
 355 |   \ifXeTeX
 356 |     \usepackage{mathspec}
 357 |   \else
 358 |     \usepackage{unicode-math}
 359 |   \fi
 360 | $else$
 361 |   \usepackage{unicode-math}
 362 | $endif$
 363 |   \defaultfontfeatures{Scale=MatchLowercase}
 364 |   \defaultfontfeatures[\rmfamily]{Ligatures=TeX,Scale=1}
 365 | $if(mainfont)$
 366 |   \setmainfont[$for(mainfontoptions)$$mainfontoptions$$sep$,$endfor$]{$mainfont$}
 367 | $endif$
 368 | $if(sansfont)$
 369 |   \setsansfont[$for(sansfontoptions)$$sansfontoptions$$sep$,$endfor$]{$sansfont$}
 370 | $endif$
 371 | $if(monofont)$
 372 |   \setmonofont[$for(monofontoptions)$$monofontoptions$$sep$,$endfor$]{$monofont$}
 373 | $endif$
 374 | $for(fontfamilies)$
 375 |   \newfontfamily{$fontfamilies.name$}[$for(fontfamilies.options)$$fontfamilies.options$$sep$,$endfor$]{$fontfamilies.font$}
 376 | $endfor$
 377 | $if(mathfont)$
 378 | $if(mathspec)$
 379 |   \ifXeTeX
 380 |     \setmathfont(Digits,Latin,Greek)[$for(mathfontoptions)$$mathfontoptions$$sep$,$endfor$]{$mathfont$}
 381 |   \else
 382 |     \setmathfont[$for(mathfontoptions)$$mathfontoptions$$sep$,$endfor$]{$mathfont$}
 383 |   \fi
 384 | $else$
 385 |   \setmathfont[$for(mathfontoptions)$$mathfontoptions$$sep$,$endfor$]{$mathfont$}
 386 | $endif$
 387 | $endif$
 388 | $if(CJKmainfont)$
 389 |   \ifXeTeX
 390 |     \usepackage{xeCJK}
 391 |     \setCJKmainfont[$for(CJKoptions)$$CJKoptions$$sep$,$endfor$]{$CJKmainfont$}
 392 |   \fi
 393 | $endif$
 394 | $if(luatexjapresetoptions)$
 395 |   \ifLuaTeX
 396 |     \usepackage[$for(luatexjapresetoptions)$$luatexjapresetoptions$$sep$,$endfor$]{luatexja-preset}
 397 |   \fi
 398 | $endif$
 399 | $if(CJKmainfont)$
 400 |   \ifLuaTeX
 401 |     \usepackage[$for(luatexjafontspecoptions)$$luatexjafontspecoptions$$sep$,$endfor$]{luatexja-fontspec}
 402 |     \setmainjfont[$for(CJKoptions)$$CJKoptions$$sep$,$endfor$]{$CJKmainfont$}
 403 |   \fi
 404 | $endif$
 405 | \fi
 406 | $if(zero-width-non-joiner)$
 407 | %% Support for zero-width non-joiner characters.
 408 | \makeatletter
 409 | \def\zerowidthnonjoiner{%
 410 |   % Prevent ligatures and adjust kerning, but still support hyphenating.
 411 |   \texorpdfstring{%
 412 |     \textormath{\nobreak\discretionary{-}{}{\kern.03em}%
 413 |       \ifvmode\else\nobreak\hskip\z@skip\fi}{}%
 414 |   }{}%
 415 | }
 416 | \makeatother
 417 | \ifPDFTeX
 418 |   \DeclareUnicodeCharacter{200C}{\zerowidthnonjoiner}
 419 | \else
 420 |   \catcode`^^^^200c=\active
 421 |   \protected\def ^^^^200c{\zerowidthnonjoiner}
 422 | \fi
 423 | %% End of ZWNJ support
 424 | $endif$
 425 | % Use upquote if available, for straight quotes in verbatim environments
 426 | \IfFileExists{upquote.sty}{\usepackage{upquote}}{}
 427 | \IfFileExists{microtype.sty}{% use microtype if available
 428 |   \usepackage[$for(microtypeoptions)$$microtypeoptions$$sep$,$endfor$]{microtype}
 429 |   \UseMicrotypeSet[protrusion]{basicmath} % disable protrusion for tt fonts
 430 | }{}
 431 | $if(indent)$
 432 | $else$
 433 | \makeatletter
 434 | \@ifundefined{KOMAClassName}{% if non-KOMA class
 435 |   \IfFileExists{parskip.sty}{%
 436 |     \usepackage{parskip}
 437 |   }{% else
 438 |     \setlength{\parindent}{0pt}
 439 |     \setlength{\parskip}{6pt plus 2pt minus 1pt}}
 440 | }{% if KOMA class
 441 |   \KOMAoptions{parskip=half}}
 442 | \makeatother
 443 | $endif$
 444 | $if(verbatim-in-note)$
 445 | \usepackage{fancyvrb}
 446 | $endif$
 447 | \usepackage{xcolor}
 448 | \IfFileExists{xurl.sty}{\usepackage{xurl}}{} % add URL line breaks if available
 449 | \IfFileExists{bookmark.sty}{\usepackage{bookmark}}{\usepackage{hyperref}}
 450 | \hypersetup{
 451 | $if(title-meta)$
 452 |   pdftitle={$title-meta$},
 453 | $endif$
 454 | $if(author-meta)$
 455 |   pdfauthor={$author-meta$},
 456 | $endif$
 457 | $if(lang)$
 458 |   pdflang={$lang$},
 459 | $endif$
 460 | $if(subject)$
 461 |   pdfsubject={$subject$},
 462 | $endif$
 463 | $if(keywords)$
 464 |   pdfkeywords={$for(keywords)$$keywords$$sep$, $endfor$},
 465 | $endif$
 466 | $if(colorlinks)$
 467 |   colorlinks=true,
 468 |   linkcolor={$if(linkcolor)$$linkcolor$$else$Maroon$endif$},
 469 |   filecolor={$if(filecolor)$$filecolor$$else$Maroon$endif$},
 470 |   citecolor={$if(citecolor)$$citecolor$$else$Blue$endif$},
 471 |   urlcolor={$if(urlcolor)$$urlcolor$$else$Blue$endif$},
 472 | $else$
 473 |   hidelinks,
 474 | $endif$
 475 |   pdfcreator={LaTeX via pandoc}}
 476 | \urlstyle{same} % disable monospaced font for URLs
 477 | $if(verbatim-in-note)$
 478 | \VerbatimFootnotes % allow verbatim text in footnotes
 479 | $endif$
 480 | $if(geometry)$
 481 | \usepackage[$for(geometry)$$geometry$$sep$,$endfor$]{geometry}
 482 | $endif$
 483 | $if(listings)$
 484 | \usepackage{listings}
 485 | \newcommand{\passthrough}[1]{#1}
 486 | \lstset{defaultdialect=[5.3]Lua}
 487 | \lstset{defaultdialect=[x86masm]Assembler}
 488 | $endif$
 489 | $if(lhs)$
 490 | \lstnewenvironment{code}{\lstset{language=Haskell,basicstyle=\small\ttfamily}}{}
 491 | $endif$
 492 | $if(highlighting-macros)$
 493 | $highlighting-macros$
 494 | $endif$
 495 | $if(tables)$
 496 | \usepackage{longtable,booktabs,array}
 497 | 
 498 | % https://tex.stackexchange.com/a/224096
 499 | \makeatletter
 500 | \let\oldlt\longtable
 501 | \let\endoldlt\endlongtable
 502 | \def\longtable{\@ifnextchar[\longtable@i \longtable@ii}
 503 | \def\longtable@i[#1]{\begin{figure}[t]
 504 | \onecolumn
 505 | \begin{minipage}{0.5\textwidth}
 506 | \oldlt[#1]
 507 | }
 508 | \def\longtable@ii{\begin{figure}[t]
 509 | \onecolumn
 510 | \begin{minipage}{0.5\textwidth}
 511 | \oldlt
 512 | }
 513 | \def\endlongtable{\endoldlt
 514 | \end{minipage}
 515 | \twocolumn
 516 | \end{figure}}
 517 | \makeatother
 518 | 
 519 | 
 520 | $if(multirow)$
 521 | \usepackage{multirow}
 522 | $endif$
 523 | \usepackage{calc} % for calculating minipage widths
 524 | % Correct order of tables after \paragraph or \subparagraph
 525 | \usepackage{etoolbox}
 526 | \makeatletter
 527 | \patchcmd\longtable{\par}{\if@noskipsec\mbox{}\fi\par}{}{}
 528 | \makeatother
 529 | % Allow footnotes in longtable head/foot
 530 | \IfFileExists{footnotehyper.sty}{\usepackage{footnotehyper}}{\usepackage{footnote}}
 531 | \makesavenoteenv{longtable}
 532 | $endif$
 533 | $if(graphics)$
 534 | \usepackage{graphicx}
 535 | \makeatletter
 536 | \def\maxwidth{\ifdim\Gin@nat@width>\linewidth\linewidth\else\Gin@nat@width\fi}
 537 | \def\maxheight{\ifdim\Gin@nat@height>\textheight\textheight\else\Gin@nat@height\fi}
 538 | \makeatother
 539 | % Scale images if necessary, so that they will not overflow the page
 540 | % margins by default, and it is still possible to overwrite the defaults
 541 | % using explicit options in \includegraphics[width, height, ...]{}
 542 | \setkeys{Gin}{width=\maxwidth,height=\maxheight,keepaspectratio}
 543 | % Set default figure placement to htbp
 544 | \makeatletter
 545 | \def\fps@figure{htbp}
 546 | \makeatother
 547 | $endif$
 548 | $if(links-as-notes)$
 549 | % Make links footnotes instead of hotlinks:
 550 | \DeclareRobustCommand{\href}[2]{#2\footnote{\url{#1}}}
 551 | $endif$
 552 | $if(strikeout)$
 553 | $-- also used for underline
 554 | \usepackage[normalem]{ulem}
 555 | % Avoid problems with \sout in headers with hyperref
 556 | \pdfstringdefDisableCommands{\renewcommand{\sout}{}}
 557 | $endif$
 558 | \setlength{\emergencystretch}{3em} % prevent overfull lines
 559 | \providecommand{\tightlist}{%
 560 |   \setlength{\itemsep}{0pt}\setlength{\parskip}{0pt}}
 561 | $if(numbersections)$
 562 | \setcounter{secnumdepth}{$if(secnumdepth)$$secnumdepth$$else$5$endif$}
 563 | $else$
 564 | \setcounter{secnumdepth}{-\maxdimen} % remove section numbering
 565 | $endif$
 566 | $if(block-headings)$
 567 | % Make \paragraph and \subparagraph free-standing
 568 | \ifx\paragraph\undefined\else
 569 |   \let\oldparagraph\paragraph
 570 |   \renewcommand{\paragraph}[1]{\oldparagraph{#1}\mbox{}}
 571 | \fi
 572 | \ifx\subparagraph\undefined\else
 573 |   \let\oldsubparagraph\subparagraph
 574 |   \renewcommand{\subparagraph}[1]{\oldsubparagraph{#1}\mbox{}}
 575 | \fi
 576 | $endif$
 577 | $if(pagestyle)$
 578 | \pagestyle{$pagestyle$}
 579 | $endif$
 580 | $if(csl-refs)$
 581 | \newlength{\cslhangindent}
 582 | \setlength{\cslhangindent}{1.5em}
 583 | \newlength{\csllabelwidth}
 584 | \setlength{\csllabelwidth}{3em}
 585 | \newlength{\cslentryspacingunit} % times entry-spacing
 586 | \setlength{\cslentryspacingunit}{\parskip}
 587 | \newenvironment{CSLReferences}[2] % #1 hanging-ident, #2 entry spacing
 588 |  {% dont indent paragraphs
 589 |   \setlength{\parindent}{0pt}
 590 |   % turn on hanging indent if param 1 is 1
 591 |   \ifodd #1
 592 |   \let\oldpar\par
 593 |   \def\par{\hangindent=\cslhangindent\oldpar}
 594 |   \fi
 595 |   % set entry spacing
 596 |   \setlength{\parskip}{#2\cslentryspacingunit}
 597 |  }%
 598 |  {}
 599 | \usepackage{calc}
 600 | \newcommand{\CSLBlock}[1]{#1\hfill\break}
 601 | \newcommand{\CSLLeftMargin}[1]{\parbox[t]{\csllabelwidth}{#1}}
 602 | \newcommand{\CSLRightInline}[1]{\parbox[t]{\linewidth - \csllabelwidth}{#1}\break}
 603 | \newcommand{\CSLIndent}[1]{\hspace{\cslhangindent}#1}
 604 | $endif$
 605 | $for(header-includes)$
 606 | $header-includes$
 607 | $endfor$
 608 | $if(lang)$
 609 | \ifXeTeX
 610 |   % Load polyglossia as late as possible: uses bidi with RTL langages (e.g. Hebrew, Arabic)
 611 |   \usepackage{polyglossia}
 612 |   \setmainlanguage[$for(polyglossia-lang.options)$$polyglossia-lang.options$$sep$,$endfor$]{$polyglossia-lang.name$}
 613 | $for(polyglossia-otherlangs)$
 614 |   \setotherlanguage[$for(polyglossia-otherlangs.options)$$polyglossia-otherlangs.options$$sep$,$endfor$]{$polyglossia-otherlangs.name$}
 615 | $endfor$
 616 | \else
 617 |   \usepackage[$for(babel-otherlangs)$$babel-otherlangs$,$endfor$main=$babel-lang$]{babel}
 618 | % get rid of language-specific shorthands (see #6817):
 619 | \let\LanguageShortHands\languageshorthands
 620 | \def\languageshorthands#1{}
 621 | $if(babel-newcommands)$
 622 |   $babel-newcommands$
 623 | $endif$
 624 | \fi
 625 | $endif$
 626 | \ifLuaTeX
 627 |   \usepackage{selnolig}  % disable illegal ligatures
 628 | \fi
 629 | $if(dir)$
 630 | \ifXeTeX
 631 |   % Load bidi as late as possible as it modifies e.g. graphicx
 632 |   \usepackage{bidi}
 633 | \fi
 634 | \ifPDFTeX
 635 |   \TeXXeTstate=1
 636 |   \newcommand{\RL}[1]{\beginR #1\endR}
 637 |   \newcommand{\LR}[1]{\beginL #1\endL}
 638 |   \newenvironment{RTL}{\beginR}{\endR}
 639 |   \newenvironment{LTR}{\beginL}{\endL}
 640 | \fi
 641 | $endif$
 642 | $if(natbib)$
 643 | \usepackage[$natbiboptions$]{natbib}
 644 | \bibliographystyle{$if(biblio-style)$$biblio-style$$else$plainnat$endif$}
 645 | $endif$
 646 | $if(biblatex)$
 647 | \usepackage[$if(biblio-style)$style=$biblio-style$,$endif$$for(biblatexoptions)$$biblatexoptions$$sep$,$endfor$]{biblatex}
 648 | $for(bibliography)$
 649 | \addbibresource{$bibliography$}
 650 | $endfor$
 651 | $endif$
 652 | $if(nocite-ids)$
 653 | \nocite{$for(nocite-ids)$$it$$sep$, $endfor$}
 654 | $endif$
 655 | \usepackage{csquotes}
 656 | 
 657 | $if(title)$
 658 | \title{$title$$if(thanks)$\thanks{$thanks$}$endif$}
 659 | $endif$
 660 | 
 661 | \markboth{$if(journal)$$journal$$endif$}{$if(title)$$title$$endif$}
 662 | 
 663 | $if(subtitle)$
 664 | \usepackage{etoolbox}
 665 | \makeatletter
 666 | \providecommand{\subtitle}[1]{% add subtitle to \maketitle
 667 |   \apptocmd{\@title}{\par {\large #1 \par}}{}{}
 668 | }
 669 | \makeatother
 670 | \subtitle{$subtitle$}
 671 | $endif$
 672 | \author{$for(author)$$author$$sep$ \and $endfor$}
 673 | \date{$date$}
 674 | 
 675 | \begin{document}
 676 | $if(has-frontmatter)$
 677 | \frontmatter
 678 | $endif$
 679 | $if(title)$
 680 | \maketitle
 681 | $if(abstract)$
 682 | \begin{abstract}
 683 | $abstract$
 684 | \end{abstract}
 685 | $endif$
 686 | $endif$
 687 | 
 688 | $for(include-before)$
 689 | $include-before$
 690 | 
 691 | $endfor$
 692 | $if(toc)$
 693 | $if(toc-title)$
 694 | \renewcommand*\contentsname{$toc-title$}
 695 | $endif$
 696 | {
 697 | $if(colorlinks)$
 698 | \hypersetup{linkcolor=$if(toccolor)$$toccolor$$else$$endif$}
 699 | $endif$
 700 | \setcounter{tocdepth}{$toc-depth$}
 701 | \tableofcontents
 702 | }
 703 | $endif$
 704 | $if(lof)$
 705 | \listoffigures
 706 | $endif$
 707 | $if(lot)$
 708 | \listoftables
 709 | $endif$
 710 | $if(linestretch)$
 711 | \setstretch{$linestretch$}
 712 | $endif$
 713 | $if(has-frontmatter)$
 714 | \mainmatter
 715 | $endif$
 716 | $body$
 717 | 
 718 | $if(has-frontmatter)$
 719 | \backmatter
 720 | $endif$
 721 | $if(natbib)$
 722 | $if(bibliography)$
 723 | $if(biblio-title)$
 724 | $if(has-chapters)$
 725 | \renewcommand\bibname{$biblio-title$}
 726 | $else$
 727 | \renewcommand\refname{$biblio-title$}
 728 | $endif$
 729 | $endif$
 730 |   \bibliography{$for(bibliography)$$bibliography$$sep$,$endfor$}
 731 | 
 732 | $endif$
 733 | $endif$
 734 | $if(biblatex)$
 735 | \printbibliography$if(biblio-title)$[title=$biblio-title$]$endif$
 736 | 
 737 | $endif$
 738 | $for(include-after)$
 739 | $include-after$
 740 | 
 741 | $endfor$
 742 | \end{document}
 743 | EOF
 744 | }
 745 | 
 746 | dump_yaml_template() {
 747 |   if "${SKIP_REGENERATING_METADATA}" && [ -f metadata.md ]; then
 748 |     return 0
 749 |   elif [ -n "${CHATGPT_TOKEN}" ] && ! "${SKIP_REGENERATING_METADATA}"; then
 750 |     log "Generating paper metadata with ChatGPT..."
 751 |     curl https://api.openai.com/v1/chat/completions \
 752 |         --silent \
 753 |         --show-error \
 754 |         --location \
 755 |         --header "Content-Type: application/json" \
 756 |         --header "Authorization: Bearer ${CHATGPT_TOKEN}" \
 757 |         --data '{
 758 |           "model": "gpt-3.5-turbo",
 759 |           "messages": [
 760 |             {
 761 |               "role": "system",
 762 |               "content": "You are a JSON generator. You only return valid JSON. You generate JSON with information about realistic scientific research papers for a given topic. A user should be convinced that the paper, its author, and all parts of it are real. The fields in the returned JSON object are: journal_name, thanks, author_name, author_organization, author_email, paper_title, paper_abstract"
 763 |             },
 764 |             {
 765 |               "role": "user",
 766 |               "content": "Generate a valid JSON object with metadata about an award-winning research paper related to '"${CHATGPT_TOPIC}"'. Include the journal name, author thanks, author name, author organization, author email, paper title, and paper abstract."
 767 |             }
 768 |           ]
 769 |         }' \
 770 |       > chatgpt_response.json
 771 |     jq --raw-output '.choices[0].message.content' chatgpt_response.json \
 772 |       > metadata.json
 773 |     log "Generated metadata for '$(
 774 |       jq --raw-output '.paper_title' metadata.json
 775 |     )'"
 776 |     cat <<EOF > metadata.md
 777 | ---
 778 | documentclass: IEEEtran
 779 | classoption:
 780 |   - journal
 781 |   - letterpaper
 782 | journal: |
 783 |   $(jq --raw-output '.journal_name' metadata.json)
 784 | title: |
 785 |   $(jq --raw-output '.paper_title' metadata.json)
 786 | thanks: |
 787 |   $(jq --raw-output '.thanks' metadata.json)
 788 | author: 
 789 |   - |
 790 |     $(jq --raw-output '.author_name' metadata.json)
 791 | 
 792 |     $(jq --raw-output '.author_organization' metadata.json)
 793 | 
 794 |     [$(jq --raw-output '.author_email' metadata.json)](mailto:$(jq --raw-output '.author_email' metadata.json))
 795 | abstract: |
 796 |   $(jq --raw-output '.paper_abstract' metadata.json)
 797 | ...
 798 | 
 799 | 
 800 | EOF
 801 |   else
 802 |     cat <<"EOF" > metadata.md
 803 | ---
 804 | documentclass: IEEEtran
 805 | classoption:
 806 |   # - conference
 807 |   - journal
 808 |   # - compsoc  # Changes a lot of the typefaces
 809 |   - letterpaper
 810 | journal: |
 811 |   International Journal of Cybersecurity Research (IJCR)
 812 | title: |
 813 |   Adaptive Threat Intelligence Framework for Proactive Cyber Defense
 814 | # subtitle: (With a subtitle)
 815 | thanks: |
 816 |   The authors would like to express their gratitude to the Cybersecurity
 817 |   Research Institute (CRI) for providing valuable resources and support
 818 |   during the research process
 819 | author: 
 820 |   - |
 821 |     Emily Collins, PhD
 822 | 
 823 |     Cybersecurity Institute for Advanced Research (CIAR)
 824 | 
 825 |     [`ecollins@ciar.org`](mailto:ecollins@ciar.org)
 826 | abstract: |
 827 |   In this paper, we present a novel approach for detecting advanced persistent
 828 |   threats (APTs) using deep learning techniques. APTs pose significant
 829 |   challenges to traditional security systems due to their stealthy and
 830 |   persistent nature. Our proposed method leverages a combination of
 831 |   convolutional neural networks and recurrent neural networks to analyze
 832 |   large-scale network traffic data. We introduce a novel attention mechanism
 833 |   that identifies subtle patterns in the data, enabling the detection of APTs
 834 |   with high accuracy. Experimental results on real-world datasets demonstrate
 835 |   the effectiveness of our approach in identifying previously unknown APTs
 836 |   while minimizing false positives. The framework offers a promising solution
 837 |   for enhancing the security posture of modern network infrastructures against
 838 |   sophisticated cyber threats.
 839 | ...
 840 | 
 841 | 
 842 | EOF
 843 |   fi
 844 | }
 845 | 
 846 | download_papers() {
 847 |   if "${SKIP_DOWNLOADING}"; then
 848 |     return 0
 849 |   fi
 850 |   log "Downloading papers..."
 851 |   mkdir -p images
 852 |   mkdir -p tex
 853 |   mkdir -p unknown_files
 854 |   (
 855 |     cd images || error_exit "could not cd to $(pwd)/images"
 856 |     # This little pipeline is some of the most egregious code I've ever
 857 |     # written. But sometimes you have to shit the bed to remind yourself how
 858 |     # clean sheets feel. Greetz to Haskell Curry.
 859 |     curl \
 860 |         --silent \
 861 |         --show-error \
 862 |         --location \
 863 |         "https://arxiv.org/list/${ARXIV_CAT}/current?skip=0&show=${NUM_PAPERS}" \
 864 |       | grep --only-matching 'href="/format/[^"]*"' \
 865 |       | sed 's,href="/format/\(.*\)",https://arxiv.org/e-print/\1,' \
 866 |       | (
 867 |         while read -r URL; do \
 868 |           worker_wait
 869 |           curl \
 870 |               --silent \
 871 |               --show-error \
 872 |               --location \
 873 |               "${URL}" \
 874 |             | python3 -c 'import base64, gzip, io, os, sys, tarfile; exec(
 875 |                 "def _try(attempt, _except):\n"
 876 |                 "  try:\n"
 877 |                 "    return attempt()\n"
 878 |                 "  except Exception as e:\n"
 879 |                 "    return _except(e)"
 880 |               ), (
 881 |                 lambda ext: lambda rand: lambda data: (
 882 |                   lambda _filter: lambda randname:
 883 |                     _try(
 884 |                       lambda: (
 885 |                         lambda f: list(
 886 |                           open(
 887 |                             randname(member.name),
 888 |                             "wb",
 889 |                           ).write(
 890 |                             f.extractfile(member).read()
 891 |                           )
 892 |                           for member in f.getmembers()
 893 |                           if _filter(member)
 894 |                         ) 
 895 |                       )( # f
 896 |                         tarfile.open(mode="r", fileobj=data)
 897 |                       ),
 898 |                       lambda e: (
 899 |                         _try(
 900 |                           lambda: (
 901 |                             data.seek(0),
 902 |                             open(
 903 |                               randname("gzipped.tex"),
 904 |                               "wb",
 905 |                             ).write(
 906 |                               gzip.decompress(data.read())
 907 |                             )
 908 |                           ),
 909 |                           lambda _e: (
 910 |                             print("Exception", e, _e),
 911 |                             open(
 912 |                               os.path.join("..", "unknown_files", rand(24)), "wb"
 913 |                             ).write(
 914 |                               (data.seek(0), data.read(), data.seek(0))[1]
 915 |                             )
 916 |                           )
 917 |                         )
 918 |                       ),
 919 |                     )
 920 |                 )( # _filter
 921 |                   lambda m:
 922 |                     m if (
 923 |                       not (m.name.startswith("..") or m.name.startswith("/"))
 924 |                       and m.isfile()
 925 |                       and ext(m.name) in {
 926 |                         "jpg", 
 927 |                         "jpeg", 
 928 |                         "png",
 929 |                         "tex",
 930 |                       }
 931 |                     ) else None
 932 |                 )( # randname
 933 |                   lambda f:
 934 |                     f"./{rand(24)}.{ext(f)}"
 935 |                 )
 936 |               )( # ext
 937 |                 lambda s: 
 938 |                   os.path.splitext(s)[1][1:].lower()
 939 |               )( # rand
 940 |                 lambda n: 
 941 |                   base64.b64encode(
 942 |                     open("/dev/urandom", "rb").read(n),
 943 |                     altchars=b"__",
 944 |                   ).decode("ascii")
 945 |               )( # data
 946 |                 io.BytesIO(sys.stdin.buffer.read())
 947 |               )' &
 948 |         done
 949 |         wait
 950 |       )
 951 |     mv ./*.tex ../tex/
 952 |   )
 953 | }
 954 | 
 955 | deduplicate() {
 956 |   if "${SKIP_DOWNLOADING}"; then
 957 |     return 0
 958 |   fi
 959 |   # *nix users born after 1993 don't know how to use awk. All they know is
 960 |   # charge they phone, lay massive amounts of pipe, eat hot chip, and lie.
 961 |   for d in ./*; do
 962 |     if [ -d "${d}" ]; then
 963 |       (
 964 |         cd "${d}" || error_exit "could not cd to $(pwd)/${d}"
 965 |         log "Deduplicating $(pwd)..."
 966 |         sha256sum ./* > hashes.txt 2>/dev/null
 967 |         < hashes.txt \
 968 |             cut -d ' ' -f 1 \
 969 |           | sort \
 970 |           | uniq -c \
 971 |           | sort -n \
 972 |           | sed 's/^ *//g' \
 973 |           | grep '^\([2-9]\|[0-9][0-9]\)' \
 974 |           | cut -d ' ' -f 2 \
 975 |           | (
 976 |             while read -r HASH; do
 977 |               < hashes.txt \
 978 |                   grep --fixed-strings "${HASH}" \
 979 |                 | head -n -1 \
 980 |                 | sed 's/  */ /g' \
 981 |                 | cut -d ' ' -f 2 \
 982 |                 | xargs rm -vf
 983 |             done
 984 |           )
 985 |         rm -f hashes.txt
 986 |       )
 987 |     fi
 988 |   done
 989 | }
 990 | 
 991 | filter_large_files() {
 992 |   if "${SKIP_FILTERING}"; then
 993 |     return 0
 994 |   fi
 995 |   log "Removing images greater than ${MAX_SIZE} bytes..."
 996 |   mkdir -p big_images
 997 |   (
 998 |     cd images || error_exit "could not cd to $(pwd)/images"
 999 |     du --bytes ./* \
1000 |       | awk '$1 > '"${MAX_SIZE}"' { print $2 }' \
1001 |       | xargs -I {} mv {} ../big_images/
1002 |   )
1003 | }
1004 | 
1005 | filter_diagrams() {
1006 |   if ! command -v convert >/dev/null 2>&1 || "${SKIP_FILTERING}"; then
1007 |     return 0
1008 |   fi
1009 |   # Use a rough heuristic to pick out diagram-ish images: the top left and
1010 |   # bottom right corners must be approximately white (or transparent)
1011 |   log "Removing non-diagram images..."
1012 |   mkdir -p non_diagram_images
1013 |   (
1014 |     cd images || error_exit "could not cd to $(pwd)/images"
1015 |     TOTAL="$(find . -type f | wc -l)"
1016 |     NUM="0"
1017 |     for IMAGE in ./*; do
1018 |       worker_wait
1019 |       (
1020 |         convert \
1021 |             "${IMAGE}" \
1022 |             -format \
1023 |             "%[fx:u.p{0,0}.a == 0 ? 999 : u.p{0,0}.r * 255 + u.p{0,0}.g * 255 + u.p{0,0}.b * 255]\n%[fx:u.p{w,h}.a == 0 ? 999 : u.p{w,h}.r * 255 + u.p{w,h}.g * 255 + u.p{w,h}.b * 255]\n" \
1024 |             "info:" \
1025 |           | (
1026 |             while read -r PIXEL_SUM; do
1027 |               if [ -z "${PIXEL_SUM}" ] || [ "${PIXEL_SUM}" -lt 750 ]; then
1028 |                 mv "${IMAGE}" ../non_diagram_images/
1029 |                 break
1030 |               fi
1031 |             done
1032 |           )
1033 |       ) &
1034 |       NUM="$(( NUM + 1 ))"
1035 |       if ! "${QUIET}"; then
1036 |         printf "\r%s%% complete..." "$(( 100 * NUM / TOTAL ))" >&2
1037 |       fi
1038 |     done
1039 |     wait
1040 |     echo
1041 |   )
1042 | }
1043 | 
1044 | extract_captions() {
1045 |   if "${SKIP_EXTRACTING}"; then
1046 |     return 0
1047 |   fi
1048 |   log "Generating and testing figure captions..."
1049 |   cat tex/* \
1050 |     | grep --only-matching '\\caption{[^\{]\+}' \
1051 |     | sed 's,\\caption{,,g' \
1052 |     | sed 's,}$,,g' \
1053 |     > unchecked_captions.txt
1054 |   < unchecked_captions.txt \
1055 |     sort \
1056 |     | uniq \
1057 |     | grep '.\{'"${MIN_CAPTION_LENGTH}"',\}' \
1058 |     | shuf \
1059 |     | (
1060 |       TOTAL="$(wc -l < unchecked_captions.txt)"
1061 |       NUM="0"
1062 |       while read -r CAPTION; do
1063 |         worker_wait
1064 |         (check_latex "${CAPTION}" && echo "${CAPTION}") &
1065 |         NUM="$(( NUM + 1 ))"
1066 |         if ! "${QUIET}"; then
1067 |           printf "\r%s%% complete..." "$(( 100 * NUM / TOTAL ))" >&2
1068 |         fi
1069 |       done
1070 |       wait
1071 |       echo
1072 |     ) \
1073 |     > captions.txt
1074 | }
1075 | 
1076 | extract_equations() {
1077 |   if "${SKIP_EXTRACTING}"; then
1078 |     return 0
1079 |   fi
1080 |   log "Generating and testing equations..."
1081 |   cat tex/* \
1082 |     | grep --only-matching '^\$\$.*\$\$$' \
1083 |     | sort \
1084 |     | uniq \
1085 |     | grep '^\$\$ *.\{'"${MIN_EQUATION_LENGTH},${MAX_EQUATION_LENGTH}"'\} *\$\$$' \
1086 |     | shuf \
1087 |     | (
1088 |       while read -r EQUATION; do
1089 |         worker_wait
1090 |         (check_latex "${EQUATION}" && echo "${EQUATION}") &
1091 |       done
1092 |       wait
1093 |     ) \
1094 |     > equations.txt
1095 | }
1096 | 
1097 | build_paper() {
1098 |   if ! echo "${ORIGINAL_FILE_URL}" | grep '\.\(html\|php\)$' >/dev/null \
1099 |       && ! echo "${ORIGINAL_FILE_URL}" | grep 'http.*\/[^.]*$' >/dev/null; then
1100 |     # Pandoc cannot download e.g. epub files, but it must download web-based
1101 |     # files itself to correctly pull media and rewrite URLs for images. In this
1102 |     # case, we download non-HTML on its behalf.
1103 |     log "Downloading input file..."
1104 |     curl \
1105 |       --silent \
1106 |       --show-error \
1107 |       --location \
1108 |       --output "input.${FROM_FORMAT}" \
1109 |       "${ORIGINAL_FILE_URL}"
1110 |     ORIGINAL_FILE_URL="input.${FROM_FORMAT}"
1111 |   fi
1112 | 
1113 |   log "Building paper..."
1114 |   pandoc \
1115 |       --from "${FROM_FORMAT}" \
1116 |       --to "gfm" \
1117 |       --wrap none \
1118 |       --extract-media media \
1119 |       --output - \
1120 |       "${ORIGINAL_FILE_URL}" \
1121 |     | grep -v 'cover\.\(jpe\?g\|png\)' \
1122 |     | grep -v '!\[.*\](.*\.\(svg\|gif\))' \
1123 |     | (
1124 |       while read -r LINE; do 
1125 |         echo "${LINE}"
1126 |         if [ "$(rand_int "${FIGURE_PROB}")" = 1 ]; then 
1127 |           printf "\n\n![%s](%s)\n\n" "$(
1128 |             < captions.txt \
1129 |                 shuf \
1130 |               | head -n 1
1131 |           )" "$(
1132 |             find images -type f \
1133 |               | shuf \
1134 |               | head -n 1
1135 |           )"
1136 |         elif [ "$(rand_int "${EQUATION_PROB}")" = 1 ]; then 
1137 |           printf "\n\n%s\n\n" "$(
1138 |             < equations.txt \
1139 |                 shuf \
1140 |               | head -n 1
1141 |           )"
1142 |         fi
1143 |       done
1144 |     ) \
1145 |     | cat metadata.md - \
1146 |     | iconv \
1147 |         --from-code utf-8 \
1148 |         --to-code ascii//translit \
1149 |         ${ICONV_PARAM} \
1150 |     | pandoc \
1151 |         --from "markdown" \
1152 |         --to latex \
1153 |         --template template.tex \
1154 |         --output output.tex \
1155 |         -
1156 |   pdflatex output.tex
1157 | }
1158 | 
1159 | check_requirements
1160 | parse_args "${@}"
1161 | (
1162 |   open_temp_dir  # Changes the working directory to $TEMP_DIR
1163 |   dump_latex_template
1164 |   dump_yaml_template
1165 |   download_papers
1166 |   deduplicate
1167 |   filter_large_files
1168 |   filter_diagrams
1169 |   deduplicate
1170 |   extract_captions
1171 |   extract_equations
1172 |   build_paper
1173 | )
1174 | cp "${TEMP_DIR}/output.pdf" "${OUTPUT_FILE}"
1175 | 
1176 | 


--------------------------------------------------------------------------------