├── .github
    └── workflows
    │   ├── cron_runs.yaml
    │   └── publish_md_test.yml
├── .gitignore
├── .pre-commit-config.yaml
├── LICENSE
├── README.md
├── arxiv_scraper.py
├── configs
    ├── authors.template.txt
    ├── authors.txt
    ├── base_prompt.txt
    ├── config.ini
    ├── paper_topics.template.txt
    ├── paper_topics.txt
    └── postfix_prompt.txt
├── filter_papers.py
├── in
    └── debug_papers.json
├── main.py
├── out
    └── .gitignore
├── parse_json_to_md.py
├── push_to_slack.py
└── requirements.txt


/.github/workflows/cron_runs.yaml:
--------------------------------------------------------------------------------
 1 | name: Run daily arxiv
 2 | 
 3 | on:
 4 |   schedule:
 5 |     # * is a special character in YAML so you have to quote this string
 6 |     - cron:  '0 13 * * *'
 7 |   workflow_dispatch:
 8 |   
 9 | 
10 | permissions:
11 |   contents: read
12 | 
13 | jobs:
14 |   build:
15 |     runs-on: ubuntu-latest
16 | 
17 |     steps:
18 |     - uses: actions/checkout@v3
19 |     - name: Set up Python 3.10
20 |       uses: actions/setup-python@v3
21 |       with:
22 |         python-version: "3.10"
23 |     - name: Install dependencies
24 |       run: |
25 |         python -m pip install --upgrade pip
26 |         if [ -f requirements.txt ]; then pip install -r requirements.txt; fi
27 |     - name: Run main
28 |       env:
29 |         OAI_KEY: ${{ secrets.OAI_KEY }}
30 |         SLACK_KEY: ${{ secrets.SLACK_KEY }}
31 |         SLACK_CHANNEL_ID: ${{ secrets.SLACK_CHANNEL_ID }}
32 |       run: |
33 |         python main.py
34 |     - name: Upload results
35 |       uses: actions/upload-artifact@v3
36 |       with:
37 |         name: arxiv-scanner-outputs
38 |         path: out/
39 |         retention-days: 5
40 | 


--------------------------------------------------------------------------------
/.github/workflows/publish_md_test.yml:
--------------------------------------------------------------------------------
 1 | name: Publish output MD to github pages
 2 | 
 3 | on:
 4 |   workflow_dispatch:
 5 |   workflow_run:
 6 |     workflows: ["Run daily arxiv"]
 7 |     types:
 8 |       - completed
 9 | 
10 | 
11 | permissions: write-all
12 | 
13 | jobs:
14 |   build:
15 |     environment:
16 |       name: github-pages
17 |       url: ${{ steps.deployment.outputs.page_url }}
18 |     runs-on: ubuntu-latest
19 |     outputs:
20 |       file-exists: ${{steps.file-check.outputs.file_exists}}
21 |     steps:
22 |     - uses: actions/checkout@v3
23 |     - name: Download artifact
24 |       id: download-artifact
25 |       uses: dawidd6/action-download-artifact@v2
26 |       with:
27 |         workflow: cron_runs.yaml
28 |         workflow_conclusion: success
29 |         name: arxiv-scanner-outputs
30 |     - name: Display structure of downloaded files
31 |       run: ls -R
32 |     - name: Check for output.md
33 |       id: check_files
34 |       uses: andstor/file-existence-action@v2
35 |       with:
36 |         files: output.md
37 |     - name: Convert output.md to pages
38 |       uses: wranders/markdown-to-pages-action@v0.1
39 |       if: steps.check_files.outputs.files_exists == 'true'
40 |       with:
41 |        token: ${{ secrets.GITHUB_TOKEN }}
42 |        file: output.md
43 |     - uses: actions/upload-pages-artifact@v2
44 |       if: steps.check_files.outputs.files_exists == 'true'
45 |       with:
46 |         path: dist
47 |     - uses: actions/deploy-pages@v1
48 |       if: steps.check_files.outputs.files_exists == 'true'
49 |       id: deployment


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
1 | # dont commit the actual api keys or other personal info to the public github
2 | #configs/authors.txt
3 | configs/keys.ini
4 | #configs/paper_topics.txt
5 | # dont commit venv and ide stuff
6 | venv
7 | .idea
8 | 


--------------------------------------------------------------------------------
/.pre-commit-config.yaml:
--------------------------------------------------------------------------------
1 | repos:
2 |   - repo: https://github.com/astral-sh/ruff-pre-commit
3 |     # Ruff version.
4 |     rev: v0.1.5
5 |     hooks:
6 |       # Run the linter.
7 |       - id: ruff
8 |       # Run the formatter.
9 |       - id: ruff-format


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
  1 | 
  2 |                                  Apache License
  3 |                            Version 2.0, January 2004
  4 |                         http://www.apache.org/licenses/
  5 | 
  6 |    TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
  7 | 
  8 |    1. Definitions.
  9 | 
 10 |       "License" shall mean the terms and conditions for use, reproduction,
 11 |       and distribution as defined by Sections 1 through 9 of this document.
 12 | 
 13 |       "Licensor" shall mean the copyright owner or entity authorized by
 14 |       the copyright owner that is granting the License.
 15 | 
 16 |       "Legal Entity" shall mean the union of the acting entity and all
 17 |       other entities that control, are controlled by, or are under common
 18 |       control with that entity. For the purposes of this definition,
 19 |       "control" means (i) the power, direct or indirect, to cause the
 20 |       direction or management of such entity, whether by contract or
 21 |       otherwise, or (ii) ownership of fifty percent (50%) or more of the
 22 |       outstanding shares, or (iii) beneficial ownership of such entity.
 23 | 
 24 |       "You" (or "Your") shall mean an individual or Legal Entity
 25 |       exercising permissions granted by this License.
 26 | 
 27 |       "Source" form shall mean the preferred form for making modifications,
 28 |       including but not limited to software source code, documentation
 29 |       source, and configuration files.
 30 | 
 31 |       "Object" form shall mean any form resulting from mechanical
 32 |       transformation or translation of a Source form, including but
 33 |       not limited to compiled object code, generated documentation,
 34 |       and conversions to other media types.
 35 | 
 36 |       "Work" shall mean the work of authorship, whether in Source or
 37 |       Object form, made available under the License, as indicated by a
 38 |       copyright notice that is included in or attached to the work
 39 |       (an example is provided in the Appendix below).
 40 | 
 41 |       "Derivative Works" shall mean any work, whether in Source or Object
 42 |       form, that is based on (or derived from) the Work and for which the
 43 |       editorial revisions, annotations, elaborations, or other modifications
 44 |       represent, as a whole, an original work of authorship. For the purposes
 45 |       of this License, Derivative Works shall not include works that remain
 46 |       separable from, or merely link (or bind by name) to the interfaces of,
 47 |       the Work and Derivative Works thereof.
 48 | 
 49 |       "Contribution" shall mean any work of authorship, including
 50 |       the original version of the Work and any modifications or additions
 51 |       to that Work or Derivative Works thereof, that is intentionally
 52 |       submitted to Licensor for inclusion in the Work by the copyright owner
 53 |       or by an individual or Legal Entity authorized to submit on behalf of
 54 |       the copyright owner. For the purposes of this definition, "submitted"
 55 |       means any form of electronic, verbal, or written communication sent
 56 |       to the Licensor or its representatives, including but not limited to
 57 |       communication on electronic mailing lists, source code control systems,
 58 |       and issue tracking systems that are managed by, or on behalf of, the
 59 |       Licensor for the purpose of discussing and improving the Work, but
 60 |       excluding communication that is conspicuously marked or otherwise
 61 |       designated in writing by the copyright owner as "Not a Contribution."
 62 | 
 63 |       "Contributor" shall mean Licensor and any individual or Legal Entity
 64 |       on behalf of whom a Contribution has been received by Licensor and
 65 |       subsequently incorporated within the Work.
 66 | 
 67 |    2. Grant of Copyright License. Subject to the terms and conditions of
 68 |       this License, each Contributor hereby grants to You a perpetual,
 69 |       worldwide, non-exclusive, no-charge, royalty-free, irrevocable
 70 |       copyright license to reproduce, prepare Derivative Works of,
 71 |       publicly display, publicly perform, sublicense, and distribute the
 72 |       Work and such Derivative Works in Source or Object form.
 73 | 
 74 |    3. Grant of Patent License. Subject to the terms and conditions of
 75 |       this License, each Contributor hereby grants to You a perpetual,
 76 |       worldwide, non-exclusive, no-charge, royalty-free, irrevocable
 77 |       (except as stated in this section) patent license to make, have made,
 78 |       use, offer to sell, sell, import, and otherwise transfer the Work,
 79 |       where such license applies only to those patent claims licensable
 80 |       by such Contributor that are necessarily infringed by their
 81 |       Contribution(s) alone or by combination of their Contribution(s)
 82 |       with the Work to which such Contribution(s) was submitted. If You
 83 |       institute patent litigation against any entity (including a
 84 |       cross-claim or counterclaim in a lawsuit) alleging that the Work
 85 |       or a Contribution incorporated within the Work constitutes direct
 86 |       or contributory patent infringement, then any patent licenses
 87 |       granted to You under this License for that Work shall terminate
 88 |       as of the date such litigation is filed.
 89 | 
 90 |    4. Redistribution. You may reproduce and distribute copies of the
 91 |       Work or Derivative Works thereof in any medium, with or without
 92 |       modifications, and in Source or Object form, provided that You
 93 |       meet the following conditions:
 94 | 
 95 |       (a) You must give any other recipients of the Work or
 96 |           Derivative Works a copy of this License; and
 97 | 
 98 |       (b) You must cause any modified files to carry prominent notices
 99 |           stating that You changed the files; and
100 | 
101 |       (c) You must retain, in the Source form of any Derivative Works
102 |           that You distribute, all copyright, patent, trademark, and
103 |           attribution notices from the Source form of the Work,
104 |           excluding those notices that do not pertain to any part of
105 |           the Derivative Works; and
106 | 
107 |       (d) If the Work includes a "NOTICE" text file as part of its
108 |           distribution, then any Derivative Works that You distribute must
109 |           include a readable copy of the attribution notices contained
110 |           within such NOTICE file, excluding those notices that do not
111 |           pertain to any part of the Derivative Works, in at least one
112 |           of the following places: within a NOTICE text file distributed
113 |           as part of the Derivative Works; within the Source form or
114 |           documentation, if provided along with the Derivative Works; or,
115 |           within a display generated by the Derivative Works, if and
116 |           wherever such third-party notices normally appear. The contents
117 |           of the NOTICE file are for informational purposes only and
118 |           do not modify the License. You may add Your own attribution
119 |           notices within Derivative Works that You distribute, alongside
120 |           or as an addendum to the NOTICE text from the Work, provided
121 |           that such additional attribution notices cannot be construed
122 |           as modifying the License.
123 | 
124 |       You may add Your own copyright statement to Your modifications and
125 |       may provide additional or different license terms and conditions
126 |       for use, reproduction, or distribution of Your modifications, or
127 |       for any such Derivative Works as a whole, provided Your use,
128 |       reproduction, and distribution of the Work otherwise complies with
129 |       the conditions stated in this License.
130 | 
131 |    5. Submission of Contributions. Unless You explicitly state otherwise,
132 |       any Contribution intentionally submitted for inclusion in the Work
133 |       by You to the Licensor shall be under the terms and conditions of
134 |       this License, without any additional terms or conditions.
135 |       Notwithstanding the above, nothing herein shall supersede or modify
136 |       the terms of any separate license agreement you may have executed
137 |       with Licensor regarding such Contributions.
138 | 
139 |    6. Trademarks. This License does not grant permission to use the trade
140 |       names, trademarks, service marks, or product names of the Licensor,
141 |       except as required for reasonable and customary use in describing the
142 |       origin of the Work and reproducing the content of the NOTICE file.
143 | 
144 |    7. Disclaimer of Warranty. Unless required by applicable law or
145 |       agreed to in writing, Licensor provides the Work (and each
146 |       Contributor provides its Contributions) on an "AS IS" BASIS,
147 |       WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
148 |       implied, including, without limitation, any warranties or conditions
149 |       of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
150 |       PARTICULAR PURPOSE. You are solely responsible for determining the
151 |       appropriateness of using or redistributing the Work and assume any
152 |       risks associated with Your exercise of permissions under this License.
153 | 
154 |    8. Limitation of Liability. In no event and under no legal theory,
155 |       whether in tort (including negligence), contract, or otherwise,
156 |       unless required by applicable law (such as deliberate and grossly
157 |       negligent acts) or agreed to in writing, shall any Contributor be
158 |       liable to You for damages, including any direct, indirect, special,
159 |       incidental, or consequential damages of any character arising as a
160 |       result of this License or out of the use or inability to use the
161 |       Work (including but not limited to damages for loss of goodwill,
162 |       work stoppage, computer failure or malfunction, or any and all
163 |       other commercial damages or losses), even if such Contributor
164 |       has been advised of the possibility of such damages.
165 | 
166 |    9. Accepting Warranty or Additional Liability. While redistributing
167 |       the Work or Derivative Works thereof, You may choose to offer,
168 |       and charge a fee for, acceptance of support, warranty, indemnity,
169 |       or other liability obligations and/or rights consistent with this
170 |       License. However, in accepting such obligations, You may act only
171 |       on Your own behalf and on Your sole responsibility, not on behalf
172 |       of any other Contributor, and only if You agree to indemnify,
173 |       defend, and hold each Contributor harmless for any liability
174 |       incurred by, or claims asserted against, such Contributor by reason
175 |       of your accepting any such warranty or additional liability.
176 | 
177 |    END OF TERMS AND CONDITIONS
178 | 
179 |    APPENDIX: How to apply the Apache License to your work.
180 | 
181 |       To apply the Apache License to your work, attach the following
182 |       boilerplate notice, with the fields enclosed by brackets "[]"
183 |       replaced with your own identifying information. (Don't include
184 |       the brackets!)  The text should be enclosed in the appropriate
185 |       comment syntax for the file format. We also recommend that a
186 |       file or class name and description of purpose be included on the
187 |       same "printed page" as the copyright notice for easier
188 |       identification within third-party archives.
189 | 
190 |    Copyright [yyyy] [name of copyright owner]
191 | 
192 |    Licensed under the Apache License, Version 2.0 (the "License");
193 |    you may not use this file except in compliance with the License.
194 |    You may obtain a copy of the License at
195 | 
196 |        http://www.apache.org/licenses/LICENSE-2.0
197 | 
198 |    Unless required by applicable law or agreed to in writing, software
199 |    distributed under the License is distributed on an "AS IS" BASIS,
200 |    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
201 |    See the License for the specific language governing permissions and
202 |    limitations under the License.
203 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
  1 | # GPT4 paper assistant: A daily ArXiv scanner
  2 | 
  3 | This repo implements a very simple daily scanner for Arxiv that uses GPT4 and author matches to find papers you might find interesting. 
  4 | It will run daily via github actions and can post this information to slack via a bot or just render it in a static github-pages website.
  5 | 
  6 | A simple demo of the daily papers can be seen [here](https://tatsu-lab.github.io/gpt_paper_assistant/) running on `cs.CL`
  7 | 
  8 | As a cost estimate, running this on all of `cs.CL` cost $0.07 on 2/7/2024
  9 | 
 10 | ## Changelog
 11 | - **2/15/2024**: fixed a bug with author parsing in the RSS format + cost estimates for title filtering being off + crash when 0 papers are on the feed. 
 12 | - **2/7/2024**: fixed a critical issue from ArXiv changing their RSS format. Added and enabled a title filtering to reduce costs.
 13 | 
 14 | 
 15 | ## Quickstart
 16 | This is the minimal necessary steps to get the scanner to run. It is highly recommended to read the whole thing to decide what you want to run.
 17 | 
 18 | ### Running on github actions
 19 | 
 20 | 1. Copy/fork this repo to a new github repo and [enable scheduled workflows](https://docs.github.com/en/actions/using-workflows/disabling-and-enabling-a-workflow) if you fork it.
 21 | 2. Copy `config/paper_topics.template.txt` to `config/paper_topics.txt` and fill it out with the types of papers you want to follow
 22 | 3. Copy `config/authors.template.txt` to `config/authors.txt` and list the authors you actually want to follow. The numbers behind the author are important. They are semantic scholar author IDs which you can find by looking up the authors on semantic scholar and taking the numbers at the end of the URL.
 23 | 4. Set your desired ArXiv categories in `config/config.ini`.
 24 | 5. Set your openai key (`OAI_KEY`) as ``a [github secret](https://docs.github.com/en/actions/security-guides/using-secrets-in-github-actions#creating-secrets-for-a-repository)
 25 | 6. In your repo settings, set github page build sources to be [github actions](https://docs.github.com/en/pages/getting-started-with-github-pages/configuring-a-publishing-source-for-your-github-pages-site#publishing-with-a-custom-github-actions-workflow)
 26 | 
 27 | At this point your bot should run daily and publish a static website. You can test this by running the github action workflow manually.
 28 | 
 29 | **Optional but highly recommended**: 
 30 | 
 31 | 7. Get and set up a semantic scholar API key (`S2_KEY`) as a github secret. Otherwise the author search step will be very slow
 32 | 8. [Set up a slack bot](https://api.slack.com/start/quickstart), get the OAuth key, set it to `SLACK_KEY` as a github secret
 33 | 9. Make a channel for the bot (and invite it to the channel), get its [Slack channel id](https://stackoverflow.com/questions/40940327/what-is-the-simplest-way-to-find-a-slack-team-id-and-a-channel-id), set it as `SLACK_CHANNEL_ID` in a github secret.
 34 | 10. Take a look at `configs/config.ini` to tweak how things are filtered.
 35 | 11. Set the github repo private to avoid github actions being [set to inactive after 60 days](https://docs.github.com/en/actions/using-workflows/disabling-and-enabling-a-workflow)
 36 | 
 37 | Each day at 1pm UTC, the bot will run and post to slack and publish a github pages website (see the publish_md and cron_runs actions for details).
 38 | 
 39 | ### Running locally
 40 | 
 41 | The steps are generally the same as above, but you have to set up the environment via `requirements.txt`
 42 | 
 43 | Instead of passing credentials via github secrets, you have to set environment variables `OAI_KEY`, `SLACK_KEY`, `SLACK_CHANNEL_ID`.
 44 | 
 45 | To run everything, just call `main.py`
 46 | 
 47 | **Other notes:**
 48 | You may also want to not push to slack, in which case set your desired output endpoint (json, markdown, slack) in the `dump_json`, `dump_md`, and `push_to_slack` fields of `config/config.ini`.
 49 | 
 50 | If the semantic scholar API times out or is slow, you should get a [S2 api key](https://www.semanticscholar.org/product/api#api-key-form) and set it as `S2_KEY` in your environment variables.
 51 | (due to the limitations of github actions, this will only help if the code is run locally)
 52 | 
 53 | **Making it run on its own:**
 54 | This whole thing takes almost no compute, so you can rent the cheapest VM from AWS, put this repo in it, install the `requirements.txt`
 55 | appropriately set up the environment variables and add the following crontab
 56 | ```
 57 | 0 13 * * * python ~/arxiv_scanner/main.py
 58 | ```
 59 | This crontab will run the script every 1pm UTC, 6pm pacific. 
 60 | 
 61 | ## Making the `paper_topics.txt` prompt
 62 | The `paper_topics.txt` file is used to generate the prompt for GPT. It is a list of topics that you want to follow.
 63 | One set of examples might be something like 
 64 | ```text
 65 |  1. New methodological improvements to RLHF or instruction-following which are specific fine-tuning steps that are taken to make language models better at following user instructions across a range of tasks.
 66 |     - Relevant: papers that discuss specific methods like RLHF, or instruction-tuning datasets, improving these methods, or analyzing them.
 67 |     - Not relevant: papers about adaptation to some task. Simply following instructions or inputs are not sufficient.
 68 |  2. Shows new powerful test set contamination or membership inference methods for language models. Test set contamination is the phenomenon where a language model observes a benchmark dataset during pretraining.
 69 |     - Relevant: test statistics that can detect contamination of benchmarks in language models. statistics that can provide guarantees are more interesting. membership inference methods that are general enough to apply to language models are also relevant.
 70 |     - Not relevant: any papers that do not consider language models, or that do not consider test set contamination.
 71 |  3. Shows a significant advance in the performance of diffusion language models.
 72 |     - Relevant: papers that study language models that are also diffusion models. Continuous diffusions are even more relevant, while discrete diffusions are less so.
 73 |     - Not relevant: papers about image diffusions like DALL-E or Stable Diffusion, or papers that do not explicitly mention language models or applications to text.
 74 | ```
 75 | This is just a standard prompt, but being very specific can help, especially for things like 'diffusion language models' or 'instruction-following', where the LM can get confused about whether image diffusions are relevant, or if doing some task better is sufficient to improve instruction following.
 76 | 
 77 | You may also want to follow this with some general interest areas like
 78 | ```text
 79 | In suggesting papers to your friend, remember that he enjoys papers on statistical machine learning, and generative modeling in natural language processing.
 80 |  Your friend also likes learning about surprising empirical results in language models, as well as clever statistical tricks.
 81 |  He does not want to read papers that are about primarily applications of methods to specific domains.
 82 | ```
 83 | 
 84 | ## Details of how it works
 85 | 
 86 | The script grabs a candidate set of ArXiv papers for a specific day, via the RSS feeds. To avoid double-announcing papers, it will only grab an RSS feed within the last day. To avoid missing papers, you'd want to run this every day. 
 87 | It filters out any `UPDATED` papers and announces only new ones.
 88 | 
 89 | The filtering logic is pretty simple. We first check for author match.
 90 | 1. Do a lookup of the authors on semantic scholar, getting a list of candidate matches.
 91 | 2. Check the authors of the paper. If the author semantic scholar id matches someone in `authors.txt` it goes in the candidate set with a default score of `author_match_score`.
 92 | 
 93 | We then check for GPT-evaluated relevance. We do this in two steps.
 94 | 1. Filter out any papers that have no authors with h-index above `hcutoff` in `config.ini`. This is to reduce costs.
 95 | 2. All remaining examples get batched, and are evaluated by a GPT model specified by `model` in `config.ini`. **You should only use GPT3.5 for debugging. It does not work well for this purpose!**
 96 | This step uses the following prompt setup defined in `configs/`
 97 | 
 98 | >You are a helpful paper reading assistant whose job is to read daily posts from ArXiv and identify a few papers that might be relevant for your friend. There will be up to 5 papers below. Your job is to find papers that:
 99 | > 1. Criterion 1
100 | > 2. Criterion 2
101 | > 
102 | > [PAPERS]
103 | > 
104 | > Write the response in JSONL format with {ARXIVID, COMMENT, RELEVANCE, NOVELTY} on each line, one for each paper.
105 | The ARXIVID should be the ArXiv ID.
106 | The COMMENT should identify whether there is a criteria that match the paper very closely. If so, it should mention it by number (no need to mention the non-matching criteria).
107 | These matches should not be based on general terms like "language modeling" or "advancements" and should specifically refer to a criterion.
108 | The RELEVANCE should be a relevance score from 1-10 where 10 must be directly related to the exact, specific criterion with near-synonym keyword matches and authors who are known for working on the topic, 1 is irrelevant to any criterion, and unrelated to your friend's general interest area, 2-3 is papers that are relevant to the general interest area, but not specific criteria, and 5 is a direct match to a specific criterion.
109 | The NOVELTY should be a score from 1 to 10, where 10 is a groundbreaking, general-purpose discovery that would transform the entire field and 1 is work that improves one aspect of a problem or is an application to a very specific field. Read the abstract carefully to determine this and assume that authors cannot be trusted in their claims of novelty.
110 | 
111 | 3. GPT scores the papers for relevance (to the topics in `config/papers_topics.txt`) and novelty (scale 1-10)
112 | 4. Papers are filtered if they have scores below either the relevance and novelty cutoffs in `config.ini`
113 | 5. Papers are given an overall score based on equal weight to relevance and novelty
114 | 
115 | Finally, all papers are sorted by the max of their `author_match_score` and the sum of the GPT-rated relevance and novelty scores (the relevance and novelty scores will only show up in the final output if they are above the cutoff thresholds you set in the config file). Then the papers are rendered and pushed into their endpoints (text files or Slack).
116 | 
117 | ## Contributing 
118 | This repo uses ruff - `ruff check .` and `ruff format .` 
119 | Please install the pre-commit hook by running `pre-commit install`
120 | 
121 | ### Testing and improving the GPT filter
122 | The `filter_papers.py` code can also be run as a standalone script.
123 | This will take a batch of papers in `in/debug_papers.json`, run whatever config and prompts you have
124 | and return an output to `out/filter_paper_test.debug.json`. If you find the bot makes mistakes, you can find the associated batch in `out/gpt_paper_batches.debug.json` and copy that into the relevant `debug_papers` file.
125 | 
126 | This lets you build a benchmark for the filter and to see what comes out on the other side.
127 | 
128 | ## Other stuff
129 | This repo and code was originally built by Tatsunori Hashimoto is licensed under the Apache 2.0 license.
130 | Thanks to Chenglei Si for testing and benchmarking the GPT filter.
131 | 


--------------------------------------------------------------------------------
/arxiv_scraper.py:
--------------------------------------------------------------------------------
  1 | import configparser
  2 | import dataclasses
  3 | import json
  4 | from datetime import datetime, timedelta
  5 | from html import unescape
  6 | from typing import List, Optional
  7 | import re
  8 | import arxiv
  9 | 
 10 | import feedparser
 11 | from dataclasses import dataclass
 12 | 
 13 | 
 14 | class EnhancedJSONEncoder(json.JSONEncoder):
 15 |     def default(self, o):
 16 |         if dataclasses.is_dataclass(o):
 17 |             return dataclasses.asdict(o)
 18 |         return super().default(o)
 19 | 
 20 | 
 21 | @dataclass
 22 | class Paper:
 23 |     # paper class should track the list of authors, paper title, abstract, arxiv id
 24 |     authors: List[str]
 25 |     title: str
 26 |     abstract: str
 27 |     arxiv_id: str
 28 | 
 29 |     # add a hash function using arxiv_id
 30 |     def __hash__(self):
 31 |         return hash(self.arxiv_id)
 32 | 
 33 | 
 34 | def is_earlier(ts1, ts2):
 35 |     # compares two arxiv ids, returns true if ts1 is older than ts2
 36 |     return int(ts1.replace(".", "")) < int(ts2.replace(".", ""))
 37 | 
 38 | 
 39 | def get_papers_from_arxiv_api(area: str, timestamp, last_id) -> List[Paper]:
 40 |     # look for papers that are newer than the newest papers in RSS.
 41 |     # we do this by looking at last_id and grabbing everything newer.
 42 |     end_date = timestamp
 43 |     start_date = timestamp - timedelta(days=4)
 44 |     search = arxiv.Search(
 45 |         query="("
 46 |         + area
 47 |         + ") AND submittedDate:["
 48 |         + start_date.strftime("%Y%m%d")
 49 |         + "* TO "
 50 |         + end_date.strftime("%Y%m%d")
 51 |         + "*]",
 52 |         max_results=None,
 53 |         sort_by=arxiv.SortCriterion.SubmittedDate,
 54 |     )
 55 |     results = list(arxiv.Client().results(search))
 56 |     api_papers = []
 57 |     for result in results:
 58 |         new_id = result.get_short_id()[:10]
 59 |         if is_earlier(last_id, new_id):
 60 |             authors = [author.name for author in result.authors]
 61 |             summary = result.summary
 62 |             summary = unescape(re.sub("\n", " ", summary))
 63 |             paper = Paper(
 64 |                 authors=authors,
 65 |                 title=result.title,
 66 |                 abstract=summary,
 67 |                 arxiv_id=result.get_short_id()[:10],
 68 |             )
 69 |             api_papers.append(paper)
 70 |     return api_papers
 71 | 
 72 | 
 73 | def get_papers_from_arxiv_rss(area: str, config: Optional[dict]) -> List[Paper]:
 74 |     # get the feed from http://export.arxiv.org/rss/ and use the updated timestamp to avoid duplicates
 75 |     updated = datetime.utcnow() - timedelta(days=1)
 76 |     # format this into the string format 'Fri, 03 Nov 2023 00:30:00 GMT'
 77 |     updated_string = updated.strftime("%a, %d %b %Y %H:%M:%S GMT")
 78 |     feed = feedparser.parse(
 79 |         f"http://export.arxiv.org/rss/{area}", modified=updated_string
 80 |     )
 81 |     if feed.status == 304:
 82 |         if (config is not None) and config["OUTPUT"]["debug_messages"]:
 83 |             print("No new papers since " + updated_string + " for " + area)
 84 |         # if there are no new papers return an empty list
 85 |         return [], None, None
 86 |     # get the list of entries
 87 |     entries = feed.entries
 88 |     if len(feed.entries) == 0:
 89 |         print("No entries found for " + area)
 90 |         return [], None, None
 91 |     last_id = feed.entries[0].link.split("/")[-1]
 92 |     # parse last modified date
 93 |     timestamp = datetime.strptime(feed.feed["updated"], "%a, %d %b %Y %H:%M:%S +0000")
 94 |     paper_list = []
 95 |     for paper in entries:
 96 |         # ignore updated papers
 97 |         if paper["arxiv_announce_type"] != "new":
 98 |             continue
 99 |         # extract area
100 |         paper_area = paper.tags[0]["term"]
101 |         # ignore papers not in primary area
102 |         if (area != paper_area) and (config["FILTERING"].getboolean("force_primary")):
103 |             print(f"ignoring {paper.title}")
104 |             continue
105 |         # otherwise make a new paper, for the author field make sure to strip the HTML tags
106 |         authors = [
107 |             unescape(re.sub("<[^<]+?>", "", author)).strip()
108 |             for author in paper.author.replace("\n", ", ").split(",")
109 |         ]
110 |         # strip html tags from summary
111 |         summary = re.sub("<[^<]+?>", "", paper.summary)
112 |         summary = unescape(re.sub("\n", " ", summary))
113 |         # strip the last pair of parentehses containing (arXiv:xxxx.xxxxx [area.XX])
114 |         title = re.sub("\(arXiv:[0-9]+\.[0-9]+v[0-9]+ \[.*\]\)$", "", paper.title)
115 |         # remove the link part of the id
116 |         id = paper.link.split("/")[-1]
117 |         # make a new paper
118 |         new_paper = Paper(authors=authors, title=title, abstract=summary, arxiv_id=id)
119 |         paper_list.append(new_paper)
120 | 
121 |     return paper_list, timestamp, last_id
122 | 
123 | 
124 | def merge_paper_list(paper_list, api_paper_list):
125 |     api_set = set([paper.arxiv_id for paper in api_paper_list])
126 |     merged_paper_list = api_paper_list
127 |     for paper in paper_list:
128 |         if paper.arxiv_id not in api_set:
129 |             merged_paper_list.append(paper)
130 |     return merged_paper_list
131 | 
132 | 
133 | def get_papers_from_arxiv_rss_api(area: str, config: Optional[dict]) -> List[Paper]:
134 |     paper_list, timestamp, last_id = get_papers_from_arxiv_rss(area, config)
135 |     # if timestamp is None:
136 |     #    return []
137 |     # api_paper_list = get_papers_from_arxiv_api(area, timestamp, last_id)
138 |     # merged_paper_list = merge_paper_list(paper_list, api_paper_list)
139 |     # return merged_paper_list
140 |     return paper_list
141 | 
142 | 
143 | if __name__ == "__main__":
144 |     config = configparser.ConfigParser()
145 |     config.read("configs/config.ini")
146 |     paper_list, timestamp, last_id = get_papers_from_arxiv_rss("cs.CL", config)
147 |     print(timestamp)
148 |     api_paper_list = get_papers_from_arxiv_api("cs.CL", timestamp, last_id)
149 |     merged_paper_list = merge_paper_list(paper_list, api_paper_list)
150 |     print([paper.arxiv_id for paper in merged_paper_list])
151 |     print([paper.arxiv_id for paper in paper_list])
152 |     print([paper.arxiv_id for paper in api_paper_list])
153 |     print("success")
154 | 


--------------------------------------------------------------------------------
/configs/authors.template.txt:
--------------------------------------------------------------------------------
1 | # Copy me to authors.txt and replace the authors below with people you actually want to follow
2 | # the numbers behind the author are important - these are semantic scholar author IDs which you can find by looking up the authors on semantic scholar and taking the numbers at the end of the URL
3 | # Example line:
4 | Tatsunori B. Hashimoto, 3056528
5 | 


--------------------------------------------------------------------------------
/configs/authors.txt:
--------------------------------------------------------------------------------
 1 | # Stanford NLP faculty.
 2 | Tatsunori B. Hashimoto, 3056528
 3 | Tatsunori Hashimoto, 2117567142
 4 | Percy Liang, 145419642
 5 | Dan Jurafsky, 1746807
 6 | Christopher D. Manning, 144783904
 7 | Chris Potts, 1741530
 8 | Diyi Yang, 2022168
 9 | Monica Lam, 39682108
10 | 


--------------------------------------------------------------------------------
/configs/base_prompt.txt:
--------------------------------------------------------------------------------
1 | You are a helpful paper reading assistant whose job is to read daily posts from ArXiv and identify a few papers that your friend will enjoy reading.
2 | Your job is to carefully read the paper titles and abstracts below and find the ones that match the criteria below.


--------------------------------------------------------------------------------
/configs/config.ini:
--------------------------------------------------------------------------------
 1 | [SELECTION]
 2 | author_match_score = 15.0
 3 | run_openai = true
 4 | # DO NOT USE GPT 3.5 TURBO EXCEPT FOR DEBUGGING
 5 | #model = gpt-3.5-turbo
 6 | #model = gpt-3.5-turbo-1106
 7 | #model = gpt-4
 8 | model = gpt-4-1106-preview
 9 | # cost quality tradeoff - larger batches are cheaper but less accurate.
10 | batch_size = 5
11 | 
12 | [FILTERING]
13 | #arxiv_category = cs.CL,cs.LG,cs.AI
14 | arxiv_category = cs.CL
15 | # force_primary ignores papers that are only cross-listed into the arxiv_category
16 | force_primary = true
17 | # draws num_samples samples from the LM and averages scores
18 | num_samples = 1
19 | hcutoff = 15
20 | relevance_cutoff = 3
21 | novelty_cutoff = 3
22 | # whether to do author matching
23 | author_match = true
24 | 
25 | [OUTPUT]
26 | debug_messages = true
27 | dump_debug_file = true
28 | output_path = out/
29 | # options: json, md, slack
30 | dump_json = true
31 | dump_md = true
32 | push_to_slack = true


--------------------------------------------------------------------------------
/configs/paper_topics.template.txt:
--------------------------------------------------------------------------------
1 |  # this file goes into the prompt to tell the bot your interest areas. you can write down your interests in whatever format you want
2 |  # they can be simple, like a one line description like
3 |  1. Shows new powerful test set contamination or membership inference methods for language models
4 |  # or in the case of more general areas like `instruction-following`, you might want to give examples
5 |  2. New methodological improvements to RLHF or instruction-following which are specific fine-tuning steps that are taken to make language models better at following user instructions across a range of tasks.
6 |     - Relevant: papers that discuss specific methods like RLHF, or instruction-tuning datasets, improving these methods, or analyzing them. Usually these papers will explicitly mention RLHF, instruction-following or instruction-tuning.
7 |     - Not relevant: papers about adaptation to some task. Simply following instructions or inputs are not sufficient.


--------------------------------------------------------------------------------
/configs/paper_topics.txt:
--------------------------------------------------------------------------------
 1 |  1. New methodological improvements to RLHF or instruction-following which are specific fine-tuning steps that are taken to make language models better at following user instructions across a range of tasks.
 2 |     - Relevant: papers that discuss specific methods like RLHF, or instruction-tuning datasets, improving these methods, or analyzing them. Usually these papers will explicitly mention RLHF, instruction-following or instruction-tuning.
 3 |     - Not relevant: papers about adaptation to some task. Simply following instructions or inputs are not sufficient.
 4 |  2. Shows new powerful test set contamination or membership inference methods for language models. Test set contamination is the phenomenon where a language model observes a benchmark dataset during pretraining.
 5 |     - Relevant: test statistics that can detect contamination of benchmarks in language models. statistics that can provide guarantees are more interesting. membership inference methods that are general enough to apply to language models are also relevant.
 6 |     - Not relevant: any papers that do not consider language models, or that do not consider test set contamination.
 7 |  3. Shows a significant advance in the performance of diffusion language models.
 8 |     - Relevant: papers that study language models that are also diffusion models. Continuous diffusions are even more relevant, while discrete diffusions are less so.
 9 |     - Not relevant: papers about image diffusions like DALL-E or Stable Diffusion, or papers that do not explicitly mention language models or applications to text.
10 |  4. Describes new paradigms to evaluating open-ended text generation. Evaluating the outputs of language models is hard, especially in open-ended settings like for chatbots.
11 |     - Relevant: papers that fundamentally rethink language model evaluation -- especially by accounting for subjectivity or using adversaries.
12 |     - Not relevant: specific evaluations for specific tasks, identifying new properties or flaws of language models, or simply collecting new data.
13 |  5. Conducts surveys or provides data into real-world usage and safety properties of language models.
14 |     - Relevant: papers that create new datasets or surveys on real-world usage of language models.
15 |     - Not relevant: papers that apply language models to new real-world tasks.
16 |  6. Studies 'scaling laws' in the context of neural networks. Scaling laws refer to the very clear power-law relationship between the size or computational power used to train a model and the performance of that model.
17 |     - Relevant: theoretical or conceptual explanation behind scaling laws for language models.
18 |     - Not relevant: papers that have experiments at different model scales (but do not explicitly fit a scaling law) or papers that mention scaling laws, but the scaling laws are not the central subject of the paper
19 | 
20 |  In suggesting papers to your friend, remember that he enjoys papers on statistical machine learning, and generative modeling in natural language processing.
21 |  Your friend also likes learning about surprising empirical results in language models, as well as clever statistical tricks.
22 |  He does not want to read papers that are about primarily applications of methods to specific domains.


--------------------------------------------------------------------------------
/configs/postfix_prompt.txt:
--------------------------------------------------------------------------------
1 | Write the response in JSONL format with {ARXIVID, COMMENT, RELEVANCE, NOVELTY} on each line, one for each paper.
2 | The ARXIVID should be the ArXiv ID.
3 | The COMMENT should identify whether there is a criteria that match the paper very closely. If so, it should mention it by number (no need to mention the non-matching criteria).
4 | These matches should not be based on general terms like "language modeling" or "advancements" and should specifically refer to a criterion.
5 | The RELEVANCE should be a relevance score from 1-10 where 10 must be directly related to the exact, specific criterion with near-synonym keyword matches and authors who are known for working on the topic, 1 is irrelevant to any criterion, and unrelated to your friend's general interest area, 3 is papers that are relevant to the general interest area, but not specific criteria, and 5 is a direct match to a specific criterion.
6 | The NOVELTY should be a score from 1 to 10, where 10 is a groundbreaking, general-purpose discovery that would transform the entire field and 1 is work that improves one aspect of a problem or is an application to a very specific field. Read the abstract carefully to determine this and assume that authors cannot be trusted in their claims of novelty.
7 | 


--------------------------------------------------------------------------------
/filter_papers.py:
--------------------------------------------------------------------------------
  1 | import configparser
  2 | import dataclasses
  3 | import json
  4 | import re
  5 | from typing import List
  6 | 
  7 | import retry
  8 | from openai import OpenAI
  9 | from tqdm import tqdm
 10 | 
 11 | from arxiv_scraper import Paper
 12 | from arxiv_scraper import EnhancedJSONEncoder
 13 | 
 14 | 
 15 | def filter_by_author(all_authors, papers, author_targets, config):
 16 |     # filter and parse the papers
 17 |     selected_papers = {}  # pass to output
 18 |     all_papers = {}  # dict for later filtering
 19 |     sort_dict = {}  # dict storing key and score
 20 | 
 21 |     # author based selection
 22 |     for paper in papers:
 23 |         all_papers[paper.arxiv_id] = paper
 24 |         for author in paper.authors:
 25 |             if author in all_authors:
 26 |                 for alias in all_authors[author]:
 27 |                     if alias["authorId"] in author_targets:
 28 |                         selected_papers[paper.arxiv_id] = {
 29 |                             **dataclasses.asdict(paper),
 30 |                             **{"COMMENT": "Author match"},
 31 |                         }
 32 |                         sort_dict[paper.arxiv_id] = float(
 33 |                             config["SELECTION"]["author_match_score"]
 34 |                         )
 35 |                         break
 36 |     return selected_papers, all_papers, sort_dict
 37 | 
 38 | 
 39 | def filter_papers_by_hindex(all_authors, papers, config):
 40 |     # filters papers by checking to see if there's at least one author with > hcutoff hindex
 41 |     paper_list = []
 42 |     for paper in papers:
 43 |         max_h = 0
 44 |         for author in paper.authors:
 45 |             if author in all_authors:
 46 |                 max_h = max(
 47 |                     max_h, max([alias["hIndex"] for alias in all_authors[author]])
 48 |                 )
 49 |         if max_h >= float(config["FILTERING"]["hcutoff"]):
 50 |             paper_list.append(paper)
 51 |     return paper_list
 52 | 
 53 | 
 54 | def calc_price(model, usage):
 55 |     if model == "gpt-4-1106-preview":
 56 |         return (0.01 * usage.prompt_tokens + 0.03 * usage.completion_tokens) / 1000.0
 57 |     if model == "gpt-4":
 58 |         return (0.03 * usage.prompt_tokens + 0.06 * usage.completion_tokens) / 1000.0
 59 |     if (model == "gpt-3.5-turbo") or (model == "gpt-3.5-turbo-1106"):
 60 |         return (0.0015 * usage.prompt_tokens + 0.002 * usage.completion_tokens) / 1000.0
 61 | 
 62 | 
 63 | @retry.retry(tries=3, delay=2)
 64 | def call_chatgpt(full_prompt, openai_client, model):
 65 |     return openai_client.chat.completions.create(
 66 |         model=model,
 67 |         messages=[{"role": "user", "content": full_prompt}],
 68 |         temperature=0.0,
 69 |         seed=0,
 70 |     )
 71 | 
 72 | 
 73 | def run_and_parse_chatgpt(full_prompt, openai_client, config):
 74 |     # just runs the chatgpt prompt, tries to parse the resulting JSON
 75 |     completion = call_chatgpt(full_prompt, openai_client, config["SELECTION"]["model"])
 76 |     out_text = completion.choices[0].message.content
 77 |     out_text = re.sub("```jsonl\n", "", out_text)
 78 |     out_text = re.sub("```", "", out_text)
 79 |     out_text = re.sub(r"\n+", "\n", out_text)
 80 |     out_text = re.sub("},", "}", out_text).strip()
 81 |     # split out_text line by line and parse each as a json.
 82 |     json_dicts = []
 83 |     for line in out_text.split("\n"):
 84 |         # try catch block to attempt to parse json
 85 |         try:
 86 |             json_dicts.append(json.loads(line))
 87 |         except Exception as ex:
 88 |             if config["OUTPUT"].getboolean("debug_messages"):
 89 |                 print("Exception happened " + str(ex))
 90 |                 print("Failed to parse LM output as json")
 91 |                 print(out_text)
 92 |                 print("RAW output")
 93 |                 print(completion.choices[0].message.content)
 94 |             continue
 95 |     return json_dicts, calc_price(config["SELECTION"]["model"], completion.usage)
 96 | 
 97 | 
 98 | def paper_to_string(paper_entry: Paper) -> str:
 99 |     # renders each paper into a string to be processed by GPT
100 |     new_str = (
101 |         "ArXiv ID: "
102 |         + paper_entry.arxiv_id
103 |         + "\n"
104 |         + "Title: "
105 |         + paper_entry.title
106 |         + "\n"
107 |         + "Authors: "
108 |         + " and ".join(paper_entry.authors)
109 |         + "\n"
110 |         + "Abstract: "
111 |         + paper_entry.abstract[:4000]
112 |     )
113 |     return new_str
114 | 
115 | 
116 | def batched(items, batch_size):
117 |     # takes a list and returns a list of list with batch_size
118 |     return [items[i : i + batch_size] for i in range(0, len(items), batch_size)]
119 | 
120 | 
121 | def filter_papers_by_title(
122 |     papers, config, openai_client, base_prompt, criterion
123 | ) -> List[Paper]:
124 |     filter_postfix = 'Identify any papers that are absolutely and completely irrelavent to the criteria, and you are absolutely sure your friend will not enjoy, formatted as a list of arxiv ids like ["ID1", "ID2", "ID3"..]. Be extremely cautious, and if you are unsure at all, do not add a paper in this list. You will check it in detail later.\n Directly respond with the list, do not add ANY extra text before or after the list. Even if every paper seems irrelevant, please keep at least TWO papers'
125 |     batches_of_papers = batched(papers, 20)
126 |     final_list = []
127 |     cost = 0
128 |     for batch in batches_of_papers:
129 |         papers_string = "".join([paper_to_titles(paper) for paper in batch])
130 |         full_prompt = (
131 |             base_prompt + "\n " + criterion + "\n" + papers_string + filter_postfix
132 |         )
133 |         model = config["SELECTION"]["model"]
134 |         completion = call_chatgpt(full_prompt, openai_client, model)
135 |         cost += calc_price(model, completion.usage)
136 |         out_text = completion.choices[0].message.content
137 |         try:
138 |             filtered_set = set(json.loads(out_text))
139 |             for paper in batch:
140 |                 if paper.arxiv_id not in filtered_set:
141 |                     final_list.append(paper)
142 |                 else:
143 |                     print("Filtered out paper " + paper.arxiv_id)
144 |         except Exception as ex:
145 |             print("Exception happened " + str(ex))
146 |             print("Failed to parse LM output as list " + out_text)
147 |             print(completion)
148 |             continue
149 |     return final_list, cost
150 | 
151 | 
152 | def paper_to_titles(paper_entry: Paper) -> str:
153 |     return "ArXiv ID: " + paper_entry.arxiv_id + " Title: " + paper_entry.title + "\n"
154 | 
155 | 
156 | def run_on_batch(
157 |     paper_batch, base_prompt, criterion, postfix_prompt, openai_client, config
158 | ):
159 |     batch_str = [paper_to_string(paper) for paper in paper_batch]
160 |     full_prompt = "\n".join(
161 |         [
162 |             base_prompt,
163 |             criterion + "\n",
164 |             "\n\n".join(batch_str) + "\n",
165 |             postfix_prompt,
166 |         ]
167 |     )
168 |     json_dicts, cost = run_and_parse_chatgpt(full_prompt, openai_client, config)
169 |     return json_dicts, cost
170 | 
171 | 
172 | def filter_by_gpt(
173 |     all_authors, papers, config, openai_client, all_papers, selected_papers, sort_dict
174 | ):
175 |     # deal with config parsing
176 |     with open("configs/base_prompt.txt", "r") as f:
177 |         base_prompt = f.read()
178 |     with open("configs/paper_topics.txt", "r") as f:
179 |         criterion = f.read()
180 |     with open("configs/postfix_prompt.txt", "r") as f:
181 |         postfix_prompt = f.read()
182 |     all_cost = 0
183 |     if config["SELECTION"].getboolean("run_openai"):
184 |         # filter first by hindex of authors to reduce costs.
185 |         paper_list = filter_papers_by_hindex(all_authors, papers, config)
186 |         if config["OUTPUT"].getboolean("debug_messages"):
187 |             print(str(len(paper_list)) + " papers after hindex filtering")
188 |         cost = 0
189 |         paper_list, cost = filter_papers_by_title(
190 |             paper_list, config, openai_client, base_prompt, criterion
191 |         )
192 |         if config["OUTPUT"].getboolean("debug_messages"):
193 |             print(
194 |                 str(len(paper_list))
195 |                 + " papers after title filtering with cost of $"
196 |                 + str(cost)
197 |             )
198 |         all_cost += cost
199 | 
200 |         # batch the remaining papers and invoke GPT
201 |         batch_of_papers = batched(paper_list, int(config["SELECTION"]["batch_size"]))
202 |         scored_batches = []
203 |         for batch in tqdm(batch_of_papers):
204 |             scored_in_batch = []
205 |             json_dicts, cost = run_on_batch(
206 |                 batch, base_prompt, criterion, postfix_prompt, openai_client, config
207 |             )
208 |             all_cost += cost
209 |             for jdict in json_dicts:
210 |                 if (
211 |                     int(jdict["RELEVANCE"])
212 |                     >= int(config["FILTERING"]["relevance_cutoff"])
213 |                     and jdict["NOVELTY"] >= int(config["FILTERING"]["novelty_cutoff"])
214 |                     and jdict["ARXIVID"] in all_papers
215 |                 ):
216 |                     selected_papers[jdict["ARXIVID"]] = {
217 |                         **dataclasses.asdict(all_papers[jdict["ARXIVID"]]),
218 |                         **jdict,
219 |                     }
220 |                     sort_dict[jdict["ARXIVID"]] = jdict["RELEVANCE"] + jdict["NOVELTY"]
221 |                 scored_in_batch.append(
222 |                     {
223 |                         **dataclasses.asdict(all_papers[jdict["ARXIVID"]]),
224 |                         **jdict,
225 |                     }
226 |                 )
227 |             scored_batches.append(scored_in_batch)
228 |         if config["OUTPUT"].getboolean("dump_debug_file"):
229 |             with open(
230 |                 config["OUTPUT"]["output_path"] + "gpt_paper_batches.debug.json", "w"
231 |             ) as outfile:
232 |                 json.dump(scored_batches, outfile, cls=EnhancedJSONEncoder, indent=4)
233 |         if config["OUTPUT"].getboolean("debug_messages"):
234 |             print("Total cost: $" + str(all_cost))
235 | 
236 | 
237 | if __name__ == "__main__":
238 |     config = configparser.ConfigParser()
239 |     config.read("configs/config.ini")
240 |     # now load the api keys
241 |     keyconfig = configparser.ConfigParser()
242 |     keyconfig.read("configs/keys.ini")
243 |     S2_API_KEY = keyconfig["KEYS"]["semanticscholar"]
244 |     openai_client = OpenAI(api_key=keyconfig["KEYS"]["openai"])
245 |     # deal with config parsing
246 |     with open("configs/base_prompt.txt", "r") as f:
247 |         base_prompt = f.read()
248 |     with open("configs/paper_topics.txt", "r") as f:
249 |         criterion = f.read()
250 |     with open("configs/postfix_prompt.txt", "r") as f:
251 |         postfix_prompt = f.read()
252 |     # loads papers from 'in/debug_papers.json' and filters them
253 |     with open("in/debug_papers.json", "r") as f:
254 |         # with open("in/gpt_paper_batches.debug-11-10.json", "r") as f:
255 |         paper_list_in_dict = json.load(f)
256 |     papers = [
257 |         [
258 |             Paper(
259 |                 arxiv_id=paper["arxiv_id"],
260 |                 authors=paper["authors"],
261 |                 title=paper["title"],
262 |                 abstract=paper["abstract"],
263 |             )
264 |             for paper in batch
265 |         ]
266 |         for batch in paper_list_in_dict
267 |     ]
268 |     all_papers = {}
269 |     paper_outputs = {}
270 |     sort_dict = {}
271 |     total_cost = 0
272 |     for batch in tqdm(papers):
273 |         json_dicts, cost = run_on_batch(
274 |             batch, base_prompt, criterion, postfix_prompt, openai_client, config
275 |         )
276 |         total_cost += cost
277 |         for paper in batch:
278 |             all_papers[paper.arxiv_id] = paper
279 |         for jdict in json_dicts:
280 |             paper_outputs[jdict["ARXIVID"]] = {
281 |                 **dataclasses.asdict(all_papers[jdict["ARXIVID"]]),
282 |                 **jdict,
283 |             }
284 |             sort_dict[jdict["ARXIVID"]] = jdict["RELEVANCE"] + jdict["NOVELTY"]
285 | 
286 |         # sort the papers by relevance and novelty
287 |     print("total cost:" + str(total_cost))
288 |     keys = list(sort_dict.keys())
289 |     values = list(sort_dict.values())
290 | 
291 |     def argsort(seq):
292 |         return sorted(range(len(seq)), key=seq.__getitem__)
293 | 
294 |     sorted_keys = [keys[idx] for idx in argsort(values)[::-1]]
295 |     selected_papers = {key: paper_outputs[key] for key in sorted_keys}
296 | 
297 |     with open(
298 |         config["OUTPUT"]["output_path"] + "filter_paper_test.debug.json", "w"
299 |     ) as outfile:
300 |         json.dump(selected_papers, outfile, cls=EnhancedJSONEncoder, indent=4)
301 | 


--------------------------------------------------------------------------------
/in/debug_papers.json:
--------------------------------------------------------------------------------
  1 | [
  2 |     [
  3 |         {
  4 |             "authors": [
  5 |                 "Sloan Nietert",
  6 |                 "Ziv Goldfeld",
  7 |                 "Soroosh Shafiee"
  8 |             ],
  9 |             "title": "Outlier-Robust Wasserstein DRO. ",
 10 |             "abstract": "Distributionally robust optimization (DRO) is an effective approach for data-driven decision-making in the presence of uncertainty. Geometric uncertainty due to sampling or localized perturbations of data points is captured by Wasserstein DRO (WDRO), which seeks to learn a model that performs uniformly well over a Wasserstein ball centered around the observed data distribution. However, WDRO fails to account for non-geometric perturbations such as adversarial outliers, which can greatly distort the Wasserstein distance measurement and impede the learned model. We address this gap by proposing a novel outlier-robust WDRO framework for decision-making under both geometric (Wasserstein) perturbations and non-geometric (total variation (TV)) contamination that allows an $\\varepsilon$-fraction of data to be arbitrarily corrupted. We design an uncertainty set using a certain robust Wasserstein ball that accounts for both perturbation types and derive minimax optimal excess risk bounds for this procedure that explicitly capture the Wasserstein and TV risks. We prove a strong duality result that enables tractable convex reformulations and efficient computation of our outlier-robust WDRO problem. When the loss function depends only on low-dimensional features of the data, we eliminate certain dimension dependencies from the risk bounds that are unavoidable in the general setting. Finally, we present experiments validating our theory on standard regression and classification tasks. ",
 11 |             "arxiv_id": "2311.05573",
 12 |             "ARXIVID": "2311.05573",
 13 |             "COMMENT": "This paper does not match any of the specified criteria.",
 14 |             "RELEVANCE": 1,
 15 |             "NOVELTY": 3
 16 |         },
 17 |         {
 18 |             "authors": [
 19 |                 "Tong Zhu",
 20 |                 "Junfei Ren",
 21 |                 "Zijian Yu",
 22 |                 "Mengsong Wu",
 23 |                 "Guoliang Zhang",
 24 |                 "Xiaoye Qu",
 25 |                 "Wenliang Chen",
 26 |                 "Zhefeng Wang",
 27 |                 "Baoxing Huai",
 28 |                 "Min Zhang"
 29 |             ],
 30 |             "title": "Mirror: A Universal Framework for Various Information Extraction Tasks. ",
 31 |             "abstract": "Sharing knowledge between information extraction tasks has always been a challenge due to the diverse data formats and task variations. Meanwhile, this divergence leads to information waste and increases difficulties in building complex applications in real scenarios. Recent studies often formulate IE tasks as a triplet extraction problem. However, such a paradigm does not support multi-span and n-ary extraction, leading to weak versatility. To this end, we reorganize IE problems into unified multi-slot tuples and propose a universal framework for various IE tasks, namely Mirror. Specifically, we recast existing IE tasks as a multi-span cyclic graph extraction problem and devise a non-autoregressive graph decoding algorithm to extract all spans in a single step. It is worth noting that this graph structure is incredibly versatile, and it supports not only complex IE tasks, but also machine reading comprehension and classification tasks. We manually construct a corpus containing 57 datasets for model pretraining, and conduct experiments on 30 datasets across 8 downstream tasks. The experimental results demonstrate that our model has decent compatibility and outperforms or reaches competitive performance with SOTA systems under few-shot and zero-shot settings. The code, model weights, and pretraining corpus are available at https://github.com/Spico197/Mirror . ",
 32 |             "arxiv_id": "2311.05419",
 33 |             "ARXIVID": "2311.05419",
 34 |             "COMMENT": "This paper does not match any of the specified criteria as it focuses on information extraction tasks and not on the statistical machine learning or generative modeling in NLP.",
 35 |             "RELEVANCE": 1,
 36 |             "NOVELTY": 3
 37 |         },
 38 |         {
 39 |             "authors": [
 40 |                 "Luca Beurer-Kellner",
 41 |                 "Mark Niklas M\u00fcller",
 42 |                 "Marc Fischer",
 43 |                 "Martin Vechev"
 44 |             ],
 45 |             "title": "Prompt Sketching for Large Language Models. ",
 46 |             "abstract": "Many recent prompting strategies for large language models (LLMs) query the model multiple times sequentially -- first to produce intermediate results and then the final answer. However, using these methods, both decoder and model are unaware of potential follow-up prompts, leading to disconnected and undesirably wordy intermediate responses. In this work, we address this issue by proposing prompt sketching, a new prompting paradigm in which an LLM does not only respond by completing a prompt, but by predicting values for multiple variables in a template. This way, sketching grants users more control over the generation process, e.g., by providing a reasoning framework via intermediate instructions, leading to better overall results. The key idea enabling sketching with existing, autoregressive models is to adapt the decoding procedure to also score follow-up instructions during text generation, thus optimizing overall template likelihood in inference. Our experiments show that in a zero-shot setting, prompt sketching outperforms existing, sequential prompting schemes such as direct asking or chain-of-thought on 7 out of 8 LLM benchmarking tasks, including state tracking, arithmetic reasoning, and general question answering. To facilitate future use, we release a number of generic, yet effective sketches applicable to many tasks, and an open source library called dclib, powering our sketch-aware decoders. ",
 47 |             "arxiv_id": "2311.04954",
 48 |             "ARXIVID": "2311.04954",
 49 |             "COMMENT": "Criterion 2: The paper proposes 'prompt sketching', a new prompting paradigm for large language models, which could be considered a methodological improvement to instruction-following.",
 50 |             "RELEVANCE": 8,
 51 |             "NOVELTY": 7
 52 |         },
 53 |         {
 54 |             "authors": [
 55 |                 "ShengYun Peng",
 56 |                 "Seongmin Lee",
 57 |                 "Xiaojing Wang",
 58 |                 "Rajarajeswari Balasubramaniyan",
 59 |                 "Duen Horng Chau"
 60 |             ],
 61 |             "title": "High-Performance Transformers for Table Structure Recognition Need Early Convolutions. ",
 62 |             "abstract": "Table structure recognition (TSR) aims to convert tabular images into a machine-readable format, where a visual encoder extracts image features and a textual decoder generates table-representing tokens. Existing approaches use classic convolutional neural network (CNN) backbones for the visual encoder and transformers for the textual decoder. However, this hybrid CNN-Transformer architecture introduces a complex visual encoder that accounts for nearly half of the total model parameters, markedly reduces both training and inference speed, and hinders the potential for self-supervised learning in TSR. In this work, we design a lightweight visual encoder for TSR without sacrificing expressive power. We discover that a convolutional stem can match classic CNN backbone performance, with a much simpler model. The convolutional stem strikes an optimal balance between two crucial factors for high-performance TSR: a higher receptive field (RF) ratio and a longer sequence length. This allows it to \"see\" an appropriate portion of the table and \"store\" the complex table structure within sufficient context length for the subsequent transformer. We conducted reproducible ablation studies and open-sourced our code at https://github.com/poloclub/tsr-convstem to enhance transparency, inspire innovations, and facilitate fair comparisons in our domain as tables are a promising modality for representation learning. ",
 63 |             "arxiv_id": "2311.05565",
 64 |             "ARXIVID": "2311.05565",
 65 |             "COMMENT": "This paper does not match any of the specified criteria as it focuses on table structure recognition using transformers and convolutions, which is not related to the statistical machine learning or generative modeling in NLP.",
 66 |             "RELEVANCE": 1,
 67 |             "NOVELTY": 3
 68 |         }
 69 |     ],
 70 |     [
 71 |         {
 72 |             "authors": [
 73 |                 "James Boyko",
 74 |                 "Joseph Cohen",
 75 |                 "Nathan Fox",
 76 |                 "Maria Han Veiga",
 77 |                 "Jennifer I-Hsiu Li",
 78 |                 "Jing Liu",
 79 |                 "Bernardo Modenesi",
 80 |                 "Andreas H. Rauch",
 81 |                 "Kenneth N. Reid",
 82 |                 "Soumi Tribedi",
 83 |                 "Anastasia Visheratina",
 84 |                 "Xin Xie"
 85 |             ],
 86 |             "title": "An Interdisciplinary Outlook on Large Language Models for Scientific Research. ",
 87 |             "abstract": "In this paper, we describe the capabilities and constraints of Large Language Models (LLMs) within disparate academic disciplines, aiming to delineate their strengths and limitations with precision. We examine how LLMs augment scientific inquiry, offering concrete examples such as accelerating literature review by summarizing vast numbers of publications, enhancing code development through automated syntax correction, and refining the scientific writing process. Simultaneously, we articulate the challenges LLMs face, including their reliance on extensive and sometimes biased datasets, and the potential ethical dilemmas stemming from their use. Our critical discussion extends to the varying impacts of LLMs across fields, from the natural sciences, where they help model complex biological sequences, to the social sciences, where they can parse large-scale qualitative data. We conclude by offering a nuanced perspective on how LLMs can be both a boon and a boundary to scientific progress. ",
 88 |             "arxiv_id": "2311.04929",
 89 |             "ARXIVID": "2311.04929",
 90 |             "COMMENT": "This paper does not match any of the specified criteria as it provides a general discussion on the capabilities and constraints of LLMs across various academic disciplines.",
 91 |             "RELEVANCE": 1,
 92 |             "NOVELTY": 2
 93 |         },
 94 |         {
 95 |             "authors": [
 96 |                 "Joey Hong",
 97 |                 "Sergey Levine",
 98 |                 "Anca Dragan"
 99 |             ],
100 |             "title": "Zero-Shot Goal-Directed Dialogue via RL on Imagined Conversations. ",
101 |             "abstract": "Large language models (LLMs) have emerged as powerful and general solutions to many natural language tasks. However, many of the most important applications of language generation are interactive, where an agent has to talk to a person to reach a desired outcome. For example, a teacher might try to understand their student's current comprehension level to tailor their instruction accordingly, and a travel agent might ask questions of their customer to understand their preferences in order to recommend activities they might enjoy. LLMs trained with supervised fine-tuning or \"single-step\" RL, as with standard RLHF, might struggle which tasks that require such goal-directed behavior, since they are not trained to optimize for overall conversational outcomes after multiple turns of interaction. In this work, we explore a new method for adapting LLMs with RL for such goal-directed dialogue. Our key insight is that, though LLMs might not effectively solve goal-directed dialogue tasks out of the box, they can provide useful data for solving such tasks by simulating suboptimal but human-like behaviors. Given a textual description of a goal-directed dialogue task, we leverage LLMs to sample diverse synthetic rollouts of hypothetical in-domain human-human interactions. Our algorithm then utilizes this dataset with offline reinforcement learning to train an interactive conversational agent that can optimize goal-directed objectives over multiple turns. In effect, the LLM produces examples of possible interactions, and RL then processes these examples to learn to perform more optimal interactions. Empirically, we show that our proposed approach achieves state-of-the-art performance in various goal-directed dialogue tasks that include teaching and preference elicitation. ",
102 |             "arxiv_id": "2311.05584",
103 |             "ARXIVID": "2311.05584",
104 |             "COMMENT": "Matches criterion 2 as it explores a new method for adapting LLMs with RL for goal-directed dialogue, which is a methodological improvement to RLHF and instruction-following.",
105 |             "RELEVANCE": 9,
106 |             "NOVELTY": 7
107 |         },
108 |         {
109 |             "authors": [
110 |                 "Erik Schultheis",
111 |                 "Marek Wydmuch",
112 |                 "Wojciech Kot\u0142owski",
113 |                 "Rohit Babbar",
114 |                 "Krzysztof Dembczy\u0144ski"
115 |             ],
116 |             "title": "Generalized test utilities for long-tail performance in extreme multi-label classification. ",
117 |             "abstract": "Extreme multi-label classification (XMLC) is the task of selecting a small subset of relevant labels from a very large set of possible labels. As such, it is characterized by long-tail labels, i.e., most labels have very few positive instances. With standard performance measures such as precision@k, a classifier can ignore tail labels and still report good performance. However, it is often argued that correct predictions in the tail are more interesting or rewarding, but the community has not yet settled on a metric capturing this intuitive concept. The existing propensity-scored metrics fall short on this goal by confounding the problems of long-tail and missing labels. In this paper, we analyze generalized metrics budgeted \"at k\" as an alternative solution. To tackle the challenging problem of optimizing these metrics, we formulate it in the expected test utility (ETU) framework, which aims at optimizing the expected performance on a fixed test set. We derive optimal prediction rules and construct computationally efficient approximations with provable regret guarantees and robustness against model misspecification. Our algorithm, based on block coordinate ascent, scales effortlessly to XMLC problems and obtains promising results in terms of long-tail performance. ",
118 |             "arxiv_id": "2311.05081",
119 |             "ARXIVID": "2311.05081",
120 |             "COMMENT": "This paper does not match any of the specified criteria as it focuses on extreme multi-label classification and does not directly relate to language models or the specific topics of interest.",
121 |             "RELEVANCE": 1,
122 |             "NOVELTY": 3
123 |         },
124 |         {
125 |             "authors": [
126 |                 "Licheng Wen",
127 |                 "Xuemeng Yang",
128 |                 "Daocheng Fu",
129 |                 "Xiaofeng Wang",
130 |                 "Pinlong Cai",
131 |                 "Xin Li",
132 |                 "Tao Ma",
133 |                 "Yingxuan Li",
134 |                 "Linran Xu",
135 |                 "Dengke Shang",
136 |                 "Zheng Zhu",
137 |                 "Shaoyan Sun",
138 |                 "Yeqi Bai",
139 |                 "Xinyu Cai",
140 |                 "Min Dou",
141 |                 "Shuanglu Hu",
142 |                 "Botian Shi"
143 |             ],
144 |             "title": "On the Road with GPT-4V(ision): Early Explorations of Visual-Language Model on Autonomous Driving. ",
145 |             "abstract": "The pursuit of autonomous driving technology hinges on the sophisticated integration of perception, decision-making, and control systems. Traditional approaches, both data-driven and rule-based, have been hindered by their inability to grasp the nuance of complex driving environments and the intentions of other road users. This has been a significant bottleneck, particularly in the development of common sense reasoning and nuanced scene understanding necessary for safe and reliable autonomous driving. The advent of Visual Language Models (VLM) represents a novel frontier in realizing fully autonomous vehicle driving. This report provides an exhaustive evaluation of the latest state-of-the-art VLM, \\modelnamefull, and its application in autonomous driving scenarios. We explore the model's abilities to understand and reason about driving scenes, make decisions, and ultimately act in the capacity of a driver. Our comprehensive tests span from basic scene recognition to complex causal reasoning and real-time decision-making under varying conditions. Our findings reveal that \\modelname demonstrates superior performance in scene understanding and causal reasoning compared to existing autonomous systems. It showcases the potential to handle out-of-distribution scenarios, recognize intentions, and make informed decisions in real driving contexts. However, challenges remain, particularly in direction discernment, traffic light recognition, vision grounding, and spatial reasoning tasks. These limitations underscore the need for further research and development. Project is now available on GitHub for interested parties to access and utilize: \\url{https://github.com/PJLab-ADG/GPT4V-AD-Exploration} ",
146 |             "arxiv_id": "2311.05332",
147 |             "ARXIVID": "2311.05332",
148 |             "COMMENT": "This paper does not match any of the specified criteria as it focuses on the application of visual-language models in autonomous driving, which is outside the scope of the specified interests.",
149 |             "RELEVANCE": 1,
150 |             "NOVELTY": 4
151 |         },
152 |         {
153 |             "authors": [
154 |                 "Daniel Galvez",
155 |                 "Tim Kaldewey"
156 |             ],
157 |             "title": "GPU-Accelerated WFST Beam Search Decoder for CTC-based Speech Recognition. ",
158 |             "abstract": "While Connectionist Temporal Classification (CTC) models deliver state-of-the-art accuracy in automated speech recognition (ASR) pipelines, their performance has been limited by CPU-based beam search decoding. We introduce a GPU-accelerated Weighted Finite State Transducer (WFST) beam search decoder compatible with current CTC models. It increases pipeline throughput and decreases latency, supports streaming inference, and also supports advanced features like utterance-specific word boosting via on-the-fly composition. We provide pre-built DLPack-based python bindings for ease of use with Python-based machine learning frameworks at https://github.com/nvidia-riva/riva-asrlib-decoder. We evaluated our decoder for offline and online scenarios, demonstrating that it is the fastest beam search decoder for CTC models. In the offline scenario it achieves up to 7 times more throughput than the current state-of-the-art CPU decoder and in the online streaming scenario, it achieves nearly 8 times lower latency, with same or better word error rate. ",
159 |             "arxiv_id": "2311.04996",
160 |             "ARXIVID": "2311.04996",
161 |             "COMMENT": "This paper does not match any of the specified criteria as it focuses on GPU-accelerated beam search decoding for CTC-based speech recognition, which is not related to the specified topics of interest in language models.",
162 |             "RELEVANCE": 1,
163 |             "NOVELTY": 3
164 |         }
165 |     ],
166 |     [
167 |         {
168 |             "authors": [
169 |                 "Hongjian Zhou",
170 |                 "Boyang Gu",
171 |                 "Xinyu Zou",
172 |                 "Yiru Li",
173 |                 "Sam S. Chen",
174 |                 "Peilin Zhou",
175 |                 "Junling Liu",
176 |                 "Yining Hua",
177 |                 "Chengfeng Mao",
178 |                 "Xian Wu",
179 |                 "Zheng Li",
180 |                 "Fenglin Liu"
181 |             ],
182 |             "title": "A Survey of Large Language Models in Medicine: Progress, Application, and Challenge. ",
183 |             "abstract": "Large language models (LLMs), such as ChatGPT, have achieved substantial attention due to their impressive human language understanding and generation capabilities. Therefore, the application of LLMs in medicine to assist physicians and patient care emerges as a promising research direction in both artificial intelligence and clinical medicine. To this end, this survey provides a comprehensive overview of the current progress, applications, and challenges faced by LLMs in medicine. Specifically, we aim to address the following questions: 1) What are LLMs and how can medical LLMs be built? 2) What are the downstream performances of medical LLMs? 3) How can medical LLMs be utilized in real-world clinical practice? 4) What challenges arise from the use of medical LLMs? 5) How can we better construct and utilize medical LLMs? As a result, this survey aims to provide insights into the opportunities and challenges of LLMs in medicine and serve as a valuable resource for constructing practical and effective medical LLMs. A regularly updated list of practical guide resources of medical LLMs can be found at https://github.com/AI-in-Health/MedLLMsPracticalGuide. ",
184 |             "arxiv_id": "2311.05112",
185 |             "ARXIVID": "2311.05112",
186 |             "COMMENT": "This paper does not match any of the specified criteria as it is a survey on the application of LLMs in medicine.",
187 |             "RELEVANCE": 1,
188 |             "NOVELTY": 2
189 |         },
190 |         {
191 |             "authors": [
192 |                 "Jianzhong Wu",
193 |                 "Mengyang Gu"
194 |             ],
195 |             "title": "Perfecting Liquid-State Theories with Machine Intelligence. ",
196 |             "abstract": "Recent years have seen a significant increase in the use of machine intelligence for predicting electronic structure, molecular force fields, and the physicochemical properties of various condensed systems. However, substantial challenges remain in developing a comprehensive framework capable of handling a wide range of atomic compositions and thermodynamic conditions. This perspective discusses potential future developments in liquid-state theories leveraging on recent advancements of functional machine learning. By harnessing the strengths of theoretical analysis and machine learning techniques including surrogate models, dimension reduction and uncertainty quantification, we envision that liquid-state theories will gain significant improvements in accuracy, scalability and computational efficiency, enabling their broader applications across diverse materials and chemical systems. ",
197 |             "arxiv_id": "2311.05167",
198 |             "ARXIVID": "2311.05167",
199 |             "COMMENT": "This paper does not match any of the specified criteria as it discusses liquid-state theories and machine intelligence, not language models.",
200 |             "RELEVANCE": 1,
201 |             "NOVELTY": 3
202 |         },
203 |         {
204 |             "authors": [
205 |                 "Yanzhao Zhang",
206 |                 "Dingkun Long",
207 |                 "Zehan Li",
208 |                 "Pengjun Xie"
209 |             ],
210 |             "title": "Text Representation Distillation via Information Bottleneck Principle. ",
211 |             "abstract": "Pre-trained language models (PLMs) have recently shown great success in text representation field. However, the high computational cost and high-dimensional representation of PLMs pose significant challenges for practical applications. To make models more accessible, an effective method is to distill large models into smaller representation models. In order to relieve the issue of performance degradation after distillation, we propose a novel Knowledge Distillation method called IBKD. This approach is motivated by the Information Bottleneck principle and aims to maximize the mutual information between the final representation of the teacher and student model, while simultaneously reducing the mutual information between the student model's representation and the input data. This enables the student model to preserve important learned information while avoiding unnecessary information, thus reducing the risk of over-fitting. Empirical studies on two main downstream applications of text representation (Semantic Textual Similarity and Dense Retrieval tasks) demonstrate the effectiveness of our proposed approach. ",
212 |             "arxiv_id": "2311.05472",
213 |             "ARXIVID": "2311.05472",
214 |             "COMMENT": "This paper does not match any of the specified criteria as it focuses on knowledge distillation for text representation, not on the specified topics of interest.",
215 |             "RELEVANCE": 1,
216 |             "NOVELTY": 4
217 |         },
218 |         {
219 |             "authors": [
220 |                 "Dipak Dulal",
221 |                 "Joseph J. Charney",
222 |                 "Michael Gallagher",
223 |                 "Carmeliza Navasca",
224 |                 "Nicholas Skowronski"
225 |             ],
226 |             "title": "Exploring and Analyzing Wildland Fire Data Via Machine Learning Techniques. ",
227 |             "abstract": "This research project investigated the correlation between a 10 Hz time series of thermocouple temperatures and turbulent kinetic energy (TKE) computed from wind speeds collected from a small experimental prescribed burn at the Silas Little Experimental Forest in New Jersey, USA. The primary objective of this project was to explore the potential for using thermocouple temperatures as predictors for estimating the TKE produced by a wildland fire. Machine learning models, including Deep Neural Networks, Random Forest Regressor, Gradient Boosting, and Gaussian Process Regressor, are employed to assess the potential for thermocouple temperature perturbations to predict TKE values. Data visualization and correlation analyses reveal patterns and relationships between thermocouple temperatures and TKE, providing insight into the underlying dynamics. The project achieves high accuracy in predicting TKE by employing various machine learning models despite a weak correlation between the predictors and the target variable. The results demonstrate significant success, particularly from regression models, in accurately estimating the TKE. The research findings contribute to fire behavior and smoke modeling science, emphasizing the importance of incorporating machine learning approaches and identifying complex relationships between fine-scale fire behavior and turbulence. Accurate TKE estimation using thermocouple temperatures allows for the refinement of models that can inform decision-making in fire management strategies, facilitate effective risk mitigation, and optimize fire management efforts. This project highlights the valuable role of machine learning techniques in analyzing wildland fire data, showcasing their potential to advance fire research and management practices. ",
228 |             "arxiv_id": "2311.05128",
229 |             "ARXIVID": "2311.05128",
230 |             "COMMENT": "This paper does not match any of the specified criteria as it is about wildland fire data analysis using machine learning, not language models.",
231 |             "RELEVANCE": 1,
232 |             "NOVELTY": 3
233 |         },
234 |         {
235 |             "authors": [
236 |                 "Simian Luo",
237 |                 "Yiqin Tan",
238 |                 "Suraj Patil",
239 |                 "Daniel Gu",
240 |                 "Patrick von Platen",
241 |                 "Apolin\u00e1rio Passos",
242 |                 "Longbo Huang",
243 |                 "Jian Li",
244 |                 "Hang Zhao"
245 |             ],
246 |             "title": "LCM-LoRA: A Universal Stable-Diffusion Acceleration Module. ",
247 |             "abstract": "Latent Consistency Models (LCMs) have achieved impressive performance in accelerating text-to-image generative tasks, producing high-quality images with minimal inference steps. LCMs are distilled from pre-trained latent diffusion models (LDMs), requiring only ~32 A100 GPU training hours. This report further extends LCMs' potential in two aspects: First, by applying LoRA distillation to Stable-Diffusion models including SD-V1.5, SSD-1B, and SDXL, we have expanded LCM's scope to larger models with significantly less memory consumption, achieving superior image generation quality. Second, we identify the LoRA parameters obtained through LCM distillation as a universal Stable-Diffusion acceleration module, named LCM-LoRA. LCM-LoRA can be directly plugged into various Stable-Diffusion fine-tuned models or LoRAs without training, thus representing a universally applicable accelerator for diverse image generation tasks. Compared with previous numerical PF-ODE solvers such as DDIM, DPM-Solver, LCM-LoRA can be viewed as a plug-in neural PF-ODE solver that possesses strong generalization abilities. Project page: https://github.com/luosiallen/latent-consistency-model. ",
248 |             "arxiv_id": "2311.05556",
249 |             "ARXIVID": "2311.05556",
250 |             "COMMENT": "Criterion 4 is closely matched as the paper discusses LCM-LoRA, an acceleration module for diffusion language models, which could represent a significant advance in the performance of such models.",
251 |             "RELEVANCE": 9,
252 |             "NOVELTY": 7
253 |         }
254 |     ],
255 |     [
256 |         {
257 |             "authors": [
258 |                 "Akshit Jindal",
259 |                 "Vikram Goyal",
260 |                 "Saket Anand",
261 |                 "Chetan Arora"
262 |             ],
263 |             "title": "Army of Thieves: Enhancing Black-Box Model Extraction via Ensemble based sample selection. ",
264 |             "abstract": "Machine Learning (ML) models become vulnerable to Model Stealing Attacks (MSA) when they are deployed as a service. In such attacks, the deployed model is queried repeatedly to build a labelled dataset. This dataset allows the attacker to train a thief model that mimics the original model. To maximize query efficiency, the attacker has to select the most informative subset of data points from the pool of available data. Existing attack strategies utilize approaches like Active Learning and Semi-Supervised learning to minimize costs. However, in the black-box setting, these approaches may select sub-optimal samples as they train only one thief model. Depending on the thief model's capacity and the data it was pretrained on, the model might even select noisy samples that harm the learning process. In this work, we explore the usage of an ensemble of deep learning models as our thief model. We call our attack Army of Thieves(AOT) as we train multiple models with varying complexities to leverage the crowd's wisdom. Based on the ensemble's collective decision, uncertain samples are selected for querying, while the most confident samples are directly included in the training data. Our approach is the first one to utilize an ensemble of thief models to perform model extraction. We outperform the base approaches of existing state-of-the-art methods by at least 3% and achieve a 21% higher adversarial sample transferability than previous work for models trained on the CIFAR-10 dataset. ",
265 |             "arxiv_id": "2311.04588",
266 |             "ARXIVID": "2311.04588",
267 |             "COMMENT": "This paper does not match any of the specified criteria.",
268 |             "RELEVANCE": 1,
269 |             "NOVELTY": 3
270 |         },
271 |         {
272 |             "authors": [
273 |                 "Wujiang Xu",
274 |                 "Xuying Ning",
275 |                 "Wenfang Lin",
276 |                 "Mingming Ha",
277 |                 "Qiongxu Ma",
278 |                 "Linxun Chen",
279 |                 "Bing Han",
280 |                 "Minnan Luo"
281 |             ],
282 |             "title": "Towards Open-world Cross-Domain Sequential Recommendation: A Model-Agnostic Contrastive Denoising Approach. ",
283 |             "abstract": "Cross-domain sequential recommendation (CDSR) aims to address the data sparsity problems that exist in traditional sequential recommendation (SR) systems.  The existing approaches aim to design a specific cross-domain unit that can transfer and propagate information across multiple domains by relying on overlapping users with abundant behaviors. However, in real-world recommender systems, CDSR scenarios usually consist of a majority of long-tailed users with sparse behaviors and cold-start users who only exist in one domain. This leads to a drop in the performance of existing CDSR methods in the real-world industry platform. Therefore, improving the consistency and effectiveness of models in open-world CDSR scenarios is crucial for constructing CDSR models (\\textit{1st} CH). Recently, some SR approaches have utilized auxiliary behaviors to complement the information for long-tailed users. However, these multi-behavior SR methods cannot deliver promising performance in CDSR, as they overlook the semantic gap between target and auxiliary behaviors, as well as user interest deviation across domains (\\textit{2nd} CH). ",
284 |             "arxiv_id": "2311.04760",
285 |             "ARXIVID": "2311.04760",
286 |             "COMMENT": "This paper does not match any of the specified criteria.",
287 |             "RELEVANCE": 1,
288 |             "NOVELTY": 3
289 |         },
290 |         {
291 |             "authors": [
292 |                 "Lea M. Trenkwalder",
293 |                 "Eleanor Scerri",
294 |                 "Thomas E. O'Brien",
295 |                 "Vedran Dunjko"
296 |             ],
297 |             "title": "Compilation of product-formula Hamiltonian simulation via reinforcement learning. ",
298 |             "abstract": "Hamiltonian simulation is believed to be one of the first tasks where quantum computers can yield a quantum advantage. One of the most popular methods of Hamiltonian simulation is Trotterization, which makes use of the approximation $e^{i\\sum_jA_j}\\sim \\prod_je^{iA_j}$ and higher-order corrections thereto. However, this leaves open the question of the order of operations (i.e. the order of the product over $j$, which is known to affect the quality of approximation). In some cases this order is fixed by the desire to minimise the error of approximation; when it is not the case, we propose that the order can be chosen to optimize compilation to a native quantum architecture. This presents a new compilation problem -- order-agnostic quantum circuit compilation -- which we prove is NP-hard in the worst case. In lieu of an easily-computable exact solution, we turn to methods of heuristic optimization of compilation. We focus on reinforcement learning due to the sequential nature of the compilation task, comparing it to simulated annealing and Monte Carlo tree search. While two of the methods outperform a naive heuristic, reinforcement learning clearly outperforms all others, with a gain of around 12% with respect to the second-best method and of around 50% compared to the naive heuristic in terms of the gate count. We further test the ability of RL to generalize across instances of the compilation problem, and find that a single learner is able to solve entire problem families. This demonstrates the ability of machine learning techniques to provide assistance in an order-agnostic quantum compilation task. ",
299 |             "arxiv_id": "2311.04285",
300 |             "ARXIVID": "2311.04285",
301 |             "COMMENT": "This paper does not match any of the specified criteria.",
302 |             "RELEVANCE": 1,
303 |             "NOVELTY": 4
304 |         },
305 |         {
306 |             "authors": [
307 |                 "Koyena Pal",
308 |                 "Jiuding Sun",
309 |                 "Andrew Yuan",
310 |                 "Byron C. Wallace",
311 |                 "David Bau"
312 |             ],
313 |             "title": "Future Lens: Anticipating Subsequent Tokens from a Single Hidden State. ",
314 |             "abstract": "We conjecture that hidden state vectors corresponding to individual input tokens encode information sufficient to accurately predict several tokens ahead. More concretely, in this paper we ask: Given a hidden (internal) representation of a single token at position $t$ in an input, can we reliably anticipate the tokens that will appear at positions $\\geq t + 2$? To test this, we measure linear approximation and causal intervention methods in GPT-J-6B to evaluate the degree to which individual hidden states in the network contain signal rich enough to predict future hidden states and, ultimately, token outputs. We find that, at some layers, we can approximate a model's output with more than 48% accuracy with respect to its prediction of subsequent tokens through a single hidden state. Finally we present a \"Future Lens\" visualization that uses these methods to create a new view of transformer states. ",
315 |             "arxiv_id": "2311.04897",
316 |             "ARXIVID": "2311.04897",
317 |             "COMMENT": "This paper does not match any of the specified criteria.",
318 |             "RELEVANCE": 1,
319 |             "NOVELTY": 5
320 |         },
321 |         {
322 |             "authors": [
323 |                 "Ercong Yu",
324 |                 "Jinle Zhu",
325 |                 "Qiang Li",
326 |                 "Zilong Liu",
327 |                 "Hongyang Chen",
328 |                 "Shlomo Shamai (Shitz)",
329 |                 "H. Vincent Poor"
330 |             ],
331 |             "title": "Deep Learning Assisted Multiuser MIMO Load Modulated Systems for Enhanced Downlink mmWave Communications. ",
332 |             "abstract": "This paper is focused on multiuser load modulation arrays (MU-LMAs) which are attractive due to their low system complexity and reduced cost for millimeter wave (mmWave) multi-input multi-output (MIMO) systems. The existing precoding algorithm for downlink MU-LMA relies on a sub-array structured (SAS) transmitter which may suffer from decreased degrees of freedom and complex system configuration. Furthermore, a conventional LMA codebook with codewords uniformly distributed on a hypersphere may not be channel-adaptive and may lead to increased signal detection complexity. In this paper, we conceive an MU-LMA system employing a full-array structured (FAS) transmitter and propose two algorithms accordingly. The proposed FAS-based system addresses the SAS structural problems and can support larger numbers of users. For LMA-imposed constant-power downlink precoding, we propose an FAS-based normalized block diagonalization (FAS-NBD) algorithm. However, the forced normalization may result in performance degradation. This degradation, together with the aforementioned codebook design problems, is difficult to solve analytically. This motivates us to propose a Deep Learning-enhanced (FAS-DL-NBD) algorithm for adaptive codebook design and codebook-independent decoding. It is shown that the proposed algorithms are robust to imperfect knowledge of channel state information and yield excellent error performance. Moreover, the FAS-DL-NBD algorithm enables signal detection with low complexity as the number of bits per codeword increases. ",
333 |             "arxiv_id": "2311.04537",
334 |             "ARXIVID": "2311.04537",
335 |             "COMMENT": "This paper does not match any of the specified criteria.",
336 |             "RELEVANCE": 1,
337 |             "NOVELTY": 3
338 |         }
339 |     ]
340 | ]


--------------------------------------------------------------------------------
/main.py:
--------------------------------------------------------------------------------
  1 | import json
  2 | import configparser
  3 | import os
  4 | import time
  5 | 
  6 | from openai import OpenAI
  7 | from requests import Session
  8 | from typing import TypeVar, Generator
  9 | import io
 10 | 
 11 | from retry import retry
 12 | from tqdm import tqdm
 13 | 
 14 | from arxiv_scraper import get_papers_from_arxiv_rss_api
 15 | from filter_papers import filter_by_author, filter_by_gpt
 16 | from parse_json_to_md import render_md_string
 17 | from push_to_slack import push_to_slack
 18 | from arxiv_scraper import EnhancedJSONEncoder
 19 | 
 20 | T = TypeVar("T")
 21 | 
 22 | 
 23 | def batched(items: list[T], batch_size: int) -> list[T]:
 24 |     # takes a list and returns a list of list with batch_size
 25 |     return [items[i : i + batch_size] for i in range(0, len(items), batch_size)]
 26 | 
 27 | 
 28 | def argsort(seq):
 29 |     # native python version of an 'argsort'
 30 |     # http://stackoverflow.com/questions/3071415/efficient-method-to-calculate-the-rank-vector-of-a-list-in-python
 31 |     return sorted(range(len(seq)), key=seq.__getitem__)
 32 | 
 33 | 
 34 | def get_paper_batch(
 35 |     session: Session,
 36 |     ids: list[str],
 37 |     S2_API_KEY: str,
 38 |     fields: str = "paperId,title",
 39 |     **kwargs,
 40 | ) -> list[dict]:
 41 |     # gets a batch of papers. taken from the sem scholar example.
 42 |     params = {
 43 |         "fields": fields,
 44 |         **kwargs,
 45 |     }
 46 |     if S2_API_KEY is None:
 47 |         headers = {}
 48 |     else:
 49 |         headers = {
 50 |             "X-API-KEY": S2_API_KEY,
 51 |         }
 52 |     body = {
 53 |         "ids": ids,
 54 |     }
 55 | 
 56 |     # https://api.semanticscholar.org/api-docs/graph#tag/Paper-Data/operation/post_graph_get_papers
 57 |     with session.post(
 58 |         "https://api.semanticscholar.org/graph/v1/paper/batch",
 59 |         params=params,
 60 |         headers=headers,
 61 |         json=body,
 62 |     ) as response:
 63 |         response.raise_for_status()
 64 |         return response.json()
 65 | 
 66 | 
 67 | def get_author_batch(
 68 |     session: Session,
 69 |     ids: list[str],
 70 |     S2_API_KEY: str,
 71 |     fields: str = "name,hIndex,citationCount",
 72 |     **kwargs,
 73 | ) -> list[dict]:
 74 |     # gets a batch of authors. analogous to author batch
 75 |     params = {
 76 |         "fields": fields,
 77 |         **kwargs,
 78 |     }
 79 |     if S2_API_KEY is None:
 80 |         headers = {}
 81 |     else:
 82 |         headers = {
 83 |             "X-API-KEY": S2_API_KEY,
 84 |         }
 85 |     body = {
 86 |         "ids": ids,
 87 |     }
 88 | 
 89 |     with session.post(
 90 |         "https://api.semanticscholar.org/graph/v1/author/batch",
 91 |         params=params,
 92 |         headers=headers,
 93 |         json=body,
 94 |     ) as response:
 95 |         response.raise_for_status()
 96 |         return response.json()
 97 | 
 98 | 
 99 | @retry(tries=3, delay=2.0)
100 | def get_one_author(session, author: str, S2_API_KEY: str) -> str:
101 |     # query the right endpoint https://api.semanticscholar.org/graph/v1/author/search?query=adam+smith
102 |     params = {"query": author, "fields": "authorId,name,hIndex", "limit": "10"}
103 |     if S2_API_KEY is None:
104 |         headers = {}
105 |     else:
106 |         headers = {
107 |             "X-API-KEY": S2_API_KEY,
108 |         }
109 |     with session.get(
110 |         "https://api.semanticscholar.org/graph/v1/author/search",
111 |         params=params,
112 |         headers=headers,
113 |     ) as response:
114 |         # try catch for errors
115 |         try:
116 |             response.raise_for_status()
117 |             response_json = response.json()
118 |             if len(response_json["data"]) >= 1:
119 |                 return response_json["data"]
120 |             else:
121 |                 return None
122 |         except Exception as ex:
123 |             print("exception happened" + str(ex))
124 |             return None
125 | 
126 | 
127 | def get_papers(
128 |     ids: list[str], S2_API_KEY: str, batch_size: int = 100, **kwargs
129 | ) -> Generator[dict, None, None]:
130 |     # gets all papers, doing batching to avoid hitting the max paper limit.
131 |     # use a session to reuse the same TCP connection
132 |     with Session() as session:
133 |         # take advantage of S2 batch paper endpoint
134 |         for ids_batch in batched(ids, batch_size=batch_size):
135 |             yield from get_paper_batch(session, ids_batch, S2_API_KEY, **kwargs)
136 | 
137 | 
138 | def get_authors(
139 |     all_authors: list[str], S2_API_KEY: str, batch_size: int = 100, **kwargs
140 | ):
141 |     # first get the list of all author ids by querying by author names
142 |     author_metadata_dict = {}
143 |     with Session() as session:
144 |         for author in tqdm(all_authors):
145 |             auth_map = get_one_author(session, author, S2_API_KEY)
146 |             if auth_map is not None:
147 |                 author_metadata_dict[author] = auth_map
148 |             # add a 20ms wait time to avoid rate limiting
149 |             # otherwise, semantic scholar aggressively rate limits, so do 1s
150 |             if S2_API_KEY is not None:
151 |                 time.sleep(0.02)
152 |             else:
153 |                 time.sleep(1.0)
154 |     return author_metadata_dict
155 | 
156 | 
157 | def get_papers_from_arxiv(config):
158 |     area_list = config["FILTERING"]["arxiv_category"].split(",")
159 |     paper_set = set()
160 |     for area in area_list:
161 |         papers = get_papers_from_arxiv_rss_api(area.strip(), config)
162 |         paper_set.update(set(papers))
163 |     if config["OUTPUT"].getboolean("debug_messages"):
164 |         print("Number of papers:" + str(len(paper_set)))
165 |     return paper_set
166 | 
167 | 
168 | def parse_authors(lines):
169 |     # parse the comma-separated author list, ignoring lines that are empty and starting with #
170 |     author_ids = []
171 |     authors = []
172 |     for line in lines:
173 |         if line.startswith("#"):
174 |             continue
175 |         if not line.strip():
176 |             continue
177 |         author_split = line.split(",")
178 |         author_ids.append(author_split[1].strip())
179 |         authors.append(author_split[0].strip())
180 |     return authors, author_ids
181 | 
182 | 
183 | if __name__ == "__main__":
184 |     # now load config.ini
185 |     config = configparser.ConfigParser()
186 |     config.read("configs/config.ini")
187 | 
188 |     S2_API_KEY = os.environ.get("S2_KEY")
189 |     OAI_KEY = os.environ.get("OAI_KEY")
190 |     if OAI_KEY is None:
191 |         raise ValueError(
192 |             "OpenAI key is not set - please set OAI_KEY to your OpenAI key"
193 |         )
194 |     openai_client = OpenAI(api_key=OAI_KEY)
195 |     # load the author list
196 |     with io.open("configs/authors.txt", "r") as fopen:
197 |         author_names, author_ids = parse_authors(fopen.readlines())
198 |     author_id_set = set(author_ids)
199 | 
200 |     papers = list(get_papers_from_arxiv(config))
201 |     # dump all papers for debugging
202 | 
203 |     all_authors = set()
204 |     for paper in papers:
205 |         all_authors.update(set(paper.authors))
206 |     if config["OUTPUT"].getboolean("debug_messages"):
207 |         print("Getting author info for " + str(len(all_authors)) + " authors")
208 |     all_authors = get_authors(list(all_authors), S2_API_KEY)
209 | 
210 |     if config["OUTPUT"].getboolean("dump_debug_file"):
211 |         with open(
212 |             config["OUTPUT"]["output_path"] + "papers.debug.json", "w"
213 |         ) as outfile:
214 |             json.dump(papers, outfile, cls=EnhancedJSONEncoder, indent=4)
215 |         with open(
216 |             config["OUTPUT"]["output_path"] + "all_authors.debug.json", "w"
217 |         ) as outfile:
218 |             json.dump(all_authors, outfile, cls=EnhancedJSONEncoder, indent=4)
219 |         with open(
220 |             config["OUTPUT"]["output_path"] + "author_id_set.debug.json", "w"
221 |         ) as outfile:
222 |             json.dump(list(author_id_set), outfile, cls=EnhancedJSONEncoder, indent=4)
223 | 
224 |     selected_papers, all_papers, sort_dict = filter_by_author(
225 |         all_authors, papers, author_id_set, config
226 |     )
227 |     filter_by_gpt(
228 |         all_authors,
229 |         papers,
230 |         config,
231 |         openai_client,
232 |         all_papers,
233 |         selected_papers,
234 |         sort_dict,
235 |     )
236 | 
237 |     # sort the papers by relevance and novelty
238 |     keys = list(sort_dict.keys())
239 |     values = list(sort_dict.values())
240 |     sorted_keys = [keys[idx] for idx in argsort(values)[::-1]]
241 |     selected_papers = {key: selected_papers[key] for key in sorted_keys}
242 |     if config["OUTPUT"].getboolean("debug_messages"):
243 |         print(sort_dict)
244 |         print(selected_papers)
245 | 
246 |     # pick endpoints and push the summaries
247 |     if len(papers) > 0:
248 |         if config["OUTPUT"].getboolean("dump_json"):
249 |             with open(config["OUTPUT"]["output_path"] + "output.json", "w") as outfile:
250 |                 json.dump(selected_papers, outfile, indent=4)
251 |         if config["OUTPUT"].getboolean("dump_md"):
252 |             with open(config["OUTPUT"]["output_path"] + "output.md", "w") as f:
253 |                 f.write(render_md_string(selected_papers))
254 |         # only push to slack for non-empty dicts
255 |         if config["OUTPUT"].getboolean("push_to_slack"):
256 |             SLACK_KEY = os.environ.get("SLACK_KEY")
257 |             if SLACK_KEY is None:
258 |                 print(
259 |                     "Warning: push_to_slack is true, but SLACK_KEY is not set - not pushing to slack"
260 |                 )
261 |             else:
262 |                 push_to_slack(selected_papers)
263 | 


--------------------------------------------------------------------------------
/out/.gitignore:
--------------------------------------------------------------------------------
1 | # Ignore everything in this directory
2 | *
3 | # Except this file
4 | !.gitignore


--------------------------------------------------------------------------------
/parse_json_to_md.py:
--------------------------------------------------------------------------------
 1 | import json
 2 | from datetime import datetime
 3 | 
 4 | 
 5 | def render_paper(paper_entry: dict, idx: int) -> str:
 6 |     """
 7 |     :param paper_entry: is a dict from a json. an example is
 8 |     {"paperId": "2754e70eaa0c2d40972c47c4c23210f0cece8bfc", "externalIds": {"ArXiv": "2310.16834", "CorpusId": 264451832}, "title": "Discrete Diffusion Language Modeling by Estimating the Ratios of the Data Distribution", "abstract": "Despite their groundbreaking performance for many generative modeling tasks, diffusion models have fallen short on discrete data domains such as natural language. Crucially, standard diffusion models rely on the well-established theory of score matching, but efforts to generalize this to discrete structures have not yielded the same empirical gains. In this work, we bridge this gap by proposing score entropy, a novel discrete score matching loss that is more stable than existing methods, forms an ELBO for maximum likelihood training, and can be efficiently optimized with a denoising variant. We scale our Score Entropy Discrete Diffusion models (SEDD) to the experimental setting of GPT-2, achieving highly competitive likelihoods while also introducing distinct algorithmic advantages. In particular, when comparing similarly sized SEDD and GPT-2 models, SEDD attains comparable perplexities (normally within $+10\\%$ of and sometimes outperforming the baseline). Furthermore, SEDD models learn a more faithful sequence distribution (around $4\\times$ better compared to GPT-2 models with ancestral sampling as measured by large models), can trade off compute for generation quality (needing only $16\\times$ fewer network evaluations to match GPT-2), and enables arbitrary infilling beyond the standard left to right prompting.", "year": 2023, "authors": [{"authorId": "2261494043", "name": "Aaron Lou"}, {"authorId": "83262128", "name": "Chenlin Meng"}, {"authorId": "2490652", "name": "Stefano Ermon"}], "ARXIVID": "2310.16834", "COMMENT": "The paper shows a significant advance in the performance of diffusion language models, directly meeting one of the criteria.", "RELEVANCE": 10, "NOVELTY": 8}, "2310.16779": {"paperId": "edc8953d559560d3237fc0b27175cdb1114c0ca5", "externalIds": {"ArXiv": "2310.16779", "CorpusId": 264451949}, "title": "Multi-scale Diffusion Denoised Smoothing", "abstract": "Along with recent diffusion models, randomized smoothing has become one of a few tangible approaches that offers adversarial robustness to models at scale, e.g., those of large pre-trained models. Specifically, one can perform randomized smoothing on any classifier via a simple\"denoise-and-classify\"pipeline, so-called denoised smoothing, given that an accurate denoiser is available - such as diffusion model. In this paper, we investigate the trade-off between accuracy and certified robustness of denoised smoothing: for example, we question on which representation of diffusion model would maximize the certified robustness of denoised smoothing. We consider a new objective that aims collective robustness of smoothed classifiers across multiple noise levels at a shared diffusion model, which also suggests a new way to compensate the cost of accuracy in randomized smoothing for its certified robustness. This objective motivates us to fine-tune diffusion model (a) to perform consistent denoising whenever the original image is recoverable, but (b) to generate rather diverse outputs otherwise. Our experiments show that this fine-tuning scheme of diffusion models combined with the multi-scale smoothing enables a strong certified robustness possible at highest noise level while maintaining the accuracy closer to non-smoothed classifiers.", "year": 2023, "authors": [{"authorId": "83125078", "name": "Jongheon Jeong"}, {"authorId": "2261688831", "name": "Jinwoo Shin"}], "ARXIVID": "2310.16779", "COMMENT": "The paper presents an advancement in the performance of diffusion models, specifically in the context of denoised smoothing.", "RELEVANCE": 9, "NOVELTY": 7}
 9 |     :return: a markdown formatted string showing the arxiv id, title, arxiv url, abstract, authors, score and comment (if those fields exist)
10 |     """
11 |     # get the arxiv id
12 |     arxiv_id = paper_entry["arxiv_id"]
13 |     # get the title
14 |     title = paper_entry["title"]
15 |     # get the arxiv url
16 |     arxiv_url = f"https://arxiv.org/abs/{arxiv_id}"
17 |     # get the abstract
18 |     abstract = paper_entry["abstract"]
19 |     # get the authors
20 |     authors = paper_entry["authors"]
21 |     paper_string = f'## {idx}. [{title}]({arxiv_url}) <a id="link{idx}"></a>\n'
22 |     paper_string += f"**ArXiv ID:** {arxiv_id}\n"
23 |     paper_string += f'**Authors:** {", ".join(authors)}\n\n'
24 |     paper_string += f"**Abstract:** {abstract}\n\n"
25 |     if "COMMENT" in paper_entry:
26 |         comment = paper_entry["COMMENT"]
27 |         paper_string += f"**Comment:** {comment}\n"
28 |     if "RELEVANCE" in paper_entry and "NOVELTY" in paper_entry:
29 |         # get the relevance and novelty scores
30 |         relevance = paper_entry["RELEVANCE"]
31 |         novelty = paper_entry["NOVELTY"]
32 |         paper_string += f"**Relevance:** {relevance}\n"
33 |         paper_string += f"**Novelty:** {novelty}\n"
34 |     return paper_string + "\n---\n"
35 | 
36 | 
37 | def render_title_and_author(paper_entry: dict, idx: int) -> str:
38 |     title = paper_entry["title"]
39 |     authors = paper_entry["authors"]
40 |     paper_string = f"{idx}. [{title}](#link{idx})\n"
41 |     paper_string += f'**Authors:** {", ".join(authors)}\n'
42 |     return paper_string
43 | 
44 | 
45 | def render_md_string(papers_dict):
46 |     # header
47 |     with open("configs/paper_topics.txt", "r") as f:
48 |         criterion = f.read()
49 |     output_string = (
50 |         "# Personalized Daily Arxiv Papers "
51 |         + datetime.today().strftime("%m/%d/%Y")
52 |         + "\nTotal relevant papers: "
53 |         + str(len(papers_dict))
54 |         + "\n\n"
55 |         + "Paper selection prompt and criteria at the bottom\n\n"
56 |         + "Table of contents with paper titles:\n\n"
57 |     )
58 |     title_strings = [
59 |         render_title_and_author(paper, i)
60 |         for i, paper in enumerate(papers_dict.values())
61 |     ]
62 |     output_string = output_string + "\n".join(title_strings) + "\n---\n"
63 |     # render each paper
64 |     paper_strings = [
65 |         render_paper(paper, i) for i, paper in enumerate(papers_dict.values())
66 |     ]
67 |     # join all papers into one string
68 |     output_string = output_string + "\n".join(paper_strings)
69 |     output_string += "\n\n---\n\n"
70 |     output_string += f"## Paper selection prompt\n{criterion}"
71 |     return output_string
72 | 
73 | 
74 | if __name__ == "__main__":
75 |     # parse output.json into a dict
76 |     with open("out/output.json", "r") as f:
77 |         output = json.load(f)
78 |     # write to output.md
79 |     with open("out/output.md", "w") as f:
80 |         f.write(render_md_string(output))
81 | 


--------------------------------------------------------------------------------
/push_to_slack.py:
--------------------------------------------------------------------------------
  1 | """
  2 | Code to render the output.json into a format suitable for a slackbot, and to push it to slack using webhooks
  3 | """
  4 | import json
  5 | import os
  6 | from datetime import datetime
  7 | 
  8 | from typing import List, TypeVar
  9 | from slack_sdk import WebClient
 10 | from slack_sdk.errors import SlackApiError
 11 | 
 12 | from arxiv_scraper import Paper
 13 | 
 14 | T = TypeVar("T")
 15 | 
 16 | 
 17 | def batched(items: list[T], batch_size: int) -> list[T]:
 18 |     # takes a list and returns a list of list with batch_size
 19 |     return [items[i : i + batch_size] for i in range(0, len(items), batch_size)]
 20 | 
 21 | 
 22 | def send_main_message(block_list: List, channel_id, client):
 23 |     try:
 24 |         # Call the conversations.list method using the WebClient
 25 |         result = client.chat_postMessage(
 26 |             channel=channel_id,
 27 |             blocks=block_list,
 28 |             text="Arxiv update",
 29 |             unfurl_links=False,
 30 |             # You could also use a blocks[] array to send richer content
 31 |         )
 32 |         # Print result, which includes information about the message (like TS)
 33 |         print(result)
 34 |         return result["ts"]
 35 | 
 36 |     except SlackApiError as e:
 37 |         print(f"Error: {e}")
 38 | 
 39 | 
 40 | def send_thread(block_list: List, channel_id, thread_id, client):
 41 |     try:
 42 |         batches = batched(block_list, 50)
 43 |         # Call the conversations.list method using the WebClient
 44 |         for batch in batches:
 45 |             result = client.chat_postMessage(
 46 |                 thread_ts=thread_id,
 47 |                 text="Arxiv full update",
 48 |                 channel=channel_id,
 49 |                 blocks=batch,
 50 |                 unfurl_links=False,
 51 |                 # You could also use a blocks[] array to send richer content
 52 |             )
 53 |             # Print result, which includes information about the message (like TS)
 54 |             print(result)
 55 | 
 56 |     except SlackApiError as e:
 57 |         print(f"Error: {e}")
 58 | 
 59 | 
 60 | def render_paper(paper_entry: Paper, counter: int) -> str:
 61 |     """
 62 |     :param counter: is the position of the paper in the list
 63 |     :param paper_entry: is a dict from a json. an example is
 64 |     {"paperId": "2754e70eaa0c2d40972c47c4c23210f0cece8bfc", "externalIds": {"ArXiv": "2310.16834", "CorpusId": 264451832}, "title": "Discrete Diffusion Language Modeling by Estimating the Ratios of the Data Distribution", "abstract": "Despite their groundbreaking performance for many generative modeling tasks, diffusion models have fallen short on discrete data domains such as natural language. Crucially, standard diffusion models rely on the well-established theory of score matching, but efforts to generalize this to discrete structures have not yielded the same empirical gains. In this work, we bridge this gap by proposing score entropy, a novel discrete score matching loss that is more stable than existing methods, forms an ELBO for maximum likelihood training, and can be efficiently optimized with a denoising variant. We scale our Score Entropy Discrete Diffusion models (SEDD) to the experimental setting of GPT-2, achieving highly competitive likelihoods while also introducing distinct algorithmic advantages. In particular, when comparing similarly sized SEDD and GPT-2 models, SEDD attains comparable perplexities (normally within $+10\\%$ of and sometimes outperforming the baseline). Furthermore, SEDD models learn a more faithful sequence distribution (around $4\\times$ better compared to GPT-2 models with ancestral sampling as measured by large models), can trade off compute for generation quality (needing only $16\\times$ fewer network evaluations to match GPT-2), and enables arbitrary infilling beyond the standard left to right prompting.", "year": 2023, "authors": [{"authorId": "2261494043", "name": "Aaron Lou"}, {"authorId": "83262128", "name": "Chenlin Meng"}, {"authorId": "2490652", "name": "Stefano Ermon"}], "ARXIVID": "2310.16834", "COMMENT": "The paper shows a significant advance in the performance of diffusion language models, directly meeting one of the criteria.", "RELEVANCE": 10, "NOVELTY": 8}, "2310.16779": {"paperId": "edc8953d559560d3237fc0b27175cdb1114c0ca5", "externalIds": {"ArXiv": "2310.16779", "CorpusId": 264451949}, "title": "Multi-scale Diffusion Denoised Smoothing", "abstract": "Along with recent diffusion models, randomized smoothing has become one of a few tangible approaches that offers adversarial robustness to models at scale, e.g., those of large pre-trained models. Specifically, one can perform randomized smoothing on any classifier via a simple\"denoise-and-classify\"pipeline, so-called denoised smoothing, given that an accurate denoiser is available - such as diffusion model. In this paper, we investigate the trade-off between accuracy and certified robustness of denoised smoothing: for example, we question on which representation of diffusion model would maximize the certified robustness of denoised smoothing. We consider a new objective that aims collective robustness of smoothed classifiers across multiple noise levels at a shared diffusion model, which also suggests a new way to compensate the cost of accuracy in randomized smoothing for its certified robustness. This objective motivates us to fine-tune diffusion model (a) to perform consistent denoising whenever the original image is recoverable, but (b) to generate rather diverse outputs otherwise. Our experiments show that this fine-tuning scheme of diffusion models combined with the multi-scale smoothing enables a strong certified robustness possible at highest noise level while maintaining the accuracy closer to non-smoothed classifiers.", "year": 2023, "authors": [{"authorId": "83125078", "name": "Jongheon Jeong"}, {"authorId": "2261688831", "name": "Jinwoo Shin"}], "ARXIVID": "2310.16779", "COMMENT": "The paper presents an advancement in the performance of diffusion models, specifically in the context of denoised smoothing.", "RELEVANCE": 9, "NOVELTY": 7}
 65 |     :return: a slackbot-appropriate mrkdwn formatted string showing the arxiv id, title, arxiv url, abstract, authors, score and comment (if those fields exist)
 66 |     """
 67 |     # get the arxiv id
 68 |     arxiv_id = paper_entry["arxiv_id"]
 69 |     # get the title
 70 |     title = paper_entry["title"]
 71 |     # get the arxiv url
 72 |     arxiv_url = f"https://arxiv.org/abs/{arxiv_id}"
 73 |     # get the abstract
 74 |     abstract = paper_entry["abstract"]
 75 |     # get the authors
 76 |     authors = paper_entry["authors"]
 77 |     paper_string = (
 78 |         "<"
 79 |         + arxiv_url
 80 |         + "|*"
 81 |         + str(counter)
 82 |         + ". "
 83 |         + title.replace("&", "&amp;")
 84 |         + "*>\n"
 85 |     )
 86 |     paper_string += f'*Authors*: {", ".join(authors)}\n\n'
 87 |     paper_string += f"*Abstract*: {abstract}\n\n"
 88 |     if "RELEVANCE" in paper_entry and "NOVELTY" in paper_entry:
 89 |         # get the relevance and novelty scores
 90 |         relevance = paper_entry["RELEVANCE"]
 91 |         novelty = paper_entry["NOVELTY"]
 92 |         paper_string += f"*Relevance*: {relevance}\t"
 93 |         paper_string += f"*Novelty*: {novelty}\t"
 94 |     if "COMMENT" in paper_entry:
 95 |         # get the comment
 96 |         comment = paper_entry["COMMENT"]
 97 |         paper_string += f"*Comment*: {comment}\n"
 98 |     return paper_string
 99 | 
100 | 
101 | def render_title(paper_entry: Paper, counter: int) -> str:
102 |     """
103 |     :param counter: is the position of the paper in the list
104 |     :param paper_entry: is a dict from a json. an example is
105 |     {"paperId": "2754e70eaa0c2d40972c47c4c23210f0cece8bfc", "externalIds": {"ArXiv": "2310.16834", "CorpusId": 264451832}, "title": "Discrete Diffusion Language Modeling by Estimating the Ratios of the Data Distribution", "abstract": "Despite their groundbreaking performance for many generative modeling tasks, diffusion models have fallen short on discrete data domains such as natural language. Crucially, standard diffusion models rely on the well-established theory of score matching, but efforts to generalize this to discrete structures have not yielded the same empirical gains. In this work, we bridge this gap by proposing score entropy, a novel discrete score matching loss that is more stable than existing methods, forms an ELBO for maximum likelihood training, and can be efficiently optimized with a denoising variant. We scale our Score Entropy Discrete Diffusion models (SEDD) to the experimental setting of GPT-2, achieving highly competitive likelihoods while also introducing distinct algorithmic advantages. In particular, when comparing similarly sized SEDD and GPT-2 models, SEDD attains comparable perplexities (normally within $+10\\%$ of and sometimes outperforming the baseline). Furthermore, SEDD models learn a more faithful sequence distribution (around $4\\times$ better compared to GPT-2 models with ancestral sampling as measured by large models), can trade off compute for generation quality (needing only $16\\times$ fewer network evaluations to match GPT-2), and enables arbitrary infilling beyond the standard left to right prompting.", "year": 2023, "authors": [{"authorId": "2261494043", "name": "Aaron Lou"}, {"authorId": "83262128", "name": "Chenlin Meng"}, {"authorId": "2490652", "name": "Stefano Ermon"}], "ARXIVID": "2310.16834", "COMMENT": "The paper shows a significant advance in the performance of diffusion language models, directly meeting one of the criteria.", "RELEVANCE": 10, "NOVELTY": 8}, "2310.16779": {"paperId": "edc8953d559560d3237fc0b27175cdb1114c0ca5", "externalIds": {"ArXiv": "2310.16779", "CorpusId": 264451949}, "title": "Multi-scale Diffusion Denoised Smoothing", "abstract": "Along with recent diffusion models, randomized smoothing has become one of a few tangible approaches that offers adversarial robustness to models at scale, e.g., those of large pre-trained models. Specifically, one can perform randomized smoothing on any classifier via a simple\"denoise-and-classify\"pipeline, so-called denoised smoothing, given that an accurate denoiser is available - such as diffusion model. In this paper, we investigate the trade-off between accuracy and certified robustness of denoised smoothing: for example, we question on which representation of diffusion model would maximize the certified robustness of denoised smoothing. We consider a new objective that aims collective robustness of smoothed classifiers across multiple noise levels at a shared diffusion model, which also suggests a new way to compensate the cost of accuracy in randomized smoothing for its certified robustness. This objective motivates us to fine-tune diffusion model (a) to perform consistent denoising whenever the original image is recoverable, but (b) to generate rather diverse outputs otherwise. Our experiments show that this fine-tuning scheme of diffusion models combined with the multi-scale smoothing enables a strong certified robustness possible at highest noise level while maintaining the accuracy closer to non-smoothed classifiers.", "year": 2023, "authors": [{"authorId": "83125078", "name": "Jongheon Jeong"}, {"authorId": "2261688831", "name": "Jinwoo Shin"}], "ARXIVID": "2310.16779", "COMMENT": "The paper presents an advancement in the performance of diffusion models, specifically in the context of denoised smoothing.", "RELEVANCE": 9, "NOVELTY": 7}
106 |     :return: a slackbot-appropriate mrkdwn formatted string showing the arxiv id, title, arxiv url, abstract, authors, score and comment (if those fields exist)
107 |     """
108 |     # get the arxiv id
109 |     arxiv_id = paper_entry["arxiv_id"]
110 |     # get the title
111 |     title = paper_entry["title"]
112 |     # get the arxiv url
113 |     arxiv_url = f"https://arxiv.org/abs/{arxiv_id}"
114 |     # get the authors
115 |     authors = paper_entry["authors"]
116 |     paper_string = (
117 |         "<"
118 |         + arxiv_url
119 |         + "|*"
120 |         + str(counter)
121 |         + ". "
122 |         + title.replace("&", "&amp;")
123 |         + "*>\n"
124 |     )
125 |     paper_string += f'*Authors*: {", ".join(authors)}\n\n'
126 |     return paper_string
127 | 
128 | 
129 | def build_block_list(title_strings, paper_strings):
130 |     """
131 |     builds a list of slack-bot blocks from a list of markdown formatted papers
132 |     """
133 |     slack_block_list = [
134 |         {
135 |             "type": "header",
136 |             "text": {
137 |                 "type": "plain_text",
138 |                 "text": "Paper alert bot update on "
139 |                 + datetime.today().strftime("%m/%d/%Y"),
140 |             },
141 |         },
142 |         {
143 |             "type": "section",
144 |             "text": {
145 |                 "type": "mrkdwn",
146 |                 "text": "Total relevant papers (max 50 in thread): "
147 |                 + str(len(title_strings))
148 |                 + "\n Top 20 titles shown below",
149 |             },
150 |         },
151 |         {"type": "divider"},
152 |     ]
153 | 
154 |     for paper in title_strings[:20]:
155 |         slack_block_list.append(
156 |             {"type": "section", "text": {"type": "mrkdwn", "text": paper}}
157 |         )
158 | 
159 |     thread_blocks = []
160 |     for paper in paper_strings[:50]:
161 |         thread_blocks.append(
162 |             {"type": "section", "text": {"type": "mrkdwn", "text": paper}}
163 |         )
164 |         thread_blocks.append({"type": "divider"})
165 | 
166 |     return slack_block_list, thread_blocks
167 | 
168 | 
169 | def push_to_slack(papers_dict):
170 |     channel_id = os.environ["SLACK_CHANNEL_ID"]
171 |     client = WebClient(token=os.environ["SLACK_KEY"])
172 |     # render each paper
173 |     if len(papers_dict) == 0:
174 |         return
175 |     title_strings = [
176 |         render_title(paper, i) for i, paper in enumerate(papers_dict.values())
177 |     ]
178 |     paper_strings = [
179 |         render_paper(paper, i) for i, paper in enumerate(papers_dict.values())
180 |     ]
181 |     blocks, thread_blocks = build_block_list(title_strings, paper_strings)
182 |     # push to slack
183 |     ts = send_main_message(blocks, channel_id, client)
184 |     send_thread(thread_blocks, channel_id, ts, client)
185 | 
186 | 
187 | if __name__ == "__main__":
188 |     # parse output.json into a dict
189 |     with open("out/output.json", "r") as f:
190 |         output = json.load(f)
191 |     push_to_slack(output)
192 | 


--------------------------------------------------------------------------------
/requirements.txt:
--------------------------------------------------------------------------------
 1 | openai~=1.2.3
 2 | Levenshtein
 3 | requests~=2.31.0
 4 | tqdm~=4.66.1
 5 | feedparser~=6.0.10
 6 | retry~=0.9.2
 7 | ruff
 8 | pre-commit
 9 | slack_sdk
10 | arxiv~=2.0.0


--------------------------------------------------------------------------------