├── .github
    └── workflows
    │   ├── download.yml
    │   └── rebuild.yml
├── .gitignore
├── Makefile
├── README.md
├── config.yml
├── configs
    ├── examples.json
    ├── spacy-llm-cohere-lg.cfg
    ├── spacy-llm-cohere-md.cfg
    ├── spacy-llm-cohere-sm.cfg
    ├── spacy-llm-openai-lg.cfg
    ├── spacy-llm-openai-md.cfg
    ├── spacy-llm-openai-sm.cfg
    ├── spacy-llm-openai-xl.cfg
    └── spacy-llm-openai-xxl.cfg
├── data
    ├── annot
    │   ├── active-learn.jsonl
    │   ├── benchmark.jsonl
    │   ├── data-annotation.jsonl
    │   ├── data-quality.jsonl
    │   ├── dev-research.jsonl
    │   ├── llm.jsonl
    │   ├── llms.jsonl
    │   ├── new-dataset.jsonl
    │   ├── prompt-eng.jsonl
    │   └── second-opinion.jsonl
    └── downloads
    │   ├── .gitkeep
    │   ├── 2023-04-08.jsonl
    │   ├── 2023-04-10.jsonl
    │   ├── 2023-04-19.jsonl
    │   ├── 2023-05-11.jsonl
    │   ├── 2023-05-12.jsonl
    │   ├── 2023-05-16.jsonl
    │   ├── 2023-05-17.jsonl
    │   ├── 2023-05-18.jsonl
    │   ├── 2023-05-19.jsonl
    │   ├── 2023-05-22.jsonl
    │   ├── 2023-05-23.jsonl
    │   ├── 2023-05-24.jsonl
    │   ├── 2023-05-25.jsonl
    │   ├── 2023-05-26.jsonl
    │   ├── 2023-05-30.jsonl
    │   ├── 2023-05-31.jsonl
    │   ├── 2023-06-01.jsonl
    │   ├── 2023-06-02.jsonl
    │   ├── 2023-06-06.jsonl
    │   ├── 2023-06-07.jsonl
    │   ├── 2023-06-08.jsonl
    │   ├── 2023-06-09.jsonl
    │   ├── 2023-06-10.jsonl
    │   ├── 2023-06-13.jsonl
    │   ├── 2023-06-14.jsonl
    │   ├── 2023-06-16.jsonl
    │   ├── 2023-06-21.jsonl
    │   ├── 2023-06-22.jsonl
    │   ├── 2023-06-23.jsonl
    │   ├── 2023-06-26.jsonl
    │   ├── 2023-06-27.jsonl
    │   ├── 2023-06-28.jsonl
    │   ├── 2023-06-29.jsonl
    │   ├── 2023-06-30-18h.jsonl
    │   ├── 2023-06-30.jsonl
    │   ├── 2023-07-01-00h.jsonl
    │   ├── 2023-07-01-12h.jsonl
    │   ├── 2023-07-01-21h.jsonl
    │   ├── 2023-07-02-00h.jsonl
    │   ├── 2023-07-02-12h.jsonl
    │   ├── 2023-07-03-12h.jsonl
    │   ├── 2023-07-04-00h.jsonl
    │   ├── 2023-07-04-12h.jsonl
    │   ├── 2023-07-05-00h.jsonl
    │   ├── 2023-07-05-14h.jsonl
    │   ├── 2023-07-06-00h.jsonl
    │   ├── 2023-07-06-12h.jsonl
    │   ├── 2023-07-07-00h.jsonl
    │   ├── 2023-07-07-19h.jsonl
    │   ├── 2023-07-08-00h.jsonl
    │   ├── 2023-07-11-12h.jsonl
    │   ├── 2023-07-12-00h.jsonl
    │   ├── 2023-07-12-12h.jsonl
    │   ├── 2023-07-13-00h.jsonl
    │   ├── 2023-07-13-12h.jsonl
    │   ├── 2023-07-14-00h.jsonl
    │   ├── 2023-07-14-12h.jsonl
    │   ├── 2023-07-15-00h.jsonl
    │   ├── 2023-07-15-12h.jsonl
    │   ├── 2023-07-16-00h.jsonl
    │   ├── 2023-07-16-12h.jsonl
    │   ├── 2023-07-17-00h.jsonl
    │   ├── 2023-07-17-12h.jsonl
    │   ├── 2023-07-17-20h.jsonl
    │   ├── 2023-07-18-12h.jsonl
    │   ├── 2023-07-19-00h.jsonl
    │   ├── 2023-07-19-12h.jsonl
    │   ├── 2023-07-20-00h.jsonl
    │   ├── 2023-07-20-12h.jsonl
    │   ├── 2023-07-21-00h.jsonl
    │   ├── 2023-07-21-12h.jsonl
    │   ├── 2023-07-22-00h.jsonl
    │   ├── 2023-07-22-12h.jsonl
    │   ├── 2023-07-23-00h.jsonl
    │   ├── 2023-07-26-10h.jsonl
    │   ├── 2023-07-26-12h.jsonl
    │   ├── 2023-07-27-00h.jsonl
    │   ├── 2023-07-27-12h.jsonl
    │   ├── 2023-07-28-08h.jsonl
    │   ├── 2023-07-28-16h.jsonl
    │   ├── 2023-08-01-08h.jsonl
    │   ├── 2023-08-02-08h.jsonl
    │   ├── 2023-08-03-08h.jsonl
    │   ├── 2023-08-04-08h.jsonl
    │   ├── 2023-08-08-08h.jsonl
    │   ├── 2023-08-09-08h.jsonl
    │   ├── 2023-08-10-00h.jsonl
    │   ├── 2023-08-10-08h.jsonl
    │   ├── 2023-08-11-08h.jsonl
    │   ├── 2023-08-15-08h.jsonl
    │   ├── 2023-08-16-00h.jsonl
    │   ├── 2023-08-16-08h.jsonl
    │   ├── 2023-08-17-08h.jsonl
    │   ├── 2023-08-22-08h.jsonl
    │   ├── 2023-08-23-08h.jsonl
    │   ├── 2023-08-24-08h.jsonl
    │   ├── 2023-08-25-08h.jsonl
    │   ├── 2023-08-29-08h.jsonl
    │   ├── 2023-08-30-08h.jsonl
    │   ├── 2023-08-31-08h.jsonl
    │   ├── 2023-09-01-08h.jsonl
    │   ├── 2023-09-06-08h.jsonl
    │   ├── 2023-09-07-08h.jsonl
    │   ├── 2023-09-08-08h.jsonl
    │   ├── 2023-09-12-08h.jsonl
    │   ├── 2023-09-13-08h.jsonl
    │   ├── 2023-09-14-16h.jsonl
    │   ├── 2023-09-15-08h.jsonl
    │   ├── 2023-09-19-08h.jsonl
    │   ├── 2023-09-20-00h.jsonl
    │   ├── 2023-09-20-08h.jsonl
    │   ├── 2023-09-21-08h.jsonl
    │   ├── 2023-09-22-08h.jsonl
    │   ├── 2023-09-26-08h.jsonl
    │   ├── 2023-09-27-08h.jsonl
    │   ├── 2023-09-28-08h.jsonl
    │   ├── 2023-09-29-08h.jsonl
    │   ├── 2023-10-04-08h.jsonl
    │   ├── 2023-10-05-08h.jsonl
    │   ├── 2023-10-06-00h.jsonl
    │   ├── 2023-10-06-08h.jsonl
    │   ├── 2023-10-10-16h.jsonl
    │   ├── 2023-10-11-08h.jsonl
    │   ├── 2023-10-12-08h.jsonl
    │   ├── 2023-10-13-16h.jsonl
    │   ├── 2023-10-17-08h.jsonl
    │   ├── 2023-10-18-00h.jsonl
    │   ├── 2023-10-18-08h.jsonl
    │   ├── 2023-10-19-08h.jsonl
    │   ├── 2023-10-20-08h.jsonl
    │   ├── 2023-10-24-08h.jsonl
    │   ├── 2023-10-25-08h.jsonl
    │   ├── 2023-10-26-08h.jsonl
    │   ├── 2023-10-27-08h.jsonl
    │   ├── 2023-10-31-08h.jsonl
    │   ├── 2023-11-01-08h.jsonl
    │   ├── 2023-11-01-16h.jsonl
    │   ├── 2023-11-02-00h.jsonl
    │   ├── 2023-11-02-08h.jsonl
    │   ├── 2023-11-03-08h.jsonl
    │   ├── 2023-11-07-08h.jsonl
    │   ├── 2023-11-08-08h.jsonl
    │   ├── 2023-11-09-08h.jsonl
    │   ├── 2023-11-10-08h.jsonl
    │   ├── 2023-11-14-08h.jsonl
    │   ├── 2023-11-15-08h.jsonl
    │   ├── 2023-11-16-08h.jsonl
    │   ├── 2023-11-17-08h.jsonl
    │   ├── 2023-11-17-16h.jsonl
    │   ├── 2023-11-18-00h.jsonl
    │   ├── 2023-11-21-08h.jsonl
    │   ├── 2023-11-22-08h.jsonl
    │   ├── 2023-11-23-08h.jsonl
    │   ├── 2023-11-28-08h.jsonl
    │   ├── 2023-11-29-08h.jsonl
    │   ├── 2023-11-30-08h.jsonl
    │   ├── 2023-12-01-08h.jsonl
    │   ├── 2023-12-05-08h.jsonl
    │   ├── 2023-12-06-08h.jsonl
    │   ├── 2023-12-07-08h.jsonl
    │   ├── 2023-12-08-08h.jsonl
    │   ├── 2023-12-12-08h.jsonl
    │   ├── 2023-12-13-08h.jsonl
    │   ├── 2023-12-14-08h.jsonl
    │   ├── 2023-12-15-08h.jsonl
    │   ├── 2023-12-19-08h.jsonl
    │   ├── 2023-12-20-08h.jsonl
    │   ├── 2023-12-21-08h.jsonl
    │   ├── 2023-12-22-08h.jsonl
    │   ├── 2023-12-27-08h.jsonl
    │   ├── 2023-12-29-08h.jsonl
    │   ├── 2024-01-03-08h.jsonl
    │   ├── 2024-01-04-08h.jsonl
    │   ├── 2024-01-05-08h.jsonl
    │   ├── 2024-01-09-08h.jsonl
    │   ├── 2024-01-10-08h.jsonl
    │   ├── 2024-01-11-08h.jsonl
    │   ├── 2024-01-12-08h.jsonl
    │   ├── 2024-01-12-16h.jsonl
    │   ├── 2024-01-13-00h.jsonl
    │   ├── 2024-01-17-08h.jsonl
    │   ├── 2024-01-17-16h.jsonl
    │   ├── 2024-01-18-00h.jsonl
    │   ├── 2024-01-18-08h.jsonl
    │   ├── 2024-01-19-08h.jsonl
    │   ├── 2024-01-23-08h.jsonl
    │   ├── 2024-01-24-08h.jsonl
    │   ├── 2024-01-25-08h.jsonl
    │   ├── 2024-01-26-08h.jsonl
    │   ├── 2024-01-30-08h.jsonl
    │   ├── 2024-01-31-08h.jsonl
    │   ├── 2024-02-01-08h.jsonl
    │   ├── 2024-02-02-08h.jsonl
    │   ├── 2024-02-06-08h.jsonl
    │   ├── 2024-02-07-08h.jsonl
    │   ├── 2024-02-08-08h.jsonl
    │   ├── 2024-02-09-08h.jsonl
    │   ├── 2024-02-13-08h.jsonl
    │   ├── 2024-02-14-08h.jsonl
    │   ├── 2024-02-15-08h.jsonl
    │   ├── 2024-02-16-08h.jsonl
    │   ├── 2024-02-20-08h.jsonl
    │   ├── 2024-02-21-08h.jsonl
    │   ├── 2024-02-22-08h.jsonl
    │   ├── 2024-02-23-08h.jsonl
    │   ├── 2024-02-27-08h.jsonl
    │   ├── 2024-02-28-08h.jsonl
    │   ├── 2024-02-29-08h.jsonl
    │   ├── 2024-03-01-08h.jsonl
    │   ├── 2024-03-05-08h.jsonl
    │   ├── 2024-03-06-08h.jsonl
    │   ├── 2024-03-07-08h.jsonl
    │   ├── 2024-03-08-08h.jsonl
    │   ├── 2024-03-08-16h.jsonl
    │   ├── 2024-03-09-00h.jsonl
    │   ├── 2024-03-09-08h.jsonl
    │   ├── 2024-03-09-16h.jsonl
    │   ├── 2024-03-10-00h.jsonl
    │   ├── 2024-03-12-08h.jsonl
    │   ├── 2024-03-13-00h.jsonl
    │   ├── 2024-03-13-08h.jsonl
    │   ├── 2024-03-13-16h.jsonl
    │   ├── 2024-03-14-00h.jsonl
    │   ├── 2024-03-14-08h.jsonl
    │   ├── 2024-03-15-08h.jsonl
    │   ├── 2024-03-19-08h.jsonl
    │   ├── 2024-03-20-00h.jsonl
    │   ├── 2024-03-20-08h.jsonl
    │   ├── 2024-03-21-08h.jsonl
    │   ├── 2024-03-22-08h.jsonl
    │   ├── 2024-03-26-08h.jsonl
    │   ├── 2024-03-26-16h.jsonl
    │   ├── 2024-03-27-00h.jsonl
    │   ├── 2024-03-27-08h.jsonl
    │   ├── 2024-03-28-08h.jsonl
    │   ├── 2024-03-29-08h.jsonl
    │   ├── 2024-04-03-08h.jsonl
    │   ├── 2024-04-04-08h.jsonl
    │   ├── 2024-04-05-00h.jsonl
    │   ├── 2024-04-05-16h.jsonl
    │   ├── 2024-04-09-08h.jsonl
    │   ├── 2024-04-10-08h.jsonl
    │   ├── 2024-04-11-00h.jsonl
    │   ├── 2024-04-11-08h.jsonl
    │   ├── 2024-04-12-08h.jsonl
    │   ├── 2024-04-15-02h.jsonl
    │   ├── 2024-04-16-08h.jsonl
    │   ├── 2024-04-17-08h.jsonl
    │   ├── 2024-04-18-08h.jsonl
    │   ├── 2024-04-19-00h.jsonl
    │   ├── 2024-04-19-08h.jsonl
    │   ├── 2024-04-23-08h.jsonl
    │   ├── 2024-04-24-08h.jsonl
    │   ├── 2024-04-25-08h.jsonl
    │   ├── 2024-04-26-08h.jsonl
    │   ├── 2024-04-30-08h.jsonl
    │   ├── 2024-05-01-08h.jsonl
    │   ├── 2024-05-02-08h.jsonl
    │   ├── 2024-05-03-08h.jsonl
    │   ├── 2024-05-07-08h.jsonl
    │   ├── 2024-05-08-01h.jsonl
    │   ├── 2024-05-08-08h.jsonl
    │   ├── 2024-05-09-08h.jsonl
    │   ├── 2024-05-10-08h.jsonl
    │   ├── 2024-05-14-08h.jsonl
    │   ├── 2024-05-15-08h.jsonl
    │   ├── 2024-05-16-08h.jsonl
    │   ├── 2024-05-17-08h.jsonl
    │   ├── 2024-05-21-08h.jsonl
    │   ├── 2024-05-22-00h.jsonl
    │   ├── 2024-05-22-08h.jsonl
    │   ├── 2024-05-24-08h.jsonl
    │   ├── 2024-05-28-08h.jsonl
    │   ├── 2024-05-29-08h.jsonl
    │   ├── 2024-05-30-08h.jsonl
    │   ├── 2024-05-31-08h.jsonl
    │   ├── 2024-06-05-08h.jsonl
    │   ├── 2024-06-06-08h.jsonl
    │   ├── 2024-06-07-08h.jsonl
    │   ├── 2024-06-11-08h.jsonl
    │   ├── 2024-06-12-08h.jsonl
    │   ├── 2024-06-13-08h.jsonl
    │   ├── 2024-06-14-00h.jsonl
    │   ├── 2024-06-14-16h.jsonl
    │   ├── 2024-06-21-08h.jsonl
    │   ├── 2024-06-25-08h.jsonl
    │   ├── 2024-06-26-08h.jsonl
    │   ├── 2024-06-27-08h.jsonl
    │   ├── 2024-06-28-08h.jsonl
    │   ├── 2024-07-03-08h.jsonl
    │   ├── 2024-07-04-08h.jsonl
    │   ├── 2024-07-09-08h.jsonl
    │   ├── 2024-07-10-08h.jsonl
    │   ├── 2024-07-11-08h.jsonl
    │   ├── 2024-07-12-16h.jsonl
    │   ├── 2024-07-13-00h.jsonl
    │   ├── 2024-07-13-08h.jsonl
    │   ├── 2024-07-16-08h.jsonl
    │   ├── 2024-07-17-08h.jsonl
    │   ├── 2024-07-18-08h.jsonl
    │   ├── 2024-07-19-08h.jsonl
    │   ├── 2024-07-23-08h.jsonl
    │   ├── 2024-07-24-08h.jsonl
    │   ├── 2024-07-25-00h.jsonl
    │   ├── 2024-07-25-08h.jsonl
    │   ├── 2024-07-26-00h.jsonl
    │   ├── 2024-07-26-08h.jsonl
    │   ├── 2024-07-30-08h.jsonl
    │   ├── 2024-07-31-08h.jsonl
    │   ├── 2024-08-01-08h.jsonl
    │   ├── 2024-08-06-08h.jsonl
    │   ├── 2024-08-07-00h.jsonl
    │   ├── 2024-08-07-08h.jsonl
    │   ├── 2024-08-08-08h.jsonl
    │   ├── 2024-08-09-08h.jsonl
    │   ├── 2024-08-12-08h.jsonl
    │   ├── 2024-08-13-08h.jsonl
    │   ├── 2024-08-14-08h.jsonl
    │   ├── 2024-08-15-08h.jsonl
    │   ├── 2024-08-16-08h.jsonl
    │   ├── 2024-08-20-08h.jsonl
    │   ├── 2024-08-21-08h.jsonl
    │   ├── 2024-08-22-08h.jsonl
    │   ├── 2024-08-23-08h.jsonl
    │   ├── 2024-08-27-08h.jsonl
    │   ├── 2024-08-28-08h.jsonl
    │   ├── 2024-08-28-16h.jsonl
    │   ├── 2024-08-29-00h.jsonl
    │   ├── 2024-08-29-08h.jsonl
    │   ├── 2024-08-30-08h.jsonl
    │   ├── 2024-09-05-08h.jsonl
    │   ├── 2024-09-06-00h.jsonl
    │   ├── 2024-09-06-08h.jsonl
    │   ├── 2024-09-10-08h.jsonl
    │   ├── 2024-09-11-00h.jsonl
    │   ├── 2024-09-11-08h.jsonl
    │   ├── 2024-09-12-08h.jsonl
    │   ├── 2024-09-13-08h.jsonl
    │   ├── 2024-09-17-08h.jsonl
    │   ├── 2024-09-18-08h.jsonl
    │   ├── 2024-09-19-08h.jsonl
    │   ├── 2024-09-26-00h.jsonl
    │   ├── 2024-09-26-08h.jsonl
    │   ├── 2024-09-27-16h.jsonl
    │   ├── 2024-10-02-00h.jsonl
    │   ├── 2024-10-03-08h.jsonl
    │   ├── 2024-10-04-08h.jsonl
    │   ├── 2024-10-08-08h.jsonl
    │   ├── 2024-10-10-08h.jsonl
    │   ├── 2024-10-11-16h.jsonl
    │   ├── 2024-10-15-08h.jsonl
    │   ├── 2024-10-16-08h.jsonl
    │   ├── 2024-10-17-08h.jsonl
    │   ├── 2024-10-18-08h.jsonl
    │   ├── 2024-10-22-08h.jsonl
    │   ├── 2024-10-23-08h.jsonl
    │   ├── 2024-10-24-00h.jsonl
    │   ├── 2024-10-24-08h.jsonl
    │   ├── 2024-10-25-08h.jsonl
    │   ├── 2024-10-29-08h.jsonl
    │   ├── 2024-10-30-08h.jsonl
    │   ├── 2024-10-31-00h.jsonl
    │   ├── 2024-10-31-08h.jsonl
    │   ├── 2024-11-01-08h.jsonl
    │   ├── 2024-11-05-08h.jsonl
    │   ├── 2024-11-06-08h.jsonl
    │   ├── 2024-11-07-08h.jsonl
    │   ├── 2024-11-08-08h.jsonl
    │   ├── 2024-11-12-08h.jsonl
    │   ├── 2024-11-13-08h.jsonl
    │   ├── 2024-11-14-16h.jsonl
    │   ├── 2024-11-15-08h.jsonl
    │   ├── 2024-11-15-16h.jsonl
    │   ├── 2024-11-16-00h.jsonl
    │   ├── 2024-11-16-08h.jsonl
    │   ├── 2024-11-16-16h.jsonl
    │   ├── 2024-11-17-00h.jsonl
    │   ├── 2024-11-19-08h.jsonl
    │   ├── 2024-11-20-08h.jsonl
    │   ├── 2024-11-21-08h.jsonl
    │   ├── 2024-11-22-08h.jsonl
    │   ├── 2024-11-26-08h.jsonl
    │   ├── 2024-11-27-08h.jsonl
    │   ├── 2024-11-28-08h.jsonl
    │   ├── 2024-12-04-08h.jsonl
    │   ├── 2024-12-05-08h.jsonl
    │   ├── 2024-12-06-08h.jsonl
    │   ├── 2024-12-10-08h.jsonl
    │   ├── 2024-12-11-08h.jsonl
    │   ├── 2024-12-12-08h.jsonl
    │   ├── 2024-12-13-08h.jsonl
    │   ├── 2024-12-17-08h.jsonl
    │   ├── 2024-12-18-16h.jsonl
    │   ├── 2024-12-19-08h.jsonl
    │   ├── 2024-12-20-08h.jsonl
    │   ├── 2024-12-24-08h.jsonl
    │   ├── 2024-12-25-08h.jsonl
    │   ├── 2024-12-31-08h.jsonl
    │   ├── 2025-01-07-08h.jsonl
    │   ├── 2025-01-08-08h.jsonl
    │   ├── 2025-01-09-08h.jsonl
    │   ├── 2025-01-10-00h.jsonl
    │   ├── 2025-01-10-08h.jsonl
    │   ├── 2025-01-14-16h.jsonl
    │   ├── 2025-01-15-16h.jsonl
    │   ├── 2025-01-16-16h.jsonl
    │   ├── 2025-01-17-08h.jsonl
    │   ├── 2025-01-22-08h.jsonl
    │   ├── 2025-01-23-08h.jsonl
    │   ├── 2025-01-24-08h.jsonl
    │   ├── 2025-01-28-08h.jsonl
    │   ├── 2025-01-29-08h.jsonl
    │   ├── 2025-01-30-08h.jsonl
    │   ├── 2025-01-31-08h.jsonl
    │   ├── 2025-02-05-08h.jsonl
    │   ├── 2025-02-06-08h.jsonl
    │   ├── 2025-02-07-08h.jsonl
    │   ├── 2025-02-11-08h.jsonl
    │   ├── 2025-02-12-08h.jsonl
    │   ├── 2025-02-13-08h.jsonl
    │   ├── 2025-02-14-08h.jsonl
    │   ├── 2025-02-18-08h.jsonl
    │   ├── 2025-02-19-08h.jsonl
    │   ├── 2025-02-20-08h.jsonl
    │   ├── 2025-02-21-08h.jsonl
    │   ├── 2025-02-25-08h.jsonl
    │   ├── 2025-02-26-08h.jsonl
    │   ├── 2025-02-27-08h.jsonl
    │   ├── 2025-02-28-08h.jsonl
    │   ├── 2025-03-05-16h.jsonl
    │   ├── 2025-03-06-08h.jsonl
    │   ├── 2025-03-07-08h.jsonl
    │   ├── 2025-03-11-08h.jsonl
    │   ├── 2025-03-12-00h.jsonl
    │   ├── 2025-03-12-08h.jsonl
    │   ├── 2025-03-13-08h.jsonl
    │   ├── 2025-03-14-08h.jsonl
    │   ├── 2025-03-18-08h.jsonl
    │   ├── 2025-03-19-08h.jsonl
    │   ├── 2025-03-20-08h.jsonl
    │   ├── 2025-03-21-08h.jsonl
    │   ├── 2025-03-25-08h.jsonl
    │   ├── 2025-03-26-08h.jsonl
    │   ├── 2025-03-27-08h.jsonl
    │   ├── 2025-03-28-08h.jsonl
    │   ├── 2025-04-01-08h.jsonl
    │   ├── 2025-04-03-08h.jsonl
    │   ├── 2025-04-04-00h.jsonl
    │   ├── 2025-04-04-08h.jsonl
    │   ├── 2025-04-08-08h.jsonl
    │   ├── 2025-04-09-08h.jsonl
    │   ├── 2025-04-10-08h.jsonl
    │   ├── 2025-04-11-08h.jsonl
    │   ├── 2025-04-15-08h.jsonl
    │   ├── 2025-04-16-08h.jsonl
    │   ├── 2025-04-17-08h.jsonl
    │   ├── 2025-04-18-08h.jsonl
    │   ├── 2025-04-22-08h.jsonl
    │   ├── 2025-04-23-08h.jsonl
    │   ├── 2025-04-24-08h.jsonl
    │   ├── 2025-04-25-08h.jsonl
    │   ├── 2025-04-29-08h.jsonl
    │   ├── 2025-04-30-08h.jsonl
    │   ├── 2025-05-01-08h.jsonl
    │   ├── 2025-05-02-08h.jsonl
    │   ├── 2025-05-06-08h.jsonl
    │   ├── 2025-05-07-00h.jsonl
    │   ├── 2025-05-07-08h.jsonl
    │   ├── 2025-05-08-08h.jsonl
    │   ├── 2025-05-09-08h.jsonl
    │   ├── 2025-05-13-08h.jsonl
    │   ├── 2025-05-14-08h.jsonl
    │   ├── 2025-05-15-08h.jsonl
    │   ├── 2025-05-16-00h.jsonl
    │   ├── 2025-05-16-08h.jsonl
    │   ├── 2025-05-20-08h.jsonl
    │   ├── 2025-05-21-00h.jsonl
    │   ├── 2025-05-21-08h.jsonl
    │   ├── 2025-05-22-00h.jsonl
    │   ├── 2025-05-22-08h.jsonl
    │   ├── 2025-05-23-00h.jsonl
    │   ├── 2025-05-23-08h.jsonl
    │   ├── 2025-05-27-08h.jsonl
    │   ├── 2025-05-27-16h.jsonl
    │   ├── 2025-05-28-00h.jsonl
    │   ├── 2025-05-28-08h.jsonl
    │   ├── 2025-05-29-00h.jsonl
    │   ├── 2025-05-29-08h.jsonl
    │   ├── 2025-05-30-08h.jsonl
    │   ├── 2025-06-04-08h.jsonl
    │   ├── 2025-06-05-00h.jsonl
    │   ├── 2025-06-05-08h.jsonl
    │   ├── 2025-06-06-00h.jsonl
    │   └── 2025-06-06-08h.jsonl
├── frontpage
    ├── __init__.py
    ├── __main__.py
    ├── _benchmark.py
    ├── constants.py
    ├── datastream.py
    ├── download.py
    ├── modelling.py
    ├── recipe.py
    ├── types.py
    └── utils.py
├── images
    ├── active-teaching.png
    ├── multiheads.png
    └── sentence-model.png
├── index.html
├── prodigy.json
├── requirements-build.txt
├── requirements-download.txt
├── setup.py
├── taskfile.yml
└── templates
    └── home.html


/.github/workflows/download.yml:
--------------------------------------------------------------------------------
 1 | name: Download New Data
 2 | 
 3 | on:
 4 |   workflow_dispatch:
 5 |   schedule:
 6 |     - cron:  '10 */8 * * *'
 7 | 
 8 | jobs:
 9 |   scheduled:
10 |     runs-on: ubuntu-latest
11 |     steps:
12 |     - name: Check out this repo
13 |       uses: actions/checkout@v2
14 |     - name: Set up Python ${{ matrix.python-version }}
15 |       uses: actions/setup-python@v1
16 |       with:
17 |         python-version: 3.9
18 |         cache: 'pip'
19 |     - name: Install dependencies
20 |       run: |
21 |         python -m pip install --upgrade pip
22 |         python -m pip install -r requirements-download.txt
23 |         python -m spacy download en_core_web_sm
24 |         python -m pip install -e .
25 |     - name: Fetch latest data
26 |       run: python -m frontpage download
27 |     - name: Commit and push if it changed
28 |       run: |-
29 |         git config user.name "Automated"
30 |         git config user.email "actions@users.noreply.github.com"
31 |         git add -A
32 |         timestamp=$(date -u)
33 |         git commit -m "Latest data: ${timestamp}" || exit 0
34 |         git push
35 | 


--------------------------------------------------------------------------------
/.github/workflows/rebuild.yml:
--------------------------------------------------------------------------------
 1 | name: Rebuild Site
 2 | 
 3 | env:
 4 |   WANDB_API_KEY: ${{ secrets.WANDB_API_KEY }}
 5 | 
 6 | on:
 7 |   workflow_dispatch:
 8 |   schedule:
 9 |     - cron:  '0 9 * * *'
10 | 
11 | jobs:
12 |   scheduled:
13 |     runs-on: ubuntu-latest
14 |     steps:
15 |     - name: Check out this repo
16 |       uses: actions/checkout@v2
17 |     - name: Set up Python ${{ matrix.python-version }}
18 |       uses: actions/setup-python@v4
19 |       with:
20 |         python-version: 3.9
21 |         cache: 'pip'
22 |         cache-dependency-path: |
23 |             requirements-build.txt
24 |     - name: Install dependencies
25 |       run: |
26 |         python -m pip install --upgrade pip
27 |         python -m pip install -r requirements-build.txt
28 |         python -m pip install -e .
29 |     - name: Build new site
30 |       run: |
31 |         python -m frontpage artifact download 
32 |         python -m frontpage build --retrain --preprocess
33 |     - name: Commit and push if it changed
34 |       run: |-
35 |         git config user.name "Automated"
36 |         git config user.email "actions@users.noreply.github.com"
37 |         git add index.html
38 |         timestamp=$(date -u)
39 |         git commit -m "Latest data: ${timestamp}" || exit 0
40 |         git push
41 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
  1 | # Byte-compiled / optimized / DLL files
  2 | __pycache__/
  3 | *.py[cod]
  4 | *$py.class
  5 | 
  6 | # C extensions
  7 | *.so
  8 | 
  9 | # Distribution / packaging
 10 | .Python
 11 | build/
 12 | develop-eggs/
 13 | dist/
 14 | eggs/
 15 | .eggs/
 16 | lib/
 17 | lib64/
 18 | parts/
 19 | sdist/
 20 | var/
 21 | wheels/
 22 | share/python-wheels/
 23 | *.egg-info/
 24 | .installed.cfg
 25 | *.egg
 26 | MANIFEST
 27 | 
 28 | # PyInstaller
 29 | #  Usually these files are written by a python script from a template
 30 | #  before PyInstaller builds the exe, so as to inject date/other infos into it.
 31 | *.manifest
 32 | *.spec
 33 | 
 34 | # Installer logs
 35 | pip-log.txt
 36 | pip-delete-this-directory.txt
 37 | 
 38 | # Unit test / coverage reports
 39 | htmlcov/
 40 | .tox/
 41 | .nox/
 42 | .coverage
 43 | .coverage.*
 44 | .cache
 45 | nosetests.xml
 46 | coverage.xml
 47 | *.cover
 48 | *.py,cover
 49 | .hypothesis/
 50 | .pytest_cache/
 51 | cover/
 52 | 
 53 | # Translations
 54 | *.mo
 55 | *.pot
 56 | 
 57 | # Django stuff:
 58 | *.log
 59 | local_settings.py
 60 | db.sqlite3
 61 | db.sqlite3-journal
 62 | 
 63 | # Flask stuff:
 64 | instance/
 65 | .webassets-cache
 66 | 
 67 | # Scrapy stuff:
 68 | .scrapy
 69 | 
 70 | # Sphinx documentation
 71 | docs/_build/
 72 | 
 73 | # PyBuilder
 74 | .pybuilder/
 75 | target/
 76 | 
 77 | # Jupyter Notebook
 78 | .ipynb_checkpoints
 79 | 
 80 | # IPython
 81 | profile_default/
 82 | ipython_config.py
 83 | 
 84 | # pyenv
 85 | #   For a library or package, you might want to ignore these files since the code is
 86 | #   intended to run in multiple environments; otherwise, check them in:
 87 | # .python-version
 88 | 
 89 | # pipenv
 90 | #   According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
 91 | #   However, in case of collaboration, if having platform-specific dependencies or dependencies
 92 | #   having no cross-platform support, pipenv may install dependencies that don't work, or not
 93 | #   install all needed dependencies.
 94 | #Pipfile.lock
 95 | 
 96 | # poetry
 97 | #   Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control.
 98 | #   This is especially recommended for binary packages to ensure reproducibility, and is more
 99 | #   commonly ignored for libraries.
100 | #   https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control
101 | #poetry.lock
102 | 
103 | # pdm
104 | #   Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control.
105 | #pdm.lock
106 | #   pdm stores project-wide configurations in .pdm.toml, but it is recommended to not include it
107 | #   in version control.
108 | #   https://pdm.fming.dev/#use-with-ide
109 | .pdm.toml
110 | 
111 | # PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm
112 | __pypackages__/
113 | 
114 | # Celery stuff
115 | celerybeat-schedule
116 | celerybeat.pid
117 | 
118 | # SageMath parsed files
119 | *.sage.py
120 | 
121 | # Environments
122 | .env
123 | .venv
124 | env/
125 | venv/
126 | ENV/
127 | env.bak/
128 | venv.bak/
129 | 
130 | # Spyder project settings
131 | .spyderproject
132 | .spyproject
133 | 
134 | # Rope project settings
135 | .ropeproject
136 | 
137 | # mkdocs documentation
138 | /site
139 | 
140 | # mypy
141 | .mypy_cache/
142 | .dmypy.json
143 | dmypy.json
144 | 
145 | # Pyre type checker
146 | .pyre/
147 | 
148 | # pytype static type analyzer
149 | .pytype/
150 | 
151 | # Cython debug symbols
152 | cython_debug/
153 | 
154 | # PyCharm
155 | #  JetBrains specific template is maintained in a separate JetBrains.gitignore that can
156 | #  be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore
157 | #  and can be added to the global gitignore or merged into this file.  For a more nuclear
158 | #  option (not recommended) you can uncomment the following to ignore the entire idea folder.
159 | #.idea/
160 | 
161 | project.lock
162 | indices
163 | raw
164 | training
165 | wandb
166 | cache
167 | cleaned
168 | .DS_Store
169 | .idea
170 | .task
171 | tmp
172 | data/annot/active-learn.jsonl
173 | data/annot/second-opinion.jsonl
174 | configs/*/*.spacy
175 | *.ipynb


--------------------------------------------------------------------------------
/Makefile:
--------------------------------------------------------------------------------
1 | install:
2 | 	python -m pip install --upgrade pip
3 | 	python -m pip install -r requirements-build.txt
4 | 	python -m pip install -e .
5 | 	python -m spacy download en_core_web_sm
6 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | # Arxiv Frontpage 
 2 | 
 3 | Today's frontpage can be viewed here:
 4 | 
 5 | https://koaning.github.io/arxiv-frontpage/
 6 | 
 7 | ## What's this? 
 8 | 
 9 | This project is an attempt at making my own frontpage of Arxiv. Every day this project does [git-scraping](https://simonwillison.net/2020/Oct/9/git-scraping/) on new Arxiv articles via [this Python API](https://pypi.org/project/arxiv/). Then, another cronjob runs a script that attempts to make recommendations based on annotations that reside in this repo. This is then committed as a new `index.html` page which is hosted by Github pages.
10 | 
11 | This project is very much a personal one and may certainly see a bunch of changes in the future. But I figured it would be nice to host it publicly so that it may inspire other folks to make their own feed as well. 
12 | 
13 | ## Contents 
14 | 
15 | - There is a `config.yml` file that contains definitions of the labels. All scripts will make assumptions based on the contents of this file. 
16 | - There is a [taskfile](https://taskfile.dev/) that contains some common commands. 
17 | - There is a `.github` folder that contains all the cronjobs.
18 | - There is a `frontpage` Python module that contains all the logic to prepare data for annotation, to train sentence-models and to build the new site. 
19 | - There are two `benchmark*.ipynb` files that contain some scripts that I've used to run benchmarks. Some attempts done with LLMs via `spacy-llm` while others were done with [pretrained embeddings](https://github.com/koaning/embetter).
20 | - This project assumes a `.env` file, which you can use if you intend to use weights and biases to store custom sentence transformers or use external embedding providers.
21 | 
22 | ## Notes 
23 | 
24 | This project also explores how to pragmatically bootstrap a predictive project. There are a few things in particular worth highlighting that feels somewhat unique. 
25 | 
26 | First off, instead of active learning this project assumes active teaching. There are multiple methods available to select a subset of interest which help the user steer the algorithm. 
27 | 
28 | ![](/images/active-teaching.png)
29 | 
30 | If you want to explore the options, you can run:
31 | 
32 | ```
33 | python -m frontpage annotate
34 | ```
35 | 
36 | This will give a menu that you can use to select the subset selection method. You can annotate on a sentence-level or abstract-level and select from a number of tricks to find an interesting subset. 
37 | 
38 | In terms of modelling, this project employs a sentence-model that makes a prediction per sentence. It's possibly not _the_ most performant modelling approach, but it is easy to interpret. It also helps make the model more understandable in the UI.
39 | 
40 | ![](/images/sentence-model.png)
41 | 
42 | This sentence-model can be trained by first finetuning the embedding by using a [setfit](https://github.com/huggingface/setfit)-like approach. We first use all the labels to finetune a new embedding layer, after which we train a classifier head for each label.
43 | 
44 | ![](/images/multiheads.png)
45 | 
46 | I ended up writing custom implementations for a lot of this because my annotations don't really fit the "multi-label classification" setting. Some sentences may have two labels, but many will only have a single one. That means that my label matrix should have lots of `nan`-values, which goes against the assumptions of many libraries out there. 
47 | 
48 | ![](/images/why-custom.png)
49 | 


--------------------------------------------------------------------------------
/config.yml:
--------------------------------------------------------------------------------
 1 | sections:
 2 |   - name: "New Datasets"
 3 |     instructions: | 
 4 |       This label represents arxiv articles that discuss the release of a new dataset. 
 5 |       When annotating, it's important that the sentence in question clearly confirms,
 6 |       without a doubt, that the article is about a new dataset or corpus. New benchmarks
 7 |       on a existing dataset are not of interest.
 8 |     label: "new-dataset"
 9 |     threshold: 0.7
10 |   - name: "Data Quality"
11 |     instructions: |
12 |       This label represents arxiv articles that discuss data quality issues in existing 
13 |       datasets. It may also relate to annotation issues.
14 |     label: "data-quality"
15 |     threshold: 0.6
16 |   - name: "Benchmarks"
17 |     instructions: |
18 |       This label represents the discussion of a benchmark.
19 |     label: "benchmark"
20 |     threshold: 0.6
21 |   - name: "LLMs"
22 |     instructions: |
23 |       This label represents the discussion of a large language models.
24 |     label: "llms"
25 |     threshold: 0.6
26 |   - name: "Developer Research"
27 |     instructions: |
28 |       This label represents arxiv articles that discuss research on how to be a developer. 
29 |       Mainly targeted towards quantitative research.
30 |     label: "dev-research"
31 |     threshold: 0.6
32 |   - name: "Data Annotation Techniques"
33 |     instructions: |
34 |       This label represents research related to data annotation. Could be annotator agreement
35 |       stuff, interfaces, annotations processes ... that stuff!
36 |     label: "data-annotation"
37 |     threshold: 0.6


--------------------------------------------------------------------------------
/configs/examples.json:
--------------------------------------------------------------------------------
 1 | [
 2 |   {
 3 |     "text": "We will release the dataset and code to facilitate future endeavors.",
 4 |     "answer": "DATASET"
 5 |   },
 6 |   {
 7 |     "text": "In this paper we introduce a new exciting corpus for machine learning",
 8 |     "answer": "DATASET"
 9 |   },
10 |   {
11 |     "text": "This benchmark demonstrates state of the art performance on five datasets.",
12 |     "answer": "==NONE=="
13 |   }
14 | ]


--------------------------------------------------------------------------------
/configs/spacy-llm-cohere-lg.cfg:
--------------------------------------------------------------------------------
 1 | [nlp]
 2 | lang = "en"
 3 | pipeline = ["llm"]
 4 | 
 5 | [components]
 6 | 
 7 | [components.llm]
 8 | factory = "llm"
 9 | save_io = True 
10 | 
11 | [components.llm.task]
12 | @llm_tasks = "spacy.TextCat.v3"
13 | labels = ["DATASET"]
14 | 
15 | [components.llm.task.label_definitions]
16 | DATASET = "Clear indication that the topic of the abstract is a new dataset"
17 | 
18 | [components.llm.model]
19 | @llm_models = "spacy.Command.v1"
20 | name = "command"
21 | config = {"temperature": 0.3}
22 | 
23 | [components.llm.task.examples]
24 | @misc = "spacy.FewShotReader.v1"
25 | path = "configs/examples.json"
26 | 
27 | [components.llm.cache]
28 | @llm_misc = "spacy.BatchCache.v1"
29 | path = "configs/cohere-cache-lg"
30 | batch_size = 3
31 | max_batches_in_mem = 4
32 | 


--------------------------------------------------------------------------------
/configs/spacy-llm-cohere-md.cfg:
--------------------------------------------------------------------------------
 1 | [nlp]
 2 | lang = "en"
 3 | pipeline = ["llm"]
 4 | 
 5 | [components]
 6 | 
 7 | [components.llm]
 8 | factory = "llm"
 9 | save_io = True 
10 | 
11 | [components.llm.task]
12 | @llm_tasks = "spacy.TextCat.v3"
13 | labels = ["DATASET"]
14 | 
15 | [components.llm.task.label_definitions]
16 | DATASET = "Clear indication that the topic of the abstract is a new dataset"
17 | 
18 | [components.llm.model]
19 | @llm_models = "spacy.Command.v1"
20 | name = "command"
21 | config = {"temperature": 0.3}
22 | 
23 | [components.llm.cache]
24 | @llm_misc = "spacy.BatchCache.v1"
25 | path = "configs/cohere-cache-md"
26 | batch_size = 3
27 | max_batches_in_mem = 4
28 | 


--------------------------------------------------------------------------------
/configs/spacy-llm-cohere-sm.cfg:
--------------------------------------------------------------------------------
 1 | [nlp]
 2 | lang = "en"
 3 | pipeline = ["llm"]
 4 | 
 5 | [components]
 6 | 
 7 | [components.llm]
 8 | factory = "llm"
 9 | save_io = True 
10 | 
11 | [components.llm.task]
12 | @llm_tasks = "spacy.TextCat.v3"
13 | labels = ["DATASET"]
14 | 
15 | [components.llm.model]
16 | @llm_models = "spacy.Command.v1"
17 | name = "command"
18 | config = {"temperature": 0.3}
19 | 
20 | [components.llm.cache]
21 | @llm_misc = "spacy.BatchCache.v1"
22 | path = "configs/cohere-cache-sm"
23 | batch_size = 3
24 | max_batches_in_mem = 4
25 | 


--------------------------------------------------------------------------------
/configs/spacy-llm-openai-lg.cfg:
--------------------------------------------------------------------------------
 1 | [nlp]
 2 | lang = "en"
 3 | pipeline = ["llm"]
 4 | 
 5 | [components]
 6 | 
 7 | [components.llm]
 8 | factory = "llm"
 9 | save_io = True
10 | 
11 | [components.llm.task]
12 | @llm_tasks = "spacy.TextCat.v3"
13 | labels = ["DATASET"]
14 | 
15 | [components.llm.task.label_definitions]
16 | DATASET = "Clear indication that the topic of the abstract is a new dataset"
17 | 
18 | [components.llm.task.examples]
19 | @misc = "spacy.FewShotReader.v1"
20 | path = "configs/examples.json"
21 | 
22 | [components.llm.model]
23 | @llm_models = "spacy.GPT-3-5.v1"
24 | config = {"temperature": 0.3}
25 | 
26 | [components.llm.cache]
27 | @llm_misc = "spacy.BatchCache.v1"
28 | path = "configs/openai-cache-lg"
29 | batch_size = 3
30 | max_batches_in_mem = 4
31 | 


--------------------------------------------------------------------------------
/configs/spacy-llm-openai-md.cfg:
--------------------------------------------------------------------------------
 1 | [nlp]
 2 | lang = "en"
 3 | pipeline = ["llm"]
 4 | 
 5 | [components]
 6 | 
 7 | [components.llm]
 8 | factory = "llm"
 9 | save_io = True 
10 | 
11 | [components.llm.task]
12 | @llm_tasks = "spacy.TextCat.v3"
13 | labels = ["DATASET"]
14 | 
15 | [components.llm.task.label_definitions]
16 | DATASET = "Clear indication that the topic of the abstract is a new dataset"
17 | 
18 | [components.llm.model]
19 | @llm_models = "spacy.GPT-3-5.v1"
20 | config = {"temperature": 0.3}
21 | 
22 | [components.llm.cache]
23 | @llm_misc = "spacy.BatchCache.v1"
24 | path = "configs/openai-cache-md"
25 | batch_size = 3
26 | max_batches_in_mem = 4
27 | 


--------------------------------------------------------------------------------
/configs/spacy-llm-openai-sm.cfg:
--------------------------------------------------------------------------------
 1 | [nlp]
 2 | lang = "en"
 3 | pipeline = ["llm"]
 4 | 
 5 | [components]
 6 | 
 7 | [components.llm]
 8 | factory = "llm"
 9 | save_io = True
10 | 
11 | [components.llm.task]
12 | @llm_tasks = "spacy.TextCat.v3"
13 | labels = ["DATASET"]
14 | 
15 | [components.llm.model]
16 | @llm_models = "spacy.GPT-3-5.v1"
17 | config = {"temperature": 0.3}
18 | 
19 | [components.llm.cache]
20 | @llm_misc = "spacy.BatchCache.v1"
21 | path = "configs/openai-cache-sm"
22 | batch_size = 3
23 | max_batches_in_mem = 4
24 | 


--------------------------------------------------------------------------------
/configs/spacy-llm-openai-xl.cfg:
--------------------------------------------------------------------------------
 1 | [nlp]
 2 | lang = "en"
 3 | pipeline = ["llm"]
 4 | 
 5 | [components]
 6 | 
 7 | [components.llm]
 8 | factory = "llm"
 9 | save_io = True
10 | 
11 | [components.llm.task]
12 | @llm_tasks = "spacy.TextCat.v3"
13 | labels = ["DATASET"]
14 | 
15 | [components.llm.task.label_definitions]
16 | DATASET = "Clear indication that the topic of the abstract is a new dataset"
17 | 
18 | [components.llm.task.examples]
19 | @misc = "spacy.FewShotReader.v1"
20 | path = "configs/examples.json"
21 | 
22 | [components.llm.model]
23 | @llm_models = "spacy.GPT-4.v1"
24 | config = {"temperature": 0.3}
25 | 
26 | [components.llm.cache]
27 | @llm_misc = "spacy.BatchCache.v1"
28 | path = "configs/openai-cache-xl"
29 | batch_size = 3
30 | max_batches_in_mem = 4
31 | 


--------------------------------------------------------------------------------
/configs/spacy-llm-openai-xxl.cfg:
--------------------------------------------------------------------------------
 1 | [nlp]
 2 | lang = "en"
 3 | pipeline = ["llm"]
 4 | 
 5 | [components]
 6 | 
 7 | [components.llm]
 8 | factory = "llm"
 9 | save_io = True
10 | 
11 | [components.llm.task]
12 | @llm_tasks = "spacy.TextCat.v3"
13 | labels = ["DATASET"]
14 | 
15 | [components.llm.task.label_definitions]
16 | DATASET = "Clear indication that the topic of the abstract is a new dataset. It can be fine if the text mentions a benchmark, but only if it is clear that it is applied on a NEW dataset."
17 | 
18 | [components.llm.task.examples]
19 | @misc = "spacy.FewShotReader.v1"
20 | path = "configs/examples.json"
21 | 
22 | [components.llm.model]
23 | @llm_models = "spacy.GPT-4.v1"
24 | config = {"temperature": 0.3}
25 | 
26 | [components.llm.cache]
27 | @llm_misc = "spacy.BatchCache.v1"
28 | path = "configs/openai-cache-xxl"
29 | batch_size = 3
30 | max_batches_in_mem = 4
31 | 


--------------------------------------------------------------------------------
/data/annot/benchmark.jsonl:
--------------------------------------------------------------------------------
 1 | {"text":"We will release the dataset and code to facilitate future endeavors.","cats":{"new-dataset":1,"benchmark":0}}
 2 | {"text":"We release our dataset for others to use and build on.","cats":{"new-dataset":1,"benchmark":0}}
 3 | {"text":"We release the generated dataset and used prompts to facilitate future research.","cats":{"new-dataset":1,"benchmark":0}}
 4 | {"text":"These datasets included the latest second and third generation deepfake datasets.","cats":{"new-dataset":0,"benchmark":0}}
 5 | {"text":"The real-world datasets will be released.","cats":{"new-dataset":1,"benchmark":0}}
 6 | {"text":"Experimental results on multiple benchmark datasets demonstrate the effectiveness of our method.","cats":{"new-dataset":0,"benchmark":1}}
 7 | {"text":"We perform an exhaustive evaluation in two benchmark datasets.","cats":{"new-dataset":0,"benchmark":1}}
 8 | {"text":"We conduct experiments on two benchmark datasets.","cats":{"new-dataset":0,"benchmark":1}}
 9 | {"text":"Extensive experiments conducted on two benchmark datasets show that our approach achieves excellent performance compared to its competitors.","cats":{"new-dataset":0,"benchmark":1}}
10 | {"text":"We validate our scheme with some of the most popular benchmarking datasets.","cats":{"new-dataset":0,"benchmark":1}}
11 | {"text":"Our results improve the state-of-the-art on standard benchmarks.","cats":{"new-dataset":0,"benchmark":1}}
12 | {"text":"In addition, we provide extra annotations for used datasets and introduce our new benchmark.","cats":{"new-dataset":1,"benchmark":1}}
13 | {"text":"We then describe the dataset and the results of benchmarking.","cats":{"new-dataset":0,"benchmark":1}}
14 | {"text":"We finally conduct extensive analyses to understand the effectiveness of our method.","cats":{"new-dataset":0,"benchmark":1}}
15 | {"text":"Our method is effective and presents a significant improvement over the original model.","cats":{"new-dataset":0,"benchmark":1}}
16 | {"text":"Additionally, we employ the self-training strategy to improve the performance of our method further.","cats":{"new-dataset":0,"benchmark":0}}
17 | {"text":"Compared to a variety of baselines, our method achieves superior results.","cats":{"new-dataset":0,"data-quality":0,"benchmark":1}}
18 | {"text":"In order to implement the pretraining phase, we curated an expansive tabular dataset comprising approximately 13 billion samples, meticulously gathered from the Kaggle platform.","cats":{"new-dataset":1,"benchmark":0}}
19 | {"text":"Models and the dataset shall be released at https://github.com/OpenGVLab/All-Seeing, and demo can be seen at https://huggingface.co/spaces/OpenGVLab/all-seeing.","cats":{"new-dataset":0,"benchmark":0}}
20 | {"text":"It covers a wide range of 3.5 million common and rare concepts in the real world, and has 132.2 billion tokens that describe the concepts and their attributes.","cats":{"new-dataset":0,"benchmark":0}}
21 | {"text":"Leveraging this new dataset, we develop the All-Seeing model (ASM), a unified framework for panoptic visual recognition and understanding.","cats":{"new-dataset":0,"benchmark":0}}
22 | {"text":"We hope that this project can serve as a foundation for vision-language artificial general intelligence research.","cats":{"new-dataset":0,"benchmark":0}}
23 | {"text":"Extensive experiments demonstrate that the proposed method can surpass all baselines by a large margin.","cats":{"benchmark":1}}
24 | {"text":"The experimental results show that our method strongly outperforms the baselines.","cats":{"benchmark":1}}
25 | {"text":"Empirical results demonstrate the superiority of our method over other baselines.","cats":{"benchmark":1}}
26 | {"text":"We provide results of standard baseline methods.","cats":{"benchmark":1}}
27 | {"text":"Our method outperforms baselines in most tasks by a large margin.","cats":{"benchmark":1}}
28 | {"text":"Thorough comparisons with multiple baseline methods illustrate the strengths of our proposed methods.","cats":{"benchmark":1}}
29 | {"text":"We propose two specific methods and compare them with a baseline method.","cats":{"benchmark":1}}
30 | {"text":"Experimental results show that the method outperforms the baseline.","cats":{"benchmark":1}}
31 | {"text":"Experimental results show that our proposed method can achieve the accuracy@1 of 88.9\\%, which significantly outperforms other baselines by a large margin.","cats":{"benchmark":1}}
32 | {"text":"We benchmark our method as well as several state-of-the-art baselines and demonstrate the effectiveness of the proposed approach.","cats":{"benchmark":1}}
33 | {"text":"Some methods have achieved better results than baseline methods, and the winning methods have demonstrated superior prediction performance.","cats":{"benchmark":1}}
34 | {"text":"To facilitate research reuse, we release our code, trained model weights, and high quality pseudo-labels for the Argoverse 2 and Waymo Open datasets.","cats":{"benchmark":0}}
35 | {"text":"We present the All-Seeing (AS) project: a large-scale data and model for recognizing and understanding everything in the open world.","cats":{"benchmark":0}}
36 | {"text":"The model is trained with open-ended language prompts and locations, which allows it to generalize to various vision and language tasks with remarkable zero-shot performance, including region-text retrieval, region recognition, captioning, and question-answering.","cats":{"benchmark":0}}
37 | {"text":"Large language models (LLMs) have revolutionized NLP by solving downstream tasks with little to no labeled data.","cats":{"benchmark":0}}
38 | {"text":"Despite their versatile abilities, the larger question of their ability to reason remains ill-understood.","cats":{"benchmark":0}}
39 | {"text":"This paper addresses reasoning in math word problems (MWPs) by studying symbolic versions of the numeric problems, since a symbolic expression is a \"concise explanation\" of the numeric answer.","cats":{"benchmark":0}}
40 | {"text":"Our benchmark is available at https://github.com/FudanSELab/ClassEval.","cats":{"benchmark":1}}
41 | {"text":"Our method has attained better classification accuracy over existing methods with notable margins.","cats":{"benchmark":1}}
42 | {"text":"Returned results, show a decent performance of the proposed algorithm (99 % accuracy) in comparison with others.","cats":{"benchmark":1}}
43 | {"text":"For example, compared with several other related methods, UCDFormer improves performance on the Kappa coefficient by more than 12\\%.","cats":{"benchmark":1}}
44 | {"text":"Numerical results show the superiority of the proposed algorithm over state-of-the-art methods.","cats":{"benchmark":1}}
45 | {"text":"Our results reveal both limitations and promising aspects of adapted KGE methods.","cats":{"benchmark":1}}
46 | {"text":"Several numerical results are presented to illustrate the effectiveness of the proposed methodologies.","cats":{"benchmark":1}}
47 | {"text":"Our method achieves substantial improvements of +6% and","cats":{"benchmark":1}}
48 | {"text":"This is a challenging task in which two popular neural network baselines fail.","cats":{"benchmark":1}}
49 | {"text":"On a randomly selected and manually labeled 200 online reviews, CLAA achieved 92% accuracy while the SOTA baseline achieved 81.5%.","cats":{"benchmark":1}}
50 | {"text":"Existing methods under this perspective are also reviewed.   ","cats":{"benchmark":1}}
51 | {"text":"We demonstrate the capabilities of our approach on 11 different benchmarks.","cats":{"benchmark":1}}
52 | {"text":"We evaluate our method on a newly proposed benchmark.","cats":{"benchmark":1}}
53 | {"text":"Comprehensive experiments indicate that our method achieves state-of-the-art performance on widely-used benchmarks.","cats":{"benchmark":1}}
54 | {"text":"We give preliminary evidence suggesting the viability of the approach on a micro-benchmark.","cats":{"benchmark":1}}
55 | {"text":"Experimental results illustrate the effectiveness of our approach, where state-of-the-art performance is achieved on public benchmarks.","cats":{"benchmark":1}}
56 | {"text":"Extensive experiments on benchmark datasets demonstrate the effectiveness of our proposed method.","cats":{"benchmark":1}}
57 | {"text":"We evaluate our method on a wide range of benchmarks in different scales.","cats":{"benchmark":1}}
58 | {"text":"All benchmarks and all raw results are available1 for further analysis.","cats":{"benchmark":1}}
59 | {"text":"The performance of the algorithm is shown for a well-known benchmark.","cats":{"benchmark":1}}
60 | {"text":"Experiments on three benchmarks demonstrate the effectiveness of our method.","cats":{"benchmark":1}}
61 | {"text":"In the experiments, our framework achieves state-of-the-art results on several main benchmarks.","cats":{"benchmark":1}}
62 | {"text":"The results ascertain the efficacy of our technique.","cats":{"benchmark":1}}
63 | {"text":"Yet, much remains to be understood about how best to develop these techniques.","cats":{"benchmark":0}}
64 | {"text":"Method.","cats":{"benchmark":0}}
65 | {"text":"Methods.","cats":{"benchmark":0}}
66 | {"text":"Experiments demonstrate that the proposed method outperforms other methods.","cats":{"benchmark":1}}
67 | {"text":"METHODS:","cats":{"benchmark":0}}
68 | {"text":"A majority of our experiments were toward optimizing this technique, ensuring a proper representation of the technique's potential, since many of the details were new questions.","cats":{"benchmark":0}}
69 | {"text":"The results demonstrate a significant improvement over previous methods.","cats":{"benchmark":1}}
70 | 


--------------------------------------------------------------------------------
/data/annot/data-annotation.jsonl:
--------------------------------------------------------------------------------
1 | {"text":"Our experiments show that this approach consistently improves inter-annotator agreement and annotation accuracy.","cats":{"data-quality":1,"data-annotation":1}}
2 | {"text":"We advocate for the use of IAA in predicting the labeling quality of individual annotators, leading to cost and time efficiency in data production.","cats":{"data-quality":1,"data-annotation":1}}
3 | {"text":"This paper presents a novel approach of leveraging Inter-Annotator Agreement (IAA), traditionally used for assessing labeling consistency, to optimize Data Management Operations (DMOps).","cats":{"data-quality":1,"data-annotation":0}}
4 | {"text":"We propose and evaluate an additional application of our method leading to the detection of annotation errors.","cats":{"data-quality":1,"data-annotation":1}}
5 | {"text":"The extensive experimental results validate the efficacy of the proposed data annotation pipeline.","cats":{"data-annotation":0}}
6 | 


--------------------------------------------------------------------------------
/data/annot/llm.jsonl:
--------------------------------------------------------------------------------
 1 | {"text":"For stand-alone TPR execution, we perform both automatic and human evaluations on a fine-tuned T5 model, as well as OpenAI's GPT-3 LLMs.","cats":{"llm":1}}
 2 | {"text":"In this paper, we report on our investigation of an early version of GPT-4, when it was still in active development by OpenAI.","cats":{"llm":1}}
 3 | {"text":"GPT-3.5 and GPT-4 are the two most widely used large language model (LLM) services.","cats":{"llm":1}}
 4 | {"text":"The framework includes state-of-the-art open-access LLMs such as LLaMA, BLOOM, OPT, and GPT-J, as well as widely used adapters such as Series adapter, Parallel adapter, and LoRA.","cats":{"llm":1}}
 5 | {"text":"We investigate seven versions of GPT models, including ChatGPT.","cats":{"llm":1}}
 6 | {"text":"After deploying representative open-source LLMs (e.g., GPT-2-base and LLaMA model) at the edge and the cloud, we present the feasibility of NetGPT on the basis of low-rank adaptation-based light-weight fine-tuning.","cats":{"llm":1}}
 7 | {"text":"The latest model developed by OpenAI, GPT-4, was trained using an unprecedented scale of compute and data.","cats":{"llm":1}}
 8 | {"text":"Interestingly, despite being introduced four years ago, T5-based LLMs, such as FLAN-T5, continue to outperform the latest decoder-based LLMs, such as LLAMA and VICUNA, on tasks that require general problem-solving skills.","cats":{"llm":1}}
 9 | {"text":"We report on first experiments using the popular LLM GPT-3 and deliver some promising results.","cats":{"llm":1}}
10 | {"text":"Our model is a GPT2-like architecture with 350m parameters.","cats":{"llm":1}}
11 | {"text":"ChatGPT, developed by OpenAI, is one of the milestone large language models (LLMs) with 6 billion parameters.","cats":{"llm":1}}
12 | {"text":"We demonstrate that the proposed model consistently outperforms the baselines.","cats":{"llm":0}}
13 | {"text":"We validate our method on five datasets, empirically demonstrating that it outperforms the baseline methods in most cases and is valid over a wider range of training budgets.","cats":{"llm":0}}
14 | {"text":"Our empirical experiments show that \\autoknow~outperforms strong baselines by a significant margin on all datasets.","cats":{"llm":0}}
15 | {"text":"Empirically, our method achieves better performance than all baselines on multiple datasets.","cats":{"llm":0}}
16 | {"text":"Extensive experiments on six datasets show substantial improvements to the baseline.","cats":{"llm":0}}
17 | {"text":"Experiments on two datasets demonstrate that our proposed method outperforms the baselines and achieves new state-of-the-art performance.","cats":{"llm":0}}
18 | {"text":"Extensive experiments demonstrate that the proposed method can surpass all baselines by a large margin.","cats":{"llm":0,"benchmark":1}}
19 | 


--------------------------------------------------------------------------------
/data/annot/llms.jsonl:
--------------------------------------------------------------------------------
 1 | {"text":"Moreover, we find that LLMs might not be a fair judge if different LLMs are used for agents.","cats":{"llms":1}}
 2 | {"text":"To enable further research on PEFT methods of LLMs, this paper presents LLM-Adapters, an easy-to-use framework that integrates various adapters into LLMs and can execute these adapter-based PEFT methods of LLMs for different tasks.","cats":{"llms":1}}
 3 | {"text":"Our methods successfully identify families of LLMs and accurately cluster LLMs into meaningful subgroups.","cats":{"llms":1}}
 4 | {"text":"This paper introduces a novel human-LLM interaction framework, Low-code LLM.","cats":{"llms":1}}
 5 | {"text":"Consequently, evaluating the safety of LLMs has become an essential task for facilitating the broad applications of LLMs.","cats":{"llms":1}}
 6 | {"text":"Vision-LLMs instead post-hoc condition LLMs to `understand' the output of an image encoder.","cats":{"llms":1}}
 7 | {"text":"However, do LLMs explain themselves?","cats":{"llms":1}}
 8 | {"text":"We analyze how the proportion of LLM papers is increasing; the LLM-related topics receiving the most attention; the authors writing LLM papers; how authors' research topics correlate with their backgrounds; the factors distinguishing highly cited LLM papers; and the patterns of international collaboration.","cats":{"llms":1}}
 9 | {"text":"Previous methods, which primarily rely on model logits, have become less suitable for LLMs and even infeasible with the rise of closed-source LLMs (e.g., commercialized LLM APIs).","cats":{"llms":1}}
10 | {"text":"Instead of using public APIs of LLMs, we instruction tune an open-source LLM (3B Flan-T5-XL), in order to better adapt LLMs to recommender systems.","cats":{"llms":1}}
11 | {"text":"We believe SafetyBench will enable fast and comprehensive evaluation of LLMs' safety, and foster the development of safer LLMs.","cats":{"llms":1}}
12 | {"text":"With the emergence of large language models (LLMs), researchers have explored LLMs' potential as alternatives for human evaluation.","cats":{"llms":1}}
13 | {"text":"Yet, the two LLMs are closed source, and little is known about the LLMs' performance in real-world use cases.","cats":{"llms":1}}
14 | {"text":"This workshop will demonstrate the capabilities of LLMs to help attendees evaluate whether and how LLMs might be integrated into their pedagogy and research.","cats":{"llms":1}}
15 | {"text":"We reflect on human and LLMs' different sensitivities to instructions, stress the importance of enabling human-facing safeguards for LLMs, and discuss the potential of training humans and LLMs with complementary skill sets.","cats":{"llms":1}}
16 | {"text":"Based on our analysis of these LLMs, we claim that the average and peak power utilization in LLM clusters for inference should not be very high.","cats":{"llms":1}}
17 | {"text":"While uses of LLMs for CODL are valuable standalone, they are particularly valuable as part of LLM applications such as AI chatbots.","cats":{"llms":1}}
18 | {"text":"This work highlights the challenges and opportunities of discourse modeling for LLMs, which we hope can inspire the future design and evaluation of LLMs.","cats":{"llms":1}}
19 | {"text":"This paper proposes a LLM-Augmenter system, which augments a black-box LLM with a set of plug-and-play modules.","cats":{"llms":1}}
20 | {"text":"However, there is no comprehensive index of LLMs available.","cats":{"llms":1}}
21 | {"text":"However, it is not yet known the performance of LLMs on CLS.","cats":{"llms":1}}
22 | {"text":"We present ImageBind-LLM, a multi-modality instruction tuning method of large language models (LLMs) via ImageBind.","cats":{"llms":1}}
23 | {"text":"To bridge this gap, we propose LLMRec, a LLM-based recommender system designed for benchmarking LLMs on various recommendation tasks.","cats":{"llms":1}}
24 | {"text":"Given the huge influx of LLMs, it is of interest to know which LLM backbones, settings, training methods, and families are popular or trending.","cats":{"llms":1}}
25 | {"text":"Our aim is to offer invaluable insights to researchers in the realm of LLMs evaluation, thereby aiding the development of more proficient LLMs.","cats":{"llms":1}}
26 | {"text":"KGLLM provides a solution to enhance LLMs' factual reasoning ability, opening up new avenues for LLM research.","cats":{"llms":1}}
27 | {"text":"Among the regression models, Regularized Linear Regression was the most accurate for estimating MP a, and Polynomial Regression was the most accurate for estimating MP b.","cats":{"llms":0}}
28 | {"text":"We analyze the bit complexity of efficient algorithms for fundamental optimization problems, such as linear regression, $p$-norm regression, and linear programming (LP).","cats":{"llms":0}}
29 | {"text":"We study the problem of regression in a generalized linear model (GLM) with multiple signals and latent variables.","cats":{"llms":0}}
30 | {"text":"In this study we introduce a new online linear regression approach.","cats":{"llms":0}}
31 | {"text":"The linear regression model is trained using SGD, tracking weights and loss separately and zipping them finally.","cats":{"llms":0}}
32 | {"text":"The theoretical results are validated by numerical simulations for mixed linear regression, max-affine regression, and mixture-of-experts.","cats":{"llms":0}}
33 | {"text":"We input those features into linear regression models to infer 5 ECAS sub-scores and the total score.","cats":{"llms":0}}
34 | {"text":"This research enhances linear regression models by integrating a Kalman filter and analysing curve areas to minimize loss.","cats":{"llms":0}}
35 | 


--------------------------------------------------------------------------------
/data/annot/prompt-eng.jsonl:
--------------------------------------------------------------------------------
 1 | {"text":"To tackle these issues, we introduce DialogStudio: the largest and most diverse collection of dialogue datasets, unified under a consistent format while preserving their original information.","cats":{"new-dataset":1,"prompt-eng":0}}
 2 | {"text":"Our collection encompasses data from open-domain dialogues, task-oriented dialogues, natural language understanding, conversational recommendation, dialogue summarization, and knowledge-grounded dialogues, making it an incredibly rich and diverse resource for dialogue research and model training.","cats":{"new-dataset":1,"prompt-eng":0}}
 3 | {"text":"Large Language Models (LLMs) are popular for their impressive abilities, but the need for model-specific fine-tuning or task-specific prompt engineering can hinder their generalization.","cats":{"prompt-eng":1}}
 4 | {"text":"The specific assignment prompted students to define and explain their career goals as engineers.","cats":{"prompt-eng":0}}
 5 | {"text":"Prompt engineering typically requires hand-crafting a set of prompts for individual downstream tasks.","cats":{"prompt-eng":1}}
 6 | {"text":"It turns out that the key challenge lies in designing the most effective prompt for the LLM, a task called prompt engineering.","cats":{"prompt-eng":1}}
 7 | {"text":"Specifically, we add a set of ``task prompts'', each corresponding to a different task, and let each prompt predict task-related annotations.","cats":{"prompt-eng":1}}
 8 | {"text":"To accomplish this task, we employ prompt engineering, a technique that involves designing prompts to guide the LLMs towards the desired output.","cats":{"prompt-eng":1}}
 9 | {"text":"The context of our task leverages a generative model as an IR engine to evaluate the prompts' performance on image retrieval tasks.","cats":{"prompt-eng":1}}
10 | {"text":"Specifically, we first perform supervised fine-tuning with a pretrained language model on a small collection of manually engineered prompts.","cats":{"prompt-eng":1}}
11 | {"text":"We first devise a learnable universal prompt to describe the correlations among all tasks and then convert this prompt and image features into a task-specific prompt, which is fed to the decoder as a part of its input.","cats":{"prompt-eng":1}}
12 | {"text":"Our studies offer a deeper understanding of prompt engineering thereby opening up avenues for research on the future of prompt engineering.","cats":{"prompt-eng":1}}
13 | {"text":"Designing suitable prompts for specific visual tasks has emerged as a meaningful research direction.","cats":{"prompt-eng":0}}
14 | {"text":"We design task-specific prompts, by either leveraging another large-scale model, or simply manipulating the special tokens in the default prompts.","cats":{"prompt-eng":1}}
15 | {"text":"We adopt a specific prompting approach to solving the ranking task by LLMs: we carefully design the prompting template by including the sequential interaction history, the candidate items, and the ranking instruction.","cats":{"prompt-eng":1}}
16 | {"text":"To be specific, we design a set of prompts to fine-tune the pre-trained image captioner.","cats":{"prompt-eng":1}}
17 | {"text":"However, these approaches are task-specific; designing algorithms for new tasks is a cumbersome process.","cats":{"prompt-eng":0}}
18 | {"text":"Therefore, no further task-specific reward design is needed.","cats":{"prompt-eng":0}}
19 | {"text":"In this report, we aim to further mine ChatGPT's translation ability by revisiting several aspects: temperature, task information, and domain information, and correspondingly propose two (simple but effective) prompts: Task-Specific Prompts (TSP) and Domain-Specific Prompts (DSP).","cats":{"prompt-eng":1}}
20 | {"text":"However, existing methods struggle with either computational complexity or model expressivity, rendering the maximum sequence length restricted.","cats":{"prompt-eng":0,"data-quality":0}}
21 | {"text":"LongNet has significant advantages: 1) it has a linear computation complexity and a logarithm dependency between tokens; 2) it can be served as a distributed trainer for extremely long sequences; 3) its dilated attention is a drop-in replacement for standard attention, which can be seamlessly integrated with the existing Transformer-based optimization.","cats":{"prompt-eng":0}}
22 | {"text":"In the case where the region is the entire infinite triangular grid, we prove that the existence of a solution can be solved with an algorithm of complexity $O(|X|^3)$ where $X$ is the set of input edges.","cats":{"prompt-eng":0}}
23 | {"text":"Please check out our webpage at https://sites.google.com/view/automatic-prompt-engineer.","cats":{"prompt-eng":1}}
24 | {"text":"Creating a high-quality prompt that consists of a subject and several modifiers can be time-consuming and costly.","cats":{"prompt-eng":1}}
25 | {"text":"We refer to this approach as ``Promptonomy'', since the prompts model task-related structure.","cats":{"prompt-eng":1}}
26 | {"text":"Overall, our approach provides a robust and fundamental theoretical framework for selecting simple and effective prompts.","cats":{"prompt-eng":1}}
27 | {"text":"To address this issue in prompt engineering, we propose a new and effective approach called Prompt Space.","cats":{"prompt-eng":1}}
28 | {"text":"Prompt-based language models have produced encouraging results in numerous applications, including Named Entity Recognition (NER) tasks.","cats":{"prompt-eng":1}}
29 | {"text":"We evaluate different prompt designs in zero- and few-shot settings and experiment with providing task definitions and detailed instructions to the model.","cats":{"prompt-eng":1}}
30 | {"text":"In our evaluation, we focus on zero-shot prompting, comparing four prompt variants in two modes, based on the availability of the reference.","cats":{"prompt-eng":1}}
31 | {"text":"Based on these findings, we also propose a method for generating prompts using only unlabeled data, outperforming strong baselines by an average of 7.0% accuracy across three tasks.","cats":{"prompt-eng":1}}
32 | {"text":"However, the strong performance of most available NER approaches is heavily dependent on the design of discrete prompts and a verbalizer to map the model-predicted outputs to entity categories, which are complicated undertakings.","cats":{"prompt-eng":1}}
33 | {"text":"Methodologically, PCR-Chain is backed up by the underlying global-level prompt architecture (which combines three design ideas: hierarchical task breakdown, prompt composition, and a mix of prompt-based AI and non-AI units) and the local-level prompt design.","cats":{"prompt-eng":1}}
34 | {"text":"Prompts are also a form of programming that can customize the outputs and interactions with an LLM.","cats":{"prompt-eng":0}}
35 | {"text":"We approach this problem by attempting to modify the predictions of a prompt, rather than the prompt itself.","cats":{"prompt-eng":1}}
36 | {"text":"Our user study evaluated and demonstrated the efficiency and correctness of Prompt Sapper.","cats":{"prompt-eng":1}}
37 | {"text":"In this work, we aim to automate this prompt engineering and improve zero-shot accuracy through prompt ensembling.","cats":{"prompt-eng":1}}
38 | {"text":"Instead of laborious human engineering, we propose prompt adaptation, a general framework that automatically adapts original user input to model-preferred prompts.","cats":{"prompt-eng":1}}
39 | {"text":"Recently, their generalizability has been further extended by incorporating trainable prompts, borrowed from the natural language processing literature.","cats":{"prompt-eng":1}}
40 | {"text":"Prompt Engineering has gained significant relevance in recent years, fueled by advancements in pre-trained and large language models.","cats":{"prompt-eng":0}}
41 | {"text":"Large language models that are capable of zero or few-shot prompting approaches have given rise to the new research area of prompt engineering.","cats":{"prompt-eng":0}}
42 | {"text":"Recently, prompt learning emerged as an NLP paradigm that can lead to more generalizable results without any (zero-shot) or few labeled samples (few-shot).","cats":{"prompt-eng":0}}
43 | {"text":"Extensive experiments show that PromptClass achieves overall better performance than existing strong baselines on four benchmark datasets and even achieves similar performance to fully-supervised classifiers on sentiment classification tasks.","cats":{"prompt-eng":1}}
44 | {"text":"Furthermore, it generates more detailed and comprehensible assessments than traditional text classification methods.","cats":{"prompt-eng":0}}
45 | {"text":"Despite advancements in conversational AI, language models encounter challenges to handle diverse conversational tasks, and existing dialogue dataset collections often lack diversity and comprehensiveness.","cats":{"prompt-eng":0}}
46 | {"text":"To further enhance the utility of DialogStudio, we identify the licenses for each dataset and design domain-aware prompts for selected dialogues to facilitate instruction-aware fine-tuning.","cats":{"prompt-eng":0}}
47 | {"text":"Furthermore, we develop conversational AI models using the dataset collection, and our experiments in both zero-shot and few-shot learning scenarios demonstrate the superiority of DialogStudio.","cats":{"prompt-eng":0}}
48 | {"text":"To improve transparency and support dataset and task-based research, as well as language model pre-training, all datasets, licenses, codes, and models associated with DialogStudio are made publicly accessible at https://github.com/salesforce/DialogStudio","cats":{"prompt-eng":0}}
49 | 


--------------------------------------------------------------------------------
/data/downloads/.gitkeep:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/koaning/arxiv-frontpage/260eaf2a63af68045367f0579161b6b414fc97ac/data/downloads/.gitkeep


--------------------------------------------------------------------------------
/data/downloads/2023-05-22.jsonl:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/koaning/arxiv-frontpage/260eaf2a63af68045367f0579161b6b414fc97ac/data/downloads/2023-05-22.jsonl


--------------------------------------------------------------------------------
/data/downloads/2023-06-10.jsonl:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/koaning/arxiv-frontpage/260eaf2a63af68045367f0579161b6b414fc97ac/data/downloads/2023-06-10.jsonl


--------------------------------------------------------------------------------
/data/downloads/2023-07-02-12h.jsonl:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/koaning/arxiv-frontpage/260eaf2a63af68045367f0579161b6b414fc97ac/data/downloads/2023-07-02-12h.jsonl


--------------------------------------------------------------------------------
/data/downloads/2023-07-03-12h.jsonl:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/koaning/arxiv-frontpage/260eaf2a63af68045367f0579161b6b414fc97ac/data/downloads/2023-07-03-12h.jsonl


--------------------------------------------------------------------------------
/data/downloads/2023-07-04-00h.jsonl:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/koaning/arxiv-frontpage/260eaf2a63af68045367f0579161b6b414fc97ac/data/downloads/2023-07-04-00h.jsonl


--------------------------------------------------------------------------------
/data/downloads/2023-07-04-12h.jsonl:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/koaning/arxiv-frontpage/260eaf2a63af68045367f0579161b6b414fc97ac/data/downloads/2023-07-04-12h.jsonl


--------------------------------------------------------------------------------
/data/downloads/2023-07-05-00h.jsonl:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/koaning/arxiv-frontpage/260eaf2a63af68045367f0579161b6b414fc97ac/data/downloads/2023-07-05-00h.jsonl


--------------------------------------------------------------------------------
/data/downloads/2023-07-16-12h.jsonl:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/koaning/arxiv-frontpage/260eaf2a63af68045367f0579161b6b414fc97ac/data/downloads/2023-07-16-12h.jsonl


--------------------------------------------------------------------------------
/data/downloads/2023-07-17-00h.jsonl:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/koaning/arxiv-frontpage/260eaf2a63af68045367f0579161b6b414fc97ac/data/downloads/2023-07-17-00h.jsonl


--------------------------------------------------------------------------------
/data/downloads/2023-07-17-12h.jsonl:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/koaning/arxiv-frontpage/260eaf2a63af68045367f0579161b6b414fc97ac/data/downloads/2023-07-17-12h.jsonl


--------------------------------------------------------------------------------
/data/downloads/2023-07-17-20h.jsonl:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/koaning/arxiv-frontpage/260eaf2a63af68045367f0579161b6b414fc97ac/data/downloads/2023-07-17-20h.jsonl


--------------------------------------------------------------------------------
/data/downloads/2023-08-10-00h.jsonl:
--------------------------------------------------------------------------------
1 | {"created":"2023-08-08 16:18:20","title":"Cumulative Reasoning with Large Language Models","abstract":"While language models are powerful and versatile, they often fail to address highly complex problems. This is because solving complex problems requires deliberate thinking, which has been only minimally guided during training. In this paper, we propose a new method called Cumulative Reasoning (CR), which employs language models in a cumulative and iterative manner to emulate human thought processes. By decomposing tasks into smaller components, CR streamlines the problem-solving process, rendering it both more manageable and effective. For logical inference tasks, CR consistently outperforms existing methods with an improvement up to 9.3%, and achieves the astonishing accuracy of 98.04% on the curated FOLIO wiki dataset. In the context of the Game of 24, CR achieves an accuracy of 94%, which signifies a substantial enhancement of 20% over the previous state-of-the-art method.","sentences":["While language models are powerful and versatile, they often fail to address highly complex problems.","This is because solving complex problems requires deliberate thinking, which has been only minimally guided during training.","In this paper, we propose a new method called Cumulative Reasoning (CR), which employs language models in a cumulative and iterative manner to emulate human thought processes.","By decomposing tasks into smaller components, CR streamlines the problem-solving process, rendering it both more manageable and effective.","For logical inference tasks, CR consistently outperforms existing methods with an improvement up to 9.3%, and achieves the astonishing accuracy of 98.04% on the curated FOLIO wiki dataset.","In the context of the Game of 24, CR achieves an accuracy of 94%, which signifies a substantial enhancement of 20% over the previous state-of-the-art method."],"url":"http://arxiv.org/abs/2308.04371v2"}
2 | {"created":"2023-08-08 15:01:51","title":"Apple Vision Pro for Healthcare: \"The Ultimate Display\"? -- Entering the Wonderland of Precision","abstract":"At the Worldwide Developers Conference (WWDC) in June 2023, Apple introduced the Vision Pro. The Vision Pro is a Mixed Reality (MR) headset, more specifically it is a Virtual Reality (VR) device with an additional Video See-Through (VST) capability. The VST capability turns the Vision Pro also into an Augmented Reality (AR) device. The AR feature is enabled by streaming the real world via cameras to the (VR) screens in front of the user's eyes. This is of course not unique and similar to other devices, like the Varjo XR-3. Nevertheless, the Vision Pro has some interesting features, like an inside-out screen that can show the headset wearers' eyes to \"outsiders\" or a button on the top, called \"Digital Crown\", that allows you to seamlessly blend digital content with your physical space by turning it. In addition, it is untethered, except for the cable to the battery, which makes the headset more agile, compared to the Varjo XR-3. This could actually come closer to the \"Ultimate Display\", which Ivan Sutherland had already sketched in 1965. Not available to the public yet, like the Ultimate Display, we want to take a look into the crystal ball in this perspective to see if it can overcome some clinical challenges that - especially - AR still faces in the medical domain, but also go beyond and discuss if the Vision Pro could support clinicians in essential tasks to spend more time with their patients.","sentences":["At the Worldwide Developers Conference (WWDC) in June 2023, Apple introduced the Vision Pro.","The Vision Pro is a Mixed Reality (MR) headset, more specifically it is a Virtual Reality (VR) device with an additional Video See-Through (VST) capability.","The VST capability turns the Vision Pro also into an Augmented Reality (AR) device.","The AR feature is enabled by streaming the real world via cameras to the (VR) screens in front of the user's eyes.","This is of course not unique and similar to other devices, like the Varjo XR-3.","Nevertheless, the Vision Pro has some interesting features, like an inside-out screen that can show the headset wearers' eyes to \"outsiders\" or a button on the top, called \"Digital Crown\", that allows you to seamlessly blend digital content with your physical space by turning it.","In addition, it is untethered, except for the cable to the battery, which makes the headset more agile, compared to the Varjo XR-3.","This could actually come closer to the \"Ultimate Display\", which Ivan Sutherland had already sketched in 1965.","Not available to the public yet, like the Ultimate Display, we want to take a look into the crystal ball in this perspective to see if it can overcome some clinical challenges that - especially - AR still faces in the medical domain, but also go beyond and discuss if the Vision Pro could support clinicians in essential tasks to spend more time with their patients."],"url":"http://arxiv.org/abs/2308.04313v2"}
3 | 


--------------------------------------------------------------------------------
/data/downloads/2023-08-16-00h.jsonl:
--------------------------------------------------------------------------------
1 | {"created":"2023-08-14 17:30:03","title":"Why Not? Explaining Missing Entailments with Evee (Technical Report)","abstract":"Understanding logical entailments derived by a description logic reasoner is not always straight-forward for ontology users. For this reason, various methods for explaining entailments using justifications and proofs have been developed and implemented as plug-ins for the ontology editor Prot\\'eg\\'e. However, when the user expects a missing consequence to hold, it is equally important to explain why it does not follow from the ontology. In this paper, we describe a new version of $\\rm E{\\scriptsize VEE}$, a Prot\\'eg\\'e plugin that now also provides explanations for missing consequences, via existing and new techniques based on abduction and counterexamples.","sentences":["Understanding logical entailments derived by a description logic reasoner is not always straight-forward for ontology users.","For this reason, various methods for explaining entailments using justifications and proofs have been developed and implemented as plug-ins for the ontology editor Prot\\'eg\\'e.","However, when the user expects a missing consequence to hold, it is equally important to explain why it does not follow from the ontology.","In this paper, we describe a new version of $\\rm E{\\scriptsize VEE}$, a Prot\\'eg\\'e plugin that now also provides explanations for missing consequences, via existing and new techniques based on abduction and counterexamples."],"url":"http://arxiv.org/abs/2308.07294v2"}
2 | 


--------------------------------------------------------------------------------
/data/downloads/2023-09-20-00h.jsonl:
--------------------------------------------------------------------------------
1 | {"created":"2023-09-18 16:39:51","title":"Graph topological property recovery with heat and wave dynamics-based features on graphs","abstract":"In this paper, we propose Graph Differential Equation Network (GDeNet), an approach that harnesses the expressive power of solutions to PDEs on a graph to obtain continuous node- and graph-level representations for various downstream tasks. We derive theoretical results connecting the dynamics of heat and wave equations to the spectral properties of the graph and to the behavior of continuous-time random walks on graphs. We demonstrate experimentally that these dynamics are able to capture salient aspects of graph geometry and topology by recovering generating parameters of random graphs, Ricci curvature, and persistent homology. Furthermore, we demonstrate the superior performance of GDeNet on real-world datasets including citation graphs, drug-like molecules, and proteins.","sentences":["In this paper, we propose Graph Differential Equation Network (GDeNet), an approach that harnesses the expressive power of solutions to PDEs on a graph to obtain continuous node- and graph-level representations for various downstream tasks.","We derive theoretical results connecting the dynamics of heat and wave equations to the spectral properties of the graph and to the behavior of continuous-time random walks on graphs.","We demonstrate experimentally that these dynamics are able to capture salient aspects of graph geometry and topology by recovering generating parameters of random graphs, Ricci curvature, and persistent homology.","Furthermore, we demonstrate the superior performance of GDeNet on real-world datasets including citation graphs, drug-like molecules, and proteins."],"url":"http://arxiv.org/abs/2309.09924v2"}
2 | {"created":"2023-09-18 14:55:21","title":"HypR: A comprehensive study for ASR hypothesis revising with a reference corpus","abstract":"With the development of deep learning, automatic speech recognition (ASR) has made significant progress. To further enhance the performance, revising recognition results is one of the lightweight but efficient manners. Various methods can be roughly classified into N-best reranking methods and error correction models. The former aims to select the hypothesis with the lowest error rate from a set of candidates generated by ASR for a given input speech. The latter focuses on detecting recognition errors in a given hypothesis and correcting these errors to obtain an enhanced result. However, we observe that these studies are hardly comparable to each other as they are usually evaluated on different corpora, paired with different ASR models, and even use different datasets to train the models. Accordingly, we first concentrate on releasing an ASR hypothesis revising (HypR) dataset in this study. HypR contains several commonly used corpora (AISHELL-1, TED-LIUM 2, and LibriSpeech) and provides 50 recognition hypotheses for each speech utterance. The checkpoint models of the ASR are also published. In addition, we implement and compare several classic and representative methods, showing the recent research progress in revising speech recognition results. We hope the publicly available HypR dataset can become a reference benchmark for subsequent research and promote the school of research to an advanced level.","sentences":["With the development of deep learning, automatic speech recognition (ASR) has made significant progress.","To further enhance the performance, revising recognition results is one of the lightweight but efficient manners.","Various methods can be roughly classified into N-best reranking methods and error correction models.","The former aims to select the hypothesis with the lowest error rate from a set of candidates generated by ASR for a given input speech.","The latter focuses on detecting recognition errors in a given hypothesis and correcting these errors to obtain an enhanced result.","However, we observe that these studies are hardly comparable to each other as they are usually evaluated on different corpora, paired with different ASR models, and even use different datasets to train the models.","Accordingly, we first concentrate on releasing an ASR hypothesis revising (HypR) dataset in this study.","HypR contains several commonly used corpora (AISHELL-1, TED-LIUM 2, and LibriSpeech) and provides 50 recognition hypotheses for each speech utterance.","The checkpoint models of the ASR are also published.","In addition, we implement and compare several classic and representative methods, showing the recent research progress in revising speech recognition results.","We hope the publicly available HypR dataset can become a reference benchmark for subsequent research and promote the school of research to an advanced level."],"url":"http://arxiv.org/abs/2309.09838v2"}
3 | 


--------------------------------------------------------------------------------
/data/downloads/2023-10-06-00h.jsonl:
--------------------------------------------------------------------------------
1 | {"created":"2023-10-04 16:58:25","title":"Co-modeling the Sequential and Graphical Routes for Peptide Representation Learning","abstract":"Peptides are formed by the dehydration condensation of multiple amino acids. The primary structure of a peptide can be represented either as an amino acid sequence or as a molecular graph consisting of atoms and chemical bonds. Previous studies have indicated that deep learning routes specific to sequential and graphical peptide forms exhibit comparable performance on downstream tasks. Despite the fact that these models learn representations of the same modality of peptides, we find that they explain their predictions differently. Considering sequential and graphical models as two experts making inferences from different perspectives, we work on fusing expert knowledge to enrich the learned representations for improving the discriminative performance. To achieve this, we propose a peptide co-modeling method, RepCon, which employs a contrastive learning-based framework to enhance the mutual information of representations from decoupled sequential and graphical end-to-end models. It considers representations from the sequential encoder and the graphical encoder for the same peptide sample as a positive pair and learns to enhance the consistency of representations between positive sample pairs and to repel representations between negative pairs. Empirical studies of RepCon and other co-modeling methods are conducted on open-source discriminative datasets, including aggregation propensity, retention time, antimicrobial peptide prediction, and family classification from Peptide Database. Our results demonstrate the superiority of the co-modeling approach over independent modeling, as well as the superiority of RepCon over other methods under the co-modeling framework. In addition, the attribution on RepCon further corroborates the validity of the approach at the level of model explanation.","sentences":["Peptides are formed by the dehydration condensation of multiple amino acids.","The primary structure of a peptide can be represented either as an amino acid sequence or as a molecular graph consisting of atoms and chemical bonds.","Previous studies have indicated that deep learning routes specific to sequential and graphical peptide forms exhibit comparable performance on downstream tasks.","Despite the fact that these models learn representations of the same modality of peptides, we find that they explain their predictions differently.","Considering sequential and graphical models as two experts making inferences from different perspectives, we work on fusing expert knowledge to enrich the learned representations for improving the discriminative performance.","To achieve this, we propose a peptide co-modeling method, RepCon, which employs a contrastive learning-based framework to enhance the mutual information of representations from decoupled sequential and graphical end-to-end models.","It considers representations from the sequential encoder and the graphical encoder for the same peptide sample as a positive pair and learns to enhance the consistency of representations between positive sample pairs and to repel representations between negative pairs.","Empirical studies of RepCon and other co-modeling methods are conducted on open-source discriminative datasets, including aggregation propensity, retention time, antimicrobial peptide prediction, and family classification from Peptide Database.","Our results demonstrate the superiority of the co-modeling approach over independent modeling, as well as the superiority of RepCon over other methods under the co-modeling framework.","In addition, the attribution on RepCon further corroborates the validity of the approach at the level of model explanation."],"url":"http://arxiv.org/abs/2310.02964v2"}
2 | 


--------------------------------------------------------------------------------
/data/downloads/2023-10-18-00h.jsonl:
--------------------------------------------------------------------------------
1 | {"created":"2023-10-16 15:25:14","title":"ReMax: A Simple, Effective, and Efficient Reinforcement Learning Method for Aligning Large Language Models","abstract":"Alignment is of critical importance for training large language models (LLMs). The predominant strategy to address this is through Reinforcement Learning from Human Feedback (RLHF), where PPO serves as the de-facto algorithm. Yet, PPO is known to suffer from computational inefficiency, which is a challenge that this paper aims to address. We identify three important properties in RLHF tasks: fast simulation, deterministic transitions, and trajectory-level rewards, which are not leveraged in PPO. Based on such observations, we develop a new algorithm tailored for RLHF, called ReMax. The algorithm design of ReMax is built on a celebrated algorithm REINFORCE but is equipped with a new variance-reduction technique.   Our method has three-fold advantages over PPO: first, ReMax is simple to implement and removes many hyper-parameters in PPO, which are scale-sensitive and laborious to tune. Second, ReMax saves about 50% memory usage in principle. As a result, PPO runs out-of-memory when fine-tuning a Llama2 (7B) model on 8xA100-40GB GPUs, whereas ReMax can afford training. This memory improvement is achieved by removing the value model in PPO. Third, based on our calculations, we find that even assuming PPO can afford the training of Llama2 (7B), it would still run about 2x slower than ReMax. This is due to the computational overhead of the value model, which does not exist in ReMax. Importantly, the above computational improvements do not sacrifice the performance. We hypothesize these advantages can be maintained in larger-scaled models. Our implementation of ReMax is available at https://github.com/liziniu/ReMax","sentences":["Alignment is of critical importance for training large language models (LLMs).","The predominant strategy to address this is through Reinforcement Learning from Human Feedback (RLHF), where PPO serves as the de-facto algorithm.","Yet, PPO is known to suffer from computational inefficiency, which is a challenge that this paper aims to address.","We identify three important properties in RLHF tasks: fast simulation, deterministic transitions, and trajectory-level rewards, which are not leveraged in PPO.","Based on such observations, we develop a new algorithm tailored for RLHF, called ReMax.","The algorithm design of ReMax is built on a celebrated algorithm REINFORCE but is equipped with a new variance-reduction technique.   ","Our method has three-fold advantages over PPO:","first, ReMax is simple to implement and removes many hyper-parameters in PPO, which are scale-sensitive and laborious to tune.","Second, ReMax saves about 50% memory usage in principle.","As a result, PPO runs out-of-memory when fine-tuning a Llama2 (7B) model on 8xA100-40GB GPUs, whereas ReMax can afford training.","This memory improvement is achieved by removing the value model in PPO.","Third, based on our calculations, we find that even assuming PPO can afford the training of Llama2 (7B), it would still run about 2x slower than ReMax.","This is due to the computational overhead of the value model, which does not exist in ReMax.","Importantly, the above computational improvements do not sacrifice the performance.","We hypothesize these advantages can be maintained in larger-scaled models.","Our implementation of ReMax is available at https://github.com/liziniu/ReMax"],"url":"http://arxiv.org/abs/2310.10505v2"}
2 | 


--------------------------------------------------------------------------------
/data/downloads/2023-11-01-16h.jsonl:
--------------------------------------------------------------------------------
1 | {"created":"2023-10-31 15:53:14","title":"Taking control: Policies to address extinction risks from advanced AI","abstract":"This paper provides policy recommendations to reduce extinction risks from advanced artificial intelligence (AI). First, we briefly provide background information about extinction risks from AI. Second, we argue that voluntary commitments from AI companies would be an inappropriate and insufficient response. Third, we describe three policy proposals that would meaningfully address the threats from advanced AI: (1) establishing a Multinational AGI Consortium to enable democratic oversight of advanced AI (MAGIC), (2) implementing a global cap on the amount of computing power used to train an AI system (global compute cap), and (3) requiring affirmative safety evaluations to ensure that risks are kept below acceptable levels (gating critical experiments). MAGIC would be a secure, safety-focused, internationally-governed institution responsible for reducing risks from advanced AI and performing research to safely harness the benefits of AI. MAGIC would also maintain emergency response infrastructure (kill switch) to swiftly halt AI development or withdraw model deployment in the event of an AI-related emergency. The global compute cap would end the corporate race toward dangerous AI systems while enabling the vast majority of AI innovation to continue unimpeded. Gating critical experiments would ensure that companies developing powerful AI systems are required to present affirmative evidence that these models keep extinction risks below an acceptable threshold. After describing these recommendations, we propose intermediate steps that the international community could take to implement these proposals and lay the groundwork for international coordination around advanced AI.","sentences":["This paper provides policy recommendations to reduce extinction risks from advanced artificial intelligence (AI).","First, we briefly provide background information about extinction risks from AI.","Second, we argue that voluntary commitments from AI companies would be an inappropriate and insufficient response.","Third, we describe three policy proposals that would meaningfully address the threats from advanced AI: (1) establishing a Multinational AGI Consortium to enable democratic oversight of advanced AI (MAGIC), (2) implementing a global cap on the amount of computing power used to train an AI system (global compute cap), and (3) requiring affirmative safety evaluations to ensure that risks are kept below acceptable levels (gating critical experiments).","MAGIC would be a secure, safety-focused, internationally-governed institution responsible for reducing risks from advanced AI and performing research to safely harness the benefits of AI.","MAGIC would also maintain emergency response infrastructure (kill switch) to swiftly halt AI development or withdraw model deployment in the event of an AI-related emergency.","The global compute cap would end the corporate race toward dangerous AI systems while enabling the vast majority of AI innovation to continue unimpeded.","Gating critical experiments would ensure that companies developing powerful AI systems are required to present affirmative evidence that these models keep extinction risks below an acceptable threshold.","After describing these recommendations, we propose intermediate steps that the international community could take to implement these proposals and lay the groundwork for international coordination around advanced AI."],"url":"http://arxiv.org/abs/2310.20563v1"}
2 | 


--------------------------------------------------------------------------------
/data/downloads/2024-03-08-16h.jsonl:
--------------------------------------------------------------------------------
1 | {"created":"2024-03-07 18:14:02","title":"QRtree -- Decision Tree dialect specification of QRscript","abstract":"This specification document specifies the syntax and semantics of QRtree, which is a specific dialect of QRscript particularly suited to represent decision trees without chance nodes. The term dialect identifies one of the possible sub-languages that can be encoded inside of an eQR code via QRscript. This specification will describe an intermediate representation of QRtree, made through a language derived by the three-address code. It will then define the transformation rules from the intermediate representation to a binary code. The latter is a binary representation called eQRtreebytecode. These rules can also be applied inversely to transform the eQRtreeBytecode into the intermediate representation. This specification document will pay particular attention to the creation of a compact eQRtreebytecode, as the maximum number of bits that can be stored in a QR code is, at the time of writing, equal to 2953 bytes (in the case of QR code version 40 with a \"low\" error correction level).","sentences":["This specification document specifies the syntax and semantics of QRtree, which is a specific dialect of QRscript particularly suited to represent decision trees without chance nodes.","The term dialect identifies one of the possible sub-languages that can be encoded inside of an eQR code via QRscript.","This specification will describe an intermediate representation of QRtree, made through a language derived by the three-address code.","It will then define the transformation rules from the intermediate representation to a binary code.","The latter is a binary representation called eQRtreebytecode.","These rules can also be applied inversely to transform the eQRtreeBytecode into the intermediate representation.","This specification document will pay particular attention to the creation of a compact eQRtreebytecode, as the maximum number of bits that can be stored in a QR code is, at the time of writing, equal to 2953 bytes (in the case of QR code version 40 with a \"low\" error correction level)."],"url":"http://arxiv.org/abs/2403.04716v1"}
2 | 


--------------------------------------------------------------------------------
/data/downloads/2024-03-09-08h.jsonl:
--------------------------------------------------------------------------------
1 | {"created":"2024-03-07 18:14:02","title":"QRtree -- Decision Tree dialect specification of QRscript","abstract":"This specification document specifies the syntax and semantics of QRtree, which is a specific dialect of QRscript particularly suited to represent decision trees without chance nodes. The term dialect identifies one of the possible sub-languages that can be encoded inside of an eQR code via QRscript. This specification will describe an intermediate representation of QRtree, made through a language derived by the three-address code. It will then define the transformation rules from the intermediate representation to a binary code. The latter is a binary representation called eQRtreebytecode. These rules can also be applied inversely to transform the eQRtreeBytecode into the intermediate representation. This specification document will pay particular attention to the creation of a compact eQRtreebytecode, as the maximum number of bits that can be stored in a QR code is, at the time of writing, equal to 2953 bytes (in the case of QR code version 40 with a \"low\" error correction level).","sentences":["This specification document specifies the syntax and semantics of QRtree, which is a specific dialect of QRscript particularly suited to represent decision trees without chance nodes.","The term dialect identifies one of the possible sub-languages that can be encoded inside of an eQR code via QRscript.","This specification will describe an intermediate representation of QRtree, made through a language derived by the three-address code.","It will then define the transformation rules from the intermediate representation to a binary code.","The latter is a binary representation called eQRtreebytecode.","These rules can also be applied inversely to transform the eQRtreeBytecode into the intermediate representation.","This specification document will pay particular attention to the creation of a compact eQRtreebytecode, as the maximum number of bits that can be stored in a QR code is, at the time of writing, equal to 2953 bytes (in the case of QR code version 40 with a \"low\" error correction level)."],"url":"http://arxiv.org/abs/2403.04716v1"}
2 | 


--------------------------------------------------------------------------------
/data/downloads/2024-03-10-00h.jsonl:
--------------------------------------------------------------------------------
1 | {"created":"2024-03-07 18:14:02","title":"QRtree -- Decision Tree dialect specification of QRscript","abstract":"This specification document specifies the syntax and semantics of QRtree, which is a specific dialect of QRscript particularly suited to represent decision trees without chance nodes. The term dialect identifies one of the possible sub-languages that can be encoded inside of an eQR code via QRscript. This specification will describe an intermediate representation of QRtree, made through a language derived by the three-address code. It will then define the transformation rules from the intermediate representation to a binary code. The latter is a binary representation called eQRtreebytecode. These rules can also be applied inversely to transform the eQRtreeBytecode into the intermediate representation. This specification document will pay particular attention to the creation of a compact eQRtreebytecode, as the maximum number of bits that can be stored in a QR code is, at the time of writing, equal to 2953 bytes (in the case of QR code version 40 with a \"low\" error correction level).","sentences":["This specification document specifies the syntax and semantics of QRtree, which is a specific dialect of QRscript particularly suited to represent decision trees without chance nodes.","The term dialect identifies one of the possible sub-languages that can be encoded inside of an eQR code via QRscript.","This specification will describe an intermediate representation of QRtree, made through a language derived by the three-address code.","It will then define the transformation rules from the intermediate representation to a binary code.","The latter is a binary representation called eQRtreebytecode.","These rules can also be applied inversely to transform the eQRtreeBytecode into the intermediate representation.","This specification document will pay particular attention to the creation of a compact eQRtreebytecode, as the maximum number of bits that can be stored in a QR code is, at the time of writing, equal to 2953 bytes (in the case of QR code version 40 with a \"low\" error correction level)."],"url":"http://arxiv.org/abs/2403.04716v1"}
2 | 


--------------------------------------------------------------------------------
/data/downloads/2024-03-13-00h.jsonl:
--------------------------------------------------------------------------------
1 | {"created":"2024-03-11 16:45:53","title":"Numerical simulation of individual coil placement -- A proof-of-concept study for the prediction of recurrence after aneurysm coiling","abstract":"Rupture of intracranial aneurysms results in severe subarachnoidal hemorrhage, which is associated with high morbidity and mortality. Neurointerventional occlusion of the aneurysm through coiling has evolved to a therapeutical standard. The choice of the specific coil has an important influence on secondary regrowth requiring retreatment. Aneurysm occlusion was simulated either through virtual implantation of a preshaped 3D coil or with a porous media approach. In this study, we used a recently developed numerical approach to simulate aneurysm shapes in specific challenging aneurysm anatomies and correlated these with aneurysm recurrence 6 months after treatment. The simulation showed a great variety of coil shapes depending on the variability in possible microcatheter positions. Aneurysms with a later recurrence showed a tendency for more successful coiling attempts. Results revealed further trends suggesting lower simulated packing densities in aneurysms with reoccurrence. Simulated packing densities did not correlate with those calculated by conventional software, indicating the potential for our approach to offer additional predictive value. Our study, therefore, pioneers a comprehensive numerical model for simulating aneurysm coiling, providing insights into individualized treatment strategies and outcome prediction. Future directions involve expanding the model's capabilities to simulate intraprocedural outcomes and long-term predictions, aiming to refine occlusion quality criteria and validate prediction parameters in larger patient cohorts. This simulation framework holds promise for enhancing clinical decision-making and optimizing patient outcomes in endovascular aneurysm treatment.","sentences":["Rupture of intracranial aneurysms results in severe subarachnoidal hemorrhage, which is associated with high morbidity and mortality.","Neurointerventional occlusion of the aneurysm through coiling has evolved to a therapeutical standard.","The choice of the specific coil has an important influence on secondary regrowth requiring retreatment.","Aneurysm occlusion was simulated either through virtual implantation of a preshaped 3D coil or with a porous media approach.","In this study, we used a recently developed numerical approach to simulate aneurysm shapes in specific challenging aneurysm anatomies and correlated these with aneurysm recurrence 6 months after treatment.","The simulation showed a great variety of coil shapes depending on the variability in possible microcatheter positions.","Aneurysms with a later recurrence showed a tendency for more successful coiling attempts.","Results revealed further trends suggesting lower simulated packing densities in aneurysms with reoccurrence.","Simulated packing densities did not correlate with those calculated by conventional software, indicating the potential for our approach to offer additional predictive value.","Our study, therefore, pioneers a comprehensive numerical model for simulating aneurysm coiling, providing insights into individualized treatment strategies and outcome prediction.","Future directions involve expanding the model's capabilities to simulate intraprocedural outcomes and long-term predictions, aiming to refine occlusion quality criteria and validate prediction parameters in larger patient cohorts.","This simulation framework holds promise for enhancing clinical decision-making and optimizing patient outcomes in endovascular aneurysm treatment."],"url":"http://arxiv.org/abs/2403.06889v1"}
2 | 


--------------------------------------------------------------------------------
/data/downloads/2024-03-13-16h.jsonl:
--------------------------------------------------------------------------------
1 | {"created":"2024-03-12 15:13:12","title":"DSEG-LIME - Improving Image Explanation by Hierarchical Data-Driven Segmentation","abstract":"Explainable Artificial Intelligence is critical in unraveling decision-making processes in complex machine learning models. LIME (Local Interpretable Model-agnostic Explanations) is a well-known XAI framework for image analysis. It utilizes image segmentation to create features to identify relevant areas for classification. Consequently, poor segmentation can compromise the consistency of the explanation and undermine the importance of the segments, affecting the overall interpretability. Addressing these challenges, we introduce DSEG-LIME (Data-Driven Segmentation LIME), featuring: i) a data-driven segmentation for human-recognized feature generation, and ii) a hierarchical segmentation procedure through composition. We benchmark DSEG-LIME on pre-trained models with images from the ImageNet dataset - scenarios without domain-specific knowledge. The analysis includes a quantitative evaluation using established XAI metrics, complemented by a qualitative assessment through a user study. Our findings demonstrate that DSEG outperforms in most of the XAI metrics and enhances the alignment of explanations with human-recognized concepts, significantly improving interpretability. The code is available under: https://github. com/patrick-knab/DSEG-LIME","sentences":["Explainable Artificial Intelligence is critical in unraveling decision-making processes in complex machine learning models.","LIME (Local Interpretable Model-agnostic Explanations) is a well-known XAI framework for image analysis.","It utilizes image segmentation to create features to identify relevant areas for classification.","Consequently, poor segmentation can compromise the consistency of the explanation and undermine the importance of the segments, affecting the overall interpretability.","Addressing these challenges, we introduce DSEG-LIME (Data-Driven Segmentation LIME), featuring: i) a data-driven segmentation for human-recognized feature generation, and ii) a hierarchical segmentation procedure through composition.","We benchmark DSEG-LIME on pre-trained models with images from the ImageNet dataset - scenarios without domain-specific knowledge.","The analysis includes a quantitative evaluation using established XAI metrics, complemented by a qualitative assessment through a user study.","Our findings demonstrate that DSEG outperforms in most of the XAI metrics and enhances the alignment of explanations with human-recognized concepts, significantly improving interpretability.","The code is available under: https://github. com/patrick-knab/DSEG-LIME"],"url":"http://arxiv.org/abs/2403.07733v1"}
2 | 


--------------------------------------------------------------------------------
/data/downloads/2024-03-20-00h.jsonl:
--------------------------------------------------------------------------------
1 | {"created":"2024-03-18 07:14:21","title":"State-Separated SARSA: A Practical Sequential Decision-Making Algorithm with Recovering Rewards","abstract":"While many multi-armed bandit algorithms assume that rewards for all arms are constant across rounds, this assumption does not hold in many real-world scenarios. This paper considers the setting of recovering bandits (Pike-Burke & Grunewalder, 2019), where the reward depends on the number of rounds elapsed since the last time an arm was pulled. We propose a new reinforcement learning (RL) algorithm tailored to this setting, named the State-Separate SARSA (SS-SARSA) algorithm, which treats rounds as states. The SS-SARSA algorithm achieves efficient learning by reducing the number of state combinations required for Q-learning/SARSA, which often suffers from combinatorial issues for large-scale RL problems. Additionally, it makes minimal assumptions about the reward structure and offers lower computational complexity. Furthermore, we prove asymptotic convergence to an optimal policy under mild assumptions. Simulation studies demonstrate the superior performance of our algorithm across various settings.","sentences":["While many multi-armed bandit algorithms assume that rewards for all arms are constant across rounds, this assumption does not hold in many real-world scenarios.","This paper considers the setting of recovering bandits (Pike-Burke & Grunewalder, 2019), where the reward depends on the number of rounds elapsed since the last time an arm was pulled.","We propose a new reinforcement learning (RL) algorithm tailored to this setting, named the State-Separate SARSA (SS-SARSA) algorithm, which treats rounds as states.","The SS-SARSA algorithm achieves efficient learning by reducing the number of state combinations required for Q-learning/SARSA, which often suffers from combinatorial issues for large-scale RL problems.","Additionally, it makes minimal assumptions about the reward structure and offers lower computational complexity.","Furthermore, we prove asymptotic convergence to an optimal policy under mild assumptions.","Simulation studies demonstrate the superior performance of our algorithm across various settings."],"url":"http://arxiv.org/abs/2403.11520v1"}
2 | 


--------------------------------------------------------------------------------
/data/downloads/2024-04-05-00h.jsonl:
--------------------------------------------------------------------------------
1 | {"created":"2024-04-03 14:07:02","title":"AQuA - Combining Experts' and Non-Experts' Views To Assess Deliberation Quality in Online Discussions Using LLMs","abstract":"Measuring the quality of contributions in political online discussions is crucial in deliberation research and computer science. Research has identified various indicators to assess online discussion quality, and with deep learning advancements, automating these measures has become feasible. While some studies focus on analyzing specific quality indicators, a comprehensive quality score incorporating various deliberative aspects is often preferred. In this work, we introduce AQuA, an additive score that calculates a unified deliberative quality score from multiple indices for each discussion post. Unlike other singular scores, AQuA preserves information on the deliberative aspects present in comments, enhancing model transparency. We develop adapter models for 20 deliberative indices, and calculate correlation coefficients between experts' annotations and the perceived deliberativeness by non-experts to weigh the individual indices into a single deliberative score. We demonstrate that the AQuA score can be computed easily from pre-trained adapters and aligns well with annotations on other datasets that have not be seen during training. The analysis of experts' vs. non-experts' annotations confirms theoretical findings in the social science literature.","sentences":["Measuring the quality of contributions in political online discussions is crucial in deliberation research and computer science.","Research has identified various indicators to assess online discussion quality, and with deep learning advancements, automating these measures has become feasible.","While some studies focus on analyzing specific quality indicators, a comprehensive quality score incorporating various deliberative aspects is often preferred.","In this work, we introduce AQuA, an additive score that calculates a unified deliberative quality score from multiple indices for each discussion post.","Unlike other singular scores, AQuA preserves information on the deliberative aspects present in comments, enhancing model transparency.","We develop adapter models for 20 deliberative indices, and calculate correlation coefficients between experts' annotations and the perceived deliberativeness by non-experts to weigh the individual indices into a single deliberative score.","We demonstrate that the AQuA score can be computed easily from pre-trained adapters and aligns well with annotations on other datasets that have not be seen during training.","The analysis of experts' vs. non-experts' annotations confirms theoretical findings in the social science literature."],"url":"http://arxiv.org/abs/2404.02761v2"}
2 | 


--------------------------------------------------------------------------------
/data/downloads/2024-04-11-00h.jsonl:
--------------------------------------------------------------------------------
1 | {"created":"2024-04-09 17:48:52","title":"Flying with Photons: Rendering Novel Views of Propagating Light","abstract":"We present an imaging and neural rendering technique that seeks to synthesize videos of light propagating through a scene from novel, moving camera viewpoints. Our approach relies on a new ultrafast imaging setup to capture a first-of-its kind, multi-viewpoint video dataset with picosecond-level temporal resolution. Combined with this dataset, we introduce an efficient neural volume rendering framework based on the transient field. This field is defined as a mapping from a 3D point and 2D direction to a high-dimensional, discrete-time signal that represents time-varying radiance at ultrafast timescales. Rendering with transient fields naturally accounts for effects due to the finite speed of light, including viewpoint-dependent appearance changes caused by light propagation delays to the camera. We render a range of complex effects, including scattering, specular reflection, refraction, and diffraction. Additionally, we demonstrate removing viewpoint-dependent propagation delays using a time warping procedure, rendering of relativistic effects, and video synthesis of direct and global components of light transport.","sentences":["We present an imaging and neural rendering technique that seeks to synthesize videos of light propagating through a scene from novel, moving camera viewpoints.","Our approach relies on a new ultrafast imaging setup to capture a first-of-its kind, multi-viewpoint video dataset with picosecond-level temporal resolution.","Combined with this dataset, we introduce an efficient neural volume rendering framework based on the transient field.","This field is defined as a mapping from a 3D point and 2D direction to a high-dimensional, discrete-time signal that represents time-varying radiance at ultrafast timescales.","Rendering with transient fields naturally accounts for effects due to the finite speed of light, including viewpoint-dependent appearance changes caused by light propagation delays to the camera.","We render a range of complex effects, including scattering, specular reflection, refraction, and diffraction.","Additionally, we demonstrate removing viewpoint-dependent propagation delays using a time warping procedure, rendering of relativistic effects, and video synthesis of direct and global components of light transport."],"url":"http://arxiv.org/abs/2404.06493v2"}
2 | 


--------------------------------------------------------------------------------
/data/downloads/2024-04-19-00h.jsonl:
--------------------------------------------------------------------------------
1 | {"created":"2024-04-17 17:59:55","title":"Dynamic Typography: Bringing Text to Life via Video Diffusion Prior","abstract":"Text animation serves as an expressive medium, transforming static communication into dynamic experiences by infusing words with motion to evoke emotions, emphasize meanings, and construct compelling narratives. Crafting animations that are semantically aware poses significant challenges, demanding expertise in graphic design and animation. We present an automated text animation scheme, termed \"Dynamic Typography\", which combines two challenging tasks. It deforms letters to convey semantic meaning and infuses them with vibrant movements based on user prompts. Our technique harnesses vector graphics representations and an end-to-end optimization-based framework. This framework employs neural displacement fields to convert letters into base shapes and applies per-frame motion, encouraging coherence with the intended textual concept. Shape preservation techniques and perceptual loss regularization are employed to maintain legibility and structural integrity throughout the animation process. We demonstrate the generalizability of our approach across various text-to-video models and highlight the superiority of our end-to-end methodology over baseline methods, which might comprise separate tasks. Through quantitative and qualitative evaluations, we demonstrate the effectiveness of our framework in generating coherent text animations that faithfully interpret user prompts while maintaining readability. Our code is available at: https://animate-your-word.github.io/demo/.","sentences":["Text animation serves as an expressive medium, transforming static communication into dynamic experiences by infusing words with motion to evoke emotions, emphasize meanings, and construct compelling narratives.","Crafting animations that are semantically aware poses significant challenges, demanding expertise in graphic design and animation.","We present an automated text animation scheme, termed \"Dynamic Typography\", which combines two challenging tasks.","It deforms letters to convey semantic meaning and infuses them with vibrant movements based on user prompts.","Our technique harnesses vector graphics representations and an end-to-end optimization-based framework.","This framework employs neural displacement fields to convert letters into base shapes and applies per-frame motion, encouraging coherence with the intended textual concept.","Shape preservation techniques and perceptual loss regularization are employed to maintain legibility and structural integrity throughout the animation process.","We demonstrate the generalizability of our approach across various text-to-video models and highlight the superiority of our end-to-end methodology over baseline methods, which might comprise separate tasks.","Through quantitative and qualitative evaluations, we demonstrate the effectiveness of our framework in generating coherent text animations that faithfully interpret user prompts while maintaining readability.","Our code is available at: https://animate-your-word.github.io/demo/."],"url":"http://arxiv.org/abs/2404.11614v2"}
2 | 


--------------------------------------------------------------------------------
/data/downloads/2024-05-08-01h.jsonl:
--------------------------------------------------------------------------------
1 | {"created":"2024-05-06 15:48:22","title":"Understanding Read-Write Wait-Free Coverings in the Fully-Anonymous Shared-Memory Model","abstract":"In the fully-anonymous (shared-memory) model, inspired by a biological setting, processors have no identifiers and memory locations are anonymous. This means that there is no pre-existing agreement among processors on any naming of the memory locations. In this work, we ask fundamental questions about the fully-anonymous model in the hope to obtain a better understanding of the role of naming and anonymity in distributed computing.   First, we ask what it means to solve a task under processor anonymity. With tasks such as renaming, the traditional notion obviously does not apply. Instead of restricting ourselves to colorless tasks, we propose using the notion of group solvability, which allows transferring any task to processor-anonymous models.   Second, the difficulty with anonymity is that processors can hardly avoid covering and then overwriting each other's writes, erasing information written by their predecessors. To get to the bottom of this phenomenon, we ask what system configurations are stable when processors keep reading and writing ad infinitum. Resolving this question leads us to a wait-free solution to the snapshot task, which then allows us to solve renaming and obstruction-free consensus.","sentences":["In the fully-anonymous (shared-memory) model, inspired by a biological setting, processors have no identifiers and memory locations are anonymous.","This means that there is no pre-existing agreement among processors on any naming of the memory locations.","In this work, we ask fundamental questions about the fully-anonymous model in the hope to obtain a better understanding of the role of naming and anonymity in distributed computing.   ","First, we ask what it means to solve a task under processor anonymity.","With tasks such as renaming, the traditional notion obviously does not apply.","Instead of restricting ourselves to colorless tasks, we propose using the notion of group solvability, which allows transferring any task to processor-anonymous models.   ","Second, the difficulty with anonymity is that processors can hardly avoid covering and then overwriting each other's writes, erasing information written by their predecessors.","To get to the bottom of this phenomenon, we ask what system configurations are stable when processors keep reading and writing ad infinitum.","Resolving this question leads us to a wait-free solution to the snapshot task, which then allows us to solve renaming and obstruction-free consensus."],"url":"http://arxiv.org/abs/2405.03573v2"}
2 | 


--------------------------------------------------------------------------------
/data/downloads/2024-05-22-00h.jsonl:
--------------------------------------------------------------------------------
1 | {"created":"2024-05-20 17:17:44","title":"Training Data Attribution via Approximate Unrolled Differentiation","abstract":"Many training data attribution (TDA) methods aim to estimate how a model's behavior would change if one or more data points were removed from the training set. Methods based on implicit differentiation, such as influence functions, can be made computationally efficient, but fail to account for underspecification, the implicit bias of the optimization algorithm, or multi-stage training pipelines. By contrast, methods based on unrolling address these issues but face scalability challenges. In this work, we connect the implicit-differentiation-based and unrolling-based approaches and combine their benefits by introducing Source, an approximate unrolling-based TDA method that is computed using an influence-function-like formula. While being computationally efficient compared to unrolling-based approaches, Source is suitable in cases where implicit-differentiation-based approaches struggle, such as in non-converged models and multi-stage training pipelines. Empirically, Source outperforms existing TDA techniques in counterfactual prediction, especially in settings where implicit-differentiation-based approaches fall short.","sentences":["Many training data attribution (TDA) methods aim to estimate how a model's behavior would change if one or more data points were removed from the training set.","Methods based on implicit differentiation, such as influence functions, can be made computationally efficient, but fail to account for underspecification, the implicit bias of the optimization algorithm, or multi-stage training pipelines.","By contrast, methods based on unrolling address these issues but face scalability challenges.","In this work, we connect the implicit-differentiation-based and unrolling-based approaches and combine their benefits by introducing Source, an approximate unrolling-based TDA method that is computed using an influence-function-like formula.","While being computationally efficient compared to unrolling-based approaches, Source is suitable in cases where implicit-differentiation-based approaches struggle, such as in non-converged models and multi-stage training pipelines.","Empirically, Source outperforms existing TDA techniques in counterfactual prediction, especially in settings where implicit-differentiation-based approaches fall short."],"url":"http://arxiv.org/abs/2405.12186v2"}
2 | 


--------------------------------------------------------------------------------
/data/downloads/2024-06-14-00h.jsonl:
--------------------------------------------------------------------------------
1 | {"created":"2024-06-12 16:20:58","title":"APSeg: Auto-Prompt Network for Cross-Domain Few-Shot Semantic Segmentation","abstract":"Few-shot semantic segmentation (FSS) endeavors to segment unseen classes with only a few labeled samples. Current FSS methods are commonly built on the assumption that their training and application scenarios share similar domains, and their performances degrade significantly while applied to a distinct domain. To this end, we propose to leverage the cutting-edge foundation model, the Segment Anything Model (SAM), for generalization enhancement. The SAM however performs unsatisfactorily on domains that are distinct from its training data, which primarily comprise natural scene images, and it does not support automatic segmentation of specific semantics due to its interactive prompting mechanism. In our work, we introduce APSeg, a novel auto-prompt network for cross-domain few-shot semantic segmentation (CD-FSS), which is designed to be auto-prompted for guiding cross-domain segmentation. Specifically, we propose a Dual Prototype Anchor Transformation (DPAT) module that fuses pseudo query prototypes extracted based on cycle-consistency with support prototypes, allowing features to be transformed into a more stable domain-agnostic space. Additionally, a Meta Prompt Generator (MPG) module is introduced to automatically generate prompt embeddings, eliminating the need for manual visual prompts. We build an efficient model which can be applied directly to target domains without fine-tuning. Extensive experiments on four cross-domain datasets show that our model outperforms the state-of-the-art CD-FSS method by 5.24% and 3.10% in average accuracy on 1-shot and 5-shot settings, respectively.","sentences":["Few-shot semantic segmentation (FSS) endeavors to segment unseen classes with only a few labeled samples.","Current FSS methods are commonly built on the assumption that their training and application scenarios share similar domains, and their performances degrade significantly while applied to a distinct domain.","To this end, we propose to leverage the cutting-edge foundation model, the Segment Anything Model (SAM), for generalization enhancement.","The SAM however performs unsatisfactorily on domains that are distinct from its training data, which primarily comprise natural scene images, and it does not support automatic segmentation of specific semantics due to its interactive prompting mechanism.","In our work, we introduce APSeg, a novel auto-prompt network for cross-domain few-shot semantic segmentation (CD-FSS), which is designed to be auto-prompted for guiding cross-domain segmentation.","Specifically, we propose a Dual Prototype Anchor Transformation (DPAT) module that fuses pseudo query prototypes extracted based on cycle-consistency with support prototypes, allowing features to be transformed into a more stable domain-agnostic space.","Additionally, a Meta Prompt Generator (MPG) module is introduced to automatically generate prompt embeddings, eliminating the need for manual visual prompts.","We build an efficient model which can be applied directly to target domains without fine-tuning.","Extensive experiments on four cross-domain datasets show that our model outperforms the state-of-the-art CD-FSS method by 5.24% and 3.10% in average accuracy on 1-shot and 5-shot settings, respectively."],"url":"http://arxiv.org/abs/2406.08372v2"}
2 | 


--------------------------------------------------------------------------------
/data/downloads/2024-07-25-00h.jsonl:
--------------------------------------------------------------------------------
1 | {"created":"2024-07-23 17:48:17","title":"Goedel logics: Prenex fragments","abstract":"In this paper, we provide a complete classification for the first-order Goedel logics concerning the property that the formulas admit logically equivalent prenex normal forms. We show that the only first-order Goedel logics that admit such prenex forms are those with finite truth value sets since they allow all quantifier-shift rules and the logic $G_\\uparrow$ with only one accumulation point at 1 in the infinite truth value set. In all the other cases, there are generally no logically equivalent prenex normal forms. We will also see that $G_\\uparrow$ is the intersection of all finite first-order Goedel logics.   The second part of this paper investigates the existence of effective equivalence between the validity of a formula and the validity of some prenex normal form. The existence of such a normal form is obvious for finite valued Goedel logic and $G_\\uparrow$. Goedel logics with an uncountable truth value set admit the prenex normal forms if and only if every surrounding of 0 is uncountable or 0 is an isolated point. Otherwise, uncountable Goedel logics are not recursively enumerable, however, the prenex fragment is always recursively enumerable. Therefore, there is no effective translation between the valid formula and the valid prenex normal form. However, the existence of effectively constructible validity equivalent prenex forms for the countable case is still up for debate.","sentences":["In this paper, we provide a complete classification for the first-order Goedel logics concerning the property that the formulas admit logically equivalent prenex normal forms.","We show that the only first-order Goedel logics that admit such prenex forms are those with finite truth value sets since they allow all quantifier-shift rules and the logic $G_\\uparrow$ with only one accumulation point at 1 in the infinite truth value set.","In all the other cases, there are generally no logically equivalent prenex normal forms.","We will also see that $G_\\uparrow$ is the intersection of all finite first-order Goedel logics.   ","The second part of this paper investigates the existence of effective equivalence between the validity of a formula and the validity of some prenex normal form.","The existence of such a normal form is obvious for finite valued Goedel logic and $G_\\uparrow$. Goedel logics with an uncountable truth value set admit the prenex normal forms if and only if every surrounding of 0 is uncountable or 0 is an isolated point.","Otherwise, uncountable Goedel logics are not recursively enumerable, however, the prenex fragment is always recursively enumerable.","Therefore, there is no effective translation between the valid formula and the valid prenex normal form.","However, the existence of effectively constructible validity equivalent prenex forms for the countable case is still up for debate."],"url":"http://arxiv.org/abs/2407.16683v2"}
2 | 


--------------------------------------------------------------------------------
/data/downloads/2024-07-26-00h.jsonl:
--------------------------------------------------------------------------------
1 | {"created":"2024-07-24 13:42:46","title":"SoK: Bridging Trust into the Blockchain. A Systematic Review on On-Chain Identity","abstract":"The ongoing regulation of blockchain-based services and applications requires the identification of users who are issuing transactions on the blockchain. This systematic review explores the current status, identifies research gaps, and outlines future research directions for establishing trusted and privacy-compliant identities on the blockchain (on-chain identity). A systematic search term was applied across various scientific databases, collecting 2232 potentially relevant research papers. These papers were narrowed down in two methodologically executed steps to 98 and finally to 13 relevant sources. The relevant articles were then systematically analyzed based on a set of screening questions. The results of the selected studies have provided insightful findings on the mechanisms of on-chain identities. On-chain identities are established using zero-knowledge proofs, public key infrastructure/certificates, and web of trust approaches. The technologies and architectures used by the authors are also highlighted. Trust has emerged as a key research gap, manifesting in two ways: firstly, a gap in how to trust the digital identity representation of a physical human; secondly, a gap in how to trust identity providers that issue identity confirmations on-chain. Potential future research avenues are suggested to help fill the current gaps in establishing trust and on-chain identities.","sentences":["The ongoing regulation of blockchain-based services and applications requires the identification of users who are issuing transactions on the blockchain.","This systematic review explores the current status, identifies research gaps, and outlines future research directions for establishing trusted and privacy-compliant identities on the blockchain (on-chain identity).","A systematic search term was applied across various scientific databases, collecting 2232 potentially relevant research papers.","These papers were narrowed down in two methodologically executed steps to 98 and finally to 13 relevant sources.","The relevant articles were then systematically analyzed based on a set of screening questions.","The results of the selected studies have provided insightful findings on the mechanisms of on-chain identities.","On-chain identities are established using zero-knowledge proofs, public key infrastructure/certificates, and web of trust approaches.","The technologies and architectures used by the authors are also highlighted.","Trust has emerged as a key research gap, manifesting in two ways: firstly, a gap in how to trust the digital identity representation of a physical human; secondly, a gap in how to trust identity providers that issue identity confirmations on-chain.","Potential future research avenues are suggested to help fill the current gaps in establishing trust and on-chain identities."],"url":"http://arxiv.org/abs/2407.17276v2"}
2 | 


--------------------------------------------------------------------------------
/data/downloads/2024-08-07-00h.jsonl:
--------------------------------------------------------------------------------
1 | {"created":"2024-08-05 14:49:12","title":"Explaining Reinforcement Learning: A Counterfactual Shapley Values Approach","abstract":"This paper introduces a novel approach Counterfactual Shapley Values (CSV), which enhances explainability in reinforcement learning (RL) by integrating counterfactual analysis with Shapley Values. The approach aims to quantify and compare the contributions of different state dimensions to various action choices. To more accurately analyze these impacts, we introduce new characteristic value functions, the ``Counterfactual Difference Characteristic Value\" and the ``Average Counterfactual Difference Characteristic Value.\" These functions help calculate the Shapley values to evaluate the differences in contributions between optimal and non-optimal actions. Experiments across several RL domains, such as GridWorld, FrozenLake, and Taxi, demonstrate the effectiveness of the CSV method. The results show that this method not only improves transparency in complex RL systems but also quantifies the differences across various decisions.","sentences":["This paper introduces a novel approach Counterfactual Shapley Values (CSV), which enhances explainability in reinforcement learning (RL) by integrating counterfactual analysis with Shapley Values.","The approach aims to quantify and compare the contributions of different state dimensions to various action choices.","To more accurately analyze these impacts, we introduce new characteristic value functions, the ``Counterfactual Difference Characteristic Value\" and the ``Average Counterfactual Difference Characteristic Value.\"","These functions help calculate the Shapley values to evaluate the differences in contributions between optimal and non-optimal actions.","Experiments across several RL domains, such as GridWorld, FrozenLake, and Taxi, demonstrate the effectiveness of the CSV method.","The results show that this method not only improves transparency in complex RL systems but also quantifies the differences across various decisions."],"url":"http://arxiv.org/abs/2408.02529v2"}
2 | 


--------------------------------------------------------------------------------
/data/downloads/2024-08-12-08h.jsonl:
--------------------------------------------------------------------------------
1 | {"created":"2024-08-10 05:41:19","title":"Preserving Privacy in Large Language Models: A Survey on Current Threats and Solutions","abstract":"Large Language Models (LLMs) represent a significant advancement in artificial intelligence, finding applications across various domains. However, their reliance on massive internet-sourced datasets for training brings notable privacy issues, which are exacerbated in critical domains (e.g., healthcare). Moreover, certain application-specific scenarios may require fine-tuning these models on private data. This survey critically examines the privacy threats associated with LLMs, emphasizing the potential for these models to memorize and inadvertently reveal sensitive information. We explore current threats by reviewing privacy attacks on LLMs and propose comprehensive solutions for integrating privacy mechanisms throughout the entire learning pipeline. These solutions range from anonymizing training datasets to implementing differential privacy during training or inference and machine unlearning after training. Our comprehensive review of existing literature highlights ongoing challenges, available tools, and future directions for preserving privacy in LLMs. This work aims to guide the development of more secure and trustworthy AI systems by providing a thorough understanding of privacy preservation methods and their effectiveness in mitigating risks.","sentences":["Large Language Models (LLMs) represent a significant advancement in artificial intelligence, finding applications across various domains.","However, their reliance on massive internet-sourced datasets for training brings notable privacy issues, which are exacerbated in critical domains (e.g., healthcare).","Moreover, certain application-specific scenarios may require fine-tuning these models on private data.","This survey critically examines the privacy threats associated with LLMs, emphasizing the potential for these models to memorize and inadvertently reveal sensitive information.","We explore current threats by reviewing privacy attacks on LLMs and propose comprehensive solutions for integrating privacy mechanisms throughout the entire learning pipeline.","These solutions range from anonymizing training datasets to implementing differential privacy during training or inference and machine unlearning after training.","Our comprehensive review of existing literature highlights ongoing challenges, available tools, and future directions for preserving privacy in LLMs.","This work aims to guide the development of more secure and trustworthy AI systems by providing a thorough understanding of privacy preservation methods and their effectiveness in mitigating risks."],"url":"http://arxiv.org/abs/2408.05212v1"}
2 | 


--------------------------------------------------------------------------------
/data/downloads/2024-08-28-16h.jsonl:
--------------------------------------------------------------------------------
1 | {"created":"2024-08-27 15:03:20","title":"Machine Learning for Methane Detection and Quantification from Space - A survey","abstract":"Methane ($CH_4$) is a potent anthropogenic greenhouse gas, contributing 86 times more to global warming than Carbon Dioxide ($CO_2$) over 20 years, and it also acts as an air pollutant. Given its high radiative forcing potential and relatively short atmospheric lifetime (9$\\pm$1 years), methane has important implications for climate change, therefore, cutting methane emissions is crucial for effective climate change mitigation. This work expands existing information on operational methane point source detection sensors in the Short-Wave Infrared (SWIR) bands. It reviews the state-of-the-art for traditional as well as Machine Learning (ML) approaches. The architecture and data used in such ML models will be discussed separately for methane plume segmentation and emission rate estimation. Traditionally, experts rely on labor-intensive manually adjusted methods for methane detection. However, ML approaches offer greater scalability. Our analysis reveals that ML models outperform traditional methods, particularly those based on convolutional neural networks (CNN), which are based on the U-net and transformer architectures. These ML models extract valuable information from methane-sensitive spectral data, enabling a more accurate detection. Challenges arise when comparing these methods due to variations in data, sensor specifications, and evaluation metrics. To address this, we discuss existing datasets and metrics, providing an overview of available resources and identifying open research problems. Finally, we explore potential future advances in ML, emphasizing approaches for model comparability, large dataset creation, and the European Union's forthcoming methane strategy.","sentences":["Methane ($CH_4$) is a potent anthropogenic greenhouse gas, contributing 86 times more to global warming than Carbon Dioxide ($CO_2$) over 20 years, and it also acts as an air pollutant.","Given its high radiative forcing potential and relatively short atmospheric lifetime (9$\\pm$1 years), methane has important implications for climate change, therefore, cutting methane emissions is crucial for effective climate change mitigation.","This work expands existing information on operational methane point source detection sensors in the Short-Wave Infrared (SWIR) bands.","It reviews the state-of-the-art for traditional as well as Machine Learning (ML) approaches.","The architecture and data used in such ML models will be discussed separately for methane plume segmentation and emission rate estimation.","Traditionally, experts rely on labor-intensive manually adjusted methods for methane detection.","However, ML approaches offer greater scalability.","Our analysis reveals that ML models outperform traditional methods, particularly those based on convolutional neural networks (CNN), which are based on the U-net and transformer architectures.","These ML models extract valuable information from methane-sensitive spectral data, enabling a more accurate detection.","Challenges arise when comparing these methods due to variations in data, sensor specifications, and evaluation metrics.","To address this, we discuss existing datasets and metrics, providing an overview of available resources and identifying open research problems.","Finally, we explore potential future advances in ML, emphasizing approaches for model comparability, large dataset creation, and the European Union's forthcoming methane strategy."],"url":"http://arxiv.org/abs/2408.15122v1"}
2 | 


--------------------------------------------------------------------------------
/data/downloads/2024-09-06-00h.jsonl:
--------------------------------------------------------------------------------
1 | {"created":"2024-09-04 14:01:48","title":"Pooling And Attention: What Are Effective Designs For LLM-Based Embedding Models?","abstract":"The significant advancements of Large Language Models (LLMs) in generative tasks have led to a growing body of work exploring LLM-based embedding models. While these models, employing different pooling and attention strategies, have achieved state-of-the-art performance on public embedding benchmarks, questions still arise about what constitutes an effective design for LLM-based embedding models. However, these models are often trained on different datasets, using different LLM base models or training settings. Moreover, evaluations on public embedding benchmarks often fail to report statistical significance, making it difficult to determine which designs truly contribute to final performance. This complicates the process for practitioners seeking optimal training recipes for LLM-based embedding models. In this study, we conduct a large-scale experiment by training a series of LLM-based embedding models using the same training data and base model but differing in their pooling and attention strategies. The results show that there is no one-size-fits-all solution: while bidirectional attention and an additional trainable pooling layer outperform in text similarity and information retrieval tasks, they do not significantly surpass simpler designs like EOS-last token pooling and default causal attention in clustering and classification tasks. Furthermore, we propose a new pooling strategy, Multi-Layers Trainable Pooling, which transforms the outputs of all hidden layers, rather than just the last layer, using a cross-attention network. This method proves to be statistically superior in text similarity and retrieval tasks compared to existing pooling methods. Overall, this paper sheds light on effective training strategies for LLM-based embedding models.","sentences":["The significant advancements of Large Language Models (LLMs) in generative tasks have led to a growing body of work exploring LLM-based embedding models.","While these models, employing different pooling and attention strategies, have achieved state-of-the-art performance on public embedding benchmarks, questions still arise about what constitutes an effective design for LLM-based embedding models.","However, these models are often trained on different datasets, using different LLM base models or training settings.","Moreover, evaluations on public embedding benchmarks often fail to report statistical significance, making it difficult to determine which designs truly contribute to final performance.","This complicates the process for practitioners seeking optimal training recipes for LLM-based embedding models.","In this study, we conduct a large-scale experiment by training a series of LLM-based embedding models using the same training data and base model but differing in their pooling and attention strategies.","The results show that there is no one-size-fits-all solution: while bidirectional attention and an additional trainable pooling layer outperform in text similarity and information retrieval tasks, they do not significantly surpass simpler designs like EOS-last token pooling and default causal attention in clustering and classification tasks.","Furthermore, we propose a new pooling strategy, Multi-Layers Trainable Pooling, which transforms the outputs of all hidden layers, rather than just the last layer, using a cross-attention network.","This method proves to be statistically superior in text similarity and retrieval tasks compared to existing pooling methods.","Overall, this paper sheds light on effective training strategies for LLM-based embedding models."],"url":"http://arxiv.org/abs/2409.02727v2"}
2 | 


--------------------------------------------------------------------------------
/data/downloads/2024-09-11-00h.jsonl:
--------------------------------------------------------------------------------
1 | {"created":"2024-09-09 16:48:09","title":"NeurLZ: On Enhancing Lossy Compression Performance based on Error-Controlled Neural Learning for Scientific Data","abstract":"Large-scale scientific simulations generate massive datasets that pose significant challenges for storage and I/O. While traditional lossy compression techniques can improve performance, balancing compression ratio, data quality, and throughput remains difficult. To address this, we propose NeurLZ, a novel cross-field learning-based and error-controlled compression framework for scientific data. By integrating skipping DNN models, cross-field learning, and error control, our framework aims to substantially enhance lossy compression performance. Our contributions are three-fold: (1) We design a lightweight skipping model to provide high-fidelity detail retention, further improving prediction accuracy. (2) We adopt a cross-field learning approach to significantly improve data prediction accuracy, resulting in a substantially improved compression ratio. (3) We develop an error control approach to provide strict error bounds according to user requirements. We evaluated NeurLZ on several real-world HPC application datasets, including Nyx (cosmological simulation), Miranda (large turbulence simulation), and Hurricane (weather simulation). Experiments demonstrate that our framework achieves up to a 90% relative reduction in bit rate under the same data distortion, compared to the best existing approach.","sentences":["Large-scale scientific simulations generate massive datasets that pose significant challenges for storage and I/O.","While traditional lossy compression techniques can improve performance, balancing compression ratio, data quality, and throughput remains difficult.","To address this, we propose NeurLZ, a novel cross-field learning-based and error-controlled compression framework for scientific data.","By integrating skipping DNN models, cross-field learning, and error control, our framework aims to substantially enhance lossy compression performance.","Our contributions are three-fold: (1) We design a lightweight skipping model to provide high-fidelity detail retention, further improving prediction accuracy.","(2) We adopt a cross-field learning approach to significantly improve data prediction accuracy, resulting in a substantially improved compression ratio.","(3) We develop an error control approach to provide strict error bounds according to user requirements.","We evaluated NeurLZ on several real-world HPC application datasets, including Nyx (cosmological simulation), Miranda (large turbulence simulation), and Hurricane (weather simulation).","Experiments demonstrate that our framework achieves up to a 90% relative reduction in bit rate under the same data distortion, compared to the best existing approach."],"url":"http://arxiv.org/abs/2409.05785v2"}
2 | 


--------------------------------------------------------------------------------
/data/downloads/2024-10-24-00h.jsonl:
--------------------------------------------------------------------------------
1 | {"created":"2024-10-22 17:13:38","title":"Non-myopic Generation of Language Model for Reasoning and Planning","abstract":"Large Language Models have demonstrated remarkable abilities in reasoning and planning by breaking down complex problems into sequential steps. Despite their success in various domains like mathematical problem-solving and coding, LLMs face challenges in ensuring reliable and optimal planning due to their inherent myopic nature of autoregressive decoding. This paper revisits LLM reasoning from an optimal-control perspective, proposing a novel method, Predictive-Decoding, that leverages Model Predictive Control to enhance planning accuracy. By re-weighting LLM distributions based on foresight trajectories, Predictive-Decoding aims to mitigate early errors and promote non-myopic planning. Our experiments show significant improvements in a wide range of tasks for math, coding, and agents. Furthermore, Predictive-Decoding demonstrates computational efficiency, outperforming search baselines with reduced computational resources. This study provides insights into optimizing LLM planning capabilities.","sentences":["Large Language Models have demonstrated remarkable abilities in reasoning and planning by breaking down complex problems into sequential steps.","Despite their success in various domains like mathematical problem-solving and coding, LLMs face challenges in ensuring reliable and optimal planning due to their inherent myopic nature of autoregressive decoding.","This paper revisits LLM reasoning from an optimal-control perspective, proposing a novel method, Predictive-Decoding, that leverages Model Predictive Control to enhance planning accuracy.","By re-weighting LLM distributions based on foresight trajectories, Predictive-Decoding aims to mitigate early errors and promote non-myopic planning.","Our experiments show significant improvements in a wide range of tasks for math, coding, and agents.","Furthermore, Predictive-Decoding demonstrates computational efficiency, outperforming search baselines with reduced computational resources.","This study provides insights into optimizing LLM planning capabilities."],"url":"http://arxiv.org/abs/2410.17195v2"}
2 | 


--------------------------------------------------------------------------------
/data/downloads/2024-10-31-00h.jsonl:
--------------------------------------------------------------------------------
1 | {"created":"2024-10-29 17:58:13","title":"Robots Pre-train Robots: Manipulation-Centric Robotic Representation from Large-Scale Robot Datasets","abstract":"The pre-training of visual representations has enhanced the efficiency of robot learning. Due to the lack of large-scale in-domain robotic datasets, prior works utilize in-the-wild human videos to pre-train robotic visual representation. Despite their promising results, representations from human videos are inevitably subject to distribution shifts and lack the dynamics information crucial for task completion. We first evaluate various pre-trained representations in terms of their correlation to the downstream robotic manipulation tasks (i.e., manipulation centricity). Interestingly, we find that the \"manipulation centricity\" is a strong indicator of success rates when applied to downstream tasks. Drawing from these findings, we propose Manipulation Centric Representation (MCR), a foundation representation learning framework capturing both visual features and the dynamics information such as actions and proprioceptions of manipulation tasks to improve manipulation centricity. Specifically, we pre-train a visual encoder on the DROID robotic dataset and leverage motion-relevant data such as robot proprioceptive states and actions. We introduce a novel contrastive loss that aligns visual observations with the robot's proprioceptive state-action dynamics, combined with a behavior cloning (BC)-like actor loss to predict actions during pre-training, along with a time contrastive loss. Empirical results across 4 simulation domains with 20 tasks verify that MCR outperforms the strongest baseline method by 14.8%. Moreover, MCR boosts the performance of data-efficient learning with a UR5e arm on 3 real-world tasks by 76.9%. Project website: https://robots-pretrain-robots.github.io/.","sentences":["The pre-training of visual representations has enhanced the efficiency of robot learning.","Due to the lack of large-scale in-domain robotic datasets, prior works utilize in-the-wild human videos to pre-train robotic visual representation.","Despite their promising results, representations from human videos are inevitably subject to distribution shifts and lack the dynamics information crucial for task completion.","We first evaluate various pre-trained representations in terms of their correlation to the downstream robotic manipulation tasks (i.e., manipulation centricity).","Interestingly, we find that the \"manipulation centricity\" is a strong indicator of success rates when applied to downstream tasks.","Drawing from these findings, we propose Manipulation Centric Representation (MCR), a foundation representation learning framework capturing both visual features and the dynamics information such as actions and proprioceptions of manipulation tasks to improve manipulation centricity.","Specifically, we pre-train a visual encoder on the DROID robotic dataset and leverage motion-relevant data such as robot proprioceptive states and actions.","We introduce a novel contrastive loss that aligns visual observations with the robot's proprioceptive state-action dynamics, combined with a behavior cloning (BC)-like actor loss to predict actions during pre-training, along with a time contrastive loss.","Empirical results across 4 simulation domains with 20 tasks verify that MCR outperforms the strongest baseline method by 14.8%.","Moreover, MCR boosts the performance of data-efficient learning with a UR5e arm on 3 real-world tasks by 76.9%.","Project website: https://robots-pretrain-robots.github.io/."],"url":"http://arxiv.org/abs/2410.22325v2"}
2 | 


--------------------------------------------------------------------------------
/data/downloads/2024-11-15-16h.jsonl:
--------------------------------------------------------------------------------
1 | {"created":"2024-11-14 18:44:31","title":"NeuralDEM -- Real-time Simulation of Industrial Particulate Flows","abstract":"Advancements in computing power have made it possible to numerically simulate large-scale fluid-mechanical and/or particulate systems, many of which are integral to core industrial processes. Among the different numerical methods available, the discrete element method (DEM) provides one of the most accurate representations of a wide range of physical systems involving granular and discontinuous materials. Consequently, DEM has become a widely accepted approach for tackling engineering problems connected to granular flows and powder mechanics. Additionally, DEM can be integrated with grid-based computational fluid dynamics (CFD) methods, enabling the simulation of chemical processes taking place, e.g., in fluidized beds. However, DEM is computationally intensive because of the intrinsic multiscale nature of particulate systems, restricting simulation duration or number of particles. Towards this end, NeuralDEM presents an end-to-end approach to replace slow numerical DEM routines with fast, adaptable deep learning surrogates. NeuralDEM is capable of picturing long-term transport processes across different regimes using macroscopic observables without any reference to microscopic model parameters. First, NeuralDEM treats the Lagrangian discretization of DEM as an underlying continuous field, while simultaneously modeling macroscopic behavior directly as additional auxiliary fields. Second, NeuralDEM introduces multi-branch neural operators scalable to real-time modeling of industrially-sized scenarios - from slow and pseudo-steady to fast and transient. Such scenarios have previously posed insurmountable challenges for deep learning models. Notably, NeuralDEM faithfully models coupled CFD-DEM fluidized bed reactors of 160k CFD cells and 500k DEM particles for trajectories of 28s. NeuralDEM will open many new doors to advanced engineering and much faster process cycles.","sentences":["Advancements in computing power have made it possible to numerically simulate large-scale fluid-mechanical and/or particulate systems, many of which are integral to core industrial processes.","Among the different numerical methods available, the discrete element method (DEM) provides one of the most accurate representations of a wide range of physical systems involving granular and discontinuous materials.","Consequently, DEM has become a widely accepted approach for tackling engineering problems connected to granular flows and powder mechanics.","Additionally, DEM can be integrated with grid-based computational fluid dynamics (CFD) methods, enabling the simulation of chemical processes taking place, e.g., in fluidized beds.","However, DEM is computationally intensive because of the intrinsic multiscale nature of particulate systems, restricting simulation duration or number of particles.","Towards this end, NeuralDEM presents an end-to-end approach to replace slow numerical DEM routines with fast, adaptable deep learning surrogates.","NeuralDEM is capable of picturing long-term transport processes across different regimes using macroscopic observables without any reference to microscopic model parameters.","First, NeuralDEM treats the Lagrangian discretization of DEM as an underlying continuous field, while simultaneously modeling macroscopic behavior directly as additional auxiliary fields.","Second, NeuralDEM introduces multi-branch neural operators scalable to real-time modeling of industrially-sized scenarios - from slow and pseudo-steady to fast and transient.","Such scenarios have previously posed insurmountable challenges for deep learning models.","Notably, NeuralDEM faithfully models coupled CFD-DEM fluidized bed reactors of 160k CFD cells and 500k DEM particles for trajectories of 28s.","NeuralDEM will open many new doors to advanced engineering and much faster process cycles."],"url":"http://arxiv.org/abs/2411.09678v1"}
2 | 


--------------------------------------------------------------------------------
/data/downloads/2024-11-16-08h.jsonl:
--------------------------------------------------------------------------------
1 | {"created":"2024-11-14 18:44:31","title":"NeuralDEM -- Real-time Simulation of Industrial Particulate Flows","abstract":"Advancements in computing power have made it possible to numerically simulate large-scale fluid-mechanical and/or particulate systems, many of which are integral to core industrial processes. Among the different numerical methods available, the discrete element method (DEM) provides one of the most accurate representations of a wide range of physical systems involving granular and discontinuous materials. Consequently, DEM has become a widely accepted approach for tackling engineering problems connected to granular flows and powder mechanics. Additionally, DEM can be integrated with grid-based computational fluid dynamics (CFD) methods, enabling the simulation of chemical processes taking place, e.g., in fluidized beds. However, DEM is computationally intensive because of the intrinsic multiscale nature of particulate systems, restricting simulation duration or number of particles. Towards this end, NeuralDEM presents an end-to-end approach to replace slow numerical DEM routines with fast, adaptable deep learning surrogates. NeuralDEM is capable of picturing long-term transport processes across different regimes using macroscopic observables without any reference to microscopic model parameters. First, NeuralDEM treats the Lagrangian discretization of DEM as an underlying continuous field, while simultaneously modeling macroscopic behavior directly as additional auxiliary fields. Second, NeuralDEM introduces multi-branch neural operators scalable to real-time modeling of industrially-sized scenarios - from slow and pseudo-steady to fast and transient. Such scenarios have previously posed insurmountable challenges for deep learning models. Notably, NeuralDEM faithfully models coupled CFD-DEM fluidized bed reactors of 160k CFD cells and 500k DEM particles for trajectories of 28s. NeuralDEM will open many new doors to advanced engineering and much faster process cycles.","sentences":["Advancements in computing power have made it possible to numerically simulate large-scale fluid-mechanical and/or particulate systems, many of which are integral to core industrial processes.","Among the different numerical methods available, the discrete element method (DEM) provides one of the most accurate representations of a wide range of physical systems involving granular and discontinuous materials.","Consequently, DEM has become a widely accepted approach for tackling engineering problems connected to granular flows and powder mechanics.","Additionally, DEM can be integrated with grid-based computational fluid dynamics (CFD) methods, enabling the simulation of chemical processes taking place, e.g., in fluidized beds.","However, DEM is computationally intensive because of the intrinsic multiscale nature of particulate systems, restricting simulation duration or number of particles.","Towards this end, NeuralDEM presents an end-to-end approach to replace slow numerical DEM routines with fast, adaptable deep learning surrogates.","NeuralDEM is capable of picturing long-term transport processes across different regimes using macroscopic observables without any reference to microscopic model parameters.","First, NeuralDEM treats the Lagrangian discretization of DEM as an underlying continuous field, while simultaneously modeling macroscopic behavior directly as additional auxiliary fields.","Second, NeuralDEM introduces multi-branch neural operators scalable to real-time modeling of industrially-sized scenarios - from slow and pseudo-steady to fast and transient.","Such scenarios have previously posed insurmountable challenges for deep learning models.","Notably, NeuralDEM faithfully models coupled CFD-DEM fluidized bed reactors of 160k CFD cells and 500k DEM particles for trajectories of 28s.","NeuralDEM will open many new doors to advanced engineering and much faster process cycles."],"url":"http://arxiv.org/abs/2411.09678v1"}
2 | 


--------------------------------------------------------------------------------
/data/downloads/2024-11-17-00h.jsonl:
--------------------------------------------------------------------------------
1 | {"created":"2024-11-14 18:44:31","title":"NeuralDEM -- Real-time Simulation of Industrial Particulate Flows","abstract":"Advancements in computing power have made it possible to numerically simulate large-scale fluid-mechanical and/or particulate systems, many of which are integral to core industrial processes. Among the different numerical methods available, the discrete element method (DEM) provides one of the most accurate representations of a wide range of physical systems involving granular and discontinuous materials. Consequently, DEM has become a widely accepted approach for tackling engineering problems connected to granular flows and powder mechanics. Additionally, DEM can be integrated with grid-based computational fluid dynamics (CFD) methods, enabling the simulation of chemical processes taking place, e.g., in fluidized beds. However, DEM is computationally intensive because of the intrinsic multiscale nature of particulate systems, restricting simulation duration or number of particles. Towards this end, NeuralDEM presents an end-to-end approach to replace slow numerical DEM routines with fast, adaptable deep learning surrogates. NeuralDEM is capable of picturing long-term transport processes across different regimes using macroscopic observables without any reference to microscopic model parameters. First, NeuralDEM treats the Lagrangian discretization of DEM as an underlying continuous field, while simultaneously modeling macroscopic behavior directly as additional auxiliary fields. Second, NeuralDEM introduces multi-branch neural operators scalable to real-time modeling of industrially-sized scenarios - from slow and pseudo-steady to fast and transient. Such scenarios have previously posed insurmountable challenges for deep learning models. Notably, NeuralDEM faithfully models coupled CFD-DEM fluidized bed reactors of 160k CFD cells and 500k DEM particles for trajectories of 28s. NeuralDEM will open many new doors to advanced engineering and much faster process cycles.","sentences":["Advancements in computing power have made it possible to numerically simulate large-scale fluid-mechanical and/or particulate systems, many of which are integral to core industrial processes.","Among the different numerical methods available, the discrete element method (DEM) provides one of the most accurate representations of a wide range of physical systems involving granular and discontinuous materials.","Consequently, DEM has become a widely accepted approach for tackling engineering problems connected to granular flows and powder mechanics.","Additionally, DEM can be integrated with grid-based computational fluid dynamics (CFD) methods, enabling the simulation of chemical processes taking place, e.g., in fluidized beds.","However, DEM is computationally intensive because of the intrinsic multiscale nature of particulate systems, restricting simulation duration or number of particles.","Towards this end, NeuralDEM presents an end-to-end approach to replace slow numerical DEM routines with fast, adaptable deep learning surrogates.","NeuralDEM is capable of picturing long-term transport processes across different regimes using macroscopic observables without any reference to microscopic model parameters.","First, NeuralDEM treats the Lagrangian discretization of DEM as an underlying continuous field, while simultaneously modeling macroscopic behavior directly as additional auxiliary fields.","Second, NeuralDEM introduces multi-branch neural operators scalable to real-time modeling of industrially-sized scenarios - from slow and pseudo-steady to fast and transient.","Such scenarios have previously posed insurmountable challenges for deep learning models.","Notably, NeuralDEM faithfully models coupled CFD-DEM fluidized bed reactors of 160k CFD cells and 500k DEM particles for trajectories of 28s.","NeuralDEM will open many new doors to advanced engineering and much faster process cycles."],"url":"http://arxiv.org/abs/2411.09678v1"}
2 | 


--------------------------------------------------------------------------------
/data/downloads/2025-01-10-00h.jsonl:
--------------------------------------------------------------------------------
1 | {"created":"2025-01-08 18:42:48","title":"Towards System 2 Reasoning in LLMs: Learning How to Think With Meta Chain-of-Thought","abstract":"We propose a novel framework, Meta Chain-of-Thought (Meta-CoT), which extends traditional Chain-of-Thought (CoT) by explicitly modeling the underlying reasoning required to arrive at a particular CoT. We present empirical evidence from state-of-the-art models exhibiting behaviors consistent with in-context search, and explore methods for producing Meta-CoT via process supervision, synthetic data generation, and search algorithms. Finally, we outline a concrete pipeline for training a model to produce Meta-CoTs, incorporating instruction tuning with linearized search traces and reinforcement learning post-training. Finally, we discuss open research questions, including scaling laws, verifier roles, and the potential for discovering novel reasoning algorithms. This work provides a theoretical and practical roadmap to enable Meta-CoT in LLMs, paving the way for more powerful and human-like reasoning in artificial intelligence.","sentences":["We propose a novel framework, Meta Chain-of-Thought (Meta-CoT), which extends traditional Chain-of-Thought (CoT) by explicitly modeling the underlying reasoning required to arrive at a particular CoT. We present empirical evidence from state-of-the-art models exhibiting behaviors consistent with in-context search, and explore methods for producing Meta-CoT via process supervision, synthetic data generation, and search algorithms.","Finally, we outline a concrete pipeline for training a model to produce Meta-CoTs, incorporating instruction tuning with linearized search traces and reinforcement learning post-training.","Finally, we discuss open research questions, including scaling laws, verifier roles, and the potential for discovering novel reasoning algorithms.","This work provides a theoretical and practical roadmap to enable Meta-CoT in LLMs, paving the way for more powerful and human-like reasoning in artificial intelligence."],"url":"http://arxiv.org/abs/2501.04682v1"}
2 | 


--------------------------------------------------------------------------------
/data/downloads/2025-04-04-00h.jsonl:
--------------------------------------------------------------------------------
1 | {"created":"2025-04-02 17:59:38","title":"Toward Real-world BEV Perception: Depth Uncertainty Estimation via Gaussian Splatting","abstract":"Bird's-eye view (BEV) perception has gained significant attention because it provides a unified representation to fuse multiple view images and enables a wide range of down-stream autonomous driving tasks, such as forecasting and planning. Recent state-of-the-art models utilize projection-based methods which formulate BEV perception as query learning to bypass explicit depth estimation. While we observe promising advancements in this paradigm, they still fall short of real-world applications because of the lack of uncertainty modeling and expensive computational requirement. In this work, we introduce GaussianLSS, a novel uncertainty-aware BEV perception framework that revisits unprojection-based methods, specifically the Lift-Splat-Shoot (LSS) paradigm, and enhances them with depth un-certainty modeling. GaussianLSS represents spatial dispersion by learning a soft depth mean and computing the variance of the depth distribution, which implicitly captures object extents. We then transform the depth distribution into 3D Gaussians and rasterize them to construct uncertainty-aware BEV features. We evaluate GaussianLSS on the nuScenes dataset, achieving state-of-the-art performance compared to unprojection-based methods. In particular, it provides significant advantages in speed, running 2.5x faster, and in memory efficiency, using 0.3x less memory compared to projection-based methods, while achieving competitive performance with only a 0.4% IoU difference.","sentences":["Bird's-eye view (BEV) perception has gained significant attention because it provides a unified representation to fuse multiple view images and enables a wide range of down-stream autonomous driving tasks, such as forecasting and planning.","Recent state-of-the-art models utilize projection-based methods which formulate BEV perception as query learning to bypass explicit depth estimation.","While we observe promising advancements in this paradigm, they still fall short of real-world applications because of the lack of uncertainty modeling and expensive computational requirement.","In this work, we introduce GaussianLSS, a novel uncertainty-aware BEV perception framework that revisits unprojection-based methods, specifically the Lift-Splat-Shoot (LSS) paradigm, and enhances them with depth un-certainty modeling.","GaussianLSS represents spatial dispersion by learning a soft depth mean and computing the variance of the depth distribution, which implicitly captures object extents.","We then transform the depth distribution into 3D Gaussians and rasterize them to construct uncertainty-aware BEV features.","We evaluate GaussianLSS on the nuScenes dataset, achieving state-of-the-art performance compared to unprojection-based methods.","In particular, it provides significant advantages in speed, running 2.5x faster, and in memory efficiency, using 0.3x less memory compared to projection-based methods, while achieving competitive performance with only a 0.4% IoU difference."],"url":"http://arxiv.org/abs/2504.01957v2"}
2 | 


--------------------------------------------------------------------------------
/data/downloads/2025-05-07-00h.jsonl:
--------------------------------------------------------------------------------
1 | {"created":"2025-05-05 14:57:16","title":"VGLD: Visually-Guided Linguistic Disambiguation for Monocular Depth Scale Recovery","abstract":"We propose a robust method for monocular depth scale recovery. Monocular depth estimation can be divided into two main directions: (1) relative depth estimation, which provides normalized or inverse depth without scale information, and (2) metric depth estimation, which involves recovering depth with absolute scale. To obtain absolute scale information for practical downstream tasks, utilizing textual information to recover the scale of a relative depth map is a highly promising approach. However, since a single image can have multiple descriptions from different perspectives or with varying styles, it has been shown that different textual descriptions can significantly affect the scale recovery process. To address this issue, our method, VGLD, stabilizes the influence of textual information by incorporating high-level semantic information from the corresponding image alongside the textual description. This approach resolves textual ambiguities and robustly outputs a set of linear transformation parameters (scalars) that can be globally applied to the relative depth map, ultimately generating depth predictions with metric-scale accuracy. We validate our method across several popular relative depth models(MiDas, DepthAnything), using both indoor scenes (NYUv2) and outdoor scenes (KITTI). Our results demonstrate that VGLD functions as a universal alignment module when trained on multiple datasets, achieving strong performance even in zero-shot scenarios. Code is available at: https://github.com/pakinwu/VGLD.","sentences":["We propose a robust method for monocular depth scale recovery.","Monocular depth estimation can be divided into two main directions: (1) relative depth estimation, which provides normalized or inverse depth without scale information, and (2) metric depth estimation, which involves recovering depth with absolute scale.","To obtain absolute scale information for practical downstream tasks, utilizing textual information to recover the scale of a relative depth map is a highly promising approach.","However, since a single image can have multiple descriptions from different perspectives or with varying styles, it has been shown that different textual descriptions can significantly affect the scale recovery process.","To address this issue, our method, VGLD, stabilizes the influence of textual information by incorporating high-level semantic information from the corresponding image alongside the textual description.","This approach resolves textual ambiguities and robustly outputs a set of linear transformation parameters (scalars) that can be globally applied to the relative depth map, ultimately generating depth predictions with metric-scale accuracy.","We validate our method across several popular relative depth models(MiDas, DepthAnything), using both indoor scenes (NYUv2) and outdoor scenes (KITTI).","Our results demonstrate that VGLD functions as a universal alignment module when trained on multiple datasets, achieving strong performance even in zero-shot scenarios.","Code is available at: https://github.com/pakinwu/VGLD."],"url":"http://arxiv.org/abs/2505.02704v2"}
2 | 


--------------------------------------------------------------------------------
/data/downloads/2025-05-16-00h.jsonl:
--------------------------------------------------------------------------------
1 | {"created":"2025-05-14 14:37:32","title":"Establishing Linear Surrogate Regret Bounds for Convex Smooth Losses via Convolutional Fenchel-Young Losses","abstract":"Surrogate regret bounds, also known as excess risk bounds, bridge the gap between the convergence rates of surrogate and target losses, with linear bounds favorable for their lossless regret transfer. While convex smooth surrogate losses are appealing in particular due to the efficient estimation and optimization, the existence of a trade-off between the smoothness and linear regret bound has been believed in the community. That being said, the better optimization and estimation properties of convex smooth surrogate losses may inevitably deteriorate after undergoing the regret transfer onto a target loss. We overcome this dilemma for arbitrary discrete target losses by constructing a convex smooth surrogate loss, which entails a linear surrogate regret bound composed with a tailored prediction link. The construction is based on Fenchel-Young losses generated by the convolutional negentropy, which are equivalent to the infimal convolution of a generalized negentropy and the target Bayes risk. Consequently, the infimal convolution enables us to derive a smooth loss while maintaining the surrogate regret bound linear. We additionally benefit from the infimal convolution to have a consistent estimator of the underlying class probability. Our results are overall a novel demonstration of how convex analysis penetrates into optimization and statistical efficiency in risk minimization.","sentences":["Surrogate regret bounds, also known as excess risk bounds, bridge the gap between the convergence rates of surrogate and target losses, with linear bounds favorable for their lossless regret transfer.","While convex smooth surrogate losses are appealing in particular due to the efficient estimation and optimization, the existence of a trade-off between the smoothness and linear regret bound has been believed in the community.","That being said, the better optimization and estimation properties of convex smooth surrogate losses may inevitably deteriorate after undergoing the regret transfer onto a target loss.","We overcome this dilemma for arbitrary discrete target losses by constructing a convex smooth surrogate loss, which entails a linear surrogate regret bound composed with a tailored prediction link.","The construction is based on Fenchel-Young losses generated by the convolutional negentropy, which are equivalent to the infimal convolution of a generalized negentropy and the target Bayes risk.","Consequently, the infimal convolution enables us to derive a smooth loss while maintaining the surrogate regret bound linear.","We additionally benefit from the infimal convolution to have a consistent estimator of the underlying class probability.","Our results are overall a novel demonstration of how convex analysis penetrates into optimization and statistical efficiency in risk minimization."],"url":"http://arxiv.org/abs/2505.09432v2"}
2 | 


--------------------------------------------------------------------------------
/data/downloads/2025-05-21-00h.jsonl:
--------------------------------------------------------------------------------
1 | {"created":"2025-05-19 16:06:13","title":"Rank, Chunk and Expand: Lineage-Oriented Reasoning for Taxonomy Expansion","abstract":"Taxonomies are hierarchical knowledge graphs crucial for recommendation systems, and web applications. As data grows, expanding taxonomies is essential, but existing methods face key challenges: (1) discriminative models struggle with representation limits and generalization, while (2) generative methods either process all candidates at once, introducing noise and exceeding context limits, or discard relevant entities by selecting noisy candidates. We propose LORex ($\\textbf{L}$ineage-$\\textbf{O}$riented $\\textbf{Re}$asoning for Taxonomy E$\\textbf{x}$pansion), a plug-and-play framework that combines discriminative ranking and generative reasoning for efficient taxonomy expansion. Unlike prior methods, LORex ranks and chunks candidate terms into batches, filtering noise and iteratively refining selections by reasoning candidates' hierarchy to ensure contextual efficiency. Extensive experiments across four benchmarks and twelve baselines show that LORex improves accuracy by 12% and Wu & Palmer similarity by 5% over state-of-the-art methods.","sentences":["Taxonomies are hierarchical knowledge graphs crucial for recommendation systems, and web applications.","As data grows, expanding taxonomies is essential, but existing methods face key challenges: (1) discriminative models struggle with representation limits and generalization, while (2) generative methods either process all candidates at once, introducing noise and exceeding context limits, or discard relevant entities by selecting noisy candidates.","We propose LORex ($\\textbf{L}$ineage-$\\textbf{O}$riented $\\textbf{Re}$asoning for Taxonomy E$\\textbf{x}$pansion), a plug-and-play framework that combines discriminative ranking and generative reasoning for efficient taxonomy expansion.","Unlike prior methods, LORex ranks and chunks candidate terms into batches, filtering noise and iteratively refining selections by reasoning candidates' hierarchy to ensure contextual efficiency.","Extensive experiments across four benchmarks and twelve baselines show that LORex improves accuracy by 12% and Wu & Palmer similarity by 5% over state-of-the-art methods."],"url":"http://arxiv.org/abs/2505.13282v2"}
2 | 


--------------------------------------------------------------------------------
/data/downloads/2025-05-22-00h.jsonl:
--------------------------------------------------------------------------------
1 | {"created":"2025-05-20 15:44:54","title":"Sparc3D: Sparse Representation and Construction for High-Resolution 3D Shapes Modeling","abstract":"High-fidelity 3D object synthesis remains significantly more challenging than 2D image generation due to the unstructured nature of mesh data and the cubic complexity of dense volumetric grids. Existing two-stage pipelines-compressing meshes with a VAE (using either 2D or 3D supervision), followed by latent diffusion sampling-often suffer from severe detail loss caused by inefficient representations and modality mismatches introduced in VAE. We introduce Sparc3D, a unified framework that combines a sparse deformable marching cubes representation Sparcubes with a novel encoder Sparconv-VAE. Sparcubes converts raw meshes into high-resolution ($1024^3$) surfaces with arbitrary topology by scattering signed distance and deformation fields onto a sparse cube, allowing differentiable optimization. Sparconv-VAE is the first modality-consistent variational autoencoder built entirely upon sparse convolutional networks, enabling efficient and near-lossless 3D reconstruction suitable for high-resolution generative modeling through latent diffusion. Sparc3D achieves state-of-the-art reconstruction fidelity on challenging inputs, including open surfaces, disconnected components, and intricate geometry. It preserves fine-grained shape details, reduces training and inference cost, and integrates naturally with latent diffusion models for scalable, high-resolution 3D generation.","sentences":["High-fidelity 3D object synthesis remains significantly more challenging than 2D image generation due to the unstructured nature of mesh data and the cubic complexity of dense volumetric grids.","Existing two-stage pipelines-compressing meshes with a VAE (using either 2D or 3D supervision), followed by latent diffusion sampling-often suffer from severe detail loss caused by inefficient representations and modality mismatches introduced in VAE.","We introduce Sparc3D, a unified framework that combines a sparse deformable marching cubes representation Sparcubes with a novel encoder Sparconv-VAE.","Sparcubes converts raw meshes into high-resolution ($1024^3$) surfaces with arbitrary topology by scattering signed distance and deformation fields onto a sparse cube, allowing differentiable optimization.","Sparconv-VAE is the first modality-consistent variational autoencoder built entirely upon sparse convolutional networks, enabling efficient and near-lossless 3D reconstruction suitable for high-resolution generative modeling through latent diffusion.","Sparc3D achieves state-of-the-art reconstruction fidelity on challenging inputs, including open surfaces, disconnected components, and intricate geometry.","It preserves fine-grained shape details, reduces training and inference cost, and integrates naturally with latent diffusion models for scalable, high-resolution 3D generation."],"url":"http://arxiv.org/abs/2505.14521v2"}
2 | 


--------------------------------------------------------------------------------
/data/downloads/2025-05-23-00h.jsonl:
--------------------------------------------------------------------------------
1 | {"created":"2025-05-21 16:11:31","title":"An Efficient Data Structure and Algorithm for Long-Match Query in Run-Length Compressed BWT","abstract":"In this paper, we describe a new type of match between a pattern and a text that aren't necessarily maximal in the query, but still contain useful matching information: locally maximal exact matches (LEMs). There are usually a large amount of LEMs, so we only consider those above some length threshold $\\mathcal{L}$. These are referred to as long LEMs. The purpose of long LEMs is to capture substring matches between a query and a text that are not necessarily maximal in the pattern but still long enough to be important. Therefore efficient long LEMs finding algorithms are desired for these datasets. However, these datasets are too large to query on traditional string indexes. Fortunately, these datasets are very repetitive. Recently, compressed string indexes that take advantage of the redundancy in the data but retain efficient querying capability have been proposed as a solution. We therefore give an efficient algorithm for computing all the long LEMs of a query and a text in a BWT runs compressed string index. We describe an $O(m+occ)$ expected time algorithm that relies on an $O(r)$ words space string index for outputting all long LEMs of a pattern with respect to a text given the matching statistics of the pattern with respect to the text. Here $m$ is the length of the query, $occ$ is the number of long LEMs outputted, and $r$ is the number of runs in the BWT of the text. The $O(r)$ space string index we describe relies on an adaptation of the move data structure by Nishimoto and Tabei. We are able to support $LCP[i]$ queries in constant time given $SA[i]$. In other words, we answer $PLCP[i]$ queries in constant time. Long LEMs may provide useful similarity information between a pattern and a text that MEMs may ignore. This information is particularly useful in pangenome and biobank scale haplotype panel contexts.","sentences":["In this paper, we describe a new type of match between a pattern and a text that aren't necessarily maximal in the query, but still contain useful matching information: locally maximal exact matches (LEMs).","There are usually a large amount of LEMs, so we only consider those above some length threshold $\\mathcal{L}$. These are referred to as long LEMs.","The purpose of long LEMs is to capture substring matches between a query and a text that are not necessarily maximal in the pattern but still long enough to be important.","Therefore efficient long LEMs finding algorithms are desired for these datasets.","However, these datasets are too large to query on traditional string indexes.","Fortunately, these datasets are very repetitive.","Recently, compressed string indexes that take advantage of the redundancy in the data but retain efficient querying capability have been proposed as a solution.","We therefore give an efficient algorithm for computing all the long LEMs of a query and a text in a BWT runs compressed string index.","We describe an $O(m+occ)$ expected time algorithm that relies on an $O(r)$ words space string index for outputting all long LEMs of a pattern with respect to a text given the matching statistics of the pattern with respect to the text.","Here $m$ is the length of the query, $occ$ is the number of long LEMs outputted, and $r$ is the number of runs in the BWT of the text.","The $O(r)$ space string index we describe relies on an adaptation of the move data structure by Nishimoto and Tabei.","We are able to support $LCP[i]$ queries in constant time given $SA[i]$. In other words, we answer $PLCP[i]$ queries in constant time.","Long LEMs may provide useful similarity information between a pattern and a text that MEMs may ignore.","This information is particularly useful in pangenome and biobank scale haplotype panel contexts."],"url":"http://arxiv.org/abs/2505.15698v1"}
2 | 


--------------------------------------------------------------------------------
/data/downloads/2025-05-29-00h.jsonl:
--------------------------------------------------------------------------------
1 | {"created":"2025-05-27 15:48:17","title":"Towards Robust Automated Perceptual Voice Quality Assessment with Speech Foundation Models","abstract":"Perceptual voice quality assessment is essential for diagnosing and monitoring voice disorders. Traditionally, expert raters use scales such as the CAPE-V and GRBAS. However, these are subjective and prone to inter-rater variability, motivating the need for automated, objective assessment methods. This study proposes VOQANet, a deep learning framework with an attention mechanism that leverages a Speech Foundation Model (SFM) to extract high-level acoustic and prosodic information from raw speech. To improve robustness and interpretability, we introduce VOQANet+, which integrates handcrafted acoustic features such as jitter, shimmer, and harmonics-to-noise ratio (HNR) with SFM embeddings into a hybrid representation. Unlike prior work focusing only on vowel-based phonation (PVQD-A subset) from the Perceptual Voice Quality Dataset (PVQD), we evaluate our models on both vowel-based and sentence-level speech (PVQD-S subset) for better generalizability. Results show that sentence-based input outperforms vowel-based input, particularly at the patient level, highlighting the benefit of longer utterances for capturing voice attributes. VOQANet consistently surpasses baseline methods in root mean squared error and Pearson correlation across CAPE-V and GRBAS dimensions, with VOQANet+ achieving further improvements. Additional tests under noisy conditions show that VOQANet+ maintains high prediction accuracy, supporting its use in real-world and telehealth settings. These findings demonstrate the value of combining SFM embeddings with domain-informed acoustic features for interpretable and robust voice quality assessment.","sentences":["Perceptual voice quality assessment is essential for diagnosing and monitoring voice disorders.","Traditionally, expert raters use scales such as the CAPE-V and GRBAS.","However, these are subjective and prone to inter-rater variability, motivating the need for automated, objective assessment methods.","This study proposes VOQANet, a deep learning framework with an attention mechanism that leverages a Speech Foundation Model (SFM) to extract high-level acoustic and prosodic information from raw speech.","To improve robustness and interpretability, we introduce VOQANet+, which integrates handcrafted acoustic features such as jitter, shimmer, and harmonics-to-noise ratio (HNR) with SFM embeddings into a hybrid representation.","Unlike prior work focusing only on vowel-based phonation (PVQD-A subset) from the Perceptual Voice Quality Dataset (PVQD), we evaluate our models on both vowel-based and sentence-level speech (PVQD-S subset) for better generalizability.","Results show that sentence-based input outperforms vowel-based input, particularly at the patient level, highlighting the benefit of longer utterances for capturing voice attributes.","VOQANet consistently surpasses baseline methods in root mean squared error and Pearson correlation across CAPE-V and GRBAS dimensions, with VOQANet+ achieving further improvements.","Additional tests under noisy conditions show that VOQANet+ maintains high prediction accuracy, supporting its use in real-world and telehealth settings.","These findings demonstrate the value of combining SFM embeddings with domain-informed acoustic features for interpretable and robust voice quality assessment."],"url":"http://arxiv.org/abs/2505.21356v2"}
2 | 


--------------------------------------------------------------------------------
/data/downloads/2025-06-05-00h.jsonl:
--------------------------------------------------------------------------------
1 | {"created":"2025-06-03 15:00:18","title":"PC-MoE: Memory-Efficient and Privacy-Preserving Collaborative Training for Mixture-of-Experts LLMs","abstract":"Mixture-of-Experts (MoE) has been gaining popularity due to its successful adaptation to large language models (LLMs). In this work, we introduce Privacy-preserving Collaborative Mixture-of-Experts (PC-MoE), which leverages the sparsity of the MoE architecture for memory-efficient decentralized collaborative LLM training, enabling multiple parties with limited GPU-memory and data resources to collectively train more capable LLMs than they could achieve individually. At the same time, this approach protects training data privacy of each participant by keeping training data, as well as parts of the forward pass signal and gradients locally within each party. By design, PC-MoE synergistically combines the strengths of distributed computation with strong confidentiality assurances. Unlike most privacy-preserving schemes, which pay for confidentiality with lower task accuracy, our framework breaks that trade-off: across seven popular LLM benchmarks, it almost matches (and sometimes exceeds) the performance and convergence rate of a fully centralized model, enjoys near 70% peak GPU RAM reduction, while being fully robust against reconstruction attacks.","sentences":["Mixture-of-Experts (MoE) has been gaining popularity due to its successful adaptation to large language models (LLMs).","In this work, we introduce Privacy-preserving Collaborative Mixture-of-Experts (PC-MoE), which leverages the sparsity of the MoE architecture for memory-efficient decentralized collaborative LLM training, enabling multiple parties with limited GPU-memory and data resources to collectively train more capable LLMs than they could achieve individually.","At the same time, this approach protects training data privacy of each participant by keeping training data, as well as parts of the forward pass signal and gradients locally within each party.","By design, PC-MoE synergistically combines the strengths of distributed computation with strong confidentiality assurances.","Unlike most privacy-preserving schemes, which pay for confidentiality with lower task accuracy, our framework breaks that trade-off: across seven popular LLM benchmarks, it almost matches (and sometimes exceeds) the performance and convergence rate of a fully centralized model, enjoys near 70% peak GPU RAM reduction, while being fully robust against reconstruction attacks."],"url":"http://arxiv.org/abs/2506.02965v2"}
2 | 


--------------------------------------------------------------------------------
/data/downloads/2025-06-06-00h.jsonl:
--------------------------------------------------------------------------------
1 | {"created":"2025-06-04 15:49:53","title":"Complexity and Manipulation of International Kidney Exchange Programmes with Country-Specific Parameters","abstract":"Kidney Exchange Programmes (KEPs) facilitate the exchange of kidneys, and larger pools of recipient-donor pairs tend to yield proportionally more transplants, leading to the proposal of international KEPs (IKEPs). However, as studied by \\citet{mincu2021ip}, practical limitations must be considered in IKEPs to ensure that countries remain willing to participate. Thus, we study IKEPs with country-specific parameters, represented by a tuple $\\Gamma$, restricting the selected transplants to be feasible for the countries to conduct, e.g., imposing an upper limit on the number of consecutive exchanges within a country's borders. We provide a complete complexity dichotomy for the problem of finding a feasible (according to the constraints given by $\\Gamma$) cycle packing with the maximum number of transplants, for every possible $\\Gamma$. We also study the potential for countries to misreport their parameters to increase their allocation. As manipulation can harm the total number of transplants, we propose a novel individually rational and incentive compatible mechanism $\\mathcal{M}_{\\text{order}}$. We first give a theoretical approximation ratio for $\\mathcal{M}_{\\text{order}}$ in terms of the number of transplants, and show that the approximation ratio of $\\mathcal{M}_{\\text{order}}$ is asymptotically optimal. We then use simulations which suggest that, in practice, the performance of $\\mathcal{M}_{\\text{order}}$ is significantly better than this worst-case ratio.","sentences":["Kidney Exchange Programmes (KEPs) facilitate the exchange of kidneys, and larger pools of recipient-donor pairs tend to yield proportionally more transplants, leading to the proposal of international KEPs (IKEPs).","However, as studied by \\citet{mincu2021ip}, practical limitations must be considered in IKEPs to ensure that countries remain willing to participate.","Thus, we study IKEPs with country-specific parameters, represented by a tuple $\\Gamma$, restricting the selected transplants to be feasible for the countries to conduct, e.g., imposing an upper limit on the number of consecutive exchanges within a country's borders.","We provide a complete complexity dichotomy for the problem of finding a feasible (according to the constraints given by $\\Gamma$) cycle packing with the maximum number of transplants, for every possible $\\Gamma$. We also study the potential for countries to misreport their parameters to increase their allocation.","As manipulation can harm the total number of transplants, we propose a novel individually rational and incentive compatible mechanism $\\mathcal{M}_{\\text{order}}$. We first give a theoretical approximation ratio for $\\mathcal{M}_{\\text{order}}$ in terms of the number of transplants, and show that the approximation ratio of $\\mathcal{M}_{\\text{order}}$ is asymptotically optimal.","We then use simulations which suggest that, in practice, the performance of $\\mathcal{M}_{\\text{order}}$ is significantly better than this worst-case ratio."],"url":"http://arxiv.org/abs/2506.04092v2"}
2 | 


--------------------------------------------------------------------------------
/frontpage/__init__.py:
--------------------------------------------------------------------------------
  1 | # from tqdm.rich import tqdm
  2 | # import jinja2 
  3 | # import datetime as dt 
  4 | # import json 
  5 | # import itertools as it
  6 | # from pathlib import Path
  7 | # from functools import cached_property
  8 | 
  9 | # import srsly
 10 | # import questionary
 11 | # from lunr import lunr
 12 | # from wasabi import Printer
 13 | # from lazylines import LazyLines, read_jsonl
 14 | # from embetter.utils import cached
 15 | # import warnings
 16 | # from tqdm import TqdmExperimentalWarning
 17 | 
 18 | # from .pipeline import dedup_stream, add_rownum, attach_docs
 19 | # from .datastream import DataStream
 20 | 
 21 | # warnings.filterwarnings("ignore", category=TqdmExperimentalWarning)
 22 | 
 23 | # msg = Printer()
 24 | 
 25 | # TRAINED_FOLDER_FOLDER = "training"
 26 | # TEMPLATE_PATH = "templates/home.html"
 27 | # CONFIG_FILE = "config.yml"
 28 | # DATA_LEVELS = ["sentence", "abstract"]
 29 | 
 30 | 
 31 | # class Frontpage:
 32 | #     """This is the main object that contains all project verbs."""
 33 | 
 34 | #     def __init__(self):
 35 | #         self.config = srsly.read_yaml(CONFIG_FILE)
 36 | #         self.sections = self.config["sections"]
 37 | #         self.labels = [s["label"] for s in self.config["sections"]]
 38 | #         self.datastream = DataStream()
 39 | 
 40 | #     @cached_property
 41 | #     def encoder(self):
 42 | #         from embetter.text import SentenceEncoder
 43 | #         encoder = SentenceEncoder()
 44 | #         encoder = cached(f"cache/{str(type(encoder))}", encoder)
 45 | #         return encoder
 46 | 
 47 | #     @cached_property
 48 | #     def nlp(self):
 49 | #         import spacy
 50 | #         return spacy.load("en_core_web_sm", disable=["ner", "lemmatizer", "tagger"])
 51 | 
 52 | #     @cached_property
 53 | #     def model(self):
 54 | #         from ._model import SentenceModel
 55 | #         return SentenceModel.from_disk(TRAINED_FOLDER_FOLDER, encoder=self.encoder)
 56 | 
 57 | #     def _dataset_name(self, label:str, view:str) -> str:
 58 | #         return f"{view}-{label}"
 59 |     
 60 | #     @property
 61 | #     def _annotation_views(self):
 62 | #         return ["sentence", "abstract"]
 63 | 
 64 | #     def content_stream(self, view:str):
 65 | #         # Fetch all downloaded files, make sure most recent ones come first
 66 | #         glob = reversed(list(Path("downloads").glob("**/*.jsonl")))
 67 | #         # Make lazy generator for all the items
 68 | #         stream = it.chain(*list(srsly.read_jsonl(file) for file in glob))
 69 | #         # Generate two streams lazily
 70 | #         abstract_stream = ({"text": ex["abstract"], "meta": {"url": ex["url"], "title": ex["title"], "created": ex["created"][:10]}} 
 71 | #                            for ex in stream)
 72 | #         sentences_stream = ({"text": sent, "meta": {"url": ex["url"]}} 
 73 | #                             for ex in stream for sent in ex['sentences'])
 74 | #         return dedup_stream(abstract_stream) if view == "abstract" else dedup_stream(sentences_stream)
 75 | 
 76 | #     def index(self):
 77 | #         """Index annotation examples for quick annotation."""
 78 | #         from simsity import create_index
 79 | 
 80 | #         for view in self._annotation_views:
 81 | #             msg.info(f"Preparing simsity index for {view}")
 82 | #             stream = (LazyLines(self.content_stream(view=view)).map(lambda d: d['text']))
 83 | #             create_index(list(stream), self.encoder, path=self._index_path(kind="simsity", view=view), pbar=True)
 84 | 
 85 | #             msg.info(f"Preparing lunr index for {view}")
 86 | #             stream = (LazyLines(self.content_stream(view=view)).pipe(add_rownum))
 87 | #             index = lunr(ref='idx', fields=('text',), documents=list(stream))
 88 | #             serialized = index.serialize()
 89 | #             with open(self._index_path(kind="lunr", view=view), 'w') as fd:
 90 | #                 json.dump(serialized, fd)
 91 | 
 92 | 
 93 | #     def annotate(self):
 94 | #         """
 95 | #         Methods for abstract level.
 96 | #             - second opinion
 97 | #             - simsity
 98 | #             - search-engine
 99 | #             - random
100 | 
101 | #         Methods for sentence level.
102 | #             - filter by patterns
103 | #             - active learn by sentence
104 | #             - simsity
105 | #             - search-engine
106 | #             - random
107 | #         """
108 |         
109 | 
110 | #         from .recipe import arxiv_sentence, arxiv_abstract
111 | #         from prodigy.app import server 
112 | #         from prodigy.core import Controller
113 | 
114 | #         dataset_name = f"{results['label']}-{results['view']}"
115 | #         name = "textcat.arxiv.sentence" if results['view'] == 'sentence' else "textcat.arxiv.abstract"
116 | #         if results['view'] == 'sentence':
117 | #             ctrl_data = arxiv_sentence(dataset_name, results['label'], results['tactic'], results['setting'])
118 | #         else:
119 | #             ctrl_data = arxiv_abstract(dataset_name, results['label'], results['tactic'], results['setting'])
120 | #         controller = Controller.from_components(name, ctrl_data)
121 | #         server(controller, controller.config)   
122 | 
123 | #     @cached_property
124 | #     def db(self):
125 | #         from prodigy.components.db import connect
126 |         
127 | #         db = connect()
128 | #         return db
129 | 
130 | #     def fetch_annotated_data(self):
131 | #         train_data = {}
132 | #         found_tags = []
133 | #         for tag in self.tags:
134 | #             tag = f"{tag}-sentence"
135 | #             if tag in self.db.datasets:
136 | #                 if len(self.db.get_dataset_examples(tag)) == 0:
137 | #                     msg.warn(f"Skipping training for {tag}. No training examples.")
138 | #                 else:
139 | #                     msg.info(f"Preparing data for {tag}.")
140 | #                     found_tags.append(tag)
141 | #                     for ex in self.db.get_dataset_examples(tag):
142 | #                         if ex["answer"] != "ignore":
143 | #                             h = ex["_input_hash"]
144 | #                             if h not in train_data:
145 | #                                 train_data[h] = {"text": ex["text"]}
146 | #                             train_data[h][tag] = int(ex["answer"] == "accept")
147 | 
148 | #         return train_data.values(), found_tags
149 |         
150 |     
151 | #     def train(self):
152 | #         from ._model import SentenceModel
153 | 
154 | #         annotated_data, found_tags = self.fetch_annotated_data()
155 | 
156 | #         model = SentenceModel(encoder=self.encoder, tasks=found_tags)
157 | #         model.update(annotated_data)
158 | #         model.to_disk(TRAINED_FOLDER_FOLDER)
159 |     
160 | #     def fetch_tag_candidate_stream(self, tag:str):
161 | #         from frontpage._model import SentenceModel
162 | 
163 | #         # TODO: 
164 | #         # so yeah, this can be made way faster. we're looping over the same 
165 | #         # thing again and again here, which is wasteful. we should also introcuce
166 | #         # a max lookup to the
167 | 
168 | #         model = SentenceModel.from_disk(TRAINED_FOLDER_FOLDER, encoder=self.encoder)
169 | #         stream = self.content_stream(view="abstract")
170 | 
171 | #         def render_html(doc):
172 | #             text = doc.text
173 | #             for span in doc.spans["sc"]:
174 | #                 text = text.replace(span.text, f"<span style='background-color: rgb(254 240 138);'>{span.text}</span>")
175 | #             return f"<p>{text}</p>"
176 | 
177 | #         return (LazyLines(stream)
178 | #             .mutate(abstract=lambda d: d['text'])
179 | #             .pipe(attach_docs, nlp=self.nlp, model=model)
180 | #             .mutate(cats = lambda d: d['doc'].cats)
181 | #             .keep(lambda d: d['cats'].get(tag, 0.0) > 0.6)
182 | #             .mutate(html=lambda d: render_html(d['doc']),
183 | #                     n_sents=lambda d: len(d['doc'].spans["sc"]),
184 | #                     link=lambda d: d['meta']['url']))
185 | 
186 | #     def build(self):
187 | #         config = srsly.read_yaml(CONFIG_FILE)
188 | #         for section in tqdm(config['sections'], desc="Looping over tags."):
189 | #             print(section['name'])
190 | #             section["content"] = self.fetch_tag_candidate_stream(tag=section['tag']).head(20).collect()
191 | #         import pprint
192 | #         pprint.pprint(section["content"])
193 |         
194 | #         template = jinja2.Template(Path(TEMPLATE_PATH).read_text())
195 | #         Path("site.html").write_text(template.render(sections=config['sections'], today=dt.date.today()))
196 | 
197 |     
198 | #     def evaluate(self):
199 | #         from ._benchmark import benchmark
200 | #         annotated, found_tags = self.fetch_annotated_data()
201 | #         benchmark(annotated, tags=["new-dataset", "data-quality"])
202 | 
203 | #     def push_wandb(self):
204 | #         ...
205 | 
206 | #     def pull_wandb(self):
207 | #         ...
208 | 
209 | 
210 | # if __name__ == "__main__":
211 | #     Frontpage(config=srsly.read_yaml("config.yml")).build()
212 | 


--------------------------------------------------------------------------------
/frontpage/__main__.py:
--------------------------------------------------------------------------------
  1 | import os
  2 | import datetime as dt 
  3 | from pathlib import Path 
  4 | 
  5 | from jinja2 import Template
  6 | from radicli import Radicli, Arg
  7 | 
  8 | from .utils import console
  9 | from .constants import TEMPLATE_PATH, TRAINED_FOLDER, SITE_PATH
 10 | 
 11 | cli = Radicli()
 12 | 
 13 | 
 14 | @cli.command("download")
 15 | def download():
 16 |     """Download new data."""
 17 |     from .download import main as download_data
 18 |     download_data()
 19 | 
 20 | 
 21 | @cli.command("index", 
 22 |              kind=Arg(help="Can be lunr/simsity"), 
 23 |              level=Arg(help="Can be sentence/abstract")
 24 | )
 25 | def index_cli(kind:str, level:str):
 26 |     """Creates index for annotation."""
 27 |     from .datastream import DataStream
 28 | 
 29 |     DataStream().create_index(level=level, kind=kind)
 30 | 
 31 | 
 32 | @cli.command("preprocess")
 33 | def preprocess_cli():
 34 |     """Dedup and process data for faster processing."""
 35 |     from .datastream import DataStream
 36 |     DataStream().save_clean_download_stream()
 37 | 
 38 | 
 39 | @cli.command("annotate")
 40 | def annotate():
 41 |     """Annotate new examples."""
 42 |     def run_questions():
 43 |         import questionary
 44 |         from .constants import LABELS, DATA_LEVELS
 45 |         results = {}
 46 |         results["label"] = questionary.select(
 47 |             "Which label do you want to annotate?",
 48 |             choices=LABELS,
 49 |         ).ask()
 50 | 
 51 |         results["level"] = questionary.select(
 52 |             "What view of the data do you want to take?",
 53 |             choices=DATA_LEVELS,
 54 |         ).ask()
 55 | 
 56 |         if results["level"] == "abstract":
 57 |             choices = ["second-opinion", "search-engine", "simsity", "random"]
 58 |         else:
 59 |             choices = ["simsity", "search-engine", "active-learning", "random"]
 60 | 
 61 |         results["tactic"] = questionary.select(
 62 |             "Which tactic do you want to apply?",
 63 |             choices=choices,
 64 |         ).ask()
 65 | 
 66 |         results['setting'] = ''
 67 |         if results["tactic"] in ["simsity", "search-engine"]:
 68 |             results["setting"] = questionary.text(
 69 |                 "What query would you like to use?", ""
 70 |             ).ask()
 71 | 
 72 |         if results["tactic"] == "active-learning":
 73 |             results["setting"] = questionary.select(
 74 |                 "What should the active learning method prefer?",
 75 |                 choices=["positive class", "uncertainty", "negative class"],
 76 |             ).ask()
 77 |         return results 
 78 |     
 79 |     results = run_questions()
 80 |     from .recipe import annotate_prodigy
 81 |     annotate_prodigy(results)
 82 | 
 83 | @cli.command("annotprep")
 84 | def annotprep():
 85 |     """Prepares data for training."""
 86 |     from .datastream import DataStream
 87 |     DataStream().save_train_stream()
 88 | 
 89 | 
 90 | @cli.command("train")
 91 | def train():
 92 |     """Trains a new model on the data."""
 93 |     from .datastream import DataStream
 94 |     from .modelling import SentenceModel
 95 |     examples = DataStream().get_train_stream()
 96 |     SentenceModel().train(examples=examples).to_disk()
 97 | 
 98 | 
 99 | @cli.command("pretrain")
100 | def pretrain():
101 |     """Trains a new featurizer, set-fit style."""
102 |     from .datastream import DataStream
103 |     from .modelling import SentenceModel
104 |     examples = DataStream().get_train_stream()
105 |     SentenceModel().pretrain(examples=examples)
106 | 
107 | 
108 | @cli.command("stats")
109 | def stats():
110 |     """Show annotation stats"""
111 |     from .datastream import DataStream
112 |     DataStream().show_annot_stats()
113 | 
114 | 
115 | @cli.command(
116 |     "build", 
117 |     retrain=Arg("--retrain", "-rt", help="Retrain model?"),
118 |     prep=Arg("--preprocess", "-pr", help="Preprocess again?")
119 | )
120 | def build(retrain: bool = False, prep:bool = False):
121 |     """Build a new site"""
122 |     from .datastream import DataStream
123 |     if prep:
124 |         preprocess_cli()
125 |     if retrain:
126 |         train()
127 |     console.log("Starting site build process")
128 |     sections = DataStream().get_site_content()
129 |     template = Template(Path(TEMPLATE_PATH).read_text())
130 |     rendered = template.render(sections=sections, today=dt.date.today())
131 |     SITE_PATH.write_text(rendered)
132 |     console.log("Site built.")
133 | 
134 | 
135 | @cli.command("artifact",
136 |     action=Arg(help="Can be upload/download"),
137 | )
138 | def artifact(action:str):
139 |     """Upload/download from wandb"""
140 |     import wandb
141 |     from dotenv import load_dotenv
142 |     from frontpage.constants import PRETRAINED_FOLDER
143 |     load_dotenv()
144 |     run = wandb.init(os.getenv("WANDB_API_KEY"))
145 |     if action == "upload":
146 |         artifact = wandb.Artifact(name='custom-sbert-emb', type="model")
147 |         artifact.add_dir(local_path=PRETRAINED_FOLDER)
148 |         run = wandb.init(project="arxiv-frontpage", job_type="upload")
149 |         run.log_artifact(artifact)
150 |     if action == "download":
151 |         if not PRETRAINED_FOLDER.exists():
152 |             run = wandb.init(project="arxiv-frontpage", job_type="download")
153 |             artifact = run.use_artifact('custom-sbert-emb:latest')
154 |             console.log(f"Could not find {PRETRAINED_FOLDER}. So will download from wandb.")
155 |             artifact.download(PRETRAINED_FOLDER)
156 |         else:
157 |             console.log(f"{PRETRAINED_FOLDER} already exists. Skip wandb download.")
158 | 
159 | 
160 | @cli.command("search")
161 | def search():
162 |     """Annotate new examples."""
163 |     import questionary
164 |     from simsity import load_index
165 |     from .modelling import SentenceModel
166 |     enc = SentenceModel().encoder
167 |     index = load_index("indices/simsity/sentence", encoder=enc)
168 |     while True:
169 |         query = questionary.text("Query:").ask()
170 |         texts, dists = index.query([query], n=5)
171 |         for t in texts:
172 |             print(t)
173 | 
174 | if __name__ == "__main__":
175 |     cli.run()
176 | 


--------------------------------------------------------------------------------
/frontpage/_benchmark.py:
--------------------------------------------------------------------------------
  1 | from pathlib import Path
  2 | import itertools as it 
  3 | 
  4 | import tqdm
  5 | import srsly
  6 | import numpy as np
  7 | import polars as pl
  8 | from dotenv import load_dotenv
  9 | from sklearn.svm import SVC
 10 | from sklearn.pipeline import make_pipeline
 11 | from sklearn.linear_model import LogisticRegression
 12 | from sklearn.feature_extraction.text import HashingVectorizer
 13 | from sklearn.metrics import classification_report
 14 | from sklearn.model_selection import StratifiedKFold, train_test_split
 15 | from embetter.text import SentenceEncoder, spaCyEncoder
 16 | from embetter.external import CohereEncoder, OpenAIEncoder
 17 | from embetter.utils import cached
 18 | from sklearn.pipeline import make_pipeline, make_union
 19 | from sklearn.decomposition import TruncatedSVD
 20 | from embetter.finetune import ForwardFinetuner, ContrastiveFinetuner
 21 | from sklearn.preprocessing import FunctionTransformer
 22 | 
 23 | from frontpage.datastream import DataStream
 24 | 
 25 | 
 26 | load_dotenv()
 27 | 
 28 | def grid(**kwargs):
 29 |     res = [{k: v for k, v in zip(kwargs.keys(), prod)} 
 30 |             for prod in it.product(*[v for v in kwargs.values()])]
 31 |     return tqdm.tqdm(res)
 32 |         
 33 | 
 34 | datastream = DataStream()
 35 | 
 36 | k_folder = StratifiedKFold(n_splits=10)
 37 | 
 38 | encoders = {
 39 |     "spacy": spaCyEncoder("en_core_web_md"),
 40 |     "sbert": SentenceEncoder(),
 41 |     "hash_lg": HashingVectorizer(),
 42 |     "hash_sm": HashingVectorizer(n_features=2**14),
 43 |     "openai": OpenAIEncoder(),
 44 |     "cohere": CohereEncoder(),
 45 | }
 46 | 
 47 | encoders["multi"] = make_union(
 48 |     encoders["sbert"], 
 49 |     make_pipeline(
 50 |         HashingVectorizer(n_features=10_000), 
 51 |         TruncatedSVD(),
 52 |     )
 53 | )
 54 | 
 55 | tuners = {
 56 |     "forward": lambda: ForwardFinetuner(hidden_dim=300), 
 57 |     "contrast": lambda: ContrastiveFinetuner(hidden_dim=300),
 58 |     "none": lambda: FunctionTransformer()
 59 | }
 60 | 
 61 | for name, enc in encoders.items():
 62 |     if name not in ["multi", "hash_lg", "hash_sm"]:
 63 |         encoders[name] = cached(f"cache/{str(type(enc))}", enc)
 64 | 
 65 | models = {
 66 |     "logistic": LogisticRegression(class_weight="balanced", max_iter=1000), 
 67 |     "svm": SVC(class_weight="balanced")
 68 | }
 69 | 
 70 | def calc_stats(pred_valid, y_valid):
 71 |     return {**classification_report(pred_valid, y_valid, output_dict=True)['1'],  "accuracy": float(np.mean(pred_valid == y_valid))}
 72 | 
 73 | 
 74 | def run_benchmark_k_fold(label, model, encoder, tuner):
 75 |     res = {"label": label, "model": model, "encoder": encoder, "tuner": tuner, "method": "k_fold"}
 76 |     pipe = make_pipeline(encoders[encoder], tuners[tuner](), models[model])
 77 |     examples = datastream.get_train_stream()
 78 |     X = [ex['text'] for ex in examples if label in ex['cats']]
 79 |     y = [ex['cats'][label] for ex in examples if label in ex['cats']]
 80 |     folds = k_folder.split(X, y)
 81 |     for i, (train_idx, valid_idx) in enumerate(folds):
 82 |         X_train = [str(x) for x in np.array(X)[train_idx]]
 83 |         X_valid = [str(x) for x in np.array(X)[valid_idx]]
 84 |         y_train = np.array(y)[train_idx]
 85 |         y_valid = np.array(y)[valid_idx]
 86 |         pipe.fit(X_train, y_train)
 87 |         valid_pred = pipe.predict(X_valid)
 88 |         stats = calc_stats(valid_pred, y_valid)
 89 |         res = {**res, **stats, "data_size": len(y), "i": i}
 90 |         yield res
 91 | 
 92 | 
 93 | def run_benchmark_train_size(label, model, encoder, tuner):
 94 |     res = {"label": label, "model": model, "encoder": encoder, "tuner": tuner, "method": "train_size"}
 95 |     pipe = make_pipeline(encoders[encoder], tuners[tuner](), models[model])
 96 |     examples = datastream.get_train_stream()
 97 |     X = [ex['text'] for ex in examples if label in ex['cats']]
 98 |     y = [ex['cats'][label] for ex in examples if label in ex['cats']]
 99 |     X_train, X_valid, y_train, y_valid = train_test_split(X, y, test_size=0.2)
100 |     for p in [0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 1.0]:
101 |         idx = int(len(X_train) * p)
102 |         X_train_use = [str(x) for x in np.array(X_train)[:idx]]
103 |         y_train_use = np.array(y_train)[:idx]
104 |         pipe.fit(X_train_use, y_train_use)
105 |         valid_pred = pipe.predict(X_valid)
106 |         stats = calc_stats(valid_pred, y_valid)
107 |         res = {**res, **stats, "data_size": len(y), "p": p}
108 |         yield res
109 | 
110 | 
111 | if __name__ == "__main__":
112 |     settings = grid(
113 |         label=["new-dataset"], 
114 |         encoder=["sbert", "openai", "cohere", "multi"], 
115 |         model=["logistic", "svm"],
116 |         tuner=["contrast", "forward", "none"]
117 |     )
118 | 
119 |     stats = (ex for setting in settings for ex in run_benchmark_k_fold(**setting))
120 | 
121 |     if Path("benchmark_kfold.jsonl").exists():
122 |         Path("benchmark_kfold.jsonl").unlink()
123 |     srsly.write_jsonl("benchmark_kfold.jsonl", stats)
124 | 
125 |     stats = (ex for setting in settings for ex in run_benchmark_train_size(**setting))
126 | 
127 |     if Path("benchmark_train_size.jsonl").exists():
128 |         Path("benchmark_train_size.jsonl").unlink()
129 |     srsly.write_jsonl("benchmark_train_size.jsonl", stats)
130 | 
131 |     pl.Config.set_tbl_rows(100)
132 |     pl.Config.set_tbl_width_chars(1000)
133 | 
134 |     # print(
135 |     #     pl.read_ndjson("benchmark.jsonl")
136 |     #     .groupby("label","model","encoder","tuner")
137 |     #     .agg(
138 |     #         pl.mean("recall"), 
139 |     #         pl.mean("precision"), 
140 |     #         pl.mean("f1-score"),
141 |     #         pl.mean("accuracy"),
142 |     #     ).sort("f1-score")
143 |     # )
144 | 
145 | 


--------------------------------------------------------------------------------
/frontpage/constants.py:
--------------------------------------------------------------------------------
 1 | import srsly
 2 | from pathlib import Path 
 3 | from typing import Literal 
 4 | 
 5 | from .types import Config
 6 | 
 7 | # Paths and folders
 8 | DATA_FOLDER = Path("data")
 9 | ANNOT_FOLDER = DATA_FOLDER / "annot"
10 | INDICES_FOLDER = Path("indices")
11 | CLEAN_DOWNLOADS_FOLDER = DATA_FOLDER / Path("cleaned")
12 | DOWNLOADS_FOLDER = DATA_FOLDER / "downloads"
13 | ANNOT_PATH = ANNOT_FOLDER / "annotations.jsonl"
14 | ACTIVE_LEARN_PATH = ANNOT_FOLDER / "active-learn.jsonl"
15 | SECOND_OPINION_PATH = ANNOT_FOLDER / "second-opinion.jsonl"
16 | TRAINED_FOLDER = Path("training")
17 | TEMPLATE_PATH = Path("templates/home.html")
18 | CONFIG_FILE = "config.yml"
19 | PRETRAINED_FOLDER = TRAINED_FOLDER / "custom-sbert-emb"
20 | 
21 | # Cache paths
22 | EMBETTER_CACHE = Path("cache") / "embetter"
23 | 
24 | # Possible values
25 | DATA_LEVELS = ["sentence", "abstract"]
26 | DATA_LEVELS_TYPE = Literal["sentence", "abstract"]
27 | CONFIG = Config(**srsly.read_yaml(CONFIG_FILE))
28 | LABELS = [s.label for s in CONFIG.sections]
29 | THRESHOLDS = {s.label: s.threshold for s in CONFIG.sections}
30 | SITE_PATH = Path("index.html")
31 | 


--------------------------------------------------------------------------------
/frontpage/download.py:
--------------------------------------------------------------------------------
 1 | import datetime as dt 
 2 | from pathlib import Path
 3 | from typing import List
 4 | 
 5 | import srsly
 6 | import tqdm
 7 | import arxiv
 8 | from arxiv import Result
 9 | from retry import retry 
10 | import spacy
11 | from spacy.language import Language
12 | from .types import ArxivArticle
13 | from rich.console import Console 
14 | 
15 | console = Console()
16 | 
17 | 
18 | def age_in_days(res: Result) -> float:
19 |     """Get total seconds from now from Arxiv result"""
20 |     now = dt.datetime.now(dt.timezone.utc)
21 |     return (now - res.published).total_seconds() / 3600 / 24
22 | 
23 | 
24 | def parse(res: Result, nlp: Language) -> ArxivArticle:
25 |     """Parse proper Pydantic object from Arxiv"""
26 |     summary = res.summary.replace("\n", " ")
27 |     doc = nlp(summary)
28 |     sents = [s.text for s in doc.sents]
29 |     
30 |     return ArxivArticle(
31 |         created=str(res.published)[:19], 
32 |         title=str(res.title),
33 |         abstract=summary,
34 |         sentences=sents,
35 |         url=str(res.entry_id)
36 |     )
37 | 
38 | @retry(tries=5, delay=1, backoff=2)
39 | def main():
40 |     nlp = spacy.load("en_core_web_sm", disable=["ner", "lemmatizer", "tagger"])
41 |     console.log(f"Starting arxiv search.")
42 |     items = arxiv.Search(
43 |         query="and",
44 |         max_results=200,
45 |         sort_by=arxiv.SortCriterion.SubmittedDate,
46 |     )
47 | 
48 |     results = list(items.results())
49 | 
50 |     console.log(f"Found {len(results)} results.")
51 | 
52 |     articles = [dict(parse(r, nlp=nlp)) 
53 |                 for r in tqdm.tqdm(results) 
54 |                 if age_in_days(r) < 2.5 and r.primary_category.startswith("cs")]
55 | 
56 |     dist = [age_in_days(r) for r in results]
57 |     if dist:
58 |         console.log(f"Minimum article age: {min(dist)}")
59 |         console.log(f"Maximum article age: {max(dist)}")
60 |     articles_dict = {ex['title']: ex for ex in articles}
61 |     most_recent = list(sorted(Path("data/downloads/").glob("*.jsonl")))[-1]
62 |     old_articles_dict = {ex['title']: ex for ex in srsly.read_jsonl(most_recent)}
63 | 
64 |     new_articles = [ex for title, ex in articles_dict.items() if title not in old_articles_dict.keys()]
65 |     old_articles = [ex for title, ex in articles_dict.items() if title in old_articles_dict.keys()]
66 |     if old_articles:
67 |         console.log(f"Found {len(old_articles)} old articles in current batch. Skipping.")
68 |     if new_articles:
69 |         console.log(f"Found {len(new_articles)} new articles in current batch to write.")
70 |         filename = str(dt.datetime.now()).replace(" ", "-")[:13] + "h.jsonl"
71 |         srsly.write_jsonl(Path("data") / "downloads" / filename, new_articles)
72 |         console.log(f"Wrote {len(new_articles)} articles into {filename}.")
73 | 


--------------------------------------------------------------------------------
/frontpage/modelling.py:
--------------------------------------------------------------------------------
  1 | from typing import List, Dict
  2 | from pathlib import Path
  3 | from functools import cached_property
  4 | 
  5 | from wasabi import Printer
  6 | import numpy as np
  7 | from skops.io import dump, load
  8 | from embetter.utils import cached
  9 | from sklearn.linear_model import LogisticRegression
 10 | 
 11 | from .constants import TRAINED_FOLDER, LABELS, EMBETTER_CACHE, PRETRAINED_FOLDER
 12 | from .utils import console 
 13 | 
 14 | msg = Printer()
 15 | 
 16 | 
 17 | class SentenceModel:
 18 |     def __init__(self, labels=LABELS) -> None:
 19 |         self.labels = labels 
 20 |         self._models = {k: LogisticRegression(class_weight="balanced") for k in self.labels}
 21 | 
 22 |     def train(self, examples):
 23 |         X = self.featurizer.transform([ex["text"] for ex in examples])
 24 |         for task, model in self._models.items():
 25 |             xs = np.array([X[i] for i, ex in enumerate(examples) if task in ex['cats']])
 26 |             ys = np.array(
 27 |                 [ex['cats'][task] for ex in examples if task in ex['cats']], dtype=int
 28 |             )
 29 |             model.fit(xs, ys)
 30 |             console.log(f"Trained the [bold]{task}[/bold] task, using {len(xs)} examples.")
 31 |         return self
 32 | 
 33 |     def __call__(self, text:str) -> Dict:
 34 |         result = {}
 35 |         X = self.featurizer.transform([text])
 36 |         for label in self.labels:
 37 |             proba = self._models[label].predict_proba(X)[0, 1]
 38 |             result[label] = float(proba)
 39 |         return result
 40 |     
 41 |     def predict(self, texts: List[str]) -> List[Dict]:
 42 |         X = self.featurizer.transform(texts)
 43 |         result = [{} for _ in texts]
 44 |         for label in self.labels:
 45 |             probas = self._models[label].predict_proba(X)[:, 1]
 46 |             for i, proba in enumerate(probas):
 47 |                 result[i][label] = float(proba)
 48 |         return result
 49 | 
 50 |     @cached_property
 51 |     def encoder(self):
 52 |         from embetter.text import SentenceEncoder
 53 |         encoder = SentenceEncoder()
 54 |         encoder = cached(EMBETTER_CACHE / "sbert", encoder)
 55 |         return encoder
 56 | 
 57 |     @cached_property
 58 |     def featurizer(self):
 59 |         from embetter.text import SentenceEncoder
 60 |         if not Path(PRETRAINED_FOLDER).exists():
 61 |             console.log("Did not find pretrained model. Falling back.")
 62 |             return self.encoder
 63 |         console.log(f"Will use custom model found in {PRETRAINED_FOLDER}")
 64 |         return SentenceEncoder(PRETRAINED_FOLDER)
 65 | 
 66 |     def pretrain(self, examples):
 67 |         from sentence_transformers import SentenceTransformer, InputExample, losses
 68 |         from torch.utils.data import DataLoader
 69 |         from embetter.finetune._contrastive import generate_pairs_batch
 70 |         
 71 |         console.log("Starting pretraining sequence.")
 72 |         all_pairs = []
 73 |         for label in LABELS:
 74 |             subset = [ex for ex in examples if label in ex['cats']]
 75 |             pairs = generate_pairs_batch([ex['cats'][label] for ex in subset], n_neg=1)
 76 |             all_pairs.extend(pairs)
 77 |         
 78 |         input_examples = []
 79 |         for pair in pairs:
 80 |             text1 = examples[pair.i1]['text']
 81 |             text2 = examples[pair.i2]['text']
 82 |             input_examples.append(InputExample(texts=[text1, text2], label=float(pair.label)))
 83 |         model = SentenceTransformer('all-MiniLM-L6-v2')
 84 | 
 85 |         train_dataloader = DataLoader(input_examples, shuffle=True, batch_size=16)
 86 |         train_loss = losses.CosineSimilarityLoss(model)
 87 | 
 88 |         console.log("Pairs generated. About to tune model.")
 89 |         model.fit(
 90 |             train_objectives=[(train_dataloader, train_loss)], 
 91 |             epochs=3, 
 92 |             warmup_steps=100, 
 93 |             output_path=str(PRETRAINED_FOLDER)
 94 |         )
 95 |         console.log(f"New encoder saved at {PRETRAINED_FOLDER}")
 96 |     
 97 |     @cached_property
 98 |     def nlp(self):
 99 |         import spacy
100 |         return spacy.load("en_core_web_sm", disable=["ner", "lemmatizer", "tagger"])
101 | 
102 |     def to_disk(self, path: Path=TRAINED_FOLDER):
103 |         if not Path(path).exists():
104 |             Path(path).mkdir(exist_ok=True, parents=True)
105 |         # Delete old files
106 |         if Path(path).exists():
107 |             for p in Path(path).glob("*.h5"):
108 |                 p.unlink()
109 |         # Write new files
110 |         for name, clf in self._models.items():
111 |             dump(clf, Path(path) / f"{name}.h5")
112 |         console.log(f"Model saved in folder: [bold]{path}[/bold].")
113 | 
114 |     @classmethod
115 |     def from_disk(cls, path: Path=TRAINED_FOLDER):
116 |         if not Path(path).exists():
117 |             raise RuntimeError("You need to train a model beforehand.")
118 |         models = {}
119 |         for f in Path(path).glob("*.h5"):
120 |             models[f.stem] = load(f, trusted=True)
121 | 
122 |         model = SentenceModel(labels=models.keys())
123 |         model._models = models
124 |         console.log(f"Model loaded from: [bold]{path}[/bold].")
125 |         return model
126 | 


--------------------------------------------------------------------------------
/frontpage/recipe.py:
--------------------------------------------------------------------------------
  1 | import spacy
  2 | 
  3 | import prodigy
  4 | 
  5 | from .datastream import DataStream
  6 | from .utils import console
  7 | datastream = DataStream()
  8 | 
  9 | 
 10 | 
 11 | @prodigy.recipe("textcat.arxiv.sentence",
 12 |     dataset=("The dataset to save", "positional", None, str),
 13 |     label=("The label to annotate", "positional", None, str),
 14 |     tactic=("The tactic to retreive relevant examples", "positional", None, str),
 15 |     setting=("Additional setting for the tactic", "positional", None, str),
 16 | )
 17 | def arxiv_sentence(dataset, label, tactic, setting):
 18 |     """Very general recipe to annotate sentences, using different ordering techniques."""
 19 |     from prodigy import set_hashes
 20 |     if tactic == "simsity":
 21 |         console.log("Setting up simsity stream")
 22 |         stream = datastream.get_ann_stream(query=setting, level="sentence")
 23 |     elif tactic == "random":
 24 |         console.log("Setting up randomized stream")
 25 |         stream = datastream.get_random_stream(level="sentence")
 26 |     elif tactic == "active-learning":
 27 |         console.log("Setting up active learning")
 28 |         stream = datastream.get_active_learn_stream(label=label, preference=setting)
 29 |     elif tactic == "search-engine":
 30 |         console.log("Setting up lunr query")
 31 |         stream = datastream.get_lunr_stream(query=setting, level="sentence")
 32 |     else:
 33 |         raise ValueError("This should never happen.")
 34 |     
 35 |     return {
 36 |         "dataset": dataset,
 37 |         "stream": (set_hashes({**ex, "label": label}) for ex in stream),
 38 |         "view_id": "classification",
 39 |         "config":{
 40 |             "exclude_by": "input"
 41 |         }
 42 |     }
 43 | 
 44 | 
 45 | @prodigy.recipe("textcat.arxiv.abstract",
 46 |     dataset=("The dataset to save", "positional", None, str),
 47 |     label=("The label to annotate", "positional", None, str),
 48 |     tactic=("The tactic to retreive relevant examples", "positional", None, str),
 49 |     setting=("Additional setting for the tactic", "positional", None, str),
 50 | )
 51 | def arxiv_abstract(dataset, label, tactic, setting):
 52 |     """Very general recipe to annotate sentences, using different ordering techniques."""
 53 |     from prodigy.components.preprocess import add_tokens
 54 |     from prodigy import set_hashes
 55 | 
 56 |     if tactic == "simsity":
 57 |         console.log("Setting up simsity stream")
 58 |         stream = datastream.get_ann_stream(query=setting, level="abstract")
 59 |     elif tactic == "random":
 60 |         console.log("Setting up randomized stream")
 61 |         stream = datastream.get_random_stream(level="abstract")
 62 |     elif tactic == "search-engine":
 63 |         console.log("Setting up lunr query")
 64 |         stream = datastream.get_lunr_stream(query=setting, level="abstract")
 65 |     elif tactic == "second-opinion":
 66 |         console.log("Setting up second opinion")
 67 |         stream = datastream.get_second_opinion_stream(label=label, min_sents=1, max_sents=2)
 68 |     else:
 69 |         raise ValueError("This should never happen.")
 70 |     
 71 |     nlp = spacy.blank("en")
 72 |     stream = ({**ex, "label": label} for ex in stream)
 73 |     stream = add_tokens(nlp, stream)
 74 |     return {
 75 |         "dataset": dataset,
 76 |         "stream": (set_hashes(ex) for ex in stream),
 77 |         "view_id": "blocks",
 78 |         "config": {
 79 |             "labels": [label],
 80 |             "blocks": [
 81 |                 {"view_id": "ner_manual"},
 82 |             ],
 83 |             "exclude_by": "input"
 84 |         }
 85 |     }
 86 | 
 87 | 
 88 | def annotate_prodigy(results):
 89 |     from prodigy.app import server 
 90 |     from prodigy.core import Controller
 91 | 
 92 |     dataset_name = datastream.get_dataset_name(results['label'], results['level'])
 93 |     name = "textcat.arxiv.sentence" if results['level'] == 'sentence' else "textcat.arxiv.abstract"
 94 |     if results['level'] == 'sentence':
 95 |         ctrl_data = arxiv_sentence(dataset_name, results['label'], results['tactic'], results['setting'])
 96 |     else:
 97 |         ctrl_data = arxiv_abstract(dataset_name, results['label'], results['tactic'], results['setting'])
 98 |     controller = Controller.from_components(name, ctrl_data)
 99 |     server(controller, controller.config)
100 | 


--------------------------------------------------------------------------------
/frontpage/types.py:
--------------------------------------------------------------------------------
 1 | from typing import List
 2 | from pydantic import BaseModel 
 3 | 
 4 | 
 5 | class ArxivArticle(BaseModel):
 6 |     created: str
 7 |     title: str
 8 |     abstract: str
 9 |     sentences: List[str]
10 |     url: str
11 | 
12 | 
13 | class LabelConfig(BaseModel):
14 |     name: str
15 |     instructions: str
16 |     label: str
17 |     threshold: float = 0.7
18 | 
19 | 
20 | class Config(BaseModel):
21 |     n_per_section: int = 20
22 |     sections: List[LabelConfig]
23 | 


--------------------------------------------------------------------------------
/frontpage/utils.py:
--------------------------------------------------------------------------------
 1 | from rich.console import Console 
 2 | import itertools as it
 3 | 
 4 | from spacy.tokens import Span
 5 | 
 6 | 
 7 | console = Console()
 8 | 
 9 | 
10 | def batched(iterable, n=56):
11 |     "Batch data into tuples of length n. The last batch may be shorter."
12 |     if n < 1:
13 |         raise ValueError('n must be at least one')
14 |     iters = iter(iterable)
15 |     while batch := tuple(it.islice(iters, n)):
16 |         yield batch
17 | 
18 | 
19 | def dedup_stream(stream, key="text"):
20 |     uniq = {}
21 |     for ex in stream:
22 |         uniq[hash(ex[key])] = ex
23 |     for ex in uniq.values():
24 |         yield ex
25 | 
26 | 
27 | def add_rownum(stream):
28 |     for i, ex in enumerate(stream):
29 |         yield {"text": ex["text"], "idx": i}
30 | 
31 | 
32 | def attach_docs(lines, nlp, label):
33 |     tuples = ((eg['text'], eg) for eg in lines)
34 |     for doc, eg in nlp.pipe(tuples, as_tuples=True):
35 |         eg['doc'] = sentence_classifier(doc, eg['preds'], label)
36 |         yield eg
37 | 
38 | 
39 | def sentence_classifier(doc, preds, label):
40 |     doc.spans["sc"] = []
41 |     for sent, pred in zip(doc.sents, preds):
42 |         for k, p in pred.items():
43 |             if p >= 0.6:
44 |                 if k == label:
45 |                     doc.spans["sc"].append(Span(doc, sent.start, sent.end, k))
46 |                     doc.cats[k] = max(doc.cats.get(k, 0.0), p)
47 |     return doc
48 | 
49 | 
50 | def attach_spans(stream, label, min_spans=1, max_spans=1):
51 |     for ex in stream:
52 |         spans = []
53 |         for spansvals in ex['doc'].spans.values():
54 |             for span in spansvals:
55 |                 spans.append(
56 |                     {
57 |                         "token_start": span.start,
58 |                         "token_end": span.end - 1,
59 |                         "start": span.start_char,
60 |                         "end": span.end_char,
61 |                         "text": span.text,
62 |                         "label": label,
63 |                     }
64 |                 )
65 |         ex["spans"] = spans
66 |         del ex["doc"]
67 |         if len(spans) >= min_spans:
68 |             if len(spans) <= max_spans:
69 |                 yield ex
70 | 
71 | 
72 | def add_predictions(stream, model):
73 |     for ex in stream:
74 |         preds = model.predict(ex['sentences'])
75 |         ex['preds'] = preds
76 |         ex['created'] = ex['created'][:10]
77 |         yield ex
78 | 
79 | 
80 | def _abstract_single_annot_to_sent(example, nlp, label):
81 |     """Takens an annotation from abstract level and turns it into a training example"""
82 |     text = example['text']
83 |     if example['answer'] == "accept" and "spans" in example:
84 |         for span in example['spans']:
85 |             yield {"text": text[span['start']: span['end']], label: 1}
86 |         for span in example['spans']:
87 |             text = text.replace(text[span['start']: span['end']], "")
88 |         for sent in nlp(text).sents:
89 |             if len(sent.text) > 5:
90 |                 yield {"text": sent.text, label: 0}
91 | 
92 | 
93 | def abstract_annot_to_sent(examples, nlp, label):
94 |     """Takens an annotation from abstract level and turns it into a training example"""
95 |     for ex in examples:
96 |         for annot in _abstract_single_annot_to_sent(ex, nlp, label):
97 |             yield annot
98 | 


--------------------------------------------------------------------------------
/images/active-teaching.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/koaning/arxiv-frontpage/260eaf2a63af68045367f0579161b6b414fc97ac/images/active-teaching.png


--------------------------------------------------------------------------------
/images/multiheads.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/koaning/arxiv-frontpage/260eaf2a63af68045367f0579161b6b414fc97ac/images/multiheads.png


--------------------------------------------------------------------------------
/images/sentence-model.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/koaning/arxiv-frontpage/260eaf2a63af68045367f0579161b6b414fc97ac/images/sentence-model.png


--------------------------------------------------------------------------------
/prodigy.json:
--------------------------------------------------------------------------------
1 | {
2 |     "exclude_by": "input"
3 | }


--------------------------------------------------------------------------------
/requirements-build.txt:
--------------------------------------------------------------------------------
 1 | srsly==2.4.6
 2 | tqdm==4.65.0
 3 | arxiv==1.4.2
 4 | httpx==0.23.3
 5 | spacy==3.5.0
 6 | retry==0.9.2
 7 | wandb==0.14.2
 8 | embetter==0.5.0
 9 | sentence-transformers==2.2.2
10 | simsity==0.5.5
11 | questionary==1.10.0
12 | radicli==0.0.25
13 | lazylines==0.0.4
14 | rich==13.4.2
15 | lunr==0.6.2
16 | skops==0.6.0
17 | python-dotenv==1.0.0
18 | scikit-learn==1.3
19 | numpy==1.26.4
20 | huggingface_hub==0.24.7
21 | 


--------------------------------------------------------------------------------
/requirements-download.txt:
--------------------------------------------------------------------------------
 1 | srsly==2.4.6
 2 | tqdm==4.65.0
 3 | arxiv==1.4.2
 4 | httpx==0.23.3
 5 | spacy==3.5.0
 6 | retry==0.9.2
 7 | rich==13.4.2
 8 | radicli==0.0.25
 9 | numpy==1.26
10 | 


--------------------------------------------------------------------------------
/setup.py:
--------------------------------------------------------------------------------
 1 | from setuptools import setup, find_packages
 2 | 
 3 | 
 4 | setup(
 5 |     name="frontpage",
 6 |     version="0.1.0",
 7 |     description="Your frontpage.",
 8 |     packages=find_packages(exclude=["notebooks", "tests"]),
 9 | )
10 | 


--------------------------------------------------------------------------------
/taskfile.yml:
--------------------------------------------------------------------------------
 1 | version: '3'
 2 | 
 3 | tasks:
 4 |   index:
 5 |     deps: [lunr-sentence, lunr-abstract]
 6 |     desc: Creates the simsity/lunr indices.
 7 |     sources:
 8 |       - data/cleaned/*.jsonl
 9 |     cmds:
10 |       - python -m frontpage index simsity sentence
11 |       - python -m frontpage index simsity abstract
12 | 
13 |   lunr-sentence:
14 |     desc: Creates the sentence-level lunr index.
15 |     cmds:
16 |       - python -m frontpage index lunr sentence
17 |     sources:
18 |       - data/cleaned/*.jsonl
19 |     generates:
20 |       - indices/lunr/sentence.json
21 |   
22 |   lunr-abstract:
23 |     desc: Creates the abstract-level lunr index.
24 |     cmds:
25 |       - python -m frontpage index lunr abstract
26 |     sources:
27 |       - data/cleaned/*.jsonl
28 |     generates:
29 |       - indices/lunr/abstract.json
30 |   
31 |   clean:
32 |     desc: Cleans the downloaded data.
33 |     cmds:
34 |       - python -c "from frontpage.datastream import DataStream; DataStream().save_clean_download_stream()"
35 |     sources:
36 |       - data/downloads/*.jsonl
37 | 
38 |   build-learned-streams:
39 |     desc: Generates files for the active-learn-y annotation recipes.
40 |     cmds:
41 |       - python -c "from frontpage.datastream import DataStream; DataStream().build_active_learn_stream()"
42 |       - python -c "from frontpage.datastream import DataStream; DataStream().build_second_opinion_stream()"
43 |     sources:
44 |       - data/cleaned/*.jsonl
45 |       - training
46 |   
47 |   stats-out:
48 |     desc: Runs `prodigy stats`, which could indicate that there are new annotations.
49 |     cmds: 
50 |       - python -m prodigy stats -nf > /tmp/stats.jsonl
51 |     generates:
52 |       - tmp/stats.jsonl
53 | 
54 |   prepare-annotations:
55 |     desc: Export the annotations from Prodigy so ML can be trained on it.
56 |     cmds:
57 |       - python -m frontpage annotprep
58 |     sources:
59 |       - /tmp/stats.jsonl
60 |     generates:
61 |       - data/annot/annotations.jsonl
62 |   
63 |   train:
64 |     desc: Trains new classification heads on top of pretrained SBERT layer.
65 |     cmds:
66 |       - python -m frontpage train
67 |     sources:
68 |       - data/annot/annotations.jsonl
69 |       - training/custom-sbert-emb
70 |   
71 |   pretrain:
72 |     desc: Trains new SBERT representations. Can be expensive. Will also upload to wandb.
73 |     cmds:
74 |       - python -m frontpage pretrain
75 |       - python -m frontpage artifact upload
76 | 
77 |   prepare:
78 |     desc: Runs all the steps required to update the streams for annotation.
79 |     cmds:
80 |       - task: clean
81 |       - task: index
82 |       - task: stats-out
83 |       - task: prepare-annotations
84 |       - task: train
85 |       - task: build-learned-streams
86 | 
87 |   build:
88 |     desc: Constuct a new frontpage. Always retrains heads just in case.
89 |     cmds:
90 |     - task: clean
91 |     # - python -m frontpage artifact download 
92 |     - task: train
93 |     - python -m frontpage build
94 |     - python -m http.server
95 | 


--------------------------------------------------------------------------------
/templates/home.html:
--------------------------------------------------------------------------------
 1 | <!doctype html>
 2 | <html>
 3 | <head>
 4 |   <meta charset="UTF-8">
 5 |   <meta name="viewport" content="width=device-width, initial-scale=1.0">
 6 |   <script src="https://cdn.tailwindcss.com"></script>
 7 |   <link rel="stylesheet" href="https://cdn.jsdelivr.net/npm/katex@0.16.4/dist/katex.min.css" integrity="sha384-vKruj+a13U8yHIkAyGgK1J3ArTLzrFGBbBc0tDp4ad/EyewESeXE/Iv67Aj8gKZ0" crossorigin="anonymous">
 8 |   <script defer src="https://cdn.jsdelivr.net/npm/katex@0.16.4/dist/katex.min.js" integrity="sha384-PwRUT/YqbnEjkZO0zZxNqcxACrXe+j766U2amXcgMg5457rve2Y7I6ZJSm2A0mS4" crossorigin="anonymous"></script>
 9 |   <script defer src="https://cdn.jsdelivr.net/npm/katex@0.16.4/dist/contrib/auto-render.min.js" integrity="sha384-+VBxd3r6XgURycqtZ117nYw44OOcIax56Z4dCRWbxyPt0Koah1uHoK0o4+/RRE05" crossorigin="anonymous"></script>
10 |   <script defer src="https://cdn.jsdelivr.net/npm/@alpinejs/collapse@3.x.x/dist/cdn.min.js"></script>
11 |   <script defer src="https://cdn.jsdelivr.net/npm/alpinejs@3.x.x/dist/cdn.min.js"></script>
12 | </head>
13 | <body>
14 |   <div class="relative mx-auto h-full max-w-2xl text-md">
15 |     <table class="table-auto">
16 |       <tbody>
17 |         <tr>
18 |           <td></td>
19 |           <td>
20 |             <h1 class="text-4xl pt-4 font-bold"><span class="underline">Vincent's</span> Arxiv FrontPage</h1>
21 |             <br>
22 |             <p>Generated on {{today}}.</p><br/>
23 |             <p class="text-sm text-gray-500 pt-2">This frontpage is made by scraping arxiv and by running a sentence-model that detects if the abstract describes a paper about a topic of interest. One cool feature: it all pretty much runs via Github Actions. </p>
24 |             <br>
25 |           </td>
26 |         </tr>
27 |         {%- for section in sections -%}
28 |         <tr>
29 |           <td></td>
30 |           <td>
31 |             <h2 class="text-2xl tracking-tight pt-4 font-bold">{{section['name']}}</h2>
32 |           </td>
33 |         </tr>
34 |         {%- for content in section['content'] -%}
35 |         <tr>
36 |           <td class="inline-block">
37 |             <p class='font-bold text-black px-2 mx-1 text-xs w-24'>{{content['created']}}</p>
38 |           </td>
39 |           <td>
40 |             <div x-data="{open: false}">
41 |               <span @click="open = ! open" class="hover:underline cursor-pointer decoration-2 decoration-green-600 text-gray-800 text-sm">
42 |                 {{content['title']}}
43 |               </span>
44 |               <div x-show="open" x-collapse.duration.500ms class="text-sm text-gray-500 pt-2">
45 |                 <div class="text-center pt-2">
46 |                   {%- for key in content['cats'] -%}
47 |                   <span class='text-black px-2 mx-1 bg-blue-200 text-sm'>cls:{{key}}: 0.{{(content['cats'][key] * 1000) | int | round}}</span>
48 |                   {%- endfor -%}
49 |                 </div>
50 |                 <p class="pt-2">
51 |                   {{content['html']}}
52 |                 </p>
53 |               <p class="pb-2 pt-2 text-center">
54 |                 <a class="underline decoration-2 text-green-600 text-md pt-2" href='{{content["url"]}}' target="_blank">
55 |                   link
56 |                 </a>
57 |               </p>
58 |             </div>
59 |           </div>
60 |         </td>
61 |       </tr>
62 |       {%- endfor -%}  
63 |       {%- endfor -%}
64 |     </tbody>
65 |   </table>
66 |   <br><br>
67 | </div>
68 | </div>
69 | <script>
70 |   document.addEventListener("DOMContentLoaded", function() {
71 |     renderMathInElement(document.body, {
72 |       // customised options
73 |       // • auto-render specific keys, e.g.:
74 |       delimiters: [
75 |       {left: '$$', right: '$$', display: true},
76 |       {left: '$', right: '$', display: false},
77 |       {left: '\\(', right: '\\)', display: false},
78 |       {left: '\\[', right: '\\]', display: true}
79 |       ],
80 |       // • rendering keys, e.g.:
81 |       throwOnError : false
82 |     });
83 |   });
84 | </script>
85 | </body>
86 | </html>


--------------------------------------------------------------------------------