├── readme_images
├── fork.png
├── UIarxiv.png
├── artifact.png
├── banner.png
├── openai.png
├── secrets.png
├── settings.png
├── trigger.png
├── example_1.png
├── example_2.png
├── hf_example.png
├── main_banner.png
├── example_report.png
└── example_custom_1.png
├── src
├── design_finder
│ ├── __init__.py
│ ├── __main__.py
│ └── main.py
├── paths.py
├── relevancy_filter_prompt.txt
├── design
│ ├── find_design_papers.sh
│ ├── get_design_papers.sh
│ ├── README.md
│ ├── design_finder.py
│ └── find_design_papers.py
├── relevancy_prompt.txt
├── download_new_papers.py
├── fix_parser.py
├── design_papers_crawler.py
├── interpretability_analysis.py
├── gemini_utils.py
├── utils.py
├── design_automation.py
├── action.py
├── anthropic_utils.py
└── model_manager.py
├── run.sh
├── requirements.txt
├── find_design_papers.sh
├── .env.template
├── LICENSE
├── config.yaml
├── .github
└── workflows
│ └── daily_pipeline.yaml
├── .gitignore
├── advanced_usage.md
└── README.md
/readme_images/fork.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/linhkid/ArxivDigest-extra/HEAD/readme_images/fork.png
--------------------------------------------------------------------------------
/readme_images/UIarxiv.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/linhkid/ArxivDigest-extra/HEAD/readme_images/UIarxiv.png
--------------------------------------------------------------------------------
/readme_images/artifact.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/linhkid/ArxivDigest-extra/HEAD/readme_images/artifact.png
--------------------------------------------------------------------------------
/readme_images/banner.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/linhkid/ArxivDigest-extra/HEAD/readme_images/banner.png
--------------------------------------------------------------------------------
/readme_images/openai.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/linhkid/ArxivDigest-extra/HEAD/readme_images/openai.png
--------------------------------------------------------------------------------
/readme_images/secrets.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/linhkid/ArxivDigest-extra/HEAD/readme_images/secrets.png
--------------------------------------------------------------------------------
/readme_images/settings.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/linhkid/ArxivDigest-extra/HEAD/readme_images/settings.png
--------------------------------------------------------------------------------
/readme_images/trigger.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/linhkid/ArxivDigest-extra/HEAD/readme_images/trigger.png
--------------------------------------------------------------------------------
/readme_images/example_1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/linhkid/ArxivDigest-extra/HEAD/readme_images/example_1.png
--------------------------------------------------------------------------------
/readme_images/example_2.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/linhkid/ArxivDigest-extra/HEAD/readme_images/example_2.png
--------------------------------------------------------------------------------
/readme_images/hf_example.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/linhkid/ArxivDigest-extra/HEAD/readme_images/hf_example.png
--------------------------------------------------------------------------------
/src/design_finder/__init__.py:
--------------------------------------------------------------------------------
1 | """
2 | Design Finder module for finding AI/ML design automation papers on arXiv.
3 | """
--------------------------------------------------------------------------------
/readme_images/main_banner.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/linhkid/ArxivDigest-extra/HEAD/readme_images/main_banner.png
--------------------------------------------------------------------------------
/readme_images/example_report.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/linhkid/ArxivDigest-extra/HEAD/readme_images/example_report.png
--------------------------------------------------------------------------------
/readme_images/example_custom_1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/linhkid/ArxivDigest-extra/HEAD/readme_images/example_custom_1.png
--------------------------------------------------------------------------------
/src/design_finder/__main__.py:
--------------------------------------------------------------------------------
1 | """
2 | Entry point for design_finder module.
3 | """
4 | from .main import main
5 |
6 | if __name__ == "__main__":
7 | main()
--------------------------------------------------------------------------------
/run.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 | # Run the ArxivDigest-extra app using the latest version
3 | echo "Starting ArxivDigest-extra..."
4 | cd "$(dirname "$0")"
5 | python src/app_new.py
--------------------------------------------------------------------------------
/requirements.txt:
--------------------------------------------------------------------------------
1 | PyYAML==6.0
2 | beautifulsoup4==4.12.2
3 | numpy==1.24.2
4 | openai>=1.3.0
5 | python-dotenv==1.0.0
6 | pytz==2023.3
7 | sendgrid==6.10.0
8 | tqdm==4.65.0
9 | google-generativeai>=0.3.0
10 | anthropic>=0.8.0
11 | gradio>=3.50.0
--------------------------------------------------------------------------------
/find_design_papers.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 | # Root-level wrapper script for the design papers finder
3 |
4 | # Show deprecation warning
5 | echo "ℹ️ Note: This script is a wrapper for ./src/design/find_design_papers.sh"
6 | echo "ℹ️ Consider using ./src/design/find_design_papers.sh directly for best results"
7 | echo ""
8 |
9 | # Simply forward all arguments to the actual script
10 | ./src/design/find_design_papers.sh "$@"
11 |
12 | # The exit code will propagate from the called script
13 | exit $?
--------------------------------------------------------------------------------
/src/paths.py:
--------------------------------------------------------------------------------
1 | """
2 | Common path definitions for ArxivDigest-extra.
3 | This module provides consistent paths throughout the application.
4 | """
5 | import os
6 |
7 | # Get the project root directory
8 | ROOT_DIR = os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
9 |
10 | # Define common directories
11 | DATA_DIR = os.path.join(ROOT_DIR, "data")
12 | DIGEST_DIR = os.path.join(ROOT_DIR, "digest")
13 | SRC_DIR = os.path.join(ROOT_DIR, "src")
14 |
15 | # Create directories if they don't exist
16 | for directory in [DATA_DIR, DIGEST_DIR]:
17 | os.makedirs(directory, exist_ok=True)
--------------------------------------------------------------------------------
/.env.template:
--------------------------------------------------------------------------------
1 | ## ArxivDigest- environment seting
2 |
3 |
4 | ##################################################################################################
5 | # DO NOT COMMIT YOUR API KEYS OR EMAIL ADDRESS TO YOUR REPOSITORY
6 | ##################################################################################################
7 | OPENAI_API_KEY=your_api_key # DO NOT COMMIT ANY FILE WITH THIS KEY SET
8 |
9 | ## EMAIL SETTINGS
10 | SENDGRID_API_KEY=your_api_key # DO NOT COMMIT ANY FILE WITH THIS KEY SET
11 | FROM_EMAIL=your_email # DO NOT COMMIT ANY FILE WITH THIS KEY SET
12 | TO_EMAIL=your_email # DO NOT COMMIT ANY FILE WITH THIS KEY SET
13 |
--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
1 | MIT License
2 |
3 | Copyright (c) 2023 AutoLLM
4 |
5 | Permission is hereby granted, free of charge, to any person obtaining a copy
6 | of this software and associated documentation files (the "Software"), to deal
7 | in the Software without restriction, including without limitation the rights
8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 |
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 |
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 |
--------------------------------------------------------------------------------
/src/relevancy_filter_prompt.txt:
--------------------------------------------------------------------------------
1 | You are a research assistant with expertise in analyzing academic papers, particularly in AI and machine learning. You've been asked to perform PRELIMINARY SCREENING of arXiv papers based ONLY on their titles and abstracts.
2 |
3 | Your task is to evaluate which papers are worth analyzing in depth based on their potential relevance to the researcher's specific interests.
4 |
5 | For each paper, provide ONLY a relevancy score out of 10, with a higher score indicating greater relevance to the researcher's specific interests. Each paper's score should be accompanied by a brief explanation of why it matches or doesn't match the research interests.
6 |
7 | Papers scoring 7 or higher will undergo detailed analysis with their full content, so be selective.
8 |
9 | VERY IMPORTANT: Respond with a numbered list of valid JSON objects. The format MUST be exactly like this for each paper:
10 |
11 | 1. {
12 | "Relevancy score": 7,
13 | "Reasons for match": "Paper discusses multi-agent systems with focus on coordination mechanisms, which directly aligns with research interests."
14 | }
15 |
16 | 2. {
17 | "Relevancy score": 3,
18 | "Reasons for match": "Mentions agents but focuses on image processing applications, which is not part of the stated research interests."
19 | }
20 |
21 | DO NOT use "```json" code blocks or any other formatting. Just provide numbered JSON objects exactly as shown above.
22 |
23 | My research interests are:
--------------------------------------------------------------------------------
/config.yaml:
--------------------------------------------------------------------------------
1 | # For physics topics, use the specific subtopics, e.g. "Astrophysics"
2 | topic: "Computer Science"
3 | # An empty list here will include all categories in a topic
4 | # Use the natural language names of the topics, found here: https://arxiv.org
5 | # Including more categories will result in more calls to the large language model
6 | categories: ["Artificial Intelligence", "Computation and Language", "Machine Learning", "Information Retrieval"]
7 |
8 | # Relevance score threshold. abstracts that receive a score less than this from the large language model
9 | # will have their papers filtered out.
10 | #
11 | # Must be within 1-10
12 | threshold: 2
13 |
14 | # A natural language statement that the large language model will use to judge which papers are relevant
15 | #
16 | # For example:
17 | # "I am interested in complexity theory papers that establish upper bounds"
18 | # "gas chromatography, mass spectrometry"
19 | # "making lots of money"
20 | #
21 | # This can be empty, which just return a full list of papers with no judgement or filtering,
22 | # in whatever order arXiv responds with.
23 | interest: |
24 | 1. AI alignment and AI safety
25 | 2. Mechanistic interpretability and explainable AI
26 | 3. Large language model under pressure
27 | 4. AI Red teaming, deception and misalignment
28 | 5. RAGs, Information retrieval
29 | 6. Optimization of LLM and GenAI
30 | 7. Do not care about specific application, for example, information extraction, summarization, etc.
31 |
--------------------------------------------------------------------------------
/src/design/find_design_papers.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 | # Design papers finder script
3 | # Searches arXiv for design automation papers and generates reports
4 | # For full documentation, see ./README.md
5 |
6 | # Add help/usage function
7 | show_help() {
8 | echo "Usage: ./find_design_papers.sh [OPTIONS]"
9 | echo ""
10 | echo "Options:"
11 | echo " --days N Search papers from the last N days (default: 7)"
12 | echo " --keyword TERM Filter papers containing this keyword"
13 | echo " --analyze Use LLM to perform detailed analysis of papers"
14 | echo " --interest \"TEXT\" Custom research interest description for LLM"
15 | echo " --model MODEL Model to use for analysis (default: gpt-3.5-turbo-16k)"
16 | echo " --no-date Don't add date to output filenames"
17 | echo " --output FILE Custom JSON output path (default: data/design_papers_DATE.json)"
18 | echo " --html FILE Custom HTML output path (default: digest/design_papers_DATE.html)"
19 | echo " --help Show this help message"
20 | echo ""
21 | echo "Examples:"
22 | echo " ./find_design_papers.sh"
23 | echo " ./find_design_papers.sh --keyword \"layout\" --days 14"
24 | echo " ./find_design_papers.sh --analyze --interest \"UI/UX automation\""
25 | }
26 |
27 | # Show help if requested
28 | if [[ "$1" == "--help" || "$1" == "-h" ]]; then
29 | show_help
30 | exit 0
31 | fi
32 |
33 | # Run the design papers finder with all arguments passed through
34 | python -m src.design.find_design_papers "$@"
35 |
36 | # Show success message
37 | if [ $? -eq 0 ]; then
38 | echo "✓ Design papers finder completed successfully!"
39 | echo " Open the HTML report in your browser to view results"
40 | else
41 | echo "✗ Design papers finder encountered an error"
42 | fi
43 |
--------------------------------------------------------------------------------
/src/design/get_design_papers.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 | # Legacy wrapper script for design papers finder - maintained for backward compatibility
3 | # For new scripts, use find_design_papers.sh instead
4 |
5 | # Show deprecation warning
6 | echo "⚠️ Warning: get_design_papers.sh is deprecated and will be removed in a future version"
7 | echo "⚠️ Please use find_design_papers.sh instead, which has more features and better output"
8 | echo ""
9 |
10 | # Default values
11 | DAYS=7
12 | OUTPUT="design_papers.json"
13 | KEYWORD=""
14 | ANALYZE=""
15 |
16 | # Parse command-line arguments
17 | while [[ $# -gt 0 ]]; do
18 | case $1 in
19 | --days)
20 | DAYS="$2"
21 | shift 2
22 | ;;
23 | --output)
24 | OUTPUT="$2"
25 | shift 2
26 | ;;
27 | --keyword)
28 | KEYWORD="$2"
29 | shift 2
30 | ;;
31 | --analyze)
32 | ANALYZE="--analyze"
33 | shift
34 | ;;
35 | --email)
36 | # Ignore email parameter - email functionality is removed
37 | echo "Note: Email functionality has been removed. HTML report will be generated locally only."
38 | shift 2
39 | ;;
40 | *)
41 | echo "Unknown option: $1"
42 | exit 1
43 | ;;
44 | esac
45 | done
46 |
47 | # Run the crawler using the new script
48 | echo "Searching for design papers from the last $DAYS days..."
49 |
50 | # Build the command
51 | CMD="./src/design/find_design_papers.sh --days $DAYS --output ./data/$OUTPUT --html ./digest/${OUTPUT%.json}.html"
52 |
53 | # Add keyword if specified
54 | if [ -n "$KEYWORD" ]; then
55 | CMD="$CMD --keyword \"$KEYWORD\""
56 | fi
57 |
58 | # Add analyze if specified
59 | if [ -n "$ANALYZE" ]; then
60 | CMD="$CMD --analyze"
61 | fi
62 |
63 | # Execute the command
64 | eval $CMD
65 |
66 | echo "Done! View your results in ./digest/${OUTPUT%.json}.html"
--------------------------------------------------------------------------------
/src/relevancy_prompt.txt:
--------------------------------------------------------------------------------
1 | You are a research assistant with expertise in analyzing academic papers, particularly in AI and machine learning. You've been asked to thoroughly analyze a list of arXiv papers, each with title, authors, abstract, and content.
2 |
3 | For each paper, provide:
4 | 1. A relevancy score out of 10 based on my specific research interests, with a higher score indicating greater relevance. A score of 7 or higher means this paper deserves special attention.
5 | 2. A comprehensive analysis that would help me understand the paper's value and contributions without having to read the entire paper.
6 |
7 | Please maintain the original paper order in your response, with one JSON object per line. Format:
8 |
9 | 1. {
10 | "Relevancy score": "an integer score out of 10",
11 | "Reasons for match": "A detailed paragraph explaining why this paper aligns with my research interests, highlighting specific concepts, methodologies, or findings that match my interests",
12 | "Key innovations": "2-3 bullet points describing the main contributions and what makes this paper novel",
13 | "Critical analysis": "A thoughtful paragraph evaluating the strengths and potential limitations of the approach",
14 | "Goal": "What specific problem or research gap does this paper address?",
15 | "Data": "Detailed description of datasets used, including size, characteristics, and any novel data processing techniques",
16 | "Methodology": "Comprehensive explanation of the methods, algorithms, and technical approach",
17 | "Implementation details": "Information about model architecture, hyperparameters, training procedures, and computational requirements",
18 | "Git": "Link to code repository if available, or note if code is not yet released",
19 | "Experiments & Results": "Analysis of experimental setup, key results, and how they compare to prior work or baselines",
20 | "Discussion & Next steps": "The authors' own conclusions, limitations they identified, and future research directions",
21 | "Related work": "How this paper relates to similar recent papers in the field",
22 | "Practical applications": "How the findings could be applied in real-world scenarios",
23 | "Key takeaways": "3-5 bullet points summarizing the most important insights from this paper"
24 | }
25 |
26 | My research interests are: AI Alignment, AI safety, Mechanistic Interpretability, Explainable AI, RAGs, Information Retrieval, Large Language Models, Multimodal Learning, Generative AI, Optimization in LLM, Model Efficiency, Fine-tuning Techniques, Prompt Engineering, and AI Evaluation Metrics.
--------------------------------------------------------------------------------
/.github/workflows/daily_pipeline.yaml:
--------------------------------------------------------------------------------
1 | # This workflow will install Python dependencies, run tests and lint with a single version of Python
2 | # For more information see: https://help.github.com/actions/language-and-framework-guides/using-python-with-github-actions
3 |
4 | name: Daily pipeline
5 |
6 | on:
7 | workflow_dispatch: {}
8 | schedule:
9 | # * is a special character in YAML so you have to quote this string
10 | # Feel free to change this cron schedule
11 | # Currently its scheduled for 1:25 pm UTC, Sun-Thurs
12 | - cron: '25 13 * * 0-4'
13 |
14 | jobs:
15 | generate_and_send_digest:
16 | runs-on: ubuntu-latest
17 | steps:
18 | - uses: actions/checkout@v2
19 | - name: Set up Python 3.8
20 | uses: actions/setup-python@v2
21 | with:
22 | python-version: 3.8
23 | - name: Install dependencies
24 | run: |
25 | python -m pip install --upgrade pip
26 | pip install -r requirements.txt
27 | - name: Generate Digest
28 | run: |
29 | python src/action.py
30 | env:
31 | OPENAI_API_KEY: ${{ secrets.OPENAI_API_KEY }}
32 | SENDGRID_API_KEY: ${{ secrets.SENDGRID_API_KEY }}
33 | FROM_EMAIL: ${{ secrets.FROM_EMAIL }}
34 | TO_EMAIL: ${{ secrets.TO_EMAIL }}
35 | - name: Upload Artifact
36 | uses: actions/upload-artifact@v3
37 | with:
38 | name: digest.html
39 | path: digest.html
40 | - name: check
41 | id: check
42 | env:
43 | SENDGRID_API_KEY: ${{ secrets.SENDGRID_API_KEY }}
44 | MAIL_USERNAME: ${{ secrets.MAIL_USERNAME }}
45 | MAIL_PASSWORD: ${{ secrets.MAIL_PASSWORD }}
46 | MAIL_CONNECTION: ${{ secrets.MAIL_CONNECTION }}
47 | if: "${{ env.SENDGRID_API_KEY == '' && (env.MAIL_CONNECTION || env.MAIL_USERNAME != '' && env.MAIL_PASSWORD != '') }}"
48 | run: echo "DEFINED=true" >> $GITHUB_OUTPUT
49 | - name: Send mail
50 | uses: dawidd6/action-send-mail@v3
51 | env:
52 | DEFINED: ${{ steps.check.outputs.DEFINED }}
53 | if: ${{ env.DEFINED == 'true' }}
54 | with:
55 | # Specify connection via URL (replaces server_address, server_port, secure,
56 | # username and password)
57 | #
58 | # Format:
59 | #
60 | # * smtp://user:password@server:port
61 | # * smtp+starttls://user:password@server:port
62 | connection_url: ${{secrets.MAIL_CONNECTION}}
63 | # Required mail server address if not connection_url:
64 | server_address: smtp.gmail.com
65 | # Server port, default 25:
66 | server_port: 465
67 | username: ${{secrets.MAIL_USERNAME}}
68 | password: ${{secrets.MAIL_PASSWORD}}
69 | secure: true
70 | subject: Personalized arXiv Digest
71 | to: ${{ secrets.TO_EMAIL }}
72 | from: "Personalized arxiv digest"
73 | html_body: file://digest.html
74 | ignore_cert: true
75 | convert_markdown: true
76 | priority: normal
77 |
--------------------------------------------------------------------------------
/src/design/README.md:
--------------------------------------------------------------------------------
1 | # 🎨 Design Paper Discovery
2 |
3 | This module specializes in finding and analyzing papers related to AI/ML for design automation. It crawls arXiv for design-related papers and provides detailed reports on recent research at the intersection of AI and design.
4 |
5 | ## Features
6 |
7 | - **Smart Paper Finding**: Automatically finds papers related to design automation and creative AI
8 | - **Multi-Category Search**: Searches across Computer Vision, Graphics, HCI, and other relevant arXiv categories
9 | - **Intelligent Categorization**: Sorts papers into design subcategories (UI/UX, Layout, Graphic Design, etc.)
10 | - **Technique Analysis**: Identifies AI techniques used (GANs, Diffusion Models, LLMs, etc.)
11 | - **LLM-Powered Analysis**: Optional in-depth analysis using OpenAI, Gemini, or Claude models
12 | - **HTML Reports**: Generates clean, organized HTML reports with paper statistics and details
13 | - **JSON Export**: Saves all paper data in structured JSON format for further processing
14 |
15 | ## Quick Start
16 |
17 | Run the main script from the project root directory:
18 |
19 | ```bash
20 | # Basic usage - find design papers from the last 7 days
21 | ./src/design/find_design_papers.sh
22 |
23 | # With keyword filtering - find design papers about layout generation
24 | ./src/design/find_design_papers.sh --keyword "layout"
25 |
26 | # With longer timeframe - find design papers from the last month
27 | ./src/design/find_design_papers.sh --days 30
28 | ```
29 |
30 | ## Advanced Usage
31 |
32 | ```bash
33 | # With LLM analysis for comprehensive paper details
34 | ./src/design/find_design_papers.sh --analyze
35 |
36 | # Customize research interests for analysis
37 | ./src/design/find_design_papers.sh --analyze --interest "I'm looking for papers on UI/UX automation and layout generation with neural networks"
38 |
39 | # Change the model used for analysis
40 | ./src/design/find_design_papers.sh --analyze --model "gpt-4o"
41 |
42 | # Combined example with all major features
43 | ./src/design/find_design_papers.sh --days 14 --keyword "diffusion" --analyze --model "gpt-4o" --interest "I'm researching diffusion models for design applications"
44 |
45 | # Output files include the current date by default:
46 | # - data/design_papers_diffusion_20250406.json
47 | # - digest/design_papers_diffusion_20250406.html
48 |
49 | # Disable date in filenames if needed
50 | ./src/design/find_design_papers.sh --keyword "layout" --no-date
51 | ```
52 |
53 | ## Parameters Reference
54 |
55 | | Parameter | Description | Default |
56 | |-----------|-------------|---------|
57 | | `--days N` | Number of days to search back | 7 |
58 | | `--keyword TERM` | Filter papers containing this keyword | none |
59 | | `--analyze` | Use LLM to perform detailed analysis | false |
60 | | `--interest "TEXT"` | Custom research interest for LLM | Design automation focus |
61 | | `--model MODEL` | Model to use for analysis | gpt-3.5-turbo-16k |
62 | | `--no-date` | Don't add date to output filenames | false |
63 | | `--output FILE` | Custom JSON output path | data/design_papers_DATE.json |
64 | | `--html FILE` | Custom HTML output path | digest/design_papers_DATE.html |
65 | | `--help` | Show help message | |
66 |
67 | ## Implementation Details
68 |
69 | The design paper discovery consists of these main components:
70 |
71 | 1. **find_design_papers.sh**: Main shell script interface with help and options
72 | 2. **find_design_papers.py**: Core Python implementation for arXiv discovery and analysis
73 | 3. **design_finder.py**: Alternative implementation with minimal dependencies
74 | 4. **get_design_papers.sh**: Legacy script (maintained for backward compatibility)
75 |
76 | ## Example Output
77 |
78 | The HTML report includes:
79 | - Summary statistics and paper counts by category and technique
80 | - Detailed paper listings with titles, authors, and abstracts
81 | - AI analysis sections when using the `--analyze` flag
82 | - Links to arXiv pages and PDF downloads
83 |
--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
1 | # Byte-compiled / optimized / DLL files
2 | __pycache__/
3 | *.py[cod]
4 | *$py.class
5 |
6 | # C extensions
7 | *.so
8 |
9 | # Distribution / packaging
10 | .Python
11 | build/
12 | develop-eggs/
13 | dist/
14 | downloads/
15 | eggs/
16 | .eggs/
17 | lib/
18 | lib64/
19 | parts/
20 | sdist/
21 | var/
22 | wheels/
23 | share/python-wheels/
24 | *.egg-info/
25 | .installed.cfg
26 | *.egg
27 | MANIFEST
28 |
29 | # PyInstaller
30 | # Usually these files are written by a python script from a template
31 | # before PyInstaller builds the exe, so as to inject date/other infos into it.
32 | *.manifest
33 | *.spec
34 |
35 | # Installer logs
36 | pip-log.txt
37 | pip-delete-this-directory.txt
38 |
39 | # Unit test / coverage reports
40 | htmlcov/
41 | .tox/
42 | .nox/
43 | .coverage
44 | .coverage.*
45 | .cache
46 | nosetests.xml
47 | coverage.xml
48 | *.cover
49 | *.py,cover
50 | .hypothesis/
51 | .pytest_cache/
52 | cover/
53 |
54 | # Translations
55 | *.mo
56 | *.pot
57 |
58 | # Django stuff:
59 | *.log
60 | local_settings.py
61 | db.sqlite3
62 | db.sqlite3-journal
63 |
64 | # Flask stuff:
65 | instance/
66 | .webassets-cache
67 |
68 | # Scrapy stuff:
69 | .scrapy
70 |
71 | # Sphinx documentation
72 | docs/_build/
73 |
74 | # PyBuilder
75 | .pybuilder/
76 | target/
77 |
78 | # Jupyter Notebook
79 | .ipynb_checkpoints
80 |
81 | # IPython
82 | profile_default/
83 | ipython_config.py
84 |
85 | # pyenv
86 | # For a library or package, you might want to ignore these files since the code is
87 | # intended to run in multiple environments; otherwise, check them in:
88 | # .python-version
89 |
90 | # pipenv
91 | # According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
92 | # However, in case of collaboration, if having platform-specific dependencies or dependencies
93 | # having no cross-platform support, pipenv may install dependencies that don't work, or not
94 | # install all needed dependencies.
95 | #Pipfile.lock
96 |
97 | # poetry
98 | # Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control.
99 | # This is especially recommended for binary packages to ensure reproducibility, and is more
100 | # commonly ignored for libraries.
101 | # https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control
102 | #poetry.lock
103 |
104 | # pdm
105 | # Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control.
106 | #pdm.lock
107 | # pdm stores project-wide configurations in .pdm.toml, but it is recommended to not include it
108 | # in version control.
109 | # https://pdm.fming.dev/#use-with-ide
110 | .pdm.toml
111 |
112 | # PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm
113 | __pypackages__/
114 |
115 | # Celery stuff
116 | celerybeat-schedule
117 | celerybeat.pid
118 |
119 | # SageMath parsed files
120 | *.sage.py
121 |
122 | # Environments
123 | .env
124 | .venv
125 | env/
126 | venv/
127 | ENV/
128 | env.bak/
129 | venv.bak/
130 |
131 | # Spyder project settings
132 | .spyderproject
133 | .spyproject
134 |
135 | # Rope project settings
136 | .ropeproject
137 |
138 | # mkdocs documentation
139 | /site
140 |
141 | # mypy
142 | .mypy_cache/
143 | .dmypy.json
144 | dmypy.json
145 |
146 | # Pyre type checker
147 | .pyre/
148 |
149 | # pytype static type analyzer
150 | .pytype/
151 |
152 | # Cython debug symbols
153 | cython_debug/
154 |
155 | # PyCharm
156 | # JetBrains specific template is maintained in a separate JetBrains.gitignore that can
157 | # be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore
158 | # and can be added to the global gitignore or merged into this file. For a more nuclear
159 | # option (not recommended) you can uncomment the following to ignore the entire idea folder.
160 | #.idea/
161 |
162 | # vim
163 | *.sw*
164 |
--------------------------------------------------------------------------------
/src/download_new_papers.py:
--------------------------------------------------------------------------------
1 | # encoding: utf-8
2 | import os
3 | import re
4 | from urllib.error import HTTPError
5 |
6 | import tqdm
7 | from bs4 import BeautifulSoup as bs
8 | import urllib.request
9 | import json
10 | import datetime
11 | import pytz
12 |
13 | # Import standardized paths
14 | from paths import DATA_DIR
15 |
16 | #Linh - add new def crawl_html_version(html_link) here
17 | def crawl_html_version(html_link):
18 | main_content = []
19 | try:
20 | # Add user-agent header to appear more like a browser
21 | headers = {
22 | 'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/121.0.0.0 Safari/537.36'
23 | }
24 | req = urllib.request.Request(html_link, headers=headers)
25 | html = urllib.request.urlopen(req)
26 | except HTTPError as e:
27 | return f"Error accessing HTML: {str(e)}"
28 |
29 | soup = bs(html)
30 | content = soup.find('div', attrs={'class': 'ltx_page_content'})
31 | if not content:
32 | return "Content not available in HTML format"
33 | para_list = content.find_all("div", attrs={'class': 'ltx_para'})
34 |
35 | for each in para_list:
36 | main_content.append(each.text.strip())
37 | return ' '.join(main_content)[:10000]
38 | #if len(main_content >)
39 | #return ''.join(main_content) if len(main_content) < 20000 else ''.join(main_content[:20000])
40 |
41 | #Linh - add because cs sub does not have abstract displayed, will revert if it comes back
42 | def crawl_abstract(html_link):
43 | main_content = []
44 | try:
45 | html = urllib.request.urlopen(html_link)
46 | except HTTPError as e:
47 | return ["None"]
48 | soup = bs(html)
49 | content = soup.find('blockquote', attrs={'class': 'abstract'}).text.replace("Abstract:", "").strip()
50 | return content
51 | def _download_new_papers(field_abbr):
52 | NEW_SUB_URL = f'https://arxiv.org/list/{field_abbr}/new' # https://arxiv.org/list/cs/new
53 | print(NEW_SUB_URL)
54 | # Add user-agent header to appear more like a browser
55 | headers = {
56 | 'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/121.0.0.0 Safari/537.36'
57 | }
58 | req = urllib.request.Request(NEW_SUB_URL, headers=headers)
59 | page = urllib.request.urlopen(req)
60 |
61 | soup = bs(page)
62 | content = soup.body.find("div", {'id': 'content'})
63 |
64 | # find the first h3 element in content
65 | h3 = content.find("h3").text # e.g: New submissions for Wed, 10 May 23
66 | date = h3.replace("New submissions for", "").strip()
67 |
68 | dt_list = content.dl.find_all("dt")
69 | dd_list = content.dl.find_all("dd")
70 | arxiv_base = "https://arxiv.org/abs/"
71 | arxiv_html = "https://arxiv.org/html/"
72 |
73 | assert len(dt_list) == len(dd_list)
74 | new_paper_list = []
75 | for i in tqdm.tqdm(range(len(dt_list))):
76 | paper = {}
77 | ahref = dt_list[i].find('a', href = re.compile(r'[/]([a-z]|[A-Z])\w+')).attrs['href']
78 | paper_number = ahref.strip().replace("/abs/", "")
79 |
80 | paper['main_page'] = arxiv_base + paper_number
81 | paper['pdf'] = arxiv_base.replace('abs', 'pdf') + paper_number
82 |
83 | paper['title'] = dd_list[i].find("div", {"class": "list-title mathjax"}).text.replace("Title:\n", "").strip()
84 | paper['authors'] = dd_list[i].find("div", {"class": "list-authors"}).text \
85 | .replace("Authors:\n", "").replace("\n", "").strip()
86 | paper['subjects'] = dd_list[i].find("div", {"class": "list-subjects"}).text.replace("Subjects:\n", "").strip()
87 | #print(dd_list[i].find("div", {"class": "list-subjects"}).text.replace("Subjects:\n", "").strip())
88 |
89 | #TODO: edit the abstract part - it is currently moved
90 | paper['abstract'] = dd_list[i].find("p", {"class": "mathjax"}).text.replace("\n", " ").strip()
91 | try:
92 | paper['content'] = crawl_html_version(arxiv_html + paper_number + "v1")
93 | except Exception as e:
94 | paper['content'] = f"Error fetching content: {str(e)}"
95 | new_paper_list.append(paper)
96 |
97 |
98 | # DATA_DIR is already created by paths.py
99 |
100 | # save new_paper_list to a jsonl file, with each line as the element of a dictionary
101 | date = datetime.date.fromtimestamp(datetime.datetime.now(tz=pytz.timezone("America/New_York")).timestamp())
102 | date = date.strftime("%a, %d %b %y")
103 | file_path = os.path.join(DATA_DIR, f"{field_abbr}_{date}.jsonl")
104 | with open(file_path, "w") as f:
105 | for paper in new_paper_list:
106 | f.write(json.dumps(paper) + "\n")
107 |
108 |
109 | def get_papers(field_abbr, limit=None):
110 | date = datetime.date.fromtimestamp(datetime.datetime.now(tz=pytz.timezone("America/New_York")).timestamp())
111 | date = date.strftime("%a, %d %b %y")
112 | file_path = os.path.join(DATA_DIR, f"{field_abbr}_{date}.jsonl")
113 | if not os.path.exists(file_path):
114 | _download_new_papers(field_abbr)
115 | results = []
116 | with open(file_path, "r") as f:
117 | for i, line in enumerate(f.readlines()):
118 | if limit and i == limit:
119 | return results
120 | results.append(json.loads(line))
121 | return results
122 |
123 | #crawl_html_version("https://arxiv.org/html/2404.11972v1")
124 |
--------------------------------------------------------------------------------
/advanced_usage.md:
--------------------------------------------------------------------------------
1 | # Advanced Usage
2 |
3 | ## Step-by-step instructions for running as a Github action
4 |
5 | ### Fork the repository
6 |
7 | Click the fork button at the top of this repository page, as seen on the below image. This will create your own version of the repository, including your own set of github actions
8 |
9 | 
10 |
11 | ### Modify the configuration file
12 |
13 | Modify `config.yaml` by cloning the respository and merging your changes
14 |
15 | ### Create and Fetch your API Keys
16 |
17 | - Create or fetch your API key for [OpenAI](https://platform.openai.com/account/api-keys). Note: you will need an OpenAI account.
18 | 
19 |
20 | - Create or fetch your API key for [SendGrid](https://app.SendGrid.com/settings/api_keys). You will need a SendGrid account. The free tier will generally suffice. Make sure to [verify your sender identity](https://docs.sendgrid.com/for-developers/sending-email/sender-identity).
21 | - Sign Up for [SendGrid](https://app.sendgrid.com). Fill in the necessary information, including email, password, and a company name. If you don't have a company, you can use a made-up name.
22 | - You'll need to verify your email address to activate your account.
23 | - On your main dashboard, access the Integration Guide under Email API
24 | - Next, on the "Integrate using our Web API or SMTP Relay"-page, choose the "Web API" option.
25 | - Choose the language you're planning to use, in this case, select "Python".
26 | - You'll be prompted to provide a name for your API key. Enter a name and click "Create Key".
27 | - Copy the API Key that appears for the next step below. You won't be able to view the full key again.
28 |
29 | ### Set the secrets for the github action
30 |
31 | Go to the Settings tab on the top of this page, and then the "Actions" menu under "Secrets and variables":
32 |
33 | 
34 |
35 | Create a new repository secret for each of the following using the button in the below image:
36 | - `OPENAI_API_KEY`
37 | - `SENDGRID_API_KEY`
38 | - `FROM_EMAIL`
39 | - `TO_EMAIL`
40 |
41 | 
42 |
43 | ### Manually trigger the action, or wait until the scheduled trigger
44 |
45 | Go to the actions tab, and then click on "Daily Workflow" and "Run Workflow"
46 |
47 | 
48 |
49 | ## Additional Configuration
50 |
51 | - If you want a different schedule than Sunday through Thursday at 1:25PM UTC, then modify the file `.github/workflows/daily_pipeline.yaml`
52 |
53 |
54 | ## Alternative Usage
55 |
56 | Running `src/action.py` will generate an HTML file that can then be emailed. The following alternative usage methods all use that pattern
57 |
58 | ### Running as a github action with SMTP credentials.
59 |
60 | An alternative way to get started using this repository is to:
61 |
62 | 1. Fork the repository
63 | 2. Modify `config.yaml` and merge the changes into your main branch. If you want a different schedule than Sunday through Thursday at 1:25PM UTC, then also modify the file `.github/workflows/daily_pipeline.yaml`
64 | 3. Create or fetch your API key for [OpenAI](https://platform.openai.com/account/api-keys).
65 | 4. Find your email provider's SMTP settings and set the secret `MAIL_CONNECTION` to that. It should be in the form `smtp://user:password@server:port` or `smtp+starttls://user:password@server:port`. Alternatively, if you are using Gmail, you can set `MAIL_USERNAME` and `MAIL_PASSWORD` instead, using an [application password](https://support.google.com/accounts/answer/185833).
66 | 5. Set the following secrets [(under settings, Secrets and variables, repository secrets)](https://docs.github.com/en/actions/security-guides/encrypted-secrets#creating-encrypted-secrets-for-a-repository):
67 | - `OPENAI_API_KEY`
68 | - `MAIL_CONNECTION` (see above)
69 | - `MAIL_PASSWORD` (only if you don't have `MAIL_CONNECTION` set)
70 | - `MAIL_USERNAME` (only if you don't have `MAIL_CONNECTION` set)
71 | - `FROM_EMAIL`
72 | - `TO_EMAIL`
73 | 6. Manually trigger the action or wait until the scheduled action takes place.
74 |
75 | ### Running as a github action without emails
76 |
77 | If you do not wish to create a SendGrid account or use your email authentication, the action will also emit an artifact containing the HTML output. Simply do not create the SendGrid or SMTP secrets.
78 |
79 | You can access this digest as part of the github action artifact.
80 |
81 | 
82 |
83 | ### Running from the command line
84 |
85 | If you do not wish to fork this repository, and would prefer to clone and run it locally instead:
86 |
87 | 1. Install the requirements in `src/requirements.txt`
88 | 2. Modify the configuration file `config.yaml`
89 | 3. Create or fetch your API key for [OpenAI](https://platform.openai.com/account/api-keys).
90 | 4. Create or fetch your API key for [SendGrid](https://app.SendGrid.com/settings/api_keys) (optional, if you want the script to email you)
91 | 5. Set the following secrets as environment variables:
92 | - `OPENAI_API_KEY`
93 | - `SENDGRID_API_KEY` (only if using SendGrid)
94 | - `FROM_EMAIL` (only if using SendGrid. Note that this value must match the email you used to create the SendGrid Api Key.)
95 | - `TO_EMAIL` (only if using SendGrid)
96 | 6. Run `python action.py`.
97 | 7. If you are not using SendGrid, the html of the digest will be written to `digest.html`. You can then use your favorite webbrowser to view it.
98 |
99 | You may want to use something like crontab to schedule the digest.
100 |
--------------------------------------------------------------------------------
/src/fix_parser.py:
--------------------------------------------------------------------------------
1 | """
2 | A script to fix and test the OpenAI response parsing.
3 | """
4 | import json
5 | import re
6 | import os
7 |
8 | def is_valid_json(text):
9 | try:
10 | json.loads(text)
11 | return True
12 | except json.JSONDecodeError:
13 | return False
14 |
15 | def extract_json_from_string(text):
16 | """
17 | Attempt to extract JSON from a string by finding '{'...'}'
18 | """
19 | # Find the outermost JSON object
20 | stack = []
21 | start_idx = -1
22 |
23 | for i, char in enumerate(text):
24 | if char == '{' and start_idx == -1:
25 | start_idx = i
26 | stack.append(char)
27 | elif char == '{':
28 | stack.append(char)
29 | elif char == '}' and stack:
30 | stack.pop()
31 | if not stack and start_idx != -1:
32 | # Found complete JSON object
33 | json_str = text[start_idx:i+1]
34 | try:
35 | parsed = json.loads(json_str)
36 | return parsed
37 | except json.JSONDecodeError:
38 | # If this one fails, continue looking
39 | start_idx = -1
40 |
41 | return None
42 |
43 | def fix_openai_response(response_text):
44 | """
45 | Fix the OpenAI response by handling different formats and parsing the JSON.
46 | Returns a list of dictionaries with paper analysis.
47 | """
48 | # First, try to parse the entire response as JSON
49 | cleaned_text = response_text.strip()
50 |
51 | # Try to extract JSON directly
52 | if '{' in cleaned_text and '}' in cleaned_text:
53 | json_obj = extract_json_from_string(cleaned_text)
54 | if json_obj and "Relevancy score" in json_obj:
55 | print(f"Successfully extracted JSON with score {json_obj['Relevancy score']}")
56 | return [json_obj]
57 |
58 | return []
59 |
60 | # Example usage
61 | if __name__ == "__main__":
62 | example_response = """
63 | "Relevancy score": 7,
64 | "Reasons for match": "This paper aligns with your research interests as it explores the application of Large Language Models (LLMs) in the context of hardware design. It introduces a unified framework, Marco, that integrates configurable graph-based task solving with multi-modality and multi-AI agents for chip design. This is relevant to your interests in AI Alignment, AI safety, Large Language Models, and Multimodal Learning.",
65 | "Key innovations": [
66 | "Introduction of Marco, a unified framework that integrates configurable graph-based task solving with multi-modality and multi-AI agents for chip design.",
67 | "Demonstration of promising performance, productivity, and efficiency of LLM agents by leveraging the Marco framework on layout optimization, Verilog/design rule checker (DRC) coding, and timing analysis tasks."
68 | ],
69 | "Critical analysis": "The paper presents a novel approach to leveraging LLMs in the field of hardware design, which could have significant implications for improving efficiency and reducing costs. However, without access to the full paper, it's difficult to assess the strengths and potential limitations of the approach.",
70 | "Goal": "The paper addresses the challenge of optimizing performance, power, area, and cost (PPAC) during synthesis, verification, physical design, and reliability loops in hardware design. It aims to reduce turn-around-time (TAT) for these processes by leveraging the capabilities of LLMs.",
71 | "Data": "Unable to provide details about the datasets used due to lack of access to the full paper content.",
72 | "Methodology": "The paper proposes a unified framework, Marco, that integrates configurable graph-based task solving with multi-modality and multi-AI agents for chip design. However, detailed methodology is not available due to lack of access to the full paper content.",
73 | "Implementation details": "Unable to provide implementation details due to lack of access to the full paper content.",
74 | "Git": "Link to code repository is not provided in the abstract.",
75 | "Experiments & Results": "The abstract mentions that the Marco framework demonstrates promising performance on layout optimization, Verilog/design rule checker (DRC) coding, and timing analysis tasks. However, detailed results and comparisons are not available due to lack of access to the full paper content.",
76 | "Discussion & Next steps": "Unable to provide details on the authors' conclusions, identified limitations, and future research directions due to lack of access to the full paper content.",
77 | "Related work": "Unable to provide details on how this paper relates to similar recent papers in the field due to lack of access to the full paper content.",
78 | "Practical applications": "The framework proposed in this paper could have practical applications in the field of hardware design, potentially leading to faster product cycles, lower costs, improved design reliability and reduced risk of costly errors.",
79 | "Key takeaways": [
80 | "The paper proposes a unified framework, Marco, that integrates configurable graph-based task solving with multi-modality and multi-AI agents for chip design.",
81 | "The Marco framework leverages the capabilities of Large Language Models (LLMs) to improve efficiency and reduce costs in hardware design.",
82 | "The framework demonstrates promising performance on layout optimization, Verilog/design rule checker (DRC) coding, and timing analysis tasks."
83 | ]
84 | }
85 | """
86 |
87 | # Test the fix
88 | results = fix_openai_response(example_response)
89 | print(f"Found {len(results)} paper analyses")
90 | for i, result in enumerate(results):
91 | print(f"Paper {i+1} score: {result.get('Relevancy score', 'Not found')}")
--------------------------------------------------------------------------------
/src/design_papers_crawler.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python3
2 | """
3 | Design Papers Crawler - A dedicated script to find the latest papers
4 | on graphic design automation using AI/ML/LLM technologies.
5 |
6 | Usage:
7 | python design_papers_crawler.py [--days 7] [--output design_papers.json]
8 | """
9 |
10 | import os
11 | import sys
12 | import json
13 | import argparse
14 | import datetime
15 | import logging
16 | from typing import List, Dict, Any
17 |
18 | # Add parent directory to path to import from sibling modules
19 | sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
20 |
21 | from src.download_new_papers import get_papers, _download_new_papers
22 | from src.design_automation import (
23 | is_design_automation_paper,
24 | categorize_design_paper,
25 | analyze_design_techniques,
26 | extract_design_metrics
27 | )
28 | from src.paths import DATA_DIR, DIGEST_DIR
29 |
30 | # Configure logging
31 | logging.basicConfig(
32 | level=logging.INFO,
33 | format='%(asctime)s - %(name)s - %(levelname)s - %(message)s'
34 | )
35 | logger = logging.getLogger(__name__)
36 |
37 | # Default arXiv categories to search
38 | DEFAULT_CATEGORIES = [
39 | "cs.CV", # Computer Vision
40 | "cs.GR", # Graphics
41 | "cs.HC", # Human-Computer Interaction
42 | "cs.AI", # Artificial Intelligence
43 | "cs.LG", # Machine Learning
44 | "cs.CL", # Computation and Language (NLP)
45 | "cs.MM", # Multimedia
46 | "cs.SD", # Sound
47 | "cs.RO", # Robotics (for interactive design)
48 | "cs.CY" # Computers and Society
49 | ]
50 |
51 | def get_date_range(days_back: int = 7) -> List[str]:
52 | """
53 | Get a list of dates for the past N days in arXiv format.
54 |
55 | Args:
56 | days_back: Number of days to look back
57 |
58 | Returns:
59 | List of date strings in arXiv format
60 | """
61 | today = datetime.datetime.now()
62 | dates = []
63 |
64 | for i in range(days_back):
65 | date = today - datetime.timedelta(days=i)
66 | date_str = date.strftime("%a, %d %b %y")
67 | dates.append(date_str)
68 |
69 | return dates
70 |
71 | def ensure_data_files(categories: List[str], days_back: int = 7) -> None:
72 | """
73 | Make sure data files exist for the specified categories and date range.
74 |
75 | Args:
76 | categories: List of arXiv category codes
77 | days_back: Number of days to look back
78 | """
79 | dates = get_date_range(days_back)
80 |
81 | for category in categories:
82 | for date_str in dates:
83 | file_path = os.path.join(DATA_DIR, f"{category}_{date_str}.jsonl")
84 |
85 | if not os.path.exists(file_path):
86 | logger.info(f"Downloading papers for {category} on {date_str}")
87 | try:
88 | _download_new_papers(category)
89 | except Exception as e:
90 | logger.error(f"Error downloading {category} papers for {date_str}: {e}")
91 |
92 | def get_design_papers(categories: List[str], days_back: int = 7) -> List[Dict[str, Any]]:
93 | """
94 | Get design automation papers from specified categories over a date range.
95 |
96 | Args:
97 | categories: List of arXiv category codes
98 | days_back: Number of days to look back
99 |
100 | Returns:
101 | List of design automation papers
102 | """
103 | # Ensure data files exist
104 | ensure_data_files(categories, days_back)
105 |
106 | # Collect papers
107 | all_papers = []
108 | dates = get_date_range(days_back)
109 |
110 | for category in categories:
111 | for date_str in dates:
112 | try:
113 | papers = get_papers(category)
114 | all_papers.extend(papers)
115 | except Exception as e:
116 | logger.warning(f"Could not get papers for {category} on {date_str}: {e}")
117 |
118 | # Remove duplicates (papers can appear in multiple categories)
119 | unique_papers = {}
120 | for paper in all_papers:
121 | paper_id = paper.get("main_page", "").split("/")[-1]
122 | if paper_id and paper_id not in unique_papers:
123 | unique_papers[paper_id] = paper
124 |
125 | # Filter design automation papers
126 | design_papers = []
127 | for paper_id, paper in unique_papers.items():
128 | if is_design_automation_paper(paper):
129 | paper["paper_id"] = paper_id
130 | paper["design_category"] = categorize_design_paper(paper)
131 | paper["design_techniques"] = analyze_design_techniques(paper)
132 | paper["design_metrics"] = extract_design_metrics(paper)
133 | design_papers.append(paper)
134 |
135 | # Sort by date (newest first)
136 | design_papers.sort(key=lambda p: p.get("main_page", ""), reverse=True)
137 |
138 | return design_papers
139 |
140 | def print_paper_summary(paper: Dict[str, Any]) -> None:
141 | """
142 | Print a nice summary of a paper to the console.
143 |
144 | Args:
145 | paper: Paper dictionary
146 | """
147 | print(f"\n{'=' * 80}")
148 | print(f"TITLE: {paper.get('title', 'No title')}")
149 | print(f"AUTHORS: {paper.get('authors', 'No authors')}")
150 | print(f"URL: {paper.get('main_page', 'No URL')}")
151 | print(f"DESIGN CATEGORY: {paper.get('design_category', 'Unknown')}")
152 | print(f"TECHNIQUES: {', '.join(paper.get('design_techniques', []))}")
153 | print(f"METRICS: {', '.join(paper.get('design_metrics', []))}")
154 | print(f"\nABSTRACT: {paper.get('abstract', 'No abstract')[:500]}...")
155 | print(f"{'=' * 80}\n")
156 |
157 | def main():
158 | """Main function to run the design papers crawler."""
159 | parser = argparse.ArgumentParser(description="Find the latest graphic design automation papers.")
160 | parser.add_argument("--days", type=int, default=7, help="Number of days to look back")
161 | parser.add_argument("--output", type=str, default="design_papers.json", help="Output file path")
162 | parser.add_argument("--categories", type=str, nargs="+", default=DEFAULT_CATEGORIES,
163 | help="arXiv categories to search")
164 | args = parser.parse_args()
165 |
166 | logger.info(f"Looking for design papers in the past {args.days} days")
167 | logger.info(f"Searching categories: {', '.join(args.categories)}")
168 |
169 | # DATA_DIR is already created by paths.py
170 |
171 | # Get design papers
172 | design_papers = get_design_papers(args.categories, args.days)
173 |
174 | logger.info(f"Found {len(design_papers)} design automation papers")
175 |
176 | # Print summary to console
177 | for paper in design_papers[:10]: # Print top 10
178 | print_paper_summary(paper)
179 |
180 | if len(design_papers) > 10:
181 | print(f"...and {len(design_papers) - 10} more papers.")
182 |
183 | # Determine output path - ensure it's in DATA_DIR
184 | output_path = os.path.join(DATA_DIR, args.output)
185 |
186 | # Save to file
187 | with open(output_path, "w") as f:
188 | json.dump(design_papers, f, indent=2)
189 |
190 | logger.info(f"Saved {len(design_papers)} papers to {output_path}")
191 | print(f"\nResults saved to {output_path}")
192 |
193 | if __name__ == "__main__":
194 | main()
--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 |

2 |
3 | # ArXiv Digest (Enhanced Edition)
4 |
5 | **Personalized arXiv Paper Recommendations with Multiple AI Models**
6 |
7 | This repository provides an enhanced daily digest for newly published arXiv papers based on your research interests, leveraging multiple AI models including OpenAI GPT, Google Gemini, and Anthropic Claude to provide relevancy ratings, detailed analysis, and topic clustering.
8 |
9 | ## 📚 Contents
10 |
11 | - [Features](#-features)
12 | - [Quick Start](#-quick-start)
13 | - [What This Repo Does](#-what-this-repo-does)
14 | - [Model Integrations](#-model-integrations)
15 | - [Design Paper Discovery](#-design-paper-discovery)
16 | - [Output Formats](#-output-formats)
17 | - [Setting Up and Usage](#-setting-up-and-usage)
18 | * [Configuration](#configuration)
19 | * [Running the Web Interface](#running-the-web-interface)
20 | * [Running via GitHub Action](#running-via-github-action)
21 | * [Running from Command Line](#running-from-command-line)
22 | - [API Usage Notes](#-api-usage-notes)
23 | - [Directory Structure](#-directory-structure)
24 | - [Roadmap](#-roadmap)
25 | - [Contributing](#-contributing)
26 |
27 | ## ✨ Features
28 |
29 | - **Multi-Model Integration**: Support for OpenAI, Gemini, and Claude models for paper analysis
30 | - **Latest Models**: Support for GPT-4o, GPT-4o mini, Claude 3.5, and other current models
31 | - **Two-Stage Processing**: Efficient paper analysis with quick filtering followed by detailed analysis
32 | - **Enhanced Analysis**: Detailed paper breakdowns including key innovations, critical analysis, and practical applications
33 | - **HTML Report Generation**: Clean, organized reports saved with date-based filenames
34 | - **Adjustable Relevancy Threshold**: Interactive slider for filtering papers by relevance score
35 | - **Design Automation Backend**: Specialized tools for analyzing design-related papers
36 | - **Topic Clustering**: Group similar papers using AI-powered clustering (Gemini)
37 | - **Robust JSON Parsing**: Reliable extraction of analysis results from LLM responses
38 | - **Standardized Directory Structure**: Organized codebase with `/src`, `/data`, and `/digest` directories
39 | - **Improved Web UI**: Clean Gradio interface with dynamic topic selection and error handling
40 |
41 | 
42 |
43 | ## 🚀 Quick Start
44 |
45 | Try it out on [Hugging Face](https://huggingface.co/spaces/linhkid91/ArxivDigest-extra) using your own API keys.
46 |
47 | ## 🔍 What This Repo Does
48 |
49 | Staying up to date on [arXiv](https://arxiv.org) papers is time-consuming, with hundreds of new papers published daily. Even with the [official daily digest service](https://info.arxiv.org/help/subscribe.html), categories like [cs.AI](https://arxiv.org/list/cs.AI/recent) still contain 50-100 papers per day.
50 |
51 | This repository creates a personalized daily digest by:
52 |
53 | 1. **Crawling arXiv** for recent papers in your areas of interest
54 | 2. **Analyzing papers** in-depth using AI models (OpenAI, Gemini, or Claude)
55 | 3. **Two-stage processing** for efficiency:
56 | - Stage 1: Quick relevancy filtering using only title and abstract
57 | - Stage 2: Detailed analysis of papers that meet the relevancy threshold
58 | 4. **Scoring relevance** on a scale of 1-10 based on your research interests
59 | 5. **Providing detailed analysis** of each paper, including:
60 | - Key innovations
61 | - Critical analysis
62 | - Implementation details
63 | - Practical applications
64 | - Related work
65 | 6. **Generating reports** in HTML format with clean organization
66 |
67 | ## 🤖 Model Integrations
68 |
69 | The system supports three major AI providers:
70 |
71 | - **OpenAI GPT** (gpt-3.5-turbo-16k, gpt-4, gpt-4-turbo, gpt-4o, gpt-4o-mini)
72 | - **Google Gemini** (gemini-1.5-flash, gemini-1.5-pro, gemini-2.0-flash)
73 | - **Anthropic Claude** (claude-3-haiku, claude-3-sonnet, claude-3-opus, claude-3.5-sonnet)
74 |
75 | You can use any combination of these models, allowing you to compare results or choose based on your needs.
76 |
77 | ## 📊 Output Formats
78 |
79 | Reports are generated in multiple formats:
80 |
81 | - **HTML Reports**: Clean, organized reports saved to the `/digest` directory with date-based filenames
82 | - **Console Output**: Summary information displayed in the terminal
83 | - **JSON Data**: Raw paper data saved to the `/data` directory
84 |
85 | Every HTML report includes:
86 | - Paper title, authors, and link to arXiv
87 | - Relevancy score with explanation
88 | - Abstract and key innovations
89 | - Critical analysis and implementation details
90 | - Experiments, results, and discussion points
91 | - Related work and practical applications
92 |
93 | Example HTML report:
94 |
95 | 
96 | ## 💡 Setting Up and Usage
97 |
98 | ### Configuration
99 |
100 | Modify `config.yaml` with your preferences:
101 |
102 | ```yaml
103 | # Main research area
104 | topic: "Computer Science"
105 |
106 | # Specific categories to monitor
107 | categories: ["Artificial Intelligence", "Computation and Language", "Machine Learning", "Information Retrieval"]
108 |
109 | # Minimum relevance score (1-10)
110 | threshold: 2
111 |
112 | # Your research interests in natural language
113 | interest: |
114 | 1. AI alignment and AI safety
115 | 2. Mechanistic interpretability and explainable AI
116 | 3. Large language model optimization
117 | 4. RAGs, Information retrieval
118 | 5. AI Red teaming, deception and misalignment
119 | ```
120 |
121 | ### Running the Web Interface
122 |
123 | To run locally with the simplified UI:
124 |
125 | 1. Install requirements: `pip install -r requirements.txt`
126 | 2. Run the app: `python src/app_new.py`
127 | 3. Open the URL displayed in your terminal
128 | 4. Enter your API key(s) and configure your preferences
129 | 5. Use the relevancy threshold slider to adjust paper filtering (default is 2)
130 |
131 | ### Running via GitHub Action
132 |
133 | To set up automated daily digests:
134 |
135 | 1. Fork this repository
136 | 2. Update `config.yaml` with your preferences
137 | 3. Set the following secrets in your repository settings:
138 | - `OPENAI_API_KEY` (and/or `GEMINI_API_KEY` or `ANTHROPIC_API_KEY`)
139 | 4. The GitHub Action will run on schedule or can be triggered manually
140 |
141 | ### Running from Command Line
142 |
143 | For advanced users:
144 |
145 | ```bash
146 | # Regular paper digests with simplified UI
147 | python src/app_new.py
148 |
149 | # Design paper finder
150 | ./src/design/find_design_papers.sh --days 7 --analyze
151 | ```
152 |
153 | ## ⚠️ API Usage Notes
154 |
155 | This tool respects arXiv's robots.txt and implements proper rate limiting. If you encounter 403 Forbidden errors:
156 |
157 | 1. Wait a few hours before trying again
158 | 2. Consider reducing the number of categories you're fetching
159 | 3. Increase the delay between requests in the code
160 |
161 | ## 📁 Directory Structure
162 |
163 | The repository is organized as follows:
164 |
165 | - `/src` - All Python source code
166 | - `app_new.py` - Simplified interface with improved threshold handling and UI
167 | - `download_new_papers.py` - arXiv crawler
168 | - `relevancy.py` - Paper scoring and analysis with robust JSON parsing
169 | - `model_manager.py` - Multi-model integration
170 | - `gemini_utils.py` - Gemini API integration
171 | - `anthropic_utils.py` - Claude API integration
172 | - `design/` - Design automation tools
173 | - `paths.py` - Standardized path handling
174 | - `/data` - JSON data files (auto-created)
175 | - `/digest` - HTML report files (auto-created)
176 |
177 | ## ✅ Roadmap
178 |
179 | - [x] Support multiple AI models (OpenAI, Gemini, Claude)
180 | - [x] Generate comprehensive HTML reports with date-based filenames
181 | - [x] Specialized analysis for design automation papers
182 | - [x] Topic clustering via Gemini
183 | - [x] Standardized directory structure
184 | - [x] Enhanced HTML reports with detailed analysis sections
185 | - [x] Pre-filtering of arXiv categories for efficiency
186 | - [x] Adjustable relevancy threshold with UI slider
187 | - [x] Robust JSON parsing for reliable LLM response handling
188 | - [x] Simplified UI focused on core functionality
189 | - [x] Dynamic topic selection UI with improved error handling
190 | - [x] Support for newer models (GPT-4o, GPT-4o mini, Claude 3.5)
191 | - [x] Two-stage paper processing for efficiency (quick filtering followed by detailed analysis)
192 | - [x] Removed email functionality in favor of local HTML reports
193 | - [ ] Full PDF content analysis
194 | - [ ] Author-based ranking and filtering
195 | - [ ] Fine-tuned open-source model support: Ollama, LocalAI...
196 |
197 | ## 💁 Contributing
198 |
199 | You're encouraged to modify this code for your personal needs. If your modifications would be useful to others, please submit a pull request.
200 |
201 | Valuable contributions include:
202 | - Additional AI model integrations
203 | - New analysis capabilities
204 | - UI improvements
205 | - Prompt engineering enhancements
206 |
--------------------------------------------------------------------------------
/src/interpretability_analysis.py:
--------------------------------------------------------------------------------
1 | """
2 | Specialized module for mechanistic interpretability and technical AI safety analysis.
3 | """
4 | import json
5 | import logging
6 | from typing import Dict, Any, List, Optional
7 |
8 | # Configure logging
9 | logging.basicConfig(level=logging.INFO)
10 | logger = logging.getLogger(__name__)
11 |
12 | # Prompts for specialized analysis
13 | MECHANISTIC_INTERPRETABILITY_PROMPT = """
14 | You are a research assistant specializing in mechanistic interpretability of AI systems.
15 |
16 | Analyze this paper from the perspective of mechanistic interpretability:
17 |
18 | Title: {title}
19 | Authors: {authors}
20 | Abstract: {abstract}
21 | Content: {content}
22 |
23 | Please provide a detailed analysis covering:
24 |
25 | 1. Relevance to mechanistic interpretability: How does this paper contribute to understanding the internal workings of models?
26 | 2. Interpretability techniques: What specific methods or approaches does the paper use to explain model behavior?
27 | 3. Circuit analysis: Does the paper identify specific circuits or computational components within models?
28 | 4. Attribution methods: What techniques are used to attribute model outputs to internal components?
29 | 5. Novel insights: What new understanding does this paper bring to model internals?
30 | 6. Limitations: What are the limitations of the approach from an interpretability perspective?
31 | 7. Future directions: What follow-up work would be valuable?
32 | 8. Connections to other interpretability research: How does this relate to other work in the field?
33 |
34 | Format your response as JSON with these fields.
35 | """
36 |
37 | TECHNICAL_AI_SAFETY_PROMPT = """
38 | You are a research assistant specializing in technical AI safety.
39 |
40 | Analyze this paper from the perspective of technical AI safety:
41 |
42 | Title: {title}
43 | Authors: {authors}
44 | Abstract: {abstract}
45 | Content: {content}
46 |
47 | Please provide a detailed analysis covering:
48 |
49 | 1. Relevance to AI safety: How does this paper contribute to building safer AI systems?
50 | 2. Safety approaches: What specific methods or approaches does the paper use to improve AI safety?
51 | 3. Robustness: How does the paper address model robustness to distribution shifts or adversarial attacks?
52 | 4. Alignment: Does the paper discuss techniques for aligning AI systems with human values?
53 | 5. Risk assessment: What potential risks or failure modes does the paper address?
54 | 6. Monitoring and oversight: What methods are proposed for monitoring or controlling AI systems?
55 | 7. Limitations: What are the limitations of the approach from a safety perspective?
56 | 8. Future directions: What follow-up work would be valuable for improving safety?
57 |
58 | Format your response as JSON with these fields.
59 | """
60 |
61 | PROMPT_TEMPLATES = {
62 | "mechanistic_interpretability": MECHANISTIC_INTERPRETABILITY_PROMPT,
63 | "technical_ai_safety": TECHNICAL_AI_SAFETY_PROMPT
64 | }
65 |
66 | def extract_json_from_text(text: str) -> Dict[str, Any]:
67 | """
68 | Attempt to extract JSON from text, handling various formats.
69 |
70 | Args:
71 | text: String potentially containing JSON
72 |
73 | Returns:
74 | Extracted JSON as a dictionary, or error dictionary
75 | """
76 | try:
77 | # Look for JSON-like structures
78 | start_idx = text.find('{')
79 | end_idx = text.rfind('}') + 1
80 |
81 | if start_idx >= 0 and end_idx > start_idx:
82 | json_str = text[start_idx:end_idx]
83 | return json.loads(json_str)
84 | else:
85 | return {"error": "Could not find JSON in text", "raw_text": text}
86 | except json.JSONDecodeError:
87 | return {"error": "Failed to parse as JSON", "raw_text": text}
88 |
89 | def create_analysis_prompt(paper: Dict[str, Any], analysis_type: str) -> str:
90 | """
91 | Create a prompt for specialized analysis.
92 |
93 | Args:
94 | paper: Dictionary with paper details
95 | analysis_type: Type of analysis to perform
96 |
97 | Returns:
98 | Formatted prompt string
99 | """
100 | if analysis_type not in PROMPT_TEMPLATES:
101 | raise ValueError(f"Unknown analysis type: {analysis_type}")
102 |
103 | prompt_template = PROMPT_TEMPLATES[analysis_type]
104 |
105 | return prompt_template.format(
106 | title=paper.get("title", ""),
107 | authors=paper.get("authors", ""),
108 | abstract=paper.get("abstract", ""),
109 | content=paper.get("content", "")[:10000] # Limit content length
110 | )
111 |
112 | def analyze_interpretability_circuits(paper: Dict[str, Any], response: Dict[str, Any]) -> Dict[str, Any]:
113 | """
114 | Perform additional circuit analysis based on paper content and initial response.
115 |
116 | Args:
117 | paper: Dictionary with paper details
118 | response: Initial analysis response
119 |
120 | Returns:
121 | Enhanced analysis with circuit information
122 | """
123 | # This is a placeholder for more sophisticated circuit analysis
124 | # In a real implementation, this would use specialized tools to analyze
125 | # neural network circuits mentioned in the paper
126 |
127 | # Extract potential circuit descriptions from paper content
128 | circuit_mentions = []
129 |
130 | content = paper.get("content", "").lower()
131 | circuit_keywords = ["circuit", "attention head", "neuron", "mlp", "weight", "activation"]
132 |
133 | for keyword in circuit_keywords:
134 | if keyword in content:
135 | # Very simple extraction - in reality would use more sophisticated NLP
136 | start_idx = content.find(keyword)
137 | if start_idx >= 0:
138 | excerpt = content[max(0, start_idx-50):min(len(content), start_idx+100)]
139 | circuit_mentions.append(excerpt)
140 |
141 | # Add circuit information to response
142 | enhanced_response = response.copy()
143 | enhanced_response["circuit_mentions"] = circuit_mentions[:5] # Limit to 5 mentions
144 | enhanced_response["circuit_analysis_performed"] = len(circuit_mentions) > 0
145 |
146 | return enhanced_response
147 |
148 | def get_paper_relation_to_ai_safety(paper: Dict[str, Any]) -> str:
149 | """
150 | Determine how a paper relates to AI safety research.
151 |
152 | Args:
153 | paper: Dictionary with paper details
154 |
155 | Returns:
156 | Description of relation to AI safety
157 | """
158 | # Simple keyword-based approach
159 | safety_keywords = {
160 | "alignment": "AI alignment",
161 | "safety": "AI safety",
162 | "robustness": "Model robustness",
163 | "adversarial": "Adversarial robustness",
164 | "bias": "Bias mitigation",
165 | "fairness": "Fairness",
166 | "transparency": "Transparency",
167 | "interpretability": "Interpretability",
168 | "explainability": "Explainability",
169 | "oversight": "AI oversight",
170 | "control": "AI control",
171 | "verification": "Formal verification",
172 | "monitoring": "AI monitoring"
173 | }
174 |
175 | relation = []
176 | content = (paper.get("abstract", "") + " " + paper.get("title", "")).lower()
177 |
178 | for keyword, category in safety_keywords.items():
179 | if keyword in content:
180 | relation.append(category)
181 |
182 | if relation:
183 | return ", ".join(set(relation))
184 | else:
185 | return "No direct relation to AI safety identified"
186 |
187 | def analyze_multi_agent_safety(paper: Dict[str, Any]) -> Dict[str, Any]:
188 | """
189 | Analyze multi-agent safety aspects of a paper.
190 |
191 | Args:
192 | paper: Dictionary with paper details
193 |
194 | Returns:
195 | Multi-agent safety analysis
196 | """
197 | # Check if paper mentions multi-agent systems
198 | content = (paper.get("abstract", "") + " " + paper.get("title", "")).lower()
199 |
200 | multi_agent_keywords = [
201 | "multi-agent", "multiagent", "agent cooperation", "agent competition",
202 | "game theory", "nash equilibrium", "cooperative ai", "collaborative ai"
203 | ]
204 |
205 | is_multi_agent = any(keyword in content for keyword in multi_agent_keywords)
206 |
207 | if not is_multi_agent:
208 | return {"is_multi_agent_focused": False}
209 |
210 | # Simple analysis of multi-agent safety aspects
211 | safety_aspects = []
212 |
213 | if "cooperation" in content or "collaborative" in content or "coordination" in content:
214 | safety_aspects.append("Agent cooperation")
215 |
216 | if "competition" in content or "adversarial" in content:
217 | safety_aspects.append("Agent competition")
218 |
219 | if "equilibrium" in content or "game theory" in content:
220 | safety_aspects.append("Game theoretic analysis")
221 |
222 | if "incentive" in content or "reward" in content:
223 | safety_aspects.append("Incentive design")
224 |
225 | if "communication" in content:
226 | safety_aspects.append("Agent communication")
227 |
228 | return {
229 | "is_multi_agent_focused": True,
230 | "multi_agent_safety_aspects": safety_aspects,
231 | "summary": f"This paper focuses on multi-agent systems, specifically addressing: {', '.join(safety_aspects)}" if safety_aspects else "This paper discusses multi-agent systems but doesn't specifically address safety aspects."
232 | }
--------------------------------------------------------------------------------
/src/gemini_utils.py:
--------------------------------------------------------------------------------
1 | """
2 | Gemini API integration for ArxivDigest.
3 | This module provides functions to work with Google's Gemini API for paper analysis.
4 | """
5 | import os
6 | import json
7 | import logging
8 | import time
9 | from typing import List, Dict, Any, Optional
10 |
11 | try:
12 | import google.generativeai as genai
13 | from google.api_core.exceptions import GoogleAPIError
14 | GEMINI_AVAILABLE = True
15 | except ImportError:
16 | GEMINI_AVAILABLE = False
17 |
18 | # Configure logging
19 | logging.basicConfig(level=logging.INFO)
20 | logger = logging.getLogger(__name__)
21 |
22 | class GeminiConfig:
23 | """Configuration for Gemini API calls."""
24 | def __init__(
25 | self,
26 | temperature: float = 0.4,
27 | max_output_tokens: int = 2048,
28 | top_p: float = 0.95,
29 | top_k: int = 40
30 | ):
31 | self.temperature = temperature
32 | self.max_output_tokens = max_output_tokens
33 | self.top_p = top_p
34 | self.top_k = top_k
35 |
36 | def setup_gemini_api(api_key: str) -> bool:
37 | """
38 | Setup the Gemini API with the provided API key.
39 |
40 | Args:
41 | api_key: Gemini API key
42 |
43 | Returns:
44 | bool: True if setup was successful, False otherwise
45 | """
46 | if not GEMINI_AVAILABLE:
47 | logger.error("Gemini package not installed. Run 'pip install google-generativeai'")
48 | return False
49 |
50 | if not api_key:
51 | logger.error("No Gemini API key provided")
52 | return False
53 |
54 | try:
55 | genai.configure(api_key=api_key)
56 | # Test API connection
57 | models = genai.list_models()
58 | logger.info(f"Successfully connected to Gemini API. Available models: {[m.name for m in models if 'generateContent' in m.supported_generation_methods]}")
59 | return True
60 | except Exception as e:
61 | logger.error(f"Failed to setup Gemini API: {e}")
62 | return False
63 |
64 | def get_gemini_model(model_name: str = "gemini-1.5-flash"):
65 | """
66 | Get a Gemini model by name.
67 |
68 | Args:
69 | model_name: Name of the Gemini model
70 |
71 | Returns:
72 | Model object or None if not available
73 | """
74 | if not GEMINI_AVAILABLE:
75 | return None
76 |
77 | try:
78 | model = genai.GenerativeModel(model_name)
79 | return model
80 | except Exception as e:
81 | logger.error(f"Failed to get Gemini model: {e}")
82 | return None
83 |
84 | def analyze_papers_with_gemini(
85 | papers: List[Dict[str, Any]],
86 | query: Dict[str, str],
87 | config: Optional[GeminiConfig] = None,
88 | model_name: str = "gemini-1.5-flash"
89 | ) -> List[Dict[str, Any]]:
90 | """
91 | Analyze papers using the Gemini model.
92 |
93 | Args:
94 | papers: List of paper dictionaries
95 | query: Dictionary with 'interest' key describing research interests
96 | config: GeminiConfig object
97 | model_name: Name of the Gemini model to use
98 |
99 | Returns:
100 | List of papers with added analysis
101 | """
102 | if not GEMINI_AVAILABLE:
103 | logger.error("Gemini package not installed. Cannot analyze papers.")
104 | return papers
105 |
106 | if not config:
107 | config = GeminiConfig()
108 |
109 | model = get_gemini_model(model_name)
110 | if not model:
111 | return papers
112 |
113 | analyzed_papers = []
114 |
115 | for paper in papers:
116 | try:
117 | # Prepare prompt
118 | prompt = f"""
119 | You are a research assistant analyzing academic papers in AI and ML.
120 |
121 | Analyze this paper and provide insights based on the user's research interests.
122 |
123 | Research interests: {query['interest']}
124 |
125 | Paper details:
126 | Title: {paper['title']}
127 | Authors: {paper['authors']}
128 | Abstract: {paper['abstract']}
129 | Content: {paper['content'][:5000]}
130 |
131 | Please provide your response as a single JSON object with the following structure:
132 | {{
133 | "Relevancy score": 1-10 (higher = more relevant),
134 | "Reasons for match": "Detailed explanation of why this paper matches the interests",
135 | "Key innovations": "List the main contributions of the paper",
136 | "Critical analysis": "Evaluate strengths and weaknesses",
137 | "Goal": "What problem does the paper address?",
138 | "Data": "Description of datasets used",
139 | "Methodology": "Technical approach and methods",
140 | "Implementation details": "Model architecture, hyperparameters, etc.",
141 | "Experiments & Results": "Key findings and comparisons",
142 | "Discussion & Next steps": "Limitations and future work",
143 | "Related work": "Connection to similar research",
144 | "Practical applications": "Real-world uses of this research",
145 | "Key takeaways": ["Point 1", "Point 2", "Point 3"]
146 | }}
147 |
148 | Format your response as a valid JSON object and nothing else.
149 | """
150 |
151 | # Just log that we're sending a prompt to Gemini
152 | print(f"Sending prompt to Gemini for paper: {paper['title'][:50]}...")
153 |
154 | generation_config = {
155 | "temperature": config.temperature,
156 | "top_p": config.top_p,
157 | "top_k": config.top_k,
158 | "max_output_tokens": config.max_output_tokens,
159 | }
160 |
161 | response = model.generate_content(
162 | prompt,
163 | generation_config=generation_config
164 | )
165 |
166 | # Extract and parse the response
167 | response_text = response.text
168 |
169 | # Try to extract JSON
170 | try:
171 | start_idx = response_text.find('{')
172 | end_idx = response_text.rfind('}') + 1
173 | if start_idx >= 0 and end_idx > start_idx:
174 | json_str = response_text[start_idx:end_idx]
175 | gemini_analysis = json.loads(json_str)
176 |
177 | # Add Gemini analysis to paper
178 | paper['gemini_analysis'] = gemini_analysis
179 |
180 | # Directly copy fields to paper
181 | for key, value in gemini_analysis.items():
182 | paper[key] = value
183 | else:
184 | logger.warning(f"Could not extract JSON from Gemini response for paper {paper['title']}")
185 | paper['gemini_analysis'] = {"error": "Failed to parse response"}
186 | except json.JSONDecodeError:
187 | logger.warning(f"Failed to parse Gemini response as JSON for paper {paper['title']}")
188 | paper['gemini_analysis'] = {"error": "Failed to parse response"}
189 |
190 | analyzed_papers.append(paper)
191 |
192 | # Avoid rate limiting
193 | time.sleep(1)
194 |
195 | except GoogleAPIError as e:
196 | logger.error(f"Gemini API error: {e}")
197 | paper['gemini_analysis'] = {"error": f"Gemini API error: {str(e)}"}
198 | analyzed_papers.append(paper)
199 |
200 | except Exception as e:
201 | logger.error(f"Error analyzing paper with Gemini: {e}")
202 | paper['gemini_analysis'] = {"error": f"Error: {str(e)}"}
203 | analyzed_papers.append(paper)
204 |
205 | return analyzed_papers
206 |
207 | def get_topic_clustering(papers: List[Dict[str, Any]], model_name: str = "gemini-1.5-flash"):
208 | """
209 | Cluster papers by topic using Gemini.
210 |
211 | Args:
212 | papers: List of paper dictionaries
213 | model_name: Name of the Gemini model to use
214 |
215 | Returns:
216 | Dictionary with topic clusters
217 | """
218 | if not GEMINI_AVAILABLE:
219 | logger.error("Gemini package not installed. Cannot cluster papers.")
220 | return {}
221 |
222 | model = get_gemini_model(model_name)
223 | if not model:
224 | return {}
225 |
226 | # Create a condensed representation of the papers
227 | paper_summaries = []
228 | for i, paper in enumerate(papers):
229 | paper_summaries.append(f"{i+1}. Title: {paper['title']}\nAbstract: {paper['abstract'][:300]}...")
230 |
231 | paper_text = "\n\n".join(paper_summaries)
232 |
233 | prompt = f"""
234 | You are a research librarian organizing academic papers into topic clusters.
235 |
236 | Analyze these papers and group them into 3-7 thematic clusters:
237 |
238 | {paper_text}
239 |
240 | For each cluster:
241 | 1. Provide a descriptive name for the cluster
242 | 2. List the paper numbers that belong to this cluster
243 | 3. Explain why these papers belong together
244 |
245 | Format your response as JSON with these fields: "clusters" (an array of objects with "name", "papers", and "description" fields).
246 | """
247 |
248 | try:
249 | response = model.generate_content(prompt)
250 | response_text = response.text
251 |
252 | # Try to extract JSON
253 | try:
254 | start_idx = response_text.find('{')
255 | end_idx = response_text.rfind('}') + 1
256 | if start_idx >= 0 and end_idx > start_idx:
257 | json_str = response_text[start_idx:end_idx]
258 | cluster_data = json.loads(json_str)
259 | return cluster_data
260 | else:
261 | logger.warning("Could not extract JSON from Gemini clustering response")
262 | return {"error": "Failed to parse clustering response"}
263 | except json.JSONDecodeError:
264 | logger.warning("Failed to parse Gemini clustering response as JSON")
265 | return {"error": "Failed to parse clustering response"}
266 |
267 | except Exception as e:
268 | logger.error(f"Error clustering papers with Gemini: {e}")
269 | return {"error": f"Clustering error: {str(e)}"}
--------------------------------------------------------------------------------
/src/utils.py:
--------------------------------------------------------------------------------
1 | import dataclasses
2 | import logging
3 | import math
4 | import os
5 | import io
6 | import sys
7 | import time
8 | import json
9 | from typing import Optional, Sequence, Union, Dict, Any
10 |
11 | import openai
12 | import tqdm
13 | import copy
14 |
15 | # Handle both old and new OpenAI SDK versions
16 | try:
17 | from openai import openai_object
18 | StrOrOpenAIObject = Union[str, openai_object.OpenAIObject]
19 | OPENAI_OLD_API = True
20 | except ImportError:
21 | StrOrOpenAIObject = Union[str, Dict[str, Any]]
22 | OPENAI_OLD_API = False
23 |
24 |
25 | openai_org = os.getenv("OPENAI_ORG")
26 | if openai_org is not None:
27 | openai.organization = openai_org
28 | logging.warning(f"Switching to organization: {openai_org} for OAI API key.")
29 |
30 |
31 | @dataclasses.dataclass
32 | class OpenAIDecodingArguments(object):
33 | #max_tokens: int = 1800
34 | max_tokens: int = 5400
35 | temperature: float = 0.2
36 | top_p: float = 1.0
37 | n: int = 1
38 | stream: bool = False
39 | stop: Optional[Sequence[str]] = None
40 | presence_penalty: float = 0.0
41 | frequency_penalty: float = 0.0
42 | # logprobs: Optional[int] = None
43 |
44 |
45 | def openai_completion(
46 | prompts, #: Union[str, Sequence[str], Sequence[dict[str, str]], dict[str, str]],
47 | decoding_args: OpenAIDecodingArguments,
48 | model_name="text-davinci-003",
49 | sleep_time=15,
50 | batch_size=1,
51 | max_instances=sys.maxsize,
52 | max_batches=sys.maxsize,
53 | return_text=False,
54 | **decoding_kwargs,
55 | ) -> Union[Union[StrOrOpenAIObject], Sequence[StrOrOpenAIObject], Sequence[Sequence[StrOrOpenAIObject]],]:
56 | """Decode with OpenAI API.
57 |
58 | Args:
59 | prompts: A string or a list of strings to complete. If it is a chat model the strings should be formatted
60 | as explained here: https://github.com/openai/openai-python/blob/main/chatml.md. If it is a chat model
61 | it can also be a dictionary (or list thereof) as explained here:
62 | https://github.com/openai/openai-cookbook/blob/main/examples/How_to_format_inputs_to_ChatGPT_models.ipynb
63 | decoding_args: Decoding arguments.
64 | model_name: Model name. Can be either in the format of "org/model" or just "model".
65 | sleep_time: Time to sleep once the rate-limit is hit.
66 | batch_size: Number of prompts to send in a single request. Only for non chat model.
67 | max_instances: Maximum number of prompts to decode.
68 | max_batches: Maximum number of batches to decode. This argument will be deprecated in the future.
69 | return_text: If True, return text instead of full completion object (which contains things like logprob).
70 | decoding_kwargs: Additional decoding arguments. Pass in `best_of` and `logit_bias` if you need them.
71 |
72 | Returns:
73 | A completion or a list of completions.
74 | Depending on return_text, return_openai_object, and decoding_args.n, the completion type can be one of
75 | - a string (if return_text is True)
76 | - an openai_object.OpenAIObject object (if return_text is False)
77 | - a list of objects of the above types (if decoding_args.n > 1)
78 | """
79 | is_chat_model = "gpt-3.5" in model_name or "gpt-4" in model_name
80 | is_single_prompt = isinstance(prompts, (str, dict))
81 | if is_single_prompt:
82 | prompts = [prompts]
83 |
84 | if max_batches < sys.maxsize:
85 | logging.warning(
86 | "`max_batches` will be deprecated in the future, please use `max_instances` instead."
87 | "Setting `max_instances` to `max_batches * batch_size` for now."
88 | )
89 | max_instances = max_batches * batch_size
90 |
91 | prompts = prompts[:max_instances]
92 | num_prompts = len(prompts)
93 | prompt_batches = [
94 | prompts[batch_id * batch_size : (batch_id + 1) * batch_size]
95 | for batch_id in range(int(math.ceil(num_prompts / batch_size)))
96 | ]
97 |
98 | completions = []
99 | for batch_id, prompt_batch in tqdm.tqdm(
100 | enumerate(prompt_batches),
101 | desc="prompt_batches",
102 | total=len(prompt_batches),
103 | ):
104 | batch_decoding_args = copy.deepcopy(decoding_args) # cloning the decoding_args
105 |
106 | backoff = 5
107 |
108 | while True:
109 | try:
110 | time.sleep(3)
111 | shared_kwargs = dict(
112 | model=model_name,
113 | **batch_decoding_args.__dict__,
114 | **decoding_kwargs,
115 | )
116 |
117 | if OPENAI_OLD_API:
118 | # Use old API format
119 | if is_chat_model:
120 | completion_batch = openai.ChatCompletion.create(
121 | messages=[
122 | {"role": "system", "content": "You are a helpful assistant."},
123 | {"role": "user", "content": prompt_batch[0]}
124 | ],
125 | **shared_kwargs
126 | )
127 | else:
128 | completion_batch = openai.Completion.create(prompt=prompt_batch, **shared_kwargs)
129 |
130 | choices = completion_batch.choices
131 |
132 | for choice in choices:
133 | choice["total_tokens"] = completion_batch.usage.total_tokens
134 | else:
135 | # Use new API format
136 | client = openai.OpenAI()
137 |
138 | if is_chat_model:
139 | completion_batch = client.chat.completions.create(
140 | model=model_name,
141 | messages=[
142 | {"role": "system", "content": "You are a helpful assistant."},
143 | {"role": "user", "content": prompt_batch[0]}
144 | ],
145 | temperature=batch_decoding_args.temperature,
146 | max_tokens=batch_decoding_args.max_tokens,
147 | top_p=batch_decoding_args.top_p,
148 | n=batch_decoding_args.n,
149 | stream=batch_decoding_args.stream,
150 | presence_penalty=batch_decoding_args.presence_penalty,
151 | frequency_penalty=batch_decoding_args.frequency_penalty,
152 | **decoding_kwargs
153 | )
154 |
155 | # Convert completion to dictionary format for consistency
156 | choices = []
157 | for choice in completion_batch.choices:
158 | choice_dict = {
159 | "message": {
160 | "content": choice.message.content,
161 | "role": choice.message.role
162 | },
163 | "index": choice.index,
164 | "finish_reason": choice.finish_reason,
165 | "total_tokens": completion_batch.usage.total_tokens
166 | }
167 | choices.append(choice_dict)
168 | else:
169 | completion_batch = client.completions.create(
170 | model=model_name,
171 | prompt=prompt_batch,
172 | temperature=batch_decoding_args.temperature,
173 | max_tokens=batch_decoding_args.max_tokens,
174 | top_p=batch_decoding_args.top_p,
175 | n=batch_decoding_args.n,
176 | stream=batch_decoding_args.stream,
177 | presence_penalty=batch_decoding_args.presence_penalty,
178 | frequency_penalty=batch_decoding_args.frequency_penalty,
179 | **decoding_kwargs
180 | )
181 |
182 | # Convert completion to dictionary format for consistency
183 | choices = []
184 | for choice in completion_batch.choices:
185 | choice_dict = {
186 | "text": choice.text,
187 | "index": choice.index,
188 | "finish_reason": choice.finish_reason,
189 | "total_tokens": completion_batch.usage.total_tokens
190 | }
191 | choices.append(choice_dict)
192 |
193 | completions.extend(choices)
194 | break
195 | except Exception as e:
196 | logging.warning(f"OpenAI API Error: {e}.")
197 | if "Please reduce your prompt" in str(e):
198 | batch_decoding_args.max_tokens = int(batch_decoding_args.max_tokens * 0.8)
199 | logging.warning(f"Reducing target length to {batch_decoding_args.max_tokens}, Retrying...")
200 | elif not backoff:
201 | logging.error("Hit too many failures, exiting")
202 | raise e
203 | else:
204 | backoff -= 1
205 | logging.warning("Hit request rate limit; retrying...")
206 | time.sleep(sleep_time) # Annoying rate limit on requests.
207 | continue
208 |
209 | if return_text:
210 | if is_chat_model:
211 | completions = [completion.get("message", {}).get("content", "") for completion in completions]
212 | else:
213 | completions = [completion.get("text", "") for completion in completions]
214 |
215 | if decoding_args.n > 1:
216 | # make completions a nested list, where each entry is a consecutive decoding_args.n of original entries.
217 | completions = [completions[i : i + decoding_args.n] for i in range(0, len(completions), decoding_args.n)]
218 | if is_single_prompt:
219 | # Return non-tuple if only 1 input and 1 generation.
220 | (completions,) = completions
221 | return completions
222 |
223 |
224 | def write_ans_to_file(ans_data, file_prefix, output_dir="./output"):
225 | if not os.path.exists(output_dir):
226 | os.makedirs(output_dir)
227 | filename = os.path.join(output_dir, file_prefix + ".txt")
228 | with open(filename, "w") as f:
229 | for ans in ans_data:
230 | f.write(ans + "\n")
231 |
--------------------------------------------------------------------------------
/src/design_automation.py:
--------------------------------------------------------------------------------
1 | """
2 | Module for analyzing papers related to AI/ML for graphic design automation.
3 | This module helps identify and analyze papers on automated design, layout generation,
4 | creative AI tools, and related topics.
5 | """
6 | import logging
7 | import json
8 | from typing import Dict, Any, List, Optional
9 |
10 | # Configure logging
11 | logging.basicConfig(level=logging.INFO)
12 | logger = logging.getLogger(__name__)
13 |
14 | # Design automation keywords for paper filtering
15 | DESIGN_AUTOMATION_KEYWORDS = [
16 | "design automation", "layout generation", "visual design", "graphic design",
17 | "creative AI", "generative design", "UI generation", "UX automation",
18 | "design system", "composition", "creative workflow", "automated design",
19 | "design tool", "design assistant", "design optimization", "content-aware",
20 | "user interface generation", "visual layout", "image composition"
21 | ]
22 |
23 | DESIGN_AUTOMATION_PROMPT = """
24 | You are a specialized research assistant focused on AI/ML for graphic design automation.
25 |
26 | Analyze this paper from the perspective of AI for graphic design and creative automation:
27 |
28 | Title: {title}
29 | Authors: {authors}
30 | Abstract: {abstract}
31 | Content: {content}
32 |
33 | Please provide a detailed analysis covering:
34 |
35 | 1. Design automation focus: What aspect of design does this paper attempt to automate or enhance?
36 | 2. Technical approach: What AI/ML techniques are used in the paper for design automation?
37 | 3. Visual outputs: What kind of visual artifacts does the system generate?
38 | 4. Designer interaction: How does the system interact with human designers?
39 | 5. Data requirements: What data does the system use for training or operation?
40 | 6. Evaluation metrics: How is the system's design quality evaluated?
41 | 7. Real-world applicability: How practical is this approach for professional design workflows?
42 | 8. Novelty: What makes this approach unique compared to other design automation systems?
43 | 9. Limitations: What are the current limitations of this approach?
44 | 10. Future directions: What improvements or extensions are suggested?
45 |
46 | Format your response as JSON with these fields.
47 | """
48 |
49 | def is_design_automation_paper(paper: Dict[str, Any]) -> bool:
50 | """
51 | Check if a paper is related to design automation based on keywords.
52 |
53 | Args:
54 | paper: Dictionary with paper details
55 |
56 | Returns:
57 | Boolean indicating if paper is related to design automation
58 | """
59 | text = (
60 | (paper.get("title", "") + " " +
61 | paper.get("abstract", "") + " " +
62 | paper.get("subjects", "")).lower()
63 | )
64 |
65 | return any(keyword.lower() in text for keyword in DESIGN_AUTOMATION_KEYWORDS)
66 |
67 | def categorize_design_paper(paper: Dict[str, Any]) -> str:
68 | """
69 | Categorize design automation paper into subcategories.
70 |
71 | Args:
72 | paper: Dictionary with paper details
73 |
74 | Returns:
75 | Category name string
76 | """
77 | text = (paper.get("title", "") + " " + paper.get("abstract", "")).lower()
78 |
79 | categories = {
80 | "Layout Generation": ["layout", "composition", "arrange", "grid"],
81 | "UI/UX Design": ["user interface", "ui", "ux", "interface design", "website"],
82 | "Graphic Design": ["graphic design", "poster", "visual design", "typography"],
83 | "Image Manipulation": ["image editing", "photo", "manipulation", "style transfer"],
84 | "Design Tools": ["tool", "assistant", "workflow", "productivity"],
85 | "3D Design": ["3d", "modeling", "cad", "product design"],
86 | "Multimodal Design": ["multimodal", "text-to-image", "image-to-code"]
87 | }
88 |
89 | matches = []
90 | for category, keywords in categories.items():
91 | if any(keyword.lower() in text for keyword in keywords):
92 | matches.append(category)
93 |
94 | if matches:
95 | return ", ".join(matches)
96 | return "General Design Automation"
97 |
98 | def analyze_design_techniques(paper: Dict[str, Any]) -> List[str]:
99 | """
100 | Extract AI/ML techniques used for design automation in the paper.
101 |
102 | Args:
103 | paper: Dictionary with paper details
104 |
105 | Returns:
106 | List of techniques
107 | """
108 | text = (paper.get("title", "") + " " + paper.get("abstract", "")).lower()
109 |
110 | techniques = []
111 | technique_keywords = {
112 | "Generative Adversarial Networks": ["gan", "generative adversarial"],
113 | "Diffusion Models": ["diffusion", "ddpm", "stable diffusion"],
114 | "Transformers": ["transformer", "attention mechanism"],
115 | "Reinforcement Learning": ["reinforcement learning", "rl"],
116 | "Computer Vision": ["computer vision", "vision", "cnn"],
117 | "Graph Neural Networks": ["graph neural", "gnn"],
118 | "Large Language Models": ["llm", "large language model", "gpt"],
119 | "Neural Style Transfer": ["style transfer", "neural style"],
120 | "Evolutionary Algorithms": ["genetic algorithm", "evolutionary"]
121 | }
122 |
123 | for technique, keywords in technique_keywords.items():
124 | if any(keyword in text for keyword in keywords):
125 | techniques.append(technique)
126 |
127 | return techniques
128 |
129 | def extract_design_metrics(paper: Dict[str, Any]) -> List[str]:
130 | """
131 | Extract evaluation metrics used for design quality assessment.
132 |
133 | Args:
134 | paper: Dictionary with paper details
135 |
136 | Returns:
137 | List of metrics
138 | """
139 | text = (paper.get("title", "") + " " + paper.get("abstract", "")).lower()
140 |
141 | metrics = []
142 | metric_keywords = {
143 | "User Studies": ["user study", "user evaluation", "human evaluation"],
144 | "Aesthetic Measures": ["aesthetic", "beauty", "visual quality"],
145 | "Design Principles": ["design principle", "balance", "harmony", "contrast"],
146 | "Technical Metrics": ["fid", "inception score", "clip score", "psnr"],
147 | "Efficiency Metrics": ["time", "speed", "efficiency"],
148 | "Usability": ["usability", "user experience", "ux", "ease of use"]
149 | }
150 |
151 | for metric, keywords in metric_keywords.items():
152 | if any(keyword in text for keyword in keywords):
153 | metrics.append(metric)
154 |
155 | return metrics
156 |
157 | def get_related_design_papers(paper_id: str, papers: List[Dict[str, Any]]) -> List[Dict[str, Any]]:
158 | """
159 | Find papers related to a specific design automation paper.
160 |
161 | Args:
162 | paper_id: ID of the target paper
163 | papers: List of paper dictionaries
164 |
165 | Returns:
166 | List of related papers
167 | """
168 | target_paper = next((p for p in papers if p.get("main_page", "").endswith(paper_id)), None)
169 | if not target_paper:
170 | return []
171 |
172 | # Get techniques used in target paper
173 | target_techniques = analyze_design_techniques(target_paper)
174 | target_category = categorize_design_paper(target_paper)
175 |
176 | related_papers = []
177 | for paper in papers:
178 | if paper.get("main_page", "") == target_paper.get("main_page", ""):
179 | continue
180 |
181 | # Check if paper is on design automation
182 | if not is_design_automation_paper(paper):
183 | continue
184 |
185 | # Check if techniques or categories overlap
186 | paper_techniques = analyze_design_techniques(paper)
187 | paper_category = categorize_design_paper(paper)
188 |
189 | technique_overlap = len(set(target_techniques) & set(paper_techniques))
190 | category_match = paper_category == target_category
191 |
192 | if technique_overlap > 0 or category_match:
193 | paper["relevance_reason"] = []
194 |
195 | if technique_overlap > 0:
196 | paper["relevance_reason"].append(f"Uses similar techniques: {', '.join(set(target_techniques) & set(paper_techniques))}")
197 |
198 | if category_match:
199 | paper["relevance_reason"].append(f"Same design category: {paper_category}")
200 |
201 | paper["relevance_score"] = (technique_overlap * 2) + (2 if category_match else 0)
202 | related_papers.append(paper)
203 |
204 | # Sort by relevance score
205 | related_papers.sort(key=lambda x: x.get("relevance_score", 0), reverse=True)
206 | return related_papers[:5] # Return top 5 related papers
207 |
208 | def create_design_analysis_prompt(paper: Dict[str, Any]) -> str:
209 | """
210 | Create a prompt for analyzing a design automation paper.
211 |
212 | Args:
213 | paper: Dictionary with paper details
214 |
215 | Returns:
216 | Formatted prompt string
217 | """
218 | return DESIGN_AUTOMATION_PROMPT.format(
219 | title=paper.get("title", ""),
220 | authors=paper.get("authors", ""),
221 | abstract=paper.get("abstract", ""),
222 | content=paper.get("content", "")[:10000] # Limit content length
223 | )
224 |
225 | def extract_design_capabilities(analysis: Dict[str, Any]) -> Dict[str, Any]:
226 | """
227 | Extract specific design capabilities from an analysis.
228 |
229 | Args:
230 | analysis: Dictionary with design paper analysis
231 |
232 | Returns:
233 | Dictionary of design capabilities
234 | """
235 | capabilities = {}
236 |
237 | # Extract design areas
238 | if "Design automation focus" in analysis:
239 | capabilities["design_areas"] = analysis["Design automation focus"]
240 |
241 | # Extract tools that could be replaced
242 | tools = []
243 | tools_keywords = {
244 | "Adobe Photoshop": ["photoshop", "photo editing", "image manipulation"],
245 | "Adobe Illustrator": ["illustrator", "vector", "illustration"],
246 | "Figma": ["figma", "ui design", "interface design"],
247 | "Sketch": ["sketch", "ui design", "interface design"],
248 | "InDesign": ["indesign", "layout", "publishing"],
249 | "Canva": ["canva", "simple design", "templates"]
250 | }
251 |
252 | for text_field in ["Technical approach", "Design automation focus", "Real-world applicability"]:
253 | if text_field in analysis:
254 | text = analysis[text_field].lower()
255 | for tool, keywords in tools_keywords.items():
256 | if any(keyword in text for keyword in keywords):
257 | tools.append(tool)
258 |
259 | capabilities["replaceable_tools"] = list(set(tools))
260 |
261 | # Extract human-in-the-loop vs fully automated
262 | if "Designer interaction" in analysis:
263 | text = analysis["Designer interaction"].lower()
264 | if "fully automated" in text or "automatic" in text or "without human" in text:
265 | capabilities["automation_level"] = "Fully automated"
266 | elif "human-in-the-loop" in text or "collaboration" in text or "assists" in text:
267 | capabilities["automation_level"] = "Human-in-the-loop"
268 | else:
269 | capabilities["automation_level"] = "Hybrid"
270 |
271 | # Extract if it's ready for production
272 | if "Real-world applicability" in analysis:
273 | text = analysis["Real-world applicability"].lower()
274 | if "production ready" in text or "commercially viable" in text or "can be used in real" in text:
275 | capabilities["production_ready"] = True
276 | elif "prototype" in text or "proof of concept" in text or "research" in text or "limitations" in text:
277 | capabilities["production_ready"] = False
278 | else:
279 | capabilities["production_ready"] = "Unclear"
280 |
281 | return capabilities
--------------------------------------------------------------------------------
/src/action.py:
--------------------------------------------------------------------------------
1 | from sendgrid import SendGridAPIClient
2 | from sendgrid.helpers.mail import Mail, Email, To, Content
3 |
4 | import argparse
5 | import yaml
6 | import os
7 | from dotenv import load_dotenv
8 | import openai
9 | from relevancy import generate_relevance_score, process_subject_fields
10 | from download_new_papers import get_papers
11 | from datetime import date
12 |
13 | import ssl
14 |
15 | ssl._create_default_https_context = ssl._create_stdlib_context
16 |
17 | # Hackathon quality code. Don't judge too harshly.
18 | # Feel free to submit pull requests to improve the code.
19 |
20 | topics = {
21 | "Physics": "",
22 | "Mathematics": "math",
23 | "Computer Science": "cs",
24 | "Quantitative Biology": "q-bio",
25 | "Quantitative Finance": "q-fin",
26 | "Statistics": "stat",
27 | "Electrical Engineering and Systems Science": "eess",
28 | "Economics": "econ",
29 | }
30 |
31 | physics_topics = {
32 | "Astrophysics": "astro-ph",
33 | "Condensed Matter": "cond-mat",
34 | "General Relativity and Quantum Cosmology": "gr-qc",
35 | "High Energy Physics - Experiment": "hep-ex",
36 | "High Energy Physics - Lattice": "hep-lat",
37 | "High Energy Physics - Phenomenology": "hep-ph",
38 | "High Energy Physics - Theory": "hep-th",
39 | "Mathematical Physics": "math-ph",
40 | "Nonlinear Sciences": "nlin",
41 | "Nuclear Experiment": "nucl-ex",
42 | "Nuclear Theory": "nucl-th",
43 | "Physics": "physics",
44 | "Quantum Physics": "quant-ph",
45 | }
46 |
47 |
48 | # TODO: surely theres a better way
49 | category_map = {
50 | "Astrophysics": [
51 | "Astrophysics of Galaxies",
52 | "Cosmology and Nongalactic Astrophysics",
53 | "Earth and Planetary Astrophysics",
54 | "High Energy Astrophysical Phenomena",
55 | "Instrumentation and Methods for Astrophysics",
56 | "Solar and Stellar Astrophysics",
57 | ],
58 | "Condensed Matter": [
59 | "Disordered Systems and Neural Networks",
60 | "Materials Science",
61 | "Mesoscale and Nanoscale Physics",
62 | "Other Condensed Matter",
63 | "Quantum Gases",
64 | "Soft Condensed Matter",
65 | "Statistical Mechanics",
66 | "Strongly Correlated Electrons",
67 | "Superconductivity",
68 | ],
69 | "General Relativity and Quantum Cosmology": ["None"],
70 | "High Energy Physics - Experiment": ["None"],
71 | "High Energy Physics - Lattice": ["None"],
72 | "High Energy Physics - Phenomenology": ["None"],
73 | "High Energy Physics - Theory": ["None"],
74 | "Mathematical Physics": ["None"],
75 | "Nonlinear Sciences": [
76 | "Adaptation and Self-Organizing Systems",
77 | "Cellular Automata and Lattice Gases",
78 | "Chaotic Dynamics",
79 | "Exactly Solvable and Integrable Systems",
80 | "Pattern Formation and Solitons",
81 | ],
82 | "Nuclear Experiment": ["None"],
83 | "Nuclear Theory": ["None"],
84 | "Physics": [
85 | "Accelerator Physics",
86 | "Applied Physics",
87 | "Atmospheric and Oceanic Physics",
88 | "Atomic and Molecular Clusters",
89 | "Atomic Physics",
90 | "Biological Physics",
91 | "Chemical Physics",
92 | "Classical Physics",
93 | "Computational Physics",
94 | "Data Analysis, Statistics and Probability",
95 | "Fluid Dynamics",
96 | "General Physics",
97 | "Geophysics",
98 | "History and Philosophy of Physics",
99 | "Instrumentation and Detectors",
100 | "Medical Physics",
101 | "Optics",
102 | "Physics and Society",
103 | "Physics Education",
104 | "Plasma Physics",
105 | "Popular Physics",
106 | "Space Physics",
107 | ],
108 | "Quantum Physics": ["None"],
109 | "Mathematics": [
110 | "Algebraic Geometry",
111 | "Algebraic Topology",
112 | "Analysis of PDEs",
113 | "Category Theory",
114 | "Classical Analysis and ODEs",
115 | "Combinatorics",
116 | "Commutative Algebra",
117 | "Complex Variables",
118 | "Differential Geometry",
119 | "Dynamical Systems",
120 | "Functional Analysis",
121 | "General Mathematics",
122 | "General Topology",
123 | "Geometric Topology",
124 | "Group Theory",
125 | "History and Overview",
126 | "Information Theory",
127 | "K-Theory and Homology",
128 | "Logic",
129 | "Mathematical Physics",
130 | "Metric Geometry",
131 | "Number Theory",
132 | "Numerical Analysis",
133 | "Operator Algebras",
134 | "Optimization and Control",
135 | "Probability",
136 | "Quantum Algebra",
137 | "Representation Theory",
138 | "Rings and Algebras",
139 | "Spectral Theory",
140 | "Statistics Theory",
141 | "Symplectic Geometry",
142 | ],
143 | "Computer Science": [
144 | "Artificial Intelligence",
145 | "Computation and Language",
146 | "Computational Complexity",
147 | "Computational Engineering, Finance, and Science",
148 | "Computational Geometry",
149 | "Computer Science and Game Theory",
150 | "Computer Vision and Pattern Recognition",
151 | "Computers and Society",
152 | "Cryptography and Security",
153 | "Data Structures and Algorithms",
154 | "Databases",
155 | "Digital Libraries",
156 | "Discrete Mathematics",
157 | "Distributed, Parallel, and Cluster Computing",
158 | "Emerging Technologies",
159 | "Formal Languages and Automata Theory",
160 | "General Literature",
161 | "Graphics",
162 | "Hardware Architecture",
163 | "Human-Computer Interaction",
164 | "Information Retrieval",
165 | "Information Theory",
166 | "Logic in Computer Science",
167 | "Machine Learning",
168 | "Mathematical Software",
169 | "Multiagent Systems",
170 | "Multimedia",
171 | "Networking and Internet Architecture",
172 | "Neural and Evolutionary Computing",
173 | "Numerical Analysis",
174 | "Operating Systems",
175 | "Other Computer Science",
176 | "Performance",
177 | "Programming Languages",
178 | "Robotics",
179 | "Social and Information Networks",
180 | "Software Engineering",
181 | "Sound",
182 | "Symbolic Computation",
183 | "Systems and Control",
184 | ],
185 | "Quantitative Biology": [
186 | "Biomolecules",
187 | "Cell Behavior",
188 | "Genomics",
189 | "Molecular Networks",
190 | "Neurons and Cognition",
191 | "Other Quantitative Biology",
192 | "Populations and Evolution",
193 | "Quantitative Methods",
194 | "Subcellular Processes",
195 | "Tissues and Organs",
196 | ],
197 | "Quantitative Finance": [
198 | "Computational Finance",
199 | "Economics",
200 | "General Finance",
201 | "Mathematical Finance",
202 | "Portfolio Management",
203 | "Pricing of Securities",
204 | "Risk Management",
205 | "Statistical Finance",
206 | "Trading and Market Microstructure",
207 | ],
208 | "Statistics": [
209 | "Applications",
210 | "Computation",
211 | "Machine Learning",
212 | "Methodology",
213 | "Other Statistics",
214 | "Statistics Theory",
215 | ],
216 | "Electrical Engineering and Systems Science": [
217 | "Audio and Speech Processing",
218 | "Image and Video Processing",
219 | "Signal Processing",
220 | "Systems and Control",
221 | ],
222 | "Economics": ["Econometrics", "General Economics", "Theoretical Economics"],
223 | }
224 |
225 |
226 | def generate_body(topic, categories, interest, threshold):
227 | f_papers = []
228 | if topic == "Physics":
229 | raise RuntimeError("You must choose a physics subtopic.")
230 | elif topic in physics_topics:
231 | abbr = physics_topics[topic]
232 | elif topic in topics:
233 | abbr = topics[topic]
234 | else:
235 | raise RuntimeError(f"Invalid topic {topic}")
236 | if categories:
237 | for category in categories:
238 | if category not in category_map[topic]:
239 | raise RuntimeError(f"{category} is not a category of {topic}")
240 | papers = get_papers(abbr)
241 |
242 | papers = [
243 | t
244 | for t in papers
245 | if bool(set(process_subject_fields(t["subjects"])) & set(categories))
246 | ]
247 |
248 | else:
249 | papers = get_papers(abbr)
250 | if interest:
251 | relevancy, hallucination = generate_relevance_score(
252 | papers,
253 | query={"interest": interest},
254 | threshold_score=threshold,
255 | num_paper_in_prompt=2,
256 | )
257 |
258 | body = "
".join(
259 | [
260 | f'Subject: {paper["subjects"]}
Title: {paper["title"]}
Authors: {paper["authors"]}
'
261 | f'Score: {paper["Relevancy score"]}
Reason: {paper["Reasons for match"]}
'
262 | f'Goal: {paper["Goal"]}
Data: {paper["Data"]}
Methodology: {paper["Methodology"]}
'
263 | f'Experiments & Results: {paper["Experiments & Results"]}
Git: {paper["Git"]}
'
264 | f'Discussion & Next steps: {paper["Discussion & Next steps"]}'
265 | for paper in relevancy
266 | ]
267 | )
268 | if hallucination:
269 | body = (
270 | "Warning: the model hallucinated some papers. We have tried to remove them, but the scores may not be accurate.
"
271 | + body
272 | )
273 | else:
274 | body = "
".join(
275 | [
276 | f'Title: {paper["title"]}
Authors: {paper["authors"]}'
277 | for paper in papers
278 | ]
279 | )
280 | return body
281 |
282 | def get_date():
283 | today = date.today()
284 | formatted_date = today.strftime("%d%m%Y")
285 | return formatted_date
286 |
287 | if __name__ == "__main__":
288 | # Load the .env file.
289 | load_dotenv()
290 | parser = argparse.ArgumentParser()
291 | parser.add_argument(
292 | "--config", help="yaml config file to use", default="config.yaml"
293 | )
294 | args = parser.parse_args()
295 | with open(args.config, "r") as f:
296 | config = yaml.safe_load(f)
297 |
298 | if "OPENAI_API_KEY" not in os.environ:
299 | raise RuntimeError("No openai api key found")
300 | openai.api_key = os.environ.get("OPENAI_API_KEY")
301 |
302 | topic = config["topic"]
303 | categories = config["categories"]
304 | from_email = os.environ.get("FROM_EMAIL")
305 | to_email = os.environ.get("TO_EMAIL")
306 | threshold = config["threshold"]
307 | interest = config["interest"]
308 | body = generate_body(topic, categories, interest, threshold)
309 | today_date = get_date()
310 | with open(f"digest_{today_date}.html", "w") as f:
311 | f.write(body)
312 | if os.environ.get("SENDGRID_API_KEY", None):
313 | sg = SendGridAPIClient(api_key=os.environ.get("SENDGRID_API_KEY"))
314 | from_email = Email(from_email) # Change to your verified sender
315 | to_email = To(to_email)
316 | subject = date.today().strftime("Personalized arXiv Digest, %d %b %Y")
317 | content = Content("text/html", body)
318 | mail = Mail(from_email, to_email, subject, content)
319 | mail_json = mail.get()
320 |
321 | # Send an HTTP POST request to /mail/send
322 | response = sg.client.mail.send.post(request_body=mail_json)
323 | if response.status_code >= 200 and response.status_code <= 300:
324 | print("Send test email: Success!")
325 | else:
326 | print("Send test email: Failure ({response.status_code}, {response.text})")
327 | else:
328 | print("No sendgrid api key found. Skipping email")
329 |
--------------------------------------------------------------------------------
/src/anthropic_utils.py:
--------------------------------------------------------------------------------
1 | """
2 | Anthropic/Claude API integration for ArxivDigest.
3 | This module provides functions to work with Anthropic's Claude API for paper analysis.
4 | """
5 | import os
6 | import json
7 | import logging
8 | import time
9 | from typing import List, Dict, Any, Optional
10 |
11 | try:
12 | import anthropic
13 | from anthropic.types import MessageParam
14 | ANTHROPIC_AVAILABLE = True
15 | except ImportError:
16 | ANTHROPIC_AVAILABLE = False
17 |
18 | # Configure logging
19 | logging.basicConfig(level=logging.INFO)
20 | logger = logging.getLogger(__name__)
21 |
22 | class ClaudeConfig:
23 | """Configuration for Claude API calls."""
24 | def __init__(
25 | self,
26 | temperature: float = 0.5,
27 | max_tokens: int = 4000,
28 | top_p: float = 0.95,
29 | top_k: int = 40
30 | ):
31 | self.temperature = temperature
32 | self.max_tokens = max_tokens
33 | self.top_p = top_p
34 | self.top_k = top_k
35 |
36 | def setup_anthropic_api(api_key: str) -> bool:
37 | """
38 | Setup the Anthropic API with the provided API key.
39 |
40 | Args:
41 | api_key: Anthropic API key
42 |
43 | Returns:
44 | bool: True if setup was successful, False otherwise
45 | """
46 | if not ANTHROPIC_AVAILABLE:
47 | logger.error("Anthropic package not installed. Run 'pip install anthropic'")
48 | return False
49 |
50 | if not api_key:
51 | logger.error("No Anthropic API key provided")
52 | return False
53 |
54 | try:
55 | # Initialize client to test connection
56 | client = anthropic.Anthropic(api_key=api_key)
57 | # Test API connection by listing models
58 | models = client.models.list()
59 | available_models = [model.id for model in models.data]
60 | logger.info(f"Successfully connected to Anthropic API. Available models: {available_models}")
61 | return True
62 | except Exception as e:
63 | logger.error(f"Failed to setup Anthropic API: {e}")
64 | return False
65 |
66 | def get_claude_client(api_key: str) -> Optional[anthropic.Anthropic]:
67 | """
68 | Get an Anthropic client with the given API key.
69 |
70 | Args:
71 | api_key: Anthropic API key
72 |
73 | Returns:
74 | Anthropic client or None if not available
75 | """
76 | if not ANTHROPIC_AVAILABLE:
77 | return None
78 |
79 | try:
80 | client = anthropic.Anthropic(api_key=api_key)
81 | return client
82 | except Exception as e:
83 | logger.error(f"Failed to get Anthropic client: {e}")
84 | return None
85 |
86 | def analyze_papers_with_claude(
87 | papers: List[Dict[str, Any]],
88 | query: Dict[str, str],
89 | config: Optional[ClaudeConfig] = None,
90 | model_name: str = "claude-3.5-sonnet-20240620",
91 | api_key: str = None
92 | ) -> List[Dict[str, Any]]:
93 | """
94 | Analyze papers using Claude.
95 |
96 | Args:
97 | papers: List of paper dictionaries
98 | query: Dictionary with 'interest' key describing research interests
99 | config: ClaudeConfig object
100 | model_name: Name of the Claude model to use
101 | api_key: Anthropic API key (optional if already configured elsewhere)
102 |
103 | Returns:
104 | List of papers with added analysis
105 | """
106 | if not ANTHROPIC_AVAILABLE:
107 | logger.error("Anthropic package not installed. Cannot analyze papers.")
108 | return papers
109 |
110 | if not config:
111 | config = ClaudeConfig()
112 |
113 | # Get client
114 | if api_key:
115 | client = get_claude_client(api_key)
116 | else:
117 | # Try to get from environment
118 | api_key = os.environ.get("ANTHROPIC_API_KEY", "")
119 | if not api_key:
120 | logger.error("No Anthropic API key provided")
121 | return papers
122 | client = get_claude_client(api_key)
123 |
124 | if not client:
125 | return papers
126 |
127 | analyzed_papers = []
128 |
129 | for paper in papers:
130 | try:
131 | # Prepare system prompt
132 | system_prompt = f"""
133 | You are a research assistant analyzing academic papers in AI and ML.
134 | You provide comprehensive, accurate and unbiased analysis based on the user's research interests.
135 | Your responses should be well-structured and factual, focusing on the paper's strengths, weaknesses, and relevance.
136 | """
137 |
138 | # Prepare user prompt
139 | user_prompt = f"""
140 | Analyze this paper and provide insights based on the following research interests:
141 |
142 | Research interests: {query['interest']}
143 |
144 | Paper details:
145 | Title: {paper['title']}
146 | Authors: {paper['authors']}
147 | Abstract: {paper['abstract']}
148 | Content: {paper['content'][:5000] if 'content' in paper else 'Not available'}
149 |
150 | Please provide your response as a single JSON object with the following structure:
151 | {{
152 | "Relevancy score": 1-10 (higher = more relevant),
153 | "Reasons for match": "Detailed explanation of why this paper matches the interests",
154 | "Key innovations": "List the main contributions of the paper",
155 | "Critical analysis": "Evaluate strengths and weaknesses",
156 | "Goal": "What problem does the paper address?",
157 | "Data": "Description of datasets used",
158 | "Methodology": "Technical approach and methods",
159 | "Implementation details": "Model architecture, hyperparameters, etc.",
160 | "Experiments & Results": "Key findings and comparisons",
161 | "Discussion & Next steps": "Limitations and future work",
162 | "Related work": "Connection to similar research",
163 | "Practical applications": "Real-world uses of this research",
164 | "Key takeaways": ["Point 1", "Point 2", "Point 3"]
165 | }}
166 |
167 | Format your response as a valid JSON object and nothing else.
168 | """
169 |
170 | # Just log that we're sending a prompt to Claude
171 | print(f"Sending prompt to Claude for paper: {paper['title'][:50]}...")
172 |
173 | # Create message
174 | messages: List[MessageParam] = [
175 | {
176 | "role": "user",
177 | "content": user_prompt
178 | }
179 | ]
180 |
181 | # Call the API
182 | response = client.messages.create(
183 | model=model_name,
184 | max_tokens=config.max_tokens,
185 | temperature=config.temperature,
186 | system=system_prompt,
187 | messages=messages
188 | )
189 |
190 | # Extract and parse the response
191 | response_text = response.content[0].text if response.content else ""
192 |
193 | # Try to extract JSON
194 | try:
195 | start_idx = response_text.find('{')
196 | end_idx = response_text.rfind('}') + 1
197 | if start_idx >= 0 and end_idx > start_idx:
198 | json_str = response_text[start_idx:end_idx]
199 | claude_analysis = json.loads(json_str)
200 |
201 | # Add Claude analysis to paper
202 | paper['claude_analysis'] = claude_analysis
203 |
204 | # Directly copy fields to paper
205 | for key, value in claude_analysis.items():
206 | paper[key] = value
207 | else:
208 | logger.warning(f"Could not extract JSON from Claude response for paper {paper['title']}")
209 | paper['claude_analysis'] = {"error": "Failed to parse response"}
210 | except json.JSONDecodeError:
211 | logger.warning(f"Failed to parse Claude response as JSON for paper {paper['title']}")
212 | paper['claude_analysis'] = {"error": "Failed to parse response"}
213 |
214 | analyzed_papers.append(paper)
215 |
216 | # Avoid rate limiting
217 | time.sleep(1)
218 |
219 | except Exception as e:
220 | logger.error(f"Claude API error: {e}")
221 | paper['claude_analysis'] = {"error": f"Claude API error: {str(e)}"}
222 | analyzed_papers.append(paper)
223 |
224 | return analyzed_papers
225 |
226 | def get_claude_interpretability_analysis(paper: Dict[str, Any], model_name: str = "claude-3.5-sonnet-20240620", api_key: str = None) -> Dict[str, Any]:
227 | """
228 | Get specialized mechanistic interpretability analysis for a paper using Claude.
229 |
230 | Args:
231 | paper: Paper dictionary
232 | model_name: Claude model to use
233 | api_key: Anthropic API key (optional if already configured elsewhere)
234 |
235 | Returns:
236 | Dictionary with interpretability analysis
237 | """
238 | if not ANTHROPIC_AVAILABLE:
239 | return {"error": "Anthropic package not installed"}
240 |
241 | # Get client
242 | if api_key:
243 | client = get_claude_client(api_key)
244 | else:
245 | # Try to get from environment
246 | api_key = os.environ.get("ANTHROPIC_API_KEY", "")
247 | if not api_key:
248 | return {"error": "No Anthropic API key provided"}
249 | client = get_claude_client(api_key)
250 |
251 | if not client:
252 | return {"error": "Failed to initialize Anthropic client"}
253 |
254 | try:
255 | # Prepare system prompt
256 | system_prompt = """
257 | You are a specialist in mechanistic interpretability and AI alignment.
258 | Provide a thorough analysis of research papers with focus on interpretability methods,
259 | circuit analysis, and how the work relates to understanding AI systems.
260 | """
261 |
262 | # Prepare the prompt
263 | user_prompt = f"""
264 | Analyze this paper from a mechanistic interpretability perspective:
265 |
266 | Title: {paper['title']}
267 | Authors: {paper['authors']}
268 | Abstract: {paper['abstract']}
269 | Content: {paper['content'][:7000] if 'content' in paper else paper['abstract']}
270 |
271 | Please return your analysis as a JSON object with the following fields:
272 |
273 | {{
274 | "interpretability_score": 1-10 (how relevant is this to mechanistic interpretability),
275 | "key_methods": "Main interpretability techniques used or proposed",
276 | "circuit_analysis": "Any findings about neural circuits or components",
277 | "relevance_to_alignment": "How this work contributes to AI alignment",
278 | "novel_insights": "New perspectives on model internals",
279 | "limitations": "Limitations of the interpretability methods",
280 | "potential_extensions": "How this work could be extended",
281 | "connection_to_other_work": "Relationship to other interpretability papers"
282 | }}
283 |
284 | Respond with only the JSON.
285 | """
286 |
287 | # Create message
288 | messages: List[MessageParam] = [
289 | {
290 | "role": "user",
291 | "content": user_prompt
292 | }
293 | ]
294 |
295 | # Call the API
296 | response = client.messages.create(
297 | model=model_name,
298 | max_tokens=4000,
299 | temperature=0.3,
300 | system=system_prompt,
301 | messages=messages
302 | )
303 |
304 | # Extract and parse the response
305 | response_text = response.content[0].text if response.content else ""
306 |
307 | # Try to extract JSON
308 | try:
309 | # Find the JSON part in the response
310 | start_idx = response_text.find('{')
311 | end_idx = response_text.rfind('}') + 1
312 | if start_idx >= 0 and end_idx > start_idx:
313 | json_str = response_text[start_idx:end_idx]
314 | analysis = json.loads(json_str)
315 | return analysis
316 | else:
317 | return {"error": "Could not extract JSON from response"}
318 | except json.JSONDecodeError:
319 | return {"error": "Failed to parse response as JSON"}
320 |
321 | except Exception as e:
322 | return {"error": f"Claude API error: {str(e)}"}
--------------------------------------------------------------------------------
/src/design_finder/main.py:
--------------------------------------------------------------------------------
1 | """
2 | Main module for design_finder.
3 | Run with: python -m src.design_finder
4 | """
5 | import os
6 | import sys
7 | import json
8 | import argparse
9 | import datetime
10 | import logging
11 | from typing import List, Dict, Any
12 |
13 | # Add parent directory to path to import from sibling modules
14 | current_dir = os.path.dirname(os.path.abspath(__file__))
15 | parent_dir = os.path.dirname(os.path.dirname(current_dir))
16 | if parent_dir not in sys.path:
17 | sys.path.append(parent_dir)
18 |
19 | from src.download_new_papers import get_papers, _download_new_papers
20 | from src.design_automation import (
21 | is_design_automation_paper,
22 | categorize_design_paper,
23 | analyze_design_techniques,
24 | extract_design_metrics
25 | )
26 | from src.paths import DATA_DIR, DIGEST_DIR
27 |
28 | # Configure logging
29 | logging.basicConfig(
30 | level=logging.INFO,
31 | format='%(asctime)s - %(name)s - %(levelname)s - %(message)s'
32 | )
33 | logger = logging.getLogger(__name__)
34 |
35 | # Default arXiv categories to search
36 | DEFAULT_CATEGORIES = [
37 | "cs.CV", # Computer Vision
38 | "cs.GR", # Graphics
39 | "cs.HC", # Human-Computer Interaction
40 | "cs.AI", # Artificial Intelligence
41 | "cs.LG", # Machine Learning
42 | "cs.CL", # Computation and Language (NLP)
43 | "cs.MM", # Multimedia
44 | "cs.SD", # Sound
45 | "cs.RO", # Robotics (for interactive design)
46 | "cs.CY" # Computers and Society
47 | ]
48 |
49 | def get_date_range(days_back: int = 7) -> List[str]:
50 | """
51 | Get a list of dates for the past N days in arXiv format.
52 |
53 | Args:
54 | days_back: Number of days to look back
55 |
56 | Returns:
57 | List of date strings in arXiv format
58 | """
59 | today = datetime.datetime.now()
60 | dates = []
61 |
62 | for i in range(days_back):
63 | date = today - datetime.timedelta(days=i)
64 | date_str = date.strftime("%a, %d %b %y")
65 | dates.append(date_str)
66 |
67 | return dates
68 |
69 | def ensure_data_files(categories: List[str], days_back: int = 7) -> None:
70 | """
71 | Make sure data files exist for the specified categories and date range.
72 |
73 | Args:
74 | categories: List of arXiv category codes
75 | days_back: Number of days to look back
76 | """
77 | dates = get_date_range(days_back)
78 |
79 | for category in categories:
80 | for date_str in dates:
81 | # Add a delay between requests to avoid being blocked
82 | time.sleep(1)
83 | file_path = os.path.join(DATA_DIR, f"{category}_{date_str}.jsonl")
84 |
85 | if not os.path.exists(file_path):
86 | logger.info(f"Downloading papers for {category} on {date_str}")
87 | try:
88 | _download_new_papers(category)
89 | except Exception as e:
90 | logger.error(f"Error downloading {category} papers for {date_str}: {e}")
91 |
92 | def get_design_papers(categories: List[str], days_back: int = 7) -> List[Dict[str, Any]]:
93 | """
94 | Get design automation papers from specified categories over a date range.
95 |
96 | Args:
97 | categories: List of arXiv category codes
98 | days_back: Number of days to look back
99 |
100 | Returns:
101 | List of design automation papers
102 | """
103 | # Ensure data files exist
104 | ensure_data_files(categories, days_back)
105 |
106 | # Collect papers
107 | all_papers = []
108 | dates = get_date_range(days_back)
109 |
110 | for category in categories:
111 | for date_str in dates:
112 | try:
113 | papers = get_papers(category)
114 | all_papers.extend(papers)
115 | except Exception as e:
116 | logger.warning(f"Could not get papers for {category} on {date_str}: {e}")
117 |
118 | # Remove duplicates (papers can appear in multiple categories)
119 | unique_papers = {}
120 | for paper in all_papers:
121 | paper_id = paper.get("main_page", "").split("/")[-1]
122 | if paper_id and paper_id not in unique_papers:
123 | unique_papers[paper_id] = paper
124 |
125 | # Filter design automation papers
126 | design_papers = []
127 | for paper_id, paper in unique_papers.items():
128 | if is_design_automation_paper(paper):
129 | paper["paper_id"] = paper_id
130 | paper["design_category"] = categorize_design_paper(paper)
131 | paper["design_techniques"] = analyze_design_techniques(paper)
132 | paper["design_metrics"] = extract_design_metrics(paper)
133 | design_papers.append(paper)
134 |
135 | # Sort by date (newest first)
136 | design_papers.sort(key=lambda p: p.get("main_page", ""), reverse=True)
137 |
138 | return design_papers
139 |
140 | def print_paper_summary(paper: Dict[str, Any]) -> None:
141 | """
142 | Print a nice summary of a paper to the console.
143 |
144 | Args:
145 | paper: Paper dictionary
146 | """
147 | print(f"\n{'=' * 80}")
148 | print(f"TITLE: {paper.get('title', 'No title')}")
149 | print(f"AUTHORS: {paper.get('authors', 'No authors')}")
150 | print(f"URL: {paper.get('main_page', 'No URL')}")
151 | print(f"DESIGN CATEGORY: {paper.get('design_category', 'Unknown')}")
152 | print(f"TECHNIQUES: {', '.join(paper.get('design_techniques', []))}")
153 | print(f"METRICS: {', '.join(paper.get('design_metrics', []))}")
154 | print(f"\nABSTRACT: {paper.get('abstract', 'No abstract')[:500]}...")
155 | print(f"{'=' * 80}\n")
156 |
157 | def generate_html_report(papers: List[Dict[str, Any]], output_file: str) -> None:
158 | """
159 | Generate an HTML report from papers.
160 |
161 | Args:
162 | papers: List of paper dictionaries
163 | output_file: Path to output HTML file
164 | """
165 | html = f"""
166 |
167 |
168 |
169 |
170 | Design Automation Papers
171 |
186 |
187 |
188 | Design Automation Papers
189 |
190 |
Found {len(papers)} papers related to graphic design automation with AI/ML
191 |
Generated on {datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S")}
192 |
193 | """
194 |
195 | # Count categories and techniques
196 | categories = {}
197 | techniques = {}
198 |
199 | for paper in papers:
200 | category = paper.get("design_category", "Uncategorized")
201 | if category in categories:
202 | categories[category] += 1
203 | else:
204 | categories[category] = 1
205 |
206 | for technique in paper.get("design_techniques", []):
207 | if technique in techniques:
208 | techniques[technique] += 1
209 | else:
210 | techniques[technique] = 1
211 |
212 | # Add summary statistics
213 | html += "Summary Statistics
"
214 |
215 | html += "
Categories:
"
216 | for category, count in sorted(categories.items(), key=lambda x: x[1], reverse=True):
217 | html += f"- {category}: {count} papers
"
218 | html += "
"
219 |
220 | html += "
Techniques:
"
221 | for technique, count in sorted(techniques.items(), key=lambda x: x[1], reverse=True):
222 | html += f"- {technique}: {count} papers
"
223 | html += "
"
224 |
225 | # Add papers
226 | for paper in papers:
227 | publish_date = paper.get("main_page", "").split("/")[-1][:4] # Extract YYMM from id
228 |
229 | html += f"""
230 |
231 |
232 |
{paper.get("authors", "Unknown authors")}
233 |
arXiv ID: {paper.get("paper_id", "Unknown")}
234 |
Category: {paper.get("design_category", "General")} | Subject: {paper.get("subjects", "N/A")}
235 |
Techniques: {', '.join(paper.get("design_techniques", ["None identified"]))}
236 |
Evaluation metrics: {', '.join(paper.get("design_metrics", ["None identified"]))}
237 |
Abstract: {paper.get("abstract", "No abstract available")}
238 |
239 | """
240 |
241 | html += """
242 |
245 |
246 |
247 | """
248 |
249 | with open(output_file, "w") as f:
250 | f.write(html)
251 |
252 | logger.info(f"HTML report generated: {output_file}")
253 |
254 | def main():
255 | """Main function for the design finder module."""
256 | parser = argparse.ArgumentParser(description="Find the latest graphic design automation papers.")
257 | parser.add_argument("--days", type=int, default=7, help="Number of days to look back")
258 | parser.add_argument("--output", type=str, default="design_papers.json", help="Output JSON file path")
259 | parser.add_argument("--html", type=str, default="design_papers.html", help="Output HTML file path")
260 | parser.add_argument("--categories", type=str, nargs="+", default=DEFAULT_CATEGORIES,
261 | help="arXiv categories to search")
262 | parser.add_argument("--keyword", type=str, help="Additional keyword to filter papers")
263 | parser.add_argument("--technique", type=str, help="Filter by specific technique")
264 | parser.add_argument("--category", type=str, help="Filter by specific design category")
265 | args = parser.parse_args()
266 |
267 | logger.info(f"Looking for design papers in the past {args.days} days")
268 | logger.info(f"Searching categories: {', '.join(args.categories)}")
269 |
270 | # DATA_DIR is already created by paths.py
271 |
272 | # Get design papers
273 | design_papers = get_design_papers(args.categories, args.days)
274 |
275 | # Apply additional filters if specified
276 | if args.keyword:
277 | keyword = args.keyword.lower()
278 | design_papers = [
279 | p for p in design_papers
280 | if keyword in p.get("title", "").lower() or
281 | keyword in p.get("abstract", "").lower()
282 | ]
283 | logger.info(f"Filtered by keyword '{args.keyword}': {len(design_papers)} papers remaining")
284 |
285 | if args.technique:
286 | technique = args.technique.lower()
287 | design_papers = [
288 | p for p in design_papers
289 | if any(technique in t.lower() for t in p.get("design_techniques", []))
290 | ]
291 | logger.info(f"Filtered by technique '{args.technique}': {len(design_papers)} papers remaining")
292 |
293 | if args.category:
294 | category = args.category.lower()
295 | design_papers = [
296 | p for p in design_papers
297 | if category in p.get("design_category", "").lower()
298 | ]
299 | logger.info(f"Filtered by category '{args.category}': {len(design_papers)} papers remaining")
300 |
301 | logger.info(f"Found {len(design_papers)} design automation papers")
302 |
303 | # Print summary to console
304 | for paper in design_papers[:10]: # Print top 10
305 | print_paper_summary(paper)
306 |
307 | if len(design_papers) > 10:
308 | print(f"...and {len(design_papers) - 10} more papers.")
309 |
310 | # Save to JSON file in data directory
311 | output_path = os.path.join(DATA_DIR, args.output)
312 | with open(output_path, "w") as f:
313 | json.dump(design_papers, f, indent=2)
314 |
315 | logger.info(f"Saved {len(design_papers)} papers to {output_path}")
316 |
317 | # Generate HTML report in digest directory
318 | html_path = os.path.join(DIGEST_DIR, args.html)
319 | generate_html_report(design_papers, html_path)
320 |
321 | print(f"\nResults saved to {output_path} and {html_path}")
322 |
323 | if __name__ == "__main__":
324 | main()
--------------------------------------------------------------------------------
/src/model_manager.py:
--------------------------------------------------------------------------------
1 | """
2 | Model Manager module to handle different LLM providers.
3 | This provides a unified interface for working with different LLM providers.
4 | """
5 | import os
6 | import json
7 | import logging
8 | import time
9 | from typing import Dict, List, Any, Optional, Union, Tuple
10 | from enum import Enum
11 |
12 | import openai
13 | try:
14 | import google.generativeai as genai
15 | GEMINI_AVAILABLE = True
16 | except ImportError:
17 | GEMINI_AVAILABLE = False
18 |
19 | try:
20 | import anthropic
21 | ANTHROPIC_AVAILABLE = True
22 | except ImportError:
23 | ANTHROPIC_AVAILABLE = False
24 |
25 | # Configure logging
26 | logging.basicConfig(level=logging.INFO)
27 | logger = logging.getLogger(__name__)
28 |
29 | class ModelProvider(Enum):
30 | OPENAI = "openai"
31 | GEMINI = "gemini"
32 | ANTHROPIC = "anthropic"
33 |
34 | class ModelManager:
35 | """Manager for handling different LLM providers."""
36 |
37 | def __init__(self):
38 | self.providers = {}
39 | self.available_models = {}
40 |
41 | def register_openai(self, api_key: str) -> bool:
42 | """Register OpenAI as a provider."""
43 | if not api_key:
44 | logger.error("No OpenAI API key provided")
45 | return False
46 |
47 | try:
48 | openai.api_key = api_key
49 | # Test API connection
50 | models = openai.Model.list()
51 | self.providers[ModelProvider.OPENAI] = True
52 | self.available_models[ModelProvider.OPENAI] = [model.id for model in models.data]
53 | logger.info(f"Successfully connected to OpenAI API. Available models: {self.available_models[ModelProvider.OPENAI]}")
54 | return True
55 | except Exception as e:
56 | logger.error(f"Failed to setup OpenAI API: {e}")
57 | return False
58 |
59 | def register_gemini(self, api_key: str) -> bool:
60 | """Register Gemini as a provider."""
61 | if not GEMINI_AVAILABLE:
62 | logger.error("Gemini package not installed. Run 'pip install google-generativeai'")
63 | return False
64 |
65 | if not api_key:
66 | logger.error("No Gemini API key provided")
67 | return False
68 |
69 | try:
70 | genai.configure(api_key=api_key)
71 | # Test API connection
72 | models = genai.list_models()
73 | self.providers[ModelProvider.GEMINI] = True
74 | self.available_models[ModelProvider.GEMINI] = [m.name for m in models if 'generateContent' in m.supported_generation_methods]
75 | logger.info(f"Successfully connected to Gemini API. Available models: {self.available_models[ModelProvider.GEMINI]}")
76 | return True
77 | except Exception as e:
78 | logger.error(f"Failed to setup Gemini API: {e}")
79 | return False
80 |
81 | def register_anthropic(self, api_key: str) -> bool:
82 | """Register Anthropic/Claude as a provider."""
83 | if not ANTHROPIC_AVAILABLE:
84 | logger.error("Anthropic package not installed. Run 'pip install anthropic'")
85 | return False
86 |
87 | if not api_key:
88 | logger.error("No Anthropic API key provided")
89 | return False
90 |
91 | try:
92 | self.anthropic_client = anthropic.Anthropic(api_key=api_key)
93 | # Test API connection by listing models
94 | models = self.anthropic_client.models.list()
95 | self.providers[ModelProvider.ANTHROPIC] = True
96 | self.available_models[ModelProvider.ANTHROPIC] = [model.id for model in models.data]
97 | logger.info(f"Successfully connected to Anthropic API. Available models: {self.available_models[ModelProvider.ANTHROPIC]}")
98 | return True
99 | except Exception as e:
100 | logger.error(f"Failed to setup Anthropic API: {e}")
101 | return False
102 |
103 | def is_provider_available(self, provider: ModelProvider) -> bool:
104 | """Check if a provider is available."""
105 | return provider in self.providers and self.providers[provider]
106 |
107 | def get_available_providers(self) -> List[ModelProvider]:
108 | """Get a list of available providers."""
109 | return [provider for provider in self.providers if self.providers[provider]]
110 |
111 | def get_provider_models(self, provider: ModelProvider) -> List[str]:
112 | """Get available models for a provider."""
113 | if provider in self.available_models:
114 | return self.available_models[provider]
115 | return []
116 |
117 | def analyze_papers(
118 | self,
119 | papers: List[Dict[str, Any]],
120 | query: Dict[str, str],
121 | providers: List[ModelProvider] = None,
122 | model_names: Dict[ModelProvider, str] = None,
123 | threshold_score: int = 7,
124 | ) -> Tuple[List[Dict[str, Any]], bool]:
125 | """
126 | Analyze papers using multiple model providers.
127 |
128 | Args:
129 | papers: List of paper dictionaries
130 | query: Dictionary with 'interest' key describing research interests
131 | providers: List of providers to use (defaults to all available)
132 | model_names: Dictionary mapping providers to model names
133 | threshold_score: Minimum score for a paper to be considered relevant
134 |
135 | Returns:
136 | Tuple of (list of papers with analysis, hallucination flag)
137 | """
138 | if not providers:
139 | providers = self.get_available_providers()
140 |
141 | if not model_names:
142 | model_names = {}
143 |
144 | # Default model names if not specified
145 | default_models = {
146 | ModelProvider.OPENAI: "gpt-3.5-turbo-16k",
147 | ModelProvider.GEMINI: "gemini-1.5-flash",
148 | ModelProvider.ANTHROPIC: "claude-3.5-sonnet-20240620"
149 | }
150 |
151 | # Use default models if not specified
152 | for provider in providers:
153 | if provider not in model_names:
154 | model_names[provider] = default_models.get(provider)
155 |
156 | # Check if any providers are available
157 | if not any(self.is_provider_available(provider) for provider in providers):
158 | logger.error("No available providers for paper analysis")
159 | return papers, False
160 |
161 | analyzed_papers = []
162 | hallucination = False
163 |
164 | # Import the modules here to avoid circular imports
165 | if ModelProvider.OPENAI in providers and self.is_provider_available(ModelProvider.OPENAI):
166 | from relevancy import generate_relevance_score
167 | try:
168 | analyzed_papers, hallu = generate_relevance_score(
169 | papers,
170 | query=query,
171 | model_name=model_names[ModelProvider.OPENAI],
172 | threshold_score=threshold_score,
173 | num_paper_in_prompt=2
174 | )
175 | hallucination = hallucination or hallu
176 | except Exception as e:
177 | logger.error(f"Error analyzing papers with OpenAI: {e}")
178 |
179 | # Add Gemini analysis if available
180 | if ModelProvider.GEMINI in providers and self.is_provider_available(ModelProvider.GEMINI):
181 | # Import locally to avoid circular imports
182 | from gemini_utils import analyze_papers_with_gemini
183 |
184 | try:
185 | if not analyzed_papers: # If OpenAI analysis failed or was not used
186 | analyzed_papers = papers
187 |
188 | analyzed_papers = analyze_papers_with_gemini(
189 | analyzed_papers,
190 | query=query,
191 | model_name=model_names[ModelProvider.GEMINI]
192 | )
193 | except Exception as e:
194 | logger.error(f"Error analyzing papers with Gemini: {e}")
195 |
196 | # Add Anthropic/Claude analysis if available
197 | if ModelProvider.ANTHROPIC in providers and self.is_provider_available(ModelProvider.ANTHROPIC):
198 | # Import locally to avoid circular imports
199 | from anthropic_utils import analyze_papers_with_claude
200 |
201 | try:
202 | if not analyzed_papers: # If previous analyses failed or were not used
203 | analyzed_papers = papers
204 |
205 | analyzed_papers = analyze_papers_with_claude(
206 | analyzed_papers,
207 | query=query,
208 | model_name=model_names[ModelProvider.ANTHROPIC]
209 | )
210 | except Exception as e:
211 | logger.error(f"Error analyzing papers with Claude: {e}")
212 |
213 | return analyzed_papers, hallucination
214 |
215 | def get_mechanistic_interpretability_analysis(
216 | self,
217 | paper: Dict[str, Any],
218 | provider: ModelProvider = None,
219 | model_name: str = None
220 | ) -> Dict[str, Any]:
221 | """
222 | Get specialized mechanistic interpretability analysis for a paper.
223 |
224 | Args:
225 | paper: Paper dictionary
226 | provider: Provider to use (defaults to first available)
227 | model_name: Model name to use
228 |
229 | Returns:
230 | Dictionary with mechanistic interpretability analysis
231 | """
232 | # Import interpretability analysis functions
233 | from interpretability_analysis import (
234 | create_analysis_prompt,
235 | extract_json_from_text,
236 | analyze_interpretability_circuits,
237 | get_paper_relation_to_ai_safety
238 | )
239 |
240 | if not provider:
241 | available_providers = self.get_available_providers()
242 | if not available_providers:
243 | logger.error("No available providers for mechanistic interpretability analysis")
244 | return {"error": "No available providers"}
245 | provider = available_providers[0]
246 |
247 | if not model_name:
248 | # Use more powerful models for specialized analysis
249 | default_models = {
250 | ModelProvider.OPENAI: "gpt-4o",
251 | ModelProvider.GEMINI: "gemini-2.0-flash",
252 | ModelProvider.ANTHROPIC: "claude-3.5-sonnet-20240620"
253 | }
254 | model_name = default_models.get(provider)
255 |
256 | if not self.is_provider_available(provider):
257 | logger.error(f"Provider {provider} is not available")
258 | return {"error": f"Provider {provider} is not available"}
259 |
260 | # Get specialized prompt
261 | prompt = create_analysis_prompt(paper, "mechanistic_interpretability")
262 |
263 | # Process based on provider
264 | if provider == ModelProvider.OPENAI:
265 | try:
266 | response = openai.ChatCompletion.create(
267 | model=model_name,
268 | messages=[
269 | {"role": "system", "content": "You are a specialist in mechanistic interpretability and AI safety."},
270 | {"role": "user", "content": prompt}
271 | ],
272 | temperature=0.3,
273 | max_tokens=2048
274 | )
275 |
276 | # Extract JSON from response
277 | content = response.choices[0].message.content
278 | analysis = extract_json_from_text(content)
279 |
280 | # Add additional circuit analysis if there's no error
281 | if "error" not in analysis:
282 | analysis = analyze_interpretability_circuits(paper, analysis)
283 | analysis["ai_safety_relation"] = get_paper_relation_to_ai_safety(paper)
284 |
285 | return analysis
286 |
287 | except Exception as e:
288 | logger.error(f"Error getting mechanistic interpretability analysis with OpenAI: {e}")
289 | return {"error": f"OpenAI error: {str(e)}"}
290 |
291 | elif provider == ModelProvider.GEMINI and GEMINI_AVAILABLE:
292 | try:
293 | model = genai.GenerativeModel(model_name)
294 | response = model.generate_content(prompt)
295 |
296 | # Extract JSON from response
297 | content = response.text
298 | analysis = extract_json_from_text(content)
299 |
300 | # Add additional circuit analysis if there's no error
301 | if "error" not in analysis:
302 | analysis = analyze_interpretability_circuits(paper, analysis)
303 | analysis["ai_safety_relation"] = get_paper_relation_to_ai_safety(paper)
304 |
305 | return analysis
306 |
307 | except Exception as e:
308 | logger.error(f"Error getting mechanistic interpretability analysis with Gemini: {e}")
309 | return {"error": f"Gemini error: {str(e)}"}
310 |
311 | elif provider == ModelProvider.ANTHROPIC and ANTHROPIC_AVAILABLE:
312 | try:
313 | response = self.anthropic_client.messages.create(
314 | model=model_name,
315 | max_tokens=2048,
316 | temperature=0.3,
317 | system="You are a specialist in mechanistic interpretability and AI safety.",
318 | messages=[
319 | {"role": "user", "content": prompt}
320 | ]
321 | )
322 |
323 | # Extract JSON from response
324 | content = response.content[0].text
325 | analysis = extract_json_from_text(content)
326 |
327 | # Add additional circuit analysis if there's no error
328 | if "error" not in analysis:
329 | analysis = analyze_interpretability_circuits(paper, analysis)
330 | analysis["ai_safety_relation"] = get_paper_relation_to_ai_safety(paper)
331 |
332 | return analysis
333 |
334 | except Exception as e:
335 | logger.error(f"Error getting mechanistic interpretability analysis with Claude: {e}")
336 | return {"error": f"Claude error: {str(e)}"}
337 |
338 | return {"error": "Unsupported provider or configuration"}
339 |
340 | def analyze_design_automation(
341 | self,
342 | paper: Dict[str, Any],
343 | provider: ModelProvider = None,
344 | model_name: str = None
345 | ) -> Dict[str, Any]:
346 | """
347 | Get specialized analysis for design automation papers.
348 |
349 | Args:
350 | paper: Paper dictionary
351 | provider: Provider to use (defaults to first available)
352 | model_name: Model name to use
353 |
354 | Returns:
355 | Dictionary with design automation analysis
356 | """
357 | # Import design automation functions
358 | from design_automation import (
359 | create_design_analysis_prompt,
360 | extract_design_capabilities
361 | )
362 | from interpretability_analysis import extract_json_from_text
363 |
364 | if not provider:
365 | available_providers = self.get_available_providers()
366 | if not available_providers:
367 | logger.error("No available providers for design automation analysis")
368 | return {"error": "No available providers"}
369 | provider = available_providers[0]
370 |
371 | if not model_name:
372 | # Use appropriate models for design analysis
373 | default_models = {
374 | ModelProvider.OPENAI: "gpt-4o",
375 | ModelProvider.GEMINI: "gemini-2.0-flash",
376 | ModelProvider.ANTHROPIC: "claude-3.5-sonnet-20240620"
377 | }
378 | model_name = default_models.get(provider)
379 |
380 | if not self.is_provider_available(provider):
381 | logger.error(f"Provider {provider} is not available")
382 | return {"error": f"Provider {provider} is not available"}
383 |
384 | # Get specialized prompt
385 | prompt = create_design_analysis_prompt(paper)
386 |
387 | # Process based on provider
388 | try:
389 | analysis = None
390 |
391 | if provider == ModelProvider.OPENAI:
392 | response = openai.ChatCompletion.create(
393 | model=model_name,
394 | messages=[
395 | {"role": "system", "content": "You are a specialist in AI for design automation."},
396 | {"role": "user", "content": prompt}
397 | ],
398 | temperature=0.3,
399 | max_tokens=2048
400 | )
401 | content = response.choices[0].message.content
402 | analysis = extract_json_from_text(content)
403 |
404 | elif provider == ModelProvider.GEMINI and GEMINI_AVAILABLE:
405 | model = genai.GenerativeModel(model_name)
406 | response = model.generate_content(prompt)
407 | content = response.text
408 | analysis = extract_json_from_text(content)
409 |
410 | elif provider == ModelProvider.ANTHROPIC and ANTHROPIC_AVAILABLE:
411 | response = self.anthropic_client.messages.create(
412 | model=model_name,
413 | max_tokens=2048,
414 | temperature=0.3,
415 | system="You are a specialist in AI for design automation.",
416 | messages=[
417 | {"role": "user", "content": prompt}
418 | ]
419 | )
420 | content = response.content[0].text
421 | analysis = extract_json_from_text(content)
422 |
423 | # Enhance analysis with design capabilities if successful
424 | if analysis and "error" not in analysis:
425 | capabilities = extract_design_capabilities(analysis)
426 | analysis["capabilities"] = capabilities
427 |
428 | return analysis or {"error": "Failed to generate analysis"}
429 |
430 | except Exception as e:
431 | logger.error(f"Error analyzing design automation paper: {e}")
432 | return {"error": f"Analysis error: {str(e)}"}
433 |
434 | # Create a singleton instance
435 | model_manager = ModelManager()
--------------------------------------------------------------------------------
/src/design/design_finder.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python3
2 | """
3 | Design Finder - A self-contained script to find AI/ML design automation papers on arXiv.
4 |
5 | This script requires only Python standard libraries and BeautifulSoup, making it very easy to run
6 | without complex dependencies.
7 |
8 | Usage:
9 | python design_finder.py [--days 7] [--output design_papers.json]
10 | """
11 |
12 | import os
13 | import sys
14 | import json
15 | import argparse
16 | import datetime
17 | import re
18 | import time
19 | import urllib.request
20 | from typing import List, Dict, Any
21 |
22 | # Check for BeautifulSoup
23 | try:
24 | from bs4 import BeautifulSoup as bs
25 | except ImportError:
26 | print("BeautifulSoup not found. Installing...")
27 | import subprocess
28 | subprocess.check_call([sys.executable, "-m", "pip", "install", "beautifulsoup4"])
29 | from bs4 import BeautifulSoup as bs
30 |
31 | # Default arXiv categories to search
32 | DEFAULT_CATEGORIES = [
33 | "cs.CV", # Computer Vision
34 | "cs.GR", # Graphics
35 | "cs.HC", # Human-Computer Interaction
36 | "cs.AI", # Artificial Intelligence
37 | "cs.LG", # Machine Learning
38 | "cs.CL", # Computation and Language (NLP)
39 | "cs.MM" # Multimedia
40 | ]
41 |
42 | # Design automation keywords for paper filtering
43 | DESIGN_AUTOMATION_KEYWORDS = [
44 | "design automation", "layout generation", "visual design", "graphic design",
45 | "creative AI", "generative design", "UI generation", "UX automation",
46 | "design system", "composition", "creative workflow", "automated design",
47 | "design tool", "design assistant", "design optimization", "content-aware",
48 | "user interface generation", "visual layout", "image composition", "AI design"
49 | ]
50 |
51 | class DesignPaperFinder:
52 | def __init__(self, days_back=7, categories=None, output_file="design_papers.json",
53 | html_file="design_papers.html", keyword=None, verbose=True):
54 | self.days_back = days_back
55 | self.categories = categories or DEFAULT_CATEGORIES
56 | self.output_file = output_file
57 | self.html_file = html_file
58 | self.keyword = keyword
59 | self.verbose = verbose
60 | self.papers = []
61 |
62 | # Data directory is already created by paths.py module
63 |
64 | def log(self, message):
65 | """Print a message if verbose mode is enabled."""
66 | if self.verbose:
67 | print(message)
68 |
69 | def get_date_range(self) -> List[str]:
70 | """Get list of dates to search in arXiv format."""
71 | today = datetime.datetime.now()
72 | dates = []
73 |
74 | for i in range(self.days_back):
75 | date = today - datetime.timedelta(days=i)
76 | date_str = date.strftime("%a, %d %b %y")
77 | dates.append(date_str)
78 |
79 | return dates
80 |
81 | def download_papers(self, category: str, date_str: str) -> List[Dict[str, Any]]:
82 | """Download papers for a specific category and date."""
83 | # Check if we already have this data
84 | # Import data directory at runtime to avoid circular imports
85 | import sys
86 | sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
87 | from paths import DATA_DIR
88 | file_path = os.path.join(DATA_DIR, f"{category}_{date_str}.jsonl")
89 | if os.path.exists(file_path):
90 | self.log(f"Loading cached papers for {category} on {date_str}")
91 | papers = []
92 | with open(file_path, "r") as f:
93 | for line in f:
94 | papers.append(json.loads(line))
95 | return papers
96 |
97 | # Download new papers
98 | self.log(f"Downloading papers for {category} on {date_str}")
99 | NEW_SUB_URL = f'https://arxiv.org/list/{category}/new'
100 |
101 | try:
102 | page = urllib.request.urlopen(NEW_SUB_URL)
103 | except Exception as e:
104 | self.log(f"Error downloading from {NEW_SUB_URL}: {e}")
105 | return []
106 |
107 | soup = bs(page, 'html.parser')
108 | content = soup.body.find("div", {'id': 'content'})
109 |
110 | # Find the date heading
111 | h3 = content.find("h3").text # e.g: New submissions for Wed, 10 May 23
112 | date_from_page = h3.replace("New submissions for", "").strip()
113 |
114 | # Find all papers
115 | dt_list = content.dl.find_all("dt")
116 | dd_list = content.dl.find_all("dd")
117 | arxiv_base = "https://arxiv.org/abs/"
118 | arxiv_html = "https://arxiv.org/html/"
119 |
120 | papers = []
121 | for i in range(len(dt_list)):
122 | try:
123 | paper = {}
124 | ahref = dt_list[i].find('a', href=re.compile(r'[/]([a-z]|[A-Z])\w+')).attrs['href']
125 | paper_number = ahref.strip().replace("/abs/", "")
126 |
127 | paper['main_page'] = arxiv_base + paper_number
128 | paper['pdf'] = arxiv_base.replace('abs', 'pdf') + paper_number
129 |
130 | paper['title'] = dd_list[i].find("div", {"class": "list-title mathjax"}).text.replace("Title:\n", "").strip()
131 | paper['authors'] = dd_list[i].find("div", {"class": "list-authors"}).text.replace("Authors:\n", "").replace("\n", "").strip()
132 | paper['subjects'] = dd_list[i].find("div", {"class": "list-subjects"}).text.replace("Subjects:\n", "").strip()
133 | paper['abstract'] = dd_list[i].find("p", {"class": "mathjax"}).text.replace("\n", " ").strip()
134 |
135 | # Get a short excerpt of content (optional)
136 | try:
137 | html = urllib.request.urlopen(arxiv_html + paper_number + "v1")
138 | soup_content = bs(html, 'html.parser')
139 | content_div = soup_content.find('div', attrs={'class': 'ltx_page_content'})
140 | if content_div:
141 | para_list = content_div.find_all("div", attrs={'class': 'ltx_para'})
142 | excerpt = ' '.join([p.text.strip() for p in para_list[:3]]) # Get first 3 paragraphs
143 | paper['content_excerpt'] = excerpt[:1000] + "..." if len(excerpt) > 1000 else excerpt
144 | else:
145 | paper['content_excerpt'] = "Content not available"
146 | except Exception:
147 | paper['content_excerpt'] = ""
148 |
149 | papers.append(paper)
150 | except Exception as e:
151 | if self.verbose:
152 | self.log(f"Error processing paper {i}: {e}")
153 |
154 | # Save papers to file
155 | with open(file_path, "w") as f:
156 | for paper in papers:
157 | f.write(json.dumps(paper) + "\n")
158 |
159 | return papers
160 |
161 | def is_design_automation_paper(self, paper: Dict[str, Any]) -> bool:
162 | """Check if a paper is related to design automation based on keywords."""
163 | text = (
164 | (paper.get("title", "") + " " +
165 | paper.get("abstract", "") + " " +
166 | paper.get("subjects", "")).lower()
167 | )
168 |
169 | return any(keyword.lower() in text for keyword in DESIGN_AUTOMATION_KEYWORDS)
170 |
171 | def categorize_design_paper(self, paper: Dict[str, Any]) -> str:
172 | """Categorize design automation paper into subcategories."""
173 | text = (paper.get("title", "") + " " + paper.get("abstract", "")).lower()
174 |
175 | categories = {
176 | "Layout Generation": ["layout", "composition", "arrange", "grid"],
177 | "UI/UX Design": ["user interface", "ui", "ux", "interface design", "website"],
178 | "Graphic Design": ["graphic design", "poster", "visual design", "typography"],
179 | "Image Manipulation": ["image editing", "photo", "manipulation", "style transfer"],
180 | "Design Tools": ["tool", "assistant", "workflow", "productivity"],
181 | "3D Design": ["3d", "modeling", "cad", "product design"],
182 | "Multimodal Design": ["multimodal", "text-to-image", "image-to-code"]
183 | }
184 |
185 | matches = []
186 | for category, keywords in categories.items():
187 | if any(keyword in text for keyword in keywords):
188 | matches.append(category)
189 |
190 | if matches:
191 | return ", ".join(matches)
192 | return "General Design Automation"
193 |
194 | def analyze_design_techniques(self, paper: Dict[str, Any]) -> List[str]:
195 | """Extract AI/ML techniques used for design automation in the paper."""
196 | text = (paper.get("title", "") + " " + paper.get("abstract", "")).lower()
197 |
198 | techniques = []
199 | technique_keywords = {
200 | "Generative Adversarial Networks": ["gan", "generative adversarial"],
201 | "Diffusion Models": ["diffusion", "ddpm", "stable diffusion"],
202 | "Transformers": ["transformer", "attention mechanism"],
203 | "Reinforcement Learning": ["reinforcement learning", "rl"],
204 | "Computer Vision": ["computer vision", "vision", "cnn"],
205 | "Graph Neural Networks": ["graph neural", "gnn"],
206 | "Large Language Models": ["llm", "large language model", "gpt", "chatgpt"],
207 | "Neural Style Transfer": ["style transfer", "neural style"],
208 | "Evolutionary Algorithms": ["genetic algorithm", "evolutionary"]
209 | }
210 |
211 | for technique, keywords in technique_keywords.items():
212 | if any(keyword in text for keyword in keywords):
213 | techniques.append(technique)
214 |
215 | return techniques
216 |
217 | def find_papers(self):
218 | """Find design automation papers from arXiv."""
219 | self.log(f"Looking for design papers in the past {self.days_back} days")
220 | self.log(f"Searching categories: {', '.join(self.categories)}")
221 |
222 | # Get papers for each category and date
223 | dates = self.get_date_range()
224 | all_papers = []
225 |
226 | for category in self.categories:
227 | for date_str in dates:
228 | try:
229 | papers = self.download_papers(category, date_str)
230 | all_papers.extend(papers)
231 | # Avoid hitting arXiv rate limits
232 | time.sleep(3)
233 | except Exception as e:
234 | self.log(f"Error downloading papers for {category} on {date_str}: {e}")
235 |
236 | # Remove duplicates (papers can appear in multiple categories)
237 | unique_papers = {}
238 | for paper in all_papers:
239 | paper_id = paper.get("main_page", "").split("/")[-1]
240 | if paper_id and paper_id not in unique_papers:
241 | unique_papers[paper_id] = paper
242 |
243 | all_papers = list(unique_papers.values())
244 |
245 | # Filter for design automation papers
246 | design_papers = []
247 | for paper in all_papers:
248 | if self.is_design_automation_paper(paper):
249 | paper["design_category"] = self.categorize_design_paper(paper)
250 | paper["design_techniques"] = self.analyze_design_techniques(paper)
251 | design_papers.append(paper)
252 |
253 | # Additional keyword filtering if specified
254 | if self.keyword:
255 | keyword = self.keyword.lower()
256 | design_papers = [
257 | p for p in design_papers
258 | if keyword in p.get("title", "").lower() or
259 | keyword in p.get("abstract", "").lower()
260 | ]
261 |
262 | # Sort by date
263 | design_papers.sort(key=lambda p: p.get("main_page", ""), reverse=True)
264 |
265 | self.papers = design_papers
266 | self.log(f"Found {len(design_papers)} design automation papers")
267 | return design_papers
268 |
269 | def print_paper_summary(self, paper: Dict[str, Any]):
270 | """Print a nice summary of a paper to the console."""
271 | print(f"\n{'=' * 80}")
272 | print(f"TITLE: {paper.get('title', 'No title')}")
273 | print(f"AUTHORS: {paper.get('authors', 'No authors')}")
274 | print(f"URL: {paper.get('main_page', 'No URL')}")
275 | print(f"DESIGN CATEGORY: {paper.get('design_category', 'Unknown')}")
276 | print(f"TECHNIQUES: {', '.join(paper.get('design_techniques', []))}")
277 | print(f"\nABSTRACT: {paper.get('abstract', 'No abstract')[:500]}...")
278 | print(f"{'=' * 80}\n")
279 |
280 | def generate_html_report(self):
281 | """Generate an HTML report from papers."""
282 | if not self.papers:
283 | self.log("No papers to generate HTML report from")
284 | return
285 |
286 | html = f"""
287 |
288 |
289 |
290 |
291 | Design Automation Papers
292 |
309 |
310 |
311 | Design Automation Papers
312 |
316 |
317 |
Found {len(self.papers)} papers related to graphic design automation with AI/ML
318 |
Generated on {datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S")}
319 |
Keywords: {', '.join(DESIGN_AUTOMATION_KEYWORDS[:5])}...
320 |
321 | """
322 |
323 | # Count categories and techniques
324 | categories = {}
325 | techniques = {}
326 |
327 | for paper in self.papers:
328 | category = paper.get("design_category", "Uncategorized")
329 | if category in categories:
330 | categories[category] += 1
331 | else:
332 | categories[category] = 1
333 |
334 | for technique in paper.get("design_techniques", []):
335 | if technique in techniques:
336 | techniques[technique] += 1
337 | else:
338 | techniques[technique] = 1
339 |
340 | # Add summary statistics
341 | html += 'Summary Statistics
'
342 |
343 | html += "
Categories:
"
344 | for category, count in sorted(categories.items(), key=lambda x: x[1], reverse=True):
345 | html += f"- {category}: {count} papers
"
346 | html += "
"
347 |
348 | html += "
Techniques:
"
349 | for technique, count in sorted(techniques.items(), key=lambda x: x[1], reverse=True):
350 | html += f"- {technique}: {count} papers
"
351 | html += "
"
352 |
353 | # Add papers
354 | html += 'Papers
'
355 | for paper in self.papers:
356 | html += f"""
357 |
358 |
359 |
{paper.get("authors", "Unknown authors")}
360 |
Category: {paper.get("design_category", "General")} | Subject: {paper.get("subjects", "N/A")}
361 |
Techniques: {', '.join(paper.get("design_techniques", ["None identified"]))}
362 |
Abstract: {paper.get("abstract", "No abstract available")}
363 |
367 |
368 | """
369 |
370 | html += """
371 |
374 |
375 |
376 | """
377 |
378 | with open(self.html_file, "w") as f:
379 | f.write(html)
380 |
381 | self.log(f"HTML report generated: {self.html_file}")
382 |
383 | def save_json(self):
384 | """Save papers to JSON file."""
385 | if not self.papers:
386 | self.log("No papers to save")
387 | return
388 |
389 | with open(self.output_file, "w") as f:
390 | json.dump(self.papers, f, indent=2)
391 |
392 | self.log(f"Saved {len(self.papers)} papers to {self.output_file}")
393 |
394 | def run(self):
395 | """Run the full paper finding process."""
396 | self.find_papers()
397 |
398 | if not self.papers:
399 | print("No design automation papers found.")
400 | return
401 |
402 | # Print summary of top papers
403 | for paper in self.papers[:10]: # Print top 10
404 | self.print_paper_summary(paper)
405 |
406 | if len(self.papers) > 10:
407 | print(f"...and {len(self.papers) - 10} more papers.")
408 |
409 | # Save outputs
410 | self.save_json()
411 | self.generate_html_report()
412 |
413 | print(f"\nResults saved to {self.output_file} and {self.html_file}")
414 | print(f"Open {self.html_file} in your browser to view the report.")
415 |
416 | def main():
417 | parser = argparse.ArgumentParser(description="Find the latest graphic design automation papers.")
418 | parser.add_argument("--days", type=int, default=7, help="Number of days to look back")
419 | parser.add_argument("--output", type=str, default="design_papers.json", help="Output file path")
420 | parser.add_argument("--html", type=str, default="design_papers.html", help="HTML output file path")
421 | parser.add_argument("--categories", type=str, nargs="+", default=DEFAULT_CATEGORIES,
422 | help="arXiv categories to search")
423 | parser.add_argument("--keyword", type=str, help="Additional keyword to filter papers")
424 | parser.add_argument("--quiet", action="store_true", help="Suppress progress messages")
425 | args = parser.parse_args()
426 |
427 | finder = DesignPaperFinder(
428 | days_back=args.days,
429 | categories=args.categories,
430 | output_file=args.output,
431 | html_file=args.html,
432 | keyword=args.keyword,
433 | verbose=not args.quiet
434 | )
435 |
436 | finder.run()
437 |
438 | if __name__ == "__main__":
439 | main()
--------------------------------------------------------------------------------
/src/design/find_design_papers.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python3
2 | """
3 | Standalone Design Papers Crawler - A simple script to find the latest papers
4 | on graphic design automation using AI/ML/LLM technologies.
5 |
6 | This version has minimal dependencies and doesn't require the full model setup.
7 |
8 | Usage:
9 | python find_design_papers.py [--days 7] [--output design_papers.json]
10 | """
11 |
12 | import os
13 | import sys
14 | import json
15 | import argparse
16 | import datetime
17 | import logging
18 | import re
19 | import urllib.request
20 | import time
21 | from typing import List, Dict, Any, Optional, Tuple
22 | from bs4 import BeautifulSoup as bs
23 |
24 | # Add parent directory to path to allow imports from sibling modules
25 | sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
26 | from paths import DATA_DIR, DIGEST_DIR
27 | from model_manager import model_manager, ModelProvider
28 |
29 | # Configure logging
30 | logging.basicConfig(
31 | level=logging.INFO,
32 | format='%(asctime)s - %(name)s - %(levelname)s - %(message)s'
33 | )
34 | logger = logging.getLogger(__name__)
35 |
36 | # Default arXiv categories to search
37 | DEFAULT_CATEGORIES = [
38 | "cs.CV", # Computer Vision
39 | "cs.GR", # Graphics
40 | "cs.HC", # Human-Computer Interaction
41 | "cs.AI", # Artificial Intelligence
42 | "cs.LG", # Machine Learning
43 | "cs.CL", # Computation and Language (NLP)
44 | "cs.MM" # Multimedia
45 | ]
46 |
47 | # Design automation keywords for paper filtering
48 | DESIGN_AUTOMATION_KEYWORDS = [
49 | "design automation", "layout generation", "visual design", "graphic design",
50 | "creative AI", "generative design", "UI generation", "UX automation",
51 | "design system", "composition", "creative workflow", "automated design",
52 | "design tool", "design assistant", "design optimization", "content-aware",
53 | "user interface generation", "visual layout", "image composition"
54 | ]
55 |
56 | def download_papers(category: str, date_str: str = None) -> List[Dict[str, Any]]:
57 | """
58 | Download papers for a specific category and date.
59 |
60 | Args:
61 | category: arXiv category code
62 | date_str: Date string in arXiv format (default: today)
63 |
64 | Returns:
65 | List of paper dictionaries
66 | """
67 | if not date_str:
68 | date = datetime.datetime.now()
69 | date_str = date.strftime("%a, %d %b %y")
70 |
71 | # Data directory is already created by paths.py
72 | pass
73 |
74 | # Check if we already have this data
75 | file_path = os.path.join(DATA_DIR, f"{category}_{date_str}.jsonl")
76 | if os.path.exists(file_path):
77 | papers = []
78 | with open(file_path, "r") as f:
79 | for line in f:
80 | papers.append(json.loads(line))
81 | return papers
82 |
83 | # Download new papers
84 | logger.info(f"Downloading papers for {category} on {date_str}")
85 | NEW_SUB_URL = f'https://arxiv.org/list/{category}/new'
86 |
87 | try:
88 | # Add user-agent header to appear more like a browser
89 | headers = {
90 | 'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/121.0.0.0 Safari/537.36'
91 | }
92 | req = urllib.request.Request(NEW_SUB_URL, headers=headers)
93 | page = urllib.request.urlopen(req)
94 | except Exception as e:
95 | logger.error(f"Error downloading from {NEW_SUB_URL}: {e}")
96 | return []
97 |
98 | soup = bs(page, 'html.parser')
99 | content = soup.body.find("div", {'id': 'content'})
100 |
101 | # Find the date heading
102 | h3 = content.find("h3").text # e.g: New submissions for Wed, 10 May 23
103 | date_from_page = h3.replace("New submissions for", "").strip()
104 |
105 | # Find all papers
106 | dt_list = content.dl.find_all("dt")
107 | dd_list = content.dl.find_all("dd")
108 | arxiv_base = "https://arxiv.org/abs/"
109 | arxiv_html = "https://arxiv.org/html/"
110 |
111 | papers = []
112 | for i in range(len(dt_list)):
113 | try:
114 | paper = {}
115 | ahref = dt_list[i].find('a', href=re.compile(r'[/]([a-z]|[A-Z])\w+')).attrs['href']
116 | paper_number = ahref.strip().replace("/abs/", "")
117 |
118 | paper['main_page'] = arxiv_base + paper_number
119 | paper['pdf'] = arxiv_base.replace('abs', 'pdf') + paper_number
120 |
121 | paper['title'] = dd_list[i].find("div", {"class": "list-title mathjax"}).text.replace("Title:\n", "").strip()
122 | paper['authors'] = dd_list[i].find("div", {"class": "list-authors"}).text.replace("Authors:\n", "").replace("\n", "").strip()
123 | paper['subjects'] = dd_list[i].find("div", {"class": "list-subjects"}).text.replace("Subjects:\n", "").strip()
124 | paper['abstract'] = dd_list[i].find("p", {"class": "mathjax"}).text.replace("\n", " ").strip()
125 |
126 | # Get a short excerpt of content (optional)
127 | try:
128 | # Add user-agent header to appear more like a browser
129 | headers = {
130 | 'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/121.0.0.0 Safari/537.36'
131 | }
132 | req = urllib.request.Request(arxiv_html + paper_number + "v1", headers=headers)
133 | html = urllib.request.urlopen(req)
134 | soup_content = bs(html, 'html.parser')
135 | content_div = soup_content.find('div', attrs={'class': 'ltx_page_content'})
136 | if content_div:
137 | para_list = content_div.find_all("div", attrs={'class': 'ltx_para'})
138 | excerpt = ' '.join([p.text.strip() for p in para_list[:3]]) # Get first 3 paragraphs
139 | paper['content_excerpt'] = excerpt[:1000] + "..." if len(excerpt) > 1000 else excerpt
140 | else:
141 | paper['content_excerpt'] = "Content not available"
142 | except Exception as e:
143 | paper['content_excerpt'] = f"Error extracting content: {str(e)}"
144 |
145 | papers.append(paper)
146 | except Exception as e:
147 | logger.warning(f"Error processing paper {i}: {e}")
148 |
149 | # Save papers to file
150 | with open(file_path, "w") as f:
151 | for paper in papers:
152 | f.write(json.dumps(paper) + "\n")
153 |
154 | return papers
155 |
156 | def is_design_automation_paper(paper: Dict[str, Any]) -> bool:
157 | """
158 | Check if a paper is related to design automation based on keywords.
159 |
160 | Args:
161 | paper: Dictionary with paper details
162 |
163 | Returns:
164 | Boolean indicating if paper is related to design automation
165 | """
166 | text = (
167 | (paper.get("title", "") + " " +
168 | paper.get("abstract", "") + " " +
169 | paper.get("subjects", "")).lower()
170 | )
171 |
172 | return any(keyword.lower() in text for keyword in DESIGN_AUTOMATION_KEYWORDS)
173 |
174 | def categorize_design_paper(paper: Dict[str, Any]) -> str:
175 | """
176 | Categorize design automation paper into subcategories.
177 |
178 | Args:
179 | paper: Dictionary with paper details
180 |
181 | Returns:
182 | Category name string
183 | """
184 | text = (paper.get("title", "") + " " + paper.get("abstract", "")).lower()
185 |
186 | categories = {
187 | "Layout Generation": ["layout", "composition", "arrange", "grid"],
188 | "UI/UX Design": ["user interface", "ui", "ux", "interface design", "website"],
189 | "Graphic Design": ["graphic design", "poster", "visual design", "typography"],
190 | "Image Manipulation": ["image editing", "photo", "manipulation", "style transfer"],
191 | "Design Tools": ["tool", "assistant", "workflow", "productivity"],
192 | "3D Design": ["3d", "modeling", "cad", "product design"],
193 | "Multimodal Design": ["multimodal", "text-to-image", "image-to-code"]
194 | }
195 |
196 | matches = []
197 | for category, keywords in categories.items():
198 | if any(keyword in text for keyword in keywords):
199 | matches.append(category)
200 |
201 | if matches:
202 | return ", ".join(matches)
203 | return "General Design Automation"
204 |
205 | def analyze_design_techniques(paper: Dict[str, Any]) -> List[str]:
206 | """
207 | Extract AI/ML techniques used for design automation in the paper.
208 |
209 | Args:
210 | paper: Dictionary with paper details
211 |
212 | Returns:
213 | List of techniques
214 | """
215 | text = (paper.get("title", "") + " " + paper.get("abstract", "")).lower()
216 |
217 | techniques = []
218 | technique_keywords = {
219 | "Generative Adversarial Networks": ["gan", "generative adversarial"],
220 | "Diffusion Models": ["diffusion", "ddpm", "stable diffusion"],
221 | "Transformers": ["transformer", "attention mechanism"],
222 | "Reinforcement Learning": ["reinforcement learning", "rl"],
223 | "Computer Vision": ["computer vision", "vision", "cnn"],
224 | "Graph Neural Networks": ["graph neural", "gnn"],
225 | "Large Language Models": ["llm", "large language model", "gpt"],
226 | "Neural Style Transfer": ["style transfer", "neural style"],
227 | "Evolutionary Algorithms": ["genetic algorithm", "evolutionary"]
228 | }
229 |
230 | for technique, keywords in technique_keywords.items():
231 | if any(keyword in text for keyword in keywords):
232 | techniques.append(technique)
233 |
234 | return techniques
235 |
236 | def get_date_range(days_back: int = 7) -> List[str]:
237 | """
238 | Get a list of dates for the past N days in arXiv format.
239 |
240 | Args:
241 | days_back: Number of days to look back
242 |
243 | Returns:
244 | List of date strings in arXiv format
245 | """
246 | today = datetime.datetime.now()
247 | dates = []
248 |
249 | for i in range(days_back):
250 | date = today - datetime.timedelta(days=i)
251 | date_str = date.strftime("%a, %d %b %y")
252 | dates.append(date_str)
253 |
254 | return dates
255 |
256 | def generate_html_report(papers: List[Dict[str, Any]], output_file: str, keyword: str = None, days_back: int = 7) -> None:
257 | """
258 | Generate an HTML report from papers.
259 |
260 | Args:
261 | papers: List of paper dictionaries
262 | output_file: Path to output HTML file
263 | keyword: Optional keyword used for filtering
264 | days_back: Number of days searched
265 | """
266 | # Ensure the output directory exists
267 | output_dir = os.path.dirname(output_file)
268 | if output_dir and not os.path.exists(output_dir):
269 | os.makedirs(output_dir, exist_ok=True)
270 |
271 | # Create a title that includes any keywords and date
272 | title_date = datetime.datetime.now().strftime("%B %d, %Y")
273 | page_title = "Design Automation Papers"
274 | if keyword:
275 | page_title = f"Design Automation Papers - {keyword.title()} - {title_date}"
276 | else:
277 | page_title = f"Design Automation Papers - {title_date}"
278 |
279 | html = f"""
280 |
281 |
282 |
283 |
284 | {page_title}
285 |
304 |
305 |
306 | Design Automation Papers
307 |
308 |
Found {len(papers)} papers related to graphic design automation with AI/ML
309 |
Generated on {datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S")}
310 |
311 | """
312 |
313 | # Count categories and techniques
314 | categories = {}
315 | techniques = {}
316 |
317 | for paper in papers:
318 | category = paper.get("design_category", "Uncategorized")
319 | if category in categories:
320 | categories[category] += 1
321 | else:
322 | categories[category] = 1
323 |
324 | for technique in paper.get("design_techniques", []):
325 | if technique in techniques:
326 | techniques[technique] += 1
327 | else:
328 | techniques[technique] = 1
329 |
330 | # Add summary statistics
331 | html += "Summary Statistics
"
332 |
333 | html += "
Categories:
"
334 | for category, count in sorted(categories.items(), key=lambda x: x[1], reverse=True):
335 | html += f"- {category}: {count} papers
"
336 | html += "
"
337 |
338 | html += "
Techniques:
"
339 | for technique, count in sorted(techniques.items(), key=lambda x: x[1], reverse=True):
340 | html += f"- {technique}: {count} papers
"
341 | html += "
"
342 |
343 | # Add papers
344 | for paper in papers:
345 | html += f"""
346 |
347 |
348 |
{paper.get("authors", "Unknown authors")}
349 |
Category: {paper.get("design_category", "General")} | Subject: {paper.get("subjects", "N/A")}
350 |
Techniques: {', '.join(paper.get("design_techniques", ["None identified"]))}
351 | """
352 |
353 | # Add relevancy score and reasons if available
354 | if "Relevancy score" in paper:
355 | html += f'
Relevancy Score: {paper.get("Relevancy score", "N/A")}
'
356 |
357 | if "Reasons for match" in paper:
358 | html += f'
Reason: {paper.get("Reasons for match", "")}
'
359 |
360 | # Add abstract
361 | if "abstract" in paper:
362 | html += f'
Abstract: {paper.get("abstract", "")}
'
363 |
364 | # Add all the additional analysis sections
365 | for key, value in paper.items():
366 | if key in ["title", "authors", "subjects", "main_page", "Relevancy score", "Reasons for match",
367 | "design_category", "design_techniques", "content", "abstract"]:
368 | continue
369 |
370 | if isinstance(value, str) and value.strip():
371 | html += f'
'
372 |
373 | # Add links
374 | html += f"""
375 |
379 |
380 | """
381 |
382 | html += f"""
383 |
388 |
389 |
390 | """
391 |
392 | with open(output_file, "w") as f:
393 | f.write(html)
394 |
395 | logger.info(f"HTML report generated: {output_file}")
396 |
397 | def print_paper_summary(paper: Dict[str, Any]) -> None:
398 | """
399 | Print a nice summary of a paper to the console.
400 |
401 | Args:
402 | paper: Paper dictionary
403 | """
404 | print(f"\n{'=' * 80}")
405 | print(f"TITLE: {paper.get('title', 'No title')}")
406 | print(f"AUTHORS: {paper.get('authors', 'No authors')}")
407 | print(f"URL: {paper.get('main_page', 'No URL')}")
408 | print(f"DESIGN CATEGORY: {paper.get('design_category', 'Unknown')}")
409 | print(f"TECHNIQUES: {', '.join(paper.get('design_techniques', []))}")
410 | print(f"\nABSTRACT: {paper.get('abstract', 'No abstract')[:500]}...")
411 | print(f"{'=' * 80}\n")
412 |
413 | def analyze_papers_with_llm(papers: List[Dict[str, Any]], research_interest: str) -> List[Dict[str, Any]]:
414 | """
415 | Analyze papers using LLM to provide detailed analysis
416 |
417 | Args:
418 | papers: List of paper dictionaries
419 | research_interest: Description of research interests
420 |
421 | Returns:
422 | Enhanced list of papers with detailed analysis
423 | """
424 | if not papers:
425 | return papers
426 |
427 | # Check if model_manager is properly initialized
428 | if not model_manager.is_provider_available(ModelProvider.OPENAI):
429 | # Try to get OpenAI key from environment
430 | import os
431 | openai_key = os.environ.get("OPENAI_API_KEY")
432 | if openai_key:
433 | model_manager.register_openai(openai_key)
434 | else:
435 | logger.warning("No OpenAI API key available. Skipping detailed analysis.")
436 | return papers
437 |
438 | logger.info(f"Analyzing {len(papers)} papers with LLM...")
439 |
440 | # Default research interest for design papers if none provided
441 | if not research_interest:
442 | research_interest = """
443 | I'm interested in papers that use AI/ML for design automation, including:
444 | 1. Generative design systems for graphics, UI/UX, and layouts
445 | 2. ML-enhanced creative tools and design assistants
446 | 3. Novel techniques for automating design processes
447 | 4. Human-AI collaborative design workflows
448 | 5. Applications of LLMs, diffusion models, and GANs to design tasks
449 | """
450 |
451 | # Analyze papers using model_manager
452 | try:
453 | analyzed_papers, _ = model_manager.analyze_papers(
454 | papers,
455 | query={"interest": research_interest},
456 | providers=[ModelProvider.OPENAI],
457 | model_names={ModelProvider.OPENAI: "gpt-3.5-turbo-16k"},
458 | threshold_score=0 # Include all papers, even low scored ones
459 | )
460 | return analyzed_papers
461 | except Exception as e:
462 | logger.error(f"Error during LLM analysis: {e}")
463 | return papers
464 |
465 | def pre_filter_category(category: str, keyword: str = None) -> bool:
466 | """
467 | Check if a category is likely to contain design-related papers
468 | to avoid downloading irrelevant categories.
469 |
470 | Args:
471 | category: arXiv category code
472 | keyword: Optional search keyword
473 |
474 | Returns:
475 | Boolean indicating whether to include this category
476 | """
477 | # Always include these categories as they're highly relevant
478 | high_relevance = ["cs.GR", "cs.HC", "cs.CV", "cs.MM", "cs.SD"]
479 |
480 | if category in high_relevance:
481 | return True
482 |
483 | # If we have a keyword, we need to be less strict to avoid missing papers
484 | if keyword:
485 | return True
486 |
487 | # Medium relevance categories - include for comprehensive searches
488 | medium_relevance = ["cs.AI", "cs.LG", "cs.CL", "cs.RO", "cs.CY"]
489 | return category in medium_relevance
490 |
491 | def main():
492 | parser = argparse.ArgumentParser(description="Find the latest graphic design automation papers.")
493 | parser.add_argument("--days", type=int, default=7, help="Number of days to look back")
494 | parser.add_argument("--output", type=str, help="Output JSON file path (date will be added automatically)")
495 | parser.add_argument("--html", type=str, help="HTML output file path (date will be added automatically)")
496 | parser.add_argument("--categories", type=str, nargs="+", default=DEFAULT_CATEGORIES,
497 | help="arXiv categories to search")
498 | parser.add_argument("--keyword", type=str, help="Additional keyword to filter papers")
499 | parser.add_argument("--analyze", action="store_true", help="Use LLM to perform detailed analysis of papers")
500 | parser.add_argument("--interest", type=str, help="Research interest description for LLM analysis")
501 | parser.add_argument("--model", type=str, default="gpt-3.5-turbo-16k", help="Model to use for analysis")
502 | parser.add_argument("--no-date", action="store_true", help="Disable adding date to filenames")
503 | args = parser.parse_args()
504 |
505 | # Generate date string for filenames
506 | current_date = datetime.datetime.now().strftime("%Y%m%d")
507 |
508 | # Set default filenames with dates if not provided
509 | if args.output is None:
510 | base_filename = "design_papers"
511 | if args.keyword:
512 | # Add keyword to filename if provided
513 | base_filename = f"design_papers_{args.keyword.lower().replace(' ', '_')}"
514 |
515 | if not args.no_date:
516 | args.output = os.path.join(DATA_DIR, f"{base_filename}_{current_date}.json")
517 | else:
518 | args.output = os.path.join(DATA_DIR, f"{base_filename}.json")
519 |
520 | if args.html is None:
521 | base_filename = "design_papers"
522 | if args.keyword:
523 | # Add keyword to filename if provided
524 | base_filename = f"design_papers_{args.keyword.lower().replace(' ', '_')}"
525 |
526 | if not args.no_date:
527 | args.html = os.path.join(DIGEST_DIR, f"{base_filename}_{current_date}.html")
528 | else:
529 | args.html = os.path.join(DIGEST_DIR, f"{base_filename}.html")
530 |
531 | logger.info(f"Looking for design papers in the past {args.days} days")
532 |
533 | # Apply pre-filtering to categories
534 | filtered_categories = [cat for cat in args.categories if pre_filter_category(cat, args.keyword)]
535 | logger.info(f"Pre-filtered categories: {', '.join(filtered_categories)}")
536 |
537 | # Get papers for each category and date
538 | dates = get_date_range(args.days)
539 | all_papers = []
540 |
541 | for category in filtered_categories:
542 | for date_str in dates:
543 | try:
544 | papers = download_papers(category, date_str)
545 | # Apply keyword filter immediately if provided
546 | if args.keyword:
547 | keyword = args.keyword.lower()
548 | papers = [
549 | p for p in papers
550 | if keyword in p.get("title", "").lower() or
551 | keyword in p.get("abstract", "").lower() or
552 | keyword in p.get("subjects", "").lower()
553 | ]
554 | logger.info(f"Found {len(papers)} papers matching keyword '{args.keyword}' in {category}")
555 |
556 | all_papers.extend(papers)
557 | # Avoid hitting arXiv rate limits
558 | time.sleep(5)
559 | except Exception as e:
560 | logger.error(f"Error downloading papers for {category} on {date_str}: {e}")
561 |
562 | # Remove duplicates (papers can appear in multiple categories)
563 | unique_papers = {}
564 | for paper in all_papers:
565 | paper_id = paper.get("main_page", "").split("/")[-1]
566 | if paper_id and paper_id not in unique_papers:
567 | unique_papers[paper_id] = paper
568 |
569 | all_papers = list(unique_papers.values())
570 |
571 | # Filter for design automation papers
572 | design_papers = []
573 | for paper in all_papers:
574 | if is_design_automation_paper(paper):
575 | paper["design_category"] = categorize_design_paper(paper)
576 | paper["design_techniques"] = analyze_design_techniques(paper)
577 | design_papers.append(paper)
578 |
579 | # Sort by date
580 | design_papers.sort(key=lambda p: p.get("main_page", ""), reverse=True)
581 | logger.info(f"Found {len(design_papers)} design automation papers")
582 |
583 | # Add detailed analysis with LLM if requested
584 | if args.analyze and design_papers:
585 | design_papers = analyze_papers_with_llm(design_papers, args.interest)
586 | logger.info("Completed LLM analysis of papers")
587 |
588 | # Debug: Print out the analysis fields for the first paper
589 | if design_papers:
590 | logger.info(f"Paper analysis fields: {list(design_papers[0].keys())}")
591 | # If 'Key innovations' is present, it confirms we have the detailed analysis
592 | if 'Key innovations' in design_papers[0]:
593 | logger.info("Detailed analysis fields present!")
594 | else:
595 | logger.warning("Detailed analysis fields missing!")
596 |
597 | # Print summary to console
598 | for paper in design_papers[:10]: # Print top 10
599 | print_paper_summary(paper)
600 |
601 | if len(design_papers) > 10:
602 | print(f"...and {len(design_papers) - 10} more papers.")
603 |
604 | # Ensure output directory exists
605 | output_dir = os.path.dirname(args.output)
606 | if output_dir and not os.path.exists(output_dir):
607 | os.makedirs(output_dir, exist_ok=True)
608 |
609 | # Save to file
610 | with open(args.output, "w") as f:
611 | json.dump(design_papers, f, indent=2)
612 |
613 | # Generate HTML report
614 | generate_html_report(design_papers, args.html, args.keyword, args.days)
615 |
616 | logger.info(f"Saved {len(design_papers)} papers to {args.output}")
617 | print(f"\nResults saved to {args.output} and {args.html}")
618 |
619 | if args.analyze:
620 | print("\nPapers have been analyzed with LLM for detailed information.")
621 | print("The HTML report includes comprehensive analysis of each paper.")
622 |
623 | if __name__ == "__main__":
624 | main()
--------------------------------------------------------------------------------