├── readme_images ├── fork.png ├── UIarxiv.png ├── artifact.png ├── banner.png ├── openai.png ├── secrets.png ├── settings.png ├── trigger.png ├── example_1.png ├── example_2.png ├── hf_example.png ├── main_banner.png ├── example_report.png └── example_custom_1.png ├── src ├── design_finder │ ├── __init__.py │ ├── __main__.py │ └── main.py ├── paths.py ├── relevancy_filter_prompt.txt ├── design │ ├── find_design_papers.sh │ ├── get_design_papers.sh │ ├── README.md │ ├── design_finder.py │ └── find_design_papers.py ├── relevancy_prompt.txt ├── download_new_papers.py ├── fix_parser.py ├── design_papers_crawler.py ├── interpretability_analysis.py ├── gemini_utils.py ├── utils.py ├── design_automation.py ├── action.py ├── anthropic_utils.py └── model_manager.py ├── run.sh ├── requirements.txt ├── find_design_papers.sh ├── .env.template ├── LICENSE ├── config.yaml ├── .github └── workflows │ └── daily_pipeline.yaml ├── .gitignore ├── advanced_usage.md └── README.md /readme_images/fork.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/linhkid/ArxivDigest-extra/HEAD/readme_images/fork.png -------------------------------------------------------------------------------- /readme_images/UIarxiv.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/linhkid/ArxivDigest-extra/HEAD/readme_images/UIarxiv.png -------------------------------------------------------------------------------- /readme_images/artifact.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/linhkid/ArxivDigest-extra/HEAD/readme_images/artifact.png -------------------------------------------------------------------------------- /readme_images/banner.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/linhkid/ArxivDigest-extra/HEAD/readme_images/banner.png -------------------------------------------------------------------------------- /readme_images/openai.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/linhkid/ArxivDigest-extra/HEAD/readme_images/openai.png -------------------------------------------------------------------------------- /readme_images/secrets.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/linhkid/ArxivDigest-extra/HEAD/readme_images/secrets.png -------------------------------------------------------------------------------- /readme_images/settings.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/linhkid/ArxivDigest-extra/HEAD/readme_images/settings.png -------------------------------------------------------------------------------- /readme_images/trigger.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/linhkid/ArxivDigest-extra/HEAD/readme_images/trigger.png -------------------------------------------------------------------------------- /readme_images/example_1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/linhkid/ArxivDigest-extra/HEAD/readme_images/example_1.png -------------------------------------------------------------------------------- /readme_images/example_2.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/linhkid/ArxivDigest-extra/HEAD/readme_images/example_2.png -------------------------------------------------------------------------------- /readme_images/hf_example.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/linhkid/ArxivDigest-extra/HEAD/readme_images/hf_example.png -------------------------------------------------------------------------------- /src/design_finder/__init__.py: -------------------------------------------------------------------------------- 1 | """ 2 | Design Finder module for finding AI/ML design automation papers on arXiv. 3 | """ -------------------------------------------------------------------------------- /readme_images/main_banner.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/linhkid/ArxivDigest-extra/HEAD/readme_images/main_banner.png -------------------------------------------------------------------------------- /readme_images/example_report.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/linhkid/ArxivDigest-extra/HEAD/readme_images/example_report.png -------------------------------------------------------------------------------- /readme_images/example_custom_1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/linhkid/ArxivDigest-extra/HEAD/readme_images/example_custom_1.png -------------------------------------------------------------------------------- /src/design_finder/__main__.py: -------------------------------------------------------------------------------- 1 | """ 2 | Entry point for design_finder module. 3 | """ 4 | from .main import main 5 | 6 | if __name__ == "__main__": 7 | main() -------------------------------------------------------------------------------- /run.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | # Run the ArxivDigest-extra app using the latest version 3 | echo "Starting ArxivDigest-extra..." 4 | cd "$(dirname "$0")" 5 | python src/app_new.py -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | PyYAML==6.0 2 | beautifulsoup4==4.12.2 3 | numpy==1.24.2 4 | openai>=1.3.0 5 | python-dotenv==1.0.0 6 | pytz==2023.3 7 | sendgrid==6.10.0 8 | tqdm==4.65.0 9 | google-generativeai>=0.3.0 10 | anthropic>=0.8.0 11 | gradio>=3.50.0 -------------------------------------------------------------------------------- /find_design_papers.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | # Root-level wrapper script for the design papers finder 3 | 4 | # Show deprecation warning 5 | echo "ℹ️ Note: This script is a wrapper for ./src/design/find_design_papers.sh" 6 | echo "ℹ️ Consider using ./src/design/find_design_papers.sh directly for best results" 7 | echo "" 8 | 9 | # Simply forward all arguments to the actual script 10 | ./src/design/find_design_papers.sh "$@" 11 | 12 | # The exit code will propagate from the called script 13 | exit $? -------------------------------------------------------------------------------- /src/paths.py: -------------------------------------------------------------------------------- 1 | """ 2 | Common path definitions for ArxivDigest-extra. 3 | This module provides consistent paths throughout the application. 4 | """ 5 | import os 6 | 7 | # Get the project root directory 8 | ROOT_DIR = os.path.dirname(os.path.dirname(os.path.abspath(__file__))) 9 | 10 | # Define common directories 11 | DATA_DIR = os.path.join(ROOT_DIR, "data") 12 | DIGEST_DIR = os.path.join(ROOT_DIR, "digest") 13 | SRC_DIR = os.path.join(ROOT_DIR, "src") 14 | 15 | # Create directories if they don't exist 16 | for directory in [DATA_DIR, DIGEST_DIR]: 17 | os.makedirs(directory, exist_ok=True) -------------------------------------------------------------------------------- /.env.template: -------------------------------------------------------------------------------- 1 | ## ArxivDigest- environment seting 2 | 3 | 4 | ################################################################################################## 5 | # DO NOT COMMIT YOUR API KEYS OR EMAIL ADDRESS TO YOUR REPOSITORY 6 | ################################################################################################## 7 | OPENAI_API_KEY=your_api_key # DO NOT COMMIT ANY FILE WITH THIS KEY SET 8 | 9 | ## EMAIL SETTINGS 10 | SENDGRID_API_KEY=your_api_key # DO NOT COMMIT ANY FILE WITH THIS KEY SET 11 | FROM_EMAIL=your_email # DO NOT COMMIT ANY FILE WITH THIS KEY SET 12 | TO_EMAIL=your_email # DO NOT COMMIT ANY FILE WITH THIS KEY SET 13 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2023 AutoLLM 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /src/relevancy_filter_prompt.txt: -------------------------------------------------------------------------------- 1 | You are a research assistant with expertise in analyzing academic papers, particularly in AI and machine learning. You've been asked to perform PRELIMINARY SCREENING of arXiv papers based ONLY on their titles and abstracts. 2 | 3 | Your task is to evaluate which papers are worth analyzing in depth based on their potential relevance to the researcher's specific interests. 4 | 5 | For each paper, provide ONLY a relevancy score out of 10, with a higher score indicating greater relevance to the researcher's specific interests. Each paper's score should be accompanied by a brief explanation of why it matches or doesn't match the research interests. 6 | 7 | Papers scoring 7 or higher will undergo detailed analysis with their full content, so be selective. 8 | 9 | VERY IMPORTANT: Respond with a numbered list of valid JSON objects. The format MUST be exactly like this for each paper: 10 | 11 | 1. { 12 | "Relevancy score": 7, 13 | "Reasons for match": "Paper discusses multi-agent systems with focus on coordination mechanisms, which directly aligns with research interests." 14 | } 15 | 16 | 2. { 17 | "Relevancy score": 3, 18 | "Reasons for match": "Mentions agents but focuses on image processing applications, which is not part of the stated research interests." 19 | } 20 | 21 | DO NOT use "```json" code blocks or any other formatting. Just provide numbered JSON objects exactly as shown above. 22 | 23 | My research interests are: -------------------------------------------------------------------------------- /config.yaml: -------------------------------------------------------------------------------- 1 | # For physics topics, use the specific subtopics, e.g. "Astrophysics" 2 | topic: "Computer Science" 3 | # An empty list here will include all categories in a topic 4 | # Use the natural language names of the topics, found here: https://arxiv.org 5 | # Including more categories will result in more calls to the large language model 6 | categories: ["Artificial Intelligence", "Computation and Language", "Machine Learning", "Information Retrieval"] 7 | 8 | # Relevance score threshold. abstracts that receive a score less than this from the large language model 9 | # will have their papers filtered out. 10 | # 11 | # Must be within 1-10 12 | threshold: 2 13 | 14 | # A natural language statement that the large language model will use to judge which papers are relevant 15 | # 16 | # For example: 17 | # "I am interested in complexity theory papers that establish upper bounds" 18 | # "gas chromatography, mass spectrometry" 19 | # "making lots of money" 20 | # 21 | # This can be empty, which just return a full list of papers with no judgement or filtering, 22 | # in whatever order arXiv responds with. 23 | interest: | 24 | 1. AI alignment and AI safety 25 | 2. Mechanistic interpretability and explainable AI 26 | 3. Large language model under pressure 27 | 4. AI Red teaming, deception and misalignment 28 | 5. RAGs, Information retrieval 29 | 6. Optimization of LLM and GenAI 30 | 7. Do not care about specific application, for example, information extraction, summarization, etc. 31 | -------------------------------------------------------------------------------- /src/design/find_design_papers.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | # Design papers finder script 3 | # Searches arXiv for design automation papers and generates reports 4 | # For full documentation, see ./README.md 5 | 6 | # Add help/usage function 7 | show_help() { 8 | echo "Usage: ./find_design_papers.sh [OPTIONS]" 9 | echo "" 10 | echo "Options:" 11 | echo " --days N Search papers from the last N days (default: 7)" 12 | echo " --keyword TERM Filter papers containing this keyword" 13 | echo " --analyze Use LLM to perform detailed analysis of papers" 14 | echo " --interest \"TEXT\" Custom research interest description for LLM" 15 | echo " --model MODEL Model to use for analysis (default: gpt-3.5-turbo-16k)" 16 | echo " --no-date Don't add date to output filenames" 17 | echo " --output FILE Custom JSON output path (default: data/design_papers_DATE.json)" 18 | echo " --html FILE Custom HTML output path (default: digest/design_papers_DATE.html)" 19 | echo " --help Show this help message" 20 | echo "" 21 | echo "Examples:" 22 | echo " ./find_design_papers.sh" 23 | echo " ./find_design_papers.sh --keyword \"layout\" --days 14" 24 | echo " ./find_design_papers.sh --analyze --interest \"UI/UX automation\"" 25 | } 26 | 27 | # Show help if requested 28 | if [[ "$1" == "--help" || "$1" == "-h" ]]; then 29 | show_help 30 | exit 0 31 | fi 32 | 33 | # Run the design papers finder with all arguments passed through 34 | python -m src.design.find_design_papers "$@" 35 | 36 | # Show success message 37 | if [ $? -eq 0 ]; then 38 | echo "✓ Design papers finder completed successfully!" 39 | echo " Open the HTML report in your browser to view results" 40 | else 41 | echo "✗ Design papers finder encountered an error" 42 | fi 43 | -------------------------------------------------------------------------------- /src/design/get_design_papers.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | # Legacy wrapper script for design papers finder - maintained for backward compatibility 3 | # For new scripts, use find_design_papers.sh instead 4 | 5 | # Show deprecation warning 6 | echo "⚠️ Warning: get_design_papers.sh is deprecated and will be removed in a future version" 7 | echo "⚠️ Please use find_design_papers.sh instead, which has more features and better output" 8 | echo "" 9 | 10 | # Default values 11 | DAYS=7 12 | OUTPUT="design_papers.json" 13 | KEYWORD="" 14 | ANALYZE="" 15 | 16 | # Parse command-line arguments 17 | while [[ $# -gt 0 ]]; do 18 | case $1 in 19 | --days) 20 | DAYS="$2" 21 | shift 2 22 | ;; 23 | --output) 24 | OUTPUT="$2" 25 | shift 2 26 | ;; 27 | --keyword) 28 | KEYWORD="$2" 29 | shift 2 30 | ;; 31 | --analyze) 32 | ANALYZE="--analyze" 33 | shift 34 | ;; 35 | --email) 36 | # Ignore email parameter - email functionality is removed 37 | echo "Note: Email functionality has been removed. HTML report will be generated locally only." 38 | shift 2 39 | ;; 40 | *) 41 | echo "Unknown option: $1" 42 | exit 1 43 | ;; 44 | esac 45 | done 46 | 47 | # Run the crawler using the new script 48 | echo "Searching for design papers from the last $DAYS days..." 49 | 50 | # Build the command 51 | CMD="./src/design/find_design_papers.sh --days $DAYS --output ./data/$OUTPUT --html ./digest/${OUTPUT%.json}.html" 52 | 53 | # Add keyword if specified 54 | if [ -n "$KEYWORD" ]; then 55 | CMD="$CMD --keyword \"$KEYWORD\"" 56 | fi 57 | 58 | # Add analyze if specified 59 | if [ -n "$ANALYZE" ]; then 60 | CMD="$CMD --analyze" 61 | fi 62 | 63 | # Execute the command 64 | eval $CMD 65 | 66 | echo "Done! View your results in ./digest/${OUTPUT%.json}.html" -------------------------------------------------------------------------------- /src/relevancy_prompt.txt: -------------------------------------------------------------------------------- 1 | You are a research assistant with expertise in analyzing academic papers, particularly in AI and machine learning. You've been asked to thoroughly analyze a list of arXiv papers, each with title, authors, abstract, and content. 2 | 3 | For each paper, provide: 4 | 1. A relevancy score out of 10 based on my specific research interests, with a higher score indicating greater relevance. A score of 7 or higher means this paper deserves special attention. 5 | 2. A comprehensive analysis that would help me understand the paper's value and contributions without having to read the entire paper. 6 | 7 | Please maintain the original paper order in your response, with one JSON object per line. Format: 8 | 9 | 1. { 10 | "Relevancy score": "an integer score out of 10", 11 | "Reasons for match": "A detailed paragraph explaining why this paper aligns with my research interests, highlighting specific concepts, methodologies, or findings that match my interests", 12 | "Key innovations": "2-3 bullet points describing the main contributions and what makes this paper novel", 13 | "Critical analysis": "A thoughtful paragraph evaluating the strengths and potential limitations of the approach", 14 | "Goal": "What specific problem or research gap does this paper address?", 15 | "Data": "Detailed description of datasets used, including size, characteristics, and any novel data processing techniques", 16 | "Methodology": "Comprehensive explanation of the methods, algorithms, and technical approach", 17 | "Implementation details": "Information about model architecture, hyperparameters, training procedures, and computational requirements", 18 | "Git": "Link to code repository if available, or note if code is not yet released", 19 | "Experiments & Results": "Analysis of experimental setup, key results, and how they compare to prior work or baselines", 20 | "Discussion & Next steps": "The authors' own conclusions, limitations they identified, and future research directions", 21 | "Related work": "How this paper relates to similar recent papers in the field", 22 | "Practical applications": "How the findings could be applied in real-world scenarios", 23 | "Key takeaways": "3-5 bullet points summarizing the most important insights from this paper" 24 | } 25 | 26 | My research interests are: AI Alignment, AI safety, Mechanistic Interpretability, Explainable AI, RAGs, Information Retrieval, Large Language Models, Multimodal Learning, Generative AI, Optimization in LLM, Model Efficiency, Fine-tuning Techniques, Prompt Engineering, and AI Evaluation Metrics. -------------------------------------------------------------------------------- /.github/workflows/daily_pipeline.yaml: -------------------------------------------------------------------------------- 1 | # This workflow will install Python dependencies, run tests and lint with a single version of Python 2 | # For more information see: https://help.github.com/actions/language-and-framework-guides/using-python-with-github-actions 3 | 4 | name: Daily pipeline 5 | 6 | on: 7 | workflow_dispatch: {} 8 | schedule: 9 | # * is a special character in YAML so you have to quote this string 10 | # Feel free to change this cron schedule 11 | # Currently its scheduled for 1:25 pm UTC, Sun-Thurs 12 | - cron: '25 13 * * 0-4' 13 | 14 | jobs: 15 | generate_and_send_digest: 16 | runs-on: ubuntu-latest 17 | steps: 18 | - uses: actions/checkout@v2 19 | - name: Set up Python 3.8 20 | uses: actions/setup-python@v2 21 | with: 22 | python-version: 3.8 23 | - name: Install dependencies 24 | run: | 25 | python -m pip install --upgrade pip 26 | pip install -r requirements.txt 27 | - name: Generate Digest 28 | run: | 29 | python src/action.py 30 | env: 31 | OPENAI_API_KEY: ${{ secrets.OPENAI_API_KEY }} 32 | SENDGRID_API_KEY: ${{ secrets.SENDGRID_API_KEY }} 33 | FROM_EMAIL: ${{ secrets.FROM_EMAIL }} 34 | TO_EMAIL: ${{ secrets.TO_EMAIL }} 35 | - name: Upload Artifact 36 | uses: actions/upload-artifact@v3 37 | with: 38 | name: digest.html 39 | path: digest.html 40 | - name: check 41 | id: check 42 | env: 43 | SENDGRID_API_KEY: ${{ secrets.SENDGRID_API_KEY }} 44 | MAIL_USERNAME: ${{ secrets.MAIL_USERNAME }} 45 | MAIL_PASSWORD: ${{ secrets.MAIL_PASSWORD }} 46 | MAIL_CONNECTION: ${{ secrets.MAIL_CONNECTION }} 47 | if: "${{ env.SENDGRID_API_KEY == '' && (env.MAIL_CONNECTION || env.MAIL_USERNAME != '' && env.MAIL_PASSWORD != '') }}" 48 | run: echo "DEFINED=true" >> $GITHUB_OUTPUT 49 | - name: Send mail 50 | uses: dawidd6/action-send-mail@v3 51 | env: 52 | DEFINED: ${{ steps.check.outputs.DEFINED }} 53 | if: ${{ env.DEFINED == 'true' }} 54 | with: 55 | # Specify connection via URL (replaces server_address, server_port, secure, 56 | # username and password) 57 | # 58 | # Format: 59 | # 60 | # * smtp://user:password@server:port 61 | # * smtp+starttls://user:password@server:port 62 | connection_url: ${{secrets.MAIL_CONNECTION}} 63 | # Required mail server address if not connection_url: 64 | server_address: smtp.gmail.com 65 | # Server port, default 25: 66 | server_port: 465 67 | username: ${{secrets.MAIL_USERNAME}} 68 | password: ${{secrets.MAIL_PASSWORD}} 69 | secure: true 70 | subject: Personalized arXiv Digest 71 | to: ${{ secrets.TO_EMAIL }} 72 | from: "Personalized arxiv digest" 73 | html_body: file://digest.html 74 | ignore_cert: true 75 | convert_markdown: true 76 | priority: normal 77 | -------------------------------------------------------------------------------- /src/design/README.md: -------------------------------------------------------------------------------- 1 | # 🎨 Design Paper Discovery 2 | 3 | This module specializes in finding and analyzing papers related to AI/ML for design automation. It crawls arXiv for design-related papers and provides detailed reports on recent research at the intersection of AI and design. 4 | 5 | ## Features 6 | 7 | - **Smart Paper Finding**: Automatically finds papers related to design automation and creative AI 8 | - **Multi-Category Search**: Searches across Computer Vision, Graphics, HCI, and other relevant arXiv categories 9 | - **Intelligent Categorization**: Sorts papers into design subcategories (UI/UX, Layout, Graphic Design, etc.) 10 | - **Technique Analysis**: Identifies AI techniques used (GANs, Diffusion Models, LLMs, etc.) 11 | - **LLM-Powered Analysis**: Optional in-depth analysis using OpenAI, Gemini, or Claude models 12 | - **HTML Reports**: Generates clean, organized HTML reports with paper statistics and details 13 | - **JSON Export**: Saves all paper data in structured JSON format for further processing 14 | 15 | ## Quick Start 16 | 17 | Run the main script from the project root directory: 18 | 19 | ```bash 20 | # Basic usage - find design papers from the last 7 days 21 | ./src/design/find_design_papers.sh 22 | 23 | # With keyword filtering - find design papers about layout generation 24 | ./src/design/find_design_papers.sh --keyword "layout" 25 | 26 | # With longer timeframe - find design papers from the last month 27 | ./src/design/find_design_papers.sh --days 30 28 | ``` 29 | 30 | ## Advanced Usage 31 | 32 | ```bash 33 | # With LLM analysis for comprehensive paper details 34 | ./src/design/find_design_papers.sh --analyze 35 | 36 | # Customize research interests for analysis 37 | ./src/design/find_design_papers.sh --analyze --interest "I'm looking for papers on UI/UX automation and layout generation with neural networks" 38 | 39 | # Change the model used for analysis 40 | ./src/design/find_design_papers.sh --analyze --model "gpt-4o" 41 | 42 | # Combined example with all major features 43 | ./src/design/find_design_papers.sh --days 14 --keyword "diffusion" --analyze --model "gpt-4o" --interest "I'm researching diffusion models for design applications" 44 | 45 | # Output files include the current date by default: 46 | # - data/design_papers_diffusion_20250406.json 47 | # - digest/design_papers_diffusion_20250406.html 48 | 49 | # Disable date in filenames if needed 50 | ./src/design/find_design_papers.sh --keyword "layout" --no-date 51 | ``` 52 | 53 | ## Parameters Reference 54 | 55 | | Parameter | Description | Default | 56 | |-----------|-------------|---------| 57 | | `--days N` | Number of days to search back | 7 | 58 | | `--keyword TERM` | Filter papers containing this keyword | none | 59 | | `--analyze` | Use LLM to perform detailed analysis | false | 60 | | `--interest "TEXT"` | Custom research interest for LLM | Design automation focus | 61 | | `--model MODEL` | Model to use for analysis | gpt-3.5-turbo-16k | 62 | | `--no-date` | Don't add date to output filenames | false | 63 | | `--output FILE` | Custom JSON output path | data/design_papers_DATE.json | 64 | | `--html FILE` | Custom HTML output path | digest/design_papers_DATE.html | 65 | | `--help` | Show help message | | 66 | 67 | ## Implementation Details 68 | 69 | The design paper discovery consists of these main components: 70 | 71 | 1. **find_design_papers.sh**: Main shell script interface with help and options 72 | 2. **find_design_papers.py**: Core Python implementation for arXiv discovery and analysis 73 | 3. **design_finder.py**: Alternative implementation with minimal dependencies 74 | 4. **get_design_papers.sh**: Legacy script (maintained for backward compatibility) 75 | 76 | ## Example Output 77 | 78 | The HTML report includes: 79 | - Summary statistics and paper counts by category and technique 80 | - Detailed paper listings with titles, authors, and abstracts 81 | - AI analysis sections when using the `--analyze` flag 82 | - Links to arXiv pages and PDF downloads 83 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | # Byte-compiled / optimized / DLL files 2 | __pycache__/ 3 | *.py[cod] 4 | *$py.class 5 | 6 | # C extensions 7 | *.so 8 | 9 | # Distribution / packaging 10 | .Python 11 | build/ 12 | develop-eggs/ 13 | dist/ 14 | downloads/ 15 | eggs/ 16 | .eggs/ 17 | lib/ 18 | lib64/ 19 | parts/ 20 | sdist/ 21 | var/ 22 | wheels/ 23 | share/python-wheels/ 24 | *.egg-info/ 25 | .installed.cfg 26 | *.egg 27 | MANIFEST 28 | 29 | # PyInstaller 30 | # Usually these files are written by a python script from a template 31 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 32 | *.manifest 33 | *.spec 34 | 35 | # Installer logs 36 | pip-log.txt 37 | pip-delete-this-directory.txt 38 | 39 | # Unit test / coverage reports 40 | htmlcov/ 41 | .tox/ 42 | .nox/ 43 | .coverage 44 | .coverage.* 45 | .cache 46 | nosetests.xml 47 | coverage.xml 48 | *.cover 49 | *.py,cover 50 | .hypothesis/ 51 | .pytest_cache/ 52 | cover/ 53 | 54 | # Translations 55 | *.mo 56 | *.pot 57 | 58 | # Django stuff: 59 | *.log 60 | local_settings.py 61 | db.sqlite3 62 | db.sqlite3-journal 63 | 64 | # Flask stuff: 65 | instance/ 66 | .webassets-cache 67 | 68 | # Scrapy stuff: 69 | .scrapy 70 | 71 | # Sphinx documentation 72 | docs/_build/ 73 | 74 | # PyBuilder 75 | .pybuilder/ 76 | target/ 77 | 78 | # Jupyter Notebook 79 | .ipynb_checkpoints 80 | 81 | # IPython 82 | profile_default/ 83 | ipython_config.py 84 | 85 | # pyenv 86 | # For a library or package, you might want to ignore these files since the code is 87 | # intended to run in multiple environments; otherwise, check them in: 88 | # .python-version 89 | 90 | # pipenv 91 | # According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control. 92 | # However, in case of collaboration, if having platform-specific dependencies or dependencies 93 | # having no cross-platform support, pipenv may install dependencies that don't work, or not 94 | # install all needed dependencies. 95 | #Pipfile.lock 96 | 97 | # poetry 98 | # Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control. 99 | # This is especially recommended for binary packages to ensure reproducibility, and is more 100 | # commonly ignored for libraries. 101 | # https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control 102 | #poetry.lock 103 | 104 | # pdm 105 | # Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control. 106 | #pdm.lock 107 | # pdm stores project-wide configurations in .pdm.toml, but it is recommended to not include it 108 | # in version control. 109 | # https://pdm.fming.dev/#use-with-ide 110 | .pdm.toml 111 | 112 | # PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm 113 | __pypackages__/ 114 | 115 | # Celery stuff 116 | celerybeat-schedule 117 | celerybeat.pid 118 | 119 | # SageMath parsed files 120 | *.sage.py 121 | 122 | # Environments 123 | .env 124 | .venv 125 | env/ 126 | venv/ 127 | ENV/ 128 | env.bak/ 129 | venv.bak/ 130 | 131 | # Spyder project settings 132 | .spyderproject 133 | .spyproject 134 | 135 | # Rope project settings 136 | .ropeproject 137 | 138 | # mkdocs documentation 139 | /site 140 | 141 | # mypy 142 | .mypy_cache/ 143 | .dmypy.json 144 | dmypy.json 145 | 146 | # Pyre type checker 147 | .pyre/ 148 | 149 | # pytype static type analyzer 150 | .pytype/ 151 | 152 | # Cython debug symbols 153 | cython_debug/ 154 | 155 | # PyCharm 156 | # JetBrains specific template is maintained in a separate JetBrains.gitignore that can 157 | # be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore 158 | # and can be added to the global gitignore or merged into this file. For a more nuclear 159 | # option (not recommended) you can uncomment the following to ignore the entire idea folder. 160 | #.idea/ 161 | 162 | # vim 163 | *.sw* 164 | -------------------------------------------------------------------------------- /src/download_new_papers.py: -------------------------------------------------------------------------------- 1 | # encoding: utf-8 2 | import os 3 | import re 4 | from urllib.error import HTTPError 5 | 6 | import tqdm 7 | from bs4 import BeautifulSoup as bs 8 | import urllib.request 9 | import json 10 | import datetime 11 | import pytz 12 | 13 | # Import standardized paths 14 | from paths import DATA_DIR 15 | 16 | #Linh - add new def crawl_html_version(html_link) here 17 | def crawl_html_version(html_link): 18 | main_content = [] 19 | try: 20 | # Add user-agent header to appear more like a browser 21 | headers = { 22 | 'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/121.0.0.0 Safari/537.36' 23 | } 24 | req = urllib.request.Request(html_link, headers=headers) 25 | html = urllib.request.urlopen(req) 26 | except HTTPError as e: 27 | return f"Error accessing HTML: {str(e)}" 28 | 29 | soup = bs(html) 30 | content = soup.find('div', attrs={'class': 'ltx_page_content'}) 31 | if not content: 32 | return "Content not available in HTML format" 33 | para_list = content.find_all("div", attrs={'class': 'ltx_para'}) 34 | 35 | for each in para_list: 36 | main_content.append(each.text.strip()) 37 | return ' '.join(main_content)[:10000] 38 | #if len(main_content >) 39 | #return ''.join(main_content) if len(main_content) < 20000 else ''.join(main_content[:20000]) 40 | 41 | #Linh - add because cs sub does not have abstract displayed, will revert if it comes back 42 | def crawl_abstract(html_link): 43 | main_content = [] 44 | try: 45 | html = urllib.request.urlopen(html_link) 46 | except HTTPError as e: 47 | return ["None"] 48 | soup = bs(html) 49 | content = soup.find('blockquote', attrs={'class': 'abstract'}).text.replace("Abstract:", "").strip() 50 | return content 51 | def _download_new_papers(field_abbr): 52 | NEW_SUB_URL = f'https://arxiv.org/list/{field_abbr}/new' # https://arxiv.org/list/cs/new 53 | print(NEW_SUB_URL) 54 | # Add user-agent header to appear more like a browser 55 | headers = { 56 | 'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/121.0.0.0 Safari/537.36' 57 | } 58 | req = urllib.request.Request(NEW_SUB_URL, headers=headers) 59 | page = urllib.request.urlopen(req) 60 | 61 | soup = bs(page) 62 | content = soup.body.find("div", {'id': 'content'}) 63 | 64 | # find the first h3 element in content 65 | h3 = content.find("h3").text # e.g: New submissions for Wed, 10 May 23 66 | date = h3.replace("New submissions for", "").strip() 67 | 68 | dt_list = content.dl.find_all("dt") 69 | dd_list = content.dl.find_all("dd") 70 | arxiv_base = "https://arxiv.org/abs/" 71 | arxiv_html = "https://arxiv.org/html/" 72 | 73 | assert len(dt_list) == len(dd_list) 74 | new_paper_list = [] 75 | for i in tqdm.tqdm(range(len(dt_list))): 76 | paper = {} 77 | ahref = dt_list[i].find('a', href = re.compile(r'[/]([a-z]|[A-Z])\w+')).attrs['href'] 78 | paper_number = ahref.strip().replace("/abs/", "") 79 | 80 | paper['main_page'] = arxiv_base + paper_number 81 | paper['pdf'] = arxiv_base.replace('abs', 'pdf') + paper_number 82 | 83 | paper['title'] = dd_list[i].find("div", {"class": "list-title mathjax"}).text.replace("Title:\n", "").strip() 84 | paper['authors'] = dd_list[i].find("div", {"class": "list-authors"}).text \ 85 | .replace("Authors:\n", "").replace("\n", "").strip() 86 | paper['subjects'] = dd_list[i].find("div", {"class": "list-subjects"}).text.replace("Subjects:\n", "").strip() 87 | #print(dd_list[i].find("div", {"class": "list-subjects"}).text.replace("Subjects:\n", "").strip()) 88 | 89 | #TODO: edit the abstract part - it is currently moved 90 | paper['abstract'] = dd_list[i].find("p", {"class": "mathjax"}).text.replace("\n", " ").strip() 91 | try: 92 | paper['content'] = crawl_html_version(arxiv_html + paper_number + "v1") 93 | except Exception as e: 94 | paper['content'] = f"Error fetching content: {str(e)}" 95 | new_paper_list.append(paper) 96 | 97 | 98 | # DATA_DIR is already created by paths.py 99 | 100 | # save new_paper_list to a jsonl file, with each line as the element of a dictionary 101 | date = datetime.date.fromtimestamp(datetime.datetime.now(tz=pytz.timezone("America/New_York")).timestamp()) 102 | date = date.strftime("%a, %d %b %y") 103 | file_path = os.path.join(DATA_DIR, f"{field_abbr}_{date}.jsonl") 104 | with open(file_path, "w") as f: 105 | for paper in new_paper_list: 106 | f.write(json.dumps(paper) + "\n") 107 | 108 | 109 | def get_papers(field_abbr, limit=None): 110 | date = datetime.date.fromtimestamp(datetime.datetime.now(tz=pytz.timezone("America/New_York")).timestamp()) 111 | date = date.strftime("%a, %d %b %y") 112 | file_path = os.path.join(DATA_DIR, f"{field_abbr}_{date}.jsonl") 113 | if not os.path.exists(file_path): 114 | _download_new_papers(field_abbr) 115 | results = [] 116 | with open(file_path, "r") as f: 117 | for i, line in enumerate(f.readlines()): 118 | if limit and i == limit: 119 | return results 120 | results.append(json.loads(line)) 121 | return results 122 | 123 | #crawl_html_version("https://arxiv.org/html/2404.11972v1") 124 | -------------------------------------------------------------------------------- /advanced_usage.md: -------------------------------------------------------------------------------- 1 | # Advanced Usage 2 | 3 | ## Step-by-step instructions for running as a Github action 4 | 5 | ### Fork the repository 6 | 7 | Click the fork button at the top of this repository page, as seen on the below image. This will create your own version of the repository, including your own set of github actions 8 | 9 | ![fork](./readme_images/fork.png) 10 | 11 | ### Modify the configuration file 12 | 13 | Modify `config.yaml` by cloning the respository and merging your changes 14 | 15 | ### Create and Fetch your API Keys 16 | 17 | - Create or fetch your API key for [OpenAI](https://platform.openai.com/account/api-keys). Note: you will need an OpenAI account. 18 | ![fork](./readme_images/openai.png) 19 | 20 | - Create or fetch your API key for [SendGrid](https://app.SendGrid.com/settings/api_keys). You will need a SendGrid account. The free tier will generally suffice. Make sure to [verify your sender identity](https://docs.sendgrid.com/for-developers/sending-email/sender-identity). 21 | - Sign Up for [SendGrid](https://app.sendgrid.com). Fill in the necessary information, including email, password, and a company name. If you don't have a company, you can use a made-up name. 22 | - You'll need to verify your email address to activate your account. 23 | - On your main dashboard, access the Integration Guide under Email API 24 | - Next, on the "Integrate using our Web API or SMTP Relay"-page, choose the "Web API" option. 25 | - Choose the language you're planning to use, in this case, select "Python". 26 | - You'll be prompted to provide a name for your API key. Enter a name and click "Create Key". 27 | - Copy the API Key that appears for the next step below. You won't be able to view the full key again. 28 | 29 | ### Set the secrets for the github action 30 | 31 | Go to the Settings tab on the top of this page, and then the "Actions" menu under "Secrets and variables": 32 | 33 | ![settings](./readme_images/settings.png) 34 | 35 | Create a new repository secret for each of the following using the button in the below image: 36 | - `OPENAI_API_KEY` 37 | - `SENDGRID_API_KEY` 38 | - `FROM_EMAIL` 39 | - `TO_EMAIL` 40 | 41 | ![secret](./readme_images/secrets.png) 42 | 43 | ### Manually trigger the action, or wait until the scheduled trigger 44 | 45 | Go to the actions tab, and then click on "Daily Workflow" and "Run Workflow" 46 | 47 | ![trigger](./readme_images/trigger.png) 48 | 49 | ## Additional Configuration 50 | 51 | - If you want a different schedule than Sunday through Thursday at 1:25PM UTC, then modify the file `.github/workflows/daily_pipeline.yaml` 52 | 53 | 54 | ## Alternative Usage 55 | 56 | Running `src/action.py` will generate an HTML file that can then be emailed. The following alternative usage methods all use that pattern 57 | 58 | ### Running as a github action with SMTP credentials. 59 | 60 | An alternative way to get started using this repository is to: 61 | 62 | 1. Fork the repository 63 | 2. Modify `config.yaml` and merge the changes into your main branch. If you want a different schedule than Sunday through Thursday at 1:25PM UTC, then also modify the file `.github/workflows/daily_pipeline.yaml` 64 | 3. Create or fetch your API key for [OpenAI](https://platform.openai.com/account/api-keys). 65 | 4. Find your email provider's SMTP settings and set the secret `MAIL_CONNECTION` to that. It should be in the form `smtp://user:password@server:port` or `smtp+starttls://user:password@server:port`. Alternatively, if you are using Gmail, you can set `MAIL_USERNAME` and `MAIL_PASSWORD` instead, using an [application password](https://support.google.com/accounts/answer/185833). 66 | 5. Set the following secrets [(under settings, Secrets and variables, repository secrets)](https://docs.github.com/en/actions/security-guides/encrypted-secrets#creating-encrypted-secrets-for-a-repository): 67 | - `OPENAI_API_KEY` 68 | - `MAIL_CONNECTION` (see above) 69 | - `MAIL_PASSWORD` (only if you don't have `MAIL_CONNECTION` set) 70 | - `MAIL_USERNAME` (only if you don't have `MAIL_CONNECTION` set) 71 | - `FROM_EMAIL` 72 | - `TO_EMAIL` 73 | 6. Manually trigger the action or wait until the scheduled action takes place. 74 | 75 | ### Running as a github action without emails 76 | 77 | If you do not wish to create a SendGrid account or use your email authentication, the action will also emit an artifact containing the HTML output. Simply do not create the SendGrid or SMTP secrets. 78 | 79 | You can access this digest as part of the github action artifact. 80 | 81 | ![artifact](./readme_images/artifact.png) 82 | 83 | ### Running from the command line 84 | 85 | If you do not wish to fork this repository, and would prefer to clone and run it locally instead: 86 | 87 | 1. Install the requirements in `src/requirements.txt` 88 | 2. Modify the configuration file `config.yaml` 89 | 3. Create or fetch your API key for [OpenAI](https://platform.openai.com/account/api-keys). 90 | 4. Create or fetch your API key for [SendGrid](https://app.SendGrid.com/settings/api_keys) (optional, if you want the script to email you) 91 | 5. Set the following secrets as environment variables: 92 | - `OPENAI_API_KEY` 93 | - `SENDGRID_API_KEY` (only if using SendGrid) 94 | - `FROM_EMAIL` (only if using SendGrid. Note that this value must match the email you used to create the SendGrid Api Key.) 95 | - `TO_EMAIL` (only if using SendGrid) 96 | 6. Run `python action.py`. 97 | 7. If you are not using SendGrid, the html of the digest will be written to `digest.html`. You can then use your favorite webbrowser to view it. 98 | 99 | You may want to use something like crontab to schedule the digest. 100 | -------------------------------------------------------------------------------- /src/fix_parser.py: -------------------------------------------------------------------------------- 1 | """ 2 | A script to fix and test the OpenAI response parsing. 3 | """ 4 | import json 5 | import re 6 | import os 7 | 8 | def is_valid_json(text): 9 | try: 10 | json.loads(text) 11 | return True 12 | except json.JSONDecodeError: 13 | return False 14 | 15 | def extract_json_from_string(text): 16 | """ 17 | Attempt to extract JSON from a string by finding '{'...'}' 18 | """ 19 | # Find the outermost JSON object 20 | stack = [] 21 | start_idx = -1 22 | 23 | for i, char in enumerate(text): 24 | if char == '{' and start_idx == -1: 25 | start_idx = i 26 | stack.append(char) 27 | elif char == '{': 28 | stack.append(char) 29 | elif char == '}' and stack: 30 | stack.pop() 31 | if not stack and start_idx != -1: 32 | # Found complete JSON object 33 | json_str = text[start_idx:i+1] 34 | try: 35 | parsed = json.loads(json_str) 36 | return parsed 37 | except json.JSONDecodeError: 38 | # If this one fails, continue looking 39 | start_idx = -1 40 | 41 | return None 42 | 43 | def fix_openai_response(response_text): 44 | """ 45 | Fix the OpenAI response by handling different formats and parsing the JSON. 46 | Returns a list of dictionaries with paper analysis. 47 | """ 48 | # First, try to parse the entire response as JSON 49 | cleaned_text = response_text.strip() 50 | 51 | # Try to extract JSON directly 52 | if '{' in cleaned_text and '}' in cleaned_text: 53 | json_obj = extract_json_from_string(cleaned_text) 54 | if json_obj and "Relevancy score" in json_obj: 55 | print(f"Successfully extracted JSON with score {json_obj['Relevancy score']}") 56 | return [json_obj] 57 | 58 | return [] 59 | 60 | # Example usage 61 | if __name__ == "__main__": 62 | example_response = """ 63 | "Relevancy score": 7, 64 | "Reasons for match": "This paper aligns with your research interests as it explores the application of Large Language Models (LLMs) in the context of hardware design. It introduces a unified framework, Marco, that integrates configurable graph-based task solving with multi-modality and multi-AI agents for chip design. This is relevant to your interests in AI Alignment, AI safety, Large Language Models, and Multimodal Learning.", 65 | "Key innovations": [ 66 | "Introduction of Marco, a unified framework that integrates configurable graph-based task solving with multi-modality and multi-AI agents for chip design.", 67 | "Demonstration of promising performance, productivity, and efficiency of LLM agents by leveraging the Marco framework on layout optimization, Verilog/design rule checker (DRC) coding, and timing analysis tasks." 68 | ], 69 | "Critical analysis": "The paper presents a novel approach to leveraging LLMs in the field of hardware design, which could have significant implications for improving efficiency and reducing costs. However, without access to the full paper, it's difficult to assess the strengths and potential limitations of the approach.", 70 | "Goal": "The paper addresses the challenge of optimizing performance, power, area, and cost (PPAC) during synthesis, verification, physical design, and reliability loops in hardware design. It aims to reduce turn-around-time (TAT) for these processes by leveraging the capabilities of LLMs.", 71 | "Data": "Unable to provide details about the datasets used due to lack of access to the full paper content.", 72 | "Methodology": "The paper proposes a unified framework, Marco, that integrates configurable graph-based task solving with multi-modality and multi-AI agents for chip design. However, detailed methodology is not available due to lack of access to the full paper content.", 73 | "Implementation details": "Unable to provide implementation details due to lack of access to the full paper content.", 74 | "Git": "Link to code repository is not provided in the abstract.", 75 | "Experiments & Results": "The abstract mentions that the Marco framework demonstrates promising performance on layout optimization, Verilog/design rule checker (DRC) coding, and timing analysis tasks. However, detailed results and comparisons are not available due to lack of access to the full paper content.", 76 | "Discussion & Next steps": "Unable to provide details on the authors' conclusions, identified limitations, and future research directions due to lack of access to the full paper content.", 77 | "Related work": "Unable to provide details on how this paper relates to similar recent papers in the field due to lack of access to the full paper content.", 78 | "Practical applications": "The framework proposed in this paper could have practical applications in the field of hardware design, potentially leading to faster product cycles, lower costs, improved design reliability and reduced risk of costly errors.", 79 | "Key takeaways": [ 80 | "The paper proposes a unified framework, Marco, that integrates configurable graph-based task solving with multi-modality and multi-AI agents for chip design.", 81 | "The Marco framework leverages the capabilities of Large Language Models (LLMs) to improve efficiency and reduce costs in hardware design.", 82 | "The framework demonstrates promising performance on layout optimization, Verilog/design rule checker (DRC) coding, and timing analysis tasks." 83 | ] 84 | } 85 | """ 86 | 87 | # Test the fix 88 | results = fix_openai_response(example_response) 89 | print(f"Found {len(results)} paper analyses") 90 | for i, result in enumerate(results): 91 | print(f"Paper {i+1} score: {result.get('Relevancy score', 'Not found')}") -------------------------------------------------------------------------------- /src/design_papers_crawler.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | """ 3 | Design Papers Crawler - A dedicated script to find the latest papers 4 | on graphic design automation using AI/ML/LLM technologies. 5 | 6 | Usage: 7 | python design_papers_crawler.py [--days 7] [--output design_papers.json] 8 | """ 9 | 10 | import os 11 | import sys 12 | import json 13 | import argparse 14 | import datetime 15 | import logging 16 | from typing import List, Dict, Any 17 | 18 | # Add parent directory to path to import from sibling modules 19 | sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__)))) 20 | 21 | from src.download_new_papers import get_papers, _download_new_papers 22 | from src.design_automation import ( 23 | is_design_automation_paper, 24 | categorize_design_paper, 25 | analyze_design_techniques, 26 | extract_design_metrics 27 | ) 28 | from src.paths import DATA_DIR, DIGEST_DIR 29 | 30 | # Configure logging 31 | logging.basicConfig( 32 | level=logging.INFO, 33 | format='%(asctime)s - %(name)s - %(levelname)s - %(message)s' 34 | ) 35 | logger = logging.getLogger(__name__) 36 | 37 | # Default arXiv categories to search 38 | DEFAULT_CATEGORIES = [ 39 | "cs.CV", # Computer Vision 40 | "cs.GR", # Graphics 41 | "cs.HC", # Human-Computer Interaction 42 | "cs.AI", # Artificial Intelligence 43 | "cs.LG", # Machine Learning 44 | "cs.CL", # Computation and Language (NLP) 45 | "cs.MM", # Multimedia 46 | "cs.SD", # Sound 47 | "cs.RO", # Robotics (for interactive design) 48 | "cs.CY" # Computers and Society 49 | ] 50 | 51 | def get_date_range(days_back: int = 7) -> List[str]: 52 | """ 53 | Get a list of dates for the past N days in arXiv format. 54 | 55 | Args: 56 | days_back: Number of days to look back 57 | 58 | Returns: 59 | List of date strings in arXiv format 60 | """ 61 | today = datetime.datetime.now() 62 | dates = [] 63 | 64 | for i in range(days_back): 65 | date = today - datetime.timedelta(days=i) 66 | date_str = date.strftime("%a, %d %b %y") 67 | dates.append(date_str) 68 | 69 | return dates 70 | 71 | def ensure_data_files(categories: List[str], days_back: int = 7) -> None: 72 | """ 73 | Make sure data files exist for the specified categories and date range. 74 | 75 | Args: 76 | categories: List of arXiv category codes 77 | days_back: Number of days to look back 78 | """ 79 | dates = get_date_range(days_back) 80 | 81 | for category in categories: 82 | for date_str in dates: 83 | file_path = os.path.join(DATA_DIR, f"{category}_{date_str}.jsonl") 84 | 85 | if not os.path.exists(file_path): 86 | logger.info(f"Downloading papers for {category} on {date_str}") 87 | try: 88 | _download_new_papers(category) 89 | except Exception as e: 90 | logger.error(f"Error downloading {category} papers for {date_str}: {e}") 91 | 92 | def get_design_papers(categories: List[str], days_back: int = 7) -> List[Dict[str, Any]]: 93 | """ 94 | Get design automation papers from specified categories over a date range. 95 | 96 | Args: 97 | categories: List of arXiv category codes 98 | days_back: Number of days to look back 99 | 100 | Returns: 101 | List of design automation papers 102 | """ 103 | # Ensure data files exist 104 | ensure_data_files(categories, days_back) 105 | 106 | # Collect papers 107 | all_papers = [] 108 | dates = get_date_range(days_back) 109 | 110 | for category in categories: 111 | for date_str in dates: 112 | try: 113 | papers = get_papers(category) 114 | all_papers.extend(papers) 115 | except Exception as e: 116 | logger.warning(f"Could not get papers for {category} on {date_str}: {e}") 117 | 118 | # Remove duplicates (papers can appear in multiple categories) 119 | unique_papers = {} 120 | for paper in all_papers: 121 | paper_id = paper.get("main_page", "").split("/")[-1] 122 | if paper_id and paper_id not in unique_papers: 123 | unique_papers[paper_id] = paper 124 | 125 | # Filter design automation papers 126 | design_papers = [] 127 | for paper_id, paper in unique_papers.items(): 128 | if is_design_automation_paper(paper): 129 | paper["paper_id"] = paper_id 130 | paper["design_category"] = categorize_design_paper(paper) 131 | paper["design_techniques"] = analyze_design_techniques(paper) 132 | paper["design_metrics"] = extract_design_metrics(paper) 133 | design_papers.append(paper) 134 | 135 | # Sort by date (newest first) 136 | design_papers.sort(key=lambda p: p.get("main_page", ""), reverse=True) 137 | 138 | return design_papers 139 | 140 | def print_paper_summary(paper: Dict[str, Any]) -> None: 141 | """ 142 | Print a nice summary of a paper to the console. 143 | 144 | Args: 145 | paper: Paper dictionary 146 | """ 147 | print(f"\n{'=' * 80}") 148 | print(f"TITLE: {paper.get('title', 'No title')}") 149 | print(f"AUTHORS: {paper.get('authors', 'No authors')}") 150 | print(f"URL: {paper.get('main_page', 'No URL')}") 151 | print(f"DESIGN CATEGORY: {paper.get('design_category', 'Unknown')}") 152 | print(f"TECHNIQUES: {', '.join(paper.get('design_techniques', []))}") 153 | print(f"METRICS: {', '.join(paper.get('design_metrics', []))}") 154 | print(f"\nABSTRACT: {paper.get('abstract', 'No abstract')[:500]}...") 155 | print(f"{'=' * 80}\n") 156 | 157 | def main(): 158 | """Main function to run the design papers crawler.""" 159 | parser = argparse.ArgumentParser(description="Find the latest graphic design automation papers.") 160 | parser.add_argument("--days", type=int, default=7, help="Number of days to look back") 161 | parser.add_argument("--output", type=str, default="design_papers.json", help="Output file path") 162 | parser.add_argument("--categories", type=str, nargs="+", default=DEFAULT_CATEGORIES, 163 | help="arXiv categories to search") 164 | args = parser.parse_args() 165 | 166 | logger.info(f"Looking for design papers in the past {args.days} days") 167 | logger.info(f"Searching categories: {', '.join(args.categories)}") 168 | 169 | # DATA_DIR is already created by paths.py 170 | 171 | # Get design papers 172 | design_papers = get_design_papers(args.categories, args.days) 173 | 174 | logger.info(f"Found {len(design_papers)} design automation papers") 175 | 176 | # Print summary to console 177 | for paper in design_papers[:10]: # Print top 10 178 | print_paper_summary(paper) 179 | 180 | if len(design_papers) > 10: 181 | print(f"...and {len(design_papers) - 10} more papers.") 182 | 183 | # Determine output path - ensure it's in DATA_DIR 184 | output_path = os.path.join(DATA_DIR, args.output) 185 | 186 | # Save to file 187 | with open(output_path, "w") as f: 188 | json.dump(design_papers, f, indent=2) 189 | 190 | logger.info(f"Saved {len(design_papers)} papers to {output_path}") 191 | print(f"\nResults saved to {output_path}") 192 | 193 | if __name__ == "__main__": 194 | main() -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 |

2 | 3 | # ArXiv Digest (Enhanced Edition) 4 | 5 | **Personalized arXiv Paper Recommendations with Multiple AI Models** 6 | 7 | This repository provides an enhanced daily digest for newly published arXiv papers based on your research interests, leveraging multiple AI models including OpenAI GPT, Google Gemini, and Anthropic Claude to provide relevancy ratings, detailed analysis, and topic clustering. 8 | 9 | ## 📚 Contents 10 | 11 | - [Features](#-features) 12 | - [Quick Start](#-quick-start) 13 | - [What This Repo Does](#-what-this-repo-does) 14 | - [Model Integrations](#-model-integrations) 15 | - [Design Paper Discovery](#-design-paper-discovery) 16 | - [Output Formats](#-output-formats) 17 | - [Setting Up and Usage](#-setting-up-and-usage) 18 | * [Configuration](#configuration) 19 | * [Running the Web Interface](#running-the-web-interface) 20 | * [Running via GitHub Action](#running-via-github-action) 21 | * [Running from Command Line](#running-from-command-line) 22 | - [API Usage Notes](#-api-usage-notes) 23 | - [Directory Structure](#-directory-structure) 24 | - [Roadmap](#-roadmap) 25 | - [Contributing](#-contributing) 26 | 27 | ## ✨ Features 28 | 29 | - **Multi-Model Integration**: Support for OpenAI, Gemini, and Claude models for paper analysis 30 | - **Latest Models**: Support for GPT-4o, GPT-4o mini, Claude 3.5, and other current models 31 | - **Two-Stage Processing**: Efficient paper analysis with quick filtering followed by detailed analysis 32 | - **Enhanced Analysis**: Detailed paper breakdowns including key innovations, critical analysis, and practical applications 33 | - **HTML Report Generation**: Clean, organized reports saved with date-based filenames 34 | - **Adjustable Relevancy Threshold**: Interactive slider for filtering papers by relevance score 35 | - **Design Automation Backend**: Specialized tools for analyzing design-related papers 36 | - **Topic Clustering**: Group similar papers using AI-powered clustering (Gemini) 37 | - **Robust JSON Parsing**: Reliable extraction of analysis results from LLM responses 38 | - **Standardized Directory Structure**: Organized codebase with `/src`, `/data`, and `/digest` directories 39 | - **Improved Web UI**: Clean Gradio interface with dynamic topic selection and error handling 40 | 41 | ![](./readme_images/UIarxiv.png) 42 | 43 | ## 🚀 Quick Start 44 | 45 | Try it out on [Hugging Face](https://huggingface.co/spaces/linhkid91/ArxivDigest-extra) using your own API keys. 46 | 47 | ## 🔍 What This Repo Does 48 | 49 | Staying up to date on [arXiv](https://arxiv.org) papers is time-consuming, with hundreds of new papers published daily. Even with the [official daily digest service](https://info.arxiv.org/help/subscribe.html), categories like [cs.AI](https://arxiv.org/list/cs.AI/recent) still contain 50-100 papers per day. 50 | 51 | This repository creates a personalized daily digest by: 52 | 53 | 1. **Crawling arXiv** for recent papers in your areas of interest 54 | 2. **Analyzing papers** in-depth using AI models (OpenAI, Gemini, or Claude) 55 | 3. **Two-stage processing** for efficiency: 56 | - Stage 1: Quick relevancy filtering using only title and abstract 57 | - Stage 2: Detailed analysis of papers that meet the relevancy threshold 58 | 4. **Scoring relevance** on a scale of 1-10 based on your research interests 59 | 5. **Providing detailed analysis** of each paper, including: 60 | - Key innovations 61 | - Critical analysis 62 | - Implementation details 63 | - Practical applications 64 | - Related work 65 | 6. **Generating reports** in HTML format with clean organization 66 | 67 | ## 🤖 Model Integrations 68 | 69 | The system supports three major AI providers: 70 | 71 | - **OpenAI GPT** (gpt-3.5-turbo-16k, gpt-4, gpt-4-turbo, gpt-4o, gpt-4o-mini) 72 | - **Google Gemini** (gemini-1.5-flash, gemini-1.5-pro, gemini-2.0-flash) 73 | - **Anthropic Claude** (claude-3-haiku, claude-3-sonnet, claude-3-opus, claude-3.5-sonnet) 74 | 75 | You can use any combination of these models, allowing you to compare results or choose based on your needs. 76 | 77 | ## 📊 Output Formats 78 | 79 | Reports are generated in multiple formats: 80 | 81 | - **HTML Reports**: Clean, organized reports saved to the `/digest` directory with date-based filenames 82 | - **Console Output**: Summary information displayed in the terminal 83 | - **JSON Data**: Raw paper data saved to the `/data` directory 84 | 85 | Every HTML report includes: 86 | - Paper title, authors, and link to arXiv 87 | - Relevancy score with explanation 88 | - Abstract and key innovations 89 | - Critical analysis and implementation details 90 | - Experiments, results, and discussion points 91 | - Related work and practical applications 92 | 93 | Example HTML report: 94 | 95 | ![](/readme_images/example_report.png) 96 | ## 💡 Setting Up and Usage 97 | 98 | ### Configuration 99 | 100 | Modify `config.yaml` with your preferences: 101 | 102 | ```yaml 103 | # Main research area 104 | topic: "Computer Science" 105 | 106 | # Specific categories to monitor 107 | categories: ["Artificial Intelligence", "Computation and Language", "Machine Learning", "Information Retrieval"] 108 | 109 | # Minimum relevance score (1-10) 110 | threshold: 2 111 | 112 | # Your research interests in natural language 113 | interest: | 114 | 1. AI alignment and AI safety 115 | 2. Mechanistic interpretability and explainable AI 116 | 3. Large language model optimization 117 | 4. RAGs, Information retrieval 118 | 5. AI Red teaming, deception and misalignment 119 | ``` 120 | 121 | ### Running the Web Interface 122 | 123 | To run locally with the simplified UI: 124 | 125 | 1. Install requirements: `pip install -r requirements.txt` 126 | 2. Run the app: `python src/app_new.py` 127 | 3. Open the URL displayed in your terminal 128 | 4. Enter your API key(s) and configure your preferences 129 | 5. Use the relevancy threshold slider to adjust paper filtering (default is 2) 130 | 131 | ### Running via GitHub Action 132 | 133 | To set up automated daily digests: 134 | 135 | 1. Fork this repository 136 | 2. Update `config.yaml` with your preferences 137 | 3. Set the following secrets in your repository settings: 138 | - `OPENAI_API_KEY` (and/or `GEMINI_API_KEY` or `ANTHROPIC_API_KEY`) 139 | 4. The GitHub Action will run on schedule or can be triggered manually 140 | 141 | ### Running from Command Line 142 | 143 | For advanced users: 144 | 145 | ```bash 146 | # Regular paper digests with simplified UI 147 | python src/app_new.py 148 | 149 | # Design paper finder 150 | ./src/design/find_design_papers.sh --days 7 --analyze 151 | ``` 152 | 153 | ## ⚠️ API Usage Notes 154 | 155 | This tool respects arXiv's robots.txt and implements proper rate limiting. If you encounter 403 Forbidden errors: 156 | 157 | 1. Wait a few hours before trying again 158 | 2. Consider reducing the number of categories you're fetching 159 | 3. Increase the delay between requests in the code 160 | 161 | ## 📁 Directory Structure 162 | 163 | The repository is organized as follows: 164 | 165 | - `/src` - All Python source code 166 | - `app_new.py` - Simplified interface with improved threshold handling and UI 167 | - `download_new_papers.py` - arXiv crawler 168 | - `relevancy.py` - Paper scoring and analysis with robust JSON parsing 169 | - `model_manager.py` - Multi-model integration 170 | - `gemini_utils.py` - Gemini API integration 171 | - `anthropic_utils.py` - Claude API integration 172 | - `design/` - Design automation tools 173 | - `paths.py` - Standardized path handling 174 | - `/data` - JSON data files (auto-created) 175 | - `/digest` - HTML report files (auto-created) 176 | 177 | ## ✅ Roadmap 178 | 179 | - [x] Support multiple AI models (OpenAI, Gemini, Claude) 180 | - [x] Generate comprehensive HTML reports with date-based filenames 181 | - [x] Specialized analysis for design automation papers 182 | - [x] Topic clustering via Gemini 183 | - [x] Standardized directory structure 184 | - [x] Enhanced HTML reports with detailed analysis sections 185 | - [x] Pre-filtering of arXiv categories for efficiency 186 | - [x] Adjustable relevancy threshold with UI slider 187 | - [x] Robust JSON parsing for reliable LLM response handling 188 | - [x] Simplified UI focused on core functionality 189 | - [x] Dynamic topic selection UI with improved error handling 190 | - [x] Support for newer models (GPT-4o, GPT-4o mini, Claude 3.5) 191 | - [x] Two-stage paper processing for efficiency (quick filtering followed by detailed analysis) 192 | - [x] Removed email functionality in favor of local HTML reports 193 | - [ ] Full PDF content analysis 194 | - [ ] Author-based ranking and filtering 195 | - [ ] Fine-tuned open-source model support: Ollama, LocalAI... 196 | 197 | ## 💁 Contributing 198 | 199 | You're encouraged to modify this code for your personal needs. If your modifications would be useful to others, please submit a pull request. 200 | 201 | Valuable contributions include: 202 | - Additional AI model integrations 203 | - New analysis capabilities 204 | - UI improvements 205 | - Prompt engineering enhancements 206 | -------------------------------------------------------------------------------- /src/interpretability_analysis.py: -------------------------------------------------------------------------------- 1 | """ 2 | Specialized module for mechanistic interpretability and technical AI safety analysis. 3 | """ 4 | import json 5 | import logging 6 | from typing import Dict, Any, List, Optional 7 | 8 | # Configure logging 9 | logging.basicConfig(level=logging.INFO) 10 | logger = logging.getLogger(__name__) 11 | 12 | # Prompts for specialized analysis 13 | MECHANISTIC_INTERPRETABILITY_PROMPT = """ 14 | You are a research assistant specializing in mechanistic interpretability of AI systems. 15 | 16 | Analyze this paper from the perspective of mechanistic interpretability: 17 | 18 | Title: {title} 19 | Authors: {authors} 20 | Abstract: {abstract} 21 | Content: {content} 22 | 23 | Please provide a detailed analysis covering: 24 | 25 | 1. Relevance to mechanistic interpretability: How does this paper contribute to understanding the internal workings of models? 26 | 2. Interpretability techniques: What specific methods or approaches does the paper use to explain model behavior? 27 | 3. Circuit analysis: Does the paper identify specific circuits or computational components within models? 28 | 4. Attribution methods: What techniques are used to attribute model outputs to internal components? 29 | 5. Novel insights: What new understanding does this paper bring to model internals? 30 | 6. Limitations: What are the limitations of the approach from an interpretability perspective? 31 | 7. Future directions: What follow-up work would be valuable? 32 | 8. Connections to other interpretability research: How does this relate to other work in the field? 33 | 34 | Format your response as JSON with these fields. 35 | """ 36 | 37 | TECHNICAL_AI_SAFETY_PROMPT = """ 38 | You are a research assistant specializing in technical AI safety. 39 | 40 | Analyze this paper from the perspective of technical AI safety: 41 | 42 | Title: {title} 43 | Authors: {authors} 44 | Abstract: {abstract} 45 | Content: {content} 46 | 47 | Please provide a detailed analysis covering: 48 | 49 | 1. Relevance to AI safety: How does this paper contribute to building safer AI systems? 50 | 2. Safety approaches: What specific methods or approaches does the paper use to improve AI safety? 51 | 3. Robustness: How does the paper address model robustness to distribution shifts or adversarial attacks? 52 | 4. Alignment: Does the paper discuss techniques for aligning AI systems with human values? 53 | 5. Risk assessment: What potential risks or failure modes does the paper address? 54 | 6. Monitoring and oversight: What methods are proposed for monitoring or controlling AI systems? 55 | 7. Limitations: What are the limitations of the approach from a safety perspective? 56 | 8. Future directions: What follow-up work would be valuable for improving safety? 57 | 58 | Format your response as JSON with these fields. 59 | """ 60 | 61 | PROMPT_TEMPLATES = { 62 | "mechanistic_interpretability": MECHANISTIC_INTERPRETABILITY_PROMPT, 63 | "technical_ai_safety": TECHNICAL_AI_SAFETY_PROMPT 64 | } 65 | 66 | def extract_json_from_text(text: str) -> Dict[str, Any]: 67 | """ 68 | Attempt to extract JSON from text, handling various formats. 69 | 70 | Args: 71 | text: String potentially containing JSON 72 | 73 | Returns: 74 | Extracted JSON as a dictionary, or error dictionary 75 | """ 76 | try: 77 | # Look for JSON-like structures 78 | start_idx = text.find('{') 79 | end_idx = text.rfind('}') + 1 80 | 81 | if start_idx >= 0 and end_idx > start_idx: 82 | json_str = text[start_idx:end_idx] 83 | return json.loads(json_str) 84 | else: 85 | return {"error": "Could not find JSON in text", "raw_text": text} 86 | except json.JSONDecodeError: 87 | return {"error": "Failed to parse as JSON", "raw_text": text} 88 | 89 | def create_analysis_prompt(paper: Dict[str, Any], analysis_type: str) -> str: 90 | """ 91 | Create a prompt for specialized analysis. 92 | 93 | Args: 94 | paper: Dictionary with paper details 95 | analysis_type: Type of analysis to perform 96 | 97 | Returns: 98 | Formatted prompt string 99 | """ 100 | if analysis_type not in PROMPT_TEMPLATES: 101 | raise ValueError(f"Unknown analysis type: {analysis_type}") 102 | 103 | prompt_template = PROMPT_TEMPLATES[analysis_type] 104 | 105 | return prompt_template.format( 106 | title=paper.get("title", ""), 107 | authors=paper.get("authors", ""), 108 | abstract=paper.get("abstract", ""), 109 | content=paper.get("content", "")[:10000] # Limit content length 110 | ) 111 | 112 | def analyze_interpretability_circuits(paper: Dict[str, Any], response: Dict[str, Any]) -> Dict[str, Any]: 113 | """ 114 | Perform additional circuit analysis based on paper content and initial response. 115 | 116 | Args: 117 | paper: Dictionary with paper details 118 | response: Initial analysis response 119 | 120 | Returns: 121 | Enhanced analysis with circuit information 122 | """ 123 | # This is a placeholder for more sophisticated circuit analysis 124 | # In a real implementation, this would use specialized tools to analyze 125 | # neural network circuits mentioned in the paper 126 | 127 | # Extract potential circuit descriptions from paper content 128 | circuit_mentions = [] 129 | 130 | content = paper.get("content", "").lower() 131 | circuit_keywords = ["circuit", "attention head", "neuron", "mlp", "weight", "activation"] 132 | 133 | for keyword in circuit_keywords: 134 | if keyword in content: 135 | # Very simple extraction - in reality would use more sophisticated NLP 136 | start_idx = content.find(keyword) 137 | if start_idx >= 0: 138 | excerpt = content[max(0, start_idx-50):min(len(content), start_idx+100)] 139 | circuit_mentions.append(excerpt) 140 | 141 | # Add circuit information to response 142 | enhanced_response = response.copy() 143 | enhanced_response["circuit_mentions"] = circuit_mentions[:5] # Limit to 5 mentions 144 | enhanced_response["circuit_analysis_performed"] = len(circuit_mentions) > 0 145 | 146 | return enhanced_response 147 | 148 | def get_paper_relation_to_ai_safety(paper: Dict[str, Any]) -> str: 149 | """ 150 | Determine how a paper relates to AI safety research. 151 | 152 | Args: 153 | paper: Dictionary with paper details 154 | 155 | Returns: 156 | Description of relation to AI safety 157 | """ 158 | # Simple keyword-based approach 159 | safety_keywords = { 160 | "alignment": "AI alignment", 161 | "safety": "AI safety", 162 | "robustness": "Model robustness", 163 | "adversarial": "Adversarial robustness", 164 | "bias": "Bias mitigation", 165 | "fairness": "Fairness", 166 | "transparency": "Transparency", 167 | "interpretability": "Interpretability", 168 | "explainability": "Explainability", 169 | "oversight": "AI oversight", 170 | "control": "AI control", 171 | "verification": "Formal verification", 172 | "monitoring": "AI monitoring" 173 | } 174 | 175 | relation = [] 176 | content = (paper.get("abstract", "") + " " + paper.get("title", "")).lower() 177 | 178 | for keyword, category in safety_keywords.items(): 179 | if keyword in content: 180 | relation.append(category) 181 | 182 | if relation: 183 | return ", ".join(set(relation)) 184 | else: 185 | return "No direct relation to AI safety identified" 186 | 187 | def analyze_multi_agent_safety(paper: Dict[str, Any]) -> Dict[str, Any]: 188 | """ 189 | Analyze multi-agent safety aspects of a paper. 190 | 191 | Args: 192 | paper: Dictionary with paper details 193 | 194 | Returns: 195 | Multi-agent safety analysis 196 | """ 197 | # Check if paper mentions multi-agent systems 198 | content = (paper.get("abstract", "") + " " + paper.get("title", "")).lower() 199 | 200 | multi_agent_keywords = [ 201 | "multi-agent", "multiagent", "agent cooperation", "agent competition", 202 | "game theory", "nash equilibrium", "cooperative ai", "collaborative ai" 203 | ] 204 | 205 | is_multi_agent = any(keyword in content for keyword in multi_agent_keywords) 206 | 207 | if not is_multi_agent: 208 | return {"is_multi_agent_focused": False} 209 | 210 | # Simple analysis of multi-agent safety aspects 211 | safety_aspects = [] 212 | 213 | if "cooperation" in content or "collaborative" in content or "coordination" in content: 214 | safety_aspects.append("Agent cooperation") 215 | 216 | if "competition" in content or "adversarial" in content: 217 | safety_aspects.append("Agent competition") 218 | 219 | if "equilibrium" in content or "game theory" in content: 220 | safety_aspects.append("Game theoretic analysis") 221 | 222 | if "incentive" in content or "reward" in content: 223 | safety_aspects.append("Incentive design") 224 | 225 | if "communication" in content: 226 | safety_aspects.append("Agent communication") 227 | 228 | return { 229 | "is_multi_agent_focused": True, 230 | "multi_agent_safety_aspects": safety_aspects, 231 | "summary": f"This paper focuses on multi-agent systems, specifically addressing: {', '.join(safety_aspects)}" if safety_aspects else "This paper discusses multi-agent systems but doesn't specifically address safety aspects." 232 | } -------------------------------------------------------------------------------- /src/gemini_utils.py: -------------------------------------------------------------------------------- 1 | """ 2 | Gemini API integration for ArxivDigest. 3 | This module provides functions to work with Google's Gemini API for paper analysis. 4 | """ 5 | import os 6 | import json 7 | import logging 8 | import time 9 | from typing import List, Dict, Any, Optional 10 | 11 | try: 12 | import google.generativeai as genai 13 | from google.api_core.exceptions import GoogleAPIError 14 | GEMINI_AVAILABLE = True 15 | except ImportError: 16 | GEMINI_AVAILABLE = False 17 | 18 | # Configure logging 19 | logging.basicConfig(level=logging.INFO) 20 | logger = logging.getLogger(__name__) 21 | 22 | class GeminiConfig: 23 | """Configuration for Gemini API calls.""" 24 | def __init__( 25 | self, 26 | temperature: float = 0.4, 27 | max_output_tokens: int = 2048, 28 | top_p: float = 0.95, 29 | top_k: int = 40 30 | ): 31 | self.temperature = temperature 32 | self.max_output_tokens = max_output_tokens 33 | self.top_p = top_p 34 | self.top_k = top_k 35 | 36 | def setup_gemini_api(api_key: str) -> bool: 37 | """ 38 | Setup the Gemini API with the provided API key. 39 | 40 | Args: 41 | api_key: Gemini API key 42 | 43 | Returns: 44 | bool: True if setup was successful, False otherwise 45 | """ 46 | if not GEMINI_AVAILABLE: 47 | logger.error("Gemini package not installed. Run 'pip install google-generativeai'") 48 | return False 49 | 50 | if not api_key: 51 | logger.error("No Gemini API key provided") 52 | return False 53 | 54 | try: 55 | genai.configure(api_key=api_key) 56 | # Test API connection 57 | models = genai.list_models() 58 | logger.info(f"Successfully connected to Gemini API. Available models: {[m.name for m in models if 'generateContent' in m.supported_generation_methods]}") 59 | return True 60 | except Exception as e: 61 | logger.error(f"Failed to setup Gemini API: {e}") 62 | return False 63 | 64 | def get_gemini_model(model_name: str = "gemini-1.5-flash"): 65 | """ 66 | Get a Gemini model by name. 67 | 68 | Args: 69 | model_name: Name of the Gemini model 70 | 71 | Returns: 72 | Model object or None if not available 73 | """ 74 | if not GEMINI_AVAILABLE: 75 | return None 76 | 77 | try: 78 | model = genai.GenerativeModel(model_name) 79 | return model 80 | except Exception as e: 81 | logger.error(f"Failed to get Gemini model: {e}") 82 | return None 83 | 84 | def analyze_papers_with_gemini( 85 | papers: List[Dict[str, Any]], 86 | query: Dict[str, str], 87 | config: Optional[GeminiConfig] = None, 88 | model_name: str = "gemini-1.5-flash" 89 | ) -> List[Dict[str, Any]]: 90 | """ 91 | Analyze papers using the Gemini model. 92 | 93 | Args: 94 | papers: List of paper dictionaries 95 | query: Dictionary with 'interest' key describing research interests 96 | config: GeminiConfig object 97 | model_name: Name of the Gemini model to use 98 | 99 | Returns: 100 | List of papers with added analysis 101 | """ 102 | if not GEMINI_AVAILABLE: 103 | logger.error("Gemini package not installed. Cannot analyze papers.") 104 | return papers 105 | 106 | if not config: 107 | config = GeminiConfig() 108 | 109 | model = get_gemini_model(model_name) 110 | if not model: 111 | return papers 112 | 113 | analyzed_papers = [] 114 | 115 | for paper in papers: 116 | try: 117 | # Prepare prompt 118 | prompt = f""" 119 | You are a research assistant analyzing academic papers in AI and ML. 120 | 121 | Analyze this paper and provide insights based on the user's research interests. 122 | 123 | Research interests: {query['interest']} 124 | 125 | Paper details: 126 | Title: {paper['title']} 127 | Authors: {paper['authors']} 128 | Abstract: {paper['abstract']} 129 | Content: {paper['content'][:5000]} 130 | 131 | Please provide your response as a single JSON object with the following structure: 132 | {{ 133 | "Relevancy score": 1-10 (higher = more relevant), 134 | "Reasons for match": "Detailed explanation of why this paper matches the interests", 135 | "Key innovations": "List the main contributions of the paper", 136 | "Critical analysis": "Evaluate strengths and weaknesses", 137 | "Goal": "What problem does the paper address?", 138 | "Data": "Description of datasets used", 139 | "Methodology": "Technical approach and methods", 140 | "Implementation details": "Model architecture, hyperparameters, etc.", 141 | "Experiments & Results": "Key findings and comparisons", 142 | "Discussion & Next steps": "Limitations and future work", 143 | "Related work": "Connection to similar research", 144 | "Practical applications": "Real-world uses of this research", 145 | "Key takeaways": ["Point 1", "Point 2", "Point 3"] 146 | }} 147 | 148 | Format your response as a valid JSON object and nothing else. 149 | """ 150 | 151 | # Just log that we're sending a prompt to Gemini 152 | print(f"Sending prompt to Gemini for paper: {paper['title'][:50]}...") 153 | 154 | generation_config = { 155 | "temperature": config.temperature, 156 | "top_p": config.top_p, 157 | "top_k": config.top_k, 158 | "max_output_tokens": config.max_output_tokens, 159 | } 160 | 161 | response = model.generate_content( 162 | prompt, 163 | generation_config=generation_config 164 | ) 165 | 166 | # Extract and parse the response 167 | response_text = response.text 168 | 169 | # Try to extract JSON 170 | try: 171 | start_idx = response_text.find('{') 172 | end_idx = response_text.rfind('}') + 1 173 | if start_idx >= 0 and end_idx > start_idx: 174 | json_str = response_text[start_idx:end_idx] 175 | gemini_analysis = json.loads(json_str) 176 | 177 | # Add Gemini analysis to paper 178 | paper['gemini_analysis'] = gemini_analysis 179 | 180 | # Directly copy fields to paper 181 | for key, value in gemini_analysis.items(): 182 | paper[key] = value 183 | else: 184 | logger.warning(f"Could not extract JSON from Gemini response for paper {paper['title']}") 185 | paper['gemini_analysis'] = {"error": "Failed to parse response"} 186 | except json.JSONDecodeError: 187 | logger.warning(f"Failed to parse Gemini response as JSON for paper {paper['title']}") 188 | paper['gemini_analysis'] = {"error": "Failed to parse response"} 189 | 190 | analyzed_papers.append(paper) 191 | 192 | # Avoid rate limiting 193 | time.sleep(1) 194 | 195 | except GoogleAPIError as e: 196 | logger.error(f"Gemini API error: {e}") 197 | paper['gemini_analysis'] = {"error": f"Gemini API error: {str(e)}"} 198 | analyzed_papers.append(paper) 199 | 200 | except Exception as e: 201 | logger.error(f"Error analyzing paper with Gemini: {e}") 202 | paper['gemini_analysis'] = {"error": f"Error: {str(e)}"} 203 | analyzed_papers.append(paper) 204 | 205 | return analyzed_papers 206 | 207 | def get_topic_clustering(papers: List[Dict[str, Any]], model_name: str = "gemini-1.5-flash"): 208 | """ 209 | Cluster papers by topic using Gemini. 210 | 211 | Args: 212 | papers: List of paper dictionaries 213 | model_name: Name of the Gemini model to use 214 | 215 | Returns: 216 | Dictionary with topic clusters 217 | """ 218 | if not GEMINI_AVAILABLE: 219 | logger.error("Gemini package not installed. Cannot cluster papers.") 220 | return {} 221 | 222 | model = get_gemini_model(model_name) 223 | if not model: 224 | return {} 225 | 226 | # Create a condensed representation of the papers 227 | paper_summaries = [] 228 | for i, paper in enumerate(papers): 229 | paper_summaries.append(f"{i+1}. Title: {paper['title']}\nAbstract: {paper['abstract'][:300]}...") 230 | 231 | paper_text = "\n\n".join(paper_summaries) 232 | 233 | prompt = f""" 234 | You are a research librarian organizing academic papers into topic clusters. 235 | 236 | Analyze these papers and group them into 3-7 thematic clusters: 237 | 238 | {paper_text} 239 | 240 | For each cluster: 241 | 1. Provide a descriptive name for the cluster 242 | 2. List the paper numbers that belong to this cluster 243 | 3. Explain why these papers belong together 244 | 245 | Format your response as JSON with these fields: "clusters" (an array of objects with "name", "papers", and "description" fields). 246 | """ 247 | 248 | try: 249 | response = model.generate_content(prompt) 250 | response_text = response.text 251 | 252 | # Try to extract JSON 253 | try: 254 | start_idx = response_text.find('{') 255 | end_idx = response_text.rfind('}') + 1 256 | if start_idx >= 0 and end_idx > start_idx: 257 | json_str = response_text[start_idx:end_idx] 258 | cluster_data = json.loads(json_str) 259 | return cluster_data 260 | else: 261 | logger.warning("Could not extract JSON from Gemini clustering response") 262 | return {"error": "Failed to parse clustering response"} 263 | except json.JSONDecodeError: 264 | logger.warning("Failed to parse Gemini clustering response as JSON") 265 | return {"error": "Failed to parse clustering response"} 266 | 267 | except Exception as e: 268 | logger.error(f"Error clustering papers with Gemini: {e}") 269 | return {"error": f"Clustering error: {str(e)}"} -------------------------------------------------------------------------------- /src/utils.py: -------------------------------------------------------------------------------- 1 | import dataclasses 2 | import logging 3 | import math 4 | import os 5 | import io 6 | import sys 7 | import time 8 | import json 9 | from typing import Optional, Sequence, Union, Dict, Any 10 | 11 | import openai 12 | import tqdm 13 | import copy 14 | 15 | # Handle both old and new OpenAI SDK versions 16 | try: 17 | from openai import openai_object 18 | StrOrOpenAIObject = Union[str, openai_object.OpenAIObject] 19 | OPENAI_OLD_API = True 20 | except ImportError: 21 | StrOrOpenAIObject = Union[str, Dict[str, Any]] 22 | OPENAI_OLD_API = False 23 | 24 | 25 | openai_org = os.getenv("OPENAI_ORG") 26 | if openai_org is not None: 27 | openai.organization = openai_org 28 | logging.warning(f"Switching to organization: {openai_org} for OAI API key.") 29 | 30 | 31 | @dataclasses.dataclass 32 | class OpenAIDecodingArguments(object): 33 | #max_tokens: int = 1800 34 | max_tokens: int = 5400 35 | temperature: float = 0.2 36 | top_p: float = 1.0 37 | n: int = 1 38 | stream: bool = False 39 | stop: Optional[Sequence[str]] = None 40 | presence_penalty: float = 0.0 41 | frequency_penalty: float = 0.0 42 | # logprobs: Optional[int] = None 43 | 44 | 45 | def openai_completion( 46 | prompts, #: Union[str, Sequence[str], Sequence[dict[str, str]], dict[str, str]], 47 | decoding_args: OpenAIDecodingArguments, 48 | model_name="text-davinci-003", 49 | sleep_time=15, 50 | batch_size=1, 51 | max_instances=sys.maxsize, 52 | max_batches=sys.maxsize, 53 | return_text=False, 54 | **decoding_kwargs, 55 | ) -> Union[Union[StrOrOpenAIObject], Sequence[StrOrOpenAIObject], Sequence[Sequence[StrOrOpenAIObject]],]: 56 | """Decode with OpenAI API. 57 | 58 | Args: 59 | prompts: A string or a list of strings to complete. If it is a chat model the strings should be formatted 60 | as explained here: https://github.com/openai/openai-python/blob/main/chatml.md. If it is a chat model 61 | it can also be a dictionary (or list thereof) as explained here: 62 | https://github.com/openai/openai-cookbook/blob/main/examples/How_to_format_inputs_to_ChatGPT_models.ipynb 63 | decoding_args: Decoding arguments. 64 | model_name: Model name. Can be either in the format of "org/model" or just "model". 65 | sleep_time: Time to sleep once the rate-limit is hit. 66 | batch_size: Number of prompts to send in a single request. Only for non chat model. 67 | max_instances: Maximum number of prompts to decode. 68 | max_batches: Maximum number of batches to decode. This argument will be deprecated in the future. 69 | return_text: If True, return text instead of full completion object (which contains things like logprob). 70 | decoding_kwargs: Additional decoding arguments. Pass in `best_of` and `logit_bias` if you need them. 71 | 72 | Returns: 73 | A completion or a list of completions. 74 | Depending on return_text, return_openai_object, and decoding_args.n, the completion type can be one of 75 | - a string (if return_text is True) 76 | - an openai_object.OpenAIObject object (if return_text is False) 77 | - a list of objects of the above types (if decoding_args.n > 1) 78 | """ 79 | is_chat_model = "gpt-3.5" in model_name or "gpt-4" in model_name 80 | is_single_prompt = isinstance(prompts, (str, dict)) 81 | if is_single_prompt: 82 | prompts = [prompts] 83 | 84 | if max_batches < sys.maxsize: 85 | logging.warning( 86 | "`max_batches` will be deprecated in the future, please use `max_instances` instead." 87 | "Setting `max_instances` to `max_batches * batch_size` for now." 88 | ) 89 | max_instances = max_batches * batch_size 90 | 91 | prompts = prompts[:max_instances] 92 | num_prompts = len(prompts) 93 | prompt_batches = [ 94 | prompts[batch_id * batch_size : (batch_id + 1) * batch_size] 95 | for batch_id in range(int(math.ceil(num_prompts / batch_size))) 96 | ] 97 | 98 | completions = [] 99 | for batch_id, prompt_batch in tqdm.tqdm( 100 | enumerate(prompt_batches), 101 | desc="prompt_batches", 102 | total=len(prompt_batches), 103 | ): 104 | batch_decoding_args = copy.deepcopy(decoding_args) # cloning the decoding_args 105 | 106 | backoff = 5 107 | 108 | while True: 109 | try: 110 | time.sleep(3) 111 | shared_kwargs = dict( 112 | model=model_name, 113 | **batch_decoding_args.__dict__, 114 | **decoding_kwargs, 115 | ) 116 | 117 | if OPENAI_OLD_API: 118 | # Use old API format 119 | if is_chat_model: 120 | completion_batch = openai.ChatCompletion.create( 121 | messages=[ 122 | {"role": "system", "content": "You are a helpful assistant."}, 123 | {"role": "user", "content": prompt_batch[0]} 124 | ], 125 | **shared_kwargs 126 | ) 127 | else: 128 | completion_batch = openai.Completion.create(prompt=prompt_batch, **shared_kwargs) 129 | 130 | choices = completion_batch.choices 131 | 132 | for choice in choices: 133 | choice["total_tokens"] = completion_batch.usage.total_tokens 134 | else: 135 | # Use new API format 136 | client = openai.OpenAI() 137 | 138 | if is_chat_model: 139 | completion_batch = client.chat.completions.create( 140 | model=model_name, 141 | messages=[ 142 | {"role": "system", "content": "You are a helpful assistant."}, 143 | {"role": "user", "content": prompt_batch[0]} 144 | ], 145 | temperature=batch_decoding_args.temperature, 146 | max_tokens=batch_decoding_args.max_tokens, 147 | top_p=batch_decoding_args.top_p, 148 | n=batch_decoding_args.n, 149 | stream=batch_decoding_args.stream, 150 | presence_penalty=batch_decoding_args.presence_penalty, 151 | frequency_penalty=batch_decoding_args.frequency_penalty, 152 | **decoding_kwargs 153 | ) 154 | 155 | # Convert completion to dictionary format for consistency 156 | choices = [] 157 | for choice in completion_batch.choices: 158 | choice_dict = { 159 | "message": { 160 | "content": choice.message.content, 161 | "role": choice.message.role 162 | }, 163 | "index": choice.index, 164 | "finish_reason": choice.finish_reason, 165 | "total_tokens": completion_batch.usage.total_tokens 166 | } 167 | choices.append(choice_dict) 168 | else: 169 | completion_batch = client.completions.create( 170 | model=model_name, 171 | prompt=prompt_batch, 172 | temperature=batch_decoding_args.temperature, 173 | max_tokens=batch_decoding_args.max_tokens, 174 | top_p=batch_decoding_args.top_p, 175 | n=batch_decoding_args.n, 176 | stream=batch_decoding_args.stream, 177 | presence_penalty=batch_decoding_args.presence_penalty, 178 | frequency_penalty=batch_decoding_args.frequency_penalty, 179 | **decoding_kwargs 180 | ) 181 | 182 | # Convert completion to dictionary format for consistency 183 | choices = [] 184 | for choice in completion_batch.choices: 185 | choice_dict = { 186 | "text": choice.text, 187 | "index": choice.index, 188 | "finish_reason": choice.finish_reason, 189 | "total_tokens": completion_batch.usage.total_tokens 190 | } 191 | choices.append(choice_dict) 192 | 193 | completions.extend(choices) 194 | break 195 | except Exception as e: 196 | logging.warning(f"OpenAI API Error: {e}.") 197 | if "Please reduce your prompt" in str(e): 198 | batch_decoding_args.max_tokens = int(batch_decoding_args.max_tokens * 0.8) 199 | logging.warning(f"Reducing target length to {batch_decoding_args.max_tokens}, Retrying...") 200 | elif not backoff: 201 | logging.error("Hit too many failures, exiting") 202 | raise e 203 | else: 204 | backoff -= 1 205 | logging.warning("Hit request rate limit; retrying...") 206 | time.sleep(sleep_time) # Annoying rate limit on requests. 207 | continue 208 | 209 | if return_text: 210 | if is_chat_model: 211 | completions = [completion.get("message", {}).get("content", "") for completion in completions] 212 | else: 213 | completions = [completion.get("text", "") for completion in completions] 214 | 215 | if decoding_args.n > 1: 216 | # make completions a nested list, where each entry is a consecutive decoding_args.n of original entries. 217 | completions = [completions[i : i + decoding_args.n] for i in range(0, len(completions), decoding_args.n)] 218 | if is_single_prompt: 219 | # Return non-tuple if only 1 input and 1 generation. 220 | (completions,) = completions 221 | return completions 222 | 223 | 224 | def write_ans_to_file(ans_data, file_prefix, output_dir="./output"): 225 | if not os.path.exists(output_dir): 226 | os.makedirs(output_dir) 227 | filename = os.path.join(output_dir, file_prefix + ".txt") 228 | with open(filename, "w") as f: 229 | for ans in ans_data: 230 | f.write(ans + "\n") 231 | -------------------------------------------------------------------------------- /src/design_automation.py: -------------------------------------------------------------------------------- 1 | """ 2 | Module for analyzing papers related to AI/ML for graphic design automation. 3 | This module helps identify and analyze papers on automated design, layout generation, 4 | creative AI tools, and related topics. 5 | """ 6 | import logging 7 | import json 8 | from typing import Dict, Any, List, Optional 9 | 10 | # Configure logging 11 | logging.basicConfig(level=logging.INFO) 12 | logger = logging.getLogger(__name__) 13 | 14 | # Design automation keywords for paper filtering 15 | DESIGN_AUTOMATION_KEYWORDS = [ 16 | "design automation", "layout generation", "visual design", "graphic design", 17 | "creative AI", "generative design", "UI generation", "UX automation", 18 | "design system", "composition", "creative workflow", "automated design", 19 | "design tool", "design assistant", "design optimization", "content-aware", 20 | "user interface generation", "visual layout", "image composition" 21 | ] 22 | 23 | DESIGN_AUTOMATION_PROMPT = """ 24 | You are a specialized research assistant focused on AI/ML for graphic design automation. 25 | 26 | Analyze this paper from the perspective of AI for graphic design and creative automation: 27 | 28 | Title: {title} 29 | Authors: {authors} 30 | Abstract: {abstract} 31 | Content: {content} 32 | 33 | Please provide a detailed analysis covering: 34 | 35 | 1. Design automation focus: What aspect of design does this paper attempt to automate or enhance? 36 | 2. Technical approach: What AI/ML techniques are used in the paper for design automation? 37 | 3. Visual outputs: What kind of visual artifacts does the system generate? 38 | 4. Designer interaction: How does the system interact with human designers? 39 | 5. Data requirements: What data does the system use for training or operation? 40 | 6. Evaluation metrics: How is the system's design quality evaluated? 41 | 7. Real-world applicability: How practical is this approach for professional design workflows? 42 | 8. Novelty: What makes this approach unique compared to other design automation systems? 43 | 9. Limitations: What are the current limitations of this approach? 44 | 10. Future directions: What improvements or extensions are suggested? 45 | 46 | Format your response as JSON with these fields. 47 | """ 48 | 49 | def is_design_automation_paper(paper: Dict[str, Any]) -> bool: 50 | """ 51 | Check if a paper is related to design automation based on keywords. 52 | 53 | Args: 54 | paper: Dictionary with paper details 55 | 56 | Returns: 57 | Boolean indicating if paper is related to design automation 58 | """ 59 | text = ( 60 | (paper.get("title", "") + " " + 61 | paper.get("abstract", "") + " " + 62 | paper.get("subjects", "")).lower() 63 | ) 64 | 65 | return any(keyword.lower() in text for keyword in DESIGN_AUTOMATION_KEYWORDS) 66 | 67 | def categorize_design_paper(paper: Dict[str, Any]) -> str: 68 | """ 69 | Categorize design automation paper into subcategories. 70 | 71 | Args: 72 | paper: Dictionary with paper details 73 | 74 | Returns: 75 | Category name string 76 | """ 77 | text = (paper.get("title", "") + " " + paper.get("abstract", "")).lower() 78 | 79 | categories = { 80 | "Layout Generation": ["layout", "composition", "arrange", "grid"], 81 | "UI/UX Design": ["user interface", "ui", "ux", "interface design", "website"], 82 | "Graphic Design": ["graphic design", "poster", "visual design", "typography"], 83 | "Image Manipulation": ["image editing", "photo", "manipulation", "style transfer"], 84 | "Design Tools": ["tool", "assistant", "workflow", "productivity"], 85 | "3D Design": ["3d", "modeling", "cad", "product design"], 86 | "Multimodal Design": ["multimodal", "text-to-image", "image-to-code"] 87 | } 88 | 89 | matches = [] 90 | for category, keywords in categories.items(): 91 | if any(keyword.lower() in text for keyword in keywords): 92 | matches.append(category) 93 | 94 | if matches: 95 | return ", ".join(matches) 96 | return "General Design Automation" 97 | 98 | def analyze_design_techniques(paper: Dict[str, Any]) -> List[str]: 99 | """ 100 | Extract AI/ML techniques used for design automation in the paper. 101 | 102 | Args: 103 | paper: Dictionary with paper details 104 | 105 | Returns: 106 | List of techniques 107 | """ 108 | text = (paper.get("title", "") + " " + paper.get("abstract", "")).lower() 109 | 110 | techniques = [] 111 | technique_keywords = { 112 | "Generative Adversarial Networks": ["gan", "generative adversarial"], 113 | "Diffusion Models": ["diffusion", "ddpm", "stable diffusion"], 114 | "Transformers": ["transformer", "attention mechanism"], 115 | "Reinforcement Learning": ["reinforcement learning", "rl"], 116 | "Computer Vision": ["computer vision", "vision", "cnn"], 117 | "Graph Neural Networks": ["graph neural", "gnn"], 118 | "Large Language Models": ["llm", "large language model", "gpt"], 119 | "Neural Style Transfer": ["style transfer", "neural style"], 120 | "Evolutionary Algorithms": ["genetic algorithm", "evolutionary"] 121 | } 122 | 123 | for technique, keywords in technique_keywords.items(): 124 | if any(keyword in text for keyword in keywords): 125 | techniques.append(technique) 126 | 127 | return techniques 128 | 129 | def extract_design_metrics(paper: Dict[str, Any]) -> List[str]: 130 | """ 131 | Extract evaluation metrics used for design quality assessment. 132 | 133 | Args: 134 | paper: Dictionary with paper details 135 | 136 | Returns: 137 | List of metrics 138 | """ 139 | text = (paper.get("title", "") + " " + paper.get("abstract", "")).lower() 140 | 141 | metrics = [] 142 | metric_keywords = { 143 | "User Studies": ["user study", "user evaluation", "human evaluation"], 144 | "Aesthetic Measures": ["aesthetic", "beauty", "visual quality"], 145 | "Design Principles": ["design principle", "balance", "harmony", "contrast"], 146 | "Technical Metrics": ["fid", "inception score", "clip score", "psnr"], 147 | "Efficiency Metrics": ["time", "speed", "efficiency"], 148 | "Usability": ["usability", "user experience", "ux", "ease of use"] 149 | } 150 | 151 | for metric, keywords in metric_keywords.items(): 152 | if any(keyword in text for keyword in keywords): 153 | metrics.append(metric) 154 | 155 | return metrics 156 | 157 | def get_related_design_papers(paper_id: str, papers: List[Dict[str, Any]]) -> List[Dict[str, Any]]: 158 | """ 159 | Find papers related to a specific design automation paper. 160 | 161 | Args: 162 | paper_id: ID of the target paper 163 | papers: List of paper dictionaries 164 | 165 | Returns: 166 | List of related papers 167 | """ 168 | target_paper = next((p for p in papers if p.get("main_page", "").endswith(paper_id)), None) 169 | if not target_paper: 170 | return [] 171 | 172 | # Get techniques used in target paper 173 | target_techniques = analyze_design_techniques(target_paper) 174 | target_category = categorize_design_paper(target_paper) 175 | 176 | related_papers = [] 177 | for paper in papers: 178 | if paper.get("main_page", "") == target_paper.get("main_page", ""): 179 | continue 180 | 181 | # Check if paper is on design automation 182 | if not is_design_automation_paper(paper): 183 | continue 184 | 185 | # Check if techniques or categories overlap 186 | paper_techniques = analyze_design_techniques(paper) 187 | paper_category = categorize_design_paper(paper) 188 | 189 | technique_overlap = len(set(target_techniques) & set(paper_techniques)) 190 | category_match = paper_category == target_category 191 | 192 | if technique_overlap > 0 or category_match: 193 | paper["relevance_reason"] = [] 194 | 195 | if technique_overlap > 0: 196 | paper["relevance_reason"].append(f"Uses similar techniques: {', '.join(set(target_techniques) & set(paper_techniques))}") 197 | 198 | if category_match: 199 | paper["relevance_reason"].append(f"Same design category: {paper_category}") 200 | 201 | paper["relevance_score"] = (technique_overlap * 2) + (2 if category_match else 0) 202 | related_papers.append(paper) 203 | 204 | # Sort by relevance score 205 | related_papers.sort(key=lambda x: x.get("relevance_score", 0), reverse=True) 206 | return related_papers[:5] # Return top 5 related papers 207 | 208 | def create_design_analysis_prompt(paper: Dict[str, Any]) -> str: 209 | """ 210 | Create a prompt for analyzing a design automation paper. 211 | 212 | Args: 213 | paper: Dictionary with paper details 214 | 215 | Returns: 216 | Formatted prompt string 217 | """ 218 | return DESIGN_AUTOMATION_PROMPT.format( 219 | title=paper.get("title", ""), 220 | authors=paper.get("authors", ""), 221 | abstract=paper.get("abstract", ""), 222 | content=paper.get("content", "")[:10000] # Limit content length 223 | ) 224 | 225 | def extract_design_capabilities(analysis: Dict[str, Any]) -> Dict[str, Any]: 226 | """ 227 | Extract specific design capabilities from an analysis. 228 | 229 | Args: 230 | analysis: Dictionary with design paper analysis 231 | 232 | Returns: 233 | Dictionary of design capabilities 234 | """ 235 | capabilities = {} 236 | 237 | # Extract design areas 238 | if "Design automation focus" in analysis: 239 | capabilities["design_areas"] = analysis["Design automation focus"] 240 | 241 | # Extract tools that could be replaced 242 | tools = [] 243 | tools_keywords = { 244 | "Adobe Photoshop": ["photoshop", "photo editing", "image manipulation"], 245 | "Adobe Illustrator": ["illustrator", "vector", "illustration"], 246 | "Figma": ["figma", "ui design", "interface design"], 247 | "Sketch": ["sketch", "ui design", "interface design"], 248 | "InDesign": ["indesign", "layout", "publishing"], 249 | "Canva": ["canva", "simple design", "templates"] 250 | } 251 | 252 | for text_field in ["Technical approach", "Design automation focus", "Real-world applicability"]: 253 | if text_field in analysis: 254 | text = analysis[text_field].lower() 255 | for tool, keywords in tools_keywords.items(): 256 | if any(keyword in text for keyword in keywords): 257 | tools.append(tool) 258 | 259 | capabilities["replaceable_tools"] = list(set(tools)) 260 | 261 | # Extract human-in-the-loop vs fully automated 262 | if "Designer interaction" in analysis: 263 | text = analysis["Designer interaction"].lower() 264 | if "fully automated" in text or "automatic" in text or "without human" in text: 265 | capabilities["automation_level"] = "Fully automated" 266 | elif "human-in-the-loop" in text or "collaboration" in text or "assists" in text: 267 | capabilities["automation_level"] = "Human-in-the-loop" 268 | else: 269 | capabilities["automation_level"] = "Hybrid" 270 | 271 | # Extract if it's ready for production 272 | if "Real-world applicability" in analysis: 273 | text = analysis["Real-world applicability"].lower() 274 | if "production ready" in text or "commercially viable" in text or "can be used in real" in text: 275 | capabilities["production_ready"] = True 276 | elif "prototype" in text or "proof of concept" in text or "research" in text or "limitations" in text: 277 | capabilities["production_ready"] = False 278 | else: 279 | capabilities["production_ready"] = "Unclear" 280 | 281 | return capabilities -------------------------------------------------------------------------------- /src/action.py: -------------------------------------------------------------------------------- 1 | from sendgrid import SendGridAPIClient 2 | from sendgrid.helpers.mail import Mail, Email, To, Content 3 | 4 | import argparse 5 | import yaml 6 | import os 7 | from dotenv import load_dotenv 8 | import openai 9 | from relevancy import generate_relevance_score, process_subject_fields 10 | from download_new_papers import get_papers 11 | from datetime import date 12 | 13 | import ssl 14 | 15 | ssl._create_default_https_context = ssl._create_stdlib_context 16 | 17 | # Hackathon quality code. Don't judge too harshly. 18 | # Feel free to submit pull requests to improve the code. 19 | 20 | topics = { 21 | "Physics": "", 22 | "Mathematics": "math", 23 | "Computer Science": "cs", 24 | "Quantitative Biology": "q-bio", 25 | "Quantitative Finance": "q-fin", 26 | "Statistics": "stat", 27 | "Electrical Engineering and Systems Science": "eess", 28 | "Economics": "econ", 29 | } 30 | 31 | physics_topics = { 32 | "Astrophysics": "astro-ph", 33 | "Condensed Matter": "cond-mat", 34 | "General Relativity and Quantum Cosmology": "gr-qc", 35 | "High Energy Physics - Experiment": "hep-ex", 36 | "High Energy Physics - Lattice": "hep-lat", 37 | "High Energy Physics - Phenomenology": "hep-ph", 38 | "High Energy Physics - Theory": "hep-th", 39 | "Mathematical Physics": "math-ph", 40 | "Nonlinear Sciences": "nlin", 41 | "Nuclear Experiment": "nucl-ex", 42 | "Nuclear Theory": "nucl-th", 43 | "Physics": "physics", 44 | "Quantum Physics": "quant-ph", 45 | } 46 | 47 | 48 | # TODO: surely theres a better way 49 | category_map = { 50 | "Astrophysics": [ 51 | "Astrophysics of Galaxies", 52 | "Cosmology and Nongalactic Astrophysics", 53 | "Earth and Planetary Astrophysics", 54 | "High Energy Astrophysical Phenomena", 55 | "Instrumentation and Methods for Astrophysics", 56 | "Solar and Stellar Astrophysics", 57 | ], 58 | "Condensed Matter": [ 59 | "Disordered Systems and Neural Networks", 60 | "Materials Science", 61 | "Mesoscale and Nanoscale Physics", 62 | "Other Condensed Matter", 63 | "Quantum Gases", 64 | "Soft Condensed Matter", 65 | "Statistical Mechanics", 66 | "Strongly Correlated Electrons", 67 | "Superconductivity", 68 | ], 69 | "General Relativity and Quantum Cosmology": ["None"], 70 | "High Energy Physics - Experiment": ["None"], 71 | "High Energy Physics - Lattice": ["None"], 72 | "High Energy Physics - Phenomenology": ["None"], 73 | "High Energy Physics - Theory": ["None"], 74 | "Mathematical Physics": ["None"], 75 | "Nonlinear Sciences": [ 76 | "Adaptation and Self-Organizing Systems", 77 | "Cellular Automata and Lattice Gases", 78 | "Chaotic Dynamics", 79 | "Exactly Solvable and Integrable Systems", 80 | "Pattern Formation and Solitons", 81 | ], 82 | "Nuclear Experiment": ["None"], 83 | "Nuclear Theory": ["None"], 84 | "Physics": [ 85 | "Accelerator Physics", 86 | "Applied Physics", 87 | "Atmospheric and Oceanic Physics", 88 | "Atomic and Molecular Clusters", 89 | "Atomic Physics", 90 | "Biological Physics", 91 | "Chemical Physics", 92 | "Classical Physics", 93 | "Computational Physics", 94 | "Data Analysis, Statistics and Probability", 95 | "Fluid Dynamics", 96 | "General Physics", 97 | "Geophysics", 98 | "History and Philosophy of Physics", 99 | "Instrumentation and Detectors", 100 | "Medical Physics", 101 | "Optics", 102 | "Physics and Society", 103 | "Physics Education", 104 | "Plasma Physics", 105 | "Popular Physics", 106 | "Space Physics", 107 | ], 108 | "Quantum Physics": ["None"], 109 | "Mathematics": [ 110 | "Algebraic Geometry", 111 | "Algebraic Topology", 112 | "Analysis of PDEs", 113 | "Category Theory", 114 | "Classical Analysis and ODEs", 115 | "Combinatorics", 116 | "Commutative Algebra", 117 | "Complex Variables", 118 | "Differential Geometry", 119 | "Dynamical Systems", 120 | "Functional Analysis", 121 | "General Mathematics", 122 | "General Topology", 123 | "Geometric Topology", 124 | "Group Theory", 125 | "History and Overview", 126 | "Information Theory", 127 | "K-Theory and Homology", 128 | "Logic", 129 | "Mathematical Physics", 130 | "Metric Geometry", 131 | "Number Theory", 132 | "Numerical Analysis", 133 | "Operator Algebras", 134 | "Optimization and Control", 135 | "Probability", 136 | "Quantum Algebra", 137 | "Representation Theory", 138 | "Rings and Algebras", 139 | "Spectral Theory", 140 | "Statistics Theory", 141 | "Symplectic Geometry", 142 | ], 143 | "Computer Science": [ 144 | "Artificial Intelligence", 145 | "Computation and Language", 146 | "Computational Complexity", 147 | "Computational Engineering, Finance, and Science", 148 | "Computational Geometry", 149 | "Computer Science and Game Theory", 150 | "Computer Vision and Pattern Recognition", 151 | "Computers and Society", 152 | "Cryptography and Security", 153 | "Data Structures and Algorithms", 154 | "Databases", 155 | "Digital Libraries", 156 | "Discrete Mathematics", 157 | "Distributed, Parallel, and Cluster Computing", 158 | "Emerging Technologies", 159 | "Formal Languages and Automata Theory", 160 | "General Literature", 161 | "Graphics", 162 | "Hardware Architecture", 163 | "Human-Computer Interaction", 164 | "Information Retrieval", 165 | "Information Theory", 166 | "Logic in Computer Science", 167 | "Machine Learning", 168 | "Mathematical Software", 169 | "Multiagent Systems", 170 | "Multimedia", 171 | "Networking and Internet Architecture", 172 | "Neural and Evolutionary Computing", 173 | "Numerical Analysis", 174 | "Operating Systems", 175 | "Other Computer Science", 176 | "Performance", 177 | "Programming Languages", 178 | "Robotics", 179 | "Social and Information Networks", 180 | "Software Engineering", 181 | "Sound", 182 | "Symbolic Computation", 183 | "Systems and Control", 184 | ], 185 | "Quantitative Biology": [ 186 | "Biomolecules", 187 | "Cell Behavior", 188 | "Genomics", 189 | "Molecular Networks", 190 | "Neurons and Cognition", 191 | "Other Quantitative Biology", 192 | "Populations and Evolution", 193 | "Quantitative Methods", 194 | "Subcellular Processes", 195 | "Tissues and Organs", 196 | ], 197 | "Quantitative Finance": [ 198 | "Computational Finance", 199 | "Economics", 200 | "General Finance", 201 | "Mathematical Finance", 202 | "Portfolio Management", 203 | "Pricing of Securities", 204 | "Risk Management", 205 | "Statistical Finance", 206 | "Trading and Market Microstructure", 207 | ], 208 | "Statistics": [ 209 | "Applications", 210 | "Computation", 211 | "Machine Learning", 212 | "Methodology", 213 | "Other Statistics", 214 | "Statistics Theory", 215 | ], 216 | "Electrical Engineering and Systems Science": [ 217 | "Audio and Speech Processing", 218 | "Image and Video Processing", 219 | "Signal Processing", 220 | "Systems and Control", 221 | ], 222 | "Economics": ["Econometrics", "General Economics", "Theoretical Economics"], 223 | } 224 | 225 | 226 | def generate_body(topic, categories, interest, threshold): 227 | f_papers = [] 228 | if topic == "Physics": 229 | raise RuntimeError("You must choose a physics subtopic.") 230 | elif topic in physics_topics: 231 | abbr = physics_topics[topic] 232 | elif topic in topics: 233 | abbr = topics[topic] 234 | else: 235 | raise RuntimeError(f"Invalid topic {topic}") 236 | if categories: 237 | for category in categories: 238 | if category not in category_map[topic]: 239 | raise RuntimeError(f"{category} is not a category of {topic}") 240 | papers = get_papers(abbr) 241 | 242 | papers = [ 243 | t 244 | for t in papers 245 | if bool(set(process_subject_fields(t["subjects"])) & set(categories)) 246 | ] 247 | 248 | else: 249 | papers = get_papers(abbr) 250 | if interest: 251 | relevancy, hallucination = generate_relevance_score( 252 | papers, 253 | query={"interest": interest}, 254 | threshold_score=threshold, 255 | num_paper_in_prompt=2, 256 | ) 257 | 258 | body = "

".join( 259 | [ 260 | f'Subject: {paper["subjects"]}
Title: {paper["title"]}
Authors: {paper["authors"]}
' 261 | f'Score: {paper["Relevancy score"]}
Reason: {paper["Reasons for match"]}
' 262 | f'Goal: {paper["Goal"]}
Data: {paper["Data"]}
Methodology: {paper["Methodology"]}
' 263 | f'Experiments & Results: {paper["Experiments & Results"]}
Git: {paper["Git"]}
' 264 | f'Discussion & Next steps: {paper["Discussion & Next steps"]}' 265 | for paper in relevancy 266 | ] 267 | ) 268 | if hallucination: 269 | body = ( 270 | "Warning: the model hallucinated some papers. We have tried to remove them, but the scores may not be accurate.

" 271 | + body 272 | ) 273 | else: 274 | body = "

".join( 275 | [ 276 | f'Title: {paper["title"]}
Authors: {paper["authors"]}' 277 | for paper in papers 278 | ] 279 | ) 280 | return body 281 | 282 | def get_date(): 283 | today = date.today() 284 | formatted_date = today.strftime("%d%m%Y") 285 | return formatted_date 286 | 287 | if __name__ == "__main__": 288 | # Load the .env file. 289 | load_dotenv() 290 | parser = argparse.ArgumentParser() 291 | parser.add_argument( 292 | "--config", help="yaml config file to use", default="config.yaml" 293 | ) 294 | args = parser.parse_args() 295 | with open(args.config, "r") as f: 296 | config = yaml.safe_load(f) 297 | 298 | if "OPENAI_API_KEY" not in os.environ: 299 | raise RuntimeError("No openai api key found") 300 | openai.api_key = os.environ.get("OPENAI_API_KEY") 301 | 302 | topic = config["topic"] 303 | categories = config["categories"] 304 | from_email = os.environ.get("FROM_EMAIL") 305 | to_email = os.environ.get("TO_EMAIL") 306 | threshold = config["threshold"] 307 | interest = config["interest"] 308 | body = generate_body(topic, categories, interest, threshold) 309 | today_date = get_date() 310 | with open(f"digest_{today_date}.html", "w") as f: 311 | f.write(body) 312 | if os.environ.get("SENDGRID_API_KEY", None): 313 | sg = SendGridAPIClient(api_key=os.environ.get("SENDGRID_API_KEY")) 314 | from_email = Email(from_email) # Change to your verified sender 315 | to_email = To(to_email) 316 | subject = date.today().strftime("Personalized arXiv Digest, %d %b %Y") 317 | content = Content("text/html", body) 318 | mail = Mail(from_email, to_email, subject, content) 319 | mail_json = mail.get() 320 | 321 | # Send an HTTP POST request to /mail/send 322 | response = sg.client.mail.send.post(request_body=mail_json) 323 | if response.status_code >= 200 and response.status_code <= 300: 324 | print("Send test email: Success!") 325 | else: 326 | print("Send test email: Failure ({response.status_code}, {response.text})") 327 | else: 328 | print("No sendgrid api key found. Skipping email") 329 | -------------------------------------------------------------------------------- /src/anthropic_utils.py: -------------------------------------------------------------------------------- 1 | """ 2 | Anthropic/Claude API integration for ArxivDigest. 3 | This module provides functions to work with Anthropic's Claude API for paper analysis. 4 | """ 5 | import os 6 | import json 7 | import logging 8 | import time 9 | from typing import List, Dict, Any, Optional 10 | 11 | try: 12 | import anthropic 13 | from anthropic.types import MessageParam 14 | ANTHROPIC_AVAILABLE = True 15 | except ImportError: 16 | ANTHROPIC_AVAILABLE = False 17 | 18 | # Configure logging 19 | logging.basicConfig(level=logging.INFO) 20 | logger = logging.getLogger(__name__) 21 | 22 | class ClaudeConfig: 23 | """Configuration for Claude API calls.""" 24 | def __init__( 25 | self, 26 | temperature: float = 0.5, 27 | max_tokens: int = 4000, 28 | top_p: float = 0.95, 29 | top_k: int = 40 30 | ): 31 | self.temperature = temperature 32 | self.max_tokens = max_tokens 33 | self.top_p = top_p 34 | self.top_k = top_k 35 | 36 | def setup_anthropic_api(api_key: str) -> bool: 37 | """ 38 | Setup the Anthropic API with the provided API key. 39 | 40 | Args: 41 | api_key: Anthropic API key 42 | 43 | Returns: 44 | bool: True if setup was successful, False otherwise 45 | """ 46 | if not ANTHROPIC_AVAILABLE: 47 | logger.error("Anthropic package not installed. Run 'pip install anthropic'") 48 | return False 49 | 50 | if not api_key: 51 | logger.error("No Anthropic API key provided") 52 | return False 53 | 54 | try: 55 | # Initialize client to test connection 56 | client = anthropic.Anthropic(api_key=api_key) 57 | # Test API connection by listing models 58 | models = client.models.list() 59 | available_models = [model.id for model in models.data] 60 | logger.info(f"Successfully connected to Anthropic API. Available models: {available_models}") 61 | return True 62 | except Exception as e: 63 | logger.error(f"Failed to setup Anthropic API: {e}") 64 | return False 65 | 66 | def get_claude_client(api_key: str) -> Optional[anthropic.Anthropic]: 67 | """ 68 | Get an Anthropic client with the given API key. 69 | 70 | Args: 71 | api_key: Anthropic API key 72 | 73 | Returns: 74 | Anthropic client or None if not available 75 | """ 76 | if not ANTHROPIC_AVAILABLE: 77 | return None 78 | 79 | try: 80 | client = anthropic.Anthropic(api_key=api_key) 81 | return client 82 | except Exception as e: 83 | logger.error(f"Failed to get Anthropic client: {e}") 84 | return None 85 | 86 | def analyze_papers_with_claude( 87 | papers: List[Dict[str, Any]], 88 | query: Dict[str, str], 89 | config: Optional[ClaudeConfig] = None, 90 | model_name: str = "claude-3.5-sonnet-20240620", 91 | api_key: str = None 92 | ) -> List[Dict[str, Any]]: 93 | """ 94 | Analyze papers using Claude. 95 | 96 | Args: 97 | papers: List of paper dictionaries 98 | query: Dictionary with 'interest' key describing research interests 99 | config: ClaudeConfig object 100 | model_name: Name of the Claude model to use 101 | api_key: Anthropic API key (optional if already configured elsewhere) 102 | 103 | Returns: 104 | List of papers with added analysis 105 | """ 106 | if not ANTHROPIC_AVAILABLE: 107 | logger.error("Anthropic package not installed. Cannot analyze papers.") 108 | return papers 109 | 110 | if not config: 111 | config = ClaudeConfig() 112 | 113 | # Get client 114 | if api_key: 115 | client = get_claude_client(api_key) 116 | else: 117 | # Try to get from environment 118 | api_key = os.environ.get("ANTHROPIC_API_KEY", "") 119 | if not api_key: 120 | logger.error("No Anthropic API key provided") 121 | return papers 122 | client = get_claude_client(api_key) 123 | 124 | if not client: 125 | return papers 126 | 127 | analyzed_papers = [] 128 | 129 | for paper in papers: 130 | try: 131 | # Prepare system prompt 132 | system_prompt = f""" 133 | You are a research assistant analyzing academic papers in AI and ML. 134 | You provide comprehensive, accurate and unbiased analysis based on the user's research interests. 135 | Your responses should be well-structured and factual, focusing on the paper's strengths, weaknesses, and relevance. 136 | """ 137 | 138 | # Prepare user prompt 139 | user_prompt = f""" 140 | Analyze this paper and provide insights based on the following research interests: 141 | 142 | Research interests: {query['interest']} 143 | 144 | Paper details: 145 | Title: {paper['title']} 146 | Authors: {paper['authors']} 147 | Abstract: {paper['abstract']} 148 | Content: {paper['content'][:5000] if 'content' in paper else 'Not available'} 149 | 150 | Please provide your response as a single JSON object with the following structure: 151 | {{ 152 | "Relevancy score": 1-10 (higher = more relevant), 153 | "Reasons for match": "Detailed explanation of why this paper matches the interests", 154 | "Key innovations": "List the main contributions of the paper", 155 | "Critical analysis": "Evaluate strengths and weaknesses", 156 | "Goal": "What problem does the paper address?", 157 | "Data": "Description of datasets used", 158 | "Methodology": "Technical approach and methods", 159 | "Implementation details": "Model architecture, hyperparameters, etc.", 160 | "Experiments & Results": "Key findings and comparisons", 161 | "Discussion & Next steps": "Limitations and future work", 162 | "Related work": "Connection to similar research", 163 | "Practical applications": "Real-world uses of this research", 164 | "Key takeaways": ["Point 1", "Point 2", "Point 3"] 165 | }} 166 | 167 | Format your response as a valid JSON object and nothing else. 168 | """ 169 | 170 | # Just log that we're sending a prompt to Claude 171 | print(f"Sending prompt to Claude for paper: {paper['title'][:50]}...") 172 | 173 | # Create message 174 | messages: List[MessageParam] = [ 175 | { 176 | "role": "user", 177 | "content": user_prompt 178 | } 179 | ] 180 | 181 | # Call the API 182 | response = client.messages.create( 183 | model=model_name, 184 | max_tokens=config.max_tokens, 185 | temperature=config.temperature, 186 | system=system_prompt, 187 | messages=messages 188 | ) 189 | 190 | # Extract and parse the response 191 | response_text = response.content[0].text if response.content else "" 192 | 193 | # Try to extract JSON 194 | try: 195 | start_idx = response_text.find('{') 196 | end_idx = response_text.rfind('}') + 1 197 | if start_idx >= 0 and end_idx > start_idx: 198 | json_str = response_text[start_idx:end_idx] 199 | claude_analysis = json.loads(json_str) 200 | 201 | # Add Claude analysis to paper 202 | paper['claude_analysis'] = claude_analysis 203 | 204 | # Directly copy fields to paper 205 | for key, value in claude_analysis.items(): 206 | paper[key] = value 207 | else: 208 | logger.warning(f"Could not extract JSON from Claude response for paper {paper['title']}") 209 | paper['claude_analysis'] = {"error": "Failed to parse response"} 210 | except json.JSONDecodeError: 211 | logger.warning(f"Failed to parse Claude response as JSON for paper {paper['title']}") 212 | paper['claude_analysis'] = {"error": "Failed to parse response"} 213 | 214 | analyzed_papers.append(paper) 215 | 216 | # Avoid rate limiting 217 | time.sleep(1) 218 | 219 | except Exception as e: 220 | logger.error(f"Claude API error: {e}") 221 | paper['claude_analysis'] = {"error": f"Claude API error: {str(e)}"} 222 | analyzed_papers.append(paper) 223 | 224 | return analyzed_papers 225 | 226 | def get_claude_interpretability_analysis(paper: Dict[str, Any], model_name: str = "claude-3.5-sonnet-20240620", api_key: str = None) -> Dict[str, Any]: 227 | """ 228 | Get specialized mechanistic interpretability analysis for a paper using Claude. 229 | 230 | Args: 231 | paper: Paper dictionary 232 | model_name: Claude model to use 233 | api_key: Anthropic API key (optional if already configured elsewhere) 234 | 235 | Returns: 236 | Dictionary with interpretability analysis 237 | """ 238 | if not ANTHROPIC_AVAILABLE: 239 | return {"error": "Anthropic package not installed"} 240 | 241 | # Get client 242 | if api_key: 243 | client = get_claude_client(api_key) 244 | else: 245 | # Try to get from environment 246 | api_key = os.environ.get("ANTHROPIC_API_KEY", "") 247 | if not api_key: 248 | return {"error": "No Anthropic API key provided"} 249 | client = get_claude_client(api_key) 250 | 251 | if not client: 252 | return {"error": "Failed to initialize Anthropic client"} 253 | 254 | try: 255 | # Prepare system prompt 256 | system_prompt = """ 257 | You are a specialist in mechanistic interpretability and AI alignment. 258 | Provide a thorough analysis of research papers with focus on interpretability methods, 259 | circuit analysis, and how the work relates to understanding AI systems. 260 | """ 261 | 262 | # Prepare the prompt 263 | user_prompt = f""" 264 | Analyze this paper from a mechanistic interpretability perspective: 265 | 266 | Title: {paper['title']} 267 | Authors: {paper['authors']} 268 | Abstract: {paper['abstract']} 269 | Content: {paper['content'][:7000] if 'content' in paper else paper['abstract']} 270 | 271 | Please return your analysis as a JSON object with the following fields: 272 | 273 | {{ 274 | "interpretability_score": 1-10 (how relevant is this to mechanistic interpretability), 275 | "key_methods": "Main interpretability techniques used or proposed", 276 | "circuit_analysis": "Any findings about neural circuits or components", 277 | "relevance_to_alignment": "How this work contributes to AI alignment", 278 | "novel_insights": "New perspectives on model internals", 279 | "limitations": "Limitations of the interpretability methods", 280 | "potential_extensions": "How this work could be extended", 281 | "connection_to_other_work": "Relationship to other interpretability papers" 282 | }} 283 | 284 | Respond with only the JSON. 285 | """ 286 | 287 | # Create message 288 | messages: List[MessageParam] = [ 289 | { 290 | "role": "user", 291 | "content": user_prompt 292 | } 293 | ] 294 | 295 | # Call the API 296 | response = client.messages.create( 297 | model=model_name, 298 | max_tokens=4000, 299 | temperature=0.3, 300 | system=system_prompt, 301 | messages=messages 302 | ) 303 | 304 | # Extract and parse the response 305 | response_text = response.content[0].text if response.content else "" 306 | 307 | # Try to extract JSON 308 | try: 309 | # Find the JSON part in the response 310 | start_idx = response_text.find('{') 311 | end_idx = response_text.rfind('}') + 1 312 | if start_idx >= 0 and end_idx > start_idx: 313 | json_str = response_text[start_idx:end_idx] 314 | analysis = json.loads(json_str) 315 | return analysis 316 | else: 317 | return {"error": "Could not extract JSON from response"} 318 | except json.JSONDecodeError: 319 | return {"error": "Failed to parse response as JSON"} 320 | 321 | except Exception as e: 322 | return {"error": f"Claude API error: {str(e)}"} -------------------------------------------------------------------------------- /src/design_finder/main.py: -------------------------------------------------------------------------------- 1 | """ 2 | Main module for design_finder. 3 | Run with: python -m src.design_finder 4 | """ 5 | import os 6 | import sys 7 | import json 8 | import argparse 9 | import datetime 10 | import logging 11 | from typing import List, Dict, Any 12 | 13 | # Add parent directory to path to import from sibling modules 14 | current_dir = os.path.dirname(os.path.abspath(__file__)) 15 | parent_dir = os.path.dirname(os.path.dirname(current_dir)) 16 | if parent_dir not in sys.path: 17 | sys.path.append(parent_dir) 18 | 19 | from src.download_new_papers import get_papers, _download_new_papers 20 | from src.design_automation import ( 21 | is_design_automation_paper, 22 | categorize_design_paper, 23 | analyze_design_techniques, 24 | extract_design_metrics 25 | ) 26 | from src.paths import DATA_DIR, DIGEST_DIR 27 | 28 | # Configure logging 29 | logging.basicConfig( 30 | level=logging.INFO, 31 | format='%(asctime)s - %(name)s - %(levelname)s - %(message)s' 32 | ) 33 | logger = logging.getLogger(__name__) 34 | 35 | # Default arXiv categories to search 36 | DEFAULT_CATEGORIES = [ 37 | "cs.CV", # Computer Vision 38 | "cs.GR", # Graphics 39 | "cs.HC", # Human-Computer Interaction 40 | "cs.AI", # Artificial Intelligence 41 | "cs.LG", # Machine Learning 42 | "cs.CL", # Computation and Language (NLP) 43 | "cs.MM", # Multimedia 44 | "cs.SD", # Sound 45 | "cs.RO", # Robotics (for interactive design) 46 | "cs.CY" # Computers and Society 47 | ] 48 | 49 | def get_date_range(days_back: int = 7) -> List[str]: 50 | """ 51 | Get a list of dates for the past N days in arXiv format. 52 | 53 | Args: 54 | days_back: Number of days to look back 55 | 56 | Returns: 57 | List of date strings in arXiv format 58 | """ 59 | today = datetime.datetime.now() 60 | dates = [] 61 | 62 | for i in range(days_back): 63 | date = today - datetime.timedelta(days=i) 64 | date_str = date.strftime("%a, %d %b %y") 65 | dates.append(date_str) 66 | 67 | return dates 68 | 69 | def ensure_data_files(categories: List[str], days_back: int = 7) -> None: 70 | """ 71 | Make sure data files exist for the specified categories and date range. 72 | 73 | Args: 74 | categories: List of arXiv category codes 75 | days_back: Number of days to look back 76 | """ 77 | dates = get_date_range(days_back) 78 | 79 | for category in categories: 80 | for date_str in dates: 81 | # Add a delay between requests to avoid being blocked 82 | time.sleep(1) 83 | file_path = os.path.join(DATA_DIR, f"{category}_{date_str}.jsonl") 84 | 85 | if not os.path.exists(file_path): 86 | logger.info(f"Downloading papers for {category} on {date_str}") 87 | try: 88 | _download_new_papers(category) 89 | except Exception as e: 90 | logger.error(f"Error downloading {category} papers for {date_str}: {e}") 91 | 92 | def get_design_papers(categories: List[str], days_back: int = 7) -> List[Dict[str, Any]]: 93 | """ 94 | Get design automation papers from specified categories over a date range. 95 | 96 | Args: 97 | categories: List of arXiv category codes 98 | days_back: Number of days to look back 99 | 100 | Returns: 101 | List of design automation papers 102 | """ 103 | # Ensure data files exist 104 | ensure_data_files(categories, days_back) 105 | 106 | # Collect papers 107 | all_papers = [] 108 | dates = get_date_range(days_back) 109 | 110 | for category in categories: 111 | for date_str in dates: 112 | try: 113 | papers = get_papers(category) 114 | all_papers.extend(papers) 115 | except Exception as e: 116 | logger.warning(f"Could not get papers for {category} on {date_str}: {e}") 117 | 118 | # Remove duplicates (papers can appear in multiple categories) 119 | unique_papers = {} 120 | for paper in all_papers: 121 | paper_id = paper.get("main_page", "").split("/")[-1] 122 | if paper_id and paper_id not in unique_papers: 123 | unique_papers[paper_id] = paper 124 | 125 | # Filter design automation papers 126 | design_papers = [] 127 | for paper_id, paper in unique_papers.items(): 128 | if is_design_automation_paper(paper): 129 | paper["paper_id"] = paper_id 130 | paper["design_category"] = categorize_design_paper(paper) 131 | paper["design_techniques"] = analyze_design_techniques(paper) 132 | paper["design_metrics"] = extract_design_metrics(paper) 133 | design_papers.append(paper) 134 | 135 | # Sort by date (newest first) 136 | design_papers.sort(key=lambda p: p.get("main_page", ""), reverse=True) 137 | 138 | return design_papers 139 | 140 | def print_paper_summary(paper: Dict[str, Any]) -> None: 141 | """ 142 | Print a nice summary of a paper to the console. 143 | 144 | Args: 145 | paper: Paper dictionary 146 | """ 147 | print(f"\n{'=' * 80}") 148 | print(f"TITLE: {paper.get('title', 'No title')}") 149 | print(f"AUTHORS: {paper.get('authors', 'No authors')}") 150 | print(f"URL: {paper.get('main_page', 'No URL')}") 151 | print(f"DESIGN CATEGORY: {paper.get('design_category', 'Unknown')}") 152 | print(f"TECHNIQUES: {', '.join(paper.get('design_techniques', []))}") 153 | print(f"METRICS: {', '.join(paper.get('design_metrics', []))}") 154 | print(f"\nABSTRACT: {paper.get('abstract', 'No abstract')[:500]}...") 155 | print(f"{'=' * 80}\n") 156 | 157 | def generate_html_report(papers: List[Dict[str, Any]], output_file: str) -> None: 158 | """ 159 | Generate an HTML report from papers. 160 | 161 | Args: 162 | papers: List of paper dictionaries 163 | output_file: Path to output HTML file 164 | """ 165 | html = f""" 166 | 167 | 168 | 169 | 170 | Design Automation Papers 171 | 186 | 187 | 188 |

Design Automation Papers

189 |
190 |

Found {len(papers)} papers related to graphic design automation with AI/ML

191 |

Generated on {datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S")}

192 |
193 | """ 194 | 195 | # Count categories and techniques 196 | categories = {} 197 | techniques = {} 198 | 199 | for paper in papers: 200 | category = paper.get("design_category", "Uncategorized") 201 | if category in categories: 202 | categories[category] += 1 203 | else: 204 | categories[category] = 1 205 | 206 | for technique in paper.get("design_techniques", []): 207 | if technique in techniques: 208 | techniques[technique] += 1 209 | else: 210 | techniques[technique] = 1 211 | 212 | # Add summary statistics 213 | html += "

Summary Statistics

" 214 | 215 | html += "

Categories:

" 219 | 220 | html += "

Techniques:

" 224 | 225 | # Add papers 226 | for paper in papers: 227 | publish_date = paper.get("main_page", "").split("/")[-1][:4] # Extract YYMM from id 228 | 229 | html += f""" 230 |
231 |
{paper.get("title", "No title")}
232 |
{paper.get("authors", "Unknown authors")}
233 |
arXiv ID: {paper.get("paper_id", "Unknown")}
234 |
Category: {paper.get("design_category", "General")} | Subject: {paper.get("subjects", "N/A")}
235 |
Techniques: {', '.join(paper.get("design_techniques", ["None identified"]))}
236 |
Evaluation metrics: {', '.join(paper.get("design_metrics", ["None identified"]))}
237 |
Abstract: {paper.get("abstract", "No abstract available")}
238 |
239 | """ 240 | 241 | html += """ 242 | 245 | 246 | 247 | """ 248 | 249 | with open(output_file, "w") as f: 250 | f.write(html) 251 | 252 | logger.info(f"HTML report generated: {output_file}") 253 | 254 | def main(): 255 | """Main function for the design finder module.""" 256 | parser = argparse.ArgumentParser(description="Find the latest graphic design automation papers.") 257 | parser.add_argument("--days", type=int, default=7, help="Number of days to look back") 258 | parser.add_argument("--output", type=str, default="design_papers.json", help="Output JSON file path") 259 | parser.add_argument("--html", type=str, default="design_papers.html", help="Output HTML file path") 260 | parser.add_argument("--categories", type=str, nargs="+", default=DEFAULT_CATEGORIES, 261 | help="arXiv categories to search") 262 | parser.add_argument("--keyword", type=str, help="Additional keyword to filter papers") 263 | parser.add_argument("--technique", type=str, help="Filter by specific technique") 264 | parser.add_argument("--category", type=str, help="Filter by specific design category") 265 | args = parser.parse_args() 266 | 267 | logger.info(f"Looking for design papers in the past {args.days} days") 268 | logger.info(f"Searching categories: {', '.join(args.categories)}") 269 | 270 | # DATA_DIR is already created by paths.py 271 | 272 | # Get design papers 273 | design_papers = get_design_papers(args.categories, args.days) 274 | 275 | # Apply additional filters if specified 276 | if args.keyword: 277 | keyword = args.keyword.lower() 278 | design_papers = [ 279 | p for p in design_papers 280 | if keyword in p.get("title", "").lower() or 281 | keyword in p.get("abstract", "").lower() 282 | ] 283 | logger.info(f"Filtered by keyword '{args.keyword}': {len(design_papers)} papers remaining") 284 | 285 | if args.technique: 286 | technique = args.technique.lower() 287 | design_papers = [ 288 | p for p in design_papers 289 | if any(technique in t.lower() for t in p.get("design_techniques", [])) 290 | ] 291 | logger.info(f"Filtered by technique '{args.technique}': {len(design_papers)} papers remaining") 292 | 293 | if args.category: 294 | category = args.category.lower() 295 | design_papers = [ 296 | p for p in design_papers 297 | if category in p.get("design_category", "").lower() 298 | ] 299 | logger.info(f"Filtered by category '{args.category}': {len(design_papers)} papers remaining") 300 | 301 | logger.info(f"Found {len(design_papers)} design automation papers") 302 | 303 | # Print summary to console 304 | for paper in design_papers[:10]: # Print top 10 305 | print_paper_summary(paper) 306 | 307 | if len(design_papers) > 10: 308 | print(f"...and {len(design_papers) - 10} more papers.") 309 | 310 | # Save to JSON file in data directory 311 | output_path = os.path.join(DATA_DIR, args.output) 312 | with open(output_path, "w") as f: 313 | json.dump(design_papers, f, indent=2) 314 | 315 | logger.info(f"Saved {len(design_papers)} papers to {output_path}") 316 | 317 | # Generate HTML report in digest directory 318 | html_path = os.path.join(DIGEST_DIR, args.html) 319 | generate_html_report(design_papers, html_path) 320 | 321 | print(f"\nResults saved to {output_path} and {html_path}") 322 | 323 | if __name__ == "__main__": 324 | main() -------------------------------------------------------------------------------- /src/model_manager.py: -------------------------------------------------------------------------------- 1 | """ 2 | Model Manager module to handle different LLM providers. 3 | This provides a unified interface for working with different LLM providers. 4 | """ 5 | import os 6 | import json 7 | import logging 8 | import time 9 | from typing import Dict, List, Any, Optional, Union, Tuple 10 | from enum import Enum 11 | 12 | import openai 13 | try: 14 | import google.generativeai as genai 15 | GEMINI_AVAILABLE = True 16 | except ImportError: 17 | GEMINI_AVAILABLE = False 18 | 19 | try: 20 | import anthropic 21 | ANTHROPIC_AVAILABLE = True 22 | except ImportError: 23 | ANTHROPIC_AVAILABLE = False 24 | 25 | # Configure logging 26 | logging.basicConfig(level=logging.INFO) 27 | logger = logging.getLogger(__name__) 28 | 29 | class ModelProvider(Enum): 30 | OPENAI = "openai" 31 | GEMINI = "gemini" 32 | ANTHROPIC = "anthropic" 33 | 34 | class ModelManager: 35 | """Manager for handling different LLM providers.""" 36 | 37 | def __init__(self): 38 | self.providers = {} 39 | self.available_models = {} 40 | 41 | def register_openai(self, api_key: str) -> bool: 42 | """Register OpenAI as a provider.""" 43 | if not api_key: 44 | logger.error("No OpenAI API key provided") 45 | return False 46 | 47 | try: 48 | openai.api_key = api_key 49 | # Test API connection 50 | models = openai.Model.list() 51 | self.providers[ModelProvider.OPENAI] = True 52 | self.available_models[ModelProvider.OPENAI] = [model.id for model in models.data] 53 | logger.info(f"Successfully connected to OpenAI API. Available models: {self.available_models[ModelProvider.OPENAI]}") 54 | return True 55 | except Exception as e: 56 | logger.error(f"Failed to setup OpenAI API: {e}") 57 | return False 58 | 59 | def register_gemini(self, api_key: str) -> bool: 60 | """Register Gemini as a provider.""" 61 | if not GEMINI_AVAILABLE: 62 | logger.error("Gemini package not installed. Run 'pip install google-generativeai'") 63 | return False 64 | 65 | if not api_key: 66 | logger.error("No Gemini API key provided") 67 | return False 68 | 69 | try: 70 | genai.configure(api_key=api_key) 71 | # Test API connection 72 | models = genai.list_models() 73 | self.providers[ModelProvider.GEMINI] = True 74 | self.available_models[ModelProvider.GEMINI] = [m.name for m in models if 'generateContent' in m.supported_generation_methods] 75 | logger.info(f"Successfully connected to Gemini API. Available models: {self.available_models[ModelProvider.GEMINI]}") 76 | return True 77 | except Exception as e: 78 | logger.error(f"Failed to setup Gemini API: {e}") 79 | return False 80 | 81 | def register_anthropic(self, api_key: str) -> bool: 82 | """Register Anthropic/Claude as a provider.""" 83 | if not ANTHROPIC_AVAILABLE: 84 | logger.error("Anthropic package not installed. Run 'pip install anthropic'") 85 | return False 86 | 87 | if not api_key: 88 | logger.error("No Anthropic API key provided") 89 | return False 90 | 91 | try: 92 | self.anthropic_client = anthropic.Anthropic(api_key=api_key) 93 | # Test API connection by listing models 94 | models = self.anthropic_client.models.list() 95 | self.providers[ModelProvider.ANTHROPIC] = True 96 | self.available_models[ModelProvider.ANTHROPIC] = [model.id for model in models.data] 97 | logger.info(f"Successfully connected to Anthropic API. Available models: {self.available_models[ModelProvider.ANTHROPIC]}") 98 | return True 99 | except Exception as e: 100 | logger.error(f"Failed to setup Anthropic API: {e}") 101 | return False 102 | 103 | def is_provider_available(self, provider: ModelProvider) -> bool: 104 | """Check if a provider is available.""" 105 | return provider in self.providers and self.providers[provider] 106 | 107 | def get_available_providers(self) -> List[ModelProvider]: 108 | """Get a list of available providers.""" 109 | return [provider for provider in self.providers if self.providers[provider]] 110 | 111 | def get_provider_models(self, provider: ModelProvider) -> List[str]: 112 | """Get available models for a provider.""" 113 | if provider in self.available_models: 114 | return self.available_models[provider] 115 | return [] 116 | 117 | def analyze_papers( 118 | self, 119 | papers: List[Dict[str, Any]], 120 | query: Dict[str, str], 121 | providers: List[ModelProvider] = None, 122 | model_names: Dict[ModelProvider, str] = None, 123 | threshold_score: int = 7, 124 | ) -> Tuple[List[Dict[str, Any]], bool]: 125 | """ 126 | Analyze papers using multiple model providers. 127 | 128 | Args: 129 | papers: List of paper dictionaries 130 | query: Dictionary with 'interest' key describing research interests 131 | providers: List of providers to use (defaults to all available) 132 | model_names: Dictionary mapping providers to model names 133 | threshold_score: Minimum score for a paper to be considered relevant 134 | 135 | Returns: 136 | Tuple of (list of papers with analysis, hallucination flag) 137 | """ 138 | if not providers: 139 | providers = self.get_available_providers() 140 | 141 | if not model_names: 142 | model_names = {} 143 | 144 | # Default model names if not specified 145 | default_models = { 146 | ModelProvider.OPENAI: "gpt-3.5-turbo-16k", 147 | ModelProvider.GEMINI: "gemini-1.5-flash", 148 | ModelProvider.ANTHROPIC: "claude-3.5-sonnet-20240620" 149 | } 150 | 151 | # Use default models if not specified 152 | for provider in providers: 153 | if provider not in model_names: 154 | model_names[provider] = default_models.get(provider) 155 | 156 | # Check if any providers are available 157 | if not any(self.is_provider_available(provider) for provider in providers): 158 | logger.error("No available providers for paper analysis") 159 | return papers, False 160 | 161 | analyzed_papers = [] 162 | hallucination = False 163 | 164 | # Import the modules here to avoid circular imports 165 | if ModelProvider.OPENAI in providers and self.is_provider_available(ModelProvider.OPENAI): 166 | from relevancy import generate_relevance_score 167 | try: 168 | analyzed_papers, hallu = generate_relevance_score( 169 | papers, 170 | query=query, 171 | model_name=model_names[ModelProvider.OPENAI], 172 | threshold_score=threshold_score, 173 | num_paper_in_prompt=2 174 | ) 175 | hallucination = hallucination or hallu 176 | except Exception as e: 177 | logger.error(f"Error analyzing papers with OpenAI: {e}") 178 | 179 | # Add Gemini analysis if available 180 | if ModelProvider.GEMINI in providers and self.is_provider_available(ModelProvider.GEMINI): 181 | # Import locally to avoid circular imports 182 | from gemini_utils import analyze_papers_with_gemini 183 | 184 | try: 185 | if not analyzed_papers: # If OpenAI analysis failed or was not used 186 | analyzed_papers = papers 187 | 188 | analyzed_papers = analyze_papers_with_gemini( 189 | analyzed_papers, 190 | query=query, 191 | model_name=model_names[ModelProvider.GEMINI] 192 | ) 193 | except Exception as e: 194 | logger.error(f"Error analyzing papers with Gemini: {e}") 195 | 196 | # Add Anthropic/Claude analysis if available 197 | if ModelProvider.ANTHROPIC in providers and self.is_provider_available(ModelProvider.ANTHROPIC): 198 | # Import locally to avoid circular imports 199 | from anthropic_utils import analyze_papers_with_claude 200 | 201 | try: 202 | if not analyzed_papers: # If previous analyses failed or were not used 203 | analyzed_papers = papers 204 | 205 | analyzed_papers = analyze_papers_with_claude( 206 | analyzed_papers, 207 | query=query, 208 | model_name=model_names[ModelProvider.ANTHROPIC] 209 | ) 210 | except Exception as e: 211 | logger.error(f"Error analyzing papers with Claude: {e}") 212 | 213 | return analyzed_papers, hallucination 214 | 215 | def get_mechanistic_interpretability_analysis( 216 | self, 217 | paper: Dict[str, Any], 218 | provider: ModelProvider = None, 219 | model_name: str = None 220 | ) -> Dict[str, Any]: 221 | """ 222 | Get specialized mechanistic interpretability analysis for a paper. 223 | 224 | Args: 225 | paper: Paper dictionary 226 | provider: Provider to use (defaults to first available) 227 | model_name: Model name to use 228 | 229 | Returns: 230 | Dictionary with mechanistic interpretability analysis 231 | """ 232 | # Import interpretability analysis functions 233 | from interpretability_analysis import ( 234 | create_analysis_prompt, 235 | extract_json_from_text, 236 | analyze_interpretability_circuits, 237 | get_paper_relation_to_ai_safety 238 | ) 239 | 240 | if not provider: 241 | available_providers = self.get_available_providers() 242 | if not available_providers: 243 | logger.error("No available providers for mechanistic interpretability analysis") 244 | return {"error": "No available providers"} 245 | provider = available_providers[0] 246 | 247 | if not model_name: 248 | # Use more powerful models for specialized analysis 249 | default_models = { 250 | ModelProvider.OPENAI: "gpt-4o", 251 | ModelProvider.GEMINI: "gemini-2.0-flash", 252 | ModelProvider.ANTHROPIC: "claude-3.5-sonnet-20240620" 253 | } 254 | model_name = default_models.get(provider) 255 | 256 | if not self.is_provider_available(provider): 257 | logger.error(f"Provider {provider} is not available") 258 | return {"error": f"Provider {provider} is not available"} 259 | 260 | # Get specialized prompt 261 | prompt = create_analysis_prompt(paper, "mechanistic_interpretability") 262 | 263 | # Process based on provider 264 | if provider == ModelProvider.OPENAI: 265 | try: 266 | response = openai.ChatCompletion.create( 267 | model=model_name, 268 | messages=[ 269 | {"role": "system", "content": "You are a specialist in mechanistic interpretability and AI safety."}, 270 | {"role": "user", "content": prompt} 271 | ], 272 | temperature=0.3, 273 | max_tokens=2048 274 | ) 275 | 276 | # Extract JSON from response 277 | content = response.choices[0].message.content 278 | analysis = extract_json_from_text(content) 279 | 280 | # Add additional circuit analysis if there's no error 281 | if "error" not in analysis: 282 | analysis = analyze_interpretability_circuits(paper, analysis) 283 | analysis["ai_safety_relation"] = get_paper_relation_to_ai_safety(paper) 284 | 285 | return analysis 286 | 287 | except Exception as e: 288 | logger.error(f"Error getting mechanistic interpretability analysis with OpenAI: {e}") 289 | return {"error": f"OpenAI error: {str(e)}"} 290 | 291 | elif provider == ModelProvider.GEMINI and GEMINI_AVAILABLE: 292 | try: 293 | model = genai.GenerativeModel(model_name) 294 | response = model.generate_content(prompt) 295 | 296 | # Extract JSON from response 297 | content = response.text 298 | analysis = extract_json_from_text(content) 299 | 300 | # Add additional circuit analysis if there's no error 301 | if "error" not in analysis: 302 | analysis = analyze_interpretability_circuits(paper, analysis) 303 | analysis["ai_safety_relation"] = get_paper_relation_to_ai_safety(paper) 304 | 305 | return analysis 306 | 307 | except Exception as e: 308 | logger.error(f"Error getting mechanistic interpretability analysis with Gemini: {e}") 309 | return {"error": f"Gemini error: {str(e)}"} 310 | 311 | elif provider == ModelProvider.ANTHROPIC and ANTHROPIC_AVAILABLE: 312 | try: 313 | response = self.anthropic_client.messages.create( 314 | model=model_name, 315 | max_tokens=2048, 316 | temperature=0.3, 317 | system="You are a specialist in mechanistic interpretability and AI safety.", 318 | messages=[ 319 | {"role": "user", "content": prompt} 320 | ] 321 | ) 322 | 323 | # Extract JSON from response 324 | content = response.content[0].text 325 | analysis = extract_json_from_text(content) 326 | 327 | # Add additional circuit analysis if there's no error 328 | if "error" not in analysis: 329 | analysis = analyze_interpretability_circuits(paper, analysis) 330 | analysis["ai_safety_relation"] = get_paper_relation_to_ai_safety(paper) 331 | 332 | return analysis 333 | 334 | except Exception as e: 335 | logger.error(f"Error getting mechanistic interpretability analysis with Claude: {e}") 336 | return {"error": f"Claude error: {str(e)}"} 337 | 338 | return {"error": "Unsupported provider or configuration"} 339 | 340 | def analyze_design_automation( 341 | self, 342 | paper: Dict[str, Any], 343 | provider: ModelProvider = None, 344 | model_name: str = None 345 | ) -> Dict[str, Any]: 346 | """ 347 | Get specialized analysis for design automation papers. 348 | 349 | Args: 350 | paper: Paper dictionary 351 | provider: Provider to use (defaults to first available) 352 | model_name: Model name to use 353 | 354 | Returns: 355 | Dictionary with design automation analysis 356 | """ 357 | # Import design automation functions 358 | from design_automation import ( 359 | create_design_analysis_prompt, 360 | extract_design_capabilities 361 | ) 362 | from interpretability_analysis import extract_json_from_text 363 | 364 | if not provider: 365 | available_providers = self.get_available_providers() 366 | if not available_providers: 367 | logger.error("No available providers for design automation analysis") 368 | return {"error": "No available providers"} 369 | provider = available_providers[0] 370 | 371 | if not model_name: 372 | # Use appropriate models for design analysis 373 | default_models = { 374 | ModelProvider.OPENAI: "gpt-4o", 375 | ModelProvider.GEMINI: "gemini-2.0-flash", 376 | ModelProvider.ANTHROPIC: "claude-3.5-sonnet-20240620" 377 | } 378 | model_name = default_models.get(provider) 379 | 380 | if not self.is_provider_available(provider): 381 | logger.error(f"Provider {provider} is not available") 382 | return {"error": f"Provider {provider} is not available"} 383 | 384 | # Get specialized prompt 385 | prompt = create_design_analysis_prompt(paper) 386 | 387 | # Process based on provider 388 | try: 389 | analysis = None 390 | 391 | if provider == ModelProvider.OPENAI: 392 | response = openai.ChatCompletion.create( 393 | model=model_name, 394 | messages=[ 395 | {"role": "system", "content": "You are a specialist in AI for design automation."}, 396 | {"role": "user", "content": prompt} 397 | ], 398 | temperature=0.3, 399 | max_tokens=2048 400 | ) 401 | content = response.choices[0].message.content 402 | analysis = extract_json_from_text(content) 403 | 404 | elif provider == ModelProvider.GEMINI and GEMINI_AVAILABLE: 405 | model = genai.GenerativeModel(model_name) 406 | response = model.generate_content(prompt) 407 | content = response.text 408 | analysis = extract_json_from_text(content) 409 | 410 | elif provider == ModelProvider.ANTHROPIC and ANTHROPIC_AVAILABLE: 411 | response = self.anthropic_client.messages.create( 412 | model=model_name, 413 | max_tokens=2048, 414 | temperature=0.3, 415 | system="You are a specialist in AI for design automation.", 416 | messages=[ 417 | {"role": "user", "content": prompt} 418 | ] 419 | ) 420 | content = response.content[0].text 421 | analysis = extract_json_from_text(content) 422 | 423 | # Enhance analysis with design capabilities if successful 424 | if analysis and "error" not in analysis: 425 | capabilities = extract_design_capabilities(analysis) 426 | analysis["capabilities"] = capabilities 427 | 428 | return analysis or {"error": "Failed to generate analysis"} 429 | 430 | except Exception as e: 431 | logger.error(f"Error analyzing design automation paper: {e}") 432 | return {"error": f"Analysis error: {str(e)}"} 433 | 434 | # Create a singleton instance 435 | model_manager = ModelManager() -------------------------------------------------------------------------------- /src/design/design_finder.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | """ 3 | Design Finder - A self-contained script to find AI/ML design automation papers on arXiv. 4 | 5 | This script requires only Python standard libraries and BeautifulSoup, making it very easy to run 6 | without complex dependencies. 7 | 8 | Usage: 9 | python design_finder.py [--days 7] [--output design_papers.json] 10 | """ 11 | 12 | import os 13 | import sys 14 | import json 15 | import argparse 16 | import datetime 17 | import re 18 | import time 19 | import urllib.request 20 | from typing import List, Dict, Any 21 | 22 | # Check for BeautifulSoup 23 | try: 24 | from bs4 import BeautifulSoup as bs 25 | except ImportError: 26 | print("BeautifulSoup not found. Installing...") 27 | import subprocess 28 | subprocess.check_call([sys.executable, "-m", "pip", "install", "beautifulsoup4"]) 29 | from bs4 import BeautifulSoup as bs 30 | 31 | # Default arXiv categories to search 32 | DEFAULT_CATEGORIES = [ 33 | "cs.CV", # Computer Vision 34 | "cs.GR", # Graphics 35 | "cs.HC", # Human-Computer Interaction 36 | "cs.AI", # Artificial Intelligence 37 | "cs.LG", # Machine Learning 38 | "cs.CL", # Computation and Language (NLP) 39 | "cs.MM" # Multimedia 40 | ] 41 | 42 | # Design automation keywords for paper filtering 43 | DESIGN_AUTOMATION_KEYWORDS = [ 44 | "design automation", "layout generation", "visual design", "graphic design", 45 | "creative AI", "generative design", "UI generation", "UX automation", 46 | "design system", "composition", "creative workflow", "automated design", 47 | "design tool", "design assistant", "design optimization", "content-aware", 48 | "user interface generation", "visual layout", "image composition", "AI design" 49 | ] 50 | 51 | class DesignPaperFinder: 52 | def __init__(self, days_back=7, categories=None, output_file="design_papers.json", 53 | html_file="design_papers.html", keyword=None, verbose=True): 54 | self.days_back = days_back 55 | self.categories = categories or DEFAULT_CATEGORIES 56 | self.output_file = output_file 57 | self.html_file = html_file 58 | self.keyword = keyword 59 | self.verbose = verbose 60 | self.papers = [] 61 | 62 | # Data directory is already created by paths.py module 63 | 64 | def log(self, message): 65 | """Print a message if verbose mode is enabled.""" 66 | if self.verbose: 67 | print(message) 68 | 69 | def get_date_range(self) -> List[str]: 70 | """Get list of dates to search in arXiv format.""" 71 | today = datetime.datetime.now() 72 | dates = [] 73 | 74 | for i in range(self.days_back): 75 | date = today - datetime.timedelta(days=i) 76 | date_str = date.strftime("%a, %d %b %y") 77 | dates.append(date_str) 78 | 79 | return dates 80 | 81 | def download_papers(self, category: str, date_str: str) -> List[Dict[str, Any]]: 82 | """Download papers for a specific category and date.""" 83 | # Check if we already have this data 84 | # Import data directory at runtime to avoid circular imports 85 | import sys 86 | sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__)))) 87 | from paths import DATA_DIR 88 | file_path = os.path.join(DATA_DIR, f"{category}_{date_str}.jsonl") 89 | if os.path.exists(file_path): 90 | self.log(f"Loading cached papers for {category} on {date_str}") 91 | papers = [] 92 | with open(file_path, "r") as f: 93 | for line in f: 94 | papers.append(json.loads(line)) 95 | return papers 96 | 97 | # Download new papers 98 | self.log(f"Downloading papers for {category} on {date_str}") 99 | NEW_SUB_URL = f'https://arxiv.org/list/{category}/new' 100 | 101 | try: 102 | page = urllib.request.urlopen(NEW_SUB_URL) 103 | except Exception as e: 104 | self.log(f"Error downloading from {NEW_SUB_URL}: {e}") 105 | return [] 106 | 107 | soup = bs(page, 'html.parser') 108 | content = soup.body.find("div", {'id': 'content'}) 109 | 110 | # Find the date heading 111 | h3 = content.find("h3").text # e.g: New submissions for Wed, 10 May 23 112 | date_from_page = h3.replace("New submissions for", "").strip() 113 | 114 | # Find all papers 115 | dt_list = content.dl.find_all("dt") 116 | dd_list = content.dl.find_all("dd") 117 | arxiv_base = "https://arxiv.org/abs/" 118 | arxiv_html = "https://arxiv.org/html/" 119 | 120 | papers = [] 121 | for i in range(len(dt_list)): 122 | try: 123 | paper = {} 124 | ahref = dt_list[i].find('a', href=re.compile(r'[/]([a-z]|[A-Z])\w+')).attrs['href'] 125 | paper_number = ahref.strip().replace("/abs/", "") 126 | 127 | paper['main_page'] = arxiv_base + paper_number 128 | paper['pdf'] = arxiv_base.replace('abs', 'pdf') + paper_number 129 | 130 | paper['title'] = dd_list[i].find("div", {"class": "list-title mathjax"}).text.replace("Title:\n", "").strip() 131 | paper['authors'] = dd_list[i].find("div", {"class": "list-authors"}).text.replace("Authors:\n", "").replace("\n", "").strip() 132 | paper['subjects'] = dd_list[i].find("div", {"class": "list-subjects"}).text.replace("Subjects:\n", "").strip() 133 | paper['abstract'] = dd_list[i].find("p", {"class": "mathjax"}).text.replace("\n", " ").strip() 134 | 135 | # Get a short excerpt of content (optional) 136 | try: 137 | html = urllib.request.urlopen(arxiv_html + paper_number + "v1") 138 | soup_content = bs(html, 'html.parser') 139 | content_div = soup_content.find('div', attrs={'class': 'ltx_page_content'}) 140 | if content_div: 141 | para_list = content_div.find_all("div", attrs={'class': 'ltx_para'}) 142 | excerpt = ' '.join([p.text.strip() for p in para_list[:3]]) # Get first 3 paragraphs 143 | paper['content_excerpt'] = excerpt[:1000] + "..." if len(excerpt) > 1000 else excerpt 144 | else: 145 | paper['content_excerpt'] = "Content not available" 146 | except Exception: 147 | paper['content_excerpt'] = "" 148 | 149 | papers.append(paper) 150 | except Exception as e: 151 | if self.verbose: 152 | self.log(f"Error processing paper {i}: {e}") 153 | 154 | # Save papers to file 155 | with open(file_path, "w") as f: 156 | for paper in papers: 157 | f.write(json.dumps(paper) + "\n") 158 | 159 | return papers 160 | 161 | def is_design_automation_paper(self, paper: Dict[str, Any]) -> bool: 162 | """Check if a paper is related to design automation based on keywords.""" 163 | text = ( 164 | (paper.get("title", "") + " " + 165 | paper.get("abstract", "") + " " + 166 | paper.get("subjects", "")).lower() 167 | ) 168 | 169 | return any(keyword.lower() in text for keyword in DESIGN_AUTOMATION_KEYWORDS) 170 | 171 | def categorize_design_paper(self, paper: Dict[str, Any]) -> str: 172 | """Categorize design automation paper into subcategories.""" 173 | text = (paper.get("title", "") + " " + paper.get("abstract", "")).lower() 174 | 175 | categories = { 176 | "Layout Generation": ["layout", "composition", "arrange", "grid"], 177 | "UI/UX Design": ["user interface", "ui", "ux", "interface design", "website"], 178 | "Graphic Design": ["graphic design", "poster", "visual design", "typography"], 179 | "Image Manipulation": ["image editing", "photo", "manipulation", "style transfer"], 180 | "Design Tools": ["tool", "assistant", "workflow", "productivity"], 181 | "3D Design": ["3d", "modeling", "cad", "product design"], 182 | "Multimodal Design": ["multimodal", "text-to-image", "image-to-code"] 183 | } 184 | 185 | matches = [] 186 | for category, keywords in categories.items(): 187 | if any(keyword in text for keyword in keywords): 188 | matches.append(category) 189 | 190 | if matches: 191 | return ", ".join(matches) 192 | return "General Design Automation" 193 | 194 | def analyze_design_techniques(self, paper: Dict[str, Any]) -> List[str]: 195 | """Extract AI/ML techniques used for design automation in the paper.""" 196 | text = (paper.get("title", "") + " " + paper.get("abstract", "")).lower() 197 | 198 | techniques = [] 199 | technique_keywords = { 200 | "Generative Adversarial Networks": ["gan", "generative adversarial"], 201 | "Diffusion Models": ["diffusion", "ddpm", "stable diffusion"], 202 | "Transformers": ["transformer", "attention mechanism"], 203 | "Reinforcement Learning": ["reinforcement learning", "rl"], 204 | "Computer Vision": ["computer vision", "vision", "cnn"], 205 | "Graph Neural Networks": ["graph neural", "gnn"], 206 | "Large Language Models": ["llm", "large language model", "gpt", "chatgpt"], 207 | "Neural Style Transfer": ["style transfer", "neural style"], 208 | "Evolutionary Algorithms": ["genetic algorithm", "evolutionary"] 209 | } 210 | 211 | for technique, keywords in technique_keywords.items(): 212 | if any(keyword in text for keyword in keywords): 213 | techniques.append(technique) 214 | 215 | return techniques 216 | 217 | def find_papers(self): 218 | """Find design automation papers from arXiv.""" 219 | self.log(f"Looking for design papers in the past {self.days_back} days") 220 | self.log(f"Searching categories: {', '.join(self.categories)}") 221 | 222 | # Get papers for each category and date 223 | dates = self.get_date_range() 224 | all_papers = [] 225 | 226 | for category in self.categories: 227 | for date_str in dates: 228 | try: 229 | papers = self.download_papers(category, date_str) 230 | all_papers.extend(papers) 231 | # Avoid hitting arXiv rate limits 232 | time.sleep(3) 233 | except Exception as e: 234 | self.log(f"Error downloading papers for {category} on {date_str}: {e}") 235 | 236 | # Remove duplicates (papers can appear in multiple categories) 237 | unique_papers = {} 238 | for paper in all_papers: 239 | paper_id = paper.get("main_page", "").split("/")[-1] 240 | if paper_id and paper_id not in unique_papers: 241 | unique_papers[paper_id] = paper 242 | 243 | all_papers = list(unique_papers.values()) 244 | 245 | # Filter for design automation papers 246 | design_papers = [] 247 | for paper in all_papers: 248 | if self.is_design_automation_paper(paper): 249 | paper["design_category"] = self.categorize_design_paper(paper) 250 | paper["design_techniques"] = self.analyze_design_techniques(paper) 251 | design_papers.append(paper) 252 | 253 | # Additional keyword filtering if specified 254 | if self.keyword: 255 | keyword = self.keyword.lower() 256 | design_papers = [ 257 | p for p in design_papers 258 | if keyword in p.get("title", "").lower() or 259 | keyword in p.get("abstract", "").lower() 260 | ] 261 | 262 | # Sort by date 263 | design_papers.sort(key=lambda p: p.get("main_page", ""), reverse=True) 264 | 265 | self.papers = design_papers 266 | self.log(f"Found {len(design_papers)} design automation papers") 267 | return design_papers 268 | 269 | def print_paper_summary(self, paper: Dict[str, Any]): 270 | """Print a nice summary of a paper to the console.""" 271 | print(f"\n{'=' * 80}") 272 | print(f"TITLE: {paper.get('title', 'No title')}") 273 | print(f"AUTHORS: {paper.get('authors', 'No authors')}") 274 | print(f"URL: {paper.get('main_page', 'No URL')}") 275 | print(f"DESIGN CATEGORY: {paper.get('design_category', 'Unknown')}") 276 | print(f"TECHNIQUES: {', '.join(paper.get('design_techniques', []))}") 277 | print(f"\nABSTRACT: {paper.get('abstract', 'No abstract')[:500]}...") 278 | print(f"{'=' * 80}\n") 279 | 280 | def generate_html_report(self): 281 | """Generate an HTML report from papers.""" 282 | if not self.papers: 283 | self.log("No papers to generate HTML report from") 284 | return 285 | 286 | html = f""" 287 | 288 | 289 | 290 | 291 | Design Automation Papers 292 | 309 | 310 | 311 |

Design Automation Papers

312 | 316 |
317 |

Found {len(self.papers)} papers related to graphic design automation with AI/ML

318 |

Generated on {datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S")}

319 |

Keywords: {', '.join(DESIGN_AUTOMATION_KEYWORDS[:5])}...

320 |
321 | """ 322 | 323 | # Count categories and techniques 324 | categories = {} 325 | techniques = {} 326 | 327 | for paper in self.papers: 328 | category = paper.get("design_category", "Uncategorized") 329 | if category in categories: 330 | categories[category] += 1 331 | else: 332 | categories[category] = 1 333 | 334 | for technique in paper.get("design_techniques", []): 335 | if technique in techniques: 336 | techniques[technique] += 1 337 | else: 338 | techniques[technique] = 1 339 | 340 | # Add summary statistics 341 | html += '

Summary Statistics

' 342 | 343 | html += "

Categories:

" 347 | 348 | html += "

Techniques:

" 352 | 353 | # Add papers 354 | html += '

Papers

' 355 | for paper in self.papers: 356 | html += f""" 357 |
358 |
{paper.get("title", "No title")}
359 |
{paper.get("authors", "Unknown authors")}
360 |
Category: {paper.get("design_category", "General")} | Subject: {paper.get("subjects", "N/A")}
361 |
Techniques: {', '.join(paper.get("design_techniques", ["None identified"]))}
362 |
Abstract: {paper.get("abstract", "No abstract available")}
363 |
364 | PDF | 365 | arXiv 366 |
367 |
368 | """ 369 | 370 | html += """ 371 | 374 | 375 | 376 | """ 377 | 378 | with open(self.html_file, "w") as f: 379 | f.write(html) 380 | 381 | self.log(f"HTML report generated: {self.html_file}") 382 | 383 | def save_json(self): 384 | """Save papers to JSON file.""" 385 | if not self.papers: 386 | self.log("No papers to save") 387 | return 388 | 389 | with open(self.output_file, "w") as f: 390 | json.dump(self.papers, f, indent=2) 391 | 392 | self.log(f"Saved {len(self.papers)} papers to {self.output_file}") 393 | 394 | def run(self): 395 | """Run the full paper finding process.""" 396 | self.find_papers() 397 | 398 | if not self.papers: 399 | print("No design automation papers found.") 400 | return 401 | 402 | # Print summary of top papers 403 | for paper in self.papers[:10]: # Print top 10 404 | self.print_paper_summary(paper) 405 | 406 | if len(self.papers) > 10: 407 | print(f"...and {len(self.papers) - 10} more papers.") 408 | 409 | # Save outputs 410 | self.save_json() 411 | self.generate_html_report() 412 | 413 | print(f"\nResults saved to {self.output_file} and {self.html_file}") 414 | print(f"Open {self.html_file} in your browser to view the report.") 415 | 416 | def main(): 417 | parser = argparse.ArgumentParser(description="Find the latest graphic design automation papers.") 418 | parser.add_argument("--days", type=int, default=7, help="Number of days to look back") 419 | parser.add_argument("--output", type=str, default="design_papers.json", help="Output file path") 420 | parser.add_argument("--html", type=str, default="design_papers.html", help="HTML output file path") 421 | parser.add_argument("--categories", type=str, nargs="+", default=DEFAULT_CATEGORIES, 422 | help="arXiv categories to search") 423 | parser.add_argument("--keyword", type=str, help="Additional keyword to filter papers") 424 | parser.add_argument("--quiet", action="store_true", help="Suppress progress messages") 425 | args = parser.parse_args() 426 | 427 | finder = DesignPaperFinder( 428 | days_back=args.days, 429 | categories=args.categories, 430 | output_file=args.output, 431 | html_file=args.html, 432 | keyword=args.keyword, 433 | verbose=not args.quiet 434 | ) 435 | 436 | finder.run() 437 | 438 | if __name__ == "__main__": 439 | main() -------------------------------------------------------------------------------- /src/design/find_design_papers.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | """ 3 | Standalone Design Papers Crawler - A simple script to find the latest papers 4 | on graphic design automation using AI/ML/LLM technologies. 5 | 6 | This version has minimal dependencies and doesn't require the full model setup. 7 | 8 | Usage: 9 | python find_design_papers.py [--days 7] [--output design_papers.json] 10 | """ 11 | 12 | import os 13 | import sys 14 | import json 15 | import argparse 16 | import datetime 17 | import logging 18 | import re 19 | import urllib.request 20 | import time 21 | from typing import List, Dict, Any, Optional, Tuple 22 | from bs4 import BeautifulSoup as bs 23 | 24 | # Add parent directory to path to allow imports from sibling modules 25 | sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__)))) 26 | from paths import DATA_DIR, DIGEST_DIR 27 | from model_manager import model_manager, ModelProvider 28 | 29 | # Configure logging 30 | logging.basicConfig( 31 | level=logging.INFO, 32 | format='%(asctime)s - %(name)s - %(levelname)s - %(message)s' 33 | ) 34 | logger = logging.getLogger(__name__) 35 | 36 | # Default arXiv categories to search 37 | DEFAULT_CATEGORIES = [ 38 | "cs.CV", # Computer Vision 39 | "cs.GR", # Graphics 40 | "cs.HC", # Human-Computer Interaction 41 | "cs.AI", # Artificial Intelligence 42 | "cs.LG", # Machine Learning 43 | "cs.CL", # Computation and Language (NLP) 44 | "cs.MM" # Multimedia 45 | ] 46 | 47 | # Design automation keywords for paper filtering 48 | DESIGN_AUTOMATION_KEYWORDS = [ 49 | "design automation", "layout generation", "visual design", "graphic design", 50 | "creative AI", "generative design", "UI generation", "UX automation", 51 | "design system", "composition", "creative workflow", "automated design", 52 | "design tool", "design assistant", "design optimization", "content-aware", 53 | "user interface generation", "visual layout", "image composition" 54 | ] 55 | 56 | def download_papers(category: str, date_str: str = None) -> List[Dict[str, Any]]: 57 | """ 58 | Download papers for a specific category and date. 59 | 60 | Args: 61 | category: arXiv category code 62 | date_str: Date string in arXiv format (default: today) 63 | 64 | Returns: 65 | List of paper dictionaries 66 | """ 67 | if not date_str: 68 | date = datetime.datetime.now() 69 | date_str = date.strftime("%a, %d %b %y") 70 | 71 | # Data directory is already created by paths.py 72 | pass 73 | 74 | # Check if we already have this data 75 | file_path = os.path.join(DATA_DIR, f"{category}_{date_str}.jsonl") 76 | if os.path.exists(file_path): 77 | papers = [] 78 | with open(file_path, "r") as f: 79 | for line in f: 80 | papers.append(json.loads(line)) 81 | return papers 82 | 83 | # Download new papers 84 | logger.info(f"Downloading papers for {category} on {date_str}") 85 | NEW_SUB_URL = f'https://arxiv.org/list/{category}/new' 86 | 87 | try: 88 | # Add user-agent header to appear more like a browser 89 | headers = { 90 | 'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/121.0.0.0 Safari/537.36' 91 | } 92 | req = urllib.request.Request(NEW_SUB_URL, headers=headers) 93 | page = urllib.request.urlopen(req) 94 | except Exception as e: 95 | logger.error(f"Error downloading from {NEW_SUB_URL}: {e}") 96 | return [] 97 | 98 | soup = bs(page, 'html.parser') 99 | content = soup.body.find("div", {'id': 'content'}) 100 | 101 | # Find the date heading 102 | h3 = content.find("h3").text # e.g: New submissions for Wed, 10 May 23 103 | date_from_page = h3.replace("New submissions for", "").strip() 104 | 105 | # Find all papers 106 | dt_list = content.dl.find_all("dt") 107 | dd_list = content.dl.find_all("dd") 108 | arxiv_base = "https://arxiv.org/abs/" 109 | arxiv_html = "https://arxiv.org/html/" 110 | 111 | papers = [] 112 | for i in range(len(dt_list)): 113 | try: 114 | paper = {} 115 | ahref = dt_list[i].find('a', href=re.compile(r'[/]([a-z]|[A-Z])\w+')).attrs['href'] 116 | paper_number = ahref.strip().replace("/abs/", "") 117 | 118 | paper['main_page'] = arxiv_base + paper_number 119 | paper['pdf'] = arxiv_base.replace('abs', 'pdf') + paper_number 120 | 121 | paper['title'] = dd_list[i].find("div", {"class": "list-title mathjax"}).text.replace("Title:\n", "").strip() 122 | paper['authors'] = dd_list[i].find("div", {"class": "list-authors"}).text.replace("Authors:\n", "").replace("\n", "").strip() 123 | paper['subjects'] = dd_list[i].find("div", {"class": "list-subjects"}).text.replace("Subjects:\n", "").strip() 124 | paper['abstract'] = dd_list[i].find("p", {"class": "mathjax"}).text.replace("\n", " ").strip() 125 | 126 | # Get a short excerpt of content (optional) 127 | try: 128 | # Add user-agent header to appear more like a browser 129 | headers = { 130 | 'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/121.0.0.0 Safari/537.36' 131 | } 132 | req = urllib.request.Request(arxiv_html + paper_number + "v1", headers=headers) 133 | html = urllib.request.urlopen(req) 134 | soup_content = bs(html, 'html.parser') 135 | content_div = soup_content.find('div', attrs={'class': 'ltx_page_content'}) 136 | if content_div: 137 | para_list = content_div.find_all("div", attrs={'class': 'ltx_para'}) 138 | excerpt = ' '.join([p.text.strip() for p in para_list[:3]]) # Get first 3 paragraphs 139 | paper['content_excerpt'] = excerpt[:1000] + "..." if len(excerpt) > 1000 else excerpt 140 | else: 141 | paper['content_excerpt'] = "Content not available" 142 | except Exception as e: 143 | paper['content_excerpt'] = f"Error extracting content: {str(e)}" 144 | 145 | papers.append(paper) 146 | except Exception as e: 147 | logger.warning(f"Error processing paper {i}: {e}") 148 | 149 | # Save papers to file 150 | with open(file_path, "w") as f: 151 | for paper in papers: 152 | f.write(json.dumps(paper) + "\n") 153 | 154 | return papers 155 | 156 | def is_design_automation_paper(paper: Dict[str, Any]) -> bool: 157 | """ 158 | Check if a paper is related to design automation based on keywords. 159 | 160 | Args: 161 | paper: Dictionary with paper details 162 | 163 | Returns: 164 | Boolean indicating if paper is related to design automation 165 | """ 166 | text = ( 167 | (paper.get("title", "") + " " + 168 | paper.get("abstract", "") + " " + 169 | paper.get("subjects", "")).lower() 170 | ) 171 | 172 | return any(keyword.lower() in text for keyword in DESIGN_AUTOMATION_KEYWORDS) 173 | 174 | def categorize_design_paper(paper: Dict[str, Any]) -> str: 175 | """ 176 | Categorize design automation paper into subcategories. 177 | 178 | Args: 179 | paper: Dictionary with paper details 180 | 181 | Returns: 182 | Category name string 183 | """ 184 | text = (paper.get("title", "") + " " + paper.get("abstract", "")).lower() 185 | 186 | categories = { 187 | "Layout Generation": ["layout", "composition", "arrange", "grid"], 188 | "UI/UX Design": ["user interface", "ui", "ux", "interface design", "website"], 189 | "Graphic Design": ["graphic design", "poster", "visual design", "typography"], 190 | "Image Manipulation": ["image editing", "photo", "manipulation", "style transfer"], 191 | "Design Tools": ["tool", "assistant", "workflow", "productivity"], 192 | "3D Design": ["3d", "modeling", "cad", "product design"], 193 | "Multimodal Design": ["multimodal", "text-to-image", "image-to-code"] 194 | } 195 | 196 | matches = [] 197 | for category, keywords in categories.items(): 198 | if any(keyword in text for keyword in keywords): 199 | matches.append(category) 200 | 201 | if matches: 202 | return ", ".join(matches) 203 | return "General Design Automation" 204 | 205 | def analyze_design_techniques(paper: Dict[str, Any]) -> List[str]: 206 | """ 207 | Extract AI/ML techniques used for design automation in the paper. 208 | 209 | Args: 210 | paper: Dictionary with paper details 211 | 212 | Returns: 213 | List of techniques 214 | """ 215 | text = (paper.get("title", "") + " " + paper.get("abstract", "")).lower() 216 | 217 | techniques = [] 218 | technique_keywords = { 219 | "Generative Adversarial Networks": ["gan", "generative adversarial"], 220 | "Diffusion Models": ["diffusion", "ddpm", "stable diffusion"], 221 | "Transformers": ["transformer", "attention mechanism"], 222 | "Reinforcement Learning": ["reinforcement learning", "rl"], 223 | "Computer Vision": ["computer vision", "vision", "cnn"], 224 | "Graph Neural Networks": ["graph neural", "gnn"], 225 | "Large Language Models": ["llm", "large language model", "gpt"], 226 | "Neural Style Transfer": ["style transfer", "neural style"], 227 | "Evolutionary Algorithms": ["genetic algorithm", "evolutionary"] 228 | } 229 | 230 | for technique, keywords in technique_keywords.items(): 231 | if any(keyword in text for keyword in keywords): 232 | techniques.append(technique) 233 | 234 | return techniques 235 | 236 | def get_date_range(days_back: int = 7) -> List[str]: 237 | """ 238 | Get a list of dates for the past N days in arXiv format. 239 | 240 | Args: 241 | days_back: Number of days to look back 242 | 243 | Returns: 244 | List of date strings in arXiv format 245 | """ 246 | today = datetime.datetime.now() 247 | dates = [] 248 | 249 | for i in range(days_back): 250 | date = today - datetime.timedelta(days=i) 251 | date_str = date.strftime("%a, %d %b %y") 252 | dates.append(date_str) 253 | 254 | return dates 255 | 256 | def generate_html_report(papers: List[Dict[str, Any]], output_file: str, keyword: str = None, days_back: int = 7) -> None: 257 | """ 258 | Generate an HTML report from papers. 259 | 260 | Args: 261 | papers: List of paper dictionaries 262 | output_file: Path to output HTML file 263 | keyword: Optional keyword used for filtering 264 | days_back: Number of days searched 265 | """ 266 | # Ensure the output directory exists 267 | output_dir = os.path.dirname(output_file) 268 | if output_dir and not os.path.exists(output_dir): 269 | os.makedirs(output_dir, exist_ok=True) 270 | 271 | # Create a title that includes any keywords and date 272 | title_date = datetime.datetime.now().strftime("%B %d, %Y") 273 | page_title = "Design Automation Papers" 274 | if keyword: 275 | page_title = f"Design Automation Papers - {keyword.title()} - {title_date}" 276 | else: 277 | page_title = f"Design Automation Papers - {title_date}" 278 | 279 | html = f""" 280 | 281 | 282 | 283 | 284 | {page_title} 285 | 304 | 305 | 306 |

Design Automation Papers

307 |
308 |

Found {len(papers)} papers related to graphic design automation with AI/ML

309 |

Generated on {datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S")}

310 |
311 | """ 312 | 313 | # Count categories and techniques 314 | categories = {} 315 | techniques = {} 316 | 317 | for paper in papers: 318 | category = paper.get("design_category", "Uncategorized") 319 | if category in categories: 320 | categories[category] += 1 321 | else: 322 | categories[category] = 1 323 | 324 | for technique in paper.get("design_techniques", []): 325 | if technique in techniques: 326 | techniques[technique] += 1 327 | else: 328 | techniques[technique] = 1 329 | 330 | # Add summary statistics 331 | html += "

Summary Statistics

" 332 | 333 | html += "

Categories:

" 337 | 338 | html += "

Techniques:

" 342 | 343 | # Add papers 344 | for paper in papers: 345 | html += f""" 346 |
347 |
{paper.get("title", "No title")}
348 |
{paper.get("authors", "Unknown authors")}
349 |
Category: {paper.get("design_category", "General")} | Subject: {paper.get("subjects", "N/A")}
350 |
Techniques: {', '.join(paper.get("design_techniques", ["None identified"]))}
351 | """ 352 | 353 | # Add relevancy score and reasons if available 354 | if "Relevancy score" in paper: 355 | html += f'
Relevancy Score: {paper.get("Relevancy score", "N/A")}
' 356 | 357 | if "Reasons for match" in paper: 358 | html += f'
Reason: {paper.get("Reasons for match", "")}
' 359 | 360 | # Add abstract 361 | if "abstract" in paper: 362 | html += f'
Abstract: {paper.get("abstract", "")}
' 363 | 364 | # Add all the additional analysis sections 365 | for key, value in paper.items(): 366 | if key in ["title", "authors", "subjects", "main_page", "Relevancy score", "Reasons for match", 367 | "design_category", "design_techniques", "content", "abstract"]: 368 | continue 369 | 370 | if isinstance(value, str) and value.strip(): 371 | html += f'
{key}:
{value}
' 372 | 373 | # Add links 374 | html += f""" 375 | 379 |
380 | """ 381 | 382 | html += f""" 383 | 388 | 389 | 390 | """ 391 | 392 | with open(output_file, "w") as f: 393 | f.write(html) 394 | 395 | logger.info(f"HTML report generated: {output_file}") 396 | 397 | def print_paper_summary(paper: Dict[str, Any]) -> None: 398 | """ 399 | Print a nice summary of a paper to the console. 400 | 401 | Args: 402 | paper: Paper dictionary 403 | """ 404 | print(f"\n{'=' * 80}") 405 | print(f"TITLE: {paper.get('title', 'No title')}") 406 | print(f"AUTHORS: {paper.get('authors', 'No authors')}") 407 | print(f"URL: {paper.get('main_page', 'No URL')}") 408 | print(f"DESIGN CATEGORY: {paper.get('design_category', 'Unknown')}") 409 | print(f"TECHNIQUES: {', '.join(paper.get('design_techniques', []))}") 410 | print(f"\nABSTRACT: {paper.get('abstract', 'No abstract')[:500]}...") 411 | print(f"{'=' * 80}\n") 412 | 413 | def analyze_papers_with_llm(papers: List[Dict[str, Any]], research_interest: str) -> List[Dict[str, Any]]: 414 | """ 415 | Analyze papers using LLM to provide detailed analysis 416 | 417 | Args: 418 | papers: List of paper dictionaries 419 | research_interest: Description of research interests 420 | 421 | Returns: 422 | Enhanced list of papers with detailed analysis 423 | """ 424 | if not papers: 425 | return papers 426 | 427 | # Check if model_manager is properly initialized 428 | if not model_manager.is_provider_available(ModelProvider.OPENAI): 429 | # Try to get OpenAI key from environment 430 | import os 431 | openai_key = os.environ.get("OPENAI_API_KEY") 432 | if openai_key: 433 | model_manager.register_openai(openai_key) 434 | else: 435 | logger.warning("No OpenAI API key available. Skipping detailed analysis.") 436 | return papers 437 | 438 | logger.info(f"Analyzing {len(papers)} papers with LLM...") 439 | 440 | # Default research interest for design papers if none provided 441 | if not research_interest: 442 | research_interest = """ 443 | I'm interested in papers that use AI/ML for design automation, including: 444 | 1. Generative design systems for graphics, UI/UX, and layouts 445 | 2. ML-enhanced creative tools and design assistants 446 | 3. Novel techniques for automating design processes 447 | 4. Human-AI collaborative design workflows 448 | 5. Applications of LLMs, diffusion models, and GANs to design tasks 449 | """ 450 | 451 | # Analyze papers using model_manager 452 | try: 453 | analyzed_papers, _ = model_manager.analyze_papers( 454 | papers, 455 | query={"interest": research_interest}, 456 | providers=[ModelProvider.OPENAI], 457 | model_names={ModelProvider.OPENAI: "gpt-3.5-turbo-16k"}, 458 | threshold_score=0 # Include all papers, even low scored ones 459 | ) 460 | return analyzed_papers 461 | except Exception as e: 462 | logger.error(f"Error during LLM analysis: {e}") 463 | return papers 464 | 465 | def pre_filter_category(category: str, keyword: str = None) -> bool: 466 | """ 467 | Check if a category is likely to contain design-related papers 468 | to avoid downloading irrelevant categories. 469 | 470 | Args: 471 | category: arXiv category code 472 | keyword: Optional search keyword 473 | 474 | Returns: 475 | Boolean indicating whether to include this category 476 | """ 477 | # Always include these categories as they're highly relevant 478 | high_relevance = ["cs.GR", "cs.HC", "cs.CV", "cs.MM", "cs.SD"] 479 | 480 | if category in high_relevance: 481 | return True 482 | 483 | # If we have a keyword, we need to be less strict to avoid missing papers 484 | if keyword: 485 | return True 486 | 487 | # Medium relevance categories - include for comprehensive searches 488 | medium_relevance = ["cs.AI", "cs.LG", "cs.CL", "cs.RO", "cs.CY"] 489 | return category in medium_relevance 490 | 491 | def main(): 492 | parser = argparse.ArgumentParser(description="Find the latest graphic design automation papers.") 493 | parser.add_argument("--days", type=int, default=7, help="Number of days to look back") 494 | parser.add_argument("--output", type=str, help="Output JSON file path (date will be added automatically)") 495 | parser.add_argument("--html", type=str, help="HTML output file path (date will be added automatically)") 496 | parser.add_argument("--categories", type=str, nargs="+", default=DEFAULT_CATEGORIES, 497 | help="arXiv categories to search") 498 | parser.add_argument("--keyword", type=str, help="Additional keyword to filter papers") 499 | parser.add_argument("--analyze", action="store_true", help="Use LLM to perform detailed analysis of papers") 500 | parser.add_argument("--interest", type=str, help="Research interest description for LLM analysis") 501 | parser.add_argument("--model", type=str, default="gpt-3.5-turbo-16k", help="Model to use for analysis") 502 | parser.add_argument("--no-date", action="store_true", help="Disable adding date to filenames") 503 | args = parser.parse_args() 504 | 505 | # Generate date string for filenames 506 | current_date = datetime.datetime.now().strftime("%Y%m%d") 507 | 508 | # Set default filenames with dates if not provided 509 | if args.output is None: 510 | base_filename = "design_papers" 511 | if args.keyword: 512 | # Add keyword to filename if provided 513 | base_filename = f"design_papers_{args.keyword.lower().replace(' ', '_')}" 514 | 515 | if not args.no_date: 516 | args.output = os.path.join(DATA_DIR, f"{base_filename}_{current_date}.json") 517 | else: 518 | args.output = os.path.join(DATA_DIR, f"{base_filename}.json") 519 | 520 | if args.html is None: 521 | base_filename = "design_papers" 522 | if args.keyword: 523 | # Add keyword to filename if provided 524 | base_filename = f"design_papers_{args.keyword.lower().replace(' ', '_')}" 525 | 526 | if not args.no_date: 527 | args.html = os.path.join(DIGEST_DIR, f"{base_filename}_{current_date}.html") 528 | else: 529 | args.html = os.path.join(DIGEST_DIR, f"{base_filename}.html") 530 | 531 | logger.info(f"Looking for design papers in the past {args.days} days") 532 | 533 | # Apply pre-filtering to categories 534 | filtered_categories = [cat for cat in args.categories if pre_filter_category(cat, args.keyword)] 535 | logger.info(f"Pre-filtered categories: {', '.join(filtered_categories)}") 536 | 537 | # Get papers for each category and date 538 | dates = get_date_range(args.days) 539 | all_papers = [] 540 | 541 | for category in filtered_categories: 542 | for date_str in dates: 543 | try: 544 | papers = download_papers(category, date_str) 545 | # Apply keyword filter immediately if provided 546 | if args.keyword: 547 | keyword = args.keyword.lower() 548 | papers = [ 549 | p for p in papers 550 | if keyword in p.get("title", "").lower() or 551 | keyword in p.get("abstract", "").lower() or 552 | keyword in p.get("subjects", "").lower() 553 | ] 554 | logger.info(f"Found {len(papers)} papers matching keyword '{args.keyword}' in {category}") 555 | 556 | all_papers.extend(papers) 557 | # Avoid hitting arXiv rate limits 558 | time.sleep(5) 559 | except Exception as e: 560 | logger.error(f"Error downloading papers for {category} on {date_str}: {e}") 561 | 562 | # Remove duplicates (papers can appear in multiple categories) 563 | unique_papers = {} 564 | for paper in all_papers: 565 | paper_id = paper.get("main_page", "").split("/")[-1] 566 | if paper_id and paper_id not in unique_papers: 567 | unique_papers[paper_id] = paper 568 | 569 | all_papers = list(unique_papers.values()) 570 | 571 | # Filter for design automation papers 572 | design_papers = [] 573 | for paper in all_papers: 574 | if is_design_automation_paper(paper): 575 | paper["design_category"] = categorize_design_paper(paper) 576 | paper["design_techniques"] = analyze_design_techniques(paper) 577 | design_papers.append(paper) 578 | 579 | # Sort by date 580 | design_papers.sort(key=lambda p: p.get("main_page", ""), reverse=True) 581 | logger.info(f"Found {len(design_papers)} design automation papers") 582 | 583 | # Add detailed analysis with LLM if requested 584 | if args.analyze and design_papers: 585 | design_papers = analyze_papers_with_llm(design_papers, args.interest) 586 | logger.info("Completed LLM analysis of papers") 587 | 588 | # Debug: Print out the analysis fields for the first paper 589 | if design_papers: 590 | logger.info(f"Paper analysis fields: {list(design_papers[0].keys())}") 591 | # If 'Key innovations' is present, it confirms we have the detailed analysis 592 | if 'Key innovations' in design_papers[0]: 593 | logger.info("Detailed analysis fields present!") 594 | else: 595 | logger.warning("Detailed analysis fields missing!") 596 | 597 | # Print summary to console 598 | for paper in design_papers[:10]: # Print top 10 599 | print_paper_summary(paper) 600 | 601 | if len(design_papers) > 10: 602 | print(f"...and {len(design_papers) - 10} more papers.") 603 | 604 | # Ensure output directory exists 605 | output_dir = os.path.dirname(args.output) 606 | if output_dir and not os.path.exists(output_dir): 607 | os.makedirs(output_dir, exist_ok=True) 608 | 609 | # Save to file 610 | with open(args.output, "w") as f: 611 | json.dump(design_papers, f, indent=2) 612 | 613 | # Generate HTML report 614 | generate_html_report(design_papers, args.html, args.keyword, args.days) 615 | 616 | logger.info(f"Saved {len(design_papers)} papers to {args.output}") 617 | print(f"\nResults saved to {args.output} and {args.html}") 618 | 619 | if args.analyze: 620 | print("\nPapers have been analyzed with LLM for detailed information.") 621 | print("The HTML report includes comprehensive analysis of each paper.") 622 | 623 | if __name__ == "__main__": 624 | main() --------------------------------------------------------------------------------