├── tests ├── __init__.py └── test_util.py ├── requirements.txt ├── LICENSE ├── pytest.ini ├── parameters_ar.yaml ├── util ├── parser.py ├── error_standards.py └── logging_standards.py ├── .gitignore ├── parameters_sys.yaml ├── clients ├── client_factory.py ├── base_client.py ├── apis │ ├── generic.py │ └── xploreapi.py ├── core.py ├── arxiv.py └── springer.py ├── templates ├── basic_search_template.yaml ├── advanced_research_template.yaml └── machine_learning_template.yaml ├── docs ├── quick_start_guide.md └── configuration_guide.md └── README.md /tests/__init__.py: -------------------------------------------------------------------------------- 1 | # Tests package for SaLS project 2 | # This package contains all test modules for the project 3 | -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | PyYAML==6.0 2 | pandas==1.4.3 3 | pydantic==1.7.4 4 | spacy-langdetect==0.1.2 5 | spacy==3.6 6 | scipy==1.11.2 7 | gensim==4.2.0 8 | nltk==3.7 9 | lxml==4.9.1 10 | rich==12.6.0 11 | numpy~=1.23.1 12 | requests~=2.31.0 13 | beautifulsoup4==4.12.2 14 | sentence-transformers==2.5.1 15 | tqdm>=4.66.3 16 | pyparsing==3.1.2 17 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2021 Christian Cabrera 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /pytest.ini: -------------------------------------------------------------------------------- 1 | [tool:pytest] 2 | # Pytest configuration for SaLS project 3 | # This file configures pytest behavior and test discovery 4 | 5 | # Test discovery patterns 6 | testpaths = tests 7 | python_files = test_*.py 8 | python_classes = Test* 9 | python_functions = test_* 10 | 11 | # Output and reporting 12 | addopts = 13 | -v # Verbose output 14 | --tb=short # Short traceback format 15 | --strict-markers # Strict marker validation 16 | --disable-warnings # Disable warning display during tests 17 | --color=yes # Colored output 18 | 19 | # Markers for test categorization 20 | markers = 21 | unit: Unit tests for individual functions 22 | integration: Integration tests for workflows 23 | slow: Tests that take longer to run 24 | api: Tests that require external API access 25 | config: Configuration and validation tests 26 | error_handling: Error handling and recovery tests 27 | 28 | # Minimum version requirements 29 | minversion = 6.0 30 | 31 | # Test timeout (seconds) 32 | timeout = 300 33 | 34 | # Coverage configuration (if pytest-cov is installed) 35 | # addopts = --cov=util --cov=analysis --cov=clients --cov-report=html --cov-report=term-missing 36 | -------------------------------------------------------------------------------- /parameters_ar.yaml: -------------------------------------------------------------------------------- 1 | # List of queries in the format : "". & (and operator) ¦ (or operator). 2 | queries: 3 | - augmented reality: "'augmented reality' & 'edge' & 'orchestration' & 'placement'" 4 | #Synonyms of the keywords to expand the queries. 5 | augmented reality: 6 | - ar 7 | - virtual reality 8 | - vr 9 | - game 10 | orchestration: 11 | - service orchestration 12 | - service composition 13 | - composition 14 | - service choreography 15 | - choreography 16 | placement: 17 | - service placement 18 | - service offloading 19 | - offloading 20 | - resource allocation 21 | edge: 22 | - edge computing 23 | - fog computing 24 | - fog 25 | - iot 26 | # Databases where to search for papers. arXiv and Semantic Scholar by default as they are open. You can use other 27 | # repositories by uncommenting the respective lines. You should add the API access keys to the ./config.json file 28 | # in order to use them. (See step 7 in the How to run it? instructions) 29 | databases: 30 | - arxiv 31 | - semantic_scholar 32 | - springer 33 | - ieeexplore 34 | - scopus 35 | - core 36 | # Search time interval YYYY-mm-dd. If you do not want to include search dates comment start_date and 37 | # end_date parameters. 38 | #start_date: 2010-01-01 39 | #end_date: 2022-08-01 40 | # Date of the search and folder name where the outputs will be stored 41 | search_date: 2025-09-01 42 | folder_name: ar_search 43 | -------------------------------------------------------------------------------- /util/parser.py: -------------------------------------------------------------------------------- 1 | from pyparsing import Word, alphanums, quotedString, oneOf, infixNotation, opAssoc, ParseException 2 | import logging 3 | 4 | logger = logging.getLogger('logger') 5 | # Define grammar elements 6 | # Allow common identifier characters without requiring quotes (e.g., hyphens, underscores, slashes, colons, plus, dots) 7 | identifier = Word(alphanums + "-_./:+*") 8 | string_literal = quotedString.setParseAction(lambda t: t[0][1:-1]) 9 | and_operator = oneOf("") 10 | or_operator = oneOf("") 11 | 12 | # Define expression grammar 13 | expression = infixNotation( 14 | identifier | string_literal, 15 | [ 16 | (and_operator, 2, opAssoc.LEFT), 17 | (or_operator, 2, opAssoc.LEFT), 18 | ] 19 | ) 20 | 21 | 22 | # Parse boolean expression function 23 | def parse_boolean_expression(expression_str): 24 | # Quick parentheses balance check ignoring text inside quotes 25 | def _balanced_parentheses(s: str) -> bool: 26 | depth = 0 27 | in_quote = False 28 | q = '' 29 | for ch in s: 30 | if in_quote: 31 | if ch == q: 32 | in_quote = False 33 | continue 34 | if ch in ('"', '\''): 35 | in_quote = True 36 | q = ch 37 | elif ch == '(': 38 | depth += 1 39 | elif ch == ')': 40 | depth -= 1 41 | if depth < 0: 42 | return False 43 | return depth == 0 and not in_quote 44 | 45 | if not _balanced_parentheses(expression_str): 46 | logger.info('Error parsing expression: unbalanced parentheses or unterminated quotes') 47 | return [], False 48 | 49 | try: 50 | parsed = expression.parseString(expression_str, parseAll=True) 51 | return parsed[0], True 52 | except ParseException as e: 53 | # Build a caret marker to indicate where parsing failed 54 | try: 55 | line, col = e.line, e.column 56 | caret = ' ' * (col - 1) + '^' 57 | logger.info('Error parsing expression at column ' + str(col) + ':') 58 | logger.info(line) 59 | logger.info(caret) 60 | logger.info(str(e)) 61 | except Exception: 62 | logger.info('Error parsing expression: ' + str(e)) 63 | return [], False 64 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | # Byte-compiled / optimized / DLL files 2 | __pycache__/ 3 | *.py[cod] 4 | *$py.class 5 | 6 | # C extensions 7 | *.so 8 | 9 | # Distribution / packaging 10 | .Python 11 | build/ 12 | develop-eggs/ 13 | dist/ 14 | downloads/ 15 | eggs/ 16 | .eggs/ 17 | lib/ 18 | lib64/ 19 | parts/ 20 | sdist/ 21 | var/ 22 | wheels/ 23 | pip-wheel-metadata/ 24 | share/python-wheels/ 25 | *.egg-info/ 26 | .installed.cfg 27 | *.egg 28 | MANIFEST 29 | 30 | # PyInstaller 31 | # Usually these files are written by a python script from a template 32 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 33 | *.manifest 34 | *.spec 35 | 36 | # Installer logs 37 | pip-log.txt 38 | pip-delete-this-directory.txt 39 | 40 | # Unit test / coverage reports 41 | htmlcov/ 42 | .tox/ 43 | .nox/ 44 | .coverage 45 | .coverage.* 46 | .cache 47 | nosetests.xml 48 | coverage.xml 49 | *.cover 50 | *.py,cover 51 | .hypothesis/ 52 | .pytest_cache/ 53 | 54 | # Translations 55 | *.mo 56 | *.pot 57 | 58 | # Django stuff: 59 | *.log 60 | local_settings.py 61 | db.sqlite3 62 | db.sqlite3-journal 63 | 64 | # Flask stuff: 65 | instance/ 66 | .webassets-cache 67 | 68 | # Scrapy stuff: 69 | .scrapy 70 | 71 | # Sphinx documentation 72 | docs/_build/ 73 | 74 | # PyBuilder 75 | target/ 76 | 77 | # Jupyter Notebook 78 | .ipynb_checkpoints 79 | 80 | # IPython 81 | profile_default/ 82 | ipython_config.py 83 | 84 | # pyenv 85 | .python-version 86 | 87 | # pipenv 88 | # According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control. 89 | # However, in case of collaboration, if having platform-specific dependencies or dependencies 90 | # having no cross-platform support, pipenv may install dependencies that don't work, or not 91 | # install all needed dependencies. 92 | #Pipfile.lock 93 | 94 | # PEP 582; used by e.g. github.com/David-OConnor/pyflow 95 | __pypackages__/ 96 | 97 | # Celery stuff 98 | celerybeat-schedule 99 | celerybeat.pid 100 | 101 | # SageMath parsed files 102 | *.sage.py 103 | 104 | # Environments 105 | .env 106 | .venv 107 | env/ 108 | venv/ 109 | ENV/ 110 | env.bak/ 111 | venv.bak/ 112 | 113 | # Spyder project settings 114 | .spyderproject 115 | .spyproject 116 | 117 | # Rope project settings 118 | .ropeproject 119 | 120 | # mkdocs documentation 121 | /site 122 | 123 | # mypy 124 | .mypy_cache/ 125 | .dmypy.json 126 | dmypy.json 127 | 128 | # Pyre type checker 129 | .pyre/ 130 | 131 | *.pyc 132 | 133 | # Pycharm 134 | .idea/ 135 | dump.json 136 | logs/ 137 | data/ 138 | .vscode/ 139 | 140 | # API Keys File 141 | config.json 142 | 143 | # Papers 144 | papers/ 145 | 146 | # Local parameter files 147 | local/ 148 | -------------------------------------------------------------------------------- /parameters_sys.yaml: -------------------------------------------------------------------------------- 1 | # List of queries in the format : "". & (and operator) ¦ (or operator). 2 | queries: 3 | - systems engineering: "'systems engineering' & ('generative ai' ¦ 'artificial intelligence')" 4 | # Syntactic filters are used to restrict the number of papers that massive repositories (e.g., springer, scopus) return 5 | # The exact terms and their synonyms must appear in the paper title or abstract to be retrieved. 6 | # If the number of returned papers is too big and syntactic filters are not provided, 7 | # the platform will skip the repository to avoid quota errors or overloading the APIs. 8 | # We advise including syntactic filters to make the retrieval process more feasible. 9 | syntactic_filters: 10 | - 'systems engineering' 11 | - 'generative ai' 12 | # Semantic filters uses LLMs to match abstracts with the provided description. 13 | # The type parameter corresponds to the embedding the semantic search uses. Bert is the open option it currently uses. 14 | # The description parameter is the text that describes the papers you are looking for. A way to create such description is to think of the ideal abstract a selected paper should have. 15 | # The score parameter is the similarity degree between the queries and the included papers. 16 | # Papers with greater or equal score are included. 17 | semantic_filters: 18 | - type: "bert" 19 | - description: "This paper proposes a systems engineering approach to analyse, design, implement, evaluate, and deploy systems based on artificial intelligence. AI-based systems are complex, dependable, data-driven, and critical systems containing one or more components based on AI or machine learning. The systems engineering approach helps to address the challenges that AI-based components generate in the systems (e.g., lack of explainability, security issues, unreliable behaviour, lack of alignment, etc.). This help relies on systems engineering practices and principles that define models, methodologies, techniques, architectural patterns, to facilitate the integration of AI into systems." 20 | - score: 0.8 21 | #Synonyms of the keywords to expand the queries. 22 | systems engineering: 23 | - systems thinking 24 | - dependable systems 25 | - engineering ai 26 | generative ai: 27 | - llm 28 | - large language model 29 | artificial intelligence: 30 | - ai 31 | - machine learning 32 | - ml 33 | - deep learning 34 | # Databases where to search for papers. arXiv and Semantic Scholar by default as they are open. You can use other 35 | # repositories by uncommenting the respective lines. You should add the API access keys to the ./config.json file 36 | # in order to use them. (See step 7 in the How to run it? instructions) 37 | databases: 38 | - arxiv 39 | - semantic_scholar 40 | - springer 41 | - ieeexplore 42 | - scopus 43 | - core 44 | # Search time interval YYYY-mm-dd. If you do not want to include search dates comment start_date and 45 | # end_date parameters. 46 | start_date: 2017-01-01 47 | end_date: 2025-08-18 48 | # Date of the search and folder name where the outputs will be stored 49 | search_date: 2025-08-18 50 | folder_name: sys_search 51 | -------------------------------------------------------------------------------- /clients/client_factory.py: -------------------------------------------------------------------------------- 1 | from .base_client import DatabaseClient 2 | from .arxiv import ArxivClient 3 | from .springer import SpringerClient 4 | from .ieeexplore import IeeeXploreClient 5 | from .core import CoreClient 6 | from .elsevier import ElsevierClient 7 | from .semantic_scholar import SemanticScholarClient 8 | from util.error_standards import ErrorHandler, create_error_context, ErrorSeverity, ErrorCategory, get_standard_error_info 9 | from util.logging_standards import LogCategory 10 | 11 | class DatabaseClientFactory: 12 | """ 13 | Factory class for creating database clients. 14 | 15 | This provides a clean interface for creating the right client 16 | based on the database name, and makes it easy to add new databases. 17 | """ 18 | 19 | def __init__(self): 20 | self._clients = { 21 | 'arxiv': ArxivClient, 22 | 'springer': SpringerClient, 23 | 'ieeexplore': IeeeXploreClient, 24 | 'core': CoreClient, 25 | 'elsevier': ElsevierClient, 26 | 'semantic_scholar': SemanticScholarClient, 27 | } 28 | 29 | def create_client(self, database_name: str) -> DatabaseClient: 30 | """ 31 | Create a client for the specified database. 32 | 33 | Args: 34 | database_name: Name of the database (e.g., 'arxiv', 'springer') 35 | 36 | Returns: 37 | DatabaseClient instance or None if database not supported 38 | """ 39 | if database_name not in self._clients: 40 | return None 41 | 42 | try: 43 | return self._clients[database_name]() 44 | except (ValueError, TypeError) as e: 45 | # Log the error but don't crash the system 46 | import logging 47 | logger = logging.getLogger('logger') 48 | context = create_error_context( 49 | "client_factory", "create_client", 50 | ErrorSeverity.WARNING, 51 | ErrorCategory.DATA, 52 | f"Failed to create client for {database_name} due to data type error: {type(e).__name__}: {str(e)}" 53 | ) 54 | error_info = get_standard_error_info("data_validation_failed") 55 | ErrorHandler.handle_error(e, context, error_info, logger) 56 | return None 57 | except Exception as ex: 58 | # Log the error but don't crash the system 59 | import logging 60 | logger = logging.getLogger('logger') 61 | context = create_error_context( 62 | "client_factory", "create_client", 63 | ErrorSeverity.ERROR, 64 | ErrorCategory.SYSTEM, 65 | f"Failed to create client for {database_name} due to unexpected error: {type(ex).__name__}: {str(ex)}" 66 | ) 67 | error_info = get_standard_error_info("unexpected_error") 68 | ErrorHandler.handle_error(ex, context, error_info, logger) 69 | return None 70 | 71 | def get_supported_databases(self) -> list: 72 | """Get list of supported database names.""" 73 | return list(self._clients.keys()) 74 | 75 | def is_supported(self, database_name: str) -> bool: 76 | """Check if a database is supported.""" 77 | return database_name in self._clients 78 | 79 | def register_client(self, database_name: str, client_class): 80 | """ 81 | Register a new client class for a database. 82 | 83 | This allows for dynamic registration of new clients 84 | without modifying the factory code. 85 | """ 86 | if not issubclass(client_class, DatabaseClient): 87 | raise ValueError(f"Client class must inherit from DatabaseClient") 88 | 89 | self._clients[database_name] = client_class 90 | -------------------------------------------------------------------------------- /templates/basic_search_template.yaml: -------------------------------------------------------------------------------- 1 | # ============================================================================= 2 | # SaLS Basic Search Configuration Template 3 | # ============================================================================= 4 | # This template provides a minimal configuration for basic literature searches 5 | # Copy this file and modify the values according to your research needs 6 | # ============================================================================= 7 | 8 | # REQUIRED: Search queries in the format : "" 9 | # Supported operators: & (AND), | (OR), and parentheses for grouping 10 | # Use quotes around multi-word terms: 'machine learning' & 'edge computing' 11 | queries: 12 | - machine learning: "'machine learning' & 'edge computing'" 13 | - artificial intelligence: "'artificial intelligence' | 'AI'" 14 | 15 | # OPTIONAL: Synonyms to expand your search queries 16 | # These will be automatically added to your searches to increase coverage 17 | machine learning: 18 | - ml 19 | - deep learning 20 | - neural networks 21 | - supervised learning 22 | - unsupervised learning 23 | 24 | artificial intelligence: 25 | - ai 26 | - machine intelligence 27 | - cognitive computing 28 | 29 | # REQUIRED: Databases to search for papers 30 | # Open databases (no API key required): arxiv, semantic_scholar 31 | # Commercial databases (API key required): springer, ieeexplore, scopus, core 32 | # See README.md for API key setup instructions 33 | databases: 34 | - arxiv # Open access, no API key needed 35 | - semantic_scholar # Open access, no API key needed 36 | # - springer # Uncomment and add API key to config.json 37 | # - ieeexplore # Uncomment and add API key to config.json 38 | # - scopus # Uncomment and add API key to config.json 39 | # - core # Uncomment and add API key to config.json 40 | 41 | # OPTIONAL: Date range for your search (YYYY-MM-DD format) 42 | # Comment out both lines to search all available papers 43 | # Including dates can significantly reduce search time and improve relevance 44 | # start_date: 2020-01-01 # Papers published from this date 45 | # end_date: 2024-12-31 # Papers published until this date 46 | 47 | # REQUIRED: Search metadata 48 | search_date: 2024-12-15 # Date when this search was performed 49 | folder_name: basic_search # Output folder name (will be created automatically) 50 | 51 | # ============================================================================= 52 | # ADVANCED FEATURES (Optional - uncomment and configure as needed) 53 | # ============================================================================= 54 | 55 | # OPTIONAL: Syntactic filters for more precise control 56 | # These terms will be required in the paper content 57 | # syntactic_filters: 58 | # - edge computing 59 | # - distributed systems 60 | 61 | # OPTIONAL: Semantic filters using AI-powered similarity matching 62 | # Format: filter_name: "detailed description of what you're looking for" 63 | # semantic_filters: 64 | # - edge computing: "Papers about edge computing, fog computing, and distributed edge systems" 65 | # - machine learning: "Research on machine learning algorithms and applications in edge environments" 66 | 67 | # ============================================================================= 68 | # BEST PRACTICES 69 | # ============================================================================= 70 | # 1. Start with simple queries and refine based on results 71 | # 2. Use specific terms rather than broad concepts 72 | # 3. Include synonyms to catch related research 73 | # 4. Set reasonable date ranges to focus on recent work 74 | # 5. Test with open databases first before adding commercial ones 75 | # 6. Use semantic filters for complex research areas 76 | # ============================================================================= 77 | -------------------------------------------------------------------------------- /docs/quick_start_guide.md: -------------------------------------------------------------------------------- 1 | # SaLS Quick Start Guide 2 | 3 | ## Get Started in 5 Minutes 4 | 5 | This guide will help you run your first literature search with SaLS in just a few minutes. 6 | 7 | ## Prerequisites 8 | 9 | - Python 3.8 or higher 10 | - Internet connection 11 | - Basic understanding of YAML files 12 | 13 | ## Step 1: Setup (2 minutes) 14 | 15 | ### 1.1 Clone and Install 16 | ```bash 17 | git clone https://github.com/cabrerac/semi-automatic-literature-survey.git 18 | cd semi-automatic-literature-survey 19 | python -m venv venv 20 | source venv/bin/activate # On Windows: venv\Scripts\activate 21 | pip install -r requirements.txt 22 | python -m spacy download en_core_web_sm 23 | ``` 24 | 25 | ### 1.2 Test Installation 26 | ```bash 27 | python -c "from util import util; print('✅ SaLS installed successfully!')" 28 | ``` 29 | 30 | ## Step 2: Create Your First Configuration (2 minutes) 31 | 32 | ### 2.1 Copy a Template 33 | ```bash 34 | cp templates/basic_search_template.yaml my_first_search.yaml 35 | ``` 36 | 37 | ### 2.2 Edit the Configuration 38 | Open `my_first_search.yaml` and modify these lines: 39 | 40 | ```yaml 41 | queries: 42 | - your_topic: "'your research topic' & 'key concept'" 43 | 44 | search_date: 2024-12-15 # Today's date 45 | folder_name: my_first_search # Your project name 46 | ``` 47 | 48 | **Example for machine learning research**: 49 | ```yaml 50 | queries: 51 | - deep learning: "'deep learning' & 'computer vision'" 52 | 53 | search_date: 2024-12-15 54 | folder_name: deep_learning_cv 55 | ``` 56 | 57 | ## Step 3: Run Your First Search (1 minute) 58 | 59 | ```bash 60 | python main.py my_first_search.yaml 61 | ``` 62 | 63 | ## What Happens Next? 64 | 65 | 1. **Paper Retrieval**: SaLS searches selected databases 66 | 2. **Preprocessing**: Papers are cleaned and deduplicated 67 | 3. **Semantic Filtering**: AI-powered relevance scoring (if configured) 68 | 4. **Manual Review**: You review abstracts and full papers 69 | 5. **Results**: Final paper list saved to `./papers/` folder 70 | 71 | ## Expected Output 72 | 73 | ``` 74 | 0. Retrieving papers from the databases... 75 | ✅ Retrieved 150 papers from arxiv 76 | ✅ Retrieved 89 papers from semantic_scholar 77 | 78 | 1. Preprocessing papers... 79 | ✅ Preprocessing results can be found at: 1_preprocessed_papers.csv 80 | 81 | 2. Manual filtering by abstract... 82 | [Interactive review process starts] 83 | ``` 84 | 85 | ## Common First-Time Issues 86 | 87 | ### Issue: "Configuration validation failed" 88 | **Solution**: SaLS will show exactly what's wrong and how to fix it 89 | - **Critical errors** (🔴) must be fixed before continuing 90 | - **Warnings** (🟡) allow the pipeline to continue with defaults 91 | - Follow the provided examples to fix issues quickly 92 | 93 | ### Issue: "No papers found" 94 | **Solution**: Try broader queries or different databases 95 | 96 | ### Issue: "API key required" 97 | **Solution**: Use only `arxiv` and `semantic_scholar` (no API key needed) 98 | 99 | ### Issue: Missing optional fields 100 | **Solution**: SaLS automatically provides sensible defaults: 101 | - Missing databases → defaults to open databases 102 | - Missing search_date → defaults to current date 103 | - Missing folder_name → defaults to filename-based 104 | 105 | ## Next Steps 106 | 107 | 1. **Review Results**: Check the generated CSV files 108 | 2. **Refine Queries**: Adjust based on initial results 109 | 3. **Add Filters**: Use syntactic and semantic filters 110 | 4. **Expand Databases**: Add commercial databases with API keys 111 | 112 | ## Need Help? 113 | 114 | - **Configuration Guide**: `docs/configuration_guide.md` 115 | - **Templates**: `templates/` directory 116 | - **Examples**: `parameters_ar.yaml` (working example) 117 | 118 | ## Quick Configuration Examples 119 | 120 | ### Simple Search 121 | ```yaml 122 | queries: 123 | - ai: "'artificial intelligence'" 124 | databases: 125 | - arxiv 126 | - semantic_scholar 127 | search_date: 2024-12-15 128 | folder_name: ai_search 129 | ``` 130 | 131 | ### Focused Search 132 | ```yaml 133 | queries: 134 | - ml_edge: "'machine learning' & 'edge computing'" 135 | databases: 136 | - arxiv 137 | - semantic_scholar 138 | start_date: 2020-01-01 139 | end_date: 2024-12-31 140 | search_date: 2024-12-15 141 | folder_name: ml_edge_search 142 | ``` 143 | 144 | ### Advanced Search 145 | ```yaml 146 | queries: 147 | - systems: "'systems engineering' & ('AI' | 'machine learning')" 148 | databases: 149 | - arxiv 150 | - semantic_scholar 151 | - springer 152 | syntactic_filters: 153 | - systems 154 | - engineering 155 | semantic_filters: 156 | - ai_systems: "Research on AI and machine learning in systems engineering contexts" 157 | search_date: 2024-12-15 158 | folder_name: ai_systems_search 159 | ``` 160 | 161 | ## Success Checklist 162 | 163 | - [ ] SaLS runs without errors 164 | - [ ] Papers are retrieved from databases 165 | - [ ] Results are saved to CSV files 166 | - [ ] You can review and filter papers 167 | - [ ] Final paper list is generated 168 | 169 | ## Congratulations! 170 | 171 | You've successfully completed your first literature search with SaLS. The system is now ready for your research needs. 172 | 173 | **Tip**: Start with simple searches and gradually add complexity as you become familiar with the system. 174 | -------------------------------------------------------------------------------- /templates/advanced_research_template.yaml: -------------------------------------------------------------------------------- 1 | # ============================================================================= 2 | # SaLS Advanced Research Configuration Template 3 | # ============================================================================= 4 | # This template demonstrates all SaLS features for comprehensive literature reviews 5 | # Use this for systematic literature reviews, PhD research, or complex research projects 6 | # ============================================================================= 7 | 8 | # REQUIRED: Primary research queries 9 | # Use complex boolean expressions with parentheses for precise control 10 | # Supported operators: & (AND), | (OR), and parentheses for grouping 11 | queries: 12 | - systems engineering: "'systems engineering' & ('large language models' | 'LLM' | 'generative AI')" 13 | - edge orchestration: "'edge computing' & ('orchestration' | 'composition' | 'choreography') & 'placement'" 14 | - machine learning systems: "'machine learning systems' & ('production' | 'deployment' | 'operations')" 15 | 16 | # REQUIRED: Comprehensive synonyms for query expansion 17 | # These synonyms will be automatically added to increase search coverage 18 | # Group related terms under meaningful categories 19 | systems engineering: 20 | - systems thinking 21 | - system architecture 22 | - system design 23 | - system integration 24 | - system lifecycle 25 | - requirements engineering 26 | - verification and validation 27 | 28 | large language models: 29 | - llm 30 | - transformer models 31 | - language models 32 | - neural language models 33 | - foundation models 34 | - generative models 35 | 36 | edge computing: 37 | - edge 38 | - fog computing 39 | - mobile edge computing 40 | - multi-access edge computing 41 | - edge intelligence 42 | - edge AI 43 | 44 | orchestration: 45 | - service orchestration 46 | - service composition 47 | - service choreography 48 | - resource orchestration 49 | - workflow orchestration 50 | - microservice orchestration 51 | 52 | placement: 53 | - service placement 54 | - resource placement 55 | - workload placement 56 | - task placement 57 | - service offloading 58 | - resource allocation 59 | 60 | machine learning systems: 61 | - ml systems 62 | - production ml 63 | - mlops 64 | - machine learning operations 65 | - ml infrastructure 66 | - ml deployment 67 | 68 | # REQUIRED: Database selection for comprehensive coverage 69 | # Mix open and commercial databases for best results 70 | # Open databases: arxiv, semantic_scholar (no API key needed) 71 | # Commercial databases: springer, ieeexplore, scopus, core (API key required) 72 | databases: 73 | - arxiv # Open access, excellent for recent research 74 | - semantic_scholar # Open access, good citation analysis 75 | - springer # Commercial, high-quality journals 76 | - ieeexplore # Commercial, excellent for engineering 77 | - scopus # Commercial, comprehensive coverage 78 | - core # Commercial, open access repository 79 | # Additional databases available: crossref, europe_pmc, pubmed, openalex 80 | 81 | # OPTIONAL: Date range for focused research (YYYY-MM-DD format) 82 | # Use date ranges to focus on recent developments or specific time periods 83 | # Comment out both lines to search all available papers 84 | start_date: 2018-01-01 # Focus on recent research (last 6 years) 85 | end_date: 2024-12-31 # Include current year 86 | 87 | # REQUIRED: Search metadata 88 | search_date: 2024-12-15 # Date when this search was performed 89 | folder_name: advanced_research # Output folder name (will be created automatically) 90 | 91 | # OPTIONAL: Syntactic filters for precise control 92 | # These terms must appear in the paper content 93 | # Use for filtering out irrelevant papers early in the process 94 | syntactic_filters: 95 | - systems 96 | - engineering 97 | - architecture 98 | - design 99 | - implementation 100 | - evaluation 101 | - analysis 102 | 103 | # OPTIONAL: Semantic filters using AI-powered similarity matching 104 | # These use BERT-based models to find semantically similar papers 105 | # Format: filter_name: "detailed description of what you're looking for" 106 | # Be specific and descriptive for best results 107 | semantic_filters: 108 | - systems engineering: "Research on systems engineering methodologies, frameworks, and approaches applied to complex systems, including requirements engineering, system architecture, design patterns, and lifecycle management" 109 | 110 | - edge orchestration: "Papers about orchestration, composition, and choreography of edge computing resources and services, including placement strategies, resource allocation, and workload distribution in edge environments" 111 | 112 | - production ml: "Research on machine learning systems in production environments, including MLOps, deployment strategies, monitoring, scaling, and operational challenges of ML systems" 113 | 114 | # ============================================================================= 115 | # ADVANCED CONFIGURATION OPTIONS 116 | # ============================================================================= 117 | 118 | # OPTIONAL: Custom fields and types (advanced users) 119 | # fields: ['title', 'abstract', 'keywords', 'full_text'] 120 | # types: ['conferences', 'journals', 'preprints', 'reports'] 121 | 122 | # OPTIONAL: Custom search parameters (advanced users) 123 | # max_papers_per_database: 1000 124 | # search_timeout: 300 125 | # retry_attempts: 3 126 | 127 | # ============================================================================= 128 | # BEST PRACTICES FOR ADVANCED RESEARCH 129 | # ============================================================================= 130 | # 1. QUERIES: Start broad, then narrow down based on initial results 131 | # 2. SYNONYMS: Include variations, abbreviations, and related terminology 132 | # 3. DATABASES: Use a mix of open and commercial for comprehensive coverage 133 | # 4. DATES: Set reasonable ranges to focus on relevant time periods 134 | # 5. FILTERS: Use syntactic filters for precision, semantic filters for recall 135 | # 6. ITERATION: Run multiple searches with refined parameters 136 | # 7. VALIDATION: Check results manually to ensure quality 137 | # 8. DOCUMENTATION: Keep track of search parameters and results 138 | # ============================================================================= 139 | 140 | # ============================================================================= 141 | # TROUBLESHOOTING TIPS 142 | # ============================================================================= 143 | # - If you get too many results: Add more specific terms or use date ranges 144 | # - If you get too few results: Broaden queries or add synonyms 145 | # - If semantic filtering fails: Check that descriptions are detailed and specific 146 | # - If API errors occur: Verify API keys in config.json or use open databases 147 | # - If validation fails: Check YAML syntax and required field formats 148 | # ============================================================================= 149 | -------------------------------------------------------------------------------- /templates/machine_learning_template.yaml: -------------------------------------------------------------------------------- 1 | # ============================================================================= 2 | # SaLS Machine Learning Research Configuration Template 3 | # ============================================================================= 4 | # Specialized template for machine learning, AI, and data science research 5 | # Optimized for finding papers on ML algorithms, systems, and applications 6 | # ============================================================================= 7 | 8 | # REQUIRED: ML-focused research queries 9 | # Use specific ML terminology and combine with application domains 10 | queries: 11 | - deep learning: "'deep learning' & ('neural networks' | 'CNN' | 'RNN' | 'transformer')" 12 | - reinforcement learning: "'reinforcement learning' & ('Q-learning' | 'policy gradient' | 'actor-critic')" 13 | - ml systems: "'machine learning systems' & ('production' | 'deployment' | 'scaling' | 'mlops')" 14 | - computer vision: "'computer vision' & ('image recognition' | 'object detection' | 'segmentation')" 15 | - natural language processing: "'natural language processing' | 'NLP' & ('text analysis' | 'language models')" 16 | 17 | # REQUIRED: ML-specific synonyms and terminology 18 | # Include abbreviations, alternative names, and related concepts 19 | deep learning: 20 | - deep neural networks 21 | - deep learning models 22 | - deep architectures 23 | - deep learning algorithms 24 | - deep learning frameworks 25 | - deep learning applications 26 | 27 | neural networks: 28 | - artificial neural networks 29 | - neural network models 30 | - neural architectures 31 | - neural network training 32 | - neural network optimization 33 | 34 | reinforcement learning: 35 | - rl 36 | - reinforcement learning algorithms 37 | - rl methods 38 | - rl frameworks 39 | - rl applications 40 | - rl optimization 41 | 42 | machine learning systems: 43 | - ml systems 44 | - production ml 45 | - ml infrastructure 46 | - ml deployment 47 | - ml operations 48 | - mlops 49 | - ml engineering 50 | 51 | computer vision: 52 | - cv 53 | - image processing 54 | - visual recognition 55 | - image understanding 56 | - visual analysis 57 | - computer vision systems 58 | 59 | natural language processing: 60 | - nlp 61 | - text processing 62 | - language understanding 63 | - text analysis 64 | - language models 65 | - nlp systems 66 | 67 | # REQUIRED: Databases optimized for ML research 68 | # ML research is well-covered across all major databases 69 | databases: 70 | - arxiv # Excellent for recent ML preprints 71 | - semantic_scholar # Good for ML citations and impact 72 | - springer # High-quality ML journals 73 | - ieeexplore # Excellent for ML conferences and journals 74 | - scopus # Comprehensive ML coverage 75 | - core # Open access ML papers 76 | 77 | # OPTIONAL: Date range for ML research (YYYY-MM-DD format) 78 | # ML field evolves rapidly - consider focusing on recent years 79 | # Comment out both lines to search all available papers 80 | start_date: 2019-01-01 # Focus on recent ML developments 81 | end_date: 2024-12-31 # Include current year 82 | 83 | # REQUIRED: Search metadata 84 | search_date: 2024-12-15 # Date when this search was performed 85 | folder_name: ml_research # Output folder name (will be created automatically) 86 | 87 | # OPTIONAL: ML-specific syntactic filters 88 | # These terms must appear in the paper content 89 | syntactic_filters: 90 | - machine learning 91 | - artificial intelligence 92 | - deep learning 93 | - neural networks 94 | - algorithm 95 | - model 96 | - dataset 97 | - evaluation 98 | - performance 99 | - accuracy 100 | 101 | # OPTIONAL: ML-specific semantic filters using AI-powered similarity matching 102 | # Be specific about ML subfields and applications 103 | semantic_filters: 104 | - deep learning systems: "Research on deep learning systems, architectures, and frameworks including neural network design, training methodologies, optimization techniques, and deployment strategies for deep learning models" 105 | 106 | - reinforcement learning applications: "Papers about reinforcement learning applications in robotics, game playing, autonomous systems, recommendation systems, and other real-world domains with focus on practical implementation and performance evaluation" 107 | 108 | - production ml systems: "Research on machine learning systems in production environments including MLOps, model deployment, monitoring, scaling, A/B testing, and operational challenges of ML systems in industry settings" 109 | 110 | - computer vision applications: "Papers about computer vision applications in autonomous vehicles, medical imaging, surveillance, augmented reality, robotics, and other domains with focus on real-world deployment and performance" 111 | 112 | # ============================================================================= 113 | # ML-SPECIFIC BEST PRACTICES 114 | # ============================================================================= 115 | # 1. QUERIES: Use specific ML terminology (e.g., 'CNN' not just 'neural networks') 116 | # 2. SYNONYMS: Include abbreviations and alternative names commonly used in ML 117 | # 3. DATES: ML field evolves rapidly - focus on recent years for cutting-edge research 118 | # 4. FILTERS: Use ML-specific terms to filter out non-ML papers 119 | # 5. ITERATION: ML research has many subfields - refine queries based on results 120 | # 6. VALIDATION: Check that results are actually ML research, not just mentions 121 | # 7. COVERAGE: ML papers appear in many venues - use multiple databases 122 | # 8. TERMINOLOGY: Stay current with ML terminology and naming conventions 123 | # ============================================================================= 124 | 125 | # ============================================================================= 126 | # ML RESEARCH SUBFIELDS TO CONSIDER 127 | # ============================================================================= 128 | # - Supervised Learning: classification, regression, structured prediction 129 | # - Unsupervised Learning: clustering, dimensionality reduction, generative models 130 | # - Reinforcement Learning: Q-learning, policy methods, multi-agent systems 131 | # - Deep Learning: CNNs, RNNs, transformers, attention mechanisms 132 | # - Computer Vision: image recognition, object detection, segmentation 133 | # - Natural Language Processing: text analysis, language models, translation 134 | # - ML Systems: MLOps, production deployment, scaling, monitoring 135 | # - ML Applications: healthcare, finance, autonomous systems, recommendation 136 | # ============================================================================= 137 | 138 | # ============================================================================= 139 | # COMMON ML CONFERENCES AND JOURNALS 140 | # ============================================================================= 141 | # Conferences: NeurIPS, ICML, ICLR, CVPR, ICCV, ACL, EMNLP, KDD, AAAI, IJCAI 142 | # Journals: JMLR, TPAMI, TMLR, AIJ, MLJ, JAIR, TACL, Computational Linguistics 143 | # ============================================================================= 144 | 145 | # ============================================================================= 146 | # TROUBLESHOOTING FOR ML RESEARCH 147 | # ============================================================================= 148 | # - Too many results: Add specific ML subfield terms or application domains 149 | # - Too few results: Broaden ML terminology or remove overly specific filters 150 | # - Irrelevant results: Use more specific ML terms and syntactic filters 151 | # - Missing recent papers: Check date ranges and ensure recent databases are included 152 | # - API errors: Use open databases (arxiv, semantic_scholar) if commercial ones fail 153 | # ============================================================================= 154 | -------------------------------------------------------------------------------- /clients/base_client.py: -------------------------------------------------------------------------------- 1 | from abc import ABC, abstractmethod 2 | import pandas as pd 3 | import logging 4 | import time 5 | from tqdm import tqdm 6 | from os.path import exists 7 | from util import util 8 | from util.error_standards import ErrorHandler, create_error_context, ErrorSeverity, ErrorCategory, get_standard_error_info 9 | from util.logging_standards import LogCategory, get_current_sals_logger, get_compat_logger 10 | 11 | 12 | class DatabaseClient(ABC): 13 | """ 14 | Abstract base class for database clients using the Template Method pattern. 15 | 16 | This class defines the workflow for retrieving papers from any database: 17 | 1. Check if file exists 18 | 2. Plan requests 19 | 3. Execute requests 20 | 4. Filter papers 21 | 5. Clean papers 22 | 6. Save results 23 | """ 24 | 25 | def __init__(self, database_name: str, max_papers: int = 1000, waiting_time: int = 2, max_retries: int = 3, 26 | client_fields: dict = None, offset_limit: int = None, quota: int = None): 27 | self.database_name = database_name 28 | self.max_papers = max_papers 29 | self.waiting_time = waiting_time 30 | self.max_retries = max_retries 31 | self.client_fields = client_fields or {} 32 | self.offset_limit = offset_limit 33 | self.quota = quota 34 | # Use the standardized SaLS logger if available; fallback to std logger 35 | sals = get_current_sals_logger() 36 | self.logger = get_compat_logger() 37 | self.file_handler = '' 38 | 39 | def get_papers(self, query, syntactic_filters, synonyms, fields, types, dates, start_date, end_date, folder_name, search_date): 40 | """ 41 | Template method that defines the paper retrieval workflow. 42 | """ 43 | # Set up file handler for logging 44 | # Resolve attached file handler, if any 45 | try: 46 | for h in getattr(self.logger, 'handlers', []): 47 | if hasattr(h, 'baseFilename'): 48 | self.file_handler = h.baseFilename 49 | break 50 | except Exception: 51 | self.file_handler = '' 52 | 53 | query_name = list(query.keys())[0] 54 | query_value = query[query_name] 55 | 56 | # Generate file name for this query and database 57 | file_name = self._generate_file_name(folder_name, search_date, query_name) 58 | 59 | # Check if file already exists 60 | if exists(file_name): 61 | self.logger.info(LogCategory.FILE, "base_client", "get_papers", "File already exists.") 62 | return 63 | 64 | # Check if API access is available 65 | if not self._has_api_access(): 66 | self.logger.info(LogCategory.DATABASE, "base_client", "get_papers", "API key access not provided. Skipping this client...") 67 | return 68 | 69 | # Execute the paper retrieval workflow 70 | try: 71 | # Step 1: Plan requests 72 | self.logger.info(LogCategory.DATABASE, "base_client", "get_papers", "Retrieving papers. It might take a while...") 73 | papers = self._plan_requests(query, syntactic_filters, synonyms, fields, types, dates, start_date, end_date) 74 | 75 | if len(papers) > 0: 76 | # Step 2: Filter papers 77 | papers = self._filter_papers(papers, dates, start_date, end_date) 78 | 79 | if len(papers) > 0: 80 | # Step 3: Clean papers 81 | papers = self._clean_papers(papers) 82 | 83 | if self.database_name == 'scopus': 84 | # If the database is Scopus, get abstracts 85 | papers = self._get_abstracts(papers) 86 | 87 | if len(papers) > 0: 88 | # Step 4: Save papers 89 | util.save(file_name, papers, 'utf-8', 'a') 90 | 91 | self.logger.info(LogCategory.DATABASE, "base_client", "get_papers", f"Retrieved papers after filters and cleaning: {len(papers)}") 92 | return file_name 93 | 94 | except (ValueError, TypeError) as e: 95 | # User-friendly message explaining what's happening 96 | context = create_error_context( 97 | "base_client", "get_papers", 98 | ErrorSeverity.WARNING, 99 | ErrorCategory.DATA, 100 | f"Data type error in paper retrieval workflow: {type(e).__name__}: {str(e)}" 101 | ) 102 | error_info = get_standard_error_info("data_validation_failed") 103 | ErrorHandler.handle_error(e, context, error_info, self.logger) 104 | except Exception as ex: 105 | # User-friendly message explaining what's happening 106 | context = create_error_context( 107 | "base_client", "get_papers", 108 | ErrorSeverity.ERROR, 109 | ErrorCategory.SYSTEM, 110 | f"Unexpected error in paper retrieval workflow: {type(ex).__name__}: {str(ex)}" 111 | ) 112 | error_info = get_standard_error_info("unexpected_error") 113 | ErrorHandler.handle_error(ex, context, error_info, self.logger) 114 | 115 | def _generate_file_name(self, folder_name, search_date, query_name): 116 | """Generate the file name for saving papers.""" 117 | return f'./papers/{folder_name}/{str(search_date).replace("-", "_")}/raw_papers/{query_name.lower().replace(" ", "_")}_{self.database_name}.csv' 118 | 119 | @abstractmethod 120 | def _has_api_access(self) -> bool: 121 | """Check if API access is available for this database.""" 122 | pass 123 | 124 | @abstractmethod 125 | def _plan_requests(self, query, syntactic_filters, synonyms, fields, types, dates, start_date, end_date) -> pd.DataFrame: 126 | """Plan the API requests based on the query and parameters.""" 127 | pass 128 | 129 | @abstractmethod 130 | def _filter_papers(self, papers: pd.DataFrame, dates, start_date, end_date) -> pd.DataFrame: 131 | """Filter papers based on criteria like dates, duplicates, etc.""" 132 | pass 133 | 134 | @abstractmethod 135 | def _clean_papers(self, papers: pd.DataFrame) -> pd.DataFrame: 136 | """Clean and standardize paper data.""" 137 | pass 138 | 139 | @abstractmethod 140 | def _get_abstracts(self, papers: pd.DataFrame) -> pd.DataFrame: 141 | """Get abstracts for papers.""" 142 | pass 143 | 144 | def _retry_request(self, request_func, *args, **kwargs): 145 | """Common retry mechanism for API requests.""" 146 | retry = 0 147 | while retry < self.max_retries: 148 | try: 149 | result = request_func(*args, **kwargs) 150 | if self._is_successful_response(result): 151 | return result 152 | except (ValueError, TypeError) as e: 153 | self.logger.debug(LogCategory.DATABASE, "base_client", "_retry_request", f"Request failed due to data type error (attempt {retry + 1}): {type(e).__name__}: {str(e)}") 154 | except Exception as ex: 155 | self.logger.debug(LogCategory.DATABASE, "base_client", "_retry_request", f"Request failed due to unexpected error (attempt {retry + 1}): {type(ex).__name__}: {str(ex)}") 156 | 157 | retry += 1 158 | if retry < self.max_retries: 159 | delay = util.exponential_backoff(retry, self.waiting_time, 64) 160 | time.sleep(delay) 161 | if result is not None and result.status_code == 404: 162 | return result 163 | if result is not None and result.status_code == 429: 164 | result = self._retry_request(request_func, *args, **kwargs) 165 | return result 166 | if result is None: 167 | result = { 168 | "status": "error", 169 | "status_code": 999, 170 | "message": "There was an error processing your request. Please try again later or contact support if the issue persists.", 171 | "attempts": retry, 172 | "max_retries": self.max_retries, 173 | "database": self.database_name 174 | } 175 | return result 176 | return result 177 | 178 | def _is_successful_response(self, response) -> bool: 179 | """Check if the API response is successful.""" 180 | if hasattr(response, 'status_code'): 181 | return response.status_code == 200 182 | return True # Default to True for responses without status codes 183 | 184 | def _log_api_error(self, response, request_info=""): 185 | """Log API errors consistently across all clients.""" 186 | self.logger.info(LogCategory.DATABASE, "base_client", "_log_api_error", f"Error requesting the API. Skipping to next request. Please see the log file for details: {self.file_handler}") 187 | if hasattr(response, 'text'): 188 | self.logger.debug(LogCategory.DATABASE, "base_client", "_log_api_error", f"API response: {response.text}") 189 | if hasattr(response, 'request') and response.request is not None: 190 | self.logger.debug(LogCategory.DATABASE, "base_client", "_log_api_error", f"Request: {request_info}") 191 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # SaLS: Semi-automatic Literature Survey 2 | 3 | This project implements SaLS: a semi-automatic tool to survey research papers based on the systematic methodology proposed by Kitchenham et al.[1, 2]. The goal of this project is to semi-automate the research papers survey process while providing a framework to enable surveys reproducibility and evolution. Two SaLS use cases are: 4 | 5 | - Cabrera, Christian, et al. *The Systems Engineering Approach in Times of Large Language Models.* Proceedings of the 58th Hawaii international conference on system sciences (2025) (To Appear). [Paper.](https://arxiv.org/abs/2411.09050v1) [Code.](https://github.com/cabrerac/semi-automatic-literature-survey/tree/sys-llms-survey) 6 | - Cabrera, Christian, et al. *Machine Learning Systems: A Survey from a Data-Oriented Perspective.* ACM Computing Surveys (2025) [Paper.](https://dl.acm.org/doi/10.1145/3769292) [Code.](https://github.com/cabrerac/semi-automatic-literature-survey/tree/doa-survey) 7 | 8 | SaLS automatically retrives papers metadata based on queries that users provide. These queries are used to consume the search APIs exposed by the most popular research papers repositories in different domains. Currently, SaLS retrieves papers information from the following repositories: 9 | 10 | - [IEEE Xplore](https://ieeexplore.ieee.org/Xplore/home.jsp) 11 | - [Springer Nature](https://www.springernature.com/gp) 12 | - [Scopus](https://www.elsevier.com/en-gb/solutions/scopus) 13 | - [Semantic Scholar](https://www.semanticscholar.org) 14 | - [CORE](https://core.ac.uk) 15 | - [arXiv](https://arxiv.org) 16 | 17 | The retrieved metadata includes paper identifier (e.g., doi), publisher, publication date, title, url, and abstract. 18 | 19 | SaLS merges papers information from different repositories, and then applies customised syntactic and semantic filters (i.e., semantic search)[3] to reduce the search space of papers according to users' interests. 20 | 21 | Once automatic filters are applied, the tool prompts the title and abstract of the paper in a centralised interface where users can decide if the paper should be included or not in the review (i.e., papers filtered by abstract). The URL of the papers that passed the filter by abstract is then prompted in the last filter, which requires the user to skim the full paper and decide if it is included or no. 22 | 23 | Then, the tool applies the snowballing step by retriving the metadata of the works that cited the selected papers in the last step (i.e., papers filtered by skimming the full text), and applies the automatic and semi-automatic filters on the citing papers. 24 | 25 | The final list of papers is composed by the cited papers that passed the first round of filters, and the citing papers that passed the second round of filters (i.e., snowballing). 26 | 27 | # Requirements 28 | 29 | Some of the APIs provided by the repositories require an access key to be consumed. You should request a key to each repository you want to include in your search. Each respository has its own steps to apply for a key as follows: 30 | 31 | - [IEEE Xplore](https://developer.ieee.org/getting_started) 32 | - [Springer Nature](https://dev.springernature.com/docs) 33 | - [Scopus](https://dev.elsevier.com/) 34 | - [CORE](https://core.ac.uk/services/api) 35 | - [Semantic Scholar](https://www.semanticscholar.org/product/api/tutorial) 36 | 37 | Alternatively, you can use the tool for requesting papers from arXiv or Semantic Scholar which are open and do not need an access key. SaLS does not have control over the maintenance of the APIs. If an API produces an error, you can see the details in the log files. We recommend to stop using the API that produces errors for a while. 38 | 39 | # How to run it? 40 | 41 | The following instructions were tested on: 42 | - A Windows machine (i.e., Windows PowerShell) with Python 3.10.11. 43 | - Windows Subsystem for Linux ([WSL](https://docs.microsoft.com/en-us/windows/wsl/install)) with Python 3.8. 44 | - An Ubuntu machine with Python 3.8. 45 | 46 | 1. Clone this repository 47 | 48 | ``` 49 | git clone https://github.com/cabrerac/semi-automatic-literature-survey.git 50 | ``` 51 | ``` 52 | cd semi-automatic-literature-survey/ 53 | ``` 54 | 55 | 2. Create and activate virtual environment 56 | 57 | For Linux distributions 58 | ``` 59 | python -m venv venv 60 | ``` 61 | ``` 62 | source venv/bin/activate 63 | ``` 64 | 65 | For Windows 66 | ``` 67 | python -m venv ./venv 68 | ``` 69 | ``` 70 | ./venv/Scripts/activate 71 | ``` 72 | 73 | 3. Install requirements 74 | 75 | ``` 76 | pip install -r requirements.txt 77 | ``` 78 | 79 | 4. Install language package for spacy 80 | 81 | ``` 82 | python -m spacy download en_core_web_sm 83 | ``` 84 | 85 | 5. Create a file `./config.json` that will store the API access keys for the repositories you want to use. The file should have the following format: 86 | 87 | ``` 88 | { 89 | "api_access_core": "CORE_API_ACCESS_KEY", 90 | "api_access_ieee": "IEEE_API_ACCESS_KEY", 91 | "api_access_springer": "SPRINGER_API_ACCESS_KEY", 92 | "api_access_elsevier": "ELSEVIER_API_ACCESS_KEY" 93 | } 94 | ``` 95 | Ignore this step if you are using the tool with arXiv and Semantic Scholar. Also, you should only add the access keys of the repositories you want to use. 96 | 97 | 6. Run the main passing the search parameters file. For example: 98 | 99 | ``` 100 | python main.py parameters_ar.yaml 101 | ``` 102 | 103 | A simple self-explanatory example of a search parameters file can be found in `./parameters_ar.yaml`. Alternatively, a parameters file including syntactic and semantic filters can be found in `./parameters_sys.yaml` 104 | 105 | ## Configuration and Documentation 106 | 107 | ### Quick Start 108 | - **Quick Start Guide**: `docs/quick_start_guide.md` - Get running in 5 minutes 109 | - **Configuration Guide**: `docs/configuration_guide.md` - Comprehensive configuration reference 110 | - **Configuration Templates**: `templates/` directory - Ready-to-use configuration examples 111 | 112 | ### Configuration Templates 113 | - **Basic Template**: `templates/basic_search_template.yaml` - Simple searches 114 | - **Advanced Template**: `templates/advanced_research_template.yaml` - Complex research projects 115 | - **Machine Learning Template**: `templates/machine_learning_template.yaml` - ML/AI research 116 | 117 | ### Getting Help 118 | - Start with the **Quick Start Guide** for your first search 119 | - Use **Configuration Templates** as starting points 120 | - Refer to the **Configuration Guide** for advanced features 121 | - **Error Recovery**: SaLS automatically detects configuration issues and provides recovery suggestions 122 | - Check error messages for specific guidance and automatic fallbacks 123 | 124 | A description of the semi-automatic methodology applied in a survey can be found in the paper ["Machine Learning Systems: A Survey from a Data-Oriented Perspective"](https://dl.acm.org/doi/10.1145/3769292) [4]. Another paper using this tool is [The Systems Engineering Approach in Times of Large Language Models](https://scholarspace.manoa.hawaii.edu/items/ccd98c8b-bb61-4a86-9cd4-4719078d028f)[5]. 125 | 126 | # Query syntax 127 | 128 | SaLS uses a simple, user‑friendly boolean expression to describe search queries. The syntax you write is normalized and validated, then translated per‑API to each provider’s expected format. 129 | 130 | Supported features 131 | - Operators: AND and OR 132 | - Accepted forms: `AND`, `and`, `&&`, `&`, `OR`, `or`, `||`, `|`, and the legacy `¦` for OR 133 | - Only AND/OR are supported (no NOT) 134 | - Parentheses: use `(` and `)` to group expressions and control precedence 135 | - Phrases: wrap multi‑word terms in single or double quotes, e.g., `'systems engineering'` or "large language model" 136 | - Identifiers without quotes: letters, digits, and common symbols are allowed (e.g., `- _ . / : + *`) 137 | - Whitespace: not significant outside of quotes; preserved inside quotes 138 | 139 | Examples 140 | ``` 141 | 'systems engineering' & ('generative ai' ¦ 'artificial intelligence') 142 | "ml systems" AND (security OR safety) 143 | (robotics AND control) OR 'reinforcement learning' 144 | systems-engineering && (generative_ai || ai) 145 | ``` 146 | 147 | Validation and errors 148 | - Unbalanced parentheses or unterminated quotes are rejected with a helpful message and a caret indicator 149 | - Dangling operators (e.g., `a AND`, `a OR`) are rejected 150 | - Symbols inside quotes are preserved literally (e.g., `'a & b' && c` treats `a & b` as a phrase) 151 | 152 | # References 153 | 154 | [1] Barbara Kitchenham and Pearl Brereton. 2013. A systematic review of systematic review process research in software engineering. Information and Software Technology 55, 12 (2013), 2049–2075. https://doi.org/10.1016/j.infsof.2013.07.010 155 | 156 | [2] Barbara Kitchenham and Stuart Charters. 2007. Guidelines for performing Systematic Literature Reviews in Software Engineering. Technical Report EBSE 2007-001. Keele University and Durham University Joint Report. https://www.elsevier.com/__data/promis_misc/525444systematicreviewsguide.pdf 157 | 158 | [3] SBERT.net Sentence Transformers. 2024. Semantic Search [Available online](https://www.sbert.net/examples/applications/semantic-search/README.html) 159 | 160 | [4] Christian Cabrera, Andrei Paleyes, Pierre Thodoroff, and Neil D. Lawrence. 2025. Machine Learning Systems: A Survey from a Data-Oriented Perspective. ACM Computing Surveys. [Available online](https://dl.acm.org/doi/10.1145/3769292) 161 | 162 | [4] Christian Cabrera, Viviana Bastidas, Jennifer Schooling, and Neil D. Lawrence. 2025. The Systems Engineering Approach in Times of Large Language Models. Proceedings of the 58th Hawaii International Conference on System Sciences. [Available online](https://scholarspace.manoa.hawaii.edu/items/ccd98c8b-bb61-4a86-9cd4-4719078d028f) 163 | 164 | 165 | -------------------------------------------------------------------------------- /tests/test_util.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | """ 3 | Unit tests for SaLS utility functions. 4 | 5 | This module tests the core utility functions in util/util.py, including 6 | configuration validation, file operations, and data processing functions. 7 | """ 8 | 9 | import pytest 10 | import pandas as pd 11 | import tempfile 12 | import os 13 | import sys 14 | from datetime import datetime 15 | from unittest.mock import patch, MagicMock 16 | 17 | # Add the project root to the path for imports 18 | sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.abspath(__file__)))) 19 | 20 | from util import util 21 | 22 | 23 | class TestConfigurationValidation: 24 | """Test configuration validation functions.""" 25 | 26 | @pytest.mark.unit 27 | @pytest.mark.config 28 | def test_validate_configuration_valid_config(self): 29 | """Test validation with a completely valid configuration.""" 30 | config = { 31 | 'queries': [{'test': 'test'}], 32 | 'databases': ['arxiv', 'semantic_scholar'], 33 | 'search_date': '2024-12-15', 34 | 'folder_name': 'test_search' 35 | } 36 | 37 | is_valid, message, suggestions = util._validate_configuration(config, 'test.yaml') 38 | 39 | assert is_valid is True 40 | assert "Configuration validation passed successfully!" in message 41 | assert len(suggestions) == 0 42 | 43 | @pytest.mark.unit 44 | @pytest.mark.config 45 | def test_validate_configuration_missing_queries(self): 46 | """Test validation with missing queries (critical error).""" 47 | config = { 48 | 'databases': ['arxiv'], 49 | 'search_date': '2024-12-15' 50 | } 51 | 52 | is_valid, message, suggestions = util._validate_configuration(config, 'test.yaml') 53 | 54 | assert is_valid is False 55 | assert "CRITICAL ERRORS" in message 56 | assert "Missing queries section" in message 57 | assert len(suggestions) > 0 58 | assert any(s['severity'] == 'critical' for s in suggestions) 59 | 60 | @pytest.mark.unit 61 | @pytest.mark.config 62 | def test_validate_configuration_missing_optional_fields(self): 63 | """Test validation with missing optional fields (warnings only).""" 64 | config = { 65 | 'queries': [{'test': 'test'}] 66 | # Missing databases, search_date, folder_name 67 | } 68 | 69 | is_valid, message, suggestions = util._validate_configuration(config, 'test.yaml') 70 | 71 | assert is_valid is True # Can continue with warnings 72 | assert "WARNINGS" in message 73 | assert len(suggestions) > 0 74 | assert all(s['severity'] == 'warning' for s in suggestions) 75 | 76 | @pytest.mark.unit 77 | @pytest.mark.config 78 | def test_validate_configuration_invalid_database(self): 79 | """Test validation with invalid database names.""" 80 | config = { 81 | 'queries': [{'test': 'test'}], 82 | 'databases': ['invalid_db', 'arxiv'] 83 | } 84 | 85 | is_valid, message, suggestions = util._validate_configuration(config, 'test.yaml') 86 | 87 | assert is_valid is True # Can continue with warnings 88 | assert "invalid database" in message.lower() 89 | assert len(suggestions) > 0 90 | 91 | @pytest.mark.unit 92 | @pytest.mark.config 93 | def test_validate_configuration_invalid_date_format(self): 94 | """Test validation with invalid date formats.""" 95 | config = { 96 | 'queries': [{'test': 'test'}], 97 | 'start_date': 'invalid-date' 98 | } 99 | 100 | is_valid, message, suggestions = util._validate_configuration(config, 'test.yaml') 101 | 102 | assert is_valid is True # Can continue with warnings 103 | assert "Invalid 'start_date' format" in message 104 | assert len(suggestions) > 0 105 | 106 | 107 | class TestConfigurationFallbacks: 108 | """Test configuration fallback application.""" 109 | 110 | @pytest.mark.unit 111 | @pytest.mark.config 112 | def test_apply_configuration_fallbacks_missing_databases(self): 113 | """Test applying fallbacks for missing databases.""" 114 | config = {'queries': [{'test': 'test'}]} 115 | suggestions = [{ 116 | 'issue': 'Missing databases section', 117 | 'severity': 'warning', 118 | 'default': ['arxiv', 'semantic_scholar'] 119 | }] 120 | 121 | updated_config = util._apply_configuration_fallbacks(config, 'test.yaml', suggestions) 122 | 123 | assert 'databases' in updated_config 124 | assert updated_config['databases'] == ['arxiv', 'semantic_scholar'] 125 | 126 | @pytest.mark.unit 127 | @pytest.mark.config 128 | def test_apply_configuration_fallbacks_missing_search_date(self): 129 | """Test applying fallbacks for missing search_date.""" 130 | config = {'queries': [{'test': 'test'}]} 131 | suggestions = [{ 132 | 'issue': 'Missing search_date', 133 | 'severity': 'warning', 134 | 'default': 'current date' 135 | }] 136 | 137 | with patch('util.util.datetime') as mock_datetime: 138 | mock_datetime.today.return_value = datetime(2024, 12, 15) 139 | mock_datetime.strftime.return_value = '2024-12-15' 140 | 141 | updated_config = util._apply_configuration_fallbacks(config, 'test.yaml', suggestions) 142 | 143 | assert 'search_date' in updated_config 144 | assert updated_config['search_date'] == '2024-12-15' 145 | 146 | @pytest.mark.unit 147 | @pytest.mark.config 148 | def test_apply_configuration_fallbacks_missing_folder_name(self): 149 | """Test applying fallbacks for missing folder_name.""" 150 | config = {'queries': [{'test': 'test'}]} 151 | suggestions = [{ 152 | 'issue': 'Missing folder_name', 153 | 'severity': 'warning', 154 | 'default': 'filename-based' 155 | }] 156 | 157 | updated_config = util._apply_configuration_fallbacks(config, 'test.yaml', suggestions) 158 | 159 | assert 'folder_name' in updated_config 160 | assert updated_config['folder_name'] == 'test' 161 | 162 | 163 | class TestQueryProcessing: 164 | """Test query processing and normalization functions.""" 165 | 166 | @pytest.mark.unit 167 | def test_normalize_query_expression_basic(self): 168 | """Test basic query expression normalization.""" 169 | expression = "'machine learning' & 'edge computing'" 170 | normalized = util.normalize_query_expression(expression) 171 | 172 | assert '' in normalized 173 | assert "'machine learning'" in normalized 174 | assert "'edge computing'" in normalized 175 | 176 | @pytest.mark.unit 177 | def test_normalize_query_expression_text_operators(self): 178 | """Test normalization of text-based operators.""" 179 | expression = "'ml' AND 'edge' OR 'fog'" 180 | normalized = util.normalize_query_expression(expression) 181 | 182 | assert '' in normalized 183 | assert '' in normalized 184 | assert "'ml'" in normalized 185 | assert "'edge'" in normalized 186 | assert "'fog'" in normalized 187 | 188 | @pytest.mark.unit 189 | def test_normalize_query_expression_symbolic_operators(self): 190 | """Test normalization of symbolic operators.""" 191 | expression = "'ml' && 'edge' || 'fog'" 192 | normalized = util.normalize_query_expression(expression) 193 | 194 | assert '' in normalized 195 | assert '' in normalized 196 | assert "'ml'" in normalized 197 | assert "'edge'" in normalized 198 | assert "'fog'" in normalized 199 | 200 | @pytest.mark.unit 201 | def test_normalize_query_expression_preserves_quotes(self): 202 | """Test that operators inside quotes are preserved.""" 203 | expression = "'a & b' AND 'c | d'" 204 | normalized = util.normalize_query_expression(expression) 205 | 206 | assert "'a & b'" in normalized # Preserved 207 | assert "'c | d'" in normalized # Preserved 208 | assert '' in normalized # Normalized 209 | 210 | @pytest.mark.unit 211 | def test_normalize_query_expression_encoding_artifacts(self): 212 | """Test removal of encoding artifacts.""" 213 | expression = "Â'machine learning' & 'edge computing'" 214 | normalized = util.normalize_query_expression(expression) 215 | 216 | assert 'Â' not in normalized 217 | assert "'machine learning'" in normalized 218 | assert '' in normalized 219 | 220 | 221 | class TestExponentialBackoff: 222 | """Test exponential backoff function.""" 223 | 224 | @pytest.mark.unit 225 | def test_exponential_backoff_basic(self): 226 | """Test basic exponential backoff calculation.""" 227 | delays = [] 228 | for attempt in range(5): 229 | delay = util.exponential_backoff(attempt) 230 | delays.append(delay) 231 | 232 | # Should be increasing (with jitter) 233 | assert len(delays) == 5 234 | assert all(d > 0 for d in delays) 235 | 236 | @pytest.mark.unit 237 | def test_exponential_backoff_max_delay(self): 238 | """Test that max delay is respected.""" 239 | delay = util.exponential_backoff(10, base_delay=1, max_delay=5) 240 | assert delay <= 5 241 | 242 | @pytest.mark.unit 243 | def test_exponential_backoff_custom_base(self): 244 | """Test custom base delay.""" 245 | delay = util.exponential_backoff(2, base_delay=0.5) 246 | assert delay > 0.5 # Should be greater than base due to exponential growth 247 | 248 | 249 | class TestDataProcessing: 250 | """Test data processing functions.""" 251 | 252 | @pytest.mark.unit 253 | def test_remove_repeated_function_exists(self): 254 | """Test that the main remove_repeated function exists.""" 255 | assert hasattr(util, 'remove_repeated') 256 | assert callable(util.remove_repeated) 257 | 258 | 259 | class TestFileOperations: 260 | """Test file operation functions.""" 261 | 262 | @pytest.mark.unit 263 | def test_save_function_creates_directory(self): 264 | """Test that save function creates directories if they don't exist.""" 265 | with tempfile.TemporaryDirectory() as temp_dir: 266 | file_path = os.path.join(temp_dir, 'subdir', 'test.csv') 267 | df = pd.DataFrame({'col1': [1, 2, 3], 'col2': ['a', 'b', 'c']}) 268 | 269 | util.save(file_path, df, 'utf-8', 'w') 270 | 271 | assert os.path.exists(file_path) 272 | assert os.path.exists(os.path.dirname(file_path)) 273 | 274 | @pytest.mark.unit 275 | def test_save_function_overwrites_existing(self): 276 | """Test that save function can overwrite existing files.""" 277 | with tempfile.TemporaryDirectory() as temp_dir: 278 | file_path = os.path.join(temp_dir, 'test.csv') 279 | df1 = pd.DataFrame({'col1': [1, 2, 3]}) 280 | df2 = pd.DataFrame({'col1': [4, 5, 6]}) 281 | 282 | # Save first DataFrame 283 | util.save(file_path, df1, 'utf-8', 'w') 284 | assert os.path.exists(file_path) 285 | 286 | # Overwrite with second DataFrame 287 | util.save(file_path, df2, 'utf-8', 'w') 288 | 289 | # Verify content was overwritten 290 | loaded_df = pd.read_csv(file_path) 291 | assert len(loaded_df) == 3 292 | assert loaded_df['col1'].iloc[0] == 4 293 | 294 | 295 | class TestErrorHandling: 296 | """Test error handling and recovery.""" 297 | 298 | @pytest.mark.unit 299 | @pytest.mark.error_handling 300 | def test_validate_configuration_exception_handling(self): 301 | """Test that validation handles unexpected exceptions gracefully.""" 302 | with patch('util.util.datetime') as mock_datetime: 303 | mock_datetime.strptime.side_effect = Exception("Unexpected error") 304 | 305 | config = { 306 | 'queries': [{'test': 'test'}], 307 | 'start_date': '2024-01-01' 308 | } 309 | 310 | is_valid, message, suggestions = util._validate_configuration(config, 'test.yaml') 311 | 312 | assert is_valid is True # Should continue with warnings 313 | assert "warning" in message.lower() 314 | 315 | @pytest.mark.unit 316 | @pytest.mark.error_handling 317 | def test_fallback_application_exception_handling(self): 318 | """Test that fallback application handles exceptions gracefully.""" 319 | config = {'queries': [{'test': 'test'}]} 320 | suggestions = [{ 321 | 'issue': 'Test issue', 322 | 'severity': 'warning', 323 | 'default': 'test_default' 324 | }] 325 | 326 | # Mock a function that raises an exception 327 | with patch('util.util.logger.info', side_effect=Exception("Test exception")): 328 | result = util._apply_configuration_fallbacks(config, 'test.yaml', suggestions) 329 | 330 | # Should return original config even if fallback fails 331 | assert result == config 332 | 333 | 334 | if __name__ == "__main__": 335 | # Run tests if executed directly 336 | pytest.main([__file__, "-v"]) 337 | -------------------------------------------------------------------------------- /clients/apis/generic.py: -------------------------------------------------------------------------------- 1 | import urllib.request 2 | import urllib.parse 3 | import urllib.error 4 | import urllib 5 | import time 6 | import json 7 | import requests 8 | from requests.models import Response 9 | import re 10 | import logging 11 | 12 | 13 | file_handler = '' 14 | logger = logging.getLogger('sals_pipeline') 15 | 16 | 17 | class Generic: 18 | def request(self, query, method, data, headers): 19 | global file_handler 20 | # Safely resolve the log file path from any file-based handler 21 | file_handler = '' 22 | try: 23 | for handler in logger.handlers: 24 | if hasattr(handler, 'baseFilename'): 25 | file_handler = handler.baseFilename 26 | break 27 | except Exception: 28 | file_handler = '' 29 | request_result = None 30 | time.sleep(1) 31 | headers['Content-type'] = 'application/json' 32 | headers['Accept'] = 'application/json' 33 | if method == 'post': 34 | try: 35 | data = json.dumps(data) 36 | request_result = requests.post(query, data=data, headers=headers) 37 | except urllib.error.HTTPError as ex: 38 | logger.info("Error parsing the API response in generic client. Please see the log file for " 39 | "details: " + file_handler) 40 | logger.debug("Exception: " + str(type(ex)) + ' - ' + str(ex)) 41 | logger.debug("Request: " + str(data)) 42 | except UnicodeEncodeError as ex: 43 | logger.info("Error parsing the API response in generic client. Please see the log file for " 44 | "details: " + file_handler) 45 | logger.debug("Exception: " + str(type(ex)) + ' - ' + str(ex)) 46 | logger.debug("Request: " + str(data)) 47 | except urllib.error.URLError as ex: 48 | logger.info("Error parsing the API response in generic client. Please see the log file for " 49 | "details: " + file_handler) 50 | logger.debug("Exception: " + str(type(ex)) + ' - ' + str(ex)) 51 | logger.debug("Request: " + str(data)) 52 | except Exception as ex: 53 | logger.info("Error parsing the API response in generic client. Please see the log file for " 54 | "details: " + file_handler) 55 | logger.debug("Exception: " + str(type(ex)) + ' - ' + str(ex)) 56 | logger.debug("Request: " + str(data)) 57 | if method == 'get': 58 | try: 59 | request_result = requests.get(query, headers=headers) 60 | except urllib.error.HTTPError as ex: 61 | logger.info("Error parsing the API response in generic client. Please see the log file for " 62 | "details: " + file_handler) 63 | logger.debug("Exception: " + str(type(ex)) + ' - ' + str(ex)) 64 | logger.debug("Request: " + query) 65 | except UnicodeEncodeError as ex: 66 | logger.info("Error parsing the API response in generic client. Please see the log file for " 67 | "details: " + file_handler) 68 | logger.debug("Exception: " + str(type(ex)) + ' - ' + str(ex)) 69 | logger.debug("Request: " + query) 70 | except urllib.error.URLError as ex: 71 | logger.info("Error parsing the API response in generic client. Please see the log file for " 72 | "details: " + file_handler) 73 | logger.debug("Exception: " + str(type(ex)) + ' - ' + str(ex)) 74 | logger.debug("Request: " + query) 75 | except Exception as ex: 76 | logger.info("Error parsing the API response in generic client. Please see the log file for " 77 | "details: " + file_handler) 78 | logger.debug("Exception: " + str(type(ex)) + ' - ' + str(ex)) 79 | logger.debug("Request: " + query) 80 | if method == 'retrieve': 81 | try: 82 | req = urllib.request.Request(query, headers={'User-Agent': 'Mozilla/5.0'}) 83 | request_result = urllib.request.urlopen(req).read() 84 | except urllib.error.HTTPError as ex: 85 | logger.info("Error parsing the API response in generic client. Please see the log file for " 86 | "details: " + file_handler) 87 | logger.debug("Exception: " + str(type(ex)) + ' - ' + str(ex)) 88 | logger.debug("Request: " + query) 89 | except UnicodeEncodeError as ex: 90 | logger.info("Error parsing the API response in generic client. Please see the log file for " 91 | "details: " + file_handler) 92 | logger.debug("Exception: " + str(type(ex)) + ' - ' + str(ex)) 93 | logger.debug("Request: " + query) 94 | except urllib.error.URLError as ex: 95 | logger.info("Error parsing the API response in generic client. Please see the log file for " 96 | "details: " + file_handler) 97 | logger.debug("Exception: " + str(type(ex)) + ' - ' + str(ex)) 98 | logger.debug("Request: " + query) 99 | except Exception as ex: 100 | logger.info("Error parsing the API response in generic client. Please see the log file for " 101 | "details: " + file_handler) 102 | logger.debug("Exception: " + str(type(ex)) + ' - ' + str(ex)) 103 | logger.debug("Request: " + query) 104 | if request_result is None: 105 | logger.info("The API response is None. Please see the log file for " 106 | "details: " + file_handler) 107 | logger.debug("Request: " + str(query)) 108 | request_result = Response() 109 | request_result.status_code = 404 110 | request_result._content = "The API response is None for query: " + str(query) 111 | request_result.headers = {'Content-Type': 'text/plain'} 112 | request_result._text = "The API response is None for query: " + str(query) 113 | return request_result 114 | 115 | def default_query(self, parameters): 116 | query = parameters['query'].replace('(', '%28').replace(')', '%29').replace("'", "") 117 | words = re.split(' | ', query) 118 | for word in words: 119 | word = word.replace('%29', '').replace('%28', '') 120 | synonyms = parameters['synonyms'] 121 | query_parameter = '' 122 | if word in synonyms.keys(): 123 | word_synonyms = synonyms[word] 124 | query_parameter = query_parameter + ':%22' + word + '%22' 125 | for synonym in word_synonyms: 126 | query_parameter = query_parameter + '+OR+:%22' + synonym + '%22' 127 | query_parameter = '%28' + query_parameter + '%29' 128 | query = query.replace(word, query_parameter) 129 | else: 130 | query_parameter = query_parameter + ':%22' + word + '%22' 131 | query = query.replace(word, query_parameter) 132 | query = query.replace(' ', '+AND+').replace(' ', '+OR+').replace(' ', '+') 133 | query = '%28' + query + '%29' 134 | 135 | if 'fields' in parameters: 136 | qf = '' 137 | fields = parameters['fields'] 138 | for field in fields: 139 | qf = qf + query.replace('', field) + '+OR+' 140 | query = qf[:-4] 141 | return query 142 | 143 | def ieeexplore_query(self, parameters): 144 | query = parameters['query'].replace("\'", '') 145 | words = re.split(' | ', query) 146 | for word in words: 147 | word = word.replace(')', '').replace('(', '') 148 | synonyms = parameters['synonyms'] 149 | query_parameter = '' 150 | if word in synonyms.keys(): 151 | word_synonyms = synonyms[word] 152 | query_parameter = query_parameter + '"' + word + '"' 153 | for synonym in word_synonyms: 154 | query_parameter = query_parameter + 'OR"' + synonym + '"' 155 | query_parameter = '(' + query_parameter + ')' 156 | query = query.replace(word, query_parameter) 157 | else: 158 | query_parameter = query_parameter + '"' + word + '"' 159 | query = query.replace(word, query_parameter) 160 | query = query.replace(' ', 'AND').replace(' ', 'OR') 161 | first_term = query.split('AND')[0] 162 | first_term = first_term.replace('(', '').replace(')', '') 163 | words_first_term = first_term.split('OR') 164 | queries = [] 165 | for word in words_first_term: 166 | q = query.replace('(' + first_term + ')', word) 167 | queries.append(q) 168 | return queries 169 | 170 | def elsevier_query(self, parameters): 171 | domains = [] 172 | for domain in parameters['domains']: 173 | domains.append(domain) 174 | synonyms = parameters['synonyms'][domain] 175 | for synonym in synonyms: 176 | domains.append(synonym) 177 | query_domains = 'ALL(' 178 | for domain in domains: 179 | query_domains = query_domains + domain + ' OR ' 180 | query_domains = query_domains + ')' 181 | query_domains = query_domains.replace(' OR )', ')') 182 | 183 | interests = [] 184 | for interest in parameters['interests']: 185 | interests.append(interest) 186 | synonyms = parameters['synonyms'][interest] 187 | for synonym in synonyms: 188 | interests.append(synonym) 189 | query_interests = 'ALL(' 190 | for interest in interests: 191 | query_interests = query_interests + interest + ' OR ' 192 | query_interests = query_interests + ')' 193 | query_interests = query_interests.replace(' OR )', ')') 194 | query = query_domains + ' AND ' + query_interests 195 | return query 196 | 197 | def core_query(self, parameters): 198 | query = parameters['query'].replace("'", "") 199 | words = re.split(' | ', query) 200 | for word in words: 201 | word = word.replace('(', '').replace(')', '') 202 | synonyms = parameters['synonyms'] 203 | query_parameter = '' 204 | if word in synonyms.keys(): 205 | word_synonyms = synonyms[word] 206 | query_parameter = query_parameter + ' ' + word + ' ' 207 | for synonym in word_synonyms: 208 | query_parameter = query_parameter + ' OR ' + synonym + ' ' 209 | query_parameter = '(' + query_parameter + ')' 210 | query = query.replace(word, query_parameter) 211 | else: 212 | query_parameter = query_parameter + ' ' + word + ' ' 213 | query_parameter = '(' + query_parameter + ')' 214 | query = query.replace(word, query_parameter) 215 | query = query.replace(' ', ' AND ').replace(' ', ' OR ') 216 | query = ':(' + query + ')' 217 | 218 | if 'fields' in parameters: 219 | qf = '' 220 | fields = parameters['fields'] 221 | for field in fields: 222 | qf = qf + query.replace('', field) + ' OR ' 223 | query = qf[:-4] 224 | query = 'language.code:en AND abstract:(NOT thesis AND NOT tesis) AND title:(NOT survey AND NOT review) ' \ 225 | 'AND (' + query + ')' 226 | return query 227 | 228 | def transform_query(self, parameters, api): 229 | queries = [] 230 | query = parameters['query'] 231 | # Define API-specific transformations 232 | if api == 'arxiv' or api == 'springer' or api == 'scopus': 233 | # Replace single quotes with double quotes 234 | query = re.sub(r"'", '"', query) 235 | # Add field specifications and URL encoding for AND and OR operators 236 | query = re.sub(r'(\w+)', r':"\1"', query) 237 | query = re.sub(r'', '+AND+', query) 238 | query = re.sub(r'', '+OR+', query) 239 | 240 | # Wrap the whole expression in parentheses 241 | query = f'({query})' 242 | 243 | # URL-encode the resulting string 244 | query = query.replace('(', '%28').replace(')', '%29') 245 | 246 | if 'fields' in parameters: 247 | qf = '' 248 | fields = parameters['fields'] 249 | for field in fields: 250 | qf = qf + query.replace('', field) + '+OR+' 251 | query = qf[:-4] 252 | queries.append(query) 253 | 254 | elif api == 'core': 255 | # Replace single quotes with double quotes 256 | query = re.sub(r"'", '"', query) 257 | # Add parentheses for grouping 258 | query = re.sub(r'', ' AND ', query) 259 | query = re.sub(r'', ' OR ', query) 260 | if 'fields' in parameters: 261 | qf = '' 262 | fields = parameters['fields'] 263 | for field in fields: 264 | qf = qf + query.replace('', field) + ' OR ' 265 | query = qf[:-4] 266 | query = '(subjects:(*article* OR *Article* OR *journal* OR *Journal* OR *ART* OR ' \ 267 | '*conference* OR *CONFERENCE*)) AND (description:(NOT *thes* AND NOT *Thes* ' \ 268 | 'AND NOT *tesis* AND NOT *Tesis* AND NOT *Master* AND NOT *master*)) AND (' + query + ')' 269 | queries.append(query) 270 | 271 | elif api == 'ieeexplore' or api == 'semantic_scholar': 272 | # Preserve whitespace, especially inside quoted phrases 273 | # Replace single quotes with double quotes 274 | query = re.sub(r"'", '"', query) 275 | # Add parentheses for grouping 276 | query = re.sub(r'', 'AND', query) 277 | query = re.sub(r'', 'OR', query) 278 | first_term = query.split('AND')[0] 279 | first_term = first_term.replace('(', '').replace(')', '') 280 | words_first_term = first_term.split('OR') 281 | for word in words_first_term: 282 | q = query.replace('(' + first_term + ')', word) 283 | queries.append(q) 284 | return queries 285 | -------------------------------------------------------------------------------- /clients/apis/xploreapi.py: -------------------------------------------------------------------------------- 1 | import math 2 | import urllib 3 | from urllib.request import urlopen 4 | import xml.etree.ElementTree as ET 5 | import json 6 | import requests 7 | import time 8 | 9 | 10 | class XPLORE: 11 | 12 | # API endpoint (all non-Open Access) 13 | endPoint = "http://ieeexploreapi.ieee.org/api/v1/search/articles" 14 | 15 | # Open Access Document endpoint 16 | openAccessEndPoint = "http://ieeexploreapi.ieee.org/api/v1/search/document/" 17 | 18 | def __init__(self, apiKey): 19 | 20 | # API key 21 | self.apiKey = apiKey 22 | 23 | # flag that some search criteria has been provided 24 | self.queryProvided = False 25 | 26 | # flag for Open Access, which changes endpoint in use and limits results to just Open Access 27 | self.usingOpenAccess = False 28 | 29 | # flag that article number has been provided, which overrides all other search criteria 30 | self.usingArticleNumber = False 31 | 32 | # flag that a boolean method is in use 33 | self.usingBoolean = False 34 | 35 | # flag that a facet is in use 36 | self.usingFacet = False 37 | 38 | # flag that a facet has been applied, in the event that multiple facets are passed 39 | self.facetApplied = False 40 | 41 | # data type for results; default is json (other option is xml) 42 | self.outputType = 'json' 43 | 44 | # data format for results; default is raw (returned string); other option is object 45 | self.outputDataFormat = 'raw' 46 | 47 | # default of 25 results returned 48 | self.resultSetMax = 25 49 | 50 | # maximum of 200 results returned 51 | self.resultSetMaxCap = 200 52 | 53 | # records returned default to position 1 in result set 54 | self.startRecord = 1 55 | 56 | # default sort order is ascending; could also be 'desc' for descending 57 | self.sortOrder = 'asc' 58 | 59 | # field name that is being used for sorting 60 | self.sortField = 'article_title' 61 | 62 | # array of permitted search fields for searchField() method 63 | self.allowedSearchFields = ['abstract', 'affiliation', 'article_number', 'article_title', 'author', 'boolean_text', 'content_type', 'd-au', 'd-pubtype', 'd-publisher', 'd-year', 'doi', 'end_year', 'facet', 'index_terms', 'isbn', 'issn', 'is_number', 'meta_data', 'open_access', 'publication_number', 'publication_title', 'publication_year', 'publisher', 'querytext', 'start_year', 'thesaurus_terms'] 64 | 65 | # dictionary of all search parameters in use and their values 66 | self.parameters = {} 67 | 68 | # dictionary of all filters in use and their values 69 | self.filters = {} 70 | 71 | 72 | # ensuring == can be used reliably 73 | def __eq__(self, other): 74 | if isinstance(other, self.__class__): 75 | return self.__dict__ == other.__dict__ 76 | else: 77 | return False 78 | 79 | 80 | # ensuring != can be used reliably 81 | def __ne__(self, other): 82 | return not self.__eq__(other) 83 | 84 | 85 | # set the data type for the API output 86 | # string outputType Format for the returned result (JSON, XML) 87 | # return void 88 | def dataType(self, outputType): 89 | 90 | outputType = outputType.strip().lower() 91 | self.outputType = outputType 92 | 93 | 94 | # set the data format for the API output 95 | # string outputDataFormat Data structure for the returned result (raw string or object) 96 | # return void 97 | def dataFormat(self, outputDataFormat): 98 | 99 | outputDataFormat = outputDataFormat.strip().lower() 100 | self.outputDataFormat = outputDataFormat 101 | 102 | 103 | # set the start position in the 104 | # string start Start position in the returned data 105 | # return void 106 | def startingResult(self, start): 107 | 108 | self.startRecord = math.ceil(start) if (start > 0) else 1 109 | 110 | 111 | # set the maximum number of results 112 | # string maximum Max number of results to return 113 | # return void 114 | def maximumResults(self, maximum): 115 | 116 | self.resultSetMax = math.ceil(maximum) if (maximum > 0) else 25 117 | if self.resultSetMax > self.resultSetMaxCap: 118 | self.resultSetMax = self.resultSetMaxCap 119 | 120 | 121 | # setting a filter on results 122 | # string filterParam Field used for filtering 123 | # string value Text to filter on 124 | # return void 125 | def resultsFilter(self, filterParam, value): 126 | 127 | filterParam = filterParam.strip().lower() 128 | value = value.strip() 129 | 130 | if len(value) > 0: 131 | self.filters[filterParam] = value 132 | self.queryProvided = True 133 | 134 | # Standards do not have article titles, so switch to sorting by article number 135 | if (filterParam == 'content_type' and value == 'Standards'): 136 | self.resultsSorting('publication_year', 'asc') 137 | 138 | 139 | # setting sort order for results 140 | # string field Data field used for sorting 141 | # string order Sort order for results (ascending or descending) 142 | # return void 143 | def resultsSorting(self, field, order): 144 | 145 | field = field.strip().lower() 146 | order = order.strip() 147 | self.sortField = field 148 | self.sortOrder = order 149 | 150 | 151 | # shortcut method for assigning search parameters and values 152 | # string field Field used for searching 153 | # string value Text to query 154 | # return void 155 | def searchField(self, field, value): 156 | 157 | field = field.strip().lower() 158 | if field in self.allowedSearchFields: 159 | self.addParameter(field, value) 160 | else: 161 | print("Searches against field " + field + " are not supported") 162 | 163 | 164 | # string value Abstract text to query 165 | # return void 166 | def abstractText(self, value): 167 | 168 | self.addParameter('abstract', value) 169 | 170 | 171 | # string value Affiliation text to query 172 | # return void 173 | def affiliationText(self, value): 174 | 175 | self.addParameter('affiliation', value) 176 | 177 | 178 | # string value Article number to query 179 | # return void 180 | def articleNumber(self, value): 181 | 182 | self.addParameter('article_number', value) 183 | 184 | 185 | # string value Article title to query 186 | # return void 187 | def articleTitle(self, value): 188 | 189 | self.addParameter('article_title', value) 190 | 191 | 192 | # string value Author to query 193 | # return void 194 | def authorText(self, value): 195 | 196 | self.addParameter('author', value) 197 | 198 | 199 | # string value Author Facet text to query 200 | # return void 201 | def authorFacetText(self, value): 202 | 203 | self.addParameter('d-au', value) 204 | 205 | 206 | # string value Value(s) to use in the boolean query 207 | # return void 208 | def booleanText(self, value): 209 | 210 | self.addParameter('boolean_text', value) 211 | 212 | 213 | # string value Content Type Facet text to query 214 | # return void 215 | def contentTypeFacetText(self, value): 216 | 217 | self.addParameter('d-pubtype', value) 218 | 219 | 220 | # string value DOI (Digital Object Identifier) to query 221 | # return void 222 | def doi(self, value): 223 | 224 | self.addParameter('doi', value) 225 | 226 | 227 | # string value Facet text to query 228 | # return void 229 | def facetText(self, value): 230 | 231 | self.addParameter('facet', value) 232 | 233 | 234 | # string value Author Keywords, IEEE Terms, and Mesh Terms to query 235 | # return void 236 | def indexTerms(self, value): 237 | 238 | self.addParameter('index_terms', value) 239 | 240 | 241 | # string value ISBN (International Standard Book Number) to query 242 | # return void 243 | def isbn(self, value): 244 | 245 | self.addParameter('isbn', value) 246 | 247 | 248 | # string value ISSN (International Standard Serial number) to query 249 | # return void 250 | def issn(self, value): 251 | 252 | self.addParameter('issn', value) 253 | 254 | 255 | # string value Issue number to query 256 | # return void 257 | def issueNumber(self, value): 258 | 259 | self.addParameter('is_number', value) 260 | 261 | 262 | # string value Text to query across metadata fields and the abstract 263 | # return void 264 | def metaDataText(self, value): 265 | 266 | self.addParameter('meta_data', value) 267 | 268 | 269 | # string value Publication Facet text to query 270 | # return void 271 | def publicationFacetText(self, value): 272 | 273 | self.addParameter('d-year', value) 274 | 275 | 276 | # string value Publisher Facet text to query 277 | # return void 278 | def publisherFacetText(self, value): 279 | 280 | self.addParameter('d-publisher', value) 281 | 282 | 283 | # string value Publication title to query 284 | # return void 285 | def publicationTitle(self, value): 286 | 287 | self.addParameter('publication_title', value) 288 | 289 | 290 | # string or number value Publication year to query 291 | # return void 292 | def publicationYear(self, value): 293 | 294 | self.addParameter('publication_year', value) 295 | 296 | 297 | # string value Text to query across metadata fields, abstract and document text 298 | # return void 299 | def queryText(self, value): 300 | 301 | self.addParameter('querytext', value) 302 | 303 | 304 | # string value Thesaurus terms (IEEE Terms) to query 305 | # return void 306 | def thesaurusTerms(self, value): 307 | 308 | self.addParameter('thesaurus_terms', value) 309 | 310 | 311 | # add query parameter 312 | # string parameter Data field to query 313 | # string value Text to use in query 314 | # return void 315 | def addParameter(self, parameter, value): 316 | 317 | value = value.strip() 318 | 319 | if (len(value) > 0): 320 | 321 | self.parameters[parameter]= value 322 | 323 | # viable query criteria provided 324 | self.queryProvided = True 325 | 326 | # set flags based on parameter 327 | if (parameter == 'article_number'): 328 | 329 | self.usingArticleNumber = True 330 | 331 | if (parameter == 'boolean_text'): 332 | 333 | self.usingBoolean = True 334 | 335 | if (parameter == 'facet' or parameter == 'd-au' or parameter == 'd-year' or parameter == 'd-pubtype' or parameter == 'd-publisher'): 336 | 337 | self.usingFacet = True 338 | 339 | 340 | # Open Access document 341 | # string article Article number to query 342 | # return void 343 | def openAccess(self, article): 344 | 345 | self.usingOpenAccess = True 346 | self.queryProvided = True 347 | self.articleNumber(article) 348 | 349 | 350 | # calls the API 351 | # string debugMode If this mode is on (True) then output query and not data 352 | # return either raw result string, XML or JSON object, or array 353 | def callAPI(self, debugModeOff=True): 354 | 355 | if self.usingOpenAccess is True: 356 | 357 | str1 = self.buildOpenAccessQuery() 358 | 359 | else: 360 | 361 | str1 = self.buildQuery() 362 | 363 | if debugModeOff is False: 364 | 365 | return str1 366 | 367 | else: 368 | 369 | if self.queryProvided is False: 370 | print("No search criteria provided") 371 | data = {} 372 | try: 373 | data = self.queryAPI(str1) 374 | except urllib.error.HTTPError as ex: 375 | return data 376 | except Exception as ex: 377 | return data 378 | return data 379 | 380 | 381 | # creates the URL for the Open Access Document API call 382 | # return string: full URL for querying the API 383 | def buildOpenAccessQuery(self): 384 | 385 | url = self.openAccessEndPoint; 386 | url += str(self.parameters['article_number']) + '/fulltext' 387 | url += '?apikey=' + str(self.apiKey) 388 | url += '&format=' + str(self.outputType) 389 | 390 | return url 391 | 392 | 393 | # creates the URL for the non-Open Access Document API call 394 | # return string: full URL for querying the API 395 | def buildQuery(self): 396 | 397 | url = self.endPoint; 398 | 399 | url += '?apikey=' + str(self.apiKey) 400 | url += '&format=' + str(self.outputType) 401 | url += '&max_records=' + str(self.resultSetMax) 402 | url += '&start_record=' + str(self.startRecord) 403 | url += '&sort_order=' + str(self.sortOrder) 404 | url += '&sort_field=' + str(self.sortField) 405 | 406 | # add in search criteria 407 | # article number query takes priority over all others 408 | if (self.usingArticleNumber): 409 | 410 | url += '&article_number=' + str(self.parameters['article_number']) 411 | 412 | # boolean query 413 | elif (self.usingBoolean): 414 | 415 | url += '&querytext=(' + urllib.parse.quote_plus(self.parameters['boolean_text']) + ')' 416 | 417 | else: 418 | 419 | for key in self.parameters: 420 | 421 | if (self.usingFacet and self.facetApplied is False): 422 | 423 | url += '&querytext=' + urllib.parse.quote_plus(self.parameters[key]) + '&facet=' + key 424 | self.facetApplied = True 425 | 426 | else: 427 | 428 | url += '&' + key + '=' + urllib.parse.quote_plus(self.parameters[key]) 429 | 430 | 431 | # add in filters 432 | for key in self.filters: 433 | 434 | url += '&' + key + '=' + str(self.filters[key]) 435 | 436 | return url 437 | 438 | 439 | # creates the URL for the API call 440 | # string url Full URL to pass to API 441 | # return string: Results from API 442 | def queryAPI(self, url): 443 | try: 444 | headers = {'Content-type': 'application/json', 'Accept': 'application/json'} 445 | content = requests.get(url, headers=headers) 446 | except urllib.error.HTTPError as ex: 447 | return content 448 | except UnicodeEncodeError as ex: 449 | return content 450 | except urllib.error.URLError as ex: 451 | return content 452 | except Exception as ex: 453 | return content 454 | return content 455 | 456 | 457 | # formats the data returned by the API 458 | # string data Result string from API 459 | def formatData(self, data): 460 | 461 | if self.outputDataFormat == 'raw': 462 | return data 463 | 464 | elif self.outputDataFormat == 'object': 465 | 466 | if self.outputType == 'xml': 467 | obj = ET.ElementTree(ET.fromstring(data)) 468 | return obj 469 | 470 | else: 471 | obj = json.loads(data) 472 | return obj 473 | 474 | else: 475 | return data 476 | -------------------------------------------------------------------------------- /docs/configuration_guide.md: -------------------------------------------------------------------------------- 1 | # SaLS Configuration Guide 2 | 3 | ## Overview 4 | 5 | This guide provides comprehensive information about configuring SaLS (Semi-automatic Literature Survey) for your research needs. SaLS uses YAML configuration files to define search parameters, filters, and database selections. 6 | 7 | ## Table of Contents 8 | 9 | 1. [Basic Configuration Structure](#basic-configuration-structure) 10 | 2. [Required Parameters](#required-parameters) 11 | 3. [Optional Parameters](#optional-parameters) 12 | 4. [Query Syntax](#query-syntax) 13 | 5. [Database Configuration](#database-configuration) 14 | 6. [Filtering Options](#filtering-options) 15 | 7. [Best Practices](#best-practices) 16 | 8. [Troubleshooting](#troubleshooting) 17 | 9. [Configuration Templates](#configuration-templates) 18 | 19 | ## Basic Configuration Structure 20 | 21 | A SaLS configuration file is a YAML document with the following structure: 22 | 23 | ```yaml 24 | # Required parameters 25 | queries: [...] 26 | databases: [...] 27 | search_date: "YYYY-MM-DD" 28 | folder_name: "your_search_name" 29 | 30 | # Optional parameters 31 | start_date: "YYYY-MM-DD" 32 | end_date: "YYYY-MM-DD" 33 | syntactic_filters: [...] 34 | semantic_filters: [...] 35 | synonyms: {...} 36 | ``` 37 | 38 | ## Required Parameters 39 | 40 | ### queries 41 | **Type**: List of dictionaries 42 | **Description**: Defines your search queries using boolean expressions 43 | **Format**: `[{query_name: "boolean_expression"}]` 44 | 45 | **Example**: 46 | ```yaml 47 | queries: 48 | - machine learning: "'machine learning' & 'edge computing'" 49 | - systems engineering: "'systems engineering' | 'SE'" 50 | ``` 51 | 52 | **Best Practices**: 53 | - Use descriptive names for your queries 54 | - Use quotes around multi-word terms 55 | - Combine related concepts with OR operators 56 | - Use AND operators to narrow down results 57 | 58 | ### databases 59 | **Type**: List of strings 60 | **Description**: Specifies which databases to search 61 | **Available Options**: `arxiv`, `semantic_scholar`, `springer`, `ieeexplore`, `scopus`, `core`, `crossref`, `europe_pmc`, `pubmed`, `openalex` 62 | 63 | **Example**: 64 | ```yaml 65 | databases: 66 | - arxiv # Open access, no API key needed 67 | - semantic_scholar # Open access, no API key needed 68 | - springer # Commercial, requires API key 69 | ``` 70 | 71 | **Note**: Some databases require API keys in `config.json`. See [Database Configuration](#database-configuration) for details. 72 | 73 | ### search_date 74 | **Type**: String (YYYY-MM-DD format) 75 | **Description**: Date when the search was performed (for organization purposes) 76 | **Example**: `search_date: 2024-12-15` 77 | 78 | ### folder_name 79 | **Type**: String 80 | **Description**: Name of the folder where results will be stored 81 | **Example**: `folder_name: my_literature_search` 82 | 83 | ## Optional Parameters 84 | 85 | ### start_date and end_date 86 | **Type**: String (YYYY-MM-DD format) 87 | **Description**: Date range to limit search results 88 | **Example**: 89 | ```yaml 90 | start_date: 2020-01-01 # Papers from 2020 onwards 91 | end_date: 2024-12-31 # Papers until end of 2024 92 | ``` 93 | 94 | **Benefits**: 95 | - Reduces search time 96 | - Focuses on recent research 97 | - Improves relevance for time-sensitive topics 98 | 99 | ### synonyms 100 | **Type**: Dictionary 101 | **Description**: Defines synonyms for query expansion to increase search coverage 102 | **Format**: `{term: [synonym1, synonym2, ...]}` 103 | 104 | **Example**: 105 | ```yaml 106 | machine learning: 107 | - ml 108 | - deep learning 109 | - neural networks 110 | - supervised learning 111 | ``` 112 | 113 | **Best Practices**: 114 | - Include abbreviations and alternative names 115 | - Add related concepts and terminology 116 | - Use domain-specific synonyms 117 | 118 | ### syntactic_filters 119 | **Type**: List of strings 120 | **Description**: Terms that must appear in paper content (AND logic) 121 | **Example**: 122 | ```yaml 123 | syntactic_filters: 124 | - edge computing 125 | - distributed systems 126 | - performance 127 | ``` 128 | 129 | **Use Cases**: 130 | - Filtering out irrelevant papers early 131 | - Ensuring specific concepts are covered 132 | - Improving result relevance 133 | 134 | ### semantic_filters 135 | **Type**: List of dictionaries 136 | **Description**: AI-powered similarity matching using detailed descriptions 137 | **Format**: `[{filter_name: "detailed_description"}]` 138 | 139 | **Example**: 140 | ```yaml 141 | semantic_filters: 142 | - edge computing: "Research on edge computing, fog computing, and distributed edge systems including resource management, placement strategies, and performance optimization" 143 | - ml systems: "Papers about machine learning systems in production environments including deployment, monitoring, scaling, and operational challenges" 144 | ``` 145 | 146 | **Best Practices**: 147 | - Be specific and descriptive 148 | - Include key concepts and requirements 149 | - Focus on what you're looking for, not what you want to exclude 150 | 151 | ## Query Syntax 152 | 153 | SaLS supports a flexible boolean query syntax with the following operators: 154 | 155 | ### Basic Operators 156 | - `&` or `AND` - AND operator (both terms must be present) 157 | - `|` or `OR` - OR operator (either term can be present) 158 | - `&&` or `||` - Alternative syntax for AND/OR 159 | 160 | ### Advanced Features 161 | - **Parentheses**: Group expressions for complex logic 162 | - **Quotes**: Preserve multi-word terms as phrases 163 | - **Legacy Support**: `¦` character for OR operations 164 | 165 | ### Examples 166 | 167 | **Simple AND**: 168 | ```yaml 169 | queries: 170 | - basic: "'machine learning' & 'edge computing'" 171 | ``` 172 | 173 | **Complex Boolean Expression**: 174 | ```yaml 175 | queries: 176 | - complex: "'machine learning' & ('edge computing' | 'fog computing') & ('performance' | 'optimization')" 177 | ``` 178 | 179 | **Grouped Logic**: 180 | ```yaml 181 | queries: 182 | - grouped: "('deep learning' | 'neural networks') & ('computer vision' | 'image processing')" 183 | ``` 184 | 185 | ## Database Configuration 186 | 187 | ### Open Access Databases (No API Key Required) 188 | - **arXiv**: Excellent for recent preprints and open access papers 189 | - **Semantic Scholar**: Good for citation analysis and impact assessment 190 | 191 | ### Commercial Databases (API Key Required) 192 | - **Springer Nature**: High-quality journals and books 193 | - **IEEE Xplore**: Excellent for engineering and computer science 194 | - **Scopus**: Comprehensive coverage across all disciplines 195 | - **CORE**: Open access repository aggregator 196 | 197 | ### API Key Setup 198 | 1. Create a `config.json` file in the project root 199 | 2. Add your API keys: 200 | ```json 201 | { 202 | "api_access_springer": "YOUR_SPRINGER_API_KEY", 203 | "api_access_ieee": "YOUR_IEEE_API_KEY", 204 | "api_access_elsevier": "YOUR_SCOPUS_API_KEY", 205 | "api_access_core": "YOUR_CORE_API_KEY" 206 | } 207 | ``` 208 | 209 | **Note**: Only add keys for databases you plan to use. 210 | 211 | ## Filtering Options 212 | 213 | ### Two-Stage Filtering Process 214 | 215 | 1. **Syntactic Filtering**: Basic text matching using your specified terms 216 | 2. **Semantic Filtering**: AI-powered similarity matching using BERT models 217 | 218 | ### Filtering Strategy 219 | 220 | **For High Precision (Fewer, More Relevant Results)**: 221 | - Use more specific queries 222 | - Add more syntactic filters 223 | - Use date ranges to focus on recent work 224 | 225 | **For High Recall (More Results, May Include Less Relevant)**: 226 | - Use broader queries with OR operators 227 | - Fewer syntactic filters 228 | - No date restrictions 229 | 230 | ## Best Practices 231 | 232 | ### 1. Start Simple 233 | - Begin with basic queries 234 | - Add complexity gradually 235 | - Test with open databases first 236 | 237 | ### 2. Query Design 238 | - Use specific terminology from your field 239 | - Include synonyms and abbreviations 240 | - Balance between precision and recall 241 | 242 | ### 3. Database Selection 243 | - Start with open databases (arxiv, semantic_scholar) 244 | - Add commercial databases for comprehensive coverage 245 | - Consider field-specific database strengths 246 | 247 | ### 4. Filtering Strategy 248 | - Use syntactic filters for precision 249 | - Use semantic filters for recall 250 | - Iterate based on initial results 251 | 252 | ### 5. Date Management 253 | - Set reasonable date ranges for your research area 254 | - Consider field evolution speed 255 | - Balance between recency and comprehensiveness 256 | 257 | ## Troubleshooting and Error Recovery 258 | 259 | ### Configuration Error Recovery 260 | 261 | SaLS now provides intelligent error recovery that helps you fix configuration issues quickly and continue with your research. 262 | 263 | #### Error Severity Levels 264 | 265 | **🔴 Critical Errors** - Pipeline cannot continue 266 | - Missing or invalid queries (required for search) 267 | - Malformed query syntax 268 | - These must be fixed before the pipeline can run 269 | 270 | **🟡 Warnings** - Pipeline can continue with defaults 271 | - Missing databases (defaults to open databases) 272 | - Missing search_date (defaults to current date) 273 | - Missing folder_name (defaults to filename-based) 274 | - Invalid date formats (defaults to reasonable values) 275 | - Missing filters (defaults to empty lists) 276 | 277 | #### Automatic Fallbacks 278 | 279 | When warnings are detected, SaLS automatically applies sensible defaults: 280 | 281 | ```yaml 282 | # If databases are missing, SaLS uses: 283 | databases: [arxiv, semantic_scholar] 284 | 285 | # If search_date is missing, SaLS uses: 286 | search_date: [current date] 287 | 288 | # If folder_name is missing, SaLS uses: 289 | folder_name: [filename without .yaml extension] 290 | 291 | # If filters are missing, SaLS uses: 292 | syntactic_filters: [] 293 | semantic_filters: [] 294 | ``` 295 | 296 | #### Recovery Suggestions 297 | 298 | For each issue, SaLS provides: 299 | - **Clear description** of what's wrong 300 | - **Specific fix** instructions 301 | - **Working examples** to copy-paste 302 | - **Default values** that will be used 303 | 304 | ### Common Issues and Solutions 305 | 306 | #### Configuration Validation Errors 307 | **Problem**: Configuration validation fails with specific error messages 308 | **Solution**: Follow the error message guidance and check: 309 | - YAML syntax (proper indentation) 310 | - Required field formats 311 | - Date format (YYYY-MM-DD) 312 | - Database name spelling 313 | 314 | **Recovery**: SaLS will show exactly what's wrong and how to fix it 315 | 316 | #### Missing Required Fields 317 | **Problem**: Critical fields like queries are missing 318 | **Solution**: Add the missing sections following the provided examples 319 | 320 | **Recovery**: SaLS prevents pipeline execution and guides you to add required fields 321 | 322 | #### Missing Optional Fields 323 | **Problem**: Optional fields like databases or search_date are missing 324 | **Solution**: Either add them or let SaLS use sensible defaults 325 | 326 | **Recovery**: SaLS continues with defaults and shows what was applied 327 | 328 | #### Invalid Date Formats 329 | **Problem**: Dates are in wrong format (e.g., 2020/01/01) 330 | **Solution**: Use YYYY-MM-DD format (e.g., 2020-01-01) 331 | 332 | **Recovery**: SaLS suggests the correct format and provides examples 333 | 334 | #### Invalid Database Names 335 | **Problem**: Unknown database specified 336 | **Solution**: Use only valid database names from the supported list 337 | 338 | **Recovery**: SaLS shows all valid databases and continues with valid ones 339 | 340 | #### Too Many Results 341 | **Problem**: Search returns too many papers 342 | **Solutions**: 343 | - Add more specific terms to queries 344 | - Use syntactic filters 345 | - Set date ranges 346 | - Use more specific semantic filter descriptions 347 | 348 | #### Too Few Results 349 | **Problem**: Search returns too few papers 350 | **Solutions**: 351 | - Broaden queries with OR operators 352 | - Add synonyms 353 | - Remove overly restrictive filters 354 | - Check date ranges 355 | 356 | #### API Errors 357 | **Problem**: Commercial database searches fail 358 | **Solutions**: 359 | - Verify API keys in `config.json` 360 | - Check API key validity 361 | - Use open databases as fallback 362 | - Check rate limiting 363 | 364 | #### Semantic Filtering Issues 365 | **Problem**: Semantic filters don't work as expected 366 | **Solutions**: 367 | - Make descriptions more specific and detailed 368 | - Include key concepts and requirements 369 | - Focus on what you want, not what you want to exclude 370 | 371 | ### Error Message Examples 372 | 373 | #### Critical Error (Pipeline Stops) 374 | ``` 375 | Configuration error: 'queries' section is missing in config.yaml 376 | 377 | 🔴 CRITICAL ERRORS - Pipeline cannot continue: 378 | 379 | ❌ Missing queries section 380 | Fix: Add a queries section with your search terms 381 | Example: 382 | queries: 383 | - augmented reality: "'augmented reality' & 'edge'" 384 | - machine learning: "'machine learning' & 'systems'" 385 | ``` 386 | 387 | #### Warnings (Pipeline Continues with Defaults) 388 | ``` 389 | Configuration validation completed with warnings: 390 | Configuration warning: 'databases' section is missing 391 | Configuration warning: 'search_date' is missing 392 | 393 | 🟡 WARNINGS - Pipeline will continue with defaults where possible: 394 | 395 | ⚠️ Missing databases section 396 | Fix: Add databases section or use default open databases 397 | Default: ['arxiv', 'semantic_scholar'] 398 | Example: 399 | databases: 400 | - arxiv # Open access, no API key needed 401 | - semantic_scholar # Open access, no API key needed 402 | 403 | ⚠️ Missing search_date 404 | Fix: Add search_date or use current date 405 | Default: current date 406 | Example: 407 | search_date: 2024-12-15 408 | ``` 409 | 410 | ### Best Practices for Error Recovery 411 | 412 | 1. **Start with the error messages** - they provide specific guidance 413 | 2. **Fix critical errors first** - these prevent the pipeline from running 414 | 3. **Review warnings** - understand what defaults will be applied 415 | 4. **Use the provided examples** - copy-paste working configurations 416 | 5. **Test incrementally** - fix one issue at a time 417 | 6. **Let SaLS help** - use the automatic fallbacks when appropriate 418 | 419 | ### Getting Help with Configuration Issues 420 | 421 | If you encounter persistent issues: 422 | 423 | 1. **Check the error messages** - they provide specific guidance 424 | 2. **Review the configuration guide** - covers common scenarios 425 | 3. **Use the templates** - working examples to build upon 426 | 4. **Start simple** - add complexity gradually 427 | 5. **Test with open databases** - no API key requirements 428 | 429 | ## Configuration Templates 430 | 431 | SaLS provides several configuration templates to get you started: 432 | 433 | ### Basic Template 434 | - **File**: `templates/basic_search_template.yaml` 435 | - **Use Case**: Simple literature searches 436 | - **Features**: Basic queries, synonyms, open databases 437 | 438 | ### Advanced Template 439 | - **File**: `templates/advanced_research_template.yaml` 440 | - **Use Case**: Complex research projects, systematic reviews 441 | - **Features**: All SaLS features, comprehensive examples 442 | 443 | ### Machine Learning Template 444 | - **File**: `templates/machine_learning_template.yaml` 445 | - **Use Case**: ML/AI research 446 | - **Features**: ML-specific terminology, subfield examples 447 | 448 | ### Using Templates 449 | 1. Copy the appropriate template file 450 | 2. Rename it to your project 451 | 3. Modify the values according to your research needs 452 | 4. Update the `search_date` and `folder_name` 453 | 5. Test with a small search first 454 | 455 | ## Getting Help 456 | 457 | If you encounter issues: 458 | 459 | 1. **Check the error messages** - they provide specific guidance 460 | 2. **Review the configuration guide** - covers common scenarios 461 | 3. **Use the templates** - working examples to build upon 462 | 4. **Start simple** - add complexity gradually 463 | 5. **Test with open databases** - no API key requirements 464 | 465 | ## Advanced Configuration 466 | 467 | ### Custom Fields and Types 468 | ```yaml 469 | # Advanced users can customize search fields and types 470 | fields: ['title', 'abstract', 'keywords', 'full_text'] 471 | types: ['conferences', 'journals', 'preprints', 'reports'] 472 | ``` 473 | 474 | ### Performance Optimization 475 | - Use date ranges to limit search scope 476 | - Start with fewer databases and add more as needed 477 | - Use syntactic filters to reduce processing time 478 | - Test queries with small date ranges first 479 | 480 | --- 481 | 482 | *This guide covers the essential configuration options for SaLS. For more advanced usage, refer to the code documentation and examples in the templates directory.* 483 | -------------------------------------------------------------------------------- /util/error_standards.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | """ 3 | Error Handling Standards for SaLS Project. 4 | 5 | This module defines consistent error handling patterns, logging standards, 6 | and user experience guidelines across the entire SaLS codebase. 7 | 8 | Standards ensure: 9 | 1. Consistent error categorization (CRITICAL, ERROR, WARNING, INFO) 10 | 2. Uniform logging message formats 11 | 3. Standardized user-facing error messages 12 | 4. Consistent exception handling patterns 13 | 5. Unified error recovery suggestions 14 | """ 15 | 16 | import logging 17 | import traceback 18 | from typing import Dict, List, Optional, Tuple, Union 19 | from enum import Enum 20 | 21 | 22 | class ErrorSeverity(Enum): 23 | """Standard error severity levels for consistent categorization.""" 24 | CRITICAL = "CRITICAL" # Pipeline cannot continue 25 | ERROR = "ERROR" # Operation failed, but pipeline can continue 26 | WARNING = "WARNING" # Issue detected, operation continues with defaults 27 | INFO = "INFO" # Informational message 28 | DEBUG = "DEBUG" # Debug information 29 | 30 | 31 | class ErrorCategory(Enum): 32 | """Standard error categories for consistent classification.""" 33 | CONFIGURATION = "CONFIGURATION" # Configuration file issues 34 | API = "API" # External API failures 35 | NETWORK = "NETWORK" # Network/connection issues 36 | DATA = "DATA" # Data processing issues 37 | FILE = "FILE" # File I/O operations 38 | VALIDATION = "VALIDATION" # Data validation failures 39 | SYSTEM = "SYSTEM" # System-level issues 40 | USER_INPUT = "USER_INPUT" # User input validation 41 | RESOURCE = "RESOURCE" # Resource limitations (quotas, etc.) 42 | PIPELINE = "PIPELINE" # Pipeline orchestration and step execution 43 | 44 | 45 | class ErrorContext: 46 | """Standard error context information for consistent error reporting.""" 47 | 48 | def __init__(self, 49 | module: str, 50 | function: str, 51 | operation: str, 52 | severity: ErrorSeverity, 53 | category: ErrorCategory, 54 | user_facing: bool = True): 55 | self.module = module 56 | self.function = function 57 | self.operation = operation 58 | self.severity = severity 59 | self.category = category 60 | self.user_facing = user_facing 61 | self.timestamp = None # Will be set by error handler 62 | self.additional_context: Dict = {} 63 | 64 | def add_context(self, key: str, value: str) -> None: 65 | """Add additional context information.""" 66 | self.additional_context[key] = value 67 | 68 | def get_formatted_context(self) -> str: 69 | """Get formatted context string for logging.""" 70 | context_parts = [ 71 | f"Module: {self.module}", 72 | f"Function: {self.function}", 73 | f"Operation: {self.operation}", 74 | f"Category: {self.category.value}", 75 | f"Severity: {self.severity.value}" 76 | ] 77 | 78 | for key, value in self.additional_context.items(): 79 | context_parts.append(f"{key}: {value}") 80 | 81 | return " | ".join(context_parts) 82 | 83 | 84 | class ErrorMessage: 85 | """Standard error message format for consistent error reporting.""" 86 | 87 | def __init__(self, 88 | context: ErrorContext, 89 | error_type: str, 90 | error_description: str, 91 | recovery_suggestion: Optional[str] = None, 92 | next_steps: Optional[List[str]] = None, 93 | exception_type: Optional[str] = None, 94 | exception_message: Optional[str] = None, 95 | short_traceback: Optional[str] = None): 96 | self.context = context 97 | self.error_type = error_type 98 | self.error_description = error_description 99 | self.recovery_suggestion = recovery_suggestion 100 | self.next_steps = next_steps or [] 101 | self.exception_type = exception_type 102 | self.exception_message = exception_message 103 | self.short_traceback = short_traceback 104 | 105 | def get_log_message(self) -> str: 106 | """Get formatted message for logging.""" 107 | message_parts = [ 108 | f"[{self.context.severity.value}] {self.error_type}", 109 | f"Description: {self.error_description}", 110 | f"Context: {self.context.get_formatted_context()}" 111 | ] 112 | 113 | if self.recovery_suggestion: 114 | message_parts.append(f"Recovery: {self.recovery_suggestion}") 115 | 116 | return " | ".join(message_parts) 117 | 118 | def get_user_message(self) -> str: 119 | """Get user-friendly error message.""" 120 | if not self.context.user_facing: 121 | return f"An error occurred: {self.error_description}" 122 | 123 | message_parts = [] 124 | 125 | # Add severity indicator 126 | if self.context.severity == ErrorSeverity.CRITICAL: 127 | message_parts.append("🔴 CRITICAL ERROR") 128 | elif self.context.severity == ErrorSeverity.ERROR: 129 | message_parts.append("❌ ERROR") 130 | elif self.context.severity == ErrorSeverity.WARNING: 131 | message_parts.append("⚠️ WARNING") 132 | else: 133 | message_parts.append("ℹ️ INFO") 134 | 135 | # Add main message 136 | message_parts.append(self.error_description) 137 | 138 | # Add concise details 139 | message_parts.append(f"Details: {self.error_type} | Where: {self.context.module}.{self.context.function} ({self.context.operation})") 140 | if self.exception_type: 141 | message_parts.append(f"Error: {self.exception_type}: {self.exception_message}") 142 | 143 | # Add recovery suggestion 144 | if self.recovery_suggestion: 145 | message_parts.append(f"\n💡 {self.recovery_suggestion}") 146 | 147 | # Add next steps 148 | if self.next_steps: 149 | message_parts.append("\n📋 Next steps:") 150 | for i, step in enumerate(self.next_steps, 1): 151 | message_parts.append(f" {i}. {step}") 152 | 153 | return "\n".join(message_parts) 154 | 155 | 156 | class ErrorHandler: 157 | """Standard error handler for consistent error processing.""" 158 | 159 | def __init__(self, logger: logging.Logger): 160 | # Accept either a standard logging.Logger or a wrapper with `.logger` 161 | try: 162 | if not isinstance(logger, logging.Logger) and hasattr(logger, 'logger') and isinstance(logger.logger, logging.Logger): 163 | self.logger = logger.logger 164 | else: 165 | self.logger = logger 166 | except Exception: 167 | self.logger = logging.getLogger('sals_pipeline') 168 | 169 | def handle_error(self, 170 | error: Exception, 171 | context: ErrorContext, 172 | error_type: str, 173 | error_description: str, 174 | recovery_suggestion: Optional[str] = None, 175 | next_steps: Optional[List[str]] = None) -> ErrorMessage: 176 | """Handle an error according to SaLS standards.""" 177 | 178 | # Create error message 179 | exc_type_name = type(error).__name__ if error else None 180 | exc_message = str(error) if error else None 181 | short_tb = None 182 | try: 183 | if error and getattr(error, "__traceback__", None): 184 | tb_frames = traceback.extract_tb(error.__traceback__) 185 | last_frames = tb_frames[-3:] if len(tb_frames) > 3 else tb_frames 186 | formatted_frames = traceback.format_list(last_frames) 187 | short_tb = "".join(formatted_frames).rstrip() 188 | if exc_type_name: 189 | short_tb = f"{short_tb}\n{exc_type_name}: {exc_message}" 190 | except Exception: 191 | short_tb = None 192 | error_msg = ErrorMessage( 193 | context=context, 194 | error_type=error_type, 195 | error_description=error_description, 196 | recovery_suggestion=recovery_suggestion, 197 | next_steps=next_steps, 198 | exception_type=exc_type_name, 199 | exception_message=exc_message, 200 | short_traceback=short_tb 201 | ) 202 | 203 | # Log according to severity (include traceback for ERROR/CRITICAL) 204 | if context.severity == ErrorSeverity.CRITICAL: 205 | if error: 206 | self.logger.critical(error_msg.get_log_message(), exc_info=True) 207 | else: 208 | self.logger.critical(error_msg.get_log_message()) 209 | elif context.severity == ErrorSeverity.ERROR: 210 | if error: 211 | self.logger.error(error_msg.get_log_message(), exc_info=True) 212 | else: 213 | self.logger.error(error_msg.get_log_message()) 214 | elif context.severity == ErrorSeverity.WARNING: 215 | self.logger.warning(error_msg.get_log_message()) 216 | elif context.severity == ErrorSeverity.INFO: 217 | self.logger.info(error_msg.get_log_message()) 218 | else: # DEBUG 219 | self.logger.debug(error_msg.get_log_message()) 220 | 221 | return error_msg 222 | 223 | def log_and_print(self, 224 | error_msg: ErrorMessage, 225 | print_to_console: bool = True) -> None: 226 | """Log error and optionally print to console for user-facing errors.""" 227 | 228 | # Always log 229 | if error_msg.context.severity == ErrorSeverity.CRITICAL: 230 | self.logger.critical(error_msg.get_log_message()) 231 | elif error_msg.context.severity == ErrorSeverity.ERROR: 232 | self.logger.error(error_msg.get_log_message()) 233 | elif error_msg.context.severity == ErrorSeverity.WARNING: 234 | self.logger.warning(error_msg.get_log_message()) 235 | elif error_msg.context.severity == ErrorSeverity.INFO: 236 | self.logger.info(error_msg.get_log_message()) 237 | else: # DEBUG 238 | self.logger.debug(error_msg.get_log_message()) 239 | 240 | # Print to console for user-facing errors 241 | if print_to_console and error_msg.context.user_facing: 242 | # Try to locate log file path from logger handlers 243 | log_file_path = "" 244 | try: 245 | for h in getattr(self.logger, 'handlers', []): 246 | if hasattr(h, 'baseFilename'): 247 | log_file_path = h.baseFilename 248 | break 249 | except Exception: 250 | log_file_path = "" 251 | user_message = error_msg.get_user_message() 252 | # Append short traceback snippet if available 253 | if error_msg.short_traceback: 254 | indented = "\n".join(" " + line.rstrip() for line in error_msg.short_traceback.splitlines()) 255 | user_message += f"\n🧵 Traceback (last 3 frames):\n{indented}" 256 | if log_file_path: 257 | user_message += f"\n📄 See logs for full traceback: {log_file_path}" 258 | print(user_message) 259 | 260 | 261 | # Standard error messages for common scenarios 262 | STANDARD_ERROR_MESSAGES = { 263 | "file_not_found": { 264 | "description": "File not found", 265 | "recovery": "Check file path and ensure file exists", 266 | "next_steps": [ 267 | "Verify the file path is correct", 268 | "Check file permissions", 269 | "Ensure the file exists in the specified location" 270 | ] 271 | }, 272 | "invalid_configuration": { 273 | "description": "Invalid configuration detected", 274 | "recovery": "Review configuration file and fix validation errors", 275 | "next_steps": [ 276 | "Check the configuration file format", 277 | "Verify all required fields are present", 278 | "Ensure field values are in correct format" 279 | ] 280 | }, 281 | "api_quota_exceeded": { 282 | "description": "API quota exceeded", 283 | "recovery": "Wait for quota reset or use alternative search strategies", 284 | "next_steps": [ 285 | "Wait for daily quota reset", 286 | "Reduce search scope using filters", 287 | "Use date ranges to limit results" 288 | ] 289 | }, 290 | "network_timeout": { 291 | "description": "Network request timed out", 292 | "recovery": "Check network connection and retry", 293 | "next_steps": [ 294 | "Verify internet connection", 295 | "Check firewall settings", 296 | "Retry the operation" 297 | ] 298 | }, 299 | "data_validation_failed": { 300 | "description": "Data validation failed", 301 | "recovery": "Review input data and fix validation issues", 302 | "next_steps": [ 303 | "Check data format and content", 304 | "Verify required fields are present", 305 | "Ensure data meets validation criteria" 306 | ] 307 | }, 308 | "pipeline_step_failed": { 309 | "description": "Pipeline step execution failed", 310 | "recovery": "Review the step that failed and check for configuration or data issues", 311 | "next_steps": [ 312 | "Check the logs for detailed error information", 313 | "Verify input data for the failed step", 314 | "Review step configuration parameters", 315 | "Consider running the pipeline from the failed step" 316 | ] 317 | }, 318 | "pipeline_execution_failed": { 319 | "description": "Pipeline execution failed", 320 | "recovery": "Review the pipeline execution and check for critical errors", 321 | "next_steps": [ 322 | "Check the logs for detailed error information", 323 | "Verify all configuration parameters", 324 | "Check system resources and permissions", 325 | "Review input data quality and format" 326 | ] 327 | }, 328 | "configuration_fallback_failed": { 329 | "description": "Configuration fallback application failed", 330 | "recovery": "Review configuration parameters and apply fallbacks manually", 331 | "next_steps": [ 332 | "Check configuration file format and content", 333 | "Verify parameter types and values", 334 | "Apply missing parameters manually", 335 | "Restart the pipeline with corrected configuration" 336 | ] 337 | } 338 | } 339 | 340 | 341 | def create_error_context(module: str, 342 | function: str, 343 | operation: str, 344 | severity: ErrorSeverity, 345 | category: ErrorCategory, 346 | user_facing: bool = True) -> ErrorContext: 347 | """Create a standardized error context.""" 348 | return ErrorContext( 349 | module=module, 350 | function=function, 351 | operation=operation, 352 | severity=severity, 353 | category=category, 354 | user_facing=user_facing 355 | ) 356 | 357 | 358 | def get_standard_error_info(error_key: str) -> Dict: 359 | """Get standard error information for common error types.""" 360 | return STANDARD_ERROR_MESSAGES.get(error_key, { 361 | "description": "An error occurred", 362 | "recovery": "Review the error details and take appropriate action", 363 | "next_steps": ["Check the logs for detailed information", "Review the operation that failed"] 364 | }) 365 | 366 | 367 | # Example usage: 368 | """ 369 | # In your module: 370 | from util.error_standards import ( 371 | ErrorHandler, create_error_context, ErrorSeverity, ErrorCategory, 372 | get_standard_error_info 373 | ) 374 | 375 | # Create error handler 376 | error_handler = ErrorHandler(logger) 377 | 378 | # Handle an error 379 | try: 380 | # Your operation here 381 | pass 382 | except FileNotFoundError as e: 383 | context = create_error_context( 384 | module="my_module", 385 | function="my_function", 386 | operation="file_reading", 387 | severity=ErrorSeverity.ERROR, 388 | category=ErrorCategory.FILE 389 | ) 390 | 391 | error_info = get_standard_error_info("file_not_found") 392 | error_msg = error_handler.handle_error( 393 | error=e, 394 | context=context, 395 | error_type="FileNotFoundError", 396 | error_description=error_info["description"], 397 | recovery_suggestion=error_info["recovery"], 398 | next_steps=error_info["next_steps"] 399 | ) 400 | 401 | # Log and print to console 402 | error_handler.log_and_print(error_msg, print_to_console=True) 403 | """ 404 | -------------------------------------------------------------------------------- /clients/core.py: -------------------------------------------------------------------------------- 1 | import time 2 | import pandas as pd 3 | import json 4 | from .apis.generic import Generic 5 | from .base_client import DatabaseClient 6 | from os.path import exists 7 | from util import util 8 | from tqdm import tqdm 9 | import logging 10 | from util.error_standards import ( 11 | ErrorHandler, create_error_context, ErrorSeverity, ErrorCategory, 12 | get_standard_error_info 13 | ) 14 | from util.logging_standards import LogCategory 15 | 16 | 17 | class CoreClient(DatabaseClient): 18 | """ 19 | Refactored CORE client using the Template Method pattern. 20 | """ 21 | 22 | def __init__(self): 23 | super().__init__( 24 | database_name='core', 25 | max_papers=1000, 26 | waiting_time=2, 27 | max_retries=3, 28 | quota=1000 29 | ) 30 | self.api_url = 'https://api.core.ac.uk/v3/search/works' 31 | self.client_fields = {'title': 'title', 'abstract': 'abstract'} 32 | self.client = Generic() 33 | 34 | # Load API access from config 35 | if exists('./config.json'): 36 | with open("./config.json", "r") as file: 37 | config = json.load(file) 38 | if 'api_access_core' in config: 39 | self.api_access = config['api_access_core'] 40 | else: 41 | self.api_access = '' 42 | else: 43 | self.api_access = '' 44 | 45 | def _has_api_access(self) -> bool: 46 | """Check if CORE API access is available.""" 47 | return self.api_access != '' 48 | 49 | def _plan_requests(self, query, syntactic_filters, synonyms, fields, types, dates, start_date, end_date) -> pd.DataFrame: 50 | """Plan the API requests for CORE.""" 51 | # Extract query value from the query dictionary 52 | query_name = list(query.keys())[0] 53 | query_value = query[query_name] 54 | 55 | # Build query parameters 56 | c_fields = [] 57 | for field in fields: 58 | if field in self.client_fields: 59 | c_fields.append(self.client_fields[field]) 60 | 61 | parameters = { 62 | 'query': query_value, 63 | 'syntactic_filters': syntactic_filters, 64 | 'synonyms': synonyms, 65 | 'fields': c_fields, 66 | 'types': types 67 | } 68 | 69 | # Create initial request to get total count 70 | request = self._create_request(parameters, dates, start_date, end_date) 71 | headers = {'Authorization': 'Bearer ' + self.api_access} 72 | raw_papers = self._retry_request(self.client.request, self.api_url, 'post', request, headers) 73 | expected_papers = self._get_expected_papers(raw_papers) 74 | 75 | self.logger.info(LogCategory.DATABASE, "core", "_plan_requests", f"Expected papers from {self.database_name}: {expected_papers}...") 76 | 77 | # Calculate number of requests needed 78 | times = int(expected_papers / self.max_papers) - 1 79 | mod = int(expected_papers) % self.max_papers 80 | if mod > 0: 81 | times = times + 1 82 | 83 | # Check quota constraints 84 | if times >= self.quota: 85 | self.logger.info(LogCategory.DATABASE, "core", "_plan_requests", f"The number of expected papers requires {times} requests which exceeds the {self.database_name} quota of {self.quota} requests per day.") 86 | if len(syntactic_filters) > 0: 87 | self.logger.info(LogCategory.DATABASE, "core", "_plan_requests", "Trying to reduce the number of requests using syntactic filters.") 88 | que = '' 89 | for word in syntactic_filters: 90 | que = que.replace('last', ' ') 91 | que = que + "'" + word + "' last" 92 | que = que.replace(' last', '') 93 | parameters['query'] = que 94 | request = self._create_request(parameters, dates, start_date, end_date) 95 | headers = {'Authorization': 'Bearer ' + self.api_access} 96 | raw_papers = self._retry_request(self.client.request, self.api_url, 'post', request, headers) 97 | expected_papers = self._get_expected_papers(raw_papers) 98 | self.logger.info(LogCategory.DATABASE, "core", "_plan_requests", f"Expected papers from {self.database_name} using syntactic filters: {expected_papers}...") 99 | times = int(expected_papers / self.max_papers) - 1 100 | mod = int(expected_papers) % self.max_papers 101 | if mod > 0: 102 | times = times + 1 103 | if times >= self.quota: 104 | self.logger.info(LogCategory.DATABASE, "core", "_plan_requests", f"The number of expected papers requires {times} requests which exceeds the {self.database_name} quota of {self.quota} requests per day.") 105 | self.logger.info(LogCategory.DATABASE, "core", "_plan_requests", "Skipping to next repository. Try to redefine your search queries and syntactic filters. Using dates to limit your search can help in case you are not.") 106 | return pd.DataFrame() 107 | else: 108 | self.logger.info(LogCategory.DATABASE, "core", "_plan_requests", "Skipping to next repository. Please use syntactic filters to avoid this problem. Using dates to limit your search can help in case you are not.") 109 | return pd.DataFrame() 110 | 111 | # Execute requests 112 | parameters['expected_papers'] = expected_papers 113 | papers = self._execute_requests(query, parameters, dates, start_date, end_date) 114 | return papers 115 | 116 | def _execute_requests(self, query, parameters, dates, start_date, end_date): 117 | """Execute the planned requests to retrieve papers.""" 118 | papers = pd.DataFrame() 119 | times = int(parameters.get('expected_papers', 0) / self.max_papers) - 1 120 | mod = int(parameters.get('expected_papers', 0) % self.max_papers) 121 | if mod > 0: 122 | times = times + 1 123 | 124 | for t in tqdm(range(0, times + 1)): 125 | time.sleep(self.waiting_time) 126 | start = self.max_papers * t 127 | request = self._create_request(parameters, dates, start_date, end_date) 128 | request['from'] = start 129 | headers = {'Authorization': 'Bearer ' + self.api_access} 130 | 131 | raw_papers = self._retry_request(self.client.request, self.api_url, 'post', request, headers) 132 | 133 | if raw_papers is None: 134 | continue 135 | 136 | papers_request = self._process_raw_papers(query, raw_papers) 137 | if len(papers) == 0: 138 | papers = papers_request 139 | else: 140 | papers = pd.concat([papers, papers_request]) 141 | 142 | return papers 143 | 144 | def _create_request(self, parameters, dates, start_date, end_date): 145 | """Create the API request for CORE.""" 146 | start_year = start_date.year 147 | end_year = end_date.year 148 | query = self.client.core_query(parameters) 149 | if dates: 150 | query = '(yearPublished>=' + str(start_year) + ' AND yearPublished<=' + str(end_year) + ') AND ' + query 151 | request = { 152 | 'q': query, 153 | 'limit': self.max_papers, 154 | 'offset': 0 155 | } 156 | 157 | return request 158 | 159 | def _get_expected_papers(self, raw_papers): 160 | """Get the expected number of papers from the API response.""" 161 | total = 0 162 | if raw_papers.status_code == 200: 163 | try: 164 | json_results = json.loads(raw_papers.text) 165 | total = int(json_results['totalHits']) 166 | except (json.JSONDecodeError, KeyError) as e: 167 | # User-friendly message explaining what's happening 168 | context = create_error_context( 169 | "core", "_get_expected_papers", 170 | ErrorSeverity.WARNING, 171 | ErrorCategory.DATA, 172 | f"Data parsing error in CORE response: {type(e).__name__}: {str(e)}" 173 | ) 174 | error_info = get_standard_error_info("data_validation_failed") 175 | ErrorHandler.handle_error(e, context, error_info, self.logger) 176 | except (ValueError, TypeError) as e: 177 | # User-friendly message explaining what's happening 178 | context = create_error_context( 179 | "core", "_get_expected_papers", 180 | ErrorSeverity.WARNING, 181 | ErrorCategory.DATA, 182 | f"Data type error in CORE response: {type(e).__name__}: {str(e)}" 183 | ) 184 | error_info = get_standard_error_info("data_validation_failed") 185 | ErrorHandler.handle_error(e, context, error_info, self.logger) 186 | except Exception as ex: 187 | # User-friendly message explaining what's happening 188 | context = create_error_context( 189 | "core", "_get_expected_papers", 190 | ErrorSeverity.ERROR, 191 | ErrorCategory.DATA, 192 | f"Unexpected error parsing CORE response: {type(ex).__name__}: {str(ex)}" 193 | ) 194 | error_info = get_standard_error_info("unexpected_error") 195 | ErrorHandler.handle_error(ex, context, error_info, self.logger) 196 | else: 197 | self._log_api_error(raw_papers, self.api_url) 198 | return total 199 | 200 | def _process_raw_papers(self, query, raw_papers): 201 | """Process the raw API response into a DataFrame.""" 202 | query_name = list(query.keys())[0] 203 | query_value = query[query_name] 204 | papers_request = pd.DataFrame() 205 | 206 | if raw_papers.status_code == 200: 207 | try: 208 | json_results = json.loads(raw_papers.text) 209 | raw_papers = pd.json_normalize(json_results['results']) 210 | papers_request['id'] = raw_papers['id'] 211 | papers_request['title'] = raw_papers['title'] 212 | papers_request['abstract'] = raw_papers['abstract'] 213 | papers_request['url'] = raw_papers['downloadUrl'] 214 | papers_request['publication'] = raw_papers['publisher'] 215 | papers_request['publisher'] = self.database_name 216 | papers_request['publication_date'] = raw_papers['publishedDate'] 217 | papers_request['database'] = self.database_name 218 | papers_request['query_name'] = query_name 219 | papers_request['query_value'] = query_value.replace('', 'AND').replace('', 'OR') 220 | except (json.JSONDecodeError, KeyError) as e: 221 | # User-friendly message explaining what's happening 222 | context = create_error_context( 223 | "core", "_process_raw_papers", 224 | ErrorSeverity.WARNING, 225 | ErrorCategory.DATA, 226 | f"Data parsing error in CORE response: {type(e).__name__}: {str(e)}" 227 | ) 228 | error_info = get_standard_error_info("data_validation_failed") 229 | ErrorHandler.handle_error(e, context, error_info, self.logger) 230 | except Exception as ex: 231 | # User-friendly message explaining what's happening 232 | context = create_error_context( 233 | "core", "_process_raw_papers", 234 | ErrorSeverity.ERROR, 235 | ErrorCategory.DATA, 236 | f"Unexpected error parsing CORE response: {type(ex).__name__}: {str(ex)}" 237 | ) 238 | error_info = get_standard_error_info("unexpected_error") 239 | ErrorHandler.handle_error(ex, context, error_info, self.logger) 240 | else: 241 | self._log_api_error(raw_papers, self.api_url) 242 | 243 | return papers_request 244 | 245 | def _filter_papers(self, papers: pd.DataFrame, dates, start_date, end_date) -> pd.DataFrame: 246 | """Filter papers based on criteria.""" 247 | self.logger.info(LogCategory.DATA, "core", "_filter_papers", "Filtering papers...") 248 | try: 249 | # Filter by title 250 | papers.loc[:, 'title'] = papers['title'].replace('', float("NaN")) 251 | papers = papers.dropna(subset=['title']) 252 | papers.loc[:, 'title'] = papers['title'].str.lower() 253 | papers = papers.drop_duplicates('title') 254 | 255 | # Filter by abstract 256 | papers.loc[:, 'abstract'] = papers['abstract'].replace('', float("NaN")) 257 | papers = papers.dropna(subset=['abstract']) 258 | 259 | except (ValueError, TypeError) as e: 260 | # User-friendly message explaining what's happening 261 | context = create_error_context( 262 | "core", "_filter_papers", 263 | ErrorSeverity.WARNING, 264 | ErrorCategory.DATA, 265 | f"Data type error during CORE paper filtering: {type(e).__name__}: {str(e)}" 266 | ) 267 | error_info = get_standard_error_info("data_validation_failed") 268 | ErrorHandler.handle_error(e, context, error_info, self.logger) 269 | # Continue with unfiltered papers rather than failing completely 270 | except KeyError as e: 271 | # User-friendly message explaining what's happening 272 | context = create_error_context( 273 | "core", "_filter_papers", 274 | ErrorSeverity.WARNING, 275 | ErrorCategory.DATA, 276 | f"Missing required column during CORE paper filtering: {type(e).__name__}: {str(e)}" 277 | ) 278 | error_info = get_standard_error_info("data_validation_failed") 279 | ErrorHandler.handle_error(e, context, error_info, self.logger) 280 | # Return papers as-is to prevent complete failure 281 | except Exception as ex: 282 | # User-friendly message explaining what's happening 283 | context = create_error_context( 284 | "core", "_filter_papers", 285 | ErrorSeverity.ERROR, 286 | ErrorCategory.DATA, 287 | f"Unexpected error during CORE paper filtering: {type(ex).__name__}: {str(ex)}" 288 | ) 289 | error_info = get_standard_error_info("unexpected_error") 290 | ErrorHandler.handle_error(ex, context, error_info, self.logger) 291 | # Return papers as-is to prevent complete failure 292 | 293 | return papers 294 | 295 | def _clean_papers(self, papers: pd.DataFrame) -> pd.DataFrame: 296 | """Clean and standardize paper data.""" 297 | self.logger.info(LogCategory.DATA, "core", "_clean_papers", "Cleaning papers...") 298 | try: 299 | papers.replace('', float("NaN"), inplace=True) 300 | papers.dropna(how='all', axis=1, inplace=True) 301 | except (ValueError, TypeError) as e: 302 | # User-friendly message explaining what's happening 303 | context = create_error_context( 304 | "core", "_clean_papers", 305 | ErrorSeverity.WARNING, 306 | ErrorCategory.DATA, 307 | f"Data type error during CORE paper cleaning: {type(e).__name__}: {str(e)}" 308 | ) 309 | error_info = get_standard_error_info("data_validation_failed") 310 | ErrorHandler.handle_error(e, context, error_info, self.logger) 311 | # Continue with uncleaned papers rather than failing completely 312 | except KeyError as e: 313 | # User-friendly message explaining what's happening 314 | context = create_error_context( 315 | "core", "_clean_papers", 316 | ErrorSeverity.WARNING, 317 | ErrorCategory.DATA, 318 | f"Missing required column during CORE paper cleaning: {type(e).__name__}: {str(e)}" 319 | ) 320 | error_info = get_standard_error_info("data_validation_failed") 321 | ErrorHandler.handle_error(e, context, error_info, self.logger) 322 | # Return papers as-is to prevent complete failure 323 | except Exception as ex: 324 | # User-friendly message explaining what's happening 325 | context = create_error_context( 326 | "core", "_clean_papers", 327 | ErrorSeverity.ERROR, 328 | ErrorCategory.DATA, 329 | f"Unexpected error during CORE paper cleaning: {type(ex).__name__}: {str(ex)}" 330 | ) 331 | error_info = get_standard_error_info("unexpected_error") 332 | ErrorHandler.handle_error(ex, context, error_info, self.logger) 333 | # Return papers as-is to prevent complete failure 334 | 335 | return papers 336 | 337 | def _get_abstracts(self, papers: pd.DataFrame) -> pd.DataFrame: 338 | """Get abstracts for papers.""" 339 | pass 340 | -------------------------------------------------------------------------------- /util/logging_standards.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | """ 3 | Logging Standards for SaLS Project. 4 | 5 | This module defines consistent logging patterns, message formats, 6 | and logging configuration across the entire SaLS codebase. 7 | 8 | Standards ensure: 9 | 1. Consistent log levels and their usage 10 | 2. Uniform message formatting 11 | 3. Standardized logging configuration 12 | 4. Consistent progress reporting 13 | 5. Unified debug information 14 | """ 15 | 16 | import logging 17 | import logging.handlers 18 | import os 19 | import sys 20 | from typing import Optional, Dict, Any 21 | from datetime import datetime 22 | from enum import Enum 23 | 24 | 25 | class LogLevel(Enum): 26 | """Standard log levels for consistent usage across SaLS.""" 27 | CRITICAL = logging.CRITICAL # Pipeline cannot continue 28 | ERROR = logging.ERROR # Operation failed, but pipeline can continue 29 | WARNING = logging.WARNING # Issue detected, operation continues with defaults 30 | INFO = logging.INFO # Informational message, user-facing 31 | DEBUG = logging.DEBUG # Debug information, developer-facing 32 | NOTSET = logging.NOTSET # Not set 33 | 34 | 35 | class LogCategory(Enum): 36 | """Standard log categories for consistent classification.""" 37 | PIPELINE = "PIPELINE" # Main pipeline operations 38 | CONFIGURATION = "CONFIG" # Configuration operations 39 | DATABASE = "DATABASE" # Database operations 40 | API = "API" # External API calls 41 | DATA = "DATA" # Data processing operations 42 | FILE = "FILE" # File I/O operations 43 | VALIDATION = "VALIDATION" # Data validation operations 44 | USER = "USER" # User interaction operations 45 | SYSTEM = "SYSTEM" # System-level operations 46 | 47 | 48 | class LogFormatter: 49 | """Standard log formatter for consistent message formatting.""" 50 | 51 | # Standard format for different log levels 52 | STANDARD_FORMATS = { 53 | LogLevel.CRITICAL: "[CRITICAL] {asctime} | {category} | {module}.{function} | {message}", 54 | LogLevel.ERROR: "[ERROR] {asctime} | {category} | {module}.{function} | {message}", 55 | LogLevel.WARNING: "[WARNING] {asctime} | {category} | {module}.{function} | {message}", 56 | LogLevel.INFO: "[INFO] {asctime} | {category} | {module}.{function} | {message}", 57 | LogLevel.DEBUG: "[DEBUG] {asctime} | {category} | {module}.{function} | {message}" 58 | } 59 | 60 | @staticmethod 61 | def format_message(level: LogLevel, 62 | category: LogCategory, 63 | module: str, 64 | function: str, 65 | message: str, 66 | extra_info: Optional[Dict[str, Any]] = None) -> str: 67 | """Format a log message according to SaLS standards.""" 68 | 69 | # Get base format 70 | base_format = LogFormatter.STANDARD_FORMATS.get(level, LogFormatter.STANDARD_FORMATS[LogLevel.INFO]) 71 | 72 | # Format extra info if provided 73 | extra_str = "" 74 | if extra_info: 75 | extra_parts = [] 76 | for key, value in extra_info.items(): 77 | if isinstance(value, (dict, list)): 78 | extra_parts.append(f"{key}: {str(value)[:100]}...") 79 | else: 80 | extra_parts.append(f"{key}: {value}") 81 | extra_str = " | " + " | ".join(extra_parts) 82 | 83 | # Format the message 84 | formatted = base_format.format( 85 | asctime=datetime.now().strftime("%Y-%m-%d %H:%M:%S"), 86 | category=category.value, 87 | module=module, 88 | function=function, 89 | message=message, 90 | extra_info=extra_str 91 | ) 92 | 93 | return formatted 94 | 95 | 96 | class SaLSLogger: 97 | """Standard SaLS logger with consistent configuration and methods.""" 98 | 99 | def __init__(self, 100 | name: str, 101 | log_file: Optional[str] = None, 102 | console_level: LogLevel = LogLevel.INFO, 103 | file_level: LogLevel = LogLevel.DEBUG, 104 | max_file_size: int = 10 * 1024 * 1024, # 10MB 105 | backup_count: int = 5): 106 | self.name = name 107 | self.log_file = log_file 108 | self.console_level = console_level 109 | self.file_level = file_level 110 | self.max_file_size = max_file_size 111 | self.backup_count = backup_count 112 | 113 | # Create logger 114 | self.logger = logging.getLogger(name) 115 | self.logger.setLevel(logging.DEBUG) # Set to lowest level, handlers will filter 116 | 117 | # Clear existing handlers 118 | self.logger.handlers.clear() 119 | 120 | # Setup handlers 121 | self._setup_console_handler() 122 | if log_file: 123 | self._setup_file_handler() 124 | 125 | def _setup_console_handler(self) -> None: 126 | """Setup console handler with user-friendly formatting.""" 127 | console_handler = logging.StreamHandler(sys.stdout) 128 | console_handler.setLevel(self.console_level.value) 129 | 130 | # Create formatter for console - use a simpler format since we handle custom fields in our log method 131 | console_formatter = logging.Formatter("[%(levelname)s] %(message)s") 132 | console_handler.setFormatter(console_formatter) 133 | 134 | self.logger.addHandler(console_handler) 135 | 136 | def _setup_file_handler(self) -> None: 137 | """Setup file handler with detailed formatting.""" 138 | # Ensure log directory exists 139 | log_dir = os.path.dirname(self.log_file) 140 | if log_dir and not os.path.exists(log_dir): 141 | os.makedirs(log_dir, exist_ok=True) 142 | 143 | # Create rotating file handler 144 | file_handler = logging.handlers.RotatingFileHandler( 145 | self.log_file, 146 | maxBytes=self.max_file_size, 147 | backupCount=self.backup_count, 148 | encoding='utf-8' 149 | ) 150 | file_handler.setLevel(self.file_level.value) 151 | 152 | # Create formatter for file - use a simpler format since we handle custom fields in our log method 153 | file_formatter = logging.Formatter("[%(levelname)s] %(asctime)s | %(message)s") 154 | file_handler.setFormatter(file_formatter) 155 | 156 | self.logger.addHandler(file_handler) 157 | 158 | def log(self, 159 | level: LogLevel, 160 | category: LogCategory, 161 | module: str, 162 | function: str, 163 | message: str, 164 | extra_info: Optional[Dict[str, Any]] = None, 165 | print_to_console: bool = False) -> None: 166 | """Log a message with consistent formatting.""" 167 | 168 | # Format the message with our custom formatter for console output 169 | formatted_message = LogFormatter.format_message( 170 | level=level, 171 | category=category, 172 | module=module, 173 | function=function, 174 | message=message, 175 | extra_info=extra_info 176 | ) 177 | 178 | # Log according to level - pass the original message to Python's logger 179 | # Python's logger will format it with its own formatter 180 | if level == LogLevel.CRITICAL: 181 | self.logger.critical(message) 182 | elif level == LogLevel.ERROR: 183 | self.logger.error(message) 184 | elif level == LogLevel.WARNING: 185 | self.logger.warning(message) 186 | elif level == LogLevel.INFO: 187 | self.logger.info(message) 188 | else: # DEBUG 189 | self.logger.debug(message) 190 | 191 | # Optionally print to console for user-facing messages 192 | if print_to_console and level in [LogLevel.CRITICAL, LogLevel.ERROR, LogLevel.WARNING, LogLevel.INFO]: 193 | print(formatted_message) 194 | 195 | def critical(self, category: LogCategory, module: str, function: str, message: str, 196 | extra_info: Optional[Dict[str, Any]] = None, print_to_console: bool = True) -> None: 197 | """Log a critical message.""" 198 | self.log(LogLevel.CRITICAL, category, module, function, message, extra_info, print_to_console) 199 | 200 | def error(self, category: LogCategory, module: str, function: str, message: str, 201 | extra_info: Optional[Dict[str, Any]] = None, print_to_console: bool = True) -> None: 202 | """Log an error message.""" 203 | self.log(LogLevel.ERROR, category, module, function, message, extra_info, print_to_console) 204 | 205 | def warning(self, category: LogCategory, module: str, function: str, message: str, 206 | extra_info: Optional[Dict[str, Any]] = None, print_to_console: bool = False) -> None: 207 | """Log a warning message.""" 208 | self.log(LogLevel.WARNING, category, module, function, message, extra_info, print_to_console) 209 | 210 | def info(self, category: LogCategory, module: str, function: str, message: str, 211 | extra_info: Optional[Dict[str, Any]] = None, print_to_console: bool = False) -> None: 212 | """Log an info message.""" 213 | self.log(LogLevel.INFO, category, module, function, message, extra_info, print_to_console) 214 | 215 | def debug(self, category: LogCategory, module: str, function: str, message: str, 216 | extra_info: Optional[Dict[str, Any]] = None, print_to_console: bool = False) -> None: 217 | """Log a debug message.""" 218 | self.log(LogLevel.DEBUG, category, module, function, message, extra_info, print_to_console) 219 | 220 | def progress(self, current: int, total: int, operation: str, 221 | extra_info: Optional[Dict[str, Any]] = None) -> None: 222 | """Log progress information with consistent formatting.""" 223 | percentage = (current / total) * 100 if total > 0 else 0 224 | progress_message = f"Progress: {current}/{total} ({percentage:.1f}%) - {operation}" 225 | 226 | self.info( 227 | category=LogCategory.PIPELINE, 228 | module=self.name, 229 | function="progress", 230 | message=progress_message, 231 | extra_info=extra_info, 232 | print_to_console=False 233 | ) 234 | 235 | def operation_start(self, operation: str, extra_info: Optional[Dict[str, Any]] = None) -> None: 236 | """Log the start of an operation.""" 237 | self.info( 238 | category=LogCategory.PIPELINE, 239 | module=self.name, 240 | function="operation_start", 241 | message=f"Starting: {operation}", 242 | extra_info=extra_info, 243 | print_to_console=False 244 | ) 245 | 246 | def operation_complete(self, operation: str, result: str, 247 | extra_info: Optional[Dict[str, Any]] = None) -> None: 248 | """Log the completion of an operation.""" 249 | self.info( 250 | category=LogCategory.PIPELINE, 251 | module=self.name, 252 | function="operation_complete", 253 | message=f"Completed: {operation} - Result: {result}", 254 | extra_info=extra_info, 255 | print_to_console=False 256 | ) 257 | 258 | def operation_failed(self, operation: str, error: str, 259 | extra_info: Optional[Dict[str, Any]] = None) -> None: 260 | """Log the failure of an operation.""" 261 | self.error( 262 | category=LogCategory.PIPELINE, 263 | module=self.name, 264 | function="operation_failed", 265 | message=f"Failed: {operation} - Error: {error}", 266 | extra_info=extra_info, 267 | print_to_console=True 268 | ) 269 | 270 | 271 | def setup_sals_logger(name: str, 272 | log_file: Optional[str] = None, 273 | console_level: LogLevel = LogLevel.INFO, 274 | file_level: LogLevel = LogLevel.DEBUG) -> SaLSLogger: 275 | """Setup a standardized SaLS logger.""" 276 | new_logger = SaLSLogger( 277 | name=name, 278 | log_file=log_file, 279 | console_level=console_level, 280 | file_level=file_level 281 | ) 282 | 283 | # Attach the same handlers to legacy 'logger' so existing modules print to console and files 284 | try: 285 | legacy_logger = logging.getLogger('logger') 286 | legacy_logger.setLevel(logging.DEBUG) 287 | legacy_logger.handlers.clear() 288 | for handler in new_logger.logger.handlers: 289 | legacy_logger.addHandler(handler) 290 | legacy_logger.propagate = False 291 | except Exception: 292 | pass 293 | 294 | # Keep a reference to the current SaLS logger 295 | global _CURRENT_SALS_LOGGER 296 | _CURRENT_SALS_LOGGER = new_logger 297 | return new_logger 298 | 299 | 300 | # Standard logging configuration 301 | def get_standard_logging_config() -> Dict[str, Any]: 302 | """Get standard logging configuration for SaLS.""" 303 | return { 304 | "console_level": LogLevel.INFO, 305 | "file_level": LogLevel.DEBUG, 306 | "max_file_size": 10 * 1024 * 1024, # 10MB 307 | "backup_count": 5, 308 | "log_format": "detailed", 309 | "encoding": "utf-8" 310 | } 311 | 312 | 313 | # Keep reference to the most recently configured SaLS logger 314 | _CURRENT_SALS_LOGGER: Optional[SaLSLogger] = None 315 | 316 | def get_current_sals_logger() -> Optional[SaLSLogger]: 317 | """Return the last configured SaLSLogger, if any.""" 318 | return _CURRENT_SALS_LOGGER 319 | 320 | # Compatibility logger that accepts both SaLSLogger-style and std logging-style calls 321 | class CompatLogger: 322 | def __init__(self, sals_logger: Optional[SaLSLogger], std_logger: logging.Logger): 323 | self._sals = sals_logger 324 | self._std = std_logger 325 | try: 326 | self._std.setLevel(logging.INFO) 327 | except Exception: 328 | pass 329 | 330 | def _route(self, level: LogLevel, *args, **kwargs): 331 | # SaLS-style: (category, module, function, message, extra_info=None, print_to_console=False) 332 | if len(args) >= 4 and isinstance(args[0], LogCategory): 333 | category, module, function, message = args[:4] 334 | extra_info = args[4] if len(args) >= 5 else kwargs.get('extra_info') 335 | print_to_console = args[5] if len(args) >= 6 else kwargs.get('print_to_console', False) 336 | if self._sals: 337 | self._sals.log(level, category, module, function, message, extra_info, print_to_console) 338 | else: 339 | rendered = LogFormatter.format_message(level, category, module, function, message, extra_info) 340 | if level == LogLevel.CRITICAL: 341 | self._std.critical(rendered) 342 | elif level == LogLevel.ERROR: 343 | self._std.error(rendered) 344 | elif level == LogLevel.WARNING: 345 | self._std.warning(rendered) 346 | elif level == LogLevel.INFO: 347 | self._std.info(rendered) 348 | else: 349 | self._std.debug(rendered) 350 | if print_to_console and level in [LogLevel.CRITICAL, LogLevel.ERROR, LogLevel.WARNING, LogLevel.INFO]: 351 | try: 352 | print(rendered) 353 | except Exception: 354 | pass 355 | return 356 | 357 | # Std-style: (msg, *fmt_args) 358 | if len(args) >= 1: 359 | msg = args[0] 360 | fmt_args = args[1:] if len(args) > 1 else () 361 | try: 362 | if level == LogLevel.CRITICAL: 363 | self._std.critical(msg, *fmt_args) 364 | elif level == LogLevel.ERROR: 365 | self._std.error(msg, *fmt_args) 366 | elif level == LogLevel.WARNING: 367 | self._std.warning(msg, *fmt_args) 368 | elif level == LogLevel.INFO: 369 | self._std.info(msg, *fmt_args) 370 | else: 371 | self._std.debug(msg, *fmt_args) 372 | except Exception: 373 | safe_msg = str(msg) 374 | if level == LogLevel.CRITICAL: 375 | self._std.critical(safe_msg) 376 | elif level == LogLevel.ERROR: 377 | self._std.error(safe_msg) 378 | elif level == LogLevel.WARNING: 379 | self._std.warning(safe_msg) 380 | elif level == LogLevel.INFO: 381 | self._std.info(safe_msg) 382 | else: 383 | self._std.debug(safe_msg) 384 | 385 | def info(self, *args, **kwargs): 386 | self._route(LogLevel.INFO, *args, **kwargs) 387 | 388 | def warning(self, *args, **kwargs): 389 | self._route(LogLevel.WARNING, *args, **kwargs) 390 | 391 | def error(self, *args, **kwargs): 392 | self._route(LogLevel.ERROR, *args, **kwargs) 393 | 394 | def debug(self, *args, **kwargs): 395 | self._route(LogLevel.DEBUG, *args, **kwargs) 396 | 397 | def critical(self, *args, **kwargs): 398 | self._route(LogLevel.CRITICAL, *args, **kwargs) 399 | 400 | 401 | def get_compat_logger() -> CompatLogger: 402 | """Return a logger that accepts both SaLSLogger-style and std logging-style calls.""" 403 | return CompatLogger(_CURRENT_SALS_LOGGER, logging.getLogger('sals_pipeline')) 404 | 405 | # Example usage: 406 | """ 407 | # In your module: 408 | from util.logging_standards import ( 409 | setup_sals_logger, LogCategory, LogLevel 410 | ) 411 | 412 | # Setup logger 413 | logger = setup_sals_logger( 414 | name="my_module", 415 | log_file="logs/my_module.log" 416 | ) 417 | 418 | # Log operations 419 | logger.operation_start("data_processing") 420 | logger.info(LogCategory.DATA, "my_module", "process_data", "Processing 100 records") 421 | logger.progress(50, 100, "data_processing") 422 | logger.operation_complete("data_processing", "100 records processed") 423 | """ 424 | -------------------------------------------------------------------------------- /clients/arxiv.py: -------------------------------------------------------------------------------- 1 | import pandas as pd 2 | import json 3 | from .base_client import DatabaseClient 4 | from .apis.generic import Generic 5 | from os.path import exists 6 | import logging 7 | import time 8 | from tqdm import tqdm 9 | from util.error_standards import ( 10 | ErrorHandler, create_error_context, ErrorSeverity, ErrorCategory, 11 | get_standard_error_info 12 | ) 13 | from util.logging_standards import LogCategory, get_compat_logger, get_current_sals_logger 14 | 15 | 16 | class ArxivClient(DatabaseClient): 17 | """ 18 | Refactored arXiv client using the Template Method pattern. 19 | """ 20 | 21 | def __init__(self): 22 | super().__init__( 23 | database_name='arxiv', 24 | max_papers=5000, 25 | waiting_time=2, 26 | max_retries=3, 27 | client_fields={'title': 'ti', 'abstract': 'abs'} 28 | ) 29 | self.api_url = 'http://export.arxiv.org/api/query?search_query=' 30 | self.client = Generic() 31 | # Normalize logger to a compat logger to accept both styles 32 | try: 33 | sals = get_current_sals_logger() 34 | self.logger = get_compat_logger() if sals is None else get_compat_logger() 35 | except Exception: 36 | pass 37 | 38 | def _has_api_access(self) -> bool: 39 | """ArXiv is open access, so no API key is needed.""" 40 | return True 41 | 42 | def _plan_requests(self, query, syntactic_filters, synonyms, fields, types, dates, start_date, end_date) -> pd.DataFrame: 43 | """Plan the API requests for arXiv.""" 44 | # Extract query value from the query dictionary (same as original arxiv.py) 45 | query_name = list(query.keys())[0] 46 | query_value = query[query_name] 47 | 48 | # Build query parameters 49 | c_fields = [] 50 | for field in fields: 51 | if field in self.client_fields: 52 | c_fields.append(self.client_fields[field]) 53 | 54 | parameters = { 55 | 'query': query_value, # Use query_value string, not the entire query dict 56 | 'syntactic_filters': syntactic_filters, 57 | 'synonyms': synonyms, 58 | 'fields': c_fields, 59 | 'types': types 60 | } 61 | 62 | # Create initial request to get total count 63 | request = self._create_request(parameters) 64 | raw_papers = self._retry_request(self.client.request, request, 'get', {}, {}) 65 | expected_papers = self._get_expected_papers(raw_papers) 66 | 67 | self.logger.info(f"Expected papers from arxiv: {expected_papers}...") 68 | 69 | # Calculate number of requests needed 70 | times = int(expected_papers / self.max_papers) - 1 71 | mod = int(expected_papers) % self.max_papers 72 | if mod > 0: 73 | times = times + 1 74 | 75 | # Execute requests 76 | papers = self._execute_requests(query, parameters, times, expected_papers, mod) 77 | return papers 78 | 79 | def _execute_requests(self, query, parameters, times, expected_papers, mod): 80 | """Execute the planned requests to retrieve papers.""" 81 | papers = pd.DataFrame() 82 | 83 | for t in tqdm(range(0, times + 1)): 84 | time.sleep(self.waiting_time) 85 | start = t * self.max_papers 86 | 87 | request = self._create_request(parameters, start) 88 | raw_papers = self._retry_request(self.client.request, request, 'get', {}, {}) 89 | 90 | if raw_papers is None: 91 | continue 92 | 93 | papers_request = self._process_raw_papers(query, raw_papers) 94 | 95 | # Sometimes arXiv API doesn't respond with all papers, so retry 96 | expected_per_request = expected_papers 97 | if expected_papers > self.max_papers: 98 | expected_per_request = self.max_papers 99 | if t == times and mod > 0: 100 | expected_per_request = mod 101 | 102 | while len(papers_request) < expected_per_request: 103 | time.sleep(self.waiting_time) 104 | raw_papers = self._retry_request(self.client.request, request, 'get', {}, {}) 105 | papers_request = self._process_raw_papers(query, raw_papers) 106 | 107 | if len(papers) == 0: 108 | papers = papers_request 109 | else: 110 | papers = pd.concat([papers, papers_request]) 111 | 112 | return papers 113 | 114 | def _create_request(self, parameters, start=0): 115 | """Create the API request URL for arXiv.""" 116 | req = self.api_url 117 | req = req + self.client.default_query(parameters) 118 | req = req + '&start=' + str(start) 119 | req = req + '&max_results=' + str(self.max_papers) 120 | req = req + '&sortBy=submittedDate&sortOrder=descending' 121 | return req 122 | 123 | def _get_expected_papers(self, raw_papers): 124 | """Get the expected number of papers from the API response.""" 125 | total = 0 126 | if raw_papers.status_code == 200: 127 | try: 128 | total_text = raw_papers.text.split('opensearch:totalResults')[1] 129 | total = int(total_text.split('>')[1].replace('', 'AND').replace('', 'OR') 185 | except (ValueError, TypeError) as e: 186 | context = create_error_context( 187 | module="arxiv", 188 | function="_process_raw_papers", 189 | operation="api_response_parsing", 190 | severity=ErrorSeverity.WARNING, 191 | category=ErrorCategory.API 192 | ) 193 | 194 | error_info = get_standard_error_info("data_validation_failed") 195 | error_handler = ErrorHandler(self.logger) 196 | error_msg = error_handler.handle_error( 197 | error=e, 198 | context=context, 199 | error_type="APIResponseParsingError", 200 | error_description=f"Error parsing the API response: {type(e).__name__}: {str(e)}", 201 | recovery_suggestion=error_info["recovery"], 202 | next_steps=error_info["next_steps"] 203 | ) 204 | except Exception as ex: 205 | context = create_error_context( 206 | module="arxiv", 207 | function="_process_raw_papers", 208 | operation="api_response_parsing", 209 | severity=ErrorSeverity.WARNING, 210 | category=ErrorCategory.API 211 | ) 212 | 213 | error_info = get_standard_error_info("data_validation_failed") 214 | error_handler = ErrorHandler(self.logger) 215 | error_msg = error_handler.handle_error( 216 | error=ex, 217 | context=context, 218 | error_type="APIResponseParsingError", 219 | error_description=f"Unexpected error parsing the API response: {type(ex).__name__}: {str(ex)}", 220 | recovery_suggestion=error_info["recovery"], 221 | next_steps=error_info["next_steps"] 222 | ) 223 | else: 224 | self._log_api_error(raw_papers, raw_papers.request.url if raw_papers.request else "") 225 | 226 | return papers_request 227 | 228 | def _filter_papers(self, papers: pd.DataFrame, dates, start_date, end_date) -> pd.DataFrame: 229 | """Filter papers based on criteria.""" 230 | self.logger.info("Filtering papers...") 231 | try: 232 | # Filter by title 233 | papers.loc[:, 'title'] = papers['title'].replace('', float("NaN")) 234 | papers.dropna(subset=['title'], inplace=True) 235 | papers.loc[:, 'title'] = papers['title'].str.lower() 236 | papers = papers.drop_duplicates('title') 237 | 238 | # Filter by abstract 239 | papers.loc[:, 'summary'] = papers['summary'].replace('', float("NaN")) 240 | papers.dropna(subset=['summary'], inplace=True) 241 | 242 | # Filter by dates if specified 243 | if dates is True: 244 | papers['published'] = pd.to_datetime(papers['published']).dt.date 245 | papers = papers[(papers['published'] >= start_date) & (papers['published'] <= end_date)] 246 | 247 | except (ValueError, TypeError) as e: 248 | context = create_error_context( 249 | module="arxiv", 250 | function="_filter_papers", 251 | operation="paper_filtering", 252 | severity=ErrorSeverity.WARNING, 253 | category=ErrorCategory.DATA 254 | ) 255 | 256 | error_info = get_standard_error_info("data_validation_failed") 257 | error_handler = ErrorHandler(self.logger) 258 | error_msg = error_handler.handle_error( 259 | error=e, 260 | context=context, 261 | error_type="PaperFilteringError", 262 | error_description=f"Error filtering papers: {type(e).__name__}: {str(e)}", 263 | recovery_suggestion=error_info["recovery"], 264 | next_steps=error_info["next_steps"] 265 | ) 266 | # Continue with unfiltered papers rather than failing completely 267 | except KeyError as e: 268 | context = create_error_context( 269 | module="arxiv", 270 | function="_filter_papers", 271 | operation="paper_filtering", 272 | severity=ErrorSeverity.WARNING, 273 | category=ErrorCategory.DATA 274 | ) 275 | 276 | error_info = get_standard_error_info("data_validation_failed") 277 | error_handler = ErrorHandler(self.logger) 278 | error_msg = error_handler.handle_error( 279 | error=e, 280 | context=context, 281 | error_type="PaperFilteringError", 282 | error_description=f"Missing required column during paper filtering: {type(e).__name__}: {str(e)}", 283 | recovery_suggestion=error_info["recovery"], 284 | next_steps=error_info["next_steps"] 285 | ) 286 | # Return papers as-is to prevent complete failure 287 | except Exception as ex: 288 | context = create_error_context( 289 | module="arxiv", 290 | function="_filter_papers", 291 | operation="paper_filtering", 292 | severity=ErrorSeverity.WARNING, 293 | category=ErrorCategory.DATA 294 | ) 295 | 296 | error_info = get_standard_error_info("data_validation_failed") 297 | error_handler = ErrorHandler(self.logger) 298 | error_msg = error_handler.handle_error( 299 | error=ex, 300 | context=context, 301 | error_type="PaperFilteringError", 302 | error_description=f"Unexpected error during paper filtering: {type(ex).__name__}: {str(ex)}", 303 | recovery_suggestion=error_info["recovery"], 304 | next_steps=error_info["next_steps"] 305 | ) 306 | # Return papers as-is to prevent complete failure 307 | 308 | return papers 309 | 310 | def _clean_papers(self, papers: pd.DataFrame) -> pd.DataFrame: 311 | """Clean and standardize paper data.""" 312 | self.logger.info("Cleaning papers...") 313 | try: 314 | # Remove unnecessary columns 315 | papers = papers.drop(columns=[ 316 | 'author', 'comment', 'link', 'primary_category', 'category', 317 | 'doi', 'journal_ref' 318 | ], errors='ignore') 319 | 320 | # Clean empty values 321 | papers.replace('', float("NaN"), inplace=True) 322 | papers.dropna(how='all', axis=1, inplace=True) 323 | 324 | except (ValueError, TypeError) as e: 325 | context = create_error_context( 326 | module="arxiv", 327 | function="_clean_papers", 328 | operation="paper_cleaning", 329 | severity=ErrorSeverity.WARNING, 330 | category=ErrorCategory.DATA 331 | ) 332 | 333 | error_info = get_standard_error_info("data_validation_failed") 334 | error_handler = ErrorHandler(self.logger) 335 | error_msg = error_handler.handle_error( 336 | error=e, 337 | context=context, 338 | error_type="PaperCleaningError", 339 | error_description=f"Error cleaning papers: {type(e).__name__}: {str(e)}", 340 | recovery_suggestion=error_info["recovery"], 341 | next_steps=error_info["next_steps"] 342 | ) 343 | # Continue with uncleaned papers rather than failing completely 344 | except KeyError as e: 345 | context = create_error_context( 346 | module="arxiv", 347 | function="_clean_papers", 348 | operation="paper_cleaning", 349 | severity=ErrorSeverity.WARNING, 350 | category=ErrorCategory.DATA 351 | ) 352 | 353 | error_info = get_standard_error_info("data_validation_failed") 354 | error_handler = ErrorHandler(self.logger) 355 | error_msg = error_handler.handle_error( 356 | error=e, 357 | context=context, 358 | error_type="PaperCleaningError", 359 | error_description=f"Missing required column during paper cleaning: {type(e).__name__}: {str(e)}", 360 | recovery_suggestion=error_info["recovery"], 361 | next_steps=error_info["next_steps"] 362 | ) 363 | # Return papers as-is to prevent complete failure 364 | except Exception as ex: 365 | context = create_error_context( 366 | module="arxiv", 367 | function="_clean_papers", 368 | operation="paper_cleaning", 369 | severity=ErrorSeverity.WARNING, 370 | category=ErrorCategory.DATA 371 | ) 372 | 373 | error_info = get_standard_error_info("data_validation_failed") 374 | error_handler = ErrorHandler(self.logger) 375 | error_msg = error_handler.handle_error( 376 | error=ex, 377 | context=context, 378 | error_type="PaperCleaningError", 379 | error_description=f"Unexpected error during paper cleaning: {type(ex).__name__}: {str(ex)}", 380 | recovery_suggestion=error_info["recovery"], 381 | next_steps=error_info["next_steps"] 382 | ) 383 | # Return papers as-is to prevent complete failure 384 | 385 | return papers 386 | 387 | def _get_abstracts(self, papers: pd.DataFrame) -> pd.DataFrame: 388 | """Get abstracts for papers.""" 389 | pass -------------------------------------------------------------------------------- /clients/springer.py: -------------------------------------------------------------------------------- 1 | import pandas as pd 2 | import json 3 | from .base_client import DatabaseClient 4 | from .apis.generic import Generic 5 | from os.path import exists 6 | import logging 7 | import time 8 | from tqdm import tqdm 9 | from util.error_standards import ( 10 | ErrorHandler, create_error_context, ErrorSeverity, ErrorCategory, 11 | get_standard_error_info 12 | ) 13 | from util.logging_standards import LogCategory 14 | 15 | 16 | class SpringerClient(DatabaseClient): 17 | """Springer client implementation using the DatabaseClient base class.""" 18 | 19 | def __init__(self): 20 | super().__init__( 21 | database_name='springer', 22 | max_papers=25, 23 | waiting_time=2, 24 | max_retries=3, 25 | client_fields={'title': 'title'}, 26 | quota=500 27 | ) 28 | 29 | # Load API access from config 30 | if exists('./config.json'): 31 | with open("./config.json", "r") as file: 32 | config = json.load(file) 33 | if 'api_access_springer' in config: 34 | self.api_access = config['api_access_springer'] 35 | else: 36 | self.api_access = '' 37 | else: 38 | self.api_access = '' 39 | 40 | # Define API URL after API access is loaded 41 | self.api_url = 'http://api.springernature.com/metadata/json?q=type:Journal' 42 | self.start = 0 43 | self.client = Generic() 44 | 45 | def _has_api_access(self) -> bool: 46 | """Check if Springer API access is available.""" 47 | return self.api_access != '' 48 | 49 | def _plan_requests(self, query, syntactic_filters, synonyms, fields, types, dates, start_date, end_date) -> pd.DataFrame: 50 | """Plan the API requests for Springer.""" 51 | # Extract query value from the query dictionary 52 | query_name = list(query.keys())[0] 53 | query_value = query[query_name] 54 | 55 | # Build query parameters 56 | c_fields = [] 57 | for field in fields: 58 | if field in self.client_fields: 59 | c_fields.append(self.client_fields[field]) 60 | 61 | parameters = { 62 | 'query': query_value, 63 | 'syntactic_filters': syntactic_filters, 64 | 'synonyms': synonyms, 65 | 'fields': c_fields, 66 | 'types': types 67 | } 68 | 69 | # Create initial request to get total count 70 | request = self._create_request(parameters, dates, start_date, end_date, False) 71 | raw_papers = self._retry_request(self.client.request, request, 'get', {}, {}) 72 | expected_papers = self._get_expected_papers(raw_papers) 73 | 74 | self.logger.info(LogCategory.DATABASE, "springer", "_plan_requests", f"Expected papers from springer: {expected_papers}...") 75 | 76 | # Calculate number of requests needed 77 | times = int(expected_papers / self.max_papers) - 1 78 | mod = int(expected_papers) % self.max_papers 79 | if mod > 0: 80 | times = times + 1 81 | 82 | # Check quota constraints 83 | if times >= self.quota: 84 | self.logger.info(LogCategory.DATABASE, "springer", "_plan_requests", f"The number of expected papers requires {times + 1} requests which exceeds the {self.database_name} quota of {self.quota} requests per day.") 85 | if len(syntactic_filters) > 0: 86 | self.logger.info(LogCategory.DATABASE, "springer", "_plan_requests", "Trying to reduce the number of requests using syntactic filters.") 87 | request = self._create_request(parameters, dates, start_date, end_date, True) 88 | raw_papers = self._retry_request(self.client.request, request, 'get', {}, {}) 89 | expected_papers = self._get_expected_papers(raw_papers) 90 | self.logger.info(LogCategory.DATABASE, "springer", "_plan_requests", f"Expected papers from {self.database_name} using syntactic filters: {expected_papers}...") 91 | times = int(expected_papers / self.max_papers) - 1 92 | mod = int(expected_papers) % self.max_papers 93 | if mod > 0: 94 | times = times + 1 95 | if times >= self.quota: 96 | self.logger.info(LogCategory.DATABASE, "springer", "_plan_requests", f"The number of expected papers requires {times + 1} requests which exceeds the {self.database_name} quota of {self.quota} requests per day.") 97 | self.logger.info(LogCategory.DATABASE, "springer", "_plan_requests", "Skipping to next repository. Try to redefine your search queries and syntactic filters. Using dates to limit your search can help in case you are not.") 98 | return pd.DataFrame() 99 | else: 100 | self.logger.info(LogCategory.DATABASE, "springer", "_plan_requests", "Skipping to next repository. Please use syntactic filters to avoid this problem. Using dates to limit your search can help in case you are not.") 101 | return pd.DataFrame() 102 | 103 | # Execute requests 104 | papers = self._execute_requests(query, parameters, times, dates, start_date, end_date, False) 105 | return papers 106 | 107 | def _execute_requests(self, query, parameters, times, dates, start_date, end_date, syntactic_filter): 108 | """Execute the planned requests to retrieve papers.""" 109 | papers = pd.DataFrame() 110 | 111 | for t in tqdm(range(0, times + 1)): 112 | time.sleep(self.waiting_time) 113 | self.start = t * self.max_papers 114 | 115 | request = self._create_request(parameters, dates, start_date, end_date, syntactic_filter) 116 | raw_papers = self._retry_request(self.client.request, request, 'get', {}, {}) 117 | 118 | if raw_papers is None: 119 | continue 120 | 121 | papers_request = self._process_raw_papers(query, raw_papers) 122 | 123 | if len(papers) == 0: 124 | papers = papers_request 125 | else: 126 | papers = pd.concat([papers, papers_request]) 127 | 128 | return papers 129 | 130 | def _create_request(self, parameters, dates, start_date, end_date, syntactic_filter): 131 | """Create the API request URL for Springer.""" 132 | req = self.api_url 133 | if dates is True: 134 | req = req.replace('', '%20onlinedatefrom:' + str(start_date) +'%20onlinedateto:' + str(end_date) + '%20') 135 | else: 136 | req = req.replace('', '') 137 | 138 | if not syntactic_filter: 139 | req = req + self.client.default_query(parameters) 140 | req = req.replace('%28', '(').replace('%29', ')').replace('+', '%20') 141 | req = req.replace('title:', '') 142 | else: 143 | query = parameters['query'] 144 | syntactic_filters = parameters['syntactic_filters'] 145 | for word in syntactic_filters: 146 | query = query.replace('last', ' ') 147 | query = query + "'" + word + "' last" 148 | query = query.replace(' last', '') 149 | parameters['query'] = query 150 | req = req + self.client.default_query(parameters) 151 | req = req.replace('%28', '(').replace('%29', ')').replace('+', '%20') 152 | req = req.replace('title:', '') 153 | 154 | req = req + '&s='+str(self.start)+'&p='+str(self.max_papers)+'&api_key=' + self.api_access 155 | return req 156 | 157 | def _get_expected_papers(self, raw_papers): 158 | """Get the expected number of papers from the API response.""" 159 | total = 0 160 | if raw_papers.status_code == 200: 161 | try: 162 | json_results = json.loads(raw_papers.text) 163 | total = int(json_results['result'][0]['total']) 164 | except (json.JSONDecodeError, KeyError, IndexError) as e: 165 | # User-friendly message explaining what's happening 166 | context = create_error_context( 167 | "springer", "_get_expected_papers", 168 | ErrorSeverity.WARNING, 169 | ErrorCategory.DATA, 170 | f"Data parsing error in Springer response: {type(e).__name__}: {str(e)}" 171 | ) 172 | error_info = get_standard_error_info("data_validation_failed") 173 | ErrorHandler.handle_error(e, context, error_info, self.logger) 174 | except (ValueError, TypeError) as e: 175 | # User-friendly message explaining what's happening 176 | context = create_error_context( 177 | "springer", "_get_expected_papers", 178 | ErrorSeverity.WARNING, 179 | ErrorCategory.DATA, 180 | f"Data type error in Springer response: {type(e).__name__}: {str(e)}" 181 | ) 182 | error_info = get_standard_error_info("data_validation_failed") 183 | ErrorHandler.handle_error(e, context, error_info, self.logger) 184 | except Exception as ex: 185 | # User-friendly message explaining what's happening 186 | context = create_error_context( 187 | "springer", "_get_expected_papers", 188 | ErrorSeverity.ERROR, 189 | ErrorCategory.DATA, 190 | f"Unexpected error parsing Springer response: {type(ex).__name__}: {str(ex)}" 191 | ) 192 | error_info = get_standard_error_info("unexpected_error") 193 | ErrorHandler.handle_error(ex, context, error_info, self.logger) 194 | else: 195 | self._log_api_error(raw_papers, raw_papers.request.url if raw_papers.request else "") 196 | return total 197 | 198 | def _process_raw_papers(self, query, raw_papers): 199 | """Process the raw API response into a DataFrame.""" 200 | query_name = list(query.keys())[0] 201 | query_value = query[query_name] 202 | papers_request = pd.DataFrame() 203 | 204 | if raw_papers.status_code == 200: 205 | try: 206 | json_results = json.loads(raw_papers.text) 207 | papers_request = pd.json_normalize(json_results['records']) 208 | papers_request.loc[:, 'database'] = self.database_name 209 | papers_request.loc[:, 'query_name'] = query_name 210 | papers_request.loc[:, 'query_value'] = query_value.replace('', 'AND').replace('', 'OR') 211 | except (json.JSONDecodeError, KeyError) as e: 212 | # User-friendly message explaining what's happening 213 | context = create_error_context( 214 | "springer", "_process_raw_papers", 215 | ErrorSeverity.WARNING, 216 | ErrorCategory.DATA, 217 | f"Data parsing error in Springer response: {type(e).__name__}: {str(e)}" 218 | ) 219 | error_info = get_standard_error_info("data_validation_failed") 220 | ErrorHandler.handle_error(e, context, error_info, self.logger) 221 | except Exception as ex: 222 | # User-friendly message explaining what's happening 223 | context = create_error_context( 224 | "springer", "_process_raw_papers", 225 | ErrorSeverity.ERROR, 226 | ErrorCategory.DATA, 227 | f"Unexpected error parsing Springer response: {type(ex).__name__}: {str(ex)}" 228 | ) 229 | error_info = get_standard_error_info("unexpected_error") 230 | ErrorHandler.handle_error(ex, context, error_info, self.logger) 231 | else: 232 | self._log_api_error(raw_papers, raw_papers.request.url if raw_papers.request else "") 233 | 234 | return papers_request 235 | 236 | def _filter_papers(self, papers: pd.DataFrame, dates, start_date, end_date) -> pd.DataFrame: 237 | """Filter papers based on criteria.""" 238 | self.logger.info(LogCategory.DATA, "springer", "_filter_papers", "Filtering papers...") 239 | try: 240 | # Filter by title 241 | papers.loc[:, 'title'] = papers['title'].replace('', float("NaN")) 242 | papers = papers.dropna(subset=['title']) 243 | papers.loc[:, 'title'] = papers['title'].str.lower() 244 | papers = papers.drop_duplicates('title') 245 | 246 | # Filter by abstract 247 | papers.loc[:, 'abstract'] = papers['abstract'].replace('', float("NaN")) 248 | papers = papers.dropna(subset=['abstract']) 249 | papers = papers.drop_duplicates(subset=['doi']) 250 | 251 | # Filter by language 252 | if 'language' in papers: 253 | papers = papers[papers['language'].str.contains('en')] 254 | 255 | except (ValueError, TypeError) as e: 256 | # User-friendly message explaining what's happening 257 | context = create_error_context( 258 | "springer", "_filter_papers", 259 | ErrorSeverity.WARNING, 260 | ErrorCategory.DATA, 261 | f"Data type error during Springer paper filtering: {type(e).__name__}: {str(e)}" 262 | ) 263 | error_info = get_standard_error_info("data_validation_failed") 264 | ErrorHandler.handle_error(e, context, error_info, self.logger) 265 | # Continue with unfiltered papers rather than failing completely 266 | except KeyError as e: 267 | # User-friendly message explaining what's happening 268 | context = create_error_context( 269 | "springer", "_filter_papers", 270 | ErrorSeverity.WARNING, 271 | ErrorCategory.DATA, 272 | f"Missing required column during Springer paper filtering: {type(e).__name__}: {str(e)}" 273 | ) 274 | error_info = get_standard_error_info("data_validation_failed") 275 | ErrorHandler.handle_error(e, context, error_info, self.logger) 276 | # Return papers as-is to prevent complete failure 277 | except Exception as ex: 278 | # User-friendly message explaining what's happening 279 | context = create_error_context( 280 | "springer", "_filter_papers", 281 | ErrorSeverity.ERROR, 282 | ErrorCategory.DATA, 283 | f"Unexpected error during Springer paper filtering: {type(ex).__name__}: {str(ex)}" 284 | ) 285 | error_info = get_standard_error_info("unexpected_error") 286 | ErrorHandler.handle_error(ex, context, error_info, self.logger) 287 | # Return papers as-is to prevent complete failure 288 | 289 | return papers 290 | 291 | def _clean_papers(self, papers: pd.DataFrame) -> pd.DataFrame: 292 | """Clean and standardize paper data.""" 293 | self.logger.info(LogCategory.DATA, "springer", "_clean_papers", "Cleaning papers...") 294 | try: 295 | # Extract URLs 296 | urls = [] 297 | if 'url' in papers: 298 | for paper in papers['url']: 299 | url = paper[0]['value'] 300 | urls.append(url) 301 | 302 | # Remove unnecessary columns 303 | papers = papers.drop(columns=['url', 'creators', 'bookEditors', 'openaccess', 'printIsbn', 'electronicIsbn', 304 | 'isbn', 'genre', 'copyright', 'conferenceInfo', 'issn', 'eIssn', 'volume', 305 | 'publicationType', 'number', 'issueType', 'topicalCollection', 'startingPage', 306 | 'endingPage', 'language', 'journalId', 'printDate', 'response', 'onlineDate', 307 | 'coverDate', 'keyword'], 308 | errors='ignore') 309 | 310 | # Add cleaned URLs 311 | if len(urls) > 0: 312 | papers.loc[:, 'url'] = urls 313 | else: 314 | papers['url'] = '' 315 | 316 | # Clean empty values 317 | papers.replace('', float("NaN"), inplace=True) 318 | papers.dropna(how='all', axis=1, inplace=True) 319 | 320 | except (ValueError, TypeError) as e: 321 | # User-friendly message explaining what's happening 322 | context = create_error_context( 323 | "springer", "_clean_papers", 324 | ErrorSeverity.WARNING, 325 | ErrorCategory.DATA, 326 | f"Data type error during Springer paper cleaning: {type(e).__name__}: {str(e)}" 327 | ) 328 | error_info = get_standard_error_info("data_validation_failed") 329 | ErrorHandler.handle_error(e, context, error_info, self.logger) 330 | # Continue with uncleaned papers rather than failing completely 331 | except KeyError as e: 332 | # User-friendly message explaining what's happening 333 | context = create_error_context( 334 | "springer", "_clean_papers", 335 | ErrorSeverity.WARNING, 336 | ErrorCategory.DATA, 337 | f"Missing required column during Springer paper cleaning: {type(e).__name__}: {str(e)}" 338 | ) 339 | error_info = get_standard_error_info("data_validation_failed") 340 | ErrorHandler.handle_error(e, context, error_info, self.logger) 341 | # Return papers as-is to prevent complete failure 342 | except (IndexError, AttributeError) as e: 343 | # User-friendly message explaining what's happening 344 | context = create_error_context( 345 | "springer", "_clean_papers", 346 | ErrorSeverity.WARNING, 347 | ErrorCategory.DATA, 348 | f"URL extraction error during Springer paper cleaning: {type(e).__name__}: {str(e)}" 349 | ) 350 | error_info = get_standard_error_info("data_validation_failed") 351 | ErrorHandler.handle_error(e, context, error_info, self.logger) 352 | # Continue with empty URLs rather than failing completely 353 | except Exception as ex: 354 | # User-friendly message explaining what's happening 355 | context = create_error_context( 356 | "springer", "_clean_papers", 357 | ErrorSeverity.ERROR, 358 | ErrorCategory.DATA, 359 | f"Unexpected error during Springer paper cleaning: {type(ex).__name__}: {str(ex)}" 360 | ) 361 | error_info = get_standard_error_info("unexpected_error") 362 | ErrorHandler.handle_error(ex, context, error_info, self.logger) 363 | # Return papers as-is to prevent complete failure 364 | 365 | return papers 366 | 367 | def _get_abstracts(self, papers: pd.DataFrame) -> pd.DataFrame: 368 | """Get abstracts for papers.""" 369 | pass --------------------------------------------------------------------------------