├── tests
    ├── __init__.py
    └── test_util.py
├── requirements.txt
├── LICENSE
├── pytest.ini
├── parameters_ar.yaml
├── util
    ├── parser.py
    ├── error_standards.py
    └── logging_standards.py
├── .gitignore
├── parameters_sys.yaml
├── clients
    ├── client_factory.py
    ├── base_client.py
    ├── apis
    │   ├── generic.py
    │   └── xploreapi.py
    ├── core.py
    ├── arxiv.py
    └── springer.py
├── templates
    ├── basic_search_template.yaml
    ├── advanced_research_template.yaml
    └── machine_learning_template.yaml
├── docs
    ├── quick_start_guide.md
    └── configuration_guide.md
└── README.md


/tests/__init__.py:
--------------------------------------------------------------------------------
1 | # Tests package for SaLS project
2 | # This package contains all test modules for the project
3 | 


--------------------------------------------------------------------------------
/requirements.txt:
--------------------------------------------------------------------------------
 1 | PyYAML==6.0
 2 | pandas==1.4.3
 3 | pydantic==1.7.4
 4 | spacy-langdetect==0.1.2
 5 | spacy==3.6
 6 | scipy==1.11.2
 7 | gensim==4.2.0
 8 | nltk==3.7
 9 | lxml==4.9.1
10 | rich==12.6.0
11 | numpy~=1.23.1
12 | requests~=2.31.0
13 | beautifulsoup4==4.12.2
14 | sentence-transformers==2.5.1
15 | tqdm>=4.66.3
16 | pyparsing==3.1.2
17 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 | MIT License
 2 | 
 3 | Copyright (c) 2021 Christian Cabrera
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | of this software and associated documentation files (the "Software"), to deal
 7 | in the Software without restriction, including without limitation the rights
 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 | 
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 | 
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 | 


--------------------------------------------------------------------------------
/pytest.ini:
--------------------------------------------------------------------------------
 1 | [tool:pytest]
 2 | # Pytest configuration for SaLS project
 3 | # This file configures pytest behavior and test discovery
 4 | 
 5 | # Test discovery patterns
 6 | testpaths = tests
 7 | python_files = test_*.py
 8 | python_classes = Test*
 9 | python_functions = test_*
10 | 
11 | # Output and reporting
12 | addopts = 
13 |     -v                  # Verbose output
14 |     --tb=short         # Short traceback format
15 |     --strict-markers   # Strict marker validation
16 |     --disable-warnings # Disable warning display during tests
17 |     --color=yes        # Colored output
18 | 
19 | # Markers for test categorization
20 | markers =
21 |     unit: Unit tests for individual functions
22 |     integration: Integration tests for workflows
23 |     slow: Tests that take longer to run
24 |     api: Tests that require external API access
25 |     config: Configuration and validation tests
26 |     error_handling: Error handling and recovery tests
27 | 
28 | # Minimum version requirements
29 | minversion = 6.0
30 | 
31 | # Test timeout (seconds)
32 | timeout = 300
33 | 
34 | # Coverage configuration (if pytest-cov is installed)
35 | # addopts = --cov=util --cov=analysis --cov=clients --cov-report=html --cov-report=term-missing
36 | 


--------------------------------------------------------------------------------
/parameters_ar.yaml:
--------------------------------------------------------------------------------
 1 | # List of queries in the format <query_name>: "<query_value>". & (and operator) ¦ (or operator).
 2 | queries:
 3 |   - augmented reality: "'augmented reality' & 'edge' & 'orchestration' & 'placement'"
 4 | #Synonyms of the keywords to expand the queries.
 5 | augmented reality:
 6 |   - ar
 7 |   - virtual reality
 8 |   - vr
 9 |   - game
10 | orchestration:
11 |   - service orchestration
12 |   - service composition
13 |   - composition
14 |   - service choreography
15 |   - choreography
16 | placement:
17 |   - service placement
18 |   - service offloading
19 |   - offloading
20 |   - resource allocation
21 | edge:
22 |   - edge computing
23 |   - fog computing
24 |   - fog
25 |   - iot
26 | # Databases where to search for papers. arXiv and Semantic Scholar by default as they are open. You can use other
27 | # repositories by uncommenting the respective lines. You should add the API access keys to the ./config.json file
28 | # in order to use them. (See step 7 in the How to run it? instructions)
29 | databases:
30 |   - arxiv
31 |   - semantic_scholar
32 |   - springer
33 |   - ieeexplore
34 |   - scopus
35 |   - core
36 | # Search time interval YYYY-mm-dd. If you do not want to include search dates comment start_date and
37 | # end_date parameters.
38 | #start_date: 2010-01-01
39 | #end_date: 2022-08-01
40 | # Date of the search and folder name where the outputs will be stored
41 | search_date: 2025-09-01
42 | folder_name: ar_search
43 | 


--------------------------------------------------------------------------------
/util/parser.py:
--------------------------------------------------------------------------------
 1 | from pyparsing import Word, alphanums, quotedString, oneOf, infixNotation, opAssoc, ParseException
 2 | import logging
 3 | 
 4 | logger = logging.getLogger('logger')
 5 | # Define grammar elements
 6 | # Allow common identifier characters without requiring quotes (e.g., hyphens, underscores, slashes, colons, plus, dots)
 7 | identifier = Word(alphanums + "-_./:+*")
 8 | string_literal = quotedString.setParseAction(lambda t: t[0][1:-1])
 9 | and_operator = oneOf("<AND>")
10 | or_operator = oneOf("<OR>")
11 | 
12 | # Define expression grammar
13 | expression = infixNotation(
14 |     identifier | string_literal,
15 |     [
16 |         (and_operator, 2, opAssoc.LEFT),
17 |         (or_operator, 2, opAssoc.LEFT),
18 |     ]
19 | )
20 | 
21 | 
22 | # Parse boolean expression function
23 | def parse_boolean_expression(expression_str):
24 |     # Quick parentheses balance check ignoring text inside quotes
25 |     def _balanced_parentheses(s: str) -> bool:
26 |         depth = 0
27 |         in_quote = False
28 |         q = ''
29 |         for ch in s:
30 |             if in_quote:
31 |                 if ch == q:
32 |                     in_quote = False
33 |                 continue
34 |             if ch in ('"', '\''):
35 |                 in_quote = True
36 |                 q = ch
37 |             elif ch == '(':
38 |                 depth += 1
39 |             elif ch == ')':
40 |                 depth -= 1
41 |                 if depth < 0:
42 |                     return False
43 |         return depth == 0 and not in_quote
44 | 
45 |     if not _balanced_parentheses(expression_str):
46 |         logger.info('Error parsing expression: unbalanced parentheses or unterminated quotes')
47 |         return [], False
48 | 
49 |     try:
50 |         parsed = expression.parseString(expression_str, parseAll=True)
51 |         return parsed[0], True
52 |     except ParseException as e:
53 |         # Build a caret marker to indicate where parsing failed
54 |         try:
55 |             line, col = e.line, e.column
56 |             caret = ' ' * (col - 1) + '^'
57 |             logger.info('Error parsing expression at column ' + str(col) + ':')
58 |             logger.info(line)
59 |             logger.info(caret)
60 |             logger.info(str(e))
61 |         except Exception:
62 |             logger.info('Error parsing expression: ' + str(e))
63 |         return [], False
64 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
  1 | # Byte-compiled / optimized / DLL files
  2 | __pycache__/
  3 | *.py[cod]
  4 | *$py.class
  5 | 
  6 | # C extensions
  7 | *.so
  8 | 
  9 | # Distribution / packaging
 10 | .Python
 11 | build/
 12 | develop-eggs/
 13 | dist/
 14 | downloads/
 15 | eggs/
 16 | .eggs/
 17 | lib/
 18 | lib64/
 19 | parts/
 20 | sdist/
 21 | var/
 22 | wheels/
 23 | pip-wheel-metadata/
 24 | share/python-wheels/
 25 | *.egg-info/
 26 | .installed.cfg
 27 | *.egg
 28 | MANIFEST
 29 | 
 30 | # PyInstaller
 31 | #  Usually these files are written by a python script from a template
 32 | #  before PyInstaller builds the exe, so as to inject date/other infos into it.
 33 | *.manifest
 34 | *.spec
 35 | 
 36 | # Installer logs
 37 | pip-log.txt
 38 | pip-delete-this-directory.txt
 39 | 
 40 | # Unit test / coverage reports
 41 | htmlcov/
 42 | .tox/
 43 | .nox/
 44 | .coverage
 45 | .coverage.*
 46 | .cache
 47 | nosetests.xml
 48 | coverage.xml
 49 | *.cover
 50 | *.py,cover
 51 | .hypothesis/
 52 | .pytest_cache/
 53 | 
 54 | # Translations
 55 | *.mo
 56 | *.pot
 57 | 
 58 | # Django stuff:
 59 | *.log
 60 | local_settings.py
 61 | db.sqlite3
 62 | db.sqlite3-journal
 63 | 
 64 | # Flask stuff:
 65 | instance/
 66 | .webassets-cache
 67 | 
 68 | # Scrapy stuff:
 69 | .scrapy
 70 | 
 71 | # Sphinx documentation
 72 | docs/_build/
 73 | 
 74 | # PyBuilder
 75 | target/
 76 | 
 77 | # Jupyter Notebook
 78 | .ipynb_checkpoints
 79 | 
 80 | # IPython
 81 | profile_default/
 82 | ipython_config.py
 83 | 
 84 | # pyenv
 85 | .python-version
 86 | 
 87 | # pipenv
 88 | #   According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
 89 | #   However, in case of collaboration, if having platform-specific dependencies or dependencies
 90 | #   having no cross-platform support, pipenv may install dependencies that don't work, or not
 91 | #   install all needed dependencies.
 92 | #Pipfile.lock
 93 | 
 94 | # PEP 582; used by e.g. github.com/David-OConnor/pyflow
 95 | __pypackages__/
 96 | 
 97 | # Celery stuff
 98 | celerybeat-schedule
 99 | celerybeat.pid
100 | 
101 | # SageMath parsed files
102 | *.sage.py
103 | 
104 | # Environments
105 | .env
106 | .venv
107 | env/
108 | venv/
109 | ENV/
110 | env.bak/
111 | venv.bak/
112 | 
113 | # Spyder project settings
114 | .spyderproject
115 | .spyproject
116 | 
117 | # Rope project settings
118 | .ropeproject
119 | 
120 | # mkdocs documentation
121 | /site
122 | 
123 | # mypy
124 | .mypy_cache/
125 | .dmypy.json
126 | dmypy.json
127 | 
128 | # Pyre type checker
129 | .pyre/
130 | 
131 | *.pyc
132 | 
133 | # Pycharm
134 | .idea/
135 | dump.json
136 | logs/
137 | data/
138 | .vscode/
139 | 
140 | # API Keys File
141 | config.json
142 | 
143 | # Papers
144 | papers/
145 | 
146 | # Local parameter files
147 | local/
148 | 


--------------------------------------------------------------------------------
/parameters_sys.yaml:
--------------------------------------------------------------------------------
 1 | # List of queries in the format <query_name>: "<query_value>". & (and operator) ¦ (or operator).
 2 | queries:
 3 |   - systems engineering: "'systems engineering' & ('generative ai' ¦ 'artificial intelligence')"
 4 | # Syntactic filters are used to restrict the number of papers that massive repositories (e.g., springer, scopus) return
 5 | # The exact terms and their synonyms must appear in the paper title or abstract to be retrieved.
 6 | # If the number of returned papers is too big and syntactic filters are not provided,
 7 | # the platform will skip the repository to avoid quota errors or overloading the APIs.
 8 | # We advise including syntactic filters to make the retrieval process more feasible.
 9 | syntactic_filters:
10 |   - 'systems engineering'
11 |   - 'generative ai'
12 | # Semantic filters uses LLMs to match abstracts with the provided description.
13 | # The type parameter corresponds to the embedding the semantic search uses. Bert is the open option it currently uses.
14 | # The description parameter is the text that describes the papers you are looking for. A way to create such description is to think of the ideal abstract a selected paper should have.
15 | # The score parameter is the similarity degree between the queries and the included papers.
16 | # Papers with greater or equal score are included.
17 | semantic_filters:
18 |   - type: "bert"
19 |   - description: "This paper proposes a systems engineering approach to analyse, design, implement, evaluate, and deploy systems based on artificial intelligence. AI-based systems are complex, dependable, data-driven, and critical systems containing one or more components based on AI or machine learning. The systems engineering approach helps to address the challenges that AI-based components generate in the systems (e.g., lack of explainability, security issues, unreliable behaviour, lack of alignment, etc.). This help relies on systems engineering practices and principles that define models, methodologies, techniques, architectural patterns, to facilitate the integration of AI into systems."
20 |   - score: 0.8
21 | #Synonyms of the keywords to expand the queries.
22 | systems engineering:
23 |   - systems thinking
24 |   - dependable systems
25 |   - engineering ai
26 | generative ai:
27 |   - llm
28 |   - large language model
29 | artificial intelligence:
30 |   - ai
31 |   - machine learning
32 |   - ml
33 |   - deep learning
34 | # Databases where to search for papers. arXiv and Semantic Scholar by default as they are open. You can use other
35 | # repositories by uncommenting the respective lines. You should add the API access keys to the ./config.json file
36 | # in order to use them. (See step 7 in the How to run it? instructions)
37 | databases:
38 |   - arxiv
39 |   - semantic_scholar
40 |   - springer
41 |   - ieeexplore
42 |   - scopus
43 |   - core
44 | # Search time interval YYYY-mm-dd. If you do not want to include search dates comment start_date and
45 | # end_date parameters.
46 | start_date: 2017-01-01
47 | end_date: 2025-08-18
48 | # Date of the search and folder name where the outputs will be stored
49 | search_date: 2025-08-18
50 | folder_name: sys_search
51 | 


--------------------------------------------------------------------------------
/clients/client_factory.py:
--------------------------------------------------------------------------------
 1 | from .base_client import DatabaseClient
 2 | from .arxiv import ArxivClient
 3 | from .springer import SpringerClient
 4 | from .ieeexplore import IeeeXploreClient
 5 | from .core import CoreClient
 6 | from .elsevier import ElsevierClient
 7 | from .semantic_scholar import SemanticScholarClient
 8 | from util.error_standards import ErrorHandler, create_error_context, ErrorSeverity, ErrorCategory, get_standard_error_info
 9 | from util.logging_standards import LogCategory
10 | 
11 | class DatabaseClientFactory:
12 |     """
13 |     Factory class for creating database clients.
14 |     
15 |     This provides a clean interface for creating the right client
16 |     based on the database name, and makes it easy to add new databases.
17 |     """
18 |     
19 |     def __init__(self):
20 |         self._clients = {
21 |             'arxiv': ArxivClient,
22 |             'springer': SpringerClient,
23 |             'ieeexplore': IeeeXploreClient,
24 |             'core': CoreClient,
25 |             'elsevier': ElsevierClient,
26 |             'semantic_scholar': SemanticScholarClient,
27 |         }
28 |     
29 |     def create_client(self, database_name: str) -> DatabaseClient:
30 |         """
31 |         Create a client for the specified database.
32 |         
33 |         Args:
34 |             database_name: Name of the database (e.g., 'arxiv', 'springer')
35 |             
36 |         Returns:
37 |             DatabaseClient instance or None if database not supported
38 |         """
39 |         if database_name not in self._clients:
40 |             return None
41 |             
42 |         try:
43 |             return self._clients[database_name]()
44 |         except (ValueError, TypeError) as e:
45 |             # Log the error but don't crash the system
46 |             import logging
47 |             logger = logging.getLogger('logger')
48 |             context = create_error_context(
49 |                 "client_factory", "create_client", 
50 |                 ErrorSeverity.WARNING, 
51 |                 ErrorCategory.DATA,
52 |                 f"Failed to create client for {database_name} due to data type error: {type(e).__name__}: {str(e)}"
53 |             )
54 |             error_info = get_standard_error_info("data_validation_failed")
55 |             ErrorHandler.handle_error(e, context, error_info, logger)
56 |             return None
57 |         except Exception as ex:
58 |             # Log the error but don't crash the system
59 |             import logging
60 |             logger = logging.getLogger('logger')
61 |             context = create_error_context(
62 |                 "client_factory", "create_client", 
63 |                 ErrorSeverity.ERROR, 
64 |                 ErrorCategory.SYSTEM,
65 |                 f"Failed to create client for {database_name} due to unexpected error: {type(ex).__name__}: {str(ex)}"
66 |             )
67 |             error_info = get_standard_error_info("unexpected_error")
68 |             ErrorHandler.handle_error(ex, context, error_info, logger)
69 |             return None
70 |     
71 |     def get_supported_databases(self) -> list:
72 |         """Get list of supported database names."""
73 |         return list(self._clients.keys())
74 |     
75 |     def is_supported(self, database_name: str) -> bool:
76 |         """Check if a database is supported."""
77 |         return database_name in self._clients
78 |     
79 |     def register_client(self, database_name: str, client_class):
80 |         """
81 |         Register a new client class for a database.
82 |         
83 |         This allows for dynamic registration of new clients
84 |         without modifying the factory code.
85 |         """
86 |         if not issubclass(client_class, DatabaseClient):
87 |             raise ValueError(f"Client class must inherit from DatabaseClient")
88 |         
89 |         self._clients[database_name] = client_class
90 | 


--------------------------------------------------------------------------------
/templates/basic_search_template.yaml:
--------------------------------------------------------------------------------
 1 | # =============================================================================
 2 | # SaLS Basic Search Configuration Template
 3 | # =============================================================================
 4 | # This template provides a minimal configuration for basic literature searches
 5 | # Copy this file and modify the values according to your research needs
 6 | # =============================================================================
 7 | 
 8 | # REQUIRED: Search queries in the format <query_name>: "<query_value>"
 9 | # Supported operators: & (AND), | (OR), and parentheses for grouping
10 | # Use quotes around multi-word terms: 'machine learning' & 'edge computing'
11 | queries:
12 |   - machine learning: "'machine learning' & 'edge computing'"
13 |   - artificial intelligence: "'artificial intelligence' | 'AI'"
14 | 
15 | # OPTIONAL: Synonyms to expand your search queries
16 | # These will be automatically added to your searches to increase coverage
17 | machine learning:
18 |   - ml
19 |   - deep learning
20 |   - neural networks
21 |   - supervised learning
22 |   - unsupervised learning
23 | 
24 | artificial intelligence:
25 |   - ai
26 |   - machine intelligence
27 |   - cognitive computing
28 | 
29 | # REQUIRED: Databases to search for papers
30 | # Open databases (no API key required): arxiv, semantic_scholar
31 | # Commercial databases (API key required): springer, ieeexplore, scopus, core
32 | # See README.md for API key setup instructions
33 | databases:
34 |   - arxiv                    # Open access, no API key needed
35 |   - semantic_scholar        # Open access, no API key needed
36 |   # - springer               # Uncomment and add API key to config.json
37 |   # - ieeexplore            # Uncomment and add API key to config.json
38 |   # - scopus                # Uncomment and add API key to config.json
39 |   # - core                  # Uncomment and add API key to config.json
40 | 
41 | # OPTIONAL: Date range for your search (YYYY-MM-DD format)
42 | # Comment out both lines to search all available papers
43 | # Including dates can significantly reduce search time and improve relevance
44 | # start_date: 2020-01-01    # Papers published from this date
45 | # end_date: 2024-12-31      # Papers published until this date
46 | 
47 | # REQUIRED: Search metadata
48 | search_date: 2024-12-15     # Date when this search was performed
49 | folder_name: basic_search    # Output folder name (will be created automatically)
50 | 
51 | # =============================================================================
52 | # ADVANCED FEATURES (Optional - uncomment and configure as needed)
53 | # =============================================================================
54 | 
55 | # OPTIONAL: Syntactic filters for more precise control
56 | # These terms will be required in the paper content
57 | # syntactic_filters:
58 | #   - edge computing
59 | #   - distributed systems
60 | 
61 | # OPTIONAL: Semantic filters using AI-powered similarity matching
62 | # Format: filter_name: "detailed description of what you're looking for"
63 | # semantic_filters:
64 | #   - edge computing: "Papers about edge computing, fog computing, and distributed edge systems"
65 | #   - machine learning: "Research on machine learning algorithms and applications in edge environments"
66 | 
67 | # =============================================================================
68 | # BEST PRACTICES
69 | # =============================================================================
70 | # 1. Start with simple queries and refine based on results
71 | # 2. Use specific terms rather than broad concepts
72 | # 3. Include synonyms to catch related research
73 | # 4. Set reasonable date ranges to focus on recent work
74 | # 5. Test with open databases first before adding commercial ones
75 | # 6. Use semantic filters for complex research areas
76 | # =============================================================================
77 | 


--------------------------------------------------------------------------------
/docs/quick_start_guide.md:
--------------------------------------------------------------------------------
  1 | # SaLS Quick Start Guide
  2 | 
  3 | ## Get Started in 5 Minutes
  4 | 
  5 | This guide will help you run your first literature search with SaLS in just a few minutes.
  6 | 
  7 | ## Prerequisites
  8 | 
  9 | - Python 3.8 or higher
 10 | - Internet connection
 11 | - Basic understanding of YAML files
 12 | 
 13 | ## Step 1: Setup (2 minutes)
 14 | 
 15 | ### 1.1 Clone and Install
 16 | ```bash
 17 | git clone https://github.com/cabrerac/semi-automatic-literature-survey.git
 18 | cd semi-automatic-literature-survey
 19 | python -m venv venv
 20 | source venv/bin/activate  # On Windows: venv\Scripts\activate
 21 | pip install -r requirements.txt
 22 | python -m spacy download en_core_web_sm
 23 | ```
 24 | 
 25 | ### 1.2 Test Installation
 26 | ```bash
 27 | python -c "from util import util; print('✅ SaLS installed successfully!')"
 28 | ```
 29 | 
 30 | ## Step 2: Create Your First Configuration (2 minutes)
 31 | 
 32 | ### 2.1 Copy a Template
 33 | ```bash
 34 | cp templates/basic_search_template.yaml my_first_search.yaml
 35 | ```
 36 | 
 37 | ### 2.2 Edit the Configuration
 38 | Open `my_first_search.yaml` and modify these lines:
 39 | 
 40 | ```yaml
 41 | queries:
 42 |   - your_topic: "'your research topic' & 'key concept'"
 43 | 
 44 | search_date: 2024-12-15        # Today's date
 45 | folder_name: my_first_search    # Your project name
 46 | ```
 47 | 
 48 | **Example for machine learning research**:
 49 | ```yaml
 50 | queries:
 51 |   - deep learning: "'deep learning' & 'computer vision'"
 52 | 
 53 | search_date: 2024-12-15
 54 | folder_name: deep_learning_cv
 55 | ```
 56 | 
 57 | ## Step 3: Run Your First Search (1 minute)
 58 | 
 59 | ```bash
 60 | python main.py my_first_search.yaml
 61 | ```
 62 | 
 63 | ## What Happens Next?
 64 | 
 65 | 1. **Paper Retrieval**: SaLS searches selected databases
 66 | 2. **Preprocessing**: Papers are cleaned and deduplicated
 67 | 3. **Semantic Filtering**: AI-powered relevance scoring (if configured)
 68 | 4. **Manual Review**: You review abstracts and full papers
 69 | 5. **Results**: Final paper list saved to `./papers/` folder
 70 | 
 71 | ## Expected Output
 72 | 
 73 | ```
 74 | 0. Retrieving papers from the databases...
 75 | ✅ Retrieved 150 papers from arxiv
 76 | ✅ Retrieved 89 papers from semantic_scholar
 77 | 
 78 | 1. Preprocessing papers...
 79 | ✅ Preprocessing results can be found at: 1_preprocessed_papers.csv
 80 | 
 81 | 2. Manual filtering by abstract...
 82 | [Interactive review process starts]
 83 | ```
 84 | 
 85 | ## Common First-Time Issues
 86 | 
 87 | ### Issue: "Configuration validation failed"
 88 | **Solution**: SaLS will show exactly what's wrong and how to fix it
 89 | - **Critical errors** (🔴) must be fixed before continuing
 90 | - **Warnings** (🟡) allow the pipeline to continue with defaults
 91 | - Follow the provided examples to fix issues quickly
 92 | 
 93 | ### Issue: "No papers found"
 94 | **Solution**: Try broader queries or different databases
 95 | 
 96 | ### Issue: "API key required"
 97 | **Solution**: Use only `arxiv` and `semantic_scholar` (no API key needed)
 98 | 
 99 | ### Issue: Missing optional fields
100 | **Solution**: SaLS automatically provides sensible defaults:
101 | - Missing databases → defaults to open databases
102 | - Missing search_date → defaults to current date
103 | - Missing folder_name → defaults to filename-based
104 | 
105 | ## Next Steps
106 | 
107 | 1. **Review Results**: Check the generated CSV files
108 | 2. **Refine Queries**: Adjust based on initial results
109 | 3. **Add Filters**: Use syntactic and semantic filters
110 | 4. **Expand Databases**: Add commercial databases with API keys
111 | 
112 | ## Need Help?
113 | 
114 | - **Configuration Guide**: `docs/configuration_guide.md`
115 | - **Templates**: `templates/` directory
116 | - **Examples**: `parameters_ar.yaml` (working example)
117 | 
118 | ## Quick Configuration Examples
119 | 
120 | ### Simple Search
121 | ```yaml
122 | queries:
123 |   - ai: "'artificial intelligence'"
124 | databases:
125 |   - arxiv
126 |   - semantic_scholar
127 | search_date: 2024-12-15
128 | folder_name: ai_search
129 | ```
130 | 
131 | ### Focused Search
132 | ```yaml
133 | queries:
134 |   - ml_edge: "'machine learning' & 'edge computing'"
135 | databases:
136 |   - arxiv
137 |   - semantic_scholar
138 | start_date: 2020-01-01
139 | end_date: 2024-12-31
140 | search_date: 2024-12-15
141 | folder_name: ml_edge_search
142 | ```
143 | 
144 | ### Advanced Search
145 | ```yaml
146 | queries:
147 |   - systems: "'systems engineering' & ('AI' | 'machine learning')"
148 | databases:
149 |   - arxiv
150 |   - semantic_scholar
151 |   - springer
152 | syntactic_filters:
153 |   - systems
154 |   - engineering
155 | semantic_filters:
156 |   - ai_systems: "Research on AI and machine learning in systems engineering contexts"
157 | search_date: 2024-12-15
158 | folder_name: ai_systems_search
159 | ```
160 | 
161 | ## Success Checklist
162 | 
163 | - [ ] SaLS runs without errors
164 | - [ ] Papers are retrieved from databases
165 | - [ ] Results are saved to CSV files
166 | - [ ] You can review and filter papers
167 | - [ ] Final paper list is generated
168 | 
169 | ## Congratulations!
170 | 
171 | You've successfully completed your first literature search with SaLS. The system is now ready for your research needs.
172 | 
173 | **Tip**: Start with simple searches and gradually add complexity as you become familiar with the system.
174 | 


--------------------------------------------------------------------------------
/templates/advanced_research_template.yaml:
--------------------------------------------------------------------------------
  1 | # =============================================================================
  2 | # SaLS Advanced Research Configuration Template
  3 | # =============================================================================
  4 | # This template demonstrates all SaLS features for comprehensive literature reviews
  5 | # Use this for systematic literature reviews, PhD research, or complex research projects
  6 | # =============================================================================
  7 | 
  8 | # REQUIRED: Primary research queries
  9 | # Use complex boolean expressions with parentheses for precise control
 10 | # Supported operators: & (AND), | (OR), and parentheses for grouping
 11 | queries:
 12 |   - systems engineering: "'systems engineering' & ('large language models' | 'LLM' | 'generative AI')"
 13 |   - edge orchestration: "'edge computing' & ('orchestration' | 'composition' | 'choreography') & 'placement'"
 14 |   - machine learning systems: "'machine learning systems' & ('production' | 'deployment' | 'operations')"
 15 | 
 16 | # REQUIRED: Comprehensive synonyms for query expansion
 17 | # These synonyms will be automatically added to increase search coverage
 18 | # Group related terms under meaningful categories
 19 | systems engineering:
 20 |   - systems thinking
 21 |   - system architecture
 22 |   - system design
 23 |   - system integration
 24 |   - system lifecycle
 25 |   - requirements engineering
 26 |   - verification and validation
 27 | 
 28 | large language models:
 29 |   - llm
 30 |   - transformer models
 31 |   - language models
 32 |   - neural language models
 33 |   - foundation models
 34 |   - generative models
 35 | 
 36 | edge computing:
 37 |   - edge
 38 |   - fog computing
 39 |   - mobile edge computing
 40 |   - multi-access edge computing
 41 |   - edge intelligence
 42 |   - edge AI
 43 | 
 44 | orchestration:
 45 |   - service orchestration
 46 |   - service composition
 47 |   - service choreography
 48 |   - resource orchestration
 49 |   - workflow orchestration
 50 |   - microservice orchestration
 51 | 
 52 | placement:
 53 |   - service placement
 54 |   - resource placement
 55 |   - workload placement
 56 |   - task placement
 57 |   - service offloading
 58 |   - resource allocation
 59 | 
 60 | machine learning systems:
 61 |   - ml systems
 62 |   - production ml
 63 |   - mlops
 64 |   - machine learning operations
 65 |   - ml infrastructure
 66 |   - ml deployment
 67 | 
 68 | # REQUIRED: Database selection for comprehensive coverage
 69 | # Mix open and commercial databases for best results
 70 | # Open databases: arxiv, semantic_scholar (no API key needed)
 71 | # Commercial databases: springer, ieeexplore, scopus, core (API key required)
 72 | databases:
 73 |   - arxiv                    # Open access, excellent for recent research
 74 |   - semantic_scholar        # Open access, good citation analysis
 75 |   - springer                # Commercial, high-quality journals
 76 |   - ieeexplore              # Commercial, excellent for engineering
 77 |   - scopus                  # Commercial, comprehensive coverage
 78 |   - core                    # Commercial, open access repository
 79 |   # Additional databases available: crossref, europe_pmc, pubmed, openalex
 80 | 
 81 | # OPTIONAL: Date range for focused research (YYYY-MM-DD format)
 82 | # Use date ranges to focus on recent developments or specific time periods
 83 | # Comment out both lines to search all available papers
 84 | start_date: 2018-01-01      # Focus on recent research (last 6 years)
 85 | end_date: 2024-12-31        # Include current year
 86 | 
 87 | # REQUIRED: Search metadata
 88 | search_date: 2024-12-15     # Date when this search was performed
 89 | folder_name: advanced_research  # Output folder name (will be created automatically)
 90 | 
 91 | # OPTIONAL: Syntactic filters for precise control
 92 | # These terms must appear in the paper content
 93 | # Use for filtering out irrelevant papers early in the process
 94 | syntactic_filters:
 95 |   - systems
 96 |   - engineering
 97 |   - architecture
 98 |   - design
 99 |   - implementation
100 |   - evaluation
101 |   - analysis
102 | 
103 | # OPTIONAL: Semantic filters using AI-powered similarity matching
104 | # These use BERT-based models to find semantically similar papers
105 | # Format: filter_name: "detailed description of what you're looking for"
106 | # Be specific and descriptive for best results
107 | semantic_filters:
108 |   - systems engineering: "Research on systems engineering methodologies, frameworks, and approaches applied to complex systems, including requirements engineering, system architecture, design patterns, and lifecycle management"
109 |   
110 |   - edge orchestration: "Papers about orchestration, composition, and choreography of edge computing resources and services, including placement strategies, resource allocation, and workload distribution in edge environments"
111 |   
112 |   - production ml: "Research on machine learning systems in production environments, including MLOps, deployment strategies, monitoring, scaling, and operational challenges of ML systems"
113 | 
114 | # =============================================================================
115 | # ADVANCED CONFIGURATION OPTIONS
116 | # =============================================================================
117 | 
118 | # OPTIONAL: Custom fields and types (advanced users)
119 | # fields: ['title', 'abstract', 'keywords', 'full_text']
120 | # types: ['conferences', 'journals', 'preprints', 'reports']
121 | 
122 | # OPTIONAL: Custom search parameters (advanced users)
123 | # max_papers_per_database: 1000
124 | # search_timeout: 300
125 | # retry_attempts: 3
126 | 
127 | # =============================================================================
128 | # BEST PRACTICES FOR ADVANCED RESEARCH
129 | # =============================================================================
130 | # 1. QUERIES: Start broad, then narrow down based on initial results
131 | # 2. SYNONYMS: Include variations, abbreviations, and related terminology
132 | # 3. DATABASES: Use a mix of open and commercial for comprehensive coverage
133 | # 4. DATES: Set reasonable ranges to focus on relevant time periods
134 | # 5. FILTERS: Use syntactic filters for precision, semantic filters for recall
135 | # 6. ITERATION: Run multiple searches with refined parameters
136 | # 7. VALIDATION: Check results manually to ensure quality
137 | # 8. DOCUMENTATION: Keep track of search parameters and results
138 | # =============================================================================
139 | 
140 | # =============================================================================
141 | # TROUBLESHOOTING TIPS
142 | # =============================================================================
143 | # - If you get too many results: Add more specific terms or use date ranges
144 | # - If you get too few results: Broaden queries or add synonyms
145 | # - If semantic filtering fails: Check that descriptions are detailed and specific
146 | # - If API errors occur: Verify API keys in config.json or use open databases
147 | # - If validation fails: Check YAML syntax and required field formats
148 | # =============================================================================
149 | 


--------------------------------------------------------------------------------
/templates/machine_learning_template.yaml:
--------------------------------------------------------------------------------
  1 | # =============================================================================
  2 | # SaLS Machine Learning Research Configuration Template
  3 | # =============================================================================
  4 | # Specialized template for machine learning, AI, and data science research
  5 | # Optimized for finding papers on ML algorithms, systems, and applications
  6 | # =============================================================================
  7 | 
  8 | # REQUIRED: ML-focused research queries
  9 | # Use specific ML terminology and combine with application domains
 10 | queries:
 11 |   - deep learning: "'deep learning' & ('neural networks' | 'CNN' | 'RNN' | 'transformer')"
 12 |   - reinforcement learning: "'reinforcement learning' & ('Q-learning' | 'policy gradient' | 'actor-critic')"
 13 |   - ml systems: "'machine learning systems' & ('production' | 'deployment' | 'scaling' | 'mlops')"
 14 |   - computer vision: "'computer vision' & ('image recognition' | 'object detection' | 'segmentation')"
 15 |   - natural language processing: "'natural language processing' | 'NLP' & ('text analysis' | 'language models')"
 16 | 
 17 | # REQUIRED: ML-specific synonyms and terminology
 18 | # Include abbreviations, alternative names, and related concepts
 19 | deep learning:
 20 |   - deep neural networks
 21 |   - deep learning models
 22 |   - deep architectures
 23 |   - deep learning algorithms
 24 |   - deep learning frameworks
 25 |   - deep learning applications
 26 | 
 27 | neural networks:
 28 |   - artificial neural networks
 29 |   - neural network models
 30 |   - neural architectures
 31 |   - neural network training
 32 |   - neural network optimization
 33 | 
 34 | reinforcement learning:
 35 |   - rl
 36 |   - reinforcement learning algorithms
 37 |   - rl methods
 38 |   - rl frameworks
 39 |   - rl applications
 40 |   - rl optimization
 41 | 
 42 | machine learning systems:
 43 |   - ml systems
 44 |   - production ml
 45 |   - ml infrastructure
 46 |   - ml deployment
 47 |   - ml operations
 48 |   - mlops
 49 |   - ml engineering
 50 | 
 51 | computer vision:
 52 |   - cv
 53 |   - image processing
 54 |   - visual recognition
 55 |   - image understanding
 56 |   - visual analysis
 57 |   - computer vision systems
 58 | 
 59 | natural language processing:
 60 |   - nlp
 61 |   - text processing
 62 |   - language understanding
 63 |   - text analysis
 64 |   - language models
 65 |   - nlp systems
 66 | 
 67 | # REQUIRED: Databases optimized for ML research
 68 | # ML research is well-covered across all major databases
 69 | databases:
 70 |   - arxiv                    # Excellent for recent ML preprints
 71 |   - semantic_scholar        # Good for ML citations and impact
 72 |   - springer                # High-quality ML journals
 73 |   - ieeexplore              # Excellent for ML conferences and journals
 74 |   - scopus                  # Comprehensive ML coverage
 75 |   - core                    # Open access ML papers
 76 | 
 77 | # OPTIONAL: Date range for ML research (YYYY-MM-DD format)
 78 | # ML field evolves rapidly - consider focusing on recent years
 79 | # Comment out both lines to search all available papers
 80 | start_date: 2019-01-01      # Focus on recent ML developments
 81 | end_date: 2024-12-31        # Include current year
 82 | 
 83 | # REQUIRED: Search metadata
 84 | search_date: 2024-12-15     # Date when this search was performed
 85 | folder_name: ml_research     # Output folder name (will be created automatically)
 86 | 
 87 | # OPTIONAL: ML-specific syntactic filters
 88 | # These terms must appear in the paper content
 89 | syntactic_filters:
 90 |   - machine learning
 91 |   - artificial intelligence
 92 |   - deep learning
 93 |   - neural networks
 94 |   - algorithm
 95 |   - model
 96 |   - dataset
 97 |   - evaluation
 98 |   - performance
 99 |   - accuracy
100 | 
101 | # OPTIONAL: ML-specific semantic filters using AI-powered similarity matching
102 | # Be specific about ML subfields and applications
103 | semantic_filters:
104 |   - deep learning systems: "Research on deep learning systems, architectures, and frameworks including neural network design, training methodologies, optimization techniques, and deployment strategies for deep learning models"
105 |   
106 |   - reinforcement learning applications: "Papers about reinforcement learning applications in robotics, game playing, autonomous systems, recommendation systems, and other real-world domains with focus on practical implementation and performance evaluation"
107 |   
108 |   - production ml systems: "Research on machine learning systems in production environments including MLOps, model deployment, monitoring, scaling, A/B testing, and operational challenges of ML systems in industry settings"
109 |   
110 |   - computer vision applications: "Papers about computer vision applications in autonomous vehicles, medical imaging, surveillance, augmented reality, robotics, and other domains with focus on real-world deployment and performance"
111 | 
112 | # =============================================================================
113 | # ML-SPECIFIC BEST PRACTICES
114 | # =============================================================================
115 | # 1. QUERIES: Use specific ML terminology (e.g., 'CNN' not just 'neural networks')
116 | # 2. SYNONYMS: Include abbreviations and alternative names commonly used in ML
117 | # 3. DATES: ML field evolves rapidly - focus on recent years for cutting-edge research
118 | # 4. FILTERS: Use ML-specific terms to filter out non-ML papers
119 | # 5. ITERATION: ML research has many subfields - refine queries based on results
120 | # 6. VALIDATION: Check that results are actually ML research, not just mentions
121 | # 7. COVERAGE: ML papers appear in many venues - use multiple databases
122 | # 8. TERMINOLOGY: Stay current with ML terminology and naming conventions
123 | # =============================================================================
124 | 
125 | # =============================================================================
126 | # ML RESEARCH SUBFIELDS TO CONSIDER
127 | # =============================================================================
128 | # - Supervised Learning: classification, regression, structured prediction
129 | # - Unsupervised Learning: clustering, dimensionality reduction, generative models
130 | # - Reinforcement Learning: Q-learning, policy methods, multi-agent systems
131 | # - Deep Learning: CNNs, RNNs, transformers, attention mechanisms
132 | # - Computer Vision: image recognition, object detection, segmentation
133 | # - Natural Language Processing: text analysis, language models, translation
134 | # - ML Systems: MLOps, production deployment, scaling, monitoring
135 | # - ML Applications: healthcare, finance, autonomous systems, recommendation
136 | # =============================================================================
137 | 
138 | # =============================================================================
139 | # COMMON ML CONFERENCES AND JOURNALS
140 | # =============================================================================
141 | # Conferences: NeurIPS, ICML, ICLR, CVPR, ICCV, ACL, EMNLP, KDD, AAAI, IJCAI
142 | # Journals: JMLR, TPAMI, TMLR, AIJ, MLJ, JAIR, TACL, Computational Linguistics
143 | # =============================================================================
144 | 
145 | # =============================================================================
146 | # TROUBLESHOOTING FOR ML RESEARCH
147 | # =============================================================================
148 | # - Too many results: Add specific ML subfield terms or application domains
149 | # - Too few results: Broaden ML terminology or remove overly specific filters
150 | # - Irrelevant results: Use more specific ML terms and syntactic filters
151 | # - Missing recent papers: Check date ranges and ensure recent databases are included
152 | # - API errors: Use open databases (arxiv, semantic_scholar) if commercial ones fail
153 | # =============================================================================
154 | 


--------------------------------------------------------------------------------
/clients/base_client.py:
--------------------------------------------------------------------------------
  1 | from abc import ABC, abstractmethod
  2 | import pandas as pd
  3 | import logging
  4 | import time
  5 | from tqdm import tqdm
  6 | from os.path import exists
  7 | from util import util
  8 | from util.error_standards import ErrorHandler, create_error_context, ErrorSeverity, ErrorCategory, get_standard_error_info
  9 | from util.logging_standards import LogCategory, get_current_sals_logger, get_compat_logger
 10 | 
 11 | 
 12 | class DatabaseClient(ABC):
 13 |     """
 14 |     Abstract base class for database clients using the Template Method pattern.
 15 |     
 16 |     This class defines the workflow for retrieving papers from any database:
 17 |     1. Check if file exists
 18 |     2. Plan requests
 19 |     3. Execute requests
 20 |     4. Filter papers
 21 |     5. Clean papers
 22 |     6. Save results
 23 |     """
 24 |     
 25 |     def __init__(self, database_name: str, max_papers: int = 1000, waiting_time: int = 2, max_retries: int = 3, 
 26 |                  client_fields: dict = None, offset_limit: int = None, quota: int = None):
 27 |         self.database_name = database_name
 28 |         self.max_papers = max_papers
 29 |         self.waiting_time = waiting_time
 30 |         self.max_retries = max_retries
 31 |         self.client_fields = client_fields or {}
 32 |         self.offset_limit = offset_limit
 33 |         self.quota = quota
 34 |         # Use the standardized SaLS logger if available; fallback to std logger
 35 |         sals = get_current_sals_logger()
 36 |         self.logger = get_compat_logger()
 37 |         self.file_handler = ''
 38 |         
 39 |     def get_papers(self, query, syntactic_filters, synonyms, fields, types, dates, start_date, end_date, folder_name, search_date):
 40 |         """
 41 |         Template method that defines the paper retrieval workflow.
 42 |         """
 43 |         # Set up file handler for logging
 44 |         # Resolve attached file handler, if any
 45 |         try:
 46 |             for h in getattr(self.logger, 'handlers', []):
 47 |                 if hasattr(h, 'baseFilename'):
 48 |                     self.file_handler = h.baseFilename
 49 |                     break
 50 |         except Exception:
 51 |             self.file_handler = ''
 52 |             
 53 |         query_name = list(query.keys())[0]
 54 |         query_value = query[query_name]
 55 |         
 56 |         # Generate file name for this query and database
 57 |         file_name = self._generate_file_name(folder_name, search_date, query_name)
 58 |         
 59 |         # Check if file already exists
 60 |         if exists(file_name):
 61 |             self.logger.info(LogCategory.FILE, "base_client", "get_papers", "File already exists.")
 62 |             return
 63 |             
 64 |         # Check if API access is available
 65 |         if not self._has_api_access():
 66 |             self.logger.info(LogCategory.DATABASE, "base_client", "get_papers", "API key access not provided. Skipping this client...")
 67 |             return
 68 |             
 69 |         # Execute the paper retrieval workflow
 70 |         try:
 71 |             # Step 1: Plan requests
 72 |             self.logger.info(LogCategory.DATABASE, "base_client", "get_papers", "Retrieving papers. It might take a while...")
 73 |             papers = self._plan_requests(query, syntactic_filters, synonyms, fields, types, dates, start_date, end_date)
 74 |             
 75 |             if len(papers) > 0:
 76 |                 # Step 2: Filter papers
 77 |                 papers = self._filter_papers(papers, dates, start_date, end_date)
 78 |                 
 79 |             if len(papers) > 0:
 80 |                 # Step 3: Clean papers
 81 |                 papers = self._clean_papers(papers)
 82 | 
 83 |             if self.database_name == 'scopus':
 84 |                 # If the database is Scopus, get abstracts
 85 |                 papers = self._get_abstracts(papers)
 86 |                 
 87 |             if len(papers) > 0:
 88 |                 # Step 4: Save papers
 89 |                 util.save(file_name, papers, 'utf-8', 'a')
 90 |                 
 91 |             self.logger.info(LogCategory.DATABASE, "base_client", "get_papers", f"Retrieved papers after filters and cleaning: {len(papers)}")
 92 |             return file_name
 93 |             
 94 |         except (ValueError, TypeError) as e:
 95 |             # User-friendly message explaining what's happening
 96 |             context = create_error_context(
 97 |                 "base_client", "get_papers", 
 98 |                 ErrorSeverity.WARNING, 
 99 |                 ErrorCategory.DATA,
100 |                 f"Data type error in paper retrieval workflow: {type(e).__name__}: {str(e)}"
101 |             )
102 |             error_info = get_standard_error_info("data_validation_failed")
103 |             ErrorHandler.handle_error(e, context, error_info, self.logger)
104 |         except Exception as ex:
105 |             # User-friendly message explaining what's happening
106 |             context = create_error_context(
107 |                 "base_client", "get_papers", 
108 |                 ErrorSeverity.ERROR, 
109 |                 ErrorCategory.SYSTEM,
110 |                 f"Unexpected error in paper retrieval workflow: {type(ex).__name__}: {str(ex)}"
111 |             )
112 |             error_info = get_standard_error_info("unexpected_error")
113 |             ErrorHandler.handle_error(ex, context, error_info, self.logger)
114 |     
115 |     def _generate_file_name(self, folder_name, search_date, query_name):
116 |         """Generate the file name for saving papers."""
117 |         return f'./papers/{folder_name}/{str(search_date).replace("-", "_")}/raw_papers/{query_name.lower().replace(" ", "_")}_{self.database_name}.csv'
118 |     
119 |     @abstractmethod
120 |     def _has_api_access(self) -> bool:
121 |         """Check if API access is available for this database."""
122 |         pass
123 |     
124 |     @abstractmethod
125 |     def _plan_requests(self, query, syntactic_filters, synonyms, fields, types, dates, start_date, end_date) -> pd.DataFrame:
126 |         """Plan the API requests based on the query and parameters."""
127 |         pass
128 |     
129 |     @abstractmethod
130 |     def _filter_papers(self, papers: pd.DataFrame, dates, start_date, end_date) -> pd.DataFrame:
131 |         """Filter papers based on criteria like dates, duplicates, etc."""
132 |         pass
133 |     
134 |     @abstractmethod
135 |     def _clean_papers(self, papers: pd.DataFrame) -> pd.DataFrame:
136 |         """Clean and standardize paper data."""
137 |         pass
138 |     
139 |     @abstractmethod
140 |     def _get_abstracts(self, papers: pd.DataFrame) -> pd.DataFrame:
141 |         """Get abstracts for papers."""
142 |         pass
143 | 
144 |     def _retry_request(self, request_func, *args, **kwargs):
145 |         """Common retry mechanism for API requests."""
146 |         retry = 0
147 |         while retry < self.max_retries:
148 |             try:
149 |                 result = request_func(*args, **kwargs)
150 |                 if self._is_successful_response(result):
151 |                     return result
152 |             except (ValueError, TypeError) as e:
153 |                 self.logger.debug(LogCategory.DATABASE, "base_client", "_retry_request", f"Request failed due to data type error (attempt {retry + 1}): {type(e).__name__}: {str(e)}")
154 |             except Exception as ex:
155 |                 self.logger.debug(LogCategory.DATABASE, "base_client", "_retry_request", f"Request failed due to unexpected error (attempt {retry + 1}): {type(ex).__name__}: {str(ex)}")
156 |             
157 |             retry += 1
158 |             if retry < self.max_retries:
159 |                 delay = util.exponential_backoff(retry, self.waiting_time, 64)
160 |                 time.sleep(delay)
161 |         if result is not None and result.status_code == 404:
162 |             return result
163 |         if result is not None and result.status_code == 429:
164 |             result = self._retry_request(request_func, *args, **kwargs)
165 |             return result
166 |         if result is None:
167 |             result = {
168 |                 "status": "error",
169 |                 "status_code": 999,
170 |                 "message": "There was an error processing your request. Please try again later or contact support if the issue persists.",
171 |                 "attempts": retry,
172 |                 "max_retries": self.max_retries,
173 |                 "database": self.database_name
174 |             }
175 |             return result
176 |         return result
177 |     
178 |     def _is_successful_response(self, response) -> bool:
179 |         """Check if the API response is successful."""
180 |         if hasattr(response, 'status_code'):
181 |             return response.status_code == 200
182 |         return True  # Default to True for responses without status codes
183 |     
184 |     def _log_api_error(self, response, request_info=""):
185 |         """Log API errors consistently across all clients."""
186 |         self.logger.info(LogCategory.DATABASE, "base_client", "_log_api_error", f"Error requesting the API. Skipping to next request. Please see the log file for details: {self.file_handler}")
187 |         if hasattr(response, 'text'):
188 |             self.logger.debug(LogCategory.DATABASE, "base_client", "_log_api_error", f"API response: {response.text}")
189 |         if hasattr(response, 'request') and response.request is not None:
190 |             self.logger.debug(LogCategory.DATABASE, "base_client", "_log_api_error", f"Request: {request_info}")
191 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
  1 | # SaLS: Semi-automatic Literature Survey
  2 | 
  3 | This project implements SaLS: a semi-automatic tool to survey research papers based on the systematic methodology proposed by Kitchenham et al.[1, 2]. The goal of this project is to semi-automate the research papers survey process while providing a framework to enable surveys reproducibility and evolution. Two SaLS use cases are:
  4 | 
  5 | - Cabrera, Christian, et al. *The Systems Engineering Approach in Times of Large Language Models.* Proceedings of the 58th Hawaii international conference on system sciences (2025) (To Appear). [Paper.](https://arxiv.org/abs/2411.09050v1) [Code.](https://github.com/cabrerac/semi-automatic-literature-survey/tree/sys-llms-survey)
  6 | - Cabrera, Christian, et al. *Machine Learning Systems: A Survey from a Data-Oriented Perspective.* ACM Computing Surveys (2025) [Paper.](https://dl.acm.org/doi/10.1145/3769292) [Code.](https://github.com/cabrerac/semi-automatic-literature-survey/tree/doa-survey)
  7 | 
  8 | SaLS automatically retrives papers metadata based on queries that users provide. These queries are used to consume the search APIs exposed by the most popular research papers repositories in different domains. Currently, SaLS retrieves papers information from the following repositories:
  9 | 
 10 | - [IEEE Xplore](https://ieeexplore.ieee.org/Xplore/home.jsp)
 11 | - [Springer Nature](https://www.springernature.com/gp)
 12 | - [Scopus](https://www.elsevier.com/en-gb/solutions/scopus)
 13 | - [Semantic Scholar](https://www.semanticscholar.org)
 14 | - [CORE](https://core.ac.uk)
 15 | - [arXiv](https://arxiv.org)
 16 | 
 17 | The retrieved metadata includes paper identifier (e.g., doi), publisher, publication date, title, url, and abstract.
 18 | 
 19 | SaLS merges papers information from different repositories, and then applies customised syntactic and semantic filters (i.e., semantic search)[3] to reduce the search space of papers according to users' interests.
 20 | 
 21 | Once automatic filters are applied, the tool prompts the title and abstract of the paper in a centralised interface where users can decide if the paper should be included or not in the review (i.e., papers filtered by abstract). The URL of the papers that passed the filter by abstract is then prompted in the last filter, which requires the user to skim the full paper and decide if it is included or no.
 22 | 
 23 | Then, the tool applies the snowballing step by retriving the metadata of the works that cited the selected papers in the last step (i.e., papers filtered by skimming the full text), and applies the automatic and semi-automatic filters on the citing papers. 
 24 | 
 25 | The final list of papers is composed by the cited papers that passed the first round of filters, and the citing papers that passed the second round of filters (i.e., snowballing).
 26 | 
 27 | # Requirements
 28 | 
 29 | Some of the APIs provided by the repositories require an access key to be consumed. You should request a key to each repository you want to include in your search. Each respository has its own steps to apply for a key as follows:
 30 | 
 31 | - [IEEE Xplore](https://developer.ieee.org/getting_started)
 32 | - [Springer Nature](https://dev.springernature.com/docs)
 33 | - [Scopus](https://dev.elsevier.com/)
 34 | - [CORE](https://core.ac.uk/services/api)
 35 | - [Semantic Scholar](https://www.semanticscholar.org/product/api/tutorial)
 36 | 
 37 | Alternatively, you can use the tool for requesting papers from arXiv or Semantic Scholar which are open and do not need an access key. SaLS does not have control over the maintenance of the APIs. If an API produces an error, you can see the details in the log files. We recommend to stop using the API that produces errors for a while.
 38 | 
 39 | # How to run it?
 40 | 
 41 | The following instructions were tested on:
 42 | - A Windows machine (i.e., Windows PowerShell) with Python 3.10.11.
 43 | - Windows Subsystem for Linux ([WSL](https://docs.microsoft.com/en-us/windows/wsl/install)) with Python 3.8.
 44 | - An Ubuntu machine with Python 3.8.
 45 | 
 46 | 1. Clone this repository
 47 | 
 48 | ```
 49 | git clone https://github.com/cabrerac/semi-automatic-literature-survey.git
 50 | ```
 51 | ```
 52 | cd semi-automatic-literature-survey/
 53 | ```
 54 | 
 55 | 2. Create and activate virtual environment 
 56 | 
 57 | For Linux distributions
 58 | ```
 59 | python -m venv venv
 60 | ```
 61 | ```
 62 | source venv/bin/activate
 63 | ```
 64 | 
 65 | For Windows
 66 | ```
 67 | python -m venv ./venv
 68 | ```
 69 | ```
 70 | ./venv/Scripts/activate
 71 | ```
 72 | 
 73 | 3. Install requirements
 74 | 
 75 | ```
 76 | pip install -r requirements.txt
 77 | ```
 78 | 
 79 | 4. Install language package for spacy
 80 | 
 81 | ```
 82 | python -m spacy download en_core_web_sm
 83 | ```
 84 | 
 85 | 5. Create a file `./config.json` that will store the API access keys for the repositories you want to use. The file should have the following format:
 86 | 
 87 | ```
 88 |  {
 89 |   "api_access_core": "CORE_API_ACCESS_KEY",
 90 |   "api_access_ieee": "IEEE_API_ACCESS_KEY",
 91 |   "api_access_springer": "SPRINGER_API_ACCESS_KEY",
 92 |   "api_access_elsevier": "ELSEVIER_API_ACCESS_KEY"
 93 | }
 94 | ```
 95 | Ignore this step if you are using the tool with arXiv and Semantic Scholar. Also, you should only add the access keys of the repositories you want to use.
 96 | 
 97 | 6. Run the main passing the search parameters file. For example:
 98 | 
 99 | ```
100 | python main.py parameters_ar.yaml
101 | ```
102 | 
103 | A simple self-explanatory example of a search parameters file can be found in `./parameters_ar.yaml`. Alternatively, a parameters file including syntactic and semantic filters can be found in `./parameters_sys.yaml`
104 | 
105 | ## Configuration and Documentation
106 | 
107 | ### Quick Start
108 | - **Quick Start Guide**: `docs/quick_start_guide.md` - Get running in 5 minutes
109 | - **Configuration Guide**: `docs/configuration_guide.md` - Comprehensive configuration reference
110 | - **Configuration Templates**: `templates/` directory - Ready-to-use configuration examples
111 | 
112 | ### Configuration Templates
113 | - **Basic Template**: `templates/basic_search_template.yaml` - Simple searches
114 | - **Advanced Template**: `templates/advanced_research_template.yaml` - Complex research projects
115 | - **Machine Learning Template**: `templates/machine_learning_template.yaml` - ML/AI research
116 | 
117 | ### Getting Help
118 | - Start with the **Quick Start Guide** for your first search
119 | - Use **Configuration Templates** as starting points
120 | - Refer to the **Configuration Guide** for advanced features
121 | - **Error Recovery**: SaLS automatically detects configuration issues and provides recovery suggestions
122 | - Check error messages for specific guidance and automatic fallbacks
123 | 
124 | A description of the semi-automatic methodology applied in a survey can be found in the paper ["Machine Learning Systems: A Survey from a Data-Oriented Perspective"](https://dl.acm.org/doi/10.1145/3769292) [4]. Another paper using this tool is [The Systems Engineering Approach in Times of Large Language Models](https://scholarspace.manoa.hawaii.edu/items/ccd98c8b-bb61-4a86-9cd4-4719078d028f)[5].
125 | 
126 | # Query syntax
127 | 
128 | SaLS uses a simple, user‑friendly boolean expression to describe search queries. The syntax you write is normalized and validated, then translated per‑API to each provider’s expected format.
129 | 
130 | Supported features
131 | - Operators: AND and OR
132 |   - Accepted forms: `AND`, `and`, `&&`, `&`, `OR`, `or`, `||`, `|`, and the legacy `¦` for OR
133 |   - Only AND/OR are supported (no NOT)
134 | - Parentheses: use `(` and `)` to group expressions and control precedence
135 | - Phrases: wrap multi‑word terms in single or double quotes, e.g., `'systems engineering'` or "large language model"
136 | - Identifiers without quotes: letters, digits, and common symbols are allowed (e.g., `- _ . / : + *`)
137 | - Whitespace: not significant outside of quotes; preserved inside quotes
138 | 
139 | Examples
140 | ```
141 | 'systems engineering' & ('generative ai' ¦ 'artificial intelligence')
142 | "ml systems" AND (security OR safety)
143 | (robotics AND control) OR 'reinforcement learning'
144 | systems-engineering && (generative_ai || ai)
145 | ```
146 | 
147 | Validation and errors
148 | - Unbalanced parentheses or unterminated quotes are rejected with a helpful message and a caret indicator
149 | - Dangling operators (e.g., `a AND`, `a OR`) are rejected
150 | - Symbols inside quotes are preserved literally (e.g., `'a & b' && c` treats `a & b` as a phrase)
151 | 
152 | # References
153 | 
154 | [1] Barbara Kitchenham and Pearl Brereton. 2013. A systematic review of systematic review process research in software engineering. Information and Software Technology 55, 12 (2013), 2049–2075. https://doi.org/10.1016/j.infsof.2013.07.010
155 | 
156 | [2] Barbara Kitchenham and Stuart Charters. 2007. Guidelines for performing Systematic Literature Reviews in Software Engineering. Technical Report EBSE 2007-001. Keele University and Durham University Joint Report. https://www.elsevier.com/__data/promis_misc/525444systematicreviewsguide.pdf
157 | 
158 | [3] SBERT.net Sentence Transformers. 2024. Semantic Search [Available online](https://www.sbert.net/examples/applications/semantic-search/README.html)
159 | 
160 | [4] Christian Cabrera, Andrei Paleyes, Pierre Thodoroff, and Neil D. Lawrence. 2025. Machine Learning Systems: A Survey from a Data-Oriented Perspective. ACM Computing Surveys. [Available online](https://dl.acm.org/doi/10.1145/3769292)
161 | 
162 | [4] Christian Cabrera, Viviana Bastidas, Jennifer Schooling, and Neil D. Lawrence. 2025. The Systems Engineering Approach in Times of Large Language Models. Proceedings of the 58th Hawaii International Conference on System Sciences. [Available online](https://scholarspace.manoa.hawaii.edu/items/ccd98c8b-bb61-4a86-9cd4-4719078d028f)
163 | 
164 | 
165 | 


--------------------------------------------------------------------------------
/tests/test_util.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python3
  2 | """
  3 | Unit tests for SaLS utility functions.
  4 | 
  5 | This module tests the core utility functions in util/util.py, including
  6 | configuration validation, file operations, and data processing functions.
  7 | """
  8 | 
  9 | import pytest
 10 | import pandas as pd
 11 | import tempfile
 12 | import os
 13 | import sys
 14 | from datetime import datetime
 15 | from unittest.mock import patch, MagicMock
 16 | 
 17 | # Add the project root to the path for imports
 18 | sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
 19 | 
 20 | from util import util
 21 | 
 22 | 
 23 | class TestConfigurationValidation:
 24 |     """Test configuration validation functions."""
 25 |     
 26 |     @pytest.mark.unit
 27 |     @pytest.mark.config
 28 |     def test_validate_configuration_valid_config(self):
 29 |         """Test validation with a completely valid configuration."""
 30 |         config = {
 31 |             'queries': [{'test': 'test'}],
 32 |             'databases': ['arxiv', 'semantic_scholar'],
 33 |             'search_date': '2024-12-15',
 34 |             'folder_name': 'test_search'
 35 |         }
 36 |         
 37 |         is_valid, message, suggestions = util._validate_configuration(config, 'test.yaml')
 38 |         
 39 |         assert is_valid is True
 40 |         assert "Configuration validation passed successfully!" in message
 41 |         assert len(suggestions) == 0
 42 |     
 43 |     @pytest.mark.unit
 44 |     @pytest.mark.config
 45 |     def test_validate_configuration_missing_queries(self):
 46 |         """Test validation with missing queries (critical error)."""
 47 |         config = {
 48 |             'databases': ['arxiv'],
 49 |             'search_date': '2024-12-15'
 50 |         }
 51 |         
 52 |         is_valid, message, suggestions = util._validate_configuration(config, 'test.yaml')
 53 |         
 54 |         assert is_valid is False
 55 |         assert "CRITICAL ERRORS" in message
 56 |         assert "Missing queries section" in message
 57 |         assert len(suggestions) > 0
 58 |         assert any(s['severity'] == 'critical' for s in suggestions)
 59 |     
 60 |     @pytest.mark.unit
 61 |     @pytest.mark.config
 62 |     def test_validate_configuration_missing_optional_fields(self):
 63 |         """Test validation with missing optional fields (warnings only)."""
 64 |         config = {
 65 |             'queries': [{'test': 'test'}]
 66 |             # Missing databases, search_date, folder_name
 67 |         }
 68 |         
 69 |         is_valid, message, suggestions = util._validate_configuration(config, 'test.yaml')
 70 |         
 71 |         assert is_valid is True  # Can continue with warnings
 72 |         assert "WARNINGS" in message
 73 |         assert len(suggestions) > 0
 74 |         assert all(s['severity'] == 'warning' for s in suggestions)
 75 |     
 76 |     @pytest.mark.unit
 77 |     @pytest.mark.config
 78 |     def test_validate_configuration_invalid_database(self):
 79 |         """Test validation with invalid database names."""
 80 |         config = {
 81 |             'queries': [{'test': 'test'}],
 82 |             'databases': ['invalid_db', 'arxiv']
 83 |         }
 84 |         
 85 |         is_valid, message, suggestions = util._validate_configuration(config, 'test.yaml')
 86 |         
 87 |         assert is_valid is True  # Can continue with warnings
 88 |         assert "invalid database" in message.lower()
 89 |         assert len(suggestions) > 0
 90 |     
 91 |     @pytest.mark.unit
 92 |     @pytest.mark.config
 93 |     def test_validate_configuration_invalid_date_format(self):
 94 |         """Test validation with invalid date formats."""
 95 |         config = {
 96 |             'queries': [{'test': 'test'}],
 97 |             'start_date': 'invalid-date'
 98 |         }
 99 |         
100 |         is_valid, message, suggestions = util._validate_configuration(config, 'test.yaml')
101 |         
102 |         assert is_valid is True  # Can continue with warnings
103 |         assert "Invalid 'start_date' format" in message
104 |         assert len(suggestions) > 0
105 | 
106 | 
107 | class TestConfigurationFallbacks:
108 |     """Test configuration fallback application."""
109 |     
110 |     @pytest.mark.unit
111 |     @pytest.mark.config
112 |     def test_apply_configuration_fallbacks_missing_databases(self):
113 |         """Test applying fallbacks for missing databases."""
114 |         config = {'queries': [{'test': 'test'}]}
115 |         suggestions = [{
116 |             'issue': 'Missing databases section',
117 |             'severity': 'warning',
118 |             'default': ['arxiv', 'semantic_scholar']
119 |         }]
120 |         
121 |         updated_config = util._apply_configuration_fallbacks(config, 'test.yaml', suggestions)
122 |         
123 |         assert 'databases' in updated_config
124 |         assert updated_config['databases'] == ['arxiv', 'semantic_scholar']
125 |     
126 |     @pytest.mark.unit
127 |     @pytest.mark.config
128 |     def test_apply_configuration_fallbacks_missing_search_date(self):
129 |         """Test applying fallbacks for missing search_date."""
130 |         config = {'queries': [{'test': 'test'}]}
131 |         suggestions = [{
132 |             'issue': 'Missing search_date',
133 |             'severity': 'warning',
134 |             'default': 'current date'
135 |         }]
136 |         
137 |         with patch('util.util.datetime') as mock_datetime:
138 |             mock_datetime.today.return_value = datetime(2024, 12, 15)
139 |             mock_datetime.strftime.return_value = '2024-12-15'
140 |             
141 |             updated_config = util._apply_configuration_fallbacks(config, 'test.yaml', suggestions)
142 |             
143 |             assert 'search_date' in updated_config
144 |             assert updated_config['search_date'] == '2024-12-15'
145 |     
146 |     @pytest.mark.unit
147 |     @pytest.mark.config
148 |     def test_apply_configuration_fallbacks_missing_folder_name(self):
149 |         """Test applying fallbacks for missing folder_name."""
150 |         config = {'queries': [{'test': 'test'}]}
151 |         suggestions = [{
152 |             'issue': 'Missing folder_name',
153 |             'severity': 'warning',
154 |             'default': 'filename-based'
155 |         }]
156 |         
157 |         updated_config = util._apply_configuration_fallbacks(config, 'test.yaml', suggestions)
158 |         
159 |         assert 'folder_name' in updated_config
160 |         assert updated_config['folder_name'] == 'test'
161 | 
162 | 
163 | class TestQueryProcessing:
164 |     """Test query processing and normalization functions."""
165 |     
166 |     @pytest.mark.unit
167 |     def test_normalize_query_expression_basic(self):
168 |         """Test basic query expression normalization."""
169 |         expression = "'machine learning' & 'edge computing'"
170 |         normalized = util.normalize_query_expression(expression)
171 |         
172 |         assert '<AND>' in normalized
173 |         assert "'machine learning'" in normalized
174 |         assert "'edge computing'" in normalized
175 |     
176 |     @pytest.mark.unit
177 |     def test_normalize_query_expression_text_operators(self):
178 |         """Test normalization of text-based operators."""
179 |         expression = "'ml' AND 'edge' OR 'fog'"
180 |         normalized = util.normalize_query_expression(expression)
181 |         
182 |         assert '<AND>' in normalized
183 |         assert '<OR>' in normalized
184 |         assert "'ml'" in normalized
185 |         assert "'edge'" in normalized
186 |         assert "'fog'" in normalized
187 |     
188 |     @pytest.mark.unit
189 |     def test_normalize_query_expression_symbolic_operators(self):
190 |         """Test normalization of symbolic operators."""
191 |         expression = "'ml' && 'edge' || 'fog'"
192 |         normalized = util.normalize_query_expression(expression)
193 |         
194 |         assert '<AND>' in normalized
195 |         assert '<OR>' in normalized
196 |         assert "'ml'" in normalized
197 |         assert "'edge'" in normalized
198 |         assert "'fog'" in normalized
199 |     
200 |     @pytest.mark.unit
201 |     def test_normalize_query_expression_preserves_quotes(self):
202 |         """Test that operators inside quotes are preserved."""
203 |         expression = "'a & b' AND 'c | d'"
204 |         normalized = util.normalize_query_expression(expression)
205 |         
206 |         assert "'a & b'" in normalized  # Preserved
207 |         assert "'c | d'" in normalized  # Preserved
208 |         assert '<AND>' in normalized    # Normalized
209 |     
210 |     @pytest.mark.unit
211 |     def test_normalize_query_expression_encoding_artifacts(self):
212 |         """Test removal of encoding artifacts."""
213 |         expression = "Â'machine learning' & 'edge computing'"
214 |         normalized = util.normalize_query_expression(expression)
215 |         
216 |         assert 'Â' not in normalized
217 |         assert "'machine learning'" in normalized
218 |         assert '<AND>' in normalized
219 | 
220 | 
221 | class TestExponentialBackoff:
222 |     """Test exponential backoff function."""
223 |     
224 |     @pytest.mark.unit
225 |     def test_exponential_backoff_basic(self):
226 |         """Test basic exponential backoff calculation."""
227 |         delays = []
228 |         for attempt in range(5):
229 |             delay = util.exponential_backoff(attempt)
230 |             delays.append(delay)
231 |         
232 |         # Should be increasing (with jitter)
233 |         assert len(delays) == 5
234 |         assert all(d > 0 for d in delays)
235 |     
236 |     @pytest.mark.unit
237 |     def test_exponential_backoff_max_delay(self):
238 |         """Test that max delay is respected."""
239 |         delay = util.exponential_backoff(10, base_delay=1, max_delay=5)
240 |         assert delay <= 5
241 |     
242 |     @pytest.mark.unit
243 |     def test_exponential_backoff_custom_base(self):
244 |         """Test custom base delay."""
245 |         delay = util.exponential_backoff(2, base_delay=0.5)
246 |         assert delay > 0.5  # Should be greater than base due to exponential growth
247 | 
248 | 
249 | class TestDataProcessing:
250 |     """Test data processing functions."""
251 |     
252 |     @pytest.mark.unit
253 |     def test_remove_repeated_function_exists(self):
254 |         """Test that the main remove_repeated function exists."""
255 |         assert hasattr(util, 'remove_repeated')
256 |         assert callable(util.remove_repeated)
257 | 
258 | 
259 | class TestFileOperations:
260 |     """Test file operation functions."""
261 |     
262 |     @pytest.mark.unit
263 |     def test_save_function_creates_directory(self):
264 |         """Test that save function creates directories if they don't exist."""
265 |         with tempfile.TemporaryDirectory() as temp_dir:
266 |             file_path = os.path.join(temp_dir, 'subdir', 'test.csv')
267 |             df = pd.DataFrame({'col1': [1, 2, 3], 'col2': ['a', 'b', 'c']})
268 |             
269 |             util.save(file_path, df, 'utf-8', 'w')
270 |             
271 |             assert os.path.exists(file_path)
272 |             assert os.path.exists(os.path.dirname(file_path))
273 |     
274 |     @pytest.mark.unit
275 |     def test_save_function_overwrites_existing(self):
276 |         """Test that save function can overwrite existing files."""
277 |         with tempfile.TemporaryDirectory() as temp_dir:
278 |             file_path = os.path.join(temp_dir, 'test.csv')
279 |             df1 = pd.DataFrame({'col1': [1, 2, 3]})
280 |             df2 = pd.DataFrame({'col1': [4, 5, 6]})
281 |             
282 |             # Save first DataFrame
283 |             util.save(file_path, df1, 'utf-8', 'w')
284 |             assert os.path.exists(file_path)
285 |             
286 |             # Overwrite with second DataFrame
287 |             util.save(file_path, df2, 'utf-8', 'w')
288 |             
289 |             # Verify content was overwritten
290 |             loaded_df = pd.read_csv(file_path)
291 |             assert len(loaded_df) == 3
292 |             assert loaded_df['col1'].iloc[0] == 4
293 | 
294 | 
295 | class TestErrorHandling:
296 |     """Test error handling and recovery."""
297 |     
298 |     @pytest.mark.unit
299 |     @pytest.mark.error_handling
300 |     def test_validate_configuration_exception_handling(self):
301 |         """Test that validation handles unexpected exceptions gracefully."""
302 |         with patch('util.util.datetime') as mock_datetime:
303 |             mock_datetime.strptime.side_effect = Exception("Unexpected error")
304 |             
305 |             config = {
306 |                 'queries': [{'test': 'test'}],
307 |                 'start_date': '2024-01-01'
308 |             }
309 |             
310 |             is_valid, message, suggestions = util._validate_configuration(config, 'test.yaml')
311 |             
312 |             assert is_valid is True  # Should continue with warnings
313 |             assert "warning" in message.lower()
314 |     
315 |     @pytest.mark.unit
316 |     @pytest.mark.error_handling
317 |     def test_fallback_application_exception_handling(self):
318 |         """Test that fallback application handles exceptions gracefully."""
319 |         config = {'queries': [{'test': 'test'}]}
320 |         suggestions = [{
321 |             'issue': 'Test issue',
322 |             'severity': 'warning',
323 |             'default': 'test_default'
324 |         }]
325 |         
326 |         # Mock a function that raises an exception
327 |         with patch('util.util.logger.info', side_effect=Exception("Test exception")):
328 |             result = util._apply_configuration_fallbacks(config, 'test.yaml', suggestions)
329 |             
330 |             # Should return original config even if fallback fails
331 |             assert result == config
332 | 
333 | 
334 | if __name__ == "__main__":
335 |     # Run tests if executed directly
336 |     pytest.main([__file__, "-v"])
337 | 


--------------------------------------------------------------------------------
/clients/apis/generic.py:
--------------------------------------------------------------------------------
  1 | import urllib.request
  2 | import urllib.parse
  3 | import urllib.error
  4 | import urllib
  5 | import time
  6 | import json
  7 | import requests
  8 | from requests.models import Response
  9 | import re
 10 | import logging
 11 | 
 12 | 
 13 | file_handler = ''
 14 | logger = logging.getLogger('sals_pipeline')
 15 | 
 16 | 
 17 | class Generic:
 18 |     def request(self, query, method, data, headers):
 19 |         global file_handler
 20 |         # Safely resolve the log file path from any file-based handler
 21 |         file_handler = ''
 22 |         try:
 23 |             for handler in logger.handlers:
 24 |                 if hasattr(handler, 'baseFilename'):
 25 |                     file_handler = handler.baseFilename
 26 |                     break
 27 |         except Exception:
 28 |             file_handler = ''
 29 |         request_result = None
 30 |         time.sleep(1)
 31 |         headers['Content-type'] = 'application/json'
 32 |         headers['Accept'] = 'application/json'
 33 |         if method == 'post':
 34 |             try:
 35 |                 data = json.dumps(data)
 36 |                 request_result = requests.post(query, data=data, headers=headers)
 37 |             except urllib.error.HTTPError as ex:
 38 |                 logger.info("Error parsing the API response in generic client. Please see the log file for "
 39 |                             "details: " + file_handler)
 40 |                 logger.debug("Exception: " + str(type(ex)) + ' - ' + str(ex))
 41 |                 logger.debug("Request: " + str(data))
 42 |             except UnicodeEncodeError as ex:
 43 |                 logger.info("Error parsing the API response in generic client. Please see the log file for "
 44 |                             "details: " + file_handler)
 45 |                 logger.debug("Exception: " + str(type(ex)) + ' - ' + str(ex))
 46 |                 logger.debug("Request: " + str(data))
 47 |             except urllib.error.URLError as ex:
 48 |                 logger.info("Error parsing the API response in generic client. Please see the log file for "
 49 |                             "details: " + file_handler)
 50 |                 logger.debug("Exception: " + str(type(ex)) + ' - ' + str(ex))
 51 |                 logger.debug("Request: " + str(data))
 52 |             except Exception as ex:
 53 |                 logger.info("Error parsing the API response in generic client. Please see the log file for "
 54 |                             "details: " + file_handler)
 55 |                 logger.debug("Exception: " + str(type(ex)) + ' - ' + str(ex))
 56 |                 logger.debug("Request: " + str(data))
 57 |         if method == 'get':
 58 |             try:
 59 |                 request_result = requests.get(query, headers=headers)
 60 |             except urllib.error.HTTPError as ex:
 61 |                 logger.info("Error parsing the API response in generic client. Please see the log file for "
 62 |                             "details: " + file_handler)
 63 |                 logger.debug("Exception: " + str(type(ex)) + ' - ' + str(ex))
 64 |                 logger.debug("Request: " + query)
 65 |             except UnicodeEncodeError as ex:
 66 |                 logger.info("Error parsing the API response in generic client. Please see the log file for "
 67 |                             "details: " + file_handler)
 68 |                 logger.debug("Exception: " + str(type(ex)) + ' - ' + str(ex))
 69 |                 logger.debug("Request: " + query)
 70 |             except urllib.error.URLError as ex:
 71 |                 logger.info("Error parsing the API response in generic client. Please see the log file for "
 72 |                             "details: " + file_handler)
 73 |                 logger.debug("Exception: " + str(type(ex)) + ' - ' + str(ex))
 74 |                 logger.debug("Request: " + query)
 75 |             except Exception as ex:
 76 |                 logger.info("Error parsing the API response in generic client. Please see the log file for "
 77 |                             "details: " + file_handler)
 78 |                 logger.debug("Exception: " + str(type(ex)) + ' - ' + str(ex))
 79 |                 logger.debug("Request: " + query)
 80 |         if method == 'retrieve':
 81 |             try:
 82 |                 req = urllib.request.Request(query, headers={'User-Agent': 'Mozilla/5.0'})
 83 |                 request_result = urllib.request.urlopen(req).read()
 84 |             except urllib.error.HTTPError as ex:
 85 |                 logger.info("Error parsing the API response in generic client. Please see the log file for "
 86 |                             "details: " + file_handler)
 87 |                 logger.debug("Exception: " + str(type(ex)) + ' - ' + str(ex))
 88 |                 logger.debug("Request: " + query)
 89 |             except UnicodeEncodeError as ex:
 90 |                 logger.info("Error parsing the API response in generic client. Please see the log file for "
 91 |                             "details: " + file_handler)
 92 |                 logger.debug("Exception: " + str(type(ex)) + ' - ' + str(ex))
 93 |                 logger.debug("Request: " + query)
 94 |             except urllib.error.URLError as ex:
 95 |                 logger.info("Error parsing the API response in generic client. Please see the log file for "
 96 |                             "details: " + file_handler)
 97 |                 logger.debug("Exception: " + str(type(ex)) + ' - ' + str(ex))
 98 |                 logger.debug("Request: " + query)
 99 |             except Exception as ex:
100 |                 logger.info("Error parsing the API response in generic client. Please see the log file for "
101 |                             "details: " + file_handler)
102 |                 logger.debug("Exception: " + str(type(ex)) + ' - ' + str(ex))
103 |                 logger.debug("Request: " + query)
104 |         if request_result is None:
105 |             logger.info("The API response is None. Please see the log file for "
106 |                         "details: " + file_handler)
107 |             logger.debug("Request: " + str(query))
108 |             request_result = Response()
109 |             request_result.status_code = 404
110 |             request_result._content = "The API response is None for query: " + str(query)
111 |             request_result.headers = {'Content-Type': 'text/plain'}
112 |             request_result._text = "The API response is None for query: " + str(query)
113 |         return request_result
114 | 
115 |     def default_query(self, parameters):
116 |         query = parameters['query'].replace('(', '%28').replace(')', '%29').replace("'", "")
117 |         words = re.split(' <AND> | <OR> ', query)
118 |         for word in words:
119 |             word = word.replace('%29', '').replace('%28', '')
120 |             synonyms = parameters['synonyms']
121 |             query_parameter = ''
122 |             if word in synonyms.keys():
123 |                 word_synonyms = synonyms[word]
124 |                 query_parameter = query_parameter + '<field>:%22' + word + '%22'
125 |                 for synonym in word_synonyms:
126 |                     query_parameter = query_parameter + '+OR+<field>:%22' + synonym + '%22'
127 |                 query_parameter = '%28' + query_parameter + '%29'
128 |                 query = query.replace(word, query_parameter)
129 |             else:
130 |                 query_parameter = query_parameter + '<field>:%22' + word + '%22'
131 |                 query = query.replace(word, query_parameter)
132 |         query = query.replace(' <AND> ', '+AND+').replace(' <OR> ', '+OR+').replace(' ', '+')
133 |         query = '%28' + query + '%29'
134 | 
135 |         if 'fields' in parameters:
136 |             qf = ''
137 |             fields = parameters['fields']
138 |             for field in fields:
139 |                 qf = qf + query.replace('<field>', field) + '+OR+'
140 |             query = qf[:-4]
141 |         return query
142 | 
143 |     def ieeexplore_query(self, parameters):
144 |         query = parameters['query'].replace("\'", '')
145 |         words = re.split(' <AND> | <OR> ', query)
146 |         for word in words:
147 |             word = word.replace(')', '').replace('(', '')
148 |             synonyms = parameters['synonyms']
149 |             query_parameter = ''
150 |             if word in synonyms.keys():
151 |                 word_synonyms = synonyms[word]
152 |                 query_parameter = query_parameter + '"' + word + '"'
153 |                 for synonym in word_synonyms:
154 |                     query_parameter = query_parameter + 'OR"' + synonym + '"'
155 |                 query_parameter = '(' + query_parameter + ')'
156 |                 query = query.replace(word, query_parameter)
157 |             else:
158 |                 query_parameter = query_parameter + '"' + word + '"'
159 |                 query = query.replace(word, query_parameter)
160 |         query = query.replace(' <AND> ', 'AND').replace(' <OR> ', 'OR')
161 |         first_term = query.split('AND')[0]
162 |         first_term = first_term.replace('(', '').replace(')', '')
163 |         words_first_term = first_term.split('OR')
164 |         queries = []
165 |         for word in words_first_term:
166 |             q = query.replace('(' + first_term + ')', word)
167 |             queries.append(q)
168 |         return queries
169 | 
170 |     def elsevier_query(self, parameters):
171 |         domains = []
172 |         for domain in parameters['domains']:
173 |             domains.append(domain)
174 |             synonyms = parameters['synonyms'][domain]
175 |             for synonym in synonyms:
176 |                 domains.append(synonym)
177 |         query_domains = 'ALL('
178 |         for domain in domains:
179 |             query_domains = query_domains + domain + ' OR '
180 |         query_domains = query_domains + ')'
181 |         query_domains = query_domains.replace(' OR )', ')')
182 | 
183 |         interests = []
184 |         for interest in parameters['interests']:
185 |             interests.append(interest)
186 |             synonyms = parameters['synonyms'][interest]
187 |             for synonym in synonyms:
188 |                 interests.append(synonym)
189 |         query_interests = 'ALL('
190 |         for interest in interests:
191 |             query_interests = query_interests + interest + ' OR '
192 |         query_interests = query_interests + ')'
193 |         query_interests = query_interests.replace(' OR )', ')')
194 |         query = query_domains + ' AND ' + query_interests
195 |         return query
196 | 
197 |     def core_query(self, parameters):
198 |         query = parameters['query'].replace("'", "")
199 |         words = re.split(' <AND> | <OR> ', query)
200 |         for word in words:
201 |             word = word.replace('(', '').replace(')', '')
202 |             synonyms = parameters['synonyms']
203 |             query_parameter = ''
204 |             if word in synonyms.keys():
205 |                 word_synonyms = synonyms[word]
206 |                 query_parameter = query_parameter + ' ' + word + ' '
207 |                 for synonym in word_synonyms:
208 |                     query_parameter = query_parameter + ' OR ' + synonym + ' '
209 |                 query_parameter = '(' + query_parameter + ')'
210 |                 query = query.replace(word, query_parameter)
211 |             else:
212 |                 query_parameter = query_parameter + ' ' + word + ' '
213 |                 query_parameter = '(' + query_parameter + ')'
214 |                 query = query.replace(word, query_parameter)
215 |         query = query.replace(' <AND> ', ' AND ').replace(' <OR> ', ' OR ')
216 |         query = '<field>:(' + query + ')'
217 | 
218 |         if 'fields' in parameters:
219 |             qf = ''
220 |             fields = parameters['fields']
221 |             for field in fields:
222 |                 qf = qf + query.replace('<field>', field) + ' OR '
223 |             query = qf[:-4]
224 |         query = 'language.code:en AND abstract:(NOT thesis AND NOT tesis) AND title:(NOT survey AND NOT review)  ' \
225 |                 'AND (' + query + ')'
226 |         return query
227 | 
228 |     def transform_query(self, parameters, api):
229 |         queries = []
230 |         query = parameters['query']
231 |         # Define API-specific transformations
232 |         if api == 'arxiv' or api == 'springer' or api == 'scopus':
233 |             # Replace single quotes with double quotes
234 |             query = re.sub(r"'", '"', query)
235 |             # Add field specifications and URL encoding for AND and OR operators
236 |             query = re.sub(r'(\w+)', r'<field>:"\1"', query)
237 |             query = re.sub(r'<AND>', '+AND+', query)
238 |             query = re.sub(r'<OR>', '+OR+', query)
239 | 
240 |             # Wrap the whole expression in parentheses
241 |             query = f'({query})'
242 | 
243 |             # URL-encode the resulting string
244 |             query = query.replace('(', '%28').replace(')', '%29')
245 | 
246 |             if 'fields' in parameters:
247 |                 qf = ''
248 |                 fields = parameters['fields']
249 |                 for field in fields:
250 |                     qf = qf + query.replace('<field>', field) + '+OR+'
251 |                 query = qf[:-4]
252 |             queries.append(query)
253 | 
254 |         elif api == 'core':
255 |             # Replace single quotes with double quotes
256 |             query = re.sub(r"'", '"', query)
257 |             # Add parentheses for grouping
258 |             query = re.sub(r'<AND>', ' AND ', query)
259 |             query = re.sub(r'<OR>', ' OR ', query)
260 |             if 'fields' in parameters:
261 |                 qf = ''
262 |                 fields = parameters['fields']
263 |                 for field in fields:
264 |                     qf = qf + query.replace('<field>', field) + ' OR '
265 |                 query = qf[:-4]
266 |             query = '(subjects:(*article* OR *Article* OR *journal* OR *Journal* OR *ART* OR ' \
267 |                     '*conference* OR *CONFERENCE*)) AND (description:(NOT *thes* AND NOT *Thes* ' \
268 |                     'AND NOT *tesis* AND NOT *Tesis* AND NOT *Master* AND NOT *master*)) AND (' + query + ')'
269 |             queries.append(query)
270 | 
271 |         elif api == 'ieeexplore' or api == 'semantic_scholar':
272 |             # Preserve whitespace, especially inside quoted phrases
273 |             # Replace single quotes with double quotes
274 |             query = re.sub(r"'", '"', query)
275 |             # Add parentheses for grouping
276 |             query = re.sub(r'<AND>', 'AND', query)
277 |             query = re.sub(r'<OR>', 'OR', query)
278 |             first_term = query.split('AND')[0]
279 |             first_term = first_term.replace('(', '').replace(')', '')
280 |             words_first_term = first_term.split('OR')
281 |             for word in words_first_term:
282 |                 q = query.replace('(' + first_term + ')', word)
283 |                 queries.append(q)
284 |         return queries
285 | 


--------------------------------------------------------------------------------
/clients/apis/xploreapi.py:
--------------------------------------------------------------------------------
  1 | import math
  2 | import urllib
  3 | from urllib.request import urlopen
  4 | import xml.etree.ElementTree as ET
  5 | import json
  6 | import requests
  7 | import time
  8 | 
  9 | 
 10 | class XPLORE:
 11 |  
 12 |     # API endpoint (all non-Open Access)
 13 |     endPoint = "http://ieeexploreapi.ieee.org/api/v1/search/articles"
 14 | 
 15 |     # Open Access Document endpoint
 16 |     openAccessEndPoint = "http://ieeexploreapi.ieee.org/api/v1/search/document/"    
 17 | 
 18 |     def __init__(self, apiKey):
 19 | 
 20 |     	# API key
 21 |         self.apiKey = apiKey
 22 | 
 23 |     	# flag that some search criteria has been provided
 24 |         self.queryProvided = False
 25 | 
 26 |         # flag for Open Access, which changes endpoint in use and limits results to just Open Access
 27 |         self.usingOpenAccess = False
 28 | 
 29 |         # flag that article number has been provided, which overrides all other search criteria
 30 |         self.usingArticleNumber = False
 31 | 
 32 |         # flag that a boolean method is in use
 33 |         self.usingBoolean = False
 34 | 
 35 |         # flag that a facet is in use
 36 |         self.usingFacet = False
 37 | 
 38 |         # flag that a facet has been applied, in the event that multiple facets are passed
 39 |         self.facetApplied = False 
 40 | 
 41 |         # data type for results; default is json (other option is xml)
 42 |         self.outputType = 'json'
 43 | 
 44 |         # data format for results; default is raw (returned string); other option is object
 45 |         self.outputDataFormat = 'raw'
 46 | 
 47 |         # default of 25 results returned
 48 |         self.resultSetMax = 25
 49 | 
 50 |         # maximum of 200 results returned
 51 |         self.resultSetMaxCap = 200
 52 | 
 53 |         # records returned default to position 1 in result set
 54 |         self.startRecord = 1
 55 | 
 56 |         # default sort order is ascending; could also be 'desc' for descending
 57 |         self.sortOrder = 'asc'
 58 | 
 59 |         # field name that is being used for sorting
 60 |         self.sortField = 'article_title'
 61 | 
 62 |         # array of permitted search fields for searchField() method
 63 |         self.allowedSearchFields = ['abstract', 'affiliation', 'article_number', 'article_title', 'author', 'boolean_text', 'content_type', 'd-au', 'd-pubtype', 'd-publisher', 'd-year', 'doi', 'end_year', 'facet', 'index_terms', 'isbn', 'issn', 'is_number', 'meta_data', 'open_access', 'publication_number', 'publication_title', 'publication_year', 'publisher', 'querytext', 'start_year', 'thesaurus_terms']
 64 | 
 65 |         # dictionary of all search parameters in use and their values
 66 |         self.parameters = {}
 67 | 
 68 |         # dictionary of all filters in use and their values
 69 |         self.filters = {}
 70 | 
 71 | 
 72 |     # ensuring == can be used reliably
 73 |     def __eq__(self, other):
 74 |         if isinstance(other, self.__class__):
 75 |             return self.__dict__ == other.__dict__
 76 |         else:
 77 |             return False
 78 | 
 79 | 
 80 |     # ensuring != can be used reliably
 81 |     def __ne__(self, other):
 82 |         return not self.__eq__(other)
 83 | 
 84 | 
 85 |     # set the data type for the API output
 86 |     # string outputType   Format for the returned result (JSON, XML)
 87 |     # return void
 88 |     def dataType(self, outputType):
 89 | 
 90 |         outputType = outputType.strip().lower()
 91 |         self.outputType = outputType
 92 | 
 93 | 
 94 |     # set the data format for the API output
 95 |     # string outputDataFormat   Data structure for the returned result (raw string or object)
 96 |     # return void
 97 |     def dataFormat(self, outputDataFormat):
 98 | 
 99 |         outputDataFormat = outputDataFormat.strip().lower()
100 |         self.outputDataFormat = outputDataFormat
101 | 
102 | 
103 |     # set the start position in the
104 |     # string start   Start position in the returned data
105 |     # return void
106 |     def startingResult(self, start):
107 | 
108 |         self.startRecord = math.ceil(start) if (start > 0) else 1
109 | 
110 | 
111 |     # set the maximum number of results
112 |     # string maximum   Max number of results to return
113 |     # return void
114 |     def maximumResults(self, maximum):
115 | 
116 |         self.resultSetMax = math.ceil(maximum) if (maximum > 0) else 25
117 |         if self.resultSetMax > self.resultSetMaxCap:
118 |             self.resultSetMax = self.resultSetMaxCap
119 | 
120 | 
121 |     # setting a filter on results
122 |     # string filterParam   Field used for filtering
123 |     # string value    Text to filter on
124 |     # return void
125 |     def resultsFilter(self, filterParam, value):
126 | 
127 |         filterParam = filterParam.strip().lower()
128 |         value = value.strip()
129 | 
130 |         if len(value) > 0:
131 |             self.filters[filterParam] = value
132 |             self.queryProvided = True
133 | 
134 |             # Standards do not have article titles, so switch to sorting by article number
135 |             if (filterParam == 'content_type' and value == 'Standards'):
136 |                 self.resultsSorting('publication_year', 'asc')
137 | 
138 | 
139 |     # setting sort order for results
140 |     # string field   Data field used for sorting
141 |     # string order   Sort order for results (ascending or descending)
142 |     # return void
143 |     def resultsSorting(self, field, order):
144 | 
145 |         field = field.strip().lower()
146 |         order = order.strip()
147 |         self.sortField = field
148 |         self.sortOrder = order
149 | 
150 | 
151 |     # shortcut method for assigning search parameters and values
152 |     # string field   Field used for searching
153 |     # string value   Text to query
154 |     # return void
155 |     def searchField(self, field, value):
156 | 
157 |         field = field.strip().lower()
158 |         if field in self.allowedSearchFields:
159 |             self.addParameter(field, value)
160 |         else:
161 |             print("Searches against field " + field + " are not supported")
162 | 
163 | 
164 |     # string value   Abstract text to query
165 |     # return void
166 |     def abstractText(self, value):
167 | 
168 |         self.addParameter('abstract', value)
169 | 
170 | 
171 |     # string value   Affiliation text to query
172 |     # return void
173 |     def affiliationText(self, value):
174 | 
175 |         self.addParameter('affiliation', value)
176 | 
177 | 
178 |     # string value   Article number to query
179 |     # return void
180 |     def articleNumber(self, value):
181 | 
182 |         self.addParameter('article_number', value)
183 | 
184 | 
185 |     # string value   Article title to query
186 |     # return void
187 |     def articleTitle(self, value):
188 | 
189 |         self.addParameter('article_title', value)
190 | 
191 | 
192 |     # string value   Author to query
193 |     # return void
194 |     def authorText(self, value):
195 | 
196 |         self.addParameter('author', value)
197 | 
198 | 
199 |     # string value   Author Facet text to query
200 |     # return void
201 |     def authorFacetText(self, value):
202 | 
203 |         self.addParameter('d-au', value)
204 | 
205 | 
206 |     # string value   Value(s) to use in the boolean query
207 |     # return void
208 |     def booleanText(self, value):
209 | 
210 |         self.addParameter('boolean_text', value)
211 | 
212 | 
213 |     # string value   Content Type Facet text to query
214 |     # return void
215 |     def contentTypeFacetText(self, value):
216 | 
217 |         self.addParameter('d-pubtype', value)
218 | 
219 | 
220 |     # string value   DOI (Digital Object Identifier) to query
221 |     # return void
222 |     def doi(self, value):
223 | 
224 |         self.addParameter('doi', value)
225 | 
226 | 
227 |     # string value   Facet text to query
228 |     # return void
229 |     def facetText(self, value):
230 | 
231 |         self.addParameter('facet', value)
232 | 
233 | 
234 |     # string value   Author Keywords, IEEE Terms, and Mesh Terms to query
235 |     # return void
236 |     def indexTerms(self, value):
237 | 
238 |         self.addParameter('index_terms', value)
239 | 
240 | 
241 |     # string value   ISBN (International Standard Book Number) to query
242 |     # return void
243 |     def isbn(self, value):
244 | 
245 |         self.addParameter('isbn', value)
246 | 
247 | 
248 |     # string value   ISSN (International Standard Serial number) to query
249 |     # return void
250 |     def issn(self, value):
251 | 
252 |         self.addParameter('issn', value)
253 | 
254 | 
255 |     # string value   Issue number to query
256 |     # return void
257 |     def issueNumber(self, value):
258 | 
259 |         self.addParameter('is_number', value)
260 | 
261 | 
262 |     # string value   Text to query across metadata fields and the abstract
263 |     # return void
264 |     def metaDataText(self, value):
265 | 
266 |         self.addParameter('meta_data', value)
267 | 
268 | 
269 |     # string value   Publication Facet text to query
270 |     # return void
271 |     def publicationFacetText(self, value):
272 | 
273 |         self.addParameter('d-year', value)
274 | 
275 | 
276 |     # string value   Publisher Facet text to query
277 |     # return void
278 |     def publisherFacetText(self, value):
279 | 
280 |         self.addParameter('d-publisher', value)
281 | 
282 | 
283 |     # string value   Publication title to query
284 |     # return void
285 |     def publicationTitle(self, value):
286 | 
287 |         self.addParameter('publication_title', value)
288 | 
289 | 
290 |     # string or number value   Publication year to query
291 |     # return void
292 |     def publicationYear(self, value):
293 | 
294 |         self.addParameter('publication_year', value)
295 | 
296 | 
297 |     # string value   Text to query across metadata fields, abstract and document text
298 |     # return void
299 |     def queryText(self, value):
300 | 
301 |         self.addParameter('querytext', value)
302 | 
303 | 
304 |     # string value   Thesaurus terms (IEEE Terms) to query
305 |     # return void
306 |     def thesaurusTerms(self, value):
307 | 
308 |         self.addParameter('thesaurus_terms', value)
309 | 
310 | 
311 |     # add query parameter
312 |     # string parameter   Data field to query
313 |     # string value       Text to use in query
314 |     # return void
315 |     def addParameter(self, parameter, value):
316 |       
317 |         value = value.strip()
318 | 
319 |         if (len(value) > 0):
320 | 
321 |             self.parameters[parameter]= value
322 |         
323 |             # viable query criteria provided
324 |             self.queryProvided = True
325 | 
326 |             # set flags based on parameter
327 |             if (parameter == 'article_number'):
328 | 
329 |                 self.usingArticleNumber = True
330 | 
331 |             if (parameter == 'boolean_text'):
332 | 
333 |                 self.usingBoolean = True
334 | 
335 |             if (parameter == 'facet' or parameter == 'd-au' or parameter == 'd-year' or parameter == 'd-pubtype' or parameter == 'd-publisher'):
336 | 
337 |                 self.usingFacet = True
338 | 
339 |     
340 |     # Open Access document
341 |     # string article   Article number to query
342 |     # return void
343 |     def openAccess(self, article):
344 |       
345 |         self.usingOpenAccess = True
346 |         self.queryProvided = True
347 |         self.articleNumber(article)
348 | 
349 | 
350 |     # calls the API
351 |     # string debugMode  If this mode is on (True) then output query and not data
352 |     # return either raw result string, XML or JSON object, or array
353 |     def callAPI(self, debugModeOff=True):
354 |         
355 |         if self.usingOpenAccess is True:
356 | 
357 |             str1 = self.buildOpenAccessQuery()
358 | 
359 |         else:
360 | 
361 |             str1 = self.buildQuery()
362 | 
363 |         if debugModeOff is False:
364 |         
365 |             return str1
366 |         
367 |         else:
368 |         
369 |             if self.queryProvided is False:
370 |                 print("No search criteria provided")
371 |             data = {}
372 |             try:
373 |                 data = self.queryAPI(str1)
374 |             except urllib.error.HTTPError as ex:
375 |                 return data
376 |             except Exception as ex:
377 |                 return data
378 |             return data
379 | 
380 | 
381 |     # creates the URL for the Open Access Document API call
382 |     # return string: full URL for querying the API
383 |     def buildOpenAccessQuery(self):
384 | 
385 |         url = self.openAccessEndPoint;
386 |         url += str(self.parameters['article_number']) + '/fulltext'
387 |         url += '?apikey=' + str(self.apiKey)
388 |         url += '&format=' + str(self.outputType)
389 | 
390 |         return url
391 | 
392 | 
393 |     # creates the URL for the non-Open Access Document API call
394 |     # return string: full URL for querying the API
395 |     def buildQuery(self):
396 | 
397 |         url = self.endPoint;
398 | 
399 |         url += '?apikey=' + str(self.apiKey)
400 |         url += '&format=' + str(self.outputType)
401 |         url += '&max_records=' + str(self.resultSetMax)
402 |         url += '&start_record=' + str(self.startRecord)
403 |         url += '&sort_order=' + str(self.sortOrder)
404 |         url += '&sort_field=' + str(self.sortField)
405 | 
406 |         # add in search criteria
407 |         # article number query takes priority over all others
408 |         if (self.usingArticleNumber):
409 | 
410 |             url += '&article_number=' + str(self.parameters['article_number'])
411 | 
412 |         # boolean query
413 |         elif (self.usingBoolean):
414 | 
415 |              url += '&querytext=(' + urllib.parse.quote_plus(self.parameters['boolean_text']) + ')'
416 | 
417 |         else:
418 | 
419 |             for key in self.parameters:
420 | 
421 |                 if (self.usingFacet and self.facetApplied is False):
422 | 
423 |                     url += '&querytext=' + urllib.parse.quote_plus(self.parameters[key]) + '&facet=' + key
424 |                     self.facetApplied = True
425 | 
426 |                 else:
427 | 
428 |                     url += '&' + key + '=' + urllib.parse.quote_plus(self.parameters[key])
429 | 
430 | 
431 |         # add in filters
432 |         for key in self.filters:
433 | 
434 |             url += '&' + key + '=' + str(self.filters[key])
435 |  
436 |         return url
437 | 
438 | 
439 |     # creates the URL for the API call
440 |     # string url  Full URL to pass to API
441 |     # return string: Results from API
442 |     def queryAPI(self, url):
443 |         try:
444 |             headers = {'Content-type': 'application/json', 'Accept': 'application/json'}
445 |             content = requests.get(url, headers=headers)
446 |         except urllib.error.HTTPError as ex:
447 |             return content
448 |         except UnicodeEncodeError as ex:
449 |             return content
450 |         except urllib.error.URLError as ex:
451 |             return content
452 |         except Exception as ex:
453 |             return content
454 |         return content
455 | 
456 | 
457 |     # formats the data returned by the API
458 |     # string data    Result string from API
459 |     def formatData(self, data):
460 | 
461 |         if self.outputDataFormat == 'raw':
462 |             return data
463 | 
464 |         elif self.outputDataFormat == 'object':
465 |             
466 |             if self.outputType == 'xml':
467 |                 obj = ET.ElementTree(ET.fromstring(data))
468 |                 return obj
469 | 
470 |             else:
471 |                 obj = json.loads(data) 
472 |                 return obj
473 | 
474 |         else:
475 |             return data
476 | 


--------------------------------------------------------------------------------
/docs/configuration_guide.md:
--------------------------------------------------------------------------------
  1 | # SaLS Configuration Guide
  2 | 
  3 | ## Overview
  4 | 
  5 | This guide provides comprehensive information about configuring SaLS (Semi-automatic Literature Survey) for your research needs. SaLS uses YAML configuration files to define search parameters, filters, and database selections.
  6 | 
  7 | ## Table of Contents
  8 | 
  9 | 1. [Basic Configuration Structure](#basic-configuration-structure)
 10 | 2. [Required Parameters](#required-parameters)
 11 | 3. [Optional Parameters](#optional-parameters)
 12 | 4. [Query Syntax](#query-syntax)
 13 | 5. [Database Configuration](#database-configuration)
 14 | 6. [Filtering Options](#filtering-options)
 15 | 7. [Best Practices](#best-practices)
 16 | 8. [Troubleshooting](#troubleshooting)
 17 | 9. [Configuration Templates](#configuration-templates)
 18 | 
 19 | ## Basic Configuration Structure
 20 | 
 21 | A SaLS configuration file is a YAML document with the following structure:
 22 | 
 23 | ```yaml
 24 | # Required parameters
 25 | queries: [...]
 26 | databases: [...]
 27 | search_date: "YYYY-MM-DD"
 28 | folder_name: "your_search_name"
 29 | 
 30 | # Optional parameters
 31 | start_date: "YYYY-MM-DD"
 32 | end_date: "YYYY-MM-DD"
 33 | syntactic_filters: [...]
 34 | semantic_filters: [...]
 35 | synonyms: {...}
 36 | ```
 37 | 
 38 | ## Required Parameters
 39 | 
 40 | ### queries
 41 | **Type**: List of dictionaries  
 42 | **Description**: Defines your search queries using boolean expressions  
 43 | **Format**: `[{query_name: "boolean_expression"}]`
 44 | 
 45 | **Example**:
 46 | ```yaml
 47 | queries:
 48 |   - machine learning: "'machine learning' & 'edge computing'"
 49 |   - systems engineering: "'systems engineering' | 'SE'"
 50 | ```
 51 | 
 52 | **Best Practices**:
 53 | - Use descriptive names for your queries
 54 | - Use quotes around multi-word terms
 55 | - Combine related concepts with OR operators
 56 | - Use AND operators to narrow down results
 57 | 
 58 | ### databases
 59 | **Type**: List of strings  
 60 | **Description**: Specifies which databases to search  
 61 | **Available Options**: `arxiv`, `semantic_scholar`, `springer`, `ieeexplore`, `scopus`, `core`, `crossref`, `europe_pmc`, `pubmed`, `openalex`
 62 | 
 63 | **Example**:
 64 | ```yaml
 65 | databases:
 66 |   - arxiv                    # Open access, no API key needed
 67 |   - semantic_scholar        # Open access, no API key needed
 68 |   - springer                # Commercial, requires API key
 69 | ```
 70 | 
 71 | **Note**: Some databases require API keys in `config.json`. See [Database Configuration](#database-configuration) for details.
 72 | 
 73 | ### search_date
 74 | **Type**: String (YYYY-MM-DD format)  
 75 | **Description**: Date when the search was performed (for organization purposes)  
 76 | **Example**: `search_date: 2024-12-15`
 77 | 
 78 | ### folder_name
 79 | **Type**: String  
 80 | **Description**: Name of the folder where results will be stored  
 81 | **Example**: `folder_name: my_literature_search`
 82 | 
 83 | ## Optional Parameters
 84 | 
 85 | ### start_date and end_date
 86 | **Type**: String (YYYY-MM-DD format)  
 87 | **Description**: Date range to limit search results  
 88 | **Example**:
 89 | ```yaml
 90 | start_date: 2020-01-01      # Papers from 2020 onwards
 91 | end_date: 2024-12-31        # Papers until end of 2024
 92 | ```
 93 | 
 94 | **Benefits**:
 95 | - Reduces search time
 96 | - Focuses on recent research
 97 | - Improves relevance for time-sensitive topics
 98 | 
 99 | ### synonyms
100 | **Type**: Dictionary  
101 | **Description**: Defines synonyms for query expansion to increase search coverage  
102 | **Format**: `{term: [synonym1, synonym2, ...]}`
103 | 
104 | **Example**:
105 | ```yaml
106 | machine learning:
107 |   - ml
108 |   - deep learning
109 |   - neural networks
110 |   - supervised learning
111 | ```
112 | 
113 | **Best Practices**:
114 | - Include abbreviations and alternative names
115 | - Add related concepts and terminology
116 | - Use domain-specific synonyms
117 | 
118 | ### syntactic_filters
119 | **Type**: List of strings  
120 | **Description**: Terms that must appear in paper content (AND logic)  
121 | **Example**:
122 | ```yaml
123 | syntactic_filters:
124 |   - edge computing
125 |   - distributed systems
126 |   - performance
127 | ```
128 | 
129 | **Use Cases**:
130 | - Filtering out irrelevant papers early
131 | - Ensuring specific concepts are covered
132 | - Improving result relevance
133 | 
134 | ### semantic_filters
135 | **Type**: List of dictionaries  
136 | **Description**: AI-powered similarity matching using detailed descriptions  
137 | **Format**: `[{filter_name: "detailed_description"}]`
138 | 
139 | **Example**:
140 | ```yaml
141 | semantic_filters:
142 |   - edge computing: "Research on edge computing, fog computing, and distributed edge systems including resource management, placement strategies, and performance optimization"
143 |   - ml systems: "Papers about machine learning systems in production environments including deployment, monitoring, scaling, and operational challenges"
144 | ```
145 | 
146 | **Best Practices**:
147 | - Be specific and descriptive
148 | - Include key concepts and requirements
149 | - Focus on what you're looking for, not what you want to exclude
150 | 
151 | ## Query Syntax
152 | 
153 | SaLS supports a flexible boolean query syntax with the following operators:
154 | 
155 | ### Basic Operators
156 | - `&` or `AND` - AND operator (both terms must be present)
157 | - `|` or `OR` - OR operator (either term can be present)
158 | - `&&` or `||` - Alternative syntax for AND/OR
159 | 
160 | ### Advanced Features
161 | - **Parentheses**: Group expressions for complex logic
162 | - **Quotes**: Preserve multi-word terms as phrases
163 | - **Legacy Support**: `¦` character for OR operations
164 | 
165 | ### Examples
166 | 
167 | **Simple AND**:
168 | ```yaml
169 | queries:
170 |   - basic: "'machine learning' & 'edge computing'"
171 | ```
172 | 
173 | **Complex Boolean Expression**:
174 | ```yaml
175 | queries:
176 |   - complex: "'machine learning' & ('edge computing' | 'fog computing') & ('performance' | 'optimization')"
177 | ```
178 | 
179 | **Grouped Logic**:
180 | ```yaml
181 | queries:
182 |   - grouped: "('deep learning' | 'neural networks') & ('computer vision' | 'image processing')"
183 | ```
184 | 
185 | ## Database Configuration
186 | 
187 | ### Open Access Databases (No API Key Required)
188 | - **arXiv**: Excellent for recent preprints and open access papers
189 | - **Semantic Scholar**: Good for citation analysis and impact assessment
190 | 
191 | ### Commercial Databases (API Key Required)
192 | - **Springer Nature**: High-quality journals and books
193 | - **IEEE Xplore**: Excellent for engineering and computer science
194 | - **Scopus**: Comprehensive coverage across all disciplines
195 | - **CORE**: Open access repository aggregator
196 | 
197 | ### API Key Setup
198 | 1. Create a `config.json` file in the project root
199 | 2. Add your API keys:
200 | ```json
201 | {
202 |   "api_access_springer": "YOUR_SPRINGER_API_KEY",
203 |   "api_access_ieee": "YOUR_IEEE_API_KEY",
204 |   "api_access_elsevier": "YOUR_SCOPUS_API_KEY",
205 |   "api_access_core": "YOUR_CORE_API_KEY"
206 | }
207 | ```
208 | 
209 | **Note**: Only add keys for databases you plan to use.
210 | 
211 | ## Filtering Options
212 | 
213 | ### Two-Stage Filtering Process
214 | 
215 | 1. **Syntactic Filtering**: Basic text matching using your specified terms
216 | 2. **Semantic Filtering**: AI-powered similarity matching using BERT models
217 | 
218 | ### Filtering Strategy
219 | 
220 | **For High Precision (Fewer, More Relevant Results)**:
221 | - Use more specific queries
222 | - Add more syntactic filters
223 | - Use date ranges to focus on recent work
224 | 
225 | **For High Recall (More Results, May Include Less Relevant)**:
226 | - Use broader queries with OR operators
227 | - Fewer syntactic filters
228 | - No date restrictions
229 | 
230 | ## Best Practices
231 | 
232 | ### 1. Start Simple
233 | - Begin with basic queries
234 | - Add complexity gradually
235 | - Test with open databases first
236 | 
237 | ### 2. Query Design
238 | - Use specific terminology from your field
239 | - Include synonyms and abbreviations
240 | - Balance between precision and recall
241 | 
242 | ### 3. Database Selection
243 | - Start with open databases (arxiv, semantic_scholar)
244 | - Add commercial databases for comprehensive coverage
245 | - Consider field-specific database strengths
246 | 
247 | ### 4. Filtering Strategy
248 | - Use syntactic filters for precision
249 | - Use semantic filters for recall
250 | - Iterate based on initial results
251 | 
252 | ### 5. Date Management
253 | - Set reasonable date ranges for your research area
254 | - Consider field evolution speed
255 | - Balance between recency and comprehensiveness
256 | 
257 | ## Troubleshooting and Error Recovery
258 | 
259 | ### Configuration Error Recovery
260 | 
261 | SaLS now provides intelligent error recovery that helps you fix configuration issues quickly and continue with your research.
262 | 
263 | #### Error Severity Levels
264 | 
265 | **🔴 Critical Errors** - Pipeline cannot continue
266 | - Missing or invalid queries (required for search)
267 | - Malformed query syntax
268 | - These must be fixed before the pipeline can run
269 | 
270 | **🟡 Warnings** - Pipeline can continue with defaults
271 | - Missing databases (defaults to open databases)
272 | - Missing search_date (defaults to current date)
273 | - Missing folder_name (defaults to filename-based)
274 | - Invalid date formats (defaults to reasonable values)
275 | - Missing filters (defaults to empty lists)
276 | 
277 | #### Automatic Fallbacks
278 | 
279 | When warnings are detected, SaLS automatically applies sensible defaults:
280 | 
281 | ```yaml
282 | # If databases are missing, SaLS uses:
283 | databases: [arxiv, semantic_scholar]
284 | 
285 | # If search_date is missing, SaLS uses:
286 | search_date: [current date]
287 | 
288 | # If folder_name is missing, SaLS uses:
289 | folder_name: [filename without .yaml extension]
290 | 
291 | # If filters are missing, SaLS uses:
292 | syntactic_filters: []
293 | semantic_filters: []
294 | ```
295 | 
296 | #### Recovery Suggestions
297 | 
298 | For each issue, SaLS provides:
299 | - **Clear description** of what's wrong
300 | - **Specific fix** instructions
301 | - **Working examples** to copy-paste
302 | - **Default values** that will be used
303 | 
304 | ### Common Issues and Solutions
305 | 
306 | #### Configuration Validation Errors
307 | **Problem**: Configuration validation fails with specific error messages  
308 | **Solution**: Follow the error message guidance and check:
309 | - YAML syntax (proper indentation)
310 | - Required field formats
311 | - Date format (YYYY-MM-DD)
312 | - Database name spelling
313 | 
314 | **Recovery**: SaLS will show exactly what's wrong and how to fix it
315 | 
316 | #### Missing Required Fields
317 | **Problem**: Critical fields like queries are missing  
318 | **Solution**: Add the missing sections following the provided examples
319 | 
320 | **Recovery**: SaLS prevents pipeline execution and guides you to add required fields
321 | 
322 | #### Missing Optional Fields
323 | **Problem**: Optional fields like databases or search_date are missing  
324 | **Solution**: Either add them or let SaLS use sensible defaults
325 | 
326 | **Recovery**: SaLS continues with defaults and shows what was applied
327 | 
328 | #### Invalid Date Formats
329 | **Problem**: Dates are in wrong format (e.g., 2020/01/01)  
330 | **Solution**: Use YYYY-MM-DD format (e.g., 2020-01-01)
331 | 
332 | **Recovery**: SaLS suggests the correct format and provides examples
333 | 
334 | #### Invalid Database Names
335 | **Problem**: Unknown database specified  
336 | **Solution**: Use only valid database names from the supported list
337 | 
338 | **Recovery**: SaLS shows all valid databases and continues with valid ones
339 | 
340 | #### Too Many Results
341 | **Problem**: Search returns too many papers  
342 | **Solutions**:
343 | - Add more specific terms to queries
344 | - Use syntactic filters
345 | - Set date ranges
346 | - Use more specific semantic filter descriptions
347 | 
348 | #### Too Few Results
349 | **Problem**: Search returns too few papers  
350 | **Solutions**:
351 | - Broaden queries with OR operators
352 | - Add synonyms
353 | - Remove overly restrictive filters
354 | - Check date ranges
355 | 
356 | #### API Errors
357 | **Problem**: Commercial database searches fail  
358 | **Solutions**:
359 | - Verify API keys in `config.json`
360 | - Check API key validity
361 | - Use open databases as fallback
362 | - Check rate limiting
363 | 
364 | #### Semantic Filtering Issues
365 | **Problem**: Semantic filters don't work as expected  
366 | **Solutions**:
367 | - Make descriptions more specific and detailed
368 | - Include key concepts and requirements
369 | - Focus on what you want, not what you want to exclude
370 | 
371 | ### Error Message Examples
372 | 
373 | #### Critical Error (Pipeline Stops)
374 | ```
375 | Configuration error: 'queries' section is missing in config.yaml
376 | 
377 | 🔴 CRITICAL ERRORS - Pipeline cannot continue:
378 | 
379 | ❌ Missing queries section
380 |    Fix: Add a queries section with your search terms
381 |    Example:
382 | queries:
383 |   - augmented reality: "'augmented reality' & 'edge'"
384 |   - machine learning: "'machine learning' & 'systems'"
385 | ```
386 | 
387 | #### Warnings (Pipeline Continues with Defaults)
388 | ```
389 | Configuration validation completed with warnings:
390 | Configuration warning: 'databases' section is missing
391 | Configuration warning: 'search_date' is missing
392 | 
393 | 🟡 WARNINGS - Pipeline will continue with defaults where possible:
394 | 
395 | ⚠️  Missing databases section
396 |    Fix: Add databases section or use default open databases
397 |    Default: ['arxiv', 'semantic_scholar']
398 |    Example:
399 | databases:
400 |   - arxiv                    # Open access, no API key needed
401 |   - semantic_scholar        # Open access, no API key needed
402 | 
403 | ⚠️  Missing search_date
404 |    Fix: Add search_date or use current date
405 |    Default: current date
406 |    Example:
407 | search_date: 2024-12-15
408 | ```
409 | 
410 | ### Best Practices for Error Recovery
411 | 
412 | 1. **Start with the error messages** - they provide specific guidance
413 | 2. **Fix critical errors first** - these prevent the pipeline from running
414 | 3. **Review warnings** - understand what defaults will be applied
415 | 4. **Use the provided examples** - copy-paste working configurations
416 | 5. **Test incrementally** - fix one issue at a time
417 | 6. **Let SaLS help** - use the automatic fallbacks when appropriate
418 | 
419 | ### Getting Help with Configuration Issues
420 | 
421 | If you encounter persistent issues:
422 | 
423 | 1. **Check the error messages** - they provide specific guidance
424 | 2. **Review the configuration guide** - covers common scenarios
425 | 3. **Use the templates** - working examples to build upon
426 | 4. **Start simple** - add complexity gradually
427 | 5. **Test with open databases** - no API key requirements
428 | 
429 | ## Configuration Templates
430 | 
431 | SaLS provides several configuration templates to get you started:
432 | 
433 | ### Basic Template
434 | - **File**: `templates/basic_search_template.yaml`
435 | - **Use Case**: Simple literature searches
436 | - **Features**: Basic queries, synonyms, open databases
437 | 
438 | ### Advanced Template
439 | - **File**: `templates/advanced_research_template.yaml`
440 | - **Use Case**: Complex research projects, systematic reviews
441 | - **Features**: All SaLS features, comprehensive examples
442 | 
443 | ### Machine Learning Template
444 | - **File**: `templates/machine_learning_template.yaml`
445 | - **Use Case**: ML/AI research
446 | - **Features**: ML-specific terminology, subfield examples
447 | 
448 | ### Using Templates
449 | 1. Copy the appropriate template file
450 | 2. Rename it to your project
451 | 3. Modify the values according to your research needs
452 | 4. Update the `search_date` and `folder_name`
453 | 5. Test with a small search first
454 | 
455 | ## Getting Help
456 | 
457 | If you encounter issues:
458 | 
459 | 1. **Check the error messages** - they provide specific guidance
460 | 2. **Review the configuration guide** - covers common scenarios
461 | 3. **Use the templates** - working examples to build upon
462 | 4. **Start simple** - add complexity gradually
463 | 5. **Test with open databases** - no API key requirements
464 | 
465 | ## Advanced Configuration
466 | 
467 | ### Custom Fields and Types
468 | ```yaml
469 | # Advanced users can customize search fields and types
470 | fields: ['title', 'abstract', 'keywords', 'full_text']
471 | types: ['conferences', 'journals', 'preprints', 'reports']
472 | ```
473 | 
474 | ### Performance Optimization
475 | - Use date ranges to limit search scope
476 | - Start with fewer databases and add more as needed
477 | - Use syntactic filters to reduce processing time
478 | - Test queries with small date ranges first
479 | 
480 | ---
481 | 
482 | *This guide covers the essential configuration options for SaLS. For more advanced usage, refer to the code documentation and examples in the templates directory.*
483 | 


--------------------------------------------------------------------------------
/util/error_standards.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python3
  2 | """
  3 | Error Handling Standards for SaLS Project.
  4 | 
  5 | This module defines consistent error handling patterns, logging standards,
  6 | and user experience guidelines across the entire SaLS codebase.
  7 | 
  8 | Standards ensure:
  9 | 1. Consistent error categorization (CRITICAL, ERROR, WARNING, INFO)
 10 | 2. Uniform logging message formats
 11 | 3. Standardized user-facing error messages
 12 | 4. Consistent exception handling patterns
 13 | 5. Unified error recovery suggestions
 14 | """
 15 | 
 16 | import logging
 17 | import traceback
 18 | from typing import Dict, List, Optional, Tuple, Union
 19 | from enum import Enum
 20 | 
 21 | 
 22 | class ErrorSeverity(Enum):
 23 |     """Standard error severity levels for consistent categorization."""
 24 |     CRITICAL = "CRITICAL"      # Pipeline cannot continue
 25 |     ERROR = "ERROR"            # Operation failed, but pipeline can continue
 26 |     WARNING = "WARNING"        # Issue detected, operation continues with defaults
 27 |     INFO = "INFO"              # Informational message
 28 |     DEBUG = "DEBUG"            # Debug information
 29 | 
 30 | 
 31 | class ErrorCategory(Enum):
 32 |     """Standard error categories for consistent classification."""
 33 |     CONFIGURATION = "CONFIGURATION"           # Configuration file issues
 34 |     API = "API"                              # External API failures
 35 |     NETWORK = "NETWORK"                      # Network/connection issues
 36 |     DATA = "DATA"                            # Data processing issues
 37 |     FILE = "FILE"                            # File I/O operations
 38 |     VALIDATION = "VALIDATION"                # Data validation failures
 39 |     SYSTEM = "SYSTEM"                        # System-level issues
 40 |     USER_INPUT = "USER_INPUT"                # User input validation
 41 |     RESOURCE = "RESOURCE"                    # Resource limitations (quotas, etc.)
 42 |     PIPELINE = "PIPELINE"                    # Pipeline orchestration and step execution
 43 | 
 44 | 
 45 | class ErrorContext:
 46 |     """Standard error context information for consistent error reporting."""
 47 |     
 48 |     def __init__(self, 
 49 |                  module: str,
 50 |                  function: str,
 51 |                  operation: str,
 52 |                  severity: ErrorSeverity,
 53 |                  category: ErrorCategory,
 54 |                  user_facing: bool = True):
 55 |         self.module = module
 56 |         self.function = function
 57 |         self.operation = operation
 58 |         self.severity = severity
 59 |         self.category = category
 60 |         self.user_facing = user_facing
 61 |         self.timestamp = None  # Will be set by error handler
 62 |         self.additional_context: Dict = {}
 63 |     
 64 |     def add_context(self, key: str, value: str) -> None:
 65 |         """Add additional context information."""
 66 |         self.additional_context[key] = value
 67 |     
 68 |     def get_formatted_context(self) -> str:
 69 |         """Get formatted context string for logging."""
 70 |         context_parts = [
 71 |             f"Module: {self.module}",
 72 |             f"Function: {self.function}",
 73 |             f"Operation: {self.operation}",
 74 |             f"Category: {self.category.value}",
 75 |             f"Severity: {self.severity.value}"
 76 |         ]
 77 |         
 78 |         for key, value in self.additional_context.items():
 79 |             context_parts.append(f"{key}: {value}")
 80 |         
 81 |         return " | ".join(context_parts)
 82 | 
 83 | 
 84 | class ErrorMessage:
 85 |     """Standard error message format for consistent error reporting."""
 86 |     
 87 |     def __init__(self, 
 88 |                  context: ErrorContext,
 89 |                  error_type: str,
 90 |                  error_description: str,
 91 |                  recovery_suggestion: Optional[str] = None,
 92 |                  next_steps: Optional[List[str]] = None,
 93 |                  exception_type: Optional[str] = None,
 94 |                  exception_message: Optional[str] = None,
 95 |                  short_traceback: Optional[str] = None):
 96 |         self.context = context
 97 |         self.error_type = error_type
 98 |         self.error_description = error_description
 99 |         self.recovery_suggestion = recovery_suggestion
100 |         self.next_steps = next_steps or []
101 |         self.exception_type = exception_type
102 |         self.exception_message = exception_message
103 |         self.short_traceback = short_traceback
104 |     
105 |     def get_log_message(self) -> str:
106 |         """Get formatted message for logging."""
107 |         message_parts = [
108 |             f"[{self.context.severity.value}] {self.error_type}",
109 |             f"Description: {self.error_description}",
110 |             f"Context: {self.context.get_formatted_context()}"
111 |         ]
112 |         
113 |         if self.recovery_suggestion:
114 |             message_parts.append(f"Recovery: {self.recovery_suggestion}")
115 |         
116 |         return " | ".join(message_parts)
117 |     
118 |     def get_user_message(self) -> str:
119 |         """Get user-friendly error message."""
120 |         if not self.context.user_facing:
121 |             return f"An error occurred: {self.error_description}"
122 |         
123 |         message_parts = []
124 |         
125 |         # Add severity indicator
126 |         if self.context.severity == ErrorSeverity.CRITICAL:
127 |             message_parts.append("🔴 CRITICAL ERROR")
128 |         elif self.context.severity == ErrorSeverity.ERROR:
129 |             message_parts.append("❌ ERROR")
130 |         elif self.context.severity == ErrorSeverity.WARNING:
131 |             message_parts.append("⚠️  WARNING")
132 |         else:
133 |             message_parts.append("ℹ️  INFO")
134 |         
135 |         # Add main message
136 |         message_parts.append(self.error_description)
137 |         
138 |         # Add concise details
139 |         message_parts.append(f"Details: {self.error_type} | Where: {self.context.module}.{self.context.function} ({self.context.operation})")
140 |         if self.exception_type:
141 |             message_parts.append(f"Error: {self.exception_type}: {self.exception_message}")
142 |         
143 |         # Add recovery suggestion
144 |         if self.recovery_suggestion:
145 |             message_parts.append(f"\n💡 {self.recovery_suggestion}")
146 |         
147 |         # Add next steps
148 |         if self.next_steps:
149 |             message_parts.append("\n📋 Next steps:")
150 |             for i, step in enumerate(self.next_steps, 1):
151 |                 message_parts.append(f"   {i}. {step}")
152 |         
153 |         return "\n".join(message_parts)
154 | 
155 | 
156 | class ErrorHandler:
157 |     """Standard error handler for consistent error processing."""
158 |     
159 |     def __init__(self, logger: logging.Logger):
160 |         # Accept either a standard logging.Logger or a wrapper with `.logger`
161 |         try:
162 |             if not isinstance(logger, logging.Logger) and hasattr(logger, 'logger') and isinstance(logger.logger, logging.Logger):
163 |                 self.logger = logger.logger
164 |             else:
165 |                 self.logger = logger
166 |         except Exception:
167 |             self.logger = logging.getLogger('sals_pipeline')
168 |     
169 |     def handle_error(self, 
170 |                     error: Exception,
171 |                     context: ErrorContext,
172 |                     error_type: str,
173 |                     error_description: str,
174 |                     recovery_suggestion: Optional[str] = None,
175 |                     next_steps: Optional[List[str]] = None) -> ErrorMessage:
176 |         """Handle an error according to SaLS standards."""
177 |         
178 |         # Create error message
179 |         exc_type_name = type(error).__name__ if error else None
180 |         exc_message = str(error) if error else None
181 |         short_tb = None
182 |         try:
183 |             if error and getattr(error, "__traceback__", None):
184 |                 tb_frames = traceback.extract_tb(error.__traceback__)
185 |                 last_frames = tb_frames[-3:] if len(tb_frames) > 3 else tb_frames
186 |                 formatted_frames = traceback.format_list(last_frames)
187 |                 short_tb = "".join(formatted_frames).rstrip()
188 |                 if exc_type_name:
189 |                     short_tb = f"{short_tb}\n{exc_type_name}: {exc_message}"
190 |         except Exception:
191 |             short_tb = None
192 |         error_msg = ErrorMessage(
193 |             context=context,
194 |             error_type=error_type,
195 |             error_description=error_description,
196 |             recovery_suggestion=recovery_suggestion,
197 |             next_steps=next_steps,
198 |             exception_type=exc_type_name,
199 |             exception_message=exc_message,
200 |             short_traceback=short_tb
201 |         )
202 |         
203 |         # Log according to severity (include traceback for ERROR/CRITICAL)
204 |         if context.severity == ErrorSeverity.CRITICAL:
205 |             if error:
206 |                 self.logger.critical(error_msg.get_log_message(), exc_info=True)
207 |             else:
208 |                 self.logger.critical(error_msg.get_log_message())
209 |         elif context.severity == ErrorSeverity.ERROR:
210 |             if error:
211 |                 self.logger.error(error_msg.get_log_message(), exc_info=True)
212 |             else:
213 |                 self.logger.error(error_msg.get_log_message())
214 |         elif context.severity == ErrorSeverity.WARNING:
215 |             self.logger.warning(error_msg.get_log_message())
216 |         elif context.severity == ErrorSeverity.INFO:
217 |             self.logger.info(error_msg.get_log_message())
218 |         else:  # DEBUG
219 |             self.logger.debug(error_msg.get_log_message())
220 |         
221 |         return error_msg
222 |     
223 |     def log_and_print(self, 
224 |                       error_msg: ErrorMessage,
225 |                       print_to_console: bool = True) -> None:
226 |         """Log error and optionally print to console for user-facing errors."""
227 |         
228 |         # Always log
229 |         if error_msg.context.severity == ErrorSeverity.CRITICAL:
230 |             self.logger.critical(error_msg.get_log_message())
231 |         elif error_msg.context.severity == ErrorSeverity.ERROR:
232 |             self.logger.error(error_msg.get_log_message())
233 |         elif error_msg.context.severity == ErrorSeverity.WARNING:
234 |             self.logger.warning(error_msg.get_log_message())
235 |         elif error_msg.context.severity == ErrorSeverity.INFO:
236 |             self.logger.info(error_msg.get_log_message())
237 |         else:  # DEBUG
238 |             self.logger.debug(error_msg.get_log_message())
239 |         
240 |         # Print to console for user-facing errors
241 |         if print_to_console and error_msg.context.user_facing:
242 |             # Try to locate log file path from logger handlers
243 |             log_file_path = ""
244 |             try:
245 |                 for h in getattr(self.logger, 'handlers', []):
246 |                     if hasattr(h, 'baseFilename'):
247 |                         log_file_path = h.baseFilename
248 |                         break
249 |             except Exception:
250 |                 log_file_path = ""
251 |             user_message = error_msg.get_user_message()
252 |             # Append short traceback snippet if available
253 |             if error_msg.short_traceback:
254 |                 indented = "\n".join("   " + line.rstrip() for line in error_msg.short_traceback.splitlines())
255 |                 user_message += f"\n🧵 Traceback (last 3 frames):\n{indented}"
256 |             if log_file_path:
257 |                 user_message += f"\n📄 See logs for full traceback: {log_file_path}"
258 |             print(user_message)
259 | 
260 | 
261 | # Standard error messages for common scenarios
262 | STANDARD_ERROR_MESSAGES = {
263 |     "file_not_found": {
264 |         "description": "File not found",
265 |         "recovery": "Check file path and ensure file exists",
266 |         "next_steps": [
267 |             "Verify the file path is correct",
268 |             "Check file permissions",
269 |             "Ensure the file exists in the specified location"
270 |         ]
271 |     },
272 |     "invalid_configuration": {
273 |         "description": "Invalid configuration detected",
274 |         "recovery": "Review configuration file and fix validation errors",
275 |         "next_steps": [
276 |             "Check the configuration file format",
277 |             "Verify all required fields are present",
278 |             "Ensure field values are in correct format"
279 |         ]
280 |     },
281 |     "api_quota_exceeded": {
282 |         "description": "API quota exceeded",
283 |         "recovery": "Wait for quota reset or use alternative search strategies",
284 |         "next_steps": [
285 |             "Wait for daily quota reset",
286 |             "Reduce search scope using filters",
287 |             "Use date ranges to limit results"
288 |         ]
289 |     },
290 |     "network_timeout": {
291 |         "description": "Network request timed out",
292 |         "recovery": "Check network connection and retry",
293 |         "next_steps": [
294 |             "Verify internet connection",
295 |             "Check firewall settings",
296 |             "Retry the operation"
297 |         ]
298 |     },
299 |     "data_validation_failed": {
300 |         "description": "Data validation failed",
301 |         "recovery": "Review input data and fix validation issues",
302 |         "next_steps": [
303 |             "Check data format and content",
304 |             "Verify required fields are present",
305 |             "Ensure data meets validation criteria"
306 |         ]
307 |     },
308 |     "pipeline_step_failed": {
309 |         "description": "Pipeline step execution failed",
310 |         "recovery": "Review the step that failed and check for configuration or data issues",
311 |         "next_steps": [
312 |             "Check the logs for detailed error information",
313 |             "Verify input data for the failed step",
314 |             "Review step configuration parameters",
315 |             "Consider running the pipeline from the failed step"
316 |         ]
317 |     },
318 |     "pipeline_execution_failed": {
319 |         "description": "Pipeline execution failed",
320 |         "recovery": "Review the pipeline execution and check for critical errors",
321 |         "next_steps": [
322 |             "Check the logs for detailed error information",
323 |             "Verify all configuration parameters",
324 |             "Check system resources and permissions",
325 |             "Review input data quality and format"
326 |         ]
327 |     },
328 |     "configuration_fallback_failed": {
329 |         "description": "Configuration fallback application failed",
330 |         "recovery": "Review configuration parameters and apply fallbacks manually",
331 |         "next_steps": [
332 |             "Check configuration file format and content",
333 |             "Verify parameter types and values",
334 |             "Apply missing parameters manually",
335 |             "Restart the pipeline with corrected configuration"
336 |         ]
337 |     }
338 | }
339 | 
340 | 
341 | def create_error_context(module: str,
342 |                         function: str,
343 |                         operation: str,
344 |                         severity: ErrorSeverity,
345 |                         category: ErrorCategory,
346 |                         user_facing: bool = True) -> ErrorContext:
347 |     """Create a standardized error context."""
348 |     return ErrorContext(
349 |         module=module,
350 |         function=function,
351 |         operation=operation,
352 |         severity=severity,
353 |         category=category,
354 |         user_facing=user_facing
355 |     )
356 | 
357 | 
358 | def get_standard_error_info(error_key: str) -> Dict:
359 |     """Get standard error information for common error types."""
360 |     return STANDARD_ERROR_MESSAGES.get(error_key, {
361 |         "description": "An error occurred",
362 |         "recovery": "Review the error details and take appropriate action",
363 |         "next_steps": ["Check the logs for detailed information", "Review the operation that failed"]
364 |     })
365 | 
366 | 
367 | # Example usage:
368 | """
369 | # In your module:
370 | from util.error_standards import (
371 |     ErrorHandler, create_error_context, ErrorSeverity, ErrorCategory,
372 |     get_standard_error_info
373 | )
374 | 
375 | # Create error handler
376 | error_handler = ErrorHandler(logger)
377 | 
378 | # Handle an error
379 | try:
380 |     # Your operation here
381 |     pass
382 | except FileNotFoundError as e:
383 |     context = create_error_context(
384 |         module="my_module",
385 |         function="my_function",
386 |         operation="file_reading",
387 |         severity=ErrorSeverity.ERROR,
388 |         category=ErrorCategory.FILE
389 |     )
390 |     
391 |     error_info = get_standard_error_info("file_not_found")
392 |     error_msg = error_handler.handle_error(
393 |         error=e,
394 |         context=context,
395 |         error_type="FileNotFoundError",
396 |         error_description=error_info["description"],
397 |         recovery_suggestion=error_info["recovery"],
398 |         next_steps=error_info["next_steps"]
399 |     )
400 |     
401 |     # Log and print to console
402 |     error_handler.log_and_print(error_msg, print_to_console=True)
403 | """
404 | 


--------------------------------------------------------------------------------
/clients/core.py:
--------------------------------------------------------------------------------
  1 | import time
  2 | import pandas as pd
  3 | import json
  4 | from .apis.generic import Generic
  5 | from .base_client import DatabaseClient
  6 | from os.path import exists
  7 | from util import util
  8 | from tqdm import tqdm
  9 | import logging
 10 | from util.error_standards import (
 11 |     ErrorHandler, create_error_context, ErrorSeverity, ErrorCategory,
 12 |     get_standard_error_info
 13 | )
 14 | from util.logging_standards import LogCategory
 15 | 
 16 | 
 17 | class CoreClient(DatabaseClient):
 18 |     """
 19 |     Refactored CORE client using the Template Method pattern.
 20 |     """
 21 |     
 22 |     def __init__(self):
 23 |         super().__init__(
 24 |             database_name='core',
 25 |             max_papers=1000,
 26 |             waiting_time=2,
 27 |             max_retries=3,
 28 |             quota=1000
 29 |         )
 30 |         self.api_url = 'https://api.core.ac.uk/v3/search/works'
 31 |         self.client_fields = {'title': 'title', 'abstract': 'abstract'}
 32 |         self.client = Generic()
 33 |         
 34 |         # Load API access from config
 35 |         if exists('./config.json'):
 36 |             with open("./config.json", "r") as file:
 37 |                 config = json.load(file)
 38 |             if 'api_access_core' in config:
 39 |                 self.api_access = config['api_access_core']
 40 |             else:
 41 |                 self.api_access = ''
 42 |         else:
 43 |             self.api_access = ''
 44 | 
 45 |     def _has_api_access(self) -> bool:
 46 |         """Check if CORE API access is available."""
 47 |         return self.api_access != ''
 48 | 
 49 |     def _plan_requests(self, query, syntactic_filters, synonyms, fields, types, dates, start_date, end_date) -> pd.DataFrame:
 50 |         """Plan the API requests for CORE."""
 51 |         # Extract query value from the query dictionary
 52 |         query_name = list(query.keys())[0]
 53 |         query_value = query[query_name]
 54 |         
 55 |         # Build query parameters
 56 |         c_fields = []
 57 |         for field in fields:
 58 |             if field in self.client_fields:
 59 |                 c_fields.append(self.client_fields[field])
 60 |         
 61 |         parameters = {
 62 |             'query': query_value,
 63 |             'syntactic_filters': syntactic_filters,
 64 |             'synonyms': synonyms,
 65 |             'fields': c_fields,
 66 |             'types': types
 67 |         }
 68 |         
 69 |         # Create initial request to get total count
 70 |         request = self._create_request(parameters, dates, start_date, end_date)
 71 |         headers = {'Authorization': 'Bearer ' + self.api_access}
 72 |         raw_papers = self._retry_request(self.client.request, self.api_url, 'post', request, headers)
 73 |         expected_papers = self._get_expected_papers(raw_papers)
 74 |         
 75 |         self.logger.info(LogCategory.DATABASE, "core", "_plan_requests", f"Expected papers from {self.database_name}: {expected_papers}...")
 76 |         
 77 |         # Calculate number of requests needed
 78 |         times = int(expected_papers / self.max_papers) - 1
 79 |         mod = int(expected_papers) % self.max_papers
 80 |         if mod > 0:
 81 |             times = times + 1
 82 |         
 83 |         # Check quota constraints
 84 |         if times >= self.quota:
 85 |             self.logger.info(LogCategory.DATABASE, "core", "_plan_requests", f"The number of expected papers requires {times} requests which exceeds the {self.database_name} quota of {self.quota} requests per day.")
 86 |             if len(syntactic_filters) > 0:
 87 |                 self.logger.info(LogCategory.DATABASE, "core", "_plan_requests", "Trying to reduce the number of requests using syntactic filters.")
 88 |                 que = ''
 89 |                 for word in syntactic_filters:
 90 |                     que = que.replace('<AND>last', '<AND> ')
 91 |                     que = que + "'" + word + "' <AND>last"
 92 |                 que = que.replace(' <AND>last', '')
 93 |                 parameters['query'] = que
 94 |                 request = self._create_request(parameters, dates, start_date, end_date)
 95 |                 headers = {'Authorization': 'Bearer ' + self.api_access}
 96 |                 raw_papers = self._retry_request(self.client.request, self.api_url, 'post', request, headers)
 97 |                 expected_papers = self._get_expected_papers(raw_papers)
 98 |                 self.logger.info(LogCategory.DATABASE, "core", "_plan_requests", f"Expected papers from {self.database_name} using syntactic filters: {expected_papers}...")
 99 |                 times = int(expected_papers / self.max_papers) - 1
100 |                 mod = int(expected_papers) % self.max_papers
101 |                 if mod > 0:
102 |                     times = times + 1
103 |                 if times >= self.quota:
104 |                     self.logger.info(LogCategory.DATABASE, "core", "_plan_requests", f"The number of expected papers requires {times} requests which exceeds the {self.database_name} quota of {self.quota} requests per day.")
105 |                     self.logger.info(LogCategory.DATABASE, "core", "_plan_requests", "Skipping to next repository. Try to redefine your search queries and syntactic filters. Using dates to limit your search can help in case you are not.")
106 |                     return pd.DataFrame()
107 |             else:
108 |                 self.logger.info(LogCategory.DATABASE, "core", "_plan_requests", "Skipping to next repository. Please use syntactic filters to avoid this problem. Using dates to limit your search can help in case you are not.")
109 |                 return pd.DataFrame()
110 |         
111 |         # Execute requests
112 |         parameters['expected_papers'] = expected_papers
113 |         papers = self._execute_requests(query, parameters, dates, start_date, end_date)
114 |         return papers
115 | 
116 |     def _execute_requests(self, query, parameters, dates, start_date, end_date):
117 |         """Execute the planned requests to retrieve papers."""
118 |         papers = pd.DataFrame()
119 |         times = int(parameters.get('expected_papers', 0) / self.max_papers) - 1
120 |         mod = int(parameters.get('expected_papers', 0) % self.max_papers)
121 |         if mod > 0:
122 |             times = times + 1
123 |         
124 |         for t in tqdm(range(0, times + 1)):
125 |             time.sleep(self.waiting_time)
126 |             start = self.max_papers * t
127 |             request = self._create_request(parameters, dates, start_date, end_date)
128 |             request['from'] = start
129 |             headers = {'Authorization': 'Bearer ' + self.api_access}
130 |             
131 |             raw_papers = self._retry_request(self.client.request, self.api_url, 'post', request, headers)
132 |             
133 |             if raw_papers is None:
134 |                 continue
135 |             
136 |             papers_request = self._process_raw_papers(query, raw_papers)
137 |             if len(papers) == 0:
138 |                 papers = papers_request
139 |             else:
140 |                 papers = pd.concat([papers, papers_request])
141 |         
142 |         return papers
143 | 
144 |     def _create_request(self, parameters, dates, start_date, end_date):
145 |         """Create the API request for CORE."""
146 |         start_year = start_date.year
147 |         end_year = end_date.year
148 |         query = self.client.core_query(parameters)
149 |         if dates:
150 |             query = '(yearPublished>=' + str(start_year) + ' AND yearPublished<=' + str(end_year) + ') AND ' + query
151 |         request = {
152 |             'q': query,
153 |             'limit': self.max_papers,
154 |             'offset': 0
155 |         }
156 |         
157 |         return request
158 | 
159 |     def _get_expected_papers(self, raw_papers):
160 |         """Get the expected number of papers from the API response."""
161 |         total = 0
162 |         if raw_papers.status_code == 200:
163 |             try:
164 |                 json_results = json.loads(raw_papers.text)
165 |                 total = int(json_results['totalHits'])
166 |             except (json.JSONDecodeError, KeyError) as e:
167 |                 # User-friendly message explaining what's happening
168 |                 context = create_error_context(
169 |                     "core", "_get_expected_papers", 
170 |                     ErrorSeverity.WARNING, 
171 |                     ErrorCategory.DATA,
172 |                     f"Data parsing error in CORE response: {type(e).__name__}: {str(e)}"
173 |                 )
174 |                 error_info = get_standard_error_info("data_validation_failed")
175 |                 ErrorHandler.handle_error(e, context, error_info, self.logger)
176 |             except (ValueError, TypeError) as e:
177 |                 # User-friendly message explaining what's happening
178 |                 context = create_error_context(
179 |                     "core", "_get_expected_papers", 
180 |                     ErrorSeverity.WARNING, 
181 |                     ErrorCategory.DATA,
182 |                     f"Data type error in CORE response: {type(e).__name__}: {str(e)}"
183 |                 )
184 |                 error_info = get_standard_error_info("data_validation_failed")
185 |                 ErrorHandler.handle_error(e, context, error_info, self.logger)
186 |             except Exception as ex:
187 |                 # User-friendly message explaining what's happening
188 |                 context = create_error_context(
189 |                     "core", "_get_expected_papers", 
190 |                     ErrorSeverity.ERROR, 
191 |                     ErrorCategory.DATA,
192 |                     f"Unexpected error parsing CORE response: {type(ex).__name__}: {str(ex)}"
193 |                 )
194 |                 error_info = get_standard_error_info("unexpected_error")
195 |                 ErrorHandler.handle_error(ex, context, error_info, self.logger)
196 |         else:
197 |             self._log_api_error(raw_papers, self.api_url)
198 |         return total
199 | 
200 |     def _process_raw_papers(self, query, raw_papers):
201 |         """Process the raw API response into a DataFrame."""
202 |         query_name = list(query.keys())[0]
203 |         query_value = query[query_name]
204 |         papers_request = pd.DataFrame()
205 |         
206 |         if raw_papers.status_code == 200:
207 |             try:
208 |                 json_results = json.loads(raw_papers.text)
209 |                 raw_papers = pd.json_normalize(json_results['results'])
210 |                 papers_request['id'] = raw_papers['id']
211 |                 papers_request['title'] = raw_papers['title']
212 |                 papers_request['abstract'] = raw_papers['abstract']
213 |                 papers_request['url'] = raw_papers['downloadUrl']
214 |                 papers_request['publication'] = raw_papers['publisher']
215 |                 papers_request['publisher'] = self.database_name
216 |                 papers_request['publication_date'] = raw_papers['publishedDate']
217 |                 papers_request['database'] = self.database_name
218 |                 papers_request['query_name'] = query_name
219 |                 papers_request['query_value'] = query_value.replace('<AND>', 'AND').replace('<OR>', 'OR')
220 |             except (json.JSONDecodeError, KeyError) as e:
221 |                 # User-friendly message explaining what's happening
222 |                 context = create_error_context(
223 |                     "core", "_process_raw_papers", 
224 |                     ErrorSeverity.WARNING, 
225 |                     ErrorCategory.DATA,
226 |                     f"Data parsing error in CORE response: {type(e).__name__}: {str(e)}"
227 |                 )
228 |                 error_info = get_standard_error_info("data_validation_failed")
229 |                 ErrorHandler.handle_error(e, context, error_info, self.logger)
230 |             except Exception as ex:
231 |                 # User-friendly message explaining what's happening
232 |                 context = create_error_context(
233 |                     "core", "_process_raw_papers", 
234 |                     ErrorSeverity.ERROR, 
235 |                     ErrorCategory.DATA,
236 |                     f"Unexpected error parsing CORE response: {type(ex).__name__}: {str(ex)}"
237 |                 )
238 |                 error_info = get_standard_error_info("unexpected_error")
239 |                 ErrorHandler.handle_error(ex, context, error_info, self.logger)
240 |         else:
241 |             self._log_api_error(raw_papers, self.api_url)
242 |         
243 |         return papers_request
244 | 
245 |     def _filter_papers(self, papers: pd.DataFrame, dates, start_date, end_date) -> pd.DataFrame:
246 |         """Filter papers based on criteria."""
247 |         self.logger.info(LogCategory.DATA, "core", "_filter_papers", "Filtering papers...")
248 |         try:
249 |             # Filter by title
250 |             papers.loc[:, 'title'] = papers['title'].replace('', float("NaN"))
251 |             papers = papers.dropna(subset=['title'])
252 |             papers.loc[:, 'title'] = papers['title'].str.lower()
253 |             papers = papers.drop_duplicates('title')
254 |             
255 |             # Filter by abstract
256 |             papers.loc[:, 'abstract'] = papers['abstract'].replace('', float("NaN"))
257 |             papers = papers.dropna(subset=['abstract'])
258 |             
259 |         except (ValueError, TypeError) as e:
260 |             # User-friendly message explaining what's happening
261 |             context = create_error_context(
262 |                 "core", "_filter_papers", 
263 |                 ErrorSeverity.WARNING, 
264 |                 ErrorCategory.DATA,
265 |                 f"Data type error during CORE paper filtering: {type(e).__name__}: {str(e)}"
266 |             )
267 |             error_info = get_standard_error_info("data_validation_failed")
268 |             ErrorHandler.handle_error(e, context, error_info, self.logger)
269 |             # Continue with unfiltered papers rather than failing completely
270 |         except KeyError as e:
271 |             # User-friendly message explaining what's happening
272 |             context = create_error_context(
273 |                 "core", "_filter_papers", 
274 |                 ErrorSeverity.WARNING, 
275 |                 ErrorCategory.DATA,
276 |                 f"Missing required column during CORE paper filtering: {type(e).__name__}: {str(e)}"
277 |             )
278 |             error_info = get_standard_error_info("data_validation_failed")
279 |             ErrorHandler.handle_error(e, context, error_info, self.logger)
280 |             # Return papers as-is to prevent complete failure
281 |         except Exception as ex:
282 |             # User-friendly message explaining what's happening
283 |             context = create_error_context(
284 |                 "core", "_filter_papers", 
285 |                 ErrorSeverity.ERROR, 
286 |                 ErrorCategory.DATA,
287 |                 f"Unexpected error during CORE paper filtering: {type(ex).__name__}: {str(ex)}"
288 |             )
289 |             error_info = get_standard_error_info("unexpected_error")
290 |             ErrorHandler.handle_error(ex, context, error_info, self.logger)
291 |             # Return papers as-is to prevent complete failure
292 |         
293 |         return papers
294 | 
295 |     def _clean_papers(self, papers: pd.DataFrame) -> pd.DataFrame:
296 |         """Clean and standardize paper data."""
297 |         self.logger.info(LogCategory.DATA, "core", "_clean_papers", "Cleaning papers...")
298 |         try:
299 |             papers.replace('', float("NaN"), inplace=True)
300 |             papers.dropna(how='all', axis=1, inplace=True)
301 |         except (ValueError, TypeError) as e:
302 |             # User-friendly message explaining what's happening
303 |             context = create_error_context(
304 |                 "core", "_clean_papers", 
305 |                 ErrorSeverity.WARNING, 
306 |                 ErrorCategory.DATA,
307 |                 f"Data type error during CORE paper cleaning: {type(e).__name__}: {str(e)}"
308 |             )
309 |             error_info = get_standard_error_info("data_validation_failed")
310 |             ErrorHandler.handle_error(e, context, error_info, self.logger)
311 |             # Continue with uncleaned papers rather than failing completely
312 |         except KeyError as e:
313 |             # User-friendly message explaining what's happening
314 |             context = create_error_context(
315 |                 "core", "_clean_papers", 
316 |                 ErrorSeverity.WARNING, 
317 |                 ErrorCategory.DATA,
318 |                 f"Missing required column during CORE paper cleaning: {type(e).__name__}: {str(e)}"
319 |             )
320 |             error_info = get_standard_error_info("data_validation_failed")
321 |             ErrorHandler.handle_error(e, context, error_info, self.logger)
322 |             # Return papers as-is to prevent complete failure
323 |         except Exception as ex:
324 |             # User-friendly message explaining what's happening
325 |             context = create_error_context(
326 |                 "core", "_clean_papers", 
327 |                 ErrorSeverity.ERROR, 
328 |                 ErrorCategory.DATA,
329 |                 f"Unexpected error during CORE paper cleaning: {type(ex).__name__}: {str(ex)}"
330 |             )
331 |             error_info = get_standard_error_info("unexpected_error")
332 |             ErrorHandler.handle_error(ex, context, error_info, self.logger)
333 |             # Return papers as-is to prevent complete failure
334 |         
335 |         return papers
336 |     
337 |     def _get_abstracts(self, papers: pd.DataFrame) -> pd.DataFrame:
338 |         """Get abstracts for papers."""
339 |         pass
340 | 


--------------------------------------------------------------------------------
/util/logging_standards.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python3
  2 | """
  3 | Logging Standards for SaLS Project.
  4 | 
  5 | This module defines consistent logging patterns, message formats,
  6 | and logging configuration across the entire SaLS codebase.
  7 | 
  8 | Standards ensure:
  9 | 1. Consistent log levels and their usage
 10 | 2. Uniform message formatting
 11 | 3. Standardized logging configuration
 12 | 4. Consistent progress reporting
 13 | 5. Unified debug information
 14 | """
 15 | 
 16 | import logging
 17 | import logging.handlers
 18 | import os
 19 | import sys
 20 | from typing import Optional, Dict, Any
 21 | from datetime import datetime
 22 | from enum import Enum
 23 | 
 24 | 
 25 | class LogLevel(Enum):
 26 |     """Standard log levels for consistent usage across SaLS."""
 27 |     CRITICAL = logging.CRITICAL    # Pipeline cannot continue
 28 |     ERROR = logging.ERROR          # Operation failed, but pipeline can continue
 29 |     WARNING = logging.WARNING      # Issue detected, operation continues with defaults
 30 |     INFO = logging.INFO            # Informational message, user-facing
 31 |     DEBUG = logging.DEBUG          # Debug information, developer-facing
 32 |     NOTSET = logging.NOTSET        # Not set
 33 | 
 34 | 
 35 | class LogCategory(Enum):
 36 |     """Standard log categories for consistent classification."""
 37 |     PIPELINE = "PIPELINE"         # Main pipeline operations
 38 |     CONFIGURATION = "CONFIG"      # Configuration operations
 39 |     DATABASE = "DATABASE"         # Database operations
 40 |     API = "API"                   # External API calls
 41 |     DATA = "DATA"                 # Data processing operations
 42 |     FILE = "FILE"                 # File I/O operations
 43 |     VALIDATION = "VALIDATION"     # Data validation operations
 44 |     USER = "USER"                 # User interaction operations
 45 |     SYSTEM = "SYSTEM"             # System-level operations
 46 | 
 47 | 
 48 | class LogFormatter:
 49 |     """Standard log formatter for consistent message formatting."""
 50 |     
 51 |     # Standard format for different log levels
 52 |     STANDARD_FORMATS = {
 53 |         LogLevel.CRITICAL: "[CRITICAL] {asctime} | {category} | {module}.{function} | {message}",
 54 |         LogLevel.ERROR: "[ERROR] {asctime} | {category} | {module}.{function} | {message}",
 55 |         LogLevel.WARNING: "[WARNING] {asctime} | {category} | {module}.{function} | {message}",
 56 |         LogLevel.INFO: "[INFO] {asctime} | {category} | {module}.{function} | {message}",
 57 |         LogLevel.DEBUG: "[DEBUG] {asctime} | {category} | {module}.{function} | {message}"
 58 |     }
 59 |     
 60 |     @staticmethod
 61 |     def format_message(level: LogLevel, 
 62 |                       category: LogCategory,
 63 |                       module: str,
 64 |                       function: str,
 65 |                       message: str,
 66 |                       extra_info: Optional[Dict[str, Any]] = None) -> str:
 67 |         """Format a log message according to SaLS standards."""
 68 |         
 69 |         # Get base format
 70 |         base_format = LogFormatter.STANDARD_FORMATS.get(level, LogFormatter.STANDARD_FORMATS[LogLevel.INFO])
 71 |         
 72 |         # Format extra info if provided
 73 |         extra_str = ""
 74 |         if extra_info:
 75 |             extra_parts = []
 76 |             for key, value in extra_info.items():
 77 |                 if isinstance(value, (dict, list)):
 78 |                     extra_parts.append(f"{key}: {str(value)[:100]}...")
 79 |                 else:
 80 |                     extra_parts.append(f"{key}: {value}")
 81 |             extra_str = " | " + " | ".join(extra_parts)
 82 |         
 83 |         # Format the message
 84 |         formatted = base_format.format(
 85 |             asctime=datetime.now().strftime("%Y-%m-%d %H:%M:%S"),
 86 |             category=category.value,
 87 |             module=module,
 88 |             function=function,
 89 |             message=message,
 90 |             extra_info=extra_str
 91 |         )
 92 |         
 93 |         return formatted
 94 | 
 95 | 
 96 | class SaLSLogger:
 97 |     """Standard SaLS logger with consistent configuration and methods."""
 98 |     
 99 |     def __init__(self, 
100 |                  name: str,
101 |                  log_file: Optional[str] = None,
102 |                  console_level: LogLevel = LogLevel.INFO,
103 |                  file_level: LogLevel = LogLevel.DEBUG,
104 |                  max_file_size: int = 10 * 1024 * 1024,  # 10MB
105 |                  backup_count: int = 5):
106 |         self.name = name
107 |         self.log_file = log_file
108 |         self.console_level = console_level
109 |         self.file_level = file_level
110 |         self.max_file_size = max_file_size
111 |         self.backup_count = backup_count
112 |         
113 |         # Create logger
114 |         self.logger = logging.getLogger(name)
115 |         self.logger.setLevel(logging.DEBUG)  # Set to lowest level, handlers will filter
116 |         
117 |         # Clear existing handlers
118 |         self.logger.handlers.clear()
119 |         
120 |         # Setup handlers
121 |         self._setup_console_handler()
122 |         if log_file:
123 |             self._setup_file_handler()
124 |     
125 |     def _setup_console_handler(self) -> None:
126 |         """Setup console handler with user-friendly formatting."""
127 |         console_handler = logging.StreamHandler(sys.stdout)
128 |         console_handler.setLevel(self.console_level.value)
129 |         
130 |         # Create formatter for console - use a simpler format since we handle custom fields in our log method
131 |         console_formatter = logging.Formatter("[%(levelname)s] %(message)s")
132 |         console_handler.setFormatter(console_formatter)
133 |         
134 |         self.logger.addHandler(console_handler)
135 |     
136 |     def _setup_file_handler(self) -> None:
137 |         """Setup file handler with detailed formatting."""
138 |         # Ensure log directory exists
139 |         log_dir = os.path.dirname(self.log_file)
140 |         if log_dir and not os.path.exists(log_dir):
141 |             os.makedirs(log_dir, exist_ok=True)
142 |         
143 |         # Create rotating file handler
144 |         file_handler = logging.handlers.RotatingFileHandler(
145 |             self.log_file,
146 |             maxBytes=self.max_file_size,
147 |             backupCount=self.backup_count,
148 |             encoding='utf-8'
149 |         )
150 |         file_handler.setLevel(self.file_level.value)
151 |         
152 |         # Create formatter for file - use a simpler format since we handle custom fields in our log method
153 |         file_formatter = logging.Formatter("[%(levelname)s] %(asctime)s | %(message)s")
154 |         file_handler.setFormatter(file_formatter)
155 |         
156 |         self.logger.addHandler(file_handler)
157 |     
158 |     def log(self, 
159 |             level: LogLevel,
160 |             category: LogCategory,
161 |             module: str,
162 |             function: str,
163 |             message: str,
164 |             extra_info: Optional[Dict[str, Any]] = None,
165 |             print_to_console: bool = False) -> None:
166 |         """Log a message with consistent formatting."""
167 |         
168 |         # Format the message with our custom formatter for console output
169 |         formatted_message = LogFormatter.format_message(
170 |             level=level,
171 |             category=category,
172 |             module=module,
173 |             function=function,
174 |             message=message,
175 |             extra_info=extra_info
176 |         )
177 |         
178 |         # Log according to level - pass the original message to Python's logger
179 |         # Python's logger will format it with its own formatter
180 |         if level == LogLevel.CRITICAL:
181 |             self.logger.critical(message)
182 |         elif level == LogLevel.ERROR:
183 |             self.logger.error(message)
184 |         elif level == LogLevel.WARNING:
185 |             self.logger.warning(message)
186 |         elif level == LogLevel.INFO:
187 |             self.logger.info(message)
188 |         else:  # DEBUG
189 |             self.logger.debug(message)
190 |         
191 |         # Optionally print to console for user-facing messages
192 |         if print_to_console and level in [LogLevel.CRITICAL, LogLevel.ERROR, LogLevel.WARNING, LogLevel.INFO]:
193 |             print(formatted_message)
194 |     
195 |     def critical(self, category: LogCategory, module: str, function: str, message: str, 
196 |                 extra_info: Optional[Dict[str, Any]] = None, print_to_console: bool = True) -> None:
197 |         """Log a critical message."""
198 |         self.log(LogLevel.CRITICAL, category, module, function, message, extra_info, print_to_console)
199 |     
200 |     def error(self, category: LogCategory, module: str, function: str, message: str,
201 |               extra_info: Optional[Dict[str, Any]] = None, print_to_console: bool = True) -> None:
202 |         """Log an error message."""
203 |         self.log(LogLevel.ERROR, category, module, function, message, extra_info, print_to_console)
204 |     
205 |     def warning(self, category: LogCategory, module: str, function: str, message: str,
206 |                 extra_info: Optional[Dict[str, Any]] = None, print_to_console: bool = False) -> None:
207 |         """Log a warning message."""
208 |         self.log(LogLevel.WARNING, category, module, function, message, extra_info, print_to_console)
209 |     
210 |     def info(self, category: LogCategory, module: str, function: str, message: str,
211 |              extra_info: Optional[Dict[str, Any]] = None, print_to_console: bool = False) -> None:
212 |         """Log an info message."""
213 |         self.log(LogLevel.INFO, category, module, function, message, extra_info, print_to_console)
214 |     
215 |     def debug(self, category: LogCategory, module: str, function: str, message: str,
216 |               extra_info: Optional[Dict[str, Any]] = None, print_to_console: bool = False) -> None:
217 |         """Log a debug message."""
218 |         self.log(LogLevel.DEBUG, category, module, function, message, extra_info, print_to_console)
219 |     
220 |     def progress(self, current: int, total: int, operation: str, 
221 |                 extra_info: Optional[Dict[str, Any]] = None) -> None:
222 |         """Log progress information with consistent formatting."""
223 |         percentage = (current / total) * 100 if total > 0 else 0
224 |         progress_message = f"Progress: {current}/{total} ({percentage:.1f}%) - {operation}"
225 |         
226 |         self.info(
227 |             category=LogCategory.PIPELINE,
228 |             module=self.name,
229 |             function="progress",
230 |             message=progress_message,
231 |             extra_info=extra_info,
232 |             print_to_console=False
233 |         )
234 |     
235 |     def operation_start(self, operation: str, extra_info: Optional[Dict[str, Any]] = None) -> None:
236 |         """Log the start of an operation."""
237 |         self.info(
238 |             category=LogCategory.PIPELINE,
239 |             module=self.name,
240 |             function="operation_start",
241 |             message=f"Starting: {operation}",
242 |             extra_info=extra_info,
243 |             print_to_console=False
244 |         )
245 |     
246 |     def operation_complete(self, operation: str, result: str, 
247 |                           extra_info: Optional[Dict[str, Any]] = None) -> None:
248 |         """Log the completion of an operation."""
249 |         self.info(
250 |             category=LogCategory.PIPELINE,
251 |             module=self.name,
252 |             function="operation_complete",
253 |             message=f"Completed: {operation} - Result: {result}",
254 |             extra_info=extra_info,
255 |             print_to_console=False
256 |         )
257 |     
258 |     def operation_failed(self, operation: str, error: str,
259 |                         extra_info: Optional[Dict[str, Any]] = None) -> None:
260 |         """Log the failure of an operation."""
261 |         self.error(
262 |             category=LogCategory.PIPELINE,
263 |             module=self.name,
264 |             function="operation_failed",
265 |             message=f"Failed: {operation} - Error: {error}",
266 |             extra_info=extra_info,
267 |             print_to_console=True
268 |         )
269 | 
270 | 
271 | def setup_sals_logger(name: str,
272 |                      log_file: Optional[str] = None,
273 |                      console_level: LogLevel = LogLevel.INFO,
274 |                      file_level: LogLevel = LogLevel.DEBUG) -> SaLSLogger:
275 |     """Setup a standardized SaLS logger."""
276 |     new_logger = SaLSLogger(
277 |         name=name,
278 |         log_file=log_file,
279 |         console_level=console_level,
280 |         file_level=file_level
281 |     )
282 | 
283 |     # Attach the same handlers to legacy 'logger' so existing modules print to console and files
284 |     try:
285 |         legacy_logger = logging.getLogger('logger')
286 |         legacy_logger.setLevel(logging.DEBUG)
287 |         legacy_logger.handlers.clear()
288 |         for handler in new_logger.logger.handlers:
289 |             legacy_logger.addHandler(handler)
290 |         legacy_logger.propagate = False
291 |     except Exception:
292 |         pass
293 | 
294 |     # Keep a reference to the current SaLS logger
295 |     global _CURRENT_SALS_LOGGER
296 |     _CURRENT_SALS_LOGGER = new_logger
297 |     return new_logger
298 | 
299 | 
300 | # Standard logging configuration
301 | def get_standard_logging_config() -> Dict[str, Any]:
302 |     """Get standard logging configuration for SaLS."""
303 |     return {
304 |         "console_level": LogLevel.INFO,
305 |         "file_level": LogLevel.DEBUG,
306 |         "max_file_size": 10 * 1024 * 1024,  # 10MB
307 |         "backup_count": 5,
308 |         "log_format": "detailed",
309 |         "encoding": "utf-8"
310 |     }
311 | 
312 | 
313 | # Keep reference to the most recently configured SaLS logger
314 | _CURRENT_SALS_LOGGER: Optional[SaLSLogger] = None
315 | 
316 | def get_current_sals_logger() -> Optional[SaLSLogger]:
317 |     """Return the last configured SaLSLogger, if any."""
318 |     return _CURRENT_SALS_LOGGER
319 | 
320 | # Compatibility logger that accepts both SaLSLogger-style and std logging-style calls
321 | class CompatLogger:
322 |     def __init__(self, sals_logger: Optional[SaLSLogger], std_logger: logging.Logger):
323 |         self._sals = sals_logger
324 |         self._std = std_logger
325 |         try:
326 |             self._std.setLevel(logging.INFO)
327 |         except Exception:
328 |             pass
329 | 
330 |     def _route(self, level: LogLevel, *args, **kwargs):
331 |         # SaLS-style: (category, module, function, message, extra_info=None, print_to_console=False)
332 |         if len(args) >= 4 and isinstance(args[0], LogCategory):
333 |             category, module, function, message = args[:4]
334 |             extra_info = args[4] if len(args) >= 5 else kwargs.get('extra_info')
335 |             print_to_console = args[5] if len(args) >= 6 else kwargs.get('print_to_console', False)
336 |             if self._sals:
337 |                 self._sals.log(level, category, module, function, message, extra_info, print_to_console)
338 |             else:
339 |                 rendered = LogFormatter.format_message(level, category, module, function, message, extra_info)
340 |                 if level == LogLevel.CRITICAL:
341 |                     self._std.critical(rendered)
342 |                 elif level == LogLevel.ERROR:
343 |                     self._std.error(rendered)
344 |                 elif level == LogLevel.WARNING:
345 |                     self._std.warning(rendered)
346 |                 elif level == LogLevel.INFO:
347 |                     self._std.info(rendered)
348 |                 else:
349 |                     self._std.debug(rendered)
350 |                 if print_to_console and level in [LogLevel.CRITICAL, LogLevel.ERROR, LogLevel.WARNING, LogLevel.INFO]:
351 |                     try:
352 |                         print(rendered)
353 |                     except Exception:
354 |                         pass
355 |             return
356 | 
357 |         # Std-style: (msg, *fmt_args)
358 |         if len(args) >= 1:
359 |             msg = args[0]
360 |             fmt_args = args[1:] if len(args) > 1 else ()
361 |             try:
362 |                 if level == LogLevel.CRITICAL:
363 |                     self._std.critical(msg, *fmt_args)
364 |                 elif level == LogLevel.ERROR:
365 |                     self._std.error(msg, *fmt_args)
366 |                 elif level == LogLevel.WARNING:
367 |                     self._std.warning(msg, *fmt_args)
368 |                 elif level == LogLevel.INFO:
369 |                     self._std.info(msg, *fmt_args)
370 |                 else:
371 |                     self._std.debug(msg, *fmt_args)
372 |             except Exception:
373 |                 safe_msg = str(msg)
374 |                 if level == LogLevel.CRITICAL:
375 |                     self._std.critical(safe_msg)
376 |                 elif level == LogLevel.ERROR:
377 |                     self._std.error(safe_msg)
378 |                 elif level == LogLevel.WARNING:
379 |                     self._std.warning(safe_msg)
380 |                 elif level == LogLevel.INFO:
381 |                     self._std.info(safe_msg)
382 |                 else:
383 |                     self._std.debug(safe_msg)
384 | 
385 |     def info(self, *args, **kwargs):
386 |         self._route(LogLevel.INFO, *args, **kwargs)
387 | 
388 |     def warning(self, *args, **kwargs):
389 |         self._route(LogLevel.WARNING, *args, **kwargs)
390 | 
391 |     def error(self, *args, **kwargs):
392 |         self._route(LogLevel.ERROR, *args, **kwargs)
393 | 
394 |     def debug(self, *args, **kwargs):
395 |         self._route(LogLevel.DEBUG, *args, **kwargs)
396 | 
397 |     def critical(self, *args, **kwargs):
398 |         self._route(LogLevel.CRITICAL, *args, **kwargs)
399 | 
400 | 
401 | def get_compat_logger() -> CompatLogger:
402 |     """Return a logger that accepts both SaLSLogger-style and std logging-style calls."""
403 |     return CompatLogger(_CURRENT_SALS_LOGGER, logging.getLogger('sals_pipeline'))
404 | 
405 | # Example usage:
406 | """
407 | # In your module:
408 | from util.logging_standards import (
409 |     setup_sals_logger, LogCategory, LogLevel
410 | )
411 | 
412 | # Setup logger
413 | logger = setup_sals_logger(
414 |     name="my_module",
415 |     log_file="logs/my_module.log"
416 | )
417 | 
418 | # Log operations
419 | logger.operation_start("data_processing")
420 | logger.info(LogCategory.DATA, "my_module", "process_data", "Processing 100 records")
421 | logger.progress(50, 100, "data_processing")
422 | logger.operation_complete("data_processing", "100 records processed")
423 | """
424 | 


--------------------------------------------------------------------------------
/clients/arxiv.py:
--------------------------------------------------------------------------------
  1 | import pandas as pd
  2 | import json
  3 | from .base_client import DatabaseClient
  4 | from .apis.generic import Generic
  5 | from os.path import exists
  6 | import logging
  7 | import time
  8 | from tqdm import tqdm
  9 | from util.error_standards import (
 10 |     ErrorHandler, create_error_context, ErrorSeverity, ErrorCategory,
 11 |     get_standard_error_info
 12 | )
 13 | from util.logging_standards import LogCategory, get_compat_logger, get_current_sals_logger
 14 | 
 15 | 
 16 | class ArxivClient(DatabaseClient):
 17 |     """
 18 |     Refactored arXiv client using the Template Method pattern.
 19 |     """
 20 |     
 21 |     def __init__(self):
 22 |         super().__init__(
 23 |             database_name='arxiv',
 24 |             max_papers=5000,
 25 |             waiting_time=2,
 26 |             max_retries=3,
 27 |             client_fields={'title': 'ti', 'abstract': 'abs'}
 28 |         )
 29 |         self.api_url = 'http://export.arxiv.org/api/query?search_query='
 30 |         self.client = Generic()
 31 |         # Normalize logger to a compat logger to accept both styles
 32 |         try:
 33 |             sals = get_current_sals_logger()
 34 |             self.logger = get_compat_logger() if sals is None else get_compat_logger()
 35 |         except Exception:
 36 |             pass
 37 |         
 38 |     def _has_api_access(self) -> bool:
 39 |         """ArXiv is open access, so no API key is needed."""
 40 |         return True
 41 |     
 42 |     def _plan_requests(self, query, syntactic_filters, synonyms, fields, types, dates, start_date, end_date) -> pd.DataFrame:
 43 |         """Plan the API requests for arXiv."""
 44 |         # Extract query value from the query dictionary (same as original arxiv.py)
 45 |         query_name = list(query.keys())[0]
 46 |         query_value = query[query_name]
 47 |         
 48 |         # Build query parameters
 49 |         c_fields = []
 50 |         for field in fields:
 51 |             if field in self.client_fields:
 52 |                 c_fields.append(self.client_fields[field])
 53 |         
 54 |         parameters = {
 55 |             'query': query_value,  # Use query_value string, not the entire query dict
 56 |             'syntactic_filters': syntactic_filters,
 57 |             'synonyms': synonyms,
 58 |             'fields': c_fields,
 59 |             'types': types
 60 |         }
 61 |         
 62 |         # Create initial request to get total count
 63 |         request = self._create_request(parameters)
 64 |         raw_papers = self._retry_request(self.client.request, request, 'get', {}, {})
 65 |         expected_papers = self._get_expected_papers(raw_papers)
 66 |         
 67 |         self.logger.info(f"Expected papers from arxiv: {expected_papers}...")
 68 |         
 69 |         # Calculate number of requests needed
 70 |         times = int(expected_papers / self.max_papers) - 1
 71 |         mod = int(expected_papers) % self.max_papers
 72 |         if mod > 0:
 73 |             times = times + 1
 74 |             
 75 |         # Execute requests
 76 |         papers = self._execute_requests(query, parameters, times, expected_papers, mod)
 77 |         return papers
 78 |     
 79 |     def _execute_requests(self, query, parameters, times, expected_papers, mod):
 80 |         """Execute the planned requests to retrieve papers."""
 81 |         papers = pd.DataFrame()
 82 |         
 83 |         for t in tqdm(range(0, times + 1)):
 84 |             time.sleep(self.waiting_time)
 85 |             start = t * self.max_papers
 86 |             
 87 |             request = self._create_request(parameters, start)
 88 |             raw_papers = self._retry_request(self.client.request, request, 'get', {}, {})
 89 |             
 90 |             if raw_papers is None:
 91 |                 continue
 92 |                 
 93 |             papers_request = self._process_raw_papers(query, raw_papers)
 94 |             
 95 |             # Sometimes arXiv API doesn't respond with all papers, so retry
 96 |             expected_per_request = expected_papers
 97 |             if expected_papers > self.max_papers:
 98 |                 expected_per_request = self.max_papers
 99 |             if t == times and mod > 0:
100 |                 expected_per_request = mod
101 |                 
102 |             while len(papers_request) < expected_per_request:
103 |                 time.sleep(self.waiting_time)
104 |                 raw_papers = self._retry_request(self.client.request, request, 'get', {}, {})
105 |                 papers_request = self._process_raw_papers(query, raw_papers)
106 |             
107 |             if len(papers) == 0:
108 |                 papers = papers_request
109 |             else:
110 |                 papers = pd.concat([papers, papers_request])
111 |         
112 |         return papers
113 |     
114 |     def _create_request(self, parameters, start=0):
115 |         """Create the API request URL for arXiv."""
116 |         req = self.api_url
117 |         req = req + self.client.default_query(parameters)
118 |         req = req + '&start=' + str(start)
119 |         req = req + '&max_results=' + str(self.max_papers)
120 |         req = req + '&sortBy=submittedDate&sortOrder=descending'
121 |         return req
122 |     
123 |     def _get_expected_papers(self, raw_papers):
124 |         """Get the expected number of papers from the API response."""
125 |         total = 0
126 |         if raw_papers.status_code == 200:
127 |             try:
128 |                 total_text = raw_papers.text.split('opensearch:totalResults')[1]
129 |                 total = int(total_text.split('>')[1].replace('</', ''))
130 |             except (IndexError, ValueError) as e:
131 |                 context = create_error_context(
132 |                     module="arxiv",
133 |                     function="_get_expected_papers",
134 |                     operation="api_response_parsing",
135 |                     severity=ErrorSeverity.WARNING,
136 |                     category=ErrorCategory.API
137 |                 )
138 |                 
139 |                 error_info = get_standard_error_info("data_validation_failed")
140 |                 error_handler = ErrorHandler(self.logger)
141 |                 error_msg = error_handler.handle_error(
142 |                     error=e,
143 |                     context=context,
144 |                     error_type="APIResponseParsingError",
145 |                     error_description=f"Error parsing the API response: {type(e).__name__}: {str(e)}",
146 |                     recovery_suggestion=error_info["recovery"],
147 |                     next_steps=error_info["next_steps"]
148 |                 )
149 |             except Exception as ex:
150 |                 context = create_error_context(
151 |                     module="arxiv",
152 |                     function="_get_expected_papers",
153 |                     operation="api_response_parsing",
154 |                     severity=ErrorSeverity.WARNING,
155 |                     category=ErrorCategory.API
156 |                 )
157 |                 
158 |                 error_info = get_standard_error_info("data_validation_failed")
159 |                 error_handler = ErrorHandler(self.logger)
160 |                 error_msg = error_handler.handle_error(
161 |                     error=ex,
162 |                     context=context,
163 |                     error_type="APIResponseParsingError",
164 |                     error_description=f"Unexpected error parsing the API response: {type(ex).__name__}: {str(ex)}",
165 |                     recovery_suggestion=error_info["recovery"],
166 |                     next_steps=error_info["next_steps"]
167 |                 )
168 |         else:
169 |             self._log_api_error(raw_papers, raw_papers.request.url if raw_papers.request else "")
170 |         return total
171 |     
172 |     def _process_raw_papers(self, query, raw_papers):
173 |         """Process the raw API response into a DataFrame."""
174 |         query_name = list(query.keys())[0]
175 |         query_value = query[query_name]
176 |         papers_request = pd.DataFrame()
177 |         
178 |         if raw_papers.status_code == 200:
179 |             try:
180 |                 papers_request = pd.read_xml(raw_papers.text, xpath='//feed:entry',
181 |                                            namespaces={"feed": "http://www.w3.org/2005/Atom"})
182 |                 papers_request.loc[:, 'database'] = self.database_name
183 |                 papers_request.loc[:, 'query_name'] = query_name
184 |                 papers_request.loc[:, 'query_value'] = query_value.replace('<AND>', 'AND').replace('<OR>', 'OR')
185 |             except (ValueError, TypeError) as e:
186 |                 context = create_error_context(
187 |                     module="arxiv",
188 |                     function="_process_raw_papers",
189 |                     operation="api_response_parsing",
190 |                     severity=ErrorSeverity.WARNING,
191 |                     category=ErrorCategory.API
192 |                 )
193 |                 
194 |                 error_info = get_standard_error_info("data_validation_failed")
195 |                 error_handler = ErrorHandler(self.logger)
196 |                 error_msg = error_handler.handle_error(
197 |                     error=e,
198 |                     context=context,
199 |                     error_type="APIResponseParsingError",
200 |                     error_description=f"Error parsing the API response: {type(e).__name__}: {str(e)}",
201 |                     recovery_suggestion=error_info["recovery"],
202 |                     next_steps=error_info["next_steps"]
203 |                 )
204 |             except Exception as ex:
205 |                 context = create_error_context(
206 |                     module="arxiv",
207 |                     function="_process_raw_papers",
208 |                     operation="api_response_parsing",
209 |                     severity=ErrorSeverity.WARNING,
210 |                     category=ErrorCategory.API
211 |                 )
212 |                 
213 |                 error_info = get_standard_error_info("data_validation_failed")
214 |                 error_handler = ErrorHandler(self.logger)
215 |                 error_msg = error_handler.handle_error(
216 |                     error=ex,
217 |                     context=context,
218 |                     error_type="APIResponseParsingError",
219 |                     error_description=f"Unexpected error parsing the API response: {type(ex).__name__}: {str(ex)}",
220 |                     recovery_suggestion=error_info["recovery"],
221 |                     next_steps=error_info["next_steps"]
222 |                 )
223 |         else:
224 |             self._log_api_error(raw_papers, raw_papers.request.url if raw_papers.request else "")
225 |         
226 |         return papers_request
227 |     
228 |     def _filter_papers(self, papers: pd.DataFrame, dates, start_date, end_date) -> pd.DataFrame:
229 |         """Filter papers based on criteria."""
230 |         self.logger.info("Filtering papers...")
231 |         try:
232 |             # Filter by title
233 |             papers.loc[:, 'title'] = papers['title'].replace('', float("NaN"))
234 |             papers.dropna(subset=['title'], inplace=True)
235 |             papers.loc[:, 'title'] = papers['title'].str.lower()
236 |             papers = papers.drop_duplicates('title')
237 |             
238 |             # Filter by abstract
239 |             papers.loc[:, 'summary'] = papers['summary'].replace('', float("NaN"))
240 |             papers.dropna(subset=['summary'], inplace=True)
241 |             
242 |             # Filter by dates if specified
243 |             if dates is True:
244 |                 papers['published'] = pd.to_datetime(papers['published']).dt.date
245 |                 papers = papers[(papers['published'] >= start_date) & (papers['published'] <= end_date)]
246 |                 
247 |         except (ValueError, TypeError) as e:
248 |             context = create_error_context(
249 |                 module="arxiv",
250 |                 function="_filter_papers",
251 |                 operation="paper_filtering",
252 |                 severity=ErrorSeverity.WARNING,
253 |                 category=ErrorCategory.DATA
254 |             )
255 |             
256 |             error_info = get_standard_error_info("data_validation_failed")
257 |             error_handler = ErrorHandler(self.logger)
258 |             error_msg = error_handler.handle_error(
259 |                 error=e,
260 |                 context=context,
261 |                 error_type="PaperFilteringError",
262 |                 error_description=f"Error filtering papers: {type(e).__name__}: {str(e)}",
263 |                 recovery_suggestion=error_info["recovery"],
264 |                 next_steps=error_info["next_steps"]
265 |             )
266 |             # Continue with unfiltered papers rather than failing completely
267 |         except KeyError as e:
268 |             context = create_error_context(
269 |                 module="arxiv",
270 |                 function="_filter_papers",
271 |                 operation="paper_filtering",
272 |                 severity=ErrorSeverity.WARNING,
273 |                 category=ErrorCategory.DATA
274 |             )
275 |             
276 |             error_info = get_standard_error_info("data_validation_failed")
277 |             error_handler = ErrorHandler(self.logger)
278 |             error_msg = error_handler.handle_error(
279 |                 error=e,
280 |                 context=context,
281 |                 error_type="PaperFilteringError",
282 |                 error_description=f"Missing required column during paper filtering: {type(e).__name__}: {str(e)}",
283 |                 recovery_suggestion=error_info["recovery"],
284 |                 next_steps=error_info["next_steps"]
285 |             )
286 |             # Return papers as-is to prevent complete failure
287 |         except Exception as ex:
288 |             context = create_error_context(
289 |                 module="arxiv",
290 |                 function="_filter_papers",
291 |                 operation="paper_filtering",
292 |                 severity=ErrorSeverity.WARNING,
293 |                 category=ErrorCategory.DATA
294 |             )
295 |             
296 |             error_info = get_standard_error_info("data_validation_failed")
297 |             error_handler = ErrorHandler(self.logger)
298 |             error_msg = error_handler.handle_error(
299 |                 error=ex,
300 |                 context=context,
301 |                 error_type="PaperFilteringError",
302 |                 error_description=f"Unexpected error during paper filtering: {type(ex).__name__}: {str(ex)}",
303 |                 recovery_suggestion=error_info["recovery"],
304 |                 next_steps=error_info["next_steps"]
305 |             )
306 |             # Return papers as-is to prevent complete failure
307 |         
308 |         return papers
309 |     
310 |     def _clean_papers(self, papers: pd.DataFrame) -> pd.DataFrame:
311 |         """Clean and standardize paper data."""
312 |         self.logger.info("Cleaning papers...")
313 |         try:
314 |             # Remove unnecessary columns
315 |             papers = papers.drop(columns=[
316 |                 'author', 'comment', 'link', 'primary_category', 'category', 
317 |                 'doi', 'journal_ref'
318 |             ], errors='ignore')
319 |             
320 |             # Clean empty values
321 |             papers.replace('', float("NaN"), inplace=True)
322 |             papers.dropna(how='all', axis=1, inplace=True)
323 |             
324 |         except (ValueError, TypeError) as e:
325 |             context = create_error_context(
326 |                 module="arxiv",
327 |                 function="_clean_papers",
328 |                 operation="paper_cleaning",
329 |                 severity=ErrorSeverity.WARNING,
330 |                 category=ErrorCategory.DATA
331 |             )
332 |             
333 |             error_info = get_standard_error_info("data_validation_failed")
334 |             error_handler = ErrorHandler(self.logger)
335 |             error_msg = error_handler.handle_error(
336 |                 error=e,
337 |                 context=context,
338 |                 error_type="PaperCleaningError",
339 |                 error_description=f"Error cleaning papers: {type(e).__name__}: {str(e)}",
340 |                 recovery_suggestion=error_info["recovery"],
341 |                 next_steps=error_info["next_steps"]
342 |             )
343 |             # Continue with uncleaned papers rather than failing completely
344 |         except KeyError as e:
345 |             context = create_error_context(
346 |                 module="arxiv",
347 |                 function="_clean_papers",
348 |                 operation="paper_cleaning",
349 |                 severity=ErrorSeverity.WARNING,
350 |                 category=ErrorCategory.DATA
351 |             )
352 |             
353 |             error_info = get_standard_error_info("data_validation_failed")
354 |             error_handler = ErrorHandler(self.logger)
355 |             error_msg = error_handler.handle_error(
356 |                 error=e,
357 |                 context=context,
358 |                 error_type="PaperCleaningError",
359 |                 error_description=f"Missing required column during paper cleaning: {type(e).__name__}: {str(e)}",
360 |                 recovery_suggestion=error_info["recovery"],
361 |                 next_steps=error_info["next_steps"]
362 |             )
363 |             # Return papers as-is to prevent complete failure
364 |         except Exception as ex:
365 |             context = create_error_context(
366 |                 module="arxiv",
367 |                 function="_clean_papers",
368 |                 operation="paper_cleaning",
369 |                 severity=ErrorSeverity.WARNING,
370 |                 category=ErrorCategory.DATA
371 |             )
372 |             
373 |             error_info = get_standard_error_info("data_validation_failed")
374 |             error_handler = ErrorHandler(self.logger)
375 |             error_msg = error_handler.handle_error(
376 |                 error=ex,
377 |                 context=context,
378 |                 error_type="PaperCleaningError",
379 |                 error_description=f"Unexpected error during paper cleaning: {type(ex).__name__}: {str(ex)}",
380 |                 recovery_suggestion=error_info["recovery"],
381 |                 next_steps=error_info["next_steps"]
382 |             )
383 |             # Return papers as-is to prevent complete failure
384 |         
385 |         return papers
386 | 
387 |     def _get_abstracts(self, papers: pd.DataFrame) -> pd.DataFrame:
388 |         """Get abstracts for papers."""
389 |         pass


--------------------------------------------------------------------------------
/clients/springer.py:
--------------------------------------------------------------------------------
  1 | import pandas as pd
  2 | import json
  3 | from .base_client import DatabaseClient
  4 | from .apis.generic import Generic
  5 | from os.path import exists
  6 | import logging
  7 | import time
  8 | from tqdm import tqdm
  9 | from util.error_standards import (
 10 |     ErrorHandler, create_error_context, ErrorSeverity, ErrorCategory,
 11 |     get_standard_error_info
 12 | )
 13 | from util.logging_standards import LogCategory
 14 | 
 15 | 
 16 | class SpringerClient(DatabaseClient):
 17 |     """Springer client implementation using the DatabaseClient base class."""
 18 |     
 19 |     def __init__(self):
 20 |         super().__init__(
 21 |             database_name='springer',
 22 |             max_papers=25,
 23 |             waiting_time=2,
 24 |             max_retries=3,
 25 |             client_fields={'title': 'title'},
 26 |             quota=500
 27 |         )
 28 |         
 29 |         # Load API access from config
 30 |         if exists('./config.json'):
 31 |             with open("./config.json", "r") as file:
 32 |                 config = json.load(file)
 33 |             if 'api_access_springer' in config:
 34 |                 self.api_access = config['api_access_springer']
 35 |             else:
 36 |                 self.api_access = ''
 37 |         else:
 38 |             self.api_access = ''
 39 |         
 40 |         # Define API URL after API access is loaded
 41 |         self.api_url = 'http://api.springernature.com/metadata/json?q=type:Journal<dates>'
 42 |         self.start = 0
 43 |         self.client = Generic()
 44 | 
 45 |     def _has_api_access(self) -> bool:
 46 |         """Check if Springer API access is available."""
 47 |         return self.api_access != ''
 48 | 
 49 |     def _plan_requests(self, query, syntactic_filters, synonyms, fields, types, dates, start_date, end_date) -> pd.DataFrame:
 50 |         """Plan the API requests for Springer."""
 51 |         # Extract query value from the query dictionary
 52 |         query_name = list(query.keys())[0]
 53 |         query_value = query[query_name]
 54 |         
 55 |         # Build query parameters
 56 |         c_fields = []
 57 |         for field in fields:
 58 |             if field in self.client_fields:
 59 |                 c_fields.append(self.client_fields[field])
 60 |         
 61 |         parameters = {
 62 |             'query': query_value,
 63 |             'syntactic_filters': syntactic_filters,
 64 |             'synonyms': synonyms,
 65 |             'fields': c_fields,
 66 |             'types': types
 67 |         }
 68 |         
 69 |         # Create initial request to get total count
 70 |         request = self._create_request(parameters, dates, start_date, end_date, False)
 71 |         raw_papers = self._retry_request(self.client.request, request, 'get', {}, {})
 72 |         expected_papers = self._get_expected_papers(raw_papers)
 73 |         
 74 |         self.logger.info(LogCategory.DATABASE, "springer", "_plan_requests", f"Expected papers from springer: {expected_papers}...")
 75 |         
 76 |         # Calculate number of requests needed
 77 |         times = int(expected_papers / self.max_papers) - 1
 78 |         mod = int(expected_papers) % self.max_papers
 79 |         if mod > 0:
 80 |             times = times + 1
 81 |             
 82 |         # Check quota constraints
 83 |         if times >= self.quota:
 84 |             self.logger.info(LogCategory.DATABASE, "springer", "_plan_requests", f"The number of expected papers requires {times + 1} requests which exceeds the {self.database_name} quota of {self.quota} requests per day.")
 85 |             if len(syntactic_filters) > 0:
 86 |                 self.logger.info(LogCategory.DATABASE, "springer", "_plan_requests", "Trying to reduce the number of requests using syntactic filters.")
 87 |                 request = self._create_request(parameters, dates, start_date, end_date, True)
 88 |                 raw_papers = self._retry_request(self.client.request, request, 'get', {}, {})
 89 |                 expected_papers = self._get_expected_papers(raw_papers)
 90 |                 self.logger.info(LogCategory.DATABASE, "springer", "_plan_requests", f"Expected papers from {self.database_name} using syntactic filters: {expected_papers}...")
 91 |                 times = int(expected_papers / self.max_papers) - 1
 92 |                 mod = int(expected_papers) % self.max_papers
 93 |                 if mod > 0:
 94 |                     times = times + 1
 95 |                 if times >= self.quota:
 96 |                     self.logger.info(LogCategory.DATABASE, "springer", "_plan_requests", f"The number of expected papers requires {times + 1} requests which exceeds the {self.database_name} quota of {self.quota} requests per day.")
 97 |                     self.logger.info(LogCategory.DATABASE, "springer", "_plan_requests", "Skipping to next repository. Try to redefine your search queries and syntactic filters. Using dates to limit your search can help in case you are not.")
 98 |                     return pd.DataFrame()
 99 |             else:
100 |                 self.logger.info(LogCategory.DATABASE, "springer", "_plan_requests", "Skipping to next repository. Please use syntactic filters to avoid this problem. Using dates to limit your search can help in case you are not.")
101 |                 return pd.DataFrame()
102 |         
103 |         # Execute requests
104 |         papers = self._execute_requests(query, parameters, times, dates, start_date, end_date, False)
105 |         return papers
106 | 
107 |     def _execute_requests(self, query, parameters, times, dates, start_date, end_date, syntactic_filter):
108 |         """Execute the planned requests to retrieve papers."""
109 |         papers = pd.DataFrame()
110 |         
111 |         for t in tqdm(range(0, times + 1)):
112 |             time.sleep(self.waiting_time)
113 |             self.start = t * self.max_papers
114 |             
115 |             request = self._create_request(parameters, dates, start_date, end_date, syntactic_filter)
116 |             raw_papers = self._retry_request(self.client.request, request, 'get', {}, {})
117 |             
118 |             if raw_papers is None:
119 |                 continue
120 |                 
121 |             papers_request = self._process_raw_papers(query, raw_papers)
122 |             
123 |             if len(papers) == 0:
124 |                 papers = papers_request
125 |             else:
126 |                 papers = pd.concat([papers, papers_request])
127 |         
128 |         return papers
129 | 
130 |     def _create_request(self, parameters, dates, start_date, end_date, syntactic_filter):
131 |         """Create the API request URL for Springer."""
132 |         req = self.api_url
133 |         if dates is True:
134 |             req = req.replace('<dates>', '%20onlinedatefrom:' + str(start_date) +'%20onlinedateto:' + str(end_date) + '%20')
135 |         else:
136 |             req = req.replace('<dates>', '')
137 |             
138 |         if not syntactic_filter:
139 |             req = req + self.client.default_query(parameters)
140 |             req = req.replace('%28', '(').replace('%29', ')').replace('+', '%20')
141 |             req = req.replace('title:', '')
142 |         else:
143 |             query = parameters['query']
144 |             syntactic_filters = parameters['syntactic_filters']
145 |             for word in syntactic_filters:
146 |                 query = query.replace('<AND>last', '<AND> ')
147 |                 query = query + "'" + word + "' <AND>last"
148 |             query = query.replace(' <AND>last', '')
149 |             parameters['query'] = query
150 |             req = req + self.client.default_query(parameters)
151 |             req = req.replace('%28', '(').replace('%29', ')').replace('+', '%20')
152 |             req = req.replace('title:', '')
153 |             
154 |         req = req + '&s='+str(self.start)+'&p='+str(self.max_papers)+'&api_key=' + self.api_access
155 |         return req
156 | 
157 |     def _get_expected_papers(self, raw_papers):
158 |         """Get the expected number of papers from the API response."""
159 |         total = 0
160 |         if raw_papers.status_code == 200:
161 |             try:
162 |                 json_results = json.loads(raw_papers.text)
163 |                 total = int(json_results['result'][0]['total'])
164 |             except (json.JSONDecodeError, KeyError, IndexError) as e:
165 |                 # User-friendly message explaining what's happening
166 |                 context = create_error_context(
167 |                     "springer", "_get_expected_papers", 
168 |                     ErrorSeverity.WARNING, 
169 |                     ErrorCategory.DATA,
170 |                     f"Data parsing error in Springer response: {type(e).__name__}: {str(e)}"
171 |                 )
172 |                 error_info = get_standard_error_info("data_validation_failed")
173 |                 ErrorHandler.handle_error(e, context, error_info, self.logger)
174 |             except (ValueError, TypeError) as e:
175 |                 # User-friendly message explaining what's happening
176 |                 context = create_error_context(
177 |                     "springer", "_get_expected_papers", 
178 |                     ErrorSeverity.WARNING, 
179 |                     ErrorCategory.DATA,
180 |                     f"Data type error in Springer response: {type(e).__name__}: {str(e)}"
181 |                 )
182 |                 error_info = get_standard_error_info("data_validation_failed")
183 |                 ErrorHandler.handle_error(e, context, error_info, self.logger)
184 |             except Exception as ex:
185 |                 # User-friendly message explaining what's happening
186 |                 context = create_error_context(
187 |                     "springer", "_get_expected_papers", 
188 |                     ErrorSeverity.ERROR, 
189 |                     ErrorCategory.DATA,
190 |                     f"Unexpected error parsing Springer response: {type(ex).__name__}: {str(ex)}"
191 |                 )
192 |                 error_info = get_standard_error_info("unexpected_error")
193 |                 ErrorHandler.handle_error(ex, context, error_info, self.logger)
194 |         else:
195 |             self._log_api_error(raw_papers, raw_papers.request.url if raw_papers.request else "")
196 |         return total
197 | 
198 |     def _process_raw_papers(self, query, raw_papers):
199 |         """Process the raw API response into a DataFrame."""
200 |         query_name = list(query.keys())[0]
201 |         query_value = query[query_name]
202 |         papers_request = pd.DataFrame()
203 |         
204 |         if raw_papers.status_code == 200:
205 |             try:
206 |                 json_results = json.loads(raw_papers.text)
207 |                 papers_request = pd.json_normalize(json_results['records'])
208 |                 papers_request.loc[:, 'database'] = self.database_name
209 |                 papers_request.loc[:, 'query_name'] = query_name
210 |                 papers_request.loc[:, 'query_value'] = query_value.replace('<AND>', 'AND').replace('<OR>', 'OR')
211 |             except (json.JSONDecodeError, KeyError) as e:
212 |                 # User-friendly message explaining what's happening
213 |                 context = create_error_context(
214 |                     "springer", "_process_raw_papers", 
215 |                     ErrorSeverity.WARNING, 
216 |                     ErrorCategory.DATA,
217 |                     f"Data parsing error in Springer response: {type(e).__name__}: {str(e)}"
218 |                 )
219 |                 error_info = get_standard_error_info("data_validation_failed")
220 |                 ErrorHandler.handle_error(e, context, error_info, self.logger)
221 |             except Exception as ex:
222 |                 # User-friendly message explaining what's happening
223 |                 context = create_error_context(
224 |                     "springer", "_process_raw_papers", 
225 |                     ErrorSeverity.ERROR, 
226 |                     ErrorCategory.DATA,
227 |                     f"Unexpected error parsing Springer response: {type(ex).__name__}: {str(ex)}"
228 |                 )
229 |                 error_info = get_standard_error_info("unexpected_error")
230 |                 ErrorHandler.handle_error(ex, context, error_info, self.logger)
231 |         else:
232 |             self._log_api_error(raw_papers, raw_papers.request.url if raw_papers.request else "")
233 |         
234 |         return papers_request
235 | 
236 |     def _filter_papers(self, papers: pd.DataFrame, dates, start_date, end_date) -> pd.DataFrame:
237 |         """Filter papers based on criteria."""
238 |         self.logger.info(LogCategory.DATA, "springer", "_filter_papers", "Filtering papers...")
239 |         try:
240 |             # Filter by title
241 |             papers.loc[:, 'title'] = papers['title'].replace('', float("NaN"))
242 |             papers = papers.dropna(subset=['title'])
243 |             papers.loc[:, 'title'] = papers['title'].str.lower()
244 |             papers = papers.drop_duplicates('title')
245 |             
246 |             # Filter by abstract
247 |             papers.loc[:, 'abstract'] = papers['abstract'].replace('', float("NaN"))
248 |             papers = papers.dropna(subset=['abstract'])
249 |             papers = papers.drop_duplicates(subset=['doi'])
250 |             
251 |             # Filter by language
252 |             if 'language' in papers:
253 |                 papers = papers[papers['language'].str.contains('en')]
254 |                 
255 |         except (ValueError, TypeError) as e:
256 |             # User-friendly message explaining what's happening
257 |             context = create_error_context(
258 |                 "springer", "_filter_papers", 
259 |                 ErrorSeverity.WARNING, 
260 |                 ErrorCategory.DATA,
261 |                 f"Data type error during Springer paper filtering: {type(e).__name__}: {str(e)}"
262 |             )
263 |             error_info = get_standard_error_info("data_validation_failed")
264 |             ErrorHandler.handle_error(e, context, error_info, self.logger)
265 |             # Continue with unfiltered papers rather than failing completely
266 |         except KeyError as e:
267 |             # User-friendly message explaining what's happening
268 |             context = create_error_context(
269 |                 "springer", "_filter_papers", 
270 |                 ErrorSeverity.WARNING, 
271 |                 ErrorCategory.DATA,
272 |                 f"Missing required column during Springer paper filtering: {type(e).__name__}: {str(e)}"
273 |             )
274 |             error_info = get_standard_error_info("data_validation_failed")
275 |             ErrorHandler.handle_error(e, context, error_info, self.logger)
276 |             # Return papers as-is to prevent complete failure
277 |         except Exception as ex:
278 |             # User-friendly message explaining what's happening
279 |             context = create_error_context(
280 |                 "springer", "_filter_papers", 
281 |                 ErrorSeverity.ERROR, 
282 |                 ErrorCategory.DATA,
283 |                 f"Unexpected error during Springer paper filtering: {type(ex).__name__}: {str(ex)}"
284 |             )
285 |             error_info = get_standard_error_info("unexpected_error")
286 |             ErrorHandler.handle_error(ex, context, error_info, self.logger)
287 |             # Return papers as-is to prevent complete failure
288 |         
289 |         return papers
290 | 
291 |     def _clean_papers(self, papers: pd.DataFrame) -> pd.DataFrame:
292 |         """Clean and standardize paper data."""
293 |         self.logger.info(LogCategory.DATA, "springer", "_clean_papers", "Cleaning papers...")
294 |         try:
295 |             # Extract URLs
296 |             urls = []
297 |             if 'url' in papers:
298 |                 for paper in papers['url']:
299 |                     url = paper[0]['value']
300 |                     urls.append(url)
301 |             
302 |             # Remove unnecessary columns
303 |             papers = papers.drop(columns=['url', 'creators', 'bookEditors', 'openaccess', 'printIsbn', 'electronicIsbn',
304 |                                           'isbn', 'genre', 'copyright', 'conferenceInfo', 'issn', 'eIssn', 'volume',
305 |                                           'publicationType', 'number', 'issueType', 'topicalCollection', 'startingPage',
306 |                                           'endingPage', 'language', 'journalId', 'printDate', 'response', 'onlineDate',
307 |                                           'coverDate', 'keyword'],
308 |                                  errors='ignore')
309 |             
310 |             # Add cleaned URLs
311 |             if len(urls) > 0:
312 |                 papers.loc[:, 'url'] = urls
313 |             else:
314 |                 papers['url'] = ''
315 |             
316 |             # Clean empty values
317 |             papers.replace('', float("NaN"), inplace=True)
318 |             papers.dropna(how='all', axis=1, inplace=True)
319 |             
320 |         except (ValueError, TypeError) as e:
321 |             # User-friendly message explaining what's happening
322 |             context = create_error_context(
323 |                 "springer", "_clean_papers", 
324 |                 ErrorSeverity.WARNING, 
325 |                 ErrorCategory.DATA,
326 |                 f"Data type error during Springer paper cleaning: {type(e).__name__}: {str(e)}"
327 |             )
328 |             error_info = get_standard_error_info("data_validation_failed")
329 |             ErrorHandler.handle_error(e, context, error_info, self.logger)
330 |             # Continue with uncleaned papers rather than failing completely
331 |         except KeyError as e:
332 |             # User-friendly message explaining what's happening
333 |             context = create_error_context(
334 |                 "springer", "_clean_papers", 
335 |                 ErrorSeverity.WARNING, 
336 |                 ErrorCategory.DATA,
337 |                 f"Missing required column during Springer paper cleaning: {type(e).__name__}: {str(e)}"
338 |             )
339 |             error_info = get_standard_error_info("data_validation_failed")
340 |             ErrorHandler.handle_error(e, context, error_info, self.logger)
341 |             # Return papers as-is to prevent complete failure
342 |         except (IndexError, AttributeError) as e:
343 |             # User-friendly message explaining what's happening
344 |             context = create_error_context(
345 |                 "springer", "_clean_papers", 
346 |                 ErrorSeverity.WARNING, 
347 |                 ErrorCategory.DATA,
348 |                 f"URL extraction error during Springer paper cleaning: {type(e).__name__}: {str(e)}"
349 |             )
350 |             error_info = get_standard_error_info("data_validation_failed")
351 |             ErrorHandler.handle_error(e, context, error_info, self.logger)
352 |             # Continue with empty URLs rather than failing completely
353 |         except Exception as ex:
354 |             # User-friendly message explaining what's happening
355 |             context = create_error_context(
356 |                 "springer", "_clean_papers", 
357 |                 ErrorSeverity.ERROR, 
358 |                 ErrorCategory.DATA,
359 |                 f"Unexpected error during Springer paper cleaning: {type(ex).__name__}: {str(ex)}"
360 |             )
361 |             error_info = get_standard_error_info("unexpected_error")
362 |             ErrorHandler.handle_error(ex, context, error_info, self.logger)
363 |             # Return papers as-is to prevent complete failure
364 |         
365 |         return papers
366 |     
367 |     def _get_abstracts(self, papers: pd.DataFrame) -> pd.DataFrame:
368 |         """Get abstracts for papers."""
369 |         pass


--------------------------------------------------------------------------------