├── tests ├── __init__.py └── test_newscatcherapi_client.py ├── requirements.txt ├── newscatcherapi ├── __init__.py ├── newscatcherapi_auth.py ├── newscatcherapi_exception.py ├── const.py ├── utils.py └── newscatcherapi_client.py ├── pyproject.toml ├── setup.py ├── LICENSE └── README.md /tests/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | requests>=2.24.0 2 | dateparser==1.1.2 -------------------------------------------------------------------------------- /newscatcherapi/__init__.py: -------------------------------------------------------------------------------- 1 | from newscatcherapi.newscatcherapi_client import NewsCatcherApiClient -------------------------------------------------------------------------------- /newscatcherapi/newscatcherapi_auth.py: -------------------------------------------------------------------------------- 1 | from requests.auth import AuthBase 2 | 3 | 4 | class NewsCatcherApiAuth(AuthBase): 5 | # Provided by NewsCatcher: https://docs.newscatcherapi.com/api-docs/authentication 6 | def __init__(self, x_api_key): 7 | self.x_api_key = x_api_key 8 | 9 | def __call__(self, request): 10 | request.headers.update(get_auth_headers(self.x_api_key)) 11 | return request 12 | 13 | 14 | def get_auth_headers(x_api_key): 15 | return {"Content-Type": "Application/JSON", "x-api-key": x_api_key} 16 | -------------------------------------------------------------------------------- /newscatcherapi/newscatcherapi_exception.py: -------------------------------------------------------------------------------- 1 | class NewsCatcherApiException(Exception): 2 | """Represents an ``error`` response status value from NewsCatcher News API.""" 3 | 4 | def __init__(self, exception): 5 | self.exception = exception 6 | 7 | def get_exception(self): 8 | return self.exception 9 | 10 | def get_status(self): 11 | if self.exception["status"]: 12 | return self.exception["status"] 13 | 14 | def get_code(self): 15 | if self.exception["error_code"]: 16 | return self.exception["error_code"] 17 | 18 | def get_message(self): 19 | if self.exception["message"]: 20 | return self.exception["message"] 21 | -------------------------------------------------------------------------------- /pyproject.toml: -------------------------------------------------------------------------------- 1 | [tool.poetry] 2 | name = "newscatcherapi" 3 | version = "0.7.3" 4 | description = "NewsCatcher News API V2 SDK for Python" 5 | authors = ["Maksym Sugonyaka ", 6 | "Artem Bugara "] 7 | readme = "README.md" 8 | 9 | homepage = "https://newscatcherapi.com/" 10 | license = "MIT" 11 | keywords = ["News", "RSS", "Scraping", "Data Mining", "News Extraction"] 12 | 13 | [tool.poetry.dependencies] 14 | python = ">=3.6.0" 15 | requests = ">=2.24.0" 16 | dateparser= ">=0.7.6" 17 | 18 | [tool.poetry.dev-dependencies] 19 | pytest = "^5.2" 20 | requests="^2.24.0" 21 | dateparser="^1.1.1" 22 | 23 | [build-system] 24 | requires = ["poetry-core>=1.0.0"] 25 | build-backend = "poetry.core.masonry.api" 26 | -------------------------------------------------------------------------------- /newscatcherapi/const.py: -------------------------------------------------------------------------------- 1 | """Constants and allowed parameter values specified in the NewsCatcher News API.""" 2 | 3 | LATEST_HEADLINES_URL = "/v2/latest_headlines" 4 | SEARCH_URL = "/v2/search" 5 | SOURCES_URL = "/v2/sources" 6 | 7 | #: The 2-letter ISO-639-1 code of the language you want to get articles for. 8 | allowed_languages = 'af,ar,bg,bn,ca,cs,cy,cn,da,de,el,en,es,et,fa,fi,fr,gu,he,hi,hr,hu,id,it,ja,kn,ko,lt,lv,mk,ml,mr,ne,nl,no,pa,pl,pt,ro,ru,sk,sl,so,sq,sv,sw,ta,te,th,tl,tr,tw,uk,ur,vi'.split(',') 9 | 10 | #: The topic you want to get articles for. 11 | allowed_topics = 'news,sport,tech,world,finance,politics,business,economics,entertainment,beauty,travel,music,food,science,gaming,energy'.split(',') 12 | 13 | # Date precisions 14 | allowed_precisions = 'timezone unknown,full,date'.split(',') 15 | 16 | # Search In 17 | allowed_search_ins = ['title', 'summary', 'title,summary'] 18 | 19 | #: The order to sort article results in. If not specified, the default is ``"relevancy"``. 20 | allowed_sorts = ['relevancy', 'date', 'rank'] 21 | -------------------------------------------------------------------------------- /setup.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | 3 | # PyPI upload: 4 | # 5 | # $ python -m pip install --upgrade twine wheel 6 | # $ python setup.py sdist bdist_wheel --universal 7 | # $ twine upload dist/* 8 | # 9 | # Install in development: 10 | # 11 | # $ python3 -m pip install -e . 12 | 13 | from setuptools import setup, find_packages 14 | 15 | VERSION = "0.7.3" 16 | INSTALL_REQUIRES = ["requests>=2.24.0", "dateparser"] 17 | TESTS_REQUIRE = ["pytest"] 18 | 19 | if __name__ == "__main__": 20 | setup( 21 | name="newscatcherapi", 22 | version=VERSION, 23 | author="Maksym Sugonyaka", 24 | author_email="maksym@newscatcherapi.com", 25 | url="https://github.com/NewscatcherAPI/newscatcherapi-sdk-python", 26 | packages=find_packages(), 27 | install_requires=INSTALL_REQUIRES, 28 | tests_require=TESTS_REQUIRE, 29 | description="An official Python client for the NewsCatcher News API", 30 | download_url="", 31 | keywords=["newscatcherapi", "news"], 32 | ) 33 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2021 newscatcherapi.com 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /newscatcherapi/utils.py: -------------------------------------------------------------------------------- 1 | from __future__ import unicode_literals 2 | from newscatcherapi import const 3 | 4 | import sys 5 | 6 | def validate_language(language): 7 | if is_valid_list(language): 8 | for each_lang in language: 9 | if each_lang.strip().lower() not in const.allowed_languages: 10 | raise ValueError(f"{each_lang} - is an invalid language. Language should be one of this list => {str(const.allowed_languages)}") 11 | return ','.join([i.strip().lower() for i in language]) 12 | elif is_valid_string(language): 13 | language_clean = [i.strip().lower() for i in language.split(',')] 14 | for each_lang in language_clean: 15 | if each_lang not in const.allowed_languages: 16 | raise ValueError(f"{each_lang} - is an invalid language. Language should be one of this list => {str(const.allowed_languages)}") 17 | return ','.join(language_clean) 18 | else: 19 | raise TypeError("lang parameter should be of type str or list") 20 | 21 | 22 | def validate_countries(list_countries, name_parameter): 23 | if is_valid_list(list_countries): 24 | valid_countries = [i.strip().upper() for i in list_countries] 25 | return ','.join(valid_countries) 26 | elif is_valid_string(list_countries): 27 | valid_countries = [i.strip().upper() for i in list_countries.split(',')] 28 | return ','.join(valid_countries) 29 | else: 30 | raise TypeError(f"{name_parameter} parameter should be of type str or list") 31 | 32 | def validate_topic(topic): 33 | if is_valid_string(topic): 34 | if topic in const.allowed_topics: 35 | return topic 36 | else: 37 | raise ValueError( 38 | f"{topic} - is an unsupported topic. Topic should be one of this list => {str(const.allowed_topics)}") 39 | else: 40 | raise TypeError(f"topic parameter should be of type str") 41 | 42 | 43 | def validate_sources(list_sources, name_parameter): 44 | if is_valid_list(list_sources): 45 | valid_sources = [i.strip().lower() for i in list_sources] 46 | return ','.join(valid_sources) 47 | elif is_valid_string(list_sources): 48 | valid_sources = [i.strip().lower() for i in list_sources.split(',')] 49 | return ','.join(valid_sources) 50 | else: 51 | raise TypeError(f"{name_parameter} parameter should be of type str or list") 52 | 53 | def validate_when(when, name_parameter): 54 | if is_valid_string(when): 55 | if when[len(when)-1] in ['d', 'h']: 56 | return when 57 | else: 58 | raise TypeError(f"{name_parameter} parameter should be the next form: 30d or 24h ") 59 | else: 60 | raise TypeError(f"{name_parameter} parameter should be of type str") 61 | 62 | 63 | PY2 = sys.version_info[0] == 2 64 | PY3 = sys.version_info[0] == 3 65 | 66 | if PY3: 67 | 68 | def is_valid_string(var): 69 | return isinstance(var, str) 70 | 71 | def is_valid_num(var): 72 | return isinstance(var, (int, float)) 73 | 74 | def is_valid_list(var): 75 | return isinstance(var, list) 76 | 77 | def is_valid_boolean(var): 78 | return isinstance(var, bool) 79 | 80 | elif PY2: 81 | 82 | def is_valid_string(var): 83 | return isinstance(var, basestring) 84 | 85 | def is_valid_num(var): 86 | return isinstance(var, (int, float, long)) 87 | 88 | 89 | else: 90 | 91 | def is_valid_string(var): 92 | raise SystemError("unsupported version of python detected (supported versions: 2, 3)") 93 | 94 | 95 | # function for updating the response dict/object 96 | def update_final_res(results, payload): 97 | if 'articles' not in results.keys(): 98 | return True 99 | 100 | if 'articles' not in payload.keys(): 101 | payload['articles'] = results['articles'] 102 | else: 103 | payload['articles'].extend(results['articles']) 104 | 105 | payload['total_hits'] += results['total_hits'] 106 | payload['total_pages'] += results['total_pages'] 107 | payload['page'] += results['page'] 108 | return False 109 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # NewsCatcher News API V2 SDK for Python 2 | 3 | The official Python client library to manipulate [NewsCatcher News API V2](https://newscatcherapi.com/news-api) from your Python application. 4 | 5 | Documentation is identical with the API documentation. The same parameters and filters are available. 6 | And the same response structure. You can have a look at [docs.newscatcherapi.com](https://docs.newscatcherapi.com). 7 | 8 | ## Authentication 9 | 10 | The Authentication is done via the `x_api_key` variable. 11 | 12 | Receive your API key by registering at [app.newscatcherapi.com](https://app.newscatcherapi.com). 13 | 14 | ## Installation 15 | ```pip install newscatcherapi``` 16 | 17 | ## Quick Start 18 | Import installed package. 19 | 20 | `````from newscatcherapi import NewsCatcherApiClient````` 21 | 22 | Init the instance with an API key given after registration. 23 | 24 | ````newscatcherapi = NewsCatcherApiClient(x_api_key='YOUR_API_KEY') ```` 25 | 26 | ## Endpoints 27 | An instance of `NewsCatcherApiClient` has three main methods that correspond to three endpoints available for NewsCatcher News API. 28 | 29 | ### Get News (/v2/search) 30 | Main method that allows you to find news article by keyword, date, language, country, etc. 31 | 32 | ``` 33 | all_articles = newscatcherapi.get_search(q='Elon Musk', 34 | lang='en', 35 | countries='CA', 36 | page_size=100) 37 | ``` 38 | 39 | ### Get News Extracting All Pages (/v2/search) 40 | It is the same method as *get_search*, but you can program to extract all articles without changing `page` param manually. 41 | 42 | For example: for a given search you have 1000 found articles. *get_search* makes one API call and returns up to 100 articles. 43 | *get_search_all_pages* will make 10 API calls and will return all 1000 articles. 44 | 45 | Two new parameters: 46 | - `max_page` - The last page number to extract. To use when you want to limit the number of extracted pages. 47 | - `seconds_pause` - Number of seconds waiting before each call. This parameter helps you deal with the rate limit on your subscription plan. By default, it is set to 1 second. 48 | 49 | ``` 50 | all_articles = newscatcherapi.get_search_all_pages(q='Elon Musk', 51 | lang='en', 52 | countries='CA', 53 | page_size=100, 54 | max_page=10, 55 | seconds_pause=1.0 56 | ) 57 | ``` 58 | 59 | 60 | ### Get News Extracting All Articles (/v2/search) 61 | It is the same method as *get_search*, but you can fetch all articles without changing `page`, `from_`, and `to_` params manually. 62 | ​ 63 | For example: for a given search you have found more than 10000 articles. *get_search* makes one API call and returns up to 100 articles. 64 | *get_search_all_pages* will make 100 API calls and will return 10000 articles. The *get_search_all_articles* method will return all articles. 65 | ​ 66 | 67 | One new parameters: 68 | - `by` - How to divide the the time interval between to_ and from_ in order to extract all articles for the given search query. By default it is set to `week`. Accepted values: `month`, `week`, `day`, `hour`. 69 | ​ 70 | ``` 71 | all_articles = newscatcherapi.get_search_all_articles(q='Elon Musk', 72 | lang='en', 73 | countries='CA', 74 | page_size=100, 75 | by = 'day' 76 | ) 77 | ``` 78 | 79 | ### Get Latest Headlines (/v2/latest_headlines) 80 | Get the latest headlines given any topic, country, sources, or language. 81 | 82 | ``` 83 | top_headlines = newscatcherapi.get_latest_headlines(lang='en', 84 | countries='us', 85 | topic='business') 86 | ``` 87 | 88 | ### Get Latest Headlines Extracting All Pages (/v2/latest_headlines) 89 | It is the same function as *get_latest_headlines*, but you can program to extract all articles without changing `page` param manually. 90 | 91 | For example: for a given search you have 1000 found articles. *get_latest_headlines* makes one API call and returns up to 100 articles. 92 | *get_latest_headlines_all_pages* will make 10 API calls and will return all 1000 articles. 93 | 94 | Two new parameters: 95 | - `max_page` - The last page number to extract. To use when you want to limit the number of extracted pages. 96 | - `seconds_pause` - Number of seconds waiting before each call. This parameter helps you deal with the rate limit on your subscription plan. By default, it is set to 1 second. 97 | 98 | ``` 99 | top_headlines = newscatcherapi.get_latest_headlines_all_pages(lang='en', 100 | countries='us', 101 | topic='business', 102 | max_page=10, 103 | seconds_pause=1.0 104 | ) 105 | ``` 106 | 107 | ### Get Sources (/v2/sources) 108 | Returns a list of the top 100 supported news websites. Overall, we support over 60,000 websites. Using this method, you may find the top 100 for your specific language, country, topic combination. 109 | 110 | ``` 111 | sources = newscatcherapi.get_sources(topic='business', 112 | lang='en', 113 | countries='US') 114 | ``` 115 | 116 | ### Every endpoint supports _proxies_ parameter 117 | If you want to use proxies, you can add this parameter to all the endpoints we have. 118 | Here is an example of a valid form proxies parameter and an example of using it with one of the endpoints. 119 | 120 | ``` 121 | proxies = { 122 | 'http': 'http://proxy.example.com:8080', 123 | 'https': 'http://secureproxy.example.com:8090', 124 | } 125 | 126 | all_articles = newscatcherapi.get_search(q='Elon Musk', 127 | lang='en', 128 | countries='CA', 129 | page_size=100, 130 | proxies=proxies) 131 | ``` 132 | 133 | 134 | ### Use *from_* and *to_* instead of *from* and *to* like in NewsCatcher News API 135 | In Python, we are not allowed to reserve variable names *from* and *to*. If you try to use them, you will get a syntax error: 136 | 137 | ```SyntaxError: invalid syntax``` 138 | 139 | So, here is an example on how to use time variables *from_* and *to_* in *get_search* method. 140 | 141 | ``` 142 | all_articles = newscatcherapi.get_search(q='Elon Musk', 143 | lang='en', 144 | countries='CA,US', 145 | from_='2021/08/20', 146 | to_='2021/08/31') 147 | ``` 148 | 149 | ## Feedback 150 | 151 | Feel free to contact us if you have spot a bug or have any suggestion at maksym`[at]`newscatcherapi.com 152 | -------------------------------------------------------------------------------- /tests/test_newscatcherapi_client.py: -------------------------------------------------------------------------------- 1 | import os 2 | import unittest 3 | 4 | from newscatcherapi.newscatcherapi_client import NewsCatcherApiClient 5 | 6 | 7 | class NewsCatcherApiTest(unittest.TestCase): 8 | def setUp(self): 9 | key = os.environ.get("newscatcher_api_secret") 10 | self.api = NewsCatcherApiClient(key) 11 | 12 | def test_api_latest_headlines(self): 13 | # Raise TypeError if lang is not of type str 14 | lang = 1 15 | with self.assertRaises(TypeError): 16 | self.api.get_latest_headlines(lang=lang) 17 | 18 | # Raise ValueError if lang is not in list 19 | lang = 'aer' 20 | with self.assertRaises(ValueError): 21 | self.api.get_latest_headlines(lang=lang) 22 | 23 | # Raise TypeError if not_lang is not of type str 24 | not_lang = 1 25 | with self.assertRaises(TypeError): 26 | self.api.get_latest_headlines(not_lang=not_lang) 27 | 28 | # Raise ValueError if lang is not in list 29 | not_lang = 'aer' 30 | with self.assertRaises(ValueError): 31 | self.api.get_latest_headlines(not_lang=not_lang) 32 | 33 | # Raise TypeError if sources param is not of type str 34 | sources = 0 35 | with self.assertRaises(TypeError): 36 | self.api.get_latest_headlines(sources=sources) 37 | 38 | # Raise TypeError if country param is not of type str 39 | countries = 0 40 | with self.assertRaises(TypeError): 41 | self.api.get_latest_headlines(countries=countries) 42 | 43 | # Raises TypeError if topic param is not of type str 44 | topic = 0 45 | with self.assertRaises(TypeError): 46 | self.api.get_latest_headlines(topic=topic) 47 | 48 | # Raises ValueError if category param is invalid 49 | topic = "dogcoin" 50 | with self.assertRaises(ValueError): 51 | self.api.get_latest_headlines(topic=topic) 52 | 53 | # Raises TypeError if page_size param is not an int 54 | page_size = "1" 55 | with self.assertRaises(TypeError): 56 | self.api.get_latest_headlines(page_size=page_size) 57 | 58 | # Raises ValueError if page_size param is less than zero(0) or greater than 100 59 | page_size = -1 60 | with self.assertRaises(ValueError): 61 | self.api.get_latest_headlines(page_size=page_size) 62 | 63 | page_size = 1000 64 | with self.assertRaises(ValueError): 65 | self.api.get_latest_headlines(page_size=page_size) 66 | 67 | # Raises a TypeError is page param is not an int 68 | page = "1" 69 | with self.assertRaises(TypeError): 70 | self.api.get_latest_headlines(page=page) 71 | 72 | # Raises a ValueError if page param is less than zero(0) 73 | page = -1 74 | with self.assertRaises(ValueError): 75 | self.api.get_latest_headlines(page=page) 76 | 77 | def test_api_get_search(self): 78 | # Raise TypeError if q param is None 79 | q = 0 80 | with self.assertRaises(TypeError): 81 | self.api.get_search(q=q) 82 | 83 | # Raise TypeError if lang is not of type str 84 | lang = 1 85 | with self.assertRaises(TypeError): 86 | self.api.get_search(lang=lang) 87 | 88 | # Raise ValueError if lang is not in list 89 | lang = 'aer' 90 | with self.assertRaises(ValueError): 91 | self.api.get_search(lang=lang) 92 | 93 | # Raise TypeError if not_lang is not of type str 94 | not_lang = 1 95 | with self.assertRaises(TypeError): 96 | self.api.get_search(not_lang=not_lang) 97 | 98 | # Raise ValueError if lang is not in list 99 | not_lang = 'aer' 100 | with self.assertRaises(ValueError): 101 | self.api.get_search(not_lang=not_lang) 102 | 103 | # Raise TypeError if sources param is not of type str 104 | sources = 0 105 | with self.assertRaises(TypeError): 106 | self.api.get_search(sources=sources) 107 | 108 | # Raise TypeError if country param is not of type str 109 | countries = 0 110 | with self.assertRaises(TypeError): 111 | self.api.get_search(countries=countries) 112 | 113 | # Raise TypeError if not_countries param is not of type str 114 | not_countries = 0 115 | with self.assertRaises(TypeError): 116 | self.api.get_search(not_countries=not_countries) 117 | 118 | # Raises TypeError if topic param is not of type str 119 | topic = 0 120 | with self.assertRaises(TypeError): 121 | self.api.get_search(topic=topic) 122 | 123 | # Raises ValueError if category param is invalid 124 | topic = "dogcoin" 125 | with self.assertRaises(ValueError): 126 | self.api.get_search(topic=topic) 127 | 128 | # Raises TypeError if page_size param is not an int 129 | page_size = "1" 130 | with self.assertRaises(TypeError): 131 | self.api.get_search(page_size=page_size) 132 | 133 | # Raises ValueError if page_size param is less than zero(0) or greater than 100 134 | page_size = -1 135 | with self.assertRaises(ValueError): 136 | self.api.get_search(page_size=page_size) 137 | 138 | page_size = 1000 139 | with self.assertRaises(ValueError): 140 | self.api.get_search(page_size=page_size) 141 | 142 | # Raises a TypeError is page param is not an int 143 | page = "1" 144 | with self.assertRaises(TypeError): 145 | self.api.get_search(page=page) 146 | 147 | # Raises a ValueError if page param is less than zero(0) 148 | page = -1 149 | with self.assertRaises(ValueError): 150 | self.api.get_search(page=page) 151 | 152 | # Raise TypeError is sort_by param is not of type str 153 | sort_by = 1 154 | with self.assertRaises(TypeError): 155 | self.api.get_search(sort_by=sort_by) 156 | 157 | # Raise ValueError if soft_by param is invalid 158 | sort_by = "sort" 159 | with self.assertRaises(ValueError): 160 | self.api.get_search(sort_by=sort_by) 161 | 162 | 163 | # Raise ValueError if soft_by param is invalid 164 | published_date_precision = "score" 165 | with self.assertRaises(ValueError): 166 | self.api.get_search(published_date_precision=published_date_precision) 167 | 168 | # Raise ValueError if soft_by param is invalid 169 | search_in = "published_date" 170 | with self.assertRaises(ValueError): 171 | self.api.get_search(search_in=search_in) 172 | 173 | 174 | # Raises a TypeError is from_rank param is not an int 175 | from_rank = "1" 176 | with self.assertRaises(TypeError): 177 | self.api.get_search(from_rank=from_rank) 178 | 179 | # Raises a TypeError is from_rank param is not an int 180 | to_rank = "1" 181 | with self.assertRaises(TypeError): 182 | self.api.get_search(to_rank=to_rank) 183 | 184 | def test_api_get_sources(self): 185 | # Raise TypeError if not_countries param is not of type str 186 | not_countries = 0 187 | with self.assertRaises(TypeError): 188 | self.api.get_search(not_countries=not_countries) 189 | 190 | # Raises TypeError if topic param is not of type str 191 | topic = 0 192 | with self.assertRaises(TypeError): 193 | self.api.get_search(topic=topic) 194 | 195 | # Raises ValueError if category param is invalid 196 | topic = "dogcoin" 197 | with self.assertRaises(ValueError): 198 | self.api.get_search(topic=topic) 199 | -------------------------------------------------------------------------------- /newscatcherapi/newscatcherapi_client.py: -------------------------------------------------------------------------------- 1 | from __future__ import unicode_literals 2 | 3 | import requests 4 | import os 5 | import sys 6 | import time 7 | from datetime import date, datetime, timedelta 8 | from dateparser import parse as parse_date 9 | 10 | sys.path.append(os.getcwd()) 11 | 12 | 13 | from newscatcherapi import const, utils 14 | from newscatcherapi.newscatcherapi_auth import NewsCatcherApiAuth 15 | from newscatcherapi.newscatcherapi_exception import NewsCatcherApiException 16 | 17 | 18 | class NewsCatcherApiClient(object): 19 | """The core client object used to fetch data from NewsCatcher News API endpoints. 20 | 21 | :param api_key: Your API key, a length-32 UUID string provided for your NewsCatcher News API account. 22 | You must `register `_ for a NewsCatcher News API key. 23 | :type api_key: str 24 | 25 | :param session: An optional :class:`requests.Session` instance from which to execute requests. 26 | **Note**: If you provide a ``session`` instance, :class:`NewsCatcherApiClient` will *not* close the session 27 | for you. Remember to call ``session.close()``, or use the session as a context manager, to close 28 | the socket and free up resources. 29 | :type session: `requests.Session `_ or None 30 | """ 31 | 32 | def __init__(self, x_api_key, base_url='https://api.newscatcherapi.com', session=None): 33 | self.auth = NewsCatcherApiAuth(x_api_key=x_api_key) 34 | self.base_url = base_url 35 | if session is None: 36 | self.request_method = requests 37 | else: 38 | self.request_method = session 39 | 40 | def get_latest_headlines( 41 | self, 42 | lang=None, 43 | not_lang=None, 44 | countries=None, 45 | not_countries=None, 46 | topic=None, 47 | sources=None, 48 | not_sources=None, 49 | when=None, 50 | ranked_only=None, 51 | page_size=None, 52 | page=None, 53 | proxies=None 54 | ): 55 | """Call the `/latest_headlines` endpoint. 56 | 57 | Fetch live top and breaking headlines. 58 | 59 | Get the latest headlines given any topic, country, or language. Articles are sorted by the earliest 60 | date published first. 61 | 62 | :param lang: Specifies the languages of the search. For example: `en`. The only accepted format is [ISO 639-1 — 2](https://en.wikipedia.org/wiki/ISO_639-1) letter code. 63 | :type lang: list or str or None 64 | 65 | :param not_lang: Inverse to the `lang` parameter 66 | :type not_lang: list or str or None 67 | 68 | :param countries: Countries where the news publisher is located. **Important**: This parameter is not responsible for the countries mentioned in the news article. One or multiple countries can be used in the search. The only acceptable format is [ISO 3166-1 alpha-2](https://en.wikipedia.org/wiki/ISO_3166-1_alpha-2) For example, `US,CA,MX` or just `US` 69 | :type countries: list or str or None 70 | 71 | :param not_countries: The inverse of the `countries` parameter. 72 | :type not_countries: list or str or None 73 | 74 | :param topic: Accepted values: `news`, `sport`, `tech`, `world`, `finance`, `politics`, `business`, `economics`, `entertainment`, `beauty`, `travel`, `music`, `food`, `science`, `gaming`, `energy`. The topic to which you want to restrict the articles of your choice. Not all news articles are assigned with a topic, therefore, we cannot guarantee that 100% of topics talking about technology will be assigned a tech label. 75 | :type topic: str or None 76 | 77 | :param sources: One or more news resources to filter your search. It should be the normal form of the URL, For example: `nytimes.com,theguardian.com` 78 | :type sources: list or str or None 79 | 80 | :param not_sources: One or more sources to be excluded from the search. Comma-separated list. For example: `nytimes.com,cnn.com,wsj.com` 81 | :type not_sources: list or str or None 82 | 83 | :param when: The time period you want to get the latest headlines for. Accepted forms: 7d => Dailly Form (last 7 days time period), 30d (last 30 days time period) | 1h => Hourly Form (last hour), 24h (last 24 hours) 84 | :type when: str or None 85 | 86 | :param ranked_only: Default: `True` Limit the search only for the sources which are in the top 1 million online websites. Unranked sources are assigned a rank that equals `999999` 87 | :type ranked_only: bool or None 88 | 89 | :param page_size: `[1:100]` How many articles to return per page. 90 | :type page_size: int or None 91 | 92 | :param page: The number of the page. Use it to scroll through the results. This parameter is used to paginate: scroll through results because one API response cannot return more than 100 articles. 93 | :type page: int or None 94 | 95 | :param proxies: Dict of proxies if needed 96 | :type proxies: dict or None 97 | 98 | :return: JSON response as nested Python dictionary. 99 | :rtype: dict 100 | :raises NewsCatcherApiException: If the ``"status"`` value of the response is ``"error"`` rather than ``"ok"``. 101 | """ 102 | 103 | payload = {} 104 | 105 | 106 | # Language 107 | if lang is not None: 108 | payload["lang"] = utils.validate_language(lang) 109 | 110 | if not_lang is not None: 111 | payload["not_lang"] = utils.validate_language(not_lang) 112 | 113 | # Countries 114 | if countries is not None: 115 | payload["countries"] = utils.validate_countries(countries, 'countries') 116 | 117 | if not_countries is not None: 118 | payload["not_countries"] = utils.validate_countries(not_countries, 'not_countries') 119 | 120 | # Topic 121 | if topic is not None: 122 | payload['topic'] = utils.validate_topic(topic) 123 | 124 | # Sources 125 | if sources is not None: 126 | payload["sources"] = utils.validate_sources(sources, 'sources') 127 | 128 | if not_sources is not None: 129 | payload["not_sources"] = utils.validate_sources(not_sources, 'not_sources') 130 | 131 | # When 132 | if when is not None: 133 | payload["when"] = utils.validate_when(when, 'when') 134 | 135 | # Ranks 136 | if ranked_only is not None: 137 | if utils.is_valid_boolean(ranked_only): 138 | payload['ranked_only'] = ranked_only 139 | else: 140 | raise TypeError("ranked_only parameter should be of type boolean") 141 | 142 | # Page and page sizes 143 | # Page Size 144 | if page_size is not None: 145 | if type(page_size) == int: 146 | payload["page_size"] = page_size 147 | else: 148 | raise TypeError("page_size param should be an int") 149 | 150 | # Page 151 | if page is not None: 152 | if type(page) == int: 153 | if page > 0: 154 | payload["page"] = page 155 | else: 156 | raise ValueError("page param should be an int greater than 0") 157 | else: 158 | raise TypeError("page param should be an int") 159 | 160 | # Send Request 161 | r = self.request_method.get(self.base_url + const.LATEST_HEADLINES_URL, auth=self.auth, timeout=60, params=payload, proxies=proxies) 162 | 163 | # Check Status of Request 164 | if r.status_code != requests.codes.ok: 165 | raise NewsCatcherApiException(r.json()) 166 | 167 | return r.json() 168 | 169 | def get_search( 170 | self, 171 | q=None, 172 | lang=None, 173 | not_lang=None, 174 | from_=None, 175 | to_=None, 176 | published_date_precision=None, 177 | search_in=None, 178 | countries=None, 179 | not_countries=None, 180 | topic=None, 181 | sources=None, 182 | not_sources=None, 183 | ranked_only=None, 184 | from_rank=None, 185 | to_rank=None, 186 | sort_by=None, 187 | page_size=None, 188 | page=None, 189 | proxies=None 190 | ): 191 | """Call the `/search` endpoint. 192 | 193 | Main endpoint that allows you to find news article by keyword, date, language, country, etc. 194 | 195 | :param q: Keyword/keywords you're searching for. This is the most important part of your query. Please, refer to the **Advanced Query Parameter** section below for more examples and explanations. (required) 196 | :type q: str or None 197 | 198 | :param lang: Specifies the languages of the search. For example: `en`. The only accepted format is [ISO 639-1 — 2](https://en.wikipedia.org/wiki/ISO_639-1) letter code. 199 | :type lang: list or str or None 200 | 201 | :param not_lang: Inverse to the `lang` parameter 202 | :type not_lang: list or str or None 203 | 204 | :param from_: `YYYY/mm/dd` From which point in time to start the search. The default timezone is UTC. Defaults to the past week. 205 | :type from_: str or None 206 | 207 | :param to_: `YYYY/mm/dd` Until which point in time to search for. The default timezone is UTC. 208 | :type to_: str or None 209 | 210 | :param published_date_precision: There are 3 types of date precision we define: `full` — day and time of an article is correctly identified with the appropriate timezone `timezone unknown` — day and time of an article is correctly identified without timezone `date` — only the day is identified without an exact time 211 | :type published_date_precision: str or None 212 | 213 | :param search_in: By default, we search what you specified in the `q` parameter in both `title` and `summary` of the article. However, you can limit this to either `title` or `summary` 214 | :type search_in: str or None 215 | 216 | :param countries: Countries where the news publisher is located. **Important**: This parameter is not responsible for the countries mentioned in the news article. One or multiple countries can be used in the search. The only acceptable format is [ISO 3166-1 alpha-2](https://en.wikipedia.org/wiki/ISO_3166-1_alpha-2) For example, `US,CA,MX` or just `US` 217 | :type countries: list or str or None 218 | 219 | :param not_countries: The inverse of the `countries` parameter. 220 | :type not_countries: list or str or None 221 | 222 | :param topic: Accepted values: `news`, `sport`, `tech`, `world`, `finance`, `politics`, `business`, `economics`, `entertainment`, `beauty`, `travel`, `music`, `food`, `science`, `gaming`, `energy`. The topic to which you want to restrict the articles of your choice. Not all news articles are assigned with a topic, therefore, we cannot guarantee that 100% of topics talking about technology will be assigned a tech label. 223 | :type topic: str or None 224 | 225 | :param sources: One or more news resources to filter your search. It should be the normal form of the URL, For example: `nytimes.com,theguardian.com` 226 | :type sources: list or str or None 227 | 228 | :param not_sources: One or more sources to be excluded from the search. Comma-separated list. For example: `nytimes.com,cnn.com,wsj.com` 229 | :type not_sources: list or str or None 230 | 231 | :param ranked_only: Default: `True` Limit the search only for the sources which are in the top 1 million online websites. Unranked sources are assigned a rank that equals `999999` 232 | :type ranked_only: bool or None 233 | 234 | :param from_rank: `[0:999999]` The lowest boundary of the rank of a news website to filter by. Important: lower rank means that a source is more popular 235 | :type from_rank: int or None 236 | 237 | :param to_rank: `[0:999999]` The upper boundary of the rank of a news website to filter by. 238 | :type to_rank: int or None 239 | 240 | :param sort_by: `relevancy` (default value) — the most relevant results first `date` — the most recently published results first `rank` — the results from the highest-ranked sources first 241 | :type sort_by: str or None 242 | 243 | :param page_size: `[1:100]` How many articles to return per page. 244 | :type page_size: int or None 245 | 246 | :param page: The number of the page. Use it to scroll through the results. This parameter is used to paginate: scroll through results because one API response cannot return more than 100 articles. 247 | :type page: int or None 248 | 249 | :param proxies: Dict of proxies if needed 250 | :type proxies: dict or None 251 | 252 | 253 | :return: JSON response as nested Python dictionary. 254 | :rtype: dict 255 | :raises NewsCatcherApiException: If the ``"status"`` value of the response is ``"error"`` rather than ``"ok"``. 256 | """ 257 | 258 | payload = {} 259 | 260 | # Q 261 | if q is not None: 262 | if utils.is_valid_string(q): 263 | payload["q"] = q 264 | else: 265 | raise TypeError("q parameter should be of type str") 266 | 267 | # Language 268 | if lang is not None: 269 | payload["lang"] = utils.validate_language(lang) 270 | 271 | if not_lang is not None: 272 | payload["not_lang"] = utils.validate_language(not_lang) 273 | 274 | # Time variables 275 | if from_ is not None: 276 | if utils.is_valid_string(from_): 277 | payload["from"] = from_ 278 | else: 279 | raise TypeError("from_ parameter should be of type str") 280 | 281 | if to_ is not None: 282 | if utils.is_valid_string(to_): 283 | payload["to"] = to_ 284 | else: 285 | raise TypeError("to_ parameter should be of type str") 286 | 287 | if published_date_precision is not None: 288 | if utils.is_valid_string(published_date_precision): 289 | if published_date_precision in const.allowed_precisions: 290 | payload["published_date_precision"] = published_date_precision 291 | else: 292 | raise ValueError(f'{published_date_precision} is not a valid date precision. ' 293 | f'It should be one of the list: {str(const.allowed_precisions)}') 294 | else: 295 | raise TypeError("published_date_precision parameter should be of type str") 296 | 297 | # Search in 298 | if search_in is not None: 299 | if utils.is_valid_string(search_in): 300 | if search_in in const.allowed_search_ins: 301 | payload["search_in"] = search_in 302 | else: 303 | raise ValueError(f'{search_in} is not a valid place to search for keywords. ' 304 | f'It should be one of the list: {str(const.allowed_search_ins)}') 305 | else: 306 | raise TypeError("search_in parameter should be of type str") 307 | 308 | # Countries 309 | if countries is not None: 310 | payload["countries"] = utils.validate_countries(countries, 'countries') 311 | 312 | if not_countries is not None: 313 | payload["not_countries"] = utils.validate_countries(not_countries, 'not_countries') 314 | 315 | # Topic 316 | if topic is not None: 317 | payload['topic'] = utils.validate_topic(topic) 318 | 319 | # Sources 320 | if sources is not None: 321 | payload["sources"] = utils.validate_sources(sources, 'sources') 322 | 323 | if not_sources is not None: 324 | payload["not_sources"] = utils.validate_sources(not_sources, 'not_sources') 325 | 326 | 327 | # Ranks 328 | if ranked_only is not None: 329 | if utils.is_valid_boolean(ranked_only): 330 | payload['ranked_only'] = ranked_only 331 | else: 332 | raise TypeError("ranked_only parameter should be of type boolean") 333 | 334 | if from_rank is not None: 335 | if utils.is_valid_num(from_rank): 336 | payload['from_rank'] = from_rank 337 | else: 338 | raise TypeError("from_rank parameter should be of type int") 339 | 340 | if to_rank is not None: 341 | if utils.is_valid_num(to_rank): 342 | payload['to_rank'] = to_rank 343 | else: 344 | raise TypeError("to_rank parameter should be of type int") 345 | 346 | # Sort by 347 | if sort_by is not None: 348 | if utils.is_valid_string(sort_by): 349 | if sort_by in const.allowed_sorts: 350 | payload["sort_by"] = sort_by 351 | else: 352 | raise ValueError(f'{sort_by} is not a valid sort by type. ' 353 | f'It should be one of the list: {str(const.allowed_sorts)}') 354 | else: 355 | raise TypeError("sort_by parameter should be of type str") 356 | 357 | # Page and page sizes 358 | # Page Size 359 | if page_size is not None: 360 | if type(page_size) == int: 361 | payload["page_size"] = page_size 362 | else: 363 | raise TypeError("page_size param should be an int") 364 | 365 | # Page 366 | if page is not None: 367 | if type(page) == int: 368 | if page > 0: 369 | payload["page"] = page 370 | else: 371 | raise ValueError("page param should be an int greater than 0") 372 | else: 373 | raise TypeError("page param should be an int") 374 | 375 | # Send Request 376 | r = self.request_method.get(self.base_url + const.SEARCH_URL, auth=self.auth, timeout=60, params=payload, proxies=proxies) 377 | 378 | # Check Status of Request 379 | if r.status_code != requests.codes.ok: 380 | raise NewsCatcherApiException(r.json()) 381 | 382 | return r.json() 383 | 384 | def get_sources(self, 385 | lang=None, 386 | countries=None, 387 | topic=None, 388 | proxies=None): 389 | """Call the `/sources` endpoint. 390 | 391 | Returns a list of the top 100 supported news websites. Overall, we support over 60,000 websites. Using this endpoint, you may find the top 100 for your specific language, country, topic combination. 392 | 393 | :param lang: Specifies the languages of the search. For example: `en`. The only accepted format is [ISO 639-1 — 2](https://en.wikipedia.org/wiki/ISO_639-1) letter code. 394 | :type lang: list or str or None 395 | 396 | :param countries: Countries where the news publisher is located. **Important**: This parameter is not responsible for the countries mentioned in the news article. One or multiple countries can be used in the search. The only acceptable format is [ISO 3166-1 alpha-2](https://en.wikipedia.org/wiki/ISO_3166-1_alpha-2) For example, `US,CA,MX` or just `US` 397 | :type countries: list or str or None 398 | 399 | :param topic: Accepted values: `news`, `sport`, `tech`, `world`, `finance`, `politics`, `business`, `economics`, `entertainment`, `beauty`, `travel`, `music`, `food`, `science`, `gaming`, `energy`. The topic to which you want to restrict the articles of your choice. Not all news articles are assigned with a topic, therefore, we cannot guarantee that 100% of topics talking about technology will be assigned a tech label. 400 | :type topic: str or None 401 | 402 | :param proxies: Dict of proxies if needed 403 | :type proxies: dict or None 404 | 405 | :return: JSON response as nested Python dictionary. 406 | :rtype: dict 407 | :raises NewsCatcherApiException: If the ``"status"`` value of the response is ``"error"`` rather than ``"ok"``. 408 | 409 | """ 410 | 411 | payload = {} 412 | 413 | # Language 414 | if lang is not None: 415 | payload["lang"] = utils.validate_language(lang) 416 | 417 | # Countries 418 | if countries is not None: 419 | payload["countries"] = utils.validate_countries(countries, 'countries') 420 | 421 | # Topic 422 | if topic is not None: 423 | payload['topic'] = utils.validate_topic(topic) 424 | 425 | # Send Request 426 | r = self.request_method.get(self.base_url + const.SOURCES_URL, auth=self.auth, timeout=60, params=payload, proxies=proxies) 427 | 428 | # Check Status of Request 429 | if r.status_code != requests.codes.ok: 430 | raise NewsCatcherApiException(r.json()) 431 | 432 | return r.json() 433 | 434 | def get_latest_headlines_all_pages( 435 | self, 436 | lang=None, 437 | not_lang=None, 438 | countries=None, 439 | not_countries=None, 440 | topic=None, 441 | sources=None, 442 | not_sources=None, 443 | when=None, 444 | ranked_only=None, 445 | page_size=100, 446 | page=1, 447 | max_page=None, 448 | seconds_pause=1.0, 449 | proxies=None 450 | ): 451 | 452 | """Call the `/latest_headlines` endpoint the number of time sufficient to get all latest articles for a given search. 453 | 454 | Fetch live top and breaking headlines. 455 | 456 | Get the latest headlines given any topic, country, or language. Articles are sorted by the earliest 457 | date published first. All found articles will be extracted. 458 | 459 | :param lang: Specifies the languages of the search. For example: `en`. The only accepted format is [ISO 639-1 — 2](https://en.wikipedia.org/wiki/ISO_639-1) letter code. 460 | :type lang: list or str or None 461 | 462 | :param not_lang: Inverse to the `lang` parameter 463 | :type not_lang: list or str or None 464 | 465 | :param countries: Countries where the news publisher is located. **Important**: This parameter is not responsible for the countries mentioned in the news article. One or multiple countries can be used in the search. The only acceptable format is [ISO 3166-1 alpha-2](https://en.wikipedia.org/wiki/ISO_3166-1_alpha-2) For example, `US,CA,MX` or just `US` 466 | :type countries: list or str or None 467 | 468 | :param not_countries: The inverse of the `countries` parameter. 469 | :type not_countries: list or str or None 470 | 471 | :param topic: Accepted values: `news`, `sport`, `tech`, `world`, `finance`, `politics`, `business`, `economics`, `entertainment`, `beauty`, `travel`, `music`, `food`, `science`, `gaming`, `energy`. The topic to which you want to restrict the articles of your choice. Not all news articles are assigned with a topic, therefore, we cannot guarantee that 100% of topics talking about technology will be assigned a tech label. 472 | :type topic: str or None 473 | 474 | :param sources: One or more news resources to filter your search. It should be the normal form of the URL, For example: `nytimes.com,theguardian.com` 475 | :type sources: list or str or None 476 | 477 | :param not_sources: One or more sources to be excluded from the search. Comma-separated list. For example: `nytimes.com,cnn.com,wsj.com` 478 | :type not_sources: list or str or None 479 | 480 | :param when: The time period you want to get the latest headlines for. Accepted forms: 7d => Dailly Form (last 7 days time period), 30d (last 30 days time period) | 1h => Hourly Form (last hour), 24h (last 24 hours) 481 | :type topic: str or None 482 | 483 | :param ranked_only: Default: `True` Limit the search only for the sources which are in the top 1 million online websites. Unranked sources are assigned a rank that equals `999999` 484 | :type ranked_only: bool or None 485 | 486 | :param page_size: `[1:100]` How many articles to return per page. 487 | :type page_size: int 488 | 489 | :param page: The number of the page. Use it to scroll through the results. This parameter is used to paginate: scroll through results because one API response cannot return more than 100 articles. 490 | :type page: int 491 | 492 | :param max_page: The last page number to extract. Use it to manage number of API calls and articles you are going to extract. For example, if you make a broad search with page_size=100 you will extract up to 10 000 articles and make 100 calls to do so. 493 | :type max_page: int or None 494 | 495 | :param seconds_pause: The number of seconds delay between each API call. For your subscription, you can have a rate limit on number of calls per second. 496 | :type seconds_pause: float 497 | 498 | :param proxies: Dict of proxies if needed 499 | :type proxies: dict or None 500 | 501 | :return: JSON response as nested Python dictionary. 502 | :rtype: dict 503 | :raises NewsCatcherApiException: If the ``"status"`` value of the response is ``"error"`` rather than ``"ok"``. 504 | """ 505 | nb_pages = None 506 | if max_page is not None: 507 | if type(max_page) == int: 508 | if max_page >= page: 509 | nb_pages = max_page 510 | else: 511 | raise ValueError("max_page param should be greater than page param") 512 | else: 513 | raise TypeError("max_page param should be an int") 514 | 515 | all_articles = [] 516 | print(f'{str(page)} page is going to be extracted') 517 | first_result = self.get_latest_headlines( 518 | lang=lang, 519 | not_lang=not_lang, 520 | countries=countries, 521 | not_countries=not_countries, 522 | topic=topic, 523 | sources=sources, 524 | not_sources=not_sources, 525 | when=when, 526 | ranked_only=ranked_only, 527 | page_size=page_size, 528 | page=page, 529 | proxies=proxies 530 | ) 531 | 532 | time.sleep(seconds_pause) 533 | 534 | if 'articles' not in first_result.keys(): 535 | return first_result 536 | 537 | all_articles.extend(first_result['articles']) 538 | 539 | print(f'Total number of found articles => {first_result["total_hits"]}.\n' 540 | f'Total number of pages {first_result["total_pages"]}.') 541 | 542 | current_page = page 543 | 544 | if not nb_pages or (max_page and max_page > first_result["total_pages"]): 545 | nb_pages = first_result["total_pages"] 546 | 547 | while current_page < nb_pages: 548 | 549 | current_page += 1 550 | 551 | print(f'{str(current_page)}/{str(nb_pages)} page is going to be extracted') 552 | 553 | try: 554 | one_call_results = self.get_latest_headlines( 555 | lang=lang, 556 | not_lang=not_lang, 557 | countries=countries, 558 | not_countries=not_countries, 559 | topic=topic, 560 | sources=sources, 561 | not_sources=not_sources, 562 | when=when, 563 | ranked_only=ranked_only, 564 | page_size=page_size, 565 | page=current_page, 566 | proxies=proxies 567 | ) 568 | all_articles.extend(one_call_results['articles']) 569 | except NewsCatcherApiException as e: 570 | print(f'{current_page} has not been extracted due to an error') 571 | print(str(e)) 572 | pass 573 | 574 | time.sleep(seconds_pause) 575 | 576 | 577 | final_results = first_result 578 | final_results['articles'] = all_articles 579 | 580 | return final_results 581 | 582 | def get_search_all_pages( 583 | self, 584 | q=None, 585 | lang=None, 586 | not_lang=None, 587 | from_=None, 588 | to_=None, 589 | published_date_precision=None, 590 | search_in=None, 591 | countries=None, 592 | not_countries=None, 593 | topic=None, 594 | sources=None, 595 | not_sources=None, 596 | ranked_only=None, 597 | from_rank=None, 598 | to_rank=None, 599 | sort_by=None, 600 | page_size=100, 601 | page=1, 602 | max_page=None, 603 | seconds_pause=1.0, 604 | proxies=None 605 | ): 606 | """Call the `/search` endpoint the number of time sufficient to get all latest articles for a given search. 607 | 608 | Main endpoint that allows you to find news article by keyword, date, language, country, etc. 609 | 610 | :param q: Keyword/keywords you're searching for. This is the most important part of your query. Please, refer to the **Advanced Query Parameter** section below for more examples and explanations. (required) 611 | :type q: str or None 612 | 613 | :param lang: Specifies the languages of the search. For example: `en`. The only accepted format is [ISO 639-1 — 2](https://en.wikipedia.org/wiki/ISO_639-1) letter code. 614 | :type lang: list or str or None 615 | 616 | :param not_lang: Inverse to the `lang` parameter 617 | :type not_lang: list or str or None 618 | 619 | :param from_: `YYYY/mm/dd` From which point in time to start the search. The default timezone is UTC. Defaults to the past week. 620 | :type from_: str or None 621 | 622 | :param to_: `YYYY/mm/dd` Until which point in time to search for. The default timezone is UTC. 623 | :type to_: str or None 624 | 625 | :param published_date_precision: There are 3 types of date precision we define: `full` — day and time of an article is correctly identified with the appropriate timezone `timezone unknown` — day and time of an article is correctly identified without timezone `date` — only the day is identified without an exact time 626 | :type published_date_precision: str or None 627 | 628 | :param search_in: By default, we search what you specified in the `q` parameter in both `title` and `summary` of the article. However, you can limit this to either `title` or `summary` 629 | :type search_in: str or None 630 | 631 | :param countries: Countries where the news publisher is located. **Important**: This parameter is not responsible for the countries mentioned in the news article. One or multiple countries can be used in the search. The only acceptable format is [ISO 3166-1 alpha-2](https://en.wikipedia.org/wiki/ISO_3166-1_alpha-2) For example, `US,CA,MX` or just `US` 632 | :type countries: list or str or None 633 | 634 | :param not_countries: The inverse of the `countries` parameter. 635 | :type not_countries: list or str or None 636 | 637 | :param topic: Accepted values: `news`, `sport`, `tech`, `world`, `finance`, `politics`, `business`, `economics`, `entertainment`, `beauty`, `travel`, `music`, `food`, `science`, `gaming`, `energy`. The topic to which you want to restrict the articles of your choice. Not all news articles are assigned with a topic, therefore, we cannot guarantee that 100% of topics talking about technology will be assigned a tech label. 638 | :type topic: str or None 639 | 640 | :param sources: One or more news resources to filter your search. It should be the normal form of the URL, For example: `nytimes.com,theguardian.com` 641 | :type sources: list or str or None 642 | 643 | :param not_sources: One or more sources to be excluded from the search. Comma-separated list. For example: `nytimes.com,cnn.com,wsj.com` 644 | :type not_sources: list or str or None 645 | 646 | :param ranked_only: Default: `True` Limit the search only for the sources which are in the top 1 million online websites. Unranked sources are assigned a rank that equals `999999` 647 | :type ranked_only: bool or None 648 | 649 | :param from_rank: `[0:999999]` The lowest boundary of the rank of a news website to filter by. Important: lower rank means that a source is more popular 650 | :type from_rank: int or None 651 | 652 | :param to_rank: `[0:999999]` The upper boundary of the rank of a news website to filter by. 653 | :type to_rank: int or None 654 | 655 | :param sort_by: `relevancy` (default value) — the most relevant results first `date` — the most recently published results first `rank` — the results from the highest-ranked sources first 656 | :type sort_by: str or None 657 | 658 | :param page_size: `[1:100]` How many articles to return per page. 659 | :type page_size: int or None 660 | 661 | :param page: The number of the page. Use it to scroll through the results. This parameter is used to paginate: scroll through results because one API response cannot return more than 100 articles. 662 | :type page: int or None 663 | 664 | :param max_page: The last page number to extract. Use it to manage number of API calls and articles you are going to extract. For example, if you make a broad search with page_size=100 you will extract up to 10 000 articles and make 100 calls to do so. 665 | :type max_page: int or None 666 | 667 | :param seconds_pause: The number of seconds delay between each API call. For your subscription, you can have a rate limit on number of calls per second. 668 | :type seconds_pause: float 669 | 670 | :param proxies: Dict of proxies if needed 671 | :type proxies: dict or None 672 | 673 | :return: JSON response as nested Python dictionary. 674 | :rtype: dict 675 | :raises NewsCatcherApiException: If the ``"status"`` value of the response is ``"error"`` rather than ``"ok"``. 676 | """ 677 | 678 | nb_pages = None 679 | if max_page is not None: 680 | if type(max_page) == int: 681 | if max_page >= page: 682 | nb_pages = max_page 683 | else: 684 | raise ValueError("max_page param should be greater than page param") 685 | else: 686 | raise TypeError("max_page param should be an int") 687 | 688 | all_articles = [] 689 | print(f'{str(page)} page is going to be extracted') 690 | first_result = self.get_search( 691 | q=q, 692 | lang=lang, 693 | not_lang=not_lang, 694 | from_=from_, 695 | to_=to_, 696 | published_date_precision=published_date_precision, 697 | search_in=search_in, 698 | countries=countries, 699 | not_countries=not_countries, 700 | topic=topic, 701 | sources=sources, 702 | not_sources=not_sources, 703 | ranked_only=ranked_only, 704 | from_rank=from_rank, 705 | to_rank=to_rank, 706 | sort_by=sort_by, 707 | page_size=page_size, 708 | page=page, 709 | proxies=proxies 710 | ) 711 | 712 | time.sleep(seconds_pause) 713 | 714 | if 'articles' not in first_result.keys(): 715 | return first_result 716 | 717 | all_articles.extend(first_result['articles']) 718 | 719 | print(f'Total number of found articles => {first_result["total_hits"]}.\n' 720 | f'Total number of pages {first_result["total_pages"]}.') 721 | 722 | current_page = page 723 | 724 | if not nb_pages or (max_page and max_page > first_result["total_pages"]): 725 | nb_pages = first_result["total_pages"] 726 | 727 | while current_page < nb_pages: 728 | 729 | current_page += 1 730 | 731 | print(f'{str(current_page)}/{str(nb_pages)} page is going to be extracted') 732 | 733 | try: 734 | one_call_results = self.get_search( 735 | q=q, 736 | lang=lang, 737 | not_lang=not_lang, 738 | from_=from_, 739 | to_=to_, 740 | published_date_precision=published_date_precision, 741 | search_in=search_in, 742 | countries=countries, 743 | not_countries=not_countries, 744 | topic=topic, 745 | sources=sources, 746 | not_sources=not_sources, 747 | ranked_only=ranked_only, 748 | from_rank=from_rank, 749 | to_rank=to_rank, 750 | sort_by=sort_by, 751 | page_size=page_size, 752 | page=current_page, 753 | proxies=proxies 754 | ) 755 | all_articles.extend(one_call_results['articles']) 756 | except NewsCatcherApiException as e: 757 | print(f'{current_page} has not been extracted due to an error') 758 | print(str(e)) 759 | pass 760 | 761 | time.sleep(seconds_pause) 762 | 763 | final_results = first_result 764 | final_results['articles'] = all_articles 765 | 766 | return final_results 767 | 768 | def get_search_all_articles( 769 | self, 770 | q=None, 771 | lang=None, 772 | not_lang=None, 773 | from_=None, 774 | to_=None, 775 | published_date_precision=None, 776 | search_in=None, 777 | countries=None, 778 | not_countries=None, 779 | topic=None, 780 | by='week', 781 | sources=None, 782 | not_sources=None, 783 | ranked_only=None, 784 | from_rank=None, 785 | to_rank=None, 786 | sort_by=None, 787 | page_size=100, 788 | page=1, 789 | max_page=None, 790 | seconds_pause=1.0, 791 | proxies=None): 792 | 793 | """Call the `/search` endpoint the number of time sufficient to get all latest articles for a given search. 794 | 795 | Main endpoint that allows you to find news article by keyword, date, language, country, etc. 796 | 797 | :param q: Keyword/keywords you're searching for. This is the most important part of your query. Please, refer to the **Advanced Query Parameter** section below for more examples and explanations. (required) 798 | :type q: str or None 799 | 800 | :param lang: Specifies the languages of the search. For example: `en`. The only accepted format is [ISO 639-1 — 2](https://en.wikipedia.org/wiki/ISO_639-1) letter code. 801 | :type lang: list or str or None 802 | 803 | :param not_lang: Inverse to the `lang` parameter 804 | :type not_lang: list or str or None 805 | 806 | :param from_: `YYYY/mm/dd` From which point in time to start the search. The default timezone is UTC. Defaults to the past week. 807 | :type from_: str or None 808 | 809 | :param to_: `YYYY/mm/dd` Until which point in time to search for. The default timezone is UTC. 810 | :type to_: str or None 811 | 812 | :param published_date_precision: There are 3 types of date precision we define: `full` — day and time of an article is correctly identified with the appropriate timezone `timezone unknown` — day and time of an article is correctly identified without timezone `date` — only the day is identified without an exact time 813 | :type published_date_precision: str or None 814 | 815 | :param search_in: By default, we search what you specified in the `q` parameter in both `title` and `summary` of the article. However, you can limit this to either `title` or `summary` 816 | :type search_in: str or None 817 | 818 | :param countries: Countries where the news publisher is located. **Important**: This parameter is not responsible for the countries mentioned in the news article. One or multiple countries can be used in the search. The only acceptable format is [ISO 3166-1 alpha-2](https://en.wikipedia.org/wiki/ISO_3166-1_alpha-2) For example, `US,CA,MX` or just `US` 819 | :type countries: list or str or None 820 | 821 | :param not_countries: The inverse of the `countries` parameter. 822 | :type not_countries: list or str or None 823 | 824 | :param topic: Accepted values: `news`, `sport`, `tech`, `world`, `finance`, `politics`, `business`, `economics`, `entertainment`, `beauty`, `travel`, `music`, `food`, `science`, `gaming`, `energy`. The topic to which you want to restrict the articles of your choice. Not all news articles are assigned with a topic, therefore, we cannot guarantee that 100% of topics talking about technology will be assigned a tech label. 825 | :type topic: str or None 826 | 827 | :param sources: One or more news resources to filter your search. It should be the normal form of the URL, For example: `nytimes.com,theguardian.com` 828 | :type sources: list or str or None 829 | 830 | :param not_sources: One or more sources to be excluded from the search. Comma-separated list. For example: `nytimes.com,cnn.com,wsj.com` 831 | :type not_sources: list or str or None 832 | 833 | :param ranked_only: Default: `True` Limit the search only for the sources which are in the top 1 million online 834 | websites. Unranked sources are assigned a rank that equals `999999` 835 | :type ranked_only: bool or None 836 | 837 | :param from_rank: `[0:999999]` The lowest boundary of the rank of a news website to filter by. Important: lower rank means that a source is more popular 838 | :type from_rank: int or None 839 | 840 | :param to_rank: `[0:999999]` The upper boundary of the rank of a news website to filter by. 841 | :type to_rank: int or None 842 | 843 | :param sort_by: `relevancy` (default value) — the most relevant results first `date` — the most recently published results first `rank` — the results from the highest-ranked sources first 844 | :type sort_by: str or None 845 | 846 | :param page_size: `[1:100]` How many articles to return per page. 847 | :type page_size: int or None 848 | 849 | :param page: The number of the page. Use it to scroll through the results. This parameter is used to paginate: scroll through results because one API response cannot return more than 100 articles. 850 | :type page: int or None 851 | 852 | :param max_page: The last page number to extract. Use it to manage number of API calls and articles you are going to extract. For example, if you make a broad search with page_size=100 you will extract up to 10 000 articles and make 100 calls to do so. 853 | :type max_page: int or None 854 | 855 | :param seconds_pause: The number of seconds delay between each API call. For your subscription, you can have a rate limit on number of calls per second. 856 | :type seconds_pause: float 857 | 858 | :param proxies: Dict of proxies if needed 859 | :type proxies: dict or None 860 | 861 | :param by: Accepted values: `month`, `week`, `day`, 'hour'. Default: `week`. How to divide the the time time interval between to_ and from_. 862 | :type by: str 863 | 864 | :return: JSON response as nested Python dictionary. 865 | :rtype: dict 866 | :raises NewsCatcherApiException: If the ``"status"`` value of the response is ``"error"`` rather than ``"ok"``. 867 | """ 868 | if not from_: 869 | from_ = (datetime.utcnow() - timedelta(days=7)).strftime('%Y/%m/%d') 870 | if not to_: 871 | to_ = datetime.utcnow().strftime('%Y/%m/%d') 872 | 873 | # create a timedelta corresponding to the by parameter 874 | if by == 'month': 875 | delta = timedelta(days=28) 876 | elif by == 'week': 877 | delta = timedelta(days=7) 878 | elif by == 'day': 879 | delta = timedelta(days=1) 880 | elif by == 'hour': 881 | delta = timedelta(hours=1) 882 | 883 | # Convert the to_ and from_ parameters to datetime object 884 | # Check if time is specified and treat accordingly 885 | to_datetime = parse_date(to_, settings={'TIMEZONE': 'UTC'}) 886 | 887 | from_datetime = parse_date(from_, settings={'TIMEZONE': 'UTC'}) 888 | 889 | # the by parameter can't be smaller than `to_ - from_` 890 | if to_datetime - from_datetime < delta: 891 | raise ValueError("The 'by' parameter cannot be bigger than the difference of from_ and to_") 892 | 893 | # initialize response dict/object 894 | payload = {'status': '', 'total_hits': 0, 'page': 0, 'total_pages': 0, 'page_size': 0, 'articles': [], 'user_input': {}} 895 | 896 | # flag so we can compensate for our midnight trickery in the from_ parameter 897 | midnight_flag = False 898 | 899 | while True: 900 | 901 | if to_datetime - from_datetime <= delta: 902 | print(f'{from_datetime.strftime("%m/%d/%Y %H:%M:%S")} --> {to_datetime.strftime("%m/%d/%Y %H:%M:%S")}') 903 | results = self.get_search_all_pages(q=q, 904 | lang=lang, 905 | not_lang=not_lang, 906 | from_=from_datetime.strftime("%m/%d/%Y %H:%M:%S"), 907 | to_=to_datetime.strftime("%m/%d/%Y %H:%M:%S"), 908 | published_date_precision=published_date_precision, 909 | search_in=search_in, 910 | countries=countries, 911 | not_countries=not_countries, 912 | topic=topic, 913 | sources=sources, 914 | not_sources=not_sources, 915 | ranked_only=ranked_only, 916 | from_rank=from_rank, 917 | to_rank=to_rank, 918 | sort_by=sort_by, 919 | page_size=page_size, 920 | page=page, 921 | proxies=proxies, 922 | max_page=max_page, 923 | seconds_pause=seconds_pause) 924 | 925 | utils.update_final_res(results, payload) 926 | payload['page_size'] = page_size 927 | payload['user_input'] = results['user_input'] 928 | payload['user_input']['by'] = by 929 | if len(payload['articles']) > 0: 930 | payload['status'] = 'ok' 931 | else: 932 | payload['status'] = 'No matches for your search.' 933 | 934 | return payload 935 | 936 | # move the to_ parameter forward 937 | else: 938 | temp_to_ = from_datetime + delta 939 | 940 | # subtract 1 sec if the to_ parameter is the exact midnight 941 | if temp_to_.hour == 0 and temp_to_.minute == 0 and \ 942 | temp_to_.second == 0: 943 | temp_to_ -= timedelta(seconds=1) 944 | midnight_flag = True 945 | 946 | print(f'{from_datetime.strftime("%m/%d/%Y %H:%M:%S")} --> {temp_to_.strftime("%m/%d/%Y %H:%M:%S")}') 947 | 948 | results = self.get_search_all_pages(q=q, 949 | lang=lang, 950 | not_lang=not_lang, 951 | from_=from_datetime.strftime("%m/%d/%Y %H:%M:%S"), 952 | to_=temp_to_.strftime("%m/%d/%Y %H:%M:%S"), 953 | published_date_precision=published_date_precision, 954 | search_in=search_in, 955 | countries=countries, 956 | not_countries=not_countries, 957 | topic=topic, 958 | sources=sources, 959 | not_sources=not_sources, 960 | ranked_only=ranked_only, 961 | from_rank=from_rank, 962 | to_rank=to_rank, 963 | sort_by=sort_by, 964 | page_size=page_size, 965 | page=page, 966 | proxies=proxies, 967 | max_page=max_page, 968 | seconds_pause=seconds_pause) 969 | 970 | utils.update_final_res(results, payload) 971 | 972 | # move the from_ parameter forward 973 | from_datetime = temp_to_ 974 | 975 | # Add a second so from_ doesn't become DD-1/MM/YYYY 23:59:59 976 | if midnight_flag: 977 | from_datetime += timedelta(seconds=1) 978 | --------------------------------------------------------------------------------