├── tests
    ├── __init__.py
    └── test_newscatcherapi_client.py
├── requirements.txt
├── newscatcherapi
    ├── __init__.py
    ├── newscatcherapi_auth.py
    ├── newscatcherapi_exception.py
    ├── const.py
    ├── utils.py
    └── newscatcherapi_client.py
├── pyproject.toml
├── setup.py
├── LICENSE
└── README.md


/tests/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/requirements.txt:
--------------------------------------------------------------------------------
1 | requests>=2.24.0
2 | dateparser==1.1.2


--------------------------------------------------------------------------------
/newscatcherapi/__init__.py:
--------------------------------------------------------------------------------
1 | from newscatcherapi.newscatcherapi_client import NewsCatcherApiClient


--------------------------------------------------------------------------------
/newscatcherapi/newscatcherapi_auth.py:
--------------------------------------------------------------------------------
 1 | from requests.auth import AuthBase
 2 | 
 3 | 
 4 | class NewsCatcherApiAuth(AuthBase):
 5 |     # Provided by NewsCatcher: https://docs.newscatcherapi.com/api-docs/authentication
 6 |     def __init__(self, x_api_key):
 7 |         self.x_api_key = x_api_key
 8 | 
 9 |     def __call__(self, request):
10 |         request.headers.update(get_auth_headers(self.x_api_key))
11 |         return request
12 | 
13 | 
14 | def get_auth_headers(x_api_key):
15 |     return {"Content-Type": "Application/JSON", "x-api-key": x_api_key}
16 | 


--------------------------------------------------------------------------------
/newscatcherapi/newscatcherapi_exception.py:
--------------------------------------------------------------------------------
 1 | class NewsCatcherApiException(Exception):
 2 |     """Represents an ``error`` response status value from NewsCatcher News API."""
 3 | 
 4 |     def __init__(self, exception):
 5 |         self.exception = exception
 6 | 
 7 |     def get_exception(self):
 8 |         return self.exception
 9 | 
10 |     def get_status(self):
11 |         if self.exception["status"]:
12 |             return self.exception["status"]
13 | 
14 |     def get_code(self):
15 |         if self.exception["error_code"]:
16 |             return self.exception["error_code"]
17 | 
18 |     def get_message(self):
19 |         if self.exception["message"]:
20 |             return self.exception["message"]
21 | 


--------------------------------------------------------------------------------
/pyproject.toml:
--------------------------------------------------------------------------------
 1 | [tool.poetry]
 2 | name = "newscatcherapi"
 3 | version = "0.7.3"
 4 | description = "NewsCatcher News API V2 SDK for Python"
 5 | authors = ["Maksym Sugonyaka <maksym@newscatcherapi.com>",
 6 |            "Artem Bugara <artem@newscatcherapi.com>"]
 7 | readme = "README.md"
 8 | 
 9 | homepage = "https://newscatcherapi.com/"
10 | license = "MIT"
11 | keywords = ["News", "RSS", "Scraping", "Data Mining", "News Extraction"]
12 | 
13 | [tool.poetry.dependencies]
14 | python = ">=3.6.0"
15 | requests = ">=2.24.0"
16 | dateparser= ">=0.7.6"
17 | 
18 | [tool.poetry.dev-dependencies]
19 | pytest = "^5.2"
20 | requests="^2.24.0"
21 | dateparser="^1.1.1"
22 | 
23 | [build-system]
24 | requires = ["poetry-core>=1.0.0"]
25 | build-backend = "poetry.core.masonry.api"
26 | 


--------------------------------------------------------------------------------
/newscatcherapi/const.py:
--------------------------------------------------------------------------------
 1 | """Constants and allowed parameter values specified in the NewsCatcher News API."""
 2 | 
 3 | LATEST_HEADLINES_URL = "/v2/latest_headlines"
 4 | SEARCH_URL = "/v2/search"
 5 | SOURCES_URL = "/v2/sources"
 6 | 
 7 | #: The 2-letter ISO-639-1 code of the language you want to get articles for.
 8 | allowed_languages = 'af,ar,bg,bn,ca,cs,cy,cn,da,de,el,en,es,et,fa,fi,fr,gu,he,hi,hr,hu,id,it,ja,kn,ko,lt,lv,mk,ml,mr,ne,nl,no,pa,pl,pt,ro,ru,sk,sl,so,sq,sv,sw,ta,te,th,tl,tr,tw,uk,ur,vi'.split(',')
 9 | 
10 | #: The topic you want to get articles for.
11 | allowed_topics = 'news,sport,tech,world,finance,politics,business,economics,entertainment,beauty,travel,music,food,science,gaming,energy'.split(',')
12 | 
13 | # Date precisions
14 | allowed_precisions = 'timezone unknown,full,date'.split(',')
15 | 
16 | # Search In
17 | allowed_search_ins = ['title', 'summary', 'title,summary']
18 | 
19 | #: The order to sort article results in.  If not specified, the default is ``"relevancy"``.
20 | allowed_sorts = ['relevancy', 'date', 'rank']
21 | 


--------------------------------------------------------------------------------
/setup.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | 
 3 | # PyPI upload:
 4 | #
 5 | #     $ python -m pip install --upgrade twine wheel
 6 | #     $ python setup.py sdist bdist_wheel --universal
 7 | #     $ twine upload dist/*
 8 | #
 9 | # Install in development:
10 | #
11 | #     $ python3 -m pip install -e .
12 | 
13 | from setuptools import setup, find_packages
14 | 
15 | VERSION = "0.7.3"
16 | INSTALL_REQUIRES = ["requests>=2.24.0", "dateparser"]
17 | TESTS_REQUIRE = ["pytest"]
18 | 
19 | if __name__ == "__main__":
20 |     setup(
21 |         name="newscatcherapi",
22 |         version=VERSION,
23 |         author="Maksym Sugonyaka",
24 |         author_email="maksym@newscatcherapi.com",
25 |         url="https://github.com/NewscatcherAPI/newscatcherapi-sdk-python",
26 |         packages=find_packages(),
27 |         install_requires=INSTALL_REQUIRES,
28 |         tests_require=TESTS_REQUIRE,
29 |         description="An official Python client for the NewsCatcher News API",
30 |         download_url="",
31 |         keywords=["newscatcherapi", "news"],
32 |     )
33 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 | MIT License
 2 | 
 3 | Copyright (c) 2021 newscatcherapi.com
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | of this software and associated documentation files (the "Software"), to deal
 7 | in the Software without restriction, including without limitation the rights
 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 | 
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 | 
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 | 


--------------------------------------------------------------------------------
/newscatcherapi/utils.py:
--------------------------------------------------------------------------------
  1 | from __future__ import unicode_literals
  2 | from newscatcherapi import const
  3 | 
  4 | import sys
  5 | 
  6 | def validate_language(language):
  7 |     if is_valid_list(language):
  8 |         for each_lang in language:
  9 |             if each_lang.strip().lower() not in const.allowed_languages:
 10 |                 raise ValueError(f"{each_lang} - is an invalid language. Language should be one of this list => {str(const.allowed_languages)}")
 11 |         return ','.join([i.strip().lower() for i in language])
 12 |     elif is_valid_string(language):
 13 |         language_clean = [i.strip().lower() for i in language.split(',')]
 14 |         for each_lang in language_clean:
 15 |             if each_lang not in const.allowed_languages:
 16 |                 raise ValueError(f"{each_lang} - is an invalid language. Language should be one of this list => {str(const.allowed_languages)}")
 17 |         return ','.join(language_clean)
 18 |     else:
 19 |         raise TypeError("lang parameter should be of type str or list")
 20 | 
 21 | 
 22 | def validate_countries(list_countries, name_parameter):
 23 |     if is_valid_list(list_countries):
 24 |         valid_countries = [i.strip().upper() for i in list_countries]
 25 |         return ','.join(valid_countries)
 26 |     elif is_valid_string(list_countries):
 27 |         valid_countries = [i.strip().upper() for i in list_countries.split(',')]
 28 |         return ','.join(valid_countries)
 29 |     else:
 30 |         raise TypeError(f"{name_parameter} parameter should be of type str or list")
 31 | 
 32 | def validate_topic(topic):
 33 |     if is_valid_string(topic):
 34 |         if topic in const.allowed_topics:
 35 |             return topic
 36 |         else:
 37 |             raise ValueError(
 38 |                 f"{topic} - is an unsupported topic. Topic should be one of this list => {str(const.allowed_topics)}")
 39 |     else:
 40 |         raise TypeError(f"topic parameter should be of type str")
 41 | 
 42 | 
 43 | def validate_sources(list_sources, name_parameter):
 44 |     if is_valid_list(list_sources):
 45 |         valid_sources = [i.strip().lower() for i in list_sources]
 46 |         return ','.join(valid_sources)
 47 |     elif is_valid_string(list_sources):
 48 |         valid_sources = [i.strip().lower() for i in list_sources.split(',')]
 49 |         return ','.join(valid_sources)
 50 |     else:
 51 |         raise TypeError(f"{name_parameter} parameter should be of type str or list")
 52 | 
 53 | def validate_when(when, name_parameter):
 54 |     if is_valid_string(when):
 55 |         if when[len(when)-1] in ['d', 'h']:
 56 |             return when
 57 |         else:
 58 |             raise TypeError(f"{name_parameter} parameter should be the next form: 30d or 24h ")
 59 |     else:
 60 |         raise TypeError(f"{name_parameter} parameter should be of type str")
 61 | 
 62 | 
 63 | PY2 = sys.version_info[0] == 2
 64 | PY3 = sys.version_info[0] == 3
 65 | 
 66 | if PY3:
 67 | 
 68 |     def is_valid_string(var):
 69 |         return isinstance(var, str)
 70 | 
 71 |     def is_valid_num(var):
 72 |         return isinstance(var, (int, float))
 73 | 
 74 |     def is_valid_list(var):
 75 |         return isinstance(var, list)
 76 | 
 77 |     def is_valid_boolean(var):
 78 |         return isinstance(var, bool)
 79 | 
 80 | elif PY2:
 81 | 
 82 |     def is_valid_string(var):
 83 |         return isinstance(var, basestring)
 84 | 
 85 |     def is_valid_num(var):
 86 |         return isinstance(var, (int, float, long))
 87 | 
 88 | 
 89 | else:
 90 | 
 91 |     def is_valid_string(var):
 92 |         raise SystemError("unsupported version of python detected (supported versions: 2, 3)")
 93 | 
 94 | 
 95 | # function for updating the response dict/object
 96 | def update_final_res(results, payload):
 97 |     if 'articles' not in results.keys():
 98 |       return True
 99 | 
100 |     if 'articles' not in payload.keys():
101 |       payload['articles'] = results['articles']
102 |     else:
103 |       payload['articles'].extend(results['articles'])
104 | 
105 |     payload['total_hits'] += results['total_hits']
106 |     payload['total_pages'] += results['total_pages']
107 |     payload['page'] += results['page']
108 |     return False
109 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
  1 | # NewsCatcher News API V2 SDK for Python
  2 | 
  3 | The official Python client library to manipulate [NewsCatcher News API V2](https://newscatcherapi.com/news-api) from your Python application.
  4 | 
  5 | Documentation is identical with the API documentation. The same parameters and filters are available. 
  6 | And the same response structure. You can have a look at [docs.newscatcherapi.com](https://docs.newscatcherapi.com).
  7 | 
  8 | ## Authentication
  9 | 
 10 | The Authentication is done via the `x_api_key` variable.
 11 | 
 12 | Receive your API key by registering at [app.newscatcherapi.com](https://app.newscatcherapi.com).
 13 | 
 14 | ## Installation
 15 | ```pip install newscatcherapi```
 16 | 
 17 | ## Quick Start
 18 | Import installed package.
 19 | 
 20 | `````from newscatcherapi import NewsCatcherApiClient`````
 21 | 
 22 | Init the instance with an API key given after registration.
 23 | 
 24 | ````newscatcherapi = NewsCatcherApiClient(x_api_key='YOUR_API_KEY') ````
 25 | 
 26 | ## Endpoints
 27 | An instance of `NewsCatcherApiClient` has three main methods that correspond to three endpoints available for NewsCatcher News API.
 28 | 
 29 | ### Get News (/v2/search)
 30 | Main method that allows you to find news article by keyword, date, language, country, etc.
 31 | 
 32 | ```
 33 | all_articles = newscatcherapi.get_search(q='Elon Musk',
 34 |                                          lang='en',
 35 |                                          countries='CA',
 36 |                                          page_size=100)
 37 | ```
 38 | 
 39 | ### Get News Extracting All Pages (/v2/search)
 40 | It is the same method as *get_search*, but you can program to extract all articles without changing `page` param manually. 
 41 | 
 42 | For example: for a given search you have 1000 found articles.  *get_search* makes one API call and returns up to 100 articles. 
 43 | *get_search_all_pages* will make 10 API calls and will return all 1000 articles. 
 44 | 
 45 | Two new parameters:
 46 | - `max_page` - The last page number to extract. To use when you want to limit the number of extracted pages.
 47 | - `seconds_pause` - Number of seconds waiting before each call. This parameter helps you deal with the rate limit on your subscription plan. By default, it is set to 1 second. 
 48 | 
 49 | ```
 50 | all_articles = newscatcherapi.get_search_all_pages(q='Elon Musk',
 51 |                                          lang='en',
 52 |                                          countries='CA',
 53 |                                          page_size=100,
 54 |                                          max_page=10,
 55 |                                          seconds_pause=1.0
 56 |                                          )
 57 |  ```
 58 | 
 59 | 
 60 | ### Get News Extracting All Articles (/v2/search)
 61 | It is the same method as *get_search*, but you can fetch all articles without changing `page`, `from_`, and `to_` params manually. 
 62 | ​
 63 | For example: for a given search you have found more than 10000 articles.  *get_search* makes one API call and returns up to 100 articles. 
 64 | *get_search_all_pages* will make 100 API calls and will return 10000 articles. The *get_search_all_articles* method will return all articles. 
 65 | ​
 66 | 
 67 | One new parameters:
 68 | - `by` - How to divide the the time interval between to_ and from_ in order to extract all articles for the given search query. By default it is set to `week`. Accepted values: `month`, `week`, `day`, `hour`.
 69 | ​
 70 | ```
 71 | all_articles = newscatcherapi.get_search_all_articles(q='Elon Musk',
 72 |                                          lang='en',
 73 |                                          countries='CA',
 74 |                                          page_size=100,
 75 |                                          by = 'day'
 76 |                                          )
 77 |  ```
 78 | 
 79 | ### Get Latest Headlines (/v2/latest_headlines)
 80 | Get the latest headlines given any topic, country, sources, or language.
 81 | 
 82 | ```
 83 | top_headlines = newscatcherapi.get_latest_headlines(lang='en',
 84 |                                                     countries='us',
 85 |                                                     topic='business')
 86 |  ```
 87 | 
 88 | ### Get Latest Headlines Extracting All Pages (/v2/latest_headlines)
 89 | It is the same function as *get_latest_headlines*, but you can program to extract all articles without changing `page` param manually. 
 90 | 
 91 | For example: for a given search you have 1000 found articles.  *get_latest_headlines* makes one API call and returns up to 100 articles. 
 92 | *get_latest_headlines_all_pages* will make 10 API calls and will return all 1000 articles. 
 93 | 
 94 | Two new parameters:
 95 | - `max_page` - The last page number to extract. To use when you want to limit the number of extracted pages.
 96 | - `seconds_pause` - Number of seconds waiting before each call. This parameter helps you deal with the rate limit on your subscription plan. By default, it is set to 1 second. 
 97 | 
 98 | ```
 99 | top_headlines = newscatcherapi.get_latest_headlines_all_pages(lang='en',
100 |                                                     countries='us', 
101 |                                                     topic='business',
102 |                                                     max_page=10,
103 |                                                     seconds_pause=1.0
104 |                                                     )
105 |  ```
106 | 
107 | ### Get Sources (/v2/sources)
108 | Returns a list of the top 100 supported news websites. Overall, we support over 60,000 websites. Using this method, you may find the top 100 for your specific language, country, topic combination.
109 | 
110 | ```
111 | sources = newscatcherapi.get_sources(topic='business',
112 |                                      lang='en',
113 |                                      countries='US')
114 |  ```
115 | 
116 | ### Every endpoint supports _proxies_ parameter
117 | If you want to use proxies, you can add this parameter to all the endpoints we have.
118 | Here is an example of a valid form proxies parameter and an example of using it with one of the endpoints. 
119 | 
120 | ```
121 | proxies = {
122 |    'http': 'http://proxy.example.com:8080',
123 |    'https': 'http://secureproxy.example.com:8090',
124 | }
125 | 
126 | all_articles = newscatcherapi.get_search(q='Elon Musk',
127 |                                          lang='en',
128 |                                          countries='CA',
129 |                                          page_size=100,
130 |                                          proxies=proxies)
131 | ```
132 | 
133 | 
134 | ### Use *from_* and *to_* instead of *from* and *to* like in NewsCatcher News API
135 | In Python, we are not allowed to reserve variable names *from* and *to*. If you try to use them, you will get a syntax error:
136 | 
137 | ```SyntaxError: invalid syntax``` 
138 | 
139 | So, here is an example on how to use time variables *from_* and *to_* in *get_search* method.
140 | 
141 | ```
142 | all_articles = newscatcherapi.get_search(q='Elon Musk',
143 |                                          lang='en',
144 |                                          countries='CA,US',
145 |                                          from_='2021/08/20',
146 |                                          to_='2021/08/31')
147 | ```
148 | 
149 | ## Feedback
150 | 
151 | Feel free to contact us if you have spot a bug or have any suggestion at maksym`[at]`newscatcherapi.com
152 | 


--------------------------------------------------------------------------------
/tests/test_newscatcherapi_client.py:
--------------------------------------------------------------------------------
  1 | import os
  2 | import unittest
  3 | 
  4 | from newscatcherapi.newscatcherapi_client import NewsCatcherApiClient
  5 | 
  6 | 
  7 | class NewsCatcherApiTest(unittest.TestCase):
  8 |     def setUp(self):
  9 |         key = os.environ.get("newscatcher_api_secret")
 10 |         self.api = NewsCatcherApiClient(key)
 11 | 
 12 |     def test_api_latest_headlines(self):
 13 |         # Raise TypeError if lang is not of type str
 14 |         lang = 1
 15 |         with self.assertRaises(TypeError):
 16 |             self.api.get_latest_headlines(lang=lang)
 17 | 
 18 |         # Raise ValueError if lang is not in list
 19 |         lang = 'aer'
 20 |         with self.assertRaises(ValueError):
 21 |             self.api.get_latest_headlines(lang=lang)
 22 | 
 23 |         # Raise TypeError if not_lang is not of type str
 24 |         not_lang = 1
 25 |         with self.assertRaises(TypeError):
 26 |             self.api.get_latest_headlines(not_lang=not_lang)
 27 | 
 28 |         # Raise ValueError if lang is not in list
 29 |         not_lang = 'aer'
 30 |         with self.assertRaises(ValueError):
 31 |             self.api.get_latest_headlines(not_lang=not_lang)
 32 | 
 33 |         # Raise TypeError if sources param is not of type str
 34 |         sources = 0
 35 |         with self.assertRaises(TypeError):
 36 |             self.api.get_latest_headlines(sources=sources)
 37 | 
 38 |         # Raise TypeError if country param is not of type str
 39 |         countries = 0
 40 |         with self.assertRaises(TypeError):
 41 |             self.api.get_latest_headlines(countries=countries)
 42 | 
 43 |         # Raises TypeError if topic param is not of type str
 44 |         topic = 0
 45 |         with self.assertRaises(TypeError):
 46 |             self.api.get_latest_headlines(topic=topic)
 47 | 
 48 |         # Raises ValueError if category param is invalid
 49 |         topic = "dogcoin"
 50 |         with self.assertRaises(ValueError):
 51 |             self.api.get_latest_headlines(topic=topic)
 52 | 
 53 |         # Raises TypeError if page_size param is not an int
 54 |         page_size = "1"
 55 |         with self.assertRaises(TypeError):
 56 |             self.api.get_latest_headlines(page_size=page_size)
 57 | 
 58 |         # Raises ValueError if page_size param is less than zero(0) or greater than 100
 59 |         page_size = -1
 60 |         with self.assertRaises(ValueError):
 61 |             self.api.get_latest_headlines(page_size=page_size)
 62 | 
 63 |         page_size = 1000
 64 |         with self.assertRaises(ValueError):
 65 |             self.api.get_latest_headlines(page_size=page_size)
 66 | 
 67 |         # Raises a TypeError is page param is not an int
 68 |         page = "1"
 69 |         with self.assertRaises(TypeError):
 70 |             self.api.get_latest_headlines(page=page)
 71 | 
 72 |         # Raises a ValueError if page param is less than zero(0)
 73 |         page = -1
 74 |         with self.assertRaises(ValueError):
 75 |             self.api.get_latest_headlines(page=page)
 76 | 
 77 |     def test_api_get_search(self):
 78 |         # Raise TypeError if q param is None
 79 |         q = 0
 80 |         with self.assertRaises(TypeError):
 81 |             self.api.get_search(q=q)
 82 | 
 83 |         # Raise TypeError if lang is not of type str
 84 |         lang = 1
 85 |         with self.assertRaises(TypeError):
 86 |             self.api.get_search(lang=lang)
 87 | 
 88 |         # Raise ValueError if lang is not in list
 89 |         lang = 'aer'
 90 |         with self.assertRaises(ValueError):
 91 |             self.api.get_search(lang=lang)
 92 | 
 93 |         # Raise TypeError if not_lang is not of type str
 94 |         not_lang = 1
 95 |         with self.assertRaises(TypeError):
 96 |             self.api.get_search(not_lang=not_lang)
 97 | 
 98 |         # Raise ValueError if lang is not in list
 99 |         not_lang = 'aer'
100 |         with self.assertRaises(ValueError):
101 |             self.api.get_search(not_lang=not_lang)
102 | 
103 |         # Raise TypeError if sources param is not of type str
104 |         sources = 0
105 |         with self.assertRaises(TypeError):
106 |             self.api.get_search(sources=sources)
107 | 
108 |         # Raise TypeError if country param is not of type str
109 |         countries = 0
110 |         with self.assertRaises(TypeError):
111 |             self.api.get_search(countries=countries)
112 | 
113 |         # Raise TypeError if not_countries param is not of type str
114 |         not_countries = 0
115 |         with self.assertRaises(TypeError):
116 |             self.api.get_search(not_countries=not_countries)
117 | 
118 |         # Raises TypeError if topic param is not of type str
119 |         topic = 0
120 |         with self.assertRaises(TypeError):
121 |             self.api.get_search(topic=topic)
122 | 
123 |         # Raises ValueError if category param is invalid
124 |         topic = "dogcoin"
125 |         with self.assertRaises(ValueError):
126 |             self.api.get_search(topic=topic)
127 | 
128 |         # Raises TypeError if page_size param is not an int
129 |         page_size = "1"
130 |         with self.assertRaises(TypeError):
131 |             self.api.get_search(page_size=page_size)
132 | 
133 |         # Raises ValueError if page_size param is less than zero(0) or greater than 100
134 |         page_size = -1
135 |         with self.assertRaises(ValueError):
136 |             self.api.get_search(page_size=page_size)
137 | 
138 |         page_size = 1000
139 |         with self.assertRaises(ValueError):
140 |             self.api.get_search(page_size=page_size)
141 | 
142 |         # Raises a TypeError is page param is not an int
143 |         page = "1"
144 |         with self.assertRaises(TypeError):
145 |             self.api.get_search(page=page)
146 | 
147 |         # Raises a ValueError if page param is less than zero(0)
148 |         page = -1
149 |         with self.assertRaises(ValueError):
150 |             self.api.get_search(page=page)
151 | 
152 |         # Raise TypeError is sort_by param is not of type str
153 |         sort_by = 1
154 |         with self.assertRaises(TypeError):
155 |             self.api.get_search(sort_by=sort_by)
156 | 
157 |         # Raise ValueError if soft_by param is invalid
158 |         sort_by = "sort"
159 |         with self.assertRaises(ValueError):
160 |             self.api.get_search(sort_by=sort_by)
161 | 
162 | 
163 |         # Raise ValueError if soft_by param is invalid
164 |         published_date_precision = "score"
165 |         with self.assertRaises(ValueError):
166 |             self.api.get_search(published_date_precision=published_date_precision)
167 | 
168 |         # Raise ValueError if soft_by param is invalid
169 |         search_in = "published_date"
170 |         with self.assertRaises(ValueError):
171 |             self.api.get_search(search_in=search_in)
172 | 
173 | 
174 |         # Raises a TypeError is from_rank param is not an int
175 |         from_rank = "1"
176 |         with self.assertRaises(TypeError):
177 |             self.api.get_search(from_rank=from_rank)
178 | 
179 |         # Raises a TypeError is from_rank param is not an int
180 |         to_rank = "1"
181 |         with self.assertRaises(TypeError):
182 |             self.api.get_search(to_rank=to_rank)
183 | 
184 |     def test_api_get_sources(self):
185 |         # Raise TypeError if not_countries param is not of type str
186 |         not_countries = 0
187 |         with self.assertRaises(TypeError):
188 |             self.api.get_search(not_countries=not_countries)
189 | 
190 |         # Raises TypeError if topic param is not of type str
191 |         topic = 0
192 |         with self.assertRaises(TypeError):
193 |             self.api.get_search(topic=topic)
194 | 
195 |         # Raises ValueError if category param is invalid
196 |         topic = "dogcoin"
197 |         with self.assertRaises(ValueError):
198 |             self.api.get_search(topic=topic)
199 | 


--------------------------------------------------------------------------------
/newscatcherapi/newscatcherapi_client.py:
--------------------------------------------------------------------------------
  1 | from __future__ import unicode_literals
  2 | 
  3 | import requests
  4 | import os
  5 | import sys
  6 | import time
  7 | from datetime import date, datetime, timedelta
  8 | from dateparser import parse as parse_date
  9 | 
 10 | sys.path.append(os.getcwd())
 11 | 
 12 | 
 13 | from newscatcherapi import const, utils
 14 | from newscatcherapi.newscatcherapi_auth import NewsCatcherApiAuth
 15 | from newscatcherapi.newscatcherapi_exception import NewsCatcherApiException
 16 | 
 17 | 
 18 | class NewsCatcherApiClient(object):
 19 |     """The core client object used to fetch data from NewsCatcher News API endpoints.
 20 | 
 21 |     :param api_key: Your API key, a length-32 UUID string provided for your NewsCatcher News API account.
 22 |         You must `register <https://app.newscatcherapi.com/auth/register>`_ for a NewsCatcher News API key.
 23 |     :type api_key: str
 24 | 
 25 |     :param session: An optional :class:`requests.Session` instance from which to execute requests.
 26 |         **Note**: If you provide a ``session`` instance, :class:`NewsCatcherApiClient` will *not* close the session
 27 |         for you.  Remember to call ``session.close()``, or use the session as a context manager, to close
 28 |         the socket and free up resources.
 29 |     :type session: `requests.Session <https://2.python-requests.org/en/master/user/advanced/#session-objects>`_ or None
 30 |     """
 31 | 
 32 |     def __init__(self, x_api_key, base_url='https://api.newscatcherapi.com', session=None):
 33 |         self.auth = NewsCatcherApiAuth(x_api_key=x_api_key)
 34 |         self.base_url = base_url
 35 |         if session is None:
 36 |             self.request_method = requests
 37 |         else:
 38 |             self.request_method = session
 39 | 
 40 |     def get_latest_headlines(
 41 |             self,
 42 |             lang=None,
 43 |             not_lang=None,
 44 |             countries=None,
 45 |             not_countries=None,
 46 |             topic=None,
 47 |             sources=None,
 48 |             not_sources=None,
 49 |             when=None,
 50 |             ranked_only=None,
 51 |             page_size=None,
 52 |             page=None,
 53 |             proxies=None
 54 |     ):
 55 |         """Call the `/latest_headlines` endpoint.
 56 | 
 57 |         Fetch live top and breaking headlines.
 58 | 
 59 |         Get the latest headlines given any topic, country, or language. Articles are sorted by the earliest
 60 |         date published first.
 61 | 
 62 |         :param lang: Specifies the languages of the search. For example: `en`. The only accepted format is [ISO 639-1 — 2](https://en.wikipedia.org/wiki/ISO_639-1) letter code.
 63 |         :type lang: list or str or None
 64 | 
 65 |         :param not_lang: Inverse to the `lang` parameter
 66 |         :type not_lang: list or str or None
 67 | 
 68 |         :param countries: Countries where the news publisher is located. **Important**: This parameter is not responsible for the countries mentioned in the news article. One or multiple countries can be used in the search. The only acceptable format is [ISO 3166-1 alpha-2](https://en.wikipedia.org/wiki/ISO_3166-1_alpha-2) For example, `US,CA,MX` or just `US`
 69 |         :type countries: list or str or None
 70 | 
 71 |         :param not_countries: The inverse of the `countries` parameter.
 72 |         :type not_countries: list or str or None
 73 | 
 74 |         :param topic: Accepted values: `news`, `sport`, `tech`, `world`, `finance`, `politics`, `business`, `economics`, `entertainment`, `beauty`, `travel`, `music`, `food`, `science`, `gaming`, `energy`. The topic to which you want to restrict the articles of your choice. Not all news articles are assigned with a topic, therefore, we cannot guarantee that 100% of topics talking about technology will be assigned a tech label.
 75 |         :type topic: str or None
 76 | 
 77 |         :param sources: One or more news resources to filter your search. It should be the normal form of the URL, For example: `nytimes.com,theguardian.com`
 78 |         :type sources: list or str or None
 79 | 
 80 |         :param not_sources: One or more sources to be excluded from the search. Comma-separated list. For example: `nytimes.com,cnn.com,wsj.com`
 81 |         :type not_sources: list or str or None
 82 | 
 83 |         :param when: The time period you want to get the latest headlines for. Accepted forms: 7d => Dailly Form (last 7 days time period),  30d  (last 30 days time period) | 1h => Hourly Form (last hour), 24h (last 24 hours)
 84 |         :type when: str or None
 85 | 
 86 |         :param ranked_only: Default: `True` Limit the search only for the sources which are in the top 1 million online websites. Unranked sources are assigned a rank that equals `999999`
 87 |         :type ranked_only: bool or None
 88 | 
 89 |         :param page_size: `[1:100]` How many articles to return per page.
 90 |         :type page_size: int or None
 91 | 
 92 |         :param page: The number of the page. Use it to scroll through the results. This parameter is used to paginate: scroll through results because one API response cannot return more than 100 articles.
 93 |         :type page: int or None
 94 | 
 95 |         :param proxies: Dict of proxies if needed
 96 |         :type proxies: dict or None
 97 | 
 98 |         :return: JSON response as nested Python dictionary.
 99 |         :rtype: dict
100 |         :raises NewsCatcherApiException: If the ``"status"`` value of the response is ``"error"`` rather than ``"ok"``.
101 |         """
102 | 
103 |         payload = {}
104 | 
105 | 
106 |         # Language
107 |         if lang is not None:
108 |             payload["lang"] = utils.validate_language(lang)
109 | 
110 |         if not_lang is not None:
111 |             payload["not_lang"] = utils.validate_language(not_lang)
112 | 
113 |         # Countries
114 |         if countries is not None:
115 |             payload["countries"] = utils.validate_countries(countries, 'countries')
116 | 
117 |         if not_countries is not None:
118 |             payload["not_countries"] = utils.validate_countries(not_countries, 'not_countries')
119 | 
120 |         # Topic
121 |         if topic is not None:
122 |             payload['topic'] = utils.validate_topic(topic)
123 | 
124 |         # Sources
125 |         if sources is not None:
126 |             payload["sources"] = utils.validate_sources(sources, 'sources')
127 | 
128 |         if not_sources is not None:
129 |             payload["not_sources"] = utils.validate_sources(not_sources, 'not_sources')
130 | 
131 |         # When
132 |         if when is not None:
133 |             payload["when"] = utils.validate_when(when, 'when')
134 | 
135 |         # Ranks
136 |         if ranked_only is not None:
137 |             if utils.is_valid_boolean(ranked_only):
138 |                 payload['ranked_only'] = ranked_only
139 |             else:
140 |                 raise TypeError("ranked_only parameter should be of type boolean")
141 | 
142 |         # Page and page sizes
143 |         # Page Size
144 |         if page_size is not None:
145 |             if type(page_size) == int:
146 |                 payload["page_size"] = page_size
147 |             else:
148 |                 raise TypeError("page_size param should be an int")
149 | 
150 |         # Page
151 |         if page is not None:
152 |             if type(page) == int:
153 |                 if page > 0:
154 |                     payload["page"] = page
155 |                 else:
156 |                     raise ValueError("page param should be an int greater than 0")
157 |             else:
158 |                 raise TypeError("page param should be an int")
159 | 
160 |         # Send Request
161 |         r = self.request_method.get(self.base_url + const.LATEST_HEADLINES_URL, auth=self.auth, timeout=60, params=payload, proxies=proxies)
162 | 
163 |         # Check Status of Request
164 |         if r.status_code != requests.codes.ok:
165 |             raise NewsCatcherApiException(r.json())
166 | 
167 |         return r.json()
168 | 
169 |     def get_search(
170 |         self,
171 |         q=None,
172 |         lang=None,
173 |         not_lang=None,
174 |         from_=None,
175 |         to_=None,
176 |         published_date_precision=None,
177 |         search_in=None,
178 |         countries=None,
179 |         not_countries=None,
180 |         topic=None,
181 |         sources=None,
182 |         not_sources=None,
183 |         ranked_only=None,
184 |         from_rank=None,
185 |         to_rank=None,
186 |         sort_by=None,
187 |         page_size=None,
188 |         page=None,
189 |         proxies=None
190 |     ):
191 |         """Call the `/search` endpoint.
192 | 
193 |         Main endpoint that allows you to find news article by keyword, date, language, country, etc.
194 | 
195 |         :param q: Keyword/keywords you're searching for. This is the most important part of your query. Please, refer to the **Advanced Query Parameter** section below for more examples and explanations.  (required)
196 |         :type q: str or None
197 | 
198 |         :param lang: Specifies the languages of the search. For example: `en`. The only accepted format is [ISO 639-1 — 2](https://en.wikipedia.org/wiki/ISO_639-1) letter code.
199 |         :type lang: list or str or None
200 | 
201 |         :param not_lang: Inverse to the `lang` parameter
202 |         :type not_lang: list or str or None
203 | 
204 |         :param from_: `YYYY/mm/dd` From which point in time to start the search. The default timezone is UTC. Defaults to the past week.
205 |         :type from_: str or None
206 | 
207 |         :param to_: `YYYY/mm/dd` Until which point in time to search for. The default timezone is UTC.
208 |         :type to_: str or None
209 | 
210 |         :param published_date_precision: There are 3 types of date precision we define: `full` — day and time of an article is correctly identified with the appropriate timezone `timezone unknown` — day and time of an article is correctly identified without timezone `date` — only the day is identified without an exact time
211 |         :type published_date_precision: str or None
212 | 
213 |         :param search_in: By default, we search what you specified in the `q` parameter in both `title` and `summary` of the article. However, you can limit this to either `title` or `summary`
214 |         :type search_in: str or None
215 | 
216 |         :param countries: Countries where the news publisher is located. **Important**: This parameter is not responsible for the countries mentioned in the news article. One or multiple countries can be used in the search. The only acceptable format is [ISO 3166-1 alpha-2](https://en.wikipedia.org/wiki/ISO_3166-1_alpha-2) For example, `US,CA,MX` or just `US`
217 |         :type countries: list or str or None
218 | 
219 |         :param not_countries: The inverse of the `countries` parameter.
220 |         :type not_countries: list or str or None
221 | 
222 |         :param topic: Accepted values: `news`, `sport`, `tech`, `world`, `finance`, `politics`, `business`, `economics`, `entertainment`, `beauty`, `travel`, `music`, `food`, `science`, `gaming`, `energy`. The topic to which you want to restrict the articles of your choice. Not all news articles are assigned with a topic, therefore, we cannot guarantee that 100% of topics talking about technology will be assigned a tech label.
223 |         :type topic: str or None
224 | 
225 |         :param sources: One or more news resources to filter your search. It should be the normal form of the URL, For example: `nytimes.com,theguardian.com`
226 |         :type sources: list or str or None
227 | 
228 |         :param not_sources: One or more sources to be excluded from the search. Comma-separated list. For example: `nytimes.com,cnn.com,wsj.com`
229 |         :type not_sources: list or str or None
230 | 
231 |         :param ranked_only: Default: `True` Limit the search only for the sources which are in the top 1 million online websites. Unranked sources are assigned a rank that equals `999999`
232 |         :type ranked_only: bool or None
233 | 
234 |         :param from_rank: `[0:999999]` The lowest boundary of the rank of a news website to filter by. Important: lower rank means that a source is more popular
235 |         :type from_rank: int or None
236 | 
237 |         :param to_rank: `[0:999999]` The upper boundary of the rank of a news website to filter by.
238 |         :type to_rank: int or None
239 | 
240 |         :param sort_by: `relevancy` (default value) — the most relevant results first `date` — the most recently published results first `rank` — the results from the highest-ranked sources first
241 |         :type sort_by: str or None
242 | 
243 |         :param page_size: `[1:100]` How many articles to return per page.
244 |         :type page_size: int or None
245 | 
246 |         :param page: The number of the page. Use it to scroll through the results. This parameter is used to paginate: scroll through results because one API response cannot return more than 100 articles.
247 |         :type page: int or None
248 | 
249 |         :param proxies: Dict of proxies if needed
250 |         :type proxies: dict or None
251 | 
252 | 
253 |         :return: JSON response as nested Python dictionary.
254 |         :rtype: dict
255 |         :raises NewsCatcherApiException: If the ``"status"`` value of the response is ``"error"`` rather than ``"ok"``.
256 |         """
257 | 
258 |         payload = {}
259 | 
260 |         # Q
261 |         if q is not None:
262 |             if utils.is_valid_string(q):
263 |                 payload["q"] = q
264 |             else:
265 |                 raise TypeError("q parameter should be of type str")
266 | 
267 |         # Language
268 |         if lang is not None:
269 |             payload["lang"] = utils.validate_language(lang)
270 | 
271 |         if not_lang is not None:
272 |             payload["not_lang"] = utils.validate_language(not_lang)
273 | 
274 |         # Time variables
275 |         if from_ is not None:
276 |             if utils.is_valid_string(from_):
277 |                 payload["from"] = from_
278 |             else:
279 |                 raise TypeError("from_ parameter should be of type str")
280 | 
281 |         if to_ is not None:
282 |             if utils.is_valid_string(to_):
283 |                 payload["to"] = to_
284 |             else:
285 |                 raise TypeError("to_ parameter should be of type str")
286 | 
287 |         if published_date_precision is not None:
288 |             if utils.is_valid_string(published_date_precision):
289 |                 if published_date_precision in const.allowed_precisions:
290 |                     payload["published_date_precision"] = published_date_precision
291 |                 else:
292 |                     raise ValueError(f'{published_date_precision} is not a valid date precision. '
293 |                                      f'It should be one of the list: {str(const.allowed_precisions)}')
294 |             else:
295 |                 raise TypeError("published_date_precision parameter should be of type str")
296 | 
297 |         # Search in
298 |         if search_in is not None:
299 |             if utils.is_valid_string(search_in):
300 |                 if search_in in const.allowed_search_ins:
301 |                     payload["search_in"] = search_in
302 |                 else:
303 |                     raise ValueError(f'{search_in} is not a valid place to search for keywords. '
304 |                                      f'It should be one of the list: {str(const.allowed_search_ins)}')
305 |             else:
306 |                 raise TypeError("search_in parameter should be of type str")
307 | 
308 |         # Countries
309 |         if countries is not None:
310 |             payload["countries"] = utils.validate_countries(countries, 'countries')
311 | 
312 |         if not_countries is not None:
313 |             payload["not_countries"] = utils.validate_countries(not_countries, 'not_countries')
314 | 
315 |         # Topic
316 |         if topic is not None:
317 |             payload['topic'] = utils.validate_topic(topic)
318 | 
319 |         # Sources
320 |         if sources is not None:
321 |             payload["sources"] = utils.validate_sources(sources, 'sources')
322 | 
323 |         if not_sources is not None:
324 |             payload["not_sources"] = utils.validate_sources(not_sources, 'not_sources')
325 | 
326 | 
327 |         # Ranks
328 |         if ranked_only is not None:
329 |             if utils.is_valid_boolean(ranked_only):
330 |                 payload['ranked_only'] = ranked_only
331 |             else:
332 |                 raise TypeError("ranked_only parameter should be of type boolean")
333 | 
334 |         if from_rank is not None:
335 |             if utils.is_valid_num(from_rank):
336 |                 payload['from_rank'] = from_rank
337 |             else:
338 |                 raise TypeError("from_rank parameter should be of type int")
339 | 
340 |         if to_rank is not None:
341 |             if utils.is_valid_num(to_rank):
342 |                 payload['to_rank'] = to_rank
343 |             else:
344 |                 raise TypeError("to_rank parameter should be of type int")
345 | 
346 |         # Sort by
347 |         if sort_by is not None:
348 |             if utils.is_valid_string(sort_by):
349 |                 if sort_by in const.allowed_sorts:
350 |                     payload["sort_by"] = sort_by
351 |                 else:
352 |                     raise ValueError(f'{sort_by} is not a valid sort by type. '
353 |                                      f'It should be one of the list: {str(const.allowed_sorts)}')
354 |             else:
355 |                 raise TypeError("sort_by parameter should be of type str")
356 | 
357 |         # Page and page sizes
358 |         # Page Size
359 |         if page_size is not None:
360 |             if type(page_size) == int:
361 |                 payload["page_size"] = page_size
362 |             else:
363 |                 raise TypeError("page_size param should be an int")
364 | 
365 |         # Page
366 |         if page is not None:
367 |             if type(page) == int:
368 |                 if page > 0:
369 |                     payload["page"] = page
370 |                 else:
371 |                     raise ValueError("page param should be an int greater than 0")
372 |             else:
373 |                 raise TypeError("page param should be an int")
374 | 
375 |         # Send Request
376 |         r = self.request_method.get(self.base_url + const.SEARCH_URL, auth=self.auth, timeout=60, params=payload, proxies=proxies)
377 | 
378 |         # Check Status of Request
379 |         if r.status_code != requests.codes.ok:
380 |             raise NewsCatcherApiException(r.json())
381 | 
382 |         return r.json()
383 | 
384 |     def get_sources(self,
385 |                     lang=None,
386 |                     countries=None,
387 |                     topic=None,
388 |                     proxies=None):
389 |         """Call the `/sources` endpoint.
390 | 
391 |         Returns a list of the top 100 supported news websites. Overall, we support over 60,000 websites. Using this endpoint, you may find the top 100 for your specific language, country, topic combination.
392 | 
393 |         :param lang: Specifies the languages of the search. For example: `en`. The only accepted format is [ISO 639-1 — 2](https://en.wikipedia.org/wiki/ISO_639-1) letter code.
394 |         :type lang: list or str or None
395 | 
396 |         :param countries: Countries where the news publisher is located. **Important**: This parameter is not responsible for the countries mentioned in the news article. One or multiple countries can be used in the search. The only acceptable format is [ISO 3166-1 alpha-2](https://en.wikipedia.org/wiki/ISO_3166-1_alpha-2) For example, `US,CA,MX` or just `US`
397 |         :type countries: list or str or None
398 | 
399 |         :param topic: Accepted values: `news`, `sport`, `tech`, `world`, `finance`, `politics`, `business`, `economics`, `entertainment`, `beauty`, `travel`, `music`, `food`, `science`, `gaming`, `energy`. The topic to which you want to restrict the articles of your choice. Not all news articles are assigned with a topic, therefore, we cannot guarantee that 100% of topics talking about technology will be assigned a tech label.
400 |         :type topic: str or None
401 | 
402 |         :param proxies: Dict of proxies if needed
403 |         :type proxies: dict or None
404 | 
405 |         :return: JSON response as nested Python dictionary.
406 |         :rtype: dict
407 |         :raises NewsCatcherApiException: If the ``"status"`` value of the response is ``"error"`` rather than ``"ok"``.
408 | 
409 |         """
410 | 
411 |         payload = {}
412 | 
413 |         # Language
414 |         if lang is not None:
415 |             payload["lang"] = utils.validate_language(lang)
416 | 
417 |         # Countries
418 |         if countries is not None:
419 |             payload["countries"] = utils.validate_countries(countries, 'countries')
420 | 
421 |         # Topic
422 |         if topic is not None:
423 |             payload['topic'] = utils.validate_topic(topic)
424 | 
425 |         # Send Request
426 |         r = self.request_method.get(self.base_url + const.SOURCES_URL, auth=self.auth, timeout=60, params=payload, proxies=proxies)
427 | 
428 |         # Check Status of Request
429 |         if r.status_code != requests.codes.ok:
430 |             raise NewsCatcherApiException(r.json())
431 | 
432 |         return r.json()
433 | 
434 |     def get_latest_headlines_all_pages(
435 |             self,
436 |             lang=None,
437 |             not_lang=None,
438 |             countries=None,
439 |             not_countries=None,
440 |             topic=None,
441 |             sources=None,
442 |             not_sources=None,
443 |             when=None,
444 |             ranked_only=None,
445 |             page_size=100,
446 |             page=1,
447 |             max_page=None,
448 |             seconds_pause=1.0,
449 |             proxies=None
450 |     ):
451 | 
452 |         """Call the `/latest_headlines` endpoint the number of time sufficient to get all latest articles for a given search.
453 | 
454 |         Fetch live top and breaking headlines.
455 | 
456 |         Get the latest headlines given any topic, country, or language. Articles are sorted by the earliest
457 |         date published first. All found articles will be extracted.
458 | 
459 |         :param lang: Specifies the languages of the search. For example: `en`. The only accepted format is [ISO 639-1 — 2](https://en.wikipedia.org/wiki/ISO_639-1) letter code.
460 |         :type lang: list or str or None
461 | 
462 |         :param not_lang: Inverse to the `lang` parameter
463 |         :type not_lang: list or str or None
464 | 
465 |         :param countries: Countries where the news publisher is located. **Important**: This parameter is not responsible for the countries mentioned in the news article. One or multiple countries can be used in the search. The only acceptable format is [ISO 3166-1 alpha-2](https://en.wikipedia.org/wiki/ISO_3166-1_alpha-2) For example, `US,CA,MX` or just `US`
466 |         :type countries: list or str or None
467 | 
468 |         :param not_countries: The inverse of the `countries` parameter.
469 |         :type not_countries: list or str or None
470 | 
471 |         :param topic: Accepted values: `news`, `sport`, `tech`, `world`, `finance`, `politics`, `business`, `economics`, `entertainment`, `beauty`, `travel`, `music`, `food`, `science`, `gaming`, `energy`. The topic to which you want to restrict the articles of your choice. Not all news articles are assigned with a topic, therefore, we cannot guarantee that 100% of topics talking about technology will be assigned a tech label.
472 |         :type topic: str or None
473 | 
474 |         :param sources: One or more news resources to filter your search. It should be the normal form of the URL, For example: `nytimes.com,theguardian.com`
475 |         :type sources: list or str or None
476 | 
477 |         :param not_sources: One or more sources to be excluded from the search. Comma-separated list. For example: `nytimes.com,cnn.com,wsj.com`
478 |         :type not_sources: list or str or None
479 | 
480 |         :param when: The time period you want to get the latest headlines for. Accepted forms: 7d => Dailly Form (last 7 days time period),  30d  (last 30 days time period) | 1h => Hourly Form (last hour), 24h (last 24 hours)
481 |         :type topic: str or None
482 | 
483 |         :param ranked_only: Default: `True` Limit the search only for the sources which are in the top 1 million online websites. Unranked sources are assigned a rank that equals `999999`
484 |         :type ranked_only: bool or None
485 | 
486 |         :param page_size: `[1:100]` How many articles to return per page.
487 |         :type page_size: int
488 | 
489 |         :param page: The number of the page. Use it to scroll through the results. This parameter is used to paginate: scroll through results because one API response cannot return more than 100 articles.
490 |         :type page: int
491 | 
492 |         :param max_page: The last page number to extract. Use it to manage number of API calls and articles you are going to extract. For example, if you make a broad search with page_size=100 you will extract up to 10 000 articles and make 100 calls to do so.
493 |         :type max_page: int or None
494 | 
495 |         :param seconds_pause: The number of seconds delay between each API call. For your subscription, you can have a rate limit on number of calls per second.
496 |         :type seconds_pause: float
497 | 
498 |         :param proxies: Dict of proxies if needed
499 |         :type proxies: dict or None
500 | 
501 |         :return: JSON response as nested Python dictionary.
502 |         :rtype: dict
503 |         :raises NewsCatcherApiException: If the ``"status"`` value of the response is ``"error"`` rather than ``"ok"``.
504 |         """
505 |         nb_pages = None
506 |         if max_page is not None:
507 |             if type(max_page) == int:
508 |                 if max_page >= page:
509 |                     nb_pages = max_page
510 |                 else:
511 |                     raise ValueError("max_page param should be greater than page param")
512 |             else:
513 |                 raise TypeError("max_page param should be an int")
514 | 
515 |         all_articles = []
516 |         print(f'{str(page)} page is going to be extracted')
517 |         first_result = self.get_latest_headlines(
518 |             lang=lang,
519 |             not_lang=not_lang,
520 |             countries=countries,
521 |             not_countries=not_countries,
522 |             topic=topic,
523 |             sources=sources,
524 |             not_sources=not_sources,
525 |             when=when,
526 |             ranked_only=ranked_only,
527 |             page_size=page_size,
528 |             page=page,
529 |             proxies=proxies
530 |         )
531 | 
532 |         time.sleep(seconds_pause)
533 | 
534 |         if 'articles' not in first_result.keys():
535 |             return first_result
536 | 
537 |         all_articles.extend(first_result['articles'])
538 | 
539 |         print(f'Total number of found articles => {first_result["total_hits"]}.\n'
540 |               f'Total number of pages {first_result["total_pages"]}.')
541 | 
542 |         current_page = page
543 | 
544 |         if not nb_pages or (max_page and max_page > first_result["total_pages"]):
545 |             nb_pages = first_result["total_pages"]
546 | 
547 |         while current_page < nb_pages:
548 | 
549 |             current_page += 1
550 | 
551 |             print(f'{str(current_page)}/{str(nb_pages)} page is going to be extracted')
552 | 
553 |             try:
554 |                 one_call_results = self.get_latest_headlines(
555 |                     lang=lang,
556 |                     not_lang=not_lang,
557 |                     countries=countries,
558 |                     not_countries=not_countries,
559 |                     topic=topic,
560 |                     sources=sources,
561 |                     not_sources=not_sources,
562 |                     when=when,
563 |                     ranked_only=ranked_only,
564 |                     page_size=page_size,
565 |                     page=current_page,
566 |                     proxies=proxies
567 |                 )
568 |                 all_articles.extend(one_call_results['articles'])
569 |             except NewsCatcherApiException as e:
570 |                 print(f'{current_page} has not been extracted due to an error')
571 |                 print(str(e))
572 |                 pass
573 | 
574 |             time.sleep(seconds_pause)
575 | 
576 | 
577 |         final_results = first_result
578 |         final_results['articles'] = all_articles
579 | 
580 |         return final_results
581 | 
582 |     def get_search_all_pages(
583 |         self,
584 |         q=None,
585 |         lang=None,
586 |         not_lang=None,
587 |         from_=None,
588 |         to_=None,
589 |         published_date_precision=None,
590 |         search_in=None,
591 |         countries=None,
592 |         not_countries=None,
593 |         topic=None,
594 |         sources=None,
595 |         not_sources=None,
596 |         ranked_only=None,
597 |         from_rank=None,
598 |         to_rank=None,
599 |         sort_by=None,
600 |         page_size=100,
601 |         page=1,
602 |         max_page=None,
603 |         seconds_pause=1.0,
604 |         proxies=None
605 |     ):
606 |         """Call the `/search` endpoint the number of time sufficient to get all latest articles for a given search.
607 | 
608 |         Main endpoint that allows you to find news article by keyword, date, language, country, etc.
609 | 
610 |         :param q: Keyword/keywords you're searching for. This is the most important part of your query. Please, refer to the **Advanced Query Parameter** section below for more examples and explanations.  (required)
611 |         :type q: str or None
612 | 
613 |         :param lang: Specifies the languages of the search. For example: `en`. The only accepted format is [ISO 639-1 — 2](https://en.wikipedia.org/wiki/ISO_639-1) letter code.
614 |         :type lang: list or str or None
615 | 
616 |         :param not_lang: Inverse to the `lang` parameter
617 |         :type not_lang: list or str or None
618 | 
619 |         :param from_: `YYYY/mm/dd` From which point in time to start the search. The default timezone is UTC. Defaults to the past week.
620 |         :type from_: str or None
621 | 
622 |         :param to_: `YYYY/mm/dd` Until which point in time to search for. The default timezone is UTC.
623 |         :type to_: str or None
624 | 
625 |         :param published_date_precision: There are 3 types of date precision we define: `full` — day and time of an article is correctly identified with the appropriate timezone `timezone unknown` — day and time of an article is correctly identified without timezone `date` — only the day is identified without an exact time
626 |         :type published_date_precision: str or None
627 | 
628 |         :param search_in: By default, we search what you specified in the `q` parameter in both `title` and `summary` of the article. However, you can limit this to either `title` or `summary`
629 |         :type search_in: str or None
630 | 
631 |         :param countries: Countries where the news publisher is located. **Important**: This parameter is not responsible for the countries mentioned in the news article. One or multiple countries can be used in the search. The only acceptable format is [ISO 3166-1 alpha-2](https://en.wikipedia.org/wiki/ISO_3166-1_alpha-2) For example, `US,CA,MX` or just `US`
632 |         :type countries: list or str or None
633 | 
634 |         :param not_countries: The inverse of the `countries` parameter.
635 |         :type not_countries: list or str or None
636 | 
637 |         :param topic: Accepted values: `news`, `sport`, `tech`, `world`, `finance`, `politics`, `business`, `economics`, `entertainment`, `beauty`, `travel`, `music`, `food`, `science`, `gaming`, `energy`. The topic to which you want to restrict the articles of your choice. Not all news articles are assigned with a topic, therefore, we cannot guarantee that 100% of topics talking about technology will be assigned a tech label.
638 |         :type topic: str or None
639 | 
640 |         :param sources: One or more news resources to filter your search. It should be the normal form of the URL, For example: `nytimes.com,theguardian.com`
641 |         :type sources: list or str or None
642 | 
643 |         :param not_sources: One or more sources to be excluded from the search. Comma-separated list. For example: `nytimes.com,cnn.com,wsj.com`
644 |         :type not_sources: list or str or None
645 | 
646 |         :param ranked_only: Default: `True` Limit the search only for the sources which are in the top 1 million online websites. Unranked sources are assigned a rank that equals `999999`
647 |         :type ranked_only: bool or None
648 | 
649 |         :param from_rank: `[0:999999]` The lowest boundary of the rank of a news website to filter by. Important: lower rank means that a source is more popular
650 |         :type from_rank: int or None
651 | 
652 |         :param to_rank: `[0:999999]` The upper boundary of the rank of a news website to filter by.
653 |         :type to_rank: int or None
654 | 
655 |         :param sort_by: `relevancy` (default value) — the most relevant results first `date` — the most recently published results first `rank` — the results from the highest-ranked sources first
656 |         :type sort_by: str or None
657 | 
658 |         :param page_size: `[1:100]` How many articles to return per page.
659 |         :type page_size: int or None
660 | 
661 |         :param page: The number of the page. Use it to scroll through the results. This parameter is used to paginate: scroll through results because one API response cannot return more than 100 articles.
662 |         :type page: int or None
663 | 
664 |         :param max_page: The last page number to extract. Use it to manage number of API calls and articles you are going to extract. For example, if you make a broad search with page_size=100 you will extract up to 10 000 articles and make 100 calls to do so.
665 |         :type max_page: int or None
666 | 
667 |         :param seconds_pause: The number of seconds delay between each API call. For your subscription, you can have a rate limit on number of calls per second.
668 |         :type seconds_pause: float
669 | 
670 |         :param proxies: Dict of proxies if needed
671 |         :type proxies: dict or None
672 | 
673 |         :return: JSON response as nested Python dictionary.
674 |         :rtype: dict
675 |         :raises NewsCatcherApiException: If the ``"status"`` value of the response is ``"error"`` rather than ``"ok"``.
676 |         """
677 | 
678 |         nb_pages = None
679 |         if max_page is not None:
680 |             if type(max_page) == int:
681 |                 if max_page >= page:
682 |                     nb_pages = max_page
683 |                 else:
684 |                     raise ValueError("max_page param should be greater than page param")
685 |             else:
686 |                 raise TypeError("max_page param should be an int")
687 | 
688 |         all_articles = []
689 |         print(f'{str(page)} page is going to be extracted')
690 |         first_result = self.get_search(
691 |             q=q,
692 |             lang=lang,
693 |             not_lang=not_lang,
694 |             from_=from_,
695 |             to_=to_,
696 |             published_date_precision=published_date_precision,
697 |             search_in=search_in,
698 |             countries=countries,
699 |             not_countries=not_countries,
700 |             topic=topic,
701 |             sources=sources,
702 |             not_sources=not_sources,
703 |             ranked_only=ranked_only,
704 |             from_rank=from_rank,
705 |             to_rank=to_rank,
706 |             sort_by=sort_by,
707 |             page_size=page_size,
708 |             page=page,
709 |             proxies=proxies
710 |         )
711 | 
712 |         time.sleep(seconds_pause)
713 | 
714 |         if 'articles' not in first_result.keys():
715 |             return first_result
716 | 
717 |         all_articles.extend(first_result['articles'])
718 | 
719 |         print(f'Total number of found articles => {first_result["total_hits"]}.\n'
720 |               f'Total number of pages {first_result["total_pages"]}.')
721 | 
722 |         current_page = page
723 | 
724 |         if not nb_pages or (max_page and max_page > first_result["total_pages"]):
725 |             nb_pages = first_result["total_pages"]
726 | 
727 |         while current_page < nb_pages:
728 | 
729 |             current_page += 1
730 | 
731 |             print(f'{str(current_page)}/{str(nb_pages)} page is going to be extracted')
732 | 
733 |             try:
734 |                 one_call_results = self.get_search(
735 |                     q=q,
736 |                     lang=lang,
737 |                     not_lang=not_lang,
738 |                     from_=from_,
739 |                     to_=to_,
740 |                     published_date_precision=published_date_precision,
741 |                     search_in=search_in,
742 |                     countries=countries,
743 |                     not_countries=not_countries,
744 |                     topic=topic,
745 |                     sources=sources,
746 |                     not_sources=not_sources,
747 |                     ranked_only=ranked_only,
748 |                     from_rank=from_rank,
749 |                     to_rank=to_rank,
750 |                     sort_by=sort_by,
751 |                     page_size=page_size,
752 |                     page=current_page,
753 |                     proxies=proxies
754 |                 )
755 |                 all_articles.extend(one_call_results['articles'])
756 |             except NewsCatcherApiException as e:
757 |                 print(f'{current_page} has not been extracted due to an error')
758 |                 print(str(e))
759 |                 pass
760 | 
761 |             time.sleep(seconds_pause)
762 | 
763 |         final_results = first_result
764 |         final_results['articles'] = all_articles
765 | 
766 |         return final_results
767 | 
768 |     def get_search_all_articles(
769 |             self,
770 |             q=None,
771 |             lang=None,
772 |             not_lang=None,
773 |             from_=None,
774 |             to_=None,
775 |             published_date_precision=None,
776 |             search_in=None,
777 |             countries=None,
778 |             not_countries=None,
779 |             topic=None,
780 |             by='week',
781 |             sources=None,
782 |             not_sources=None,
783 |             ranked_only=None,
784 |             from_rank=None,
785 |             to_rank=None,
786 |             sort_by=None,
787 |             page_size=100,
788 |             page=1,
789 |             max_page=None,
790 |             seconds_pause=1.0,
791 |             proxies=None):
792 | 
793 |         """Call the `/search` endpoint the number of time sufficient to get all latest articles for a given search.
794 | 
795 |         Main endpoint that allows you to find news article by keyword, date, language, country, etc.
796 | 
797 |         :param q: Keyword/keywords you're searching for. This is the most important part of your query. Please, refer to the **Advanced Query Parameter** section below for more examples and explanations.  (required)
798 |         :type q: str or None
799 | 
800 |         :param lang: Specifies the languages of the search. For example: `en`. The only accepted format is [ISO 639-1 — 2](https://en.wikipedia.org/wiki/ISO_639-1) letter code.
801 |         :type lang: list or str or None
802 | 
803 |         :param not_lang: Inverse to the `lang` parameter
804 |         :type not_lang: list or str or None
805 | 
806 |         :param from_: `YYYY/mm/dd` From which point in time to start the search. The default timezone is UTC. Defaults to the past week.
807 |         :type from_: str or None
808 | 
809 |         :param to_: `YYYY/mm/dd` Until which point in time to search for. The default timezone is UTC.
810 |         :type to_: str or None
811 | 
812 |         :param published_date_precision: There are 3 types of date precision we define: `full` — day and time of an article is correctly identified with the appropriate timezone `timezone unknown` — day and time of an article is correctly identified without timezone `date` — only the day is identified without an exact time
813 |         :type published_date_precision: str or None
814 | 
815 |         :param search_in: By default, we search what you specified in the `q` parameter in both `title` and `summary` of the article. However, you can limit this to either `title` or `summary`
816 |         :type search_in: str or None
817 | 
818 |         :param countries: Countries where the news publisher is located. **Important**: This parameter is not responsible for the countries mentioned in the news article. One or multiple countries can be used in the search. The only acceptable format is [ISO 3166-1 alpha-2](https://en.wikipedia.org/wiki/ISO_3166-1_alpha-2) For example, `US,CA,MX` or just `US`
819 |         :type countries: list or str or None
820 | 
821 |         :param not_countries: The inverse of the `countries` parameter.
822 |         :type not_countries: list or str or None
823 | 
824 |         :param topic: Accepted values: `news`, `sport`, `tech`, `world`, `finance`, `politics`, `business`, `economics`, `entertainment`, `beauty`, `travel`, `music`, `food`, `science`, `gaming`, `energy`. The topic to which you want to restrict the articles of your choice. Not all news articles are assigned with a topic, therefore, we cannot guarantee that 100% of topics talking about technology will be assigned a tech label.
825 |         :type topic: str or None
826 | 
827 |         :param sources: One or more news resources to filter your search. It should be the normal form of the URL, For example: `nytimes.com,theguardian.com`
828 |         :type sources: list or str or None
829 | 
830 |         :param not_sources: One or more sources to be excluded from the search. Comma-separated list. For example: `nytimes.com,cnn.com,wsj.com`
831 |         :type not_sources: list or str or None
832 | 
833 |         :param ranked_only: Default: `True` Limit the search only for the sources which are in the top 1 million online
834 |         websites. Unranked sources are assigned a rank that equals `999999`
835 |         :type ranked_only: bool or None
836 | 
837 |         :param from_rank: `[0:999999]` The lowest boundary of the rank of a news website to filter by. Important: lower rank means that a source is more popular
838 |         :type from_rank: int or None
839 | 
840 |         :param to_rank: `[0:999999]` The upper boundary of the rank of a news website to filter by.
841 |         :type to_rank: int or None
842 | 
843 |         :param sort_by: `relevancy` (default value) — the most relevant results first `date` — the most recently published results first `rank` — the results from the highest-ranked sources first
844 |         :type sort_by: str or None
845 | 
846 |         :param page_size: `[1:100]` How many articles to return per page.
847 |         :type page_size: int or None
848 | 
849 |         :param page: The number of the page. Use it to scroll through the results. This parameter is used to paginate: scroll through results because one API response cannot return more than 100 articles.
850 |         :type page: int or None
851 | 
852 |         :param max_page: The last page number to extract. Use it to manage number of API calls and articles you are going to extract. For example, if you make a broad search with page_size=100 you will extract up to 10 000 articles and make 100 calls to do so.
853 |         :type max_page: int or None
854 | 
855 |         :param seconds_pause: The number of seconds delay between each API call. For your subscription, you can have a rate limit on number of calls per second.
856 |         :type seconds_pause: float
857 | 
858 |         :param proxies: Dict of proxies if needed
859 |         :type proxies: dict or None
860 | 
861 |         :param by: Accepted values: `month`, `week`, `day`, 'hour'. Default: `week`. How to divide the the time time interval between to_ and from_.
862 |         :type by: str
863 | 
864 |         :return: JSON response as nested Python dictionary.
865 |         :rtype: dict
866 |         :raises NewsCatcherApiException: If the ``"status"`` value of the response is ``"error"`` rather than ``"ok"``.
867 |         """
868 |         if not from_:
869 |             from_ = (datetime.utcnow() - timedelta(days=7)).strftime('%Y/%m/%d')
870 |         if not to_:
871 |             to_ = datetime.utcnow().strftime('%Y/%m/%d')
872 | 
873 |         # create a timedelta corresponding to the by parameter
874 |         if by == 'month':
875 |             delta = timedelta(days=28)
876 |         elif by == 'week':
877 |             delta = timedelta(days=7)
878 |         elif by == 'day':
879 |             delta = timedelta(days=1)
880 |         elif by == 'hour':
881 |             delta = timedelta(hours=1)
882 | 
883 |         # Convert the to_ and from_ parameters to datetime object
884 |         # Check if time is specified and treat accordingly
885 |         to_datetime = parse_date(to_, settings={'TIMEZONE': 'UTC'})
886 | 
887 |         from_datetime = parse_date(from_, settings={'TIMEZONE': 'UTC'})
888 | 
889 |         # the by parameter can't be smaller than `to_ - from_`
890 |         if to_datetime - from_datetime < delta:
891 |             raise ValueError("The 'by' parameter cannot be bigger than the difference of from_ and to_")
892 | 
893 |         # initialize response dict/object
894 |         payload = {'status': '', 'total_hits': 0, 'page': 0, 'total_pages': 0, 'page_size': 0, 'articles': [], 'user_input': {}}
895 | 
896 |         # flag so we can compensate for our midnight trickery in the from_ parameter
897 |         midnight_flag = False
898 | 
899 |         while True:
900 | 
901 |             if to_datetime - from_datetime <= delta:
902 |                 print(f'{from_datetime.strftime("%m/%d/%Y %H:%M:%S")} --> {to_datetime.strftime("%m/%d/%Y %H:%M:%S")}')
903 |                 results = self.get_search_all_pages(q=q,
904 |                                                     lang=lang,
905 |                                                     not_lang=not_lang,
906 |                                                     from_=from_datetime.strftime("%m/%d/%Y %H:%M:%S"),
907 |                                                     to_=to_datetime.strftime("%m/%d/%Y %H:%M:%S"),
908 |                                                     published_date_precision=published_date_precision,
909 |                                                     search_in=search_in,
910 |                                                     countries=countries,
911 |                                                     not_countries=not_countries,
912 |                                                     topic=topic,
913 |                                                     sources=sources,
914 |                                                     not_sources=not_sources,
915 |                                                     ranked_only=ranked_only,
916 |                                                     from_rank=from_rank,
917 |                                                     to_rank=to_rank,
918 |                                                     sort_by=sort_by,
919 |                                                     page_size=page_size,
920 |                                                     page=page,
921 |                                                     proxies=proxies,
922 |                                                     max_page=max_page,
923 |                                                     seconds_pause=seconds_pause)
924 | 
925 |                 utils.update_final_res(results, payload)
926 |                 payload['page_size'] = page_size
927 |                 payload['user_input'] = results['user_input']
928 |                 payload['user_input']['by'] = by
929 |                 if len(payload['articles']) > 0:
930 |                     payload['status'] = 'ok'
931 |                 else:
932 |                     payload['status'] = 'No matches for your search.'
933 | 
934 |                 return payload
935 | 
936 |             # move the to_ parameter forward
937 |             else:
938 |                 temp_to_ = from_datetime + delta
939 | 
940 |             # subtract 1 sec if the to_ parameter is the exact midnight
941 |             if temp_to_.hour == 0 and temp_to_.minute == 0 and \
942 |                     temp_to_.second == 0:
943 |                 temp_to_ -= timedelta(seconds=1)
944 |                 midnight_flag = True
945 | 
946 |             print(f'{from_datetime.strftime("%m/%d/%Y %H:%M:%S")} --> {temp_to_.strftime("%m/%d/%Y %H:%M:%S")}')
947 | 
948 |             results = self.get_search_all_pages(q=q,
949 |                                                 lang=lang,
950 |                                                 not_lang=not_lang,
951 |                                                 from_=from_datetime.strftime("%m/%d/%Y %H:%M:%S"),
952 |                                                 to_=temp_to_.strftime("%m/%d/%Y %H:%M:%S"),
953 |                                                 published_date_precision=published_date_precision,
954 |                                                 search_in=search_in,
955 |                                                 countries=countries,
956 |                                                 not_countries=not_countries,
957 |                                                 topic=topic,
958 |                                                 sources=sources,
959 |                                                 not_sources=not_sources,
960 |                                                 ranked_only=ranked_only,
961 |                                                 from_rank=from_rank,
962 |                                                 to_rank=to_rank,
963 |                                                 sort_by=sort_by,
964 |                                                 page_size=page_size,
965 |                                                 page=page,
966 |                                                 proxies=proxies,
967 |                                                 max_page=max_page,
968 |                                                 seconds_pause=seconds_pause)
969 | 
970 |             utils.update_final_res(results, payload)
971 | 
972 |             # move the from_ parameter forward
973 |             from_datetime = temp_to_
974 | 
975 |             # Add a second so from_ doesn't become DD-1/MM/YYYY 23:59:59
976 |             if midnight_flag:
977 |                 from_datetime += timedelta(seconds=1)
978 | 


--------------------------------------------------------------------------------