├── tests ├── __init__.py └── test_GoogleNewsFeed.py ├── requirements.txt ├── .vscode └── settings.json ├── pyproject.toml ├── LICENSE ├── .github └── workflows │ ├── pip-publish.yml │ └── python-package.yml ├── README.md ├── .gitignore └── src └── google_news_feed └── __init__.py /tests/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | dateparser>=1.1.1 2 | lxml>=4.9.0 3 | beautifulsoup4>=4.12.2 4 | requests>=2.31.0 5 | aiohttp>=3.8.4 6 | -------------------------------------------------------------------------------- /.vscode/settings.json: -------------------------------------------------------------------------------- 1 | { 2 | "python.testing.pytestArgs": [ 3 | "tests" 4 | ], 5 | "python.testing.unittestEnabled": false, 6 | "python.testing.pytestEnabled": true 7 | } -------------------------------------------------------------------------------- /pyproject.toml: -------------------------------------------------------------------------------- 1 | [build-system] 2 | requires = ["hatchling"] 3 | build-backend = "hatchling.build" 4 | 5 | [project] 6 | name = "google_news_feed" 7 | version = "1.1.0" 8 | authors = [ 9 | { name="Lukas Kreussel"}, 10 | ] 11 | description = "A simple python library to consume the google news rss feed" 12 | readme = "README.md" 13 | license = { file="LICENSE" } 14 | requires-python = ">=3.10" 15 | classifiers = [ 16 | "Programming Language :: Python :: 3", 17 | "License :: OSI Approved :: MIT License", 18 | "Operating System :: OS Independent", 19 | ] 20 | dependencies=["aiohttp>=3.8.4","dateparser>=1.1.1","lxml>=4.9.0","requests>=2.31.0","beautifulsoup4>=4.12.2"] 21 | 22 | [project.urls] 23 | "Homepage" = "https://github.com/LLukas22/Google-News-RSS" 24 | "Bug Tracker" = "https://github.com/LLukas22/Google-News-RSS/issues" -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2022 LLukas22 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /.github/workflows/pip-publish.yml: -------------------------------------------------------------------------------- 1 | # This workflow will upload a Python Package using Twine when a release is created 2 | # For more information see: https://help.github.com/en/actions/language-and-framework-guides/using-python-with-github-actions#publishing-to-package-registries 3 | 4 | # This workflow uses actions that are not certified by GitHub. 5 | # They are provided by a third-party and are governed by 6 | # separate terms of service, privacy policy, and support 7 | # documentation. 8 | 9 | name: Upload Python Package 10 | 11 | on: 12 | release: 13 | types: [published] 14 | 15 | permissions: 16 | contents: read 17 | 18 | jobs: 19 | deploy: 20 | 21 | runs-on: ubuntu-latest 22 | 23 | steps: 24 | - uses: actions/checkout@v3 25 | - name: Set up Python 26 | uses: actions/setup-python@v3 27 | with: 28 | python-version: '3.10' 29 | - name: Install dependencies 30 | run: | 31 | python -m pip install --upgrade pip 32 | pip install build 33 | - name: Build package 34 | run: python -m build 35 | - name: Publish package 36 | uses: pypa/gh-action-pypi-publish@27b31702a0e7fc50959f5ad993c78deac1bdfc29 37 | with: 38 | user: __token__ 39 | password: ${{ secrets.PIP_TOKEN }} 40 | -------------------------------------------------------------------------------- /.github/workflows/python-package.yml: -------------------------------------------------------------------------------- 1 | # This workflow will install Python dependencies, run tests and lint with a variety of Python versions 2 | # For more information see: https://help.github.com/actions/language-and-framework-guides/using-python-with-github-actions 3 | 4 | name: Python package 5 | 6 | on: 7 | push: 8 | branches: [ "main" ] 9 | pull_request: 10 | branches: [ "main" ] 11 | 12 | jobs: 13 | build: 14 | 15 | runs-on: ubuntu-latest 16 | strategy: 17 | fail-fast: false 18 | matrix: 19 | python-version: ["3.10"] 20 | 21 | steps: 22 | - uses: actions/checkout@v3 23 | - name: Set up Python ${{ matrix.python-version }} 24 | uses: actions/setup-python@v3 25 | with: 26 | python-version: ${{ matrix.python-version }} 27 | - name: Install dependencies 28 | run: | 29 | python -m pip install --upgrade pip 30 | python -m pip install flake8 pytest 31 | if [ -f requirements.txt ]; then pip install -r requirements.txt; fi 32 | - name: Lint with flake8 33 | run: | 34 | # stop the build if there are Python syntax errors or undefined names 35 | flake8 . --count --select=E9,F63,F7,F82 --show-source --statistics 36 | # exit-zero treats all errors as warnings. The GitHub editor is 127 chars wide 37 | flake8 . --count --exit-zero --max-complexity=10 --max-line-length=127 --statistics 38 | - name: Test with pytest 39 | run: | 40 | pytest 41 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Google-News-Feed 2 | A simple python library to consume the google news rss feed. 3 | 4 | Inspired by [pygooglenews](https://github.com/kotartemiy/pygooglenews) and implemented using [aiohttp](https://pypi.org/project/aiohttp/) and [lxml](https://pypi.org/project/lxml/). 5 | 6 | 7 | ## Installation 8 | Via pip: pip install google-news-feed 9 | 10 | ## How to use 11 | ```python 12 | from google_news_feed import GoogleNewsFeed 13 | 14 | gnf = GoogleNewsFeed(language='en',country='US') 15 | results = gnf.query("python") 16 | print(results) 17 | ``` 18 | For more information about the query parameters see [here](https://newscatcherapi.com/blog/google-news-rss-search-parameters-the-missing-documentaiton). 19 | 20 | ### Get Top Headlines 21 | ```python 22 | gnf = GoogleNewsFeed(language='en',country='US') 23 | results = gnf.top_headlines() 24 | ``` 25 | 26 | ### Query a specific topic 27 | ```python 28 | gnf = GoogleNewsFeed(language='en',country='US') 29 | results = gnf.query_topic("business") 30 | ``` 31 | For more topics see [here](https://newscatcherapi.com/blog/google-news-rss-search-parameters-the-missing-documentaiton). 32 | ### Accessing the results 33 | The results are a list of NewsItems. 34 | ```python 35 | result = gnf.query("python")[0] 36 | print(result.title) 37 | print(result.link) 38 | print(result.pubDate) 39 | print(result.description) 40 | print(result.source) 41 | ``` 42 | 43 | ## Handling internal links 44 | Some links are internal to google news. To access the actual link to the news site the internal link has to be accessed and the redirect url is returned. To simplify this process the `resolve_internal_links` property can be set to True. 45 | ```python 46 | gnf = GoogleNewsFeed(language='en',country='US',resolve_internal_links=True) 47 | print(gnf.top_headlines()[0].link) 48 | ``` 49 | The resolution is handled asynchronously by default, but can be forced to be done synchronously via the `run_async` parameter. 50 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | # Byte-compiled / optimized / DLL files 2 | __pycache__/ 3 | *.py[cod] 4 | *$py.class 5 | 6 | # C extensions 7 | *.so 8 | 9 | # Distribution / packaging 10 | .Python 11 | build/ 12 | develop-eggs/ 13 | dist/ 14 | downloads/ 15 | eggs/ 16 | .eggs/ 17 | lib/ 18 | lib64/ 19 | parts/ 20 | sdist/ 21 | var/ 22 | wheels/ 23 | pip-wheel-metadata/ 24 | share/python-wheels/ 25 | *.egg-info/ 26 | .installed.cfg 27 | *.egg 28 | MANIFEST 29 | 30 | # PyInstaller 31 | # Usually these files are written by a python script from a template 32 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 33 | *.manifest 34 | *.spec 35 | 36 | # Installer logs 37 | pip-log.txt 38 | pip-delete-this-directory.txt 39 | 40 | # Unit test / coverage reports 41 | htmlcov/ 42 | .tox/ 43 | .nox/ 44 | .coverage 45 | .coverage.* 46 | .cache 47 | nosetests.xml 48 | coverage.xml 49 | *.cover 50 | *.py,cover 51 | .hypothesis/ 52 | .pytest_cache/ 53 | 54 | # Translations 55 | *.mo 56 | *.pot 57 | 58 | # Django stuff: 59 | *.log 60 | local_settings.py 61 | db.sqlite3 62 | db.sqlite3-journal 63 | 64 | # Flask stuff: 65 | instance/ 66 | .webassets-cache 67 | 68 | # Scrapy stuff: 69 | .scrapy 70 | 71 | # Sphinx documentation 72 | docs/_build/ 73 | 74 | # PyBuilder 75 | target/ 76 | 77 | # Jupyter Notebook 78 | .ipynb_checkpoints 79 | 80 | # IPython 81 | profile_default/ 82 | ipython_config.py 83 | 84 | # pyenv 85 | .python-version 86 | 87 | # pipenv 88 | # According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control. 89 | # However, in case of collaboration, if having platform-specific dependencies or dependencies 90 | # having no cross-platform support, pipenv may install dependencies that don't work, or not 91 | # install all needed dependencies. 92 | #Pipfile.lock 93 | 94 | # PEP 582; used by e.g. github.com/David-OConnor/pyflow 95 | __pypackages__/ 96 | 97 | # Celery stuff 98 | celerybeat-schedule 99 | celerybeat.pid 100 | 101 | # SageMath parsed files 102 | *.sage.py 103 | 104 | # Environments 105 | .env 106 | .venv 107 | env/ 108 | venv/ 109 | ENV/ 110 | env.bak/ 111 | venv.bak/ 112 | 113 | # Spyder project settings 114 | .spyderproject 115 | .spyproject 116 | 117 | # Rope project settings 118 | .ropeproject 119 | 120 | # mkdocs documentation 121 | /site 122 | 123 | # mypy 124 | .mypy_cache/ 125 | .dmypy.json 126 | dmypy.json 127 | 128 | # Pyre type checker 129 | .pyre/ 130 | -------------------------------------------------------------------------------- /tests/test_GoogleNewsFeed.py: -------------------------------------------------------------------------------- 1 | import src 2 | from src.google_news_feed import GoogleNewsFeed,NewsItem 3 | from datetime import datetime,date 4 | from lxml import etree 5 | from lxml.etree import _Element 6 | 7 | def test_build_ceid_should_build_ceid(): 8 | assert 'hl=en-US&gl=US&ceid=US:en' == GoogleNewsFeed()._build_ceid('US', 'en') 9 | 10 | def test_build_query_should_escape_query(): 11 | assert 'ELON%20MUSK' == GoogleNewsFeed()._build_query("ELON MUSK") 12 | 13 | def test_parse_item_shoudl_parse_item(): 14 | item = etree.Element("p") 15 | title = etree.SubElement(item, 'title') 16 | title.text = "Title" 17 | link = etree.SubElement(item, 'link') 18 | link.tail='http://www.foobar.com' 19 | pubDate = etree.SubElement(item, 'pubdate') 20 | pubDate.text = "2020-01-01" 21 | description = etree.SubElement(item, 'description') 22 | description.text = 'Foobar' 23 | source = etree.SubElement(item, 'source') 24 | source.text = 'source' 25 | 26 | news_item = GoogleNewsFeed()._parse_item(item) 27 | assert isinstance(news_item, NewsItem) 28 | assert news_item.description == 'Foobar' 29 | assert news_item.title == 'Title' 30 | assert news_item.link == 'http://www.foobar.com' 31 | assert news_item.pubDate == datetime(2020,1,1) 32 | assert news_item.source == 'source' 33 | 34 | def test_query_should_return_news_items(): 35 | news_items = GoogleNewsFeed(resolve_internal_links=False).query("Python") 36 | assert len(news_items) > 0 37 | assert isinstance(news_items[0], NewsItem) 38 | 39 | def test_query_works_with_start_date(): 40 | news_items = GoogleNewsFeed(resolve_internal_links=False).query("Python",after=date(2020,1,1)) 41 | assert len(news_items) > 0 42 | assert isinstance(news_items[0], NewsItem) 43 | 44 | 45 | def test_query_works_with_end_date(): 46 | news_items = GoogleNewsFeed(resolve_internal_links=False).query("Python",before=date(2020,1,1)) 47 | assert len(news_items) > 0 48 | assert isinstance(news_items[0], NewsItem) 49 | 50 | def test_query_works_with_start_and_end_date(): 51 | news_items = GoogleNewsFeed(resolve_internal_links=False).query("Python",after=date(2020,1,1),before=date(2021,1,1)) 52 | assert len(news_items) > 0 53 | assert isinstance(news_items[0], NewsItem) 54 | 55 | def test_query_works_with_interval(): 56 | news_items = GoogleNewsFeed(resolve_internal_links=False).query("Python",when="3m") 57 | assert len(news_items) > 0 58 | assert isinstance(news_items[0], NewsItem) 59 | 60 | def test_query_topic_should_return_news_items(): 61 | news_items = GoogleNewsFeed(resolve_internal_links=False).query_topic("world") 62 | assert len(news_items) > 0 63 | assert isinstance(news_items[0], NewsItem) 64 | 65 | def test_top_headlines_should_return_news_items(): 66 | news_items = GoogleNewsFeed(resolve_internal_links=False).top_headlines() 67 | assert len(news_items) > 0 68 | assert isinstance(news_items[0], NewsItem) 69 | 70 | def test_internal_link_are_resolved(): 71 | news_items = GoogleNewsFeed(run_async=False).top_headlines() 72 | assert len(news_items) > 0 73 | for news_item in news_items: 74 | assert not news_item.is_internal_google_link 75 | 76 | def test_internal_link_are_resolved_async(): 77 | news_items = GoogleNewsFeed().top_headlines() 78 | assert len(news_items) > 0 79 | for news_item in news_items: 80 | assert not news_item.is_internal_google_link -------------------------------------------------------------------------------- /src/google_news_feed/__init__.py: -------------------------------------------------------------------------------- 1 | import urllib.parse 2 | from lxml import etree 3 | from lxml.etree import _Element 4 | from datetime import datetime,date 5 | from dateparser import parse 6 | import asyncio 7 | import logging 8 | from typing import Optional 9 | from dataclasses import dataclass 10 | import requests 11 | from bs4 import BeautifulSoup 12 | import aiohttp 13 | 14 | 15 | GOOGLE_INTERNAL_URL = set(["https://news.google.com/__i/rss","https://news.google.com/rss"]) 16 | BASE_URL = 'https://news.google.com/rss' 17 | PARSER = etree.HTMLParser(recover=True) 18 | 19 | HEADERS = { 20 | 'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/108.0.0.0 Safari/537.36' 21 | } 22 | 23 | COOKIES = { 24 | 'CONSENT': 'YES+cb.20220419-08-p0.cs+FX+111' 25 | } 26 | 27 | KNOWN_TOPICS={ 28 | "BUSINESS":"CAAqKggKIiRDQkFTRlFvSUwyMHZNRGx6TVdZU0JXVnVMVlZUR2dKVlV5Z0FQAQ", 29 | "NATION":"CAAqKggKIiRDQkFTRlFvSUwyMHZNRGx6TVdZU0JXVnVMVlZUR2dKVlV5Z0FQAQ", 30 | "WORLD":"CAAqJggKIiBDQkFTRWdvSkwyMHZNRGxqTjNjd0VnVmxiaTFWVXlnQVAB", 31 | "TECHNOLOGY":"CAAqKggKIiRDQkFTRlFvSUwyMHZNRGx1YlY4U0JXVnVMVlZUR2dKVlV5Z0FQAQ", 32 | "ENTERTAINMENT":"CAAqKggKIiRDQkFTRlFvSUwyMHZNREpxYW5RU0JXVnVMVlZUR2dKVlV5Z0FQAQ", 33 | "SCIENCE":"CAAqKggKIiRDQkFTRlFvSUwyMHZNREpxYW5RU0JXVnVMVlZUR2dKVlV5Z0FQAQ", 34 | "SPORTS":"CAAqKggKIiRDQkFTRlFvSUwyMHZNRFp1ZEdvU0JXVnVMVlZUR2dKVlV5Z0FQAQ", 35 | "HEALTH":"CAAqJQgKIh9DQkFTRVFvSUwyMHZNR3QwTlRFU0JXVnVMVlZUS0FBUAE" 36 | } 37 | 38 | @dataclass 39 | class NewsItem(object): 40 | title:Optional[str]=None 41 | link:Optional[str]=None 42 | pubDate:Optional[datetime]=None 43 | description:Optional[str]=None 44 | source:Optional[str]=None 45 | 46 | def __repr__(self) -> str: 47 | return f'{self.title}' 48 | 49 | @property 50 | def is_internal_google_link(self)->bool: 51 | for internal_link in GOOGLE_INTERNAL_URL: 52 | if self.link.startswith(internal_link): 53 | return True 54 | return False 55 | 56 | class GoogleNewsFeed: 57 | def __init__(self,language:str='en',country:str='US',client:Optional[requests.Session]=None,resolve_internal_links:bool=True,run_async:bool=True)->None: 58 | self.language = language.lower() 59 | self.country = country.upper() 60 | if client: 61 | self.client = client 62 | else: 63 | self.client = requests.Session() 64 | self.client.headers.update(HEADERS) 65 | self.client.cookies.update(COOKIES) 66 | 67 | self.resolve_internal_links = resolve_internal_links 68 | self.run_async = run_async 69 | 70 | 71 | @staticmethod 72 | def _build_ceid(country:str,language:str)->str: 73 | return f'hl={language}-{country}&gl={country}&ceid={country}:{language}' 74 | 75 | @staticmethod 76 | def _build_query(query:str)->str: 77 | return urllib.parse.quote(query, safe='') 78 | 79 | @staticmethod 80 | def _build_query_url(query:str,country:str,language:str,before:date=None,after:date=None,when:str=None)->str: 81 | base_ure = f"{BASE_URL}/search?q=" 82 | 83 | query = GoogleNewsFeed._build_query(query) 84 | 85 | time_restrictions = [] 86 | if when: 87 | time_restrictions.append(f"when:{when}") 88 | else: 89 | if before: 90 | time_restrictions.append(f"before:{before.isoformat()}") 91 | if after: 92 | time_restrictions.append(f"after:{after.isoformat()}") 93 | 94 | if len(time_restrictions) > 0: 95 | return f"{base_ure}{query}+{'+'.join(time_restrictions)}&{GoogleNewsFeed._build_ceid(country,language)}" 96 | else: 97 | return f"{base_ure}{query}&{GoogleNewsFeed._build_ceid(country,language)}" 98 | 99 | 100 | @staticmethod 101 | def _parse_item(item:_Element)->NewsItem: 102 | parsed_item = NewsItem() 103 | for element in item.getchildren(): 104 | match element.tag: 105 | case 'title': 106 | parsed_item.title = element.text 107 | case 'link': 108 | parsed_item.link = element.tail 109 | case 'pubdate': 110 | parsed_item.pubDate = parse(element.text) 111 | case 'description': 112 | parsed_item.description = list(etree.fromstring(element.text,parser=PARSER).iter('a'))[0].text 113 | case 'source': 114 | parsed_item.source = element.text 115 | 116 | return parsed_item 117 | 118 | 119 | 120 | @staticmethod 121 | def _parse_feed(content:str)->list[NewsItem]: 122 | root = etree.fromstring(content,parser=PARSER) 123 | 124 | parsed_items = [] 125 | for item in root.iter('item'): 126 | try: 127 | parsed_items.append(GoogleNewsFeed._parse_item(item)) 128 | except Exception as e: 129 | logging.debug(f"Failed to parse item: {item}! Exception: {e}") 130 | 131 | return parsed_items 132 | 133 | async def _async_resolve_internal_links(self,items:list[NewsItem])->list[NewsItem]: 134 | async with aiohttp.ClientSession() as session: 135 | session.headers.update(HEADERS) 136 | session.cookie_jar.update_cookies(COOKIES) 137 | for item in items: 138 | try: 139 | if item.is_internal_google_link: 140 | async with session.get(item.link) as response: 141 | content = await response.text() 142 | if content: 143 | soup = BeautifulSoup(content, 'html.parser') 144 | item.link = soup.a['href'] 145 | del soup 146 | except: 147 | logging.debug(f"Failed to resolve internal link: {item.link}") 148 | return items 149 | 150 | def _resolve_internal_links(self,items:list[NewsItem])->list[NewsItem]: 151 | for item in items: 152 | try: 153 | if item.is_internal_google_link: 154 | response = self.client.get(item.link) 155 | if response.text: 156 | soup = BeautifulSoup(response.text, 'html.parser') 157 | item.link = soup.a['href'] 158 | del soup 159 | except: 160 | logging.debug(f"Failed to resolve internal link: {item.link}") 161 | return items 162 | 163 | def _get_feed(self,url:str)->list[NewsItem]: 164 | result = self.client.get(url) 165 | if result.status_code == 200: 166 | items = GoogleNewsFeed._parse_feed(result.content) 167 | if self.resolve_internal_links: 168 | if self.run_async: 169 | items = asyncio.run(self._async_resolve_internal_links(items)) 170 | else: 171 | items = self._resolve_internal_links(items) 172 | return items 173 | else: 174 | raise Exception(f"Error fetching feed: {url}") 175 | 176 | def query_topic(self,topic:str)->list[NewsItem]: 177 | if topic.upper() in KNOWN_TOPICS: 178 | topic = KNOWN_TOPICS[topic.upper()] 179 | 180 | url = f"{BASE_URL}/topics/{topic}?{GoogleNewsFeed._build_ceid(self.country,self.language)}" 181 | return self._get_feed(url) 182 | 183 | def top_headlines(self)->list[NewsItem]: 184 | url = f"{BASE_URL}?{GoogleNewsFeed._build_ceid(self.country,self.language)}" 185 | return self._get_feed(url) 186 | 187 | def query(self,query:str,before:date=None,after:date=None,when:str=None)->list[NewsItem]: 188 | """ 189 | For more information on the parameters, see: 190 | https://newscatcherapi.com/blog/google-news-rss-search-parameters-the-missing-documentaiton 191 | """ 192 | url = GoogleNewsFeed._build_query_url(query,self.country,self.language,before,after,when) 193 | return self._get_feed(url) 194 | 195 | @classmethod 196 | def known_topics()->list[str]: 197 | return KNOWN_TOPICS.keys() 198 | 199 | @classmethod 200 | def get_topic_hash(topic:str)->str|None: 201 | if topic.upper() in KNOWN_TOPICS: 202 | return KNOWN_TOPICS[topic.upper()] 203 | return None 204 | 205 | if __name__ == "__main__": 206 | gnf = GoogleNewsFeed() 207 | news = gnf.query("apple") 208 | print(news) 209 | 210 | 211 | --------------------------------------------------------------------------------