├── tests
├── __init__.py
└── test_GoogleNewsFeed.py
├── requirements.txt
├── .vscode
└── settings.json
├── pyproject.toml
├── LICENSE
├── .github
└── workflows
│ ├── pip-publish.yml
│ └── python-package.yml
├── README.md
├── .gitignore
└── src
└── google_news_feed
└── __init__.py
/tests/__init__.py:
--------------------------------------------------------------------------------
1 |
--------------------------------------------------------------------------------
/requirements.txt:
--------------------------------------------------------------------------------
1 | dateparser>=1.1.1
2 | lxml>=4.9.0
3 | beautifulsoup4>=4.12.2
4 | requests>=2.31.0
5 | aiohttp>=3.8.4
6 |
--------------------------------------------------------------------------------
/.vscode/settings.json:
--------------------------------------------------------------------------------
1 | {
2 | "python.testing.pytestArgs": [
3 | "tests"
4 | ],
5 | "python.testing.unittestEnabled": false,
6 | "python.testing.pytestEnabled": true
7 | }
--------------------------------------------------------------------------------
/pyproject.toml:
--------------------------------------------------------------------------------
1 | [build-system]
2 | requires = ["hatchling"]
3 | build-backend = "hatchling.build"
4 |
5 | [project]
6 | name = "google_news_feed"
7 | version = "1.1.0"
8 | authors = [
9 | { name="Lukas Kreussel"},
10 | ]
11 | description = "A simple python library to consume the google news rss feed"
12 | readme = "README.md"
13 | license = { file="LICENSE" }
14 | requires-python = ">=3.10"
15 | classifiers = [
16 | "Programming Language :: Python :: 3",
17 | "License :: OSI Approved :: MIT License",
18 | "Operating System :: OS Independent",
19 | ]
20 | dependencies=["aiohttp>=3.8.4","dateparser>=1.1.1","lxml>=4.9.0","requests>=2.31.0","beautifulsoup4>=4.12.2"]
21 |
22 | [project.urls]
23 | "Homepage" = "https://github.com/LLukas22/Google-News-RSS"
24 | "Bug Tracker" = "https://github.com/LLukas22/Google-News-RSS/issues"
--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
1 | MIT License
2 |
3 | Copyright (c) 2022 LLukas22
4 |
5 | Permission is hereby granted, free of charge, to any person obtaining a copy
6 | of this software and associated documentation files (the "Software"), to deal
7 | in the Software without restriction, including without limitation the rights
8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 |
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 |
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 |
--------------------------------------------------------------------------------
/.github/workflows/pip-publish.yml:
--------------------------------------------------------------------------------
1 | # This workflow will upload a Python Package using Twine when a release is created
2 | # For more information see: https://help.github.com/en/actions/language-and-framework-guides/using-python-with-github-actions#publishing-to-package-registries
3 |
4 | # This workflow uses actions that are not certified by GitHub.
5 | # They are provided by a third-party and are governed by
6 | # separate terms of service, privacy policy, and support
7 | # documentation.
8 |
9 | name: Upload Python Package
10 |
11 | on:
12 | release:
13 | types: [published]
14 |
15 | permissions:
16 | contents: read
17 |
18 | jobs:
19 | deploy:
20 |
21 | runs-on: ubuntu-latest
22 |
23 | steps:
24 | - uses: actions/checkout@v3
25 | - name: Set up Python
26 | uses: actions/setup-python@v3
27 | with:
28 | python-version: '3.10'
29 | - name: Install dependencies
30 | run: |
31 | python -m pip install --upgrade pip
32 | pip install build
33 | - name: Build package
34 | run: python -m build
35 | - name: Publish package
36 | uses: pypa/gh-action-pypi-publish@27b31702a0e7fc50959f5ad993c78deac1bdfc29
37 | with:
38 | user: __token__
39 | password: ${{ secrets.PIP_TOKEN }}
40 |
--------------------------------------------------------------------------------
/.github/workflows/python-package.yml:
--------------------------------------------------------------------------------
1 | # This workflow will install Python dependencies, run tests and lint with a variety of Python versions
2 | # For more information see: https://help.github.com/actions/language-and-framework-guides/using-python-with-github-actions
3 |
4 | name: Python package
5 |
6 | on:
7 | push:
8 | branches: [ "main" ]
9 | pull_request:
10 | branches: [ "main" ]
11 |
12 | jobs:
13 | build:
14 |
15 | runs-on: ubuntu-latest
16 | strategy:
17 | fail-fast: false
18 | matrix:
19 | python-version: ["3.10"]
20 |
21 | steps:
22 | - uses: actions/checkout@v3
23 | - name: Set up Python ${{ matrix.python-version }}
24 | uses: actions/setup-python@v3
25 | with:
26 | python-version: ${{ matrix.python-version }}
27 | - name: Install dependencies
28 | run: |
29 | python -m pip install --upgrade pip
30 | python -m pip install flake8 pytest
31 | if [ -f requirements.txt ]; then pip install -r requirements.txt; fi
32 | - name: Lint with flake8
33 | run: |
34 | # stop the build if there are Python syntax errors or undefined names
35 | flake8 . --count --select=E9,F63,F7,F82 --show-source --statistics
36 | # exit-zero treats all errors as warnings. The GitHub editor is 127 chars wide
37 | flake8 . --count --exit-zero --max-complexity=10 --max-line-length=127 --statistics
38 | - name: Test with pytest
39 | run: |
40 | pytest
41 |
--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 | # Google-News-Feed
2 | A simple python library to consume the google news rss feed.
3 |
4 | Inspired by [pygooglenews](https://github.com/kotartemiy/pygooglenews) and implemented using [aiohttp](https://pypi.org/project/aiohttp/) and [lxml](https://pypi.org/project/lxml/).
5 |
6 |
7 | ## Installation
8 | Via pip: pip install google-news-feed
9 |
10 | ## How to use
11 | ```python
12 | from google_news_feed import GoogleNewsFeed
13 |
14 | gnf = GoogleNewsFeed(language='en',country='US')
15 | results = gnf.query("python")
16 | print(results)
17 | ```
18 | For more information about the query parameters see [here](https://newscatcherapi.com/blog/google-news-rss-search-parameters-the-missing-documentaiton).
19 |
20 | ### Get Top Headlines
21 | ```python
22 | gnf = GoogleNewsFeed(language='en',country='US')
23 | results = gnf.top_headlines()
24 | ```
25 |
26 | ### Query a specific topic
27 | ```python
28 | gnf = GoogleNewsFeed(language='en',country='US')
29 | results = gnf.query_topic("business")
30 | ```
31 | For more topics see [here](https://newscatcherapi.com/blog/google-news-rss-search-parameters-the-missing-documentaiton).
32 | ### Accessing the results
33 | The results are a list of NewsItems.
34 | ```python
35 | result = gnf.query("python")[0]
36 | print(result.title)
37 | print(result.link)
38 | print(result.pubDate)
39 | print(result.description)
40 | print(result.source)
41 | ```
42 |
43 | ## Handling internal links
44 | Some links are internal to google news. To access the actual link to the news site the internal link has to be accessed and the redirect url is returned. To simplify this process the `resolve_internal_links` property can be set to True.
45 | ```python
46 | gnf = GoogleNewsFeed(language='en',country='US',resolve_internal_links=True)
47 | print(gnf.top_headlines()[0].link)
48 | ```
49 | The resolution is handled asynchronously by default, but can be forced to be done synchronously via the `run_async` parameter.
50 |
--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
1 | # Byte-compiled / optimized / DLL files
2 | __pycache__/
3 | *.py[cod]
4 | *$py.class
5 |
6 | # C extensions
7 | *.so
8 |
9 | # Distribution / packaging
10 | .Python
11 | build/
12 | develop-eggs/
13 | dist/
14 | downloads/
15 | eggs/
16 | .eggs/
17 | lib/
18 | lib64/
19 | parts/
20 | sdist/
21 | var/
22 | wheels/
23 | pip-wheel-metadata/
24 | share/python-wheels/
25 | *.egg-info/
26 | .installed.cfg
27 | *.egg
28 | MANIFEST
29 |
30 | # PyInstaller
31 | # Usually these files are written by a python script from a template
32 | # before PyInstaller builds the exe, so as to inject date/other infos into it.
33 | *.manifest
34 | *.spec
35 |
36 | # Installer logs
37 | pip-log.txt
38 | pip-delete-this-directory.txt
39 |
40 | # Unit test / coverage reports
41 | htmlcov/
42 | .tox/
43 | .nox/
44 | .coverage
45 | .coverage.*
46 | .cache
47 | nosetests.xml
48 | coverage.xml
49 | *.cover
50 | *.py,cover
51 | .hypothesis/
52 | .pytest_cache/
53 |
54 | # Translations
55 | *.mo
56 | *.pot
57 |
58 | # Django stuff:
59 | *.log
60 | local_settings.py
61 | db.sqlite3
62 | db.sqlite3-journal
63 |
64 | # Flask stuff:
65 | instance/
66 | .webassets-cache
67 |
68 | # Scrapy stuff:
69 | .scrapy
70 |
71 | # Sphinx documentation
72 | docs/_build/
73 |
74 | # PyBuilder
75 | target/
76 |
77 | # Jupyter Notebook
78 | .ipynb_checkpoints
79 |
80 | # IPython
81 | profile_default/
82 | ipython_config.py
83 |
84 | # pyenv
85 | .python-version
86 |
87 | # pipenv
88 | # According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
89 | # However, in case of collaboration, if having platform-specific dependencies or dependencies
90 | # having no cross-platform support, pipenv may install dependencies that don't work, or not
91 | # install all needed dependencies.
92 | #Pipfile.lock
93 |
94 | # PEP 582; used by e.g. github.com/David-OConnor/pyflow
95 | __pypackages__/
96 |
97 | # Celery stuff
98 | celerybeat-schedule
99 | celerybeat.pid
100 |
101 | # SageMath parsed files
102 | *.sage.py
103 |
104 | # Environments
105 | .env
106 | .venv
107 | env/
108 | venv/
109 | ENV/
110 | env.bak/
111 | venv.bak/
112 |
113 | # Spyder project settings
114 | .spyderproject
115 | .spyproject
116 |
117 | # Rope project settings
118 | .ropeproject
119 |
120 | # mkdocs documentation
121 | /site
122 |
123 | # mypy
124 | .mypy_cache/
125 | .dmypy.json
126 | dmypy.json
127 |
128 | # Pyre type checker
129 | .pyre/
130 |
--------------------------------------------------------------------------------
/tests/test_GoogleNewsFeed.py:
--------------------------------------------------------------------------------
1 | import src
2 | from src.google_news_feed import GoogleNewsFeed,NewsItem
3 | from datetime import datetime,date
4 | from lxml import etree
5 | from lxml.etree import _Element
6 |
7 | def test_build_ceid_should_build_ceid():
8 | assert 'hl=en-US&gl=US&ceid=US:en' == GoogleNewsFeed()._build_ceid('US', 'en')
9 |
10 | def test_build_query_should_escape_query():
11 | assert 'ELON%20MUSK' == GoogleNewsFeed()._build_query("ELON MUSK")
12 |
13 | def test_parse_item_shoudl_parse_item():
14 | item = etree.Element("p")
15 | title = etree.SubElement(item, 'title')
16 | title.text = "Title"
17 | link = etree.SubElement(item, 'link')
18 | link.tail='http://www.foobar.com'
19 | pubDate = etree.SubElement(item, 'pubdate')
20 | pubDate.text = "2020-01-01"
21 | description = etree.SubElement(item, 'description')
22 | description.text = 'Foobar'
23 | source = etree.SubElement(item, 'source')
24 | source.text = 'source'
25 |
26 | news_item = GoogleNewsFeed()._parse_item(item)
27 | assert isinstance(news_item, NewsItem)
28 | assert news_item.description == 'Foobar'
29 | assert news_item.title == 'Title'
30 | assert news_item.link == 'http://www.foobar.com'
31 | assert news_item.pubDate == datetime(2020,1,1)
32 | assert news_item.source == 'source'
33 |
34 | def test_query_should_return_news_items():
35 | news_items = GoogleNewsFeed(resolve_internal_links=False).query("Python")
36 | assert len(news_items) > 0
37 | assert isinstance(news_items[0], NewsItem)
38 |
39 | def test_query_works_with_start_date():
40 | news_items = GoogleNewsFeed(resolve_internal_links=False).query("Python",after=date(2020,1,1))
41 | assert len(news_items) > 0
42 | assert isinstance(news_items[0], NewsItem)
43 |
44 |
45 | def test_query_works_with_end_date():
46 | news_items = GoogleNewsFeed(resolve_internal_links=False).query("Python",before=date(2020,1,1))
47 | assert len(news_items) > 0
48 | assert isinstance(news_items[0], NewsItem)
49 |
50 | def test_query_works_with_start_and_end_date():
51 | news_items = GoogleNewsFeed(resolve_internal_links=False).query("Python",after=date(2020,1,1),before=date(2021,1,1))
52 | assert len(news_items) > 0
53 | assert isinstance(news_items[0], NewsItem)
54 |
55 | def test_query_works_with_interval():
56 | news_items = GoogleNewsFeed(resolve_internal_links=False).query("Python",when="3m")
57 | assert len(news_items) > 0
58 | assert isinstance(news_items[0], NewsItem)
59 |
60 | def test_query_topic_should_return_news_items():
61 | news_items = GoogleNewsFeed(resolve_internal_links=False).query_topic("world")
62 | assert len(news_items) > 0
63 | assert isinstance(news_items[0], NewsItem)
64 |
65 | def test_top_headlines_should_return_news_items():
66 | news_items = GoogleNewsFeed(resolve_internal_links=False).top_headlines()
67 | assert len(news_items) > 0
68 | assert isinstance(news_items[0], NewsItem)
69 |
70 | def test_internal_link_are_resolved():
71 | news_items = GoogleNewsFeed(run_async=False).top_headlines()
72 | assert len(news_items) > 0
73 | for news_item in news_items:
74 | assert not news_item.is_internal_google_link
75 |
76 | def test_internal_link_are_resolved_async():
77 | news_items = GoogleNewsFeed().top_headlines()
78 | assert len(news_items) > 0
79 | for news_item in news_items:
80 | assert not news_item.is_internal_google_link
--------------------------------------------------------------------------------
/src/google_news_feed/__init__.py:
--------------------------------------------------------------------------------
1 | import urllib.parse
2 | from lxml import etree
3 | from lxml.etree import _Element
4 | from datetime import datetime,date
5 | from dateparser import parse
6 | import asyncio
7 | import logging
8 | from typing import Optional
9 | from dataclasses import dataclass
10 | import requests
11 | from bs4 import BeautifulSoup
12 | import aiohttp
13 |
14 |
15 | GOOGLE_INTERNAL_URL = set(["https://news.google.com/__i/rss","https://news.google.com/rss"])
16 | BASE_URL = 'https://news.google.com/rss'
17 | PARSER = etree.HTMLParser(recover=True)
18 |
19 | HEADERS = {
20 | 'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/108.0.0.0 Safari/537.36'
21 | }
22 |
23 | COOKIES = {
24 | 'CONSENT': 'YES+cb.20220419-08-p0.cs+FX+111'
25 | }
26 |
27 | KNOWN_TOPICS={
28 | "BUSINESS":"CAAqKggKIiRDQkFTRlFvSUwyMHZNRGx6TVdZU0JXVnVMVlZUR2dKVlV5Z0FQAQ",
29 | "NATION":"CAAqKggKIiRDQkFTRlFvSUwyMHZNRGx6TVdZU0JXVnVMVlZUR2dKVlV5Z0FQAQ",
30 | "WORLD":"CAAqJggKIiBDQkFTRWdvSkwyMHZNRGxqTjNjd0VnVmxiaTFWVXlnQVAB",
31 | "TECHNOLOGY":"CAAqKggKIiRDQkFTRlFvSUwyMHZNRGx1YlY4U0JXVnVMVlZUR2dKVlV5Z0FQAQ",
32 | "ENTERTAINMENT":"CAAqKggKIiRDQkFTRlFvSUwyMHZNREpxYW5RU0JXVnVMVlZUR2dKVlV5Z0FQAQ",
33 | "SCIENCE":"CAAqKggKIiRDQkFTRlFvSUwyMHZNREpxYW5RU0JXVnVMVlZUR2dKVlV5Z0FQAQ",
34 | "SPORTS":"CAAqKggKIiRDQkFTRlFvSUwyMHZNRFp1ZEdvU0JXVnVMVlZUR2dKVlV5Z0FQAQ",
35 | "HEALTH":"CAAqJQgKIh9DQkFTRVFvSUwyMHZNR3QwTlRFU0JXVnVMVlZUS0FBUAE"
36 | }
37 |
38 | @dataclass
39 | class NewsItem(object):
40 | title:Optional[str]=None
41 | link:Optional[str]=None
42 | pubDate:Optional[datetime]=None
43 | description:Optional[str]=None
44 | source:Optional[str]=None
45 |
46 | def __repr__(self) -> str:
47 | return f'{self.title}'
48 |
49 | @property
50 | def is_internal_google_link(self)->bool:
51 | for internal_link in GOOGLE_INTERNAL_URL:
52 | if self.link.startswith(internal_link):
53 | return True
54 | return False
55 |
56 | class GoogleNewsFeed:
57 | def __init__(self,language:str='en',country:str='US',client:Optional[requests.Session]=None,resolve_internal_links:bool=True,run_async:bool=True)->None:
58 | self.language = language.lower()
59 | self.country = country.upper()
60 | if client:
61 | self.client = client
62 | else:
63 | self.client = requests.Session()
64 | self.client.headers.update(HEADERS)
65 | self.client.cookies.update(COOKIES)
66 |
67 | self.resolve_internal_links = resolve_internal_links
68 | self.run_async = run_async
69 |
70 |
71 | @staticmethod
72 | def _build_ceid(country:str,language:str)->str:
73 | return f'hl={language}-{country}&gl={country}&ceid={country}:{language}'
74 |
75 | @staticmethod
76 | def _build_query(query:str)->str:
77 | return urllib.parse.quote(query, safe='')
78 |
79 | @staticmethod
80 | def _build_query_url(query:str,country:str,language:str,before:date=None,after:date=None,when:str=None)->str:
81 | base_ure = f"{BASE_URL}/search?q="
82 |
83 | query = GoogleNewsFeed._build_query(query)
84 |
85 | time_restrictions = []
86 | if when:
87 | time_restrictions.append(f"when:{when}")
88 | else:
89 | if before:
90 | time_restrictions.append(f"before:{before.isoformat()}")
91 | if after:
92 | time_restrictions.append(f"after:{after.isoformat()}")
93 |
94 | if len(time_restrictions) > 0:
95 | return f"{base_ure}{query}+{'+'.join(time_restrictions)}&{GoogleNewsFeed._build_ceid(country,language)}"
96 | else:
97 | return f"{base_ure}{query}&{GoogleNewsFeed._build_ceid(country,language)}"
98 |
99 |
100 | @staticmethod
101 | def _parse_item(item:_Element)->NewsItem:
102 | parsed_item = NewsItem()
103 | for element in item.getchildren():
104 | match element.tag:
105 | case 'title':
106 | parsed_item.title = element.text
107 | case 'link':
108 | parsed_item.link = element.tail
109 | case 'pubdate':
110 | parsed_item.pubDate = parse(element.text)
111 | case 'description':
112 | parsed_item.description = list(etree.fromstring(element.text,parser=PARSER).iter('a'))[0].text
113 | case 'source':
114 | parsed_item.source = element.text
115 |
116 | return parsed_item
117 |
118 |
119 |
120 | @staticmethod
121 | def _parse_feed(content:str)->list[NewsItem]:
122 | root = etree.fromstring(content,parser=PARSER)
123 |
124 | parsed_items = []
125 | for item in root.iter('item'):
126 | try:
127 | parsed_items.append(GoogleNewsFeed._parse_item(item))
128 | except Exception as e:
129 | logging.debug(f"Failed to parse item: {item}! Exception: {e}")
130 |
131 | return parsed_items
132 |
133 | async def _async_resolve_internal_links(self,items:list[NewsItem])->list[NewsItem]:
134 | async with aiohttp.ClientSession() as session:
135 | session.headers.update(HEADERS)
136 | session.cookie_jar.update_cookies(COOKIES)
137 | for item in items:
138 | try:
139 | if item.is_internal_google_link:
140 | async with session.get(item.link) as response:
141 | content = await response.text()
142 | if content:
143 | soup = BeautifulSoup(content, 'html.parser')
144 | item.link = soup.a['href']
145 | del soup
146 | except:
147 | logging.debug(f"Failed to resolve internal link: {item.link}")
148 | return items
149 |
150 | def _resolve_internal_links(self,items:list[NewsItem])->list[NewsItem]:
151 | for item in items:
152 | try:
153 | if item.is_internal_google_link:
154 | response = self.client.get(item.link)
155 | if response.text:
156 | soup = BeautifulSoup(response.text, 'html.parser')
157 | item.link = soup.a['href']
158 | del soup
159 | except:
160 | logging.debug(f"Failed to resolve internal link: {item.link}")
161 | return items
162 |
163 | def _get_feed(self,url:str)->list[NewsItem]:
164 | result = self.client.get(url)
165 | if result.status_code == 200:
166 | items = GoogleNewsFeed._parse_feed(result.content)
167 | if self.resolve_internal_links:
168 | if self.run_async:
169 | items = asyncio.run(self._async_resolve_internal_links(items))
170 | else:
171 | items = self._resolve_internal_links(items)
172 | return items
173 | else:
174 | raise Exception(f"Error fetching feed: {url}")
175 |
176 | def query_topic(self,topic:str)->list[NewsItem]:
177 | if topic.upper() in KNOWN_TOPICS:
178 | topic = KNOWN_TOPICS[topic.upper()]
179 |
180 | url = f"{BASE_URL}/topics/{topic}?{GoogleNewsFeed._build_ceid(self.country,self.language)}"
181 | return self._get_feed(url)
182 |
183 | def top_headlines(self)->list[NewsItem]:
184 | url = f"{BASE_URL}?{GoogleNewsFeed._build_ceid(self.country,self.language)}"
185 | return self._get_feed(url)
186 |
187 | def query(self,query:str,before:date=None,after:date=None,when:str=None)->list[NewsItem]:
188 | """
189 | For more information on the parameters, see:
190 | https://newscatcherapi.com/blog/google-news-rss-search-parameters-the-missing-documentaiton
191 | """
192 | url = GoogleNewsFeed._build_query_url(query,self.country,self.language,before,after,when)
193 | return self._get_feed(url)
194 |
195 | @classmethod
196 | def known_topics()->list[str]:
197 | return KNOWN_TOPICS.keys()
198 |
199 | @classmethod
200 | def get_topic_hash(topic:str)->str|None:
201 | if topic.upper() in KNOWN_TOPICS:
202 | return KNOWN_TOPICS[topic.upper()]
203 | return None
204 |
205 | if __name__ == "__main__":
206 | gnf = GoogleNewsFeed()
207 | news = gnf.query("apple")
208 | print(news)
209 |
210 |
211 |
--------------------------------------------------------------------------------