├── app_store_scraper ├── tests │ ├── __init__.py │ └── test_all.py ├── __init__.py ├── __version__.py ├── podcast.py ├── app_store.py └── base.py ├── requirements.txt ├── .gitignore ├── .github ├── PULL_REQUEST_TEMPLATE.md └── workflows │ ├── publish.yml │ └── ci.yml ├── LICENCE ├── setup.py └── README.md /app_store_scraper/tests/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | requests==2.23.0 2 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | venv 2 | __pycache__ 3 | .pytest_cache 4 | *.egg-info 5 | -------------------------------------------------------------------------------- /.github/PULL_REQUEST_TEMPLATE.md: -------------------------------------------------------------------------------- 1 | This PR is about ... 2 | 3 | Changes: 4 | - Added ... 5 | - Removed ... 6 | - Changed ... 7 | - Fixed ... 8 | -------------------------------------------------------------------------------- /app_store_scraper/__init__.py: -------------------------------------------------------------------------------- 1 | from .app_store import AppStore 2 | from .podcast import Podcast 3 | from .__version__ import ( # noqa: F401 4 | __title__, 5 | __version__, 6 | __description__, 7 | __author__, 8 | __url__, 9 | __license__, 10 | ) 11 | 12 | __all__ = ["AppStore", "Podcast"] 13 | -------------------------------------------------------------------------------- /app_store_scraper/__version__.py: -------------------------------------------------------------------------------- 1 | __title__ = "app-store-scraper" 2 | __version__ = "0.3.5" 3 | __description__ = "Single API ☝ App Store Review Scraper 🧹" 4 | __author__ = "Eric Lim" 5 | __url__ = "https://github.com/cowboy-bebug/app-store-scraper" 6 | __license__ = "MIT" 7 | __keywords__ = [ 8 | "app store", 9 | "ios", 10 | "ios apps", 11 | "podcasts", 12 | "review", 13 | "scraping", 14 | "scraper", 15 | ] 16 | -------------------------------------------------------------------------------- /.github/workflows/publish.yml: -------------------------------------------------------------------------------- 1 | name: Publish 2 | 3 | on: 4 | release: 5 | types: [created] 6 | 7 | jobs: 8 | deploy: 9 | runs-on: ubuntu-latest 10 | steps: 11 | - uses: actions/checkout@v2 12 | - name: Set up Python 13 | uses: actions/setup-python@v2 14 | with: 15 | python-version: '3.x' 16 | - name: Install dependencies 17 | run: | 18 | python -m pip install --upgrade pip 19 | pip install setuptools wheel twine 20 | - name: Build and publish 🐍 📦 to PyPi 21 | env: 22 | TWINE_USERNAME: ${{ secrets.PYPI_USERNAME }} 23 | TWINE_PASSWORD: ${{ secrets.PYPI_PASSWORD }} 24 | run: | 25 | python setup.py sdist bdist_wheel 26 | twine upload dist/* 27 | -------------------------------------------------------------------------------- /app_store_scraper/podcast.py: -------------------------------------------------------------------------------- 1 | import logging 2 | from .base import Base 3 | 4 | logger = logging.getLogger("Podcast") 5 | 6 | 7 | class Podcast(Base): 8 | _landing_host = "podcasts.apple.com" 9 | _request_host = "amp-api.podcasts.apple.com" 10 | 11 | _landing_path = "{country}/podcast/{app_name}/id{app_id}" 12 | _request_path = "v1/catalog/{country}/podcasts/{app_id}/reviews" 13 | 14 | def __init__( 15 | self, 16 | country, 17 | app_name, 18 | app_id=None, 19 | log_format="%(asctime)s [%(levelname)s] %(name)s - %(message)s", 20 | log_level="INFO", 21 | log_interval=5, 22 | ): 23 | super().__init__( 24 | country=country, 25 | app_name=app_name, 26 | app_id=app_id, 27 | log_format=log_format, 28 | log_level=log_level, 29 | log_interval=log_interval, 30 | ) 31 | 32 | # override 33 | self._request_params = { 34 | "l": "en-GB", 35 | "offset": self._request_offset, 36 | "limit": 20, 37 | } 38 | -------------------------------------------------------------------------------- /LICENCE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2020 Eric Lim 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /app_store_scraper/app_store.py: -------------------------------------------------------------------------------- 1 | import logging 2 | from .base import Base 3 | 4 | logger = logging.getLogger("AppStore") 5 | 6 | 7 | class AppStore(Base): 8 | _landing_host = "apps.apple.com" 9 | _request_host = "amp-api.apps.apple.com" 10 | 11 | _landing_path = "{country}/app/{app_name}/id{app_id}" 12 | _request_path = "v1/catalog/{country}/apps/{app_id}/reviews" 13 | 14 | def __init__( 15 | self, 16 | country, 17 | app_name, 18 | app_id=None, 19 | log_format="%(asctime)s [%(levelname)s] %(name)s - %(message)s", 20 | log_level="INFO", 21 | log_interval=5, 22 | ): 23 | super().__init__( 24 | country=country, 25 | app_name=app_name, 26 | app_id=app_id, 27 | log_format=log_format, 28 | log_level=log_level, 29 | log_interval=log_interval, 30 | ) 31 | 32 | # override 33 | self._request_params = { 34 | "l": "en-GB", 35 | "offset": self._request_offset, 36 | "limit": 20, 37 | "platform": "web", 38 | "additionalPlatforms": "appletv,ipad,iphone,mac", 39 | } 40 | -------------------------------------------------------------------------------- /setup.py: -------------------------------------------------------------------------------- 1 | import setuptools 2 | 3 | about = dict() 4 | 5 | with open("app_store_scraper/__version__.py", "r") as f: 6 | exec(f.read(), about) 7 | 8 | with open("README.md", "r") as f: 9 | long_description = f.read() 10 | 11 | with open("requirements.txt", "r") as f: 12 | install_requires = f.readlines() 13 | 14 | setuptools.setup( 15 | name=about["__title__"], 16 | version=about["__version__"], 17 | description=about["__description__"], 18 | long_description=long_description, 19 | long_description_content_type="text/markdown", 20 | author=about["__author__"], 21 | url=about["__url__"], 22 | license=about["__license__"], 23 | keywords=about["__keywords__"], 24 | packages=setuptools.find_packages(".", exclude=["*.tests"]), 25 | install_requires=install_requires, 26 | classifiers=[ 27 | "Intended Audience :: Developers", 28 | "License :: OSI Approved :: MIT License", 29 | "Natural Language :: English", 30 | "Operating System :: OS Independent", 31 | "Programming Language :: Python :: 3", 32 | "Programming Language :: Python :: 3.6", 33 | "Programming Language :: Python :: 3.7", 34 | "Programming Language :: Python :: 3.8", 35 | "Topic :: Internet :: WWW/HTTP", 36 | "Topic :: Scientific/Engineering :: Information Analysis", 37 | ], 38 | python_requires=">=3.6", 39 | project_urls={"Source": about["__url__"]}, 40 | ) 41 | -------------------------------------------------------------------------------- /app_store_scraper/tests/test_all.py: -------------------------------------------------------------------------------- 1 | from app_store_scraper import AppStore, Podcast 2 | from datetime import datetime, timedelta 3 | 4 | 5 | class TestEmptyApp: 6 | country = "Nz" 7 | app_name = "Cool App" 8 | app_id = 7357 9 | app = AppStore(country=country, app_name=app_name, app_id=app_id) 10 | 11 | def test_init_attributes(self): 12 | assert self.app.country == self.country.lower() 13 | assert self.app.app_name == self.app_name.lower().replace(" ", "-") 14 | assert self.app.app_id == self.app_id 15 | assert self.app.reviews == [] 16 | assert self.app.reviews_count == 0 17 | 18 | def test_init_url(self): 19 | base_landing_url = "https://apps.apple.com" 20 | landing_path = f"{self.app.country}/app/{self.app.app_name}/id{self.app.app_id}" 21 | landing_url = f"{base_landing_url}/{landing_path}" 22 | assert self.app.url == landing_url 23 | 24 | def test_repr(self): 25 | assert self.app.__repr__() == ( 26 | f"AppStore(country='{self.app.country}', " 27 | f"app_name='{self.app.app_name}', " 28 | f"app_id={self.app.app_id})" 29 | ) 30 | 31 | def test_str(self, capsys): 32 | print(self.app) 33 | captured = capsys.readouterr() 34 | assert captured.out == ( 35 | f" Country | {self.app.country}\n" 36 | f" Name | {self.app.app_name}\n" 37 | f" ID | {self.app.app_id}\n" 38 | f" URL | {self.app.url}\n" 39 | f"Review count | {self.app.reviews_count}\n" 40 | ) 41 | 42 | 43 | class TestAppStore: 44 | app = AppStore(country="us", app_name="minecraft") 45 | 46 | def test_search_id(self): 47 | self.app.search_id() 48 | assert self.app.app_id == 479516143 49 | 50 | def test_review(self): 51 | self.app.review(how_many=3) 52 | assert len(self.app.reviews) == 20 53 | assert len(self.app.reviews) == self.app.reviews_count 54 | 55 | def test_review_continuation(self): 56 | assert len(self.app.reviews) == 20 57 | self.app.review(how_many=7) 58 | assert len(self.app.reviews) == 40 59 | 60 | def test_reviews_for_duplicates(self): 61 | for i in range(len(self.app.reviews) - 1): 62 | assert self.app.reviews[i] != self.app.reviews[i + 1] 63 | 64 | def test_reviews_for_after(self): 65 | t1 = datetime.now() 66 | t0 = t1 - timedelta(weeks=26) 67 | self.app.reviews = [] 68 | self.app.review(how_many=3, after=t0) 69 | for review in self.app.reviews: 70 | assert review["date"] >= t0 and review["date"] < t1 71 | 72 | def test_reviews_for_sleep(self): 73 | t_start = datetime.now() 74 | self.app.review(how_many=40, sleep=2) 75 | t_diff = datetime.now() - t_start 76 | assert t_diff.seconds >= 2 77 | 78 | 79 | class TestPodcast: 80 | podcast = Podcast(country="us", app_name="stuff you should know") 81 | 82 | def test_search_id(self): 83 | self.podcast.search_id() 84 | assert self.podcast.app_id == 278981407 85 | 86 | def test_review(self): 87 | self.podcast.review(how_many=3) 88 | assert len(self.podcast.reviews) == 20 89 | assert len(self.podcast.reviews) == self.podcast.reviews_count 90 | 91 | def test_review_continuation(self): 92 | assert len(self.podcast.reviews) == 20 93 | self.podcast.review(how_many=7) 94 | assert len(self.podcast.reviews) == 40 95 | 96 | def test_reviews_for_duplicates(self): 97 | for i in range(len(self.podcast.reviews) - 1): 98 | assert self.podcast.reviews[i] != self.podcast.reviews[i + 1] 99 | 100 | def test_reviews_for_after(self): 101 | t1 = datetime.now() 102 | t0 = t1 - timedelta(weeks=26) 103 | self.podcast.reviews = [] 104 | self.podcast.review(how_many=3, after=t0) 105 | for review in self.podcast.reviews: 106 | assert review["date"] >= t0 and review["date"] < t1 107 | 108 | def test_reviews_for_sleep(self): 109 | t_start = datetime.now() 110 | self.podcast.review(how_many=40, sleep=2) 111 | t_diff = datetime.now() - t_start 112 | assert t_diff.seconds >= 2 113 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | ![build](https://img.shields.io/github/workflow/status/cowboy-bebug/app-store-scraper/Build) 2 | [![PRs Welcome](https://img.shields.io/badge/PRs-welcome-brightgreen.svg)](https://github.com/cowboy-bebug/app-store-scraper/pulls) 3 | [![PyPI](https://img.shields.io/pypi/v/app-store-scraper)](https://pypi.org/project/app-store-scraper/) 4 | ![downloads](https://img.shields.io/pypi/dm/app-store-scraper) 5 | ![license](https://img.shields.io/pypi/l/app-store-scraper) 6 | ![code style](https://img.shields.io/badge/code%20style-black-black) 7 | 8 | ``` 9 | ___ _____ _ _____ 10 | / _ \ / ___| | / ___| 11 | / /_\ \_ __ _ __ \ `--.| |_ ___ _ __ ___ \ `--. ___ _ __ __ _ _ __ ___ _ __ 12 | | _ | '_ \| '_ \ `--. \ __/ _ \| '__/ _ \ `--. \/ __| '__/ _` | '_ \ / _ \ '__| 13 | | | | | |_) | |_) | /\__/ / || (_) | | | __/ /\__/ / (__| | | (_| | |_) | __/ | 14 | \_| |_/ .__/| .__/ \____/ \__\___/|_| \___| \____/ \___|_| \__,_| .__/ \___|_| 15 | | | | | | | 16 | |_| |_| |_| 17 | ``` 18 | 19 | # Quickstart 20 | 21 | Install: 22 | ```console 23 | pip3 install app-store-scraper 24 | ``` 25 | 26 | Scrape reviews for an app: 27 | ```python 28 | from app_store_scraper import AppStore 29 | from pprint import pprint 30 | 31 | minecraft = AppStore(country="nz", app_name="minecraft") 32 | minecraft.review(how_many=20) 33 | 34 | pprint(minecraft.reviews) 35 | pprint(minecraft.reviews_count) 36 | ``` 37 | 38 | Scrape reviews for a podcast: 39 | ```python 40 | from app_store_scraper import Podcast 41 | from pprint import pprint 42 | 43 | sysk = Podcast(country="nz", app_name="stuff you should know") 44 | sysk.review(how_many=20) 45 | 46 | pprint(sysk.reviews) 47 | pprint(sysk.reviews_count) 48 | ``` 49 | 50 | # Extra Details 51 | 52 | Let's continue from the code example used in [Quickstart](#quickstart). 53 | 54 | 55 | ## Instantiation 56 | 57 | There are two required and one positional parameters: 58 | 59 | - `country` (required) 60 | - two-letter country code of [ISO 3166-1 alpha-2](https://en.wikipedia.org/wiki/ISO_3166-1_alpha-2) standard 61 | - `app_name` (required) 62 | - name of an iOS application to fetch reviews for 63 | - also used by `search_id()` method to search for `app_id` internally 64 | - `app_id` (positional) 65 | - can be passed directly 66 | - or ignored to be obtained by `search_id` method internally 67 | 68 | Once instantiated, the object can be examined: 69 | ```pycon 70 | >>> minecraft 71 | AppStore(country='nz', app_name='minecraft', app_id=479516143) 72 | ``` 73 | ```pycon 74 | >>> print(app) 75 | Country | nz 76 | Name | minecraft 77 | ID | 479516143 78 | URL | https://apps.apple.com/nz/app/minecraft/id479516143 79 | Review count | 0 80 | ``` 81 | 82 | Other optional parameters are: 83 | 84 | - `log_format` 85 | - passed directly to `logging.basicConfig(format=log_format)` 86 | - default is `"%(asctime)s [%(levelname)s] %(name)s - %(message)s"` 87 | - `log_level` 88 | - passed directly to `logging.basicConfig(level=log_level)` 89 | - default is `"INFO"` 90 | - `log_interval` 91 | - log is produced every 5 seconds (by default) as a "heartbeat" (useful for a long scraping session) 92 | - default is `5` 93 | 94 | 95 | ## Fetching Review 96 | 97 | The maximum number of reviews fetched per request is 20. To minimise the number of calls, the limit of 20 is hardcoded. This means the `review()` method will always grab more than the `how_many` argument supplied with an increment of 20. 98 | 99 | ```pycon 100 | >>> minecraft.review(how_many=33) 101 | >>> minecraft.reviews_count 102 | 40 103 | ``` 104 | 105 | If `how_many` is not provided, `review()` will terminate after *all* reviews are fetched. 106 | 107 | **NOTE** the review count seen on the landing page differs from the actual number of reviews fetched. This is simply because only *some* users who rated the app also leave reviews. 108 | 109 | ### Optional Parameters 110 | 111 | - `after` 112 | - a `datetime` object to filter older reviews 113 | - `sleep` 114 | - an `int` to specify seconds to sleep between each call 115 | 116 | ## Review Data 117 | 118 | The fetched review data are loaded in memory and live inside `reviews` attribute as a list of dict. 119 | ```pycon 120 | >>> minecraft.reviews 121 | [{'userName': 'someone', 'rating': 5, 'date': datetime.datetime(... 122 | ``` 123 | 124 | Each review dictionary has the following schema: 125 | ```python 126 | { 127 | "date": datetime.datetime, 128 | "isEdited": bool, 129 | "rating": int, 130 | "review": str, 131 | "title": str, 132 | "userName": str 133 | } 134 | ``` 135 | -------------------------------------------------------------------------------- /.github/workflows/ci.yml: -------------------------------------------------------------------------------- 1 | name: Build 2 | 3 | on: 4 | push: 5 | branches: [ master ] 6 | paths-ignore: 7 | - '**.md' 8 | pull_request: 9 | branches: [ master ] 10 | paths-ignore: 11 | - '**.md' 12 | 13 | jobs: 14 | build: 15 | runs-on: ubuntu-latest 16 | strategy: 17 | matrix: 18 | os: [ubuntu-latest, macos-latest, windows-latest] 19 | python-version: [3.6, 3.7, 3.8] 20 | steps: 21 | - name: Checkout 22 | uses: actions/checkout@v2 23 | - name: Set up Python ${{ matrix.python-version }} 24 | uses: actions/setup-python@v2 25 | with: 26 | python-version: ${{ matrix.python-version }} 27 | - name: Install dependencies 28 | run: | 29 | python -m pip install --upgrade pip 30 | pip install black flake8 pytest 31 | pip install -r requirements.txt 32 | - name: Format with black 33 | run: | 34 | black . --check 35 | - name: Lint with flake8 36 | run: | 37 | flake8 . --ignore=E203 --count --show-source --statistics --max-line-length=90 38 | - name: Test with pytest 39 | run: | 40 | pytest 41 | release: 42 | runs-on: ubuntu-latest 43 | steps: 44 | - name: Checkout 45 | uses: actions/checkout@v2 46 | with: 47 | fetch-depth: 0 48 | - name: Check version bump and set versions as envvar 49 | if: github.ref != 'refs/heads/master' 50 | run: | 51 | FILES_CHANGED=$(echo -n $(git diff origin/${{ github.base_ref }} --name-only)) 52 | if [[ $FILES_CHANGED != *.py ]]; then 53 | echo "No Python scripts are modified" 54 | echo "::set-env name=FILES_CHANGED::false" 55 | exit 0 56 | fi 57 | 58 | SEMVER_PATTERN="([0-9]+)\.([0-9]+)\.([0-9]+)" 59 | VERSION=$(echo -n $(git diff origin/${{ github.base_ref }} -G '__version__' app_store_scraper/__version__.py)) 60 | 61 | if [ "$VERSION" = "" ]; then 62 | echo "Version must be bumped for every PR" 63 | exit 1 64 | fi 65 | 66 | VERSION_OLD=$(echo "$VERSION" | sed -E "s/.*\-__version__ = \"($SEMVER_PATTERN).+/\1/") 67 | VERSION_OLD_MAJOR=$(echo "$VERSION_OLD" | sed -E "s/$SEMVER_PATTERN/\1/") 68 | VERSION_OLD_MINOR=$(echo "$VERSION_OLD" | sed -E "s/$SEMVER_PATTERN/\2/") 69 | VERSION_OLD_PATCH=$(echo "$VERSION_OLD" | sed -E "s/$SEMVER_PATTERN/\3/") 70 | 71 | VERSION_NEW=$(echo "$VERSION" | sed -E "s/.*\+__version__ = \"($SEMVER_PATTERN).+/\1/") 72 | VERSION_NEW_MAJOR=$(echo "$VERSION_NEW" | sed -E "s/$SEMVER_PATTERN/\1/") 73 | VERSION_NEW_MINOR=$(echo "$VERSION_NEW" | sed -E "s/$SEMVER_PATTERN/\2/") 74 | VERSION_NEW_PATCH=$(echo "$VERSION_NEW" | sed -E "s/$SEMVER_PATTERN/\3/") 75 | 76 | echo "::set-env name=VERSION_OLD::$VERSION_OLD" 77 | echo "::set-env name=VERSION_OLD_MAJOR::$VERSION_OLD_MAJOR" 78 | echo "::set-env name=VERSION_OLD_MINOR::$VERSION_OLD_MINOR" 79 | echo "::set-env name=VERSION_OLD_PATCH::$VERSION_OLD_PATCH" 80 | 81 | echo "::set-env name=VERSION_NEW::$VERSION_NEW" 82 | echo "::set-env name=VERSION_NEW_MAJOR::$VERSION_NEW_MAJOR" 83 | echo "::set-env name=VERSION_NEW_MINOR::$VERSION_NEW_MINOR" 84 | echo "::set-env name=VERSION_NEW_PATCH::$VERSION_NEW_PATCH" 85 | 86 | echo "Old version: $VERSION_OLD" 87 | echo "New version: $VERSION_NEW" 88 | - name: Check for patch version bump 89 | if: ${{ !startsWith(github.head_ref, 'release') && github.ref != 'refs/heads/master' }} 90 | run: | 91 | if [ "$FILES_CHANGED" = false ]; then 92 | echo "No Python scripts are modified" 93 | exit 0 94 | fi 95 | 96 | if [ "$VERSION_OLD_MAJOR" = "$VERSION_NEW_MAJOR" ] && 97 | [ "$VERSION_OLD_MINOR" = "$VERSION_NEW_MINOR" ]; then 98 | if (($VERSION_OLD_PATCH < $VERSION_NEW_PATCH)); then 99 | echo "Bumped patch version $VERSION_OLD -> $VERSION_NEW" 100 | exit 0 101 | else 102 | echo "Bump patch version in __version__" 103 | exit 1 104 | fi 105 | else 106 | echo "Major / minor version must be bumped in a release branch" 107 | exit 1 108 | fi 109 | - name: Check for major / minor version bump 110 | if: ${{ startsWith(github.head_ref, 'release') && github.ref != 'refs/heads/master' }} 111 | run: | 112 | if (($VERSION_OLD_MAJOR < $VERSION_NEW_MAJOR)); then 113 | echo "Bumped major version $VERSION_OLD -> $VERSION_NEW" 114 | exit 0 115 | elif (($VERSION_OLD_MINOR < $VERSION_NEW_MINOR)); then 116 | echo "Bumped minor version $VERSION_OLD -> $VERSION_NEW" 117 | exit 0 118 | else 119 | echo "Major / minor version must be bumped for release" 120 | exit 1 121 | fi 122 | - name: Create and push tag 123 | if: github.ref == 'refs/heads/master' 124 | run: | 125 | version=$(echo -n $(git diff HEAD^1 -G '__version__' app_store_scraper/__version__.py)) 126 | version=$(echo "$version" | sed -E "s/.*\+__version__.*([0-9]+\.[0-9]+\.[0-9]+).+/\1/") 127 | if [ "$version" != "" ]; then 128 | echo "Create and push v$version tag" 129 | git tag -f v"$version" 130 | git push origin v"$version" 131 | fi 132 | -------------------------------------------------------------------------------- /app_store_scraper/base.py: -------------------------------------------------------------------------------- 1 | import logging 2 | import random 3 | import re 4 | import requests 5 | import sys 6 | import time 7 | from datetime import datetime 8 | from requests.adapters import HTTPAdapter 9 | from requests.packages.urllib3.util.retry import Retry 10 | 11 | logger = logging.getLogger("Base") 12 | 13 | 14 | class Base: 15 | _scheme = "https" 16 | 17 | _landing_host = "" 18 | _request_host = "" 19 | 20 | _landing_path = "" 21 | _request_path = "" 22 | 23 | _user_agents = [ 24 | # NOTE: grab from https://bit.ly/2zu0cmU 25 | "Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1; SV1; .NET CLR 1.1.4322)", 26 | "Mozilla/5.0 (Windows NT 6.1; WOW64; Trident/7.0; rv:11.0) like Gecko", 27 | "Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1)", 28 | ] 29 | 30 | def __init__( 31 | self, 32 | country, 33 | app_name, 34 | app_id=None, 35 | log_format="%(asctime)s [%(levelname)s] %(name)s - %(message)s", 36 | log_level="INFO", 37 | log_interval=5, 38 | ): 39 | logging.basicConfig(format=log_format, level=log_level.upper()) 40 | self._base_landing_url = f"{self._scheme}://{self._landing_host}" 41 | self._base_request_url = f"{self._scheme}://{self._request_host}" 42 | 43 | self.country = str(country).lower() 44 | self.app_name = re.sub(r"[\W_]+", "-", str(app_name).lower()) 45 | if app_id is None: 46 | logger.info("Searching for app id") 47 | app_id = self.search_id() 48 | self.app_id = int(app_id) 49 | 50 | self.url = self._landing_url() 51 | 52 | self.reviews = list() 53 | self.reviews_count = int() 54 | 55 | self._log_interval = float(log_interval) 56 | self._log_timer = float() 57 | 58 | self._fetched_count = int() 59 | 60 | self._request_url = self._request_url() 61 | self._request_offset = 0 62 | self._request_headers = { 63 | "Accept": "application/json", 64 | "Authorization": self._token(), 65 | "Connection": "keep-alive", 66 | "Content-Type": "application/x-www-form-urlencoded; charset=UTF-8", 67 | "Origin": self._base_landing_url, 68 | "Referer": self.url, 69 | "User-Agent": random.choice(self._user_agents), 70 | } 71 | self._request_params = {} 72 | self._response = requests.Response() 73 | 74 | logger.info( 75 | f"Initialised: {self.__class__.__name__}" 76 | f"('{self.country}', '{self.app_name}', {self.app_id})" 77 | ) 78 | logger.info(f"Ready to fetch reviews from: {self.url}") 79 | 80 | def __repr__(self): 81 | return "{}(country='{}', app_name='{}', app_id={})".format( 82 | self.__class__.__name__, 83 | self.country, 84 | self.app_name, 85 | self.app_id, 86 | ) 87 | 88 | def __str__(self): 89 | width = 12 90 | return ( 91 | f"{'Country'.rjust(width, ' ')} | {self.country}\n" 92 | f"{'Name'.rjust(width, ' ')} | {self.app_name}\n" 93 | f"{'ID'.rjust(width, ' ')} | {self.app_id}\n" 94 | f"{'URL'.rjust(width, ' ')} | {self.url}\n" 95 | f"{'Review count'.rjust(width, ' ')} | {self.reviews_count}" 96 | ) 97 | 98 | def _landing_url(self): 99 | landing_url = f"{self._base_landing_url}/{self._landing_path}" 100 | return landing_url.format( 101 | country=self.country, app_name=self.app_name, app_id=self.app_id 102 | ) 103 | 104 | def _request_url(self): 105 | request_url = f"{self._base_request_url}/{self._request_path}" 106 | return request_url.format(country=self.country, app_id=self.app_id) 107 | 108 | def _get( 109 | self, 110 | url, 111 | headers=None, 112 | params=None, 113 | total=3, 114 | backoff_factor=3, 115 | status_forcelist=[404, 429], 116 | ) -> requests.Response: 117 | retries = Retry( 118 | total=total, 119 | backoff_factor=backoff_factor, 120 | status_forcelist=status_forcelist, 121 | ) 122 | with requests.Session() as s: 123 | s.mount(self._base_request_url, HTTPAdapter(max_retries=retries)) 124 | logger.debug(f"Making a GET request: {url}") 125 | self._response = s.get(url, headers=headers, params=params) 126 | 127 | def _token(self): 128 | self._get(self.url) 129 | tags = self._response.text.splitlines() 130 | for tag in tags: 131 | if re.match(r" interval: 168 | self._log_status() 169 | self._log_timer = 0 170 | 171 | def search_id(self): 172 | search_url = "https://www.google.com/search" 173 | self._get(search_url, params={"q": f"app store {self.app_name}"}) 174 | pattern = fr"{self._base_landing_url}/[a-z]{{2}}/.+?/id([0-9]+)" 175 | app_id = re.search(pattern, self._response.text).group(1) 176 | return app_id 177 | 178 | def review(self, how_many=sys.maxsize, after=None, sleep=None): 179 | self._log_timer = 0 180 | if after and not isinstance(after, datetime): 181 | raise SystemExit("`after` must be a datetime object.") 182 | 183 | try: 184 | while True: 185 | self._heartbeat() 186 | self._get( 187 | self._request_url, 188 | headers=self._request_headers, 189 | params=self._request_params, 190 | ) 191 | self._parse_data(after) 192 | self._parse_next() 193 | if self._request_offset is None or self._fetched_count >= how_many: 194 | break 195 | if sleep and type(sleep) is int: 196 | time.sleep(sleep) 197 | except KeyboardInterrupt: 198 | logger.error("Keyboard interrupted") 199 | except Exception as e: 200 | logger.error(f"Something went wrong: {e}") 201 | finally: 202 | self._log_status() 203 | self._fetched_count = 0 204 | --------------------------------------------------------------------------------