├── src └── oxylabs │ ├── sources │ ├── __init__.py │ ├── bing │ │ ├── __init__.py │ │ └── bing.py │ ├── amazon │ │ └── __init__.py │ ├── google │ │ └── __init__.py │ ├── kroger │ │ ├── __init__.py │ │ └── kroger.py │ ├── wayfair │ │ ├── __init__.py │ │ └── wayfair.py │ ├── universal │ │ ├── __init__.py │ │ └── universal.py │ ├── google_shopping │ │ ├── __init__.py │ │ └── google_shopping.py │ ├── youtube_transcript │ │ ├── __init__.py │ │ └── youtube_transcript.py │ └── response.py │ ├── utils │ ├── types │ │ ├── __init__.py │ │ ├── render.py │ │ ├── locale.py │ │ ├── user_agent_type.py │ │ ├── fn_name.py │ │ ├── source.py │ │ └── domain.py │ ├── __init__.py │ ├── defaults.py │ └── utils.py │ ├── _version.py │ ├── proxy │ ├── __init__.py │ └── proxy.py │ ├── internal │ ├── __init__.py │ ├── client.py │ └── api.py │ └── __init__.py ├── pyproject.toml ├── scripts ├── publish.sh ├── fmt.sh └── tests.sh ├── .gitignore ├── requirements.txt ├── LICENSE ├── setup.py ├── CHANGELOG.md ├── tests ├── proxy │ └── test_proxy.py └── sources │ ├── wayfair │ └── test_wayfair.py │ └── bing │ └── test_bing.py ├── CONTRIBUTING.md └── README.md /src/oxylabs/sources/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /src/oxylabs/utils/types/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /src/oxylabs/_version.py: -------------------------------------------------------------------------------- 1 | __version__ = "2.0.0" 2 | -------------------------------------------------------------------------------- /src/oxylabs/proxy/__init__.py: -------------------------------------------------------------------------------- 1 | from .proxy import ProxyClient 2 | -------------------------------------------------------------------------------- /src/oxylabs/sources/bing/__init__.py: -------------------------------------------------------------------------------- 1 | from .bing import Bing, BingAsync -------------------------------------------------------------------------------- /src/oxylabs/utils/types/render.py: -------------------------------------------------------------------------------- 1 | PNG = "png" 2 | HTML = "html" 3 | -------------------------------------------------------------------------------- /src/oxylabs/sources/amazon/__init__.py: -------------------------------------------------------------------------------- 1 | from .amazon import Amazon, AmazonAsync -------------------------------------------------------------------------------- /src/oxylabs/sources/google/__init__.py: -------------------------------------------------------------------------------- 1 | from .google import Google, GoogleAsync -------------------------------------------------------------------------------- /src/oxylabs/sources/kroger/__init__.py: -------------------------------------------------------------------------------- 1 | from .kroger import Kroger, KrogerAsync -------------------------------------------------------------------------------- /src/oxylabs/sources/wayfair/__init__.py: -------------------------------------------------------------------------------- 1 | from .wayfair import Wayfair, WayfairAsync -------------------------------------------------------------------------------- /src/oxylabs/internal/__init__.py: -------------------------------------------------------------------------------- 1 | from .client import AsyncClient, RealtimeClient 2 | -------------------------------------------------------------------------------- /src/oxylabs/sources/universal/__init__.py: -------------------------------------------------------------------------------- 1 | from .universal import Universal, UniversalAsync -------------------------------------------------------------------------------- /src/oxylabs/sources/google_shopping/__init__.py: -------------------------------------------------------------------------------- 1 | from .google_shopping import GoogleShopping, GoogleShoppingAsync -------------------------------------------------------------------------------- /src/oxylabs/utils/__init__.py: -------------------------------------------------------------------------------- 1 | from .types import domain, fn_name, locale, render, source, user_agent_type 2 | -------------------------------------------------------------------------------- /pyproject.toml: -------------------------------------------------------------------------------- 1 | [build-system] 2 | requires = ["setuptools", "wheel"] 3 | build-backend = "setuptools.build_meta" 4 | -------------------------------------------------------------------------------- /src/oxylabs/__init__.py: -------------------------------------------------------------------------------- 1 | from .internal import AsyncClient, RealtimeClient 2 | from .proxy.proxy import ProxyClient 3 | -------------------------------------------------------------------------------- /src/oxylabs/sources/youtube_transcript/__init__.py: -------------------------------------------------------------------------------- 1 | from .youtube_transcript import YoutubeTranscript, YoutubeTranscriptAsync -------------------------------------------------------------------------------- /scripts/publish.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | rm -rf dist/ build/ oxylabs.egg-info/ 4 | python setup.py sdist bdist_wheel 5 | twine upload dist/* 6 | -------------------------------------------------------------------------------- /scripts/fmt.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | # Run isort on the src directory 4 | isort src 5 | 6 | # Run black on the src directory 7 | black --line-length 79 src 8 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | # Python 2 | *.pyc 3 | *.pyo 4 | *.pyd 5 | __pycache__/ 6 | *.egg-info/ 7 | dist/ 8 | build/ 9 | .venv/ 10 | *.json 11 | *.html 12 | .env 13 | .idea 14 | virtual-env -------------------------------------------------------------------------------- /src/oxylabs/utils/types/locale.py: -------------------------------------------------------------------------------- 1 | EN = "en" 2 | RU = "ru" 3 | BY = "by" 4 | DE = "de" 5 | FR = "fr" 6 | ID = "id" 7 | KK = "kk" 8 | TT = "tt" 9 | TR = "tr" 10 | UK = "uk" 11 | -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | aiohttp==3.9.4 2 | aiosignal==1.3.1 3 | async-timeout==4.0.3 4 | asyncio==3.4.3; python_version < '3.7' 5 | attrs==23.2.0 6 | certifi==2024.7.4 7 | charset-normalizer==3.3.2 8 | frozenlist==1.4.1 9 | idna==3.7 10 | multidict==6.0.5 11 | requests==2.32.2 12 | urllib3==2.2.2 13 | yarl==1.9.4 14 | -------------------------------------------------------------------------------- /src/oxylabs/utils/types/user_agent_type.py: -------------------------------------------------------------------------------- 1 | MOBILE = "mobile" 2 | TABLET = "tablet" 3 | DESKTOP = "desktop" 4 | MOBILE_IOS = "mobile_ios" 5 | TABLET_IOS = "tablet_ios" 6 | DESKTOP_EDGE = "desktop_edge" 7 | DESKTOP_OPERA = "desktop_opera" 8 | DESKTOP_SAFARI = "desktop_safari" 9 | MOBILE_ANDROID = "mobile_android" 10 | DESKTOP_CHROME = "desktop_chrome" 11 | TABLET_ANDROID = "tablet_android" 12 | DESKTOP_FIREFOX = "desktop_firefox" 13 | -------------------------------------------------------------------------------- /src/oxylabs/utils/defaults.py: -------------------------------------------------------------------------------- 1 | SYNC_BASE_URL = "https://realtime.oxylabs.io/v1/queries" 2 | ASYNC_BASE_URL = "https://data.oxylabs.io/v1/queries" 3 | 4 | PROXY_BASE_URL = "realtime.oxylabs.io" 5 | PROXY_PORT = 60000 6 | NON_UNIVERSAL_DOMAINS = {"google", "bing", "amazon", "wayfair"} 7 | 8 | 9 | DEFAULT_REQUEST_TIMEOUT = 165 10 | DEFAULT_POLL_INTERVAL = 5 11 | DEFAULT_REQUEST_TIMEOUT_ASYNC = 105 12 | DEFAULT_JOB_COMPLETION_TIMEOUT = 50 13 | -------------------------------------------------------------------------------- /src/oxylabs/utils/types/fn_name.py: -------------------------------------------------------------------------------- 1 | ELEMENT_TEXT = "element_text" 2 | XPATH = "xpath" 3 | XPATH_ONE = "xpath_one" 4 | CSS = "css" 5 | CSS_ONE = "css_one" 6 | AMOUNT_FROM_STRING = "amount_from_string" 7 | AMOUNT_RANGE_FROM_STRING = "amount_range_from_string" 8 | JOIN = "join" 9 | REGEX_FIND_ALL = "regex_find_all" 10 | REGEX_SEARCH = "regex_search" 11 | REGEX_SUBSTRING = "regex_substring" 12 | LENGTH = "length" 13 | SELECT_NTH = "select_nth" 14 | CONVERT_TO_FLOAT = "convert_to_float" 15 | CONVERT_TO_INT = "convert_to_int" 16 | CONVERT_TO_STR = "convert_to_str" 17 | AVERAGE = "average" 18 | MAX = "max" 19 | MIN = "min" 20 | PRODUCT = "product" 21 | -------------------------------------------------------------------------------- /scripts/tests.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | # Run source tests 4 | python -m unittest tests.sources.bing.test_bing.TestBingSearchSync 5 | python -m unittest tests.sources.bing.test_bing.TestBingSearchAsync 6 | python -m unittest tests.sources.bing.test_bing.TestBingUrlSync 7 | python -m unittest tests.sources.bing.test_bing.TestBingUrlAsync 8 | 9 | python -m unittest tests.sources.wayfair.test_wayfair.TestWayfairSearchSync 10 | python -m unittest tests.sources.wayfair.test_wayfair.TestWayfairSearchAsync 11 | python -m unittest tests.sources.wayfair.test_wayfair.TestWayfairUrlSync 12 | python -m unittest tests.sources.wayfair.test_wayfair.TestWayfairUrlAsync 13 | 14 | # Run proxy tests 15 | python -m unittest tests.proxy.test_proxy.TestProxyGet 16 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2024-now Oxylabs 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /setup.py: -------------------------------------------------------------------------------- 1 | from setuptools import find_packages, setup 2 | 3 | 4 | def get_version(rel_path): 5 | with open(rel_path, "r") as file: 6 | for line in file.read().splitlines(): 7 | if line.startswith("__version__"): 8 | delim = '"' if '"' in line else "'" 9 | return line.split(delim)[1] 10 | else: 11 | raise RuntimeError("Unable to find version string.") 12 | 13 | 14 | long_description = """ 15 | The official Python library for the Oxylabs Scraper APIs. 16 | 17 | Collect public data at scale with industry-leading web scraping solutions and the world's largest ethical proxy network. 18 | Documentation can be found at https://github.com/oxylabs/oxylabs-sdk-python. 19 | """ 20 | 21 | setup( 22 | name="oxylabs", 23 | version=get_version("src/oxylabs/_version.py"), 24 | description="Official Python library for Oxylabs Scraper APIs", 25 | long_description=long_description, 26 | url="https://oxylabs.io/", 27 | author="Oxylabs", 28 | author_email="support@oxylabs.io", 29 | license="MIT", 30 | package_dir={"": "src"}, 31 | packages=find_packages(where="src"), 32 | install_requires=["aiohttp", "requests"], 33 | ) 34 | -------------------------------------------------------------------------------- /src/oxylabs/utils/types/source.py: -------------------------------------------------------------------------------- 1 | GOOGLE_URL = "google" 2 | GOOGLE_ADS = "google_ads" 3 | GOOGLE_SEARCH = "google_search" 4 | GOOGLE_IMAGES = "google_images" 5 | GOOGLE_SUGGESTIONS = "google_suggest" 6 | GOOGLE_TRAVEL_HOTELS = "google_travel_hotels" 7 | GOOGLE_TRENDS_EXPLORE = "google_trends_explore" 8 | GOOGLE_MAPS = "google_maps" 9 | GOOGLE_LENS = "google_lens" 10 | 11 | BING_URL = "bing" 12 | BING_SEARCH = "bing_search" 13 | 14 | YANDEX_URL = "yandex" 15 | YANDEX_SEARCH = "yandex_search" 16 | 17 | BAIDU_URL = "baidu" 18 | BAIDU_SEARCH = "baidu_search" 19 | 20 | GOOGLE_SHOPPING_URL = "google_shopping" 21 | GOOGLE_SHOPPING_SEARCH = "google_shopping_search" 22 | GOOGLE_SHOPPING_PRODUCT = "google_shopping_product" 23 | GOOGLE_SHOPPING_PRICING = "google_shopping_pricing" 24 | 25 | WAYFAIR = "wayfair" 26 | WAYFAIR_SEARCH = "wayfair_search" 27 | 28 | UNIVERSAL = "universal_ecommerce" 29 | 30 | AMAZON_URL = "amazon" 31 | AMAZON_SEARCH = "amazon_search" 32 | AMAZON_PRODUCT = "amazon_product" 33 | AMAZON_PRICING = "amazon_pricing" 34 | AMAZON_REVIEWS = "amazon_reviews" 35 | AMAZON_QUESTIONS = "amazon_questions" 36 | AMAZON_BEST_SELLERS = "amazon_bestsellers" 37 | AMAZON_SELLERS = "amazon_sellers" 38 | 39 | KROGER = "kroger" 40 | KROGER_PRODUCT = "kroger_product" 41 | KROGER_SEARCH = "kroger_search" 42 | 43 | YOUTUBE_TRANSCRIPT = "youtube_transcript" 44 | -------------------------------------------------------------------------------- /CHANGELOG.md: -------------------------------------------------------------------------------- 1 | # Changelog 2 | 3 | ## 2.0.0 4 | - SERP and Ecommerce source split has been removed. New common modules have been created: API, Client and Response. 5 | - Support for new sources has been added: google_maps, google_lens, kroger, kroger_product, kroger_search, youtube_transcript. 6 | - Added browser_instructions parameter for universal source. 7 | - Docs update. 8 | 9 | ## 1.0.7 10 | - Add an SDK version identifier to all requests. 11 | 12 | ## 1.0.6 13 | 14 | - Security updates in 3rd party libraries. 15 | - Docs update. 16 | 17 | ## 1.0.5 18 | 19 | - Cleaned up tests from obsolete credentials. 20 | 21 | ## 1.0.4 22 | 23 | - Security updates in 3rd party libraries. 24 | 25 | ## 1.0.3 26 | 27 | - Updated import paths to resolve module not found errors. 28 | - Scraper methods now accept direct parameters and include a response object for easier access to results and metadata. 29 | - Replaced print statements with the logging module for better error handling. 30 | - Introduced AsyncClient, RealtimeClient, and ProxyClient to support all sources, providing a more organized structure. 31 | 32 | ## 1.0.2 33 | 34 | - Fixed function and class naming. 35 | - Added request timeout in proxy. 36 | - Removed Yandex and Baidu sources from SERP. 37 | 38 | ## 1.0.1 39 | 40 | - Fixed issue with uploaded package. 41 | 42 | ## 1.0.0 43 | 44 | - Initial release of Oxylabs SDK. 45 | - Scraper APIs: 46 | - SERP 47 | - Ecommerce 48 | - Integration methods: 49 | - Proxy 50 | - Push-Pull 51 | - Realtime 52 | -------------------------------------------------------------------------------- /src/oxylabs/internal/client.py: -------------------------------------------------------------------------------- 1 | import logging 2 | from oxylabs.internal.api import APICredentials, RealtimeAPI, AsyncAPI 3 | from oxylabs.sources.amazon import Amazon, AmazonAsync 4 | from oxylabs.sources.bing import Bing, BingAsync 5 | from oxylabs.sources.google import Google, GoogleAsync 6 | from oxylabs.sources.google_shopping import GoogleShopping, GoogleShoppingAsync 7 | from oxylabs.sources.kroger import Kroger, KrogerAsync 8 | from oxylabs.sources.universal import Universal, UniversalAsync 9 | from oxylabs.sources.wayfair import Wayfair, WayfairAsync 10 | from oxylabs.sources.youtube_transcript import YoutubeTranscript, YoutubeTranscriptAsync 11 | 12 | # Configure logging 13 | logging.basicConfig(level=logging.INFO) 14 | logger = logging.getLogger(__name__) 15 | 16 | class RealtimeClient: 17 | def __init__(self, username: str, password: str, **kwargs) -> None: 18 | """ 19 | Initializes an instance of the RealtimeClient class. 20 | 21 | Args: 22 | username (str): The username for API authentication. 23 | password (str): The password for API authentication. 24 | """ 25 | api = RealtimeAPI(APICredentials(username, password), **kwargs) 26 | self.amazon = Amazon(api) 27 | self.bing = Bing(api) 28 | self.google = Google(api) 29 | self.google_shopping = GoogleShopping(api) 30 | self.kroger = Kroger(api) 31 | self.universal = Universal(api) 32 | self.wayfair = Wayfair(api) 33 | self.youtube_transcript = YoutubeTranscript(api) 34 | 35 | class AsyncClient: 36 | def __init__(self, username: str, password: str, **kwargs) -> None: 37 | """ 38 | Initializes an instance of the AsyncClient class. 39 | 40 | Args: 41 | username (str): The username for API authentication. 42 | password (str): The password for API authentication. 43 | """ 44 | api = AsyncAPI(APICredentials(username, password), **kwargs) 45 | self.amazon = AmazonAsync(api) 46 | self.bing = BingAsync(api) 47 | self.google = GoogleAsync(api) 48 | self.google_shopping = GoogleShoppingAsync(api) 49 | self.kroger = KrogerAsync(api) 50 | self.universal = UniversalAsync(api) 51 | self.wayfair = WayfairAsync(api) 52 | self.youtube_transcript = YoutubeTranscriptAsync(api) 53 | 54 | 55 | -------------------------------------------------------------------------------- /tests/proxy/test_proxy.py: -------------------------------------------------------------------------------- 1 | import unittest 2 | from unittest.mock import patch, Mock 3 | from oxylabs.proxy import ProxyClient 4 | 5 | class TestProxyGet(unittest.TestCase): 6 | @patch('requests.Session') 7 | def test_proxy_get_with_timeout(self, MockSession): 8 | """ 9 | Tests the Proxy.get method for correct timeout handling and header 10 | setup. 11 | 12 | This test uses a mocked requests.Session to simulate HTTP responses and 13 | validate 14 | the interaction, ensuring the Proxy class constructs requests with the 15 | correct headers and timeout. 16 | 17 | Args: 18 | MockSession (MagicMock): A mock of the requests.Session to verify 19 | request execution. 20 | 21 | Steps: 22 | 1. Set up a mock response to simulate an HTTP response. 23 | 2. Configure Proxy instance with headers. 24 | 3. Make a request using Proxy.get with a timeout and verify the method 25 | call and response. 26 | 27 | Assertions: 28 | - Verify correct URL and timeout parameters are passed to the session's 29 | get method. 30 | - Check the response text matches expected content. 31 | """ 32 | 33 | # Setup the mock response object with desired properties (like .text) 34 | mock_response = Mock() 35 | mock_response.status_code = 200 36 | mock_response.text = "Mock response content" 37 | 38 | # Setup the mock session to return the mock response on .get 39 | session_instance = MockSession.return_value 40 | session_instance.get.return_value = mock_response 41 | 42 | # Initialize the Proxy with credentials 43 | proxy = ProxyClient("CHANGEME", "CHANGEME") 44 | 45 | # Customize headers (optional) 46 | proxy.add_user_agent_header("desktop_chrome") 47 | proxy.add_geo_location_header("Germany") 48 | proxy.add_render_header("html") 49 | 50 | # Make the request using the proxy to the test URL 51 | result = proxy.get("https://www.example.com", request_timeout=10) 52 | 53 | # Assertions to ensure the request was made correctly 54 | session_instance.get.assert_called_with( 55 | "https://www.example.com", timeout=10 56 | ) 57 | self.assertEqual(result.text, "Mock response content") 58 | -------------------------------------------------------------------------------- /src/oxylabs/utils/types/domain.py: -------------------------------------------------------------------------------- 1 | RU = "ru" 2 | UA = "ua" 3 | TR = "tr" 4 | CN = "cn" 5 | COM_AI = "com.ai" 6 | COM_PR = "com.pr" 7 | SR = "sr" 8 | ML = "ml" 9 | COM_LB = "com.lb" 10 | BF = "bf" 11 | FM = "fm" 12 | COM_MX = "com.mx" 13 | BJ = "bj" 14 | EE = "ee" 15 | MV = "mv" 16 | NE = "ne" 17 | AT = "at" 18 | GG = "gg" 19 | AE = "ae" 20 | CO_UZ = "co.uz" 21 | AM = "am" 22 | COM_SA = "com.sa" 23 | TL = "tl" 24 | COM_NA = "com.na" 25 | COM_BH = "com.bh" 26 | DK = "dk" 27 | COM_SB = "com.sb" 28 | RO = "ro" 29 | BY = "by" 30 | COM_CO = "com.co" 31 | COM_GI = "com.gi" 32 | CO_ID = "co.id" 33 | MS = "ms" 34 | COM_NG = "com.ng" 35 | IS = "is" 36 | COM_EG = "com.eg" 37 | COM_ET = "com.et" 38 | COM_AF = "com.af" 39 | CH = "ch" 40 | CO_AO = "co.ao" 41 | CL = "cl" 42 | CO_ZA = "co.za" 43 | COM_NF = "com.nf" 44 | DK_RO = "ro" 45 | MD = "md" 46 | ES = "es" 47 | BJ_YO = "bj" 48 | HU = "hu" 49 | DJ = "dj" 50 | COM_MT = "com.mt" 51 | COM_EC = "com.ec" 52 | CO_IN = "co.in" 53 | LK = "lk" 54 | CO_KE = "co.ke" 55 | GY = "gy" 56 | BE = "be" 57 | VG = "vg" 58 | CO_BW = "co.bw" 59 | COM_VN = "com.vn" 60 | CO_TZ = "co.tz" 61 | NE_HA = "ne" 62 | CO_ZW = "co.zw" 63 | TO = "to" 64 | KZ = "kz" 65 | COM_UY = "com.uy" 66 | IQ = "iq" 67 | COM_TW = "com.tw" 68 | RW = "rw" 69 | AD = "ad" 70 | COM_LY = "com.ly" 71 | AL = "al" 72 | CO_IL = "co.il" 73 | KI = "ki" 74 | COM = "com" 75 | MU = "mu" 76 | SC = "sc" 77 | COM_HK = "com.hk" 78 | COM_PA = "com.pa" 79 | CA = "ca" 80 | GE = "ge" 81 | COM_GT = "com.gt" 82 | LI = "li" 83 | COM_KH = "com.kh" 84 | CO_CR = "co.cr" 85 | COM_BO = "com.bo" 86 | CO_VE = "co.ve" 87 | COM_NI = "com.ni" 88 | TD = "td" 89 | CF = "cf" 90 | TK = "tk" 91 | BI = "bi" 92 | MG = "mg" 93 | COM_BD = "com.bd" 94 | COM_BZ = "com.bz" 95 | GM = "gm" 96 | LA = "la" 97 | COM_KW = "com.kw" 98 | CM = "cm" 99 | HT = "ht" 100 | NO = "no" 101 | COM_FJ = "com.fj" 102 | TM = "tm" 103 | COM_SL = "com.sl" 104 | COM_MM = "com.mm" 105 | IM = "im" 106 | SI = "si" 107 | COM_QA = "com.qa" 108 | COM_PE = "com.pe" 109 | CD = "cd" 110 | TT = "tt" 111 | COM_TR = "com.tr" 112 | TG = "tg" 113 | CO_LS = "co.ls" 114 | GR = "gr" 115 | GL = "gl" 116 | MK = "mk" 117 | CO_ZM = "co.zm" 118 | COM_PH = "com.ph" 119 | IT = "it" 120 | CO_JP = "co.jp" 121 | WS = "ws" 122 | COM_AR = "com.ar" 123 | CO_MZ = "co.mz" 124 | AZ = "az" 125 | CO_CK = "co.ck" 126 | FI = "fi" 127 | COM_BN = "com.bn" 128 | PT = "pt" 129 | COM_TJ = "com.tj" 130 | COM_CY = "com.cy" 131 | CV = "cv" 132 | COM_MY = "com.my" 133 | IE = "ie" 134 | COM_SG = "com.sg" 135 | DE = "de" 136 | BA = "ba" 137 | LU = "lu" 138 | BG = "bg" 139 | CO_VI = "co.vi" 140 | COM_OM = "com.om" 141 | AS = "as" 142 | DZ = "dz" 143 | FR = "fr" 144 | LV = "lv" 145 | LT = "lt" 146 | PS = "ps" 147 | SE = "se" 148 | CG = "cg" 149 | NR = "nr" 150 | CO_UG = "co.ug" 151 | COM_VC = "com.vc" 152 | JO = "jo" 153 | CO_TH = "co.th" 154 | RS = "rs" 155 | BS = "bs" 156 | COM_PK = "com.pk" 157 | CO_UK = "co.uk" 158 | SO = "so" 159 | GA = "ga" 160 | COM_UA = "com.ua" 161 | HR = "hr" 162 | COM_CU = "com.cu" 163 | SK = "sk" 164 | COM_NP = "com.np" 165 | NU = "nu" 166 | MN = "mn" 167 | VU = "vu" 168 | NL = "nl" 169 | PT_ST = "st" 170 | COM_BR = "com.br" 171 | TH = "co.th" 172 | MW = "mw" 173 | COM_PG = "com.pg" 174 | PL = "pl" 175 | CO_NZ = "co.nz" 176 | KG = "kg" 177 | CI = "ci" 178 | SH = "sh" 179 | COM_DO = "com.do" 180 | SN = "sn" 181 | COM_JM = "com.jm" 182 | CO_MA = "co.ma" 183 | COM_TN = "com.tn" 184 | DM = "dm" 185 | COM_SV = "com.sv" 186 | COM_SG_2 = "com.sg" 187 | GP = "gp" 188 | ME = "me" 189 | COM_AG = "com.ag" 190 | CZ = "cz" 191 | COM_PY = "com.py" 192 | MR_IN = "co.in" 193 | COM_GH = "com.gh" 194 | ST_LS = "co.ls" 195 | BT = "bt" 196 | RU_KZ = "kz" 197 | IT_SM = "sm" 198 | JE = "je" 199 | TN = "tn" 200 | COM_AU = "com.au" 201 | ME_ME = "me" 202 | PN = "pn" 203 | HN = "hn" 204 | CO_KR = "co.kr" 205 | AR = "com.ar" 206 | BO = "com.bo" 207 | BZ = "com.bz" 208 | UY = "com.uy" 209 | COM_VE = "com.ve" 210 | ID_TL = "tl" 211 | -------------------------------------------------------------------------------- /tests/sources/wayfair/test_wayfair.py: -------------------------------------------------------------------------------- 1 | import unittest 2 | from oxylabs.internal.api import RealtimeAPI, AsyncAPI, APICredentials 3 | from oxylabs.sources.wayfair import Wayfair, WayfairAsync 4 | from oxylabs.utils.types import user_agent_type 5 | 6 | class TestWayfairSearchSync(unittest.TestCase): 7 | def test_wayfair_search_sync(self): 8 | """ 9 | Tests synchronous search functionality for Wayfair to ensure 10 | it returns expected results. 11 | 12 | This test mocks the get_response method to simulate the API responses 13 | and checks that the method handles the search query 14 | correctly and returns the correct mock response. 15 | """ 16 | api = RealtimeAPI(APICredentials('user', 'pass')) 17 | api.get_response = lambda payload, config: {"mocked_response": "search_results"} 18 | wayfair = Wayfair(api) 19 | query = "furniture" 20 | opts = {"start_page": 1, "pages": 1, "limit": 24} 21 | 22 | result = wayfair.scrape_search(query, opts) 23 | self.assertIn("mocked_response", result.raw) 24 | self.assertEqual(result.raw["mocked_response"], "search_results") 25 | 26 | class TestWayfairUrlSync(unittest.TestCase): 27 | def test_wayfair_url_sync(self): 28 | """ 29 | Tests the Wayfair URL scraping functionality in a 30 | synchronous manner. 31 | 32 | This test mocks the get_response method to return controlled responses, 33 | ensuring that the method correctly processes the URL and user agent 34 | type, returning the expected data. 35 | """ 36 | api = RealtimeAPI(APICredentials('user', 'pass')) 37 | api.get_response = lambda payload, config: {"mocked_response": "url_results"} 38 | wayfair = Wayfair(api) 39 | url = "https://www.wayfair.com/furniture/sb0/sofas-c413892.html" 40 | opts = {"user_agent_type": user_agent_type.DESKTOP} 41 | 42 | result = wayfair.scrape_url(url, opts) 43 | self.assertIn("mocked_response", result.raw) 44 | self.assertEqual(result.raw["mocked_response"], "url_results") 45 | 46 | class TestWayfairSearchAsync(unittest.IsolatedAsyncioTestCase): 47 | async def test_wayfair_search_async(self): 48 | """ 49 | Asynchronously tests Wayfair search to validate the async 50 | API handling. 51 | 52 | Uses a mocked asynchronous response to verify that the search query 53 | processing is handled correctly and that the async functionality 54 | returns expected results. 55 | """ 56 | api = AsyncAPI(APICredentials('user', 'pass')) 57 | async def mock_get_resp(payload, config): 58 | return {"mocked_response": "async_search_results"} 59 | api.get_response = mock_get_resp 60 | wayfair = WayfairAsync(api) 61 | query = "furniture" 62 | opts = {"start_page": 1, "pages": 1, "limit": 24} 63 | 64 | result = await wayfair.scrape_search(query, opts) 65 | self.assertIn("mocked_response", result.raw) 66 | self.assertEqual(result.raw["mocked_response"], "async_search_results") 67 | 68 | class TestWayfairUrlAsync(unittest.IsolatedAsyncioTestCase): 69 | async def test_wayfair_url_async(self): 70 | """ 71 | Asynchronously tests Wayfair URL scraping functionality. 72 | 73 | This test mocks the get_response method to provide controlled async 74 | responses, verifying that the URL and user agent options are processed 75 | correctly and yield expected outcomes. 76 | """ 77 | api = AsyncAPI(APICredentials('user', 'pass')) 78 | async def mock_get_resp(payload, config): 79 | return {"mocked_response": "async_url_results"} 80 | api.get_response = mock_get_resp 81 | wayfair = WayfairAsync(api) 82 | url = "https://www.wayfair.com/furniture/sb0/sofas-c413892.html" 83 | opts = {"user_agent_type": user_agent_type.DESKTOP} 84 | 85 | result = await wayfair.scrape_url(url, opts) 86 | self.assertIn("mocked_response", result.raw) 87 | self.assertEqual(result.raw["mocked_response"], "async_url_results") 88 | -------------------------------------------------------------------------------- /src/oxylabs/sources/youtube_transcript/youtube_transcript.py: -------------------------------------------------------------------------------- 1 | from typing import Optional 2 | 3 | from oxylabs.internal.api import RealtimeAPI, AsyncAPI 4 | from oxylabs.sources.response import Response 5 | from oxylabs.utils.types import source 6 | from oxylabs.utils.utils import prepare_config 7 | 8 | 9 | class YoutubeTranscript: 10 | def __init__(self, api_instance:RealtimeAPI) -> None: 11 | """ 12 | Initializes an instance of the YoutubeTranscript class. 13 | 14 | Args: 15 | api_instance: An instance of the RealtimeAPI class used for making requests. 16 | """ 17 | self._api_instance = api_instance 18 | 19 | def scrape_transcript( 20 | self, 21 | query: str, 22 | context: Optional[list] = None, 23 | callback_url: Optional[str] = None, 24 | request_timeout: Optional[int] = 165, 25 | **kwargs 26 | ) -> Response: 27 | """ 28 | Scrapes a YouTube video transcript for a given query. 29 | 30 | Args: 31 | query (str): A YouTube video ID 32 | context: Optional[list], 33 | callback_url (Optional[str]): URL to your callback endpoint. 34 | request_timeout (int | 165, optional): The interval in seconds for 35 | the request to time out if no response is returned. 36 | Defaults to 165. 37 | 38 | Returns: 39 | Response: The response from the server after the job is completed. 40 | """ 41 | 42 | config = prepare_config(request_timeout=request_timeout) 43 | payload = { 44 | "source": source.YOUTUBE_TRANSCRIPT, 45 | "query": query, 46 | "context": context, 47 | "callback_url": callback_url, 48 | **kwargs, 49 | } 50 | api_response = self._api_instance.get_response(payload, config) 51 | return Response(api_response) 52 | 53 | class YoutubeTranscriptAsync: 54 | def __init__(self, api_instance:AsyncAPI) -> None: 55 | """ 56 | Initializes an instance of the YoutubeTranscriptAsync class. 57 | 58 | Args: 59 | api_instance: An instance of the AsyncAPI class used for making requests. 60 | """ 61 | self._api_instance = api_instance 62 | 63 | async def scrape_transcript( 64 | self, 65 | query: str, 66 | context: Optional[list] = None, 67 | callback_url: Optional[str] = None, 68 | request_timeout: Optional[int] = 165, 69 | job_completion_timeout: Optional[int] = None, 70 | poll_interval: Optional[int] = None, 71 | **kwargs 72 | ) -> Response: 73 | """ 74 | Asynchronously scrapes a YouTube video transcript for a given query. 75 | 76 | Args: 77 | query (str): A YouTube video ID 78 | context: Optional[list], 79 | callback_url (Optional[str]): URL to your callback endpoint. 80 | request_timeout (int | 165, optional): The interval in seconds for 81 | the request to time out if no response is returned. 82 | Defaults to 165. 83 | poll_interval (Optional[int]): The interval in seconds to poll 84 | the server for a response. 85 | job_completion_timeout (Optional[int]): The interval in 86 | seconds for the job to time out if no response is returned. 87 | 88 | Returns: 89 | Response: The response from the server after the job is completed. 90 | """ 91 | 92 | config = prepare_config( 93 | request_timeout=request_timeout, 94 | poll_interval=poll_interval, 95 | job_completion_timeout=job_completion_timeout, 96 | async_integration=True, 97 | ) 98 | payload = { 99 | "source": source.YOUTUBE_TRANSCRIPT, 100 | "query": query, 101 | "context": context, 102 | "callback_url": callback_url, 103 | **kwargs, 104 | } 105 | api_response = await self._api_instance.get_response(payload, config) 106 | return Response(api_response) 107 | -------------------------------------------------------------------------------- /tests/sources/bing/test_bing.py: -------------------------------------------------------------------------------- 1 | import unittest 2 | from oxylabs.utils.types import user_agent_type 3 | from oxylabs.internal import AsyncClient, RealtimeClient 4 | 5 | class TestBingSearchSync(unittest.TestCase): 6 | """ 7 | Test case for synchronous Bing search. 8 | 9 | This test case tests the functionality of the synchronous Bing search 10 | in the RealtimeClient class. It uses a mock response to simulate the 11 | behavior of the Bing search. 12 | """ 13 | 14 | def test_bing_search_sync(self): 15 | """ 16 | Test the synchronous Bing search. 17 | 18 | This test creates a RealtimeClient, finds an api instance that is used for requests and 19 | sets its get_response method to a lambda function that returns a mock response. 20 | It then calls the scrape_search method with a query and checks that the returned result 21 | contains the mock response. 22 | """ 23 | client = RealtimeClient('user', 'pass') 24 | api = client.bing._api_instance 25 | api._get_http_response = lambda payload, method, config: {"mocked_response": "search_results"} 26 | query = "nike" 27 | 28 | result = client.bing.scrape_search(query, domain="com", limit=10) 29 | self.assertIn("mocked_response", result.raw) 30 | self.assertEqual(result.raw["mocked_response"], "search_results") 31 | 32 | class TestBingUrlSync(unittest.TestCase): 33 | """ 34 | Test case for synchronous Bing URL scraping. 35 | 36 | This test case tests the functionality of the synchronous Bing URL scraping 37 | in the RealtimeClient class. It uses a mock response to simulate the 38 | behavior of the Bing URL scraping. 39 | """ 40 | 41 | def test_bing_url_sync(self): 42 | """ 43 | Test the synchronous Bing URL scraping. 44 | 45 | This test creates a RealtimeClient, finds an api instance that is used for requests and 46 | sets its get_response method to a lambda function that returns a mock response. 47 | It then calls the scrape_url method with a URL and checks that the returned result 48 | contains the mock response. 49 | """ 50 | client = RealtimeClient('user', 'pass') 51 | api = client.bing._api_instance 52 | api._get_http_response = lambda payload, method, config: {"mocked_response": "url_results"} 53 | url = "https://www.bing.com/search?q=nike" 54 | opts = {"user_agent_type": user_agent_type.DESKTOP} 55 | 56 | result = client.bing.scrape_url(url, opts) 57 | self.assertIn("mocked_response", result.raw) 58 | self.assertEqual(result.raw["mocked_response"], "url_results") 59 | 60 | 61 | class TestBingSearchAsync(unittest.IsolatedAsyncioTestCase): 62 | """ 63 | Test case for asynchronous Bing search. 64 | 65 | This test case tests the functionality of the asynchronous Bing search 66 | in the AsyncClient class. It uses a mock response to simulate the 67 | behavior of the Bing search. 68 | """ 69 | 70 | async def test_bing_search_async(self): 71 | """ 72 | Test the asynchronous Bing search. 73 | 74 | This test creates an AsyncClient, finds an api instance that is used for requests and 75 | sets its get_response method to a mock function that returns a mock response. 76 | It then calls the scrape_search method with a query and checks that the returned result 77 | contains the mock response. 78 | """ 79 | client = AsyncClient('user', 'pass') 80 | api = client.bing._api_instance 81 | async def mock_get_resp(payload, config): 82 | return {"mocked_response": "async_search_results"} 83 | api.get_response = mock_get_resp 84 | query = "nike" 85 | opts = {"domain": "com", "limit": 10} 86 | 87 | result = await client.bing.scrape_search(query, opts) 88 | self.assertIn("mocked_response", result.raw) 89 | self.assertEqual(result.raw["mocked_response"], "async_search_results") 90 | 91 | class TestBingUrlAsync(unittest.IsolatedAsyncioTestCase): 92 | """ 93 | Test case for asynchronous Bing URL scraping. 94 | 95 | This test case tests the functionality of the asynchronous Bing URL scraping 96 | in the AsyncClient class. It uses a mock response to simulate the 97 | behavior of the Bing URL scraping. 98 | """ 99 | 100 | async def test_bing_url_async(self): 101 | """ 102 | Test the asynchronous Bing URL scraping. 103 | 104 | This test creates an AsyncClient, finds an api instance that is used for requests and 105 | sets its get_response method to a mock function that returns a mock response. 106 | It then calls the scrape_url method with a URL and checks that the returned result 107 | contains the mock response. 108 | """ 109 | client = AsyncClient('user', 'pass') 110 | api = client.bing._api_instance 111 | async def mock_get_resp(payload, config): 112 | return {"mocked_response": "async_url_results"} 113 | api.get_response = mock_get_resp 114 | url = "https://www.bing.com/search?q=nike" 115 | opts = {"user_agent_type": user_agent_type.DESKTOP} 116 | 117 | result = await client.bing.scrape_url(url, opts) 118 | self.assertIn("mocked_response", result.raw) 119 | self.assertEqual(result.raw["mocked_response"], "async_url_results") 120 | -------------------------------------------------------------------------------- /src/oxylabs/proxy/proxy.py: -------------------------------------------------------------------------------- 1 | import logging 2 | from platform import python_version, architecture 3 | from typing import Optional 4 | from urllib.parse import quote, urlparse 5 | 6 | import requests 7 | 8 | from oxylabs.utils.defaults import ( 9 | NON_UNIVERSAL_DOMAINS, 10 | PROXY_BASE_URL, 11 | PROXY_PORT, 12 | ) 13 | from oxylabs.utils.utils import prepare_config 14 | from oxylabs._version import __version__ 15 | 16 | # Configure logging 17 | logging.basicConfig(level=logging.INFO) 18 | logger = logging.getLogger(__name__) 19 | 20 | 21 | class ProxyClient: 22 | def __init__(self, username: str, password: str) -> None: 23 | """ 24 | Initializes a ProxyClient object with the provided username and password. 25 | 26 | Args: 27 | username (str): The username for the proxy authentication. 28 | password (str): The password for the proxy authentication. 29 | """ 30 | self._username = quote(username) 31 | self._password = quote(password) 32 | self._proxy_url = self._build_proxy_url() 33 | self._session = requests.Session() 34 | self._session.proxies = { 35 | "http": self._proxy_url, 36 | "https": self._proxy_url, 37 | } 38 | self._session.verify = False 39 | self._url_to_scrape = None 40 | bits, _ = architecture() 41 | self._session.headers["x-oxylabs-sdk"] = f"oxylabs-sdk-python/{__version__} ({python_version()}; {bits})" 42 | 43 | 44 | def _build_proxy_url(self) -> str: 45 | """ 46 | Build the proxy URL using configured constants. 47 | 48 | Returns: 49 | str: The constructed proxy URL. 50 | """ 51 | return f"http://{self._username}:{self._password}@{PROXY_BASE_URL}:{PROXY_PORT}" 52 | 53 | def get( 54 | self, url: str, request_timeout: Optional[int] = None 55 | ) -> Optional[requests.Response]: 56 | """ 57 | Sends a GET request to the specified URL using the session object. 58 | 59 | Args: 60 | url (str): The URL to send the GET request to. 61 | request_timeout (Optional[int]): The request timeout in seconds. 62 | Defaults to None (no timeout). 63 | 64 | Returns: 65 | Optional[requests.Response]: The response object returned by the 66 | GET request, or None if an error occurred. 67 | """ 68 | try: 69 | config = prepare_config(request_timeout=request_timeout) 70 | self._url_to_scrape = url 71 | response = self._session.get( 72 | url, timeout=config["request_timeout"] 73 | ) 74 | response.raise_for_status() 75 | return response 76 | except requests.exceptions.Timeout: 77 | logger.error( 78 | f"Timeout error. The request to {url} has timed out after {request_timeout} seconds." 79 | ) 80 | return None 81 | except requests.exceptions.RequestException as e: 82 | logger.error(f"Request failed: {e}") 83 | return None 84 | 85 | def add_user_agent_header(self, user_agent_type: str) -> None: 86 | """ 87 | Adds a user agent header to the session headers. 88 | There is no way to indicate a specific User-Agent, but you can let us 89 | know which user-agent type you would like us to use. 90 | 91 | Args: 92 | user_agent_type (str): The user agent to add. Must be one of the 93 | following: 94 | - "desktop" - A User-Agent of a desktop browser. 95 | - "desktop_chrome" - A User-Agent of one of the latest versions of 96 | a desktop Chrome browser. 97 | - "desktop_edge" - A User-Agent of one of the latest versions of 98 | a desktop Edge browser. 99 | - "desktop_firefox" - A User-Agent of one of the latest versions of 100 | a desktop Firefox browser. 101 | - "desktop_opera" - A User-Agent of one of the latest versions of 102 | a desktop Opera browser. 103 | - "desktop_safari" - A User-Agent of one of the latest versions of 104 | a desktop Safari browser. 105 | - "mobile" - A User-Agent of a mobile browser. 106 | - "mobile_android" - A User-Agent of one of the latest versions of 107 | an Android mobile browser. 108 | - "mobile_ios" - A User-Agent of one of the latest versions of 109 | an iOS mobile browser. 110 | - "tablet" - A User-Agent of a tablet browser. 111 | - "tablet_android" - A User-Agent of one of the latest versions of 112 | an Android tablet browser. 113 | - "tablet_ios" - A User-Agent of one of the latest versions of 114 | an iOS tablet browser. 115 | 116 | Returns: 117 | None 118 | """ 119 | self._session.headers["x-oxylabs-user-agent-type"] = user_agent_type 120 | 121 | def add_render_header(self, render: str) -> None: 122 | """ 123 | Adds a render header to the session headers. 124 | 125 | Args: 126 | render (str): The render type to add. Must be one of the following: 127 | - "html" - The output will include an HTML result. 128 | - "png" - The output will include a PNG screenshot of the result. 129 | 130 | Returns: 131 | None 132 | """ 133 | self._session.headers["x-oxylabs-render"] = render 134 | 135 | def add_parse_header( 136 | self, parse: bool = False, parsing_instructions: Optional[dict] = None 137 | ) -> None: 138 | """ 139 | Adds a parse header to the session headers. 140 | 141 | Args: 142 | parse (bool, optional): Whether to enable parsing. Defaults to 143 | False. 144 | parsing_instructions (dict, optional): Instructions for parsing. 145 | Defaults to None. 146 | 147 | Returns: 148 | None 149 | """ 150 | 151 | if parse or parsing_instructions: 152 | self._session.headers["x-oxylabs-parse"] = "1" 153 | if self._is_universal_source(): 154 | self._session.headers["x-oxylabs-parser-type"] = ( 155 | "universal_ecommerce" 156 | ) 157 | else: 158 | self._session.headers.pop("x-oxylabs-parser-type", None) 159 | else: 160 | self._session.headers.pop("x-oxylabs-parse", None) 161 | 162 | def _is_universal_source(self) -> bool: 163 | """ 164 | Checks if the URL to scrape belongs to a universal source. 165 | 166 | Returns: 167 | bool: True if the URL belongs to a universal source, False 168 | otherwise. 169 | """ 170 | parsed_url = urlparse(self._url_to_scrape) 171 | if any( 172 | domain in parsed_url.netloc.decode() 173 | for domain in NON_UNIVERSAL_DOMAINS 174 | ): 175 | return False 176 | 177 | return True 178 | 179 | def add_geo_location_header(self, geo_location: str) -> None: 180 | """ 181 | Adds a geo location header to the session headers. 182 | In some cases, you may need to indicate the geographical location that 183 | the result should be adapted for. 184 | 185 | Args: 186 | geo_location (str): The geo location to add. Accepted values depend 187 | on the URL you would like us to scrape. 188 | 189 | Returns: 190 | None 191 | """ 192 | self._session.headers["x-oxylabs-geo-location"] = geo_location 193 | -------------------------------------------------------------------------------- /src/oxylabs/sources/universal/universal.py: -------------------------------------------------------------------------------- 1 | from typing import Optional 2 | 3 | from oxylabs.internal.api import RealtimeAPI, AsyncAPI 4 | from oxylabs.sources.response import Response 5 | from oxylabs.utils.types import source 6 | from oxylabs.utils.utils import ( 7 | check_parsing_instructions_validity, 8 | prepare_config, 9 | ) 10 | 11 | 12 | class Universal: 13 | def __init__(self, api_instance:RealtimeAPI) -> None: 14 | """ 15 | Initializes an instance of the Universal class. 16 | 17 | Args: 18 | api_instance: An instance of the RealtimeAPI class used for making requests. 19 | """ 20 | self._api_instance = api_instance 21 | 22 | def scrape_url( 23 | self, 24 | url: str, 25 | user_agent_type: Optional[str] = None, 26 | geo_location: Optional[str] = None, 27 | locale: Optional[str] = None, 28 | render: Optional[str] = None, 29 | content_encoding: Optional[str] = None, 30 | context: Optional[list] = None, 31 | callback_url: Optional[str] = None, 32 | parse: Optional[bool] = None, 33 | parser_type: Optional[str] = None, 34 | parsing_instructions: Optional[dict] = None, 35 | browser_instructions: Optional[dict] = None, 36 | request_timeout: Optional[int] = 165, 37 | **kwargs 38 | ) -> Response: 39 | """ 40 | Scrapes Universal search results for a given URL. 41 | 42 | Args: 43 | url (str): The URL to be scraped. 44 | user_agent_type (Optional[str]): Device type and browser. 45 | geo_location (Optional[str]): None, 46 | locale (Optional[str]): Accept-Language header value which changes page web interface language. 47 | render (Optional[str]): Enables JavaScript rendering. 48 | content_encoding: Add this parameter if you are downloading images. 49 | context: Optional[list], 50 | callback_url (Optional[str]): URL to your callback endpoint. 51 | parse (Optional[bool]): true will return structured data. 52 | parser_type: Set the value to ecommerce_product to access our AI-powered Adaptive Parser. 53 | parsing_instructions (Optional[dict]): Instructions for parsing the results. 54 | browser_instructions (Optional[dict]): Browser instructions that are executed when rendering JavaScript. 55 | request_timeout (int | 165, optional): The interval in seconds for 56 | the request to time out if no response is returned. 57 | Defaults to 165. 58 | 59 | Returns: 60 | Response: The response from the server after the job is completed. 61 | """ 62 | 63 | config = prepare_config(request_timeout=request_timeout) 64 | payload = { 65 | "source": source.UNIVERSAL, 66 | "url": url, 67 | "user_agent_type": user_agent_type, 68 | "geo_location": geo_location, 69 | "locale": locale, 70 | "render": render, 71 | "content_encoding": content_encoding, 72 | "context": context, 73 | "callback_url": callback_url, 74 | "parse": parse, 75 | "parser_type": parser_type, 76 | "parsing_instructions": parsing_instructions, 77 | "browser_instructions": browser_instructions, 78 | **kwargs, 79 | } 80 | check_parsing_instructions_validity(parsing_instructions) 81 | api_response = self._api_instance.get_response(payload, config) 82 | return Response(api_response) 83 | 84 | class UniversalAsync: 85 | def __init__(self, api_instance:AsyncAPI) -> None: 86 | """ 87 | Initializes an instance of the Universal class. 88 | 89 | Args: 90 | api_instance: An instance of the AsyncAPI class used for making requests. 91 | """ 92 | self._api_instance = api_instance 93 | 94 | async def scrape_url( 95 | self, 96 | url: str, 97 | user_agent_type: Optional[str] = None, 98 | geo_location: Optional[str] = None, 99 | locale: Optional[str] = None, 100 | render: Optional[str] = None, 101 | content_encoding: Optional[str] = None, 102 | context: Optional[list] = None, 103 | callback_url: Optional[str] = None, 104 | parse: Optional[bool] = None, 105 | parser_type: Optional[str] = None, 106 | parsing_instructions: Optional[dict] = None, 107 | browser_instructions: Optional[dict] = None, 108 | request_timeout: Optional[int] = 165, 109 | job_completion_timeout: Optional[int] = None, 110 | poll_interval: Optional[int] = None, 111 | **kwargs 112 | ) -> Response: 113 | """ 114 | Asynchronously scrapes Universal search results for a given URL. 115 | 116 | Args: 117 | url (str): The URL to be scraped. 118 | user_agent_type (Optional[str]): Device type and browser. 119 | geo_location (Optional[str]): None, 120 | locale (Optional[str]): Accept-Language header value which changes page web interface language. 121 | render (Optional[str]): Enables JavaScript rendering. 122 | content_encoding: Add this parameter if you are downloading images. 123 | context: Optional[list], 124 | callback_url (Optional[str]): URL to your callback endpoint. 125 | parse (Optional[bool]): true will return structured data. 126 | parser_type: Set the value to ecommerce_product to access our AI-powered Adaptive Parser. 127 | parsing_instructions (Optional[dict]): Instructions for parsing the results. 128 | browser_instructions (Optional[dict]): Browser instructions that are executed when rendering JavaScript. 129 | request_timeout (int | 165, optional): The interval in seconds for 130 | the request to time out if no response is returned. 131 | Defaults to 165. 132 | poll_interval (Optional[int]): The interval in seconds to poll 133 | the server for a response. 134 | job_completion_timeout (Optional[int]): The interval in 135 | seconds for the job to time out if no response is returned. 136 | 137 | Returns: 138 | Response: The response from the server after the job is completed. 139 | """ 140 | 141 | config = prepare_config( 142 | request_timeout=request_timeout, 143 | poll_interval=poll_interval, 144 | job_completion_timeout=job_completion_timeout, 145 | async_integration=True, 146 | ) 147 | payload = { 148 | "source": source.UNIVERSAL, 149 | "url": url, 150 | "user_agent_type": user_agent_type, 151 | "geo_location": geo_location, 152 | "locale": locale, 153 | "render": render, 154 | "content_encoding": content_encoding, 155 | "context": context, 156 | "callback_url": callback_url, 157 | "parse": parse, 158 | "parser_type": parser_type, 159 | "parsing_instructions": parsing_instructions, 160 | "browser_instructions": browser_instructions, 161 | **kwargs, 162 | } 163 | check_parsing_instructions_validity(parsing_instructions) 164 | api_response = await self._api_instance.get_response(payload, config) 165 | return Response(api_response) 166 | -------------------------------------------------------------------------------- /CONTRIBUTING.md: -------------------------------------------------------------------------------- 1 | # Contributing 2 | 3 | Thanks for taking the time to contribute! 4 | 5 | All types of contributions are encouraged and valued. Please try to read the 6 | relevant sections in this document before making your contribution. It will 7 | make it a lot easier for us maintainers and smooth out the experience for all 8 | involved. The community looks forward to your contributions. 9 | 10 | > And if you like the project, but just don't have time to contribute, that's 11 | > fine. There are other easy ways to support the project and show your 12 | > appreciation, which we would also be very happy about: 13 | > - Star the project 14 | > - Post about it on social media 15 | > - Refer this project in your own project's README 16 | > - Mention the project at local meetups and tell your friends/colleagues 17 | 18 | ## I Have a Question 19 | 20 | > Please ensure you've already read the available 21 | > [Documentation](https://developers.oxylabs.io/), which may have 22 | > answered your question. 23 | 24 | Before you ask a question, it is best to search for existing 25 | [Issues](https://github.com/oxylabs/oxylabs-sdk-python/issues) that might help 26 | you. In case you have found a suitable issue and still need clarification, you 27 | can write your question in this issue. It is also advisable to search the 28 | internet for answers first. 29 | 30 | If you then still feel the need to ask a question and need clarification, we 31 | recommend the following: 32 | 33 | - Open an [Issue](https://github.com/oxylabs/oxylabs-sdk-python/issues/new). 34 | - Provide as much context as you can about what you're running into. 35 | - Provide project and platform versions (nodejs, npm, etc), depending on what 36 | seems relevant. 37 | 38 | We will then take care of the issue as soon as possible. 39 | 40 | ## I Want To Contribute 41 | 42 | > ### Legal Notice 43 | > 44 | > When contributing to this project, you must agree that you have authored 100% 45 | > of the content, that you have the necessary rights to the content and that 46 | > the content you contribute may be provided under the project licence. 47 | 48 | ## Code Formatting 49 | 50 | This project uses [Black](https://black.readthedocs.io/en/stable/) and 51 | [isort](https://pycqa.github.io/isort/) for code formatting. Before you submit 52 | your contribution, please make sure your code is formatted according to these 53 | style guides. 54 | 55 | First, you will need to install the required tools if you haven't already: 56 | 57 | ```bash 58 | pip install black isort 59 | ``` 60 | 61 | You can use the provided `fmt.sh` script to automatically format your code. 62 | This script runs `isort` and `black` on the `src` directory. Here's how you can 63 | run it: 64 | 65 | ```bash 66 | scripts/fmt.sh 67 | ``` 68 | 69 | ## Running Tests 70 | 71 | To ensure the quality of the code, we encourage you to run tests after making 72 | any changes and before submitting a contribution. We have a script that 73 | facilitates running the unit tests for the project. 74 | 75 | To run the tests, use the `tests.sh` script located in the `scripts` directory. 76 | This will execute all the unit tests and report any failures. 77 | 78 | Here's how you can run it: 79 | 80 | ```bash 81 | scripts/tests.sh 82 | ``` 83 | 84 | ### Reporting Bugs 85 | 86 | #### Before Submitting a Bug Report 87 | 88 | A good bug report shouldn't leave others needing to chase you up for more 89 | information. Therefore, we ask you to investigate carefully, collect 90 | information and describe the issue in detail in your report. Please complete 91 | the following steps in advance to help us fix any potential bug as fast as 92 | possible. 93 | 94 | - Make sure that you are using the latest version. 95 | - Determine if your bug is really a bug and not an error on your side e.g. 96 | using incompatible environment components/versions (Make sure that you have 97 | read the 98 | [documentation](https://developers.oxylabs.io/). If you 99 | are looking for support, you might want to check [this 100 | section](#i-have-a-question)). 101 | - To see if other users have experienced (and potentially already solved) the 102 | same issue you are having, check if there is not already a bug report 103 | existing for your bug or error in the [bug 104 | tracker](https://github.com/oxylabs/oxylabs-sdk-python/issues?q=label%3Abug). 105 | - Also make sure to search the internet (including Stack Overflow) to see if 106 | users outside of the GitHub community have discussed the issue. 107 | - Collect information about the bug: 108 | - Stack trace (Traceback) 109 | - OS, Platform and Version (Windows, Linux, macOS, x86, ARM) 110 | - Version of the interpreter, compiler, SDK, runtime environment, package 111 | manager, depending on what seems relevant. 112 | - Possibly your input and the output 113 | - Can you reliably reproduce the issue? And can you also reproduce it with 114 | older versions? 115 | 116 | #### How Do I Submit a Good Bug Report? 117 | 118 | > You must never report security related issues, vulnerabilities or bugs 119 | > including sensitive information to the issue tracker, or elsewhere in public. 120 | > Instead sensitive bugs must be sent by email to . 121 | 122 | We use GitHub issues to track bugs and errors. If you run into an issue with the 123 | project: 124 | 125 | - Open an [Issue](https://github.com/oxylabs/oxylabs-sdk-python/issues/new). 126 | (Since we can't be sure at this point whether it is a bug or not, we ask you 127 | not to talk about a bug yet and not to label the issue.) 128 | - Explain the behavior you would expect and the actual behavior. 129 | - Please provide as much context as possible and describe the *reproduction 130 | steps* that someone else can follow to recreate the issue on their own. This 131 | usually includes your code. For good bug reports you should isolate the 132 | problem and create a reduced test case. 133 | - Provide the information you collected in the previous section. 134 | 135 | Once it's filed: 136 | 137 | - The project team will label the issue accordingly. 138 | - A team member will try to reproduce the issue with your provided steps. If 139 | there are no reproduction steps or no obvious way to reproduce the issue, the 140 | team will ask you for those steps and mark the issue as `needs-repro`. Bugs 141 | with the `needs-repro` tag will not be addressed until they are reproduced. 142 | - If the team is able to reproduce the issue, it will be marked `needs-fix`, as 143 | well as possibly other tags (such as `critical`), and the issue will be left 144 | to be [implemented by someone](#your-first-code-contribution). 145 | 146 | ### Suggesting Enhancements 147 | 148 | This section guides you through submitting an enhancement suggestion, 149 | **including completely new features and minor improvements to existing 150 | functionality**. Following these guidelines will help maintainers and the 151 | community to understand your suggestion and find related suggestions. 152 | 153 | #### Before Submitting an Enhancement 154 | 155 | - Make sure that you are using the latest version. 156 | - Read the [documentation](https://developers.oxylabs.io/) carefully and 157 | find out if the functionality is already covered, maybe by an individual 158 | configuration. 159 | - Perform a [search](https://github.com/oxylabs/oxylabs-sdk-python/issues) to see 160 | if the enhancement has already been suggested. If it has, add a comment to the 161 | existing issue instead of opening a new one. 162 | - Find out whether your idea fits with the scope and aims of the project. It's 163 | up to you to make a strong case to convince the project's developers of the 164 | merits of this feature. Keep in mind that we want features that will be 165 | useful to the majority of our users and not just a small subset. If you're 166 | just targeting a minority of users, consider writing an add-on/plugin 167 | library. 168 | 169 | #### How Do I Submit a Good Enhancement Suggestion? 170 | 171 | Enhancement suggestions are tracked as 172 | [GitHub issues](https://github.com/oxylabs/oxylabs-sdk-python/issues). 173 | 174 | - Use a **clear and descriptive title** for the issue to identify the 175 | suggestion. 176 | - Provide a **step-by-step description of the suggested enhancement** in as 177 | many details as possible. 178 | - **Describe the current behavior** and **explain which behavior you expected 179 | to see instead** and why. At this point you can also tell which alternatives 180 | do not work for you. 181 | - You may want to **include screenshots or screen recordings** which help you 182 | demonstrate the steps or point out the part which the suggestion is related 183 | to. 184 | - **Explain why this enhancement would be useful** to most users. You may also 185 | want to point out the other projects that solved it better and which could 186 | serve as inspiration. 187 | 188 | ## Security Issue Notifications 189 | 190 | Please see Oxylabs' [Vulnerability Disclosure 191 | Policy](https://oxylabs.io/legal/vulnerability-disclosure-policy) for details. 192 | -------------------------------------------------------------------------------- /src/oxylabs/sources/wayfair/wayfair.py: -------------------------------------------------------------------------------- 1 | from typing import Optional 2 | 3 | from oxylabs.internal.api import RealtimeAPI, AsyncAPI 4 | from oxylabs.sources.response import Response 5 | from oxylabs.utils.types import source 6 | from oxylabs.utils.utils import prepare_config 7 | 8 | 9 | class Wayfair: 10 | def __init__(self, api_instance:RealtimeAPI) -> None: 11 | """ 12 | Initializes an instance of the Wayfair class. 13 | 14 | Args: 15 | api_instance: An instance of the RealtimeAPI class used for making requests. 16 | """ 17 | self._api_instance = api_instance 18 | 19 | def scrape_search( 20 | self, 21 | query: str, 22 | start_page: Optional[int] = None, 23 | pages: Optional[int] = None, 24 | limit: Optional[int] = None, 25 | user_agent_type: Optional[str] = None, 26 | callback_url: Optional[str] = None, 27 | request_timeout: Optional[int] = 165, 28 | **kwargs 29 | ) -> Response: 30 | """ 31 | Scrapes Wayfair search results for a given query. 32 | 33 | Args: 34 | query (str): The search query. 35 | start_page (Optional[int]): The starting page number. 36 | pages (Optional[int]): The number of pages to scrape. 37 | limit (Optional[int]): Number of results to retrieve in each page. 38 | user_agent_type (Optional[str]): Device type and browser. 39 | callback_url (Optional[str]): URL to your callback endpoint. 40 | parsing_instructions (Optional[dict]): Instructions for parsing the results. 41 | request_timeout (int | 165, optional): The interval in seconds for 42 | the request to time out if no response is returned. 43 | Defaults to 165. 44 | 45 | Returns: 46 | Response: The response from the server after the job is completed. 47 | """ 48 | 49 | config = prepare_config(request_timeout=request_timeout) 50 | payload = { 51 | "source": source.WAYFAIR_SEARCH, 52 | "query": query, 53 | "start_page": start_page, 54 | "pages": pages, 55 | "limit": limit, 56 | "user_agent_type": user_agent_type, 57 | "callback_url": callback_url, 58 | **kwargs, 59 | } 60 | api_response = self._api_instance.get_response(payload, config) 61 | return Response(api_response) 62 | 63 | def scrape_url( 64 | self, 65 | url: str, 66 | user_agent_type: Optional[str] = None, 67 | callback_url: Optional[str] = None, 68 | request_timeout: Optional[int] = 165, 69 | **kwargs 70 | ) -> Response: 71 | """ 72 | Scrapes Wayfair search results for a given URL. 73 | 74 | Args: 75 | url (str): The URL to be scraped. 76 | user_agent_type (Optional[str]): Device type and browser. 77 | callback_url (Optional[str]): URL to your callback endpoint. 78 | parsing_instructions (Optional[dict]): Instructions for parsing the results. 79 | request_timeout (int | 165, optional): The interval in seconds for 80 | the request to time out if no response is returned. 81 | Defaults to 165. 82 | 83 | Returns: 84 | Response: The response from the server after the job is completed. 85 | """ 86 | 87 | config = prepare_config(request_timeout=request_timeout) 88 | payload = { 89 | "source": source.WAYFAIR, 90 | "url": url, 91 | "user_agent_type": user_agent_type, 92 | "callback_url": callback_url, 93 | **kwargs, 94 | } 95 | api_response = self._api_instance.get_response(payload, config) 96 | return Response(api_response) 97 | 98 | 99 | class WayfairAsync: 100 | def __init__(self, api_instance:AsyncAPI) -> None: 101 | """ 102 | Initializes an instance of the Wayfair class. 103 | 104 | Args: 105 | api_instance: An instance of the AsyncAPI class used for making requests. 106 | """ 107 | self._api_instance = api_instance 108 | 109 | async def scrape_search( 110 | self, 111 | query: str, 112 | start_page: Optional[int] = None, 113 | pages: Optional[int] = None, 114 | limit: Optional[int] = None, 115 | user_agent_type: Optional[str] = None, 116 | callback_url: Optional[str] = None, 117 | request_timeout: Optional[int] = 165, 118 | job_completion_timeout: Optional[int] = None, 119 | poll_interval: Optional[int] = None, 120 | **kwargs 121 | ) -> Response: 122 | """ 123 | Asynchronously scrapes Wayfair search results for a given query. 124 | 125 | Args: 126 | query (str): The search query. 127 | start_page (Optional[int]): The starting page number. 128 | pages (Optional[int]): The number of pages to scrape. 129 | limit (Optional[int]): Number of results to retrieve in each page 130 | user_agent_type (Optional[str]): Device type and browser. 131 | callback_url (Optional[str]): URL to your callback endpoint. 132 | parsing_instructions (Optional[dict]): Instructions for parsing the results. 133 | request_timeout (int | 165, optional): The interval in seconds for 134 | the request to time out if no response is returned. 135 | Defaults to 165. 136 | poll_interval (Optional[int]): The interval in seconds to poll 137 | the server for a response. 138 | job_completion_timeout (Optional[int]): The interval in 139 | seconds for the job to time out if no response is returned. 140 | 141 | Returns: 142 | Response: The response from the server after the job is completed. 143 | """ 144 | 145 | config = prepare_config( 146 | request_timeout=request_timeout, 147 | poll_interval=poll_interval, 148 | job_completion_timeout=job_completion_timeout, 149 | async_integration=True, 150 | ) 151 | payload = { 152 | "source": source.WAYFAIR_SEARCH, 153 | "query": query, 154 | "start_page": start_page, 155 | "pages": pages, 156 | "limit": limit, 157 | "user_agent_type": user_agent_type, 158 | "callback_url": callback_url, 159 | **kwargs, 160 | } 161 | api_response = await self._api_instance.get_response(payload, config) 162 | return Response(api_response) 163 | 164 | async def scrape_url( 165 | self, 166 | url: str, 167 | user_agent_type: Optional[str] = None, 168 | callback_url: Optional[str] = None, 169 | request_timeout: Optional[int] = 165, 170 | job_completion_timeout: Optional[int] = None, 171 | poll_interval: Optional[int] = None, 172 | **kwargs 173 | ) -> Response: 174 | """ 175 | Asynchronously scrapes Wayfair search results for a given URL. 176 | 177 | Args: 178 | url (str): The URL to be scraped. 179 | user_agent_type (Optional[str]): Device type and browser. 180 | callback_url (Optional[str]): URL to your callback endpoint. 181 | parsing_instructions (Optional[dict]): Instructions for parsing the results. 182 | request_timeout (int | 165, optional): The interval in seconds for 183 | the request to time out if no response is returned. 184 | Defaults to 165. 185 | poll_interval (Optional[int]): The interval in seconds to poll 186 | the server for a response. 187 | job_completion_timeout (Optional[int]): The interval in 188 | seconds for the job to time out if no response is returned. 189 | 190 | Returns: 191 | Response: The response from the server after the job is completed. 192 | """ 193 | 194 | config = prepare_config( 195 | request_timeout=request_timeout, 196 | poll_interval=poll_interval, 197 | job_completion_timeout=job_completion_timeout, 198 | async_integration=True, 199 | ) 200 | payload = { 201 | "source": source.WAYFAIR, 202 | "url": url, 203 | "user_agent_type": user_agent_type, 204 | "callback_url": callback_url, 205 | **kwargs, 206 | } 207 | api_response = await self._api_instance.get_response(payload, config) 208 | return Response(api_response) 209 | -------------------------------------------------------------------------------- /src/oxylabs/internal/api.py: -------------------------------------------------------------------------------- 1 | import base64 2 | import logging 3 | import requests 4 | import aiohttp 5 | import asyncio 6 | from platform import python_version, architecture 7 | from oxylabs._version import __version__ 8 | from oxylabs.utils.defaults import ASYNC_BASE_URL, SYNC_BASE_URL 9 | from oxylabs.utils.utils import ensure_session, close_session 10 | 11 | # Configure logging 12 | logging.basicConfig(level=logging.INFO) 13 | logger = logging.getLogger(__name__) 14 | 15 | class APICredentials: 16 | def __init__(self, username: str, password: str) -> None: 17 | """ 18 | Initializes an instance of the ApiCredentials class. 19 | 20 | Args: 21 | username (str): The username for API authentication. 22 | password (str): The password for API authentication. 23 | """ 24 | credentials = f"{username}:{password}" 25 | self.encoded_credentials = base64.b64encode(credentials.encode()).decode() 26 | 27 | class BaseAPI: 28 | def __init__(self, base_url: str, api_credentials: APICredentials, **kwargs) -> None: 29 | """ 30 | Initializes an instance of the BaseAPI class. 31 | 32 | Args: 33 | base_url (str): The URL of the API. 34 | api_credentials (APICredentials): An instance of APICredentials used for authentication. 35 | """ 36 | self._base_url = base_url 37 | bits, _ = architecture() 38 | sdk_type = kwargs.get("sdk_type", f"oxylabs-sdk-python/{__version__} ({python_version()}; {bits})") 39 | self._headers = { 40 | "Content-Type": "application/json", 41 | "Authorization": f"Basic {api_credentials.encoded_credentials}", 42 | "x-oxylabs-sdk": sdk_type, 43 | } 44 | 45 | class RealtimeAPI(BaseAPI): 46 | def __init__(self, api_credentials: APICredentials, **kwargs) -> None: 47 | """ 48 | Initializes an instance of the RealtimeAPI class. 49 | 50 | Args: 51 | api_credentials (APICredentials): An instance of APICredentials used for authentication. 52 | """ 53 | super().__init__(SYNC_BASE_URL, api_credentials, **kwargs) 54 | 55 | def get_response(self, payload:dict, config:dict) -> dict: 56 | """ 57 | Sends the payload synchronously and fetches the response. 58 | 59 | Args: 60 | payload (dict): The payload for the request. 61 | config (dict): The configuration for the request. 62 | 63 | Returns: 64 | dict: The response from the server after the job is completed. 65 | """ 66 | # Remove empty or null values from the payload 67 | payload = {k: v for k, v in payload.items() if v is not None} 68 | 69 | return self._get_http_response(payload, "POST", config) 70 | 71 | def _get_http_response(self, payload: dict, method: str, config: dict) -> dict | None: 72 | """ 73 | Sends an HTTP request to the specified URL with the given payload 74 | and method. 75 | 76 | Args: 77 | payload (dict): The payload to be sent with the request. 78 | method (str): The HTTP method to be used for the request 79 | (e.g., "POST", "GET"). 80 | config (dict): Additional configuration options for the 81 | request. 82 | 83 | Returns: 84 | dict: The JSON response from the server, if the request is 85 | successful. 86 | None, if an error occurs during the request. 87 | 88 | Raises: 89 | requests.exceptions.Timeout: If the request times out. 90 | requests.exceptions.HTTPError: If an HTTP error occurs. 91 | requests.exceptions.RequestException: If a general request 92 | error occurs. 93 | """ 94 | try: 95 | if method == "POST": 96 | response = requests.post( 97 | self._base_url, 98 | headers=self._headers, 99 | json=payload, 100 | timeout=config["request_timeout"], 101 | ) 102 | else: 103 | logger.error(f"Unsupported method: {method}") 104 | return None 105 | 106 | response.raise_for_status() 107 | 108 | if response.status_code == 200: 109 | return response.json() 110 | else: 111 | logger.error(f"Error occurred: {response.status_code}") 112 | return None 113 | 114 | except requests.exceptions.Timeout: 115 | logger.error( 116 | f"Timeout error. The request to {self._base_url} with method {method} has timed out." 117 | ) 118 | return None 119 | except requests.exceptions.HTTPError as err: 120 | logger.error(f"HTTP error occurred: {err}") 121 | logger.error(response.text) 122 | return None 123 | except requests.exceptions.RequestException as err: 124 | logger.error(f"Error occurred: {err}") 125 | return None 126 | 127 | class AsyncAPI(BaseAPI): 128 | def __init__(self, api_credentials: APICredentials, **kwargs) -> None: 129 | """ 130 | Initializes an instance of the AsyncAPI class. 131 | 132 | Args: 133 | api_credentials (APICredentials): An instance of APICredentials used for authentication. 134 | """ 135 | super().__init__(ASYNC_BASE_URL, api_credentials, **kwargs) 136 | self._session = None 137 | self._requests = 0 138 | 139 | async def get_response(self, payload: dict, config: dict) -> dict | None: 140 | """ 141 | Processes the payload asynchronously and fetches the response. 142 | 143 | Args: 144 | payload (dict): The payload for the request. 145 | config (dict): The configuration for the request. 146 | 147 | Returns: 148 | dict: The response from the server after the job is completed. 149 | """ 150 | # Remove empty or null values from the payload 151 | payload = {k: v for k, v in payload.items() if v is not None} 152 | 153 | result = None 154 | self._requests += 1 155 | 156 | try: 157 | self._session = await ensure_session(self._session) 158 | 159 | result = await self._execute_with_timeout( 160 | payload, config, self._session 161 | ) 162 | return result 163 | 164 | except Exception as e: 165 | logger.error(f"An error occurred: {e}") 166 | 167 | finally: 168 | self._requests -= 1 169 | if self._requests == 0: 170 | await close_session(self._session) 171 | return None 172 | 173 | async def _get_job_id( 174 | self, 175 | payload: dict, 176 | user_session: aiohttp.ClientSession, 177 | request_timeout: int, 178 | ) -> str | None: 179 | try: 180 | async with user_session.post( 181 | self._base_url, 182 | headers=self._headers, 183 | json=payload, 184 | timeout=request_timeout, 185 | ) as response: 186 | data = await response.json() 187 | response.raise_for_status() 188 | return data["id"] 189 | except aiohttp.ClientResponseError as e: 190 | logger.error( 191 | f"HTTP error occurred: {e.status} - {e.message} - {data['message']}" 192 | ) 193 | except aiohttp.ClientConnectionError as e: 194 | logger.error(f"Connection error occurred: {e}") 195 | except asyncio.TimeoutError: 196 | logger.error( 197 | f"Timeout error. The request to {self._base_url} has timed out." 198 | ) 199 | except Exception as e: 200 | logger.error(f"Error occurred: {str(e)}") 201 | return None 202 | 203 | async def _poll_job_status( 204 | self, 205 | job_id: str, 206 | poll_interval: int, 207 | user_session: aiohttp.ClientSession, 208 | timeout: int, 209 | ) -> bool: 210 | job_status_url = f"{self._base_url}/{job_id}" 211 | end_time = asyncio.get_event_loop().time() + timeout 212 | while asyncio.get_event_loop().time() < end_time: 213 | try: 214 | async with user_session.get( 215 | job_status_url, 216 | headers=self._headers, 217 | timeout=poll_interval, 218 | ) as response: 219 | data = await response.json() 220 | response.raise_for_status() 221 | if data["status"] == "done": 222 | return True 223 | elif data["status"] == "faulted": 224 | raise Exception("Job faulted") 225 | except Exception as e: 226 | logger.error(f"Error occurred: {str(e)}") 227 | return False 228 | await asyncio.sleep(poll_interval) 229 | 230 | logger.info("Job completion timeout exceeded") 231 | return False 232 | 233 | async def _get_http_response( 234 | self, job_id: str, user_session: aiohttp.ClientSession 235 | ) -> dict | None: 236 | """ 237 | Retrieves the HTTP response for a given job ID. 238 | 239 | Args: 240 | job_id (str): The ID of the job. 241 | user_session (aiohttp.ClientSession): The client session used for 242 | making the request. 243 | 244 | Returns: 245 | dict: The JSON response data. 246 | 247 | Raises: 248 | aiohttp.ClientResponseError: If a client response error occurs. 249 | aiohttp.ClientConnectionError: If a client connection error occurs. 250 | asyncio.TimeoutError: If the request times out. 251 | Exception: If any other error occurs. 252 | """ 253 | result_url = f"{self._base_url}/{job_id}/results" 254 | try: 255 | async with user_session.get( 256 | result_url, headers=self._headers 257 | ) as response: 258 | data = await response.json() 259 | response.raise_for_status() 260 | return data 261 | except aiohttp.ClientResponseError as e: 262 | logger.error( 263 | f"HTTP error occurred: {e.status} - {e.message} - {data['message']}" 264 | ) 265 | except aiohttp.ClientConnectionError as e: 266 | logger.error(f"Connection error occurred: {e}") 267 | except asyncio.TimeoutError: 268 | logger.error( 269 | f"Timeout error. The request to {result_url} has timed out." 270 | ) 271 | except Exception as e: 272 | logger.error(f"An error occurred: {e} - {data['message']}") 273 | return None 274 | 275 | async def _execute_with_timeout( 276 | self, payload: dict, config: dict, user_session: aiohttp.ClientSession 277 | ) -> dict: 278 | 279 | request_timeout = config["request_timeout"] 280 | job_completion_timeout = config["job_completion_timeout"] 281 | poll_interval = config["poll_interval"] 282 | 283 | job_id = await self._get_job_id(payload, user_session, request_timeout) 284 | if not job_id: 285 | logger.error("Failed to get job ID") 286 | 287 | job_completed = await self._poll_job_status( 288 | job_id, poll_interval, user_session, job_completion_timeout 289 | ) 290 | if not job_completed: 291 | logger.error("Job did not complete successfully") 292 | 293 | result = await self._get_http_response(job_id, user_session) 294 | return result -------------------------------------------------------------------------------- /src/oxylabs/utils/utils.py: -------------------------------------------------------------------------------- 1 | from typing import Any, List 2 | from urllib.parse import urlparse 3 | 4 | import aiohttp 5 | 6 | from .defaults import ( 7 | DEFAULT_JOB_COMPLETION_TIMEOUT, 8 | DEFAULT_POLL_INTERVAL, 9 | DEFAULT_REQUEST_TIMEOUT, 10 | DEFAULT_REQUEST_TIMEOUT_ASYNC, 11 | ) 12 | from .types import fn_name 13 | 14 | 15 | def get_valid_values(module: object) -> list: 16 | """ 17 | Returns a list of valid values from the given module. 18 | 19 | Args: 20 | module (object): The module to retrieve valid values from. 21 | 22 | Returns: 23 | list: A list of valid values from the module. 24 | """ 25 | return [ 26 | getattr(module, name) 27 | for name in dir(module) 28 | if not name.startswith("__") 29 | ] 30 | 31 | 32 | VALID_FN_NAMES = get_valid_values(fn_name) 33 | 34 | 35 | def prepare_config(**kwargs): 36 | """ 37 | Prepare a configuration dictionary based on the provided keyword arguments. 38 | 39 | Args: 40 | request_timeout (int, optional): The timeout value in seconds. Defaults 41 | to None. 42 | poll_interval (int, optional): The poll interval value in seconds. 43 | Defaults to None. 44 | job_completion_timeout (int, optional): The job completion timeout 45 | value in seconds. Defaults to None. 46 | 47 | Returns: 48 | dict: The prepared configuration dictionary. 49 | 50 | """ 51 | config = {} 52 | config["request_timeout"] = ( 53 | kwargs["request_timeout"] 54 | if kwargs.get("request_timeout") is not None 55 | else ( 56 | DEFAULT_REQUEST_TIMEOUT_ASYNC 57 | if kwargs.get("async_integration") is not None 58 | else DEFAULT_REQUEST_TIMEOUT 59 | ) 60 | ) 61 | config["poll_interval"] = ( 62 | kwargs["poll_interval"] 63 | if kwargs.get("poll_interval") is not None 64 | else DEFAULT_POLL_INTERVAL 65 | ) 66 | config["job_completion_timeout"] = ( 67 | kwargs["job_completion_timeout"] 68 | if kwargs.get("job_completion_timeout") is not None 69 | else DEFAULT_JOB_COMPLETION_TIMEOUT 70 | ) 71 | 72 | return config 73 | 74 | 75 | def validate_url(input_url: str, host: str) -> None: 76 | """ 77 | Validates if the given URL is valid and belongs to the specified host. 78 | 79 | Args: 80 | input_url (str): The URL to be validated. 81 | host (str): The expected domain or host. 82 | 83 | Raises: 84 | ValueError: If the URL parameter is empty, missing scheme, missing 85 | host, or does not belong to the specified host. 86 | 87 | Returns: 88 | None 89 | """ 90 | # Check if the URL is empty 91 | if not input_url: 92 | raise ValueError("URL parameter is empty") 93 | 94 | # Parse the URL 95 | parsed_url = urlparse(input_url) 96 | 97 | # Check if the scheme (protocol) is present and not empty 98 | if not parsed_url.scheme: 99 | raise ValueError("URL is missing scheme") 100 | 101 | # Check if the host is present and not empty 102 | if not parsed_url.netloc: 103 | raise ValueError("URL is missing a host") 104 | 105 | # Check if the host matches the expected domain or host 106 | if host not in parsed_url.netloc: 107 | raise ValueError(f"URL does not belong to {host}") 108 | 109 | return None 110 | 111 | 112 | async def ensure_session(session) -> aiohttp.ClientSession: 113 | """ 114 | Ensure the provided session is valid and return a valid session. 115 | 116 | Args: 117 | session: The session to ensure. 118 | 119 | Returns: 120 | A valid aiohttp.ClientSession object. 121 | 122 | """ 123 | if session is None or session.closed: 124 | session = aiohttp.ClientSession() 125 | return session 126 | 127 | 128 | async def close_session(user_session: aiohttp.ClientSession) -> None: 129 | """ 130 | Closes the user session. 131 | 132 | Args: 133 | user_session: The user session to be closed. 134 | 135 | Returns: 136 | None 137 | """ 138 | 139 | if user_session: 140 | await user_session.close() 141 | 142 | 143 | def check_parsing_instructions_validity(instructions: dict) -> None: 144 | """ 145 | Check the validity of parsing instructions. 146 | 147 | Args: 148 | instructions (dict): The parsing instructions to be validated. 149 | 150 | Raises: 151 | Exception: If the parsing instructions have an invalid structure. 152 | 153 | Returns: 154 | None 155 | """ 156 | if instructions is None: 157 | return 158 | 159 | if "_fns" in instructions: 160 | validate_fns(instructions["_fns"]) 161 | else: 162 | for key, value in instructions.items(): 163 | if isinstance(value, dict): 164 | check_parsing_instructions_validity(value) 165 | else: 166 | raise Exception(f"Invalid structure for key: {key}") 167 | 168 | 169 | def validate_fns(fns: List[Any]) -> None: 170 | """ 171 | Validates a list of functions. 172 | 173 | Args: 174 | fns (list): A list of functions to validate. 175 | 176 | Raises: 177 | Exception: If `fns` is None or not a list. 178 | """ 179 | if fns is None: 180 | raise Exception("_fns cannot be nil") 181 | if not isinstance(fns, list): 182 | raise Exception("_fns must be a list") 183 | 184 | for fn in fns: 185 | validate_fn(fn) 186 | 187 | 188 | def validate_fn(fn: dict) -> None: 189 | """ 190 | Validates the given function dictionary. 191 | 192 | Args: 193 | fn: A dictionary representing the function. 194 | 195 | Raises: 196 | ValueError: If the function dictionary is not valid. 197 | """ 198 | if not isinstance(fn, dict): 199 | raise ValueError("Each item in _fns must be a dictionary") 200 | if "_fn" not in fn: 201 | raise ValueError("_fn must be set in each function") 202 | if fn["_fn"] not in VALID_FN_NAMES: 203 | raise ValueError(f"_fn must be a valid function name, got {fn['_fn']}") 204 | 205 | # Delegate to specific argument validators 206 | validate_fn_args(fn["_fn"], fn.get("_args")) 207 | 208 | 209 | def validate_fn_args(function: str, args: Any) -> None: 210 | """ 211 | Validate the arguments for a given function. 212 | 213 | Args: 214 | function (str): The name of the function to validate. 215 | args (Any): The arguments to validate. 216 | 217 | Raises: 218 | ValueError: If there is no validator for the given function name. 219 | """ 220 | # Map function name to validator function 221 | validators = { 222 | fn_name.ELEMENT_TEXT: validate_empty, 223 | fn_name.LENGTH: validate_empty, 224 | fn_name.CONVERT_TO_FLOAT: validate_empty, 225 | fn_name.CONVERT_TO_INT: validate_empty, 226 | fn_name.CONVERT_TO_STR: validate_empty, 227 | fn_name.MAX: validate_empty, 228 | fn_name.MIN: validate_empty, 229 | fn_name.PRODUCT: validate_empty, 230 | fn_name.XPATH: validate_string_array, 231 | fn_name.XPATH_ONE: validate_string_array, 232 | fn_name.CSS: validate_string_array, 233 | fn_name.CSS_ONE: validate_string_array, 234 | fn_name.AMOUNT_FROM_STRING: validate_string, 235 | fn_name.AMOUNT_RANGE_FROM_STRING: validate_string, 236 | fn_name.REGEX_FIND_ALL: validate_string, 237 | fn_name.JOIN: validate_optional_string, 238 | fn_name.REGEX_SEARCH: validate_list_string_optional_int, 239 | fn_name.REGEX_SUBSTRING: validate_list_string_optional_int, 240 | fn_name.SELECT_NTH: validate_non_zero_int, 241 | fn_name.AVERAGE: validate_optional_int, 242 | } 243 | 244 | if function not in validators: 245 | raise ValueError(f"No validator for function name: {function}") 246 | 247 | # Call the appropriate validator 248 | validator = validators[function] 249 | validator(args) 250 | 251 | 252 | def validate_empty(args: Any) -> None: 253 | """ 254 | Validates if the given argument is empty. 255 | 256 | Args: 257 | args: The argument to be validated. 258 | 259 | Raises: 260 | ValueError: If the argument is not empty. 261 | """ 262 | if args: 263 | raise ValueError("_args must be empty") 264 | 265 | 266 | def validate_string_array(args: List[str]) -> None: 267 | """ 268 | Validates a list of non-empty strings. 269 | 270 | Args: 271 | args: A list of strings to be validated. 272 | 273 | Raises: 274 | ValueError: If `args` is not a list of non-empty strings. 275 | """ 276 | if not isinstance(args, list) or not all( 277 | isinstance(elem, str) and elem for elem in args 278 | ): 279 | raise ValueError("_args must be a list of non-empty strings") 280 | 281 | 282 | def validate_string(args: str) -> None: 283 | """ 284 | Validates if the given argument is a non-empty string. 285 | 286 | Args: 287 | args: The argument to be validated. 288 | 289 | Raises: 290 | ValueError: If the argument is not a non-empty string. 291 | """ 292 | if not isinstance(args, str) or not args: 293 | raise ValueError("_args must be a non-empty string") 294 | 295 | 296 | def validate_optional_string(args: str) -> None: 297 | """ 298 | Validates if the given argument is a non-empty string or None. 299 | 300 | Args: 301 | args (str): The argument to be validated. 302 | 303 | Raises: 304 | ValueError: If the argument is not a non-empty string or None. 305 | """ 306 | if args is not None and (not isinstance(args, str) or not args): 307 | raise ValueError("_args must be a non-empty string or None") 308 | 309 | 310 | def validate_non_zero_int(args: int) -> None: 311 | """ 312 | Validates if the given argument is a non-zero integer. 313 | 314 | Args: 315 | args (int): The argument to be validated. 316 | 317 | Raises: 318 | ValueError: If the argument is not a non-zero integer. 319 | 320 | Returns: 321 | None 322 | """ 323 | if not isinstance(args, int) or args == 0: 324 | raise ValueError("_args must be a non-zero integer") 325 | 326 | 327 | def validate_optional_int(args: int) -> None: 328 | """ 329 | Validates if the given argument is a non-zero integer or None. 330 | 331 | Args: 332 | args (int): The argument to be validated. 333 | 334 | Raises: 335 | ValueError: If the argument is not a non-zero integer or None. 336 | """ 337 | if args is not None and (not isinstance(args, int) or args == 0): 338 | raise ValueError("_args must be a non-zero integer or None") 339 | 340 | 341 | def validate_list_string_optional_int(args: list) -> None: 342 | """ 343 | Validates the input arguments. 344 | 345 | Args: 346 | args: A list containing the arguments to be validated. 347 | 348 | Raises: 349 | ValueError: If the first argument is not a non-empty string or if the 350 | second argument is not a non-zero integer when present. 351 | """ 352 | if ( 353 | not isinstance(args, list) 354 | or len(args) < 1 355 | or not isinstance(args[0], str) 356 | or not args[0] 357 | ): 358 | raise ValueError("_args first argument must be a non-empty string") 359 | if len(args) > 1 and (not isinstance(args[1], int) or args[1] == 0): 360 | raise ValueError( 361 | "_args second argument must be a non-zero integer when present" 362 | ) 363 | -------------------------------------------------------------------------------- /src/oxylabs/sources/bing/bing.py: -------------------------------------------------------------------------------- 1 | from typing import Optional 2 | 3 | from oxylabs.internal.api import RealtimeAPI, AsyncAPI 4 | from oxylabs.sources.response import Response 5 | from oxylabs.utils.types import source 6 | from oxylabs.utils.utils import ( 7 | check_parsing_instructions_validity, 8 | prepare_config, 9 | ) 10 | 11 | 12 | class Bing: 13 | def __init__(self, api_instance:RealtimeAPI) -> None: 14 | """ 15 | Initializes an instance of the Bing class. 16 | 17 | Args: 18 | api_instance: An instance of the RealtimeAPI class used for making requests. 19 | """ 20 | self._api_instance = api_instance 21 | 22 | def scrape_search( 23 | self, 24 | query: str, 25 | domain: Optional[str] = None, 26 | start_page: Optional[int] = None, 27 | pages: Optional[int] = None, 28 | limit: Optional[int] = None, 29 | user_agent_type: Optional[str] = None, 30 | callback_url: Optional[str] = None, 31 | locale: Optional[str] = None, 32 | geo_location: Optional[str] = None, 33 | render: Optional[str] = None, 34 | parse: Optional[bool] = None, 35 | parsing_instructions: Optional[dict] = None, 36 | request_timeout: Optional[int] = 165, 37 | **kwargs, 38 | ) -> Response: 39 | """ 40 | Scrapes search results from Bing. 41 | 42 | Args: 43 | query (str): UTF-encoded keyword. 44 | domain (Optional[str]): The domain to limit the search results to. 45 | start_page (Optional[int]): The starting page number. 46 | pages (Optional[int]): The number of pages to scrape. 47 | limit (Optional[int]): Number of results to retrieve in each page. 48 | user_agent_type (Optional[str]): Device type and browser. 49 | callback_url (Optional[str]): URL to your callback endpoint. 50 | locale (Optional[str]): Accept-Language header value which changes page web interface language. 51 | geo_location (Optional[str]): It goes like this: City,Region,Country. 52 | render (Optional[str]): Enables JavaScript rendering. 53 | parse (Optional[bool]): true will return structured data. 54 | parsing_instructions (Optional[dict]): Instructions for parsing the results. 55 | request_timeout (Optional[int]): The timeout for the request in seconds. 56 | **kwargs: Additional keyword arguments. 57 | Returns: 58 | Response: The response containing the scraped results. 59 | """ 60 | 61 | config = prepare_config(request_timeout=request_timeout) 62 | payload = { 63 | "source": source.BING_SEARCH, 64 | "domain": domain, 65 | "query": query, 66 | "start_page": start_page, 67 | "pages": pages, 68 | "limit": limit, 69 | "locale": locale, 70 | "geo_location": geo_location, 71 | "user_agent_type": user_agent_type, 72 | "callback_url": callback_url, 73 | "render": render, 74 | "parse": parse, 75 | "parsing_instructions": parsing_instructions, 76 | **kwargs, 77 | } 78 | 79 | check_parsing_instructions_validity(parsing_instructions) 80 | api_response = self._api_instance.get_response(payload, config) 81 | return Response(api_response) 82 | 83 | def scrape_url( 84 | self, 85 | url: str, 86 | user_agent_type: Optional[str] = None, 87 | geo_location: Optional[str] = None, 88 | callback_url: Optional[str] = None, 89 | render: Optional[str] = None, 90 | parse: Optional[bool] = None, 91 | parsing_instructions: Optional[dict] = None, 92 | request_timeout: Optional[int] = 165, 93 | **kwargs, 94 | ) -> Response: 95 | """ 96 | Scrapes Bing search results for a given URL. 97 | 98 | Args: 99 | url (str): The URL to be scraped. 100 | user_agent_type (Optional[str]): Device type and browser. 101 | geo_location (Optional[str]): The API uses Canonical Geo Location format to 102 | determine request location. It goes like this: City,Region,Country 103 | callback_url (Optional[str]): URL to your callback endpoint. 104 | render (Optional[str]): Enables JavaScript rendering. 105 | parsing_instructions (Optional[dict]): Instructions for parsing the results. 106 | parse (Optional[bool]): true will return structured data. 107 | request_timeout (int | 165, optional): The interval in seconds for 108 | the request to time out if no response is returned. 109 | Defaults to 165. 110 | 111 | Returns: 112 | Response: The response containing the scraped results. 113 | """ 114 | 115 | config = prepare_config(request_timeout=request_timeout) 116 | payload = { 117 | "source": source.BING_URL, 118 | "url": url, 119 | "user_agent_type": user_agent_type, 120 | "geo_location": geo_location, 121 | "callback_url": callback_url, 122 | "render": render, 123 | "parse": parse, 124 | "parsing_instructions": parsing_instructions, 125 | **kwargs, 126 | } 127 | check_parsing_instructions_validity(parsing_instructions) 128 | api_response = self._api_instance.get_response(payload, config) 129 | return Response(api_response) 130 | 131 | 132 | class BingAsync: 133 | def __init__(self, api_instance:AsyncAPI) -> None: 134 | """ 135 | Initializes an instance of the Bing class. 136 | 137 | Args: 138 | api_instance: An instance of the AsyncAPI class used for making requests. 139 | """ 140 | self._api_instance = api_instance 141 | 142 | async def scrape_search( 143 | self, 144 | query: str, 145 | domain: Optional[str] = None, 146 | start_page: Optional[int] = None, 147 | pages: Optional[int] = None, 148 | limit: Optional[int] = None, 149 | user_agent_type: Optional[str] = None, 150 | callback_url: Optional[str] = None, 151 | locale: Optional[str] = None, 152 | geo_location: Optional[str] = None, 153 | render: Optional[str] = None, 154 | parse: Optional[bool] = None, 155 | parsing_instructions: Optional[dict] = None, 156 | request_timeout: Optional[int] = 165, 157 | job_completion_timeout: Optional[int] = None, 158 | poll_interval: Optional[int] = None, 159 | **kwargs, 160 | ) -> Response: 161 | """ 162 | Asynchronously scrapes Bing search results for a given query. 163 | 164 | Args: 165 | query (str): The search query. 166 | domain (Optional[str]): The domain to limit the search results to. 167 | start_page (Optional[int]): The starting page number. 168 | pages (Optional[int]): The number of pages to scrape. 169 | limit (Optional[int]): Number of results to retrieve in each page. 170 | user_agent_type (Optional[str]): Device type and browser. 171 | callback_url (Optional[str]): URL to your callback endpoint. 172 | locale (Optional[str]): Accept-Language header value which changes page web interface language. 173 | geo_location (Optional[str]): The API uses Canonical Geo Location format to 174 | determine request location. It goes like this: City,Region,Country 175 | render (Optional[str]): Enables JavaScript rendering. 176 | parse (Optional[bool]): true will return structured data. 177 | request_timeout (int | 165, optional): The interval in seconds for 178 | the request to time out if no response is returned. 179 | Defaults to 165. 180 | poll_interval (Optional[int]): The interval in seconds to poll 181 | the server for a response. 182 | job_completion_timeout (Optional[int]): The interval in 183 | seconds for the job to time out if no response is returned. 184 | 185 | Returns: 186 | Response: The response containing the scraped results. 187 | """ 188 | 189 | config = prepare_config( 190 | request_timeout=request_timeout, 191 | poll_interval=poll_interval, 192 | job_completion_timeout=job_completion_timeout, 193 | async_integration=True, 194 | ) 195 | 196 | payload = { 197 | "source": source.BING_SEARCH, 198 | "domain": domain, 199 | "query": query, 200 | "start_page": start_page, 201 | "pages": pages, 202 | "limit": limit, 203 | "locale": locale, 204 | "geo_location": geo_location, 205 | "user_agent_type": user_agent_type, 206 | "callback_url": callback_url, 207 | "render": render, 208 | "parse": parse, 209 | "parsing_instructions": parsing_instructions, 210 | **kwargs, 211 | } 212 | check_parsing_instructions_validity(parsing_instructions) 213 | api_response = await self._api_instance.get_response(payload, config) 214 | return Response(api_response) 215 | 216 | async def scrape_url( 217 | self, 218 | url: str, 219 | user_agent_type: Optional[str] = None, 220 | geo_location: Optional[str] = None, 221 | callback_url: Optional[str] = None, 222 | render: Optional[str] = None, 223 | parse: Optional[bool] = None, 224 | parsing_instructions: Optional[dict] = None, 225 | request_timeout: Optional[int] = 165, 226 | job_completion_timeout: Optional[int] = None, 227 | poll_interval: Optional[int] = None, 228 | **kwargs, 229 | ) -> Response: 230 | """ 231 | Asynchronously scrapes Bing search results for a given URL. 232 | 233 | Args: 234 | url (str): The URL to be scraped. 235 | 236 | user_agent_type (Optional[str]): Device type and browser. 237 | geo_location (Optional[str]): The API uses Canonical Geo Location format to 238 | determine request location. It goes like this: City,Region,Country 239 | callback_url (Optional[str]): URL to your callback endpoint. 240 | render (Optional[str]): Enables JavaScript rendering. 241 | parsing_instructions (Optional[dict]): Instructions for parsing the results. 242 | parse (Optional[bool]): true will return structured data. 243 | request_timeout (int | 165, optional): The interval in seconds for 244 | the request to time out if no response is returned. 245 | Defaults to 165. 246 | poll_interval (Optional[int]): The interval in seconds to poll 247 | the server for a response. 248 | job_completion_timeout (Optional[int]): The interval in 249 | seconds for the job to time out if no response is returned. 250 | 251 | Returns: 252 | Response: The response containing the scraped results. 253 | """ 254 | 255 | config = prepare_config( 256 | request_timeout=request_timeout, 257 | poll_interval=poll_interval, 258 | job_completion_timeout=job_completion_timeout, 259 | async_integration=True, 260 | ) 261 | 262 | payload = { 263 | "source": source.BING_URL, 264 | "url": url, 265 | "user_agent_type": user_agent_type, 266 | "geo_location": geo_location, 267 | "callback_url": callback_url, 268 | "render": render, 269 | "parse": parse, 270 | "parsing_instructions": parsing_instructions, 271 | } 272 | check_parsing_instructions_validity(parsing_instructions) 273 | api_response = await self._api_instance.get_response(payload, config) 274 | return Response(api_response) 275 | -------------------------------------------------------------------------------- /src/oxylabs/sources/kroger/kroger.py: -------------------------------------------------------------------------------- 1 | from typing import Optional 2 | 3 | from oxylabs.internal.api import RealtimeAPI, AsyncAPI 4 | from oxylabs.sources.response import Response 5 | from oxylabs.utils.types import source 6 | from oxylabs.utils.utils import prepare_config 7 | 8 | 9 | class Kroger: 10 | def __init__(self, api_instance:RealtimeAPI) -> None: 11 | """ 12 | Initializes an instance of the Kroger class. 13 | 14 | Args: 15 | api_instance: An instance of the RealtimeAPI class used for making requests. 16 | """ 17 | self._api_instance = api_instance 18 | 19 | def scrape_product( 20 | self, 21 | product_id: str, 22 | render: Optional[str] = None, 23 | callback_url: Optional[str] = None, 24 | user_agent_type: Optional[str] = None, 25 | store_id: Optional[int] = None, 26 | delivery_zip: Optional[str] = None, 27 | fulfillment_type: Optional[str] = None, 28 | request_timeout: Optional[int] = 165, 29 | **kwargs 30 | ) -> Response: 31 | """ 32 | Scrapes Kroger product page for a given query and product ID. 33 | 34 | Args: 35 | product_id (str): The product ID. 36 | render (Optional[str]): Enables JavaScript rendering. 37 | callback_url (Optional[str]): URL to your callback endpoint. 38 | user_agent_type (Optional[str]): Device type and browser. 39 | store_id (Optional[int]): The store ID. 40 | delivery_zip (Optional[str]): The delivery location ZIP code. 41 | fulfillment_type (Optional[str]): The Fulfillment method. 42 | request_timeout (int | 165, optional): The interval in seconds for 43 | the request to time out if no response is returned. 44 | Defaults to 165. 45 | 46 | Returns: 47 | Response: The response from the server after the job is completed. 48 | """ 49 | 50 | config = prepare_config(request_timeout=request_timeout) 51 | payload = { 52 | "source": source.KROGER_PRODUCT, 53 | "product_id": product_id, 54 | "render": render, 55 | "callback_url": callback_url, 56 | "user_agent_type": user_agent_type, 57 | "store_id": store_id, 58 | "delivery_zip": delivery_zip, 59 | "fulfillment_type": fulfillment_type, 60 | **kwargs, 61 | } 62 | api_response = self._api_instance.get_response(payload, config) 63 | return Response(api_response) 64 | 65 | def scrape_search( 66 | self, 67 | query: str, 68 | render: Optional[str] = None, 69 | callback_url: Optional[str] = None, 70 | user_agent_type: Optional[str] = None, 71 | store_id: Optional[int] = None, 72 | delivery_zip: Optional[str] = None, 73 | fulfillment_type: Optional[str] = None, 74 | request_timeout: Optional[int] = 165, 75 | **kwargs 76 | ) -> Response: 77 | """ 78 | Scrapes Kroger search page for a given query. 79 | 80 | Args: 81 | query (str): The search query. 82 | render (Optional[str]): Enables JavaScript rendering. 83 | callback_url (Optional[str]): URL to your callback endpoint. 84 | user_agent_type (Optional[str]): Device type and browser. 85 | store_id (Optional[int]): The store ID. 86 | delivery_zip (Optional[str]): The delivery location ZIP code. 87 | fulfillment_type (Optional[str]): The Fulfillment method. 88 | request_timeout (int | 165, optional): The interval in seconds for 89 | the request to time out if no response is returned. 90 | Defaults to 165. 91 | 92 | Returns: 93 | Response: The response from the server after the job is completed. 94 | """ 95 | 96 | config = prepare_config(request_timeout=request_timeout) 97 | payload = { 98 | "source": source.KROGER_SEARCH, 99 | "query": query, 100 | "render": render, 101 | "callback_url": callback_url, 102 | "user_agent_type": user_agent_type, 103 | "store_id": store_id, 104 | "delivery_zip": delivery_zip, 105 | "fulfillment_type": fulfillment_type, 106 | **kwargs, 107 | } 108 | api_response = self._api_instance.get_response(payload, config) 109 | return Response(api_response) 110 | 111 | def scrape_url( 112 | self, 113 | url: str, 114 | render: Optional[str] = None, 115 | callback_url: Optional[str] = None, 116 | user_agent_type: Optional[str] = None, 117 | store_id: Optional[int] = None, 118 | delivery_zip: Optional[str] = None, 119 | fulfillment_type: Optional[str] = None, 120 | request_timeout: Optional[int] = 165, 121 | **kwargs 122 | ) -> Response: 123 | """ 124 | Scrapes Kroger page for a given URL. 125 | 126 | Args: 127 | url (str): Direct URL (link) to Kroger page. 128 | render (Optional[str]): Enables JavaScript rendering. 129 | callback_url (Optional[str]): URL to your callback endpoint. 130 | user_agent_type (Optional[str]): Device type and browser. 131 | store_id (Optional[int]): The store ID. 132 | delivery_zip (Optional[str]): The delivery location ZIP code. 133 | fulfillment_type (Optional[str]): The Fulfillment method. 134 | request_timeout (int | 165, optional): The interval in seconds for 135 | the request to time out if no response is returned. 136 | Defaults to 165. 137 | 138 | Returns: 139 | Response: The response from the server after the job is completed. 140 | """ 141 | 142 | config = prepare_config(request_timeout=request_timeout) 143 | payload = { 144 | "source": source.KROGER, 145 | "url": url, 146 | "render": render, 147 | "callback_url": callback_url, 148 | "user_agent_type": user_agent_type, 149 | "store_id": store_id, 150 | "delivery_zip": delivery_zip, 151 | "fulfillment_type": fulfillment_type, 152 | **kwargs, 153 | } 154 | api_response = self._api_instance.get_response(payload, config) 155 | return Response(api_response) 156 | 157 | 158 | class KrogerAsync: 159 | def __init__(self, api_instance:AsyncAPI) -> None: 160 | """ 161 | Initializes an instance of the Kroger class. 162 | 163 | Args: 164 | api_instance: An instance of the AsyncAPI class used for making requests. 165 | """ 166 | self._api_instance = api_instance 167 | 168 | async def scrape_product( 169 | self, 170 | product_id: str, 171 | render: Optional[str] = None, 172 | callback_url: Optional[str] = None, 173 | user_agent_type: Optional[str] = None, 174 | store_id: Optional[int] = None, 175 | delivery_zip: Optional[str] = None, 176 | fulfillment_type: Optional[str] = None, 177 | request_timeout: Optional[int] = 165, 178 | job_completion_timeout: Optional[int] = None, 179 | poll_interval: Optional[int] = None, 180 | **kwargs 181 | ) -> Response: 182 | """ 183 | Asynchronously scrapes Kroger product page for a given query and product ID. 184 | 185 | Args: 186 | product_id (str): The product ID. 187 | render (Optional[str]): Enables JavaScript rendering. 188 | callback_url (Optional[str]): URL to your callback endpoint. 189 | user_agent_type (Optional[str]): Device type and browser. 190 | store_id (Optional[int]): The store ID. 191 | delivery_zip (Optional[str]): The delivery location ZIP code. 192 | fulfillment_type (Optional[str]): The Fulfillment method. 193 | request_timeout (int | 165, optional): The interval in seconds for 194 | the request to time out if no response is returned. 195 | Defaults to 165. 196 | poll_interval (Optional[int]): The interval in seconds to poll 197 | the server for a response. 198 | job_completion_timeout (Optional[int]): The interval in 199 | seconds for the job to time out if no response is returned. 200 | 201 | Returns: 202 | Response: The response from the server after the job is completed. 203 | """ 204 | 205 | config = prepare_config( 206 | request_timeout=request_timeout, 207 | poll_interval=poll_interval, 208 | job_completion_timeout=job_completion_timeout, 209 | async_integration=True, 210 | ) 211 | payload = { 212 | "source": source.KROGER_PRODUCT, 213 | "product_id": product_id, 214 | "render": render, 215 | "callback_url": callback_url, 216 | "user_agent_type": user_agent_type, 217 | "store_id": store_id, 218 | "delivery_zip": delivery_zip, 219 | "fulfillment_type": fulfillment_type, 220 | **kwargs, 221 | } 222 | api_response = await self._api_instance.get_response(payload, config) 223 | return Response(api_response) 224 | 225 | async def scrape_search( 226 | self, 227 | query: str, 228 | render: Optional[str] = None, 229 | callback_url: Optional[str] = None, 230 | user_agent_type: Optional[str] = None, 231 | store_id: Optional[int] = None, 232 | delivery_zip: Optional[str] = None, 233 | fulfillment_type: Optional[str] = None, 234 | request_timeout: Optional[int] = 165, 235 | job_completion_timeout: Optional[int] = None, 236 | poll_interval: Optional[int] = None, 237 | **kwargs 238 | ) -> Response: 239 | """ 240 | Asynchronously scrapes Kroger search page for a given query. 241 | 242 | Args: 243 | query (str): The search query. 244 | render (Optional[str]): Enables JavaScript rendering. 245 | callback_url (Optional[str]): URL to your callback endpoint. 246 | user_agent_type (Optional[str]): Device type and browser. 247 | store_id (Optional[int]): The store ID. 248 | delivery_zip (Optional[str]): The delivery location ZIP code. 249 | fulfillment_type (Optional[str]): The Fulfillment method. 250 | request_timeout (int | 165, optional): The interval in seconds for 251 | the request to time out if no response is returned. 252 | Defaults to 165. 253 | poll_interval (Optional[int]): The interval in seconds to poll 254 | the server for a response. 255 | job_completion_timeout (Optional[int]): The interval in 256 | seconds for the job to time out if no response is returned. 257 | 258 | Returns: 259 | Response: The response from the server after the job is completed. 260 | """ 261 | 262 | config = prepare_config( 263 | request_timeout=request_timeout, 264 | poll_interval=poll_interval, 265 | job_completion_timeout=job_completion_timeout, 266 | async_integration=True, 267 | ) 268 | payload = { 269 | "source": source.KROGER_SEARCH, 270 | "query": query, 271 | "render": render, 272 | "callback_url": callback_url, 273 | "user_agent_type": user_agent_type, 274 | "store_id": store_id, 275 | "delivery_zip": delivery_zip, 276 | "fulfillment_type": fulfillment_type, 277 | **kwargs, 278 | } 279 | api_response = await self._api_instance.get_response(payload, config) 280 | return Response(api_response) 281 | 282 | async def scrape_url( 283 | self, 284 | url: str, 285 | render: Optional[str] = None, 286 | callback_url: Optional[str] = None, 287 | user_agent_type: Optional[str] = None, 288 | store_id: Optional[int] = None, 289 | delivery_zip: Optional[str] = None, 290 | fulfillment_type: Optional[str] = None, 291 | request_timeout: Optional[int] = 165, 292 | job_completion_timeout: Optional[int] = None, 293 | poll_interval: Optional[int] = None, 294 | **kwargs 295 | ) -> Response: 296 | """ 297 | Asynchronously scrapes Kroger page for a given URL. 298 | 299 | Args: 300 | url (str): Direct URL (link) to Kroger page. 301 | render (Optional[str]): Enables JavaScript rendering. 302 | callback_url (Optional[str]): URL to your callback endpoint. 303 | user_agent_type (Optional[str]): Device type and browser. 304 | store_id (Optional[int]): The store ID. 305 | delivery_zip (Optional[str]): The delivery location ZIP code. 306 | fulfillment_type (Optional[str]): The Fulfillment method. 307 | request_timeout (int | 165, optional): The interval in seconds for 308 | the request to time out if no response is returned. 309 | Defaults to 165. 310 | poll_interval (Optional[int]): The interval in seconds to poll 311 | the server for a response. 312 | job_completion_timeout (Optional[int]): The interval in 313 | seconds for the job to time out if no response is returned. 314 | 315 | Returns: 316 | Response: The response from the server after the job is completed. 317 | """ 318 | 319 | config = prepare_config( 320 | request_timeout=request_timeout, 321 | poll_interval=poll_interval, 322 | job_completion_timeout=job_completion_timeout, 323 | async_integration=True, 324 | ) 325 | payload = { 326 | "source": source.KROGER, 327 | "url": url, 328 | "render": render, 329 | "callback_url": callback_url, 330 | "user_agent_type": user_agent_type, 331 | "store_id": store_id, 332 | "delivery_zip": delivery_zip, 333 | "fulfillment_type": fulfillment_type, 334 | **kwargs, 335 | } 336 | api_response = await self._api_instance.get_response(payload, config) 337 | return Response(api_response) 338 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Oxylabs Python SDK 2 | 3 | [![Oxylabs promo code](https://raw.githubusercontent.com/oxylabs/product-integrations/refs/heads/master/Affiliate-Universal-1090x275.png)](https://oxylabs.io/pages/gitoxy?utm_source=877&utm_medium=affiliate&groupid=877&utm_content=oxylabs-sdk-python-github&transaction_id=102f49063ab94276ae8f116d224b67) 4 | 5 | [![](https://dcbadge.limes.pink/api/server/Pds3gBmKMH?style=for-the-badge&theme=discord)](https://discord.gg/Pds3gBmKMH) [![YouTube](https://img.shields.io/badge/YouTube-Oxylabs-red?style=for-the-badge&logo=youtube&logoColor=white)](https://www.youtube.com/@oxylabs) 6 | 7 | This is a Python SDK for the [Oxylabs](https://oxylabs.io) 8 | [Scraper APIs](https://developers.oxylabs.io/scraper-apis/web-scraper-api#getting-started). 9 | 10 | This SDK helps integrate with Oxylabs’ all-in-one Web Scraper API. 11 | It can help you retrieve data from e-commerce websites, search engines (SERP), 12 | real estate platforms, and more. 13 | 14 | The Python SDK provides you with several benefits over using the raw APIs 15 | directly: 16 | 17 | - **Simplified Interface**: abstracts away complexities, offering a 18 | straightforward user interface for interacting with the Oxylabs API. 19 | - **Automated Request Management**: streamlines the handling of API requests and 20 | responses for enhanced efficiency and reliability. 21 | - **Error Handling**: provides meaningful error messages and handles common API 22 | errors, simplifying troubleshooting. 23 | - **Result Parsing**: streamlines the process of extracting relevant data from HTML results, 24 | allowing developers to focus on application logic. 25 | 26 | ## Requirements 27 | 28 | - Python 3.5 or above. 29 | 30 | You can check your Python version by running the following command in your 31 | preferred terminal: 32 | 33 | ```sh 34 | python --version 35 | ``` 36 | 37 | Or, for systems with multiple Python versions installed: 38 | 39 | ```sh 40 | python3 --version 41 | ``` 42 | 43 | If you need to install or update python you can do so by following the steps 44 | mentioned [here](https://www.python.org/downloads/). 45 | 46 | ## Authentication 47 | 48 | You will need an Oxylabs API username and password which you can get by signing 49 | up at https://oxylabs.io. You can check things out with a free trial at 50 | https://oxylabs.io/products/scraper-api. 51 | 52 | ## Installation 53 | 54 | ```bash 55 | pip install oxylabs 56 | ``` 57 | 58 | ### Quick Start 59 | 60 | ```python 61 | from oxylabs import RealtimeClient 62 | 63 | # Set your Oxylabs API Credentials. 64 | username = "username" 65 | password = "password" 66 | 67 | # Initialize the Realtime client with your credentials. 68 | client = RealtimeClient(username, password) 69 | 70 | # Use `bing_search` as a source to scrape Bing with nike as a query. 71 | result = client.bing.scrape_search("nike") 72 | 73 | print(result.raw) 74 | ``` 75 | 76 | ### Integration Methods 77 | 78 | There are three integration methods for the Oxylabs SERP API, each exposed via 79 | different packages: 80 | 81 | - Realtime (Sync) - `RealtimeClient(username, password)` 82 | - Push-Pull (Async) - `AsyncClient(username, password)` 83 | - Proxy Endpoint - `ProxyClient(username, password)` 84 | 85 | Learn more about integration methods [on the official documentation](https://developers.oxylabs.io/scraper-apis/web-scraper-api/integration-methods) 86 | and how this SDK uses them [here](#integration-methods-1). 87 | 88 | ### Sources 89 | 90 | The Oxylabs API scrapes according to the sources provided via the API: 91 | 92 | | Target | Sources 93 | |------------------------| -------------- 94 | | **Amazon** | `amazon`, `amazon_product`, `amazon_search`, `amazon_pricing`, `amazon_sellers`, `amazon_bestsellers`, `amazon_reviews`, `amazon_questions` 95 | | **Google** | `google`, `google_search`, `google_ads`, `google_travel_hotels`, `google_suggest`,`google_trends_explore`,`google_maps`,`google_lens` 96 | | **Google Shopping** | `google_shopping`, `google_shopping_product`, `google_shopping_search`, `google_shopping_pricing` 97 | | **Bing** | `bing`, `bing_search` 98 | | **Kroger** | `kroger`, `kroger_product`, `kroger_search` 99 | | **Wayfair** | `wayfair`, `wayfair_search` 100 | | **Youtube Transcript** | `youtube_transcript` 101 | | **Other Websites** | `universal` 102 | 103 | These are the equivalent targets and methods available for scraping in the Python SDK: 104 | 105 | | Target | Methods 106 | |------------------------| -------------- 107 | | **amazon** | `scrape_search`, `scrape_url`, `scrape_product`, `scrape_pricing`, `scrape_reviews`, `scrape_questions`, `scrape_bestsellers`, `scrape_sellers` 108 | | **bing** | `scrape_search`, `scrape_url` 109 | | **google** | `scrape_search`, `scrape_url`, `scrape_ads`, `scrape_suggestions`, `scrape_travel_hotels`, `scrape_images`, `scrape_trends_explore`, `scrape_maps`, `scrape_lens` 110 | | **google_shopping** | `scrape_shopping_search`, `scrape_shopping_url`, `scrape_shopping_products`, `scrape_product_pricing` 111 | | **kroger** | `scrape_product`, `scrape_search`, `scrape_url` 112 | | **wayfair** | `scrape_search`, `scrape_url` 113 | | **youtube_transcript** | `scrape_transcript` 114 | | **universal** | `scrape_url` 115 | 116 | In the SDK you'll just need to call the relevant method name from the client. 117 | 118 | For example if you wish to scrape Bing search you can do it with the following code: 119 | 120 | ```python 121 | client = RealtimeClient(username, password) 122 | result = client.bing.scrape_search("football") 123 | ``` 124 | 125 | ### Query Parameters 126 | 127 | Each source has different accepted query parameters. For a detailed list of 128 | accepted parameters by each source you can head over to 129 | https://developers.oxylabs.io/scraper-apis/web-scraper-api. 130 | 131 | By default, scrape functions will use default parameters. If you need to send 132 | specific query parameters, here is an example of how to do it: 133 | 134 | ```python 135 | client = RealtimeClient(username, password) 136 | result = client.bing.scrape_search( 137 | "football", 138 | start_page=1, 139 | pages=3, 140 | limit=4, 141 | domain="com", 142 | ) 143 | ``` 144 | 145 | ### Configurable Options 146 | 147 | For consistency and ease of use, this SDK provides a list of pre-defined 148 | commonly used parameter values as constants in our library. You can use them by 149 | importing the oxylabs type module. 150 | 151 | ```python 152 | from oxylabs.utils.types import user_agent_type, render, domain 153 | ``` 154 | 155 | For the full list you can check the `types` directory. You can send in these 156 | values as strings too. 157 | 158 | These can be used as follows: 159 | 160 | ```python 161 | from oxylabs import RealtimeClient 162 | from oxylabs.utils.types import user_agent_type, render, domain 163 | 164 | client = RealtimeClient(username, password) 165 | result = client.google.scrape_search( 166 | "adidas", 167 | user_agent_type=user_agent_type.DESKTOP, 168 | render=render.HTML, 169 | domain=domain.COM, 170 | ) 171 | ``` 172 | 173 | ### Context Options for Google sources 174 | 175 | You can send in context options relevant to `google`, `amazon` and `universal` 176 | sources. Here are the [supported context values for google search](https://developers.oxylabs.io/scraper-apis/web-scraper-api/google/search). 177 | Similarly you can find supported context values for other sources in the 178 | documentation. 179 | Here's an example for Google Search scraping: 180 | 181 | ```python 182 | client = RealtimeClient(username, password) 183 | result = client.google.scrape_search( 184 | "adidas", 185 | parse=True, 186 | context=[ 187 | {"key": "results_language", "value": "en"}, 188 | {"key": "filter", "value": 0}, 189 | {"key": "tbm", "value": "isch"}, 190 | { 191 | "key": "limit_per_page", 192 | "value": [ 193 | {"page": 1, "limit": 10}, 194 | {"page": 2, "limit": 10}, 195 | ], 196 | }, 197 | ], 198 | ) 199 | ``` 200 | 201 | ### Parse instructions 202 | 203 | SDK supports [custom parsing](https://developers.oxylabs.io/scraper-apis/custom-parser) which lets 204 | you define your own parsing and data processing logic that is executed on a raw scraping result. 205 | 206 | ```python 207 | # Use `bing_search` as a source to scrape Bing using custom parsing 208 | # instructions. 209 | client = RealtimeClient(username, password) 210 | result = client.bing.scrape_url( 211 | "https://www.bing.com/search?q=nike", 212 | parse=True, 213 | parsing_instructions={ 214 | "number_of_results": { 215 | "_fns": [ 216 | { 217 | "_fn": "xpath_one", 218 | "_args": [".//span[@class='sb_count']/text()"], 219 | } 220 | ] 221 | } 222 | }, 223 | ) 224 | ``` 225 | 226 | ### Browser instructions 227 | 228 | SDK allows you to define your own [browser instructions](https://developers.oxylabs.io/scraper-apis/web-scraper-api/features/browser-instructions) 229 | that are executed when rendering JavaScript. 230 | 231 | ```python 232 | client = RealtimeClient(username, password) 233 | result = client.universal.scrape_url( 234 | "https://www.ebay.com/", 235 | render="html", 236 | browser_instructions=[ 237 | { 238 | "type": "input", 239 | "value": "pizza boxes", 240 | "selector": { 241 | "type": "xpath", 242 | "value": "//input[@class='gh-tb ui-autocomplete-input']" 243 | } 244 | }, 245 | { 246 | "type": "click", 247 | "selector": { 248 | "type": "xpath", 249 | "value": "//input[@type='submit']" 250 | } 251 | }, 252 | { 253 | "type": "wait", 254 | "wait_time_s": 10 255 | } 256 | ]) 257 | ``` 258 | 259 | ### Dedicated parsers 260 | Oxylab's Web Scraper API has dedicated parsers for some sources. You can find a list of available 261 | dedicated parsers [here](https://developers.oxylabs.io/scraper-apis/web-scraper-api/features/dedicated-parsers). If you want to use a dedicated parser to get structured data, 262 | then add **parse=True** parameter when calling scrape method. 263 | 264 | Here is an example of using a dedicated parser: 265 | 266 | ```python 267 | # Scrape Amazon search results for keyword "headset" 268 | # Then print a list of products including their ASIN and title 269 | client = RealtimeClient(username, password) 270 | response = client.amazon.scrape_search("headset", parse=True) 271 | 272 | for result in response.results: 273 | for item in result.content["results"]["organic"]: 274 | print(f"{item["asin"]}: {item["title"]}") 275 | ``` 276 | 277 | ## Integration Methods 278 | 279 | ### Realtime Integration 280 | 281 | Realtime is a synchronous integration method. This means that upon sending your 282 | job submission request, **you will have to keep the connection open** until we 283 | successfully finish your job or return an error. 284 | 285 | The **TTL** of Realtime connections is **150 seconds**. There may be rare cases 286 | where your connection times out before you receive a response from us, for 287 | example, if our system is under heavier-than-usual load or the job you submitted 288 | was extremely hard to complete: 289 | 290 | ### Push-Pull(Polling) Integration 291 | 292 | Push-Pull is an asynchronous integration method. This SDK implements this 293 | integration with a polling technique to poll the endpoint for results after a 294 | set interval of time. 295 | 296 | Using it is as straightforward as using the Realtime integration. The only 297 | difference is that it will return an asyncio Task that will eventually contain 298 | the Response. Below is an example of this integration method: 299 | 300 | ```python 301 | import asyncio 302 | from oxylabs import AsyncClient 303 | 304 | async def main(): 305 | # Set your Oxylabs API Credentials. 306 | username = "username" 307 | password = "password" 308 | 309 | # Initialize the async client with your credentials. 310 | client = AsyncClient(username, password) 311 | 312 | # 'timeout' specifies the maximum time (in seconds) to wait for the scraping 313 | # job to complete. 314 | # It is applicable for both Realtime and Push-Pull integrations. 315 | # 'poll_interval' is used only in Push-Pull integrations to set the delay 316 | # (in seconds) 317 | # between consecutive status checks of the job. 318 | tasks = [ 319 | client.bing.scrape_url( 320 | "https://www.bing.com/search?q=adidas", 321 | parse=True, 322 | timeout=35, 323 | poll_interval=3, 324 | ), 325 | client.bing.scrape_url( 326 | "https://www.bing.com/search?q=puma", 327 | parse=True, 328 | timeout=45, 329 | poll_interval=5, 330 | ), 331 | ] 332 | 333 | for future in asyncio.as_completed(tasks): 334 | result = await future 335 | 336 | 337 | if __name__ == "__main__": 338 | asyncio.run(main()) 339 | ``` 340 | 341 | ### Proxy Endpoint 342 | 343 | This method is also synchronous (like Realtime), but instead of using our 344 | service via a RESTful interface, you **can use our endpoint like a proxy**. Use 345 | Proxy Endpoint if you've used proxies before and would just like to get 346 | unblocked content from us. 347 | 348 | Since the parameters in this method are sent as headers there are only a few 349 | parameters which this integration method accepts. You can find those parameters 350 | at 351 | https://developers.oxylabs.io/scraper-apis/web-scraper-api/integration-methods/proxy-endpoint#accepted-parameters. 352 | 353 | The Proxy endpoint integration is very open-ended allowing many different use 354 | cases: 355 | 356 | ```python 357 | from oxylabs import ProxyClient 358 | 359 | # Set your Oxylabs API Credentials. 360 | username = "username" 361 | password = "password" 362 | 363 | # Initialize the ProxyClient with your credentials. 364 | proxy = ProxyClient(username, password) 365 | 366 | # Customize headers for specific requirements (optional). 367 | proxy.add_user_agent_header("desktop_chrome") 368 | proxy.add_geo_location_header("Germany") 369 | proxy.add_render_header("html") 370 | 371 | # Use the proxy to make a request. 372 | result = proxy.get("https://www.example.com") 373 | 374 | print(result.text) 375 | ``` 376 | 377 | ## Additional Resources 378 | 379 | See the official [API Documentation](https://developers.oxylabs.io/) for 380 | details on each API's actual interface, which is implemented by this SDK. 381 | 382 | ## Contributing 383 | 384 | See [CONTRIBUTING](CONTRIBUTING.md) for more information. 385 | 386 | ## Security 387 | 388 | See [Security Issue 389 | Notifications](CONTRIBUTING.md#security-issue-notifications) for more 390 | information. 391 | 392 | ## License 393 | 394 | This project is licensed under the [MIT License](LICENSE). 395 | 396 | ## About Oxylabs 397 | 398 | Established in 2015, Oxylabs are a market-leading web intelligence collection 399 | platform, driven by the highest business, ethics, and compliance standards, 400 | enabling companies worldwide to unlock data-driven insights. 401 | 402 | [![image](https://oxylabs.io/images/og-image.png)](https://oxylabs.io/) 403 | -------------------------------------------------------------------------------- /src/oxylabs/sources/google_shopping/google_shopping.py: -------------------------------------------------------------------------------- 1 | from typing import Optional 2 | 3 | from oxylabs.internal.api import RealtimeAPI, AsyncAPI 4 | from oxylabs.sources.response import Response 5 | from oxylabs.utils.types import source 6 | from oxylabs.utils.utils import ( 7 | check_parsing_instructions_validity, 8 | prepare_config, 9 | ) 10 | 11 | 12 | class GoogleShopping: 13 | def __init__(self, api_instance:RealtimeAPI) -> None: 14 | """ 15 | Initializes an instance of the Google Shopping class. 16 | 17 | Args: 18 | api_instance: An instance of the RealtimeAPI class used for making requests. 19 | """ 20 | self._api_instance = api_instance 21 | 22 | def scrape_shopping_search( 23 | self, 24 | query: str, 25 | domain: Optional[str] = None, 26 | start_page: Optional[int] = None, 27 | pages: Optional[int] = None, 28 | locale: Optional[str] = None, 29 | results_language: Optional[str] = None, 30 | geo_location: Optional[str] = None, 31 | user_agent_type: Optional[str] = None, 32 | callback_url: Optional[str] = None, 33 | render: Optional[str] = None, 34 | parse: Optional[bool] = None, 35 | context: Optional[list] = None, 36 | parsing_instructions: Optional[dict] = None, 37 | request_timeout: Optional[int] = 165, 38 | **kwargs 39 | ) -> Response: 40 | """ 41 | Scrapes Google Shopping search results for a given query. 42 | 43 | Args: 44 | query (str): UTF-encoded keyword 45 | domain (Optional[str]): The domain to limit the search results to. 46 | start_page (Optional[int]): The starting page number. 47 | pages (Optional[int]): The number of pages to scrape. 48 | locale (Optional[str]): Accept-Language header value which changes page web interface language. 49 | results_language (Optional[str]): None, 50 | geo_location (Optional[str]): None, 51 | user_agent_type (Optional[str]): Device type and browser. 52 | callback_url (Optional[str]): URL to your callback endpoint. 53 | render (Optional[str]): Enables JavaScript rendering. 54 | parse (Optional[bool]): true will return structured data. 55 | context: Optional[list], 56 | parsing_instructions (Optional[dict]): Instructions for parsing the results. 57 | request_timeout (int | 165, optional): The interval in seconds for 58 | the request to time out if no response is returned. 59 | Defaults to 165. 60 | 61 | Returns: 62 | Response: The response from the server after the job is completed. 63 | """ 64 | 65 | config = prepare_config(request_timeout=request_timeout) 66 | payload = { 67 | "source": source.GOOGLE_SHOPPING_SEARCH, 68 | "domain": domain, 69 | "query": query, 70 | "start_page": start_page, 71 | "pages": pages, 72 | "locale": locale, 73 | "results_language": results_language, 74 | "geo_location": geo_location, 75 | "user_agent_type": user_agent_type, 76 | "render": render, 77 | "callback_url": callback_url, 78 | "context": context, 79 | "parse": parse, 80 | "parsing_instructions": parsing_instructions, 81 | **kwargs, 82 | } 83 | check_parsing_instructions_validity(parsing_instructions) 84 | api_response = self._api_instance.get_response(payload, config) 85 | return Response(api_response) 86 | 87 | def scrape_shopping_url( 88 | self, 89 | url: str, 90 | user_agent_type: Optional[str] = None, 91 | render: Optional[str] = None, 92 | callback_url: Optional[str] = None, 93 | geo_location: Optional[str] = None, 94 | parse: Optional[bool] = None, 95 | parsing_instructions: Optional[dict] = None, 96 | request_timeout: Optional[int] = 165, 97 | **kwargs 98 | ) -> Response: 99 | """ 100 | Scrapes Google Shopping search results for a given URL. 101 | 102 | Args: 103 | url (str): Direct URL (link) to Google page 104 | the search. 105 | user_agent_type (Optional[str]): Device type and browser. 106 | render (Optional[str]): Enables JavaScript rendering. 107 | callback_url (Optional[str]): URL to your callback endpoint. 108 | geo_location (Optional[str]): None, 109 | parse (Optional[bool]): true will return structured data. 110 | parsing_instructions (Optional[dict]): Instructions for parsing the results. 111 | request_timeout (int | 165, optional): The interval in seconds for 112 | the request to time out if no response is returned. 113 | Defaults to 165. 114 | 115 | Returns: 116 | Response: The response from the server after the job is completed. 117 | """ 118 | 119 | config = prepare_config(request_timeout=request_timeout) 120 | payload = { 121 | "source": source.GOOGLE_SHOPPING_URL, 122 | "url": url, 123 | "user_agent_type": user_agent_type, 124 | "render": render, 125 | "callback_url": callback_url, 126 | "geo_location": geo_location, 127 | "parse": parse, 128 | "parsing_instructions": parsing_instructions, 129 | **kwargs, 130 | } 131 | check_parsing_instructions_validity(parsing_instructions) 132 | api_response = self._api_instance.get_response(payload, config) 133 | return Response(api_response) 134 | 135 | def scrape_shopping_products( 136 | self, 137 | query: str, 138 | domain: Optional[str] = None, 139 | locale: Optional[str] = None, 140 | results_language: Optional[str] = None, 141 | geo_location: Optional[str] = None, 142 | user_agent_type: Optional[str] = None, 143 | render: Optional[str] = None, 144 | callback_url: Optional[str] = None, 145 | parse: Optional[bool] = None, 146 | parsing_instructions: Optional[dict] = None, 147 | request_timeout: Optional[int] = 165, 148 | **kwargs 149 | ) -> Response: 150 | """ 151 | Scrapes Google Shopping product results for a given query. 152 | 153 | Args: 154 | query (str): UTF-encoded product code. 155 | domain (Optional[str]): The domain to limit the search results to. 156 | locale (Optional[str]): Accept-Language header value which changes page web interface language. 157 | "results_language": None, 158 | geo_location (Optional[str]): None, 159 | user_agent_type (Optional[str]): Device type and browser. 160 | render (Optional[str]): Enables JavaScript rendering. 161 | callback_url (Optional[str]): URL to your callback endpoint. 162 | parse (Optional[bool]): true will return structured data. 163 | parsing_instructions (Optional[dict]): Instructions for parsing the results. 164 | request_timeout (int | 165, optional): The interval in seconds for 165 | the request to time out if no response is returned. 166 | Defaults to 165. 167 | Returns: 168 | Response: The response from the server after the job is completed. 169 | """ 170 | 171 | config = prepare_config(request_timeout=request_timeout) 172 | payload = { 173 | "source": source.GOOGLE_SHOPPING_PRODUCT, 174 | "query": query, 175 | "domain": domain, 176 | "locale": locale, 177 | "results_language": results_language, 178 | "geo_location": geo_location, 179 | "user_agent_type": user_agent_type, 180 | "render": render, 181 | "callback_url": callback_url, 182 | "parse": parse, 183 | "parsing_instructions": parsing_instructions, 184 | **kwargs, 185 | } 186 | check_parsing_instructions_validity(parsing_instructions) 187 | api_response = self._api_instance.get_response(payload, config) 188 | return Response(api_response) 189 | 190 | def scrape_product_pricing( 191 | self, 192 | query: str, 193 | domain: Optional[str] = None, 194 | start_page: Optional[int] = None, 195 | pages: Optional[int] = None, 196 | locale: Optional[str] = None, 197 | results_language: Optional[str] = None, 198 | geo_location: Optional[str] = None, 199 | user_agent_type: Optional[str] = None, 200 | render: Optional[str] = None, 201 | callback_url: Optional[str] = None, 202 | parse: Optional[bool] = None, 203 | parsing_instructions: Optional[dict] = None, 204 | request_timeout: Optional[int] = 165, 205 | **kwargs 206 | ) -> Response: 207 | """ 208 | Scrapes Google Shopping product pricing results for a given product code. 209 | 210 | Args: 211 | query (str): UTF-encoded product code. 212 | domain (Optional[str]): The domain to limit the search results to. 213 | locale (Optional[str]): Accept-Language header value which changes page web interface language. 214 | start_page (Optional[int]): The starting page number. 215 | pages (Optional[int]): The number of pages to scrape. 216 | "results_language": None, 217 | geo_location (Optional[str]): None, 218 | user_agent_type (Optional[str]): Device type and browser. 219 | render (Optional[str]): Enables JavaScript rendering. 220 | callback_url (Optional[str]): URL to your callback endpoint. 221 | parse (Optional[bool]): true will return structured data. 222 | parsing_instructions (Optional[dict]): Instructions for parsing the results. 223 | request_timeout (int | 165, optional): The interval in seconds for 224 | the request to time out if no response is returned. 225 | Defaults to 165. 226 | Returns: 227 | Response: The response from the server after the job is completed. 228 | """ 229 | 230 | config = prepare_config(request_timeout=request_timeout) 231 | payload = { 232 | "source": source.GOOGLE_SHOPPING_PRICING, 233 | "domain": domain, 234 | "query": query, 235 | "start_page": start_page, 236 | "pages": pages, 237 | "locale": locale, 238 | "results_language": results_language, 239 | "geo_location": geo_location, 240 | "user_agent_type": user_agent_type, 241 | "render": render, 242 | "callback_url": callback_url, 243 | "parse": parse, 244 | "parsing_instructions": parsing_instructions, 245 | **kwargs, 246 | } 247 | check_parsing_instructions_validity(parsing_instructions) 248 | api_response = self._api_instance.get_response(payload, config) 249 | return Response(api_response) 250 | 251 | class GoogleShoppingAsync: 252 | def __init__(self, api_instance:AsyncAPI) -> None: 253 | """ 254 | Initializes an instance of the Google Shopping class. 255 | 256 | Args: 257 | api_instance: An instance of the AsyncAPI class used for making requests. 258 | """ 259 | self._api_instance = api_instance 260 | 261 | async def scrape_shopping_search( 262 | self, 263 | query: str, 264 | domain: Optional[str] = None, 265 | start_page: Optional[int] = None, 266 | pages: Optional[int] = None, 267 | locale: Optional[str] = None, 268 | results_language: Optional[str] = None, 269 | geo_location: Optional[str] = None, 270 | user_agent_type: Optional[str] = None, 271 | callback_url: Optional[str] = None, 272 | render: Optional[str] = None, 273 | parse: Optional[bool] = None, 274 | context: Optional[list] = None, 275 | parsing_instructions: Optional[dict] = None, 276 | request_timeout: Optional[int] = 165, 277 | job_completion_timeout: Optional[int] = None, 278 | poll_interval: Optional[int] = None, 279 | **kwargs 280 | ) -> Response: 281 | """ 282 | Scrapes Google Shopping search results for a given query. 283 | 284 | Args: 285 | query (str): UTF-encoded keyword. 286 | domain (Optional[str]): The domain to limit the search results to. 287 | start_page (Optional[int]): The starting page number. 288 | pages (Optional[int]): The number of pages to scrape. 289 | locale (Optional[str]): Accept-Language header value which changes page web interface language. 290 | "results_language": None, 291 | geo_location (Optional[str]): None, 292 | user_agent_type (Optional[str]): Device type and browser. 293 | callback_url (Optional[str]): URL to your callback endpoint. 294 | render (Optional[str]): Enables JavaScript rendering. 295 | parse (Optional[bool]): true will return structured data. 296 | context: Optional[list], 297 | parsing_instructions (Optional[dict]): Instructions for parsing the results. 298 | request_timeout (int | 165, optional): The interval in seconds for 299 | the request to time out if no response is returned. 300 | Defaults to 165. 301 | poll_interval (Optional[int]): The interval in seconds to poll 302 | the server for a response. 303 | job_completion_timeout (Optional[int]): The interval in 304 | seconds for the job to time out if no response is returned. 305 | 306 | Returns: 307 | Response: The response from the server after the job is completed. 308 | """ 309 | 310 | config = prepare_config( 311 | request_timeout=request_timeout, 312 | poll_interval=poll_interval, 313 | job_completion_timeout=job_completion_timeout, 314 | async_integration=True, 315 | ) 316 | payload = { 317 | "source": source.GOOGLE_SHOPPING_SEARCH, 318 | "domain": domain, 319 | "query": query, 320 | "start_page": start_page, 321 | "pages": pages, 322 | "locale": locale, 323 | "results_language": results_language, 324 | "geo_location": geo_location, 325 | "user_agent_type": user_agent_type, 326 | "render": render, 327 | "callback_url": callback_url, 328 | "context": context, 329 | "parse": parse, 330 | "parsing_instructions": parsing_instructions, 331 | **kwargs, 332 | } 333 | check_parsing_instructions_validity(parsing_instructions) 334 | api_response = await self._api_instance.get_response(payload, config) 335 | return Response(api_response) 336 | 337 | async def scrape_shopping_url( 338 | self, 339 | url: str, 340 | user_agent_type: Optional[str] = None, 341 | render: Optional[str] = None, 342 | callback_url: Optional[str] = None, 343 | geo_location: Optional[str] = None, 344 | parse: Optional[bool] = None, 345 | parsing_instructions: Optional[dict] = None, 346 | request_timeout: Optional[int] = 165, 347 | job_completion_timeout: Optional[int] = None, 348 | poll_interval: Optional[int] = None, 349 | **kwargs 350 | ) -> Response: 351 | """ 352 | Scrapes Google Shopping search results for a given URL. 353 | 354 | Args: 355 | url (str): Direct URL (link) to Google page. 356 | user_agent_type (Optional[str]): Device type and browser. 357 | render (Optional[str]): Enables JavaScript rendering. 358 | callback_url (Optional[str]): URL to your callback endpoint. 359 | geo_location (Optional[str]): None, 360 | parse (Optional[bool]): true will return structured data. 361 | parsing_instructions (Optional[dict]): Instructions for parsing the results. 362 | request_timeout (int | 165, optional): The interval in seconds for 363 | the request to time out if no response is returned. 364 | Defaults to 165. 365 | poll_interval (Optional[int]): The interval in seconds to poll 366 | the server for a response. 367 | job_completion_timeout (Optional[int]): The interval in 368 | seconds for the job to time out if no response is returned. 369 | Returns: 370 | Response: The response from the server after the job is completed. 371 | """ 372 | 373 | config = prepare_config( 374 | request_timeout=request_timeout, 375 | poll_interval=poll_interval, 376 | job_completion_timeout=job_completion_timeout, 377 | async_integration=True, 378 | ) 379 | payload = { 380 | "source": source.GOOGLE_SHOPPING_URL, 381 | "url": url, 382 | "user_agent_type": user_agent_type, 383 | "render": render, 384 | "callback_url": callback_url, 385 | "geo_location": geo_location, 386 | "parse": parse, 387 | "parsing_instructions": parsing_instructions, 388 | **kwargs, 389 | } 390 | check_parsing_instructions_validity(parsing_instructions) 391 | api_response = await self._api_instance.get_response(payload, config) 392 | return Response(api_response) 393 | 394 | async def scrape_shopping_products( 395 | self, 396 | query: str, 397 | domain: Optional[str] = None, 398 | locale: Optional[str] = None, 399 | results_language: Optional[str] = None, 400 | geo_location: Optional[str] = None, 401 | user_agent_type: Optional[str] = None, 402 | render: Optional[str] = None, 403 | callback_url: Optional[str] = None, 404 | parse: Optional[bool] = None, 405 | parsing_instructions: Optional[dict] = None, 406 | request_timeout: Optional[int] = 165, 407 | job_completion_timeout: Optional[int] = None, 408 | poll_interval: Optional[int] = None, 409 | **kwargs 410 | ) -> Response: 411 | """ 412 | Scrapes Google Shopping product results for a given query. 413 | 414 | Args: 415 | query (str): UTF-encoded product code. 416 | domain (Optional[str]): The domain to limit the search results to. 417 | locale (Optional[str]): Accept-Language header value which changes page web interface language. 418 | "results_language": None, 419 | geo_location (Optional[str]): None, 420 | user_agent_type (Optional[str]): Device type and browser. 421 | render (Optional[str]): Enables JavaScript rendering. 422 | callback_url (Optional[str]): URL to your callback endpoint. 423 | parse (Optional[bool]): true will return structured data. 424 | parsing_instructions (Optional[dict]): Instructions for parsing the results. 425 | request_timeout (int | 165, optional): The interval in seconds for 426 | the request to time out if no response is returned. 427 | Defaults to 165. 428 | poll_interval (Optional[int]): The interval in seconds to poll 429 | the server for a response. 430 | job_completion_timeout (Optional[int]): The interval in 431 | seconds for the job to time out if no response is returned. 432 | Returns: 433 | Response: The response from the server after the job is completed. 434 | """ 435 | 436 | config = prepare_config( 437 | request_timeout=request_timeout, 438 | poll_interval=poll_interval, 439 | job_completion_timeout=job_completion_timeout, 440 | async_integration=True, 441 | ) 442 | payload = { 443 | "source": source.GOOGLE_SHOPPING_PRODUCT, 444 | "query": query, 445 | "domain": domain, 446 | "locale": locale, 447 | "results_language": results_language, 448 | "geo_location": geo_location, 449 | "user_agent_type": user_agent_type, 450 | "render": render, 451 | "callback_url": callback_url, 452 | "parse": parse, 453 | "parsing_instructions": parsing_instructions, 454 | **kwargs, 455 | } 456 | check_parsing_instructions_validity(parsing_instructions) 457 | api_response = await self._api_instance.get_response(payload, config) 458 | return Response(api_response) 459 | 460 | async def scrape_product_pricing( 461 | self, 462 | query: str, 463 | domain: Optional[str] = None, 464 | start_page: Optional[int] = None, 465 | pages: Optional[int] = None, 466 | locale: Optional[str] = None, 467 | results_language: Optional[str] = None, 468 | geo_location: Optional[str] = None, 469 | user_agent_type: Optional[str] = None, 470 | render: Optional[str] = None, 471 | callback_url: Optional[str] = None, 472 | parse: Optional[bool] = None, 473 | parsing_instructions: Optional[dict] = None, 474 | request_timeout: Optional[int] = 165, 475 | job_completion_timeout: Optional[int] = None, 476 | poll_interval: Optional[int] = None, 477 | **kwargs 478 | ) -> Response: 479 | """ 480 | Scrapes Google Shopping product pricing results for a given product code. 481 | 482 | Args: 483 | url (str): UTF-encoded product code. 484 | domain (Optional[str]): The domain to limit the search results to. 485 | start_page (Optional[int]): The starting page number. 486 | pages (Optional[int]): The number of pages to scrape. 487 | locale (Optional[str]): Accept-Language header value which changes page web interface language. 488 | "results_language": None, 489 | geo_location (Optional[str]): None, 490 | user_agent_type (Optional[str]): Device type and browser. 491 | render (Optional[str]): Enables JavaScript rendering. 492 | callback_url (Optional[str]): URL to your callback endpoint. 493 | parse (Optional[bool]): true will return structured data. 494 | parsing_instructions (Optional[dict]): Instructions for parsing the results. 495 | request_timeout (int | 165, optional): The interval in seconds for 496 | the request to time out if no response is returned. 497 | Defaults to 165. 498 | poll_interval (Optional[int]): The interval in seconds to poll 499 | the server for a response. 500 | job_completion_timeout (Optional[int]): The interval in 501 | seconds for the job to time out if no response is returned. 502 | Returns: 503 | Response: The response from the server after the job is completed. 504 | """ 505 | 506 | config = prepare_config( 507 | request_timeout=request_timeout, 508 | poll_interval=poll_interval, 509 | job_completion_timeout=job_completion_timeout, 510 | async_integration=True, 511 | ) 512 | payload = { 513 | "source": source.GOOGLE_SHOPPING_PRICING, 514 | "domain": domain, 515 | "query": query, 516 | "start_page": start_page, 517 | "pages": pages, 518 | "locale": locale, 519 | "results_language": results_language, 520 | "geo_location": geo_location, 521 | "user_agent_type": user_agent_type, 522 | "render": render, 523 | "callback_url": callback_url, 524 | "parse": parse, 525 | "parsing_instructions": parsing_instructions, 526 | **kwargs, 527 | } 528 | check_parsing_instructions_validity(parsing_instructions) 529 | api_response = await self._api_instance.get_response(payload, config) 530 | return Response(api_response) 531 | -------------------------------------------------------------------------------- /src/oxylabs/sources/response.py: -------------------------------------------------------------------------------- 1 | 2 | class Response: 3 | def __init__(self, data): 4 | if data is None: 5 | data = {} 6 | self.raw = data 7 | self.results = [Results(item) for item in data.get("results", [])] 8 | self.job = Job(data.get("job", {})) 9 | 10 | 11 | class Results: 12 | def __init__(self, data): 13 | if data is None: 14 | data = {} 15 | self.custom_content_parsed = data.get("custom_content_parsed", {}) 16 | self.content_parsed = Content(data.get("content_parsed", {})) 17 | self.content = data.get("content") 18 | self.created_at = data.get("created_at") 19 | self.updated_at = data.get("updated_at") 20 | self.page = data.get("page") 21 | self.url = data.get("url") 22 | self.job_id = data.get("job_id") 23 | self.status_code = data.get("status_code") 24 | self.parser_type = data.get("parser_type") 25 | 26 | 27 | class Content: 28 | def __init__(self, data): 29 | if data is None: 30 | data = {} 31 | self.raw = data 32 | self.url = data.get("url") 33 | self.title = data.get("title") 34 | self.pages = data.get("pages") 35 | self.query = data.get("query") 36 | self.images = data.get("images") 37 | self.variants = Variants(data.get("variants", {})) 38 | self.highlights = data.get("highlights", []) 39 | self.description = data.get("description") 40 | self.related_items = RelatedItems(data.get("related_items", {})) 41 | self.specifications = Specifications(data.get("specifications", {})) 42 | self.page = data.get("page") 43 | self.errors = data.get("_errors") 44 | self.results = Result(data.get("results", {})) 45 | self.rating = data.get("rating") 46 | self.pricing = [Pricing(item) for item in data.get("pricing", [])] 47 | self.ads = [AmazonProductAds(item) for item in data.get("ads", [])] 48 | self.asin = data.get("asin") 49 | self.price = data.get("price") 50 | self.stock = data.get("stock") 51 | self.coupon = data.get("coupon") 52 | self.category = [ 53 | AmazonProductCategory(item) for item in data.get("category", []) 54 | ] 55 | self.currency = data.get("currency") 56 | self.delivery = [ 57 | AmazonProductDelivery(item) for item in data.get("delivery", []) 58 | ] 59 | self.warnings = data.get("_warnings", []) 60 | self.deal_type = data.get("deal_type") 61 | self.page_type = data.get("page_type") 62 | self.price_sns = data.get("price_sns") 63 | self.variation = data.get("variation") 64 | self.has_videos = data.get("has_videos") 65 | self.sales_rank = [ 66 | AmazonProductSalesRank(item) for item in data.get("sales_rank", []) 67 | ] 68 | self.top_review = data.get("top_review") 69 | self.asin_in_url = data.get("asin_in_url") 70 | self.price_upper = data.get("price_upper") 71 | self.pricing_str = data.get("pricing_str") 72 | self.pricing_url = data.get("pricing_url") 73 | self.discount_end = data.get("discount_end") 74 | self.manufacturer = data.get("manufacturer") 75 | self.max_quantity = data.get("max_quantity") 76 | self.price_buybox = data.get("price_buybox") 77 | self.product_name = data.get("product_name") 78 | self.bullet_points = data.get("bullet_points") 79 | self.is_addon_item = data.get("is_addon_item") 80 | self.price_initial = data.get("price_initial") 81 | self.pricing_count = data.get("pricing_count") 82 | self.reviews_count = data.get("reviews_count") 83 | self.sns_discounts = data.get("sns_discounts", []) 84 | self.developer_info = data.get("developer_info", []) 85 | self.lightning_deal = data.get("lightning_deal") 86 | self.price_shipping = data.get("price_shipping") 87 | self.is_prime_pantry = data.get("is_prime_pantry") 88 | self.product_details = ProductDetails(data.get("product_details", {})) 89 | self.featured_merchant = data.get("featured_merchant", []) 90 | self.is_prime_eligible = data.get("is_prime_eligible") 91 | self.product_dimensions = data.get("product_dimensions") 92 | self.refurbished_product = AmazonRefurbishedProduct( 93 | data.get("refurbished_product", {}) 94 | ) 95 | self.answered_questions_count = data.get("answered_questions_count") 96 | self.rating_star_distribution = [ 97 | AmazonRatingStarDistribution(item) 98 | for item in data.get("rating_star_distribution", []) 99 | ] 100 | self.reviews = [ 101 | AmazonReviews(item) for item in data.get("reviews", []) 102 | ] 103 | self.questions = AmazonQuestions(data.get("questions", {})) 104 | self.questions_total = data.get("questions_total") 105 | self.business_name = data.get("business_name") 106 | self.recent_feedback = [ 107 | RecentFeedback(item) for item in data.get("recent_feedback", []) 108 | ] 109 | self.business_address = data.get("business_address") 110 | self.feedback_summary_table = FeedbackSummaryTable( 111 | data.get("feedback_summary_table", {}) 112 | ) 113 | self.review_count = data.get("review_count") 114 | self.last_visible_page = data.get("last_visible_page") 115 | self.parse_status_code = data.get("parse_status_code") 116 | 117 | 118 | class Result: 119 | def __init__(self, data): 120 | if data is None: 121 | data = {} 122 | self.raw = data 123 | self.paid = [Paid(item) for item in data.get("paid", [])] 124 | self.filters = [Filters(item) for item in data.get("filters", [])] 125 | self.search_information = SearchInformation( 126 | data.get("search_information") 127 | ) 128 | self.suggested = [ 129 | SuggestedAmazonSearch(item) for item in data.get("suggested", []) 130 | ] 131 | self.amazon_choices = [ 132 | AmazonChoices(item) for item in data.get("amazon_choices", []) 133 | ] 134 | self.instant_recommendations = [ 135 | InstantRecommendations(item) 136 | for item in data.get("instant_recommendations", []) 137 | ] 138 | self.pos = data.get("pos") 139 | self.url = data.get("url") 140 | self.asin = data.get("asin") 141 | self.price = data.get("price") 142 | self.title = data.get("title") 143 | self.rating = data.get("rating") 144 | self.currency = data.get("currency") 145 | self.is_prime = data.get("is_prime") 146 | self.price_str = data.get("price_str") 147 | self.price_upper = data.get("price_upper") 148 | self.ratings_count = data.get("ratings_count") 149 | self.pla = Pla(data.get("pla", {})) 150 | self.images = Image(data.get("images", {})) 151 | self.twitter = Twitter(data.get("twitter", {})) 152 | self.knowledge = Knowledge(data.get("knowledge", {})) 153 | self.local_pack = LocalPack(data.get("local_pack", {})) 154 | self.top_stories = TopStory(data.get("top_stories", {})) 155 | self.popular_products = [ 156 | PopularProducts(item) for item in data.get("popular_products", []) 157 | ] 158 | self.related_searches = RelatedSearches( 159 | data.get("related_searches", {}) 160 | ) 161 | self.related_questions = RelatedQuestions( 162 | data.get("related_questions", {}) 163 | ) 164 | self.item_carousel = ItemCarousel(data.get("item_carousel", {})) 165 | self.recipes = Recipes(data.get("recipes", {})) 166 | self.videos = Videos(data.get("videos", {})) 167 | self.featured_snippet = [ 168 | FeaturedSnippet(item) for item in data.get("featured_snippet", []) 169 | ] 170 | self.related_searches_categorized = [ 171 | RelatedSearchesCategorized(item) 172 | for item in data.get("related_searches_categorized", []) 173 | ] 174 | self.hotels = Hotels(data.get("hotels", {})) 175 | self.flights = Flights(data.get("flights", {})) 176 | self.video_box = VideoBox(data.get("video_box", {})) 177 | self.local_service_ads = LocalServiceAds( 178 | data.get("local_service_ads", {}) 179 | ) 180 | self.navigation = [ 181 | Navigation(item) for item in data.get("navigation", []) 182 | ] 183 | self.instant_answers = [ 184 | InstantAnswers(item) for item in data.get("instant_answers", []) 185 | ] 186 | self.visually_similar_images = VisuallySimilarImages( 187 | data.get("visually_similar_images", {}) 188 | ) 189 | self.total_results_count = data.get("total_results_count") 190 | 191 | 192 | class Paid: 193 | def __init__(self, data): 194 | if data is None: 195 | data = {} 196 | self.raw = data 197 | self.pos = data.get("pos") 198 | self.url = data.get("url") 199 | self.desc = data.get("desc") 200 | self.title = data.get("title") 201 | self.data_rw = data.get("data_rw") 202 | self.data_pcu = data.get("data_pcu") 203 | self.sitelinks = PaidSitelinks(data.get("sitelinks", {})) 204 | self.url_shown = data.get("url_shown") 205 | self.asin = data.get("asin") 206 | self.price = data.get("price") 207 | self.rating = data.get("rating") 208 | self.rel_pos = data.get("rel_pos") 209 | self.currency = data.get("currency") 210 | self.url_image = data.get("url_image") 211 | self.best_seller = data.get("best_seller") 212 | self.price_upper = data.get("price_upper") 213 | self.is_sponsored = data.get("is_sponsored") 214 | self.manufacturer = data.get("manufacturer") 215 | self.pricing_count = data.get("pricing_count") 216 | self.reviews_count = data.get("reviews_count") 217 | self.is_amazons_choice = data.get("is_amazons_choice") 218 | self.no_price_reason = data.get("no_price_reason") 219 | self.sales_volume = data.get("sales_volume") 220 | self.is_prime = data.get("is_prime") 221 | self.shipping_information = data.get("shipping_information") 222 | self.pos_overall = data.get("pos_overall") 223 | 224 | 225 | class PaidSitelinks: 226 | def __init__(self, data): 227 | if data is None: 228 | data = {} 229 | self.raw = data 230 | self.expanded = [Expanded(item) for item in data.get("expanded", [])] 231 | self.inline = [Inline(item) for item in data.get("inline", [])] 232 | 233 | 234 | class Expanded: 235 | def __init__(self, data): 236 | if data is None: 237 | data = {} 238 | self.raw = data 239 | self.url = data.get("url") 240 | self.desc = data.get("desc") 241 | self.title = data.get("title") 242 | 243 | 244 | class Inline: 245 | def __init__(self, data): 246 | if data is None: 247 | data = {} 248 | self.raw = data 249 | self.url = data.get("url") 250 | self.desc = data.get("desc") 251 | self.title = data.get("title") 252 | 253 | 254 | class Filters: 255 | def __init__(self, data): 256 | if data is None: 257 | data = {} 258 | self.raw = data 259 | self.name = data.get("name") 260 | self.values = [FilterValues(item) for item in data.get("values", [])] 261 | 262 | 263 | class FilterValues: 264 | def __init__(self, data): 265 | if data is None: 266 | data = {} 267 | self.raw = data 268 | self.url = data.get("url") 269 | self.value = data.get("value") 270 | 271 | 272 | class Organic: 273 | def __init__(self, data): 274 | if data is None: 275 | data = {} 276 | self.raw = data 277 | self.pos = data.get("pos") 278 | self.url = data.get("url") 279 | self.desc = data.get("desc") 280 | self.type = data.get("type") 281 | self.price = data.get("price") 282 | self.title = data.get("title") 283 | self.currency = data.get("currency") 284 | self.merchant = Merchant(data.get("merchant", {})) 285 | self.price_str = data.get("price_str") 286 | self.product_id = data.get("product_id") 287 | self.asin = data.get("asin") 288 | self.rating = data.get("rating") 289 | self.url_image = data.get("url_image") 290 | self.best_seller = data.get("best_seller") 291 | self.price_upper = data.get("price_upper") 292 | self.is_sponsored = data.get("is_sponsored") 293 | self.manufacturer = data.get("manufacturer") 294 | self.pricing_count = data.get("pricing_count") 295 | self.reviews_count = data.get("reviews_count") 296 | self.is_amazons_choice = data.get("is_amazons_choice") 297 | self.no_price_reason = data.get("no_price_reason") 298 | self.is_prime = data.get("is_prime") 299 | self.sales_volume = data.get("sales_volume") 300 | self.variations = [ 301 | Variations(item) for item in data.get("variations", []) 302 | ] 303 | self.images = [item for item in data.get("images", [])] 304 | self.site_links = OrganicSitelinks(data.get("sitelinks", {})) 305 | self.url_shown = data.get("url_shown") 306 | self.pos_overall = data.get("pos_overall") 307 | 308 | 309 | class Merchant: 310 | def __init__(self, data): 311 | if data is None: 312 | data = {} 313 | self.raw = data 314 | self.url = data.get("url") 315 | self.name = data.get("name") 316 | 317 | 318 | class Variations: 319 | def __init__(self, data): 320 | if data is None: 321 | data = {} 322 | self.raw = data 323 | self.asin = data.get("asin") 324 | self.title = data.get("title") 325 | self.price = data.get("price") 326 | self.price_strikethrough = data.get("price_strikethrough") 327 | self.not_available = data.get("not_available") 328 | 329 | 330 | class SearchInformation: 331 | def __init__(self, data): 332 | if data is None: 333 | data = {} 334 | self.raw = data 335 | self.query = data.get("query") 336 | self.showing_results_for = data.get("showing_results_for") 337 | self.image = SearchInformationImage(data.get("image", {})) 338 | self.total_results_count = data.get("total_results_count") 339 | 340 | 341 | class Variants: 342 | def __init__(self, data): 343 | if data is None: 344 | data = {} 345 | self.raw = data 346 | self.type = data.get("type") 347 | self.items = [VariantItem(item) for item in data.get("items", [])] 348 | 349 | 350 | class VariantItem: 351 | def __init__(self, data): 352 | if data is None: 353 | data = {} 354 | self.raw = data 355 | self.value = data.get("value") 356 | self.selected = data.get("selected") 357 | self.available = data.get("available") 358 | self.product_id = data.get("product_id") 359 | 360 | 361 | class RelatedItems: 362 | def __init__(self, data): 363 | if data is None: 364 | data = {} 365 | self.raw = data 366 | self.items = [RelatedItem(item) for item in data.get("items", [])] 367 | 368 | 369 | class RelatedItem: 370 | def __init__(self, data): 371 | if data is None: 372 | data = {} 373 | self.raw = data 374 | self.url = data.get("url") 375 | self.price = data.get("price") 376 | self.title = data.get("title") 377 | self.rating = data.get("rating") 378 | self.currency = data.get("currency") 379 | self.reviews_count = data.get("reviews_count") 380 | 381 | 382 | class Specifications: 383 | def __init__(self, data): 384 | if data is None: 385 | data = {} 386 | self.raw = data 387 | self.items = [ 388 | SpecificationItem(item) for item in data.get("items", []) 389 | ] 390 | self.section_title = data.get("section_title") 391 | 392 | 393 | class SpecificationItem: 394 | def __init__(self, data): 395 | if data is None: 396 | data = {} 397 | self.raw = data 398 | self.title = data.get("title") 399 | self.value = data.get("value") 400 | 401 | 402 | class Pricing: 403 | def __init__(self, data): 404 | if data is None: 405 | data = {} 406 | self.raw = data 407 | self.price = data.get("price") 408 | self.seller = data.get("seller") 409 | self.details = data.get("details") 410 | self.currency = data.get("currency") 411 | self.condition = data.get("condition") 412 | self.price_tax = data.get("price_tax") 413 | self.price_total = data.get("price_total") 414 | self.seller_link = data.get("seller_link") 415 | self.price_shipping = data.get("price_shipping") 416 | self.delivery = data.get("delivery") 417 | self.seller_id = data.get("seller_id") 418 | self.rating_count = data.get("rating_count") 419 | self.delivery_options = data.get("delivery_options") 420 | 421 | 422 | class SuggestedAmazonSearch: 423 | def __init__(self, data): 424 | if data is None: 425 | data = {} 426 | self.raw = data 427 | self.url = data.get("url") 428 | self.asin = data.get("asin") 429 | self.price = data.get("price") 430 | self.title = data.get("title") 431 | self.rating = data.get("rating") 432 | self.currency = data.get("currency") 433 | self.url_image = data.get("url_image") 434 | self.best_seller = data.get("best_seller") 435 | self.price_upper = data.get("price_upper") 436 | self.is_sponsored = data.get("is_sponsored") 437 | self.manufacturer = data.get("manufacturer") 438 | self.pricing_count = data.get("pricing_count") 439 | self.reviews_count = data.get("reviews_count") 440 | self.is_amazons_choice = data.get("is_amazons_choice") 441 | self.pos = data.get("pos") 442 | self.shipping_information = data.get("shipping_information") 443 | self.sales_volume = data.get("sales_volume") 444 | self.no_price_reason = data.get("no_price_reason") 445 | self.suggested_query = data.get("suggested_query") 446 | 447 | 448 | class AmazonChoices: 449 | def __init__(self, data): 450 | if data is None: 451 | data = {} 452 | self.raw = data 453 | self.url = data.get("url") 454 | self.asin = data.get("asin") 455 | self.price = data.get("price") 456 | self.title = data.get("title") 457 | self.rating = data.get("rating") 458 | self.currency = data.get("currency") 459 | self.url_image = data.get("url_image") 460 | self.best_seller = data.get("best_seller") 461 | self.price_upper = data.get("price_upper") 462 | self.is_sponsored = data.get("is_sponsored") 463 | self.manufacturer = data.get("manufacturer") 464 | self.pricing_count = data.get("pricing_count") 465 | self.reviews_count = data.get("reviews_count") 466 | self.is_amazons_choice = data.get("is_amazons_choice") 467 | self.pos = data.get("pos") 468 | self.is_prime = data.get("is_prime") 469 | self.shipping_information = data.get("shipping_information") 470 | self.sales_volume = data.get("sales_volume") 471 | self.no_price_reason = data.get("no_price_reason") 472 | self.variations = [ 473 | Variations(item) for item in data.get("variations", []) 474 | ] 475 | 476 | 477 | class InstantRecommendations: 478 | def __init__(self, data): 479 | if data is None: 480 | data = {} 481 | self.raw = data 482 | self.url = data.get("url") 483 | self.asin = data.get("asin") 484 | self.price = data.get("price") 485 | self.title = data.get("title") 486 | self.rating = data.get("rating") 487 | self.currency = data.get("currency") 488 | self.url_image = data.get("url_image") 489 | self.best_seller = data.get("best_seller") 490 | self.price_upper = data.get("price_upper") 491 | self.is_sponsored = data.get("is_sponsored") 492 | self.manufacturer = data.get("manufacturer") 493 | self.pricing_count = data.get("pricing_count") 494 | self.reviews_count = data.get("reviews_count") 495 | self.is_amazons_choice = data.get("is_amazons_choice") 496 | self.pos = data.get("pos") 497 | self.sales_volume = data.get("sales_volume") 498 | self.no_price_reason = data.get("no_price_reason") 499 | 500 | 501 | class AmazonProductAds: 502 | def __init__(self, data): 503 | if data is None: 504 | data = {} 505 | self.raw = data 506 | self.pos = data.get("pos") 507 | self.asin = data.get("asin") 508 | self.type = data.get("type") 509 | self.price = data.get("price") 510 | self.title = data.get("title") 511 | self.images = data.get("images", []) 512 | self.rating = data.get("rating") 513 | self.location = data.get("location") 514 | self.price_upper = data.get("price_upper") 515 | self.reviews_count = data.get("reviews_count") 516 | self.is_prime_eligible = data.get("is_prime_eligible") 517 | 518 | 519 | class AmazonProductCategory: 520 | def __init__(self, data): 521 | if data is None: 522 | data = {} 523 | self.raw = data 524 | self.ladder = [ 525 | {"url": item.get("url"), "name": item.get("name")} 526 | for item in data.get("ladder", []) 527 | ] 528 | 529 | 530 | class AmazonProductDelivery: 531 | def __init__(self, data): 532 | if data is None: 533 | data = {} 534 | self.raw = data 535 | self.date = Date(data.get("date", {})) 536 | self.type = data.get("type") 537 | 538 | 539 | class Date: 540 | def __init__(self, data): 541 | if data is None: 542 | data = {} 543 | self.raw = data 544 | self.by = data.get("by") 545 | self.from_date = data.get("from") 546 | 547 | 548 | class AmazonProductSalesRank: 549 | def __init__(self, data): 550 | if data is None: 551 | data = {} 552 | self.raw = data 553 | self.rank = data.get("rank") 554 | self.ladder = [ 555 | {"url": item.get("url"), "name": item.get("name")} 556 | for item in data.get("ladder", []) 557 | ] 558 | 559 | 560 | class ProductDetails: 561 | def __init__(self, data): 562 | if data is None: 563 | data = {} 564 | self.raw = data 565 | self.asin = data.get("asin") 566 | self.batteries = data.get("batteries") 567 | self.item_weight = data.get("item_weight") 568 | self.manufacturer = data.get("manufacturer") 569 | self.customer_reviews = data.get("customer_reviews") 570 | self.best_sellers_rank = data.get("best_sellers_rank") 571 | self.country_of_origin = data.get("country_of_origin") 572 | self.item_model_number = data.get("item_model_number") 573 | self.product_dimensions = data.get("product_dimensions") 574 | self.date_first_available = data.get("date_first_available") 575 | self.is_discontinued_by_manufacturer = data.get( 576 | "is_discontinued_by_manufacturer" 577 | ) 578 | 579 | 580 | class AmazonRefurbishedProduct: 581 | def __init__(self, data): 582 | if data is None: 583 | data = {} 584 | self.raw = data 585 | self.link = Link(data.get("link", {})) 586 | self.condition_title = data.get("condition_title") 587 | 588 | 589 | class Link: 590 | def __init__(self, data): 591 | if data is None: 592 | data = {} 593 | self.raw = data 594 | self.url = data.get("url") 595 | self.title = data.get("title") 596 | 597 | 598 | class AmazonRatingStarDistribution: 599 | def __init__(self, data): 600 | if data is None: 601 | data = {} 602 | self.raw = data 603 | self.rating = data.get("rating") 604 | self.percentage = data.get("percentage") 605 | 606 | 607 | class AmazonReviews: 608 | def __init__(self, data): 609 | if data is None: 610 | data = {} 611 | self.raw = data 612 | self.id = data.get("id") 613 | self.title = data.get("title") 614 | self.author = data.get("author") 615 | self.rating = data.get("rating") 616 | self.content = data.get("content") 617 | self.timestamp = data.get("timestamp") 618 | self.is_verified = data.get("is_verified") 619 | self.product_attributes = data.get("product_attributes") 620 | 621 | 622 | class AmazonQuestions: 623 | def __init__(self, data): 624 | if data is None: 625 | data = {} 626 | self.raw = data 627 | self.title = data.get("title") 628 | self.votes = data.get("votes") 629 | self.answers = [Answer(item) for item in data.get("answers", [])] 630 | 631 | 632 | class Answer: 633 | def __init__(self, data): 634 | if data is None: 635 | data = {} 636 | self.raw = data 637 | self.author = data.get("author") 638 | self.content = data.get("content") 639 | self.timestamp = data.get("timestamp") 640 | 641 | 642 | class RecentFeedback: 643 | def __init__(self, data): 644 | if data is None: 645 | data = {} 646 | self.raw = data 647 | self.feedback = data.get("feedback") 648 | self.rated_by = data.get("rated_by") 649 | self.rating_stars = data.get("rating_stars") 650 | 651 | 652 | class FeedbackSummaryTable: 653 | def __init__(self, data): 654 | if data is None: 655 | data = {} 656 | self.raw = data 657 | self.counts = Counts(data.get("counts", {})) 658 | self.neutral = Counts(data.get("neutral", {})) 659 | self.negative = Counts(data.get("negative", {})) 660 | self.positive = Counts(data.get("positive", {})) 661 | 662 | 663 | class Counts: 664 | def __init__(self, data): 665 | if data is None: 666 | data = {} 667 | self.raw = data 668 | self.thirty_days = data.get("30_days") 669 | self.ninety_days = data.get("90_days") 670 | self.all_time = data.get("all_time") 671 | self.twelve_months = data.get("12_months") 672 | 673 | 674 | class Job: 675 | def __init__(self, data): 676 | if data is None: 677 | data = {} 678 | self.raw = data 679 | self.callback_url = data.get("callback_url") 680 | self.client_id = data.get("client_id") 681 | self.context = [Context(item) for item in data.get("context", [])] 682 | self.created_at = data.get("created_at") 683 | self.domain = data.get("domain") 684 | self.geo_location = data.get("geo_location") 685 | self.id = data.get("id") 686 | self.limit = data.get("limit") 687 | self.locale = data.get("locale") 688 | self.pages = data.get("pages") 689 | self.parse = data.get("parse") 690 | self.parser_type = data.get("parser_type") 691 | self.parsing_instructions = data.get("parsing_instructions") 692 | self.browser_instructions = data.get("browser_instructions") 693 | self.render = data.get("render") 694 | self.url = data.get("url") 695 | self.query = data.get("query") 696 | self.source = data.get("source") 697 | self.start_page = data.get("start_page") 698 | self.status = data.get("status") 699 | self.storage_type = data.get("storage_type") 700 | self.storage_url = data.get("storage_url") 701 | self.subdomain = data.get("subdomain") 702 | self.content_encoding = data.get("content_encoding") 703 | self.updated_at = data.get("updated_at") 704 | self.user_agent_type = data.get("user_agent_type") 705 | self.session_info = data.get("session_info") 706 | self.statuses = data.get("statuses") 707 | self.client_notes = data.get("client_notes") 708 | self.links = [JobLink(item) for item in data.get("_links", [])] 709 | 710 | 711 | class Context: 712 | def __init__(self, data): 713 | if data is None: 714 | data = {} 715 | self.raw = data 716 | self.key = data.get("key") 717 | self.value = data.get("value") 718 | 719 | 720 | class JobLink: 721 | def __init__(self, data): 722 | if data is None: 723 | data = {} 724 | self.raw = data 725 | self.rel = data.get("rel") 726 | self.href = data.get("href") 727 | self.method = data.get("method") 728 | 729 | class Pla: 730 | def __init__(self, data): 731 | if data is None: 732 | data = {} 733 | self.raw = data 734 | self.items = [PlaItem(item) for item in data.get("items", [])] 735 | self.pos_overall = data.get("pos_overall") 736 | 737 | class PlaItem: 738 | def __init__(self, data): 739 | if data is None: 740 | data = {} 741 | self.raw = data 742 | self.pos = data.get("pos") 743 | self.url = data.get("url") 744 | self.price = data.get("price") 745 | self.title = data.get("title") 746 | self.seller = data.get("seller") 747 | self.url_image = data.get("url_image") 748 | self.image_data = data.get("image_data") 749 | 750 | class Image: 751 | def __init__(self, data): 752 | if data is None: 753 | data = {} 754 | self.items = [ImageItem(item) for item in data.get("items", [])] 755 | self.pos_overall = data.get("pos_overall") 756 | 757 | 758 | class ImageItem: 759 | def __init__(self, data): 760 | if data is None: 761 | data = {} 762 | self.raw = data 763 | self.alt = data.get("alt") 764 | self.pos = data.get("pos") 765 | self.url = data.get("url") 766 | self.data = data.get("data") 767 | self.source = data.get("source") 768 | 769 | 770 | class OrganicSitelinks: 771 | def __init__(self, data): 772 | if data is None: 773 | data = {} 774 | self.raw = data 775 | self.expanded = [Expanded(item) for item in data.get("expanded", [])] 776 | self.inline = [Inline(item) for item in data.get("inline", [])] 777 | 778 | 779 | class Twitter: 780 | def __init__(self, data): 781 | if data is None: 782 | data = {} 783 | self.raw = data 784 | self.pos = data.get("pos") 785 | self.url = data.get("url") 786 | self.items = [TwitterItem(item) for item in data.get("items", [])] 787 | self.title = data.get("title") 788 | self.pos_overall = data.get("pos_overall") 789 | 790 | 791 | class TwitterItem: 792 | def __init__(self, data): 793 | if data is None: 794 | data = {} 795 | self.raw = data 796 | self.pos = data.get("pos") 797 | self.url = data.get("url") 798 | self.content = data.get("content") 799 | self.time_frame = data.get("time_frame") 800 | 801 | 802 | class Knowledge: 803 | def __init__(self, data): 804 | if data is None: 805 | data = {} 806 | self.raw = data 807 | self.title = data.get("title") 808 | self.images = [item for item in data.get("images", [])] 809 | self.factoids = [Factoid(item) for item in data.get("factoids", [])] 810 | self.profiles = [Profile(item) for item in data.get("profiles", [])] 811 | self.subtitle = data.get("subtitle") 812 | self.description = data.get("description") 813 | self.related_searches = [ 814 | RelatedSearches(item) for item in data.get("related_searches", []) 815 | ] 816 | 817 | 818 | class Factoid: 819 | def __init__(self, data): 820 | if data is None: 821 | data = {} 822 | self.raw = data 823 | self.links = [LinkElement(item) for item in data.get("links", [])] 824 | self.title = data.get("title") 825 | self.content = data.get("content") 826 | 827 | 828 | class LinkElement: 829 | def __init__(self, data): 830 | if data is None: 831 | data = {} 832 | self.raw = data 833 | self.href = data.get("href") 834 | self.title = data.get("title") 835 | 836 | 837 | class Profile: 838 | def __init__(self, data): 839 | if data is None: 840 | data = {} 841 | self.raw = data 842 | self.url = data.get("url") 843 | self.title = data.get("title") 844 | 845 | 846 | class RelatedSearches: 847 | def __init__(self, data): 848 | if data is None: 849 | data = {} 850 | self.raw = data 851 | self.url = data.get("url") 852 | self.title = data.get("title") 853 | self.section_title = data.get("section_title") 854 | self.pos_overall = data.get("pos_overall") 855 | self.related_searches = [ 856 | item for item in data.get("related_searches", []) 857 | ] 858 | 859 | 860 | class LocalPack: 861 | def __init__(self, data): 862 | if data is None: 863 | data = {} 864 | self.raw = data 865 | self.items = [LocalPackItem(item) for item in data.get("items", [])] 866 | self.pos_overall = data.get("pos_overall") 867 | 868 | 869 | class LocalPackItem: 870 | def __init__(self, data): 871 | if data is None: 872 | data = {} 873 | self.raw = data 874 | self.cid = data.get("cid") 875 | self.pos = data.get("pos") 876 | self.links = [LocalPackLink(item) for item in data.get("links", [])] 877 | self.phone = data.get("phone") 878 | self.title = data.get("title") 879 | self.rating = data.get("rating") 880 | self.address = data.get("address") 881 | self.subtitle = data.get("subtitle") 882 | self.rating_count = data.get("rating_count") 883 | 884 | 885 | class LocalPackLink: 886 | def __init__(self, data): 887 | if data is None: 888 | data = {} 889 | self.raw = data 890 | self.href = data.get("href") 891 | self.title = data.get("title") 892 | 893 | 894 | class TopStory: 895 | def __init__(self, data): 896 | if data is None: 897 | data = {} 898 | self.raw = data 899 | self.items = [TopStoryItem(item) for item in data.get("items", [])] 900 | self.pos_overall = data.get("pos_overall") 901 | 902 | 903 | class TopStoryItem: 904 | def __init__(self, data): 905 | if data is None: 906 | data = {} 907 | self.pos = data.get("pos") 908 | self.url = data.get("url") 909 | self.title = data.get("title") 910 | self.source = data.get("source") 911 | self.time_frame = data.get("time_frame") 912 | 913 | 914 | class PopularProducts: 915 | def __init__(self, data): 916 | if data is None: 917 | data = {} 918 | self.raw = data 919 | self.pos = data.get("pos") 920 | self.price = data.get("price") 921 | self.rating = data.get("rating") 922 | self.seller = data.get("seller") 923 | self.title = data.get("title") 924 | self.image_data = data.get("image_data") 925 | 926 | class RelatedQuestions: 927 | def __init__(self, data): 928 | if data is None: 929 | data = {} 930 | self.raw = data 931 | self.pos_overall = data.get("pos_overall") 932 | self.related_questions = [ 933 | RelatedQuestionsItem(item) 934 | for item in data.get("related_questions", []) 935 | ] 936 | 937 | 938 | class RelatedQuestionsItem: 939 | def __init__(self, data): 940 | if data is None: 941 | data = {} 942 | self.raw = data 943 | self.pos = data.get("pos") 944 | self.answer = data.get("answer") 945 | self.source = data.get("source") 946 | self.question = data.get("question") 947 | 948 | 949 | class Source: 950 | def __init__(self, data): 951 | if data is None: 952 | data = {} 953 | self.raw = data 954 | self.url = data.get("url") 955 | self.title = data.get("title") 956 | self.url_shown = data.get("url_shown") 957 | 958 | class SearchInformationImage: 959 | def __init__(self, data): 960 | if data is None: 961 | data = {} 962 | self.raw = data 963 | self.url = data.get("url") 964 | self.width = data.get("width") 965 | self.height = data.get("height") 966 | self.other_sizes = data.get("other_sizes") 967 | 968 | 969 | class ItemCarousel: 970 | def __init__(self, data): 971 | if data is None: 972 | data = {} 973 | self.raw = data 974 | self.items = [ItemCarouselItem(item) for item in data.get("items", [])] 975 | self.pos_overall = data.get("pos_overall") 976 | self.title = data.get("title") 977 | 978 | 979 | class ItemCarouselItem: 980 | def __init__(self, data): 981 | if data is None: 982 | data = {} 983 | self.raw = data 984 | self.pos = data.get("pos") 985 | self.href = data.get("href") 986 | self.title = data.get("title") 987 | self.subtitle = data.get("subtitle") 988 | 989 | 990 | class Recipes: 991 | def __init__(self, data): 992 | if data is None: 993 | data = {} 994 | self.raw = data 995 | self.items = [RecipesItem(item) for item in data.get("items", [])] 996 | self.pos_overall = data.get("pos_overall") 997 | 998 | 999 | class RecipesItem: 1000 | def __init__(self, data): 1001 | if data is None: 1002 | data = {} 1003 | self.raw = data 1004 | self.pos = data.get("pos") 1005 | self.url = data.get("url") 1006 | self.title = data.get("title") 1007 | self.rating = data.get("rating") 1008 | self.source = data.get("source") 1009 | self.duration = data.get("duration") 1010 | 1011 | 1012 | class Videos: 1013 | def __init__(self, data): 1014 | if data is None: 1015 | data = {} 1016 | self.raw = data 1017 | self.items = [VideosItem(item) for item in data.get("items", [])] 1018 | self.pos_overall = data.get("pos_overall") 1019 | 1020 | 1021 | class VideosItem: 1022 | def __init__(self, data): 1023 | if data is None: 1024 | data = {} 1025 | self.raw = data 1026 | self.pos = data.get("pos") 1027 | self.url = data.get("url") 1028 | self.title = data.get("title") 1029 | self.author = data.get("author") 1030 | self.source = data.get("source") 1031 | 1032 | 1033 | class FeaturedSnippet: 1034 | def __init__(self, data): 1035 | if data is None: 1036 | data = {} 1037 | self.raw = data 1038 | self.url = data.get("url") 1039 | self.desc = data.get("desc") 1040 | self.title = data.get("title") 1041 | self.url_shown = data.get("url_shown") 1042 | self.pos_overall = data.get("pos_overall") 1043 | 1044 | 1045 | class RelatedSearchesCategorized: 1046 | def __init__(self, data): 1047 | if data is None: 1048 | data = {} 1049 | self.raw = data 1050 | self.items = [ 1051 | RelatedSearchesCategorizedItem(item) 1052 | for item in data.get("items", []) 1053 | ] 1054 | self.category = data.get("category") 1055 | self.pos_overall = data.get("pos_overall") 1056 | 1057 | 1058 | class RelatedSearchesCategorizedItem: 1059 | def __init__(self, data): 1060 | if data is None: 1061 | data = {} 1062 | self.raw = data 1063 | self.url = data.get("url") 1064 | self.title = data.get("title") 1065 | 1066 | 1067 | class Category: 1068 | def __init__(self, data): 1069 | if data is None: 1070 | data = {} 1071 | self.raw = data 1072 | self.name = data.get("name") 1073 | self.type = data.get("type") 1074 | 1075 | 1076 | class Hotels: 1077 | def __init__(self, data): 1078 | if data is None: 1079 | data = {} 1080 | self.raw = data 1081 | self.date_to = data.get("date_to") 1082 | self.results = [HotelsResult(item) for item in data.get("results", [])] 1083 | self.date_from = data.get("date_from") 1084 | self.pos_overall = data.get("pos_overall") 1085 | 1086 | 1087 | class HotelsResult: 1088 | def __init__(self, data): 1089 | if data is None: 1090 | data = {} 1091 | self.raw = data 1092 | self.price = data.get("price") 1093 | self.title = data.get("title") 1094 | self.from_location = data.get("from") 1095 | 1096 | 1097 | class Flights: 1098 | def __init__(self, data): 1099 | if data is None: 1100 | data = {} 1101 | self.raw = data 1102 | self.to = data.get("to") 1103 | self.from_location = data.get("from") 1104 | self.results = [ 1105 | FlightsResult(item) for item in data.get("results", []) 1106 | ] 1107 | self.date_from = data.get("date_from") 1108 | self.pos_overall = data.get("pos_overall") 1109 | 1110 | 1111 | class FlightsResult: 1112 | def __init__(self, data): 1113 | if data is None: 1114 | data = {} 1115 | self.raw = data 1116 | self.url = data.get("url") 1117 | self.type = data.get("type") 1118 | self.price = data.get("price") 1119 | self.airline = data.get("airline") 1120 | self.duration = data.get("duration") 1121 | 1122 | 1123 | class VideoBox: 1124 | def __init__(self, data): 1125 | if data is None: 1126 | data = {} 1127 | self.raw = data 1128 | self.url = data.get("url") 1129 | self.title = data.get("title") 1130 | self.pos_overall = data.get("pos_overall") 1131 | 1132 | 1133 | class LocalServiceAds: 1134 | def __init__(self, data): 1135 | if data is None: 1136 | data = {} 1137 | self.raw = data 1138 | self.pos_overall = data.get("pos_overall") 1139 | self.items = [ 1140 | LocalServiceAdsItem(item) for item in data.get("items", []) 1141 | ] 1142 | 1143 | 1144 | class LocalServiceAdsItem: 1145 | def __init__(self, data): 1146 | if data is None: 1147 | data = {} 1148 | self.raw = data 1149 | self.pos = data.get("pos") 1150 | self.url = data.get("url") 1151 | self.title = data.get("title") 1152 | self.rating = data.get("rating") 1153 | self.reviews_count = data.get("reviews_count") 1154 | self.google_guaranteed = data.get("google_guaranteed") 1155 | 1156 | 1157 | class Navigation: 1158 | def __init__(self, data): 1159 | if data is None: 1160 | data = {} 1161 | self.raw = data 1162 | self.url = data.get("url") 1163 | self.title = data.get("title") 1164 | self.pos = data.get("pos") 1165 | 1166 | 1167 | class InstantAnswers: 1168 | def __init__(self, data): 1169 | if data is None: 1170 | data = {} 1171 | self.raw = data 1172 | self.type = data.get("type") 1173 | self.parsed = data.get("_parsed") 1174 | self.pos_overall = data.get("pos_overall") 1175 | 1176 | 1177 | class VisuallySimilarImages: 1178 | def __init__(self, data): 1179 | if data is None: 1180 | data = {} 1181 | self.raw = data 1182 | self.all_images_url = data.get("all_images_url") 1183 | self.featured_images = data.get("featured_images") 1184 | --------------------------------------------------------------------------------