├── il_supermarket_scarper ├── scrappers │ ├── tests │ │ ├── __init__.py │ │ ├── test_all.py │ │ └── test_cases.py │ ├── bareket.py │ ├── keshet.py │ ├── good_pharm.py │ ├── king_store.py │ ├── osherad.py │ ├── polizer.py │ ├── shuk_ahir.py │ ├── tivtaam.py │ ├── het_cohen.py │ ├── maayan2000.py │ ├── super_sapir.py │ ├── yohananof.py │ ├── doralon.py │ ├── victory.py │ ├── zolvebegadol.py │ ├── quik.py │ ├── mega.py │ ├── ramilevy.py │ ├── machsani_ashuk.py │ ├── shefa_barcart_ashem.py │ ├── bitan.py │ ├── salachdabach.py │ ├── superdosh.py │ ├── yellow.py │ ├── super_yuda.py │ ├── nativ_hashed.py │ ├── stop_market.py │ ├── cofix.py │ ├── __init__.py │ ├── shufersal.py │ ├── meshnat_yosef.py │ ├── wolt.py │ ├── super_pharm.py │ ├── hazihinam.py │ └── city_market.py ├── utils │ ├── databases │ │ ├── __init__.py │ │ ├── base.py │ │ ├── mongo.py │ │ └── json_file.py │ ├── exceptions.py │ ├── tests │ │ ├── PriceFull7290876100000-003-202410070010.gz │ │ ├── test_connection.py │ │ ├── test_gzip_utils.py │ │ ├── test_file_type.py │ │ └── test_status.py │ ├── lock_utils.py │ ├── __init__.py │ ├── folders_name.py │ ├── loop.py │ ├── gzip_utils.py │ ├── logger.py │ ├── validation.py │ ├── file_cache.py │ ├── file_types.py │ ├── scraper_status.py │ ├── retry.py │ └── status.py ├── engines │ ├── __init__.py │ ├── apsx.py │ ├── publishprice.py │ ├── bina.py │ ├── matrix.py │ ├── web.py │ ├── cerberus.py │ └── multipage_web.py ├── __init__.py ├── tests │ └── test_scrappers_factory.py ├── main.py ├── scrapper_runner.py ├── scrappers_factory.py └── scraper_stability.py ├── pytest.ini ├── MANIFEST.in ├── setup.cfg ├── requirements-dev.txt ├── .pylintrc ├── .gitignore ├── requirements.txt ├── .devcontainer └── devcontainer.json ├── example.py ├── .vscode └── launch.json ├── .github └── workflows │ ├── pylint.yml │ ├── python-publish.yml │ ├── user-validation.yml │ ├── docker-publish.yml │ ├── test-suite.yml │ └── codeql.yml ├── tests ├── test_integration.py └── test_main.py ├── Dockerfile ├── setup.py ├── main.py ├── stress_test.py ├── LICENSE.txt └── README.md /il_supermarket_scarper/scrappers/tests/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /pytest.ini: -------------------------------------------------------------------------------- 1 | [pytest] 2 | filterwarnings = 3 | ignore::UserWarning -------------------------------------------------------------------------------- /MANIFEST.in: -------------------------------------------------------------------------------- 1 | include requirements.txt 2 | include requirements-dev.txt 3 | -------------------------------------------------------------------------------- /setup.cfg: -------------------------------------------------------------------------------- 1 | # Inside of setup.cfg 2 | [metadata] 3 | description-file = README.md 4 | -------------------------------------------------------------------------------- /requirements-dev.txt: -------------------------------------------------------------------------------- 1 | pytest==7.1 2 | zipp==3.19.1 # patch pytest vulnerability 3 | black==24.3.0 4 | pylint==3.0.1 -------------------------------------------------------------------------------- /il_supermarket_scarper/utils/databases/__init__.py: -------------------------------------------------------------------------------- 1 | from .json_file import JsonDataBase 2 | from .mongo import MongoDataBase 3 | -------------------------------------------------------------------------------- /.pylintrc: -------------------------------------------------------------------------------- 1 | [MAIN] 2 | disable= 3 | C0114, # missing-module-docstring 4 | R0913, # too-many-arguments 5 | extension-pkg-allow-list=lxml.etree -------------------------------------------------------------------------------- /il_supermarket_scarper/utils/exceptions.py: -------------------------------------------------------------------------------- 1 | class RestartSessionError(Exception): 2 | """This error will be raised if we would like to retry to downalod after a session restart""" 3 | -------------------------------------------------------------------------------- /il_supermarket_scarper/engines/__init__.py: -------------------------------------------------------------------------------- 1 | from .cerberus import Cerberus 2 | from .multipage_web import MultiPageWeb 3 | from .matrix import Matrix 4 | from .bina import Bina 5 | from .publishprice import PublishPrice 6 | -------------------------------------------------------------------------------- /il_supermarket_scarper/utils/tests/PriceFull7290876100000-003-202410070010.gz: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/OpenIsraeliSupermarkets/israeli-supermarket-scarpers/HEAD/il_supermarket_scarper/utils/tests/PriceFull7290876100000-003-202410070010.gz -------------------------------------------------------------------------------- /il_supermarket_scarper/__init__.py: -------------------------------------------------------------------------------- 1 | from .main import ScarpingTask 2 | from .scrappers_factory import ScraperFactory 3 | from .scraper_stability import ScraperStability 4 | from .utils import FileTypesFilters, DumpFolderNames, datetime_in_tlv 5 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | __pycache__/ 2 | env/ 3 | *_cookies.txt 4 | dist/ 5 | il_supermarket_scraper.egg-info/ 6 | build/ 7 | database/* 8 | dumps/* 9 | logging.log 10 | temp*/ 11 | .vscode/settings.json 12 | .DS_Store 13 | test_dump 14 | status/ 15 | .cache/ 16 | -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | retry==0.9.2 2 | mock==4.0.3 3 | requests==2.32.2 4 | lxml==5.2.1 5 | beautifulsoup4==4.10.0 6 | pymongo==4.6.3 7 | dnspython==2.6.1 # patch pymongo vulnerability 8 | pytz==2022.4 9 | holidays==0.45 10 | cachetools==5.2.0 11 | pytest-playwright==0.7.0 -------------------------------------------------------------------------------- /il_supermarket_scarper/scrappers/bareket.py: -------------------------------------------------------------------------------- 1 | from il_supermarket_scarper.engines import Bina 2 | from il_supermarket_scarper.utils import DumpFolderNames 3 | 4 | 5 | class Bareket(Bina): 6 | """scarper for bareket""" 7 | 8 | def __init__(self, folder_name=None): 9 | super().__init__( 10 | chain=DumpFolderNames.BAREKET, 11 | chain_id="7290875100001", 12 | url_perfix="superbareket", 13 | folder_name=folder_name, 14 | ) 15 | -------------------------------------------------------------------------------- /il_supermarket_scarper/scrappers/keshet.py: -------------------------------------------------------------------------------- 1 | from il_supermarket_scarper.engines import Cerberus 2 | from il_supermarket_scarper.utils import DumpFolderNames 3 | 4 | 5 | class Keshet(Cerberus): 6 | """scaper for keshet tamim""" 7 | 8 | def __init__(self, folder_name=None): 9 | super().__init__( 10 | chain=DumpFolderNames.KESHET, 11 | chain_id="7290785400000", 12 | folder_name=folder_name, 13 | ftp_username="Keshet", 14 | ) 15 | -------------------------------------------------------------------------------- /.devcontainer/devcontainer.json: -------------------------------------------------------------------------------- 1 | { 2 | "build": { 3 | "dockerfile": "../Dockerfile", 4 | "target":"test", 5 | "args": { 6 | "PY_VERSION":"3.11.0" 7 | } 8 | }, 9 | "customizations": { 10 | "vscode": { 11 | "extensions": [ 12 | "ms-python.python", 13 | "ms-python.vscode-pylance", 14 | "ms-toolsai.jupyter", 15 | "LittleFoxTeam.vscode-python-test-adapter" 16 | ] 17 | } 18 | }, 19 | 20 | "forwardPorts": [3000] 21 | } 22 | -------------------------------------------------------------------------------- /il_supermarket_scarper/scrappers/good_pharm.py: -------------------------------------------------------------------------------- 1 | from il_supermarket_scarper.engines import Bina 2 | from il_supermarket_scarper.utils import DumpFolderNames 3 | 4 | 5 | class GoodPharm(Bina): 6 | """scarper from good pharm""" 7 | 8 | def __init__(self, folder_name=None): 9 | super().__init__( 10 | chain=DumpFolderNames.GOOD_PHARM, 11 | chain_id="7290058197699", 12 | url_perfix="goodpharm", 13 | folder_name=folder_name, 14 | ) 15 | -------------------------------------------------------------------------------- /il_supermarket_scarper/scrappers/king_store.py: -------------------------------------------------------------------------------- 1 | from il_supermarket_scarper.engines import Bina 2 | from il_supermarket_scarper.utils import DumpFolderNames 3 | 4 | 5 | class KingStore(Bina): 6 | """scraper for king store""" 7 | 8 | def __init__(self, folder_name=None): 9 | super().__init__( 10 | chain=DumpFolderNames.KING_STORE, 11 | chain_id="7290058108879", 12 | url_perfix="kingstore", 13 | folder_name=folder_name, 14 | ) 15 | -------------------------------------------------------------------------------- /il_supermarket_scarper/scrappers/osherad.py: -------------------------------------------------------------------------------- 1 | from il_supermarket_scarper.engines import Cerberus 2 | from il_supermarket_scarper.utils import DumpFolderNames 3 | 4 | 5 | class Osherad(Cerberus): 6 | """scaper for osher ad""" 7 | 8 | def __init__(self, folder_name=None): 9 | super().__init__( 10 | chain=DumpFolderNames.OSHER_AD, 11 | chain_id="7290103152017", 12 | folder_name=folder_name, 13 | ftp_username="osherad", 14 | ) 15 | -------------------------------------------------------------------------------- /il_supermarket_scarper/scrappers/polizer.py: -------------------------------------------------------------------------------- 1 | from il_supermarket_scarper.engines import Cerberus 2 | from il_supermarket_scarper.utils import DumpFolderNames 3 | 4 | 5 | class Polizer(Cerberus): 6 | """scarper for polizer""" 7 | 8 | def __init__(self, folder_name=None): 9 | super().__init__( 10 | chain=DumpFolderNames.POLIZER, 11 | chain_id="7291059100008", 12 | folder_name=folder_name, 13 | ftp_username="politzer", 14 | ) 15 | -------------------------------------------------------------------------------- /il_supermarket_scarper/scrappers/shuk_ahir.py: -------------------------------------------------------------------------------- 1 | from il_supermarket_scarper.engines import Bina 2 | from il_supermarket_scarper.utils import DumpFolderNames 3 | 4 | 5 | class ShukAhir(Bina): 6 | """scraper for shuk a hir""" 7 | 8 | def __init__(self, folder_name=None): 9 | super().__init__( 10 | chain=DumpFolderNames.SHUK_AHIR, 11 | chain_id="7290058148776", 12 | url_perfix="shuk-hayir", 13 | folder_name=folder_name, 14 | ) 15 | -------------------------------------------------------------------------------- /il_supermarket_scarper/scrappers/tivtaam.py: -------------------------------------------------------------------------------- 1 | from il_supermarket_scarper.engines import Cerberus 2 | from il_supermarket_scarper.utils import DumpFolderNames 3 | 4 | 5 | class TivTaam(Cerberus): 6 | """scraper for tiv taam""" 7 | 8 | def __init__(self, folder_name=None): 9 | super().__init__( 10 | chain=DumpFolderNames.TIV_TAAM, 11 | chain_id="7290873255550", 12 | folder_name=folder_name, 13 | ftp_username="TivTaam", 14 | ) 15 | -------------------------------------------------------------------------------- /il_supermarket_scarper/scrappers/het_cohen.py: -------------------------------------------------------------------------------- 1 | from il_supermarket_scarper.engines import Matrix 2 | from il_supermarket_scarper.utils import DumpFolderNames 3 | 4 | 5 | class HetCohen(Matrix): 6 | """scraper for ChetCohen""" 7 | 8 | def __init__(self, folder_name=None): 9 | super().__init__( 10 | chain=DumpFolderNames.HET_COHEN, 11 | chain_id=["7290455000004"], 12 | folder_name=folder_name, 13 | chain_hebrew_name="ח. כהן", 14 | ) 15 | -------------------------------------------------------------------------------- /il_supermarket_scarper/scrappers/maayan2000.py: -------------------------------------------------------------------------------- 1 | from il_supermarket_scarper.engines import Bina 2 | from il_supermarket_scarper.utils import DumpFolderNames 3 | 4 | 5 | class Maayan2000(Bina): 6 | """scaper for maayan 2000""" 7 | 8 | def __init__(self, folder_name=None): 9 | super().__init__( 10 | chain=DumpFolderNames.MAAYAN_2000, 11 | chain_id="7290058159628", 12 | url_perfix="maayan2000", 13 | folder_name=folder_name, 14 | ) 15 | -------------------------------------------------------------------------------- /il_supermarket_scarper/scrappers/super_sapir.py: -------------------------------------------------------------------------------- 1 | from il_supermarket_scarper.engines import Bina 2 | from il_supermarket_scarper.utils import DumpFolderNames 3 | 4 | 5 | class SuperSapir(Bina): 6 | """scaper for super sapir""" 7 | 8 | def __init__(self, folder_name=None): 9 | super().__init__( 10 | chain=DumpFolderNames.SUPER_SAPIR, 11 | chain_id="7290058156016", 12 | url_perfix="supersapir", 13 | folder_name=folder_name, 14 | ) 15 | -------------------------------------------------------------------------------- /il_supermarket_scarper/scrappers/yohananof.py: -------------------------------------------------------------------------------- 1 | from il_supermarket_scarper.engines import Cerberus 2 | from il_supermarket_scarper.utils import DumpFolderNames 3 | 4 | 5 | class Yohananof(Cerberus): 6 | """scraper for yohananof""" 7 | 8 | def __init__(self, folder_name=None): 9 | super().__init__( 10 | chain=DumpFolderNames.YOHANANOF, 11 | chain_id="7290803800003", 12 | folder_name=folder_name, 13 | ftp_username="yohananof", 14 | ) 15 | -------------------------------------------------------------------------------- /il_supermarket_scarper/scrappers/doralon.py: -------------------------------------------------------------------------------- 1 | from il_supermarket_scarper.engines import Cerberus 2 | from il_supermarket_scarper.utils import DumpFolderNames 3 | 4 | 5 | class DorAlon(Cerberus): 6 | """scraper for dor alon""" 7 | 8 | def __init__(self, folder_name=None): 9 | super().__init__( 10 | folder_name=folder_name, 11 | chain=DumpFolderNames.DOR_ALON, 12 | chain_id=["7290492000005", "729049000005"], 13 | ftp_username="doralon", 14 | ) 15 | -------------------------------------------------------------------------------- /il_supermarket_scarper/scrappers/victory.py: -------------------------------------------------------------------------------- 1 | from il_supermarket_scarper.engines import Matrix 2 | from il_supermarket_scarper.utils import DumpFolderNames 3 | 4 | 5 | class Victory(Matrix): 6 | """scraper for victory""" 7 | 8 | def __init__(self, folder_name=None): 9 | super().__init__( 10 | chain=DumpFolderNames.VICTORY, 11 | chain_hebrew_name="ויקטורי", 12 | chain_id=["7290696200003", "7290058103393"], 13 | folder_name=folder_name, 14 | ) 15 | -------------------------------------------------------------------------------- /il_supermarket_scarper/scrappers/zolvebegadol.py: -------------------------------------------------------------------------------- 1 | from il_supermarket_scarper.engines import Bina 2 | from il_supermarket_scarper.utils import DumpFolderNames 3 | 4 | 5 | class ZolVeBegadol(Bina): 6 | """scraper dfor zol-ve-begodol""" 7 | 8 | def __init__(self, folder_name=None): 9 | super().__init__( 10 | chain=DumpFolderNames.ZOL_VEBEGADOL, 11 | chain_id="7290058173198", 12 | url_perfix="zolvebegadol", 13 | folder_name=folder_name, 14 | ) 15 | -------------------------------------------------------------------------------- /il_supermarket_scarper/scrappers/quik.py: -------------------------------------------------------------------------------- 1 | from il_supermarket_scarper.engines.publishprice import PublishPrice 2 | from il_supermarket_scarper.utils import DumpFolderNames 3 | 4 | 5 | # @FlakyScraper 6 | class Quik(PublishPrice): 7 | """scaper for quik""" 8 | 9 | def __init__(self, folder_name=None): 10 | super().__init__( 11 | chain=DumpFolderNames.QUIK, 12 | chain_id="7291029710008", 13 | site_infix="quik", 14 | folder_name=folder_name, 15 | ) 16 | -------------------------------------------------------------------------------- /il_supermarket_scarper/scrappers/mega.py: -------------------------------------------------------------------------------- 1 | from il_supermarket_scarper.engines.publishprice import PublishPrice 2 | from il_supermarket_scarper.utils import DumpFolderNames 3 | 4 | 5 | # removed : 1.7.2025 6 | class Mega(PublishPrice): 7 | """scraper for mege""" 8 | 9 | def __init__(self, folder_name=None): 10 | super().__init__( 11 | chain=DumpFolderNames.MEGA, 12 | chain_id="7290055700007", 13 | site_infix="mega", 14 | folder_name=folder_name, 15 | ) 16 | -------------------------------------------------------------------------------- /il_supermarket_scarper/scrappers/ramilevy.py: -------------------------------------------------------------------------------- 1 | from il_supermarket_scarper.engines import Cerberus 2 | from il_supermarket_scarper.utils import DumpFolderNames 3 | 4 | 5 | class RamiLevy(Cerberus): 6 | """scaper for rami levi""" 7 | 8 | def __init__(self, folder_name=None): 9 | super().__init__( 10 | chain=DumpFolderNames.RAMI_LEVY, 11 | chain_id="7290058140886", 12 | folder_name=folder_name, 13 | ftp_username="RamiLevi", 14 | max_threads=10, 15 | ) 16 | -------------------------------------------------------------------------------- /il_supermarket_scarper/scrappers/machsani_ashuk.py: -------------------------------------------------------------------------------- 1 | from il_supermarket_scarper.engines import Matrix 2 | from il_supermarket_scarper.utils import DumpFolderNames 3 | 4 | 5 | class MahsaniAShuk(Matrix): 6 | """scraper for masani hsuk""" 7 | 8 | def __init__(self, folder_name=None): 9 | super().__init__( 10 | chain=DumpFolderNames.MAHSANI_ASHUK, 11 | chain_id=["7290661400001", "7290633800006"], 12 | folder_name=folder_name, 13 | chain_hebrew_name="מחסני השוק", 14 | ) 15 | -------------------------------------------------------------------------------- /il_supermarket_scarper/scrappers/shefa_barcart_ashem.py: -------------------------------------------------------------------------------- 1 | from il_supermarket_scarper.engines import Bina 2 | from il_supermarket_scarper.utils import DumpFolderNames 3 | 4 | 5 | class ShefaBarcartAshem(Bina): 6 | """scraper for shefa berkat ashem""" 7 | 8 | def __init__(self, folder_name=None): 9 | super().__init__( 10 | chain=DumpFolderNames.SHEFA_BARCART_ASHEM, 11 | chain_id="7290058134977", 12 | url_perfix="shefabirkathashem", 13 | folder_name=folder_name, 14 | ) 15 | -------------------------------------------------------------------------------- /il_supermarket_scarper/scrappers/bitan.py: -------------------------------------------------------------------------------- 1 | from il_supermarket_scarper.engines.publishprice import PublishPrice 2 | from il_supermarket_scarper.utils import DumpFolderNames 3 | 4 | 5 | class YaynotBitanAndCarrefour(PublishPrice): 6 | """scaper for yaynot beitan""" 7 | 8 | def __init__(self, folder_name=None): 9 | super().__init__( 10 | chain=DumpFolderNames.YAYNO_BITAN_AND_CARREFOUR, 11 | chain_id="7290055700007", 12 | site_infix="carrefour", 13 | folder_name=folder_name, 14 | ) 15 | -------------------------------------------------------------------------------- /il_supermarket_scarper/scrappers/salachdabach.py: -------------------------------------------------------------------------------- 1 | from il_supermarket_scarper.engines import Cerberus 2 | from il_supermarket_scarper.utils import DumpFolderNames 3 | 4 | 5 | class SalachDabach(Cerberus): 6 | """scraper for salach dabach""" 7 | 8 | def __init__(self, folder_name=None): 9 | super().__init__( 10 | chain=DumpFolderNames.SALACH_DABACH, 11 | chain_id="7290526500006", 12 | folder_name=folder_name, 13 | ftp_username="SalachD", 14 | ftp_password="12345", 15 | ) 16 | -------------------------------------------------------------------------------- /il_supermarket_scarper/scrappers/superdosh.py: -------------------------------------------------------------------------------- 1 | from il_supermarket_scarper.engines import Cerberus 2 | from il_supermarket_scarper.utils import DumpFolderNames 3 | 4 | 5 | class FreshMarketAndSuperDosh(Cerberus): 6 | """scraper for fresh market and super dush""" 7 | 8 | def __init__(self, folder_name=None): 9 | super().__init__( 10 | chain=DumpFolderNames.FRESH_MARKET_AND_SUPER_DOSH, 11 | chain_id="7290876100000", 12 | folder_name=folder_name, 13 | ftp_username="freshmarket", 14 | ) 15 | -------------------------------------------------------------------------------- /il_supermarket_scarper/scrappers/yellow.py: -------------------------------------------------------------------------------- 1 | from il_supermarket_scarper.engines import Cerberus 2 | from il_supermarket_scarper.utils import DumpFolderNames 3 | 4 | 5 | class Yellow(Cerberus): 6 | """scraper for yellow""" 7 | 8 | def __init__(self, folder_name=None): 9 | super().__init__( 10 | chain=DumpFolderNames.YELLOW, 11 | chain_id="7290644700005", 12 | folder_name=folder_name, 13 | ftp_username="Paz_bo", 14 | ftp_password="paz468", 15 | max_threads=10, 16 | ) 17 | -------------------------------------------------------------------------------- /example.py: -------------------------------------------------------------------------------- 1 | from il_supermarket_scarper import ScarpingTask, ScraperFactory 2 | from il_supermarket_scarper.utils import _now, Logger 3 | 4 | Logger.set_logging_level("INFO") 5 | 6 | if __name__ == "__main__": 7 | scraper = ScarpingTask( 8 | dump_folder_name="dumps", 9 | lookup_in_db=False, 10 | multiprocessing=2, 11 | limit=1, 12 | enabled_scrapers=[ScraperFactory.BAREKET.name], 13 | # size_estimation_mode=True, # download files,log size, delete files 14 | when_date=_now(), 15 | ) 16 | scraper.start() 17 | -------------------------------------------------------------------------------- /.vscode/launch.json: -------------------------------------------------------------------------------- 1 | { 2 | // Use IntelliSense to learn about possible attributes. 3 | // Hover to view descriptions of existing attributes. 4 | // For more information, visit: https://go.microsoft.com/fwlink/?linkid=830387 5 | "version": "0.2.0", 6 | "configurations": [ 7 | { 8 | "name": "Debug Unit Test", 9 | "type": "python", 10 | "request": "test", 11 | "justMyCode": false, 12 | // "env": { 13 | // "DISABLED_SCRAPPERS" : "BAREKET" 14 | // } 15 | } 16 | ] 17 | } -------------------------------------------------------------------------------- /il_supermarket_scarper/scrappers/super_yuda.py: -------------------------------------------------------------------------------- 1 | from il_supermarket_scarper.engines import Cerberus 2 | from il_supermarket_scarper.utils import DumpFolderNames 3 | 4 | 5 | class SuperYuda(Cerberus): 6 | """scraper for super yuda""" 7 | 8 | def __init__(self, folder_name=None): 9 | super().__init__( 10 | chain=DumpFolderNames.SUPER_YUDA, 11 | chain_id=["7290058198450", "7290058177776"], 12 | ftp_username="yuda_ho", 13 | ftp_password="Yud@147", 14 | ftp_path="/Yuda", 15 | folder_name=folder_name, 16 | ) 17 | -------------------------------------------------------------------------------- /il_supermarket_scarper/scrappers/nativ_hashed.py: -------------------------------------------------------------------------------- 1 | from il_supermarket_scarper.engines.web import WebBase 2 | from il_supermarket_scarper.utils import DumpFolderNames 3 | 4 | 5 | # possible: NetivHased are down in Shabatz 6 | class NetivHased(WebBase): 7 | """scraper for nativ Hased""" 8 | 9 | utilize_date_param = False 10 | 11 | def __init__(self, folder_name=None): 12 | super().__init__( 13 | chain=DumpFolderNames.NETIV_HASED, 14 | chain_id="7290058160839", 15 | url="http://141.226.203.152/", 16 | folder_name=folder_name, 17 | ) 18 | -------------------------------------------------------------------------------- /il_supermarket_scarper/scrappers/stop_market.py: -------------------------------------------------------------------------------- 1 | from il_supermarket_scarper.engines import Cerberus 2 | from il_supermarket_scarper.utils import DumpFolderNames 3 | 4 | 5 | class StopMarket(Cerberus): 6 | """scraper for stop market""" 7 | 8 | def __init__(self, folder_name=None): 9 | super().__init__( 10 | chain=DumpFolderNames.STOP_MARKET, 11 | chain_id=[ 12 | "72906390", 13 | "7290639000004", 14 | ], # in store files for some reason the store id is only 72906390 15 | folder_name=folder_name, 16 | ftp_username="Stop_Market", 17 | ) 18 | -------------------------------------------------------------------------------- /il_supermarket_scarper/utils/tests/test_connection.py: -------------------------------------------------------------------------------- 1 | import os 2 | import pytest 3 | from il_supermarket_scarper.utils.connection import wget_file 4 | 5 | 6 | def test_wget_file_dont_exist(): 7 | """Test wget file that does not exist""" 8 | with pytest.raises(FileNotFoundError): 9 | wget_file( 10 | "https://pricesprodpublic.blob.core.windows.net/price/" 11 | "Price7290027600007-036-202503181800.gz?sv=2014-02-14&sr=b" 12 | "&sig=Me8hez2oy5vClACdE5fVOyyu5Qef%2FlEJSQYfMvQAOKg%3D&" 13 | "se=2025-03-18T18%3A02%3A59Z&sp=r", 14 | "some_file.gz", 15 | ) 16 | 17 | assert not os.path.exists("some_file.gz") 18 | -------------------------------------------------------------------------------- /il_supermarket_scarper/utils/tests/test_gzip_utils.py: -------------------------------------------------------------------------------- 1 | import os 2 | import pytest 3 | 4 | from il_supermarket_scarper.utils.gzip_utils import extract_xml_file_from_gz_file 5 | 6 | 7 | def test_unzip_bad_file(): 8 | """test unziping a bad file""" 9 | 10 | file_path = ( 11 | "il_supermarket_scarper/utils/tests/PriceFull7290876100000-003-202410070010.gz" 12 | ) 13 | file_content = None 14 | if os.path.exists(file_path): 15 | with open(file_path, "rb") as f: 16 | file_content = f.read() 17 | 18 | with pytest.raises(ValueError): 19 | extract_xml_file_from_gz_file(file_path) 20 | 21 | if file_content is not None and not os.path.exists(file_path): 22 | with open(file_path, "wb") as f: 23 | f.write(file_content) 24 | -------------------------------------------------------------------------------- /il_supermarket_scarper/scrappers/cofix.py: -------------------------------------------------------------------------------- 1 | from il_supermarket_scarper.engines import Cerberus 2 | from il_supermarket_scarper.utils import FileTypesFilters, DumpFolderNames 3 | 4 | 5 | class Cofix(Cerberus): 6 | """scraper for confix""" 7 | 8 | def __init__(self, folder_name=None): 9 | super().__init__( 10 | chain=DumpFolderNames.COFIX, 11 | chain_id="7291056200008", 12 | folder_name=folder_name, 13 | ftp_username="SuperCofixApp", 14 | ) 15 | 16 | def is_valid_file_empty(self, file_name): 17 | """it is valid the file is empty""" 18 | 19 | return super().is_valid_file_empty( 20 | file_name 21 | ) or FileTypesFilters.is_file_from_type( 22 | file_name, FileTypesFilters.STORE_FILE.name 23 | ) 24 | -------------------------------------------------------------------------------- /.github/workflows/pylint.yml: -------------------------------------------------------------------------------- 1 | name: Pylint 2 | 3 | on: 4 | push: 5 | branches: [ "main" ] 6 | pull_request: 7 | branches: [ "main" ] 8 | 9 | concurrency: 10 | group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.ref }} 11 | cancel-in-progress: true 12 | 13 | jobs: 14 | build: 15 | runs-on: ubuntu-latest 16 | strategy: 17 | matrix: 18 | python-version: ["3.8"] 19 | steps: 20 | - uses: actions/checkout@v3 21 | - name: Set up Python ${{ matrix.python-version }} 22 | uses: actions/setup-python@v3 23 | with: 24 | python-version: ${{ matrix.python-version }} 25 | - name: Install dependencies 26 | run: | 27 | python -m pip install --upgrade pip 28 | pip install pylint 29 | - name: Analysing the code with pylint 30 | run: | 31 | pylint $(git ls-files '*.py') --disable=E0401,R0801,R0903,W0707,R0917,C0114 32 | -------------------------------------------------------------------------------- /il_supermarket_scarper/utils/databases/base.py: -------------------------------------------------------------------------------- 1 | from abc import ABC, abstractmethod 2 | 3 | 4 | class AbstractDataBase(ABC): 5 | """Abstract base class for database operations.""" 6 | 7 | def __init__(self, database_name, collection_status=False) -> None: 8 | self.database_name = database_name.replace(" ", "_").lower() 9 | self.collection_status = collection_status 10 | 11 | def enable_collection_status(self): 12 | """Enable data collection to the database.""" 13 | self.collection_status = True 14 | 15 | @abstractmethod 16 | def insert_document(self, collection_name, document): 17 | """Insert a document into a collection.""" 18 | 19 | @abstractmethod 20 | def find_document(self, collection_name, query): 21 | """Find a document in a collection based on a query.""" 22 | 23 | def is_collection_enabled(self): 24 | """Check if collection is enabled.""" 25 | return self.collection_status 26 | 27 | def set_collection_status(self, status): 28 | """Enable data collection to JSON storage.""" 29 | self.collection_status = status 30 | -------------------------------------------------------------------------------- /il_supermarket_scarper/utils/tests/test_file_type.py: -------------------------------------------------------------------------------- 1 | from il_supermarket_scarper.utils import FileTypesFilters 2 | 3 | 4 | def test_file_type(): 5 | """test prasing file name to enum""" 6 | assert ( 7 | FileTypesFilters.get_type_from_file("Price7290058108879-339-202409181941") 8 | == FileTypesFilters.PRICE_FILE 9 | ) 10 | assert ( 11 | FileTypesFilters.get_type_from_file("PriceFull7290058108879-339-202409181041") 12 | == FileTypesFilters.PRICE_FULL_FILE 13 | ) 14 | 15 | assert ( 16 | FileTypesFilters.get_type_from_file("StoresFull7290058108879-000-202409181041") 17 | == FileTypesFilters.STORE_FILE 18 | ) 19 | assert ( 20 | FileTypesFilters.get_type_from_file("Promo7290058108879-336-202409181544") 21 | == FileTypesFilters.PROMO_FILE 22 | ) 23 | assert ( 24 | FileTypesFilters.get_type_from_file("PromoFull7290058108879-339-202409181149") 25 | == FileTypesFilters.PROMO_FULL_FILE 26 | ) 27 | assert ( 28 | FileTypesFilters.get_type_from_file("Proasdull7290058108879-339-202409181149") 29 | is None 30 | ) 31 | -------------------------------------------------------------------------------- /il_supermarket_scarper/utils/tests/test_status.py: -------------------------------------------------------------------------------- 1 | import datetime 2 | 3 | from il_supermarket_scarper.utils.status import ( 4 | get_status, 5 | get_status_date, 6 | get_statue_page, 7 | ) 8 | from il_supermarket_scarper.utils.connection import disable_when_outside_israel 9 | from il_supermarket_scarper.utils.validation import show_text_diff 10 | 11 | 12 | @disable_when_outside_israel 13 | def test_status(): 14 | """check able to get the number of scrapers from gov.il""" 15 | num_of_scarpers = get_status() 16 | assert isinstance(num_of_scarpers, int) 17 | 18 | 19 | @disable_when_outside_israel 20 | def test_status_date(): 21 | """check able the get the date the gov.il site was updated""" 22 | date = get_status_date() 23 | assert isinstance(date, datetime.datetime) 24 | 25 | 26 | @disable_when_outside_israel 27 | def test_page_complete_diff(): 28 | """make sure the page content is the same as the cached page""" 29 | cached = get_statue_page(extraction_type="all_text", source="cache") 30 | current = get_statue_page(extraction_type="all_text", source="gov.il") 31 | assert current == cached, show_text_diff(cached, current) 32 | -------------------------------------------------------------------------------- /il_supermarket_scarper/tests/test_scrappers_factory.py: -------------------------------------------------------------------------------- 1 | from il_supermarket_scarper import ScraperStability, ScraperFactory, datetime_in_tlv 2 | from il_supermarket_scarper.utils import _is_saturday_in_israel 3 | 4 | 5 | def test_stable_scraper(): 6 | """test sample stable scarper""" 7 | assert not ScraperStability.is_validate_scraper_found_no_files( 8 | ScraperFactory.VICTORY.name 9 | ) 10 | 11 | 12 | # def test_after_date(): 13 | # """test scrapers that failed after date""" 14 | # assert ScraperStability.is_validate_scraper_found_no_files( 15 | # ScraperFactory.CITY_MARKET_GIVATAYIM.name, 16 | # when_date=datetime_in_tlv(2024, 12, 12, 0, 0, 0), 17 | # ) 18 | 19 | 20 | def test_not_active(): 21 | """test grap between active and not""" 22 | test_date = datetime_in_tlv(2024, 12, 12, 0, 0, 0) 23 | all_listed = ScraperFactory.all_listed_scrappers() 24 | all_active = ScraperFactory.all_scrapers_name(when_date=test_date) 25 | 26 | expected_to_fail = 0 27 | if _is_saturday_in_israel(test_date): 28 | expected_to_fail += 1 # only 'NetivHased' should 29 | 30 | assert len(set(all_listed) - set(all_active)) == expected_to_fail 31 | -------------------------------------------------------------------------------- /il_supermarket_scarper/utils/lock_utils.py: -------------------------------------------------------------------------------- 1 | from threading import Lock 2 | from functools import wraps 3 | 4 | 5 | class LockManager: 6 | """Manages locks based on string values.""" 7 | 8 | def __init__(self): 9 | self.locks = {} 10 | 11 | def get_lock(self, key): 12 | """Get or create a lock based on the string key.""" 13 | if key not in self.locks: 14 | self.locks[key] = Lock() 15 | return self.locks[key] 16 | 17 | 18 | lock_manager = LockManager() 19 | 20 | 21 | def lock_by_string(): 22 | """ 23 | Decorator to apply a lock based on a string key. 24 | :param lock_key_func: A function that returns the string key for which the lock will be applied. 25 | """ 26 | 27 | def decorator(func): 28 | @wraps(func) 29 | def wrapper(scraper_status, *args, **kwargs): 30 | # Get the key for which to acquire the lock (based on the arguments) 31 | lock_key = scraper_status.chain.value 32 | lock = lock_manager.get_lock(lock_key) 33 | 34 | with lock: 35 | return func(scraper_status, *args, **kwargs) 36 | 37 | return wrapper 38 | 39 | return decorator 40 | -------------------------------------------------------------------------------- /il_supermarket_scarper/utils/__init__.py: -------------------------------------------------------------------------------- 1 | from .gzip_utils import extract_xml_file_from_gz_file 2 | from .logger import Logger 3 | from .status import ( 4 | get_output_folder, 5 | clean_dump_folder, 6 | summerize_dump_folder_contant, 7 | _is_saturday_in_israel, 8 | _is_holiday_in_israel, 9 | _is_weekend_in_israel, 10 | _now, 11 | datetime_in_tlv, 12 | _testing_now, 13 | hour_files_expected_to_be_accassible, 14 | ) 15 | from .scraper_status import ScraperStatus 16 | from .file_types import FileTypesFilters 17 | from .connection import ( 18 | download_connection_retry, 19 | url_connection_retry, 20 | disable_when_outside_israel, 21 | session_with_cookies, 22 | url_retrieve, 23 | collect_from_ftp, 24 | fetch_temporary_gz_file_from_ftp, 25 | wget_file, 26 | ) 27 | from .loop import execute_in_parallel, multiple_page_aggregtion 28 | from .exceptions import RestartSessionError 29 | from .retry import retry_files 30 | from .validation import is_valid_chain_name, change_xml_encoding 31 | from .folders_name import DumpFolderNames 32 | from .lock_utils import LockManager, lock_by_string 33 | from .status import convert_unit, UnitSize, convert_nl_size_to_bytes, string_to_float 34 | -------------------------------------------------------------------------------- /.github/workflows/python-publish.yml: -------------------------------------------------------------------------------- 1 | # This workflow will upload a Python Package using Twine when a release is created 2 | # For more information see: https://help.github.com/en/actions/language-and-framework-guides/using-python-with-github-actions#publishing-to-package-registries 3 | 4 | # This workflow uses actions that are not certified by GitHub. 5 | # They are provided by a third-party and are governed by 6 | # separate terms of service, privacy policy, and support 7 | # documentation. 8 | 9 | name: Upload Python Package 10 | 11 | on: 12 | release: 13 | types: [published] 14 | 15 | permissions: 16 | contents: read 17 | 18 | jobs: 19 | deploy: 20 | 21 | runs-on: ubuntu-latest 22 | 23 | steps: 24 | - uses: actions/checkout@v3 25 | - name: Set up Python 26 | uses: actions/setup-python@v3 27 | with: 28 | python-version: '3.x' 29 | - name: Install dependencies 30 | run: | 31 | python -m pip install --upgrade pip 32 | pip install build 33 | - name: Build package 34 | run: python -m build 35 | - name: Publish package 36 | uses: pypa/gh-action-pypi-publish@27b31702a0e7fc50959f5ad993c78deac1bdfc29 37 | with: 38 | user: __token__ 39 | password: ${{ secrets.PYPI }} 40 | -------------------------------------------------------------------------------- /tests/test_integration.py: -------------------------------------------------------------------------------- 1 | import datetime 2 | from il_supermarket_scarper.utils.status import ( 3 | get_status, 4 | get_status_date, 5 | ) 6 | from il_supermarket_scarper.scrappers_factory import ScraperFactory 7 | from il_supermarket_scarper.utils import disable_when_outside_israel, DumpFolderNames 8 | 9 | 10 | def test_scrapers_folders_match(): 11 | """test the number of scrapers are the same as listed at the gov.il site""" 12 | scrapers_keys = ScraperFactory.all_scrapers_name() 13 | dump_keys = DumpFolderNames.all_folders_names() 14 | 15 | assert set(scrapers_keys) & set(dump_keys) == set(scrapers_keys) 16 | assert set(scrapers_keys) - set(dump_keys) == set() 17 | 18 | 19 | @disable_when_outside_israel 20 | def test_scrapers_are_updated(): 21 | """test the number of scrapers are the same as listed at the gov.il site""" 22 | num_of_scarper_listed = len(ScraperFactory.all_listed_scrappers()) 23 | num_of_scarper_on_gov_site = get_status() 24 | 25 | assert num_of_scarper_listed == num_of_scarper_on_gov_site 26 | 27 | 28 | @disable_when_outside_israel 29 | def test_update_date(): 30 | """test date the site update""" 31 | date = get_status_date() 32 | assert date.date() == datetime.datetime(2025, 7, 1).date(), "gov il site changed" 33 | -------------------------------------------------------------------------------- /.github/workflows/user-validation.yml: -------------------------------------------------------------------------------- 1 | name: Reject PR with IgnoreList 2 | on: 3 | pull_request: 4 | types: [opened, edited, synchronize] 5 | 6 | jobs: 7 | check_username: 8 | runs-on: ubuntu-latest 9 | env: 10 | IGNORE_USERS: ${{ secrets.IGNORE_USERS }} 11 | 12 | steps: 13 | - name: Checkout code 14 | uses: actions/checkout@v3 15 | 16 | - name: Fetch all branches 17 | run: git fetch --all 18 | 19 | - name: Check for restricted authors in commits 20 | id: check_commit_authors 21 | run: | 22 | # Convert IGNORE_USERS to an array 23 | IFS=',' read -ra IGNORED_USERS <<< "$IGNORE_USERS" 24 | 25 | # Get the commit authors in the pull request 26 | COMMIT_AUTHORS=$(git log --pretty=format:"%an" origin/main..HEAD) 27 | 28 | # Check if any commit author matches an ignored user 29 | for AUTHOR in "${IGNORED_USERS[@]}"; do 30 | if echo "$COMMIT_AUTHORS" | grep -iq "^$AUTHOR$"; then 31 | echo "Restricted author '$AUTHOR' found in commits." 32 | exit 1 33 | fi 34 | done 35 | 36 | - name: PR Rejected 37 | if: failure() 38 | run: | 39 | echo "This PR contains commits by restricted authors." 40 | exit 1 41 | -------------------------------------------------------------------------------- /il_supermarket_scarper/scrappers/__init__.py: -------------------------------------------------------------------------------- 1 | from .bareket import Bareket 2 | from .bitan import YaynotBitanAndCarrefour 3 | from .cofix import Cofix 4 | from .city_market import ( 5 | CityMarketGivatayim, 6 | CityMarketKirtatOno, 7 | CityMarketKiryatGat, 8 | CityMarketShops, 9 | ) 10 | from .doralon import DorAlon 11 | from .good_pharm import GoodPharm 12 | from .hazihinam import HaziHinam 13 | from .het_cohen import HetCohen 14 | from .keshet import Keshet 15 | from .king_store import KingStore 16 | from .maayan2000 import Maayan2000 17 | from .machsani_ashuk import MahsaniAShuk 18 | from .mega import Mega 19 | from .meshnat_yosef import MeshnatYosef1, MeshnatYosef2 20 | from .nativ_hashed import NetivHased 21 | from .osherad import Osherad 22 | from .polizer import Polizer 23 | from .ramilevy import RamiLevy 24 | from .salachdabach import SalachDabach 25 | from .shefa_barcart_ashem import ShefaBarcartAshem 26 | from .shufersal import Shufersal 27 | from .shuk_ahir import ShukAhir 28 | from .stop_market import StopMarket 29 | from .super_pharm import SuperPharm 30 | from .super_yuda import SuperYuda 31 | from .super_sapir import SuperSapir 32 | from .superdosh import FreshMarketAndSuperDosh 33 | from .quik import Quik 34 | from .tivtaam import TivTaam 35 | from .victory import Victory 36 | from .yellow import Yellow 37 | from .yohananof import Yohananof 38 | from .zolvebegadol import ZolVeBegadol 39 | from .wolt import Wolt 40 | -------------------------------------------------------------------------------- /tests/test_main.py: -------------------------------------------------------------------------------- 1 | import os 2 | import time 3 | import tempfile 4 | 5 | from il_supermarket_scarper.main import ScarpingTask 6 | from il_supermarket_scarper.scrappers_factory import ScraperFactory 7 | 8 | 9 | def test_main_with_limit(): 10 | """test the main running with limit of 1 for each chain""" 11 | with tempfile.TemporaryDirectory() as tmpdirname: 12 | expected = ScraperFactory.all_scrapers_name() + ["status"] 13 | scrapper_done = ScarpingTask(limit=1, dump_folder_name=tmpdirname).start() 14 | 15 | folders_from_scraper = list(map(lambda x: x.split("/")[-1], scrapper_done)) + [ 16 | "status" 17 | ] 18 | time.sleep(5) 19 | folders_in_dump_folder = os.listdir(tmpdirname) 20 | folders_in_dump_folder = [ 21 | name for name in folders_in_dump_folder if not name.startswith(".") 22 | ] 23 | assert len(folders_in_dump_folder) == len(expected) 24 | assert sorted(folders_from_scraper) == sorted(folders_in_dump_folder) 25 | 26 | 27 | def test_main_with_one_scarper(): 28 | """the limit only for enabled scarpers""" 29 | scrapper_done = ScarpingTask( 30 | limit=1, enabled_scrapers=ScraperFactory.sample(n=1) 31 | ).start() 32 | assert len(scrapper_done) == 1 33 | 34 | 35 | def test_main_with_size_estimation_mode(): 36 | """test size estmation mode""" 37 | scrapper_done = ScarpingTask( 38 | limit=1, size_estimation_mode=True, enabled_scrapers=ScraperFactory.sample(n=1) 39 | ).start() 40 | assert len(scrapper_done) == 1 41 | -------------------------------------------------------------------------------- /.github/workflows/docker-publish.yml: -------------------------------------------------------------------------------- 1 | # This workflow uses actions that are not certified by GitHub. 2 | # They are provided by a third-party and are governed by 3 | # separate terms of service, privacy policy, and support 4 | # documentation. 5 | 6 | # GitHub recommends pinning actions to a commit SHA. 7 | # To get a newer version, you will need to update the SHA. 8 | # You can also reference a tag or branch, but the action may change without warning. 9 | 10 | name: Publish Docker image 11 | 12 | on: 13 | release: 14 | types: [published] 15 | 16 | jobs: 17 | push_to_registry: 18 | name: Push Docker image to Docker Hub 19 | runs-on: ubuntu-latest 20 | steps: 21 | - name: Check out the repo 22 | uses: actions/checkout@v3 23 | 24 | - name: Log in to Docker Hub 25 | uses: docker/login-action@f054a8b539a109f9f41c372932f1ae047eff08c9 26 | with: 27 | username: ${{ secrets.DOCKER_USERNAME }} 28 | password: ${{ secrets.DOCKER_PASSWORD }} 29 | 30 | - name: Extract metadata (tags, labels) for Docker 31 | id: meta 32 | uses: docker/metadata-action@98669ae865ea3cffbcbaa878cf57c20bbf1c6c38 33 | with: 34 | images: erlichsefi/israeli-supermarket-scarpers 35 | 36 | - name: Build and push Docker image 37 | uses: docker/build-push-action@ad44023a93711e3deb337508980b4b5e9bcdc5dc 38 | with: 39 | context: . 40 | target: prod 41 | push: true 42 | tags: ${{ steps.meta.outputs.tags }} 43 | labels: ${{ steps.meta.outputs.labels }} -------------------------------------------------------------------------------- /Dockerfile: -------------------------------------------------------------------------------- 1 | #syntax=docker/dockerfile:1 2 | 3 | FROM node:20.19.5-bookworm-slim as base 4 | ARG PY_VERSION="3.11.0" 5 | 6 | # setting the enviroment 7 | RUN apt-get update --fix-missing -y && \ 8 | apt-get install cron -y && \ 9 | apt-get install libxml2-dev -y && \ 10 | apt-get install libxslt-dev -y 11 | 12 | 13 | # setting python and more 14 | RUN apt-get install python3-pip -y && \ 15 | apt-get install dieharder -y && \ 16 | apt-get install wget -y && \ 17 | apt-get clean && \ 18 | apt-get autoremove 19 | 20 | # setup python 21 | ENV HOME="/root" 22 | WORKDIR ${HOME} 23 | RUN apt-get install -y git libbz2-dev libncurses-dev libreadline-dev libffi-dev libssl-dev 24 | RUN git clone --depth=1 https://github.com/pyenv/pyenv.git .pyenv 25 | ENV PYENV_ROOT="${HOME}/.pyenv" 26 | ENV PATH="${PYENV_ROOT}/shims:${PYENV_ROOT}/bin:${PATH}" 27 | 28 | RUN pyenv install $PY_VERSION 29 | RUN pyenv global $PY_VERSION 30 | 31 | # setup code 32 | WORKDIR /usr/src/app 33 | COPY . . 34 | RUN python -m pip install . 35 | 36 | 37 | VOLUME ["/usr/src/app/dumps"] 38 | 39 | # development container 40 | FROM base as dev 41 | RUN apt-get -y install git 42 | RUN pip install black 43 | RUN pip install pylint 44 | 45 | 46 | # production image 47 | FROM base as prod 48 | 49 | # ADD crontab /etc/cron.d 50 | # RUN chmod 0644 /etc/cron.d/crontab 51 | # RUN crontab /etc/cron.d/crontab 52 | # RUN touch /var/log/cron.log 53 | # && cron & tail -f /var/log/cron.log 54 | CMD python main.py 55 | 56 | # run test 57 | FROM base as test 58 | 59 | # playwrite 60 | RUN npx -y playwright@1.53.0 install --with-deps 61 | RUN python -m playwright install 62 | 63 | RUN python -m pip install . ".[test]" 64 | CMD python -m pytest -vv -n 2 65 | 66 | -------------------------------------------------------------------------------- /.github/workflows/test-suite.yml: -------------------------------------------------------------------------------- 1 | # This workflow will install Python dependencies, run tests and lint with a variety of Python versions 2 | # For more information see: https://help.github.com/actions/language-and-framework-guides/using-python-with-github-actions 3 | 4 | name: Unit & Integration Tests 5 | # env: 6 | # DISABLED_SCRAPPERS: BAREKET 7 | 8 | on: 9 | push: 10 | branches: [ "main" ] 11 | pull_request: 12 | branches: [ "main" ] 13 | schedule: 14 | # * is a special character in YAML so you have to quote this string 15 | - cron: '00 17 * * *' 16 | 17 | concurrency: 18 | group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.ref }} 19 | cancel-in-progress: true 20 | 21 | 22 | jobs: 23 | build: 24 | 25 | runs-on: self-hosted 26 | strategy: 27 | fail-fast: false 28 | matrix: 29 | python-version: ["3.11.0"] 30 | 31 | steps: 32 | 33 | - name: Checkout 34 | uses: actions/checkout@v3 35 | - name: Free disk space 36 | run: | 37 | df --human-readable 38 | docker 2>/dev/null 1>&2 rmi $(docker image ls --all --quiet) || true 39 | rm --recursive --force "$AGENT_TOOLSDIRECTORY" 40 | df --human-readable 41 | - name: Build with Docker 42 | run: docker build -t erlichsefi/israeli-supermarket-scarpers:test --target test . 43 | - name: Remove all build 44 | run: (docker stop scraper-test-run 2>/dev/null || true) && (docker rm scraper-test-run 2>/dev/null || true) 45 | - name: Test with pytest 46 | run: docker run --rm --name scraper-test-run -e DISABLED_SCRAPPERS="${{ env.DISABLED_SCRAPPERS }}" erlichsefi/israeli-supermarket-scarpers:test && 47 | docker builder prune -f 48 | -------------------------------------------------------------------------------- /il_supermarket_scarper/main.py: -------------------------------------------------------------------------------- 1 | from .scrapper_runner import MainScrapperRunner 2 | from .utils.file_types import FileTypesFilters 3 | 4 | 5 | class ScarpingTask: # pylint: disable=too-many-instance-attributes 6 | """scraping task encapsulated""" 7 | 8 | def __init__( 9 | self, 10 | size_estimation_mode=False, 11 | enabled_scrapers=None, 12 | limit=None, 13 | when_date=None, 14 | files_types=FileTypesFilters.all_types(), 15 | dump_folder_name=None, 16 | lookup_in_db=True, 17 | multiprocessing=5, 18 | suppress_exception=False, 19 | min_size=None, 20 | max_size=None, 21 | ): 22 | """define the runner""" 23 | self.runner = MainScrapperRunner( 24 | size_estimation_mode=size_estimation_mode, 25 | enabled_scrapers=enabled_scrapers, 26 | dump_folder_name=dump_folder_name, 27 | lookup_in_db=lookup_in_db, 28 | multiprocessing=multiprocessing, 29 | ) 30 | self.dump_folder_name = dump_folder_name 31 | self.limit = limit 32 | self.files_types = files_types 33 | self.when_date = when_date 34 | self.suppress_exception = suppress_exception 35 | self.min_size = min_size 36 | self.max_size = max_size 37 | 38 | def get_dump_folder_name(self): 39 | """get the dump folder name""" 40 | return self.dump_folder_name 41 | 42 | def start(self): 43 | """run the scraping""" 44 | return self.runner.run( 45 | limit=self.limit, 46 | files_types=self.files_types, 47 | when_date=self.when_date, 48 | suppress_exception=self.suppress_exception, 49 | min_size=self.min_size, 50 | max_size=self.max_size, 51 | ) 52 | -------------------------------------------------------------------------------- /il_supermarket_scarper/utils/folders_name.py: -------------------------------------------------------------------------------- 1 | from enum import Enum 2 | 3 | 4 | class DumpFolderNames(Enum): 5 | """all the folder that files will be download to""" 6 | 7 | BAREKET = "Bareket" 8 | # YAYNO_BITAN = "YaynotBitan" 9 | YAYNO_BITAN_AND_CARREFOUR = "YaynotBitanAndCarrefour" 10 | COFIX = "Cofix" 11 | CITY_MARKET_GIVATAYIM = "CityMarketGivatayim" 12 | CITY_MARKET_KIRYATONO = "CityMarketKiryatOno" 13 | CITY_MARKET_KIRYATGAT = "CityMarketKiryatGat" 14 | CITY_MARKET_SHOPS = "CityMarketShops" 15 | DOR_ALON = "DorAlon" 16 | GOOD_PHARM = "GoodPharm" 17 | HAZI_HINAM = "HaziHinam" 18 | HET_COHEN = "HetCohen" 19 | KESHET = "Keshet" 20 | KING_STORE = "KingStore" 21 | MAAYAN_2000 = "Maayan2000" 22 | MAHSANI_ASHUK = "MahsaniAShuk" 23 | MEGA = "Mega" 24 | NETIV_HASED = "NetivHased" 25 | MESHMAT_YOSEF_1 = "MeshnatYosef1" 26 | MESHMAT_YOSEF_2 = "MeshnatYosef2" 27 | OSHER_AD = "Osherad" 28 | POLIZER = "Polizer" 29 | RAMI_LEVY = "RamiLevy" 30 | SALACH_DABACH = "SalachDabach" 31 | SHEFA_BARCART_ASHEM = "ShefaBarcartAshem" 32 | SHUFERSAL = "Shufersal" 33 | SHUK_AHIR = "ShukAhir" 34 | STOP_MARKET = "StopMarket" 35 | SUPER_PHARM = "SuperPharm" 36 | SUPER_YUDA = "SuperYuda" 37 | SUPER_SAPIR = "SuperSapir" 38 | FRESH_MARKET_AND_SUPER_DOSH = "FreshMarketAndSuperDosh" 39 | QUIK = "Quik" 40 | TIV_TAAM = "TivTaam" 41 | VICTORY = "Victory" 42 | YELLOW = "Yellow" 43 | YOHANANOF = "Yohananof" 44 | ZOL_VEBEGADOL = "ZolVeBegadol" 45 | WOLT = "Wolt" 46 | 47 | @classmethod 48 | def is_valid_folder_name(cls, member): 49 | """check if an folder is part of the cls""" 50 | return isinstance(member, DumpFolderNames) 51 | 52 | @classmethod 53 | def all_folders_names(cls): 54 | """get the name of all listed folders""" 55 | return [e.name for e in cls] 56 | -------------------------------------------------------------------------------- /il_supermarket_scarper/scrappers/shufersal.py: -------------------------------------------------------------------------------- 1 | import urllib.parse 2 | 3 | from il_supermarket_scarper.engines import MultiPageWeb 4 | from il_supermarket_scarper.utils import DumpFolderNames, FileTypesFilters 5 | 6 | 7 | class Shufersal(MultiPageWeb): 8 | """scaper for shufersal""" 9 | 10 | utilize_date_param = False 11 | 12 | def __init__(self, folder_name=None): 13 | super().__init__( 14 | url="https://prices.shufersal.co.il/", 15 | total_page_xpath="""//*[@id="gridContainer"]/table/tfoot/tr/td/a[6]/@href""", 16 | total_pages_pattern=r"[?&]page=([0-9]+)", 17 | chain=DumpFolderNames.SHUFERSAL, 18 | chain_id="7290027600007", 19 | folder_name=folder_name, 20 | page_argument="&page", 21 | ) 22 | 23 | def get_file_types_id(self, files_types=None): 24 | """get the file type id""" 25 | if files_types is None: 26 | return ["0"] 27 | 28 | types = [] 29 | for ftype in files_types: 30 | if ftype == FileTypesFilters.STORE_FILE.name: 31 | types.append("5") 32 | if ftype == FileTypesFilters.PRICE_FILE.name: 33 | types.append("1") 34 | if ftype == FileTypesFilters.PROMO_FILE.name: 35 | types.append("3") 36 | if ftype == FileTypesFilters.PRICE_FULL_FILE.name: 37 | types.append("2") 38 | if ftype == FileTypesFilters.PROMO_FULL_FILE.name: 39 | types.append("4") 40 | return types 41 | 42 | def build_params(self, files_types=None, store_id=None, when_date=None): 43 | """build the params for the request""" 44 | params = {"catID": ",".join(self.get_file_types_id(files_types))} 45 | 46 | if store_id: 47 | params["storeId"] = store_id 48 | return [f"/FileObject/UpdateCategory?{urllib.parse.urlencode(params)}"] 49 | -------------------------------------------------------------------------------- /setup.py: -------------------------------------------------------------------------------- 1 | from io import open 2 | from setuptools import setup 3 | 4 | with open("README.md", encoding="utf-8") as f: 5 | long_description = "\n" + f.read() 6 | 7 | with open("requirements.txt", encoding="utf-8") as f: 8 | required = f.read().splitlines() 9 | 10 | with open("requirements-dev.txt", encoding="utf-8") as f: 11 | dev_required = f.read().splitlines() 12 | 13 | setup( 14 | # Needed to silence warnings (and to be a worthwhile package) 15 | name="il-supermarket-scraper", 16 | url="https://github.com/OpenIsraeliSupermarkets/israeli-supermarket-scarpers", 17 | author="Sefi Erlich", 18 | author_email="erlichsefi@gmail.com", 19 | # Needed to actually package something 20 | packages=[ 21 | "il_supermarket_scarper", 22 | "il_supermarket_scarper.engines", 23 | "il_supermarket_scarper.scrappers", 24 | "il_supermarket_scarper.utils", 25 | "il_supermarket_scarper.utils.databases", 26 | ], 27 | # Needed for dependencies 28 | install_requires=required, 29 | tests_require=dev_required, 30 | extras_require={"test": ["pytest", "pytest-xdist"]}, 31 | # *strongly* suggested for sharing 32 | version="0.6.3", 33 | # The license can be anything you like 34 | license="MIT", 35 | description="python package that implement a scraping for israeli supermarket data", 36 | # We will also need a readme eventually (there will be a warning) 37 | long_description=long_description, 38 | long_description_content_type="text/markdown", 39 | keywords=["israel", "israeli", "scraper", "supermarket"], 40 | classifiers=[ 41 | "Development Status :: 3 - Alpha", 42 | "Intended Audience :: Developers", 43 | "Topic :: Software Development :: Build Tools", 44 | "License :: OSI Approved :: MIT License", 45 | "Programming Language :: Python :: 3", 46 | "Programming Language :: Python :: 3.4", 47 | "Programming Language :: Python :: 3.5", 48 | "Programming Language :: Python :: 3.6", 49 | ], 50 | ) 51 | -------------------------------------------------------------------------------- /il_supermarket_scarper/utils/databases/mongo.py: -------------------------------------------------------------------------------- 1 | import os 2 | from ..logger import Logger 3 | from .base import AbstractDataBase 4 | 5 | 6 | PYMONGO_INSTALLED = True 7 | try: 8 | import pymongo 9 | from pymongo.errors import ServerSelectionTimeoutError 10 | except ImportError: 11 | PYMONGO_INSTALLED = False 12 | 13 | 14 | class MongoDataBase(AbstractDataBase): 15 | """A class that represents a MongoDB database.""" 16 | 17 | def __init__(self, database_name) -> None: 18 | super().__init__(database_name) 19 | self.myclient = None 20 | self.store_db = None 21 | 22 | def create_connection(self): 23 | """Create a connection to the MongoDB database.""" 24 | if PYMONGO_INSTALLED: 25 | url = os.environ.get("MONGO_URL", "localhost") 26 | port = os.environ.get("MONGO_PORT", "27017") 27 | self.myclient = pymongo.MongoClient(f"mongodb://{url}:{port}/") 28 | self.store_db = self.myclient[self.database_name] 29 | 30 | def enable_collection_status(self): 31 | """Enable data collection to MongoDB.""" 32 | if PYMONGO_INSTALLED: 33 | self.set_collection_status(True) 34 | self.create_connection() 35 | else: 36 | Logger.info("Can't enable collection. Please install pymongo.") 37 | 38 | def insert_document(self, collection_name, document): 39 | """Insert a document into a MongoDB collection.""" 40 | if self.is_collection_enabled(): 41 | try: 42 | self.store_db[collection_name].insert_one(document) 43 | except ServerSelectionTimeoutError: 44 | self.set_collection_status(False) 45 | Logger.error( 46 | "Failed to connect to MongoDB. Collection status disabled." 47 | ) 48 | 49 | def find_document(self, collection_name, query): 50 | """Find a document in a MongoDB collection.""" 51 | if self.is_collection_enabled(): 52 | return self.store_db[collection_name].find_one(query) 53 | return None 54 | -------------------------------------------------------------------------------- /il_supermarket_scarper/scrappers/meshnat_yosef.py: -------------------------------------------------------------------------------- 1 | import json 2 | 3 | from il_supermarket_scarper.engines.web import WebBase 4 | from il_supermarket_scarper.engines import Bina 5 | 6 | from il_supermarket_scarper.utils import DumpFolderNames, Logger 7 | 8 | 9 | class MeshnatYosef1(WebBase): 10 | """scraper for meshnat yoosef""" 11 | 12 | def __init__(self, folder_name=None): 13 | super().__init__( 14 | DumpFolderNames.MESHMAT_YOSEF_1, 15 | chain_id="5144744100002", 16 | url="https://list-files.w5871031-kt.workers.dev/", 17 | folder_name=folder_name, 18 | ) 19 | 20 | def get_data_from_page(self, req_res): 21 | """get the file list from a page""" 22 | response = json.loads(req_res.text) 23 | return response 24 | 25 | def get_file_size_from_entry(self, entry): 26 | """ 27 | Extract file size from a JSON entry. 28 | Returns size in bytes, or None if not found. 29 | """ 30 | # Meshnat Yosef don't support file size in the entry 31 | return None 32 | 33 | def extract_task_from_entry(self, all_trs): 34 | """extract download links, file names, and file sizes from page list""" 35 | download_urls = [] 36 | file_names = [] 37 | file_sizes = [] 38 | for x in all_trs: 39 | try: 40 | download_urls.append(x["url"]) 41 | file_names.append(x["name"]) 42 | file_sizes.append(self.get_file_size_from_entry(x)) 43 | except (AttributeError, KeyError, IndexError, TypeError) as e: 44 | Logger.warning(f"Error extracting task from entry: {e}") 45 | 46 | return download_urls, file_names, file_sizes 47 | 48 | 49 | class MeshnatYosef2(Bina): 50 | """scaper for Meshnat Yosef""" 51 | 52 | def __init__(self, folder_name=None): 53 | super().__init__( 54 | DumpFolderNames.MESHMAT_YOSEF_2, 55 | chain_id=["5144744100001", "7290058289400"], 56 | url_perfix="ktshivuk", 57 | folder_name=folder_name, 58 | ) 59 | -------------------------------------------------------------------------------- /il_supermarket_scarper/utils/loop.py: -------------------------------------------------------------------------------- 1 | import concurrent.futures 2 | from .logger import Logger 3 | 4 | 5 | def defualt_aggregtion_function(all_done): 6 | """format the scraping result to the final input""" 7 | result = [] 8 | for response in all_done: 9 | _response = response 10 | if hasattr(_response, "result"): 11 | _response = _response.result() 12 | result.append(_response) 13 | return result 14 | 15 | 16 | def multiple_page_aggregtion(pages_to_scrape): 17 | """format the scraping result to the final input for multipage""" 18 | download_urls = [] 19 | file_names = [] 20 | file_sizes = [] 21 | for result in pages_to_scrape: 22 | if hasattr(result, "result"): 23 | page_result = result.result() 24 | else: 25 | page_result = result 26 | page_download_urls, page_file_names, page_file_sizes = page_result 27 | file_sizes.extend(page_file_sizes) 28 | download_urls.extend(page_download_urls) 29 | file_names.extend(page_file_names) 30 | return download_urls, file_names, file_sizes 31 | 32 | 33 | def execute_in_parallel( 34 | function_to_execute, 35 | iterable, 36 | max_threads=None, 37 | aggregtion_function=defualt_aggregtion_function, 38 | ): 39 | """execute a job in the event loop""" 40 | 41 | Logger.info(f"Running {len(iterable)} tasks in parallel") 42 | results = run_tasks( 43 | function_to_execute, 44 | iterable, 45 | max_threads=max_threads, 46 | ) 47 | 48 | all_done = aggregtion_function(results) 49 | Logger.info(f"Done with {len(all_done)} tasks in parallel") 50 | return all_done 51 | 52 | 53 | def run_tasks( 54 | function_to_execute, 55 | iterable, 56 | max_threads: int = None, 57 | ): 58 | """Run tasks in multi-thread or sequentially""" 59 | if max_threads: 60 | # Use multi-thread 61 | with concurrent.futures.ThreadPoolExecutor( 62 | max_workers=max_threads, thread_name_prefix="PullingThread" 63 | ) as executor: 64 | futures = [executor.submit(function_to_execute, arg) for arg in iterable] 65 | return [ 66 | future.result() for future in concurrent.futures.as_completed(futures) 67 | ] 68 | else: 69 | # Or just iterate over all 70 | return [function_to_execute(arg) for arg in iterable] 71 | -------------------------------------------------------------------------------- /il_supermarket_scarper/engines/apsx.py: -------------------------------------------------------------------------------- 1 | from abc import ABC, abstractmethod 2 | from il_supermarket_scarper.utils import Logger 3 | 4 | from .web import WebBase 5 | 6 | 7 | class Aspx(WebBase, ABC): 8 | """class for aspx scapers""" 9 | 10 | def __init__( 11 | self, chain, chain_id, url, aspx_page, folder_name=None, max_threads=5 12 | ): 13 | super().__init__( 14 | chain, chain_id, url, folder_name=folder_name, max_threads=max_threads 15 | ) 16 | self.aspx_page = aspx_page 17 | 18 | def extract_task_from_entry(self, all_trs): 19 | """from the trs extract the download urls, file names, and file sizes""" 20 | 21 | download_urls = [] 22 | file_names = [] 23 | file_sizes = [] 24 | for x in all_trs: 25 | try: 26 | download_url = self.url + self.get_href_from_entry(x) 27 | download_urls.append(download_url) 28 | file_names.append(self.get_file_name_no_ext_from_entry(download_url)) 29 | file_sizes.append(self.get_file_size_from_entry(x)) 30 | except (AttributeError, KeyError, IndexError, TypeError) as e: 31 | Logger.warning(f"Error extracting task from entry: {e}") 32 | return download_urls, file_names, file_sizes 33 | 34 | @abstractmethod 35 | def _get_all_possible_query_string_params( 36 | self, files_types=None, store_id=None, when_date=None 37 | ): 38 | """list all param to add to the url""" 39 | 40 | @abstractmethod 41 | def _build_query_url(self, query_params, base_urls): 42 | """build the url with the query params""" 43 | 44 | def get_request_url(self, files_types=None, store_id=None, when_date=None): 45 | """build the request given the base url and the query params""" 46 | result = [] 47 | for query_params in self._get_all_possible_query_string_params( 48 | files_types=files_types, store_id=store_id, when_date=when_date 49 | ): 50 | result.extend(self._build_query_url(query_params, [self.url])) 51 | Logger.debug(f"Request url: {result}") 52 | return result 53 | 54 | @abstractmethod 55 | def get_href_from_entry(self, entry): 56 | """get download link for entry (tr)""" 57 | 58 | @abstractmethod 59 | def get_file_name_no_ext_from_entry(self, entry): 60 | """get the file name without extensions from entey (tr)""" 61 | -------------------------------------------------------------------------------- /main.py: -------------------------------------------------------------------------------- 1 | import datetime 2 | import os 3 | from il_supermarket_scarper import ScarpingTask, ScraperFactory, FileTypesFilters 4 | 5 | 6 | def load_params(): 7 | """load params from env variables with validation""" 8 | kwargs = {"suppress_exception": True, "lookup_in_db": True} 9 | 10 | # validate scrapers 11 | enabled_scrapers = os.getenv("ENABLED_SCRAPERS", None) 12 | if enabled_scrapers: 13 | enabled_scrapers = enabled_scrapers.split(",") 14 | 15 | not_valid = list( 16 | filter( 17 | lambda scraper: scraper not in ScraperFactory.all_scrapers_name(), 18 | enabled_scrapers, 19 | ) 20 | ) 21 | if not_valid: 22 | raise ValueError(f"ENABLED_SCRAPERS contains invalid {not_valid}") 23 | 24 | kwargs["enabled_scrapers"] = enabled_scrapers 25 | 26 | # validate file types 27 | enabled_file_types = os.getenv("ENABLED_FILE_TYPES", None) 28 | if enabled_file_types: 29 | 30 | enabled_file_types = enabled_file_types.split(",") 31 | 32 | not_valid = list( 33 | filter( 34 | lambda f_types: f_types not in FileTypesFilters.all_types(), 35 | enabled_file_types, 36 | ) 37 | ) 38 | if not_valid: 39 | raise ValueError(f"ENABLED_FILE_TYPES contains invalid {not_valid}") 40 | 41 | kwargs["files_types"] = enabled_file_types 42 | 43 | # validate number of processes 44 | number_of_processes = os.getenv("NUMBER_OF_PROCESSES", None) 45 | if number_of_processes: 46 | try: 47 | kwargs["multiprocessing"] = int(number_of_processes) 48 | except ValueError: 49 | raise ValueError("NUMBER_OF_PROCESSES must be an integer") 50 | 51 | # validate limit 52 | limit = os.getenv("LIMIT", None) 53 | if limit: 54 | try: 55 | kwargs["limit"] = int(limit) 56 | except ValueError: 57 | raise ValueError(f"LIMIT must be an integer, but got {limit}") 58 | 59 | # validate today 60 | today = os.getenv("TODAY", None) 61 | if today: 62 | try: 63 | kwargs["when_date"] = datetime.datetime.strptime(today, "%Y-%m-%d %H:%M") 64 | except ValueError: 65 | raise ValueError("TODAY must be in the format 'YYYY-MM-DD HH:MM'") 66 | 67 | return kwargs 68 | 69 | 70 | if __name__ == "__main__": 71 | 72 | args = load_params() 73 | 74 | ScarpingTask(**args).start() 75 | -------------------------------------------------------------------------------- /il_supermarket_scarper/utils/gzip_utils.py: -------------------------------------------------------------------------------- 1 | import gzip 2 | import shutil 3 | import os 4 | import io 5 | import zipfile 6 | from .exceptions import RestartSessionError 7 | 8 | 9 | def extract_xml_file_from_gz_file(file_save_path): 10 | """extract xml from gz""" 11 | target_file_name = os.path.splitext(file_save_path)[0] + ".xml" 12 | try: 13 | with gzip.open(file_save_path, "rb") as infile: 14 | with open(target_file_name, "wb") as outfile: 15 | shutil.copyfileobj(infile, outfile) 16 | except (gzip.BadGzipFile, EOFError) as exception: 17 | try: 18 | with open(file_save_path, "rb") as response_content: 19 | with zipfile.ZipFile(io.BytesIO(response_content.read())) as the_zip: 20 | zip_info = the_zip.infolist()[0] 21 | with the_zip.open(zip_info) as the_file: 22 | with open(target_file_name, "wb") as f_out: 23 | f_out.write(the_file.read()) 24 | 25 | except ( # pylint: disable=broad-except,redefined-outer-name 26 | Exception 27 | ) as exception: 28 | report_failed_zip(exception, file_save_path, target_file_name) 29 | 30 | except Exception as exception: # pylint: disable=broad-except 31 | report_failed_zip(exception, file_save_path, target_file_name) 32 | 33 | 34 | def report_failed_zip(exception, file_save_path, target_file_name): 35 | """report a file wasn't able to extracted""" 36 | 37 | try: 38 | file_size = os.path.getsize(file_save_path) 39 | 40 | file_contant = "" 41 | with open(file_save_path, "r", encoding="utf-8") as file: 42 | file_contant = file.readlines() 43 | 44 | if "link expired" in str(file_contant): 45 | raise RestartSessionError() 46 | 47 | raise ValueError( 48 | f"Error decoding file:{file_save_path} with " 49 | f"error: {str(exception)} file size {str(file_size)} ," 50 | f"trimed_file_contant {str(file_contant)[:100]}" 51 | ) 52 | except UnicodeDecodeError: 53 | raise ValueError( 54 | f"Error decoding file:{file_save_path} with " 55 | f"error: {str(exception)} file size {str(file_size)} ," 56 | f"can't decode file" 57 | ) 58 | finally: 59 | os.remove(file_save_path) 60 | # remove the corrupted file 61 | if os.path.exists(target_file_name): 62 | os.remove(target_file_name) 63 | -------------------------------------------------------------------------------- /il_supermarket_scarper/scrappers/wolt.py: -------------------------------------------------------------------------------- 1 | from datetime import timedelta 2 | from bs4 import BeautifulSoup 3 | 4 | from il_supermarket_scarper.utils import _now, Logger 5 | from il_supermarket_scarper.engines.web import WebBase 6 | 7 | from il_supermarket_scarper.utils import DumpFolderNames 8 | 9 | 10 | class Wolt(WebBase): 11 | """scraper for wolt""" 12 | 13 | def __init__(self, folder_name=None): 14 | super().__init__( 15 | DumpFolderNames.WOLT, 16 | chain_id="7290058249350", 17 | url="https://wm-gateway.wolt.com/isr-prices/public/v1/index.html", 18 | folder_name=folder_name, 19 | ) 20 | 21 | def get_request_url( 22 | self, files_types=None, store_id=None, when_date=None 23 | ): # pylint: disable=unused-argument 24 | """get all links to collect download links from""" 25 | if when_date: 26 | formatted_date = when_date.strftime("%Y-%m-%d") 27 | return [ 28 | { 29 | "url": self.url.replace("index.html", f"{formatted_date}.html"), 30 | "method": "GET", 31 | } 32 | ] 33 | 34 | perspective = _now() 35 | all_pages_to_collect_from = [] 36 | for days_back in range(10): 37 | formatted_date = (perspective - timedelta(days=days_back)).strftime( 38 | "%Y-%m-%d" 39 | ) 40 | all_pages_to_collect_from.append( 41 | { 42 | "url": self.url.replace("index.html", f"{formatted_date}.html"), 43 | "method": "GET", 44 | } 45 | ) 46 | return all_pages_to_collect_from 47 | 48 | def get_data_from_page(self, req_res): 49 | """get the file list from a page""" 50 | soup = BeautifulSoup(req_res.text, features="lxml") 51 | return list( 52 | map( 53 | lambda x: (x.text, self.url.replace("index.html", x.a.attrs["href"])), 54 | list(soup.find_all("li")), 55 | ) 56 | ) 57 | 58 | def extract_task_from_entry(self, all_trs): 59 | """extract download links, file names, and file sizes from page list""" 60 | download_urls = [] 61 | file_names = [] 62 | file_sizes = [] 63 | for x in all_trs: 64 | try: 65 | download_urls.append(x[1]) 66 | file_names.append(x[0]) 67 | file_sizes.append(None) 68 | except (AttributeError, KeyError, IndexError, TypeError) as e: 69 | Logger.warning(f"Error extracting task from entry: {e}") 70 | 71 | return download_urls, file_names, file_sizes 72 | -------------------------------------------------------------------------------- /il_supermarket_scarper/utils/logger.py: -------------------------------------------------------------------------------- 1 | import logging 2 | import sys 3 | 4 | 5 | def build_logger(): 6 | """create the logger instance""" 7 | # Define logger 8 | logger = logging.getLogger("Logger") 9 | 10 | if not logger.handlers: 11 | logger.setLevel(logging.DEBUG) # set logger level 12 | log_formatter = logging.Formatter( 13 | "%(name)-12s %(asctime)s %(levelname)-8s " 14 | "[%(threadName)s] %(filename)s:%(funcName)s %(message)s" 15 | ) 16 | console_handler = logging.StreamHandler( 17 | sys.stdout 18 | ) # set streamhandler to stdout 19 | console_handler.setFormatter(log_formatter) 20 | logger.addHandler(console_handler) 21 | 22 | file_handler = logging.FileHandler("logging.log") 23 | file_handler.setFormatter(log_formatter) 24 | logger.addHandler(file_handler) 25 | 26 | return logger 27 | 28 | 29 | class Logger: 30 | """a static logger class to share will all components""" 31 | 32 | enabled = True 33 | logger = build_logger() 34 | 35 | @classmethod 36 | def change_logging_status(cls, new_status): 37 | """enable or disable status""" 38 | cls.enabled = new_status 39 | 40 | @classmethod 41 | def set_logging_level(cls, level): 42 | """set logging level""" 43 | if level == "DEBUG": 44 | cls.logger.setLevel(logging.DEBUG) 45 | elif level == "INFO": 46 | cls.logger.setLevel(logging.INFO) 47 | elif level == "ERROR": 48 | cls.logger.setLevel(logging.ERROR) 49 | elif level == "WARNING": 50 | cls.logger.setLevel(logging.WARNING) 51 | else: 52 | cls.logger.setLevel(logging.DEBUG) 53 | 54 | @classmethod 55 | def info(cls, msg, *args, **kwargs): 56 | """log info""" 57 | if cls.enabled: 58 | cls.logger.info(msg, *args, **kwargs) 59 | 60 | @classmethod 61 | def debug(cls, msg, *args, **kwargs): 62 | """log info""" 63 | if cls.enabled: 64 | cls.logger.debug(msg, *args, **kwargs) 65 | 66 | @classmethod 67 | def error(cls, msg, *args, **kwargs): 68 | """log error""" 69 | if cls.enabled: 70 | cls.logger.error(msg, *args, **kwargs) 71 | 72 | @classmethod 73 | def error_execption(cls, _): 74 | """log execption""" 75 | if cls.enabled: 76 | cls.logger.error( 77 | "got an execption:", 78 | exc_info=sys.exc_info(), 79 | ) 80 | 81 | @classmethod 82 | def warning(cls, msg, *args, **kwargs): 83 | """log warning""" 84 | if cls.enabled: 85 | cls.logger.warning(msg, *args, **kwargs) 86 | -------------------------------------------------------------------------------- /stress_test.py: -------------------------------------------------------------------------------- 1 | import time 2 | import json 3 | import datetime 4 | import tempfile 5 | import pstats 6 | import cProfile 7 | import io 8 | from il_supermarket_scarper.scrappers_factory import ScraperFactory 9 | from il_supermarket_scarper.utils import _now 10 | 11 | 12 | def format_stats_as_json(profile, project_name): 13 | """get the stats from the profiler and format them as json""" 14 | stream = io.StringIO() 15 | ps = pstats.Stats(profile, stream=stream) 16 | ps.sort_stats(pstats.SortKey.CUMULATIVE) # Sort by cumulative time 17 | ps.print_stats() 18 | 19 | # Convert the printed stats to a list of lines 20 | stats_output = stream.getvalue().splitlines() 21 | 22 | # Filter the lines to include only functions within the project 23 | project_stats = [] 24 | for line in stats_output: 25 | if project_name in line: # Filter for project-specific lines 26 | 27 | parts = line.split() 28 | if len(parts) >= 5: # Basic sanity check for the parts 29 | function_data = { 30 | "function": parts[-1], # Function path 31 | "ncalls": parts[0], # Number of calls 32 | "tottime": parts[1], 33 | "tottime_per_call": parts[2], # Time spent in function 34 | "cumtime": parts[3], # Cumulative time including subcalls 35 | "cumtime_per_call": parts[4], # 36 | } 37 | project_stats.append(function_data) 38 | 39 | return project_stats 40 | 41 | 42 | if __name__ == "__main__": 43 | 44 | result = {} 45 | for scraper_name in ScraperFactory.all_scrapers_name(): 46 | 47 | def full_execution(scraper): 48 | """full execution of the scraper""" 49 | with tempfile.TemporaryDirectory() as tmpdirname: 50 | try: 51 | initer = ScraperFactory.get(scraper)(folder_name=tmpdirname) 52 | return initer.scrape(when_date=_now()), "" 53 | except Exception as e: # pylint: disable=broad-exception-caught 54 | return [], str(e) 55 | 56 | execution_time = datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S") 57 | start_time = time.time() 58 | pr = cProfile.Profile() 59 | pr.enable() 60 | 61 | files, error = full_execution(scraper_name) 62 | 63 | pr.disable() 64 | 65 | end_time = time.time() 66 | result[scraper_name] = { 67 | "status": format_stats_as_json(pr, "israeli-supermarket-scarpers"), 68 | "execution_time": execution_time, 69 | "start_time": start_time, 70 | "end_time": end_time, 71 | "time": end_time - start_time, 72 | "files": len(files), 73 | "error": error, 74 | } 75 | 76 | with open("stress_test_results.json", "w", encoding="utf-8") as f: 77 | json.dump(result, f) 78 | -------------------------------------------------------------------------------- /il_supermarket_scarper/utils/validation.py: -------------------------------------------------------------------------------- 1 | import re 2 | import difflib 3 | 4 | 5 | def is_valid_chain_name(input_string): 6 | """check the chain name is in a valid folder foramt""" 7 | # Regular expression pattern to match only letters (a-z, case insensitive) and hyphens (-) 8 | pattern = r"^[a-zA-Z0-9-]+$" 9 | 10 | # Use re.match to check if the entire string matches the pattern 11 | return bool(re.match(pattern, input_string)) 12 | 13 | 14 | def find_index_with_substring(array, substring): 15 | """Find the index of the first element in the array that contains the substring""" 16 | return [i for i, s in enumerate(array) if substring in s][0] 17 | 18 | 19 | def show_text_diff(text1, text2): 20 | """show the difference between two text strings in a git-like format""" 21 | # Split the texts into lines for comparison 22 | text1_lines = text1.splitlines() 23 | text2_lines = text2.splitlines() 24 | 25 | text1_lines = text1_lines[ 26 | find_index_with_substring( 27 | text1_lines, "חוקים ותקנות" 28 | ) : find_index_with_substring(text1_lines, "נוסח החוק המעודכן ביותר") 29 | ] 30 | text2_lines = text2_lines[ 31 | find_index_with_substring( 32 | text2_lines, "חוקים ותקנות" 33 | ) : find_index_with_substring(text2_lines, "נוסח החוק המעודכן ביותר") 34 | ] 35 | 36 | # Use difflib to compare the texts with more context 37 | diff = difflib.unified_diff( 38 | text1_lines, 39 | text2_lines, 40 | lineterm="", 41 | fromfile="Expected", 42 | tofile="Actual", 43 | n=5, # Show 5 lines of context around changes 44 | ) 45 | 46 | # Format the output for better readability 47 | diff_lines = [] 48 | diff_lines.append("\n" + "=" * 80) 49 | diff_lines.append("DIFF:") 50 | diff_lines.append("=" * 80) 51 | 52 | for line in diff: 53 | # Add visual markers for different line types 54 | if line.startswith("---") or line.startswith("+++"): 55 | diff_lines.append(line) 56 | elif line.startswith("-"): 57 | diff_lines.append(f"- {line[1:]}") # Removed line 58 | elif line.startswith("+"): 59 | diff_lines.append(f"+ {line[1:]}") # Added line 60 | elif line.startswith("@@"): 61 | diff_lines.append("\n" + line) # Context marker 62 | else: 63 | diff_lines.append(f" {line}") # Context line 64 | 65 | diff_lines.append("=" * 80) 66 | 67 | return "\n".join(diff_lines) 68 | 69 | 70 | def change_xml_encoding(file_path): 71 | """change the encoding if failing with utf-8""" 72 | with open(file_path, "rb") as file: # pylint: disable=unspecified-encoding 73 | # Read the XML file content 74 | content = file.read() 75 | 76 | content = content.decode("ISO-8859-8", errors="replace") 77 | 78 | # Save the file with the new encoding declaration 79 | with open(file_path, "wb") as file: 80 | file.write( 81 | content.replace('encoding="ISO-8859-8"', 'encoding="UTF-8"').encode("utf-8") 82 | ) 83 | -------------------------------------------------------------------------------- /.github/workflows/codeql.yml: -------------------------------------------------------------------------------- 1 | # For most projects, this workflow file will not need changing; you simply need 2 | # to commit it to your repository. 3 | # 4 | # You may wish to alter this file to override the set of languages analyzed, 5 | # or to provide custom queries or build logic. 6 | # 7 | # ******** NOTE ******** 8 | # We have attempted to detect the languages in your repository. Please check 9 | # the `language` matrix defined below to confirm you have the correct set of 10 | # supported CodeQL languages. 11 | # 12 | name: "CodeQL" 13 | 14 | on: 15 | push: 16 | branches: [ "main" ] 17 | pull_request: 18 | # The branches below must be a subset of the branches above 19 | branches: [ "main" ] 20 | 21 | 22 | concurrency: 23 | group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.ref }} 24 | cancel-in-progress: true 25 | 26 | jobs: 27 | analyze: 28 | name: Analyze 29 | runs-on: ubuntu-latest 30 | permissions: 31 | actions: read 32 | contents: read 33 | security-events: write 34 | 35 | strategy: 36 | fail-fast: false 37 | matrix: 38 | language: [ 'python' ] 39 | # CodeQL supports [ 'cpp', 'csharp', 'go', 'java', 'javascript', 'python', 'ruby' ] 40 | # Learn more about CodeQL language support at https://aka.ms/codeql-docs/language-support 41 | 42 | steps: 43 | - name: Checkout repository 44 | uses: actions/checkout@v3 45 | 46 | # Initializes the CodeQL tools for scanning. 47 | - name: Initialize CodeQL 48 | uses: github/codeql-action/init@v2 49 | with: 50 | languages: ${{ matrix.language }} 51 | # If you wish to specify custom queries, you can do so here or in a config file. 52 | # By default, queries listed here will override any specified in a config file. 53 | # Prefix the list here with "+" to use these queries and those in the config file. 54 | 55 | # Details on CodeQL's query packs refer to : https://docs.github.com/en/code-security/code-scanning/automatically-scanning-your-code-for-vulnerabilities-and-errors/configuring-code-scanning#using-queries-in-ql-packs 56 | # queries: security-extended,security-and-quality 57 | 58 | 59 | # Autobuild attempts to build any compiled languages (C/C++, C#, or Java). 60 | # If this step fails, then you should remove it and run the build manually (see below) 61 | - name: Autobuild 62 | uses: github/codeql-action/autobuild@v2 63 | 64 | # ℹ️ Command-line programs to run using the OS shell. 65 | # 📚 See https://docs.github.com/en/actions/using-workflows/workflow-syntax-for-github-actions#jobsjob_idstepsrun 66 | 67 | # If the Autobuild fails above, remove it and uncomment the following three lines. 68 | # modify them (or add more) to build your code if your project, please refer to the EXAMPLE below for guidance. 69 | 70 | # - run: | 71 | # echo "Run, Build Application using script" 72 | # ./location_of_script_within_repo/buildscript.sh 73 | 74 | - name: Perform CodeQL Analysis 75 | uses: github/codeql-action/analyze@v2 76 | with: 77 | category: "/language:${{matrix.language}}" 78 | -------------------------------------------------------------------------------- /il_supermarket_scarper/engines/publishprice.py: -------------------------------------------------------------------------------- 1 | from bs4 import BeautifulSoup 2 | 3 | from il_supermarket_scarper.utils.logger import Logger 4 | from .web import WebBase 5 | 6 | 7 | class PublishPrice(WebBase): 8 | """ 9 | scrape the file of PublishPrice 10 | possibly can support historical search: there is folder for each date. 11 | but this is not implemented. 12 | """ 13 | 14 | def __init__( 15 | self, 16 | chain, 17 | chain_id, 18 | site_infix, 19 | folder_name=None, 20 | domain="prices", 21 | max_threads=5, 22 | ): 23 | super().__init__( 24 | chain, 25 | chain_id, 26 | url=f"https://{domain}.{site_infix}.co.il/", 27 | folder_name=folder_name, 28 | max_threads=max_threads, 29 | ) 30 | self.folder = None 31 | 32 | def get_request_url( 33 | self, files_types=None, store_id=None, when_date=None 34 | ): # pylint: disable=unused-argument 35 | """get all links to collect download links from""" 36 | 37 | formated = "" 38 | if when_date: 39 | formated = when_date.strftime("%Y%m%d") 40 | formated = f"?p=./{formated}" 41 | return [{"url": self.url + formated, "method": "GET"}] 42 | 43 | def get_data_from_page(self, req_res): 44 | soup = BeautifulSoup(req_res.text, features="lxml") 45 | 46 | # the developer hard-coded the files names in the html 47 | all_trs = ( 48 | soup.find_all("script")[-1] 49 | .text.replace("const files_html = [", "") 50 | .replace("];", "") 51 | .split("\n")[5] 52 | .split(",") 53 | ) 54 | return list(map(lambda x: BeautifulSoup(x, features="lxml"), all_trs)) 55 | 56 | def extract_task_from_entry(self, all_trs): 57 | """from the trs extract the download urls, file names, and file sizes""" 58 | 59 | def get_herf_element(x): 60 | herfs = x.find_all("a") 61 | if len(herfs) > 0: 62 | return herfs[-1] 63 | return None 64 | 65 | def get_herf(x): 66 | return get_herf_element(x).attrs["href"] 67 | 68 | def get_path_from_herf(x): 69 | return get_herf(x).replace("\\", "").replace('"', "").replace("./", "") 70 | 71 | def get_name_from_herf(x): 72 | return get_path_from_herf(x).split(".")[0].split("/")[-1] 73 | 74 | all_trs = list( 75 | filter( 76 | lambda x: get_herf_element(x) is not None, 77 | all_trs, 78 | ) 79 | ) 80 | 81 | download_urls = [] 82 | file_names = [] 83 | file_sizes = [] 84 | for x in all_trs: 85 | try: 86 | download_urls.append(self.url + get_path_from_herf(x)) 87 | file_names.append(get_name_from_herf(x)) 88 | file_sizes.append(self.get_file_size_from_entry(x)) 89 | except (AttributeError, KeyError, IndexError, TypeError) as e: 90 | Logger.warning(f"Error extracting task from entry: {e}") 91 | 92 | return download_urls, file_names, file_sizes 93 | -------------------------------------------------------------------------------- /il_supermarket_scarper/utils/file_cache.py: -------------------------------------------------------------------------------- 1 | import os 2 | import json 3 | import time 4 | from functools import wraps 5 | 6 | 7 | def file_cache(ttl=None): 8 | """Decorator to cache function results in a file with an optional TTL (time-to-live)""" 9 | 10 | def get_cache_file(func_name): 11 | """Generate a cache file path based on the function name""" 12 | cache_dir = ".cache" 13 | return os.path.join(cache_dir, f"{func_name}_cache.json") 14 | 15 | def load_cache(cache_file): 16 | """Load the cache from the specified cache file if it exists""" 17 | if os.path.exists(cache_file): 18 | with open(cache_file, "r", encoding="utf-8") as f: 19 | return json.load(f) 20 | return {} 21 | 22 | def save_cache(cache_file, cache_data): 23 | """Save the cache to the specified cache file""" 24 | if not os.path.exists(".cache"): 25 | os.makedirs(".cache") 26 | with open(cache_file, "w", encoding="utf-8") as f: 27 | json.dump(cache_data, f) 28 | 29 | def decorator(func): 30 | @wraps(func) 31 | def wrapper(*args, **kwargs): 32 | # Generate cache file path based on the function name 33 | cache_file = get_cache_file(func.__name__) 34 | 35 | # Load the cache from the file 36 | cache = load_cache(cache_file) 37 | 38 | # Generate a cache key from function arguments 39 | cache_key = generate_cache_key(args, kwargs) 40 | 41 | # Check if result is cached and valid 42 | if cache_key in cache: 43 | entry = cache[cache_key] 44 | timestamp = entry["timestamp"] 45 | 46 | # If ttl is set, check if cache has expired 47 | if ttl is not None and (time.time() - timestamp) > ttl: 48 | # Cache expired, remove the entry 49 | del cache[cache_key] 50 | else: 51 | # Cache is valid, return cached result 52 | return entry["result"] 53 | 54 | # If not cached or expired, call the function and store the result 55 | result = func(*args, **kwargs) 56 | 57 | # Save the result with the current timestamp in the cache 58 | cache[cache_key] = { 59 | "result": result, 60 | "timestamp": time.time(), # Save the current time 61 | } 62 | save_cache(cache_file, cache) 63 | 64 | return result 65 | 66 | def generate_cache_key(args, kwargs): 67 | key_parts = [] 68 | for arg in args: 69 | if isinstance(arg, (int, float, str, bool)): 70 | key_parts.append(str(arg)) 71 | else: 72 | raise ValueError(f"Unsupported argument type: {type(arg)}") 73 | for k, v in kwargs.items(): 74 | if isinstance(v, (int, float, str, bool)): 75 | key_parts.append(f"{k}={v}") 76 | else: 77 | raise ValueError(f"Unsupported keyword argument type: {type(v)}") 78 | return "|".join(key_parts) 79 | 80 | return wrapper 81 | 82 | return decorator 83 | -------------------------------------------------------------------------------- /il_supermarket_scarper/utils/file_types.py: -------------------------------------------------------------------------------- 1 | from enum import Enum 2 | 3 | 4 | class FileTypesFilters(Enum): 5 | """type of files avaliable to download""" 6 | 7 | PROMO_FILE = { 8 | "should_contain": "promo", 9 | "should_not_contain": "full", 10 | } 11 | STORE_FILE = { 12 | "should_contain": "store", 13 | "should_not_contain": None, 14 | } 15 | PRICE_FILE = { 16 | "should_contain": "price", 17 | "should_not_contain": "full", 18 | } 19 | PROMO_FULL_FILE = { 20 | "should_contain": "promofull", 21 | "should_not_contain": None, 22 | } 23 | PRICE_FULL_FILE = { 24 | "should_contain": "pricefull", 25 | "should_not_contain": None, 26 | } 27 | 28 | @classmethod 29 | def all_types(cls): 30 | """Returns a list of all the enum keys.""" 31 | return [e.name for e in FileTypesFilters] 32 | 33 | @classmethod 34 | def all_update_files(cls): 35 | """all the update files""" 36 | return [FileTypesFilters.PROMO_FILE.name, FileTypesFilters.PRICE_FILE.name] 37 | 38 | @classmethod 39 | def all_full_files(cls): 40 | """all the full files""" 41 | return [ 42 | FileTypesFilters.PRICE_FULL_FILE.name, 43 | FileTypesFilters.PROMO_FULL_FILE.name, 44 | ] 45 | 46 | @classmethod 47 | def only_promo(cls): 48 | """only files with promotion date""" 49 | return [FileTypesFilters.PROMO_FILE.name, FileTypesFilters.PROMO_FULL_FILE.name] 50 | 51 | @classmethod 52 | def only_store(cls): 53 | """only files with stores date""" 54 | return [FileTypesFilters.STORE_FILE.name] 55 | 56 | @classmethod 57 | def only_price(cls): 58 | """only files with prices date""" 59 | return [FileTypesFilters.PRICE_FILE.name, FileTypesFilters.PRICE_FULL_FILE.name] 60 | 61 | @staticmethod 62 | def filter_file(file_name, should_contain, should_not_contain): 63 | """fillter function""" 64 | return ( 65 | should_contain in file_name.lower() 66 | and "null" not in file_name.lower() 67 | and ( 68 | should_not_contain is None 69 | or should_not_contain not in file_name.lower() 70 | ) 71 | ) 72 | 73 | @classmethod 74 | def is_file_from_type(cls, filename, file_type): 75 | """check if file from certain type""" 76 | string_to_look_in = getattr(cls, file_type).value 77 | return cls.filter_file(filename, **string_to_look_in) 78 | 79 | @classmethod 80 | def get_type_from_file(cls, filename): 81 | """get file type from filename""" 82 | for file_type_name in cls.all_types(): 83 | if cls.is_file_from_type(filename, file_type_name): 84 | return getattr(cls, file_type_name) 85 | return None 86 | 87 | @classmethod 88 | def filter(cls, file_type, iterable, by_function=lambda x: x): 89 | """Returns the type of the file.""" 90 | return list( 91 | filter( 92 | lambda filename: cls.is_file_from_type( 93 | by_function(filename), file_type 94 | ), 95 | iterable, 96 | ) 97 | ) 98 | -------------------------------------------------------------------------------- /il_supermarket_scarper/scrappers/super_pharm.py: -------------------------------------------------------------------------------- 1 | from pathlib import Path 2 | import urllib.parse 3 | import datetime 4 | 5 | import json 6 | from il_supermarket_scarper.engines import MultiPageWeb 7 | from il_supermarket_scarper.utils import ( 8 | Logger, 9 | url_connection_retry, 10 | DumpFolderNames, 11 | FileTypesFilters, 12 | ) 13 | 14 | 15 | class SuperPharm(MultiPageWeb): 16 | """scraper for super pharm""" 17 | 18 | def __init__(self, folder_name=None): 19 | super().__init__( 20 | chain=DumpFolderNames.SUPER_PHARM, 21 | chain_id="7290172900007", 22 | url="http://prices.super-pharm.co.il/", 23 | folder_name=folder_name, 24 | total_page_xpath='//*[@class="mvc-grid-pager"]/button[last()]/@data-page', 25 | total_pages_pattern=r"(\d+)$", 26 | page_argument="&page", 27 | ) 28 | 29 | def collect_files_details_from_page(self, html): 30 | links = [] 31 | filenames = [] 32 | file_sizes = [] 33 | for element in html.xpath("//tbody/tr"): # skip header 34 | links.append(self.url + element.xpath("./td[6]/a/@href")[0]) 35 | filenames.append(element.xpath("./td[2]")[0].text) 36 | file_sizes.append(None) # Super Pharm don't support file size in the entry 37 | return links, filenames, file_sizes 38 | 39 | @url_connection_retry() 40 | def retrieve_file(self, file_link, file_save_path, timeout=15): 41 | Logger.debug(f"On a new Session: calling {file_link}") 42 | 43 | response_content = self.session_with_cookies_by_chain( 44 | file_link, timeout=timeout 45 | ) 46 | spath = json.loads(response_content.content) 47 | Logger.debug(f"Found spath: {spath}") 48 | 49 | file_to_save = self.session_with_cookies_by_chain( 50 | self.url + spath["href"], timeout=timeout 51 | ) 52 | file_to_save_with_ext = file_save_path + ".gz" 53 | Path(file_to_save_with_ext).write_bytes(file_to_save.content) 54 | 55 | return file_to_save_with_ext 56 | 57 | def get_file_types_id(self, files_types=None): 58 | """get the file type id""" 59 | if files_types is None: 60 | return [""] 61 | 62 | types = [] 63 | for ftype in files_types: 64 | if ftype == FileTypesFilters.STORE_FILE.name: 65 | types.append("StoresFull") 66 | if ftype == FileTypesFilters.PRICE_FILE.name: 67 | types.append("Price") 68 | if ftype == FileTypesFilters.PROMO_FILE.name: 69 | types.append("Promo") 70 | if ftype == FileTypesFilters.PRICE_FULL_FILE.name: 71 | types.append("PriceFull") 72 | if ftype == FileTypesFilters.PROMO_FULL_FILE.name: 73 | types.append("PromoFull") 74 | return types 75 | 76 | def build_params(self, files_types=None, store_id=None, when_date=None): 77 | """build the params for the request""" 78 | 79 | all_params = [] 80 | for ftype in self.get_file_types_id(files_types): 81 | params = {"type": "", "date": "", "store": ""} 82 | 83 | if store_id: 84 | params["store"] = store_id 85 | if when_date and isinstance(when_date, datetime.datetime): 86 | params["date"] = when_date.strftime("%Y-%m-%d") 87 | if files_types: 88 | params["type"] = ftype 89 | all_params.append(params) 90 | 91 | return ["?" + urllib.parse.urlencode(params) for params in all_params] 92 | -------------------------------------------------------------------------------- /il_supermarket_scarper/scrappers/hazihinam.py: -------------------------------------------------------------------------------- 1 | import urllib.parse 2 | import datetime 3 | from il_supermarket_scarper.engines import MultiPageWeb 4 | from il_supermarket_scarper.utils import ( 5 | DumpFolderNames, 6 | FileTypesFilters, 7 | _now, 8 | convert_unit, 9 | UnitSize, 10 | string_to_float, 11 | ) 12 | 13 | # class HaziHinam(Cerberus): 14 | # """scrper fro hazi hinam""" 15 | 16 | # def __init__(self, folder_name=None): 17 | # super().__init__( 18 | # chain=DumpFolderNames.HAZI_HINAM, 19 | # chain_id="7290700100008", 20 | # folder_name=folder_name, 21 | # ftp_username="HaziHinam", 22 | # ) 23 | 24 | 25 | class HaziHinam(MultiPageWeb): 26 | """scrper fro hazi hinam""" 27 | 28 | def __init__(self, folder_name=None): 29 | super().__init__( 30 | chain=DumpFolderNames.HAZI_HINAM, 31 | chain_id="7290700100008", 32 | url="https://shop.hazi-hinam.co.il/Prices", 33 | folder_name=folder_name, 34 | total_page_xpath="(//li[contains(concat(' ', normalize-space(@class), ' ')," 35 | + "' pagination-item ')])[last()]/a/@href", 36 | total_pages_pattern=r"\d+", 37 | page_argument="&p", 38 | ) 39 | 40 | def collect_files_details_from_page(self, html): 41 | """collect the details deom one page""" 42 | links = [] 43 | filenames = [] 44 | file_sizes = [] 45 | for link in html.xpath("//table/tbody/tr"): 46 | links.append(link.xpath("td[6]/a/@href")[0]) 47 | filenames.append(link.xpath("td[3]")[0].text.strip() + ".xml.gz") 48 | file_sizes.append( 49 | convert_unit( 50 | string_to_float(link.xpath("td[5]")[0].text.strip()), 51 | UnitSize.KB, 52 | UnitSize.BYTES, 53 | ) 54 | ) 55 | return links, filenames, file_sizes 56 | 57 | def get_file_types_id(self, files_types=None): 58 | """get the file type id""" 59 | if files_types is None or files_types == FileTypesFilters.all_types(): 60 | return [{"t": "null", "f": "null"}] 61 | 62 | types = [] 63 | for ftype in files_types: 64 | if ftype == FileTypesFilters.STORE_FILE.name: 65 | types.append({"t": "3", "f": "null"}) 66 | if ftype == FileTypesFilters.PRICE_FILE.name: 67 | types.append({"t": "1", "f": "null"}) 68 | if ftype == FileTypesFilters.PROMO_FILE.name: 69 | types.append({"t": "2", "f": "null"}) 70 | if ftype == FileTypesFilters.PRICE_FULL_FILE.name: 71 | types.append({"t": "1", "f": "null"}) 72 | if ftype == FileTypesFilters.PROMO_FULL_FILE.name: 73 | types.append({"t": "2", "f": "null"}) 74 | return types 75 | 76 | def build_params(self, files_types=None, store_id=None, when_date=None): 77 | """build the params for the request""" 78 | 79 | all_params = [] 80 | for type_params in self.get_file_types_id(files_types): 81 | 82 | # filtering store is not supported 83 | # if store_id: 84 | # params["s"] = "null" 85 | if when_date and isinstance(when_date, datetime.datetime): 86 | all_params.append({"d": when_date.strftime("%Y-%m-%d"), **type_params}) 87 | else: 88 | all_params.append({"d": _now().strftime("%Y-%m-%d"), **type_params}) 89 | all_params.append( 90 | { 91 | "d": (_now() - datetime.timedelta(days=1)).strftime("%Y-%m-%d"), 92 | **type_params, 93 | } 94 | ) 95 | 96 | return ["?" + urllib.parse.urlencode(params) for params in all_params] 97 | -------------------------------------------------------------------------------- /il_supermarket_scarper/scrappers/city_market.py: -------------------------------------------------------------------------------- 1 | import urllib.parse 2 | import datetime 3 | from il_supermarket_scarper.engines import Bina, MultiPageWeb 4 | from il_supermarket_scarper.utils import ( 5 | DumpFolderNames, 6 | FileTypesFilters, 7 | UnitSize, 8 | ) 9 | from il_supermarket_scarper.utils.status import convert_unit, string_to_float 10 | 11 | 12 | # removed on 28.02.2025 13 | class CityMarketGivatayim(Bina): 14 | """scraper for city market givatayim""" 15 | 16 | def __init__(self, folder_name=None): 17 | super().__init__( 18 | chain=DumpFolderNames.CITY_MARKET_GIVATAYIM, 19 | chain_id="5359000000000", 20 | url_perfix="citymarketgivatayim", 21 | folder_name=folder_name, 22 | ) 23 | 24 | 25 | # removed on 28.10.2024 26 | class CityMarketKirtatOno(Bina): 27 | """scraper for city market givatayim""" 28 | 29 | def __init__(self, folder_name=None): 30 | super().__init__( 31 | chain=DumpFolderNames.CITY_MARKET_KIRYATONO, 32 | chain_id="5359000000000", 33 | url_perfix="citymarketkiryatono", 34 | folder_name=folder_name, 35 | ) 36 | 37 | 38 | class CityMarketKiryatGat(Bina): 39 | """scraper for city market givatayim""" 40 | 41 | def __init__(self, folder_name=None): 42 | super().__init__( 43 | chain=DumpFolderNames.CITY_MARKET_KIRYATGAT, 44 | chain_id="7290058266241", 45 | url_perfix="citymarketkiryatgat", 46 | folder_name=folder_name, 47 | ) 48 | 49 | 50 | class CityMarketShops(MultiPageWeb): 51 | """scraper for city market givatayim""" 52 | 53 | def __init__(self, folder_name=None): 54 | super().__init__( 55 | chain=DumpFolderNames.CITY_MARKET_SHOPS, 56 | chain_id="7290000000003", 57 | url="http://www.citymarket-shops.co.il/", 58 | folder_name=folder_name, 59 | total_page_xpath="(//li[contains(concat(' ', normalize-space(@class), ' ')," 60 | + "' pagination-item ')])[last()]/a/@href", 61 | total_pages_pattern=r"\d+", 62 | page_argument="&p", 63 | ) 64 | 65 | def collect_files_details_from_page(self, html): 66 | """collect the details deom one page""" 67 | links = [] 68 | filenames = [] 69 | file_sizes = [] 70 | for link in html.xpath("//table/tbody/tr"): 71 | links.append(self.url + link.xpath("td[7]/a/@href")[0]) 72 | filenames.append(link.xpath("td[3]")[0].text.strip() + ".xml.gz") 73 | file_sizes.append( 74 | convert_unit( 75 | string_to_float(link.xpath("td[6]")[0].text.strip()), 76 | UnitSize.KB, 77 | UnitSize.BYTES, 78 | ) 79 | ) 80 | return links, filenames, file_sizes 81 | 82 | def get_file_types_id(self, files_types=None): 83 | """get the file type id""" 84 | if files_types is None or files_types == FileTypesFilters.all_types(): 85 | return [{"t": "", "f": ""}] 86 | 87 | types = [] 88 | for ftype in files_types: 89 | if ftype == FileTypesFilters.STORE_FILE.name: 90 | types.append({"t": 3, "f": ""}) 91 | if ftype == FileTypesFilters.PRICE_FILE.name: 92 | types.append({"t": "1", "f": "0"}) 93 | if ftype == FileTypesFilters.PROMO_FILE.name: 94 | types.append({"t": "2", "f": "0"}) 95 | if ftype == FileTypesFilters.PRICE_FULL_FILE.name: 96 | types.append({"t": "1", "f": "1"}) 97 | if ftype == FileTypesFilters.PROMO_FULL_FILE.name: 98 | types.append({"t": "2", "f": "1"}) 99 | return types 100 | 101 | def build_params(self, files_types=None, store_id=None, when_date=None): 102 | """build the params for the request""" 103 | 104 | all_params = [] 105 | for type_params in self.get_file_types_id(files_types): 106 | params = {"d": "", "s": ""} 107 | 108 | if store_id: 109 | params["s"] = str(store_id).zfill(3) 110 | if when_date and isinstance(when_date, datetime.datetime): 111 | params["d"] = when_date.strftime("%Y-%m-%d") 112 | if files_types: 113 | params = {**params, **type_params} 114 | all_params.append(params) 115 | 116 | return ["?" + urllib.parse.urlencode(params) for params in all_params] 117 | -------------------------------------------------------------------------------- /il_supermarket_scarper/utils/databases/json_file.py: -------------------------------------------------------------------------------- 1 | import os 2 | import json 3 | from ..logger import Logger 4 | from .base import AbstractDataBase 5 | 6 | 7 | class JsonDataBase(AbstractDataBase): 8 | """A class that represents a JSON-based database.""" 9 | 10 | def __init__(self, database_name, base_path="json_db") -> None: 11 | super().__init__(database_name, collection_status=True) 12 | self.base_path = base_path 13 | self.database_file = f"{self.database_name}.json" 14 | self._ensure_db_directory_exists() 15 | self._ensure_db_file_exists() 16 | 17 | def _ensure_db_directory_exists(self): 18 | """Ensure the base directory for the JSON database exists.""" 19 | if not os.path.exists(self.base_path): 20 | os.makedirs(self.base_path, exist_ok=True) 21 | 22 | def _ensure_db_file_exists(self): 23 | """Ensure the database file exists.""" 24 | file_path = self._get_database_file_path() 25 | if not os.path.exists(file_path): 26 | with open(file_path, "w", encoding="utf-8") as file: 27 | json.dump({}, file) # Initialize with an empty dict 28 | 29 | def _get_database_file_path(self): 30 | """Get the full path to the database JSON file.""" 31 | return os.path.join(self.base_path, self.database_file) 32 | 33 | def _read_database(self): 34 | """Read the JSON database file and return its contents.""" 35 | file_path = self._get_database_file_path() 36 | data = {} 37 | 38 | # Load existing data from the file 39 | if os.path.exists(file_path): 40 | with open(file_path, "r", encoding="utf-8") as file: 41 | try: 42 | data = json.load(file) 43 | except json.JSONDecodeError: 44 | Logger.warning(f"File {file_path} is corrupted, resetting it.") 45 | data = {} 46 | return data 47 | 48 | def _write_database(self, data): 49 | """Write data to the JSON database file.""" 50 | file_path = self._get_database_file_path() 51 | 52 | with open(file_path, "w", encoding="utf-8") as file: 53 | json.dump(dict(sorted(data.items())), file, default=str, indent=4) 54 | 55 | def insert_documents(self, collection_name, document): 56 | """Insert a document into a collection inside the JSON database.""" 57 | if self.collection_status: 58 | 59 | data = self._read_database() 60 | # Ensure the collection exists in the database 61 | if collection_name not in data: 62 | data[collection_name] = [] 63 | 64 | # Add the new document to the collection 65 | data[collection_name].extend(document) 66 | 67 | # Save the updated data back to the file 68 | self._write_database(data) 69 | 70 | def insert_document(self, collection_name, document): 71 | """Insert a document into a collection inside the JSON database.""" 72 | if self.collection_status: 73 | data = self._read_database() 74 | # Ensure the collection exists in the database 75 | if collection_name not in data: 76 | data[collection_name] = [] 77 | 78 | # Add the new document to the collection 79 | data[collection_name].append(document) 80 | 81 | # Save the updated data back to the file 82 | self._write_database(data) 83 | 84 | def find_document(self, collection_name, query): 85 | """Find a document in a collection based on a query.""" 86 | if self.collection_status: 87 | file_path = self._get_database_file_path() 88 | 89 | if os.path.exists(file_path): 90 | with open(file_path, "r", encoding="utf-8") as file: 91 | try: 92 | data = json.load(file) 93 | 94 | # Check if the collection exists 95 | if collection_name in data: 96 | # Filter the documents in the collection based on the query 97 | for document in data[collection_name]: 98 | if all( 99 | item in document.items() for item in query.items() 100 | ): 101 | return document 102 | except json.JSONDecodeError: 103 | Logger.warning(f"File {file_path} is corrupted.") 104 | 105 | return None 106 | -------------------------------------------------------------------------------- /il_supermarket_scarper/scrapper_runner.py: -------------------------------------------------------------------------------- 1 | import os 2 | 3 | from multiprocessing import Pool 4 | 5 | from .scrappers_factory import ScraperFactory 6 | from .utils import Logger, summerize_dump_folder_contant, clean_dump_folder 7 | 8 | 9 | class MainScrapperRunner: 10 | """a main scraper to execute all scraping""" 11 | 12 | def __init__( 13 | self, 14 | size_estimation_mode=False, 15 | enabled_scrapers=None, 16 | dump_folder_name=None, 17 | multiprocessing=5, 18 | lookup_in_db=True, 19 | ): 20 | assert isinstance(enabled_scrapers, list) or enabled_scrapers is None 21 | 22 | env_size_estimation_mode = os.getenv("SE_MODE", None) 23 | if env_size_estimation_mode: 24 | Logger.info( 25 | f"Setting size estimation mode from enviroment. value={env_size_estimation_mode}" 26 | ) 27 | self.size_estimation_mode = bool(env_size_estimation_mode == "True") 28 | else: 29 | self.size_estimation_mode = size_estimation_mode 30 | Logger.info(f"size_estimation_mode: {self.size_estimation_mode}") 31 | 32 | if not enabled_scrapers: 33 | enabled_scrapers = ScraperFactory.all_scrapers_name() 34 | 35 | self.enabled_scrapers = enabled_scrapers 36 | Logger.info(f"Enabled scrapers: {self.enabled_scrapers}") 37 | self.dump_folder_name = dump_folder_name 38 | self.multiprocessing = multiprocessing 39 | self.lookup_in_db = lookup_in_db 40 | 41 | def run( 42 | self, 43 | limit=None, 44 | files_types=None, 45 | when_date=False, 46 | suppress_exception=False, 47 | min_size=None, 48 | max_size=None, 49 | ): 50 | """run the scraper""" 51 | Logger.info(f"Limit is {limit}") 52 | Logger.info(f"files_types is {files_types}") 53 | Logger.info(f"Start scraping {','.join(self.enabled_scrapers)}.") 54 | 55 | with Pool(self.multiprocessing) as pool: 56 | result = pool.map( 57 | self.scrape_one_wrap, 58 | list( 59 | map( 60 | lambda chainScrapperClass: ( 61 | chainScrapperClass, 62 | { 63 | "limit": limit, 64 | "files_types": files_types, 65 | "when_date": when_date, 66 | "suppress_exception": suppress_exception, 67 | "min_size": min_size, 68 | "max_size": max_size, 69 | }, 70 | ), 71 | self.enabled_scrapers, 72 | ) 73 | ), 74 | ) 75 | 76 | Logger.info("Done scraping all supermarkets.") 77 | 78 | return result 79 | 80 | def scrape_one_wrap(self, arg): 81 | """scrape one warper""" 82 | args, kwargs = arg 83 | return self.scrape_one(args, **kwargs) 84 | 85 | def scrape_one( 86 | self, 87 | chain_scrapper_class, 88 | limit=None, 89 | files_types=None, 90 | store_id=None, 91 | when_date=None, 92 | suppress_exception=False, 93 | min_size=None, 94 | max_size=None, 95 | ): 96 | """scrape one""" 97 | chain_scrapper_constractor = ScraperFactory.get(chain_scrapper_class) 98 | Logger.info(f"Starting scrapper {chain_scrapper_constractor}") 99 | scraper = chain_scrapper_constractor(folder_name=self.dump_folder_name) 100 | chain_name = scraper.get_chain_name() 101 | 102 | Logger.info(f"scraping {chain_name}") 103 | if self.lookup_in_db: 104 | scraper.enable_collection_status() 105 | scraper.enable_aggregation_between_runs() 106 | 107 | scraper.scrape( 108 | limit=limit, 109 | files_types=files_types, 110 | store_id=store_id, 111 | when_date=when_date, 112 | files_names_to_scrape=None, 113 | filter_null=False, 114 | filter_zero=False, 115 | suppress_exception=suppress_exception, 116 | min_size=min_size, 117 | max_size=max_size, 118 | ) 119 | Logger.info(f"done scraping {chain_name}") 120 | 121 | folder_with_files = scraper.get_storage_path() 122 | if self.size_estimation_mode: 123 | Logger.info(f"Summrize test data for {chain_name}") 124 | summerize_dump_folder_contant(folder_with_files) 125 | 126 | Logger.info(f"Cleaning dump folder for {chain_name}") 127 | clean_dump_folder(folder_with_files) 128 | return folder_with_files 129 | -------------------------------------------------------------------------------- /il_supermarket_scarper/engines/bina.py: -------------------------------------------------------------------------------- 1 | import json 2 | import urllib.parse 3 | import datetime 4 | 5 | from il_supermarket_scarper.utils import ( 6 | Logger, 7 | url_connection_retry, 8 | url_retrieve, 9 | FileTypesFilters, 10 | ) 11 | 12 | from .apsx import Aspx 13 | 14 | 15 | class Bina(Aspx): 16 | """scraper for all Bina base site. 17 | Note! the websites have the possibility to download historical value as a date search menu. 18 | this class don't support downloading them. 19 | """ 20 | 21 | def __init__( 22 | self, 23 | chain, 24 | chain_id, 25 | url_perfix, 26 | download_postfix="/Download.aspx?FileNm=", 27 | domain="binaprojects.com/", 28 | folder_name=None, 29 | ): 30 | super().__init__( 31 | chain, 32 | chain_id, 33 | url=f"http://{url_perfix}.{domain}", 34 | aspx_page="MainIO_Hok.aspx", 35 | folder_name=folder_name, 36 | ) 37 | self.download_postfix = download_postfix 38 | 39 | def file_type_ids(self, file_types): 40 | """get the file type id""" 41 | file_type_mapping = { 42 | FileTypesFilters.STORE_FILE.name: 1, 43 | FileTypesFilters.PRICE_FILE.name: 2, 44 | FileTypesFilters.PROMO_FILE.name: 3, 45 | FileTypesFilters.PRICE_FULL_FILE.name: 4, 46 | FileTypesFilters.PROMO_FULL_FILE.name: 5, 47 | } 48 | if file_types is None or file_types == FileTypesFilters.all_types(): 49 | yield 0 50 | else: 51 | for file_type in file_types: 52 | if file_type not in file_type_mapping: 53 | raise ValueError(f"File type {file_type} not supported") 54 | yield file_type_mapping[file_type] 55 | 56 | def _build_query_url(self, query_params, base_urls): 57 | res = [] 58 | for base in base_urls: 59 | res.append( 60 | { 61 | "url": base + self.aspx_page + "?" + query_params, 62 | "method": "GET", 63 | } 64 | ) 65 | return res 66 | 67 | def _get_all_possible_query_string_params( 68 | self, files_types=None, store_id=None, when_date=None 69 | ): 70 | """get the arguments need to add to the url""" 71 | chains_urls = [] 72 | 73 | for c_id in self.get_chain_id(): 74 | chains_urls.append( 75 | { 76 | "_": f"{c_id}", 77 | "wReshet": "הכל", 78 | "WFileType": "", 79 | "WDate": "", 80 | "WStore": "", 81 | } 82 | ) 83 | 84 | # add file types to url 85 | if files_types: 86 | chains_urls_with_types = [] 87 | for files_type in self.file_type_ids(files_types): 88 | 89 | for chain_url in chains_urls: 90 | chains_urls_with_types.append( 91 | {**chain_url, "WFileType": files_type} 92 | ) 93 | chains_urls = chains_urls_with_types 94 | 95 | # add store id 96 | if store_id: 97 | for chains_url in chains_urls: 98 | chains_url["WStore"] = store_id 99 | 100 | # posting date 101 | if when_date and isinstance(when_date, datetime.datetime): 102 | for chains_url in chains_urls: 103 | chains_url["WDate"] = when_date.strftime("%d/%m/%Y") 104 | 105 | return [urllib.parse.urlencode(params) for params in chains_urls] 106 | 107 | def get_data_from_page(self, req_res): 108 | return json.loads(req_res.text) 109 | 110 | def get_href_from_entry(self, entry): 111 | """get download link for entry (tr)""" 112 | return self.download_postfix + entry["FileNm"] 113 | 114 | def get_file_name_no_ext_from_entry(self, entry): 115 | """get the file name without extensions from entey (tr)""" 116 | return entry.split(self.download_postfix)[-1].split(".")[0] 117 | 118 | def get_file_size_from_entry(self, entry): 119 | """ 120 | Extract file size from a JSON entry. 121 | Bina returns JSON objects, check for size field. 122 | Returns size in bytes, or None if not found. 123 | """ 124 | # Bina don't support file size in the entry 125 | return None 126 | 127 | @url_connection_retry() 128 | def retrieve_file(self, file_link, file_save_path, timeout=30): 129 | response_content = self.session_with_cookies_by_chain( 130 | file_link, 131 | ) 132 | spath = json.loads(response_content.content) 133 | Logger.debug(f"Found spath: {spath}") 134 | 135 | url = spath[0]["SPath"] 136 | ext = file_link.split(".")[-1] 137 | 138 | url_retrieve(url, file_save_path + "." + ext, timeout=timeout) 139 | return file_save_path + "." + ext 140 | -------------------------------------------------------------------------------- /LICENSE.txt: -------------------------------------------------------------------------------- 1 | Custom License Agreement 2 | 3 | This License Agreement ("Agreement") is a legal agreement between Sefi Erlich ("Licensor") and any individual or entity ("Licensee" or "Contributor") who accesses, uses, or contributes to this repository. By accessing, using, or contributing to the Repository, you agree to be bound by the terms of this Agreement. 4 | 5 | 1. Grant of License for Non-Commercial Use 6 | 7 | 1.1 Non-Commercial Use License: The Licensor grants the Licensee a worldwide, royalty-free, non-exclusive, non-transferable license to use, reproduce, modify, and distribute the content of the Repository ("Licensed Material") for non-commercial purposes only, subject to the terms and conditions of this Agreement. 8 | 9 | 1.2 Attribution Requirement: When using or distributing the Licensed Material, the Licensee must provide appropriate credit to the Licensor by: 10 | - Citing the Licensor's name as specified. 11 | - Including a link to the Repository. 12 | - Indicating if changes were made to the Licensed Material. 13 | 14 | 1.3 No Commercial Use: Licensees are expressly prohibited from using the Licensed Material, in whole or in part, for any commercial purpose without prior written permission from the Licensor. 15 | 16 | 2. Reservation of Commercial Rights 17 | 18 | 2.1 Exclusive Commercial Rights: All commercial rights to the Licensed Material are exclusively reserved by the Licensor. The Licensor retains the sole right to use, reproduce, modify, distribute, and sublicense the Licensed Material for commercial purposes. 19 | 20 | 2.2 Requesting Commercial Permission: Parties interested in using the Licensed Material for commercial purposes must obtain explicit written consent from the Licensor. Requests should be directed to the contact information provided at the end of this Agreement. 21 | 22 | 3. Contributions 23 | 24 | 3.1 Contributor License Grant: By submitting any content ("Contribution") to the Repository, the Contributor grants the Licensor a non-exclusive, perpetual, irrevocable, worldwide, royalty-free license to use, reproduce, modify, distribute, sublicense, and create derivative works from the Contribution for any purpose, including commercial purposes. 25 | 26 | 3.2 Warranty of Originality: Contributors represent and warrant that their Contributions are original works and do not infringe upon the intellectual property rights of any third party. 27 | 28 | 3.3 No Commercial Rights for Contributors: Contributors acknowledge that they have no rights to use the Licensed Material for commercial purposes. 29 | 30 | 4. Restrictions 31 | 32 | 4.1 Prohibition of Commercial Exploitation: Licensees and Contributors may not: 33 | - Use the Licensed Material or any Contributions for commercial purposes. 34 | - Distribute the Licensed Material or any Contributions as part of any commercial product or service. 35 | - Sublicense the Licensed Material or any Contributions for commercial use. 36 | 37 | 4.2 No Endorsement: Licensees and Contributors may not imply endorsement or affiliation with the Licensor without explicit written permission. 38 | 39 | 5. Term and Termination 40 | 41 | 5.1 Term: This Agreement is effective upon acceptance and continues unless terminated as provided herein. 42 | 43 | 5.2 Termination for Breach: The Licensor may terminate this Agreement immediately if the Licensee or Contributor breaches any of its terms. 44 | 45 | 5.3 Effect of Termination: Upon termination, all rights granted under this Agreement cease, and the Licensee or Contributor must destroy all copies of the Licensed Material in their possession. 46 | 47 | 5.4 Survival: Sections 2, 3, 4, 6, and 7 survive termination of this Agreement. 48 | 49 | 6. Disclaimer of Warranties and Limitation of Liability 50 | 51 | 6.1 As-Is Basis: The Licensed Material and any Contributions are provided "AS IS," without warranties or conditions of any kind, either express or implied. 52 | 53 | 6.2 Disclaimer: The Licensor expressly disclaims all warranties, including but not limited to warranties of title, non-infringement, merchantability, and fitness for a particular purpose. 54 | 55 | 6.3 Limitation of Liability: In no event shall the Licensor be liable for any direct, indirect, incidental, special, exemplary, or consequential damages arising in any way out of the use of the Licensed Material or Contributions. 56 | 57 | 7. General Provisions 58 | 59 | 7.1 Entire Agreement: This Agreement constitutes the entire agreement between the parties concerning the subject matter hereof and supersedes all prior agreements and understandings. 60 | 61 | 7.2 Modification: The Licensor reserves the right to modify this Agreement for new versions of the Licensed Material. Such modifications will not apply retroactively to any version of the Licensed Material you have already obtained. 62 | 63 | 7.3 Severability: If any provision of this Agreement is found to be unenforceable, the remainder shall remain in full force and effect. 64 | 65 | 7.4 Waiver: Failure to enforce any provision of this Agreement shall not constitute a waiver of such provision. 66 | 67 | 7.5 Governing Law: This Agreement shall be governed by and construed in accordance with the laws of Israel, without regard to its conflict of law principles. 68 | 69 | 7.6 Dispute Resolution: Any disputes arising under or in connection with this Agreement shall be subject to the exclusive jurisdiction of the courts located in Israel. 70 | 71 | 8. Acceptance by accessing, using, or contributing to the Repository, you acknowledge that you have read, understood, and agree to be bound by the terms and conditions of this Agreement. 72 | 73 | Contact Information 74 | 75 | For any questions or requests regarding this Agreement, please contact: 76 | 77 | Name: Sefi Erlich 78 | Email: erlichsefi@gmail.com -------------------------------------------------------------------------------- /il_supermarket_scarper/scrappers/tests/test_all.py: -------------------------------------------------------------------------------- 1 | from il_supermarket_scarper.scrappers_factory import ScraperFactory 2 | from .test_cases import make_test_case 3 | 4 | 5 | class BareketTestCase(make_test_case(ScraperFactory.BAREKET, 5)): 6 | """Test case for ScraperFactory.BAREKET.""" 7 | 8 | 9 | class YaynotBitanAndCarrefourTestCase( 10 | make_test_case(ScraperFactory.YAYNO_BITAN_AND_CARREFOUR, 9032) 11 | ): 12 | """Test case for ScraperFactory.YAYNO_BITAN_AND_CARREFOUR.""" 13 | 14 | 15 | class CofixTestCase(make_test_case(ScraperFactory.COFIX, 299)): 16 | """Test case for ScraperFactory.COFIX.""" 17 | 18 | 19 | # class CityMarketGivatayimTestCase( 20 | # make_test_case(ScraperFactory.CITY_MARKET_GIVATAYIM, 1) 21 | # ): 22 | # """Test case for CityMarketGivatay""" 23 | 24 | 25 | # class CityMarketKirtatOnoTestCase( 26 | # make_test_case(ScraperFactory.CITY_MARKET_KIRYATONO, 1) 27 | # ): 28 | # """Test case for CityMarketKirtatOno""" 29 | 30 | 31 | class CityMarketKiryatGatTestCase( 32 | make_test_case(ScraperFactory.CITY_MARKET_KIRYATGAT, 1) 33 | ): 34 | """Test case for CityMarketKiryatGat""" 35 | 36 | 37 | class CityMarketShopsTestCase(make_test_case(ScraperFactory.CITY_MARKET_SHOPS, 1)): 38 | """Test case for CityMarketShops""" 39 | 40 | 41 | class DorAlonTestCase(make_test_case(ScraperFactory.DOR_ALON, 501)): 42 | """Test case for ScraperFactory.DOR_ALON.""" 43 | 44 | 45 | class GoodPharmTestCase(make_test_case(ScraperFactory.GOOD_PHARM, 952)): 46 | """Test case for ScraperFactory.GOOD_PHARM.""" 47 | 48 | 49 | class HaziHinamTestCase(make_test_case(ScraperFactory.HAZI_HINAM, 206)): 50 | """Test case for ScraperFactory.HAZI_HINAM.""" 51 | 52 | 53 | class HetCohen(make_test_case(ScraperFactory.HET_COHEN, 45)): 54 | """Test case for ScraperFactory.HET_COHEN.""" 55 | 56 | 57 | class KeshetTestCase(make_test_case(ScraperFactory.KESHET, 5)): 58 | """Test case for ScraperFactory.KESHET.""" 59 | 60 | 61 | class KingStoreTestCase(make_test_case(ScraperFactory.KING_STORE, 334)): 62 | """Test case for ScraperFactory.KING_STORE.""" 63 | 64 | 65 | class Maayan2000TestCase(make_test_case(ScraperFactory.MAAYAN_2000, 60)): 66 | """Test case for ScraperFactory.MAAYAN_2000.""" 67 | 68 | 69 | class MahsaniAShukTestCase(make_test_case(ScraperFactory.MAHSANI_ASHUK, 98)): 70 | """Test case for ScraperFactory.MAHSANI_ASHUK.""" 71 | 72 | 73 | # class MegaTestCase(make_test_case(ScraperFactory.MEGA, 37)): 74 | # """Test case for ScraperFactory.MEGA.""" 75 | 76 | 77 | class NetivHasefTestCase(make_test_case(ScraperFactory.NETIV_HASED, 1)): 78 | """Test case for ScraperFactory.NETIV_HASED.""" 79 | 80 | 81 | class MeshnatYosef1TestCase(make_test_case(ScraperFactory.MESHMAT_YOSEF_1, 1)): 82 | """Test case for ScraperFactory.MESHMAT_YOSEF_1.""" 83 | 84 | 85 | class MeshnatYosef2TestCase(make_test_case(ScraperFactory.MESHMAT_YOSEF_2, 1)): 86 | """Test case for ScraperFactory.MESHMAT_YOSEF_2.""" 87 | 88 | 89 | class OsheradTestCase(make_test_case(ScraperFactory.OSHER_AD, 1)): 90 | """Test case for ScraperFactory.OSHER_AD.""" 91 | 92 | 93 | class PolizerTestCase(make_test_case(ScraperFactory.POLIZER, 2)): 94 | """Test case for ScraperFactory.POLIZER.""" 95 | 96 | 97 | class RamiLevyTestCase(make_test_case(ScraperFactory.RAMI_LEVY, 1)): 98 | """Test case for ScraperFactory.RAMI_LEVY.""" 99 | 100 | 101 | class SalachDabachTestCase(make_test_case(ScraperFactory.SALACH_DABACH, 4)): 102 | """Test case for ScraperFactory.SALACH_DABACH.""" 103 | 104 | 105 | class ShefaBarcartAshemTestCase(make_test_case(ScraperFactory.SHEFA_BARCART_ASHEM, 42)): 106 | """Test case for ScraperFactory.SHEFA_BARCART_ASHEM.""" 107 | 108 | 109 | class ShufersalTestCase(make_test_case(ScraperFactory.SHUFERSAL, 176)): 110 | """Test case for ScraperFactory.SHUFERSAL.""" 111 | 112 | 113 | class ShukAhirTestCase(make_test_case(ScraperFactory.SHUK_AHIR, 4)): 114 | """Test case for ScraperFactory.SHUK_AHIR.""" 115 | 116 | 117 | class StopMarketTestCase(make_test_case(ScraperFactory.STOP_MARKET, 5)): 118 | """Test case for ScraperFactory.STOP_MARKET.""" 119 | 120 | 121 | class SuperPharmTestCase(make_test_case(ScraperFactory.SUPER_PHARM, 224)): 122 | """Test case for ScraperFactory.SUPER_PHARM.""" 123 | 124 | 125 | class SuperYudaTestCase(make_test_case(ScraperFactory.SUPER_YUDA, 204)): 126 | """Test case for ScraperFactory.SUPER_YUDA.""" 127 | 128 | 129 | class SuperSapirTestCase(make_test_case(ScraperFactory.SUPER_SAPIR, 44)): 130 | """Test case for ScraperFactory.SUPER_SAPIR.""" 131 | 132 | 133 | class FreshMarketAndSuperDoshTestCase( 134 | make_test_case(ScraperFactory.FRESH_MARKET_AND_SUPER_DOSH, 1) 135 | ): 136 | """Test case for ScraperFactory.FRESH_MARKET_AND_SUPER_DOSH.""" 137 | 138 | 139 | class QuikTestCase(make_test_case(ScraperFactory.QUIK, None)): 140 | """Test case for ScraperFactory.QUIK.""" 141 | 142 | 143 | class TivTaamTestCase(make_test_case(ScraperFactory.TIV_TAAM, 3)): 144 | """Test case for ScraperFactory.TIV_TAAM.""" 145 | 146 | 147 | class VictoryTestCase(make_test_case(ScraperFactory.VICTORY, 1)): 148 | """Test case for ScraperFactory.VICTORY.""" 149 | 150 | 151 | class YellowTestCase(make_test_case(ScraperFactory.YELLOW, 1272)): 152 | """Test case for ScraperFactory.YELLOW.""" 153 | 154 | 155 | class YohananofTestCase(make_test_case(ScraperFactory.YOHANANOF, 1)): 156 | """Test case for ScraperFactory.YOHANANOF.""" 157 | 158 | 159 | class ZolVeBegadolTestCase(make_test_case(ScraperFactory.ZOL_VEBEGADOL, 4)): 160 | """Test case for ScraperFactory.ZOL_VEBEGADOL.""" 161 | 162 | 163 | class WoltTestCase(make_test_case(ScraperFactory.WOLT, 0)): 164 | """Test case for ScraperFactory.Wolt.""" 165 | -------------------------------------------------------------------------------- /il_supermarket_scarper/engines/matrix.py: -------------------------------------------------------------------------------- 1 | from bs4 import BeautifulSoup 2 | from il_supermarket_scarper.utils import Logger 3 | from .apsx import Aspx 4 | 5 | 6 | class Matrix(Aspx): 7 | """scraper for all matrix base site. 8 | (support adveanced search: follow the instrucation the page)""" 9 | 10 | utilize_date_param = False 11 | 12 | def __init__( 13 | self, 14 | chain, 15 | chain_id, 16 | url="https://laibcatalog.co.il/", 17 | aspx_page="NBCompetitionRegulations.aspx", 18 | chain_hebrew_name=None, 19 | folder_name=None, 20 | ): 21 | super().__init__(chain, chain_id, url, aspx_page, folder_name=folder_name) 22 | self.chain_hebrew_name = chain_hebrew_name 23 | 24 | # def get_file_types_id(self, files_types=None): 25 | # """get the file type id""" 26 | # if files_types is None: 27 | # return "all" 28 | 29 | # types = [] 30 | # for ftype in files_types: 31 | # if ftype == FileTypesFilters.STORE_FILE.name: 32 | # types.append("storefull") 33 | # if ftype == FileTypesFilters.PRICE_FILE.name: 34 | # types.append("price") 35 | # if ftype == FileTypesFilters.PROMO_FILE.name: 36 | # types.append("promo") 37 | # if ftype == FileTypesFilters.PRICE_FULL_FILE.name: 38 | # types.append("pricefull") 39 | # if ftype == FileTypesFilters.PROMO_FULL_FILE.name: 40 | # types.append("promofull") 41 | # return types 42 | 43 | # def get_when(self, when_date): 44 | # """get the when date""" 45 | # if when_date is None: 46 | # when_date = _now() 47 | # return when_date.strftime("%d/%m/%Y") 48 | 49 | # def get_chain_n_stores__id(self, store_id=None, c_id=None): 50 | # """get the store id""" 51 | # if store_id is None: 52 | # chain_id = str(c_id) # + "001" 53 | # store_id = "-1" 54 | # else: 55 | # chain_id = str(c_id) 56 | # store_id = str(c_id) + "001" + str(store_id).zfill(3) 57 | # return chain_id, store_id 58 | 59 | def _build_query_url(self, query_params, base_urls): 60 | res = [] 61 | for base in base_urls: 62 | res.append( 63 | { 64 | "method": "GET", 65 | "url": base, 66 | # "body": query_params, 67 | } 68 | ) 69 | return res 70 | 71 | def _get_all_possible_query_string_params( 72 | self, files_types=None, store_id=None, when_date=None 73 | ): 74 | """get the arguments need to add to the url""" 75 | 76 | return [{}] 77 | # post_body = [] 78 | # if isinstance(self.chain_id, list): 79 | # for c_id in self.chain_id: 80 | # chain_id, store_id = self.get_chain_n_stores__id( 81 | # store_id=store_id, c_id=c_id 82 | # ) 83 | # post_body.append( 84 | # { 85 | 86 | # "ctl00$TextArea": "", 87 | # "ctl00$MainContent$chain": chain_id, 88 | # "ctl00$MainContent$subChain": "-1", 89 | # "ctl00$MainContent$branch": store_id, 90 | # "ctl00$MainContent$txtDate": self.get_when(when_date=when_date), 91 | # "ctl00$MainContent$fileType": "all", 92 | # # "ctl00$MainContent$btnSearch": "חיפוש", 93 | # } 94 | # ) 95 | # else: 96 | # chain_id, store_id = self.get_chain_n_stores__id( 97 | # store_id=store_id, c_id=self.chain_id 98 | # ) 99 | # post_body.append( 100 | # { 101 | # "ctl00$TextArea": "", 102 | # "ctl00$MainContent$chain": chain_id, 103 | # "ctl00$MainContent$subChain": "-1", 104 | # "ctl00$MainContent$branch": store_id, 105 | # "ctl00$MainContent$txtDate": self.get_when(when_date=when_date), 106 | # "ctl00$MainContent$fileType": "all", 107 | # "ctl00$MainContent$btnSearch": "חיפוש", 108 | # } 109 | # ) 110 | 111 | # # add file types to url 112 | # if files_types: 113 | # chains_urls_with_types = [] 114 | # for files_type in self.get_file_types_id(files_types=files_types): 115 | # for chain_url in post_body: 116 | # chain_url["ctl00$MainContent$fileType"] = files_type 117 | # chains_urls_with_types.append(chain_url) 118 | # post_body = chains_urls_with_types 119 | 120 | # return post_body 121 | 122 | def get_href_from_entry(self, entry): 123 | """get download link for entry (tr)""" 124 | return entry.a.attrs["href"] 125 | 126 | def get_file_name_no_ext_from_entry(self, entry): 127 | """get the file name without extensions from entey (tr)""" 128 | return entry.split("/")[-1].split(".gz")[0].split(".")[0] 129 | 130 | def get_data_from_page(self, req_res): 131 | soup = BeautifulSoup(req_res.text, features="lxml") 132 | all_trs = list(soup.find_all("tr"))[1:] # skip title 133 | 134 | Logger.info(f"Before filtring names found {len(all_trs)} entries") 135 | if self.chain_hebrew_name: 136 | all_trs = list( 137 | filter(lambda x: x and self.chain_hebrew_name in str(x), all_trs) 138 | ) 139 | Logger.info(f"After filtering names found {len(all_trs)} entries") 140 | return all_trs 141 | -------------------------------------------------------------------------------- /il_supermarket_scarper/utils/scraper_status.py: -------------------------------------------------------------------------------- 1 | import os 2 | import traceback 3 | 4 | from .logger import Logger 5 | from .status import log_folder_details 6 | from .databases import JsonDataBase 7 | from .status import _now, get_output_folder 8 | from .lock_utils import lock_by_string 9 | 10 | 11 | class ScraperStatus: 12 | """A class that abstracts the database interface for scraper status.""" 13 | 14 | STARTED = "started" 15 | COLLECTED = "collected" 16 | DOWNLOADED = "downloaded" 17 | FAILED = "fail" 18 | ESTIMATED_SIZE = "estimated_size" 19 | VERIFIED_DOWNLOADS = "verified_downloads" 20 | 21 | def __init__(self, database_name, base_path, folder_name=None) -> None: 22 | self.database = JsonDataBase( 23 | database_name, get_output_folder(base_path, folder_name=folder_name) 24 | ) 25 | self.task_id = _now().strftime("%Y%m%d%H%M%S") 26 | self.filter_between_itrations = False 27 | 28 | @lock_by_string() 29 | def on_scraping_start(self, limit, files_types, **additional_info): 30 | """Report that scraping has started.""" 31 | self._insert_an_update( 32 | ScraperStatus.STARTED, 33 | limit=limit, 34 | files_requested=files_types, 35 | **additional_info, 36 | ) 37 | 38 | def enable_collection_status(self): 39 | """enable data collection to status files""" 40 | self.database.enable_collection_status() 41 | 42 | def enable_aggregation_between_runs(self): 43 | """allow tracking the downloaded file and don't downloading again if downloaded""" 44 | self.filter_between_itrations = True 45 | 46 | @lock_by_string() 47 | def on_collected_details( 48 | self, 49 | file_name_collected_from_site, 50 | links_collected_from_site="", 51 | **additional_info, 52 | ): 53 | """Report that file details have been collected.""" 54 | self._insert_an_update( 55 | ScraperStatus.COLLECTED, 56 | file_name_collected_from_site=file_name_collected_from_site, 57 | links_collected_from_site=links_collected_from_site, 58 | **additional_info, 59 | ) 60 | 61 | @lock_by_string() 62 | def on_download_completed(self, **additional_info): 63 | """Report that the file has been downloaded.""" 64 | self._insert_an_update(ScraperStatus.DOWNLOADED, **additional_info) 65 | self._add_downloaded_files_to_list(**additional_info) 66 | 67 | def filter_already_downloaded( 68 | self, storage_path, files_names_to_scrape, filelist, by_function=lambda x: x 69 | ): 70 | """Filter files already existing in long-term memory or previously downloaded.""" 71 | if self.database.is_collection_enabled() and self.filter_between_itrations: 72 | new_filelist = [] 73 | for file in filelist: 74 | if not self.database.find_document( 75 | self.VERIFIED_DOWNLOADS, {"file_name": by_function(file)} 76 | ): 77 | new_filelist.append(file) 78 | else: 79 | Logger.debug( 80 | f"Filtered file {file} since it was already downloaded and extracted" 81 | ) 82 | return new_filelist 83 | 84 | # Fallback: filter according to the disk 85 | exits_on_disk = os.listdir(storage_path) 86 | 87 | if files_names_to_scrape: 88 | # Delete any files we want to retry downloading 89 | for file in exits_on_disk: 90 | if file.split(".")[0] in files_names_to_scrape: 91 | os.remove(os.path.join(storage_path, file)) 92 | 93 | # Filter the files to download 94 | filelist = list( 95 | filter(lambda x: by_function(x) in files_names_to_scrape, filelist) 96 | ) 97 | 98 | return list(filter(lambda x: by_function(x) not in exits_on_disk, filelist)) 99 | 100 | def _add_downloaded_files_to_list(self, results, **_): 101 | """Add downloaded files to the MongoDB collection.""" 102 | if self.database.is_collection_enabled(): 103 | when = _now() 104 | 105 | documents = [] 106 | for res in results: 107 | if res["extract_succefully"]: 108 | documents.append( 109 | {"file_name": res["file_name"], "when": when}, 110 | ) 111 | self.database.insert_documents(self.VERIFIED_DOWNLOADS, documents) 112 | 113 | @lock_by_string() 114 | def on_scrape_completed(self, folder_name, completed_successfully=True): 115 | """Report when scraping is completed.""" 116 | self._insert_an_update( 117 | ScraperStatus.ESTIMATED_SIZE, 118 | folder_size=log_folder_details(folder_name), 119 | completed_successfully=completed_successfully, 120 | ) 121 | 122 | @lock_by_string() 123 | def on_download_fail(self, execption, download_urls=None, file_names=None): 124 | """report when the scraping in failed""" 125 | self._insert_an_update( 126 | ScraperStatus.FAILED, 127 | execption=str(execption), 128 | traceback=traceback.format_exc(), 129 | download_urls=download_urls if download_urls else [], 130 | file_names=file_names if file_names else [], 131 | ) 132 | 133 | def _insert_an_update(self, status, **additional_info): 134 | """Insert an update into the MongoDB collection.""" 135 | document = { 136 | "status": status, 137 | "when": _now(), 138 | **additional_info, 139 | } 140 | self.database.insert_document(self.task_id, document) 141 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | Israel Supermarket Scraper: Clients to download the data published by the supermarkets. 2 | ======================================= 3 | This is a scraper for ALL the supermarket chains listed in the GOV.IL site. 4 | 5 | שקיפות מחירים (השוואת מחירים) - https://www.gov.il/he/departments/legalInfo/cpfta_prices_regulations 6 | 7 | 8 | 9 | 10 | [![Unit & Integration Tests](https://github.com/OpenIsraeliSupermarkets/israeli-supermarket-scarpers/actions/workflows/test-suite.yml/badge.svg?event=push)](https://github.com/OpenIsraeliSupermarkets/israeli-supermarket-scarpers/actions/workflows/test-suite.yml) 11 | [![CodeQL](https://github.com/OpenIsraeliSupermarkets/israeli-supermarket-scarpers/actions/workflows/codeql.yml/badge.svg)](https://github.com/OpenIsraeliSupermarkets/israeli-supermarket-scarpers/actions/workflows/codeql.yml) 12 | [![Pylint](https://github.com/OpenIsraeliSupermarkets/israeli-supermarket-scarpers/actions/workflows/pylint.yml/badge.svg)](https://github.com/OpenIsraeliSupermarkets/israeli-supermarket-scarpers/actions/workflows/pylint.yml) 13 | [![Publish Docker image](https://github.com/OpenIsraeliSupermarkets/israeli-supermarket-scarpers/actions/workflows/docker-publish.yml/badge.svg)](https://github.com/OpenIsraeliSupermarkets/israeli-supermarket-scarpers/actions/workflows/docker-publish.yml) 14 | [![Upload Python Package](https://github.com/OpenIsraeliSupermarkets/israeli-supermarket-scarpers/actions/workflows/python-publish.yml/badge.svg)](https://github.com/OpenIsraeliSupermarkets/israeli-supermarket-scarpers/actions/workflows/python-publish.yml) 15 | 16 | ## 🤗 Want to support my work? 17 |

18 | Buy Me A Coffee 19 | 20 |

21 | 22 | Daily Automatic Testing 23 | ---- 24 | The test suite is scheduled to run daily, so you can see if the supermarket chains have changed something in their interface and the package will not work properly. 25 | 26 | Status: [![Scheduled Tests](https://github.com/OpenIsraeliSupermarkets/israeli-supermarket-scarpers/actions/workflows/test-suite.yml/badge.svg?event=schedule)](https://github.com/OpenIsraeliSupermarkets/israeli-supermarket-scarpers/actions/workflows/test-suite.yml) 27 | 28 | Notice: 29 | - Berekt and Quik are flaky! They will not fail the testing framework, but you can still use them. 30 | - Some of the scrapers sites are blocked from being accessed from outside of Israel. 31 | 32 | -------- 33 | 34 | 35 | 36 | Got a question? 37 | --------------- 38 | 39 | You can email me at erlichsefi@gmail.com 40 | 41 | If you think you've found a bug: 42 | 43 | - Create issue in [issue tracker](https://github.com/OpenIsraeliSupermarkets/israeli-supermarket-scarpers/issues) to see if 44 | it's already been reported 45 | - Please consider solving the issue by yourself and creating a pull request. 46 | 47 | What is il_supermarket_scarper? 48 | ------------- 49 | 50 | There are a lot of projects in GitHub trying to scrape the supermarket data, but most of them are not stable or haven't been updated for a while, it's about time there will be one codebase that does the work completely. 51 | 52 | You only need to run the following code to get all the data currently shared by the supermarkets. 53 | 54 | ```python 55 | from il_supermarket_scarper import ScarpingTask 56 | 57 | scraper = ScarpingTask() 58 | scraper.start() 59 | ``` 60 | 61 | 62 | Please notice! 63 | Since new files are constantly uploaded by the supermarket to their site, you will only get the current snapshot. In order to keep getting data, you will need to run this code more than one time to get the newly uploaded files. 64 | 65 | Quick start 66 | ----------- 67 | 68 | il_supermarket_scarper can be installed using pip: 69 | 70 | python3 pip install il-supermarket-scraper 71 | 72 | If you want to run the latest version of the code, you can install it from the 73 | repo directly: 74 | 75 | python3 -m pip install -U git+https://github.com/OpenIsraeliSupermarkets/israeli-supermarket-scarpers.git 76 | # or if you don't have 'git' installed 77 | python3 -m pip install -U https://github.com/OpenIsraeliSupermarkets/israeli-supermarket-scarpers/main 78 | 79 | 80 | 81 | Running Docker 82 | ----------- 83 | The docker is designed to re-run against the same configuration, in every iteration the scraper will collect the files available to download and check if the file already exists before fetching it, either by scanning the dump folder, or checking the mongo/status files. 84 | 85 | 86 | Build yourself: 87 | 88 | docker build -t erlichsefi/israeli-supermarket-scarpers --target prod . 89 | 90 | or pull the existing image from docker hub: 91 | 92 | docker pull erlichsefi/israeli-supermarket-scarpers:latest 93 | 94 | 95 | Then running it using: 96 | 97 | 98 | docker run -v "./dumps:/usr/src/app/dumps" \ 99 | -e ENABLED_SCRAPERS="BAREKET,YAYNO_BITAN" \ # see: il_supermarket_scarper/scrappers_factory.py 100 | -e ENABLED_FILE_TYPES="STORE_FILE" \ # see: il_supermarket_scarper/utils/file_types.py 101 | -e LIMIT=1 \ # number of files you would like to download (remove for unlimited) 102 | -e TODAY="2024-10-23 14:35" \ # the date to download data from 103 | erlichsefi/israeli-supermarket-scarpers 104 | 105 | 106 | 107 | Contributing 108 | ------------ 109 | 110 | Help in testing, development, documentation and other tasks is 111 | highly appreciated and useful to the project. There are tasks for 112 | contributors of all experience levels. 113 | 114 | If you need help getting started, don't hesitate to contact me. 115 | 116 | 117 | Development status 118 | ------------------ 119 | 120 | IL SuperMarket Scraper is beta software, as far as i see devlopment stoped until new issues will be found. 121 | -------------------------------------------------------------------------------- /il_supermarket_scarper/scrappers_factory.py: -------------------------------------------------------------------------------- 1 | import random 2 | import os 3 | from enum import Enum 4 | import il_supermarket_scarper.scrappers as all_scrappers 5 | from il_supermarket_scarper.scraper_stability import ScraperStability 6 | 7 | 8 | class ScraperFactory(Enum): 9 | """all scrapers avaliabe""" 10 | 11 | BAREKET = all_scrappers.Bareket # עוף והודו ברקת - חנות המפעל בע"מ 12 | YAYNO_BITAN_AND_CARREFOUR = all_scrappers.YaynotBitanAndCarrefour # יינות ביתן 13 | # YAYNO_BITAN = all_scrappers.YaynotBitan # יינות ביתן 14 | COFIX = all_scrappers.Cofix # קופיקס בע"מ 15 | # CITY_MARKET_GIVATAYIM = all_scrappers.CityMarketGivatayim 16 | # CITY_MARKET_KIRYATONO = all_scrappers.CityMarketKirtatOno 17 | CITY_MARKET_KIRYATGAT = all_scrappers.CityMarketKiryatGat # סיטי מרקט 18 | CITY_MARKET_SHOPS = all_scrappers.CityMarketShops # סיטי מרקט 19 | DOR_ALON = all_scrappers.DorAlon # דור אלון ניהול מתחמים קמעונאיים בע"מ 20 | GOOD_PHARM = all_scrappers.GoodPharm # גוד פארם בע"מ 21 | HAZI_HINAM = all_scrappers.HaziHinam # כל בו חצי חינם בע"מ 22 | HET_COHEN = all_scrappers.HetCohen # ח. כהן סוכנות מזון ומשקאות בע"מ 23 | KESHET = all_scrappers.Keshet # קשת טעמים בע"מ 24 | KING_STORE = all_scrappers.KingStore # אלמשהדאוי קינג סטור בע"מ 25 | MAAYAN_2000 = all_scrappers.Maayan2000 # ג.מ מעיין אלפיים (07) בע"מ 26 | MAHSANI_ASHUK = all_scrappers.MahsaniAShuk # כ.נ מחסני השוק בע"מ 27 | # MEGA = all_scrappers.Mega # קרפור \ מגה 28 | NETIV_HASED = all_scrappers.NetivHased # נתיב החסד - סופר חסד בע"מ (כולל ברכל) 29 | MESHMAT_YOSEF_1 = ( 30 | all_scrappers.MeshnatYosef1 31 | ) # קיי.טי. יבוא ושיווק בע"מ (משנת יוסף) 32 | MESHMAT_YOSEF_2 = ( 33 | all_scrappers.MeshnatYosef2 34 | ) # קיי.טי. יבוא ושיווק בע"מ (משנת יוסף) 35 | OSHER_AD = all_scrappers.Osherad # מרב-מזון כל בע"מ (אושר עד) 36 | POLIZER = all_scrappers.Polizer # פוליצר חדרה (1982) בע"מ 37 | RAMI_LEVY = all_scrappers.RamiLevy # רשת חנויות רמי לוי שיווק השקמה 2006 בע"מ 38 | SALACH_DABACH = all_scrappers.SalachDabach # סאלח דבאח ובניו בע"מ 39 | SHEFA_BARCART_ASHEM = all_scrappers.ShefaBarcartAshem # שפע ברכת השם בע"מ 40 | SHUFERSAL = all_scrappers.Shufersal # שופרסל בע"מ (כולל רשת BE) 41 | SHUK_AHIR = all_scrappers.ShukAhir # שוק העיר (ט.ע.מ.ס) בע"מ 42 | STOP_MARKET = all_scrappers.StopMarket # סטופ מרקט בע"מ 43 | SUPER_PHARM = all_scrappers.SuperPharm # סופר פארם (ישראל) בע"מ 44 | SUPER_YUDA = all_scrappers.SuperYuda # סופר יודה 45 | SUPER_SAPIR = all_scrappers.SuperSapir # סופר ספיר בע"מ 46 | FRESH_MARKET_AND_SUPER_DOSH = all_scrappers.FreshMarketAndSuperDosh # פרשמרקט 47 | QUIK = all_scrappers.Quik # קוויק 48 | TIV_TAAM = all_scrappers.TivTaam # טיב טעם רשתות בע"מ 49 | VICTORY = all_scrappers.Victory # ויקטורי רשת סופרמרקטים בע"מ 50 | YELLOW = all_scrappers.Yellow # יילו 51 | YOHANANOF = all_scrappers.Yohananof # מ. יוחננוף ובניו (1988) בע"מ 52 | ZOL_VEBEGADOL = all_scrappers.ZolVeBegadol # זול ובגדול בע"מ 53 | WOLT = all_scrappers.Wolt # וולט אופריישנס סרוויסס ישראל בע"מ 54 | 55 | @classmethod 56 | def all_listed_scrappers(cls): 57 | """get all the scarpers and filter disabled scrapers""" 58 | return list(member.name for member in cls) 59 | 60 | @classmethod 61 | def all_active(cls, limit=None, files_types=None, when_date=None): 62 | """get all the scarpers and filter disabled scrapers""" 63 | return ( 64 | member 65 | for member in cls 66 | if cls.is_scraper_enabled( 67 | member, 68 | limit=limit, 69 | files_types=files_types, 70 | when_date=when_date, 71 | ) 72 | ) 73 | 74 | @classmethod 75 | def sample(cls, n=1): 76 | """sample n from the scrappers""" 77 | return random.sample(cls.all_scrapers_name(), n) 78 | 79 | @classmethod 80 | def all_scrapers(cls, limit=None, files_types=None, when_date=None): 81 | """list all scrapers possible to use""" 82 | return [ 83 | e.value 84 | for e in ScraperFactory.all_active( 85 | limit=limit, files_types=files_types, when_date=when_date 86 | ) 87 | ] 88 | 89 | @classmethod 90 | def all_scrapers_name(cls, limit=None, files_types=None, when_date=None): 91 | """get the class name of all listed scrapers""" 92 | return [ 93 | e.name 94 | for e in ScraperFactory.all_active( 95 | limit=limit, files_types=files_types, when_date=when_date 96 | ) 97 | ] 98 | 99 | @classmethod 100 | def get(cls, class_name, limit=None, files_types=None, when_date=None): 101 | """get a scraper by class name""" 102 | 103 | enum = None 104 | if isinstance(class_name, ScraperFactory): 105 | enum = class_name 106 | elif class_name in cls.all_scrapers_name(): 107 | enum = getattr(ScraperFactory, class_name) 108 | 109 | if enum is None: 110 | raise ValueError(f"class_names {class_name} not found") 111 | 112 | if not cls.is_scraper_enabled( 113 | enum, limit=limit, files_types=files_types, when_date=when_date 114 | ): 115 | return None 116 | return enum.value 117 | 118 | @classmethod 119 | def is_scraper_enabled(cls, enum, limit=None, files_types=None, when_date=None): 120 | """get scraper value base on the enum value, if it disabled, return None""" 121 | env_var_value = os.environ.get("DISABLED_SCRAPPERS") 122 | if env_var_value is not None: 123 | disabled_scrappers = list(map(str.strip, env_var_value.split(","))) 124 | if enum.name in disabled_scrappers: 125 | return False 126 | # 127 | if ScraperStability.is_validate_scraper_found_no_files( 128 | enum.name, 129 | limit=limit, 130 | files_types=files_types, 131 | when_date=when_date, 132 | utilize_date_param=enum.value.utilize_date_param, 133 | ): 134 | return False 135 | return True 136 | -------------------------------------------------------------------------------- /il_supermarket_scarper/scraper_stability.py: -------------------------------------------------------------------------------- 1 | # pylint: disable=arguments-differ,arguments-renamed 2 | from enum import Enum 3 | from il_supermarket_scarper.utils import ( 4 | _is_saturday_in_israel, 5 | _now, 6 | datetime_in_tlv, 7 | FileTypesFilters, 8 | hour_files_expected_to_be_accassible, 9 | ) 10 | 11 | 12 | class FullyStable: 13 | """fully stable is stablity""" 14 | 15 | @classmethod 16 | def executes_between_midnight_and_morning_and_requested_today( 17 | cls, 18 | when_date=None, 19 | utilize_date_param=False, 20 | ): 21 | """it is stable if the execution is between midnight 22 | and morning and the requested date is today fails""" 23 | execution_time = _now() 24 | return ( 25 | when_date is not None 26 | and execution_time.hour >= 0 27 | and execution_time.hour < hour_files_expected_to_be_accassible() 28 | and (not utilize_date_param or when_date.date() == execution_time.date()) 29 | ) 30 | 31 | @classmethod 32 | def executed_after_date(cls, when_date, date): 33 | """check if executed after date""" 34 | return when_date > date 35 | 36 | @classmethod 37 | def failire_valid(cls, when_date=None, utilize_date_param=True, **_): 38 | """return true if the parser is stble""" 39 | 40 | return cls.executes_between_midnight_and_morning_and_requested_today( 41 | when_date=when_date, utilize_date_param=utilize_date_param 42 | ) 43 | 44 | 45 | class SuperFlaky(FullyStable): 46 | """super flaky is stablity""" 47 | 48 | @classmethod 49 | def failire_valid(cls, **_): 50 | return True 51 | 52 | 53 | class NetivHased(FullyStable): 54 | """Netiv Hased is stablity""" 55 | 56 | @classmethod 57 | def executed_in_saturday(cls, when_date=None, **_): 58 | """if the execution is in saturday""" 59 | return _is_saturday_in_israel(when_date) 60 | 61 | @classmethod 62 | def failire_valid(cls, when_date=None, utilize_date_param=False, **_): 63 | """return true if the parser is stble""" 64 | return super().failire_valid( 65 | when_date=when_date, utilize_date_param=utilize_date_param 66 | ) or cls.executed_in_saturday(when_date=when_date) 67 | 68 | 69 | class CityMarketGivataim(FullyStable): 70 | """Netiv Hased is stablity""" 71 | 72 | @classmethod 73 | def searching_for_update_promo(cls, files_types=None, **_): 74 | """if the execution is in saturday""" 75 | return files_types and files_types == [FileTypesFilters.PROMO_FILE.name] 76 | 77 | @classmethod 78 | def failire_valid( 79 | cls, when_date=None, files_types=None, utilize_date_param=True, **_ 80 | ): 81 | """return true if the parser is stble""" 82 | return ( 83 | super().failire_valid(when_date=when_date) 84 | or cls.searching_for_update_promo(files_types=files_types) 85 | or when_date is not None 86 | and cls.executed_after_date( 87 | when_date=when_date, 88 | date=datetime_in_tlv( 89 | year=2024, month=11, day=5, hour=0, minute=0, second=0 90 | ), 91 | ) 92 | ) 93 | 94 | 95 | class CityMarketKiratOno(FullyStable): 96 | """Netiv Hased is stablity""" 97 | 98 | @classmethod 99 | def searching_for_update_promo(cls, files_types=None, **_): 100 | """if the execution is in saturday""" 101 | return files_types and files_types == [FileTypesFilters.PROMO_FILE.name] 102 | 103 | @classmethod 104 | def failire_valid( 105 | cls, when_date=None, files_types=None, utilize_date_param=True, **_ 106 | ): 107 | """return true if the parser is stble""" 108 | return super().failire_valid( 109 | when_date=when_date 110 | ) or cls.searching_for_update_promo(files_types=files_types) 111 | 112 | 113 | class CityMarketKiratGat(FullyStable): 114 | """Netiv Hased is stablity""" 115 | 116 | @classmethod 117 | def searching_for_update_promo_full(cls, files_types=None, **_): 118 | """if the execution is in saturday""" 119 | return files_types and files_types == [FileTypesFilters.PROMO_FULL_FILE.name] 120 | 121 | @classmethod 122 | def failire_valid( 123 | cls, when_date=None, files_types=None, utilize_date_param=True, **_ 124 | ): 125 | """return true if the parser is stble""" 126 | return super().failire_valid( 127 | when_date=when_date 128 | ) or cls.searching_for_update_promo_full(files_types=files_types) 129 | 130 | 131 | class DoNotPublishStores(FullyStable): 132 | """stablity for chains that doesn't pubish stores""" 133 | 134 | @classmethod 135 | def searching_for_store_full(cls, files_types=None, **_): 136 | """if the execution is in saturday""" 137 | return files_types and files_types == [FileTypesFilters.STORE_FILE.name] 138 | 139 | @classmethod 140 | def failire_valid( 141 | cls, when_date=None, files_types=None, utilize_date_param=True, **_ 142 | ): 143 | """return true if the parser is stble""" 144 | return super().failire_valid( 145 | when_date=when_date, 146 | files_types=files_types, 147 | utilize_date_param=utilize_date_param, 148 | ) or cls.searching_for_store_full(files_types=files_types) 149 | 150 | 151 | class DoNotPublishPromo(FullyStable): 152 | """stablity for chains that doesn't pubish stores""" 153 | 154 | @classmethod 155 | def searching_for_promo_full(cls, files_types=None, **_): 156 | """if the execution is in saturday""" 157 | return files_types and files_types == [ 158 | FileTypesFilters.PROMO_FILE.name, 159 | FileTypesFilters.PROMO_FULL_FILE.name, 160 | ] 161 | 162 | @classmethod 163 | def failire_valid( 164 | cls, when_date=None, files_types=None, utilize_date_param=True, **_ 165 | ): 166 | """return true if the parser is stble""" 167 | return super().failire_valid( 168 | when_date=when_date, 169 | files_types=files_types, 170 | utilize_date_param=utilize_date_param, 171 | ) or cls.searching_for_promo_full(files_types=files_types) 172 | 173 | 174 | class ScraperStability(Enum): 175 | """tracker for the stablity of the scraper""" 176 | 177 | COFIX = DoNotPublishStores 178 | NETIV_HASED = NetivHased 179 | QUIK = DoNotPublishStores 180 | SALACH_DABACH = DoNotPublishStores 181 | # CITY_MARKET_GIVATAYIM = CityMarketGivataim 182 | CITY_MARKET_KIRYATONO = CityMarketKiratOno 183 | CITY_MARKET_KIRYATGAT = CityMarketKiratGat 184 | MESHMAT_YOSEF_1 = DoNotPublishPromo 185 | YOHANANOF = DoNotPublishStores 186 | 187 | @classmethod 188 | def is_validate_scraper_found_no_files( 189 | cls, 190 | scraper_enum, 191 | limit=None, 192 | files_types=None, 193 | store_id=None, 194 | when_date=None, 195 | utilize_date_param=False, 196 | ): 197 | """return true if its ok the scarper reuturn no enrty""" 198 | 199 | stabler = FullyStable 200 | if scraper_enum in ScraperStability.__members__: 201 | stabler = ScraperStability[scraper_enum].value 202 | 203 | return stabler.failire_valid( 204 | limit=limit, 205 | files_types=files_types, 206 | store_id=store_id, 207 | when_date=when_date, 208 | utilize_date_param=utilize_date_param, 209 | ) 210 | -------------------------------------------------------------------------------- /il_supermarket_scarper/utils/retry.py: -------------------------------------------------------------------------------- 1 | import logging 2 | import random 3 | import time 4 | import inspect 5 | 6 | from datetime import datetime 7 | from functools import partial 8 | 9 | import functools 10 | 11 | 12 | try: 13 | from decorator import decorator 14 | except ImportError: 15 | 16 | def decorator(caller): 17 | """Turns caller into a decorator. 18 | Unlike decorator module, function signature is not preserved. 19 | 20 | :param caller: caller(f, *args, **kwargs) 21 | """ 22 | 23 | def decor(func): 24 | @functools.wraps(func) 25 | def wrapper(*args, **kwargs): 26 | return caller(func, *args, **kwargs) 27 | 28 | return wrapper 29 | 30 | return decor 31 | 32 | 33 | logging_logger = logging.getLogger(__name__) 34 | 35 | 36 | def __retry_internal( # pylint: disable=broad-except,too-many-locals 37 | func, 38 | exceptions=Exception, 39 | tries=-1, 40 | delay=0, 41 | max_delay=None, 42 | backoff=1, 43 | timeout=None, 44 | max_timeout=None, 45 | backoff_timeout=1, 46 | jitter=0, 47 | logger=logging_logger, 48 | ): 49 | """ 50 | Executes a function and retries it if it failed. 51 | 52 | :param f: the function to execute. 53 | :param exceptions: an exception or a tuple of exceptions to catch. default: Exception. 54 | :param tries: the maximum number of attempts. default: -1 (infinite). 55 | :param delay: initial delay between attempts. default: 0. 56 | :param max_delay: the maximum value of delay. default: None (no limit). 57 | :param backoff: multiplier applied to delay between attempts. default: 1 (no backoff). 58 | :param jitter: extra seconds added to delay between attempts. default: 0. 59 | fixed if a number, random if a range tuple (min, max) 60 | :param logger: logger.warning(fmt, error, delay) will be called on failed attempts. 61 | default: retry.logging_logger. if None, logging is disabled. 62 | :returns: the result of the f function. 63 | """ 64 | _tries, _delay = tries, delay 65 | _timeout = timeout 66 | while _tries: 67 | datetime_start = datetime.now() 68 | try: 69 | if timeout: 70 | return func(timeout=_timeout) 71 | return func() 72 | except exceptions as error: # pylint: disable=broad-except 73 | measured_seconds = (datetime.now() - datetime_start).total_seconds() 74 | _tries -= 1 75 | if not _tries: 76 | raise 77 | 78 | if logger is not None: 79 | logger.warning( 80 | "%s, configured timeout %s,measured time to timeout %s ,retrying in %s seconds", 81 | error, 82 | _timeout, 83 | measured_seconds, 84 | _delay, 85 | ) 86 | logger.error_execption(error) 87 | 88 | time.sleep(_delay) 89 | _delay *= backoff 90 | 91 | if _timeout: 92 | _timeout += backoff_timeout 93 | 94 | if isinstance(jitter, tuple): 95 | _delay += random.uniform(*jitter) 96 | else: 97 | _delay += jitter 98 | 99 | if max_delay is not None: 100 | _delay = min(_delay, max_delay) 101 | 102 | if max_timeout is not None: 103 | _timeout = min(_timeout, max_timeout) 104 | raise ValueError("shouldn't be called!") 105 | 106 | 107 | def retry( 108 | exceptions=Exception, 109 | tries=-1, 110 | delay=0, 111 | max_delay=None, 112 | backoff=1, 113 | timeout=None, 114 | max_timeout=None, 115 | backoff_timeout=1, 116 | jitter=0, 117 | logger=logging_logger, 118 | ): 119 | """Returns a retry decorator. 120 | 121 | :param exceptions: an exception or a tuple of exceptions to catch. default: Exception. 122 | :param tries: the maximum number of attempts. default: -1 (infinite). 123 | :param delay: initial delay between attempts. default: 0. 124 | :param max_delay: the maximum value of delay. default: None (no limit). 125 | :param backoff: multiplier applied to delay between attempts. default: 1 (no backoff). 126 | :param jitter: extra seconds added to delay between attempts. default: 0. 127 | fixed if a number, random if a range tuple (min, max) 128 | :param logger: logger.warning(fmt, error, delay) will be called on failed attempts. 129 | default: retry.logging_logger. if None, logging is disabled. 130 | :returns: a retry decorator. 131 | """ 132 | 133 | @decorator 134 | def retry_decorator(func, *fargs, **fkwargs): 135 | args = fargs if fargs else [] 136 | kwargs = fkwargs if fkwargs else {} 137 | return __retry_internal( 138 | partial(func, *args, **kwargs), 139 | exceptions, 140 | tries, 141 | delay, 142 | max_delay, 143 | backoff, 144 | timeout, 145 | max_timeout, 146 | backoff_timeout, 147 | jitter, 148 | logger, 149 | ) 150 | 151 | return retry_decorator 152 | 153 | 154 | def retry_call( 155 | func, 156 | fargs=None, 157 | fkwargs=None, 158 | exceptions=Exception, 159 | tries=-1, 160 | delay=0, 161 | max_delay=None, 162 | backoff=1, 163 | jitter=0, 164 | logger=logging_logger, 165 | ): 166 | """ 167 | Calls a function and re-executes it if it failed. 168 | 169 | :param f: the function to execute. 170 | :param fargs: the positional arguments of the function to execute. 171 | :param fkwargs: the named arguments of the function to execute. 172 | :param exceptions: an exception or a tuple of exceptions to catch. default: Exception. 173 | :param tries: the maximum number of attempts. default: -1 (infinite). 174 | :param delay: initial delay between attempts. default: 0. 175 | :param max_delay: the maximum value of delay. default: None (no limit). 176 | :param backoff: multiplier applied to delay between attempts. default: 1 (no backoff). 177 | :param jitter: extra seconds added to delay between attempts. default: 0. 178 | fixed if a number, random if a range tuple (min, max) 179 | :param logger: logger.warning(fmt, error, delay) will be called on failed attempts. 180 | default: retry.logging_logger. if None, logging is disabled. 181 | :returns: the result of the f function. 182 | """ 183 | args = fargs if fargs else [] 184 | kwargs = fkwargs if fkwargs else {} 185 | return __retry_internal( 186 | partial(func, *args, **kwargs), 187 | exceptions, 188 | tries, 189 | delay, 190 | max_delay, 191 | backoff, 192 | jitter, 193 | logger, 194 | ) 195 | 196 | 197 | def retry_files(num_of_retrys=2, arg_name="files_names_to_scrape"): 198 | """retry only ceritin files""" 199 | 200 | @decorator 201 | def retry_files_decorator(func, *fargs, **fkwargs): 202 | args = fargs if fargs else [] 203 | kwargs = fkwargs if fkwargs else {} 204 | return __retry_files(func, args, kwargs, arg_name, num_of_retrys=num_of_retrys) 205 | 206 | return retry_files_decorator 207 | 208 | 209 | def __retry_files( 210 | func, 211 | args, 212 | kwargs, 213 | arg_name, 214 | num_of_retrys=1, 215 | logger=logging_logger, 216 | ): 217 | retry_list = [] 218 | all_results = [] 219 | for i in range(num_of_retrys): 220 | logger.info(f"File Retry: Itreation #{i},retry_list={retry_list}") 221 | 222 | if retry_list: 223 | # replace the value of 'files_names_to_scrape' 224 | args_names = inspect.getfullargspec(func).args 225 | assert arg_name in args_names, f"{arg_name} wasn't found in {args_names}." 226 | 227 | arg_list = list(args) 228 | arg_list[args_names.index(arg_name)] = retry_list 229 | args = tuple(arg_list) 230 | 231 | results = func(*args, **kwargs) 232 | 233 | # next iteration 234 | retry_list, other_results = compute_retry(results) 235 | 236 | all_results.extend(other_results) 237 | # if there is not files in the retry list, break 238 | if len(retry_list) == 0: 239 | break 240 | 241 | return all_results 242 | 243 | 244 | def compute_retry(results): 245 | """find the files to retry""" 246 | files_to_retry = [] 247 | other_results = [] 248 | for result in results: 249 | if result["restart_and_retry"]: 250 | files_to_retry.append(result["file_name"]) 251 | else: 252 | other_results.append(result) 253 | return files_to_retry, other_results 254 | -------------------------------------------------------------------------------- /il_supermarket_scarper/utils/status.py: -------------------------------------------------------------------------------- 1 | import datetime 2 | import re 3 | import os 4 | import enum 5 | import holidays 6 | import pytz 7 | from .logger import Logger 8 | from .connection import get_from_latast_webpage, get_from_webpage 9 | 10 | 11 | def get_statue_page(extraction_type, source="gov.il"): 12 | """fetch the gov.il site""" 13 | url = "https://www.gov.il/he/departments/legalInfo/cpfta_prices_regulations" 14 | # Create a handle, page, to handle the contents of the website 15 | 16 | if source == "gov.il": 17 | return get_from_latast_webpage(url, extraction_type=extraction_type) 18 | if source == "cache": 19 | return get_from_webpage(get_cached_page(), extraction_type=extraction_type) 20 | raise ValueError(f"source '{source}' is not valid.") 21 | 22 | 23 | def get_cached_page(): 24 | """get the current cached page""" 25 | cache = None 26 | with open( 27 | os.path.join( 28 | os.path.dirname(os.path.abspath(__file__)), 29 | "tests", 30 | "cpfta_prices_regulations", 31 | ), 32 | encoding="utf-8", 33 | ) as page_cache: 34 | cache = page_cache.read() 35 | return cache 36 | 37 | 38 | def get_status(): 39 | """get the number of scarper listed on the gov.il site""" 40 | links_text = get_statue_page(extraction_type="links_name") 41 | # Store the contents of the website under doc 42 | count = 0 43 | for element in links_text: 44 | if "לצפייה במחירים" in str(element) or "לצפיה במחירים" in str(element): 45 | count += 1 46 | 47 | return count 48 | 49 | 50 | def get_status_date(): 51 | """get the date change listed on the gov.il site""" 52 | line_with_date = get_statue_page(extraction_type="update_date") 53 | 54 | Logger.info(f"date in 'line_with_date' is '{line_with_date}'") 55 | 56 | dates = re.findall( 57 | r"([1-9]|1[0-9]|2[0-9]|3[0-1]|0[0-9])(.|-|\/)([1-9]|1[0-2]|0[0-9])(.|-|\/)(20[0-9][0-9])", 58 | line_with_date, 59 | ) 60 | 61 | Logger.info(f"Found {len(dates)} dates") 62 | if len(dates) != 1: 63 | raise ValueError(f"found dates: {dates}") 64 | 65 | return datetime.datetime.strptime("".join(dates[0]), "%d.%m.%Y") 66 | 67 | 68 | def get_output_folder(chain_name, folder_name=None): 69 | """the the folder to write the chain fils in""" 70 | return os.path.join(folder_name if folder_name else _get_dump_folder(), chain_name) 71 | 72 | 73 | def _get_dump_folder(): 74 | """get the dump folder to locate the chains folders in""" 75 | return os.environ.get("XML_STORE_PATH", "dumps") 76 | 77 | 78 | # Enum for size units 79 | class UnitSize(enum.Enum): 80 | """enum represent the unit size in memory""" 81 | 82 | BYTES = "Bytes" 83 | KB = "Kb" 84 | MB = "Mb" 85 | GB = "Gb" 86 | 87 | 88 | def convert_nl_size_to_bytes(size_str, to_unit=UnitSize.MB): 89 | """ 90 | Parse human-readable file size string to bytes. 91 | Supports formats like: "10.5 MB", "1.2GB", "500 KB", "1234", etc. 92 | Returns bytes as integer, or None if parsing fails. 93 | """ 94 | if not size_str: 95 | return None 96 | 97 | # Remove any extra whitespace and convert to uppercase 98 | size_str = size_str.strip().upper() 99 | 100 | # Pattern to match: number (with optional decimal) followed by optional unit 101 | pattern = r"([\d.]+)\s*(B|KB|MB|GB|TB)?" 102 | match = re.match(pattern, size_str) 103 | if not match: 104 | return None 105 | 106 | try: 107 | number = string_to_float(match.group(1)) 108 | unit_str = match.group(2) if match.group(2) else "B" 109 | # Map string units to UnitSize enum where possible 110 | unit_map = { 111 | "B": UnitSize.BYTES, 112 | "KB": UnitSize.KB, 113 | "MB": UnitSize.MB, 114 | "GB": UnitSize.GB, 115 | # You can add "TB": UnitSize.TB if desired and defined 116 | } 117 | from_unit = unit_map.get(unit_str, UnitSize.BYTES) 118 | size_in_from_unit = number 119 | # convert_unit expects size in bytes, so we need to first get bytes from the given unit 120 | return convert_unit(size_in_from_unit, from_unit=from_unit, to_unit=to_unit) 121 | except (ValueError, TypeError, KeyError): 122 | return None 123 | 124 | 125 | def string_to_float(size_str): 126 | """convert a string to a float""" 127 | return float(size_str.replace(",", "")) 128 | 129 | 130 | def convert_unit(size_in_bytes, from_unit=UnitSize.BYTES, to_unit=UnitSize.MB): 131 | """Convert the size from bytes to other units like KB, MB or GB""" 132 | if from_unit == to_unit: 133 | return size_in_bytes 134 | # Convert size_in_bytes (in from_unit) to bytes 135 | if from_unit == UnitSize.KB: 136 | bytes_val = size_in_bytes * 1024 137 | elif from_unit == UnitSize.MB: 138 | bytes_val = size_in_bytes * 1024 * 1024 139 | elif from_unit == UnitSize.GB: 140 | bytes_val = size_in_bytes * 1024 * 1024 * 1024 141 | else: # from_unit == UnitSize.BYTES 142 | bytes_val = size_in_bytes 143 | 144 | # Convert bytes to to_unit 145 | if to_unit == UnitSize.BYTES: 146 | return bytes_val 147 | if to_unit == UnitSize.KB: 148 | return bytes_val / 1024 149 | if to_unit == UnitSize.MB: 150 | return bytes_val / (1024 * 1024) 151 | if to_unit == UnitSize.GB: 152 | return bytes_val / (1024 * 1024 * 1024) 153 | return bytes_val 154 | 155 | 156 | def log_folder_details(folder, unit=UnitSize.MB): 157 | """log details about a folder""" 158 | size = 0 159 | files_scaned = [] 160 | Logger.info(f"Found the following files in {folder}") 161 | 162 | for path, _, files in os.walk(folder): 163 | 164 | # summerize all files 165 | for file in files: 166 | if "xml" in file: 167 | full_file_path = os.path.join(path, file) 168 | size += os.path.getsize(full_file_path) 169 | files_scaned.append(full_file_path) 170 | Logger.info(f"- file {full_file_path}: size {size}") 171 | 172 | # unit_size = 173 | # for sub_folder in dirs: 174 | # unit_size += log_folder_details(os.path.join(path, sub_folder), unit) 175 | 176 | Logger.info( 177 | f"Folder {folder}: Num of Files= {len(files_scaned)}," 178 | f"Size= {convert_unit(size, unit)} {unit.name}" 179 | ) 180 | 181 | return { 182 | "size": convert_unit(size, unit), 183 | "unit": unit.name, 184 | "folder": folder, 185 | "folder_content": files_scaned, 186 | } 187 | 188 | 189 | def summerize_dump_folder_contant(dump_folder): 190 | """collect details about the dump folder""" 191 | 192 | Logger.info(" == Starting summerize dump folder == ") 193 | Logger.info(f"dump_folder = {dump_folder}") 194 | for any_file in os.listdir(dump_folder): 195 | current_file = os.path.join(dump_folder, any_file) 196 | if os.path.isdir(current_file): 197 | log_folder_details(current_file) 198 | else: 199 | Logger.info(f"- file {current_file}") 200 | 201 | 202 | def clean_dump_folder(dump_folder): 203 | """clean the dump folder completly""" 204 | for any_file in os.listdir(dump_folder): 205 | current_file = os.path.join(dump_folder, any_file) 206 | if os.path.isdir(current_file): 207 | for file in os.listdir(current_file): 208 | full_file_path = os.path.join(current_file, file) 209 | os.remove(full_file_path) 210 | os.rmdir(current_file) 211 | else: 212 | os.remove(current_file) 213 | 214 | 215 | def hour_files_expected_to_be_accassible(): 216 | """the hour (AM) in which the files are expected to be published in IL time""" 217 | return 12 218 | 219 | 220 | def _now(): 221 | return datetime.datetime.now(pytz.timezone("Asia/Jerusalem")) 222 | 223 | 224 | def _testing_now(hour_consider_stable=hour_files_expected_to_be_accassible()): 225 | current_time = _now() 226 | 227 | if current_time.hour < hour_consider_stable: 228 | current_time = current_time - datetime.timedelta(hours=hour_consider_stable) 229 | return current_time 230 | 231 | 232 | def datetime_in_tlv(year, month, day, hour, minute, second): 233 | """return a datedatiem in tlv timezone""" 234 | return datetime.datetime( 235 | year, month, day, hour, minute, second, tzinfo=pytz.timezone("Asia/Jerusalem") 236 | ) 237 | 238 | 239 | def _is_saturday_in_israel(date=None): 240 | if not date: 241 | date = _now() 242 | return date.weekday() == 5 243 | 244 | 245 | def _is_friday_in_israel(): 246 | return _now().weekday() == 4 247 | 248 | 249 | def _is_weekend_in_israel(): 250 | return _is_friday_in_israel() or _is_saturday_in_israel() 251 | 252 | 253 | def _is_holiday_in_israel(): 254 | return _now().date() in holidays.CountryHoliday("IL") 255 | -------------------------------------------------------------------------------- /il_supermarket_scarper/engines/web.py: -------------------------------------------------------------------------------- 1 | import re 2 | from bs4 import BeautifulSoup 3 | from il_supermarket_scarper.utils import Logger, execute_in_parallel 4 | from il_supermarket_scarper.utils import convert_nl_size_to_bytes, UnitSize 5 | from .engine import Engine 6 | 7 | 8 | class WebBase(Engine): 9 | """scrape the file of websites that the only why to download them is via web""" 10 | 11 | def __init__(self, chain, chain_id, url, folder_name=None, max_threads=5): 12 | super().__init__(chain, chain_id, folder_name, max_threads=max_threads) 13 | self.url = url 14 | self.max_retry = 2 15 | 16 | def get_data_from_page(self, req_res): 17 | """get the file list from a page""" 18 | soup = BeautifulSoup(req_res.text, features="lxml") 19 | return soup.find_all("tr")[1:] 20 | 21 | def get_request_url( 22 | self, files_types=None, store_id=None, when_date=None 23 | ): # pylint: disable=unused-argument 24 | """get all links to collect download links from""" 25 | return [{"url": self.url, "method": "GET"}] 26 | 27 | def get_file_size_from_entry(self, entry): 28 | """ 29 | Extract file size from a table row entry. 30 | Looks for size information in table cells, typically in human-readable format. 31 | Returns size in bytes, or None if not found. 32 | """ 33 | try: 34 | size_bytes = re.search(r"\b\d+(\.\d+)?\s*(KB|MB|GB)\b", entry.text) 35 | size_bytes = convert_nl_size_to_bytes( 36 | size_bytes.group(0), to_unit=UnitSize.BYTES 37 | ) 38 | return size_bytes 39 | except (AttributeError, TypeError) as e: 40 | Logger.debug(f"Error extracting file size from entry: {e}") 41 | return None 42 | 43 | def extract_task_from_entry(self, all_trs): 44 | """extract download links, file names, and file sizes from page list""" 45 | download_urls = [] 46 | file_names = [] 47 | file_sizes = [] 48 | for x in all_trs: 49 | try: 50 | download_urls.append(self.url + x.a.attrs["href"]) 51 | file_names.append(x.a.attrs["href"].split(".")[0].split("/")[-1]) 52 | file_sizes.append(self.get_file_size_from_entry(x)) 53 | except (AttributeError, KeyError, IndexError, TypeError) as e: 54 | Logger.warning(f"Error extracting task from entry: {e}") 55 | 56 | return download_urls, file_names, file_sizes 57 | 58 | def apply_limit_zip( 59 | self, 60 | file_names, 61 | download_urls, 62 | file_sizes=None, 63 | limit=None, 64 | files_types=None, 65 | by_function=lambda x: x[0], 66 | store_id=None, 67 | when_date=None, 68 | files_names_to_scrape=None, 69 | suppress_exception=False, 70 | ): 71 | """apply limit to zip""" 72 | # Handle both 2-tuple (backward compatibility) and 3-tuple formats 73 | if file_sizes is None: 74 | zipped = list(zip(file_names, download_urls)) 75 | else: 76 | zipped = list(zip(file_names, download_urls, file_sizes)) 77 | 78 | ziped = self.apply_limit( 79 | zipped, 80 | limit=limit, 81 | files_types=files_types, 82 | by_function=by_function, 83 | store_id=store_id, 84 | when_date=when_date, 85 | files_names_to_scrape=files_names_to_scrape, 86 | suppress_exception=suppress_exception, 87 | ) 88 | if len(ziped) == 0: 89 | if file_sizes is None: 90 | return [], [] 91 | return [], [], [] 92 | return list(zip(*ziped)) 93 | 94 | def filter_bad_files_zip( 95 | self, 96 | file_names, 97 | download_urls, 98 | file_sizes=None, 99 | filter_null=False, 100 | filter_zero=False, 101 | by_function=lambda x: x[0], 102 | ): 103 | """apply bad files filtering to zip""" 104 | # Handle both 2-tuple (backward compatibility) and 3-tuple formats 105 | if file_sizes is None: 106 | files = list(zip(file_names, download_urls)) 107 | else: 108 | files = list(zip(file_names, download_urls, file_sizes)) 109 | 110 | files = self.filter_bad_files( 111 | files, 112 | filter_null=filter_null, 113 | filter_zero=filter_zero, 114 | by_function=by_function, 115 | ) 116 | if len(files) == 0: 117 | if file_sizes is None: 118 | return [], [] 119 | return [], [], [] 120 | return list(zip(*files)) 121 | 122 | def collect_files_details_from_site( # pylint: disable=too-many-locals 123 | self, 124 | limit=None, 125 | files_types=None, 126 | store_id=None, 127 | when_date=None, 128 | filter_null=False, 129 | filter_zero=False, 130 | files_names_to_scrape=None, 131 | suppress_exception=False, 132 | min_size=None, 133 | max_size=None, 134 | ): 135 | """collect all enteris to download from site""" 136 | 137 | urls_to_collect_link_from = self.get_request_url( 138 | files_types=files_types, store_id=store_id, when_date=when_date 139 | ) 140 | assert len(urls_to_collect_link_from) > 0, "No pages to scrape" 141 | 142 | all_trs = [] 143 | for url in urls_to_collect_link_from: 144 | req_res = self.session_with_cookies_by_chain(**url) 145 | trs = self.get_data_from_page(req_res) 146 | all_trs.extend(trs) 147 | 148 | Logger.info(f"Found {len(all_trs)} entries") 149 | 150 | download_urls, file_names, file_sizes = self.extract_task_from_entry(all_trs) 151 | 152 | Logger.info(f"Found {len(download_urls)} download urls") 153 | 154 | # Filter by file size if specified 155 | if min_size is not None or max_size is not None: 156 | file_names, download_urls, file_sizes = self.filter_by_file_size( 157 | file_names, 158 | download_urls, 159 | file_sizes, 160 | min_size=min_size, 161 | max_size=max_size, 162 | ) 163 | 164 | file_names, download_urls, file_sizes = self.filter_bad_files_zip( 165 | file_names, 166 | download_urls, 167 | file_sizes=file_sizes, 168 | filter_null=filter_null, 169 | filter_zero=filter_zero, 170 | ) 171 | 172 | Logger.info(f"After filtering bad files: Found {len(download_urls)} files") 173 | 174 | # pylint: disable=duplicate-code 175 | file_names, download_urls, file_sizes = self.apply_limit_zip( 176 | file_names, 177 | download_urls, 178 | file_sizes=file_sizes, 179 | limit=limit, 180 | files_types=files_types, 181 | store_id=store_id, 182 | when_date=when_date, 183 | files_names_to_scrape=files_names_to_scrape, 184 | suppress_exception=suppress_exception, 185 | ) 186 | 187 | Logger.info(f"After applying limit: Found {len(download_urls)} entries") 188 | 189 | return download_urls, file_names 190 | 191 | def _scrape( 192 | self, 193 | limit=None, 194 | files_types=None, 195 | store_id=None, 196 | when_date=None, 197 | files_names_to_scrape=None, 198 | filter_null=False, 199 | filter_zero=False, 200 | suppress_exception=False, 201 | min_size=None, 202 | max_size=None, 203 | ): 204 | """scarpe the files from multipage sites""" 205 | download_urls, file_names = [], [] 206 | try: 207 | download_urls, file_names = self.collect_files_details_from_site( 208 | limit=limit, 209 | files_types=files_types, 210 | store_id=store_id, 211 | when_date=when_date, 212 | filter_null=filter_null, 213 | filter_zero=filter_zero, 214 | files_names_to_scrape=files_names_to_scrape, 215 | suppress_exception=suppress_exception, 216 | min_size=min_size, 217 | max_size=max_size, 218 | ) 219 | 220 | self.on_collected_details(file_names, download_urls) 221 | 222 | Logger.info(f"collected {len(download_urls)} to download.") 223 | if len(download_urls) > 0: 224 | results = execute_in_parallel( 225 | self.save_and_extract, 226 | list(zip(download_urls, file_names)), 227 | max_threads=self.max_threads, 228 | ) 229 | else: 230 | results = [] 231 | 232 | return results 233 | except Exception as e: # pylint: disable=broad-except 234 | self.on_download_fail(e, download_urls=download_urls, file_names=file_names) 235 | raise e 236 | -------------------------------------------------------------------------------- /il_supermarket_scarper/engines/cerberus.py: -------------------------------------------------------------------------------- 1 | import os 2 | import datetime 3 | 4 | from il_supermarket_scarper.utils import ( 5 | extract_xml_file_from_gz_file, 6 | Logger, 7 | execute_in_parallel, 8 | collect_from_ftp, 9 | fetch_temporary_gz_file_from_ftp, 10 | FileTypesFilters, 11 | ) 12 | from .engine import Engine 13 | 14 | 15 | class Cerberus(Engine): 16 | """scraper for all Cerberus base site. (seems like can't support historical data)""" 17 | 18 | target_file_extensions = ["xml", "gz"] 19 | utilize_date_param = False 20 | 21 | def __init__( 22 | self, 23 | chain, 24 | chain_id, 25 | folder_name=None, 26 | ftp_host="url.retail.publishedprices.co.il", 27 | ftp_path="/", 28 | ftp_username="", 29 | ftp_password="", 30 | max_threads=5, 31 | ): 32 | super().__init__(chain, chain_id, folder_name, max_threads) 33 | self.ftp_host = ftp_host 34 | self.ftp_path = ftp_path 35 | self.ftp_username = ftp_username 36 | self.ftp_password = ftp_password 37 | self.ftp_session = False 38 | 39 | def _scrape( 40 | self, 41 | limit=None, 42 | files_types=None, 43 | store_id=None, 44 | when_date=None, 45 | files_names_to_scrape=None, 46 | filter_null=False, 47 | filter_zero=False, 48 | suppress_exception=False, 49 | min_size=None, 50 | max_size=None, 51 | ): 52 | files = [] 53 | try: 54 | files = self.collect_files_details_from_site( 55 | limit=limit, 56 | files_types=files_types, 57 | filter_null=filter_null, 58 | filter_zero=filter_zero, 59 | store_id=store_id, 60 | when_date=when_date, 61 | files_names_to_scrape=files_names_to_scrape, 62 | suppress_exception=suppress_exception, 63 | min_size=min_size, 64 | max_size=max_size, 65 | ) 66 | self.on_collected_details(files) 67 | 68 | results = execute_in_parallel( 69 | self.persist_from_ftp, list(files), max_threads=self.max_threads 70 | ) 71 | return results 72 | except Exception as e: # pylint: disable=broad-except 73 | self.on_download_fail(e, file_names=files) 74 | raise e 75 | 76 | def get_type_pattern(self, files_types): 77 | """get the file type pattern""" 78 | file_type_mapping = { 79 | FileTypesFilters.STORE_FILE.name: "store", 80 | FileTypesFilters.PRICE_FILE.name: "price", 81 | FileTypesFilters.PROMO_FILE.name: "promo", 82 | FileTypesFilters.PRICE_FULL_FILE.name: "pricef", 83 | FileTypesFilters.PROMO_FULL_FILE.name: "promof", 84 | } 85 | if files_types is None or files_types == FileTypesFilters.all_types(): 86 | return [None] 87 | 88 | responses = [] 89 | for file_type in files_types: 90 | if file_type not in file_type_mapping: 91 | raise ValueError(f"File type {file_type} not supported") 92 | responses.append(file_type_mapping[file_type]) 93 | return responses 94 | 95 | def build_filter_arg(self, store_id=None, when_date=None, files_types=None): 96 | """build the filter arg for the ftp""" 97 | date_pattern = None 98 | if when_date and isinstance(when_date, datetime.datetime): 99 | date_pattern = when_date.strftime("%Y%m%d") 100 | 101 | for type_pattern in self.get_type_pattern(files_types): 102 | output_pattern = [] 103 | if type_pattern: 104 | output_pattern.append(type_pattern) 105 | if store_id: 106 | output_pattern.append(f"{store_id}-") 107 | if date_pattern: 108 | output_pattern.append(date_pattern) 109 | 110 | if len(output_pattern) == 0: 111 | yield None 112 | yield "*" + "*".join(output_pattern) + "*" 113 | 114 | def collect_files_details_from_site( # pylint: disable=too-many-locals 115 | self, 116 | limit=None, 117 | files_types=None, 118 | filter_null=False, 119 | filter_zero=False, 120 | store_id=None, 121 | when_date=None, 122 | files_names_to_scrape=None, 123 | suppress_exception=False, 124 | min_size=None, 125 | max_size=None, 126 | ): 127 | """collect all files to download from the site""" 128 | files = [] 129 | for filter_arg in self.build_filter_arg(store_id, when_date, files_types): 130 | filter_files = collect_from_ftp( 131 | self.ftp_host, 132 | self.ftp_username, 133 | self.ftp_password, 134 | self.ftp_path, 135 | arg=filter_arg, 136 | ) 137 | files.extend(filter_files) 138 | 139 | Logger.info(f"Found {len(files)} files") 140 | 141 | # Convert tuples to separate lists for base class filter_by_file_size method 142 | if min_size is not None or max_size is not None: 143 | file_names = [filename for filename, _ in files] 144 | download_urls = [""] * len(files) # FTP doesn't use URLs, use empty strings 145 | file_sizes = [size for _, size in files] 146 | file_names, download_urls, file_sizes = self.filter_by_file_size( 147 | file_names, 148 | download_urls, 149 | file_sizes, 150 | min_size=min_size, 151 | max_size=max_size, 152 | ) 153 | # Convert back to tuples 154 | files = list(zip(file_names, file_sizes)) 155 | 156 | files = self.filter_bad_files( 157 | files, 158 | filter_null=filter_null, 159 | filter_zero=filter_zero, 160 | by_function=lambda x: x[0], 161 | ) 162 | 163 | Logger.info(f"After filtering bad files: Found {len(files)} files") 164 | 165 | files = list( 166 | filter(lambda x: x[0].split(".")[-1] in self.target_file_extensions, files) 167 | ) 168 | Logger.info( 169 | f"After filtering by {self.target_file_extensions}: Found {len(files)} files" 170 | ) 171 | 172 | # apply noraml filter 173 | files = self.apply_limit( 174 | files, 175 | limit=limit, 176 | files_types=files_types, 177 | store_id=store_id, 178 | when_date=when_date, 179 | files_names_to_scrape=files_names_to_scrape, 180 | suppress_exception=suppress_exception, 181 | by_function=lambda x: x[0], 182 | ) 183 | Logger.info(f"After applying limit: Found {len(files)} files") 184 | 185 | # Extract just filenames for backward compatibility with persist_from_ftp 186 | return [filename for filename, _ in files] 187 | 188 | def persist_from_ftp(self, file_name): 189 | """download file to hard drive and extract it.""" 190 | downloaded = False 191 | extract_succefully = False 192 | restart_and_retry = False 193 | error = None 194 | try: 195 | ext = os.path.splitext(file_name)[1] 196 | if ext not in [".gz", ".xml"]: 197 | raise ValueError(f"File {file_name} extension is not .gz or .xml") 198 | 199 | Logger.debug(f"Start persisting file {file_name}") 200 | temporary_gz_file_path = os.path.join(self.storage_path, file_name) 201 | 202 | fetch_temporary_gz_file_from_ftp( 203 | self.ftp_host, 204 | self.ftp_username, 205 | self.ftp_password, 206 | self.ftp_path, 207 | temporary_gz_file_path, 208 | timeout=30, 209 | ) 210 | downloaded = True 211 | 212 | if ext == ".gz": 213 | Logger.debug( 214 | f"File size is {os.path.getsize(temporary_gz_file_path)} bytes." 215 | ) 216 | extract_xml_file_from_gz_file(temporary_gz_file_path) 217 | 218 | Logger.debug(f"Done persisting file {file_name}") 219 | extract_succefully = True 220 | except Exception as exception: # pylint: disable=broad-except 221 | Logger.error( 222 | f"Error downloading {file_name},extract_succefully={extract_succefully}" 223 | f",downloaded={downloaded}" 224 | ) 225 | Logger.error_execption(exception) 226 | error = str(exception) 227 | restart_and_retry = True 228 | finally: 229 | if ext == ".gz" and os.path.exists(temporary_gz_file_path): 230 | os.remove(temporary_gz_file_path) 231 | 232 | return { 233 | "file_name": file_name, 234 | "downloaded": downloaded, 235 | "extract_succefully": extract_succefully, 236 | "restart_and_retry": restart_and_retry, 237 | "error": error, 238 | } 239 | -------------------------------------------------------------------------------- /il_supermarket_scarper/engines/multipage_web.py: -------------------------------------------------------------------------------- 1 | from urllib.parse import urlsplit 2 | import re 3 | import ntpath 4 | from abc import abstractmethod 5 | from lxml import html as lxml_html 6 | 7 | 8 | from il_supermarket_scarper.utils import ( 9 | Logger, 10 | execute_in_parallel, 11 | multiple_page_aggregtion, 12 | convert_nl_size_to_bytes, 13 | UnitSize, 14 | ) 15 | from .web import WebBase 16 | 17 | 18 | class MultiPageWeb(WebBase): 19 | """scrape the file of websites with multipage""" 20 | 21 | target_file_extension = ".xml" 22 | results_in_page = 20 23 | 24 | def __init__( 25 | self, 26 | chain, 27 | chain_id, 28 | url, 29 | folder_name=None, 30 | total_page_xpath="""//*[@id="gridContainer"]/table/ 31 | tfoot/tr/td/a[6]/@href""", 32 | total_pages_pattern=r"^\/\?page\=([0-9]{3})$", 33 | page_argument="page", 34 | max_threads=5, 35 | ): 36 | super().__init__( 37 | chain, chain_id, url=url, folder_name=folder_name, max_threads=max_threads 38 | ) 39 | self.total_page_xpath = total_page_xpath 40 | self.total_pages_pattern = total_pages_pattern 41 | self.page_argument = page_argument 42 | 43 | @abstractmethod 44 | def build_params(self, files_types=None, store_id=None, when_date=None): 45 | """build the params for the request""" 46 | 47 | def get_request_url( 48 | self, files_types=None, store_id=None, when_date=None 49 | ): # pylint: disable=unused-argument 50 | """get all links to collect download links from""" 51 | 52 | results = [] 53 | for arguments in self.build_params( 54 | files_types=files_types, store_id=store_id, when_date=when_date 55 | ): 56 | results.append( 57 | { 58 | "url": self.url + arguments, 59 | "method": "GET", 60 | } 61 | ) 62 | return results 63 | 64 | def get_number_of_pages(self, response): 65 | """get the number of pages to scarpe""" 66 | 67 | html_body = lxml_html.fromstring(response.content) 68 | 69 | elements = html_body.xpath(self.total_page_xpath) 70 | 71 | if len(elements) == 0: 72 | return None # only one page 73 | 74 | pages = re.findall( 75 | self.total_pages_pattern, 76 | elements[-1], 77 | ) 78 | return int(pages[0]) 79 | 80 | def collect_files_details_from_site( # pylint: disable=too-many-locals 81 | self, 82 | limit=None, 83 | files_types=None, 84 | store_id=None, 85 | when_date=None, 86 | filter_null=False, 87 | filter_zero=False, 88 | files_names_to_scrape=None, 89 | suppress_exception=False, 90 | min_size=None, 91 | max_size=None, 92 | ): 93 | 94 | main_page_requests = self.get_request_url( 95 | files_types=files_types, store_id=store_id, when_date=when_date 96 | ) 97 | assert len(main_page_requests) > 0, "No pages to scrape" 98 | 99 | download_urls = [] 100 | file_names = [] 101 | file_sizes = [] 102 | for main_page_request in main_page_requests: 103 | 104 | main_page_response = self.session_with_cookies_by_chain(**main_page_request) 105 | 106 | total_pages = self.get_number_of_pages(main_page_response) 107 | Logger.info(f"Found {total_pages} pages") 108 | 109 | # if there is only one page, call it again, 110 | # in the future, we can skip scrap it again 111 | if total_pages is None: 112 | pages_to_scrape = [main_page_request] 113 | else: 114 | pages_to_scrape = list( 115 | map( 116 | lambda page_number, req=main_page_request: { 117 | **req, 118 | "url": req["url"] 119 | + f"{self.page_argument}=" 120 | + str(page_number), 121 | }, 122 | range(1, total_pages + 1), 123 | ) 124 | ) 125 | 126 | _download_urls, _file_names, _file_sizes = execute_in_parallel( 127 | self.process_links_before_download, 128 | list(pages_to_scrape), 129 | aggregtion_function=multiple_page_aggregtion, 130 | max_threads=self.max_threads, 131 | ) 132 | 133 | download_urls.extend(_download_urls) 134 | file_names.extend(_file_names) 135 | file_sizes.extend( 136 | _file_sizes if _file_sizes else [None] * len(_download_urls) 137 | ) 138 | 139 | Logger.info(f"Found {len(download_urls)} files") 140 | 141 | # Filter by file size if specified 142 | if min_size is not None or max_size is not None: 143 | file_names, download_urls, file_sizes = self.filter_by_file_size( 144 | file_names, 145 | download_urls, 146 | file_sizes, 147 | min_size=min_size, 148 | max_size=max_size, 149 | ) 150 | 151 | file_names, download_urls, file_sizes = self.filter_bad_files_zip( 152 | file_names, 153 | download_urls, 154 | file_sizes=file_sizes, 155 | filter_null=filter_null, 156 | filter_zero=filter_zero, 157 | ) 158 | 159 | Logger.info(f"After filtering bad files: Found {len(download_urls)} files") 160 | 161 | file_names, download_urls, file_sizes = self.apply_limit_zip( 162 | file_names, 163 | download_urls, 164 | file_sizes=file_sizes, 165 | limit=limit, 166 | files_types=files_types, 167 | store_id=store_id, 168 | when_date=when_date, 169 | files_names_to_scrape=files_names_to_scrape, 170 | suppress_exception=suppress_exception, 171 | ) 172 | 173 | return download_urls, file_names 174 | 175 | def get_file_size_from_entry( 176 | self, html, link_element 177 | ): # pylint: disable=arguments-differ,unused-argument 178 | """ 179 | Extract file size from HTML element. 180 | For MultiPageWeb, we need to find the size in the same row as the link. 181 | Returns size in bytes, or None if not found. 182 | """ 183 | try: 184 | # Find the parent row of the link 185 | row = ( 186 | link_element.getparent().getparent() 187 | if link_element.getparent() 188 | else None 189 | ) 190 | if row is None: 191 | return None 192 | 193 | # Look for size in table cells - typically in a column after the link 194 | cells = row.xpath(".//td") 195 | for cell in cells: 196 | text = cell.text_content().strip() if cell.text_content() else "" 197 | # Parse size using the same logic as WebBase 198 | size_bytes = convert_nl_size_to_bytes(text, to_unit=UnitSize.BYTES) 199 | if size_bytes is not None: 200 | return size_bytes 201 | except (AttributeError, TypeError) as e: 202 | Logger.debug(f"Error extracting file size from entry: {e}") 203 | return None 204 | 205 | def collect_files_details_from_page(self, html): 206 | """collect the details deom one page""" 207 | links = [] 208 | filenames = [] 209 | file_sizes = [] 210 | # Select all rows from the table 211 | rows = html.xpath('//*[@id="gridContainer"]/table/tbody/tr') 212 | for row in rows: 213 | # Extract link from td[1]/a 214 | link_elements = row.xpath("./td[1]/a") 215 | if not link_elements: 216 | continue 217 | link_element = link_elements[0] 218 | link = link_element.get("href") 219 | if not link: 220 | continue 221 | 222 | # Extract size from td[3] (size column) 223 | size_elements = row.xpath("./td[3]") 224 | size_text = size_elements[0].text_content().strip() if size_elements else "" 225 | size_bytes = ( 226 | convert_nl_size_to_bytes(size_text, to_unit=UnitSize.BYTES) 227 | if size_text 228 | else None 229 | ) 230 | 231 | links.append(link) 232 | filenames.append(ntpath.basename(urlsplit(link).path)) 233 | file_sizes.append(size_bytes) 234 | return links, filenames, file_sizes 235 | 236 | def process_links_before_download( 237 | self, 238 | request, 239 | limit=None, 240 | files_types=None, 241 | store_id=None, 242 | when_date=None, 243 | suppress_exception=True, # this is nested limit don't fail 244 | ): 245 | """additional processing to the links before download""" 246 | response = self.session_with_cookies_by_chain(**request) 247 | 248 | html = lxml_html.fromstring(response.text) 249 | 250 | file_links, filenames, file_sizes = self.collect_files_details_from_page(html) 251 | Logger.info(f"Page {request}: Found {len(file_links)} files") 252 | 253 | filenames, file_links, file_sizes = self.apply_limit_zip( 254 | filenames, 255 | file_links, 256 | file_sizes=file_sizes, 257 | limit=limit, 258 | files_types=files_types, 259 | store_id=store_id, 260 | when_date=when_date, 261 | suppress_exception=suppress_exception, 262 | ) 263 | 264 | Logger.info( 265 | f"After applying limit: Page {request}: " 266 | f"Found {len(file_links)} line and {len(filenames)} files" 267 | ) 268 | 269 | return file_links, filenames, file_sizes 270 | -------------------------------------------------------------------------------- /il_supermarket_scarper/scrappers/tests/test_cases.py: -------------------------------------------------------------------------------- 1 | # pylint: disable=too-many-statements 2 | import unittest 3 | import tempfile 4 | import re 5 | import os 6 | import uuid 7 | import xml.etree.ElementTree as ET 8 | from lxml import etree 9 | from il_supermarket_scarper.utils import ( 10 | FileTypesFilters, 11 | Logger, 12 | DumpFolderNames, 13 | _testing_now, 14 | change_xml_encoding, 15 | ) 16 | from il_supermarket_scarper.scrappers_factory import ScraperFactory 17 | from il_supermarket_scarper.scraper_stability import ScraperStability 18 | 19 | 20 | def make_test_case(scraper_enum, store_id): 21 | """create test suite for scraper""" 22 | 23 | class TestScapers(unittest.TestCase): 24 | """class with all the tests for scraper""" 25 | 26 | def __init__(self, name) -> None: 27 | super().__init__(name) 28 | self.scraper_enum = scraper_enum 29 | self.folder_name = "temp" 30 | 31 | def _delete_folder_and_sub_folder(self, download_path): 32 | """delete a folder and all sub-folder""" 33 | files_found = os.listdir(download_path) 34 | for file in files_found: 35 | file_path = os.path.join(download_path, file) 36 | if os.path.isdir(file_path): 37 | self._delete_folder_and_sub_folder(file_path) 38 | os.rmdir(file_path) 39 | else: 40 | os.remove(file_path) 41 | 42 | def _delete_download_folder(self, download_path): 43 | """delete the download folder""" 44 | if os.path.isdir(download_path): 45 | self._delete_folder_and_sub_folder(download_path) 46 | os.removedirs(download_path) 47 | 48 | def _make_sure_filter_work( 49 | self, 50 | files_found, 51 | file_type=None, 52 | limit=None, 53 | store_id=None, 54 | when_date=None, 55 | ): 56 | """make sure the file type filter works""" 57 | # make sure the file type is applied 58 | if file_type: 59 | filtered_files = 0 60 | for f_type in file_type: 61 | filtered_files += len(FileTypesFilters.filter(f_type, files_found)) 62 | assert len(files_found) == filtered_files 63 | 64 | # check the store id is applied 65 | if store_id: 66 | for file in files_found: 67 | assert re.compile(rf"-0*{store_id}-").search(file) 68 | 69 | # check the date time stamp is applied 70 | if when_date: 71 | for file in files_found: 72 | assert ( 73 | when_date.strftime("%Y%m%d") in file 74 | ), f"{when_date} not in {file}" 75 | 76 | # check limit 77 | assert ( 78 | limit is None or len(files_found) == limit 79 | ), f""" Found {files_found} f"files but should be {limit}""" 80 | 81 | def _make_sure_file_contain_chain_ids(self, chain_ids, file): 82 | """make sure the scraper download only the chain id""" 83 | found_chain_id = False 84 | for possible_chain_ids in chain_ids: 85 | if possible_chain_ids in file: 86 | found_chain_id = True 87 | assert found_chain_id, f"should be one of {chain_ids} but {file}" 88 | 89 | def _make_sure_file_extension_is_xml(self, file_name): 90 | """make sure the file extension is xml""" 91 | file_ext = file_name.split(".")[-1] 92 | assert file_ext == "xml", f" should be xml but {file_ext}, file:{file_name}" 93 | 94 | def _try_to_recover_xml(self, file_path): 95 | """try to recover the xml""" 96 | parser = etree.XMLParser(recover=True, encoding="utf-8") 97 | with open(file_path, "rb") as f: 98 | tree = etree.parse(f, parser) 99 | fixed_xml = etree.tostring( 100 | tree, pretty_print=True, encoding="utf-8" 101 | ).decode("utf-8") 102 | 103 | with open(file_path, "w", encoding="utf-8") as f: 104 | f.write(fixed_xml) 105 | 106 | def _make_sure_file_is_xml_readable(self, full_file_path): 107 | """Ensure the file is a valid XML and readable.""" 108 | try: 109 | ET.parse(full_file_path) 110 | except ET.ParseError: 111 | try: 112 | self._try_to_recover_xml(full_file_path) 113 | ET.parse(full_file_path) 114 | except ET.ParseError: 115 | change_xml_encoding(full_file_path) 116 | ET.parse(full_file_path) 117 | 118 | def _clean_scarpe_delete( 119 | self, 120 | scraper_enum, 121 | store_id=None, 122 | limit=None, 123 | file_type=None, 124 | when_date=None, 125 | ): 126 | with tempfile.TemporaryDirectory() as tmpdirname: 127 | self.__clean_scarpe_delete( 128 | scraper_enum=scraper_enum, 129 | dump_path=tmpdirname, 130 | store_id=store_id, 131 | limit=limit, 132 | file_type=file_type, 133 | when_date=when_date, 134 | ) 135 | 136 | def __clean_scarpe_delete( 137 | self, 138 | scraper_enum, 139 | dump_path="temp", 140 | store_id=None, 141 | limit=None, 142 | file_type=None, 143 | when_date=None, 144 | ): 145 | self._delete_download_folder(dump_path) 146 | os.makedirs(dump_path) 147 | init_scraper_function = ScraperFactory.get(scraper_enum) 148 | 149 | if init_scraper_function is None: 150 | Logger.warning(f"{scraper_enum} is disabled.") 151 | else: 152 | try: 153 | scraper = init_scraper_function(folder_name=dump_path) 154 | 155 | kwarg = { 156 | "limit": limit, 157 | "files_types": file_type, 158 | "store_id": store_id, 159 | "when_date": when_date, 160 | "filter_null": True, 161 | "filter_zero": True, 162 | "suppress_exception": True, 163 | "min_size": 100, 164 | "max_size": 10000000, 165 | } 166 | 167 | scraper.scrape(**kwarg) 168 | 169 | files_found = os.listdir(dump_path) 170 | assert ( 171 | len(files_found) == 2 172 | ), "only one folder should exists and the status folder" 173 | assert DumpFolderNames[scraper_enum.name].value in files_found 174 | 175 | download_path = os.path.join( 176 | dump_path, DumpFolderNames[scraper_enum.name].value 177 | ) 178 | files_found = os.listdir(download_path) 179 | 180 | if not ScraperStability.is_validate_scraper_found_no_files( 181 | scraper_enum.name, 182 | limit=limit, 183 | files_types=file_type, 184 | store_id=store_id, 185 | when_date=when_date, 186 | utilize_date_param=scraper_enum.value.utilize_date_param, 187 | ): 188 | self._make_sure_filter_work( 189 | files_found, 190 | file_type=file_type, 191 | limit=limit, 192 | store_id=store_id, 193 | when_date=when_date, 194 | ) 195 | 196 | for file in files_found: 197 | self._make_sure_file_contain_chain_ids( 198 | scraper.get_chain_id(), file 199 | ) 200 | self._make_sure_file_extension_is_xml(file) 201 | 202 | self._make_sure_file_is_xml_readable( 203 | os.path.join(download_path, file) 204 | ) 205 | finally: 206 | self._delete_download_folder(dump_path) 207 | 208 | def _get_temp_folder(self): 209 | """get a temp folder to download the files into""" 210 | return self.folder_name + str(uuid.uuid4().hex) 211 | 212 | def test_scrape_one(self): 213 | """scrape one file and make sure it exists""" 214 | self._clean_scarpe_delete(scraper_enum, limit=1) 215 | 216 | def test_scrape_three(self): 217 | """scrape three file and make sure they exists""" 218 | self._clean_scarpe_delete(scraper_enum, limit=3) 219 | 220 | def test_scrape_promo(self): 221 | """scrape one promo file and make sure it exists""" 222 | self._clean_scarpe_delete( 223 | scraper_enum, 224 | limit=1, 225 | file_type=FileTypesFilters.only_promo(), 226 | ) 227 | 228 | def test_scrape_store(self): 229 | """scrape one store file and make sure it exists""" 230 | self._clean_scarpe_delete( 231 | scraper_enum, limit=1, file_type=FileTypesFilters.only_store() 232 | ) 233 | 234 | def test_scrape_price(self): 235 | """scrape one price file and make sure it exists""" 236 | self._clean_scarpe_delete( 237 | scraper_enum, limit=1, file_type=FileTypesFilters.only_price() 238 | ) 239 | 240 | def test_scrape_file_from_single_store(self): 241 | """test fetching only files from a ceriten store""" 242 | self._clean_scarpe_delete(scraper_enum, store_id=store_id, limit=1) 243 | 244 | def test_scrape_file_today(self): 245 | """test fetching file from today""" 246 | self._clean_scarpe_delete(scraper_enum, when_date=_testing_now(), limit=1) 247 | 248 | return TestScapers 249 | --------------------------------------------------------------------------------