├── il_supermarket_scarper ├── scrappers │ ├── tests │ │ ├── __init__.py │ │ ├── test_all.py │ │ └── test_cases.py │ ├── bareket.py │ ├── keshet.py │ ├── good_pharm.py │ ├── king_store.py │ ├── osherad.py │ ├── polizer.py │ ├── shuk_ahir.py │ ├── tivtaam.py │ ├── het_cohen.py │ ├── maayan2000.py │ ├── super_sapir.py │ ├── yohananof.py │ ├── doralon.py │ ├── victory.py │ ├── zolvebegadol.py │ ├── quik.py │ ├── mega.py │ ├── ramilevy.py │ ├── machsani_ashuk.py │ ├── shefa_barcart_ashem.py │ ├── bitan.py │ ├── salachdabach.py │ ├── superdosh.py │ ├── yellow.py │ ├── super_yuda.py │ ├── nativ_hashed.py │ ├── stop_market.py │ ├── cofix.py │ ├── __init__.py │ ├── shufersal.py │ ├── meshnat_yosef.py │ ├── wolt.py │ ├── super_pharm.py │ ├── hazihinam.py │ └── city_market.py ├── utils │ ├── databases │ │ ├── __init__.py │ │ ├── base.py │ │ ├── mongo.py │ │ └── json_file.py │ ├── exceptions.py │ ├── tests │ │ ├── PriceFull7290876100000-003-202410070010.gz │ │ ├── test_connection.py │ │ ├── test_gzip_utils.py │ │ ├── test_file_type.py │ │ └── test_status.py │ ├── lock_utils.py │ ├── __init__.py │ ├── folders_name.py │ ├── loop.py │ ├── gzip_utils.py │ ├── logger.py │ ├── validation.py │ ├── file_cache.py │ ├── file_types.py │ ├── scraper_status.py │ ├── retry.py │ └── status.py ├── engines │ ├── __init__.py │ ├── apsx.py │ ├── publishprice.py │ ├── bina.py │ ├── matrix.py │ ├── web.py │ ├── cerberus.py │ └── multipage_web.py ├── __init__.py ├── tests │ └── test_scrappers_factory.py ├── main.py ├── scrapper_runner.py ├── scrappers_factory.py └── scraper_stability.py ├── pytest.ini ├── MANIFEST.in ├── setup.cfg ├── requirements-dev.txt ├── .pylintrc ├── .gitignore ├── requirements.txt ├── .devcontainer └── devcontainer.json ├── example.py ├── .vscode └── launch.json ├── .github └── workflows │ ├── pylint.yml │ ├── python-publish.yml │ ├── user-validation.yml │ ├── docker-publish.yml │ ├── test-suite.yml │ └── codeql.yml ├── tests ├── test_integration.py └── test_main.py ├── Dockerfile ├── setup.py ├── main.py ├── stress_test.py ├── LICENSE.txt └── README.md /il_supermarket_scarper/scrappers/tests/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /pytest.ini: -------------------------------------------------------------------------------- 1 | [pytest] 2 | filterwarnings = 3 | ignore::UserWarning -------------------------------------------------------------------------------- /MANIFEST.in: -------------------------------------------------------------------------------- 1 | include requirements.txt 2 | include requirements-dev.txt 3 | -------------------------------------------------------------------------------- /setup.cfg: -------------------------------------------------------------------------------- 1 | # Inside of setup.cfg 2 | [metadata] 3 | description-file = README.md 4 | -------------------------------------------------------------------------------- /requirements-dev.txt: -------------------------------------------------------------------------------- 1 | pytest==7.1 2 | zipp==3.19.1 # patch pytest vulnerability 3 | black==24.3.0 4 | pylint==3.0.1 -------------------------------------------------------------------------------- /il_supermarket_scarper/utils/databases/__init__.py: -------------------------------------------------------------------------------- 1 | from .json_file import JsonDataBase 2 | from .mongo import MongoDataBase 3 | -------------------------------------------------------------------------------- /.pylintrc: -------------------------------------------------------------------------------- 1 | [MAIN] 2 | disable= 3 | C0114, # missing-module-docstring 4 | R0913, # too-many-arguments 5 | extension-pkg-allow-list=lxml.etree -------------------------------------------------------------------------------- /il_supermarket_scarper/utils/exceptions.py: -------------------------------------------------------------------------------- 1 | class RestartSessionError(Exception): 2 | """This error will be raised if we would like to retry to downalod after a session restart""" 3 | -------------------------------------------------------------------------------- /il_supermarket_scarper/engines/__init__.py: -------------------------------------------------------------------------------- 1 | from .cerberus import Cerberus 2 | from .multipage_web import MultiPageWeb 3 | from .matrix import Matrix 4 | from .bina import Bina 5 | from .publishprice import PublishPrice 6 | -------------------------------------------------------------------------------- /il_supermarket_scarper/utils/tests/PriceFull7290876100000-003-202410070010.gz: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/OpenIsraeliSupermarkets/israeli-supermarket-scarpers/HEAD/il_supermarket_scarper/utils/tests/PriceFull7290876100000-003-202410070010.gz -------------------------------------------------------------------------------- /il_supermarket_scarper/__init__.py: -------------------------------------------------------------------------------- 1 | from .main import ScarpingTask 2 | from .scrappers_factory import ScraperFactory 3 | from .scraper_stability import ScraperStability 4 | from .utils import FileTypesFilters, DumpFolderNames, datetime_in_tlv 5 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | __pycache__/ 2 | env/ 3 | *_cookies.txt 4 | dist/ 5 | il_supermarket_scraper.egg-info/ 6 | build/ 7 | database/* 8 | dumps/* 9 | logging.log 10 | temp*/ 11 | .vscode/settings.json 12 | .DS_Store 13 | test_dump 14 | status/ 15 | .cache/ 16 | -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | retry==0.9.2 2 | mock==4.0.3 3 | requests==2.32.2 4 | lxml==5.2.1 5 | beautifulsoup4==4.10.0 6 | pymongo==4.6.3 7 | dnspython==2.6.1 # patch pymongo vulnerability 8 | pytz==2022.4 9 | holidays==0.45 10 | cachetools==5.2.0 11 | pytest-playwright==0.7.0 -------------------------------------------------------------------------------- /il_supermarket_scarper/scrappers/bareket.py: -------------------------------------------------------------------------------- 1 | from il_supermarket_scarper.engines import Bina 2 | from il_supermarket_scarper.utils import DumpFolderNames 3 | 4 | 5 | class Bareket(Bina): 6 | """scarper for bareket""" 7 | 8 | def __init__(self, folder_name=None): 9 | super().__init__( 10 | chain=DumpFolderNames.BAREKET, 11 | chain_id="7290875100001", 12 | url_perfix="superbareket", 13 | folder_name=folder_name, 14 | ) 15 | -------------------------------------------------------------------------------- /il_supermarket_scarper/scrappers/keshet.py: -------------------------------------------------------------------------------- 1 | from il_supermarket_scarper.engines import Cerberus 2 | from il_supermarket_scarper.utils import DumpFolderNames 3 | 4 | 5 | class Keshet(Cerberus): 6 | """scaper for keshet tamim""" 7 | 8 | def __init__(self, folder_name=None): 9 | super().__init__( 10 | chain=DumpFolderNames.KESHET, 11 | chain_id="7290785400000", 12 | folder_name=folder_name, 13 | ftp_username="Keshet", 14 | ) 15 | -------------------------------------------------------------------------------- /.devcontainer/devcontainer.json: -------------------------------------------------------------------------------- 1 | { 2 | "build": { 3 | "dockerfile": "../Dockerfile", 4 | "target":"test", 5 | "args": { 6 | "PY_VERSION":"3.11.0" 7 | } 8 | }, 9 | "customizations": { 10 | "vscode": { 11 | "extensions": [ 12 | "ms-python.python", 13 | "ms-python.vscode-pylance", 14 | "ms-toolsai.jupyter", 15 | "LittleFoxTeam.vscode-python-test-adapter" 16 | ] 17 | } 18 | }, 19 | 20 | "forwardPorts": [3000] 21 | } 22 | -------------------------------------------------------------------------------- /il_supermarket_scarper/scrappers/good_pharm.py: -------------------------------------------------------------------------------- 1 | from il_supermarket_scarper.engines import Bina 2 | from il_supermarket_scarper.utils import DumpFolderNames 3 | 4 | 5 | class GoodPharm(Bina): 6 | """scarper from good pharm""" 7 | 8 | def __init__(self, folder_name=None): 9 | super().__init__( 10 | chain=DumpFolderNames.GOOD_PHARM, 11 | chain_id="7290058197699", 12 | url_perfix="goodpharm", 13 | folder_name=folder_name, 14 | ) 15 | -------------------------------------------------------------------------------- /il_supermarket_scarper/scrappers/king_store.py: -------------------------------------------------------------------------------- 1 | from il_supermarket_scarper.engines import Bina 2 | from il_supermarket_scarper.utils import DumpFolderNames 3 | 4 | 5 | class KingStore(Bina): 6 | """scraper for king store""" 7 | 8 | def __init__(self, folder_name=None): 9 | super().__init__( 10 | chain=DumpFolderNames.KING_STORE, 11 | chain_id="7290058108879", 12 | url_perfix="kingstore", 13 | folder_name=folder_name, 14 | ) 15 | -------------------------------------------------------------------------------- /il_supermarket_scarper/scrappers/osherad.py: -------------------------------------------------------------------------------- 1 | from il_supermarket_scarper.engines import Cerberus 2 | from il_supermarket_scarper.utils import DumpFolderNames 3 | 4 | 5 | class Osherad(Cerberus): 6 | """scaper for osher ad""" 7 | 8 | def __init__(self, folder_name=None): 9 | super().__init__( 10 | chain=DumpFolderNames.OSHER_AD, 11 | chain_id="7290103152017", 12 | folder_name=folder_name, 13 | ftp_username="osherad", 14 | ) 15 | -------------------------------------------------------------------------------- /il_supermarket_scarper/scrappers/polizer.py: -------------------------------------------------------------------------------- 1 | from il_supermarket_scarper.engines import Cerberus 2 | from il_supermarket_scarper.utils import DumpFolderNames 3 | 4 | 5 | class Polizer(Cerberus): 6 | """scarper for polizer""" 7 | 8 | def __init__(self, folder_name=None): 9 | super().__init__( 10 | chain=DumpFolderNames.POLIZER, 11 | chain_id="7291059100008", 12 | folder_name=folder_name, 13 | ftp_username="politzer", 14 | ) 15 | -------------------------------------------------------------------------------- /il_supermarket_scarper/scrappers/shuk_ahir.py: -------------------------------------------------------------------------------- 1 | from il_supermarket_scarper.engines import Bina 2 | from il_supermarket_scarper.utils import DumpFolderNames 3 | 4 | 5 | class ShukAhir(Bina): 6 | """scraper for shuk a hir""" 7 | 8 | def __init__(self, folder_name=None): 9 | super().__init__( 10 | chain=DumpFolderNames.SHUK_AHIR, 11 | chain_id="7290058148776", 12 | url_perfix="shuk-hayir", 13 | folder_name=folder_name, 14 | ) 15 | -------------------------------------------------------------------------------- /il_supermarket_scarper/scrappers/tivtaam.py: -------------------------------------------------------------------------------- 1 | from il_supermarket_scarper.engines import Cerberus 2 | from il_supermarket_scarper.utils import DumpFolderNames 3 | 4 | 5 | class TivTaam(Cerberus): 6 | """scraper for tiv taam""" 7 | 8 | def __init__(self, folder_name=None): 9 | super().__init__( 10 | chain=DumpFolderNames.TIV_TAAM, 11 | chain_id="7290873255550", 12 | folder_name=folder_name, 13 | ftp_username="TivTaam", 14 | ) 15 | -------------------------------------------------------------------------------- /il_supermarket_scarper/scrappers/het_cohen.py: -------------------------------------------------------------------------------- 1 | from il_supermarket_scarper.engines import Matrix 2 | from il_supermarket_scarper.utils import DumpFolderNames 3 | 4 | 5 | class HetCohen(Matrix): 6 | """scraper for ChetCohen""" 7 | 8 | def __init__(self, folder_name=None): 9 | super().__init__( 10 | chain=DumpFolderNames.HET_COHEN, 11 | chain_id=["7290455000004"], 12 | folder_name=folder_name, 13 | chain_hebrew_name="ח. כהן", 14 | ) 15 | -------------------------------------------------------------------------------- /il_supermarket_scarper/scrappers/maayan2000.py: -------------------------------------------------------------------------------- 1 | from il_supermarket_scarper.engines import Bina 2 | from il_supermarket_scarper.utils import DumpFolderNames 3 | 4 | 5 | class Maayan2000(Bina): 6 | """scaper for maayan 2000""" 7 | 8 | def __init__(self, folder_name=None): 9 | super().__init__( 10 | chain=DumpFolderNames.MAAYAN_2000, 11 | chain_id="7290058159628", 12 | url_perfix="maayan2000", 13 | folder_name=folder_name, 14 | ) 15 | -------------------------------------------------------------------------------- /il_supermarket_scarper/scrappers/super_sapir.py: -------------------------------------------------------------------------------- 1 | from il_supermarket_scarper.engines import Bina 2 | from il_supermarket_scarper.utils import DumpFolderNames 3 | 4 | 5 | class SuperSapir(Bina): 6 | """scaper for super sapir""" 7 | 8 | def __init__(self, folder_name=None): 9 | super().__init__( 10 | chain=DumpFolderNames.SUPER_SAPIR, 11 | chain_id="7290058156016", 12 | url_perfix="supersapir", 13 | folder_name=folder_name, 14 | ) 15 | -------------------------------------------------------------------------------- /il_supermarket_scarper/scrappers/yohananof.py: -------------------------------------------------------------------------------- 1 | from il_supermarket_scarper.engines import Cerberus 2 | from il_supermarket_scarper.utils import DumpFolderNames 3 | 4 | 5 | class Yohananof(Cerberus): 6 | """scraper for yohananof""" 7 | 8 | def __init__(self, folder_name=None): 9 | super().__init__( 10 | chain=DumpFolderNames.YOHANANOF, 11 | chain_id="7290803800003", 12 | folder_name=folder_name, 13 | ftp_username="yohananof", 14 | ) 15 | -------------------------------------------------------------------------------- /il_supermarket_scarper/scrappers/doralon.py: -------------------------------------------------------------------------------- 1 | from il_supermarket_scarper.engines import Cerberus 2 | from il_supermarket_scarper.utils import DumpFolderNames 3 | 4 | 5 | class DorAlon(Cerberus): 6 | """scraper for dor alon""" 7 | 8 | def __init__(self, folder_name=None): 9 | super().__init__( 10 | folder_name=folder_name, 11 | chain=DumpFolderNames.DOR_ALON, 12 | chain_id=["7290492000005", "729049000005"], 13 | ftp_username="doralon", 14 | ) 15 | -------------------------------------------------------------------------------- /il_supermarket_scarper/scrappers/victory.py: -------------------------------------------------------------------------------- 1 | from il_supermarket_scarper.engines import Matrix 2 | from il_supermarket_scarper.utils import DumpFolderNames 3 | 4 | 5 | class Victory(Matrix): 6 | """scraper for victory""" 7 | 8 | def __init__(self, folder_name=None): 9 | super().__init__( 10 | chain=DumpFolderNames.VICTORY, 11 | chain_hebrew_name="ויקטורי", 12 | chain_id=["7290696200003", "7290058103393"], 13 | folder_name=folder_name, 14 | ) 15 | -------------------------------------------------------------------------------- /il_supermarket_scarper/scrappers/zolvebegadol.py: -------------------------------------------------------------------------------- 1 | from il_supermarket_scarper.engines import Bina 2 | from il_supermarket_scarper.utils import DumpFolderNames 3 | 4 | 5 | class ZolVeBegadol(Bina): 6 | """scraper dfor zol-ve-begodol""" 7 | 8 | def __init__(self, folder_name=None): 9 | super().__init__( 10 | chain=DumpFolderNames.ZOL_VEBEGADOL, 11 | chain_id="7290058173198", 12 | url_perfix="zolvebegadol", 13 | folder_name=folder_name, 14 | ) 15 | -------------------------------------------------------------------------------- /il_supermarket_scarper/scrappers/quik.py: -------------------------------------------------------------------------------- 1 | from il_supermarket_scarper.engines.publishprice import PublishPrice 2 | from il_supermarket_scarper.utils import DumpFolderNames 3 | 4 | 5 | # @FlakyScraper 6 | class Quik(PublishPrice): 7 | """scaper for quik""" 8 | 9 | def __init__(self, folder_name=None): 10 | super().__init__( 11 | chain=DumpFolderNames.QUIK, 12 | chain_id="7291029710008", 13 | site_infix="quik", 14 | folder_name=folder_name, 15 | ) 16 | -------------------------------------------------------------------------------- /il_supermarket_scarper/scrappers/mega.py: -------------------------------------------------------------------------------- 1 | from il_supermarket_scarper.engines.publishprice import PublishPrice 2 | from il_supermarket_scarper.utils import DumpFolderNames 3 | 4 | 5 | # removed : 1.7.2025 6 | class Mega(PublishPrice): 7 | """scraper for mege""" 8 | 9 | def __init__(self, folder_name=None): 10 | super().__init__( 11 | chain=DumpFolderNames.MEGA, 12 | chain_id="7290055700007", 13 | site_infix="mega", 14 | folder_name=folder_name, 15 | ) 16 | -------------------------------------------------------------------------------- /il_supermarket_scarper/scrappers/ramilevy.py: -------------------------------------------------------------------------------- 1 | from il_supermarket_scarper.engines import Cerberus 2 | from il_supermarket_scarper.utils import DumpFolderNames 3 | 4 | 5 | class RamiLevy(Cerberus): 6 | """scaper for rami levi""" 7 | 8 | def __init__(self, folder_name=None): 9 | super().__init__( 10 | chain=DumpFolderNames.RAMI_LEVY, 11 | chain_id="7290058140886", 12 | folder_name=folder_name, 13 | ftp_username="RamiLevi", 14 | max_threads=10, 15 | ) 16 | -------------------------------------------------------------------------------- /il_supermarket_scarper/scrappers/machsani_ashuk.py: -------------------------------------------------------------------------------- 1 | from il_supermarket_scarper.engines import Matrix 2 | from il_supermarket_scarper.utils import DumpFolderNames 3 | 4 | 5 | class MahsaniAShuk(Matrix): 6 | """scraper for masani hsuk""" 7 | 8 | def __init__(self, folder_name=None): 9 | super().__init__( 10 | chain=DumpFolderNames.MAHSANI_ASHUK, 11 | chain_id=["7290661400001", "7290633800006"], 12 | folder_name=folder_name, 13 | chain_hebrew_name="מחסני השוק", 14 | ) 15 | -------------------------------------------------------------------------------- /il_supermarket_scarper/scrappers/shefa_barcart_ashem.py: -------------------------------------------------------------------------------- 1 | from il_supermarket_scarper.engines import Bina 2 | from il_supermarket_scarper.utils import DumpFolderNames 3 | 4 | 5 | class ShefaBarcartAshem(Bina): 6 | """scraper for shefa berkat ashem""" 7 | 8 | def __init__(self, folder_name=None): 9 | super().__init__( 10 | chain=DumpFolderNames.SHEFA_BARCART_ASHEM, 11 | chain_id="7290058134977", 12 | url_perfix="shefabirkathashem", 13 | folder_name=folder_name, 14 | ) 15 | -------------------------------------------------------------------------------- /il_supermarket_scarper/scrappers/bitan.py: -------------------------------------------------------------------------------- 1 | from il_supermarket_scarper.engines.publishprice import PublishPrice 2 | from il_supermarket_scarper.utils import DumpFolderNames 3 | 4 | 5 | class YaynotBitanAndCarrefour(PublishPrice): 6 | """scaper for yaynot beitan""" 7 | 8 | def __init__(self, folder_name=None): 9 | super().__init__( 10 | chain=DumpFolderNames.YAYNO_BITAN_AND_CARREFOUR, 11 | chain_id="7290055700007", 12 | site_infix="carrefour", 13 | folder_name=folder_name, 14 | ) 15 | -------------------------------------------------------------------------------- /il_supermarket_scarper/scrappers/salachdabach.py: -------------------------------------------------------------------------------- 1 | from il_supermarket_scarper.engines import Cerberus 2 | from il_supermarket_scarper.utils import DumpFolderNames 3 | 4 | 5 | class SalachDabach(Cerberus): 6 | """scraper for salach dabach""" 7 | 8 | def __init__(self, folder_name=None): 9 | super().__init__( 10 | chain=DumpFolderNames.SALACH_DABACH, 11 | chain_id="7290526500006", 12 | folder_name=folder_name, 13 | ftp_username="SalachD", 14 | ftp_password="12345", 15 | ) 16 | -------------------------------------------------------------------------------- /il_supermarket_scarper/scrappers/superdosh.py: -------------------------------------------------------------------------------- 1 | from il_supermarket_scarper.engines import Cerberus 2 | from il_supermarket_scarper.utils import DumpFolderNames 3 | 4 | 5 | class FreshMarketAndSuperDosh(Cerberus): 6 | """scraper for fresh market and super dush""" 7 | 8 | def __init__(self, folder_name=None): 9 | super().__init__( 10 | chain=DumpFolderNames.FRESH_MARKET_AND_SUPER_DOSH, 11 | chain_id="7290876100000", 12 | folder_name=folder_name, 13 | ftp_username="freshmarket", 14 | ) 15 | -------------------------------------------------------------------------------- /il_supermarket_scarper/scrappers/yellow.py: -------------------------------------------------------------------------------- 1 | from il_supermarket_scarper.engines import Cerberus 2 | from il_supermarket_scarper.utils import DumpFolderNames 3 | 4 | 5 | class Yellow(Cerberus): 6 | """scraper for yellow""" 7 | 8 | def __init__(self, folder_name=None): 9 | super().__init__( 10 | chain=DumpFolderNames.YELLOW, 11 | chain_id="7290644700005", 12 | folder_name=folder_name, 13 | ftp_username="Paz_bo", 14 | ftp_password="paz468", 15 | max_threads=10, 16 | ) 17 | -------------------------------------------------------------------------------- /example.py: -------------------------------------------------------------------------------- 1 | from il_supermarket_scarper import ScarpingTask, ScraperFactory 2 | from il_supermarket_scarper.utils import _now, Logger 3 | 4 | Logger.set_logging_level("INFO") 5 | 6 | if __name__ == "__main__": 7 | scraper = ScarpingTask( 8 | dump_folder_name="dumps", 9 | lookup_in_db=False, 10 | multiprocessing=2, 11 | limit=1, 12 | enabled_scrapers=[ScraperFactory.BAREKET.name], 13 | # size_estimation_mode=True, # download files,log size, delete files 14 | when_date=_now(), 15 | ) 16 | scraper.start() 17 | -------------------------------------------------------------------------------- /.vscode/launch.json: -------------------------------------------------------------------------------- 1 | { 2 | // Use IntelliSense to learn about possible attributes. 3 | // Hover to view descriptions of existing attributes. 4 | // For more information, visit: https://go.microsoft.com/fwlink/?linkid=830387 5 | "version": "0.2.0", 6 | "configurations": [ 7 | { 8 | "name": "Debug Unit Test", 9 | "type": "python", 10 | "request": "test", 11 | "justMyCode": false, 12 | // "env": { 13 | // "DISABLED_SCRAPPERS" : "BAREKET" 14 | // } 15 | } 16 | ] 17 | } -------------------------------------------------------------------------------- /il_supermarket_scarper/scrappers/super_yuda.py: -------------------------------------------------------------------------------- 1 | from il_supermarket_scarper.engines import Cerberus 2 | from il_supermarket_scarper.utils import DumpFolderNames 3 | 4 | 5 | class SuperYuda(Cerberus): 6 | """scraper for super yuda""" 7 | 8 | def __init__(self, folder_name=None): 9 | super().__init__( 10 | chain=DumpFolderNames.SUPER_YUDA, 11 | chain_id=["7290058198450", "7290058177776"], 12 | ftp_username="yuda_ho", 13 | ftp_password="Yud@147", 14 | ftp_path="/Yuda", 15 | folder_name=folder_name, 16 | ) 17 | -------------------------------------------------------------------------------- /il_supermarket_scarper/scrappers/nativ_hashed.py: -------------------------------------------------------------------------------- 1 | from il_supermarket_scarper.engines.web import WebBase 2 | from il_supermarket_scarper.utils import DumpFolderNames 3 | 4 | 5 | # possible: NetivHased are down in Shabatz 6 | class NetivHased(WebBase): 7 | """scraper for nativ Hased""" 8 | 9 | utilize_date_param = False 10 | 11 | def __init__(self, folder_name=None): 12 | super().__init__( 13 | chain=DumpFolderNames.NETIV_HASED, 14 | chain_id="7290058160839", 15 | url="http://141.226.203.152/", 16 | folder_name=folder_name, 17 | ) 18 | -------------------------------------------------------------------------------- /il_supermarket_scarper/scrappers/stop_market.py: -------------------------------------------------------------------------------- 1 | from il_supermarket_scarper.engines import Cerberus 2 | from il_supermarket_scarper.utils import DumpFolderNames 3 | 4 | 5 | class StopMarket(Cerberus): 6 | """scraper for stop market""" 7 | 8 | def __init__(self, folder_name=None): 9 | super().__init__( 10 | chain=DumpFolderNames.STOP_MARKET, 11 | chain_id=[ 12 | "72906390", 13 | "7290639000004", 14 | ], # in store files for some reason the store id is only 72906390 15 | folder_name=folder_name, 16 | ftp_username="Stop_Market", 17 | ) 18 | -------------------------------------------------------------------------------- /il_supermarket_scarper/utils/tests/test_connection.py: -------------------------------------------------------------------------------- 1 | import os 2 | import pytest 3 | from il_supermarket_scarper.utils.connection import wget_file 4 | 5 | 6 | def test_wget_file_dont_exist(): 7 | """Test wget file that does not exist""" 8 | with pytest.raises(FileNotFoundError): 9 | wget_file( 10 | "https://pricesprodpublic.blob.core.windows.net/price/" 11 | "Price7290027600007-036-202503181800.gz?sv=2014-02-14&sr=b" 12 | "&sig=Me8hez2oy5vClACdE5fVOyyu5Qef%2FlEJSQYfMvQAOKg%3D&" 13 | "se=2025-03-18T18%3A02%3A59Z&sp=r", 14 | "some_file.gz", 15 | ) 16 | 17 | assert not os.path.exists("some_file.gz") 18 | -------------------------------------------------------------------------------- /il_supermarket_scarper/utils/tests/test_gzip_utils.py: -------------------------------------------------------------------------------- 1 | import os 2 | import pytest 3 | 4 | from il_supermarket_scarper.utils.gzip_utils import extract_xml_file_from_gz_file 5 | 6 | 7 | def test_unzip_bad_file(): 8 | """test unziping a bad file""" 9 | 10 | file_path = ( 11 | "il_supermarket_scarper/utils/tests/PriceFull7290876100000-003-202410070010.gz" 12 | ) 13 | file_content = None 14 | if os.path.exists(file_path): 15 | with open(file_path, "rb") as f: 16 | file_content = f.read() 17 | 18 | with pytest.raises(ValueError): 19 | extract_xml_file_from_gz_file(file_path) 20 | 21 | if file_content is not None and not os.path.exists(file_path): 22 | with open(file_path, "wb") as f: 23 | f.write(file_content) 24 | -------------------------------------------------------------------------------- /il_supermarket_scarper/scrappers/cofix.py: -------------------------------------------------------------------------------- 1 | from il_supermarket_scarper.engines import Cerberus 2 | from il_supermarket_scarper.utils import FileTypesFilters, DumpFolderNames 3 | 4 | 5 | class Cofix(Cerberus): 6 | """scraper for confix""" 7 | 8 | def __init__(self, folder_name=None): 9 | super().__init__( 10 | chain=DumpFolderNames.COFIX, 11 | chain_id="7291056200008", 12 | folder_name=folder_name, 13 | ftp_username="SuperCofixApp", 14 | ) 15 | 16 | def is_valid_file_empty(self, file_name): 17 | """it is valid the file is empty""" 18 | 19 | return super().is_valid_file_empty( 20 | file_name 21 | ) or FileTypesFilters.is_file_from_type( 22 | file_name, FileTypesFilters.STORE_FILE.name 23 | ) 24 | -------------------------------------------------------------------------------- /.github/workflows/pylint.yml: -------------------------------------------------------------------------------- 1 | name: Pylint 2 | 3 | on: 4 | push: 5 | branches: [ "main" ] 6 | pull_request: 7 | branches: [ "main" ] 8 | 9 | concurrency: 10 | group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.ref }} 11 | cancel-in-progress: true 12 | 13 | jobs: 14 | build: 15 | runs-on: ubuntu-latest 16 | strategy: 17 | matrix: 18 | python-version: ["3.8"] 19 | steps: 20 | - uses: actions/checkout@v3 21 | - name: Set up Python ${{ matrix.python-version }} 22 | uses: actions/setup-python@v3 23 | with: 24 | python-version: ${{ matrix.python-version }} 25 | - name: Install dependencies 26 | run: | 27 | python -m pip install --upgrade pip 28 | pip install pylint 29 | - name: Analysing the code with pylint 30 | run: | 31 | pylint $(git ls-files '*.py') --disable=E0401,R0801,R0903,W0707,R0917,C0114 32 | -------------------------------------------------------------------------------- /il_supermarket_scarper/utils/databases/base.py: -------------------------------------------------------------------------------- 1 | from abc import ABC, abstractmethod 2 | 3 | 4 | class AbstractDataBase(ABC): 5 | """Abstract base class for database operations.""" 6 | 7 | def __init__(self, database_name, collection_status=False) -> None: 8 | self.database_name = database_name.replace(" ", "_").lower() 9 | self.collection_status = collection_status 10 | 11 | def enable_collection_status(self): 12 | """Enable data collection to the database.""" 13 | self.collection_status = True 14 | 15 | @abstractmethod 16 | def insert_document(self, collection_name, document): 17 | """Insert a document into a collection.""" 18 | 19 | @abstractmethod 20 | def find_document(self, collection_name, query): 21 | """Find a document in a collection based on a query.""" 22 | 23 | def is_collection_enabled(self): 24 | """Check if collection is enabled.""" 25 | return self.collection_status 26 | 27 | def set_collection_status(self, status): 28 | """Enable data collection to JSON storage.""" 29 | self.collection_status = status 30 | -------------------------------------------------------------------------------- /il_supermarket_scarper/utils/tests/test_file_type.py: -------------------------------------------------------------------------------- 1 | from il_supermarket_scarper.utils import FileTypesFilters 2 | 3 | 4 | def test_file_type(): 5 | """test prasing file name to enum""" 6 | assert ( 7 | FileTypesFilters.get_type_from_file("Price7290058108879-339-202409181941") 8 | == FileTypesFilters.PRICE_FILE 9 | ) 10 | assert ( 11 | FileTypesFilters.get_type_from_file("PriceFull7290058108879-339-202409181041") 12 | == FileTypesFilters.PRICE_FULL_FILE 13 | ) 14 | 15 | assert ( 16 | FileTypesFilters.get_type_from_file("StoresFull7290058108879-000-202409181041") 17 | == FileTypesFilters.STORE_FILE 18 | ) 19 | assert ( 20 | FileTypesFilters.get_type_from_file("Promo7290058108879-336-202409181544") 21 | == FileTypesFilters.PROMO_FILE 22 | ) 23 | assert ( 24 | FileTypesFilters.get_type_from_file("PromoFull7290058108879-339-202409181149") 25 | == FileTypesFilters.PROMO_FULL_FILE 26 | ) 27 | assert ( 28 | FileTypesFilters.get_type_from_file("Proasdull7290058108879-339-202409181149") 29 | is None 30 | ) 31 | -------------------------------------------------------------------------------- /il_supermarket_scarper/utils/tests/test_status.py: -------------------------------------------------------------------------------- 1 | import datetime 2 | 3 | from il_supermarket_scarper.utils.status import ( 4 | get_status, 5 | get_status_date, 6 | get_statue_page, 7 | ) 8 | from il_supermarket_scarper.utils.connection import disable_when_outside_israel 9 | from il_supermarket_scarper.utils.validation import show_text_diff 10 | 11 | 12 | @disable_when_outside_israel 13 | def test_status(): 14 | """check able to get the number of scrapers from gov.il""" 15 | num_of_scarpers = get_status() 16 | assert isinstance(num_of_scarpers, int) 17 | 18 | 19 | @disable_when_outside_israel 20 | def test_status_date(): 21 | """check able the get the date the gov.il site was updated""" 22 | date = get_status_date() 23 | assert isinstance(date, datetime.datetime) 24 | 25 | 26 | @disable_when_outside_israel 27 | def test_page_complete_diff(): 28 | """make sure the page content is the same as the cached page""" 29 | cached = get_statue_page(extraction_type="all_text", source="cache") 30 | current = get_statue_page(extraction_type="all_text", source="gov.il") 31 | assert current == cached, show_text_diff(cached, current) 32 | -------------------------------------------------------------------------------- /il_supermarket_scarper/tests/test_scrappers_factory.py: -------------------------------------------------------------------------------- 1 | from il_supermarket_scarper import ScraperStability, ScraperFactory, datetime_in_tlv 2 | from il_supermarket_scarper.utils import _is_saturday_in_israel 3 | 4 | 5 | def test_stable_scraper(): 6 | """test sample stable scarper""" 7 | assert not ScraperStability.is_validate_scraper_found_no_files( 8 | ScraperFactory.VICTORY.name 9 | ) 10 | 11 | 12 | # def test_after_date(): 13 | # """test scrapers that failed after date""" 14 | # assert ScraperStability.is_validate_scraper_found_no_files( 15 | # ScraperFactory.CITY_MARKET_GIVATAYIM.name, 16 | # when_date=datetime_in_tlv(2024, 12, 12, 0, 0, 0), 17 | # ) 18 | 19 | 20 | def test_not_active(): 21 | """test grap between active and not""" 22 | test_date = datetime_in_tlv(2024, 12, 12, 0, 0, 0) 23 | all_listed = ScraperFactory.all_listed_scrappers() 24 | all_active = ScraperFactory.all_scrapers_name(when_date=test_date) 25 | 26 | expected_to_fail = 0 27 | if _is_saturday_in_israel(test_date): 28 | expected_to_fail += 1 # only 'NetivHased' should 29 | 30 | assert len(set(all_listed) - set(all_active)) == expected_to_fail 31 | -------------------------------------------------------------------------------- /il_supermarket_scarper/utils/lock_utils.py: -------------------------------------------------------------------------------- 1 | from threading import Lock 2 | from functools import wraps 3 | 4 | 5 | class LockManager: 6 | """Manages locks based on string values.""" 7 | 8 | def __init__(self): 9 | self.locks = {} 10 | 11 | def get_lock(self, key): 12 | """Get or create a lock based on the string key.""" 13 | if key not in self.locks: 14 | self.locks[key] = Lock() 15 | return self.locks[key] 16 | 17 | 18 | lock_manager = LockManager() 19 | 20 | 21 | def lock_by_string(): 22 | """ 23 | Decorator to apply a lock based on a string key. 24 | :param lock_key_func: A function that returns the string key for which the lock will be applied. 25 | """ 26 | 27 | def decorator(func): 28 | @wraps(func) 29 | def wrapper(scraper_status, *args, **kwargs): 30 | # Get the key for which to acquire the lock (based on the arguments) 31 | lock_key = scraper_status.chain.value 32 | lock = lock_manager.get_lock(lock_key) 33 | 34 | with lock: 35 | return func(scraper_status, *args, **kwargs) 36 | 37 | return wrapper 38 | 39 | return decorator 40 | -------------------------------------------------------------------------------- /il_supermarket_scarper/utils/__init__.py: -------------------------------------------------------------------------------- 1 | from .gzip_utils import extract_xml_file_from_gz_file 2 | from .logger import Logger 3 | from .status import ( 4 | get_output_folder, 5 | clean_dump_folder, 6 | summerize_dump_folder_contant, 7 | _is_saturday_in_israel, 8 | _is_holiday_in_israel, 9 | _is_weekend_in_israel, 10 | _now, 11 | datetime_in_tlv, 12 | _testing_now, 13 | hour_files_expected_to_be_accassible, 14 | ) 15 | from .scraper_status import ScraperStatus 16 | from .file_types import FileTypesFilters 17 | from .connection import ( 18 | download_connection_retry, 19 | url_connection_retry, 20 | disable_when_outside_israel, 21 | session_with_cookies, 22 | url_retrieve, 23 | collect_from_ftp, 24 | fetch_temporary_gz_file_from_ftp, 25 | wget_file, 26 | ) 27 | from .loop import execute_in_parallel, multiple_page_aggregtion 28 | from .exceptions import RestartSessionError 29 | from .retry import retry_files 30 | from .validation import is_valid_chain_name, change_xml_encoding 31 | from .folders_name import DumpFolderNames 32 | from .lock_utils import LockManager, lock_by_string 33 | from .status import convert_unit, UnitSize, convert_nl_size_to_bytes, string_to_float 34 | -------------------------------------------------------------------------------- /.github/workflows/python-publish.yml: -------------------------------------------------------------------------------- 1 | # This workflow will upload a Python Package using Twine when a release is created 2 | # For more information see: https://help.github.com/en/actions/language-and-framework-guides/using-python-with-github-actions#publishing-to-package-registries 3 | 4 | # This workflow uses actions that are not certified by GitHub. 5 | # They are provided by a third-party and are governed by 6 | # separate terms of service, privacy policy, and support 7 | # documentation. 8 | 9 | name: Upload Python Package 10 | 11 | on: 12 | release: 13 | types: [published] 14 | 15 | permissions: 16 | contents: read 17 | 18 | jobs: 19 | deploy: 20 | 21 | runs-on: ubuntu-latest 22 | 23 | steps: 24 | - uses: actions/checkout@v3 25 | - name: Set up Python 26 | uses: actions/setup-python@v3 27 | with: 28 | python-version: '3.x' 29 | - name: Install dependencies 30 | run: | 31 | python -m pip install --upgrade pip 32 | pip install build 33 | - name: Build package 34 | run: python -m build 35 | - name: Publish package 36 | uses: pypa/gh-action-pypi-publish@27b31702a0e7fc50959f5ad993c78deac1bdfc29 37 | with: 38 | user: __token__ 39 | password: ${{ secrets.PYPI }} 40 | -------------------------------------------------------------------------------- /tests/test_integration.py: -------------------------------------------------------------------------------- 1 | import datetime 2 | from il_supermarket_scarper.utils.status import ( 3 | get_status, 4 | get_status_date, 5 | ) 6 | from il_supermarket_scarper.scrappers_factory import ScraperFactory 7 | from il_supermarket_scarper.utils import disable_when_outside_israel, DumpFolderNames 8 | 9 | 10 | def test_scrapers_folders_match(): 11 | """test the number of scrapers are the same as listed at the gov.il site""" 12 | scrapers_keys = ScraperFactory.all_scrapers_name() 13 | dump_keys = DumpFolderNames.all_folders_names() 14 | 15 | assert set(scrapers_keys) & set(dump_keys) == set(scrapers_keys) 16 | assert set(scrapers_keys) - set(dump_keys) == set() 17 | 18 | 19 | @disable_when_outside_israel 20 | def test_scrapers_are_updated(): 21 | """test the number of scrapers are the same as listed at the gov.il site""" 22 | num_of_scarper_listed = len(ScraperFactory.all_listed_scrappers()) 23 | num_of_scarper_on_gov_site = get_status() 24 | 25 | assert num_of_scarper_listed == num_of_scarper_on_gov_site 26 | 27 | 28 | @disable_when_outside_israel 29 | def test_update_date(): 30 | """test date the site update""" 31 | date = get_status_date() 32 | assert date.date() == datetime.datetime(2025, 7, 1).date(), "gov il site changed" 33 | -------------------------------------------------------------------------------- /.github/workflows/user-validation.yml: -------------------------------------------------------------------------------- 1 | name: Reject PR with IgnoreList 2 | on: 3 | pull_request: 4 | types: [opened, edited, synchronize] 5 | 6 | jobs: 7 | check_username: 8 | runs-on: ubuntu-latest 9 | env: 10 | IGNORE_USERS: ${{ secrets.IGNORE_USERS }} 11 | 12 | steps: 13 | - name: Checkout code 14 | uses: actions/checkout@v3 15 | 16 | - name: Fetch all branches 17 | run: git fetch --all 18 | 19 | - name: Check for restricted authors in commits 20 | id: check_commit_authors 21 | run: | 22 | # Convert IGNORE_USERS to an array 23 | IFS=',' read -ra IGNORED_USERS <<< "$IGNORE_USERS" 24 | 25 | # Get the commit authors in the pull request 26 | COMMIT_AUTHORS=$(git log --pretty=format:"%an" origin/main..HEAD) 27 | 28 | # Check if any commit author matches an ignored user 29 | for AUTHOR in "${IGNORED_USERS[@]}"; do 30 | if echo "$COMMIT_AUTHORS" | grep -iq "^$AUTHOR$"; then 31 | echo "Restricted author '$AUTHOR' found in commits." 32 | exit 1 33 | fi 34 | done 35 | 36 | - name: PR Rejected 37 | if: failure() 38 | run: | 39 | echo "This PR contains commits by restricted authors." 40 | exit 1 41 | -------------------------------------------------------------------------------- /il_supermarket_scarper/scrappers/__init__.py: -------------------------------------------------------------------------------- 1 | from .bareket import Bareket 2 | from .bitan import YaynotBitanAndCarrefour 3 | from .cofix import Cofix 4 | from .city_market import ( 5 | CityMarketGivatayim, 6 | CityMarketKirtatOno, 7 | CityMarketKiryatGat, 8 | CityMarketShops, 9 | ) 10 | from .doralon import DorAlon 11 | from .good_pharm import GoodPharm 12 | from .hazihinam import HaziHinam 13 | from .het_cohen import HetCohen 14 | from .keshet import Keshet 15 | from .king_store import KingStore 16 | from .maayan2000 import Maayan2000 17 | from .machsani_ashuk import MahsaniAShuk 18 | from .mega import Mega 19 | from .meshnat_yosef import MeshnatYosef1, MeshnatYosef2 20 | from .nativ_hashed import NetivHased 21 | from .osherad import Osherad 22 | from .polizer import Polizer 23 | from .ramilevy import RamiLevy 24 | from .salachdabach import SalachDabach 25 | from .shefa_barcart_ashem import ShefaBarcartAshem 26 | from .shufersal import Shufersal 27 | from .shuk_ahir import ShukAhir 28 | from .stop_market import StopMarket 29 | from .super_pharm import SuperPharm 30 | from .super_yuda import SuperYuda 31 | from .super_sapir import SuperSapir 32 | from .superdosh import FreshMarketAndSuperDosh 33 | from .quik import Quik 34 | from .tivtaam import TivTaam 35 | from .victory import Victory 36 | from .yellow import Yellow 37 | from .yohananof import Yohananof 38 | from .zolvebegadol import ZolVeBegadol 39 | from .wolt import Wolt 40 | -------------------------------------------------------------------------------- /tests/test_main.py: -------------------------------------------------------------------------------- 1 | import os 2 | import time 3 | import tempfile 4 | 5 | from il_supermarket_scarper.main import ScarpingTask 6 | from il_supermarket_scarper.scrappers_factory import ScraperFactory 7 | 8 | 9 | def test_main_with_limit(): 10 | """test the main running with limit of 1 for each chain""" 11 | with tempfile.TemporaryDirectory() as tmpdirname: 12 | expected = ScraperFactory.all_scrapers_name() + ["status"] 13 | scrapper_done = ScarpingTask(limit=1, dump_folder_name=tmpdirname).start() 14 | 15 | folders_from_scraper = list(map(lambda x: x.split("/")[-1], scrapper_done)) + [ 16 | "status" 17 | ] 18 | time.sleep(5) 19 | folders_in_dump_folder = os.listdir(tmpdirname) 20 | folders_in_dump_folder = [ 21 | name for name in folders_in_dump_folder if not name.startswith(".") 22 | ] 23 | assert len(folders_in_dump_folder) == len(expected) 24 | assert sorted(folders_from_scraper) == sorted(folders_in_dump_folder) 25 | 26 | 27 | def test_main_with_one_scarper(): 28 | """the limit only for enabled scarpers""" 29 | scrapper_done = ScarpingTask( 30 | limit=1, enabled_scrapers=ScraperFactory.sample(n=1) 31 | ).start() 32 | assert len(scrapper_done) == 1 33 | 34 | 35 | def test_main_with_size_estimation_mode(): 36 | """test size estmation mode""" 37 | scrapper_done = ScarpingTask( 38 | limit=1, size_estimation_mode=True, enabled_scrapers=ScraperFactory.sample(n=1) 39 | ).start() 40 | assert len(scrapper_done) == 1 41 | -------------------------------------------------------------------------------- /.github/workflows/docker-publish.yml: -------------------------------------------------------------------------------- 1 | # This workflow uses actions that are not certified by GitHub. 2 | # They are provided by a third-party and are governed by 3 | # separate terms of service, privacy policy, and support 4 | # documentation. 5 | 6 | # GitHub recommends pinning actions to a commit SHA. 7 | # To get a newer version, you will need to update the SHA. 8 | # You can also reference a tag or branch, but the action may change without warning. 9 | 10 | name: Publish Docker image 11 | 12 | on: 13 | release: 14 | types: [published] 15 | 16 | jobs: 17 | push_to_registry: 18 | name: Push Docker image to Docker Hub 19 | runs-on: ubuntu-latest 20 | steps: 21 | - name: Check out the repo 22 | uses: actions/checkout@v3 23 | 24 | - name: Log in to Docker Hub 25 | uses: docker/login-action@f054a8b539a109f9f41c372932f1ae047eff08c9 26 | with: 27 | username: ${{ secrets.DOCKER_USERNAME }} 28 | password: ${{ secrets.DOCKER_PASSWORD }} 29 | 30 | - name: Extract metadata (tags, labels) for Docker 31 | id: meta 32 | uses: docker/metadata-action@98669ae865ea3cffbcbaa878cf57c20bbf1c6c38 33 | with: 34 | images: erlichsefi/israeli-supermarket-scarpers 35 | 36 | - name: Build and push Docker image 37 | uses: docker/build-push-action@ad44023a93711e3deb337508980b4b5e9bcdc5dc 38 | with: 39 | context: . 40 | target: prod 41 | push: true 42 | tags: ${{ steps.meta.outputs.tags }} 43 | labels: ${{ steps.meta.outputs.labels }} -------------------------------------------------------------------------------- /Dockerfile: -------------------------------------------------------------------------------- 1 | #syntax=docker/dockerfile:1 2 | 3 | FROM node:20.19.5-bookworm-slim as base 4 | ARG PY_VERSION="3.11.0" 5 | 6 | # setting the enviroment 7 | RUN apt-get update --fix-missing -y && \ 8 | apt-get install cron -y && \ 9 | apt-get install libxml2-dev -y && \ 10 | apt-get install libxslt-dev -y 11 | 12 | 13 | # setting python and more 14 | RUN apt-get install python3-pip -y && \ 15 | apt-get install dieharder -y && \ 16 | apt-get install wget -y && \ 17 | apt-get clean && \ 18 | apt-get autoremove 19 | 20 | # setup python 21 | ENV HOME="/root" 22 | WORKDIR ${HOME} 23 | RUN apt-get install -y git libbz2-dev libncurses-dev libreadline-dev libffi-dev libssl-dev 24 | RUN git clone --depth=1 https://github.com/pyenv/pyenv.git .pyenv 25 | ENV PYENV_ROOT="${HOME}/.pyenv" 26 | ENV PATH="${PYENV_ROOT}/shims:${PYENV_ROOT}/bin:${PATH}" 27 | 28 | RUN pyenv install $PY_VERSION 29 | RUN pyenv global $PY_VERSION 30 | 31 | # setup code 32 | WORKDIR /usr/src/app 33 | COPY . . 34 | RUN python -m pip install . 35 | 36 | 37 | VOLUME ["/usr/src/app/dumps"] 38 | 39 | # development container 40 | FROM base as dev 41 | RUN apt-get -y install git 42 | RUN pip install black 43 | RUN pip install pylint 44 | 45 | 46 | # production image 47 | FROM base as prod 48 | 49 | # ADD crontab /etc/cron.d 50 | # RUN chmod 0644 /etc/cron.d/crontab 51 | # RUN crontab /etc/cron.d/crontab 52 | # RUN touch /var/log/cron.log 53 | # && cron & tail -f /var/log/cron.log 54 | CMD python main.py 55 | 56 | # run test 57 | FROM base as test 58 | 59 | # playwrite 60 | RUN npx -y playwright@1.53.0 install --with-deps 61 | RUN python -m playwright install 62 | 63 | RUN python -m pip install . ".[test]" 64 | CMD python -m pytest -vv -n 2 65 | 66 | -------------------------------------------------------------------------------- /.github/workflows/test-suite.yml: -------------------------------------------------------------------------------- 1 | # This workflow will install Python dependencies, run tests and lint with a variety of Python versions 2 | # For more information see: https://help.github.com/actions/language-and-framework-guides/using-python-with-github-actions 3 | 4 | name: Unit & Integration Tests 5 | # env: 6 | # DISABLED_SCRAPPERS: BAREKET 7 | 8 | on: 9 | push: 10 | branches: [ "main" ] 11 | pull_request: 12 | branches: [ "main" ] 13 | schedule: 14 | # * is a special character in YAML so you have to quote this string 15 | - cron: '00 17 * * *' 16 | 17 | concurrency: 18 | group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.ref }} 19 | cancel-in-progress: true 20 | 21 | 22 | jobs: 23 | build: 24 | 25 | runs-on: self-hosted 26 | strategy: 27 | fail-fast: false 28 | matrix: 29 | python-version: ["3.11.0"] 30 | 31 | steps: 32 | 33 | - name: Checkout 34 | uses: actions/checkout@v3 35 | - name: Free disk space 36 | run: | 37 | df --human-readable 38 | docker 2>/dev/null 1>&2 rmi $(docker image ls --all --quiet) || true 39 | rm --recursive --force "$AGENT_TOOLSDIRECTORY" 40 | df --human-readable 41 | - name: Build with Docker 42 | run: docker build -t erlichsefi/israeli-supermarket-scarpers:test --target test . 43 | - name: Remove all build 44 | run: (docker stop scraper-test-run 2>/dev/null || true) && (docker rm scraper-test-run 2>/dev/null || true) 45 | - name: Test with pytest 46 | run: docker run --rm --name scraper-test-run -e DISABLED_SCRAPPERS="${{ env.DISABLED_SCRAPPERS }}" erlichsefi/israeli-supermarket-scarpers:test && 47 | docker builder prune -f 48 | -------------------------------------------------------------------------------- /il_supermarket_scarper/main.py: -------------------------------------------------------------------------------- 1 | from .scrapper_runner import MainScrapperRunner 2 | from .utils.file_types import FileTypesFilters 3 | 4 | 5 | class ScarpingTask: # pylint: disable=too-many-instance-attributes 6 | """scraping task encapsulated""" 7 | 8 | def __init__( 9 | self, 10 | size_estimation_mode=False, 11 | enabled_scrapers=None, 12 | limit=None, 13 | when_date=None, 14 | files_types=FileTypesFilters.all_types(), 15 | dump_folder_name=None, 16 | lookup_in_db=True, 17 | multiprocessing=5, 18 | suppress_exception=False, 19 | min_size=None, 20 | max_size=None, 21 | ): 22 | """define the runner""" 23 | self.runner = MainScrapperRunner( 24 | size_estimation_mode=size_estimation_mode, 25 | enabled_scrapers=enabled_scrapers, 26 | dump_folder_name=dump_folder_name, 27 | lookup_in_db=lookup_in_db, 28 | multiprocessing=multiprocessing, 29 | ) 30 | self.dump_folder_name = dump_folder_name 31 | self.limit = limit 32 | self.files_types = files_types 33 | self.when_date = when_date 34 | self.suppress_exception = suppress_exception 35 | self.min_size = min_size 36 | self.max_size = max_size 37 | 38 | def get_dump_folder_name(self): 39 | """get the dump folder name""" 40 | return self.dump_folder_name 41 | 42 | def start(self): 43 | """run the scraping""" 44 | return self.runner.run( 45 | limit=self.limit, 46 | files_types=self.files_types, 47 | when_date=self.when_date, 48 | suppress_exception=self.suppress_exception, 49 | min_size=self.min_size, 50 | max_size=self.max_size, 51 | ) 52 | -------------------------------------------------------------------------------- /il_supermarket_scarper/utils/folders_name.py: -------------------------------------------------------------------------------- 1 | from enum import Enum 2 | 3 | 4 | class DumpFolderNames(Enum): 5 | """all the folder that files will be download to""" 6 | 7 | BAREKET = "Bareket" 8 | # YAYNO_BITAN = "YaynotBitan" 9 | YAYNO_BITAN_AND_CARREFOUR = "YaynotBitanAndCarrefour" 10 | COFIX = "Cofix" 11 | CITY_MARKET_GIVATAYIM = "CityMarketGivatayim" 12 | CITY_MARKET_KIRYATONO = "CityMarketKiryatOno" 13 | CITY_MARKET_KIRYATGAT = "CityMarketKiryatGat" 14 | CITY_MARKET_SHOPS = "CityMarketShops" 15 | DOR_ALON = "DorAlon" 16 | GOOD_PHARM = "GoodPharm" 17 | HAZI_HINAM = "HaziHinam" 18 | HET_COHEN = "HetCohen" 19 | KESHET = "Keshet" 20 | KING_STORE = "KingStore" 21 | MAAYAN_2000 = "Maayan2000" 22 | MAHSANI_ASHUK = "MahsaniAShuk" 23 | MEGA = "Mega" 24 | NETIV_HASED = "NetivHased" 25 | MESHMAT_YOSEF_1 = "MeshnatYosef1" 26 | MESHMAT_YOSEF_2 = "MeshnatYosef2" 27 | OSHER_AD = "Osherad" 28 | POLIZER = "Polizer" 29 | RAMI_LEVY = "RamiLevy" 30 | SALACH_DABACH = "SalachDabach" 31 | SHEFA_BARCART_ASHEM = "ShefaBarcartAshem" 32 | SHUFERSAL = "Shufersal" 33 | SHUK_AHIR = "ShukAhir" 34 | STOP_MARKET = "StopMarket" 35 | SUPER_PHARM = "SuperPharm" 36 | SUPER_YUDA = "SuperYuda" 37 | SUPER_SAPIR = "SuperSapir" 38 | FRESH_MARKET_AND_SUPER_DOSH = "FreshMarketAndSuperDosh" 39 | QUIK = "Quik" 40 | TIV_TAAM = "TivTaam" 41 | VICTORY = "Victory" 42 | YELLOW = "Yellow" 43 | YOHANANOF = "Yohananof" 44 | ZOL_VEBEGADOL = "ZolVeBegadol" 45 | WOLT = "Wolt" 46 | 47 | @classmethod 48 | def is_valid_folder_name(cls, member): 49 | """check if an folder is part of the cls""" 50 | return isinstance(member, DumpFolderNames) 51 | 52 | @classmethod 53 | def all_folders_names(cls): 54 | """get the name of all listed folders""" 55 | return [e.name for e in cls] 56 | -------------------------------------------------------------------------------- /il_supermarket_scarper/scrappers/shufersal.py: -------------------------------------------------------------------------------- 1 | import urllib.parse 2 | 3 | from il_supermarket_scarper.engines import MultiPageWeb 4 | from il_supermarket_scarper.utils import DumpFolderNames, FileTypesFilters 5 | 6 | 7 | class Shufersal(MultiPageWeb): 8 | """scaper for shufersal""" 9 | 10 | utilize_date_param = False 11 | 12 | def __init__(self, folder_name=None): 13 | super().__init__( 14 | url="https://prices.shufersal.co.il/", 15 | total_page_xpath="""//*[@id="gridContainer"]/table/tfoot/tr/td/a[6]/@href""", 16 | total_pages_pattern=r"[?&]page=([0-9]+)", 17 | chain=DumpFolderNames.SHUFERSAL, 18 | chain_id="7290027600007", 19 | folder_name=folder_name, 20 | page_argument="&page", 21 | ) 22 | 23 | def get_file_types_id(self, files_types=None): 24 | """get the file type id""" 25 | if files_types is None: 26 | return ["0"] 27 | 28 | types = [] 29 | for ftype in files_types: 30 | if ftype == FileTypesFilters.STORE_FILE.name: 31 | types.append("5") 32 | if ftype == FileTypesFilters.PRICE_FILE.name: 33 | types.append("1") 34 | if ftype == FileTypesFilters.PROMO_FILE.name: 35 | types.append("3") 36 | if ftype == FileTypesFilters.PRICE_FULL_FILE.name: 37 | types.append("2") 38 | if ftype == FileTypesFilters.PROMO_FULL_FILE.name: 39 | types.append("4") 40 | return types 41 | 42 | def build_params(self, files_types=None, store_id=None, when_date=None): 43 | """build the params for the request""" 44 | params = {"catID": ",".join(self.get_file_types_id(files_types))} 45 | 46 | if store_id: 47 | params["storeId"] = store_id 48 | return [f"/FileObject/UpdateCategory?{urllib.parse.urlencode(params)}"] 49 | -------------------------------------------------------------------------------- /setup.py: -------------------------------------------------------------------------------- 1 | from io import open 2 | from setuptools import setup 3 | 4 | with open("README.md", encoding="utf-8") as f: 5 | long_description = "\n" + f.read() 6 | 7 | with open("requirements.txt", encoding="utf-8") as f: 8 | required = f.read().splitlines() 9 | 10 | with open("requirements-dev.txt", encoding="utf-8") as f: 11 | dev_required = f.read().splitlines() 12 | 13 | setup( 14 | # Needed to silence warnings (and to be a worthwhile package) 15 | name="il-supermarket-scraper", 16 | url="https://github.com/OpenIsraeliSupermarkets/israeli-supermarket-scarpers", 17 | author="Sefi Erlich", 18 | author_email="erlichsefi@gmail.com", 19 | # Needed to actually package something 20 | packages=[ 21 | "il_supermarket_scarper", 22 | "il_supermarket_scarper.engines", 23 | "il_supermarket_scarper.scrappers", 24 | "il_supermarket_scarper.utils", 25 | "il_supermarket_scarper.utils.databases", 26 | ], 27 | # Needed for dependencies 28 | install_requires=required, 29 | tests_require=dev_required, 30 | extras_require={"test": ["pytest", "pytest-xdist"]}, 31 | # *strongly* suggested for sharing 32 | version="0.6.3", 33 | # The license can be anything you like 34 | license="MIT", 35 | description="python package that implement a scraping for israeli supermarket data", 36 | # We will also need a readme eventually (there will be a warning) 37 | long_description=long_description, 38 | long_description_content_type="text/markdown", 39 | keywords=["israel", "israeli", "scraper", "supermarket"], 40 | classifiers=[ 41 | "Development Status :: 3 - Alpha", 42 | "Intended Audience :: Developers", 43 | "Topic :: Software Development :: Build Tools", 44 | "License :: OSI Approved :: MIT License", 45 | "Programming Language :: Python :: 3", 46 | "Programming Language :: Python :: 3.4", 47 | "Programming Language :: Python :: 3.5", 48 | "Programming Language :: Python :: 3.6", 49 | ], 50 | ) 51 | -------------------------------------------------------------------------------- /il_supermarket_scarper/utils/databases/mongo.py: -------------------------------------------------------------------------------- 1 | import os 2 | from ..logger import Logger 3 | from .base import AbstractDataBase 4 | 5 | 6 | PYMONGO_INSTALLED = True 7 | try: 8 | import pymongo 9 | from pymongo.errors import ServerSelectionTimeoutError 10 | except ImportError: 11 | PYMONGO_INSTALLED = False 12 | 13 | 14 | class MongoDataBase(AbstractDataBase): 15 | """A class that represents a MongoDB database.""" 16 | 17 | def __init__(self, database_name) -> None: 18 | super().__init__(database_name) 19 | self.myclient = None 20 | self.store_db = None 21 | 22 | def create_connection(self): 23 | """Create a connection to the MongoDB database.""" 24 | if PYMONGO_INSTALLED: 25 | url = os.environ.get("MONGO_URL", "localhost") 26 | port = os.environ.get("MONGO_PORT", "27017") 27 | self.myclient = pymongo.MongoClient(f"mongodb://{url}:{port}/") 28 | self.store_db = self.myclient[self.database_name] 29 | 30 | def enable_collection_status(self): 31 | """Enable data collection to MongoDB.""" 32 | if PYMONGO_INSTALLED: 33 | self.set_collection_status(True) 34 | self.create_connection() 35 | else: 36 | Logger.info("Can't enable collection. Please install pymongo.") 37 | 38 | def insert_document(self, collection_name, document): 39 | """Insert a document into a MongoDB collection.""" 40 | if self.is_collection_enabled(): 41 | try: 42 | self.store_db[collection_name].insert_one(document) 43 | except ServerSelectionTimeoutError: 44 | self.set_collection_status(False) 45 | Logger.error( 46 | "Failed to connect to MongoDB. Collection status disabled." 47 | ) 48 | 49 | def find_document(self, collection_name, query): 50 | """Find a document in a MongoDB collection.""" 51 | if self.is_collection_enabled(): 52 | return self.store_db[collection_name].find_one(query) 53 | return None 54 | -------------------------------------------------------------------------------- /il_supermarket_scarper/scrappers/meshnat_yosef.py: -------------------------------------------------------------------------------- 1 | import json 2 | 3 | from il_supermarket_scarper.engines.web import WebBase 4 | from il_supermarket_scarper.engines import Bina 5 | 6 | from il_supermarket_scarper.utils import DumpFolderNames, Logger 7 | 8 | 9 | class MeshnatYosef1(WebBase): 10 | """scraper for meshnat yoosef""" 11 | 12 | def __init__(self, folder_name=None): 13 | super().__init__( 14 | DumpFolderNames.MESHMAT_YOSEF_1, 15 | chain_id="5144744100002", 16 | url="https://list-files.w5871031-kt.workers.dev/", 17 | folder_name=folder_name, 18 | ) 19 | 20 | def get_data_from_page(self, req_res): 21 | """get the file list from a page""" 22 | response = json.loads(req_res.text) 23 | return response 24 | 25 | def get_file_size_from_entry(self, entry): 26 | """ 27 | Extract file size from a JSON entry. 28 | Returns size in bytes, or None if not found. 29 | """ 30 | # Meshnat Yosef don't support file size in the entry 31 | return None 32 | 33 | def extract_task_from_entry(self, all_trs): 34 | """extract download links, file names, and file sizes from page list""" 35 | download_urls = [] 36 | file_names = [] 37 | file_sizes = [] 38 | for x in all_trs: 39 | try: 40 | download_urls.append(x["url"]) 41 | file_names.append(x["name"]) 42 | file_sizes.append(self.get_file_size_from_entry(x)) 43 | except (AttributeError, KeyError, IndexError, TypeError) as e: 44 | Logger.warning(f"Error extracting task from entry: {e}") 45 | 46 | return download_urls, file_names, file_sizes 47 | 48 | 49 | class MeshnatYosef2(Bina): 50 | """scaper for Meshnat Yosef""" 51 | 52 | def __init__(self, folder_name=None): 53 | super().__init__( 54 | DumpFolderNames.MESHMAT_YOSEF_2, 55 | chain_id=["5144744100001", "7290058289400"], 56 | url_perfix="ktshivuk", 57 | folder_name=folder_name, 58 | ) 59 | -------------------------------------------------------------------------------- /il_supermarket_scarper/utils/loop.py: -------------------------------------------------------------------------------- 1 | import concurrent.futures 2 | from .logger import Logger 3 | 4 | 5 | def defualt_aggregtion_function(all_done): 6 | """format the scraping result to the final input""" 7 | result = [] 8 | for response in all_done: 9 | _response = response 10 | if hasattr(_response, "result"): 11 | _response = _response.result() 12 | result.append(_response) 13 | return result 14 | 15 | 16 | def multiple_page_aggregtion(pages_to_scrape): 17 | """format the scraping result to the final input for multipage""" 18 | download_urls = [] 19 | file_names = [] 20 | file_sizes = [] 21 | for result in pages_to_scrape: 22 | if hasattr(result, "result"): 23 | page_result = result.result() 24 | else: 25 | page_result = result 26 | page_download_urls, page_file_names, page_file_sizes = page_result 27 | file_sizes.extend(page_file_sizes) 28 | download_urls.extend(page_download_urls) 29 | file_names.extend(page_file_names) 30 | return download_urls, file_names, file_sizes 31 | 32 | 33 | def execute_in_parallel( 34 | function_to_execute, 35 | iterable, 36 | max_threads=None, 37 | aggregtion_function=defualt_aggregtion_function, 38 | ): 39 | """execute a job in the event loop""" 40 | 41 | Logger.info(f"Running {len(iterable)} tasks in parallel") 42 | results = run_tasks( 43 | function_to_execute, 44 | iterable, 45 | max_threads=max_threads, 46 | ) 47 | 48 | all_done = aggregtion_function(results) 49 | Logger.info(f"Done with {len(all_done)} tasks in parallel") 50 | return all_done 51 | 52 | 53 | def run_tasks( 54 | function_to_execute, 55 | iterable, 56 | max_threads: int = None, 57 | ): 58 | """Run tasks in multi-thread or sequentially""" 59 | if max_threads: 60 | # Use multi-thread 61 | with concurrent.futures.ThreadPoolExecutor( 62 | max_workers=max_threads, thread_name_prefix="PullingThread" 63 | ) as executor: 64 | futures = [executor.submit(function_to_execute, arg) for arg in iterable] 65 | return [ 66 | future.result() for future in concurrent.futures.as_completed(futures) 67 | ] 68 | else: 69 | # Or just iterate over all 70 | return [function_to_execute(arg) for arg in iterable] 71 | -------------------------------------------------------------------------------- /il_supermarket_scarper/engines/apsx.py: -------------------------------------------------------------------------------- 1 | from abc import ABC, abstractmethod 2 | from il_supermarket_scarper.utils import Logger 3 | 4 | from .web import WebBase 5 | 6 | 7 | class Aspx(WebBase, ABC): 8 | """class for aspx scapers""" 9 | 10 | def __init__( 11 | self, chain, chain_id, url, aspx_page, folder_name=None, max_threads=5 12 | ): 13 | super().__init__( 14 | chain, chain_id, url, folder_name=folder_name, max_threads=max_threads 15 | ) 16 | self.aspx_page = aspx_page 17 | 18 | def extract_task_from_entry(self, all_trs): 19 | """from the trs extract the download urls, file names, and file sizes""" 20 | 21 | download_urls = [] 22 | file_names = [] 23 | file_sizes = [] 24 | for x in all_trs: 25 | try: 26 | download_url = self.url + self.get_href_from_entry(x) 27 | download_urls.append(download_url) 28 | file_names.append(self.get_file_name_no_ext_from_entry(download_url)) 29 | file_sizes.append(self.get_file_size_from_entry(x)) 30 | except (AttributeError, KeyError, IndexError, TypeError) as e: 31 | Logger.warning(f"Error extracting task from entry: {e}") 32 | return download_urls, file_names, file_sizes 33 | 34 | @abstractmethod 35 | def _get_all_possible_query_string_params( 36 | self, files_types=None, store_id=None, when_date=None 37 | ): 38 | """list all param to add to the url""" 39 | 40 | @abstractmethod 41 | def _build_query_url(self, query_params, base_urls): 42 | """build the url with the query params""" 43 | 44 | def get_request_url(self, files_types=None, store_id=None, when_date=None): 45 | """build the request given the base url and the query params""" 46 | result = [] 47 | for query_params in self._get_all_possible_query_string_params( 48 | files_types=files_types, store_id=store_id, when_date=when_date 49 | ): 50 | result.extend(self._build_query_url(query_params, [self.url])) 51 | Logger.debug(f"Request url: {result}") 52 | return result 53 | 54 | @abstractmethod 55 | def get_href_from_entry(self, entry): 56 | """get download link for entry (tr)""" 57 | 58 | @abstractmethod 59 | def get_file_name_no_ext_from_entry(self, entry): 60 | """get the file name without extensions from entey (tr)""" 61 | -------------------------------------------------------------------------------- /main.py: -------------------------------------------------------------------------------- 1 | import datetime 2 | import os 3 | from il_supermarket_scarper import ScarpingTask, ScraperFactory, FileTypesFilters 4 | 5 | 6 | def load_params(): 7 | """load params from env variables with validation""" 8 | kwargs = {"suppress_exception": True, "lookup_in_db": True} 9 | 10 | # validate scrapers 11 | enabled_scrapers = os.getenv("ENABLED_SCRAPERS", None) 12 | if enabled_scrapers: 13 | enabled_scrapers = enabled_scrapers.split(",") 14 | 15 | not_valid = list( 16 | filter( 17 | lambda scraper: scraper not in ScraperFactory.all_scrapers_name(), 18 | enabled_scrapers, 19 | ) 20 | ) 21 | if not_valid: 22 | raise ValueError(f"ENABLED_SCRAPERS contains invalid {not_valid}") 23 | 24 | kwargs["enabled_scrapers"] = enabled_scrapers 25 | 26 | # validate file types 27 | enabled_file_types = os.getenv("ENABLED_FILE_TYPES", None) 28 | if enabled_file_types: 29 | 30 | enabled_file_types = enabled_file_types.split(",") 31 | 32 | not_valid = list( 33 | filter( 34 | lambda f_types: f_types not in FileTypesFilters.all_types(), 35 | enabled_file_types, 36 | ) 37 | ) 38 | if not_valid: 39 | raise ValueError(f"ENABLED_FILE_TYPES contains invalid {not_valid}") 40 | 41 | kwargs["files_types"] = enabled_file_types 42 | 43 | # validate number of processes 44 | number_of_processes = os.getenv("NUMBER_OF_PROCESSES", None) 45 | if number_of_processes: 46 | try: 47 | kwargs["multiprocessing"] = int(number_of_processes) 48 | except ValueError: 49 | raise ValueError("NUMBER_OF_PROCESSES must be an integer") 50 | 51 | # validate limit 52 | limit = os.getenv("LIMIT", None) 53 | if limit: 54 | try: 55 | kwargs["limit"] = int(limit) 56 | except ValueError: 57 | raise ValueError(f"LIMIT must be an integer, but got {limit}") 58 | 59 | # validate today 60 | today = os.getenv("TODAY", None) 61 | if today: 62 | try: 63 | kwargs["when_date"] = datetime.datetime.strptime(today, "%Y-%m-%d %H:%M") 64 | except ValueError: 65 | raise ValueError("TODAY must be in the format 'YYYY-MM-DD HH:MM'") 66 | 67 | return kwargs 68 | 69 | 70 | if __name__ == "__main__": 71 | 72 | args = load_params() 73 | 74 | ScarpingTask(**args).start() 75 | -------------------------------------------------------------------------------- /il_supermarket_scarper/utils/gzip_utils.py: -------------------------------------------------------------------------------- 1 | import gzip 2 | import shutil 3 | import os 4 | import io 5 | import zipfile 6 | from .exceptions import RestartSessionError 7 | 8 | 9 | def extract_xml_file_from_gz_file(file_save_path): 10 | """extract xml from gz""" 11 | target_file_name = os.path.splitext(file_save_path)[0] + ".xml" 12 | try: 13 | with gzip.open(file_save_path, "rb") as infile: 14 | with open(target_file_name, "wb") as outfile: 15 | shutil.copyfileobj(infile, outfile) 16 | except (gzip.BadGzipFile, EOFError) as exception: 17 | try: 18 | with open(file_save_path, "rb") as response_content: 19 | with zipfile.ZipFile(io.BytesIO(response_content.read())) as the_zip: 20 | zip_info = the_zip.infolist()[0] 21 | with the_zip.open(zip_info) as the_file: 22 | with open(target_file_name, "wb") as f_out: 23 | f_out.write(the_file.read()) 24 | 25 | except ( # pylint: disable=broad-except,redefined-outer-name 26 | Exception 27 | ) as exception: 28 | report_failed_zip(exception, file_save_path, target_file_name) 29 | 30 | except Exception as exception: # pylint: disable=broad-except 31 | report_failed_zip(exception, file_save_path, target_file_name) 32 | 33 | 34 | def report_failed_zip(exception, file_save_path, target_file_name): 35 | """report a file wasn't able to extracted""" 36 | 37 | try: 38 | file_size = os.path.getsize(file_save_path) 39 | 40 | file_contant = "" 41 | with open(file_save_path, "r", encoding="utf-8") as file: 42 | file_contant = file.readlines() 43 | 44 | if "link expired" in str(file_contant): 45 | raise RestartSessionError() 46 | 47 | raise ValueError( 48 | f"Error decoding file:{file_save_path} with " 49 | f"error: {str(exception)} file size {str(file_size)} ," 50 | f"trimed_file_contant {str(file_contant)[:100]}" 51 | ) 52 | except UnicodeDecodeError: 53 | raise ValueError( 54 | f"Error decoding file:{file_save_path} with " 55 | f"error: {str(exception)} file size {str(file_size)} ," 56 | f"can't decode file" 57 | ) 58 | finally: 59 | os.remove(file_save_path) 60 | # remove the corrupted file 61 | if os.path.exists(target_file_name): 62 | os.remove(target_file_name) 63 | -------------------------------------------------------------------------------- /il_supermarket_scarper/scrappers/wolt.py: -------------------------------------------------------------------------------- 1 | from datetime import timedelta 2 | from bs4 import BeautifulSoup 3 | 4 | from il_supermarket_scarper.utils import _now, Logger 5 | from il_supermarket_scarper.engines.web import WebBase 6 | 7 | from il_supermarket_scarper.utils import DumpFolderNames 8 | 9 | 10 | class Wolt(WebBase): 11 | """scraper for wolt""" 12 | 13 | def __init__(self, folder_name=None): 14 | super().__init__( 15 | DumpFolderNames.WOLT, 16 | chain_id="7290058249350", 17 | url="https://wm-gateway.wolt.com/isr-prices/public/v1/index.html", 18 | folder_name=folder_name, 19 | ) 20 | 21 | def get_request_url( 22 | self, files_types=None, store_id=None, when_date=None 23 | ): # pylint: disable=unused-argument 24 | """get all links to collect download links from""" 25 | if when_date: 26 | formatted_date = when_date.strftime("%Y-%m-%d") 27 | return [ 28 | { 29 | "url": self.url.replace("index.html", f"{formatted_date}.html"), 30 | "method": "GET", 31 | } 32 | ] 33 | 34 | perspective = _now() 35 | all_pages_to_collect_from = [] 36 | for days_back in range(10): 37 | formatted_date = (perspective - timedelta(days=days_back)).strftime( 38 | "%Y-%m-%d" 39 | ) 40 | all_pages_to_collect_from.append( 41 | { 42 | "url": self.url.replace("index.html", f"{formatted_date}.html"), 43 | "method": "GET", 44 | } 45 | ) 46 | return all_pages_to_collect_from 47 | 48 | def get_data_from_page(self, req_res): 49 | """get the file list from a page""" 50 | soup = BeautifulSoup(req_res.text, features="lxml") 51 | return list( 52 | map( 53 | lambda x: (x.text, self.url.replace("index.html", x.a.attrs["href"])), 54 | list(soup.find_all("li")), 55 | ) 56 | ) 57 | 58 | def extract_task_from_entry(self, all_trs): 59 | """extract download links, file names, and file sizes from page list""" 60 | download_urls = [] 61 | file_names = [] 62 | file_sizes = [] 63 | for x in all_trs: 64 | try: 65 | download_urls.append(x[1]) 66 | file_names.append(x[0]) 67 | file_sizes.append(None) 68 | except (AttributeError, KeyError, IndexError, TypeError) as e: 69 | Logger.warning(f"Error extracting task from entry: {e}") 70 | 71 | return download_urls, file_names, file_sizes 72 | -------------------------------------------------------------------------------- /il_supermarket_scarper/utils/logger.py: -------------------------------------------------------------------------------- 1 | import logging 2 | import sys 3 | 4 | 5 | def build_logger(): 6 | """create the logger instance""" 7 | # Define logger 8 | logger = logging.getLogger("Logger") 9 | 10 | if not logger.handlers: 11 | logger.setLevel(logging.DEBUG) # set logger level 12 | log_formatter = logging.Formatter( 13 | "%(name)-12s %(asctime)s %(levelname)-8s " 14 | "[%(threadName)s] %(filename)s:%(funcName)s %(message)s" 15 | ) 16 | console_handler = logging.StreamHandler( 17 | sys.stdout 18 | ) # set streamhandler to stdout 19 | console_handler.setFormatter(log_formatter) 20 | logger.addHandler(console_handler) 21 | 22 | file_handler = logging.FileHandler("logging.log") 23 | file_handler.setFormatter(log_formatter) 24 | logger.addHandler(file_handler) 25 | 26 | return logger 27 | 28 | 29 | class Logger: 30 | """a static logger class to share will all components""" 31 | 32 | enabled = True 33 | logger = build_logger() 34 | 35 | @classmethod 36 | def change_logging_status(cls, new_status): 37 | """enable or disable status""" 38 | cls.enabled = new_status 39 | 40 | @classmethod 41 | def set_logging_level(cls, level): 42 | """set logging level""" 43 | if level == "DEBUG": 44 | cls.logger.setLevel(logging.DEBUG) 45 | elif level == "INFO": 46 | cls.logger.setLevel(logging.INFO) 47 | elif level == "ERROR": 48 | cls.logger.setLevel(logging.ERROR) 49 | elif level == "WARNING": 50 | cls.logger.setLevel(logging.WARNING) 51 | else: 52 | cls.logger.setLevel(logging.DEBUG) 53 | 54 | @classmethod 55 | def info(cls, msg, *args, **kwargs): 56 | """log info""" 57 | if cls.enabled: 58 | cls.logger.info(msg, *args, **kwargs) 59 | 60 | @classmethod 61 | def debug(cls, msg, *args, **kwargs): 62 | """log info""" 63 | if cls.enabled: 64 | cls.logger.debug(msg, *args, **kwargs) 65 | 66 | @classmethod 67 | def error(cls, msg, *args, **kwargs): 68 | """log error""" 69 | if cls.enabled: 70 | cls.logger.error(msg, *args, **kwargs) 71 | 72 | @classmethod 73 | def error_execption(cls, _): 74 | """log execption""" 75 | if cls.enabled: 76 | cls.logger.error( 77 | "got an execption:", 78 | exc_info=sys.exc_info(), 79 | ) 80 | 81 | @classmethod 82 | def warning(cls, msg, *args, **kwargs): 83 | """log warning""" 84 | if cls.enabled: 85 | cls.logger.warning(msg, *args, **kwargs) 86 | -------------------------------------------------------------------------------- /stress_test.py: -------------------------------------------------------------------------------- 1 | import time 2 | import json 3 | import datetime 4 | import tempfile 5 | import pstats 6 | import cProfile 7 | import io 8 | from il_supermarket_scarper.scrappers_factory import ScraperFactory 9 | from il_supermarket_scarper.utils import _now 10 | 11 | 12 | def format_stats_as_json(profile, project_name): 13 | """get the stats from the profiler and format them as json""" 14 | stream = io.StringIO() 15 | ps = pstats.Stats(profile, stream=stream) 16 | ps.sort_stats(pstats.SortKey.CUMULATIVE) # Sort by cumulative time 17 | ps.print_stats() 18 | 19 | # Convert the printed stats to a list of lines 20 | stats_output = stream.getvalue().splitlines() 21 | 22 | # Filter the lines to include only functions within the project 23 | project_stats = [] 24 | for line in stats_output: 25 | if project_name in line: # Filter for project-specific lines 26 | 27 | parts = line.split() 28 | if len(parts) >= 5: # Basic sanity check for the parts 29 | function_data = { 30 | "function": parts[-1], # Function path 31 | "ncalls": parts[0], # Number of calls 32 | "tottime": parts[1], 33 | "tottime_per_call": parts[2], # Time spent in function 34 | "cumtime": parts[3], # Cumulative time including subcalls 35 | "cumtime_per_call": parts[4], # 36 | } 37 | project_stats.append(function_data) 38 | 39 | return project_stats 40 | 41 | 42 | if __name__ == "__main__": 43 | 44 | result = {} 45 | for scraper_name in ScraperFactory.all_scrapers_name(): 46 | 47 | def full_execution(scraper): 48 | """full execution of the scraper""" 49 | with tempfile.TemporaryDirectory() as tmpdirname: 50 | try: 51 | initer = ScraperFactory.get(scraper)(folder_name=tmpdirname) 52 | return initer.scrape(when_date=_now()), "" 53 | except Exception as e: # pylint: disable=broad-exception-caught 54 | return [], str(e) 55 | 56 | execution_time = datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S") 57 | start_time = time.time() 58 | pr = cProfile.Profile() 59 | pr.enable() 60 | 61 | files, error = full_execution(scraper_name) 62 | 63 | pr.disable() 64 | 65 | end_time = time.time() 66 | result[scraper_name] = { 67 | "status": format_stats_as_json(pr, "israeli-supermarket-scarpers"), 68 | "execution_time": execution_time, 69 | "start_time": start_time, 70 | "end_time": end_time, 71 | "time": end_time - start_time, 72 | "files": len(files), 73 | "error": error, 74 | } 75 | 76 | with open("stress_test_results.json", "w", encoding="utf-8") as f: 77 | json.dump(result, f) 78 | -------------------------------------------------------------------------------- /il_supermarket_scarper/utils/validation.py: -------------------------------------------------------------------------------- 1 | import re 2 | import difflib 3 | 4 | 5 | def is_valid_chain_name(input_string): 6 | """check the chain name is in a valid folder foramt""" 7 | # Regular expression pattern to match only letters (a-z, case insensitive) and hyphens (-) 8 | pattern = r"^[a-zA-Z0-9-]+$" 9 | 10 | # Use re.match to check if the entire string matches the pattern 11 | return bool(re.match(pattern, input_string)) 12 | 13 | 14 | def find_index_with_substring(array, substring): 15 | """Find the index of the first element in the array that contains the substring""" 16 | return [i for i, s in enumerate(array) if substring in s][0] 17 | 18 | 19 | def show_text_diff(text1, text2): 20 | """show the difference between two text strings in a git-like format""" 21 | # Split the texts into lines for comparison 22 | text1_lines = text1.splitlines() 23 | text2_lines = text2.splitlines() 24 | 25 | text1_lines = text1_lines[ 26 | find_index_with_substring( 27 | text1_lines, "חוקים ותקנות" 28 | ) : find_index_with_substring(text1_lines, "נוסח החוק המעודכן ביותר") 29 | ] 30 | text2_lines = text2_lines[ 31 | find_index_with_substring( 32 | text2_lines, "חוקים ותקנות" 33 | ) : find_index_with_substring(text2_lines, "נוסח החוק המעודכן ביותר") 34 | ] 35 | 36 | # Use difflib to compare the texts with more context 37 | diff = difflib.unified_diff( 38 | text1_lines, 39 | text2_lines, 40 | lineterm="", 41 | fromfile="Expected", 42 | tofile="Actual", 43 | n=5, # Show 5 lines of context around changes 44 | ) 45 | 46 | # Format the output for better readability 47 | diff_lines = [] 48 | diff_lines.append("\n" + "=" * 80) 49 | diff_lines.append("DIFF:") 50 | diff_lines.append("=" * 80) 51 | 52 | for line in diff: 53 | # Add visual markers for different line types 54 | if line.startswith("---") or line.startswith("+++"): 55 | diff_lines.append(line) 56 | elif line.startswith("-"): 57 | diff_lines.append(f"- {line[1:]}") # Removed line 58 | elif line.startswith("+"): 59 | diff_lines.append(f"+ {line[1:]}") # Added line 60 | elif line.startswith("@@"): 61 | diff_lines.append("\n" + line) # Context marker 62 | else: 63 | diff_lines.append(f" {line}") # Context line 64 | 65 | diff_lines.append("=" * 80) 66 | 67 | return "\n".join(diff_lines) 68 | 69 | 70 | def change_xml_encoding(file_path): 71 | """change the encoding if failing with utf-8""" 72 | with open(file_path, "rb") as file: # pylint: disable=unspecified-encoding 73 | # Read the XML file content 74 | content = file.read() 75 | 76 | content = content.decode("ISO-8859-8", errors="replace") 77 | 78 | # Save the file with the new encoding declaration 79 | with open(file_path, "wb") as file: 80 | file.write( 81 | content.replace('encoding="ISO-8859-8"', 'encoding="UTF-8"').encode("utf-8") 82 | ) 83 | -------------------------------------------------------------------------------- /.github/workflows/codeql.yml: -------------------------------------------------------------------------------- 1 | # For most projects, this workflow file will not need changing; you simply need 2 | # to commit it to your repository. 3 | # 4 | # You may wish to alter this file to override the set of languages analyzed, 5 | # or to provide custom queries or build logic. 6 | # 7 | # ******** NOTE ******** 8 | # We have attempted to detect the languages in your repository. Please check 9 | # the `language` matrix defined below to confirm you have the correct set of 10 | # supported CodeQL languages. 11 | # 12 | name: "CodeQL" 13 | 14 | on: 15 | push: 16 | branches: [ "main" ] 17 | pull_request: 18 | # The branches below must be a subset of the branches above 19 | branches: [ "main" ] 20 | 21 | 22 | concurrency: 23 | group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.ref }} 24 | cancel-in-progress: true 25 | 26 | jobs: 27 | analyze: 28 | name: Analyze 29 | runs-on: ubuntu-latest 30 | permissions: 31 | actions: read 32 | contents: read 33 | security-events: write 34 | 35 | strategy: 36 | fail-fast: false 37 | matrix: 38 | language: [ 'python' ] 39 | # CodeQL supports [ 'cpp', 'csharp', 'go', 'java', 'javascript', 'python', 'ruby' ] 40 | # Learn more about CodeQL language support at https://aka.ms/codeql-docs/language-support 41 | 42 | steps: 43 | - name: Checkout repository 44 | uses: actions/checkout@v3 45 | 46 | # Initializes the CodeQL tools for scanning. 47 | - name: Initialize CodeQL 48 | uses: github/codeql-action/init@v2 49 | with: 50 | languages: ${{ matrix.language }} 51 | # If you wish to specify custom queries, you can do so here or in a config file. 52 | # By default, queries listed here will override any specified in a config file. 53 | # Prefix the list here with "+" to use these queries and those in the config file. 54 | 55 | # Details on CodeQL's query packs refer to : https://docs.github.com/en/code-security/code-scanning/automatically-scanning-your-code-for-vulnerabilities-and-errors/configuring-code-scanning#using-queries-in-ql-packs 56 | # queries: security-extended,security-and-quality 57 | 58 | 59 | # Autobuild attempts to build any compiled languages (C/C++, C#, or Java). 60 | # If this step fails, then you should remove it and run the build manually (see below) 61 | - name: Autobuild 62 | uses: github/codeql-action/autobuild@v2 63 | 64 | # ℹ️ Command-line programs to run using the OS shell. 65 | # 📚 See https://docs.github.com/en/actions/using-workflows/workflow-syntax-for-github-actions#jobsjob_idstepsrun 66 | 67 | # If the Autobuild fails above, remove it and uncomment the following three lines. 68 | # modify them (or add more) to build your code if your project, please refer to the EXAMPLE below for guidance. 69 | 70 | # - run: | 71 | # echo "Run, Build Application using script" 72 | # ./location_of_script_within_repo/buildscript.sh 73 | 74 | - name: Perform CodeQL Analysis 75 | uses: github/codeql-action/analyze@v2 76 | with: 77 | category: "/language:${{matrix.language}}" 78 | -------------------------------------------------------------------------------- /il_supermarket_scarper/engines/publishprice.py: -------------------------------------------------------------------------------- 1 | from bs4 import BeautifulSoup 2 | 3 | from il_supermarket_scarper.utils.logger import Logger 4 | from .web import WebBase 5 | 6 | 7 | class PublishPrice(WebBase): 8 | """ 9 | scrape the file of PublishPrice 10 | possibly can support historical search: there is folder for each date. 11 | but this is not implemented. 12 | """ 13 | 14 | def __init__( 15 | self, 16 | chain, 17 | chain_id, 18 | site_infix, 19 | folder_name=None, 20 | domain="prices", 21 | max_threads=5, 22 | ): 23 | super().__init__( 24 | chain, 25 | chain_id, 26 | url=f"https://{domain}.{site_infix}.co.il/", 27 | folder_name=folder_name, 28 | max_threads=max_threads, 29 | ) 30 | self.folder = None 31 | 32 | def get_request_url( 33 | self, files_types=None, store_id=None, when_date=None 34 | ): # pylint: disable=unused-argument 35 | """get all links to collect download links from""" 36 | 37 | formated = "" 38 | if when_date: 39 | formated = when_date.strftime("%Y%m%d") 40 | formated = f"?p=./{formated}" 41 | return [{"url": self.url + formated, "method": "GET"}] 42 | 43 | def get_data_from_page(self, req_res): 44 | soup = BeautifulSoup(req_res.text, features="lxml") 45 | 46 | # the developer hard-coded the files names in the html 47 | all_trs = ( 48 | soup.find_all("script")[-1] 49 | .text.replace("const files_html = [", "") 50 | .replace("];", "") 51 | .split("\n")[5] 52 | .split(",") 53 | ) 54 | return list(map(lambda x: BeautifulSoup(x, features="lxml"), all_trs)) 55 | 56 | def extract_task_from_entry(self, all_trs): 57 | """from the trs extract the download urls, file names, and file sizes""" 58 | 59 | def get_herf_element(x): 60 | herfs = x.find_all("a") 61 | if len(herfs) > 0: 62 | return herfs[-1] 63 | return None 64 | 65 | def get_herf(x): 66 | return get_herf_element(x).attrs["href"] 67 | 68 | def get_path_from_herf(x): 69 | return get_herf(x).replace("\\", "").replace('"', "").replace("./", "") 70 | 71 | def get_name_from_herf(x): 72 | return get_path_from_herf(x).split(".")[0].split("/")[-1] 73 | 74 | all_trs = list( 75 | filter( 76 | lambda x: get_herf_element(x) is not None, 77 | all_trs, 78 | ) 79 | ) 80 | 81 | download_urls = [] 82 | file_names = [] 83 | file_sizes = [] 84 | for x in all_trs: 85 | try: 86 | download_urls.append(self.url + get_path_from_herf(x)) 87 | file_names.append(get_name_from_herf(x)) 88 | file_sizes.append(self.get_file_size_from_entry(x)) 89 | except (AttributeError, KeyError, IndexError, TypeError) as e: 90 | Logger.warning(f"Error extracting task from entry: {e}") 91 | 92 | return download_urls, file_names, file_sizes 93 | -------------------------------------------------------------------------------- /il_supermarket_scarper/utils/file_cache.py: -------------------------------------------------------------------------------- 1 | import os 2 | import json 3 | import time 4 | from functools import wraps 5 | 6 | 7 | def file_cache(ttl=None): 8 | """Decorator to cache function results in a file with an optional TTL (time-to-live)""" 9 | 10 | def get_cache_file(func_name): 11 | """Generate a cache file path based on the function name""" 12 | cache_dir = ".cache" 13 | return os.path.join(cache_dir, f"{func_name}_cache.json") 14 | 15 | def load_cache(cache_file): 16 | """Load the cache from the specified cache file if it exists""" 17 | if os.path.exists(cache_file): 18 | with open(cache_file, "r", encoding="utf-8") as f: 19 | return json.load(f) 20 | return {} 21 | 22 | def save_cache(cache_file, cache_data): 23 | """Save the cache to the specified cache file""" 24 | if not os.path.exists(".cache"): 25 | os.makedirs(".cache") 26 | with open(cache_file, "w", encoding="utf-8") as f: 27 | json.dump(cache_data, f) 28 | 29 | def decorator(func): 30 | @wraps(func) 31 | def wrapper(*args, **kwargs): 32 | # Generate cache file path based on the function name 33 | cache_file = get_cache_file(func.__name__) 34 | 35 | # Load the cache from the file 36 | cache = load_cache(cache_file) 37 | 38 | # Generate a cache key from function arguments 39 | cache_key = generate_cache_key(args, kwargs) 40 | 41 | # Check if result is cached and valid 42 | if cache_key in cache: 43 | entry = cache[cache_key] 44 | timestamp = entry["timestamp"] 45 | 46 | # If ttl is set, check if cache has expired 47 | if ttl is not None and (time.time() - timestamp) > ttl: 48 | # Cache expired, remove the entry 49 | del cache[cache_key] 50 | else: 51 | # Cache is valid, return cached result 52 | return entry["result"] 53 | 54 | # If not cached or expired, call the function and store the result 55 | result = func(*args, **kwargs) 56 | 57 | # Save the result with the current timestamp in the cache 58 | cache[cache_key] = { 59 | "result": result, 60 | "timestamp": time.time(), # Save the current time 61 | } 62 | save_cache(cache_file, cache) 63 | 64 | return result 65 | 66 | def generate_cache_key(args, kwargs): 67 | key_parts = [] 68 | for arg in args: 69 | if isinstance(arg, (int, float, str, bool)): 70 | key_parts.append(str(arg)) 71 | else: 72 | raise ValueError(f"Unsupported argument type: {type(arg)}") 73 | for k, v in kwargs.items(): 74 | if isinstance(v, (int, float, str, bool)): 75 | key_parts.append(f"{k}={v}") 76 | else: 77 | raise ValueError(f"Unsupported keyword argument type: {type(v)}") 78 | return "|".join(key_parts) 79 | 80 | return wrapper 81 | 82 | return decorator 83 | -------------------------------------------------------------------------------- /il_supermarket_scarper/utils/file_types.py: -------------------------------------------------------------------------------- 1 | from enum import Enum 2 | 3 | 4 | class FileTypesFilters(Enum): 5 | """type of files avaliable to download""" 6 | 7 | PROMO_FILE = { 8 | "should_contain": "promo", 9 | "should_not_contain": "full", 10 | } 11 | STORE_FILE = { 12 | "should_contain": "store", 13 | "should_not_contain": None, 14 | } 15 | PRICE_FILE = { 16 | "should_contain": "price", 17 | "should_not_contain": "full", 18 | } 19 | PROMO_FULL_FILE = { 20 | "should_contain": "promofull", 21 | "should_not_contain": None, 22 | } 23 | PRICE_FULL_FILE = { 24 | "should_contain": "pricefull", 25 | "should_not_contain": None, 26 | } 27 | 28 | @classmethod 29 | def all_types(cls): 30 | """Returns a list of all the enum keys.""" 31 | return [e.name for e in FileTypesFilters] 32 | 33 | @classmethod 34 | def all_update_files(cls): 35 | """all the update files""" 36 | return [FileTypesFilters.PROMO_FILE.name, FileTypesFilters.PRICE_FILE.name] 37 | 38 | @classmethod 39 | def all_full_files(cls): 40 | """all the full files""" 41 | return [ 42 | FileTypesFilters.PRICE_FULL_FILE.name, 43 | FileTypesFilters.PROMO_FULL_FILE.name, 44 | ] 45 | 46 | @classmethod 47 | def only_promo(cls): 48 | """only files with promotion date""" 49 | return [FileTypesFilters.PROMO_FILE.name, FileTypesFilters.PROMO_FULL_FILE.name] 50 | 51 | @classmethod 52 | def only_store(cls): 53 | """only files with stores date""" 54 | return [FileTypesFilters.STORE_FILE.name] 55 | 56 | @classmethod 57 | def only_price(cls): 58 | """only files with prices date""" 59 | return [FileTypesFilters.PRICE_FILE.name, FileTypesFilters.PRICE_FULL_FILE.name] 60 | 61 | @staticmethod 62 | def filter_file(file_name, should_contain, should_not_contain): 63 | """fillter function""" 64 | return ( 65 | should_contain in file_name.lower() 66 | and "null" not in file_name.lower() 67 | and ( 68 | should_not_contain is None 69 | or should_not_contain not in file_name.lower() 70 | ) 71 | ) 72 | 73 | @classmethod 74 | def is_file_from_type(cls, filename, file_type): 75 | """check if file from certain type""" 76 | string_to_look_in = getattr(cls, file_type).value 77 | return cls.filter_file(filename, **string_to_look_in) 78 | 79 | @classmethod 80 | def get_type_from_file(cls, filename): 81 | """get file type from filename""" 82 | for file_type_name in cls.all_types(): 83 | if cls.is_file_from_type(filename, file_type_name): 84 | return getattr(cls, file_type_name) 85 | return None 86 | 87 | @classmethod 88 | def filter(cls, file_type, iterable, by_function=lambda x: x): 89 | """Returns the type of the file.""" 90 | return list( 91 | filter( 92 | lambda filename: cls.is_file_from_type( 93 | by_function(filename), file_type 94 | ), 95 | iterable, 96 | ) 97 | ) 98 | -------------------------------------------------------------------------------- /il_supermarket_scarper/scrappers/super_pharm.py: -------------------------------------------------------------------------------- 1 | from pathlib import Path 2 | import urllib.parse 3 | import datetime 4 | 5 | import json 6 | from il_supermarket_scarper.engines import MultiPageWeb 7 | from il_supermarket_scarper.utils import ( 8 | Logger, 9 | url_connection_retry, 10 | DumpFolderNames, 11 | FileTypesFilters, 12 | ) 13 | 14 | 15 | class SuperPharm(MultiPageWeb): 16 | """scraper for super pharm""" 17 | 18 | def __init__(self, folder_name=None): 19 | super().__init__( 20 | chain=DumpFolderNames.SUPER_PHARM, 21 | chain_id="7290172900007", 22 | url="http://prices.super-pharm.co.il/", 23 | folder_name=folder_name, 24 | total_page_xpath='//*[@class="mvc-grid-pager"]/button[last()]/@data-page', 25 | total_pages_pattern=r"(\d+)$", 26 | page_argument="&page", 27 | ) 28 | 29 | def collect_files_details_from_page(self, html): 30 | links = [] 31 | filenames = [] 32 | file_sizes = [] 33 | for element in html.xpath("//tbody/tr"): # skip header 34 | links.append(self.url + element.xpath("./td[6]/a/@href")[0]) 35 | filenames.append(element.xpath("./td[2]")[0].text) 36 | file_sizes.append(None) # Super Pharm don't support file size in the entry 37 | return links, filenames, file_sizes 38 | 39 | @url_connection_retry() 40 | def retrieve_file(self, file_link, file_save_path, timeout=15): 41 | Logger.debug(f"On a new Session: calling {file_link}") 42 | 43 | response_content = self.session_with_cookies_by_chain( 44 | file_link, timeout=timeout 45 | ) 46 | spath = json.loads(response_content.content) 47 | Logger.debug(f"Found spath: {spath}") 48 | 49 | file_to_save = self.session_with_cookies_by_chain( 50 | self.url + spath["href"], timeout=timeout 51 | ) 52 | file_to_save_with_ext = file_save_path + ".gz" 53 | Path(file_to_save_with_ext).write_bytes(file_to_save.content) 54 | 55 | return file_to_save_with_ext 56 | 57 | def get_file_types_id(self, files_types=None): 58 | """get the file type id""" 59 | if files_types is None: 60 | return [""] 61 | 62 | types = [] 63 | for ftype in files_types: 64 | if ftype == FileTypesFilters.STORE_FILE.name: 65 | types.append("StoresFull") 66 | if ftype == FileTypesFilters.PRICE_FILE.name: 67 | types.append("Price") 68 | if ftype == FileTypesFilters.PROMO_FILE.name: 69 | types.append("Promo") 70 | if ftype == FileTypesFilters.PRICE_FULL_FILE.name: 71 | types.append("PriceFull") 72 | if ftype == FileTypesFilters.PROMO_FULL_FILE.name: 73 | types.append("PromoFull") 74 | return types 75 | 76 | def build_params(self, files_types=None, store_id=None, when_date=None): 77 | """build the params for the request""" 78 | 79 | all_params = [] 80 | for ftype in self.get_file_types_id(files_types): 81 | params = {"type": "", "date": "", "store": ""} 82 | 83 | if store_id: 84 | params["store"] = store_id 85 | if when_date and isinstance(when_date, datetime.datetime): 86 | params["date"] = when_date.strftime("%Y-%m-%d") 87 | if files_types: 88 | params["type"] = ftype 89 | all_params.append(params) 90 | 91 | return ["?" + urllib.parse.urlencode(params) for params in all_params] 92 | -------------------------------------------------------------------------------- /il_supermarket_scarper/scrappers/hazihinam.py: -------------------------------------------------------------------------------- 1 | import urllib.parse 2 | import datetime 3 | from il_supermarket_scarper.engines import MultiPageWeb 4 | from il_supermarket_scarper.utils import ( 5 | DumpFolderNames, 6 | FileTypesFilters, 7 | _now, 8 | convert_unit, 9 | UnitSize, 10 | string_to_float, 11 | ) 12 | 13 | # class HaziHinam(Cerberus): 14 | # """scrper fro hazi hinam""" 15 | 16 | # def __init__(self, folder_name=None): 17 | # super().__init__( 18 | # chain=DumpFolderNames.HAZI_HINAM, 19 | # chain_id="7290700100008", 20 | # folder_name=folder_name, 21 | # ftp_username="HaziHinam", 22 | # ) 23 | 24 | 25 | class HaziHinam(MultiPageWeb): 26 | """scrper fro hazi hinam""" 27 | 28 | def __init__(self, folder_name=None): 29 | super().__init__( 30 | chain=DumpFolderNames.HAZI_HINAM, 31 | chain_id="7290700100008", 32 | url="https://shop.hazi-hinam.co.il/Prices", 33 | folder_name=folder_name, 34 | total_page_xpath="(//li[contains(concat(' ', normalize-space(@class), ' ')," 35 | + "' pagination-item ')])[last()]/a/@href", 36 | total_pages_pattern=r"\d+", 37 | page_argument="&p", 38 | ) 39 | 40 | def collect_files_details_from_page(self, html): 41 | """collect the details deom one page""" 42 | links = [] 43 | filenames = [] 44 | file_sizes = [] 45 | for link in html.xpath("//table/tbody/tr"): 46 | links.append(link.xpath("td[6]/a/@href")[0]) 47 | filenames.append(link.xpath("td[3]")[0].text.strip() + ".xml.gz") 48 | file_sizes.append( 49 | convert_unit( 50 | string_to_float(link.xpath("td[5]")[0].text.strip()), 51 | UnitSize.KB, 52 | UnitSize.BYTES, 53 | ) 54 | ) 55 | return links, filenames, file_sizes 56 | 57 | def get_file_types_id(self, files_types=None): 58 | """get the file type id""" 59 | if files_types is None or files_types == FileTypesFilters.all_types(): 60 | return [{"t": "null", "f": "null"}] 61 | 62 | types = [] 63 | for ftype in files_types: 64 | if ftype == FileTypesFilters.STORE_FILE.name: 65 | types.append({"t": "3", "f": "null"}) 66 | if ftype == FileTypesFilters.PRICE_FILE.name: 67 | types.append({"t": "1", "f": "null"}) 68 | if ftype == FileTypesFilters.PROMO_FILE.name: 69 | types.append({"t": "2", "f": "null"}) 70 | if ftype == FileTypesFilters.PRICE_FULL_FILE.name: 71 | types.append({"t": "1", "f": "null"}) 72 | if ftype == FileTypesFilters.PROMO_FULL_FILE.name: 73 | types.append({"t": "2", "f": "null"}) 74 | return types 75 | 76 | def build_params(self, files_types=None, store_id=None, when_date=None): 77 | """build the params for the request""" 78 | 79 | all_params = [] 80 | for type_params in self.get_file_types_id(files_types): 81 | 82 | # filtering store is not supported 83 | # if store_id: 84 | # params["s"] = "null" 85 | if when_date and isinstance(when_date, datetime.datetime): 86 | all_params.append({"d": when_date.strftime("%Y-%m-%d"), **type_params}) 87 | else: 88 | all_params.append({"d": _now().strftime("%Y-%m-%d"), **type_params}) 89 | all_params.append( 90 | { 91 | "d": (_now() - datetime.timedelta(days=1)).strftime("%Y-%m-%d"), 92 | **type_params, 93 | } 94 | ) 95 | 96 | return ["?" + urllib.parse.urlencode(params) for params in all_params] 97 | -------------------------------------------------------------------------------- /il_supermarket_scarper/scrappers/city_market.py: -------------------------------------------------------------------------------- 1 | import urllib.parse 2 | import datetime 3 | from il_supermarket_scarper.engines import Bina, MultiPageWeb 4 | from il_supermarket_scarper.utils import ( 5 | DumpFolderNames, 6 | FileTypesFilters, 7 | UnitSize, 8 | ) 9 | from il_supermarket_scarper.utils.status import convert_unit, string_to_float 10 | 11 | 12 | # removed on 28.02.2025 13 | class CityMarketGivatayim(Bina): 14 | """scraper for city market givatayim""" 15 | 16 | def __init__(self, folder_name=None): 17 | super().__init__( 18 | chain=DumpFolderNames.CITY_MARKET_GIVATAYIM, 19 | chain_id="5359000000000", 20 | url_perfix="citymarketgivatayim", 21 | folder_name=folder_name, 22 | ) 23 | 24 | 25 | # removed on 28.10.2024 26 | class CityMarketKirtatOno(Bina): 27 | """scraper for city market givatayim""" 28 | 29 | def __init__(self, folder_name=None): 30 | super().__init__( 31 | chain=DumpFolderNames.CITY_MARKET_KIRYATONO, 32 | chain_id="5359000000000", 33 | url_perfix="citymarketkiryatono", 34 | folder_name=folder_name, 35 | ) 36 | 37 | 38 | class CityMarketKiryatGat(Bina): 39 | """scraper for city market givatayim""" 40 | 41 | def __init__(self, folder_name=None): 42 | super().__init__( 43 | chain=DumpFolderNames.CITY_MARKET_KIRYATGAT, 44 | chain_id="7290058266241", 45 | url_perfix="citymarketkiryatgat", 46 | folder_name=folder_name, 47 | ) 48 | 49 | 50 | class CityMarketShops(MultiPageWeb): 51 | """scraper for city market givatayim""" 52 | 53 | def __init__(self, folder_name=None): 54 | super().__init__( 55 | chain=DumpFolderNames.CITY_MARKET_SHOPS, 56 | chain_id="7290000000003", 57 | url="http://www.citymarket-shops.co.il/", 58 | folder_name=folder_name, 59 | total_page_xpath="(//li[contains(concat(' ', normalize-space(@class), ' ')," 60 | + "' pagination-item ')])[last()]/a/@href", 61 | total_pages_pattern=r"\d+", 62 | page_argument="&p", 63 | ) 64 | 65 | def collect_files_details_from_page(self, html): 66 | """collect the details deom one page""" 67 | links = [] 68 | filenames = [] 69 | file_sizes = [] 70 | for link in html.xpath("//table/tbody/tr"): 71 | links.append(self.url + link.xpath("td[7]/a/@href")[0]) 72 | filenames.append(link.xpath("td[3]")[0].text.strip() + ".xml.gz") 73 | file_sizes.append( 74 | convert_unit( 75 | string_to_float(link.xpath("td[6]")[0].text.strip()), 76 | UnitSize.KB, 77 | UnitSize.BYTES, 78 | ) 79 | ) 80 | return links, filenames, file_sizes 81 | 82 | def get_file_types_id(self, files_types=None): 83 | """get the file type id""" 84 | if files_types is None or files_types == FileTypesFilters.all_types(): 85 | return [{"t": "", "f": ""}] 86 | 87 | types = [] 88 | for ftype in files_types: 89 | if ftype == FileTypesFilters.STORE_FILE.name: 90 | types.append({"t": 3, "f": ""}) 91 | if ftype == FileTypesFilters.PRICE_FILE.name: 92 | types.append({"t": "1", "f": "0"}) 93 | if ftype == FileTypesFilters.PROMO_FILE.name: 94 | types.append({"t": "2", "f": "0"}) 95 | if ftype == FileTypesFilters.PRICE_FULL_FILE.name: 96 | types.append({"t": "1", "f": "1"}) 97 | if ftype == FileTypesFilters.PROMO_FULL_FILE.name: 98 | types.append({"t": "2", "f": "1"}) 99 | return types 100 | 101 | def build_params(self, files_types=None, store_id=None, when_date=None): 102 | """build the params for the request""" 103 | 104 | all_params = [] 105 | for type_params in self.get_file_types_id(files_types): 106 | params = {"d": "", "s": ""} 107 | 108 | if store_id: 109 | params["s"] = str(store_id).zfill(3) 110 | if when_date and isinstance(when_date, datetime.datetime): 111 | params["d"] = when_date.strftime("%Y-%m-%d") 112 | if files_types: 113 | params = {**params, **type_params} 114 | all_params.append(params) 115 | 116 | return ["?" + urllib.parse.urlencode(params) for params in all_params] 117 | -------------------------------------------------------------------------------- /il_supermarket_scarper/utils/databases/json_file.py: -------------------------------------------------------------------------------- 1 | import os 2 | import json 3 | from ..logger import Logger 4 | from .base import AbstractDataBase 5 | 6 | 7 | class JsonDataBase(AbstractDataBase): 8 | """A class that represents a JSON-based database.""" 9 | 10 | def __init__(self, database_name, base_path="json_db") -> None: 11 | super().__init__(database_name, collection_status=True) 12 | self.base_path = base_path 13 | self.database_file = f"{self.database_name}.json" 14 | self._ensure_db_directory_exists() 15 | self._ensure_db_file_exists() 16 | 17 | def _ensure_db_directory_exists(self): 18 | """Ensure the base directory for the JSON database exists.""" 19 | if not os.path.exists(self.base_path): 20 | os.makedirs(self.base_path, exist_ok=True) 21 | 22 | def _ensure_db_file_exists(self): 23 | """Ensure the database file exists.""" 24 | file_path = self._get_database_file_path() 25 | if not os.path.exists(file_path): 26 | with open(file_path, "w", encoding="utf-8") as file: 27 | json.dump({}, file) # Initialize with an empty dict 28 | 29 | def _get_database_file_path(self): 30 | """Get the full path to the database JSON file.""" 31 | return os.path.join(self.base_path, self.database_file) 32 | 33 | def _read_database(self): 34 | """Read the JSON database file and return its contents.""" 35 | file_path = self._get_database_file_path() 36 | data = {} 37 | 38 | # Load existing data from the file 39 | if os.path.exists(file_path): 40 | with open(file_path, "r", encoding="utf-8") as file: 41 | try: 42 | data = json.load(file) 43 | except json.JSONDecodeError: 44 | Logger.warning(f"File {file_path} is corrupted, resetting it.") 45 | data = {} 46 | return data 47 | 48 | def _write_database(self, data): 49 | """Write data to the JSON database file.""" 50 | file_path = self._get_database_file_path() 51 | 52 | with open(file_path, "w", encoding="utf-8") as file: 53 | json.dump(dict(sorted(data.items())), file, default=str, indent=4) 54 | 55 | def insert_documents(self, collection_name, document): 56 | """Insert a document into a collection inside the JSON database.""" 57 | if self.collection_status: 58 | 59 | data = self._read_database() 60 | # Ensure the collection exists in the database 61 | if collection_name not in data: 62 | data[collection_name] = [] 63 | 64 | # Add the new document to the collection 65 | data[collection_name].extend(document) 66 | 67 | # Save the updated data back to the file 68 | self._write_database(data) 69 | 70 | def insert_document(self, collection_name, document): 71 | """Insert a document into a collection inside the JSON database.""" 72 | if self.collection_status: 73 | data = self._read_database() 74 | # Ensure the collection exists in the database 75 | if collection_name not in data: 76 | data[collection_name] = [] 77 | 78 | # Add the new document to the collection 79 | data[collection_name].append(document) 80 | 81 | # Save the updated data back to the file 82 | self._write_database(data) 83 | 84 | def find_document(self, collection_name, query): 85 | """Find a document in a collection based on a query.""" 86 | if self.collection_status: 87 | file_path = self._get_database_file_path() 88 | 89 | if os.path.exists(file_path): 90 | with open(file_path, "r", encoding="utf-8") as file: 91 | try: 92 | data = json.load(file) 93 | 94 | # Check if the collection exists 95 | if collection_name in data: 96 | # Filter the documents in the collection based on the query 97 | for document in data[collection_name]: 98 | if all( 99 | item in document.items() for item in query.items() 100 | ): 101 | return document 102 | except json.JSONDecodeError: 103 | Logger.warning(f"File {file_path} is corrupted.") 104 | 105 | return None 106 | -------------------------------------------------------------------------------- /il_supermarket_scarper/scrapper_runner.py: -------------------------------------------------------------------------------- 1 | import os 2 | 3 | from multiprocessing import Pool 4 | 5 | from .scrappers_factory import ScraperFactory 6 | from .utils import Logger, summerize_dump_folder_contant, clean_dump_folder 7 | 8 | 9 | class MainScrapperRunner: 10 | """a main scraper to execute all scraping""" 11 | 12 | def __init__( 13 | self, 14 | size_estimation_mode=False, 15 | enabled_scrapers=None, 16 | dump_folder_name=None, 17 | multiprocessing=5, 18 | lookup_in_db=True, 19 | ): 20 | assert isinstance(enabled_scrapers, list) or enabled_scrapers is None 21 | 22 | env_size_estimation_mode = os.getenv("SE_MODE", None) 23 | if env_size_estimation_mode: 24 | Logger.info( 25 | f"Setting size estimation mode from enviroment. value={env_size_estimation_mode}" 26 | ) 27 | self.size_estimation_mode = bool(env_size_estimation_mode == "True") 28 | else: 29 | self.size_estimation_mode = size_estimation_mode 30 | Logger.info(f"size_estimation_mode: {self.size_estimation_mode}") 31 | 32 | if not enabled_scrapers: 33 | enabled_scrapers = ScraperFactory.all_scrapers_name() 34 | 35 | self.enabled_scrapers = enabled_scrapers 36 | Logger.info(f"Enabled scrapers: {self.enabled_scrapers}") 37 | self.dump_folder_name = dump_folder_name 38 | self.multiprocessing = multiprocessing 39 | self.lookup_in_db = lookup_in_db 40 | 41 | def run( 42 | self, 43 | limit=None, 44 | files_types=None, 45 | when_date=False, 46 | suppress_exception=False, 47 | min_size=None, 48 | max_size=None, 49 | ): 50 | """run the scraper""" 51 | Logger.info(f"Limit is {limit}") 52 | Logger.info(f"files_types is {files_types}") 53 | Logger.info(f"Start scraping {','.join(self.enabled_scrapers)}.") 54 | 55 | with Pool(self.multiprocessing) as pool: 56 | result = pool.map( 57 | self.scrape_one_wrap, 58 | list( 59 | map( 60 | lambda chainScrapperClass: ( 61 | chainScrapperClass, 62 | { 63 | "limit": limit, 64 | "files_types": files_types, 65 | "when_date": when_date, 66 | "suppress_exception": suppress_exception, 67 | "min_size": min_size, 68 | "max_size": max_size, 69 | }, 70 | ), 71 | self.enabled_scrapers, 72 | ) 73 | ), 74 | ) 75 | 76 | Logger.info("Done scraping all supermarkets.") 77 | 78 | return result 79 | 80 | def scrape_one_wrap(self, arg): 81 | """scrape one warper""" 82 | args, kwargs = arg 83 | return self.scrape_one(args, **kwargs) 84 | 85 | def scrape_one( 86 | self, 87 | chain_scrapper_class, 88 | limit=None, 89 | files_types=None, 90 | store_id=None, 91 | when_date=None, 92 | suppress_exception=False, 93 | min_size=None, 94 | max_size=None, 95 | ): 96 | """scrape one""" 97 | chain_scrapper_constractor = ScraperFactory.get(chain_scrapper_class) 98 | Logger.info(f"Starting scrapper {chain_scrapper_constractor}") 99 | scraper = chain_scrapper_constractor(folder_name=self.dump_folder_name) 100 | chain_name = scraper.get_chain_name() 101 | 102 | Logger.info(f"scraping {chain_name}") 103 | if self.lookup_in_db: 104 | scraper.enable_collection_status() 105 | scraper.enable_aggregation_between_runs() 106 | 107 | scraper.scrape( 108 | limit=limit, 109 | files_types=files_types, 110 | store_id=store_id, 111 | when_date=when_date, 112 | files_names_to_scrape=None, 113 | filter_null=False, 114 | filter_zero=False, 115 | suppress_exception=suppress_exception, 116 | min_size=min_size, 117 | max_size=max_size, 118 | ) 119 | Logger.info(f"done scraping {chain_name}") 120 | 121 | folder_with_files = scraper.get_storage_path() 122 | if self.size_estimation_mode: 123 | Logger.info(f"Summrize test data for {chain_name}") 124 | summerize_dump_folder_contant(folder_with_files) 125 | 126 | Logger.info(f"Cleaning dump folder for {chain_name}") 127 | clean_dump_folder(folder_with_files) 128 | return folder_with_files 129 | -------------------------------------------------------------------------------- /il_supermarket_scarper/engines/bina.py: -------------------------------------------------------------------------------- 1 | import json 2 | import urllib.parse 3 | import datetime 4 | 5 | from il_supermarket_scarper.utils import ( 6 | Logger, 7 | url_connection_retry, 8 | url_retrieve, 9 | FileTypesFilters, 10 | ) 11 | 12 | from .apsx import Aspx 13 | 14 | 15 | class Bina(Aspx): 16 | """scraper for all Bina base site. 17 | Note! the websites have the possibility to download historical value as a date search menu. 18 | this class don't support downloading them. 19 | """ 20 | 21 | def __init__( 22 | self, 23 | chain, 24 | chain_id, 25 | url_perfix, 26 | download_postfix="/Download.aspx?FileNm=", 27 | domain="binaprojects.com/", 28 | folder_name=None, 29 | ): 30 | super().__init__( 31 | chain, 32 | chain_id, 33 | url=f"http://{url_perfix}.{domain}", 34 | aspx_page="MainIO_Hok.aspx", 35 | folder_name=folder_name, 36 | ) 37 | self.download_postfix = download_postfix 38 | 39 | def file_type_ids(self, file_types): 40 | """get the file type id""" 41 | file_type_mapping = { 42 | FileTypesFilters.STORE_FILE.name: 1, 43 | FileTypesFilters.PRICE_FILE.name: 2, 44 | FileTypesFilters.PROMO_FILE.name: 3, 45 | FileTypesFilters.PRICE_FULL_FILE.name: 4, 46 | FileTypesFilters.PROMO_FULL_FILE.name: 5, 47 | } 48 | if file_types is None or file_types == FileTypesFilters.all_types(): 49 | yield 0 50 | else: 51 | for file_type in file_types: 52 | if file_type not in file_type_mapping: 53 | raise ValueError(f"File type {file_type} not supported") 54 | yield file_type_mapping[file_type] 55 | 56 | def _build_query_url(self, query_params, base_urls): 57 | res = [] 58 | for base in base_urls: 59 | res.append( 60 | { 61 | "url": base + self.aspx_page + "?" + query_params, 62 | "method": "GET", 63 | } 64 | ) 65 | return res 66 | 67 | def _get_all_possible_query_string_params( 68 | self, files_types=None, store_id=None, when_date=None 69 | ): 70 | """get the arguments need to add to the url""" 71 | chains_urls = [] 72 | 73 | for c_id in self.get_chain_id(): 74 | chains_urls.append( 75 | { 76 | "_": f"{c_id}", 77 | "wReshet": "הכל", 78 | "WFileType": "", 79 | "WDate": "", 80 | "WStore": "", 81 | } 82 | ) 83 | 84 | # add file types to url 85 | if files_types: 86 | chains_urls_with_types = [] 87 | for files_type in self.file_type_ids(files_types): 88 | 89 | for chain_url in chains_urls: 90 | chains_urls_with_types.append( 91 | {**chain_url, "WFileType": files_type} 92 | ) 93 | chains_urls = chains_urls_with_types 94 | 95 | # add store id 96 | if store_id: 97 | for chains_url in chains_urls: 98 | chains_url["WStore"] = store_id 99 | 100 | # posting date 101 | if when_date and isinstance(when_date, datetime.datetime): 102 | for chains_url in chains_urls: 103 | chains_url["WDate"] = when_date.strftime("%d/%m/%Y") 104 | 105 | return [urllib.parse.urlencode(params) for params in chains_urls] 106 | 107 | def get_data_from_page(self, req_res): 108 | return json.loads(req_res.text) 109 | 110 | def get_href_from_entry(self, entry): 111 | """get download link for entry (tr)""" 112 | return self.download_postfix + entry["FileNm"] 113 | 114 | def get_file_name_no_ext_from_entry(self, entry): 115 | """get the file name without extensions from entey (tr)""" 116 | return entry.split(self.download_postfix)[-1].split(".")[0] 117 | 118 | def get_file_size_from_entry(self, entry): 119 | """ 120 | Extract file size from a JSON entry. 121 | Bina returns JSON objects, check for size field. 122 | Returns size in bytes, or None if not found. 123 | """ 124 | # Bina don't support file size in the entry 125 | return None 126 | 127 | @url_connection_retry() 128 | def retrieve_file(self, file_link, file_save_path, timeout=30): 129 | response_content = self.session_with_cookies_by_chain( 130 | file_link, 131 | ) 132 | spath = json.loads(response_content.content) 133 | Logger.debug(f"Found spath: {spath}") 134 | 135 | url = spath[0]["SPath"] 136 | ext = file_link.split(".")[-1] 137 | 138 | url_retrieve(url, file_save_path + "." + ext, timeout=timeout) 139 | return file_save_path + "." + ext 140 | -------------------------------------------------------------------------------- /LICENSE.txt: -------------------------------------------------------------------------------- 1 | Custom License Agreement 2 | 3 | This License Agreement ("Agreement") is a legal agreement between Sefi Erlich ("Licensor") and any individual or entity ("Licensee" or "Contributor") who accesses, uses, or contributes to this repository. By accessing, using, or contributing to the Repository, you agree to be bound by the terms of this Agreement. 4 | 5 | 1. Grant of License for Non-Commercial Use 6 | 7 | 1.1 Non-Commercial Use License: The Licensor grants the Licensee a worldwide, royalty-free, non-exclusive, non-transferable license to use, reproduce, modify, and distribute the content of the Repository ("Licensed Material") for non-commercial purposes only, subject to the terms and conditions of this Agreement. 8 | 9 | 1.2 Attribution Requirement: When using or distributing the Licensed Material, the Licensee must provide appropriate credit to the Licensor by: 10 | - Citing the Licensor's name as specified. 11 | - Including a link to the Repository. 12 | - Indicating if changes were made to the Licensed Material. 13 | 14 | 1.3 No Commercial Use: Licensees are expressly prohibited from using the Licensed Material, in whole or in part, for any commercial purpose without prior written permission from the Licensor. 15 | 16 | 2. Reservation of Commercial Rights 17 | 18 | 2.1 Exclusive Commercial Rights: All commercial rights to the Licensed Material are exclusively reserved by the Licensor. The Licensor retains the sole right to use, reproduce, modify, distribute, and sublicense the Licensed Material for commercial purposes. 19 | 20 | 2.2 Requesting Commercial Permission: Parties interested in using the Licensed Material for commercial purposes must obtain explicit written consent from the Licensor. Requests should be directed to the contact information provided at the end of this Agreement. 21 | 22 | 3. Contributions 23 | 24 | 3.1 Contributor License Grant: By submitting any content ("Contribution") to the Repository, the Contributor grants the Licensor a non-exclusive, perpetual, irrevocable, worldwide, royalty-free license to use, reproduce, modify, distribute, sublicense, and create derivative works from the Contribution for any purpose, including commercial purposes. 25 | 26 | 3.2 Warranty of Originality: Contributors represent and warrant that their Contributions are original works and do not infringe upon the intellectual property rights of any third party. 27 | 28 | 3.3 No Commercial Rights for Contributors: Contributors acknowledge that they have no rights to use the Licensed Material for commercial purposes. 29 | 30 | 4. Restrictions 31 | 32 | 4.1 Prohibition of Commercial Exploitation: Licensees and Contributors may not: 33 | - Use the Licensed Material or any Contributions for commercial purposes. 34 | - Distribute the Licensed Material or any Contributions as part of any commercial product or service. 35 | - Sublicense the Licensed Material or any Contributions for commercial use. 36 | 37 | 4.2 No Endorsement: Licensees and Contributors may not imply endorsement or affiliation with the Licensor without explicit written permission. 38 | 39 | 5. Term and Termination 40 | 41 | 5.1 Term: This Agreement is effective upon acceptance and continues unless terminated as provided herein. 42 | 43 | 5.2 Termination for Breach: The Licensor may terminate this Agreement immediately if the Licensee or Contributor breaches any of its terms. 44 | 45 | 5.3 Effect of Termination: Upon termination, all rights granted under this Agreement cease, and the Licensee or Contributor must destroy all copies of the Licensed Material in their possession. 46 | 47 | 5.4 Survival: Sections 2, 3, 4, 6, and 7 survive termination of this Agreement. 48 | 49 | 6. Disclaimer of Warranties and Limitation of Liability 50 | 51 | 6.1 As-Is Basis: The Licensed Material and any Contributions are provided "AS IS," without warranties or conditions of any kind, either express or implied. 52 | 53 | 6.2 Disclaimer: The Licensor expressly disclaims all warranties, including but not limited to warranties of title, non-infringement, merchantability, and fitness for a particular purpose. 54 | 55 | 6.3 Limitation of Liability: In no event shall the Licensor be liable for any direct, indirect, incidental, special, exemplary, or consequential damages arising in any way out of the use of the Licensed Material or Contributions. 56 | 57 | 7. General Provisions 58 | 59 | 7.1 Entire Agreement: This Agreement constitutes the entire agreement between the parties concerning the subject matter hereof and supersedes all prior agreements and understandings. 60 | 61 | 7.2 Modification: The Licensor reserves the right to modify this Agreement for new versions of the Licensed Material. Such modifications will not apply retroactively to any version of the Licensed Material you have already obtained. 62 | 63 | 7.3 Severability: If any provision of this Agreement is found to be unenforceable, the remainder shall remain in full force and effect. 64 | 65 | 7.4 Waiver: Failure to enforce any provision of this Agreement shall not constitute a waiver of such provision. 66 | 67 | 7.5 Governing Law: This Agreement shall be governed by and construed in accordance with the laws of Israel, without regard to its conflict of law principles. 68 | 69 | 7.6 Dispute Resolution: Any disputes arising under or in connection with this Agreement shall be subject to the exclusive jurisdiction of the courts located in Israel. 70 | 71 | 8. Acceptance by accessing, using, or contributing to the Repository, you acknowledge that you have read, understood, and agree to be bound by the terms and conditions of this Agreement. 72 | 73 | Contact Information 74 | 75 | For any questions or requests regarding this Agreement, please contact: 76 | 77 | Name: Sefi Erlich 78 | Email: erlichsefi@gmail.com -------------------------------------------------------------------------------- /il_supermarket_scarper/scrappers/tests/test_all.py: -------------------------------------------------------------------------------- 1 | from il_supermarket_scarper.scrappers_factory import ScraperFactory 2 | from .test_cases import make_test_case 3 | 4 | 5 | class BareketTestCase(make_test_case(ScraperFactory.BAREKET, 5)): 6 | """Test case for ScraperFactory.BAREKET.""" 7 | 8 | 9 | class YaynotBitanAndCarrefourTestCase( 10 | make_test_case(ScraperFactory.YAYNO_BITAN_AND_CARREFOUR, 9032) 11 | ): 12 | """Test case for ScraperFactory.YAYNO_BITAN_AND_CARREFOUR.""" 13 | 14 | 15 | class CofixTestCase(make_test_case(ScraperFactory.COFIX, 299)): 16 | """Test case for ScraperFactory.COFIX.""" 17 | 18 | 19 | # class CityMarketGivatayimTestCase( 20 | # make_test_case(ScraperFactory.CITY_MARKET_GIVATAYIM, 1) 21 | # ): 22 | # """Test case for CityMarketGivatay""" 23 | 24 | 25 | # class CityMarketKirtatOnoTestCase( 26 | # make_test_case(ScraperFactory.CITY_MARKET_KIRYATONO, 1) 27 | # ): 28 | # """Test case for CityMarketKirtatOno""" 29 | 30 | 31 | class CityMarketKiryatGatTestCase( 32 | make_test_case(ScraperFactory.CITY_MARKET_KIRYATGAT, 1) 33 | ): 34 | """Test case for CityMarketKiryatGat""" 35 | 36 | 37 | class CityMarketShopsTestCase(make_test_case(ScraperFactory.CITY_MARKET_SHOPS, 1)): 38 | """Test case for CityMarketShops""" 39 | 40 | 41 | class DorAlonTestCase(make_test_case(ScraperFactory.DOR_ALON, 501)): 42 | """Test case for ScraperFactory.DOR_ALON.""" 43 | 44 | 45 | class GoodPharmTestCase(make_test_case(ScraperFactory.GOOD_PHARM, 952)): 46 | """Test case for ScraperFactory.GOOD_PHARM.""" 47 | 48 | 49 | class HaziHinamTestCase(make_test_case(ScraperFactory.HAZI_HINAM, 206)): 50 | """Test case for ScraperFactory.HAZI_HINAM.""" 51 | 52 | 53 | class HetCohen(make_test_case(ScraperFactory.HET_COHEN, 45)): 54 | """Test case for ScraperFactory.HET_COHEN.""" 55 | 56 | 57 | class KeshetTestCase(make_test_case(ScraperFactory.KESHET, 5)): 58 | """Test case for ScraperFactory.KESHET.""" 59 | 60 | 61 | class KingStoreTestCase(make_test_case(ScraperFactory.KING_STORE, 334)): 62 | """Test case for ScraperFactory.KING_STORE.""" 63 | 64 | 65 | class Maayan2000TestCase(make_test_case(ScraperFactory.MAAYAN_2000, 60)): 66 | """Test case for ScraperFactory.MAAYAN_2000.""" 67 | 68 | 69 | class MahsaniAShukTestCase(make_test_case(ScraperFactory.MAHSANI_ASHUK, 98)): 70 | """Test case for ScraperFactory.MAHSANI_ASHUK.""" 71 | 72 | 73 | # class MegaTestCase(make_test_case(ScraperFactory.MEGA, 37)): 74 | # """Test case for ScraperFactory.MEGA.""" 75 | 76 | 77 | class NetivHasefTestCase(make_test_case(ScraperFactory.NETIV_HASED, 1)): 78 | """Test case for ScraperFactory.NETIV_HASED.""" 79 | 80 | 81 | class MeshnatYosef1TestCase(make_test_case(ScraperFactory.MESHMAT_YOSEF_1, 1)): 82 | """Test case for ScraperFactory.MESHMAT_YOSEF_1.""" 83 | 84 | 85 | class MeshnatYosef2TestCase(make_test_case(ScraperFactory.MESHMAT_YOSEF_2, 1)): 86 | """Test case for ScraperFactory.MESHMAT_YOSEF_2.""" 87 | 88 | 89 | class OsheradTestCase(make_test_case(ScraperFactory.OSHER_AD, 1)): 90 | """Test case for ScraperFactory.OSHER_AD.""" 91 | 92 | 93 | class PolizerTestCase(make_test_case(ScraperFactory.POLIZER, 2)): 94 | """Test case for ScraperFactory.POLIZER.""" 95 | 96 | 97 | class RamiLevyTestCase(make_test_case(ScraperFactory.RAMI_LEVY, 1)): 98 | """Test case for ScraperFactory.RAMI_LEVY.""" 99 | 100 | 101 | class SalachDabachTestCase(make_test_case(ScraperFactory.SALACH_DABACH, 4)): 102 | """Test case for ScraperFactory.SALACH_DABACH.""" 103 | 104 | 105 | class ShefaBarcartAshemTestCase(make_test_case(ScraperFactory.SHEFA_BARCART_ASHEM, 42)): 106 | """Test case for ScraperFactory.SHEFA_BARCART_ASHEM.""" 107 | 108 | 109 | class ShufersalTestCase(make_test_case(ScraperFactory.SHUFERSAL, 176)): 110 | """Test case for ScraperFactory.SHUFERSAL.""" 111 | 112 | 113 | class ShukAhirTestCase(make_test_case(ScraperFactory.SHUK_AHIR, 4)): 114 | """Test case for ScraperFactory.SHUK_AHIR.""" 115 | 116 | 117 | class StopMarketTestCase(make_test_case(ScraperFactory.STOP_MARKET, 5)): 118 | """Test case for ScraperFactory.STOP_MARKET.""" 119 | 120 | 121 | class SuperPharmTestCase(make_test_case(ScraperFactory.SUPER_PHARM, 224)): 122 | """Test case for ScraperFactory.SUPER_PHARM.""" 123 | 124 | 125 | class SuperYudaTestCase(make_test_case(ScraperFactory.SUPER_YUDA, 204)): 126 | """Test case for ScraperFactory.SUPER_YUDA.""" 127 | 128 | 129 | class SuperSapirTestCase(make_test_case(ScraperFactory.SUPER_SAPIR, 44)): 130 | """Test case for ScraperFactory.SUPER_SAPIR.""" 131 | 132 | 133 | class FreshMarketAndSuperDoshTestCase( 134 | make_test_case(ScraperFactory.FRESH_MARKET_AND_SUPER_DOSH, 1) 135 | ): 136 | """Test case for ScraperFactory.FRESH_MARKET_AND_SUPER_DOSH.""" 137 | 138 | 139 | class QuikTestCase(make_test_case(ScraperFactory.QUIK, None)): 140 | """Test case for ScraperFactory.QUIK.""" 141 | 142 | 143 | class TivTaamTestCase(make_test_case(ScraperFactory.TIV_TAAM, 3)): 144 | """Test case for ScraperFactory.TIV_TAAM.""" 145 | 146 | 147 | class VictoryTestCase(make_test_case(ScraperFactory.VICTORY, 1)): 148 | """Test case for ScraperFactory.VICTORY.""" 149 | 150 | 151 | class YellowTestCase(make_test_case(ScraperFactory.YELLOW, 1272)): 152 | """Test case for ScraperFactory.YELLOW.""" 153 | 154 | 155 | class YohananofTestCase(make_test_case(ScraperFactory.YOHANANOF, 1)): 156 | """Test case for ScraperFactory.YOHANANOF.""" 157 | 158 | 159 | class ZolVeBegadolTestCase(make_test_case(ScraperFactory.ZOL_VEBEGADOL, 4)): 160 | """Test case for ScraperFactory.ZOL_VEBEGADOL.""" 161 | 162 | 163 | class WoltTestCase(make_test_case(ScraperFactory.WOLT, 0)): 164 | """Test case for ScraperFactory.Wolt.""" 165 | -------------------------------------------------------------------------------- /il_supermarket_scarper/engines/matrix.py: -------------------------------------------------------------------------------- 1 | from bs4 import BeautifulSoup 2 | from il_supermarket_scarper.utils import Logger 3 | from .apsx import Aspx 4 | 5 | 6 | class Matrix(Aspx): 7 | """scraper for all matrix base site. 8 | (support adveanced search: follow the instrucation the page)""" 9 | 10 | utilize_date_param = False 11 | 12 | def __init__( 13 | self, 14 | chain, 15 | chain_id, 16 | url="https://laibcatalog.co.il/", 17 | aspx_page="NBCompetitionRegulations.aspx", 18 | chain_hebrew_name=None, 19 | folder_name=None, 20 | ): 21 | super().__init__(chain, chain_id, url, aspx_page, folder_name=folder_name) 22 | self.chain_hebrew_name = chain_hebrew_name 23 | 24 | # def get_file_types_id(self, files_types=None): 25 | # """get the file type id""" 26 | # if files_types is None: 27 | # return "all" 28 | 29 | # types = [] 30 | # for ftype in files_types: 31 | # if ftype == FileTypesFilters.STORE_FILE.name: 32 | # types.append("storefull") 33 | # if ftype == FileTypesFilters.PRICE_FILE.name: 34 | # types.append("price") 35 | # if ftype == FileTypesFilters.PROMO_FILE.name: 36 | # types.append("promo") 37 | # if ftype == FileTypesFilters.PRICE_FULL_FILE.name: 38 | # types.append("pricefull") 39 | # if ftype == FileTypesFilters.PROMO_FULL_FILE.name: 40 | # types.append("promofull") 41 | # return types 42 | 43 | # def get_when(self, when_date): 44 | # """get the when date""" 45 | # if when_date is None: 46 | # when_date = _now() 47 | # return when_date.strftime("%d/%m/%Y") 48 | 49 | # def get_chain_n_stores__id(self, store_id=None, c_id=None): 50 | # """get the store id""" 51 | # if store_id is None: 52 | # chain_id = str(c_id) # + "001" 53 | # store_id = "-1" 54 | # else: 55 | # chain_id = str(c_id) 56 | # store_id = str(c_id) + "001" + str(store_id).zfill(3) 57 | # return chain_id, store_id 58 | 59 | def _build_query_url(self, query_params, base_urls): 60 | res = [] 61 | for base in base_urls: 62 | res.append( 63 | { 64 | "method": "GET", 65 | "url": base, 66 | # "body": query_params, 67 | } 68 | ) 69 | return res 70 | 71 | def _get_all_possible_query_string_params( 72 | self, files_types=None, store_id=None, when_date=None 73 | ): 74 | """get the arguments need to add to the url""" 75 | 76 | return [{}] 77 | # post_body = [] 78 | # if isinstance(self.chain_id, list): 79 | # for c_id in self.chain_id: 80 | # chain_id, store_id = self.get_chain_n_stores__id( 81 | # store_id=store_id, c_id=c_id 82 | # ) 83 | # post_body.append( 84 | # { 85 | 86 | # "ctl00$TextArea": "", 87 | # "ctl00$MainContent$chain": chain_id, 88 | # "ctl00$MainContent$subChain": "-1", 89 | # "ctl00$MainContent$branch": store_id, 90 | # "ctl00$MainContent$txtDate": self.get_when(when_date=when_date), 91 | # "ctl00$MainContent$fileType": "all", 92 | # # "ctl00$MainContent$btnSearch": "חיפוש", 93 | # } 94 | # ) 95 | # else: 96 | # chain_id, store_id = self.get_chain_n_stores__id( 97 | # store_id=store_id, c_id=self.chain_id 98 | # ) 99 | # post_body.append( 100 | # { 101 | # "ctl00$TextArea": "", 102 | # "ctl00$MainContent$chain": chain_id, 103 | # "ctl00$MainContent$subChain": "-1", 104 | # "ctl00$MainContent$branch": store_id, 105 | # "ctl00$MainContent$txtDate": self.get_when(when_date=when_date), 106 | # "ctl00$MainContent$fileType": "all", 107 | # "ctl00$MainContent$btnSearch": "חיפוש", 108 | # } 109 | # ) 110 | 111 | # # add file types to url 112 | # if files_types: 113 | # chains_urls_with_types = [] 114 | # for files_type in self.get_file_types_id(files_types=files_types): 115 | # for chain_url in post_body: 116 | # chain_url["ctl00$MainContent$fileType"] = files_type 117 | # chains_urls_with_types.append(chain_url) 118 | # post_body = chains_urls_with_types 119 | 120 | # return post_body 121 | 122 | def get_href_from_entry(self, entry): 123 | """get download link for entry (tr)""" 124 | return entry.a.attrs["href"] 125 | 126 | def get_file_name_no_ext_from_entry(self, entry): 127 | """get the file name without extensions from entey (tr)""" 128 | return entry.split("/")[-1].split(".gz")[0].split(".")[0] 129 | 130 | def get_data_from_page(self, req_res): 131 | soup = BeautifulSoup(req_res.text, features="lxml") 132 | all_trs = list(soup.find_all("tr"))[1:] # skip title 133 | 134 | Logger.info(f"Before filtring names found {len(all_trs)} entries") 135 | if self.chain_hebrew_name: 136 | all_trs = list( 137 | filter(lambda x: x and self.chain_hebrew_name in str(x), all_trs) 138 | ) 139 | Logger.info(f"After filtering names found {len(all_trs)} entries") 140 | return all_trs 141 | -------------------------------------------------------------------------------- /il_supermarket_scarper/utils/scraper_status.py: -------------------------------------------------------------------------------- 1 | import os 2 | import traceback 3 | 4 | from .logger import Logger 5 | from .status import log_folder_details 6 | from .databases import JsonDataBase 7 | from .status import _now, get_output_folder 8 | from .lock_utils import lock_by_string 9 | 10 | 11 | class ScraperStatus: 12 | """A class that abstracts the database interface for scraper status.""" 13 | 14 | STARTED = "started" 15 | COLLECTED = "collected" 16 | DOWNLOADED = "downloaded" 17 | FAILED = "fail" 18 | ESTIMATED_SIZE = "estimated_size" 19 | VERIFIED_DOWNLOADS = "verified_downloads" 20 | 21 | def __init__(self, database_name, base_path, folder_name=None) -> None: 22 | self.database = JsonDataBase( 23 | database_name, get_output_folder(base_path, folder_name=folder_name) 24 | ) 25 | self.task_id = _now().strftime("%Y%m%d%H%M%S") 26 | self.filter_between_itrations = False 27 | 28 | @lock_by_string() 29 | def on_scraping_start(self, limit, files_types, **additional_info): 30 | """Report that scraping has started.""" 31 | self._insert_an_update( 32 | ScraperStatus.STARTED, 33 | limit=limit, 34 | files_requested=files_types, 35 | **additional_info, 36 | ) 37 | 38 | def enable_collection_status(self): 39 | """enable data collection to status files""" 40 | self.database.enable_collection_status() 41 | 42 | def enable_aggregation_between_runs(self): 43 | """allow tracking the downloaded file and don't downloading again if downloaded""" 44 | self.filter_between_itrations = True 45 | 46 | @lock_by_string() 47 | def on_collected_details( 48 | self, 49 | file_name_collected_from_site, 50 | links_collected_from_site="", 51 | **additional_info, 52 | ): 53 | """Report that file details have been collected.""" 54 | self._insert_an_update( 55 | ScraperStatus.COLLECTED, 56 | file_name_collected_from_site=file_name_collected_from_site, 57 | links_collected_from_site=links_collected_from_site, 58 | **additional_info, 59 | ) 60 | 61 | @lock_by_string() 62 | def on_download_completed(self, **additional_info): 63 | """Report that the file has been downloaded.""" 64 | self._insert_an_update(ScraperStatus.DOWNLOADED, **additional_info) 65 | self._add_downloaded_files_to_list(**additional_info) 66 | 67 | def filter_already_downloaded( 68 | self, storage_path, files_names_to_scrape, filelist, by_function=lambda x: x 69 | ): 70 | """Filter files already existing in long-term memory or previously downloaded.""" 71 | if self.database.is_collection_enabled() and self.filter_between_itrations: 72 | new_filelist = [] 73 | for file in filelist: 74 | if not self.database.find_document( 75 | self.VERIFIED_DOWNLOADS, {"file_name": by_function(file)} 76 | ): 77 | new_filelist.append(file) 78 | else: 79 | Logger.debug( 80 | f"Filtered file {file} since it was already downloaded and extracted" 81 | ) 82 | return new_filelist 83 | 84 | # Fallback: filter according to the disk 85 | exits_on_disk = os.listdir(storage_path) 86 | 87 | if files_names_to_scrape: 88 | # Delete any files we want to retry downloading 89 | for file in exits_on_disk: 90 | if file.split(".")[0] in files_names_to_scrape: 91 | os.remove(os.path.join(storage_path, file)) 92 | 93 | # Filter the files to download 94 | filelist = list( 95 | filter(lambda x: by_function(x) in files_names_to_scrape, filelist) 96 | ) 97 | 98 | return list(filter(lambda x: by_function(x) not in exits_on_disk, filelist)) 99 | 100 | def _add_downloaded_files_to_list(self, results, **_): 101 | """Add downloaded files to the MongoDB collection.""" 102 | if self.database.is_collection_enabled(): 103 | when = _now() 104 | 105 | documents = [] 106 | for res in results: 107 | if res["extract_succefully"]: 108 | documents.append( 109 | {"file_name": res["file_name"], "when": when}, 110 | ) 111 | self.database.insert_documents(self.VERIFIED_DOWNLOADS, documents) 112 | 113 | @lock_by_string() 114 | def on_scrape_completed(self, folder_name, completed_successfully=True): 115 | """Report when scraping is completed.""" 116 | self._insert_an_update( 117 | ScraperStatus.ESTIMATED_SIZE, 118 | folder_size=log_folder_details(folder_name), 119 | completed_successfully=completed_successfully, 120 | ) 121 | 122 | @lock_by_string() 123 | def on_download_fail(self, execption, download_urls=None, file_names=None): 124 | """report when the scraping in failed""" 125 | self._insert_an_update( 126 | ScraperStatus.FAILED, 127 | execption=str(execption), 128 | traceback=traceback.format_exc(), 129 | download_urls=download_urls if download_urls else [], 130 | file_names=file_names if file_names else [], 131 | ) 132 | 133 | def _insert_an_update(self, status, **additional_info): 134 | """Insert an update into the MongoDB collection.""" 135 | document = { 136 | "status": status, 137 | "when": _now(), 138 | **additional_info, 139 | } 140 | self.database.insert_document(self.task_id, document) 141 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | Israel Supermarket Scraper: Clients to download the data published by the supermarkets. 2 | ======================================= 3 | This is a scraper for ALL the supermarket chains listed in the GOV.IL site. 4 | 5 | שקיפות מחירים (השוואת מחירים) - https://www.gov.il/he/departments/legalInfo/cpfta_prices_regulations 6 | 7 | 8 | 9 | 10 | [](https://github.com/OpenIsraeliSupermarkets/israeli-supermarket-scarpers/actions/workflows/test-suite.yml) 11 | [](https://github.com/OpenIsraeliSupermarkets/israeli-supermarket-scarpers/actions/workflows/codeql.yml) 12 | [](https://github.com/OpenIsraeliSupermarkets/israeli-supermarket-scarpers/actions/workflows/pylint.yml) 13 | [](https://github.com/OpenIsraeliSupermarkets/israeli-supermarket-scarpers/actions/workflows/docker-publish.yml) 14 | [](https://github.com/OpenIsraeliSupermarkets/israeli-supermarket-scarpers/actions/workflows/python-publish.yml) 15 | 16 | ## 🤗 Want to support my work? 17 |
18 |
19 |
20 |