├── il_supermarket_scarper
    ├── scrappers
    │   ├── tests
    │   │   ├── __init__.py
    │   │   ├── test_all.py
    │   │   └── test_cases.py
    │   ├── bareket.py
    │   ├── keshet.py
    │   ├── good_pharm.py
    │   ├── king_store.py
    │   ├── osherad.py
    │   ├── polizer.py
    │   ├── shuk_ahir.py
    │   ├── tivtaam.py
    │   ├── het_cohen.py
    │   ├── maayan2000.py
    │   ├── super_sapir.py
    │   ├── yohananof.py
    │   ├── doralon.py
    │   ├── victory.py
    │   ├── zolvebegadol.py
    │   ├── quik.py
    │   ├── mega.py
    │   ├── ramilevy.py
    │   ├── machsani_ashuk.py
    │   ├── shefa_barcart_ashem.py
    │   ├── bitan.py
    │   ├── salachdabach.py
    │   ├── superdosh.py
    │   ├── yellow.py
    │   ├── super_yuda.py
    │   ├── nativ_hashed.py
    │   ├── stop_market.py
    │   ├── cofix.py
    │   ├── __init__.py
    │   ├── shufersal.py
    │   ├── meshnat_yosef.py
    │   ├── wolt.py
    │   ├── super_pharm.py
    │   ├── hazihinam.py
    │   └── city_market.py
    ├── utils
    │   ├── databases
    │   │   ├── __init__.py
    │   │   ├── base.py
    │   │   ├── mongo.py
    │   │   └── json_file.py
    │   ├── exceptions.py
    │   ├── tests
    │   │   ├── PriceFull7290876100000-003-202410070010.gz
    │   │   ├── test_connection.py
    │   │   ├── test_gzip_utils.py
    │   │   ├── test_file_type.py
    │   │   └── test_status.py
    │   ├── lock_utils.py
    │   ├── __init__.py
    │   ├── folders_name.py
    │   ├── loop.py
    │   ├── gzip_utils.py
    │   ├── logger.py
    │   ├── validation.py
    │   ├── file_cache.py
    │   ├── file_types.py
    │   ├── scraper_status.py
    │   ├── retry.py
    │   └── status.py
    ├── engines
    │   ├── __init__.py
    │   ├── apsx.py
    │   ├── publishprice.py
    │   ├── bina.py
    │   ├── matrix.py
    │   ├── web.py
    │   ├── cerberus.py
    │   └── multipage_web.py
    ├── __init__.py
    ├── tests
    │   └── test_scrappers_factory.py
    ├── main.py
    ├── scrapper_runner.py
    ├── scrappers_factory.py
    └── scraper_stability.py
├── pytest.ini
├── MANIFEST.in
├── setup.cfg
├── requirements-dev.txt
├── .pylintrc
├── .gitignore
├── requirements.txt
├── .devcontainer
    └── devcontainer.json
├── example.py
├── .vscode
    └── launch.json
├── .github
    └── workflows
    │   ├── pylint.yml
    │   ├── python-publish.yml
    │   ├── user-validation.yml
    │   ├── docker-publish.yml
    │   ├── test-suite.yml
    │   └── codeql.yml
├── tests
    ├── test_integration.py
    └── test_main.py
├── Dockerfile
├── setup.py
├── main.py
├── stress_test.py
├── LICENSE.txt
└── README.md


/il_supermarket_scarper/scrappers/tests/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/pytest.ini:
--------------------------------------------------------------------------------
1 | [pytest]
2 | filterwarnings =
3 |     ignore::UserWarning


--------------------------------------------------------------------------------
/MANIFEST.in:
--------------------------------------------------------------------------------
1 | include requirements.txt
2 | include requirements-dev.txt
3 | 


--------------------------------------------------------------------------------
/setup.cfg:
--------------------------------------------------------------------------------
1 | # Inside of setup.cfg
2 | [metadata]
3 | description-file = README.md
4 | 


--------------------------------------------------------------------------------
/requirements-dev.txt:
--------------------------------------------------------------------------------
1 | pytest==7.1
2 | zipp==3.19.1 # patch pytest vulnerability
3 | black==24.3.0
4 | pylint==3.0.1


--------------------------------------------------------------------------------
/il_supermarket_scarper/utils/databases/__init__.py:
--------------------------------------------------------------------------------
1 | from .json_file import JsonDataBase
2 | from .mongo import MongoDataBase
3 | 


--------------------------------------------------------------------------------
/.pylintrc:
--------------------------------------------------------------------------------
1 | [MAIN]
2 | disable=
3 |     C0114, # missing-module-docstring
4 |     R0913, # too-many-arguments
5 | extension-pkg-allow-list=lxml.etree


--------------------------------------------------------------------------------
/il_supermarket_scarper/utils/exceptions.py:
--------------------------------------------------------------------------------
1 | class RestartSessionError(Exception):
2 |     """This error will be raised if we would like to retry to downalod after a session restart"""
3 | 


--------------------------------------------------------------------------------
/il_supermarket_scarper/engines/__init__.py:
--------------------------------------------------------------------------------
1 | from .cerberus import Cerberus
2 | from .multipage_web import MultiPageWeb
3 | from .matrix import Matrix
4 | from .bina import Bina
5 | from .publishprice import PublishPrice
6 | 


--------------------------------------------------------------------------------
/il_supermarket_scarper/utils/tests/PriceFull7290876100000-003-202410070010.gz:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/OpenIsraeliSupermarkets/israeli-supermarket-scarpers/HEAD/il_supermarket_scarper/utils/tests/PriceFull7290876100000-003-202410070010.gz


--------------------------------------------------------------------------------
/il_supermarket_scarper/__init__.py:
--------------------------------------------------------------------------------
1 | from .main import ScarpingTask
2 | from .scrappers_factory import ScraperFactory
3 | from .scraper_stability import ScraperStability
4 | from .utils import FileTypesFilters, DumpFolderNames, datetime_in_tlv
5 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
 1 | __pycache__/
 2 | env/
 3 | *_cookies.txt
 4 | dist/
 5 | il_supermarket_scraper.egg-info/
 6 | build/
 7 | database/*
 8 | dumps/*
 9 | logging.log
10 | temp*/
11 | .vscode/settings.json
12 | .DS_Store
13 | test_dump
14 | status/
15 | .cache/
16 | 


--------------------------------------------------------------------------------
/requirements.txt:
--------------------------------------------------------------------------------
 1 | retry==0.9.2
 2 | mock==4.0.3
 3 | requests==2.32.2
 4 | lxml==5.2.1
 5 | beautifulsoup4==4.10.0
 6 | pymongo==4.6.3
 7 | dnspython==2.6.1 # patch pymongo vulnerability
 8 | pytz==2022.4
 9 | holidays==0.45
10 | cachetools==5.2.0
11 | pytest-playwright==0.7.0


--------------------------------------------------------------------------------
/il_supermarket_scarper/scrappers/bareket.py:
--------------------------------------------------------------------------------
 1 | from il_supermarket_scarper.engines import Bina
 2 | from il_supermarket_scarper.utils import DumpFolderNames
 3 | 
 4 | 
 5 | class Bareket(Bina):
 6 |     """scarper for bareket"""
 7 | 
 8 |     def __init__(self, folder_name=None):
 9 |         super().__init__(
10 |             chain=DumpFolderNames.BAREKET,
11 |             chain_id="7290875100001",
12 |             url_perfix="superbareket",
13 |             folder_name=folder_name,
14 |         )
15 | 


--------------------------------------------------------------------------------
/il_supermarket_scarper/scrappers/keshet.py:
--------------------------------------------------------------------------------
 1 | from il_supermarket_scarper.engines import Cerberus
 2 | from il_supermarket_scarper.utils import DumpFolderNames
 3 | 
 4 | 
 5 | class Keshet(Cerberus):
 6 |     """scaper for keshet tamim"""
 7 | 
 8 |     def __init__(self, folder_name=None):
 9 |         super().__init__(
10 |             chain=DumpFolderNames.KESHET,
11 |             chain_id="7290785400000",
12 |             folder_name=folder_name,
13 |             ftp_username="Keshet",
14 |         )
15 | 


--------------------------------------------------------------------------------
/.devcontainer/devcontainer.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "build": { 
 3 |     "dockerfile": "../Dockerfile",
 4 |     "target":"test",
 5 |     "args": {
 6 |       "PY_VERSION":"3.11.0"
 7 |       }
 8 |     },
 9 |   "customizations": {
10 |     "vscode": {
11 |       "extensions": [
12 |         "ms-python.python",
13 |         "ms-python.vscode-pylance",
14 |         "ms-toolsai.jupyter",
15 |         "LittleFoxTeam.vscode-python-test-adapter"
16 |       ]
17 |     }
18 |   },
19 | 
20 |   "forwardPorts": [3000]
21 | }
22 | 


--------------------------------------------------------------------------------
/il_supermarket_scarper/scrappers/good_pharm.py:
--------------------------------------------------------------------------------
 1 | from il_supermarket_scarper.engines import Bina
 2 | from il_supermarket_scarper.utils import DumpFolderNames
 3 | 
 4 | 
 5 | class GoodPharm(Bina):
 6 |     """scarper from good pharm"""
 7 | 
 8 |     def __init__(self, folder_name=None):
 9 |         super().__init__(
10 |             chain=DumpFolderNames.GOOD_PHARM,
11 |             chain_id="7290058197699",
12 |             url_perfix="goodpharm",
13 |             folder_name=folder_name,
14 |         )
15 | 


--------------------------------------------------------------------------------
/il_supermarket_scarper/scrappers/king_store.py:
--------------------------------------------------------------------------------
 1 | from il_supermarket_scarper.engines import Bina
 2 | from il_supermarket_scarper.utils import DumpFolderNames
 3 | 
 4 | 
 5 | class KingStore(Bina):
 6 |     """scraper for king store"""
 7 | 
 8 |     def __init__(self, folder_name=None):
 9 |         super().__init__(
10 |             chain=DumpFolderNames.KING_STORE,
11 |             chain_id="7290058108879",
12 |             url_perfix="kingstore",
13 |             folder_name=folder_name,
14 |         )
15 | 


--------------------------------------------------------------------------------
/il_supermarket_scarper/scrappers/osherad.py:
--------------------------------------------------------------------------------
 1 | from il_supermarket_scarper.engines import Cerberus
 2 | from il_supermarket_scarper.utils import DumpFolderNames
 3 | 
 4 | 
 5 | class Osherad(Cerberus):
 6 |     """scaper for osher ad"""
 7 | 
 8 |     def __init__(self, folder_name=None):
 9 |         super().__init__(
10 |             chain=DumpFolderNames.OSHER_AD,
11 |             chain_id="7290103152017",
12 |             folder_name=folder_name,
13 |             ftp_username="osherad",
14 |         )
15 | 


--------------------------------------------------------------------------------
/il_supermarket_scarper/scrappers/polizer.py:
--------------------------------------------------------------------------------
 1 | from il_supermarket_scarper.engines import Cerberus
 2 | from il_supermarket_scarper.utils import DumpFolderNames
 3 | 
 4 | 
 5 | class Polizer(Cerberus):
 6 |     """scarper for polizer"""
 7 | 
 8 |     def __init__(self, folder_name=None):
 9 |         super().__init__(
10 |             chain=DumpFolderNames.POLIZER,
11 |             chain_id="7291059100008",
12 |             folder_name=folder_name,
13 |             ftp_username="politzer",
14 |         )
15 | 


--------------------------------------------------------------------------------
/il_supermarket_scarper/scrappers/shuk_ahir.py:
--------------------------------------------------------------------------------
 1 | from il_supermarket_scarper.engines import Bina
 2 | from il_supermarket_scarper.utils import DumpFolderNames
 3 | 
 4 | 
 5 | class ShukAhir(Bina):
 6 |     """scraper for shuk a hir"""
 7 | 
 8 |     def __init__(self, folder_name=None):
 9 |         super().__init__(
10 |             chain=DumpFolderNames.SHUK_AHIR,
11 |             chain_id="7290058148776",
12 |             url_perfix="shuk-hayir",
13 |             folder_name=folder_name,
14 |         )
15 | 


--------------------------------------------------------------------------------
/il_supermarket_scarper/scrappers/tivtaam.py:
--------------------------------------------------------------------------------
 1 | from il_supermarket_scarper.engines import Cerberus
 2 | from il_supermarket_scarper.utils import DumpFolderNames
 3 | 
 4 | 
 5 | class TivTaam(Cerberus):
 6 |     """scraper for tiv taam"""
 7 | 
 8 |     def __init__(self, folder_name=None):
 9 |         super().__init__(
10 |             chain=DumpFolderNames.TIV_TAAM,
11 |             chain_id="7290873255550",
12 |             folder_name=folder_name,
13 |             ftp_username="TivTaam",
14 |         )
15 | 


--------------------------------------------------------------------------------
/il_supermarket_scarper/scrappers/het_cohen.py:
--------------------------------------------------------------------------------
 1 | from il_supermarket_scarper.engines import Matrix
 2 | from il_supermarket_scarper.utils import DumpFolderNames
 3 | 
 4 | 
 5 | class HetCohen(Matrix):
 6 |     """scraper for ChetCohen"""
 7 | 
 8 |     def __init__(self, folder_name=None):
 9 |         super().__init__(
10 |             chain=DumpFolderNames.HET_COHEN,
11 |             chain_id=["7290455000004"],
12 |             folder_name=folder_name,
13 |             chain_hebrew_name="ח. כהן",
14 |         )
15 | 


--------------------------------------------------------------------------------
/il_supermarket_scarper/scrappers/maayan2000.py:
--------------------------------------------------------------------------------
 1 | from il_supermarket_scarper.engines import Bina
 2 | from il_supermarket_scarper.utils import DumpFolderNames
 3 | 
 4 | 
 5 | class Maayan2000(Bina):
 6 |     """scaper for maayan 2000"""
 7 | 
 8 |     def __init__(self, folder_name=None):
 9 |         super().__init__(
10 |             chain=DumpFolderNames.MAAYAN_2000,
11 |             chain_id="7290058159628",
12 |             url_perfix="maayan2000",
13 |             folder_name=folder_name,
14 |         )
15 | 


--------------------------------------------------------------------------------
/il_supermarket_scarper/scrappers/super_sapir.py:
--------------------------------------------------------------------------------
 1 | from il_supermarket_scarper.engines import Bina
 2 | from il_supermarket_scarper.utils import DumpFolderNames
 3 | 
 4 | 
 5 | class SuperSapir(Bina):
 6 |     """scaper for super sapir"""
 7 | 
 8 |     def __init__(self, folder_name=None):
 9 |         super().__init__(
10 |             chain=DumpFolderNames.SUPER_SAPIR,
11 |             chain_id="7290058156016",
12 |             url_perfix="supersapir",
13 |             folder_name=folder_name,
14 |         )
15 | 


--------------------------------------------------------------------------------
/il_supermarket_scarper/scrappers/yohananof.py:
--------------------------------------------------------------------------------
 1 | from il_supermarket_scarper.engines import Cerberus
 2 | from il_supermarket_scarper.utils import DumpFolderNames
 3 | 
 4 | 
 5 | class Yohananof(Cerberus):
 6 |     """scraper for yohananof"""
 7 | 
 8 |     def __init__(self, folder_name=None):
 9 |         super().__init__(
10 |             chain=DumpFolderNames.YOHANANOF,
11 |             chain_id="7290803800003",
12 |             folder_name=folder_name,
13 |             ftp_username="yohananof",
14 |         )
15 | 


--------------------------------------------------------------------------------
/il_supermarket_scarper/scrappers/doralon.py:
--------------------------------------------------------------------------------
 1 | from il_supermarket_scarper.engines import Cerberus
 2 | from il_supermarket_scarper.utils import DumpFolderNames
 3 | 
 4 | 
 5 | class DorAlon(Cerberus):
 6 |     """scraper for dor alon"""
 7 | 
 8 |     def __init__(self, folder_name=None):
 9 |         super().__init__(
10 |             folder_name=folder_name,
11 |             chain=DumpFolderNames.DOR_ALON,
12 |             chain_id=["7290492000005", "729049000005"],
13 |             ftp_username="doralon",
14 |         )
15 | 


--------------------------------------------------------------------------------
/il_supermarket_scarper/scrappers/victory.py:
--------------------------------------------------------------------------------
 1 | from il_supermarket_scarper.engines import Matrix
 2 | from il_supermarket_scarper.utils import DumpFolderNames
 3 | 
 4 | 
 5 | class Victory(Matrix):
 6 |     """scraper for victory"""
 7 | 
 8 |     def __init__(self, folder_name=None):
 9 |         super().__init__(
10 |             chain=DumpFolderNames.VICTORY,
11 |             chain_hebrew_name="ויקטורי",
12 |             chain_id=["7290696200003", "7290058103393"],
13 |             folder_name=folder_name,
14 |         )
15 | 


--------------------------------------------------------------------------------
/il_supermarket_scarper/scrappers/zolvebegadol.py:
--------------------------------------------------------------------------------
 1 | from il_supermarket_scarper.engines import Bina
 2 | from il_supermarket_scarper.utils import DumpFolderNames
 3 | 
 4 | 
 5 | class ZolVeBegadol(Bina):
 6 |     """scraper dfor zol-ve-begodol"""
 7 | 
 8 |     def __init__(self, folder_name=None):
 9 |         super().__init__(
10 |             chain=DumpFolderNames.ZOL_VEBEGADOL,
11 |             chain_id="7290058173198",
12 |             url_perfix="zolvebegadol",
13 |             folder_name=folder_name,
14 |         )
15 | 


--------------------------------------------------------------------------------
/il_supermarket_scarper/scrappers/quik.py:
--------------------------------------------------------------------------------
 1 | from il_supermarket_scarper.engines.publishprice import PublishPrice
 2 | from il_supermarket_scarper.utils import DumpFolderNames
 3 | 
 4 | 
 5 | # @FlakyScraper
 6 | class Quik(PublishPrice):
 7 |     """scaper for quik"""
 8 | 
 9 |     def __init__(self, folder_name=None):
10 |         super().__init__(
11 |             chain=DumpFolderNames.QUIK,
12 |             chain_id="7291029710008",
13 |             site_infix="quik",
14 |             folder_name=folder_name,
15 |         )
16 | 


--------------------------------------------------------------------------------
/il_supermarket_scarper/scrappers/mega.py:
--------------------------------------------------------------------------------
 1 | from il_supermarket_scarper.engines.publishprice import PublishPrice
 2 | from il_supermarket_scarper.utils import DumpFolderNames
 3 | 
 4 | 
 5 | # removed : 1.7.2025
 6 | class Mega(PublishPrice):
 7 |     """scraper for mege"""
 8 | 
 9 |     def __init__(self, folder_name=None):
10 |         super().__init__(
11 |             chain=DumpFolderNames.MEGA,
12 |             chain_id="7290055700007",
13 |             site_infix="mega",
14 |             folder_name=folder_name,
15 |         )
16 | 


--------------------------------------------------------------------------------
/il_supermarket_scarper/scrappers/ramilevy.py:
--------------------------------------------------------------------------------
 1 | from il_supermarket_scarper.engines import Cerberus
 2 | from il_supermarket_scarper.utils import DumpFolderNames
 3 | 
 4 | 
 5 | class RamiLevy(Cerberus):
 6 |     """scaper for rami levi"""
 7 | 
 8 |     def __init__(self, folder_name=None):
 9 |         super().__init__(
10 |             chain=DumpFolderNames.RAMI_LEVY,
11 |             chain_id="7290058140886",
12 |             folder_name=folder_name,
13 |             ftp_username="RamiLevi",
14 |             max_threads=10,
15 |         )
16 | 


--------------------------------------------------------------------------------
/il_supermarket_scarper/scrappers/machsani_ashuk.py:
--------------------------------------------------------------------------------
 1 | from il_supermarket_scarper.engines import Matrix
 2 | from il_supermarket_scarper.utils import DumpFolderNames
 3 | 
 4 | 
 5 | class MahsaniAShuk(Matrix):
 6 |     """scraper for masani hsuk"""
 7 | 
 8 |     def __init__(self, folder_name=None):
 9 |         super().__init__(
10 |             chain=DumpFolderNames.MAHSANI_ASHUK,
11 |             chain_id=["7290661400001", "7290633800006"],
12 |             folder_name=folder_name,
13 |             chain_hebrew_name="מחסני השוק",
14 |         )
15 | 


--------------------------------------------------------------------------------
/il_supermarket_scarper/scrappers/shefa_barcart_ashem.py:
--------------------------------------------------------------------------------
 1 | from il_supermarket_scarper.engines import Bina
 2 | from il_supermarket_scarper.utils import DumpFolderNames
 3 | 
 4 | 
 5 | class ShefaBarcartAshem(Bina):
 6 |     """scraper for shefa berkat ashem"""
 7 | 
 8 |     def __init__(self, folder_name=None):
 9 |         super().__init__(
10 |             chain=DumpFolderNames.SHEFA_BARCART_ASHEM,
11 |             chain_id="7290058134977",
12 |             url_perfix="shefabirkathashem",
13 |             folder_name=folder_name,
14 |         )
15 | 


--------------------------------------------------------------------------------
/il_supermarket_scarper/scrappers/bitan.py:
--------------------------------------------------------------------------------
 1 | from il_supermarket_scarper.engines.publishprice import PublishPrice
 2 | from il_supermarket_scarper.utils import DumpFolderNames
 3 | 
 4 | 
 5 | class YaynotBitanAndCarrefour(PublishPrice):
 6 |     """scaper for yaynot beitan"""
 7 | 
 8 |     def __init__(self, folder_name=None):
 9 |         super().__init__(
10 |             chain=DumpFolderNames.YAYNO_BITAN_AND_CARREFOUR,
11 |             chain_id="7290055700007",
12 |             site_infix="carrefour",
13 |             folder_name=folder_name,
14 |         )
15 | 


--------------------------------------------------------------------------------
/il_supermarket_scarper/scrappers/salachdabach.py:
--------------------------------------------------------------------------------
 1 | from il_supermarket_scarper.engines import Cerberus
 2 | from il_supermarket_scarper.utils import DumpFolderNames
 3 | 
 4 | 
 5 | class SalachDabach(Cerberus):
 6 |     """scraper for salach dabach"""
 7 | 
 8 |     def __init__(self, folder_name=None):
 9 |         super().__init__(
10 |             chain=DumpFolderNames.SALACH_DABACH,
11 |             chain_id="7290526500006",
12 |             folder_name=folder_name,
13 |             ftp_username="SalachD",
14 |             ftp_password="12345",
15 |         )
16 | 


--------------------------------------------------------------------------------
/il_supermarket_scarper/scrappers/superdosh.py:
--------------------------------------------------------------------------------
 1 | from il_supermarket_scarper.engines import Cerberus
 2 | from il_supermarket_scarper.utils import DumpFolderNames
 3 | 
 4 | 
 5 | class FreshMarketAndSuperDosh(Cerberus):
 6 |     """scraper for fresh market and super dush"""
 7 | 
 8 |     def __init__(self, folder_name=None):
 9 |         super().__init__(
10 |             chain=DumpFolderNames.FRESH_MARKET_AND_SUPER_DOSH,
11 |             chain_id="7290876100000",
12 |             folder_name=folder_name,
13 |             ftp_username="freshmarket",
14 |         )
15 | 


--------------------------------------------------------------------------------
/il_supermarket_scarper/scrappers/yellow.py:
--------------------------------------------------------------------------------
 1 | from il_supermarket_scarper.engines import Cerberus
 2 | from il_supermarket_scarper.utils import DumpFolderNames
 3 | 
 4 | 
 5 | class Yellow(Cerberus):
 6 |     """scraper for yellow"""
 7 | 
 8 |     def __init__(self, folder_name=None):
 9 |         super().__init__(
10 |             chain=DumpFolderNames.YELLOW,
11 |             chain_id="7290644700005",
12 |             folder_name=folder_name,
13 |             ftp_username="Paz_bo",
14 |             ftp_password="paz468",
15 |             max_threads=10,
16 |         )
17 | 


--------------------------------------------------------------------------------
/example.py:
--------------------------------------------------------------------------------
 1 | from il_supermarket_scarper import ScarpingTask, ScraperFactory
 2 | from il_supermarket_scarper.utils import _now, Logger
 3 | 
 4 | Logger.set_logging_level("INFO")
 5 | 
 6 | if __name__ == "__main__":
 7 |     scraper = ScarpingTask(
 8 |         dump_folder_name="dumps",
 9 |         lookup_in_db=False,
10 |         multiprocessing=2,
11 |         limit=1,
12 |         enabled_scrapers=[ScraperFactory.BAREKET.name],
13 |         # size_estimation_mode=True,  # download files,log size, delete files
14 |         when_date=_now(),
15 |     )
16 |     scraper.start()
17 | 


--------------------------------------------------------------------------------
/.vscode/launch.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     // Use IntelliSense to learn about possible attributes.
 3 |     // Hover to view descriptions of existing attributes.
 4 |     // For more information, visit: https://go.microsoft.com/fwlink/?linkid=830387
 5 |     "version": "0.2.0",
 6 |     "configurations": [
 7 |         {
 8 |             "name": "Debug Unit Test",
 9 |             "type": "python",
10 |             "request": "test",
11 |             "justMyCode": false,
12 |             // "env": {
13 |             //     "DISABLED_SCRAPPERS" : "BAREKET"
14 |             // }
15 |         }
16 |     ]
17 | }


--------------------------------------------------------------------------------
/il_supermarket_scarper/scrappers/super_yuda.py:
--------------------------------------------------------------------------------
 1 | from il_supermarket_scarper.engines import Cerberus
 2 | from il_supermarket_scarper.utils import DumpFolderNames
 3 | 
 4 | 
 5 | class SuperYuda(Cerberus):
 6 |     """scraper for super yuda"""
 7 | 
 8 |     def __init__(self, folder_name=None):
 9 |         super().__init__(
10 |             chain=DumpFolderNames.SUPER_YUDA,
11 |             chain_id=["7290058198450", "7290058177776"],
12 |             ftp_username="yuda_ho",
13 |             ftp_password="Yud@147",
14 |             ftp_path="/Yuda",
15 |             folder_name=folder_name,
16 |         )
17 | 


--------------------------------------------------------------------------------
/il_supermarket_scarper/scrappers/nativ_hashed.py:
--------------------------------------------------------------------------------
 1 | from il_supermarket_scarper.engines.web import WebBase
 2 | from il_supermarket_scarper.utils import DumpFolderNames
 3 | 
 4 | 
 5 | # possible: NetivHased are down in Shabatz
 6 | class NetivHased(WebBase):
 7 |     """scraper for nativ Hased"""
 8 | 
 9 |     utilize_date_param = False
10 | 
11 |     def __init__(self, folder_name=None):
12 |         super().__init__(
13 |             chain=DumpFolderNames.NETIV_HASED,
14 |             chain_id="7290058160839",
15 |             url="http://141.226.203.152/",
16 |             folder_name=folder_name,
17 |         )
18 | 


--------------------------------------------------------------------------------
/il_supermarket_scarper/scrappers/stop_market.py:
--------------------------------------------------------------------------------
 1 | from il_supermarket_scarper.engines import Cerberus
 2 | from il_supermarket_scarper.utils import DumpFolderNames
 3 | 
 4 | 
 5 | class StopMarket(Cerberus):
 6 |     """scraper for stop market"""
 7 | 
 8 |     def __init__(self, folder_name=None):
 9 |         super().__init__(
10 |             chain=DumpFolderNames.STOP_MARKET,
11 |             chain_id=[
12 |                 "72906390",
13 |                 "7290639000004",
14 |             ],  # in store files for some reason the store id is only 72906390
15 |             folder_name=folder_name,
16 |             ftp_username="Stop_Market",
17 |         )
18 | 


--------------------------------------------------------------------------------
/il_supermarket_scarper/utils/tests/test_connection.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | import pytest
 3 | from il_supermarket_scarper.utils.connection import wget_file
 4 | 
 5 | 
 6 | def test_wget_file_dont_exist():
 7 |     """Test wget file that does not exist"""
 8 |     with pytest.raises(FileNotFoundError):
 9 |         wget_file(
10 |             "https://pricesprodpublic.blob.core.windows.net/price/"
11 |             "Price7290027600007-036-202503181800.gz?sv=2014-02-14&sr=b"
12 |             "&sig=Me8hez2oy5vClACdE5fVOyyu5Qef%2FlEJSQYfMvQAOKg%3D&"
13 |             "se=2025-03-18T18%3A02%3A59Z&sp=r",
14 |             "some_file.gz",
15 |         )
16 | 
17 |     assert not os.path.exists("some_file.gz")
18 | 


--------------------------------------------------------------------------------
/il_supermarket_scarper/utils/tests/test_gzip_utils.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | import pytest
 3 | 
 4 | from il_supermarket_scarper.utils.gzip_utils import extract_xml_file_from_gz_file
 5 | 
 6 | 
 7 | def test_unzip_bad_file():
 8 |     """test unziping a bad file"""
 9 | 
10 |     file_path = (
11 |         "il_supermarket_scarper/utils/tests/PriceFull7290876100000-003-202410070010.gz"
12 |     )
13 |     file_content = None
14 |     if os.path.exists(file_path):
15 |         with open(file_path, "rb") as f:
16 |             file_content = f.read()
17 | 
18 |     with pytest.raises(ValueError):
19 |         extract_xml_file_from_gz_file(file_path)
20 | 
21 |     if file_content is not None and not os.path.exists(file_path):
22 |         with open(file_path, "wb") as f:
23 |             f.write(file_content)
24 | 


--------------------------------------------------------------------------------
/il_supermarket_scarper/scrappers/cofix.py:
--------------------------------------------------------------------------------
 1 | from il_supermarket_scarper.engines import Cerberus
 2 | from il_supermarket_scarper.utils import FileTypesFilters, DumpFolderNames
 3 | 
 4 | 
 5 | class Cofix(Cerberus):
 6 |     """scraper for confix"""
 7 | 
 8 |     def __init__(self, folder_name=None):
 9 |         super().__init__(
10 |             chain=DumpFolderNames.COFIX,
11 |             chain_id="7291056200008",
12 |             folder_name=folder_name,
13 |             ftp_username="SuperCofixApp",
14 |         )
15 | 
16 |     def is_valid_file_empty(self, file_name):
17 |         """it is valid the file is empty"""
18 | 
19 |         return super().is_valid_file_empty(
20 |             file_name
21 |         ) or FileTypesFilters.is_file_from_type(
22 |             file_name, FileTypesFilters.STORE_FILE.name
23 |         )
24 | 


--------------------------------------------------------------------------------
/.github/workflows/pylint.yml:
--------------------------------------------------------------------------------
 1 | name: Pylint
 2 | 
 3 | on:
 4 |   push:
 5 |     branches: [ "main" ]
 6 |   pull_request:
 7 |     branches: [ "main" ]
 8 | 
 9 | concurrency:
10 |   group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.ref }}
11 |   cancel-in-progress: true
12 |   
13 | jobs:
14 |   build:
15 |     runs-on: ubuntu-latest
16 |     strategy:
17 |       matrix:
18 |         python-version: ["3.8"]
19 |     steps:
20 |     - uses: actions/checkout@v3
21 |     - name: Set up Python ${{ matrix.python-version }}
22 |       uses: actions/setup-python@v3
23 |       with:
24 |         python-version: ${{ matrix.python-version }}
25 |     - name: Install dependencies
26 |       run: |
27 |         python -m pip install --upgrade pip
28 |         pip install pylint
29 |     - name: Analysing the code with pylint
30 |       run: |
31 |         pylint $(git ls-files '*.py') --disable=E0401,R0801,R0903,W0707,R0917,C0114
32 | 


--------------------------------------------------------------------------------
/il_supermarket_scarper/utils/databases/base.py:
--------------------------------------------------------------------------------
 1 | from abc import ABC, abstractmethod
 2 | 
 3 | 
 4 | class AbstractDataBase(ABC):
 5 |     """Abstract base class for database operations."""
 6 | 
 7 |     def __init__(self, database_name, collection_status=False) -> None:
 8 |         self.database_name = database_name.replace(" ", "_").lower()
 9 |         self.collection_status = collection_status
10 | 
11 |     def enable_collection_status(self):
12 |         """Enable data collection to the database."""
13 |         self.collection_status = True
14 | 
15 |     @abstractmethod
16 |     def insert_document(self, collection_name, document):
17 |         """Insert a document into a collection."""
18 | 
19 |     @abstractmethod
20 |     def find_document(self, collection_name, query):
21 |         """Find a document in a collection based on a query."""
22 | 
23 |     def is_collection_enabled(self):
24 |         """Check if collection is enabled."""
25 |         return self.collection_status
26 | 
27 |     def set_collection_status(self, status):
28 |         """Enable data collection to JSON storage."""
29 |         self.collection_status = status
30 | 


--------------------------------------------------------------------------------
/il_supermarket_scarper/utils/tests/test_file_type.py:
--------------------------------------------------------------------------------
 1 | from il_supermarket_scarper.utils import FileTypesFilters
 2 | 
 3 | 
 4 | def test_file_type():
 5 |     """test prasing file name to enum"""
 6 |     assert (
 7 |         FileTypesFilters.get_type_from_file("Price7290058108879-339-202409181941")
 8 |         == FileTypesFilters.PRICE_FILE
 9 |     )
10 |     assert (
11 |         FileTypesFilters.get_type_from_file("PriceFull7290058108879-339-202409181041")
12 |         == FileTypesFilters.PRICE_FULL_FILE
13 |     )
14 | 
15 |     assert (
16 |         FileTypesFilters.get_type_from_file("StoresFull7290058108879-000-202409181041")
17 |         == FileTypesFilters.STORE_FILE
18 |     )
19 |     assert (
20 |         FileTypesFilters.get_type_from_file("Promo7290058108879-336-202409181544")
21 |         == FileTypesFilters.PROMO_FILE
22 |     )
23 |     assert (
24 |         FileTypesFilters.get_type_from_file("PromoFull7290058108879-339-202409181149")
25 |         == FileTypesFilters.PROMO_FULL_FILE
26 |     )
27 |     assert (
28 |         FileTypesFilters.get_type_from_file("Proasdull7290058108879-339-202409181149")
29 |         is None
30 |     )
31 | 


--------------------------------------------------------------------------------
/il_supermarket_scarper/utils/tests/test_status.py:
--------------------------------------------------------------------------------
 1 | import datetime
 2 | 
 3 | from il_supermarket_scarper.utils.status import (
 4 |     get_status,
 5 |     get_status_date,
 6 |     get_statue_page,
 7 | )
 8 | from il_supermarket_scarper.utils.connection import disable_when_outside_israel
 9 | from il_supermarket_scarper.utils.validation import show_text_diff
10 | 
11 | 
12 | @disable_when_outside_israel
13 | def test_status():
14 |     """check able to get the number of scrapers from gov.il"""
15 |     num_of_scarpers = get_status()
16 |     assert isinstance(num_of_scarpers, int)
17 | 
18 | 
19 | @disable_when_outside_israel
20 | def test_status_date():
21 |     """check able the get the date the gov.il site was updated"""
22 |     date = get_status_date()
23 |     assert isinstance(date, datetime.datetime)
24 | 
25 | 
26 | @disable_when_outside_israel
27 | def test_page_complete_diff():
28 |     """make sure the page content is the same as the cached page"""
29 |     cached = get_statue_page(extraction_type="all_text", source="cache")
30 |     current = get_statue_page(extraction_type="all_text", source="gov.il")
31 |     assert current == cached, show_text_diff(cached, current)
32 | 


--------------------------------------------------------------------------------
/il_supermarket_scarper/tests/test_scrappers_factory.py:
--------------------------------------------------------------------------------
 1 | from il_supermarket_scarper import ScraperStability, ScraperFactory, datetime_in_tlv
 2 | from il_supermarket_scarper.utils import _is_saturday_in_israel
 3 | 
 4 | 
 5 | def test_stable_scraper():
 6 |     """test sample stable scarper"""
 7 |     assert not ScraperStability.is_validate_scraper_found_no_files(
 8 |         ScraperFactory.VICTORY.name
 9 |     )
10 | 
11 | 
12 | # def test_after_date():
13 | #     """test scrapers that failed after date"""
14 | #     assert ScraperStability.is_validate_scraper_found_no_files(
15 | #         ScraperFactory.CITY_MARKET_GIVATAYIM.name,
16 | #         when_date=datetime_in_tlv(2024, 12, 12, 0, 0, 0),
17 | #     )
18 | 
19 | 
20 | def test_not_active():
21 |     """test grap between active and not"""
22 |     test_date = datetime_in_tlv(2024, 12, 12, 0, 0, 0)
23 |     all_listed = ScraperFactory.all_listed_scrappers()
24 |     all_active = ScraperFactory.all_scrapers_name(when_date=test_date)
25 | 
26 |     expected_to_fail = 0
27 |     if _is_saturday_in_israel(test_date):
28 |         expected_to_fail += 1  # only 'NetivHased' should
29 | 
30 |     assert len(set(all_listed) - set(all_active)) == expected_to_fail
31 | 


--------------------------------------------------------------------------------
/il_supermarket_scarper/utils/lock_utils.py:
--------------------------------------------------------------------------------
 1 | from threading import Lock
 2 | from functools import wraps
 3 | 
 4 | 
 5 | class LockManager:
 6 |     """Manages locks based on string values."""
 7 | 
 8 |     def __init__(self):
 9 |         self.locks = {}
10 | 
11 |     def get_lock(self, key):
12 |         """Get or create a lock based on the string key."""
13 |         if key not in self.locks:
14 |             self.locks[key] = Lock()
15 |         return self.locks[key]
16 | 
17 | 
18 | lock_manager = LockManager()
19 | 
20 | 
21 | def lock_by_string():
22 |     """
23 |     Decorator to apply a lock based on a string key.
24 |     :param lock_key_func: A function that returns the string key for which the lock will be applied.
25 |     """
26 | 
27 |     def decorator(func):
28 |         @wraps(func)
29 |         def wrapper(scraper_status, *args, **kwargs):
30 |             # Get the key for which to acquire the lock (based on the arguments)
31 |             lock_key = scraper_status.chain.value
32 |             lock = lock_manager.get_lock(lock_key)
33 | 
34 |             with lock:
35 |                 return func(scraper_status, *args, **kwargs)
36 | 
37 |         return wrapper
38 | 
39 |     return decorator
40 | 


--------------------------------------------------------------------------------
/il_supermarket_scarper/utils/__init__.py:
--------------------------------------------------------------------------------
 1 | from .gzip_utils import extract_xml_file_from_gz_file
 2 | from .logger import Logger
 3 | from .status import (
 4 |     get_output_folder,
 5 |     clean_dump_folder,
 6 |     summerize_dump_folder_contant,
 7 |     _is_saturday_in_israel,
 8 |     _is_holiday_in_israel,
 9 |     _is_weekend_in_israel,
10 |     _now,
11 |     datetime_in_tlv,
12 |     _testing_now,
13 |     hour_files_expected_to_be_accassible,
14 | )
15 | from .scraper_status import ScraperStatus
16 | from .file_types import FileTypesFilters
17 | from .connection import (
18 |     download_connection_retry,
19 |     url_connection_retry,
20 |     disable_when_outside_israel,
21 |     session_with_cookies,
22 |     url_retrieve,
23 |     collect_from_ftp,
24 |     fetch_temporary_gz_file_from_ftp,
25 |     wget_file,
26 | )
27 | from .loop import execute_in_parallel, multiple_page_aggregtion
28 | from .exceptions import RestartSessionError
29 | from .retry import retry_files
30 | from .validation import is_valid_chain_name, change_xml_encoding
31 | from .folders_name import DumpFolderNames
32 | from .lock_utils import LockManager, lock_by_string
33 | from .status import convert_unit, UnitSize, convert_nl_size_to_bytes, string_to_float
34 | 


--------------------------------------------------------------------------------
/.github/workflows/python-publish.yml:
--------------------------------------------------------------------------------
 1 | # This workflow will upload a Python Package using Twine when a release is created
 2 | # For more information see: https://help.github.com/en/actions/language-and-framework-guides/using-python-with-github-actions#publishing-to-package-registries
 3 | 
 4 | # This workflow uses actions that are not certified by GitHub.
 5 | # They are provided by a third-party and are governed by
 6 | # separate terms of service, privacy policy, and support
 7 | # documentation.
 8 | 
 9 | name: Upload Python Package
10 | 
11 | on:
12 |   release:
13 |     types: [published]
14 | 
15 | permissions:
16 |   contents: read
17 | 
18 | jobs:
19 |   deploy:
20 | 
21 |     runs-on: ubuntu-latest
22 | 
23 |     steps:
24 |     - uses: actions/checkout@v3
25 |     - name: Set up Python
26 |       uses: actions/setup-python@v3
27 |       with:
28 |         python-version: '3.x'
29 |     - name: Install dependencies
30 |       run: |
31 |         python -m pip install --upgrade pip
32 |         pip install build
33 |     - name: Build package
34 |       run: python -m build
35 |     - name: Publish package
36 |       uses: pypa/gh-action-pypi-publish@27b31702a0e7fc50959f5ad993c78deac1bdfc29
37 |       with:
38 |         user: __token__
39 |         password: ${{ secrets.PYPI }}
40 | 


--------------------------------------------------------------------------------
/tests/test_integration.py:
--------------------------------------------------------------------------------
 1 | import datetime
 2 | from il_supermarket_scarper.utils.status import (
 3 |     get_status,
 4 |     get_status_date,
 5 | )
 6 | from il_supermarket_scarper.scrappers_factory import ScraperFactory
 7 | from il_supermarket_scarper.utils import disable_when_outside_israel, DumpFolderNames
 8 | 
 9 | 
10 | def test_scrapers_folders_match():
11 |     """test the number of scrapers are the same as listed at the gov.il site"""
12 |     scrapers_keys = ScraperFactory.all_scrapers_name()
13 |     dump_keys = DumpFolderNames.all_folders_names()
14 | 
15 |     assert set(scrapers_keys) & set(dump_keys) == set(scrapers_keys)
16 |     assert set(scrapers_keys) - set(dump_keys) == set()
17 | 
18 | 
19 | @disable_when_outside_israel
20 | def test_scrapers_are_updated():
21 |     """test the number of scrapers are the same as listed at the gov.il site"""
22 |     num_of_scarper_listed = len(ScraperFactory.all_listed_scrappers())
23 |     num_of_scarper_on_gov_site = get_status()
24 | 
25 |     assert num_of_scarper_listed == num_of_scarper_on_gov_site
26 | 
27 | 
28 | @disable_when_outside_israel
29 | def test_update_date():
30 |     """test date the site update"""
31 |     date = get_status_date()
32 |     assert date.date() == datetime.datetime(2025, 7, 1).date(), "gov il site changed"
33 | 


--------------------------------------------------------------------------------
/.github/workflows/user-validation.yml:
--------------------------------------------------------------------------------
 1 | name: Reject PR with IgnoreList
 2 | on:
 3 |   pull_request:
 4 |     types: [opened, edited, synchronize]
 5 | 
 6 | jobs:
 7 |   check_username:
 8 |     runs-on: ubuntu-latest
 9 |     env:
10 |       IGNORE_USERS: ${{ secrets.IGNORE_USERS }}
11 | 
12 |     steps:
13 |       - name: Checkout code
14 |         uses: actions/checkout@v3
15 | 
16 |       - name: Fetch all branches
17 |         run: git fetch --all
18 | 
19 |       - name: Check for restricted authors in commits
20 |         id: check_commit_authors
21 |         run: |
22 |           # Convert IGNORE_USERS to an array
23 |           IFS=',' read -ra IGNORED_USERS <<< "$IGNORE_USERS"
24 | 
25 |           # Get the commit authors in the pull request
26 |           COMMIT_AUTHORS=$(git log --pretty=format:"%an" origin/main..HEAD)
27 | 
28 |           # Check if any commit author matches an ignored user
29 |           for AUTHOR in "${IGNORED_USERS[@]}"; do
30 |             if echo "$COMMIT_AUTHORS" | grep -iq "^$AUTHOR$"; then
31 |               echo "Restricted author '$AUTHOR' found in commits."
32 |               exit 1
33 |             fi
34 |           done
35 | 
36 |       - name: PR Rejected
37 |         if: failure()
38 |         run: |
39 |           echo "This PR contains commits by restricted authors."
40 |           exit 1
41 | 


--------------------------------------------------------------------------------
/il_supermarket_scarper/scrappers/__init__.py:
--------------------------------------------------------------------------------
 1 | from .bareket import Bareket
 2 | from .bitan import YaynotBitanAndCarrefour
 3 | from .cofix import Cofix
 4 | from .city_market import (
 5 |     CityMarketGivatayim,
 6 |     CityMarketKirtatOno,
 7 |     CityMarketKiryatGat,
 8 |     CityMarketShops,
 9 | )
10 | from .doralon import DorAlon
11 | from .good_pharm import GoodPharm
12 | from .hazihinam import HaziHinam
13 | from .het_cohen import HetCohen
14 | from .keshet import Keshet
15 | from .king_store import KingStore
16 | from .maayan2000 import Maayan2000
17 | from .machsani_ashuk import MahsaniAShuk
18 | from .mega import Mega
19 | from .meshnat_yosef import MeshnatYosef1, MeshnatYosef2
20 | from .nativ_hashed import NetivHased
21 | from .osherad import Osherad
22 | from .polizer import Polizer
23 | from .ramilevy import RamiLevy
24 | from .salachdabach import SalachDabach
25 | from .shefa_barcart_ashem import ShefaBarcartAshem
26 | from .shufersal import Shufersal
27 | from .shuk_ahir import ShukAhir
28 | from .stop_market import StopMarket
29 | from .super_pharm import SuperPharm
30 | from .super_yuda import SuperYuda
31 | from .super_sapir import SuperSapir
32 | from .superdosh import FreshMarketAndSuperDosh
33 | from .quik import Quik
34 | from .tivtaam import TivTaam
35 | from .victory import Victory
36 | from .yellow import Yellow
37 | from .yohananof import Yohananof
38 | from .zolvebegadol import ZolVeBegadol
39 | from .wolt import Wolt
40 | 


--------------------------------------------------------------------------------
/tests/test_main.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | import time
 3 | import tempfile
 4 | 
 5 | from il_supermarket_scarper.main import ScarpingTask
 6 | from il_supermarket_scarper.scrappers_factory import ScraperFactory
 7 | 
 8 | 
 9 | def test_main_with_limit():
10 |     """test the main running with limit of 1 for each chain"""
11 |     with tempfile.TemporaryDirectory() as tmpdirname:
12 |         expected = ScraperFactory.all_scrapers_name() + ["status"]
13 |         scrapper_done = ScarpingTask(limit=1, dump_folder_name=tmpdirname).start()
14 | 
15 |         folders_from_scraper = list(map(lambda x: x.split("/")[-1], scrapper_done)) + [
16 |             "status"
17 |         ]
18 |         time.sleep(5)
19 |         folders_in_dump_folder = os.listdir(tmpdirname)
20 |         folders_in_dump_folder = [
21 |             name for name in folders_in_dump_folder if not name.startswith(".")
22 |         ]
23 |         assert len(folders_in_dump_folder) == len(expected)
24 |         assert sorted(folders_from_scraper) == sorted(folders_in_dump_folder)
25 | 
26 | 
27 | def test_main_with_one_scarper():
28 |     """the limit only for enabled scarpers"""
29 |     scrapper_done = ScarpingTask(
30 |         limit=1, enabled_scrapers=ScraperFactory.sample(n=1)
31 |     ).start()
32 |     assert len(scrapper_done) == 1
33 | 
34 | 
35 | def test_main_with_size_estimation_mode():
36 |     """test size estmation mode"""
37 |     scrapper_done = ScarpingTask(
38 |         limit=1, size_estimation_mode=True, enabled_scrapers=ScraperFactory.sample(n=1)
39 |     ).start()
40 |     assert len(scrapper_done) == 1
41 | 


--------------------------------------------------------------------------------
/.github/workflows/docker-publish.yml:
--------------------------------------------------------------------------------
 1 | # This workflow uses actions that are not certified by GitHub.
 2 | # They are provided by a third-party and are governed by
 3 | # separate terms of service, privacy policy, and support
 4 | # documentation.
 5 | 
 6 | # GitHub recommends pinning actions to a commit SHA.
 7 | # To get a newer version, you will need to update the SHA.
 8 | # You can also reference a tag or branch, but the action may change without warning.
 9 | 
10 | name: Publish Docker image
11 | 
12 | on:
13 |   release:
14 |     types: [published]
15 | 
16 | jobs:
17 |   push_to_registry:
18 |     name: Push Docker image to Docker Hub
19 |     runs-on: ubuntu-latest
20 |     steps:
21 |       - name: Check out the repo
22 |         uses: actions/checkout@v3
23 |       
24 |       - name: Log in to Docker Hub
25 |         uses: docker/login-action@f054a8b539a109f9f41c372932f1ae047eff08c9
26 |         with:
27 |           username: ${{ secrets.DOCKER_USERNAME }}
28 |           password: ${{ secrets.DOCKER_PASSWORD }}
29 |       
30 |       - name: Extract metadata (tags, labels) for Docker
31 |         id: meta
32 |         uses: docker/metadata-action@98669ae865ea3cffbcbaa878cf57c20bbf1c6c38
33 |         with:
34 |           images: erlichsefi/israeli-supermarket-scarpers
35 |       
36 |       - name: Build and push Docker image
37 |         uses: docker/build-push-action@ad44023a93711e3deb337508980b4b5e9bcdc5dc
38 |         with:
39 |           context: .
40 |           target: prod
41 |           push: true
42 |           tags: ${{ steps.meta.outputs.tags }}
43 |           labels: ${{ steps.meta.outputs.labels }}


--------------------------------------------------------------------------------
/Dockerfile:
--------------------------------------------------------------------------------
 1 | #syntax=docker/dockerfile:1
 2 | 
 3 | FROM node:20.19.5-bookworm-slim as base
 4 | ARG PY_VERSION="3.11.0"
 5 | 
 6 | # setting the enviroment 
 7 | RUN apt-get update --fix-missing -y && \
 8 |     apt-get install cron -y && \
 9 |     apt-get install libxml2-dev -y && \
10 |     apt-get install libxslt-dev -y 
11 |     
12 | 
13 | # setting python and more 
14 | RUN apt-get install python3-pip -y && \
15 |     apt-get install dieharder -y && \
16 |     apt-get install wget -y && \
17 |     apt-get clean && \
18 |     apt-get autoremove
19 | 
20 | # setup python
21 | ENV HOME="/root"
22 | WORKDIR ${HOME}
23 | RUN apt-get install -y git libbz2-dev libncurses-dev  libreadline-dev libffi-dev libssl-dev
24 | RUN git clone --depth=1 https://github.com/pyenv/pyenv.git .pyenv
25 | ENV PYENV_ROOT="${HOME}/.pyenv"
26 | ENV PATH="${PYENV_ROOT}/shims:${PYENV_ROOT}/bin:${PATH}"
27 | 
28 | RUN pyenv install $PY_VERSION
29 | RUN pyenv global $PY_VERSION
30 | 
31 | # setup code
32 | WORKDIR /usr/src/app
33 | COPY . .
34 | RUN python -m pip install .
35 | 
36 | 
37 | VOLUME ["/usr/src/app/dumps"]
38 | 
39 | # development container
40 | FROM base as dev
41 | RUN apt-get -y install git
42 | RUN pip install black
43 | RUN pip install pylint
44 | 
45 | 
46 | # production image
47 | FROM base as prod
48 | 
49 | # ADD crontab /etc/cron.d
50 | # RUN chmod 0644 /etc/cron.d/crontab
51 | # RUN crontab /etc/cron.d/crontab
52 | # RUN touch /var/log/cron.log
53 | # && cron & tail -f /var/log/cron.log
54 | CMD python main.py 
55 | 
56 | # run test
57 | FROM base as test
58 | 
59 | # playwrite
60 | RUN npx -y playwright@1.53.0 install --with-deps
61 | RUN python -m  playwright install  
62 | 
63 | RUN python -m pip install . ".[test]"
64 | CMD python -m pytest -vv -n 2
65 | 
66 | 


--------------------------------------------------------------------------------
/.github/workflows/test-suite.yml:
--------------------------------------------------------------------------------
 1 | # This workflow will install Python dependencies, run tests and lint with a variety of Python versions
 2 | # For more information see: https://help.github.com/actions/language-and-framework-guides/using-python-with-github-actions
 3 | 
 4 | name: Unit & Integration Tests
 5 | # env:
 6 | #   DISABLED_SCRAPPERS: BAREKET
 7 | 
 8 | on:
 9 |   push:
10 |     branches: [ "main" ]
11 |   pull_request:
12 |     branches: [ "main" ]
13 |   schedule:
14 |     # * is a special character in YAML so you have to quote this string
15 |     - cron:  '00 17 * * *'
16 |     
17 | concurrency:
18 |   group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.ref }}
19 |   cancel-in-progress: true
20 |   
21 |   
22 | jobs:
23 |   build:
24 | 
25 |     runs-on: self-hosted
26 |     strategy:
27 |       fail-fast: false
28 |       matrix:
29 |         python-version: ["3.11.0"]
30 | 
31 |     steps:
32 | 
33 |     - name: Checkout
34 |       uses: actions/checkout@v3
35 |     - name: Free disk space
36 |       run: |
37 |         df --human-readable
38 |         docker 2>/dev/null 1>&2 rmi $(docker image ls --all --quiet) || true
39 |         rm --recursive --force "$AGENT_TOOLSDIRECTORY"
40 |         df --human-readable
41 |     - name: Build with Docker
42 |       run: docker build -t erlichsefi/israeli-supermarket-scarpers:test --target test .
43 |     - name: Remove all build
44 |       run: (docker stop scraper-test-run 2>/dev/null || true) && (docker rm scraper-test-run 2>/dev/null || true)
45 |     - name: Test with pytest
46 |       run:  docker run --rm --name scraper-test-run -e DISABLED_SCRAPPERS="${{ env.DISABLED_SCRAPPERS }}" erlichsefi/israeli-supermarket-scarpers:test &&
47 |             docker builder prune -f
48 | 


--------------------------------------------------------------------------------
/il_supermarket_scarper/main.py:
--------------------------------------------------------------------------------
 1 | from .scrapper_runner import MainScrapperRunner
 2 | from .utils.file_types import FileTypesFilters
 3 | 
 4 | 
 5 | class ScarpingTask:  # pylint: disable=too-many-instance-attributes
 6 |     """scraping task encapsulated"""
 7 | 
 8 |     def __init__(
 9 |         self,
10 |         size_estimation_mode=False,
11 |         enabled_scrapers=None,
12 |         limit=None,
13 |         when_date=None,
14 |         files_types=FileTypesFilters.all_types(),
15 |         dump_folder_name=None,
16 |         lookup_in_db=True,
17 |         multiprocessing=5,
18 |         suppress_exception=False,
19 |         min_size=None,
20 |         max_size=None,
21 |     ):
22 |         """define the runner"""
23 |         self.runner = MainScrapperRunner(
24 |             size_estimation_mode=size_estimation_mode,
25 |             enabled_scrapers=enabled_scrapers,
26 |             dump_folder_name=dump_folder_name,
27 |             lookup_in_db=lookup_in_db,
28 |             multiprocessing=multiprocessing,
29 |         )
30 |         self.dump_folder_name = dump_folder_name
31 |         self.limit = limit
32 |         self.files_types = files_types
33 |         self.when_date = when_date
34 |         self.suppress_exception = suppress_exception
35 |         self.min_size = min_size
36 |         self.max_size = max_size
37 | 
38 |     def get_dump_folder_name(self):
39 |         """get the dump folder name"""
40 |         return self.dump_folder_name
41 | 
42 |     def start(self):
43 |         """run the scraping"""
44 |         return self.runner.run(
45 |             limit=self.limit,
46 |             files_types=self.files_types,
47 |             when_date=self.when_date,
48 |             suppress_exception=self.suppress_exception,
49 |             min_size=self.min_size,
50 |             max_size=self.max_size,
51 |         )
52 | 


--------------------------------------------------------------------------------
/il_supermarket_scarper/utils/folders_name.py:
--------------------------------------------------------------------------------
 1 | from enum import Enum
 2 | 
 3 | 
 4 | class DumpFolderNames(Enum):
 5 |     """all the folder that files will be download to"""
 6 | 
 7 |     BAREKET = "Bareket"
 8 |     # YAYNO_BITAN = "YaynotBitan"
 9 |     YAYNO_BITAN_AND_CARREFOUR = "YaynotBitanAndCarrefour"
10 |     COFIX = "Cofix"
11 |     CITY_MARKET_GIVATAYIM = "CityMarketGivatayim"
12 |     CITY_MARKET_KIRYATONO = "CityMarketKiryatOno"
13 |     CITY_MARKET_KIRYATGAT = "CityMarketKiryatGat"
14 |     CITY_MARKET_SHOPS = "CityMarketShops"
15 |     DOR_ALON = "DorAlon"
16 |     GOOD_PHARM = "GoodPharm"
17 |     HAZI_HINAM = "HaziHinam"
18 |     HET_COHEN = "HetCohen"
19 |     KESHET = "Keshet"
20 |     KING_STORE = "KingStore"
21 |     MAAYAN_2000 = "Maayan2000"
22 |     MAHSANI_ASHUK = "MahsaniAShuk"
23 |     MEGA = "Mega"
24 |     NETIV_HASED = "NetivHased"
25 |     MESHMAT_YOSEF_1 = "MeshnatYosef1"
26 |     MESHMAT_YOSEF_2 = "MeshnatYosef2"
27 |     OSHER_AD = "Osherad"
28 |     POLIZER = "Polizer"
29 |     RAMI_LEVY = "RamiLevy"
30 |     SALACH_DABACH = "SalachDabach"
31 |     SHEFA_BARCART_ASHEM = "ShefaBarcartAshem"
32 |     SHUFERSAL = "Shufersal"
33 |     SHUK_AHIR = "ShukAhir"
34 |     STOP_MARKET = "StopMarket"
35 |     SUPER_PHARM = "SuperPharm"
36 |     SUPER_YUDA = "SuperYuda"
37 |     SUPER_SAPIR = "SuperSapir"
38 |     FRESH_MARKET_AND_SUPER_DOSH = "FreshMarketAndSuperDosh"
39 |     QUIK = "Quik"
40 |     TIV_TAAM = "TivTaam"
41 |     VICTORY = "Victory"
42 |     YELLOW = "Yellow"
43 |     YOHANANOF = "Yohananof"
44 |     ZOL_VEBEGADOL = "ZolVeBegadol"
45 |     WOLT = "Wolt"
46 | 
47 |     @classmethod
48 |     def is_valid_folder_name(cls, member):
49 |         """check if an folder is part of the cls"""
50 |         return isinstance(member, DumpFolderNames)
51 | 
52 |     @classmethod
53 |     def all_folders_names(cls):
54 |         """get the name of all listed folders"""
55 |         return [e.name for e in cls]
56 | 


--------------------------------------------------------------------------------
/il_supermarket_scarper/scrappers/shufersal.py:
--------------------------------------------------------------------------------
 1 | import urllib.parse
 2 | 
 3 | from il_supermarket_scarper.engines import MultiPageWeb
 4 | from il_supermarket_scarper.utils import DumpFolderNames, FileTypesFilters
 5 | 
 6 | 
 7 | class Shufersal(MultiPageWeb):
 8 |     """scaper for shufersal"""
 9 | 
10 |     utilize_date_param = False
11 | 
12 |     def __init__(self, folder_name=None):
13 |         super().__init__(
14 |             url="https://prices.shufersal.co.il/",
15 |             total_page_xpath="""//*[@id="gridContainer"]/table/tfoot/tr/td/a[6]/@href""",
16 |             total_pages_pattern=r"[?&]page=([0-9]+)",
17 |             chain=DumpFolderNames.SHUFERSAL,
18 |             chain_id="7290027600007",
19 |             folder_name=folder_name,
20 |             page_argument="&page",
21 |         )
22 | 
23 |     def get_file_types_id(self, files_types=None):
24 |         """get the file type id"""
25 |         if files_types is None:
26 |             return ["0"]
27 | 
28 |         types = []
29 |         for ftype in files_types:
30 |             if ftype == FileTypesFilters.STORE_FILE.name:
31 |                 types.append("5")
32 |             if ftype == FileTypesFilters.PRICE_FILE.name:
33 |                 types.append("1")
34 |             if ftype == FileTypesFilters.PROMO_FILE.name:
35 |                 types.append("3")
36 |             if ftype == FileTypesFilters.PRICE_FULL_FILE.name:
37 |                 types.append("2")
38 |             if ftype == FileTypesFilters.PROMO_FULL_FILE.name:
39 |                 types.append("4")
40 |         return types
41 | 
42 |     def build_params(self, files_types=None, store_id=None, when_date=None):
43 |         """build the params for the request"""
44 |         params = {"catID": ",".join(self.get_file_types_id(files_types))}
45 | 
46 |         if store_id:
47 |             params["storeId"] = store_id
48 |         return [f"/FileObject/UpdateCategory?{urllib.parse.urlencode(params)}"]
49 | 


--------------------------------------------------------------------------------
/setup.py:
--------------------------------------------------------------------------------
 1 | from io import open
 2 | from setuptools import setup
 3 | 
 4 | with open("README.md", encoding="utf-8") as f:
 5 |     long_description = "\n" + f.read()
 6 | 
 7 | with open("requirements.txt", encoding="utf-8") as f:
 8 |     required = f.read().splitlines()
 9 | 
10 | with open("requirements-dev.txt", encoding="utf-8") as f:
11 |     dev_required = f.read().splitlines()
12 | 
13 | setup(
14 |     # Needed to silence warnings (and to be a worthwhile package)
15 |     name="il-supermarket-scraper",
16 |     url="https://github.com/OpenIsraeliSupermarkets/israeli-supermarket-scarpers",
17 |     author="Sefi Erlich",
18 |     author_email="erlichsefi@gmail.com",
19 |     # Needed to actually package something
20 |     packages=[
21 |         "il_supermarket_scarper",
22 |         "il_supermarket_scarper.engines",
23 |         "il_supermarket_scarper.scrappers",
24 |         "il_supermarket_scarper.utils",
25 |         "il_supermarket_scarper.utils.databases",
26 |     ],
27 |     # Needed for dependencies
28 |     install_requires=required,
29 |     tests_require=dev_required,
30 |     extras_require={"test": ["pytest", "pytest-xdist"]},
31 |     # *strongly* suggested for sharing
32 |     version="0.6.3",
33 |     # The license can be anything you like
34 |     license="MIT",
35 |     description="python package that implement a scraping for israeli supermarket data",
36 |     # We will also need a readme eventually (there will be a warning)
37 |     long_description=long_description,
38 |     long_description_content_type="text/markdown",
39 |     keywords=["israel", "israeli", "scraper", "supermarket"],
40 |     classifiers=[
41 |         "Development Status :: 3 - Alpha",
42 |         "Intended Audience :: Developers",
43 |         "Topic :: Software Development :: Build Tools",
44 |         "License :: OSI Approved :: MIT License",
45 |         "Programming Language :: Python :: 3",
46 |         "Programming Language :: Python :: 3.4",
47 |         "Programming Language :: Python :: 3.5",
48 |         "Programming Language :: Python :: 3.6",
49 |     ],
50 | )
51 | 


--------------------------------------------------------------------------------
/il_supermarket_scarper/utils/databases/mongo.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | from ..logger import Logger
 3 | from .base import AbstractDataBase
 4 | 
 5 | 
 6 | PYMONGO_INSTALLED = True
 7 | try:
 8 |     import pymongo
 9 |     from pymongo.errors import ServerSelectionTimeoutError
10 | except ImportError:
11 |     PYMONGO_INSTALLED = False
12 | 
13 | 
14 | class MongoDataBase(AbstractDataBase):
15 |     """A class that represents a MongoDB database."""
16 | 
17 |     def __init__(self, database_name) -> None:
18 |         super().__init__(database_name)
19 |         self.myclient = None
20 |         self.store_db = None
21 | 
22 |     def create_connection(self):
23 |         """Create a connection to the MongoDB database."""
24 |         if PYMONGO_INSTALLED:
25 |             url = os.environ.get("MONGO_URL", "localhost")
26 |             port = os.environ.get("MONGO_PORT", "27017")
27 |             self.myclient = pymongo.MongoClient(f"mongodb://{url}:{port}/")
28 |             self.store_db = self.myclient[self.database_name]
29 | 
30 |     def enable_collection_status(self):
31 |         """Enable data collection to MongoDB."""
32 |         if PYMONGO_INSTALLED:
33 |             self.set_collection_status(True)
34 |             self.create_connection()
35 |         else:
36 |             Logger.info("Can't enable collection. Please install pymongo.")
37 | 
38 |     def insert_document(self, collection_name, document):
39 |         """Insert a document into a MongoDB collection."""
40 |         if self.is_collection_enabled():
41 |             try:
42 |                 self.store_db[collection_name].insert_one(document)
43 |             except ServerSelectionTimeoutError:
44 |                 self.set_collection_status(False)
45 |                 Logger.error(
46 |                     "Failed to connect to MongoDB. Collection status disabled."
47 |                 )
48 | 
49 |     def find_document(self, collection_name, query):
50 |         """Find a document in a MongoDB collection."""
51 |         if self.is_collection_enabled():
52 |             return self.store_db[collection_name].find_one(query)
53 |         return None
54 | 


--------------------------------------------------------------------------------
/il_supermarket_scarper/scrappers/meshnat_yosef.py:
--------------------------------------------------------------------------------
 1 | import json
 2 | 
 3 | from il_supermarket_scarper.engines.web import WebBase
 4 | from il_supermarket_scarper.engines import Bina
 5 | 
 6 | from il_supermarket_scarper.utils import DumpFolderNames, Logger
 7 | 
 8 | 
 9 | class MeshnatYosef1(WebBase):
10 |     """scraper for meshnat yoosef"""
11 | 
12 |     def __init__(self, folder_name=None):
13 |         super().__init__(
14 |             DumpFolderNames.MESHMAT_YOSEF_1,
15 |             chain_id="5144744100002",
16 |             url="https://list-files.w5871031-kt.workers.dev/",
17 |             folder_name=folder_name,
18 |         )
19 | 
20 |     def get_data_from_page(self, req_res):
21 |         """get the file list from a page"""
22 |         response = json.loads(req_res.text)
23 |         return response
24 | 
25 |     def get_file_size_from_entry(self, entry):
26 |         """
27 |         Extract file size from a JSON entry.
28 |         Returns size in bytes, or None if not found.
29 |         """
30 |         # Meshnat Yosef don't support file size in the entry
31 |         return None
32 | 
33 |     def extract_task_from_entry(self, all_trs):
34 |         """extract download links, file names, and file sizes from page list"""
35 |         download_urls = []
36 |         file_names = []
37 |         file_sizes = []
38 |         for x in all_trs:
39 |             try:
40 |                 download_urls.append(x["url"])
41 |                 file_names.append(x["name"])
42 |                 file_sizes.append(self.get_file_size_from_entry(x))
43 |             except (AttributeError, KeyError, IndexError, TypeError) as e:
44 |                 Logger.warning(f"Error extracting task from entry: {e}")
45 | 
46 |         return download_urls, file_names, file_sizes
47 | 
48 | 
49 | class MeshnatYosef2(Bina):
50 |     """scaper for Meshnat Yosef"""
51 | 
52 |     def __init__(self, folder_name=None):
53 |         super().__init__(
54 |             DumpFolderNames.MESHMAT_YOSEF_2,
55 |             chain_id=["5144744100001", "7290058289400"],
56 |             url_perfix="ktshivuk",
57 |             folder_name=folder_name,
58 |         )
59 | 


--------------------------------------------------------------------------------
/il_supermarket_scarper/utils/loop.py:
--------------------------------------------------------------------------------
 1 | import concurrent.futures
 2 | from .logger import Logger
 3 | 
 4 | 
 5 | def defualt_aggregtion_function(all_done):
 6 |     """format the scraping result to the final input"""
 7 |     result = []
 8 |     for response in all_done:
 9 |         _response = response
10 |         if hasattr(_response, "result"):
11 |             _response = _response.result()
12 |         result.append(_response)
13 |     return result
14 | 
15 | 
16 | def multiple_page_aggregtion(pages_to_scrape):
17 |     """format the scraping result to the final input for multipage"""
18 |     download_urls = []
19 |     file_names = []
20 |     file_sizes = []
21 |     for result in pages_to_scrape:
22 |         if hasattr(result, "result"):
23 |             page_result = result.result()
24 |         else:
25 |             page_result = result
26 |         page_download_urls, page_file_names, page_file_sizes = page_result
27 |         file_sizes.extend(page_file_sizes)
28 |         download_urls.extend(page_download_urls)
29 |         file_names.extend(page_file_names)
30 |     return download_urls, file_names, file_sizes
31 | 
32 | 
33 | def execute_in_parallel(
34 |     function_to_execute,
35 |     iterable,
36 |     max_threads=None,
37 |     aggregtion_function=defualt_aggregtion_function,
38 | ):
39 |     """execute a job in the event loop"""
40 | 
41 |     Logger.info(f"Running {len(iterable)} tasks in parallel")
42 |     results = run_tasks(
43 |         function_to_execute,
44 |         iterable,
45 |         max_threads=max_threads,
46 |     )
47 | 
48 |     all_done = aggregtion_function(results)
49 |     Logger.info(f"Done with {len(all_done)} tasks in parallel")
50 |     return all_done
51 | 
52 | 
53 | def run_tasks(
54 |     function_to_execute,
55 |     iterable,
56 |     max_threads: int = None,
57 | ):
58 |     """Run tasks in multi-thread or sequentially"""
59 |     if max_threads:
60 |         # Use multi-thread
61 |         with concurrent.futures.ThreadPoolExecutor(
62 |             max_workers=max_threads, thread_name_prefix="PullingThread"
63 |         ) as executor:
64 |             futures = [executor.submit(function_to_execute, arg) for arg in iterable]
65 |             return [
66 |                 future.result() for future in concurrent.futures.as_completed(futures)
67 |             ]
68 |     else:
69 |         # Or just iterate over all
70 |         return [function_to_execute(arg) for arg in iterable]
71 | 


--------------------------------------------------------------------------------
/il_supermarket_scarper/engines/apsx.py:
--------------------------------------------------------------------------------
 1 | from abc import ABC, abstractmethod
 2 | from il_supermarket_scarper.utils import Logger
 3 | 
 4 | from .web import WebBase
 5 | 
 6 | 
 7 | class Aspx(WebBase, ABC):
 8 |     """class for aspx scapers"""
 9 | 
10 |     def __init__(
11 |         self, chain, chain_id, url, aspx_page, folder_name=None, max_threads=5
12 |     ):
13 |         super().__init__(
14 |             chain, chain_id, url, folder_name=folder_name, max_threads=max_threads
15 |         )
16 |         self.aspx_page = aspx_page
17 | 
18 |     def extract_task_from_entry(self, all_trs):
19 |         """from the trs extract the download urls, file names, and file sizes"""
20 | 
21 |         download_urls = []
22 |         file_names = []
23 |         file_sizes = []
24 |         for x in all_trs:
25 |             try:
26 |                 download_url = self.url + self.get_href_from_entry(x)
27 |                 download_urls.append(download_url)
28 |                 file_names.append(self.get_file_name_no_ext_from_entry(download_url))
29 |                 file_sizes.append(self.get_file_size_from_entry(x))
30 |             except (AttributeError, KeyError, IndexError, TypeError) as e:
31 |                 Logger.warning(f"Error extracting task from entry: {e}")
32 |         return download_urls, file_names, file_sizes
33 | 
34 |     @abstractmethod
35 |     def _get_all_possible_query_string_params(
36 |         self, files_types=None, store_id=None, when_date=None
37 |     ):
38 |         """list all param to add to the url"""
39 | 
40 |     @abstractmethod
41 |     def _build_query_url(self, query_params, base_urls):
42 |         """build the url with the query params"""
43 | 
44 |     def get_request_url(self, files_types=None, store_id=None, when_date=None):
45 |         """build the request given the base url and the query params"""
46 |         result = []
47 |         for query_params in self._get_all_possible_query_string_params(
48 |             files_types=files_types, store_id=store_id, when_date=when_date
49 |         ):
50 |             result.extend(self._build_query_url(query_params, [self.url]))
51 |         Logger.debug(f"Request url: {result}")
52 |         return result
53 | 
54 |     @abstractmethod
55 |     def get_href_from_entry(self, entry):
56 |         """get download link for entry (tr)"""
57 | 
58 |     @abstractmethod
59 |     def get_file_name_no_ext_from_entry(self, entry):
60 |         """get the file name without extensions from entey (tr)"""
61 | 


--------------------------------------------------------------------------------
/main.py:
--------------------------------------------------------------------------------
 1 | import datetime
 2 | import os
 3 | from il_supermarket_scarper import ScarpingTask, ScraperFactory, FileTypesFilters
 4 | 
 5 | 
 6 | def load_params():
 7 |     """load params from env variables with validation"""
 8 |     kwargs = {"suppress_exception": True, "lookup_in_db": True}
 9 | 
10 |     # validate scrapers
11 |     enabled_scrapers = os.getenv("ENABLED_SCRAPERS", None)
12 |     if enabled_scrapers:
13 |         enabled_scrapers = enabled_scrapers.split(",")
14 | 
15 |         not_valid = list(
16 |             filter(
17 |                 lambda scraper: scraper not in ScraperFactory.all_scrapers_name(),
18 |                 enabled_scrapers,
19 |             )
20 |         )
21 |         if not_valid:
22 |             raise ValueError(f"ENABLED_SCRAPERS contains invalid {not_valid}")
23 | 
24 |         kwargs["enabled_scrapers"] = enabled_scrapers
25 | 
26 |     # validate file types
27 |     enabled_file_types = os.getenv("ENABLED_FILE_TYPES", None)
28 |     if enabled_file_types:
29 | 
30 |         enabled_file_types = enabled_file_types.split(",")
31 | 
32 |         not_valid = list(
33 |             filter(
34 |                 lambda f_types: f_types not in FileTypesFilters.all_types(),
35 |                 enabled_file_types,
36 |             )
37 |         )
38 |         if not_valid:
39 |             raise ValueError(f"ENABLED_FILE_TYPES contains invalid {not_valid}")
40 | 
41 |         kwargs["files_types"] = enabled_file_types
42 | 
43 |     # validate number of processes
44 |     number_of_processes = os.getenv("NUMBER_OF_PROCESSES", None)
45 |     if number_of_processes:
46 |         try:
47 |             kwargs["multiprocessing"] = int(number_of_processes)
48 |         except ValueError:
49 |             raise ValueError("NUMBER_OF_PROCESSES must be an integer")
50 | 
51 |     # validate limit
52 |     limit = os.getenv("LIMIT", None)
53 |     if limit:
54 |         try:
55 |             kwargs["limit"] = int(limit)
56 |         except ValueError:
57 |             raise ValueError(f"LIMIT must be an integer, but got {limit}")
58 | 
59 |     # validate today
60 |     today = os.getenv("TODAY", None)
61 |     if today:
62 |         try:
63 |             kwargs["when_date"] = datetime.datetime.strptime(today, "%Y-%m-%d %H:%M")
64 |         except ValueError:
65 |             raise ValueError("TODAY must be in the format 'YYYY-MM-DD HH:MM'")
66 | 
67 |     return kwargs
68 | 
69 | 
70 | if __name__ == "__main__":
71 | 
72 |     args = load_params()
73 | 
74 |     ScarpingTask(**args).start()
75 | 


--------------------------------------------------------------------------------
/il_supermarket_scarper/utils/gzip_utils.py:
--------------------------------------------------------------------------------
 1 | import gzip
 2 | import shutil
 3 | import os
 4 | import io
 5 | import zipfile
 6 | from .exceptions import RestartSessionError
 7 | 
 8 | 
 9 | def extract_xml_file_from_gz_file(file_save_path):
10 |     """extract xml from gz"""
11 |     target_file_name = os.path.splitext(file_save_path)[0] + ".xml"
12 |     try:
13 |         with gzip.open(file_save_path, "rb") as infile:
14 |             with open(target_file_name, "wb") as outfile:
15 |                 shutil.copyfileobj(infile, outfile)
16 |     except (gzip.BadGzipFile, EOFError) as exception:
17 |         try:
18 |             with open(file_save_path, "rb") as response_content:
19 |                 with zipfile.ZipFile(io.BytesIO(response_content.read())) as the_zip:
20 |                     zip_info = the_zip.infolist()[0]
21 |                     with the_zip.open(zip_info) as the_file:
22 |                         with open(target_file_name, "wb") as f_out:
23 |                             f_out.write(the_file.read())
24 | 
25 |         except (  # pylint: disable=broad-except,redefined-outer-name
26 |             Exception
27 |         ) as exception:
28 |             report_failed_zip(exception, file_save_path, target_file_name)
29 | 
30 |     except Exception as exception:  # pylint: disable=broad-except
31 |         report_failed_zip(exception, file_save_path, target_file_name)
32 | 
33 | 
34 | def report_failed_zip(exception, file_save_path, target_file_name):
35 |     """report a file wasn't able to extracted"""
36 | 
37 |     try:
38 |         file_size = os.path.getsize(file_save_path)
39 | 
40 |         file_contant = ""
41 |         with open(file_save_path, "r", encoding="utf-8") as file:
42 |             file_contant = file.readlines()
43 | 
44 |         if "link expired" in str(file_contant):
45 |             raise RestartSessionError()
46 | 
47 |         raise ValueError(
48 |             f"Error decoding file:{file_save_path} with "
49 |             f"error: {str(exception)} file size {str(file_size)} ,"
50 |             f"trimed_file_contant {str(file_contant)[:100]}"
51 |         )
52 |     except UnicodeDecodeError:
53 |         raise ValueError(
54 |             f"Error decoding file:{file_save_path} with "
55 |             f"error: {str(exception)} file size {str(file_size)} ,"
56 |             f"can't decode file"
57 |         )
58 |     finally:
59 |         os.remove(file_save_path)
60 |         # remove the corrupted file
61 |         if os.path.exists(target_file_name):
62 |             os.remove(target_file_name)
63 | 


--------------------------------------------------------------------------------
/il_supermarket_scarper/scrappers/wolt.py:
--------------------------------------------------------------------------------
 1 | from datetime import timedelta
 2 | from bs4 import BeautifulSoup
 3 | 
 4 | from il_supermarket_scarper.utils import _now, Logger
 5 | from il_supermarket_scarper.engines.web import WebBase
 6 | 
 7 | from il_supermarket_scarper.utils import DumpFolderNames
 8 | 
 9 | 
10 | class Wolt(WebBase):
11 |     """scraper for wolt"""
12 | 
13 |     def __init__(self, folder_name=None):
14 |         super().__init__(
15 |             DumpFolderNames.WOLT,
16 |             chain_id="7290058249350",
17 |             url="https://wm-gateway.wolt.com/isr-prices/public/v1/index.html",
18 |             folder_name=folder_name,
19 |         )
20 | 
21 |     def get_request_url(
22 |         self, files_types=None, store_id=None, when_date=None
23 |     ):  # pylint: disable=unused-argument
24 |         """get all links to collect download links from"""
25 |         if when_date:
26 |             formatted_date = when_date.strftime("%Y-%m-%d")
27 |             return [
28 |                 {
29 |                     "url": self.url.replace("index.html", f"{formatted_date}.html"),
30 |                     "method": "GET",
31 |                 }
32 |             ]
33 | 
34 |         perspective = _now()
35 |         all_pages_to_collect_from = []
36 |         for days_back in range(10):
37 |             formatted_date = (perspective - timedelta(days=days_back)).strftime(
38 |                 "%Y-%m-%d"
39 |             )
40 |             all_pages_to_collect_from.append(
41 |                 {
42 |                     "url": self.url.replace("index.html", f"{formatted_date}.html"),
43 |                     "method": "GET",
44 |                 }
45 |             )
46 |         return all_pages_to_collect_from
47 | 
48 |     def get_data_from_page(self, req_res):
49 |         """get the file list from a page"""
50 |         soup = BeautifulSoup(req_res.text, features="lxml")
51 |         return list(
52 |             map(
53 |                 lambda x: (x.text, self.url.replace("index.html", x.a.attrs["href"])),
54 |                 list(soup.find_all("li")),
55 |             )
56 |         )
57 | 
58 |     def extract_task_from_entry(self, all_trs):
59 |         """extract download links, file names, and file sizes from page list"""
60 |         download_urls = []
61 |         file_names = []
62 |         file_sizes = []
63 |         for x in all_trs:
64 |             try:
65 |                 download_urls.append(x[1])
66 |                 file_names.append(x[0])
67 |                 file_sizes.append(None)
68 |             except (AttributeError, KeyError, IndexError, TypeError) as e:
69 |                 Logger.warning(f"Error extracting task from entry: {e}")
70 | 
71 |         return download_urls, file_names, file_sizes
72 | 


--------------------------------------------------------------------------------
/il_supermarket_scarper/utils/logger.py:
--------------------------------------------------------------------------------
 1 | import logging
 2 | import sys
 3 | 
 4 | 
 5 | def build_logger():
 6 |     """create the logger instance"""
 7 |     # Define logger
 8 |     logger = logging.getLogger("Logger")
 9 | 
10 |     if not logger.handlers:
11 |         logger.setLevel(logging.DEBUG)  # set logger level
12 |         log_formatter = logging.Formatter(
13 |             "%(name)-12s %(asctime)s %(levelname)-8s "
14 |             "[%(threadName)s] %(filename)s:%(funcName)s %(message)s"
15 |         )
16 |         console_handler = logging.StreamHandler(
17 |             sys.stdout
18 |         )  # set streamhandler to stdout
19 |         console_handler.setFormatter(log_formatter)
20 |         logger.addHandler(console_handler)
21 | 
22 |         file_handler = logging.FileHandler("logging.log")
23 |         file_handler.setFormatter(log_formatter)
24 |         logger.addHandler(file_handler)
25 | 
26 |     return logger
27 | 
28 | 
29 | class Logger:
30 |     """a static logger class to share will all components"""
31 | 
32 |     enabled = True
33 |     logger = build_logger()
34 | 
35 |     @classmethod
36 |     def change_logging_status(cls, new_status):
37 |         """enable or disable status"""
38 |         cls.enabled = new_status
39 | 
40 |     @classmethod
41 |     def set_logging_level(cls, level):
42 |         """set logging level"""
43 |         if level == "DEBUG":
44 |             cls.logger.setLevel(logging.DEBUG)
45 |         elif level == "INFO":
46 |             cls.logger.setLevel(logging.INFO)
47 |         elif level == "ERROR":
48 |             cls.logger.setLevel(logging.ERROR)
49 |         elif level == "WARNING":
50 |             cls.logger.setLevel(logging.WARNING)
51 |         else:
52 |             cls.logger.setLevel(logging.DEBUG)
53 | 
54 |     @classmethod
55 |     def info(cls, msg, *args, **kwargs):
56 |         """log info"""
57 |         if cls.enabled:
58 |             cls.logger.info(msg, *args, **kwargs)
59 | 
60 |     @classmethod
61 |     def debug(cls, msg, *args, **kwargs):
62 |         """log info"""
63 |         if cls.enabled:
64 |             cls.logger.debug(msg, *args, **kwargs)
65 | 
66 |     @classmethod
67 |     def error(cls, msg, *args, **kwargs):
68 |         """log error"""
69 |         if cls.enabled:
70 |             cls.logger.error(msg, *args, **kwargs)
71 | 
72 |     @classmethod
73 |     def error_execption(cls, _):
74 |         """log execption"""
75 |         if cls.enabled:
76 |             cls.logger.error(
77 |                 "got an execption:",
78 |                 exc_info=sys.exc_info(),
79 |             )
80 | 
81 |     @classmethod
82 |     def warning(cls, msg, *args, **kwargs):
83 |         """log warning"""
84 |         if cls.enabled:
85 |             cls.logger.warning(msg, *args, **kwargs)
86 | 


--------------------------------------------------------------------------------
/stress_test.py:
--------------------------------------------------------------------------------
 1 | import time
 2 | import json
 3 | import datetime
 4 | import tempfile
 5 | import pstats
 6 | import cProfile
 7 | import io
 8 | from il_supermarket_scarper.scrappers_factory import ScraperFactory
 9 | from il_supermarket_scarper.utils import _now
10 | 
11 | 
12 | def format_stats_as_json(profile, project_name):
13 |     """get the stats from the profiler and format them as json"""
14 |     stream = io.StringIO()
15 |     ps = pstats.Stats(profile, stream=stream)
16 |     ps.sort_stats(pstats.SortKey.CUMULATIVE)  # Sort by cumulative time
17 |     ps.print_stats()
18 | 
19 |     # Convert the printed stats to a list of lines
20 |     stats_output = stream.getvalue().splitlines()
21 | 
22 |     # Filter the lines to include only functions within the project
23 |     project_stats = []
24 |     for line in stats_output:
25 |         if project_name in line:  # Filter for project-specific lines
26 | 
27 |             parts = line.split()
28 |             if len(parts) >= 5:  # Basic sanity check for the parts
29 |                 function_data = {
30 |                     "function": parts[-1],  # Function path
31 |                     "ncalls": parts[0],  # Number of calls
32 |                     "tottime": parts[1],
33 |                     "tottime_per_call": parts[2],  # Time spent in function
34 |                     "cumtime": parts[3],  # Cumulative time including subcalls
35 |                     "cumtime_per_call": parts[4],  #
36 |                 }
37 |                 project_stats.append(function_data)
38 | 
39 |     return project_stats
40 | 
41 | 
42 | if __name__ == "__main__":
43 | 
44 |     result = {}
45 |     for scraper_name in ScraperFactory.all_scrapers_name():
46 | 
47 |         def full_execution(scraper):
48 |             """full execution of the scraper"""
49 |             with tempfile.TemporaryDirectory() as tmpdirname:
50 |                 try:
51 |                     initer = ScraperFactory.get(scraper)(folder_name=tmpdirname)
52 |                     return initer.scrape(when_date=_now()), ""
53 |                 except Exception as e:  # pylint: disable=broad-exception-caught
54 |                     return [], str(e)
55 | 
56 |         execution_time = datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S")
57 |         start_time = time.time()
58 |         pr = cProfile.Profile()
59 |         pr.enable()
60 | 
61 |         files, error = full_execution(scraper_name)
62 | 
63 |         pr.disable()
64 | 
65 |         end_time = time.time()
66 |         result[scraper_name] = {
67 |             "status": format_stats_as_json(pr, "israeli-supermarket-scarpers"),
68 |             "execution_time": execution_time,
69 |             "start_time": start_time,
70 |             "end_time": end_time,
71 |             "time": end_time - start_time,
72 |             "files": len(files),
73 |             "error": error,
74 |         }
75 | 
76 |         with open("stress_test_results.json", "w", encoding="utf-8") as f:
77 |             json.dump(result, f)
78 | 


--------------------------------------------------------------------------------
/il_supermarket_scarper/utils/validation.py:
--------------------------------------------------------------------------------
 1 | import re
 2 | import difflib
 3 | 
 4 | 
 5 | def is_valid_chain_name(input_string):
 6 |     """check the chain name is in a valid folder foramt"""
 7 |     # Regular expression pattern to match only letters (a-z, case insensitive) and hyphens (-)
 8 |     pattern = r"^[a-zA-Z0-9-]+$"
 9 | 
10 |     # Use re.match to check if the entire string matches the pattern
11 |     return bool(re.match(pattern, input_string))
12 | 
13 | 
14 | def find_index_with_substring(array, substring):
15 |     """Find the index of the first element in the array that contains the substring"""
16 |     return [i for i, s in enumerate(array) if substring in s][0]
17 | 
18 | 
19 | def show_text_diff(text1, text2):
20 |     """show the difference between two text strings in a git-like format"""
21 |     # Split the texts into lines for comparison
22 |     text1_lines = text1.splitlines()
23 |     text2_lines = text2.splitlines()
24 | 
25 |     text1_lines = text1_lines[
26 |         find_index_with_substring(
27 |             text1_lines, "חוקים ותקנות"
28 |         ) : find_index_with_substring(text1_lines, "נוסח החוק המעודכן ביותר")
29 |     ]
30 |     text2_lines = text2_lines[
31 |         find_index_with_substring(
32 |             text2_lines, "חוקים ותקנות"
33 |         ) : find_index_with_substring(text2_lines, "נוסח החוק המעודכן ביותר")
34 |     ]
35 | 
36 |     # Use difflib to compare the texts with more context
37 |     diff = difflib.unified_diff(
38 |         text1_lines,
39 |         text2_lines,
40 |         lineterm="",
41 |         fromfile="Expected",
42 |         tofile="Actual",
43 |         n=5,  # Show 5 lines of context around changes
44 |     )
45 | 
46 |     # Format the output for better readability
47 |     diff_lines = []
48 |     diff_lines.append("\n" + "=" * 80)
49 |     diff_lines.append("DIFF:")
50 |     diff_lines.append("=" * 80)
51 | 
52 |     for line in diff:
53 |         # Add visual markers for different line types
54 |         if line.startswith("---") or line.startswith("+++"):
55 |             diff_lines.append(line)
56 |         elif line.startswith("-"):
57 |             diff_lines.append(f"- {line[1:]}")  # Removed line
58 |         elif line.startswith("+"):
59 |             diff_lines.append(f"+ {line[1:]}")  # Added line
60 |         elif line.startswith("@@"):
61 |             diff_lines.append("\n" + line)  # Context marker
62 |         else:
63 |             diff_lines.append(f"  {line}")  # Context line
64 | 
65 |     diff_lines.append("=" * 80)
66 | 
67 |     return "\n".join(diff_lines)
68 | 
69 | 
70 | def change_xml_encoding(file_path):
71 |     """change the encoding if failing with utf-8"""
72 |     with open(file_path, "rb") as file:  # pylint: disable=unspecified-encoding
73 |         # Read the XML file content
74 |         content = file.read()
75 | 
76 |     content = content.decode("ISO-8859-8", errors="replace")
77 | 
78 |     # Save the file with the new encoding declaration
79 |     with open(file_path, "wb") as file:
80 |         file.write(
81 |             content.replace('encoding="ISO-8859-8"', 'encoding="UTF-8"').encode("utf-8")
82 |         )
83 | 


--------------------------------------------------------------------------------
/.github/workflows/codeql.yml:
--------------------------------------------------------------------------------
 1 | # For most projects, this workflow file will not need changing; you simply need
 2 | # to commit it to your repository.
 3 | #
 4 | # You may wish to alter this file to override the set of languages analyzed,
 5 | # or to provide custom queries or build logic.
 6 | #
 7 | # ******** NOTE ********
 8 | # We have attempted to detect the languages in your repository. Please check
 9 | # the `language` matrix defined below to confirm you have the correct set of
10 | # supported CodeQL languages.
11 | #
12 | name: "CodeQL"
13 | 
14 | on:
15 |   push:
16 |     branches: [ "main" ]
17 |   pull_request:
18 |     # The branches below must be a subset of the branches above
19 |     branches: [ "main" ]
20 | 
21 |     
22 | concurrency:
23 |   group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.ref }}
24 |   cancel-in-progress: true
25 | 
26 | jobs:
27 |   analyze:
28 |     name: Analyze
29 |     runs-on: ubuntu-latest
30 |     permissions:
31 |       actions: read
32 |       contents: read
33 |       security-events: write
34 | 
35 |     strategy:
36 |       fail-fast: false
37 |       matrix:
38 |         language: [ 'python' ]
39 |         # CodeQL supports [ 'cpp', 'csharp', 'go', 'java', 'javascript', 'python', 'ruby' ]
40 |         # Learn more about CodeQL language support at https://aka.ms/codeql-docs/language-support
41 | 
42 |     steps:
43 |     - name: Checkout repository
44 |       uses: actions/checkout@v3
45 | 
46 |     # Initializes the CodeQL tools for scanning.
47 |     - name: Initialize CodeQL
48 |       uses: github/codeql-action/init@v2
49 |       with:
50 |         languages: ${{ matrix.language }}
51 |         # If you wish to specify custom queries, you can do so here or in a config file.
52 |         # By default, queries listed here will override any specified in a config file.
53 |         # Prefix the list here with "+" to use these queries and those in the config file.
54 |         
55 |         # Details on CodeQL's query packs refer to : https://docs.github.com/en/code-security/code-scanning/automatically-scanning-your-code-for-vulnerabilities-and-errors/configuring-code-scanning#using-queries-in-ql-packs
56 |         # queries: security-extended,security-and-quality
57 | 
58 |         
59 |     # Autobuild attempts to build any compiled languages  (C/C++, C#, or Java).
60 |     # If this step fails, then you should remove it and run the build manually (see below)
61 |     - name: Autobuild
62 |       uses: github/codeql-action/autobuild@v2
63 | 
64 |     # ℹ️ Command-line programs to run using the OS shell.
65 |     # 📚 See https://docs.github.com/en/actions/using-workflows/workflow-syntax-for-github-actions#jobsjob_idstepsrun
66 | 
67 |     #   If the Autobuild fails above, remove it and uncomment the following three lines. 
68 |     #   modify them (or add more) to build your code if your project, please refer to the EXAMPLE below for guidance.
69 | 
70 |     # - run: |
71 |     #   echo "Run, Build Application using script"
72 |     #   ./location_of_script_within_repo/buildscript.sh
73 | 
74 |     - name: Perform CodeQL Analysis
75 |       uses: github/codeql-action/analyze@v2
76 |       with:
77 |         category: "/language:${{matrix.language}}"
78 | 


--------------------------------------------------------------------------------
/il_supermarket_scarper/engines/publishprice.py:
--------------------------------------------------------------------------------
 1 | from bs4 import BeautifulSoup
 2 | 
 3 | from il_supermarket_scarper.utils.logger import Logger
 4 | from .web import WebBase
 5 | 
 6 | 
 7 | class PublishPrice(WebBase):
 8 |     """
 9 |     scrape the file of PublishPrice
10 |     possibly can support historical search: there is folder for each date.
11 |     but this is not implemented.
12 |     """
13 | 
14 |     def __init__(
15 |         self,
16 |         chain,
17 |         chain_id,
18 |         site_infix,
19 |         folder_name=None,
20 |         domain="prices",
21 |         max_threads=5,
22 |     ):
23 |         super().__init__(
24 |             chain,
25 |             chain_id,
26 |             url=f"https://{domain}.{site_infix}.co.il/",
27 |             folder_name=folder_name,
28 |             max_threads=max_threads,
29 |         )
30 |         self.folder = None
31 | 
32 |     def get_request_url(
33 |         self, files_types=None, store_id=None, when_date=None
34 |     ):  # pylint: disable=unused-argument
35 |         """get all links to collect download links from"""
36 | 
37 |         formated = ""
38 |         if when_date:
39 |             formated = when_date.strftime("%Y%m%d")
40 |             formated = f"?p=./{formated}"
41 |         return [{"url": self.url + formated, "method": "GET"}]
42 | 
43 |     def get_data_from_page(self, req_res):
44 |         soup = BeautifulSoup(req_res.text, features="lxml")
45 | 
46 |         # the developer hard-coded the files names in the html
47 |         all_trs = (
48 |             soup.find_all("script")[-1]
49 |             .text.replace("const files_html = [", "")
50 |             .replace("];", "")
51 |             .split("\n")[5]
52 |             .split(",")
53 |         )
54 |         return list(map(lambda x: BeautifulSoup(x, features="lxml"), all_trs))
55 | 
56 |     def extract_task_from_entry(self, all_trs):
57 |         """from the trs extract the download urls, file names, and file sizes"""
58 | 
59 |         def get_herf_element(x):
60 |             herfs = x.find_all("a")
61 |             if len(herfs) > 0:
62 |                 return herfs[-1]
63 |             return None
64 | 
65 |         def get_herf(x):
66 |             return get_herf_element(x).attrs["href"]
67 | 
68 |         def get_path_from_herf(x):
69 |             return get_herf(x).replace("\\", "").replace('"', "").replace("./", "")
70 | 
71 |         def get_name_from_herf(x):
72 |             return get_path_from_herf(x).split(".")[0].split("/")[-1]
73 | 
74 |         all_trs = list(
75 |             filter(
76 |                 lambda x: get_herf_element(x) is not None,
77 |                 all_trs,
78 |             )
79 |         )
80 | 
81 |         download_urls = []
82 |         file_names = []
83 |         file_sizes = []
84 |         for x in all_trs:
85 |             try:
86 |                 download_urls.append(self.url + get_path_from_herf(x))
87 |                 file_names.append(get_name_from_herf(x))
88 |                 file_sizes.append(self.get_file_size_from_entry(x))
89 |             except (AttributeError, KeyError, IndexError, TypeError) as e:
90 |                 Logger.warning(f"Error extracting task from entry: {e}")
91 | 
92 |         return download_urls, file_names, file_sizes
93 | 


--------------------------------------------------------------------------------
/il_supermarket_scarper/utils/file_cache.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | import json
 3 | import time
 4 | from functools import wraps
 5 | 
 6 | 
 7 | def file_cache(ttl=None):
 8 |     """Decorator to cache function results in a file with an optional TTL (time-to-live)"""
 9 | 
10 |     def get_cache_file(func_name):
11 |         """Generate a cache file path based on the function name"""
12 |         cache_dir = ".cache"
13 |         return os.path.join(cache_dir, f"{func_name}_cache.json")
14 | 
15 |     def load_cache(cache_file):
16 |         """Load the cache from the specified cache file if it exists"""
17 |         if os.path.exists(cache_file):
18 |             with open(cache_file, "r", encoding="utf-8") as f:
19 |                 return json.load(f)
20 |         return {}
21 | 
22 |     def save_cache(cache_file, cache_data):
23 |         """Save the cache to the specified cache file"""
24 |         if not os.path.exists(".cache"):
25 |             os.makedirs(".cache")
26 |         with open(cache_file, "w", encoding="utf-8") as f:
27 |             json.dump(cache_data, f)
28 | 
29 |     def decorator(func):
30 |         @wraps(func)
31 |         def wrapper(*args, **kwargs):
32 |             # Generate cache file path based on the function name
33 |             cache_file = get_cache_file(func.__name__)
34 | 
35 |             # Load the cache from the file
36 |             cache = load_cache(cache_file)
37 | 
38 |             # Generate a cache key from function arguments
39 |             cache_key = generate_cache_key(args, kwargs)
40 | 
41 |             # Check if result is cached and valid
42 |             if cache_key in cache:
43 |                 entry = cache[cache_key]
44 |                 timestamp = entry["timestamp"]
45 | 
46 |                 # If ttl is set, check if cache has expired
47 |                 if ttl is not None and (time.time() - timestamp) > ttl:
48 |                     # Cache expired, remove the entry
49 |                     del cache[cache_key]
50 |                 else:
51 |                     # Cache is valid, return cached result
52 |                     return entry["result"]
53 | 
54 |             # If not cached or expired, call the function and store the result
55 |             result = func(*args, **kwargs)
56 | 
57 |             # Save the result with the current timestamp in the cache
58 |             cache[cache_key] = {
59 |                 "result": result,
60 |                 "timestamp": time.time(),  # Save the current time
61 |             }
62 |             save_cache(cache_file, cache)
63 | 
64 |             return result
65 | 
66 |         def generate_cache_key(args, kwargs):
67 |             key_parts = []
68 |             for arg in args:
69 |                 if isinstance(arg, (int, float, str, bool)):
70 |                     key_parts.append(str(arg))
71 |                 else:
72 |                     raise ValueError(f"Unsupported argument type: {type(arg)}")
73 |             for k, v in kwargs.items():
74 |                 if isinstance(v, (int, float, str, bool)):
75 |                     key_parts.append(f"{k}={v}")
76 |                 else:
77 |                     raise ValueError(f"Unsupported keyword argument type: {type(v)}")
78 |             return "|".join(key_parts)
79 | 
80 |         return wrapper
81 | 
82 |     return decorator
83 | 


--------------------------------------------------------------------------------
/il_supermarket_scarper/utils/file_types.py:
--------------------------------------------------------------------------------
 1 | from enum import Enum
 2 | 
 3 | 
 4 | class FileTypesFilters(Enum):
 5 |     """type of files avaliable to download"""
 6 | 
 7 |     PROMO_FILE = {
 8 |         "should_contain": "promo",
 9 |         "should_not_contain": "full",
10 |     }
11 |     STORE_FILE = {
12 |         "should_contain": "store",
13 |         "should_not_contain": None,
14 |     }
15 |     PRICE_FILE = {
16 |         "should_contain": "price",
17 |         "should_not_contain": "full",
18 |     }
19 |     PROMO_FULL_FILE = {
20 |         "should_contain": "promofull",
21 |         "should_not_contain": None,
22 |     }
23 |     PRICE_FULL_FILE = {
24 |         "should_contain": "pricefull",
25 |         "should_not_contain": None,
26 |     }
27 | 
28 |     @classmethod
29 |     def all_types(cls):
30 |         """Returns a list of all the enum keys."""
31 |         return [e.name for e in FileTypesFilters]
32 | 
33 |     @classmethod
34 |     def all_update_files(cls):
35 |         """all the update files"""
36 |         return [FileTypesFilters.PROMO_FILE.name, FileTypesFilters.PRICE_FILE.name]
37 | 
38 |     @classmethod
39 |     def all_full_files(cls):
40 |         """all the full files"""
41 |         return [
42 |             FileTypesFilters.PRICE_FULL_FILE.name,
43 |             FileTypesFilters.PROMO_FULL_FILE.name,
44 |         ]
45 | 
46 |     @classmethod
47 |     def only_promo(cls):
48 |         """only files with promotion date"""
49 |         return [FileTypesFilters.PROMO_FILE.name, FileTypesFilters.PROMO_FULL_FILE.name]
50 | 
51 |     @classmethod
52 |     def only_store(cls):
53 |         """only files with stores date"""
54 |         return [FileTypesFilters.STORE_FILE.name]
55 | 
56 |     @classmethod
57 |     def only_price(cls):
58 |         """only files with prices date"""
59 |         return [FileTypesFilters.PRICE_FILE.name, FileTypesFilters.PRICE_FULL_FILE.name]
60 | 
61 |     @staticmethod
62 |     def filter_file(file_name, should_contain, should_not_contain):
63 |         """fillter function"""
64 |         return (
65 |             should_contain in file_name.lower()
66 |             and "null" not in file_name.lower()
67 |             and (
68 |                 should_not_contain is None
69 |                 or should_not_contain not in file_name.lower()
70 |             )
71 |         )
72 | 
73 |     @classmethod
74 |     def is_file_from_type(cls, filename, file_type):
75 |         """check if file from certain type"""
76 |         string_to_look_in = getattr(cls, file_type).value
77 |         return cls.filter_file(filename, **string_to_look_in)
78 | 
79 |     @classmethod
80 |     def get_type_from_file(cls, filename):
81 |         """get file type from filename"""
82 |         for file_type_name in cls.all_types():
83 |             if cls.is_file_from_type(filename, file_type_name):
84 |                 return getattr(cls, file_type_name)
85 |         return None
86 | 
87 |     @classmethod
88 |     def filter(cls, file_type, iterable, by_function=lambda x: x):
89 |         """Returns the type of the file."""
90 |         return list(
91 |             filter(
92 |                 lambda filename: cls.is_file_from_type(
93 |                     by_function(filename), file_type
94 |                 ),
95 |                 iterable,
96 |             )
97 |         )
98 | 


--------------------------------------------------------------------------------
/il_supermarket_scarper/scrappers/super_pharm.py:
--------------------------------------------------------------------------------
 1 | from pathlib import Path
 2 | import urllib.parse
 3 | import datetime
 4 | 
 5 | import json
 6 | from il_supermarket_scarper.engines import MultiPageWeb
 7 | from il_supermarket_scarper.utils import (
 8 |     Logger,
 9 |     url_connection_retry,
10 |     DumpFolderNames,
11 |     FileTypesFilters,
12 | )
13 | 
14 | 
15 | class SuperPharm(MultiPageWeb):
16 |     """scraper for super pharm"""
17 | 
18 |     def __init__(self, folder_name=None):
19 |         super().__init__(
20 |             chain=DumpFolderNames.SUPER_PHARM,
21 |             chain_id="7290172900007",
22 |             url="http://prices.super-pharm.co.il/",
23 |             folder_name=folder_name,
24 |             total_page_xpath='//*[@class="mvc-grid-pager"]/button[last()]/@data-page',
25 |             total_pages_pattern=r"(\d+)$",
26 |             page_argument="&page",
27 |         )
28 | 
29 |     def collect_files_details_from_page(self, html):
30 |         links = []
31 |         filenames = []
32 |         file_sizes = []
33 |         for element in html.xpath("//tbody/tr"):  # skip header
34 |             links.append(self.url + element.xpath("./td[6]/a/@href")[0])
35 |             filenames.append(element.xpath("./td[2]")[0].text)
36 |             file_sizes.append(None)  # Super Pharm don't support file size in the entry
37 |         return links, filenames, file_sizes
38 | 
39 |     @url_connection_retry()
40 |     def retrieve_file(self, file_link, file_save_path, timeout=15):
41 |         Logger.debug(f"On a new Session: calling {file_link}")
42 | 
43 |         response_content = self.session_with_cookies_by_chain(
44 |             file_link, timeout=timeout
45 |         )
46 |         spath = json.loads(response_content.content)
47 |         Logger.debug(f"Found spath: {spath}")
48 | 
49 |         file_to_save = self.session_with_cookies_by_chain(
50 |             self.url + spath["href"], timeout=timeout
51 |         )
52 |         file_to_save_with_ext = file_save_path + ".gz"
53 |         Path(file_to_save_with_ext).write_bytes(file_to_save.content)
54 | 
55 |         return file_to_save_with_ext
56 | 
57 |     def get_file_types_id(self, files_types=None):
58 |         """get the file type id"""
59 |         if files_types is None:
60 |             return [""]
61 | 
62 |         types = []
63 |         for ftype in files_types:
64 |             if ftype == FileTypesFilters.STORE_FILE.name:
65 |                 types.append("StoresFull")
66 |             if ftype == FileTypesFilters.PRICE_FILE.name:
67 |                 types.append("Price")
68 |             if ftype == FileTypesFilters.PROMO_FILE.name:
69 |                 types.append("Promo")
70 |             if ftype == FileTypesFilters.PRICE_FULL_FILE.name:
71 |                 types.append("PriceFull")
72 |             if ftype == FileTypesFilters.PROMO_FULL_FILE.name:
73 |                 types.append("PromoFull")
74 |         return types
75 | 
76 |     def build_params(self, files_types=None, store_id=None, when_date=None):
77 |         """build the params for the request"""
78 | 
79 |         all_params = []
80 |         for ftype in self.get_file_types_id(files_types):
81 |             params = {"type": "", "date": "", "store": ""}
82 | 
83 |             if store_id:
84 |                 params["store"] = store_id
85 |             if when_date and isinstance(when_date, datetime.datetime):
86 |                 params["date"] = when_date.strftime("%Y-%m-%d")
87 |             if files_types:
88 |                 params["type"] = ftype
89 |             all_params.append(params)
90 | 
91 |         return ["?" + urllib.parse.urlencode(params) for params in all_params]
92 | 


--------------------------------------------------------------------------------
/il_supermarket_scarper/scrappers/hazihinam.py:
--------------------------------------------------------------------------------
 1 | import urllib.parse
 2 | import datetime
 3 | from il_supermarket_scarper.engines import MultiPageWeb
 4 | from il_supermarket_scarper.utils import (
 5 |     DumpFolderNames,
 6 |     FileTypesFilters,
 7 |     _now,
 8 |     convert_unit,
 9 |     UnitSize,
10 |     string_to_float,
11 | )
12 | 
13 | # class HaziHinam(Cerberus):
14 | #     """scrper fro hazi hinam"""
15 | 
16 | #     def __init__(self, folder_name=None):
17 | #         super().__init__(
18 | #             chain=DumpFolderNames.HAZI_HINAM,
19 | #             chain_id="7290700100008",
20 | #             folder_name=folder_name,
21 | #             ftp_username="HaziHinam",
22 | #         )
23 | 
24 | 
25 | class HaziHinam(MultiPageWeb):
26 |     """scrper fro hazi hinam"""
27 | 
28 |     def __init__(self, folder_name=None):
29 |         super().__init__(
30 |             chain=DumpFolderNames.HAZI_HINAM,
31 |             chain_id="7290700100008",
32 |             url="https://shop.hazi-hinam.co.il/Prices",
33 |             folder_name=folder_name,
34 |             total_page_xpath="(//li[contains(concat(' ', normalize-space(@class), ' '),"
35 |             + "' pagination-item ')])[last()]/a/@href",
36 |             total_pages_pattern=r"\d+",
37 |             page_argument="&p",
38 |         )
39 | 
40 |     def collect_files_details_from_page(self, html):
41 |         """collect the details deom one page"""
42 |         links = []
43 |         filenames = []
44 |         file_sizes = []
45 |         for link in html.xpath("//table/tbody/tr"):
46 |             links.append(link.xpath("td[6]/a/@href")[0])
47 |             filenames.append(link.xpath("td[3]")[0].text.strip() + ".xml.gz")
48 |             file_sizes.append(
49 |                 convert_unit(
50 |                     string_to_float(link.xpath("td[5]")[0].text.strip()),
51 |                     UnitSize.KB,
52 |                     UnitSize.BYTES,
53 |                 )
54 |             )
55 |         return links, filenames, file_sizes
56 | 
57 |     def get_file_types_id(self, files_types=None):
58 |         """get the file type id"""
59 |         if files_types is None or files_types == FileTypesFilters.all_types():
60 |             return [{"t": "null", "f": "null"}]
61 | 
62 |         types = []
63 |         for ftype in files_types:
64 |             if ftype == FileTypesFilters.STORE_FILE.name:
65 |                 types.append({"t": "3", "f": "null"})
66 |             if ftype == FileTypesFilters.PRICE_FILE.name:
67 |                 types.append({"t": "1", "f": "null"})
68 |             if ftype == FileTypesFilters.PROMO_FILE.name:
69 |                 types.append({"t": "2", "f": "null"})
70 |             if ftype == FileTypesFilters.PRICE_FULL_FILE.name:
71 |                 types.append({"t": "1", "f": "null"})
72 |             if ftype == FileTypesFilters.PROMO_FULL_FILE.name:
73 |                 types.append({"t": "2", "f": "null"})
74 |         return types
75 | 
76 |     def build_params(self, files_types=None, store_id=None, when_date=None):
77 |         """build the params for the request"""
78 | 
79 |         all_params = []
80 |         for type_params in self.get_file_types_id(files_types):
81 | 
82 |             # filtering store is not supported
83 |             # if store_id:
84 |             #     params["s"] = "null"
85 |             if when_date and isinstance(when_date, datetime.datetime):
86 |                 all_params.append({"d": when_date.strftime("%Y-%m-%d"), **type_params})
87 |             else:
88 |                 all_params.append({"d": _now().strftime("%Y-%m-%d"), **type_params})
89 |                 all_params.append(
90 |                     {
91 |                         "d": (_now() - datetime.timedelta(days=1)).strftime("%Y-%m-%d"),
92 |                         **type_params,
93 |                     }
94 |                 )
95 | 
96 |         return ["?" + urllib.parse.urlencode(params) for params in all_params]
97 | 


--------------------------------------------------------------------------------
/il_supermarket_scarper/scrappers/city_market.py:
--------------------------------------------------------------------------------
  1 | import urllib.parse
  2 | import datetime
  3 | from il_supermarket_scarper.engines import Bina, MultiPageWeb
  4 | from il_supermarket_scarper.utils import (
  5 |     DumpFolderNames,
  6 |     FileTypesFilters,
  7 |     UnitSize,
  8 | )
  9 | from il_supermarket_scarper.utils.status import convert_unit, string_to_float
 10 | 
 11 | 
 12 | # removed on 28.02.2025
 13 | class CityMarketGivatayim(Bina):
 14 |     """scraper for city market givatayim"""
 15 | 
 16 |     def __init__(self, folder_name=None):
 17 |         super().__init__(
 18 |             chain=DumpFolderNames.CITY_MARKET_GIVATAYIM,
 19 |             chain_id="5359000000000",
 20 |             url_perfix="citymarketgivatayim",
 21 |             folder_name=folder_name,
 22 |         )
 23 | 
 24 | 
 25 | # removed on 28.10.2024
 26 | class CityMarketKirtatOno(Bina):
 27 |     """scraper for city market givatayim"""
 28 | 
 29 |     def __init__(self, folder_name=None):
 30 |         super().__init__(
 31 |             chain=DumpFolderNames.CITY_MARKET_KIRYATONO,
 32 |             chain_id="5359000000000",
 33 |             url_perfix="citymarketkiryatono",
 34 |             folder_name=folder_name,
 35 |         )
 36 | 
 37 | 
 38 | class CityMarketKiryatGat(Bina):
 39 |     """scraper for city market givatayim"""
 40 | 
 41 |     def __init__(self, folder_name=None):
 42 |         super().__init__(
 43 |             chain=DumpFolderNames.CITY_MARKET_KIRYATGAT,
 44 |             chain_id="7290058266241",
 45 |             url_perfix="citymarketkiryatgat",
 46 |             folder_name=folder_name,
 47 |         )
 48 | 
 49 | 
 50 | class CityMarketShops(MultiPageWeb):
 51 |     """scraper for city market givatayim"""
 52 | 
 53 |     def __init__(self, folder_name=None):
 54 |         super().__init__(
 55 |             chain=DumpFolderNames.CITY_MARKET_SHOPS,
 56 |             chain_id="7290000000003",
 57 |             url="http://www.citymarket-shops.co.il/",
 58 |             folder_name=folder_name,
 59 |             total_page_xpath="(//li[contains(concat(' ', normalize-space(@class), ' '),"
 60 |             + "' pagination-item ')])[last()]/a/@href",
 61 |             total_pages_pattern=r"\d+",
 62 |             page_argument="&p",
 63 |         )
 64 | 
 65 |     def collect_files_details_from_page(self, html):
 66 |         """collect the details deom one page"""
 67 |         links = []
 68 |         filenames = []
 69 |         file_sizes = []
 70 |         for link in html.xpath("//table/tbody/tr"):
 71 |             links.append(self.url + link.xpath("td[7]/a/@href")[0])
 72 |             filenames.append(link.xpath("td[3]")[0].text.strip() + ".xml.gz")
 73 |             file_sizes.append(
 74 |                 convert_unit(
 75 |                     string_to_float(link.xpath("td[6]")[0].text.strip()),
 76 |                     UnitSize.KB,
 77 |                     UnitSize.BYTES,
 78 |                 )
 79 |             )
 80 |         return links, filenames, file_sizes
 81 | 
 82 |     def get_file_types_id(self, files_types=None):
 83 |         """get the file type id"""
 84 |         if files_types is None or files_types == FileTypesFilters.all_types():
 85 |             return [{"t": "", "f": ""}]
 86 | 
 87 |         types = []
 88 |         for ftype in files_types:
 89 |             if ftype == FileTypesFilters.STORE_FILE.name:
 90 |                 types.append({"t": 3, "f": ""})
 91 |             if ftype == FileTypesFilters.PRICE_FILE.name:
 92 |                 types.append({"t": "1", "f": "0"})
 93 |             if ftype == FileTypesFilters.PROMO_FILE.name:
 94 |                 types.append({"t": "2", "f": "0"})
 95 |             if ftype == FileTypesFilters.PRICE_FULL_FILE.name:
 96 |                 types.append({"t": "1", "f": "1"})
 97 |             if ftype == FileTypesFilters.PROMO_FULL_FILE.name:
 98 |                 types.append({"t": "2", "f": "1"})
 99 |         return types
100 | 
101 |     def build_params(self, files_types=None, store_id=None, when_date=None):
102 |         """build the params for the request"""
103 | 
104 |         all_params = []
105 |         for type_params in self.get_file_types_id(files_types):
106 |             params = {"d": "", "s": ""}
107 | 
108 |             if store_id:
109 |                 params["s"] = str(store_id).zfill(3)
110 |             if when_date and isinstance(when_date, datetime.datetime):
111 |                 params["d"] = when_date.strftime("%Y-%m-%d")
112 |             if files_types:
113 |                 params = {**params, **type_params}
114 |             all_params.append(params)
115 | 
116 |         return ["?" + urllib.parse.urlencode(params) for params in all_params]
117 | 


--------------------------------------------------------------------------------
/il_supermarket_scarper/utils/databases/json_file.py:
--------------------------------------------------------------------------------
  1 | import os
  2 | import json
  3 | from ..logger import Logger
  4 | from .base import AbstractDataBase
  5 | 
  6 | 
  7 | class JsonDataBase(AbstractDataBase):
  8 |     """A class that represents a JSON-based database."""
  9 | 
 10 |     def __init__(self, database_name, base_path="json_db") -> None:
 11 |         super().__init__(database_name, collection_status=True)
 12 |         self.base_path = base_path
 13 |         self.database_file = f"{self.database_name}.json"
 14 |         self._ensure_db_directory_exists()
 15 |         self._ensure_db_file_exists()
 16 | 
 17 |     def _ensure_db_directory_exists(self):
 18 |         """Ensure the base directory for the JSON database exists."""
 19 |         if not os.path.exists(self.base_path):
 20 |             os.makedirs(self.base_path, exist_ok=True)
 21 | 
 22 |     def _ensure_db_file_exists(self):
 23 |         """Ensure the database file exists."""
 24 |         file_path = self._get_database_file_path()
 25 |         if not os.path.exists(file_path):
 26 |             with open(file_path, "w", encoding="utf-8") as file:
 27 |                 json.dump({}, file)  # Initialize with an empty dict
 28 | 
 29 |     def _get_database_file_path(self):
 30 |         """Get the full path to the database JSON file."""
 31 |         return os.path.join(self.base_path, self.database_file)
 32 | 
 33 |     def _read_database(self):
 34 |         """Read the JSON database file and return its contents."""
 35 |         file_path = self._get_database_file_path()
 36 |         data = {}
 37 | 
 38 |         # Load existing data from the file
 39 |         if os.path.exists(file_path):
 40 |             with open(file_path, "r", encoding="utf-8") as file:
 41 |                 try:
 42 |                     data = json.load(file)
 43 |                 except json.JSONDecodeError:
 44 |                     Logger.warning(f"File {file_path} is corrupted, resetting it.")
 45 |                     data = {}
 46 |         return data
 47 | 
 48 |     def _write_database(self, data):
 49 |         """Write data to the JSON database file."""
 50 |         file_path = self._get_database_file_path()
 51 | 
 52 |         with open(file_path, "w", encoding="utf-8") as file:
 53 |             json.dump(dict(sorted(data.items())), file, default=str, indent=4)
 54 | 
 55 |     def insert_documents(self, collection_name, document):
 56 |         """Insert a document into a collection inside the JSON database."""
 57 |         if self.collection_status:
 58 | 
 59 |             data = self._read_database()
 60 |             # Ensure the collection exists in the database
 61 |             if collection_name not in data:
 62 |                 data[collection_name] = []
 63 | 
 64 |             # Add the new document to the collection
 65 |             data[collection_name].extend(document)
 66 | 
 67 |             # Save the updated data back to the file
 68 |             self._write_database(data)
 69 | 
 70 |     def insert_document(self, collection_name, document):
 71 |         """Insert a document into a collection inside the JSON database."""
 72 |         if self.collection_status:
 73 |             data = self._read_database()
 74 |             # Ensure the collection exists in the database
 75 |             if collection_name not in data:
 76 |                 data[collection_name] = []
 77 | 
 78 |             # Add the new document to the collection
 79 |             data[collection_name].append(document)
 80 | 
 81 |             # Save the updated data back to the file
 82 |             self._write_database(data)
 83 | 
 84 |     def find_document(self, collection_name, query):
 85 |         """Find a document in a collection based on a query."""
 86 |         if self.collection_status:
 87 |             file_path = self._get_database_file_path()
 88 | 
 89 |             if os.path.exists(file_path):
 90 |                 with open(file_path, "r", encoding="utf-8") as file:
 91 |                     try:
 92 |                         data = json.load(file)
 93 | 
 94 |                         # Check if the collection exists
 95 |                         if collection_name in data:
 96 |                             # Filter the documents in the collection based on the query
 97 |                             for document in data[collection_name]:
 98 |                                 if all(
 99 |                                     item in document.items() for item in query.items()
100 |                                 ):
101 |                                     return document
102 |                     except json.JSONDecodeError:
103 |                         Logger.warning(f"File {file_path} is corrupted.")
104 | 
105 |         return None
106 | 


--------------------------------------------------------------------------------
/il_supermarket_scarper/scrapper_runner.py:
--------------------------------------------------------------------------------
  1 | import os
  2 | 
  3 | from multiprocessing import Pool
  4 | 
  5 | from .scrappers_factory import ScraperFactory
  6 | from .utils import Logger, summerize_dump_folder_contant, clean_dump_folder
  7 | 
  8 | 
  9 | class MainScrapperRunner:
 10 |     """a main scraper to execute all scraping"""
 11 | 
 12 |     def __init__(
 13 |         self,
 14 |         size_estimation_mode=False,
 15 |         enabled_scrapers=None,
 16 |         dump_folder_name=None,
 17 |         multiprocessing=5,
 18 |         lookup_in_db=True,
 19 |     ):
 20 |         assert isinstance(enabled_scrapers, list) or enabled_scrapers is None
 21 | 
 22 |         env_size_estimation_mode = os.getenv("SE_MODE", None)
 23 |         if env_size_estimation_mode:
 24 |             Logger.info(
 25 |                 f"Setting size estimation mode from enviroment. value={env_size_estimation_mode}"
 26 |             )
 27 |             self.size_estimation_mode = bool(env_size_estimation_mode == "True")
 28 |         else:
 29 |             self.size_estimation_mode = size_estimation_mode
 30 |         Logger.info(f"size_estimation_mode: {self.size_estimation_mode}")
 31 | 
 32 |         if not enabled_scrapers:
 33 |             enabled_scrapers = ScraperFactory.all_scrapers_name()
 34 | 
 35 |         self.enabled_scrapers = enabled_scrapers
 36 |         Logger.info(f"Enabled scrapers: {self.enabled_scrapers}")
 37 |         self.dump_folder_name = dump_folder_name
 38 |         self.multiprocessing = multiprocessing
 39 |         self.lookup_in_db = lookup_in_db
 40 | 
 41 |     def run(
 42 |         self,
 43 |         limit=None,
 44 |         files_types=None,
 45 |         when_date=False,
 46 |         suppress_exception=False,
 47 |         min_size=None,
 48 |         max_size=None,
 49 |     ):
 50 |         """run the scraper"""
 51 |         Logger.info(f"Limit is {limit}")
 52 |         Logger.info(f"files_types is {files_types}")
 53 |         Logger.info(f"Start scraping {','.join(self.enabled_scrapers)}.")
 54 | 
 55 |         with Pool(self.multiprocessing) as pool:
 56 |             result = pool.map(
 57 |                 self.scrape_one_wrap,
 58 |                 list(
 59 |                     map(
 60 |                         lambda chainScrapperClass: (
 61 |                             chainScrapperClass,
 62 |                             {
 63 |                                 "limit": limit,
 64 |                                 "files_types": files_types,
 65 |                                 "when_date": when_date,
 66 |                                 "suppress_exception": suppress_exception,
 67 |                                 "min_size": min_size,
 68 |                                 "max_size": max_size,
 69 |                             },
 70 |                         ),
 71 |                         self.enabled_scrapers,
 72 |                     )
 73 |                 ),
 74 |             )
 75 | 
 76 |         Logger.info("Done scraping all supermarkets.")
 77 | 
 78 |         return result
 79 | 
 80 |     def scrape_one_wrap(self, arg):
 81 |         """scrape one warper"""
 82 |         args, kwargs = arg
 83 |         return self.scrape_one(args, **kwargs)
 84 | 
 85 |     def scrape_one(
 86 |         self,
 87 |         chain_scrapper_class,
 88 |         limit=None,
 89 |         files_types=None,
 90 |         store_id=None,
 91 |         when_date=None,
 92 |         suppress_exception=False,
 93 |         min_size=None,
 94 |         max_size=None,
 95 |     ):
 96 |         """scrape one"""
 97 |         chain_scrapper_constractor = ScraperFactory.get(chain_scrapper_class)
 98 |         Logger.info(f"Starting scrapper {chain_scrapper_constractor}")
 99 |         scraper = chain_scrapper_constractor(folder_name=self.dump_folder_name)
100 |         chain_name = scraper.get_chain_name()
101 | 
102 |         Logger.info(f"scraping {chain_name}")
103 |         if self.lookup_in_db:
104 |             scraper.enable_collection_status()
105 |             scraper.enable_aggregation_between_runs()
106 | 
107 |         scraper.scrape(
108 |             limit=limit,
109 |             files_types=files_types,
110 |             store_id=store_id,
111 |             when_date=when_date,
112 |             files_names_to_scrape=None,
113 |             filter_null=False,
114 |             filter_zero=False,
115 |             suppress_exception=suppress_exception,
116 |             min_size=min_size,
117 |             max_size=max_size,
118 |         )
119 |         Logger.info(f"done scraping {chain_name}")
120 | 
121 |         folder_with_files = scraper.get_storage_path()
122 |         if self.size_estimation_mode:
123 |             Logger.info(f"Summrize test data for {chain_name}")
124 |             summerize_dump_folder_contant(folder_with_files)
125 | 
126 |             Logger.info(f"Cleaning dump folder for {chain_name}")
127 |             clean_dump_folder(folder_with_files)
128 |         return folder_with_files
129 | 


--------------------------------------------------------------------------------
/il_supermarket_scarper/engines/bina.py:
--------------------------------------------------------------------------------
  1 | import json
  2 | import urllib.parse
  3 | import datetime
  4 | 
  5 | from il_supermarket_scarper.utils import (
  6 |     Logger,
  7 |     url_connection_retry,
  8 |     url_retrieve,
  9 |     FileTypesFilters,
 10 | )
 11 | 
 12 | from .apsx import Aspx
 13 | 
 14 | 
 15 | class Bina(Aspx):
 16 |     """scraper for all Bina base site.
 17 |     Note! the websites have the possibility to download historical value as a date search menu.
 18 |     this class don't support downloading them.
 19 |     """
 20 | 
 21 |     def __init__(
 22 |         self,
 23 |         chain,
 24 |         chain_id,
 25 |         url_perfix,
 26 |         download_postfix="/Download.aspx?FileNm=",
 27 |         domain="binaprojects.com/",
 28 |         folder_name=None,
 29 |     ):
 30 |         super().__init__(
 31 |             chain,
 32 |             chain_id,
 33 |             url=f"http://{url_perfix}.{domain}",
 34 |             aspx_page="MainIO_Hok.aspx",
 35 |             folder_name=folder_name,
 36 |         )
 37 |         self.download_postfix = download_postfix
 38 | 
 39 |     def file_type_ids(self, file_types):
 40 |         """get the file type id"""
 41 |         file_type_mapping = {
 42 |             FileTypesFilters.STORE_FILE.name: 1,
 43 |             FileTypesFilters.PRICE_FILE.name: 2,
 44 |             FileTypesFilters.PROMO_FILE.name: 3,
 45 |             FileTypesFilters.PRICE_FULL_FILE.name: 4,
 46 |             FileTypesFilters.PROMO_FULL_FILE.name: 5,
 47 |         }
 48 |         if file_types is None or file_types == FileTypesFilters.all_types():
 49 |             yield 0
 50 |         else:
 51 |             for file_type in file_types:
 52 |                 if file_type not in file_type_mapping:
 53 |                     raise ValueError(f"File type {file_type} not supported")
 54 |                 yield file_type_mapping[file_type]
 55 | 
 56 |     def _build_query_url(self, query_params, base_urls):
 57 |         res = []
 58 |         for base in base_urls:
 59 |             res.append(
 60 |                 {
 61 |                     "url": base + self.aspx_page + "?" + query_params,
 62 |                     "method": "GET",
 63 |                 }
 64 |             )
 65 |         return res
 66 | 
 67 |     def _get_all_possible_query_string_params(
 68 |         self, files_types=None, store_id=None, when_date=None
 69 |     ):
 70 |         """get the arguments need to add to the url"""
 71 |         chains_urls = []
 72 | 
 73 |         for c_id in self.get_chain_id():
 74 |             chains_urls.append(
 75 |                 {
 76 |                     "_": f"{c_id}",
 77 |                     "wReshet": "הכל",
 78 |                     "WFileType": "",
 79 |                     "WDate": "",
 80 |                     "WStore": "",
 81 |                 }
 82 |             )
 83 | 
 84 |         # add file types to url
 85 |         if files_types:
 86 |             chains_urls_with_types = []
 87 |             for files_type in self.file_type_ids(files_types):
 88 | 
 89 |                 for chain_url in chains_urls:
 90 |                     chains_urls_with_types.append(
 91 |                         {**chain_url, "WFileType": files_type}
 92 |                     )
 93 |             chains_urls = chains_urls_with_types
 94 | 
 95 |         # add store id
 96 |         if store_id:
 97 |             for chains_url in chains_urls:
 98 |                 chains_url["WStore"] = store_id
 99 | 
100 |         # posting date
101 |         if when_date and isinstance(when_date, datetime.datetime):
102 |             for chains_url in chains_urls:
103 |                 chains_url["WDate"] = when_date.strftime("%d/%m/%Y")
104 | 
105 |         return [urllib.parse.urlencode(params) for params in chains_urls]
106 | 
107 |     def get_data_from_page(self, req_res):
108 |         return json.loads(req_res.text)
109 | 
110 |     def get_href_from_entry(self, entry):
111 |         """get download link for entry (tr)"""
112 |         return self.download_postfix + entry["FileNm"]
113 | 
114 |     def get_file_name_no_ext_from_entry(self, entry):
115 |         """get the file name without extensions from entey (tr)"""
116 |         return entry.split(self.download_postfix)[-1].split(".")[0]
117 | 
118 |     def get_file_size_from_entry(self, entry):
119 |         """
120 |         Extract file size from a JSON entry.
121 |         Bina returns JSON objects, check for size field.
122 |         Returns size in bytes, or None if not found.
123 |         """
124 |         # Bina don't support file size in the entry
125 |         return None
126 | 
127 |     @url_connection_retry()
128 |     def retrieve_file(self, file_link, file_save_path, timeout=30):
129 |         response_content = self.session_with_cookies_by_chain(
130 |             file_link,
131 |         )
132 |         spath = json.loads(response_content.content)
133 |         Logger.debug(f"Found spath: {spath}")
134 | 
135 |         url = spath[0]["SPath"]
136 |         ext = file_link.split(".")[-1]
137 | 
138 |         url_retrieve(url, file_save_path + "." + ext, timeout=timeout)
139 |         return file_save_path + "." + ext
140 | 


--------------------------------------------------------------------------------
/LICENSE.txt:
--------------------------------------------------------------------------------
 1 | Custom License Agreement
 2 | 
 3 | This License Agreement ("Agreement") is a legal agreement between Sefi Erlich ("Licensor") and any individual or entity ("Licensee" or "Contributor") who accesses, uses, or contributes to this repository. By accessing, using, or contributing to the Repository, you agree to be bound by the terms of this Agreement.
 4 | 
 5 | 1. Grant of License for Non-Commercial Use
 6 | 
 7 | 1.1 Non-Commercial Use License: The Licensor grants the Licensee a worldwide, royalty-free, non-exclusive, non-transferable license to use, reproduce, modify, and distribute the content of the Repository ("Licensed Material") for non-commercial purposes only, subject to the terms and conditions of this Agreement.
 8 | 
 9 | 1.2 Attribution Requirement: When using or distributing the Licensed Material, the Licensee must provide appropriate credit to the Licensor by:
10 |     - Citing the Licensor's name as specified.
11 |     - Including a link to the Repository.
12 |     - Indicating if changes were made to the Licensed Material.
13 | 
14 | 1.3 No Commercial Use: Licensees are expressly prohibited from using the Licensed Material, in whole or in part, for any commercial purpose without prior written permission from the Licensor.
15 | 
16 | 2. Reservation of Commercial Rights
17 | 
18 | 2.1 Exclusive Commercial Rights: All commercial rights to the Licensed Material are exclusively reserved by the Licensor. The Licensor retains the sole right to use, reproduce, modify, distribute, and sublicense the Licensed Material for commercial purposes.
19 | 
20 | 2.2 Requesting Commercial Permission: Parties interested in using the Licensed Material for commercial purposes must obtain explicit written consent from the Licensor. Requests should be directed to the contact information provided at the end of this Agreement.
21 | 
22 | 3. Contributions
23 | 
24 | 3.1 Contributor License Grant: By submitting any content ("Contribution") to the Repository, the Contributor grants the Licensor a non-exclusive, perpetual, irrevocable, worldwide, royalty-free license to use, reproduce, modify, distribute, sublicense, and create derivative works from the Contribution for any purpose, including commercial purposes.
25 | 
26 | 3.2 Warranty of Originality: Contributors represent and warrant that their Contributions are original works and do not infringe upon the intellectual property rights of any third party.
27 | 
28 | 3.3 No Commercial Rights for Contributors: Contributors acknowledge that they have no rights to use the Licensed Material for commercial purposes.
29 | 
30 | 4. Restrictions
31 | 
32 | 4.1 Prohibition of Commercial Exploitation: Licensees and Contributors may not:
33 |     - Use the Licensed Material or any Contributions for commercial purposes.
34 |     - Distribute the Licensed Material or any Contributions as part of any commercial product or service.
35 |     - Sublicense the Licensed Material or any Contributions for commercial use.
36 | 
37 | 4.2 No Endorsement: Licensees and Contributors may not imply endorsement or affiliation with the Licensor without explicit written permission.
38 | 
39 | 5. Term and Termination
40 | 
41 | 5.1 Term: This Agreement is effective upon acceptance and continues unless terminated as provided herein.
42 | 
43 | 5.2 Termination for Breach: The Licensor may terminate this Agreement immediately if the Licensee or Contributor breaches any of its terms.
44 | 
45 | 5.3 Effect of Termination: Upon termination, all rights granted under this Agreement cease, and the Licensee or Contributor must destroy all copies of the Licensed Material in their possession.
46 | 
47 | 5.4 Survival: Sections 2, 3, 4, 6, and 7 survive termination of this Agreement.
48 | 
49 | 6. Disclaimer of Warranties and Limitation of Liability
50 | 
51 | 6.1 As-Is Basis: The Licensed Material and any Contributions are provided "AS IS," without warranties or conditions of any kind, either express or implied.
52 | 
53 | 6.2 Disclaimer: The Licensor expressly disclaims all warranties, including but not limited to warranties of title, non-infringement, merchantability, and fitness for a particular purpose.
54 | 
55 | 6.3 Limitation of Liability: In no event shall the Licensor be liable for any direct, indirect, incidental, special, exemplary, or consequential damages arising in any way out of the use of the Licensed Material or Contributions.
56 | 
57 | 7. General Provisions
58 | 
59 | 7.1 Entire Agreement: This Agreement constitutes the entire agreement between the parties concerning the subject matter hereof and supersedes all prior agreements and understandings.
60 | 
61 | 7.2 Modification: The Licensor reserves the right to modify this Agreement for new versions of the Licensed Material. Such modifications will not apply retroactively to any version of the Licensed Material you have already obtained.
62 | 
63 | 7.3 Severability: If any provision of this Agreement is found to be unenforceable, the remainder shall remain in full force and effect.
64 | 
65 | 7.4 Waiver: Failure to enforce any provision of this Agreement shall not constitute a waiver of such provision.
66 | 
67 | 7.5 Governing Law: This Agreement shall be governed by and construed in accordance with the laws of Israel, without regard to its conflict of law principles.
68 | 
69 | 7.6 Dispute Resolution: Any disputes arising under or in connection with this Agreement shall be subject to the exclusive jurisdiction of the courts located in Israel.
70 | 
71 | 8. Acceptance by accessing, using, or contributing to the Repository, you acknowledge that you have read, understood, and agree to be bound by the terms and conditions of this Agreement.
72 | 
73 | Contact Information
74 | 
75 | For any questions or requests regarding this Agreement, please contact:
76 | 
77 | Name: Sefi Erlich
78 | Email: erlichsefi@gmail.com


--------------------------------------------------------------------------------
/il_supermarket_scarper/scrappers/tests/test_all.py:
--------------------------------------------------------------------------------
  1 | from il_supermarket_scarper.scrappers_factory import ScraperFactory
  2 | from .test_cases import make_test_case
  3 | 
  4 | 
  5 | class BareketTestCase(make_test_case(ScraperFactory.BAREKET, 5)):
  6 |     """Test case for ScraperFactory.BAREKET."""
  7 | 
  8 | 
  9 | class YaynotBitanAndCarrefourTestCase(
 10 |     make_test_case(ScraperFactory.YAYNO_BITAN_AND_CARREFOUR, 9032)
 11 | ):
 12 |     """Test case for ScraperFactory.YAYNO_BITAN_AND_CARREFOUR."""
 13 | 
 14 | 
 15 | class CofixTestCase(make_test_case(ScraperFactory.COFIX, 299)):
 16 |     """Test case for ScraperFactory.COFIX."""
 17 | 
 18 | 
 19 | # class CityMarketGivatayimTestCase(
 20 | #     make_test_case(ScraperFactory.CITY_MARKET_GIVATAYIM, 1)
 21 | # ):
 22 | #     """Test case for CityMarketGivatay"""
 23 | 
 24 | 
 25 | # class CityMarketKirtatOnoTestCase(
 26 | #     make_test_case(ScraperFactory.CITY_MARKET_KIRYATONO, 1)
 27 | # ):
 28 | #     """Test case for CityMarketKirtatOno"""
 29 | 
 30 | 
 31 | class CityMarketKiryatGatTestCase(
 32 |     make_test_case(ScraperFactory.CITY_MARKET_KIRYATGAT, 1)
 33 | ):
 34 |     """Test case for CityMarketKiryatGat"""
 35 | 
 36 | 
 37 | class CityMarketShopsTestCase(make_test_case(ScraperFactory.CITY_MARKET_SHOPS, 1)):
 38 |     """Test case for CityMarketShops"""
 39 | 
 40 | 
 41 | class DorAlonTestCase(make_test_case(ScraperFactory.DOR_ALON, 501)):
 42 |     """Test case for ScraperFactory.DOR_ALON."""
 43 | 
 44 | 
 45 | class GoodPharmTestCase(make_test_case(ScraperFactory.GOOD_PHARM, 952)):
 46 |     """Test case for ScraperFactory.GOOD_PHARM."""
 47 | 
 48 | 
 49 | class HaziHinamTestCase(make_test_case(ScraperFactory.HAZI_HINAM, 206)):
 50 |     """Test case for ScraperFactory.HAZI_HINAM."""
 51 | 
 52 | 
 53 | class HetCohen(make_test_case(ScraperFactory.HET_COHEN, 45)):
 54 |     """Test case for ScraperFactory.HET_COHEN."""
 55 | 
 56 | 
 57 | class KeshetTestCase(make_test_case(ScraperFactory.KESHET, 5)):
 58 |     """Test case for ScraperFactory.KESHET."""
 59 | 
 60 | 
 61 | class KingStoreTestCase(make_test_case(ScraperFactory.KING_STORE, 334)):
 62 |     """Test case for ScraperFactory.KING_STORE."""
 63 | 
 64 | 
 65 | class Maayan2000TestCase(make_test_case(ScraperFactory.MAAYAN_2000, 60)):
 66 |     """Test case for ScraperFactory.MAAYAN_2000."""
 67 | 
 68 | 
 69 | class MahsaniAShukTestCase(make_test_case(ScraperFactory.MAHSANI_ASHUK, 98)):
 70 |     """Test case for ScraperFactory.MAHSANI_ASHUK."""
 71 | 
 72 | 
 73 | # class MegaTestCase(make_test_case(ScraperFactory.MEGA, 37)):
 74 | #     """Test case for ScraperFactory.MEGA."""
 75 | 
 76 | 
 77 | class NetivHasefTestCase(make_test_case(ScraperFactory.NETIV_HASED, 1)):
 78 |     """Test case for ScraperFactory.NETIV_HASED."""
 79 | 
 80 | 
 81 | class MeshnatYosef1TestCase(make_test_case(ScraperFactory.MESHMAT_YOSEF_1, 1)):
 82 |     """Test case for ScraperFactory.MESHMAT_YOSEF_1."""
 83 | 
 84 | 
 85 | class MeshnatYosef2TestCase(make_test_case(ScraperFactory.MESHMAT_YOSEF_2, 1)):
 86 |     """Test case for ScraperFactory.MESHMAT_YOSEF_2."""
 87 | 
 88 | 
 89 | class OsheradTestCase(make_test_case(ScraperFactory.OSHER_AD, 1)):
 90 |     """Test case for ScraperFactory.OSHER_AD."""
 91 | 
 92 | 
 93 | class PolizerTestCase(make_test_case(ScraperFactory.POLIZER, 2)):
 94 |     """Test case for ScraperFactory.POLIZER."""
 95 | 
 96 | 
 97 | class RamiLevyTestCase(make_test_case(ScraperFactory.RAMI_LEVY, 1)):
 98 |     """Test case for ScraperFactory.RAMI_LEVY."""
 99 | 
100 | 
101 | class SalachDabachTestCase(make_test_case(ScraperFactory.SALACH_DABACH, 4)):
102 |     """Test case for ScraperFactory.SALACH_DABACH."""
103 | 
104 | 
105 | class ShefaBarcartAshemTestCase(make_test_case(ScraperFactory.SHEFA_BARCART_ASHEM, 42)):
106 |     """Test case for ScraperFactory.SHEFA_BARCART_ASHEM."""
107 | 
108 | 
109 | class ShufersalTestCase(make_test_case(ScraperFactory.SHUFERSAL, 176)):
110 |     """Test case for ScraperFactory.SHUFERSAL."""
111 | 
112 | 
113 | class ShukAhirTestCase(make_test_case(ScraperFactory.SHUK_AHIR, 4)):
114 |     """Test case for ScraperFactory.SHUK_AHIR."""
115 | 
116 | 
117 | class StopMarketTestCase(make_test_case(ScraperFactory.STOP_MARKET, 5)):
118 |     """Test case for ScraperFactory.STOP_MARKET."""
119 | 
120 | 
121 | class SuperPharmTestCase(make_test_case(ScraperFactory.SUPER_PHARM, 224)):
122 |     """Test case for ScraperFactory.SUPER_PHARM."""
123 | 
124 | 
125 | class SuperYudaTestCase(make_test_case(ScraperFactory.SUPER_YUDA, 204)):
126 |     """Test case for ScraperFactory.SUPER_YUDA."""
127 | 
128 | 
129 | class SuperSapirTestCase(make_test_case(ScraperFactory.SUPER_SAPIR, 44)):
130 |     """Test case for ScraperFactory.SUPER_SAPIR."""
131 | 
132 | 
133 | class FreshMarketAndSuperDoshTestCase(
134 |     make_test_case(ScraperFactory.FRESH_MARKET_AND_SUPER_DOSH, 1)
135 | ):
136 |     """Test case for ScraperFactory.FRESH_MARKET_AND_SUPER_DOSH."""
137 | 
138 | 
139 | class QuikTestCase(make_test_case(ScraperFactory.QUIK, None)):
140 |     """Test case for ScraperFactory.QUIK."""
141 | 
142 | 
143 | class TivTaamTestCase(make_test_case(ScraperFactory.TIV_TAAM, 3)):
144 |     """Test case for ScraperFactory.TIV_TAAM."""
145 | 
146 | 
147 | class VictoryTestCase(make_test_case(ScraperFactory.VICTORY, 1)):
148 |     """Test case for ScraperFactory.VICTORY."""
149 | 
150 | 
151 | class YellowTestCase(make_test_case(ScraperFactory.YELLOW, 1272)):
152 |     """Test case for ScraperFactory.YELLOW."""
153 | 
154 | 
155 | class YohananofTestCase(make_test_case(ScraperFactory.YOHANANOF, 1)):
156 |     """Test case for ScraperFactory.YOHANANOF."""
157 | 
158 | 
159 | class ZolVeBegadolTestCase(make_test_case(ScraperFactory.ZOL_VEBEGADOL, 4)):
160 |     """Test case for ScraperFactory.ZOL_VEBEGADOL."""
161 | 
162 | 
163 | class WoltTestCase(make_test_case(ScraperFactory.WOLT, 0)):
164 |     """Test case for ScraperFactory.Wolt."""
165 | 


--------------------------------------------------------------------------------
/il_supermarket_scarper/engines/matrix.py:
--------------------------------------------------------------------------------
  1 | from bs4 import BeautifulSoup
  2 | from il_supermarket_scarper.utils import Logger
  3 | from .apsx import Aspx
  4 | 
  5 | 
  6 | class Matrix(Aspx):
  7 |     """scraper for all matrix base site.
  8 |     (support adveanced search: follow the instrucation the page)"""
  9 | 
 10 |     utilize_date_param = False
 11 | 
 12 |     def __init__(
 13 |         self,
 14 |         chain,
 15 |         chain_id,
 16 |         url="https://laibcatalog.co.il/",
 17 |         aspx_page="NBCompetitionRegulations.aspx",
 18 |         chain_hebrew_name=None,
 19 |         folder_name=None,
 20 |     ):
 21 |         super().__init__(chain, chain_id, url, aspx_page, folder_name=folder_name)
 22 |         self.chain_hebrew_name = chain_hebrew_name
 23 | 
 24 |     # def get_file_types_id(self, files_types=None):
 25 |     #     """get the file type id"""
 26 |     #     if files_types is None:
 27 |     #         return "all"
 28 | 
 29 |     #     types = []
 30 |     #     for ftype in files_types:
 31 |     #         if ftype == FileTypesFilters.STORE_FILE.name:
 32 |     #             types.append("storefull")
 33 |     #         if ftype == FileTypesFilters.PRICE_FILE.name:
 34 |     #             types.append("price")
 35 |     #         if ftype == FileTypesFilters.PROMO_FILE.name:
 36 |     #             types.append("promo")
 37 |     #         if ftype == FileTypesFilters.PRICE_FULL_FILE.name:
 38 |     #             types.append("pricefull")
 39 |     #         if ftype == FileTypesFilters.PROMO_FULL_FILE.name:
 40 |     #             types.append("promofull")
 41 |     #     return types
 42 | 
 43 |     # def get_when(self, when_date):
 44 |     #     """get the when date"""
 45 |     #     if when_date is None:
 46 |     #         when_date = _now()
 47 |     #     return when_date.strftime("%d/%m/%Y")
 48 | 
 49 |     # def get_chain_n_stores__id(self, store_id=None, c_id=None):
 50 |     #     """get the store id"""
 51 |     #     if store_id is None:
 52 |     #         chain_id = str(c_id)  # + "001"
 53 |     #         store_id = "-1"
 54 |     #     else:
 55 |     #         chain_id = str(c_id)
 56 |     #         store_id = str(c_id) + "001" + str(store_id).zfill(3)
 57 |     #     return chain_id, store_id
 58 | 
 59 |     def _build_query_url(self, query_params, base_urls):
 60 |         res = []
 61 |         for base in base_urls:
 62 |             res.append(
 63 |                 {
 64 |                     "method": "GET",
 65 |                     "url": base,
 66 |                     # "body": query_params,
 67 |                 }
 68 |             )
 69 |         return res
 70 | 
 71 |     def _get_all_possible_query_string_params(
 72 |         self, files_types=None, store_id=None, when_date=None
 73 |     ):
 74 |         """get the arguments need to add to the url"""
 75 | 
 76 |         return [{}]
 77 |         # post_body = []
 78 |         # if isinstance(self.chain_id, list):
 79 |         #     for c_id in self.chain_id:
 80 |         #         chain_id, store_id = self.get_chain_n_stores__id(
 81 |         #             store_id=store_id, c_id=c_id
 82 |         #         )
 83 |         #         post_body.append(
 84 |         #             {
 85 | 
 86 |         #                 "ctl00$TextArea": "",
 87 |         #                 "ctl00$MainContent$chain": chain_id,
 88 |         #                 "ctl00$MainContent$subChain": "-1",
 89 |         #                 "ctl00$MainContent$branch": store_id,
 90 |         #                 "ctl00$MainContent$txtDate": self.get_when(when_date=when_date),
 91 |         #                 "ctl00$MainContent$fileType": "all",
 92 |         #                 # "ctl00$MainContent$btnSearch": "חיפוש",
 93 |         #             }
 94 |         #         )
 95 |         # else:
 96 |         #     chain_id, store_id = self.get_chain_n_stores__id(
 97 |         #         store_id=store_id, c_id=self.chain_id
 98 |         #     )
 99 |         #     post_body.append(
100 |         #         {
101 |         #             "ctl00$TextArea": "",
102 |         #             "ctl00$MainContent$chain": chain_id,
103 |         #             "ctl00$MainContent$subChain": "-1",
104 |         #             "ctl00$MainContent$branch": store_id,
105 |         #             "ctl00$MainContent$txtDate": self.get_when(when_date=when_date),
106 |         #             "ctl00$MainContent$fileType": "all",
107 |         #             "ctl00$MainContent$btnSearch": "חיפוש",
108 |         #         }
109 |         #     )
110 | 
111 |         # # add file types to url
112 |         # if files_types:
113 |         #     chains_urls_with_types = []
114 |         #     for files_type in self.get_file_types_id(files_types=files_types):
115 |         #         for chain_url in post_body:
116 |         #             chain_url["ctl00$MainContent$fileType"] = files_type
117 |         #             chains_urls_with_types.append(chain_url)
118 |         #     post_body = chains_urls_with_types
119 | 
120 |         # return post_body
121 | 
122 |     def get_href_from_entry(self, entry):
123 |         """get download link for entry (tr)"""
124 |         return entry.a.attrs["href"]
125 | 
126 |     def get_file_name_no_ext_from_entry(self, entry):
127 |         """get the file name without extensions from entey (tr)"""
128 |         return entry.split("/")[-1].split(".gz")[0].split(".")[0]
129 | 
130 |     def get_data_from_page(self, req_res):
131 |         soup = BeautifulSoup(req_res.text, features="lxml")
132 |         all_trs = list(soup.find_all("tr"))[1:]  # skip title
133 | 
134 |         Logger.info(f"Before filtring names found {len(all_trs)} entries")
135 |         if self.chain_hebrew_name:
136 |             all_trs = list(
137 |                 filter(lambda x: x and self.chain_hebrew_name in str(x), all_trs)
138 |             )
139 |             Logger.info(f"After filtering names found {len(all_trs)} entries")
140 |         return all_trs
141 | 


--------------------------------------------------------------------------------
/il_supermarket_scarper/utils/scraper_status.py:
--------------------------------------------------------------------------------
  1 | import os
  2 | import traceback
  3 | 
  4 | from .logger import Logger
  5 | from .status import log_folder_details
  6 | from .databases import JsonDataBase
  7 | from .status import _now, get_output_folder
  8 | from .lock_utils import lock_by_string
  9 | 
 10 | 
 11 | class ScraperStatus:
 12 |     """A class that abstracts the database interface for scraper status."""
 13 | 
 14 |     STARTED = "started"
 15 |     COLLECTED = "collected"
 16 |     DOWNLOADED = "downloaded"
 17 |     FAILED = "fail"
 18 |     ESTIMATED_SIZE = "estimated_size"
 19 |     VERIFIED_DOWNLOADS = "verified_downloads"
 20 | 
 21 |     def __init__(self, database_name, base_path, folder_name=None) -> None:
 22 |         self.database = JsonDataBase(
 23 |             database_name, get_output_folder(base_path, folder_name=folder_name)
 24 |         )
 25 |         self.task_id = _now().strftime("%Y%m%d%H%M%S")
 26 |         self.filter_between_itrations = False
 27 | 
 28 |     @lock_by_string()
 29 |     def on_scraping_start(self, limit, files_types, **additional_info):
 30 |         """Report that scraping has started."""
 31 |         self._insert_an_update(
 32 |             ScraperStatus.STARTED,
 33 |             limit=limit,
 34 |             files_requested=files_types,
 35 |             **additional_info,
 36 |         )
 37 | 
 38 |     def enable_collection_status(self):
 39 |         """enable data collection to status files"""
 40 |         self.database.enable_collection_status()
 41 | 
 42 |     def enable_aggregation_between_runs(self):
 43 |         """allow tracking the downloaded file and don't downloading again if downloaded"""
 44 |         self.filter_between_itrations = True
 45 | 
 46 |     @lock_by_string()
 47 |     def on_collected_details(
 48 |         self,
 49 |         file_name_collected_from_site,
 50 |         links_collected_from_site="",
 51 |         **additional_info,
 52 |     ):
 53 |         """Report that file details have been collected."""
 54 |         self._insert_an_update(
 55 |             ScraperStatus.COLLECTED,
 56 |             file_name_collected_from_site=file_name_collected_from_site,
 57 |             links_collected_from_site=links_collected_from_site,
 58 |             **additional_info,
 59 |         )
 60 | 
 61 |     @lock_by_string()
 62 |     def on_download_completed(self, **additional_info):
 63 |         """Report that the file has been downloaded."""
 64 |         self._insert_an_update(ScraperStatus.DOWNLOADED, **additional_info)
 65 |         self._add_downloaded_files_to_list(**additional_info)
 66 | 
 67 |     def filter_already_downloaded(
 68 |         self, storage_path, files_names_to_scrape, filelist, by_function=lambda x: x
 69 |     ):
 70 |         """Filter files already existing in long-term memory or previously downloaded."""
 71 |         if self.database.is_collection_enabled() and self.filter_between_itrations:
 72 |             new_filelist = []
 73 |             for file in filelist:
 74 |                 if not self.database.find_document(
 75 |                     self.VERIFIED_DOWNLOADS, {"file_name": by_function(file)}
 76 |                 ):
 77 |                     new_filelist.append(file)
 78 |                 else:
 79 |                     Logger.debug(
 80 |                         f"Filtered file {file} since it was already downloaded and extracted"
 81 |                     )
 82 |             return new_filelist
 83 | 
 84 |         # Fallback: filter according to the disk
 85 |         exits_on_disk = os.listdir(storage_path)
 86 | 
 87 |         if files_names_to_scrape:
 88 |             # Delete any files we want to retry downloading
 89 |             for file in exits_on_disk:
 90 |                 if file.split(".")[0] in files_names_to_scrape:
 91 |                     os.remove(os.path.join(storage_path, file))
 92 | 
 93 |             # Filter the files to download
 94 |             filelist = list(
 95 |                 filter(lambda x: by_function(x) in files_names_to_scrape, filelist)
 96 |             )
 97 | 
 98 |         return list(filter(lambda x: by_function(x) not in exits_on_disk, filelist))
 99 | 
100 |     def _add_downloaded_files_to_list(self, results, **_):
101 |         """Add downloaded files to the MongoDB collection."""
102 |         if self.database.is_collection_enabled():
103 |             when = _now()
104 | 
105 |             documents = []
106 |             for res in results:
107 |                 if res["extract_succefully"]:
108 |                     documents.append(
109 |                         {"file_name": res["file_name"], "when": when},
110 |                     )
111 |             self.database.insert_documents(self.VERIFIED_DOWNLOADS, documents)
112 | 
113 |     @lock_by_string()
114 |     def on_scrape_completed(self, folder_name, completed_successfully=True):
115 |         """Report when scraping is completed."""
116 |         self._insert_an_update(
117 |             ScraperStatus.ESTIMATED_SIZE,
118 |             folder_size=log_folder_details(folder_name),
119 |             completed_successfully=completed_successfully,
120 |         )
121 | 
122 |     @lock_by_string()
123 |     def on_download_fail(self, execption, download_urls=None, file_names=None):
124 |         """report when the scraping in failed"""
125 |         self._insert_an_update(
126 |             ScraperStatus.FAILED,
127 |             execption=str(execption),
128 |             traceback=traceback.format_exc(),
129 |             download_urls=download_urls if download_urls else [],
130 |             file_names=file_names if file_names else [],
131 |         )
132 | 
133 |     def _insert_an_update(self, status, **additional_info):
134 |         """Insert an update into the MongoDB collection."""
135 |         document = {
136 |             "status": status,
137 |             "when": _now(),
138 |             **additional_info,
139 |         }
140 |         self.database.insert_document(self.task_id, document)
141 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
  1 | Israel Supermarket Scraper: Clients to download the data published by the supermarkets.
  2 | =======================================
  3 | This is a scraper for ALL the supermarket chains listed in the GOV.IL site.
  4 | 
  5 | שקיפות מחירים (השוואת מחירים) - https://www.gov.il/he/departments/legalInfo/cpfta_prices_regulations
  6 | 
  7 | 
  8 | 
  9 | 
 10 | [![Unit & Integration Tests](https://github.com/OpenIsraeliSupermarkets/israeli-supermarket-scarpers/actions/workflows/test-suite.yml/badge.svg?event=push)](https://github.com/OpenIsraeliSupermarkets/israeli-supermarket-scarpers/actions/workflows/test-suite.yml)
 11 | [![CodeQL](https://github.com/OpenIsraeliSupermarkets/israeli-supermarket-scarpers/actions/workflows/codeql.yml/badge.svg)](https://github.com/OpenIsraeliSupermarkets/israeli-supermarket-scarpers/actions/workflows/codeql.yml)
 12 | [![Pylint](https://github.com/OpenIsraeliSupermarkets/israeli-supermarket-scarpers/actions/workflows/pylint.yml/badge.svg)](https://github.com/OpenIsraeliSupermarkets/israeli-supermarket-scarpers/actions/workflows/pylint.yml)
 13 | [![Publish Docker image](https://github.com/OpenIsraeliSupermarkets/israeli-supermarket-scarpers/actions/workflows/docker-publish.yml/badge.svg)](https://github.com/OpenIsraeliSupermarkets/israeli-supermarket-scarpers/actions/workflows/docker-publish.yml)
 14 | [![Upload Python Package](https://github.com/OpenIsraeliSupermarkets/israeli-supermarket-scarpers/actions/workflows/python-publish.yml/badge.svg)](https://github.com/OpenIsraeliSupermarkets/israeli-supermarket-scarpers/actions/workflows/python-publish.yml)
 15 | 
 16 | ## 🤗 Want to support my work?
 17 | <p align="center">
 18 |     <a href="https://buymeacoffee.com/erlichsefi" target="_blank"><img src="https://cdn.buymeacoffee.com/buttons/v2/default-yellow.png" alt="Buy Me A Coffee" style="height: 60px !important;width: 217px !important;">
 19 |     </a>
 20 | </p>
 21 | 
 22 | Daily Automatic Testing
 23 | ----
 24 | The test suite is scheduled to run daily, so you can see if the supermarket chains have changed something in their interface and the package will not work properly.
 25 | 
 26 | Status: [![Scheduled Tests](https://github.com/OpenIsraeliSupermarkets/israeli-supermarket-scarpers/actions/workflows/test-suite.yml/badge.svg?event=schedule)](https://github.com/OpenIsraeliSupermarkets/israeli-supermarket-scarpers/actions/workflows/test-suite.yml)
 27 | 
 28 | Notice:
 29 | - Berekt and Quik are flaky! They will not fail the testing framework, but you can still use them.
 30 | - Some of the scrapers sites are blocked from being accessed from outside of Israel. 
 31 | 
 32 | --------
 33 | 
 34 |  
 35 | 
 36 | Got a question?
 37 | ---------------
 38 | 
 39 | You can email me at erlichsefi@gmail.com
 40 | 
 41 | If you think you've found a bug:
 42 | 
 43 | - Create issue in [issue tracker](https://github.com/OpenIsraeliSupermarkets/israeli-supermarket-scarpers/issues) to see if
 44 |   it's already been reported
 45 | - Please consider solving the issue by yourself and creating a pull request.
 46 | 
 47 | What is il_supermarket_scarper?
 48 | -------------
 49 | 
 50 | There are a lot of projects in GitHub trying to scrape the supermarket data, but most of them are not stable or haven't been updated for a while, it's about time there will be one codebase that does the work completely. 
 51 | 
 52 | You only need to run the following code to get all the data currently shared by the supermarkets.
 53 | 
 54 | ```python
 55 | from il_supermarket_scarper import ScarpingTask
 56 | 
 57 | scraper = ScarpingTask()
 58 | scraper.start()
 59 | ```
 60 | 
 61 | 
 62 | Please notice!
 63 | Since new files are constantly uploaded by the supermarket to their site, you will only get the current snapshot. In order to keep getting data, you will need to run this code more than one time to get the newly uploaded files.
 64 | 
 65 | Quick start
 66 | -----------
 67 | 
 68 | il_supermarket_scarper can be installed using pip:
 69 | 
 70 |     python3 pip install il-supermarket-scraper
 71 | 
 72 | If you want to run the latest version of the code, you can install it from the
 73 | repo directly:
 74 | 
 75 |     python3 -m pip install -U git+https://github.com/OpenIsraeliSupermarkets/israeli-supermarket-scarpers.git
 76 |     # or if you don't have 'git' installed
 77 |     python3 -m pip install -U https://github.com/OpenIsraeliSupermarkets/israeli-supermarket-scarpers/main
 78 |     
 79 | 
 80 | 
 81 | Running Docker
 82 | -----------
 83 | The docker is designed to re-run against the same configuration, in every iteration the scraper will collect the files available to download and check if the file already exists before fetching it, either by scanning the dump folder, or checking the mongo/status files.
 84 | 
 85 | 
 86 | Build yourself:
 87 | 
 88 |     docker build -t erlichsefi/israeli-supermarket-scarpers --target prod .
 89 | 
 90 | or pull the existing image from docker hub:
 91 | 
 92 |     docker pull erlichsefi/israeli-supermarket-scarpers:latest
 93 | 
 94 | 
 95 | Then running it using:
 96 | 
 97 | 
 98 |     docker run  -v "./dumps:/usr/src/app/dumps" \
 99 |                 -e ENABLED_SCRAPERS="BAREKET,YAYNO_BITAN" \   # see: il_supermarket_scarper/scrappers_factory.py
100 |                 -e ENABLED_FILE_TYPES="STORE_FILE" \          # see: il_supermarket_scarper/utils/file_types.py
101 |                 -e LIMIT=1 \                                  # number of files you would like to download (remove for unlimited)
102 |                 -e TODAY="2024-10-23 14:35" \                 # the date to download data from
103 |                 erlichsefi/israeli-supermarket-scarpers
104 | 
105 | 
106 | 
107 | Contributing
108 | ------------
109 | 
110 | Help in testing, development, documentation and other tasks is
111 | highly appreciated and useful to the project. There are tasks for
112 | contributors of all experience levels.
113 | 
114 | If you need help getting started, don't hesitate to contact me.
115 | 
116 | 
117 | Development status
118 | ------------------
119 | 
120 | IL SuperMarket Scraper is beta software, as far as i see devlopment stoped until new issues will be found.
121 | 


--------------------------------------------------------------------------------
/il_supermarket_scarper/scrappers_factory.py:
--------------------------------------------------------------------------------
  1 | import random
  2 | import os
  3 | from enum import Enum
  4 | import il_supermarket_scarper.scrappers as all_scrappers
  5 | from il_supermarket_scarper.scraper_stability import ScraperStability
  6 | 
  7 | 
  8 | class ScraperFactory(Enum):
  9 |     """all scrapers avaliabe"""
 10 | 
 11 |     BAREKET = all_scrappers.Bareket  # עוף והודו ברקת - חנות המפעל בע"מ
 12 |     YAYNO_BITAN_AND_CARREFOUR = all_scrappers.YaynotBitanAndCarrefour  # יינות ביתן
 13 |     # YAYNO_BITAN = all_scrappers.YaynotBitan  # יינות ביתן
 14 |     COFIX = all_scrappers.Cofix  # קופיקס בע"מ
 15 |     # CITY_MARKET_GIVATAYIM = all_scrappers.CityMarketGivatayim
 16 |     # CITY_MARKET_KIRYATONO = all_scrappers.CityMarketKirtatOno
 17 |     CITY_MARKET_KIRYATGAT = all_scrappers.CityMarketKiryatGat  # סיטי מרקט
 18 |     CITY_MARKET_SHOPS = all_scrappers.CityMarketShops  # סיטי מרקט
 19 |     DOR_ALON = all_scrappers.DorAlon  # דור אלון ניהול מתחמים קמעונאיים בע"מ
 20 |     GOOD_PHARM = all_scrappers.GoodPharm  # גוד פארם בע"מ
 21 |     HAZI_HINAM = all_scrappers.HaziHinam  # כל בו חצי חינם בע"מ
 22 |     HET_COHEN = all_scrappers.HetCohen  # ח. כהן סוכנות מזון ומשקאות בע"מ
 23 |     KESHET = all_scrappers.Keshet  # קשת טעמים בע"מ
 24 |     KING_STORE = all_scrappers.KingStore  # אלמשהדאוי קינג סטור בע"מ
 25 |     MAAYAN_2000 = all_scrappers.Maayan2000  # ג.מ מעיין אלפיים (07) בע"מ
 26 |     MAHSANI_ASHUK = all_scrappers.MahsaniAShuk  # כ.נ מחסני השוק בע"מ
 27 |     # MEGA = all_scrappers.Mega  # קרפור \ מגה
 28 |     NETIV_HASED = all_scrappers.NetivHased  # נתיב החסד - סופר חסד בע"מ (כולל ברכל)
 29 |     MESHMAT_YOSEF_1 = (
 30 |         all_scrappers.MeshnatYosef1
 31 |     )  # קיי.טי. יבוא ושיווק בע"מ (משנת יוסף)
 32 |     MESHMAT_YOSEF_2 = (
 33 |         all_scrappers.MeshnatYosef2
 34 |     )  # קיי.טי. יבוא ושיווק בע"מ (משנת יוסף)
 35 |     OSHER_AD = all_scrappers.Osherad  # מרב-מזון כל בע"מ (אושר עד)
 36 |     POLIZER = all_scrappers.Polizer  # פוליצר חדרה (1982) בע"מ
 37 |     RAMI_LEVY = all_scrappers.RamiLevy  # רשת חנויות רמי לוי שיווק השקמה 2006 בע"מ
 38 |     SALACH_DABACH = all_scrappers.SalachDabach  # סאלח דבאח ובניו בע"מ
 39 |     SHEFA_BARCART_ASHEM = all_scrappers.ShefaBarcartAshem  # שפע ברכת השם בע"מ
 40 |     SHUFERSAL = all_scrappers.Shufersal  # שופרסל בע"מ (כולל רשת BE)
 41 |     SHUK_AHIR = all_scrappers.ShukAhir  # שוק העיר (ט.ע.מ.ס) בע"מ
 42 |     STOP_MARKET = all_scrappers.StopMarket  # סטופ מרקט בע"מ
 43 |     SUPER_PHARM = all_scrappers.SuperPharm  # סופר פארם (ישראל) בע"מ
 44 |     SUPER_YUDA = all_scrappers.SuperYuda  # סופר יודה
 45 |     SUPER_SAPIR = all_scrappers.SuperSapir  # סופר ספיר בע"מ
 46 |     FRESH_MARKET_AND_SUPER_DOSH = all_scrappers.FreshMarketAndSuperDosh  # פרשמרקט
 47 |     QUIK = all_scrappers.Quik  # קוויק
 48 |     TIV_TAAM = all_scrappers.TivTaam  # טיב טעם רשתות בע"מ
 49 |     VICTORY = all_scrappers.Victory  # ויקטורי רשת סופרמרקטים בע"מ
 50 |     YELLOW = all_scrappers.Yellow  # יילו
 51 |     YOHANANOF = all_scrappers.Yohananof  # מ. יוחננוף ובניו (1988) בע"מ
 52 |     ZOL_VEBEGADOL = all_scrappers.ZolVeBegadol  # זול ובגדול בע"מ
 53 |     WOLT = all_scrappers.Wolt  # וולט אופריישנס סרוויסס ישראל בע"מ
 54 | 
 55 |     @classmethod
 56 |     def all_listed_scrappers(cls):
 57 |         """get all the scarpers and filter disabled scrapers"""
 58 |         return list(member.name for member in cls)
 59 | 
 60 |     @classmethod
 61 |     def all_active(cls, limit=None, files_types=None, when_date=None):
 62 |         """get all the scarpers and filter disabled scrapers"""
 63 |         return (
 64 |             member
 65 |             for member in cls
 66 |             if cls.is_scraper_enabled(
 67 |                 member,
 68 |                 limit=limit,
 69 |                 files_types=files_types,
 70 |                 when_date=when_date,
 71 |             )
 72 |         )
 73 | 
 74 |     @classmethod
 75 |     def sample(cls, n=1):
 76 |         """sample n from the scrappers"""
 77 |         return random.sample(cls.all_scrapers_name(), n)
 78 | 
 79 |     @classmethod
 80 |     def all_scrapers(cls, limit=None, files_types=None, when_date=None):
 81 |         """list all scrapers possible to use"""
 82 |         return [
 83 |             e.value
 84 |             for e in ScraperFactory.all_active(
 85 |                 limit=limit, files_types=files_types, when_date=when_date
 86 |             )
 87 |         ]
 88 | 
 89 |     @classmethod
 90 |     def all_scrapers_name(cls, limit=None, files_types=None, when_date=None):
 91 |         """get the class name of all listed scrapers"""
 92 |         return [
 93 |             e.name
 94 |             for e in ScraperFactory.all_active(
 95 |                 limit=limit, files_types=files_types, when_date=when_date
 96 |             )
 97 |         ]
 98 | 
 99 |     @classmethod
100 |     def get(cls, class_name, limit=None, files_types=None, when_date=None):
101 |         """get a scraper by class name"""
102 | 
103 |         enum = None
104 |         if isinstance(class_name, ScraperFactory):
105 |             enum = class_name
106 |         elif class_name in cls.all_scrapers_name():
107 |             enum = getattr(ScraperFactory, class_name)
108 | 
109 |         if enum is None:
110 |             raise ValueError(f"class_names {class_name} not found")
111 | 
112 |         if not cls.is_scraper_enabled(
113 |             enum, limit=limit, files_types=files_types, when_date=when_date
114 |         ):
115 |             return None
116 |         return enum.value
117 | 
118 |     @classmethod
119 |     def is_scraper_enabled(cls, enum, limit=None, files_types=None, when_date=None):
120 |         """get scraper value base on the enum value, if it disabled, return None"""
121 |         env_var_value = os.environ.get("DISABLED_SCRAPPERS")
122 |         if env_var_value is not None:
123 |             disabled_scrappers = list(map(str.strip, env_var_value.split(",")))
124 |             if enum.name in disabled_scrappers:
125 |                 return False
126 |         #
127 |         if ScraperStability.is_validate_scraper_found_no_files(
128 |             enum.name,
129 |             limit=limit,
130 |             files_types=files_types,
131 |             when_date=when_date,
132 |             utilize_date_param=enum.value.utilize_date_param,
133 |         ):
134 |             return False
135 |         return True
136 | 


--------------------------------------------------------------------------------
/il_supermarket_scarper/scraper_stability.py:
--------------------------------------------------------------------------------
  1 | # pylint: disable=arguments-differ,arguments-renamed
  2 | from enum import Enum
  3 | from il_supermarket_scarper.utils import (
  4 |     _is_saturday_in_israel,
  5 |     _now,
  6 |     datetime_in_tlv,
  7 |     FileTypesFilters,
  8 |     hour_files_expected_to_be_accassible,
  9 | )
 10 | 
 11 | 
 12 | class FullyStable:
 13 |     """fully stable is stablity"""
 14 | 
 15 |     @classmethod
 16 |     def executes_between_midnight_and_morning_and_requested_today(
 17 |         cls,
 18 |         when_date=None,
 19 |         utilize_date_param=False,
 20 |     ):
 21 |         """it is stable if the execution is between midnight
 22 |         and morning and the requested date is today fails"""
 23 |         execution_time = _now()
 24 |         return (
 25 |             when_date is not None
 26 |             and execution_time.hour >= 0
 27 |             and execution_time.hour < hour_files_expected_to_be_accassible()
 28 |             and (not utilize_date_param or when_date.date() == execution_time.date())
 29 |         )
 30 | 
 31 |     @classmethod
 32 |     def executed_after_date(cls, when_date, date):
 33 |         """check if executed after date"""
 34 |         return when_date > date
 35 | 
 36 |     @classmethod
 37 |     def failire_valid(cls, when_date=None, utilize_date_param=True, **_):
 38 |         """return true if the parser is stble"""
 39 | 
 40 |         return cls.executes_between_midnight_and_morning_and_requested_today(
 41 |             when_date=when_date, utilize_date_param=utilize_date_param
 42 |         )
 43 | 
 44 | 
 45 | class SuperFlaky(FullyStable):
 46 |     """super flaky is stablity"""
 47 | 
 48 |     @classmethod
 49 |     def failire_valid(cls, **_):
 50 |         return True
 51 | 
 52 | 
 53 | class NetivHased(FullyStable):
 54 |     """Netiv Hased is stablity"""
 55 | 
 56 |     @classmethod
 57 |     def executed_in_saturday(cls, when_date=None, **_):
 58 |         """if the execution is in saturday"""
 59 |         return _is_saturday_in_israel(when_date)
 60 | 
 61 |     @classmethod
 62 |     def failire_valid(cls, when_date=None, utilize_date_param=False, **_):
 63 |         """return true if the parser is stble"""
 64 |         return super().failire_valid(
 65 |             when_date=when_date, utilize_date_param=utilize_date_param
 66 |         ) or cls.executed_in_saturday(when_date=when_date)
 67 | 
 68 | 
 69 | class CityMarketGivataim(FullyStable):
 70 |     """Netiv Hased is stablity"""
 71 | 
 72 |     @classmethod
 73 |     def searching_for_update_promo(cls, files_types=None, **_):
 74 |         """if the execution is in saturday"""
 75 |         return files_types and files_types == [FileTypesFilters.PROMO_FILE.name]
 76 | 
 77 |     @classmethod
 78 |     def failire_valid(
 79 |         cls, when_date=None, files_types=None, utilize_date_param=True, **_
 80 |     ):
 81 |         """return true if the parser is stble"""
 82 |         return (
 83 |             super().failire_valid(when_date=when_date)
 84 |             or cls.searching_for_update_promo(files_types=files_types)
 85 |             or when_date is not None
 86 |             and cls.executed_after_date(
 87 |                 when_date=when_date,
 88 |                 date=datetime_in_tlv(
 89 |                     year=2024, month=11, day=5, hour=0, minute=0, second=0
 90 |                 ),
 91 |             )
 92 |         )
 93 | 
 94 | 
 95 | class CityMarketKiratOno(FullyStable):
 96 |     """Netiv Hased is stablity"""
 97 | 
 98 |     @classmethod
 99 |     def searching_for_update_promo(cls, files_types=None, **_):
100 |         """if the execution is in saturday"""
101 |         return files_types and files_types == [FileTypesFilters.PROMO_FILE.name]
102 | 
103 |     @classmethod
104 |     def failire_valid(
105 |         cls, when_date=None, files_types=None, utilize_date_param=True, **_
106 |     ):
107 |         """return true if the parser is stble"""
108 |         return super().failire_valid(
109 |             when_date=when_date
110 |         ) or cls.searching_for_update_promo(files_types=files_types)
111 | 
112 | 
113 | class CityMarketKiratGat(FullyStable):
114 |     """Netiv Hased is stablity"""
115 | 
116 |     @classmethod
117 |     def searching_for_update_promo_full(cls, files_types=None, **_):
118 |         """if the execution is in saturday"""
119 |         return files_types and files_types == [FileTypesFilters.PROMO_FULL_FILE.name]
120 | 
121 |     @classmethod
122 |     def failire_valid(
123 |         cls, when_date=None, files_types=None, utilize_date_param=True, **_
124 |     ):
125 |         """return true if the parser is stble"""
126 |         return super().failire_valid(
127 |             when_date=when_date
128 |         ) or cls.searching_for_update_promo_full(files_types=files_types)
129 | 
130 | 
131 | class DoNotPublishStores(FullyStable):
132 |     """stablity for chains that doesn't pubish stores"""
133 | 
134 |     @classmethod
135 |     def searching_for_store_full(cls, files_types=None, **_):
136 |         """if the execution is in saturday"""
137 |         return files_types and files_types == [FileTypesFilters.STORE_FILE.name]
138 | 
139 |     @classmethod
140 |     def failire_valid(
141 |         cls, when_date=None, files_types=None, utilize_date_param=True, **_
142 |     ):
143 |         """return true if the parser is stble"""
144 |         return super().failire_valid(
145 |             when_date=when_date,
146 |             files_types=files_types,
147 |             utilize_date_param=utilize_date_param,
148 |         ) or cls.searching_for_store_full(files_types=files_types)
149 | 
150 | 
151 | class DoNotPublishPromo(FullyStable):
152 |     """stablity for chains that doesn't pubish stores"""
153 | 
154 |     @classmethod
155 |     def searching_for_promo_full(cls, files_types=None, **_):
156 |         """if the execution is in saturday"""
157 |         return files_types and files_types == [
158 |             FileTypesFilters.PROMO_FILE.name,
159 |             FileTypesFilters.PROMO_FULL_FILE.name,
160 |         ]
161 | 
162 |     @classmethod
163 |     def failire_valid(
164 |         cls, when_date=None, files_types=None, utilize_date_param=True, **_
165 |     ):
166 |         """return true if the parser is stble"""
167 |         return super().failire_valid(
168 |             when_date=when_date,
169 |             files_types=files_types,
170 |             utilize_date_param=utilize_date_param,
171 |         ) or cls.searching_for_promo_full(files_types=files_types)
172 | 
173 | 
174 | class ScraperStability(Enum):
175 |     """tracker for the stablity of the scraper"""
176 | 
177 |     COFIX = DoNotPublishStores
178 |     NETIV_HASED = NetivHased
179 |     QUIK = DoNotPublishStores
180 |     SALACH_DABACH = DoNotPublishStores
181 |     # CITY_MARKET_GIVATAYIM = CityMarketGivataim
182 |     CITY_MARKET_KIRYATONO = CityMarketKiratOno
183 |     CITY_MARKET_KIRYATGAT = CityMarketKiratGat
184 |     MESHMAT_YOSEF_1 = DoNotPublishPromo
185 |     YOHANANOF = DoNotPublishStores
186 | 
187 |     @classmethod
188 |     def is_validate_scraper_found_no_files(
189 |         cls,
190 |         scraper_enum,
191 |         limit=None,
192 |         files_types=None,
193 |         store_id=None,
194 |         when_date=None,
195 |         utilize_date_param=False,
196 |     ):
197 |         """return true if its ok the scarper reuturn no enrty"""
198 | 
199 |         stabler = FullyStable
200 |         if scraper_enum in ScraperStability.__members__:
201 |             stabler = ScraperStability[scraper_enum].value
202 | 
203 |         return stabler.failire_valid(
204 |             limit=limit,
205 |             files_types=files_types,
206 |             store_id=store_id,
207 |             when_date=when_date,
208 |             utilize_date_param=utilize_date_param,
209 |         )
210 | 


--------------------------------------------------------------------------------
/il_supermarket_scarper/utils/retry.py:
--------------------------------------------------------------------------------
  1 | import logging
  2 | import random
  3 | import time
  4 | import inspect
  5 | 
  6 | from datetime import datetime
  7 | from functools import partial
  8 | 
  9 | import functools
 10 | 
 11 | 
 12 | try:
 13 |     from decorator import decorator
 14 | except ImportError:
 15 | 
 16 |     def decorator(caller):
 17 |         """Turns caller into a decorator.
 18 |         Unlike decorator module, function signature is not preserved.
 19 | 
 20 |         :param caller: caller(f, *args, **kwargs)
 21 |         """
 22 | 
 23 |         def decor(func):
 24 |             @functools.wraps(func)
 25 |             def wrapper(*args, **kwargs):
 26 |                 return caller(func, *args, **kwargs)
 27 | 
 28 |             return wrapper
 29 | 
 30 |         return decor
 31 | 
 32 | 
 33 | logging_logger = logging.getLogger(__name__)
 34 | 
 35 | 
 36 | def __retry_internal(  # pylint: disable=broad-except,too-many-locals
 37 |     func,
 38 |     exceptions=Exception,
 39 |     tries=-1,
 40 |     delay=0,
 41 |     max_delay=None,
 42 |     backoff=1,
 43 |     timeout=None,
 44 |     max_timeout=None,
 45 |     backoff_timeout=1,
 46 |     jitter=0,
 47 |     logger=logging_logger,
 48 | ):
 49 |     """
 50 |     Executes a function and retries it if it failed.
 51 | 
 52 |     :param f: the function to execute.
 53 |     :param exceptions: an exception or a tuple of exceptions to catch. default: Exception.
 54 |     :param tries: the maximum number of attempts. default: -1 (infinite).
 55 |     :param delay: initial delay between attempts. default: 0.
 56 |     :param max_delay: the maximum value of delay. default: None (no limit).
 57 |     :param backoff: multiplier applied to delay between attempts. default: 1 (no backoff).
 58 |     :param jitter: extra seconds added to delay between attempts. default: 0.
 59 |                    fixed if a number, random if a range tuple (min, max)
 60 |     :param logger: logger.warning(fmt, error, delay) will be called on failed attempts.
 61 |                    default: retry.logging_logger. if None, logging is disabled.
 62 |     :returns: the result of the f function.
 63 |     """
 64 |     _tries, _delay = tries, delay
 65 |     _timeout = timeout
 66 |     while _tries:
 67 |         datetime_start = datetime.now()
 68 |         try:
 69 |             if timeout:
 70 |                 return func(timeout=_timeout)
 71 |             return func()
 72 |         except exceptions as error:  # pylint: disable=broad-except
 73 |             measured_seconds = (datetime.now() - datetime_start).total_seconds()
 74 |             _tries -= 1
 75 |             if not _tries:
 76 |                 raise
 77 | 
 78 |             if logger is not None:
 79 |                 logger.warning(
 80 |                     "%s, configured timeout %s,measured time to timeout %s ,retrying in %s seconds",
 81 |                     error,
 82 |                     _timeout,
 83 |                     measured_seconds,
 84 |                     _delay,
 85 |                 )
 86 |                 logger.error_execption(error)
 87 | 
 88 |             time.sleep(_delay)
 89 |             _delay *= backoff
 90 | 
 91 |             if _timeout:
 92 |                 _timeout += backoff_timeout
 93 | 
 94 |             if isinstance(jitter, tuple):
 95 |                 _delay += random.uniform(*jitter)
 96 |             else:
 97 |                 _delay += jitter
 98 | 
 99 |             if max_delay is not None:
100 |                 _delay = min(_delay, max_delay)
101 | 
102 |             if max_timeout is not None:
103 |                 _timeout = min(_timeout, max_timeout)
104 |     raise ValueError("shouldn't be called!")
105 | 
106 | 
107 | def retry(
108 |     exceptions=Exception,
109 |     tries=-1,
110 |     delay=0,
111 |     max_delay=None,
112 |     backoff=1,
113 |     timeout=None,
114 |     max_timeout=None,
115 |     backoff_timeout=1,
116 |     jitter=0,
117 |     logger=logging_logger,
118 | ):
119 |     """Returns a retry decorator.
120 | 
121 |     :param exceptions: an exception or a tuple of exceptions to catch. default: Exception.
122 |     :param tries: the maximum number of attempts. default: -1 (infinite).
123 |     :param delay: initial delay between attempts. default: 0.
124 |     :param max_delay: the maximum value of delay. default: None (no limit).
125 |     :param backoff: multiplier applied to delay between attempts. default: 1 (no backoff).
126 |     :param jitter: extra seconds added to delay between attempts. default: 0.
127 |                    fixed if a number, random if a range tuple (min, max)
128 |     :param logger: logger.warning(fmt, error, delay) will be called on failed attempts.
129 |                    default: retry.logging_logger. if None, logging is disabled.
130 |     :returns: a retry decorator.
131 |     """
132 | 
133 |     @decorator
134 |     def retry_decorator(func, *fargs, **fkwargs):
135 |         args = fargs if fargs else []
136 |         kwargs = fkwargs if fkwargs else {}
137 |         return __retry_internal(
138 |             partial(func, *args, **kwargs),
139 |             exceptions,
140 |             tries,
141 |             delay,
142 |             max_delay,
143 |             backoff,
144 |             timeout,
145 |             max_timeout,
146 |             backoff_timeout,
147 |             jitter,
148 |             logger,
149 |         )
150 | 
151 |     return retry_decorator
152 | 
153 | 
154 | def retry_call(
155 |     func,
156 |     fargs=None,
157 |     fkwargs=None,
158 |     exceptions=Exception,
159 |     tries=-1,
160 |     delay=0,
161 |     max_delay=None,
162 |     backoff=1,
163 |     jitter=0,
164 |     logger=logging_logger,
165 | ):
166 |     """
167 |     Calls a function and re-executes it if it failed.
168 | 
169 |     :param f: the function to execute.
170 |     :param fargs: the positional arguments of the function to execute.
171 |     :param fkwargs: the named arguments of the function to execute.
172 |     :param exceptions: an exception or a tuple of exceptions to catch. default: Exception.
173 |     :param tries: the maximum number of attempts. default: -1 (infinite).
174 |     :param delay: initial delay between attempts. default: 0.
175 |     :param max_delay: the maximum value of delay. default: None (no limit).
176 |     :param backoff: multiplier applied to delay between attempts. default: 1 (no backoff).
177 |     :param jitter: extra seconds added to delay between attempts. default: 0.
178 |                    fixed if a number, random if a range tuple (min, max)
179 |     :param logger: logger.warning(fmt, error, delay) will be called on failed attempts.
180 |                    default: retry.logging_logger. if None, logging is disabled.
181 |     :returns: the result of the f function.
182 |     """
183 |     args = fargs if fargs else []
184 |     kwargs = fkwargs if fkwargs else {}
185 |     return __retry_internal(
186 |         partial(func, *args, **kwargs),
187 |         exceptions,
188 |         tries,
189 |         delay,
190 |         max_delay,
191 |         backoff,
192 |         jitter,
193 |         logger,
194 |     )
195 | 
196 | 
197 | def retry_files(num_of_retrys=2, arg_name="files_names_to_scrape"):
198 |     """retry only ceritin files"""
199 | 
200 |     @decorator
201 |     def retry_files_decorator(func, *fargs, **fkwargs):
202 |         args = fargs if fargs else []
203 |         kwargs = fkwargs if fkwargs else {}
204 |         return __retry_files(func, args, kwargs, arg_name, num_of_retrys=num_of_retrys)
205 | 
206 |     return retry_files_decorator
207 | 
208 | 
209 | def __retry_files(
210 |     func,
211 |     args,
212 |     kwargs,
213 |     arg_name,
214 |     num_of_retrys=1,
215 |     logger=logging_logger,
216 | ):
217 |     retry_list = []
218 |     all_results = []
219 |     for i in range(num_of_retrys):
220 |         logger.info(f"File Retry: Itreation #{i},retry_list={retry_list}")
221 | 
222 |         if retry_list:
223 |             # replace the value of 'files_names_to_scrape'
224 |             args_names = inspect.getfullargspec(func).args
225 |             assert arg_name in args_names, f"{arg_name} wasn't found in {args_names}."
226 | 
227 |             arg_list = list(args)
228 |             arg_list[args_names.index(arg_name)] = retry_list
229 |             args = tuple(arg_list)
230 | 
231 |         results = func(*args, **kwargs)
232 | 
233 |         # next iteration
234 |         retry_list, other_results = compute_retry(results)
235 | 
236 |         all_results.extend(other_results)
237 |         # if there is not files in the retry list, break
238 |         if len(retry_list) == 0:
239 |             break
240 | 
241 |     return all_results
242 | 
243 | 
244 | def compute_retry(results):
245 |     """find the files to retry"""
246 |     files_to_retry = []
247 |     other_results = []
248 |     for result in results:
249 |         if result["restart_and_retry"]:
250 |             files_to_retry.append(result["file_name"])
251 |         else:
252 |             other_results.append(result)
253 |     return files_to_retry, other_results
254 | 


--------------------------------------------------------------------------------
/il_supermarket_scarper/utils/status.py:
--------------------------------------------------------------------------------
  1 | import datetime
  2 | import re
  3 | import os
  4 | import enum
  5 | import holidays
  6 | import pytz
  7 | from .logger import Logger
  8 | from .connection import get_from_latast_webpage, get_from_webpage
  9 | 
 10 | 
 11 | def get_statue_page(extraction_type, source="gov.il"):
 12 |     """fetch the gov.il site"""
 13 |     url = "https://www.gov.il/he/departments/legalInfo/cpfta_prices_regulations"
 14 |     # Create a handle, page, to handle the contents of the website
 15 | 
 16 |     if source == "gov.il":
 17 |         return get_from_latast_webpage(url, extraction_type=extraction_type)
 18 |     if source == "cache":
 19 |         return get_from_webpage(get_cached_page(), extraction_type=extraction_type)
 20 |     raise ValueError(f"source '{source}' is not valid.")
 21 | 
 22 | 
 23 | def get_cached_page():
 24 |     """get the current cached page"""
 25 |     cache = None
 26 |     with open(
 27 |         os.path.join(
 28 |             os.path.dirname(os.path.abspath(__file__)),
 29 |             "tests",
 30 |             "cpfta_prices_regulations",
 31 |         ),
 32 |         encoding="utf-8",
 33 |     ) as page_cache:
 34 |         cache = page_cache.read()
 35 |     return cache
 36 | 
 37 | 
 38 | def get_status():
 39 |     """get the number of scarper listed on the gov.il site"""
 40 |     links_text = get_statue_page(extraction_type="links_name")
 41 |     # Store the contents of the website under doc
 42 |     count = 0
 43 |     for element in links_text:
 44 |         if "לצפייה במחירים" in str(element) or "לצפיה במחירים" in str(element):
 45 |             count += 1
 46 | 
 47 |     return count
 48 | 
 49 | 
 50 | def get_status_date():
 51 |     """get the date change listed on the gov.il site"""
 52 |     line_with_date = get_statue_page(extraction_type="update_date")
 53 | 
 54 |     Logger.info(f"date in 'line_with_date' is '{line_with_date}'")
 55 | 
 56 |     dates = re.findall(
 57 |         r"([1-9]|1[0-9]|2[0-9]|3[0-1]|0[0-9])(.|-|\/)([1-9]|1[0-2]|0[0-9])(.|-|\/)(20[0-9][0-9])",
 58 |         line_with_date,
 59 |     )
 60 | 
 61 |     Logger.info(f"Found {len(dates)} dates")
 62 |     if len(dates) != 1:
 63 |         raise ValueError(f"found dates: {dates}")
 64 | 
 65 |     return datetime.datetime.strptime("".join(dates[0]), "%d.%m.%Y")
 66 | 
 67 | 
 68 | def get_output_folder(chain_name, folder_name=None):
 69 |     """the the folder to write the chain fils in"""
 70 |     return os.path.join(folder_name if folder_name else _get_dump_folder(), chain_name)
 71 | 
 72 | 
 73 | def _get_dump_folder():
 74 |     """get the dump folder to locate the chains folders in"""
 75 |     return os.environ.get("XML_STORE_PATH", "dumps")
 76 | 
 77 | 
 78 | # Enum for size units
 79 | class UnitSize(enum.Enum):
 80 |     """enum represent the unit size in memory"""
 81 | 
 82 |     BYTES = "Bytes"
 83 |     KB = "Kb"
 84 |     MB = "Mb"
 85 |     GB = "Gb"
 86 | 
 87 | 
 88 | def convert_nl_size_to_bytes(size_str, to_unit=UnitSize.MB):
 89 |     """
 90 |     Parse human-readable file size string to bytes.
 91 |     Supports formats like: "10.5 MB", "1.2GB", "500 KB", "1234", etc.
 92 |     Returns bytes as integer, or None if parsing fails.
 93 |     """
 94 |     if not size_str:
 95 |         return None
 96 | 
 97 |     # Remove any extra whitespace and convert to uppercase
 98 |     size_str = size_str.strip().upper()
 99 | 
100 |     # Pattern to match: number (with optional decimal) followed by optional unit
101 |     pattern = r"([\d.]+)\s*(B|KB|MB|GB|TB)?"
102 |     match = re.match(pattern, size_str)
103 |     if not match:
104 |         return None
105 | 
106 |     try:
107 |         number = string_to_float(match.group(1))
108 |         unit_str = match.group(2) if match.group(2) else "B"
109 |         # Map string units to UnitSize enum where possible
110 |         unit_map = {
111 |             "B": UnitSize.BYTES,
112 |             "KB": UnitSize.KB,
113 |             "MB": UnitSize.MB,
114 |             "GB": UnitSize.GB,
115 |             # You can add "TB": UnitSize.TB if desired and defined
116 |         }
117 |         from_unit = unit_map.get(unit_str, UnitSize.BYTES)
118 |         size_in_from_unit = number
119 |         # convert_unit expects size in bytes, so we need to first get bytes from the given unit
120 |         return convert_unit(size_in_from_unit, from_unit=from_unit, to_unit=to_unit)
121 |     except (ValueError, TypeError, KeyError):
122 |         return None
123 | 
124 | 
125 | def string_to_float(size_str):
126 |     """convert a string to a float"""
127 |     return float(size_str.replace(",", ""))
128 | 
129 | 
130 | def convert_unit(size_in_bytes, from_unit=UnitSize.BYTES, to_unit=UnitSize.MB):
131 |     """Convert the size from bytes to other units like KB, MB or GB"""
132 |     if from_unit == to_unit:
133 |         return size_in_bytes
134 |     # Convert size_in_bytes (in from_unit) to bytes
135 |     if from_unit == UnitSize.KB:
136 |         bytes_val = size_in_bytes * 1024
137 |     elif from_unit == UnitSize.MB:
138 |         bytes_val = size_in_bytes * 1024 * 1024
139 |     elif from_unit == UnitSize.GB:
140 |         bytes_val = size_in_bytes * 1024 * 1024 * 1024
141 |     else:  # from_unit == UnitSize.BYTES
142 |         bytes_val = size_in_bytes
143 | 
144 |     # Convert bytes to to_unit
145 |     if to_unit == UnitSize.BYTES:
146 |         return bytes_val
147 |     if to_unit == UnitSize.KB:
148 |         return bytes_val / 1024
149 |     if to_unit == UnitSize.MB:
150 |         return bytes_val / (1024 * 1024)
151 |     if to_unit == UnitSize.GB:
152 |         return bytes_val / (1024 * 1024 * 1024)
153 |     return bytes_val
154 | 
155 | 
156 | def log_folder_details(folder, unit=UnitSize.MB):
157 |     """log details about a folder"""
158 |     size = 0
159 |     files_scaned = []
160 |     Logger.info(f"Found the following files in {folder}")
161 | 
162 |     for path, _, files in os.walk(folder):
163 | 
164 |         # summerize all files
165 |         for file in files:
166 |             if "xml" in file:
167 |                 full_file_path = os.path.join(path, file)
168 |                 size += os.path.getsize(full_file_path)
169 |                 files_scaned.append(full_file_path)
170 |                 Logger.info(f"- file {full_file_path}: size {size}")
171 | 
172 |         # unit_size =
173 |         # for sub_folder in dirs:
174 |         #     unit_size += log_folder_details(os.path.join(path, sub_folder), unit)
175 | 
176 |     Logger.info(
177 |         f"Folder {folder}: Num of Files= {len(files_scaned)},"
178 |         f"Size= {convert_unit(size, unit)} {unit.name}"
179 |     )
180 | 
181 |     return {
182 |         "size": convert_unit(size, unit),
183 |         "unit": unit.name,
184 |         "folder": folder,
185 |         "folder_content": files_scaned,
186 |     }
187 | 
188 | 
189 | def summerize_dump_folder_contant(dump_folder):
190 |     """collect details about the dump folder"""
191 | 
192 |     Logger.info(" == Starting summerize dump folder == ")
193 |     Logger.info(f"dump_folder = {dump_folder}")
194 |     for any_file in os.listdir(dump_folder):
195 |         current_file = os.path.join(dump_folder, any_file)
196 |         if os.path.isdir(current_file):
197 |             log_folder_details(current_file)
198 |         else:
199 |             Logger.info(f"- file {current_file}")
200 | 
201 | 
202 | def clean_dump_folder(dump_folder):
203 |     """clean the dump folder completly"""
204 |     for any_file in os.listdir(dump_folder):
205 |         current_file = os.path.join(dump_folder, any_file)
206 |         if os.path.isdir(current_file):
207 |             for file in os.listdir(current_file):
208 |                 full_file_path = os.path.join(current_file, file)
209 |                 os.remove(full_file_path)
210 |             os.rmdir(current_file)
211 |         else:
212 |             os.remove(current_file)
213 | 
214 | 
215 | def hour_files_expected_to_be_accassible():
216 |     """the hour (AM) in which the files are expected to be published in IL time"""
217 |     return 12
218 | 
219 | 
220 | def _now():
221 |     return datetime.datetime.now(pytz.timezone("Asia/Jerusalem"))
222 | 
223 | 
224 | def _testing_now(hour_consider_stable=hour_files_expected_to_be_accassible()):
225 |     current_time = _now()
226 | 
227 |     if current_time.hour < hour_consider_stable:
228 |         current_time = current_time - datetime.timedelta(hours=hour_consider_stable)
229 |     return current_time
230 | 
231 | 
232 | def datetime_in_tlv(year, month, day, hour, minute, second):
233 |     """return a datedatiem in tlv timezone"""
234 |     return datetime.datetime(
235 |         year, month, day, hour, minute, second, tzinfo=pytz.timezone("Asia/Jerusalem")
236 |     )
237 | 
238 | 
239 | def _is_saturday_in_israel(date=None):
240 |     if not date:
241 |         date = _now()
242 |     return date.weekday() == 5
243 | 
244 | 
245 | def _is_friday_in_israel():
246 |     return _now().weekday() == 4
247 | 
248 | 
249 | def _is_weekend_in_israel():
250 |     return _is_friday_in_israel() or _is_saturday_in_israel()
251 | 
252 | 
253 | def _is_holiday_in_israel():
254 |     return _now().date() in holidays.CountryHoliday("IL")
255 | 


--------------------------------------------------------------------------------
/il_supermarket_scarper/engines/web.py:
--------------------------------------------------------------------------------
  1 | import re
  2 | from bs4 import BeautifulSoup
  3 | from il_supermarket_scarper.utils import Logger, execute_in_parallel
  4 | from il_supermarket_scarper.utils import convert_nl_size_to_bytes, UnitSize
  5 | from .engine import Engine
  6 | 
  7 | 
  8 | class WebBase(Engine):
  9 |     """scrape the file of websites that the only why to download them is via web"""
 10 | 
 11 |     def __init__(self, chain, chain_id, url, folder_name=None, max_threads=5):
 12 |         super().__init__(chain, chain_id, folder_name, max_threads=max_threads)
 13 |         self.url = url
 14 |         self.max_retry = 2
 15 | 
 16 |     def get_data_from_page(self, req_res):
 17 |         """get the file list from a page"""
 18 |         soup = BeautifulSoup(req_res.text, features="lxml")
 19 |         return soup.find_all("tr")[1:]
 20 | 
 21 |     def get_request_url(
 22 |         self, files_types=None, store_id=None, when_date=None
 23 |     ):  # pylint: disable=unused-argument
 24 |         """get all links to collect download links from"""
 25 |         return [{"url": self.url, "method": "GET"}]
 26 | 
 27 |     def get_file_size_from_entry(self, entry):
 28 |         """
 29 |         Extract file size from a table row entry.
 30 |         Looks for size information in table cells, typically in human-readable format.
 31 |         Returns size in bytes, or None if not found.
 32 |         """
 33 |         try:
 34 |             size_bytes = re.search(r"\b\d+(\.\d+)?\s*(KB|MB|GB)\b", entry.text)
 35 |             size_bytes = convert_nl_size_to_bytes(
 36 |                 size_bytes.group(0), to_unit=UnitSize.BYTES
 37 |             )
 38 |             return size_bytes
 39 |         except (AttributeError, TypeError) as e:
 40 |             Logger.debug(f"Error extracting file size from entry: {e}")
 41 |         return None
 42 | 
 43 |     def extract_task_from_entry(self, all_trs):
 44 |         """extract download links, file names, and file sizes from page list"""
 45 |         download_urls = []
 46 |         file_names = []
 47 |         file_sizes = []
 48 |         for x in all_trs:
 49 |             try:
 50 |                 download_urls.append(self.url + x.a.attrs["href"])
 51 |                 file_names.append(x.a.attrs["href"].split(".")[0].split("/")[-1])
 52 |                 file_sizes.append(self.get_file_size_from_entry(x))
 53 |             except (AttributeError, KeyError, IndexError, TypeError) as e:
 54 |                 Logger.warning(f"Error extracting task from entry: {e}")
 55 | 
 56 |         return download_urls, file_names, file_sizes
 57 | 
 58 |     def apply_limit_zip(
 59 |         self,
 60 |         file_names,
 61 |         download_urls,
 62 |         file_sizes=None,
 63 |         limit=None,
 64 |         files_types=None,
 65 |         by_function=lambda x: x[0],
 66 |         store_id=None,
 67 |         when_date=None,
 68 |         files_names_to_scrape=None,
 69 |         suppress_exception=False,
 70 |     ):
 71 |         """apply limit to zip"""
 72 |         # Handle both 2-tuple (backward compatibility) and 3-tuple formats
 73 |         if file_sizes is None:
 74 |             zipped = list(zip(file_names, download_urls))
 75 |         else:
 76 |             zipped = list(zip(file_names, download_urls, file_sizes))
 77 | 
 78 |         ziped = self.apply_limit(
 79 |             zipped,
 80 |             limit=limit,
 81 |             files_types=files_types,
 82 |             by_function=by_function,
 83 |             store_id=store_id,
 84 |             when_date=when_date,
 85 |             files_names_to_scrape=files_names_to_scrape,
 86 |             suppress_exception=suppress_exception,
 87 |         )
 88 |         if len(ziped) == 0:
 89 |             if file_sizes is None:
 90 |                 return [], []
 91 |             return [], [], []
 92 |         return list(zip(*ziped))
 93 | 
 94 |     def filter_bad_files_zip(
 95 |         self,
 96 |         file_names,
 97 |         download_urls,
 98 |         file_sizes=None,
 99 |         filter_null=False,
100 |         filter_zero=False,
101 |         by_function=lambda x: x[0],
102 |     ):
103 |         """apply bad files filtering to zip"""
104 |         # Handle both 2-tuple (backward compatibility) and 3-tuple formats
105 |         if file_sizes is None:
106 |             files = list(zip(file_names, download_urls))
107 |         else:
108 |             files = list(zip(file_names, download_urls, file_sizes))
109 | 
110 |         files = self.filter_bad_files(
111 |             files,
112 |             filter_null=filter_null,
113 |             filter_zero=filter_zero,
114 |             by_function=by_function,
115 |         )
116 |         if len(files) == 0:
117 |             if file_sizes is None:
118 |                 return [], []
119 |             return [], [], []
120 |         return list(zip(*files))
121 | 
122 |     def collect_files_details_from_site(  # pylint: disable=too-many-locals
123 |         self,
124 |         limit=None,
125 |         files_types=None,
126 |         store_id=None,
127 |         when_date=None,
128 |         filter_null=False,
129 |         filter_zero=False,
130 |         files_names_to_scrape=None,
131 |         suppress_exception=False,
132 |         min_size=None,
133 |         max_size=None,
134 |     ):
135 |         """collect all enteris to download from site"""
136 | 
137 |         urls_to_collect_link_from = self.get_request_url(
138 |             files_types=files_types, store_id=store_id, when_date=when_date
139 |         )
140 |         assert len(urls_to_collect_link_from) > 0, "No pages to scrape"
141 | 
142 |         all_trs = []
143 |         for url in urls_to_collect_link_from:
144 |             req_res = self.session_with_cookies_by_chain(**url)
145 |             trs = self.get_data_from_page(req_res)
146 |             all_trs.extend(trs)
147 | 
148 |         Logger.info(f"Found {len(all_trs)} entries")
149 | 
150 |         download_urls, file_names, file_sizes = self.extract_task_from_entry(all_trs)
151 | 
152 |         Logger.info(f"Found {len(download_urls)} download urls")
153 | 
154 |         # Filter by file size if specified
155 |         if min_size is not None or max_size is not None:
156 |             file_names, download_urls, file_sizes = self.filter_by_file_size(
157 |                 file_names,
158 |                 download_urls,
159 |                 file_sizes,
160 |                 min_size=min_size,
161 |                 max_size=max_size,
162 |             )
163 | 
164 |         file_names, download_urls, file_sizes = self.filter_bad_files_zip(
165 |             file_names,
166 |             download_urls,
167 |             file_sizes=file_sizes,
168 |             filter_null=filter_null,
169 |             filter_zero=filter_zero,
170 |         )
171 | 
172 |         Logger.info(f"After filtering bad files: Found {len(download_urls)} files")
173 | 
174 |         # pylint: disable=duplicate-code
175 |         file_names, download_urls, file_sizes = self.apply_limit_zip(
176 |             file_names,
177 |             download_urls,
178 |             file_sizes=file_sizes,
179 |             limit=limit,
180 |             files_types=files_types,
181 |             store_id=store_id,
182 |             when_date=when_date,
183 |             files_names_to_scrape=files_names_to_scrape,
184 |             suppress_exception=suppress_exception,
185 |         )
186 | 
187 |         Logger.info(f"After applying limit: Found {len(download_urls)} entries")
188 | 
189 |         return download_urls, file_names
190 | 
191 |     def _scrape(
192 |         self,
193 |         limit=None,
194 |         files_types=None,
195 |         store_id=None,
196 |         when_date=None,
197 |         files_names_to_scrape=None,
198 |         filter_null=False,
199 |         filter_zero=False,
200 |         suppress_exception=False,
201 |         min_size=None,
202 |         max_size=None,
203 |     ):
204 |         """scarpe the files from multipage sites"""
205 |         download_urls, file_names = [], []
206 |         try:
207 |             download_urls, file_names = self.collect_files_details_from_site(
208 |                 limit=limit,
209 |                 files_types=files_types,
210 |                 store_id=store_id,
211 |                 when_date=when_date,
212 |                 filter_null=filter_null,
213 |                 filter_zero=filter_zero,
214 |                 files_names_to_scrape=files_names_to_scrape,
215 |                 suppress_exception=suppress_exception,
216 |                 min_size=min_size,
217 |                 max_size=max_size,
218 |             )
219 | 
220 |             self.on_collected_details(file_names, download_urls)
221 | 
222 |             Logger.info(f"collected {len(download_urls)} to download.")
223 |             if len(download_urls) > 0:
224 |                 results = execute_in_parallel(
225 |                     self.save_and_extract,
226 |                     list(zip(download_urls, file_names)),
227 |                     max_threads=self.max_threads,
228 |                 )
229 |             else:
230 |                 results = []
231 | 
232 |             return results
233 |         except Exception as e:  # pylint: disable=broad-except
234 |             self.on_download_fail(e, download_urls=download_urls, file_names=file_names)
235 |             raise e
236 | 


--------------------------------------------------------------------------------
/il_supermarket_scarper/engines/cerberus.py:
--------------------------------------------------------------------------------
  1 | import os
  2 | import datetime
  3 | 
  4 | from il_supermarket_scarper.utils import (
  5 |     extract_xml_file_from_gz_file,
  6 |     Logger,
  7 |     execute_in_parallel,
  8 |     collect_from_ftp,
  9 |     fetch_temporary_gz_file_from_ftp,
 10 |     FileTypesFilters,
 11 | )
 12 | from .engine import Engine
 13 | 
 14 | 
 15 | class Cerberus(Engine):
 16 |     """scraper for all Cerberus base site. (seems like can't support historical data)"""
 17 | 
 18 |     target_file_extensions = ["xml", "gz"]
 19 |     utilize_date_param = False
 20 | 
 21 |     def __init__(
 22 |         self,
 23 |         chain,
 24 |         chain_id,
 25 |         folder_name=None,
 26 |         ftp_host="url.retail.publishedprices.co.il",
 27 |         ftp_path="/",
 28 |         ftp_username="",
 29 |         ftp_password="",
 30 |         max_threads=5,
 31 |     ):
 32 |         super().__init__(chain, chain_id, folder_name, max_threads)
 33 |         self.ftp_host = ftp_host
 34 |         self.ftp_path = ftp_path
 35 |         self.ftp_username = ftp_username
 36 |         self.ftp_password = ftp_password
 37 |         self.ftp_session = False
 38 | 
 39 |     def _scrape(
 40 |         self,
 41 |         limit=None,
 42 |         files_types=None,
 43 |         store_id=None,
 44 |         when_date=None,
 45 |         files_names_to_scrape=None,
 46 |         filter_null=False,
 47 |         filter_zero=False,
 48 |         suppress_exception=False,
 49 |         min_size=None,
 50 |         max_size=None,
 51 |     ):
 52 |         files = []
 53 |         try:
 54 |             files = self.collect_files_details_from_site(
 55 |                 limit=limit,
 56 |                 files_types=files_types,
 57 |                 filter_null=filter_null,
 58 |                 filter_zero=filter_zero,
 59 |                 store_id=store_id,
 60 |                 when_date=when_date,
 61 |                 files_names_to_scrape=files_names_to_scrape,
 62 |                 suppress_exception=suppress_exception,
 63 |                 min_size=min_size,
 64 |                 max_size=max_size,
 65 |             )
 66 |             self.on_collected_details(files)
 67 | 
 68 |             results = execute_in_parallel(
 69 |                 self.persist_from_ftp, list(files), max_threads=self.max_threads
 70 |             )
 71 |             return results
 72 |         except Exception as e:  # pylint: disable=broad-except
 73 |             self.on_download_fail(e, file_names=files)
 74 |             raise e
 75 | 
 76 |     def get_type_pattern(self, files_types):
 77 |         """get the file type pattern"""
 78 |         file_type_mapping = {
 79 |             FileTypesFilters.STORE_FILE.name: "store",
 80 |             FileTypesFilters.PRICE_FILE.name: "price",
 81 |             FileTypesFilters.PROMO_FILE.name: "promo",
 82 |             FileTypesFilters.PRICE_FULL_FILE.name: "pricef",
 83 |             FileTypesFilters.PROMO_FULL_FILE.name: "promof",
 84 |         }
 85 |         if files_types is None or files_types == FileTypesFilters.all_types():
 86 |             return [None]
 87 | 
 88 |         responses = []
 89 |         for file_type in files_types:
 90 |             if file_type not in file_type_mapping:
 91 |                 raise ValueError(f"File type {file_type} not supported")
 92 |             responses.append(file_type_mapping[file_type])
 93 |         return responses
 94 | 
 95 |     def build_filter_arg(self, store_id=None, when_date=None, files_types=None):
 96 |         """build the filter arg for the ftp"""
 97 |         date_pattern = None
 98 |         if when_date and isinstance(when_date, datetime.datetime):
 99 |             date_pattern = when_date.strftime("%Y%m%d")
100 | 
101 |         for type_pattern in self.get_type_pattern(files_types):
102 |             output_pattern = []
103 |             if type_pattern:
104 |                 output_pattern.append(type_pattern)
105 |             if store_id:
106 |                 output_pattern.append(f"{store_id}-")
107 |             if date_pattern:
108 |                 output_pattern.append(date_pattern)
109 | 
110 |             if len(output_pattern) == 0:
111 |                 yield None
112 |             yield "*" + "*".join(output_pattern) + "*"
113 | 
114 |     def collect_files_details_from_site(  # pylint: disable=too-many-locals
115 |         self,
116 |         limit=None,
117 |         files_types=None,
118 |         filter_null=False,
119 |         filter_zero=False,
120 |         store_id=None,
121 |         when_date=None,
122 |         files_names_to_scrape=None,
123 |         suppress_exception=False,
124 |         min_size=None,
125 |         max_size=None,
126 |     ):
127 |         """collect all files to download from the site"""
128 |         files = []
129 |         for filter_arg in self.build_filter_arg(store_id, when_date, files_types):
130 |             filter_files = collect_from_ftp(
131 |                 self.ftp_host,
132 |                 self.ftp_username,
133 |                 self.ftp_password,
134 |                 self.ftp_path,
135 |                 arg=filter_arg,
136 |             )
137 |             files.extend(filter_files)
138 | 
139 |         Logger.info(f"Found {len(files)} files")
140 | 
141 |         # Convert tuples to separate lists for base class filter_by_file_size method
142 |         if min_size is not None or max_size is not None:
143 |             file_names = [filename for filename, _ in files]
144 |             download_urls = [""] * len(files)  # FTP doesn't use URLs, use empty strings
145 |             file_sizes = [size for _, size in files]
146 |             file_names, download_urls, file_sizes = self.filter_by_file_size(
147 |                 file_names,
148 |                 download_urls,
149 |                 file_sizes,
150 |                 min_size=min_size,
151 |                 max_size=max_size,
152 |             )
153 |             # Convert back to tuples
154 |             files = list(zip(file_names, file_sizes))
155 | 
156 |         files = self.filter_bad_files(
157 |             files,
158 |             filter_null=filter_null,
159 |             filter_zero=filter_zero,
160 |             by_function=lambda x: x[0],
161 |         )
162 | 
163 |         Logger.info(f"After filtering bad files: Found {len(files)} files")
164 | 
165 |         files = list(
166 |             filter(lambda x: x[0].split(".")[-1] in self.target_file_extensions, files)
167 |         )
168 |         Logger.info(
169 |             f"After filtering by {self.target_file_extensions}: Found {len(files)} files"
170 |         )
171 | 
172 |         # apply noraml filter
173 |         files = self.apply_limit(
174 |             files,
175 |             limit=limit,
176 |             files_types=files_types,
177 |             store_id=store_id,
178 |             when_date=when_date,
179 |             files_names_to_scrape=files_names_to_scrape,
180 |             suppress_exception=suppress_exception,
181 |             by_function=lambda x: x[0],
182 |         )
183 |         Logger.info(f"After applying limit: Found {len(files)} files")
184 | 
185 |         # Extract just filenames for backward compatibility with persist_from_ftp
186 |         return [filename for filename, _ in files]
187 | 
188 |     def persist_from_ftp(self, file_name):
189 |         """download file to hard drive and extract it."""
190 |         downloaded = False
191 |         extract_succefully = False
192 |         restart_and_retry = False
193 |         error = None
194 |         try:
195 |             ext = os.path.splitext(file_name)[1]
196 |             if ext not in [".gz", ".xml"]:
197 |                 raise ValueError(f"File {file_name} extension is not .gz or .xml")
198 | 
199 |             Logger.debug(f"Start persisting file {file_name}")
200 |             temporary_gz_file_path = os.path.join(self.storage_path, file_name)
201 | 
202 |             fetch_temporary_gz_file_from_ftp(
203 |                 self.ftp_host,
204 |                 self.ftp_username,
205 |                 self.ftp_password,
206 |                 self.ftp_path,
207 |                 temporary_gz_file_path,
208 |                 timeout=30,
209 |             )
210 |             downloaded = True
211 | 
212 |             if ext == ".gz":
213 |                 Logger.debug(
214 |                     f"File size is {os.path.getsize(temporary_gz_file_path)} bytes."
215 |                 )
216 |                 extract_xml_file_from_gz_file(temporary_gz_file_path)
217 | 
218 |             Logger.debug(f"Done persisting file {file_name}")
219 |             extract_succefully = True
220 |         except Exception as exception:  # pylint: disable=broad-except
221 |             Logger.error(
222 |                 f"Error downloading {file_name},extract_succefully={extract_succefully}"
223 |                 f",downloaded={downloaded}"
224 |             )
225 |             Logger.error_execption(exception)
226 |             error = str(exception)
227 |             restart_and_retry = True
228 |         finally:
229 |             if ext == ".gz" and os.path.exists(temporary_gz_file_path):
230 |                 os.remove(temporary_gz_file_path)
231 | 
232 |         return {
233 |             "file_name": file_name,
234 |             "downloaded": downloaded,
235 |             "extract_succefully": extract_succefully,
236 |             "restart_and_retry": restart_and_retry,
237 |             "error": error,
238 |         }
239 | 


--------------------------------------------------------------------------------
/il_supermarket_scarper/engines/multipage_web.py:
--------------------------------------------------------------------------------
  1 | from urllib.parse import urlsplit
  2 | import re
  3 | import ntpath
  4 | from abc import abstractmethod
  5 | from lxml import html as lxml_html
  6 | 
  7 | 
  8 | from il_supermarket_scarper.utils import (
  9 |     Logger,
 10 |     execute_in_parallel,
 11 |     multiple_page_aggregtion,
 12 |     convert_nl_size_to_bytes,
 13 |     UnitSize,
 14 | )
 15 | from .web import WebBase
 16 | 
 17 | 
 18 | class MultiPageWeb(WebBase):
 19 |     """scrape the file of websites with multipage"""
 20 | 
 21 |     target_file_extension = ".xml"
 22 |     results_in_page = 20
 23 | 
 24 |     def __init__(
 25 |         self,
 26 |         chain,
 27 |         chain_id,
 28 |         url,
 29 |         folder_name=None,
 30 |         total_page_xpath="""//*[@id="gridContainer"]/table/
 31 |                                             tfoot/tr/td/a[6]/@href""",
 32 |         total_pages_pattern=r"^\/\?page\=([0-9]{3})$",
 33 |         page_argument="page",
 34 |         max_threads=5,
 35 |     ):
 36 |         super().__init__(
 37 |             chain, chain_id, url=url, folder_name=folder_name, max_threads=max_threads
 38 |         )
 39 |         self.total_page_xpath = total_page_xpath
 40 |         self.total_pages_pattern = total_pages_pattern
 41 |         self.page_argument = page_argument
 42 | 
 43 |     @abstractmethod
 44 |     def build_params(self, files_types=None, store_id=None, when_date=None):
 45 |         """build the params for the request"""
 46 | 
 47 |     def get_request_url(
 48 |         self, files_types=None, store_id=None, when_date=None
 49 |     ):  # pylint: disable=unused-argument
 50 |         """get all links to collect download links from"""
 51 | 
 52 |         results = []
 53 |         for arguments in self.build_params(
 54 |             files_types=files_types, store_id=store_id, when_date=when_date
 55 |         ):
 56 |             results.append(
 57 |                 {
 58 |                     "url": self.url + arguments,
 59 |                     "method": "GET",
 60 |                 }
 61 |             )
 62 |         return results
 63 | 
 64 |     def get_number_of_pages(self, response):
 65 |         """get the number of pages to scarpe"""
 66 | 
 67 |         html_body = lxml_html.fromstring(response.content)
 68 | 
 69 |         elements = html_body.xpath(self.total_page_xpath)
 70 | 
 71 |         if len(elements) == 0:
 72 |             return None  # only one page
 73 | 
 74 |         pages = re.findall(
 75 |             self.total_pages_pattern,
 76 |             elements[-1],
 77 |         )
 78 |         return int(pages[0])
 79 | 
 80 |     def collect_files_details_from_site(  # pylint: disable=too-many-locals
 81 |         self,
 82 |         limit=None,
 83 |         files_types=None,
 84 |         store_id=None,
 85 |         when_date=None,
 86 |         filter_null=False,
 87 |         filter_zero=False,
 88 |         files_names_to_scrape=None,
 89 |         suppress_exception=False,
 90 |         min_size=None,
 91 |         max_size=None,
 92 |     ):
 93 | 
 94 |         main_page_requests = self.get_request_url(
 95 |             files_types=files_types, store_id=store_id, when_date=when_date
 96 |         )
 97 |         assert len(main_page_requests) > 0, "No pages to scrape"
 98 | 
 99 |         download_urls = []
100 |         file_names = []
101 |         file_sizes = []
102 |         for main_page_request in main_page_requests:
103 | 
104 |             main_page_response = self.session_with_cookies_by_chain(**main_page_request)
105 | 
106 |             total_pages = self.get_number_of_pages(main_page_response)
107 |             Logger.info(f"Found {total_pages} pages")
108 | 
109 |             # if there is only one page, call it again,
110 |             # in the future, we can skip scrap it again
111 |             if total_pages is None:
112 |                 pages_to_scrape = [main_page_request]
113 |             else:
114 |                 pages_to_scrape = list(
115 |                     map(
116 |                         lambda page_number, req=main_page_request: {
117 |                             **req,
118 |                             "url": req["url"]
119 |                             + f"{self.page_argument}="
120 |                             + str(page_number),
121 |                         },
122 |                         range(1, total_pages + 1),
123 |                     )
124 |                 )
125 | 
126 |             _download_urls, _file_names, _file_sizes = execute_in_parallel(
127 |                 self.process_links_before_download,
128 |                 list(pages_to_scrape),
129 |                 aggregtion_function=multiple_page_aggregtion,
130 |                 max_threads=self.max_threads,
131 |             )
132 | 
133 |             download_urls.extend(_download_urls)
134 |             file_names.extend(_file_names)
135 |             file_sizes.extend(
136 |                 _file_sizes if _file_sizes else [None] * len(_download_urls)
137 |             )
138 | 
139 |         Logger.info(f"Found {len(download_urls)} files")
140 | 
141 |         # Filter by file size if specified
142 |         if min_size is not None or max_size is not None:
143 |             file_names, download_urls, file_sizes = self.filter_by_file_size(
144 |                 file_names,
145 |                 download_urls,
146 |                 file_sizes,
147 |                 min_size=min_size,
148 |                 max_size=max_size,
149 |             )
150 | 
151 |         file_names, download_urls, file_sizes = self.filter_bad_files_zip(
152 |             file_names,
153 |             download_urls,
154 |             file_sizes=file_sizes,
155 |             filter_null=filter_null,
156 |             filter_zero=filter_zero,
157 |         )
158 | 
159 |         Logger.info(f"After filtering bad files: Found {len(download_urls)} files")
160 | 
161 |         file_names, download_urls, file_sizes = self.apply_limit_zip(
162 |             file_names,
163 |             download_urls,
164 |             file_sizes=file_sizes,
165 |             limit=limit,
166 |             files_types=files_types,
167 |             store_id=store_id,
168 |             when_date=when_date,
169 |             files_names_to_scrape=files_names_to_scrape,
170 |             suppress_exception=suppress_exception,
171 |         )
172 | 
173 |         return download_urls, file_names
174 | 
175 |     def get_file_size_from_entry(
176 |         self, html, link_element
177 |     ):  # pylint: disable=arguments-differ,unused-argument
178 |         """
179 |         Extract file size from HTML element.
180 |         For MultiPageWeb, we need to find the size in the same row as the link.
181 |         Returns size in bytes, or None if not found.
182 |         """
183 |         try:
184 |             # Find the parent row of the link
185 |             row = (
186 |                 link_element.getparent().getparent()
187 |                 if link_element.getparent()
188 |                 else None
189 |             )
190 |             if row is None:
191 |                 return None
192 | 
193 |             # Look for size in table cells - typically in a column after the link
194 |             cells = row.xpath(".//td")
195 |             for cell in cells:
196 |                 text = cell.text_content().strip() if cell.text_content() else ""
197 |                 # Parse size using the same logic as WebBase
198 |                 size_bytes = convert_nl_size_to_bytes(text, to_unit=UnitSize.BYTES)
199 |                 if size_bytes is not None:
200 |                     return size_bytes
201 |         except (AttributeError, TypeError) as e:
202 |             Logger.debug(f"Error extracting file size from entry: {e}")
203 |         return None
204 | 
205 |     def collect_files_details_from_page(self, html):
206 |         """collect the details deom one page"""
207 |         links = []
208 |         filenames = []
209 |         file_sizes = []
210 |         # Select all rows from the table
211 |         rows = html.xpath('//*[@id="gridContainer"]/table/tbody/tr')
212 |         for row in rows:
213 |             # Extract link from td[1]/a
214 |             link_elements = row.xpath("./td[1]/a")
215 |             if not link_elements:
216 |                 continue
217 |             link_element = link_elements[0]
218 |             link = link_element.get("href")
219 |             if not link:
220 |                 continue
221 | 
222 |             # Extract size from td[3] (size column)
223 |             size_elements = row.xpath("./td[3]")
224 |             size_text = size_elements[0].text_content().strip() if size_elements else ""
225 |             size_bytes = (
226 |                 convert_nl_size_to_bytes(size_text, to_unit=UnitSize.BYTES)
227 |                 if size_text
228 |                 else None
229 |             )
230 | 
231 |             links.append(link)
232 |             filenames.append(ntpath.basename(urlsplit(link).path))
233 |             file_sizes.append(size_bytes)
234 |         return links, filenames, file_sizes
235 | 
236 |     def process_links_before_download(
237 |         self,
238 |         request,
239 |         limit=None,
240 |         files_types=None,
241 |         store_id=None,
242 |         when_date=None,
243 |         suppress_exception=True,  # this is nested limit don't fail
244 |     ):
245 |         """additional processing to the links before download"""
246 |         response = self.session_with_cookies_by_chain(**request)
247 | 
248 |         html = lxml_html.fromstring(response.text)
249 | 
250 |         file_links, filenames, file_sizes = self.collect_files_details_from_page(html)
251 |         Logger.info(f"Page {request}: Found {len(file_links)} files")
252 | 
253 |         filenames, file_links, file_sizes = self.apply_limit_zip(
254 |             filenames,
255 |             file_links,
256 |             file_sizes=file_sizes,
257 |             limit=limit,
258 |             files_types=files_types,
259 |             store_id=store_id,
260 |             when_date=when_date,
261 |             suppress_exception=suppress_exception,
262 |         )
263 | 
264 |         Logger.info(
265 |             f"After applying limit: Page {request}: "
266 |             f"Found {len(file_links)} line and {len(filenames)} files"
267 |         )
268 | 
269 |         return file_links, filenames, file_sizes
270 | 


--------------------------------------------------------------------------------
/il_supermarket_scarper/scrappers/tests/test_cases.py:
--------------------------------------------------------------------------------
  1 | # pylint: disable=too-many-statements
  2 | import unittest
  3 | import tempfile
  4 | import re
  5 | import os
  6 | import uuid
  7 | import xml.etree.ElementTree as ET
  8 | from lxml import etree
  9 | from il_supermarket_scarper.utils import (
 10 |     FileTypesFilters,
 11 |     Logger,
 12 |     DumpFolderNames,
 13 |     _testing_now,
 14 |     change_xml_encoding,
 15 | )
 16 | from il_supermarket_scarper.scrappers_factory import ScraperFactory
 17 | from il_supermarket_scarper.scraper_stability import ScraperStability
 18 | 
 19 | 
 20 | def make_test_case(scraper_enum, store_id):
 21 |     """create test suite for scraper"""
 22 | 
 23 |     class TestScapers(unittest.TestCase):
 24 |         """class with all the tests for scraper"""
 25 | 
 26 |         def __init__(self, name) -> None:
 27 |             super().__init__(name)
 28 |             self.scraper_enum = scraper_enum
 29 |             self.folder_name = "temp"
 30 | 
 31 |         def _delete_folder_and_sub_folder(self, download_path):
 32 |             """delete a folder and all sub-folder"""
 33 |             files_found = os.listdir(download_path)
 34 |             for file in files_found:
 35 |                 file_path = os.path.join(download_path, file)
 36 |                 if os.path.isdir(file_path):
 37 |                     self._delete_folder_and_sub_folder(file_path)
 38 |                     os.rmdir(file_path)
 39 |                 else:
 40 |                     os.remove(file_path)
 41 | 
 42 |         def _delete_download_folder(self, download_path):
 43 |             """delete the download folder"""
 44 |             if os.path.isdir(download_path):
 45 |                 self._delete_folder_and_sub_folder(download_path)
 46 |                 os.removedirs(download_path)
 47 | 
 48 |         def _make_sure_filter_work(
 49 |             self,
 50 |             files_found,
 51 |             file_type=None,
 52 |             limit=None,
 53 |             store_id=None,
 54 |             when_date=None,
 55 |         ):
 56 |             """make sure the file type filter works"""
 57 |             # make sure the file type is applied
 58 |             if file_type:
 59 |                 filtered_files = 0
 60 |                 for f_type in file_type:
 61 |                     filtered_files += len(FileTypesFilters.filter(f_type, files_found))
 62 |                 assert len(files_found) == filtered_files
 63 | 
 64 |             # check the store id is applied
 65 |             if store_id:
 66 |                 for file in files_found:
 67 |                     assert re.compile(rf"-0*{store_id}-").search(file)
 68 | 
 69 |             # check the date time stamp is applied
 70 |             if when_date:
 71 |                 for file in files_found:
 72 |                     assert (
 73 |                         when_date.strftime("%Y%m%d") in file
 74 |                     ), f"{when_date} not in {file}"
 75 | 
 76 |             # check limit
 77 |             assert (
 78 |                 limit is None or len(files_found) == limit
 79 |             ), f""" Found {files_found} f"files but should be {limit}"""
 80 | 
 81 |         def _make_sure_file_contain_chain_ids(self, chain_ids, file):
 82 |             """make sure the scraper download only the chain id"""
 83 |             found_chain_id = False
 84 |             for possible_chain_ids in chain_ids:
 85 |                 if possible_chain_ids in file:
 86 |                     found_chain_id = True
 87 |             assert found_chain_id, f"should be one of {chain_ids} but {file}"
 88 | 
 89 |         def _make_sure_file_extension_is_xml(self, file_name):
 90 |             """make sure the file extension is xml"""
 91 |             file_ext = file_name.split(".")[-1]
 92 |             assert file_ext == "xml", f" should be xml but {file_ext}, file:{file_name}"
 93 | 
 94 |         def _try_to_recover_xml(self, file_path):
 95 |             """try to recover the xml"""
 96 |             parser = etree.XMLParser(recover=True, encoding="utf-8")
 97 |             with open(file_path, "rb") as f:
 98 |                 tree = etree.parse(f, parser)
 99 |             fixed_xml = etree.tostring(
100 |                 tree, pretty_print=True, encoding="utf-8"
101 |             ).decode("utf-8")
102 | 
103 |             with open(file_path, "w", encoding="utf-8") as f:
104 |                 f.write(fixed_xml)
105 | 
106 |         def _make_sure_file_is_xml_readable(self, full_file_path):
107 |             """Ensure the file is a valid XML and readable."""
108 |             try:
109 |                 ET.parse(full_file_path)
110 |             except ET.ParseError:
111 |                 try:
112 |                     self._try_to_recover_xml(full_file_path)
113 |                     ET.parse(full_file_path)
114 |                 except ET.ParseError:
115 |                     change_xml_encoding(full_file_path)
116 |                     ET.parse(full_file_path)
117 | 
118 |         def _clean_scarpe_delete(
119 |             self,
120 |             scraper_enum,
121 |             store_id=None,
122 |             limit=None,
123 |             file_type=None,
124 |             when_date=None,
125 |         ):
126 |             with tempfile.TemporaryDirectory() as tmpdirname:
127 |                 self.__clean_scarpe_delete(
128 |                     scraper_enum=scraper_enum,
129 |                     dump_path=tmpdirname,
130 |                     store_id=store_id,
131 |                     limit=limit,
132 |                     file_type=file_type,
133 |                     when_date=when_date,
134 |                 )
135 | 
136 |         def __clean_scarpe_delete(
137 |             self,
138 |             scraper_enum,
139 |             dump_path="temp",
140 |             store_id=None,
141 |             limit=None,
142 |             file_type=None,
143 |             when_date=None,
144 |         ):
145 |             self._delete_download_folder(dump_path)
146 |             os.makedirs(dump_path)
147 |             init_scraper_function = ScraperFactory.get(scraper_enum)
148 | 
149 |             if init_scraper_function is None:
150 |                 Logger.warning(f"{scraper_enum} is disabled.")
151 |             else:
152 |                 try:
153 |                     scraper = init_scraper_function(folder_name=dump_path)
154 | 
155 |                     kwarg = {
156 |                         "limit": limit,
157 |                         "files_types": file_type,
158 |                         "store_id": store_id,
159 |                         "when_date": when_date,
160 |                         "filter_null": True,
161 |                         "filter_zero": True,
162 |                         "suppress_exception": True,
163 |                         "min_size": 100,
164 |                         "max_size": 10000000,
165 |                     }
166 | 
167 |                     scraper.scrape(**kwarg)
168 | 
169 |                     files_found = os.listdir(dump_path)
170 |                     assert (
171 |                         len(files_found) == 2
172 |                     ), "only one folder should exists and the status folder"
173 |                     assert DumpFolderNames[scraper_enum.name].value in files_found
174 | 
175 |                     download_path = os.path.join(
176 |                         dump_path, DumpFolderNames[scraper_enum.name].value
177 |                     )
178 |                     files_found = os.listdir(download_path)
179 | 
180 |                     if not ScraperStability.is_validate_scraper_found_no_files(
181 |                         scraper_enum.name,
182 |                         limit=limit,
183 |                         files_types=file_type,
184 |                         store_id=store_id,
185 |                         when_date=when_date,
186 |                         utilize_date_param=scraper_enum.value.utilize_date_param,
187 |                     ):
188 |                         self._make_sure_filter_work(
189 |                             files_found,
190 |                             file_type=file_type,
191 |                             limit=limit,
192 |                             store_id=store_id,
193 |                             when_date=when_date,
194 |                         )
195 | 
196 |                     for file in files_found:
197 |                         self._make_sure_file_contain_chain_ids(
198 |                             scraper.get_chain_id(), file
199 |                         )
200 |                         self._make_sure_file_extension_is_xml(file)
201 | 
202 |                         self._make_sure_file_is_xml_readable(
203 |                             os.path.join(download_path, file)
204 |                         )
205 |                 finally:
206 |                     self._delete_download_folder(dump_path)
207 | 
208 |         def _get_temp_folder(self):
209 |             """get a temp folder to download the files into"""
210 |             return self.folder_name + str(uuid.uuid4().hex)
211 | 
212 |         def test_scrape_one(self):
213 |             """scrape one file and make sure it exists"""
214 |             self._clean_scarpe_delete(scraper_enum, limit=1)
215 | 
216 |         def test_scrape_three(self):
217 |             """scrape three file and make sure they exists"""
218 |             self._clean_scarpe_delete(scraper_enum, limit=3)
219 | 
220 |         def test_scrape_promo(self):
221 |             """scrape one promo file and make sure it exists"""
222 |             self._clean_scarpe_delete(
223 |                 scraper_enum,
224 |                 limit=1,
225 |                 file_type=FileTypesFilters.only_promo(),
226 |             )
227 | 
228 |         def test_scrape_store(self):
229 |             """scrape one store file and make sure it exists"""
230 |             self._clean_scarpe_delete(
231 |                 scraper_enum, limit=1, file_type=FileTypesFilters.only_store()
232 |             )
233 | 
234 |         def test_scrape_price(self):
235 |             """scrape one price file and make sure it exists"""
236 |             self._clean_scarpe_delete(
237 |                 scraper_enum, limit=1, file_type=FileTypesFilters.only_price()
238 |             )
239 | 
240 |         def test_scrape_file_from_single_store(self):
241 |             """test fetching only files from a ceriten store"""
242 |             self._clean_scarpe_delete(scraper_enum, store_id=store_id, limit=1)
243 | 
244 |         def test_scrape_file_today(self):
245 |             """test fetching file from today"""
246 |             self._clean_scarpe_delete(scraper_enum, when_date=_testing_now(), limit=1)
247 | 
248 |     return TestScapers
249 | 


--------------------------------------------------------------------------------