├── .dockerignore ├── .flake8 ├── .gitattributes ├── .gitignore ├── .pylintrc ├── .streamlit └── config.toml ├── Dockerfile ├── LICENSE ├── Makefile ├── Packages.md ├── README.md ├── Virtualenv.md ├── packages.txt ├── requirements.txt ├── resources ├── python.png ├── selenium.png ├── selenium_base.png └── streamlit.png ├── scratchpad ├── .dockerignore ├── Dockerfile ├── README.md ├── packages.txt ├── proxies.ipynb ├── requirements.txt └── selenium.ipynb └── streamlit_app.py /.dockerignore: -------------------------------------------------------------------------------- 1 | ### Python ### 2 | # Byte-compiled / optimized / DLL files 3 | __pycache__/ 4 | *.py[cod] 5 | *$py.class 6 | 7 | # C extensions 8 | *.so 9 | 10 | # Distribution / packaging 11 | .Python 12 | build/ 13 | develop-eggs/ 14 | dist/ 15 | downloads/ 16 | eggs/ 17 | .eggs/ 18 | parts/ 19 | sdist/ 20 | var/ 21 | wheels/ 22 | pip-wheel-metadata/ 23 | share/python-wheels/ 24 | *.egg-info/ 25 | .installed.cfg 26 | *.egg 27 | MANIFEST 28 | 29 | # Environments 30 | .env 31 | .venv 32 | env/ 33 | venv/ 34 | 35 | .vscode/ 36 | *.exe 37 | *.md 38 | *.zip 39 | *.gz 40 | *.log 41 | LICENSE 42 | Makefile 43 | .git/ 44 | .gitignore 45 | .gitattributes 46 | .streamlit/secrets.toml 47 | .flake8 48 | .pylintrc 49 | scratchpad/ 50 | -------------------------------------------------------------------------------- /.flake8: -------------------------------------------------------------------------------- 1 | [flake8] 2 | ignore = E501,E128 -------------------------------------------------------------------------------- /.gitattributes: -------------------------------------------------------------------------------- 1 | # Only use LF line endings of our bash scripts and txt files 2 | *.sh -lf 3 | *.txt -lf 4 | .env -lf 5 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | # File created using '.gitignore Generator' for Visual Studio Code: https://bit.ly/vscode-gig 2 | 3 | # Created by https://www.toptal.com/developers/gitignore/api/windows,visualstudiocode,jupyternotebooks,python 4 | # Edit at https://www.toptal.com/developers/gitignore?templates=windows,visualstudiocode,jupyternotebooks,python 5 | 6 | ### JupyterNotebooks ### 7 | # gitignore template for Jupyter Notebooks 8 | # website: http://jupyter.org/ 9 | 10 | .ipynb_checkpoints 11 | */.ipynb_checkpoints/* 12 | 13 | # IPython 14 | profile_default/ 15 | ipython_config.py 16 | 17 | # Remove previous ipynb_checkpoints 18 | # git rm -r .ipynb_checkpoints/ 19 | 20 | ### Python ### 21 | # Byte-compiled / optimized / DLL files 22 | __pycache__/ 23 | *.py[cod] 24 | *$py.class 25 | 26 | # C extensions 27 | *.so 28 | 29 | # Distribution / packaging 30 | .Python 31 | build/ 32 | develop-eggs/ 33 | dist/ 34 | downloads/ 35 | eggs/ 36 | .eggs/ 37 | lib/ 38 | lib64/ 39 | parts/ 40 | sdist/ 41 | var/ 42 | wheels/ 43 | share/python-wheels/ 44 | *.egg-info/ 45 | .installed.cfg 46 | *.egg 47 | MANIFEST 48 | 49 | # PyInstaller 50 | # Usually these files are written by a python script from a template 51 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 52 | *.manifest 53 | *.spec 54 | 55 | # Installer logs 56 | pip-log.txt 57 | pip-delete-this-directory.txt 58 | 59 | # Unit test / coverage reports 60 | htmlcov/ 61 | .tox/ 62 | .nox/ 63 | .coverage 64 | .coverage.* 65 | .cache 66 | nosetests.xml 67 | coverage.xml 68 | *.cover 69 | *.py,cover 70 | .hypothesis/ 71 | .pytest_cache/ 72 | cover/ 73 | 74 | # Translations 75 | *.mo 76 | *.pot 77 | 78 | # Django stuff: 79 | *.log 80 | local_settings.py 81 | db.sqlite3 82 | db.sqlite3-journal 83 | 84 | # Flask stuff: 85 | instance/ 86 | .webassets-cache 87 | 88 | # Scrapy stuff: 89 | .scrapy 90 | 91 | # Sphinx documentation 92 | docs/_build/ 93 | 94 | # PyBuilder 95 | .pybuilder/ 96 | target/ 97 | 98 | # Jupyter Notebook 99 | 100 | # IPython 101 | 102 | # pyenv 103 | # For a library or package, you might want to ignore these files since the code is 104 | # intended to run in multiple environments; otherwise, check them in: 105 | # .python-version 106 | 107 | # pipenv 108 | # According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control. 109 | # However, in case of collaboration, if having platform-specific dependencies or dependencies 110 | # having no cross-platform support, pipenv may install dependencies that don't work, or not 111 | # install all needed dependencies. 112 | #Pipfile.lock 113 | 114 | # poetry 115 | # Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control. 116 | # This is especially recommended for binary packages to ensure reproducibility, and is more 117 | # commonly ignored for libraries. 118 | # https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control 119 | #poetry.lock 120 | 121 | # PEP 582; used by e.g. github.com/David-OConnor/pyflow 122 | __pypackages__/ 123 | 124 | # Celery stuff 125 | celerybeat-schedule 126 | celerybeat.pid 127 | 128 | # SageMath parsed files 129 | *.sage.py 130 | 131 | # Environments 132 | .env 133 | .venv 134 | env/ 135 | venv/ 136 | ENV/ 137 | env.bak/ 138 | venv.bak/ 139 | 140 | # Spyder project settings 141 | .spyderproject 142 | .spyproject 143 | 144 | # Rope project settings 145 | .ropeproject 146 | 147 | # mkdocs documentation 148 | /site 149 | 150 | # mypy 151 | .mypy_cache/ 152 | .dmypy.json 153 | dmypy.json 154 | 155 | # Pyre type checker 156 | .pyre/ 157 | 158 | # pytype static type analyzer 159 | .pytype/ 160 | 161 | # Cython debug symbols 162 | cython_debug/ 163 | 164 | # PyCharm 165 | # JetBrains specific template is maintained in a separate JetBrains.gitignore that can 166 | # be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore 167 | # and can be added to the global gitignore or merged into this file. For a more nuclear 168 | # option (not recommended) you can uncomment the following to ignore the entire idea folder. 169 | .idea/ 170 | 171 | ### VisualStudioCode ### 172 | .vscode/ 173 | 174 | # Local History for Visual Studio Code 175 | .history/ 176 | 177 | # Built Visual Studio Code Extensions 178 | *.vsix 179 | 180 | ### VisualStudioCode Patch ### 181 | # Ignore all local history of files 182 | .history 183 | .ionide 184 | 185 | # Support for Project snippet scope 186 | 187 | ### Windows ### 188 | # Windows thumbnail cache files 189 | Thumbs.db 190 | Thumbs.db:encryptable 191 | ehthumbs.db 192 | ehthumbs_vista.db 193 | 194 | # Dump file 195 | *.stackdump 196 | 197 | # Folder config file 198 | [Dd]esktop.ini 199 | 200 | # Recycle Bin used on file shares 201 | $RECYCLE.BIN/ 202 | 203 | # Windows Installer files 204 | *.cab 205 | *.msi 206 | *.msix 207 | *.msm 208 | *.msp 209 | 210 | # Windows shortcuts 211 | *.lnk 212 | 213 | # End of https://www.toptal.com/developers/gitignore/api/windows,visualstudiocode,jupyternotebooks,python 214 | 215 | # Custom rules (everything added below won't be overriden by 'Generate .gitignore File' if you use 'Update' option) 216 | 217 | -------------------------------------------------------------------------------- /.pylintrc: -------------------------------------------------------------------------------- 1 | [MAIN] 2 | disable= 3 | C0114, # missing-module-docstring 4 | C0115, # missing-class-docstring 5 | C0116, # missing-function-docstring 6 | R0903, # too-few-public-methods 7 | ; C0301, # line-too-long 8 | 9 | [FORMAT] 10 | max-line-length=140 11 | -------------------------------------------------------------------------------- /.streamlit/config.toml: -------------------------------------------------------------------------------- 1 | # # this is needed for local development with docker 2 | # [server] 3 | # # if you don't want to start the default browser: 4 | # headless = true 5 | # # you will need this for local development: 6 | # runOnSave = true 7 | # # you will need this if running docker on windows host: 8 | # fileWatcherType = "poll" 9 | -------------------------------------------------------------------------------- /Dockerfile: -------------------------------------------------------------------------------- 1 | # this base image seems to be quite similar to the streamlit cloud environment 2 | FROM python:3.11-slim-bullseye 3 | 4 | ENV PYTHONUNBUFFERED=1 \ 5 | PYTHONDONTWRITEBYTECODE=1 \ 6 | PIP_NO_CACHE_DIR=1 \ 7 | PIP_DISABLE_PIP_VERSION_CHECK=1 \ 8 | PIP_DEFAULT_TIMEOUT=120 \ 9 | LC_ALL=C.UTF-8 \ 10 | LANG=C.UTF-8 11 | 12 | # we need some build tools for installing additional python pip packages 13 | RUN apt-get update \ 14 | && apt-get install --yes \ 15 | software-properties-common \ 16 | build-essential \ 17 | gcc \ 18 | g++ \ 19 | cmake \ 20 | git \ 21 | curl \ 22 | python3-dev 23 | 24 | WORKDIR /app 25 | 26 | # if we have a packages.txt, install it 27 | COPY packages.txt packages.txt 28 | RUN xargs -a packages.txt apt-get install --yes 29 | 30 | RUN pip install --no-cache-dir --upgrade pip setuptools wheel uv 31 | COPY requirements.txt requirements.txt 32 | RUN uv pip install --system --no-cache -r requirements.txt 33 | 34 | EXPOSE 8501 35 | 36 | HEALTHCHECK --interval=1m --timeout=20s \ 37 | CMD curl --fail http://localhost:8501/_stcore/health 38 | 39 | COPY . . 40 | 41 | CMD ["streamlit", "run", "streamlit_app.py"] 42 | 43 | # docker build --progress=plain --tag streamlit-selenium:latest . 44 | # docker run -ti -p 8501:8501 --rm streamlit-selenium:latest /bin/bash 45 | # docker run -ti -p 8501:8501 --rm streamlit-selenium:latest 46 | # docker run -ti -p 8501:8501 -v ${pwd}:/app --rm streamlit-selenium:latest 47 | # docker run -ti -p 8501:8501 -v ${pwd}:/app --rm streamlit-selenium:latest /bin/bash 48 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2021 Franky1 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /Makefile: -------------------------------------------------------------------------------- 1 | .PHONY: all update venv venvupdate docker cleanpy cleanvenv cleanall 2 | 3 | # run one shell only 4 | .ONESHELL: all update venv venvupdate docker cleanpy cleanvenv cleanall 5 | 6 | # disable running of targets in parallel 7 | .NOTPARALLEL: all update venv venvupdate docker cleanpy cleanvenv cleanall 8 | 9 | # predefined variables 10 | CURRDIRECTORY := "$(notdir $(CURDIR))" 11 | DOCKERTAG := "$(shell python -c "print('$(CURRDIRECTORY)'.lower())"):latest" 12 | 13 | # check if os is windows or linux/mac 14 | ifeq ($(OS),Windows_NT) 15 | # windows 16 | # set python executable path for python virtualenv 17 | PYTHONVENV := .venv/Scripts/ 18 | PYTHONVENVEXE := .venv/Scripts/python.exe 19 | else 20 | # linux or mac 21 | # set python executable path for python virtualenv 22 | PYTHONVENV := .venv/bin/ 23 | PYTHONVENVEXE := .venv/bin/python 24 | endif 25 | 26 | # default target 27 | all: cleanpy update venv 28 | @echo 29 | @echo "******************* all FINISHED *******************" 30 | @echo 31 | 32 | # local update of pip/virtualenv 33 | update: 34 | @echo "+++++++++++++++++++ update START +++++++++++++++++++" 35 | @echo 36 | python -m pip install --upgrade pip setuptools wheel poetry virtualenv uv ruff 37 | @echo 38 | @echo "******************* update FINISHED *******************" 39 | @echo 40 | 41 | # target for bulding the python venv 42 | venv: 43 | @echo "+++++++++++++++++++ virtualenv venv START +++++++++++++++++++" 44 | @echo 45 | @echo "Local Python Version..." 46 | python --version 47 | which python 48 | @echo 49 | @echo "Make Virtual Environment..." 50 | # python -m venv .venv --clear --upgrade-deps 51 | python -m uv venv --seed 52 | @echo 53 | @echo "Check Virtual Environment Python Version..." 54 | $(PYTHONVENVEXE) --version 55 | $(PYTHONVENVEXE) -c "import sys; print(sys.executable)" 56 | @echo 57 | @echo "Install/Update venv dependencies..." 58 | # $(PYTHONVENVEXE) -m pip install --upgrade pip setuptools wheel poetry 59 | uv pip install --upgrade pip setuptools wheel poetry jupyter 60 | @echo 61 | @echo "Install project dependencies..." 62 | # $(PYTHONVENVEXE) -m pip install --upgrade -r requirements.txt 63 | uv pip install --upgrade --requirement requirements.txt 64 | @echo 65 | @echo "Check for outdated dependencies and just list them..." 66 | $(PYTHONVENVEXE) -m pip list --outdated 67 | @echo 68 | @echo "******************* virtualenv venv FINISHED *******************" 69 | @echo 70 | 71 | # target for upgrading venv 72 | venvupdate: 73 | @echo "+++++++++++++++++++ venvupdate START +++++++++++++++++++" 74 | @echo 75 | @echo "Check Virtual Environment Python Version..." 76 | $(PYTHONVENVEXE) --version 77 | $(PYTHONVENVEXE) -c "import sys; print(sys.executable)" 78 | @echo 79 | @echo "Update venv dependencies..." 80 | # $(PYTHONVENVEXE) -m pip install --upgrade pip setuptools wheel poetry 81 | uv pip install --upgrade pip setuptools wheel poetry jupyter 82 | @echo 83 | @echo "Update project dependencies..." 84 | # $(PYTHONVENVEXE) -m pip install --upgrade -r requirements.txt 85 | uv pip install --upgrade --requirement requirements.txt 86 | @echo 87 | @echo "Check for outdated dependencies and just list them..." 88 | $(PYTHONVENVEXE) -m pip list --outdated 89 | @echo 90 | @echo "******************* venvupdate FINISHED *******************" 91 | @echo 92 | 93 | # build docker image 94 | docker: 95 | @echo "+++++++++++++++++++ docker START +++++++++++++++++++" 96 | @echo 97 | @echo "Build docker image with TAG: $(DOCKERTAG)" 98 | @echo 99 | docker build --pull --progress=plain --tag $(DOCKERTAG) . 100 | @echo 101 | @echo "******************* docker FINISHED *******************" 102 | @echo 103 | 104 | # remove cache files 105 | cleanpy: 106 | @echo "+++++++++++++++++++ cleanpy START +++++++++++++++++++" 107 | @echo 108 | rm -rf __pycache__ 109 | @echo 110 | @echo "******************* cleanpy FINISHED *******************" 111 | @echo 112 | 113 | # remove venv 114 | cleanvenv: 115 | @echo "+++++++++++++++++++ cleanvenv START +++++++++++++++++++" 116 | @echo 117 | rm -rf .venv 118 | @echo 119 | @echo "******************* cleanvenv FINISHED *******************" 120 | @echo 121 | 122 | # remove docker image and dangling layers 123 | cleandocker: 124 | @echo "+++++++++++++++++++ cleandocker START +++++++++++++++++++" 125 | @echo 126 | docker image rm -f $(DOCKERTAG) 127 | docker builder prune -a -f 128 | @echo 129 | @echo "******************* cleandocker FINISHED *******************" 130 | @echo 131 | 132 | # clean all 133 | cleanall: cleanpy cleanvenv 134 | @echo 135 | @echo "******************* cleanall FINISHED *******************" 136 | @echo 137 | -------------------------------------------------------------------------------- /Packages.md: -------------------------------------------------------------------------------- 1 | # Packages 2 | 3 | Just a scratchpad of useful commands regarding packages. 4 | 5 | ## search for apt packages 6 | 7 | ```sh 8 | apt update 9 | apt-cache search chrome 10 | apt-cache search chromium 11 | apt-cache search firefox 12 | apt-cache search firefox-geckodriver 13 | apt-cache search geckodriver 14 | apt-cache search chromedriver 15 | cat /etc/apt/sources.list 16 | ``` 17 | 18 | ### apt packages found in Docker Container python:3.7.10-slim 19 | 20 | ```log 21 | chromium - web browser 22 | chromium-common - web browser - common resources used by the chromium packages 23 | chromium-driver - web browser - WebDriver support 24 | chromium-sandbox - web browser - setuid security sandbox for chromium 25 | chromium-shell - web browser - minimal shell 26 | firefox-esr - Mozilla Firefox web browser - Extended Support Release (ESR) 27 | ``` 28 | 29 | ### apt package installation in Docker Container python:3.7.10-slim 30 | 31 | ```sh 32 | apt install chromium chromium-common chromium-driver -y 33 | ``` 34 | 35 | ### apt sources 36 | 37 | ```sh 38 | cat /etc/apt/sources.list 39 | ``` 40 | 41 | ```log 42 | # deb http://snapshot.debian.org/archive/debian/20210329T000000Z buster main 43 | deb http://deb.debian.org/debian buster main 44 | # deb http://snapshot.debian.org/archive/debian-security/20210329T000000Z buster/updates main 45 | deb http://security.debian.org/debian-security buster/updates main 46 | # deb http://snapshot.debian.org/archive/debian/20210329T000000Z buster-updates main 47 | deb http://deb.debian.org/debian buster-updates main 48 | ``` 49 | 50 | --- 51 | 52 | ## Issue 53 | 54 | ```log 55 | E: Failed to fetch http://security.debian.org/debian-security/pool/updates/main/c/chromium/chromium-common_89.0.4389.114-1~deb10u1_amd64.deb 404 Not Found [IP: 151.101.54.132 80] 56 | E: Failed to fetch http://security.debian.org/debian-security/pool/updates/main/c/chromium/chromium_89.0.4389.114-1~deb10u1_amd64.deb 404 Not Found [IP: 151.101.54.132 80] 57 | E: Failed to fetch http://security.debian.org/debian-security/pool/updates/main/c/chromium/chromium-driver_89.0.4389.114-1~deb10u1_amd64.deb 404 Not Found [IP: 151.101.54.132 80] 58 | E: Unable to fetch some archives, maybe run apt-get update or try with --fix-missing? 59 | ``` 60 | 61 | ### apt-cache policy 62 | 63 | #### apt-cache policy chromium 64 | 65 | ```shell 66 | apt-cache policy chromium 67 | ``` 68 | 69 | Result: 70 | 71 | ```shell 72 | chromium: 73 | Installed: (none) 74 | Candidate: 89.0.4389.114-1~deb10u1 75 | Version table: 76 | 89.0.4389.114-1~deb10u1 500 77 | 500 http://security.debian.org/debian-security buster/updates/main amd64 Packages 78 | 88.0.4324.182-1~deb10u1 500 79 | 500 http://deb.debian.org/debian buster/main amd64 Packages 80 | ``` 81 | 82 | #### apt-cache policy chromium-common 83 | 84 | ```shell 85 | apt-cache policy chromium-common 86 | ``` 87 | 88 | Result: 89 | 90 | ```shell 91 | chromium-common: 92 | Installed: (none) 93 | Candidate: 89.0.4389.114-1~deb10u1 94 | Version table: 95 | 89.0.4389.114-1~deb10u1 500 96 | 500 http://security.debian.org/debian-security buster/updates/main amd64 Packages 97 | 88.0.4324.182-1~deb10u1 500 98 | 500 http://deb.debian.org/debian buster/main amd64 Packages 99 | ``` 100 | 101 | #### apt-cache policy chromium-driver 102 | 103 | ```shell 104 | apt-cache policy chromium-driver 105 | ``` 106 | 107 | Result: 108 | 109 | ```shell 110 | chromium-driver: 111 | Installed: (none) 112 | Candidate: 89.0.4389.114-1~deb10u1 113 | Version table: 114 | 89.0.4389.114-1~deb10u1 500 115 | 500 http://security.debian.org/debian-security buster/updates/main amd64 Packages 116 | 88.0.4324.182-1~deb10u1 500 117 | 500 http://deb.debian.org/debian buster/main amd64 Packages 118 | ``` 119 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | 2 | # Streamlit Selenium Test 3 | 4 | Streamlit project to test Selenium running in Streamlit Cloud runtime. 5 | 6 | - [x] **Local Windows 10** machine works 7 | - [x] **Local Docker** container works 8 | - [x] **Streamlit Cloud** runtime works, see example app here: [![Docker](https://img.shields.io/badge/Go%20To-Streamlit%20Cloud-red?logo=streamlit)](https://selenium-example.streamlit.app/) 9 | 10 | ## Issues :bug: 11 | 12 | - Example fails on Streamlit Cloud with a `TimeoutException`, due to a `403` response, because **GeoIP blocking** is active on the target website. Therefore a **proxy** can be enabled optionally to bypass this. 13 | - However, the proxies are not very reliable, because only free proxies are used here. Therefore, the example is not very stable with enabled proxies and can fail sometimes. Sometimes, no proxies are available. 14 | 15 | ## ToDo :ballot_box_with_check: 16 | 17 | - [ ] improve example 18 | - [ ] fix proxy issues 19 | - [ ] try also `undetected_chromedriver` package 20 | - [ ] try also `seleniumbase` package 21 | 22 | ## Problem :thinking: 23 | 24 | The suggestion for this repo came from a post on the Streamlit Community Forum. 25 | 26 | 27 | 28 | It is not that easy to install and use Selenium based webscraper in container based environments. 29 | On the local computer, this usually works much more smoothly because a browser is already installed and can be controlled by the associated webdriver. 30 | In container-based environments, however, **headless** operation is **mandatory** because no UI can be used there. 31 | 32 | Therefore, in this repository a small example is given to get Selenium working on: 33 | 34 | - **Local Windows 10** machine 35 | - **Local Docker** container that mimics the Streamlit Cloud runtime 36 | - **Streamlit Community Cloud** runtime 37 | 38 | ## Proxy :sunglasses: 39 | 40 | Because some websites block requests based on countries (aka geoip blocking) or from certain IP ranges, a proxy can be used to bypass this. The example app has a checkbox to enable a proxy. You can choose between socks4 and socks5 proxies. However, socks4 does not work at all. The socks5 proxy is a free socks5 proxy from a public list and is not very reliable. Therefore, the example is not very stable with enabled proxies and can fail quite often. 41 | 42 | ## Pitfalls :triangular_flag_on_post: 43 | 44 | - To use Selenium (even headless in a container) you need always **two** components to be installed on your machine: 45 | - A **webbrowser** and its associated **webdriver**. 46 | - The **version** of the headless webbrowser and its associated webdriver must always **match**. 47 | - If your are using Selenium in a docker container or on Streamlit Cloud, the `--headless` option is mandatory, because there is no graphical user interface available. 48 | - There are three options of webbrowser/webdriver combinations for Selenium: 49 | 1. `chrome & chromedriver` 50 | 2. `chromium & chromedriver` 51 | 3. `firefox & geckodriver` 52 | - Unfortunately in the default Debian Bullseye apt package repositories, not all of these packages are available. If we want an installation from the default repositories, only `chromium & chromedriver` is left. 53 | - The chromedriver has a lot of options, that can be set. It may be necessary to tweak these options on different platforms to make headless operation work. 54 | - The chromedriver, selenium and its options change quite a lot over time. A lot of information on stackoverflow regarding chromedriver/selenium is outdated. 55 | - The deployment to Streamlit Cloud has unfortunately failed sometimes in the past. A concrete cause of the error or an informative error message could not be identified. Currently it seems to be stable on Streamlit Cloud. 56 | - To run this streamlit app on **Windows**, the Windows `chromedriver.exe` must be stored here in the root folder or added to the Windows PATH. Be aware, that the version of this chromedriver must match the version of your installed Chrome browser. 57 | 58 | ## Development Setup :hammer_and_wrench: 59 | 60 | In the Streamlit Cloud runtime, neither chrome, chromedriver nor geckodriver are available in the default apt package sources. 61 | 62 | The Streamlit Cloud runtime seems to be very similar to the official docker image `python:3.XX-slim-bullseye` on Docker Hub, which is based on Debian Bullseye. 63 | 64 | In this repository a [Dockerfile](Dockerfile) is provided that mimics the Streamlit Cloud runtime. It can be used for local testing. 65 | 66 | A `packages.txt` is provided with the following minimal content: 67 | 68 | ```txt 69 | chromium 70 | chromium-driver 71 | ``` 72 | 73 | A `requirements.txt` is provided with the following minimal content: 74 | 75 | ```txt 76 | streamlit 77 | selenium 78 | ``` 79 | 80 | ## Docker :whale2: 81 | 82 | ### Docker Container local 83 | 84 | The provided [Dockerfile](Dockerfile) tries to mimic the Streamlit Cloud runtime. 85 | 86 | Build local custom Docker Image from Dockerfile 87 | 88 | ```shell 89 | docker build --progress=plain --tag selenium:latest . 90 | ``` 91 | 92 | Run custom Docker Container 93 | 94 | ```shell 95 | docker run -ti -p 8501:8501 --rm selenium:latest 96 | docker run -ti -p 8501:8501 --rm selenium:latest /bin/bash 97 | docker run -ti -p 8501:8501 -v $(pwd):/app --rm selenium:latest # linux 98 | docker run -ti -p 8501:8501 -v ${pwd}:/app --rm selenium:latest # powershell 99 | docker run -ti -p 8501:8501 -v %cd%:/app --rm selenium:latest # cmd.exe 100 | ``` 101 | 102 | ## Selenium :eye: 103 | 104 | 105 | 106 | ```sh 107 | pip install selenium 108 | ``` 109 | 110 | ### Chromium :spider_web: 111 | 112 | Required packages to install 113 | 114 | ```shell 115 | apt install chromium 116 | apt install chromium-driver 117 | ``` 118 | 119 | ### Chromium Options 120 | 121 | 122 | 123 | ## undetected_chromedriver :man_shrugging: 124 | 125 | > Another option to try, not yet done... 126 | 127 | - 128 | - *Resources* 129 | - 130 | - 131 | - 132 | - 133 | - 134 | - 135 | 136 | ## Status :heavy_check_mark: 137 | 138 | > Last changed: 2024-06-13 139 | -------------------------------------------------------------------------------- /Virtualenv.md: -------------------------------------------------------------------------------- 1 | # Python **virtualenv** Setup 2 | 3 | ```shell 4 | pip install --upgrade virtualenv 5 | python -m venv .venv --clear --upgrade-deps 6 | .venv\Scripts\activate.bat 7 | python -m pip install --upgrade pip 8 | pip install --upgrade -r requirements.txt 9 | # ...... 10 | deactivate.bat 11 | ``` 12 | -------------------------------------------------------------------------------- /packages.txt: -------------------------------------------------------------------------------- 1 | chromium 2 | chromium-driver 3 | -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | pandas 2 | streamlit 3 | selenium 4 | requests 5 | lxml 6 | countryflag 7 | -------------------------------------------------------------------------------- /resources/python.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Franky1/Streamlit-Selenium/f21831192738dbc6881fb84f92da53a6266b7fe3/resources/python.png -------------------------------------------------------------------------------- /resources/selenium.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Franky1/Streamlit-Selenium/f21831192738dbc6881fb84f92da53a6266b7fe3/resources/selenium.png -------------------------------------------------------------------------------- /resources/selenium_base.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Franky1/Streamlit-Selenium/f21831192738dbc6881fb84f92da53a6266b7fe3/resources/selenium_base.png -------------------------------------------------------------------------------- /resources/streamlit.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Franky1/Streamlit-Selenium/f21831192738dbc6881fb84f92da53a6266b7fe3/resources/streamlit.png -------------------------------------------------------------------------------- /scratchpad/.dockerignore: -------------------------------------------------------------------------------- 1 | ### Python ### 2 | # Byte-compiled / optimized / DLL files 3 | __pycache__/ 4 | *.py[cod] 5 | *$py.class 6 | 7 | # C extensions 8 | *.so 9 | 10 | # Distribution / packaging 11 | .Python 12 | build/ 13 | develop-eggs/ 14 | dist/ 15 | downloads/ 16 | eggs/ 17 | .eggs/ 18 | parts/ 19 | sdist/ 20 | var/ 21 | wheels/ 22 | pip-wheel-metadata/ 23 | share/python-wheels/ 24 | *.egg-info/ 25 | .installed.cfg 26 | *.egg 27 | MANIFEST 28 | 29 | # Environments 30 | .env 31 | .venv 32 | env/ 33 | venv/ 34 | 35 | .vscode/ 36 | *.exe 37 | *.md 38 | *.zip 39 | *.gz 40 | *.log 41 | LICENSE 42 | Makefile 43 | .git/ 44 | .gitignore 45 | .gitattributes 46 | .pylintrc 47 | -------------------------------------------------------------------------------- /scratchpad/Dockerfile: -------------------------------------------------------------------------------- 1 | # this base image seems to be quite similar to the streamlit cloud environment 2 | FROM python:3.11-slim-bullseye 3 | 4 | ENV PYTHONUNBUFFERED=1 \ 5 | PYTHONDONTWRITEBYTECODE=1 \ 6 | PIP_NO_CACHE_DIR=1 \ 7 | PIP_DISABLE_PIP_VERSION_CHECK=1 \ 8 | PIP_DEFAULT_TIMEOUT=120 \ 9 | LC_ALL=C.UTF-8 \ 10 | LANG=C.UTF-8 11 | 12 | # we need some build tools for installing additional python pip packages 13 | RUN apt-get update \ 14 | && apt-get install --yes \ 15 | software-properties-common \ 16 | build-essential \ 17 | gcc \ 18 | g++ \ 19 | cmake \ 20 | git \ 21 | curl \ 22 | python3-dev \ 23 | nano 24 | 25 | WORKDIR /app 26 | 27 | # if we have a packages.txt, install it 28 | COPY packages.txt packages.txt 29 | RUN xargs -a packages.txt apt-get install --yes 30 | 31 | RUN pip install --no-cache-dir --upgrade pip setuptools wheel uv 32 | COPY requirements.txt requirements.txt 33 | RUN uv pip install --system --no-cache -r requirements.txt 34 | 35 | # jupyter notebook default port 36 | EXPOSE 8888 37 | 38 | COPY . . 39 | 40 | CMD ["jupyter", "notebook", "--ip=0.0.0.0", "--no-browser", "--allow-root"] 41 | 42 | # docker build --progress=plain --tag jupyter:latest . 43 | # docker run -ti -p 8888:8888 --rm jupyter:latest /bin/bash 44 | # docker run -ti -p 8888:8888 --rm jupyter:latest 45 | # docker run -ti -p 8888:8888 -v ${pwd}:/app --rm jupyter:latest 46 | # docker run -ti -p 8888:8888 -v ${pwd}:/app --rm jupyter:latest /bin/bash 47 | -------------------------------------------------------------------------------- /scratchpad/README.md: -------------------------------------------------------------------------------- 1 | 2 | # Scratchpad :memo: 3 | 4 | This subfolder is a scratchpad for testing and experimenting with new ideas. It is not part of the main project and can be used for any purpose. 5 | 6 | ## Proxies :sunglasses: 7 | 8 | - 9 | - 10 | - 11 | - 12 | - 13 | - 14 | - 15 | - 16 | - 17 | -------------------------------------------------------------------------------- /scratchpad/packages.txt: -------------------------------------------------------------------------------- 1 | chromium 2 | chromium-driver 3 | -------------------------------------------------------------------------------- /scratchpad/proxies.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "# Playground for Proxies" 8 | ] 9 | }, 10 | { 11 | "cell_type": "code", 12 | "execution_count": null, 13 | "metadata": {}, 14 | "outputs": [], 15 | "source": [ 16 | "import json\n", 17 | "from io import StringIO\n", 18 | "\n", 19 | "import countryflag\n", 20 | "import pandas as pd\n", 21 | "import requests" 22 | ] 23 | }, 24 | { 25 | "cell_type": "markdown", 26 | "metadata": {}, 27 | "source": [ 28 | "#### https://proxyscrape.com/free-proxy-list" 29 | ] 30 | }, 31 | { 32 | "cell_type": "code", 33 | "execution_count": null, 34 | "metadata": {}, 35 | "outputs": [], 36 | "source": [ 37 | "PROXYSCRAPE_URL = 'https://api.proxyscrape.com/v3/free-proxy-list/get'" 38 | ] 39 | }, 40 | { 41 | "cell_type": "code", 42 | "execution_count": null, 43 | "metadata": {}, 44 | "outputs": [], 45 | "source": [ 46 | "def write_dict_to_json_file(data: dict, filename: str):\n", 47 | " with open(filename, mode='w', encoding='utf-8') as f:\n", 48 | " json.dump(data, f, indent=4)" 49 | ] 50 | }, 51 | { 52 | "cell_type": "code", 53 | "execution_count": null, 54 | "metadata": {}, 55 | "outputs": [], 56 | "source": [ 57 | "def write_list_to_txt_file(data: list, filename: str):\n", 58 | " with open(filename, mode='w', encoding='utf-8') as f:\n", 59 | " text = '\\n'.join(data)\n", 60 | " f.write(text)" 61 | ] 62 | }, 63 | { 64 | "cell_type": "code", 65 | "execution_count": null, 66 | "metadata": {}, 67 | "outputs": [], 68 | "source": [ 69 | "def get_proxyscrape_list(country: str = 'FR', protocol: str = 'socks4'):\n", 70 | " url = PROXYSCRAPE_URL\n", 71 | " params = {\n", 72 | " 'request': 'displayproxies',\n", 73 | " 'proxy_format' : 'protocolipport',\n", 74 | " 'format': 'text',\n", 75 | " 'protocol': protocol,\n", 76 | " 'timeout': 3000,\n", 77 | " 'anonymity': 'all',\n", 78 | " 'country': country,\n", 79 | " }\n", 80 | " try:\n", 81 | " response = requests.get(url=url, params=params, timeout=3)\n", 82 | " response.raise_for_status()\n", 83 | " # convert the response to a list\n", 84 | " response = response.text.strip().split('\\r\\n')\n", 85 | " write_list_to_txt_file(response, f'proxyscrape_{country.lower()}.txt')\n", 86 | " return True, response\n", 87 | " except Exception as e:\n", 88 | " return False, str(e)" 89 | ] 90 | }, 91 | { 92 | "cell_type": "code", 93 | "execution_count": null, 94 | "metadata": {}, 95 | "outputs": [], 96 | "source": [ 97 | "def get_proxyscrape_dict(country: str, protocol: str) -> tuple:\n", 98 | " params = {\n", 99 | " 'request': 'displayproxies',\n", 100 | " 'proxy_format' : 'protocolipport',\n", 101 | " 'format': 'json',\n", 102 | " 'protocol': protocol,\n", 103 | " 'timeout': 3000,\n", 104 | " 'anonymity': 'all',\n", 105 | " 'country': country,\n", 106 | " }\n", 107 | " try:\n", 108 | " response = requests.get(url=PROXYSCRAPE_URL, params=params, timeout=3)\n", 109 | " response.raise_for_status()\n", 110 | " response = response.json()\n", 111 | " # write_dict_to_json_file(response, f'proxyscrape_{country.lower()}.json')\n", 112 | " return True, response\n", 113 | " except Exception as e:\n", 114 | " return False, str(e)" 115 | ] 116 | }, 117 | { 118 | "cell_type": "code", 119 | "execution_count": null, 120 | "metadata": { 121 | "scrolled": true 122 | }, 123 | "outputs": [], 124 | "source": [ 125 | "success, proxies = get_proxyscrape_dict(country='all', protocol='socks4')\n", 126 | "if success:\n", 127 | " df = pd.json_normalize(proxies.get('proxies')).astype(str)\n", 128 | " df.to_json('proxyscrape_all.json', indent=4, orient='records')\n", 129 | " if not df.empty:\n", 130 | " countries = sorted(df['ip_data.countryCode'].unique().tolist())\n", 131 | " print(countries)\n", 132 | " print(df[['ip_data.countryCode', 'proxy']].head(10))\n", 133 | "else:\n", 134 | " print(proxies)" 135 | ] 136 | }, 137 | { 138 | "cell_type": "code", 139 | "execution_count": null, 140 | "metadata": {}, 141 | "outputs": [], 142 | "source": [ 143 | "df[['ip', 'port']].apply(lambda x: f\"{x[0]}:{x[1]}\", axis=1).tolist()" 144 | ] 145 | }, 146 | { 147 | "cell_type": "code", 148 | "execution_count": null, 149 | "metadata": {}, 150 | "outputs": [], 151 | "source": [ 152 | "def get_flag(country: str):\n", 153 | " return countryflag.getflag([country])" 154 | ] 155 | }, 156 | { 157 | "cell_type": "code", 158 | "execution_count": null, 159 | "metadata": {}, 160 | "outputs": [], 161 | "source": [ 162 | "countries_and_flags = [f'{c} {get_flag(c)}' for c in countries]\n", 163 | "for country_and_flag in countries_and_flags:\n", 164 | " print(country_and_flag)" 165 | ] 166 | }, 167 | { 168 | "cell_type": "markdown", 169 | "metadata": {}, 170 | "source": [ 171 | "#### https://www.socks-proxy.net/" 172 | ] 173 | }, 174 | { 175 | "cell_type": "code", 176 | "execution_count": null, 177 | "metadata": {}, 178 | "outputs": [], 179 | "source": [ 180 | "headers = {\n", 181 | " 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:123.0) Gecko/20100101 Firefox/123.0',\n", 182 | " 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,*/*;q=0.8',\n", 183 | " 'Accept-Language': 'de',\n", 184 | " 'Referer': 'https://www.socks-proxy.net/',\n", 185 | " 'Connection': 'keep-alive',\n", 186 | " 'Upgrade-Insecure-Requests': '1',\n", 187 | " 'Sec-Fetch-Dest': 'document',\n", 188 | " 'Sec-Fetch-Mode': 'navigate',\n", 189 | " 'Sec-Fetch-Site': 'cross-site',\n", 190 | " 'Sec-Fetch-User': '?1',\n", 191 | " 'DNT': '1',\n", 192 | " 'Sec-GPC': '1',\n", 193 | "}" 194 | ] 195 | }, 196 | { 197 | "cell_type": "code", 198 | "execution_count": null, 199 | "metadata": {}, 200 | "outputs": [], 201 | "source": [ 202 | "# read table from html with pandas\n", 203 | "url = \"https://www.socks-proxy.net/\"\n", 204 | "response = requests.get(url, headers=headers)\n", 205 | "tables = pd.read_html(StringIO(response.text))" 206 | ] 207 | }, 208 | { 209 | "cell_type": "code", 210 | "execution_count": null, 211 | "metadata": {}, 212 | "outputs": [], 213 | "source": [ 214 | "tables[0]" 215 | ] 216 | }, 217 | { 218 | "cell_type": "code", 219 | "execution_count": null, 220 | "metadata": {}, 221 | "outputs": [], 222 | "source": [ 223 | "df = tables[0].astype(str)\n", 224 | "# filter the table by country code and socks4\n", 225 | "df = df[(df['Code'] == 'FR') & (df['Version'] == 'Socks4')]\n", 226 | "df.head(10)" 227 | ] 228 | }, 229 | { 230 | "cell_type": "code", 231 | "execution_count": null, 232 | "metadata": {}, 233 | "outputs": [], 234 | "source": [ 235 | "# save pandas dataframe to json\n", 236 | "df.to_json('socks_proxy.json', indent=4, orient='records')" 237 | ] 238 | }, 239 | { 240 | "cell_type": "markdown", 241 | "metadata": {}, 242 | "source": [ 243 | "#### https://mtpro.xyz/api-overview" 244 | ] 245 | }, 246 | { 247 | "cell_type": "code", 248 | "execution_count": null, 249 | "metadata": {}, 250 | "outputs": [], 251 | "source": [ 252 | "url = \"https://mtpro.xyz/api/\"\n", 253 | "params = {\n", 254 | " 'type': 'socks'\n", 255 | "}\n", 256 | "response = requests.get(url, params=params)\n", 257 | "response = response.json()\n", 258 | "df = pd.DataFrame(response).astype(str)\n", 259 | "df.to_json('mtpro.json', indent=4, orient='records')\n", 260 | "countries = sorted(df['country'].unique().tolist())\n", 261 | "print(countries)\n", 262 | "df.head(10)" 263 | ] 264 | }, 265 | { 266 | "cell_type": "code", 267 | "execution_count": null, 268 | "metadata": {}, 269 | "outputs": [], 270 | "source": [ 271 | "proxies = df[['ip', 'port']].apply(lambda x: f\"{x[0]}:{x[1]}\", axis=1).tolist()\n", 272 | "for p in proxies:\n", 273 | " print(p)" 274 | ] 275 | }, 276 | { 277 | "cell_type": "code", 278 | "execution_count": null, 279 | "metadata": {}, 280 | "outputs": [], 281 | "source": [] 282 | } 283 | ], 284 | "metadata": { 285 | "kernelspec": { 286 | "display_name": "Python 3 (ipykernel)", 287 | "language": "python", 288 | "name": "python3" 289 | }, 290 | "language_info": { 291 | "codemirror_mode": { 292 | "name": "ipython", 293 | "version": 3 294 | }, 295 | "file_extension": ".py", 296 | "mimetype": "text/x-python", 297 | "name": "python", 298 | "nbconvert_exporter": "python", 299 | "pygments_lexer": "ipython3", 300 | "version": "3.9.10" 301 | } 302 | }, 303 | "nbformat": 4, 304 | "nbformat_minor": 4 305 | } 306 | -------------------------------------------------------------------------------- /scratchpad/requirements.txt: -------------------------------------------------------------------------------- 1 | pandas 2 | selenium 3 | requests 4 | lxml 5 | countryflag 6 | jupyter 7 | jupyterlab 8 | -------------------------------------------------------------------------------- /scratchpad/selenium.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "# Selenium\n", 8 | "\n", 9 | "Testing the selenium options in jupyter notebook" 10 | ] 11 | }, 12 | { 13 | "cell_type": "code", 14 | "execution_count": null, 15 | "metadata": {}, 16 | "outputs": [], 17 | "source": [ 18 | "import os\n", 19 | "import shutil\n", 20 | "\n", 21 | "from selenium import webdriver\n", 22 | "from selenium.webdriver.chrome.options import Options\n", 23 | "from selenium.webdriver.chrome.service import Service\n", 24 | "from selenium.webdriver.common.by import By" 25 | ] 26 | }, 27 | { 28 | "cell_type": "code", 29 | "execution_count": null, 30 | "metadata": {}, 31 | "outputs": [], 32 | "source": [ 33 | "def get_logpath() -> str:\n", 34 | " return os.path.join(os.getcwd(), 'selenium.log')" 35 | ] 36 | }, 37 | { 38 | "cell_type": "code", 39 | "execution_count": null, 40 | "metadata": {}, 41 | "outputs": [], 42 | "source": [ 43 | "def get_chromedriver_path() -> str:\n", 44 | " return shutil.which('chromedriver')" 45 | ] 46 | }, 47 | { 48 | "cell_type": "code", 49 | "execution_count": null, 50 | "metadata": {}, 51 | "outputs": [], 52 | "source": [ 53 | "def get_webdriver_service(logpath) -> Service:\n", 54 | " service = Service(\n", 55 | " executable_path=get_chromedriver_path(),\n", 56 | " log_output=logpath,\n", 57 | " )\n", 58 | " return service" 59 | ] 60 | }, 61 | { 62 | "cell_type": "code", 63 | "execution_count": null, 64 | "metadata": {}, 65 | "outputs": [], 66 | "source": [ 67 | "def get_webdriver_options(proxy: str, socksStr: str) -> Options:\n", 68 | " options = Options()\n", 69 | " options.add_argument(\"--headless\")\n", 70 | " options.add_argument(\"--no-sandbox\")\n", 71 | " options.add_argument(\"--disable-dev-shm-usage\")\n", 72 | " options.add_argument(\"--disable-gpu\")\n", 73 | " options.add_argument(\"--disable-features=NetworkService\")\n", 74 | " options.add_argument(\"--window-size=1920x1080\")\n", 75 | " options.add_argument(\"--disable-features=VizDisplayCompositor\")\n", 76 | " options.add_argument('--ignore-certificate-errors')\n", 77 | " if proxy is not None:\n", 78 | " options.add_argument(f\"--proxy-server={socksStr}://{proxy}\")\n", 79 | " options.set_capability('goog:loggingPrefs', {'performance': 'ALL'})\n", 80 | " return options" 81 | ] 82 | }, 83 | { 84 | "cell_type": "code", 85 | "execution_count": null, 86 | "metadata": {}, 87 | "outputs": [], 88 | "source": [ 89 | "logpath = get_logpath()\n", 90 | "service = get_webdriver_service(logpath=logpath)" 91 | ] 92 | }, 93 | { 94 | "cell_type": "code", 95 | "execution_count": null, 96 | "metadata": {}, 97 | "outputs": [], 98 | "source": [ 99 | "print(logpath)" 100 | ] 101 | }, 102 | { 103 | "cell_type": "code", 104 | "execution_count": null, 105 | "metadata": {}, 106 | "outputs": [], 107 | "source": [ 108 | "def get_ip_address(options, service):\n", 109 | " with webdriver.Chrome(options=options, service=service) as driver:\n", 110 | " driver.get(\"https://api.ipify.org/\")\n", 111 | " print(driver.find_element(By.TAG_NAME, \"body\").text)" 112 | ] 113 | }, 114 | { 115 | "cell_type": "code", 116 | "execution_count": null, 117 | "metadata": {}, 118 | "outputs": [], 119 | "source": [ 120 | "proxy = \"23.19.244.109:1080\" # socks5\n", 121 | "options = get_webdriver_options(proxy=proxy, socksStr='socks5')\n", 122 | "get_ip_address(options=options, service=service)" 123 | ] 124 | }, 125 | { 126 | "cell_type": "code", 127 | "execution_count": null, 128 | "metadata": {}, 129 | "outputs": [], 130 | "source": [ 131 | "proxy = \"184.181.217.201:4145\" # socks4\n", 132 | "options = get_webdriver_options(proxy=proxy, socksStr='socks4')\n", 133 | "get_ip_address(options=options, service=service)" 134 | ] 135 | } 136 | ], 137 | "metadata": { 138 | "kernelspec": { 139 | "display_name": "Python 3 (ipykernel)", 140 | "language": "python", 141 | "name": "python3" 142 | }, 143 | "language_info": { 144 | "codemirror_mode": { 145 | "name": "ipython", 146 | "version": 3 147 | }, 148 | "file_extension": ".py", 149 | "mimetype": "text/x-python", 150 | "name": "python", 151 | "nbconvert_exporter": "python", 152 | "pygments_lexer": "ipython3", 153 | "version": "3.11.9" 154 | } 155 | }, 156 | "nbformat": 4, 157 | "nbformat_minor": 4 158 | } 159 | -------------------------------------------------------------------------------- /streamlit_app.py: -------------------------------------------------------------------------------- 1 | import json 2 | import os 3 | import shutil 4 | import subprocess 5 | import time 6 | from typing import List, Tuple 7 | 8 | import countryflag 9 | import pandas as pd 10 | import requests 11 | import streamlit as st 12 | from lxml import etree, html 13 | from selenium import webdriver 14 | from selenium.webdriver.chrome.options import Options 15 | from selenium.webdriver.chrome.service import Service 16 | from selenium.webdriver.common.by import By 17 | from selenium.webdriver.support.wait import WebDriverWait 18 | 19 | 20 | @st.cache_data(show_spinner=False, ttl=180) 21 | def get_proxyscrape_socks4(country: str = 'all', protocol: str = 'socks4') -> tuple: 22 | PROXYSCRAPE_URL = 'https://api.proxyscrape.com/v3/free-proxy-list/get' 23 | params = { 24 | 'request': 'displayproxies', 25 | 'proxy_format' : 'protocolipport', 26 | 'format': 'json', 27 | 'protocol': protocol, 28 | 'timeout': 2000, 29 | 'anonymity': 'all', 30 | 'country': country, 31 | } 32 | try: 33 | response = requests.get(url=PROXYSCRAPE_URL, params=params, timeout=3) 34 | response.raise_for_status() 35 | response = response.json() 36 | response = pd.json_normalize(response.get('proxies')).astype(str) 37 | except Exception as e: 38 | return False, str(e) 39 | else: 40 | return True, response 41 | 42 | 43 | @st.cache_data(show_spinner=False, ttl=180) 44 | def get_mtproto_socks5() -> tuple: 45 | url = "https://mtpro.xyz/api/" 46 | params = { 47 | 'type': 'socks' 48 | } 49 | try: 50 | response = requests.get(url, params=params) 51 | response = response.json() 52 | response = pd.DataFrame(response).astype(str) 53 | except Exception as e: 54 | return False, str(e) 55 | else: 56 | return True, response 57 | 58 | 59 | @st.cache_resource(show_spinner=False) 60 | def get_flag(country: str): 61 | return countryflag.getflag([country]) 62 | 63 | 64 | @st.cache_resource(show_spinner=False) 65 | def get_python_version() -> str: 66 | try: 67 | result = subprocess.run(['python', '--version'], capture_output=True, text=True) 68 | version = result.stdout.split()[1] 69 | return version 70 | except Exception as e: 71 | return str(e) 72 | 73 | 74 | @st.cache_resource(show_spinner=False) 75 | def get_chromium_version() -> str: 76 | try: 77 | result = subprocess.run(['chromium', '--version'], capture_output=True, text=True) 78 | version = result.stdout.split()[1] 79 | return version 80 | except Exception as e: 81 | return str(e) 82 | 83 | 84 | @st.cache_resource(show_spinner=False) 85 | def get_chromedriver_version() -> str: 86 | try: 87 | result = subprocess.run(['chromedriver', '--version'], capture_output=True, text=True) 88 | version = result.stdout.split()[1] 89 | return version 90 | except Exception as e: 91 | return str(e) 92 | 93 | 94 | @st.cache_resource(show_spinner=False) 95 | def get_logpath() -> str: 96 | return os.path.join(os.getcwd(), 'selenium.log') 97 | 98 | 99 | @st.cache_resource(show_spinner=False) 100 | def get_chromedriver_path() -> str: 101 | return shutil.which('chromedriver') 102 | 103 | 104 | @st.cache_resource(show_spinner=False) 105 | def get_webdriver_options(proxy: str = None, socksStr: str = None) -> Options: 106 | options = Options() 107 | options.add_argument("--headless") 108 | options.add_argument("--no-sandbox") 109 | options.add_argument("--disable-dev-shm-usage") 110 | options.add_argument("--disable-gpu") 111 | options.add_argument("--disable-features=NetworkService") 112 | options.add_argument("--window-size=1920x1080") 113 | options.add_argument("--disable-features=VizDisplayCompositor") 114 | options.add_argument('--ignore-certificate-errors') 115 | if proxy is not None and socksStr is not None: 116 | options.add_argument(f"--proxy-server={socksStr}://{proxy}") 117 | options.set_capability('goog:loggingPrefs', {'performance': 'ALL'}) 118 | return options 119 | 120 | 121 | def get_messages_from_log(logs) -> List: 122 | messages = list() 123 | for entry in logs: 124 | logmsg = json.loads(entry["message"])["message"] 125 | if logmsg["method"] == "Network.responseReceived": # Filter out HTTP responses 126 | # check for 200 and 204 status codes 127 | if logmsg["params"]["response"]["status"] not in [200, 204]: 128 | messages.append(logmsg) 129 | elif logmsg["method"] == "Network.responseReceivedExtraInfo": 130 | if logmsg["params"]["statusCode"] not in [200, 204]: 131 | messages.append(logmsg) 132 | if len(messages) == 0: 133 | return None 134 | return messages 135 | 136 | 137 | def prettify_html(html_content) -> str: 138 | return etree.tostring(html.fromstring(html_content), pretty_print=True).decode('utf-8') 139 | 140 | 141 | def get_webdriver_service(logpath) -> Service: 142 | service = Service( 143 | executable_path=get_chromedriver_path(), 144 | log_output=logpath, 145 | ) 146 | return service 147 | 148 | 149 | def delete_selenium_log(logpath: str): 150 | if os.path.exists(logpath): 151 | os.remove(logpath) 152 | 153 | 154 | def show_selenium_log(logpath: str): 155 | if os.path.exists(logpath): 156 | with open(logpath) as f: 157 | content = f.read() 158 | st.code(body=content, language='log', line_numbers=True) 159 | else: 160 | st.error('No log file found!', icon='🔥') 161 | 162 | 163 | def run_selenium(logpath: str, proxy: str, socksStr: str) -> Tuple[str, List, List, str]: 164 | name = None 165 | html_content = None 166 | options = get_webdriver_options(proxy=proxy, socksStr=socksStr) 167 | service = get_webdriver_service(logpath=logpath) 168 | with webdriver.Chrome(options=options, service=service) as driver: 169 | url = "https://www.unibet.fr/sport/hub/euro-2024" 170 | try: 171 | driver.get(url) 172 | time.sleep(2) 173 | # Wait for the element to be rendered: 174 | element = WebDriverWait(driver=driver, timeout=10).until(lambda x: x.find_elements(by=By.CSS_SELECTOR, value="h2.eventcard-content-name")) 175 | name = element[0].get_property('attributes')[0]['name'] 176 | html_content = driver.page_source 177 | except Exception as e: 178 | st.error(body='Selenium Exception occured!', icon='🔥') 179 | st.error(body=str(e), icon='🔥') 180 | finally: 181 | performance_log = driver.get_log('performance') 182 | browser_log = driver.get_log('browser') 183 | return name, performance_log, browser_log, html_content 184 | 185 | 186 | if __name__ == "__main__": 187 | if "proxy" not in st.session_state: 188 | st.session_state.proxy = None 189 | if "proxies" not in st.session_state: 190 | st.session_state.proxies = None 191 | if "socks5" not in st.session_state: 192 | st.session_state.socks5 = False 193 | if "df" not in st.session_state: 194 | st.session_state.df = None 195 | if "countries" not in st.session_state: 196 | st.session_state.countries = None 197 | logpath=get_logpath() 198 | delete_selenium_log(logpath=logpath) 199 | st.set_page_config(page_title="Selenium Test", page_icon='🕸️', layout="wide", 200 | initial_sidebar_state='collapsed') 201 | left, middle, right = st.columns([2, 11, 1], gap="small") 202 | with middle: 203 | st.title('Selenium on Streamlit Cloud 🕸️') 204 | st.markdown('''This app is only a very simple test for **Selenium** running on **Streamlit Cloud** runtime. 205 | The suggestion for this demo app came from a post on the Streamlit Community Forum.
206 |

207 | This is just a very very simple example and more a proof of concept. 208 | A link is called and waited for the existence of a specific class to read a specific property. 209 | If there is no error message, the action was successful. Afterwards the log files are displayed. 210 | Since the target website has geoip blocking enabled, a proxy is required to bypass this and can be selected optionally. 211 | However, the use of proxies is not guaranteed to work, as they may not working properly. 212 | If you disable the proxy, the app will usually fail on streamlit cloud to load the page. 213 | ''', unsafe_allow_html=True) 214 | st.markdown('---') 215 | middle_left, middle_right = st.columns([9, 10], gap="medium") 216 | with middle_left: 217 | st.header('Proxy') 218 | st.session_state.useproxy = st.toggle(label='Enable proxy to bypass geoip blocking', value=True, disabled=False) 219 | if st.session_state.useproxy: 220 | socks5 = st.toggle(label='Use Socks5 proxy', value=True, disabled=False) 221 | if socks5 != st.session_state.socks5: 222 | st.session_state.socks5 = socks5 223 | st.session_state.proxy = None 224 | st.session_state.proxies = None 225 | st.session_state.df = None 226 | if st.session_state.socks5: 227 | # try to gather and use socks5 proxies 228 | if st.button(label='Refresh proxies from free Socks5 list'): 229 | success, proxies = get_mtproto_socks5() 230 | if not success: 231 | st.error(f"No socks5 proxies found", icon='🔥') 232 | st.error(proxies, icon='🔥') 233 | st.session_state.df = None 234 | else: 235 | if not proxies.empty: 236 | countries = sorted(proxies['country'].unique().tolist()) 237 | st.session_state.df = proxies.copy() 238 | st.session_state.countries = countries 239 | else: 240 | st.session_state.df = None 241 | st.session_state.countries = None 242 | else: 243 | # try to gather and use socks4 proxies 244 | if st.button(label='Refresh proxies from free Socks4 list'): 245 | success, proxies = get_proxyscrape_socks4(country='all', protocol='socks4') 246 | if not success: 247 | st.error(f"No socks4 proxies found", icon='🔥') 248 | st.error(proxies, icon='🔥') 249 | st.session_state.df = None 250 | else: 251 | if not proxies.empty: 252 | countries = sorted(proxies['ip_data.countryCode'].unique().tolist()) 253 | st.session_state.df = proxies.copy() 254 | st.session_state.countries = countries 255 | else: 256 | st.session_state.df = None 257 | st.session_state.countries = None 258 | if st.session_state.countries is not None: 259 | # limit countries to a set of countries 260 | allowed_countries = ['FR', 'GB', 'DE', 'ES', 'CH', 'US'] 261 | st.session_state.countries = [country for country in st.session_state.countries if country in allowed_countries] 262 | if st.session_state.df is not None and st.session_state.countries is not None: 263 | selected_country = st.selectbox(label='Select a country', options=st.session_state.countries) 264 | selected_country_flag = get_flag(selected_country) 265 | st.info(f'Selected Country: {selected_country} {selected_country_flag}', icon='🌍') 266 | if st.session_state.socks5: 267 | selected_country_proxies = st.session_state.df[st.session_state.df['country'] == selected_country] 268 | else: 269 | selected_country_proxies = st.session_state.df[st.session_state.df['ip_data.countryCode'] == selected_country] 270 | st.session_state.proxies = set(selected_country_proxies[['ip', 'port']].apply(lambda x: f"{x.iloc[0]}:{x.iloc[1]}", axis=1).tolist()) 271 | if st.session_state.proxies: 272 | st.session_state.proxy = st.selectbox(label='Select a proxy from the list', options=st.session_state.proxies, index=0) 273 | st.info(body=f'{st.session_state.proxy} {get_flag(selected_country)}', icon='😎') 274 | else: 275 | st.session_state.proxy = None 276 | st.session_state.proxies = None 277 | st.session_state.df = None 278 | st.info('Proxy is disabled', icon='🔒') 279 | with middle_right: 280 | st.header('Versions') 281 | st.text('This is only for debugging purposes.\n' 282 | 'Checking versions installed in environment:\n\n' 283 | f'- Python: {get_python_version()}\n' 284 | f'- Streamlit: {st.__version__}\n' 285 | f'- Selenium: {webdriver.__version__}\n' 286 | f'- Chromedriver: {get_chromedriver_version()}\n' 287 | f'- Chromium: {get_chromium_version()}') 288 | st.markdown('---') 289 | 290 | if st.button('Start Selenium run'): 291 | st.info(f'Selected Proxy: {st.session_state.proxy}', icon='☢️') 292 | if st.session_state.useproxy: 293 | socksStr = 'socks5' if st.session_state.socks5 else 'socks4' 294 | st.info(f'Selected Socks: {socksStr}', icon='🧦') 295 | else: 296 | socksStr = None 297 | with st.spinner('Selenium is running, please wait...'): 298 | result, performance_log, browser_log, html_content = run_selenium(logpath=logpath, proxy=st.session_state.proxy, socksStr=socksStr) 299 | if result is None: 300 | st.error('There was an error, no result found!', icon='🔥') 301 | else: 302 | st.success(body=f'Result: {result}', icon='🎉') 303 | st.info('Selenium log files are shown below...', icon='⬇️') 304 | performance_log_msg = get_messages_from_log(performance_log) 305 | if performance_log_msg is not None: 306 | st.header('Performance Log (filtered) - only non 200/204 status codes') 307 | st.code(body=json.dumps(performance_log_msg, indent=4), language='json', line_numbers=True) 308 | st.header('Selenium Log') 309 | show_selenium_log(logpath=logpath) 310 | if result is None and html_content is not None: 311 | st.header('HTML Content') 312 | st.code(body=prettify_html(html_content), language='html', line_numbers=True) 313 | st.balloons() 314 | --------------------------------------------------------------------------------