├── .codecov.yml ├── .dockerignore ├── .flake8.cfg ├── .github ├── ISSUE_TEMPLATE │ ├── bug_report.md │ ├── feature_request.md │ └── general-question.md └── workflows │ ├── codeql.yml │ ├── docker-gh-publish.yml │ ├── docker-hub-release.yml │ └── pythonpackage.yml ├── .gitignore ├── .readthedocs.yml ├── .vscode └── launch.json ├── CODE_OF_CONDUCT.md ├── CONTRIBUTING.md ├── Dockerfile ├── Dockerfile-alpine ├── LICENSE ├── MANIFEST.in ├── Makefile ├── PULL_REQUEST_TEMPLATE.md ├── README.md ├── README.rst ├── docker-compose.yml ├── docs ├── Makefile ├── requirements-rtfd.txt └── source │ ├── _templates │ └── sidebarintro.html │ ├── conf.py │ ├── index.rst │ ├── modules.rst │ ├── requests_integration.rst │ ├── scylla.providers.rst │ ├── scylla.proxy.rst │ ├── scylla.rst │ ├── scylla.web.rst │ └── validation_policy.rst ├── docs_zh ├── Makefile └── source │ ├── _templates │ └── sidebarintro.html │ ├── conf.py │ ├── index.rst │ ├── modules.rst │ ├── requests_integration.rst │ ├── scylla.providers.rst │ ├── scylla.proxy.rst │ ├── scylla.rst │ ├── scylla.web.rst │ └── validation_policy.rst ├── frontend ├── .eslintrc.cjs ├── .gitignore ├── README.md ├── index.html ├── package.json ├── public │ └── vite.svg ├── src │ ├── App.css │ ├── App.tsx │ ├── LegacyApp.tsx │ ├── assets │ │ ├── react.svg │ │ └── scylla_banner.png │ ├── components │ │ ├── GeoDistribution.tsx │ │ ├── ProxyList.tsx │ │ ├── ProxyListFilter.tsx │ │ └── Statistics.tsx │ ├── index.css │ ├── index.html │ ├── index.scss │ ├── main.tsx │ ├── utils.ts │ └── vite-env.d.ts ├── tsconfig.json ├── tsconfig.node.json └── vite.config.ts ├── requirements.txt ├── scripts └── ubuntu_dependence.sh ├── scylla ├── __init__.py ├── __main__.py ├── _version.py ├── cli.py ├── config.py ├── database.py ├── jobs.py ├── loggings.py ├── providers │ ├── __init__.py │ ├── a2u_provider.py │ ├── base_provider.py │ ├── comp0_provider.py │ ├── cool_proxy_provider.py │ ├── data5u_provider.py │ ├── free_proxy_list_provider.py │ ├── http_proxy_provider.py │ ├── ipaddress_provider.py │ ├── kuaidaili_provider.py │ ├── plain_text_provider.py │ ├── proxy_list_provider.py │ ├── proxy_scraper_provider.py │ ├── proxylists_provider.py │ ├── proxynova_provider.py │ ├── pubproxy_provider.py │ ├── rmccurdy_provider.py │ ├── rudnkh_provider.py │ ├── spys_me_provider.py │ ├── spys_one_provider.py │ ├── the_speedX_provider.py │ └── xici_provider.py ├── proxy │ ├── __init__.py │ └── server.py ├── proxy_check_services.py ├── scheduler.py ├── tcpping.py ├── validation_policy.py ├── validator.py ├── web │ ├── __init__.py │ └── server.py └── worker.py ├── setup.cfg ├── setup.py ├── tests ├── __init__.py ├── cli_test.py ├── config_test.py ├── conftest.py ├── database_test.py ├── jobs_test.py ├── requirements-test.txt ├── scheduler_test.py ├── tcpping_test.py ├── test_loggings.py ├── validation_policy_test.py ├── validator_test.py ├── web │ └── __init__.py └── worker_test.py └── tsconfig.json /.codecov.yml: -------------------------------------------------------------------------------- 1 | ignore: 2 | - "docs/" 3 | - "frontend/" 4 | - ".gitignore" 5 | - ".readthedocs.yml" 6 | - ".travis.yml" 7 | - "Dockerfile" 8 | - "LICENSE" 9 | - "Makefile" 10 | - "MANIFEST.in" 11 | - "package.json" 12 | - "README.md" 13 | - "README.rst" 14 | - "requirements.txt" 15 | - "*_test.py" -------------------------------------------------------------------------------- /.dockerignore: -------------------------------------------------------------------------------- 1 | # Byte-compiled / optimized / DLL files 2 | __pycache__/ 3 | *.py[cod] 4 | *$py.class 5 | *.DS_Store 6 | 7 | # C extensions 8 | *.so 9 | 10 | # Distribution / packaging 11 | .Python 12 | build/ 13 | develop-eggs/ 14 | dist/ 15 | scylla/assets/ 16 | downloads/ 17 | eggs/ 18 | .eggs/ 19 | lib/ 20 | lib64/ 21 | parts/ 22 | sdist/ 23 | var/ 24 | wheels/ 25 | *.egg-info/ 26 | .installed.cfg 27 | *.egg 28 | MANIFEST 29 | 30 | # PyInstaller 31 | # Usually these files are written by a python script from a template 32 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 33 | *.manifest 34 | *.spec 35 | 36 | # Installer logs 37 | pip-log.txt 38 | pip-delete-this-directory.txt 39 | 40 | # Unit test / coverage reports 41 | htmlcov/ 42 | .tox/ 43 | .coverage 44 | .coverage.* 45 | .cache 46 | nosetests.xml 47 | coverage.xml 48 | *.cover 49 | .hypothesis/ 50 | .pytest_cache/ 51 | 52 | # Translations 53 | *.mo 54 | *.pot 55 | 56 | # Django stuff: 57 | *.log 58 | .static_storage/ 59 | .media/ 60 | local_settings.py 61 | 62 | # Flask stuff: 63 | instance/ 64 | .webassets-cache 65 | 66 | # Scrapy stuff: 67 | .scrapy 68 | 69 | # Sphinx documentation 70 | docs/build/ 71 | 72 | # PyBuilder 73 | target/ 74 | 75 | # Jupyter Notebook 76 | .ipynb_checkpoints 77 | 78 | # pyenv 79 | .python-version 80 | 81 | # celery beat schedule file 82 | celerybeat-schedule 83 | 84 | # SageMath parsed files 85 | *.sage.py 86 | 87 | # Environments 88 | .env 89 | .venv 90 | env/ 91 | venv/ 92 | ENV/ 93 | env.bak/ 94 | venv.bak/ 95 | 96 | # Spyder project settings 97 | .spyderproject 98 | .spyproject 99 | 100 | # Rope project settings 101 | .ropeproject 102 | 103 | # mkdocs documentation 104 | /site 105 | 106 | # mypy 107 | .mypy_cache/ 108 | 109 | # idea 110 | .idea/ 111 | 112 | # vscode 113 | .vscode 114 | 115 | out/ 116 | 117 | *.db 118 | 119 | # node 120 | node_modules 121 | 122 | *.bak 123 | 124 | package-lock.json 125 | scylla.db-journal 126 | 127 | 128 | venv/ 129 | -------------------------------------------------------------------------------- /.flake8.cfg: -------------------------------------------------------------------------------- 1 | [flake8] 2 | ignore = D203,E501,E701,E712 3 | exclude = 4 | .git, 5 | __pycache__, 6 | docs/source/conf.py, 7 | old, 8 | build, 9 | dist, 10 | docs_zh/source/conf.py, 11 | node_modules, 12 | venv 13 | max-complexity = 10 -------------------------------------------------------------------------------- /.github/ISSUE_TEMPLATE/bug_report.md: -------------------------------------------------------------------------------- 1 | --- 2 | name: Bug report 3 | about: Create a report to help us improve 4 | 5 | --- 6 | 7 | **Describe the bug** 8 | A clear and concise description of what the bug is. 9 | 10 | **To Reproduce** 11 | Steps to reproduce the behavior: 12 | 1. Go to '...' 13 | 2. Click on '....' 14 | 3. Scroll down to '....' 15 | 4. See error 16 | 17 | **Expected behavior** 18 | A clear and concise description of what you expected to happen. 19 | 20 | **Screenshots** 21 | If applicable, add screenshots to help explain your problem. 22 | 23 | **Desktop (please complete the following information):** 24 | - OS: [e.g. iOS] 25 | - Browser [e.g. chrome, safari] 26 | - Version [e.g. 22] 27 | 28 | **Smartphone (please complete the following information):** 29 | - Device: [e.g. iPhone6] 30 | - OS: [e.g. iOS8.1] 31 | - Browser [e.g. stock browser, safari] 32 | - Version [e.g. 22] 33 | 34 | **Additional context** 35 | Add any other context about the problem here. 36 | -------------------------------------------------------------------------------- /.github/ISSUE_TEMPLATE/feature_request.md: -------------------------------------------------------------------------------- 1 | --- 2 | name: Feature request 3 | about: Suggest an idea for this project 4 | 5 | --- 6 | 7 | **Is your feature request related to a problem? Please describe.** 8 | A clear and concise description of what the problem is. Ex. I'm always frustrated when [...] 9 | 10 | **Describe the solution you'd like** 11 | A clear and concise description of what you want to happen. 12 | 13 | **Describe alternatives you've considered** 14 | A clear and concise description of any alternative solutions or features you've considered. 15 | 16 | **Additional context** 17 | Add any other context or screenshots about the feature request here. 18 | -------------------------------------------------------------------------------- /.github/ISSUE_TEMPLATE/general-question.md: -------------------------------------------------------------------------------- 1 | --- 2 | name: General question 3 | about: Template for asking question 4 | 5 | --- 6 | 7 | Please provide the following information if applicable: 8 | 9 | - Operating system and its version: 10 | - Version number of Scylla: 11 | - Version number of your Chromium (if related): 12 | -------------------------------------------------------------------------------- /.github/workflows/codeql.yml: -------------------------------------------------------------------------------- 1 | # For most projects, this workflow file will not need changing; you simply need 2 | # to commit it to your repository. 3 | # 4 | # You may wish to alter this file to override the set of languages analyzed, 5 | # or to provide custom queries or build logic. 6 | # 7 | # ******** NOTE ******** 8 | # We have attempted to detect the languages in your repository. Please check 9 | # the `language` matrix defined below to confirm you have the correct set of 10 | # supported CodeQL languages. 11 | # 12 | name: "CodeQL" 13 | 14 | on: 15 | push: 16 | branches: [ "main" ] 17 | pull_request: 18 | branches: [ "main" ] 19 | schedule: 20 | - cron: '21 8 * * 1' 21 | 22 | jobs: 23 | analyze: 24 | name: Analyze (${{ matrix.language }}) 25 | # Runner size impacts CodeQL analysis time. To learn more, please see: 26 | # - https://gh.io/recommended-hardware-resources-for-running-codeql 27 | # - https://gh.io/supported-runners-and-hardware-resources 28 | # - https://gh.io/using-larger-runners (GitHub.com only) 29 | # Consider using larger runners or machines with greater resources for possible analysis time improvements. 30 | runs-on: ${{ (matrix.language == 'swift' && 'macos-latest') || 'ubuntu-latest' }} 31 | timeout-minutes: ${{ (matrix.language == 'swift' && 120) || 360 }} 32 | permissions: 33 | # required for all workflows 34 | security-events: write 35 | 36 | # required to fetch internal or private CodeQL packs 37 | packages: read 38 | 39 | # only required for workflows in private repositories 40 | actions: read 41 | contents: read 42 | 43 | strategy: 44 | fail-fast: false 45 | matrix: 46 | include: 47 | - language: javascript-typescript 48 | build-mode: none 49 | - language: python 50 | build-mode: none 51 | # CodeQL supports the following values keywords for 'language': 'c-cpp', 'csharp', 'go', 'java-kotlin', 'javascript-typescript', 'python', 'ruby', 'swift' 52 | # Use `c-cpp` to analyze code written in C, C++ or both 53 | # Use 'java-kotlin' to analyze code written in Java, Kotlin or both 54 | # Use 'javascript-typescript' to analyze code written in JavaScript, TypeScript or both 55 | # To learn more about changing the languages that are analyzed or customizing the build mode for your analysis, 56 | # see https://docs.github.com/en/code-security/code-scanning/creating-an-advanced-setup-for-code-scanning/customizing-your-advanced-setup-for-code-scanning. 57 | # If you are analyzing a compiled language, you can modify the 'build-mode' for that language to customize how 58 | # your codebase is analyzed, see https://docs.github.com/en/code-security/code-scanning/creating-an-advanced-setup-for-code-scanning/codeql-code-scanning-for-compiled-languages 59 | steps: 60 | - name: Checkout repository 61 | uses: actions/checkout@v4 62 | 63 | # Initializes the CodeQL tools for scanning. 64 | - name: Initialize CodeQL 65 | uses: github/codeql-action/init@v3 66 | with: 67 | languages: ${{ matrix.language }} 68 | build-mode: ${{ matrix.build-mode }} 69 | # If you wish to specify custom queries, you can do so here or in a config file. 70 | # By default, queries listed here will override any specified in a config file. 71 | # Prefix the list here with "+" to use these queries and those in the config file. 72 | 73 | # For more details on CodeQL's query packs, refer to: https://docs.github.com/en/code-security/code-scanning/automatically-scanning-your-code-for-vulnerabilities-and-errors/configuring-code-scanning#using-queries-in-ql-packs 74 | # queries: security-extended,security-and-quality 75 | 76 | # If the analyze step fails for one of the languages you are analyzing with 77 | # "We were unable to automatically build your code", modify the matrix above 78 | # to set the build mode to "manual" for that language. Then modify this step 79 | # to build your code. 80 | # ℹ️ Command-line programs to run using the OS shell. 81 | # 📚 See https://docs.github.com/en/actions/using-workflows/workflow-syntax-for-github-actions#jobsjob_idstepsrun 82 | - if: matrix.build-mode == 'manual' 83 | shell: bash 84 | run: | 85 | echo 'If you are using a "manual" build mode for one or more of the' \ 86 | 'languages you are analyzing, replace this with the commands to build' \ 87 | 'your code, for example:' 88 | echo ' make bootstrap' 89 | echo ' make release' 90 | exit 1 91 | 92 | - name: Perform CodeQL Analysis 93 | uses: github/codeql-action/analyze@v3 94 | with: 95 | category: "/language:${{matrix.language}}" 96 | -------------------------------------------------------------------------------- /.github/workflows/docker-gh-publish.yml: -------------------------------------------------------------------------------- 1 | name: Docker (GitHub Packages) 2 | 3 | # This workflow uses actions that are not certified by GitHub. 4 | # They are provided by a third-party and are governed by 5 | # separate terms of service, privacy policy, and support 6 | # documentation. 7 | 8 | on: 9 | schedule: 10 | - cron: '20 21 * * *' 11 | push: 12 | branches: [ "main" ] 13 | # Publish semver tags as releases. 14 | tags: [ 'v*.*.*' ] 15 | pull_request: 16 | branches: [ "main" ] 17 | 18 | env: 19 | # Use docker.io for Docker Hub if empty 20 | REGISTRY: ghcr.io 21 | # github.repository as / 22 | IMAGE_NAME: ${{ github.repository }} 23 | 24 | 25 | jobs: 26 | build: 27 | 28 | runs-on: ubuntu-latest 29 | permissions: 30 | contents: read 31 | packages: write 32 | # This is used to complete the identity challenge 33 | # with sigstore/fulcio when running outside of PRs. 34 | id-token: write 35 | 36 | steps: 37 | - name: Checkout repository 38 | uses: actions/checkout@v3 39 | 40 | # Install the cosign tool except on PR 41 | # https://github.com/sigstore/cosign-installer 42 | - name: Install cosign 43 | if: github.event_name != 'pull_request' 44 | uses: sigstore/cosign-installer@6e04d228eb30da1757ee4e1dd75a0ec73a653e06 #v3.1.1 45 | with: 46 | cosign-release: 'v2.1.1' 47 | 48 | # Set up BuildKit Docker container builder to be able to build 49 | # multi-platform images and export cache 50 | # https://github.com/docker/setup-buildx-action 51 | - name: Set up Docker Buildx 52 | uses: docker/setup-buildx-action@f95db51fddba0c2d1ec667646a06c2ce06100226 # v3.0.0 53 | 54 | # Login against a Docker registry except on PR 55 | # https://github.com/docker/login-action 56 | - name: Log into registry ${{ env.REGISTRY }} 57 | if: github.event_name != 'pull_request' 58 | uses: docker/login-action@343f7c4344506bcbf9b4de18042ae17996df046d # v3.0.0 59 | with: 60 | registry: ${{ env.REGISTRY }} 61 | username: ${{ github.actor }} 62 | password: ${{ secrets.GITHUB_TOKEN }} 63 | 64 | # Extract metadata (tags, labels) for Docker 65 | # https://github.com/docker/metadata-action 66 | - name: Extract Docker metadata 67 | id: meta 68 | uses: docker/metadata-action@96383f45573cb7f253c731d3b3ab81c87ef81934 # v5.0.0 69 | with: 70 | images: ${{ env.REGISTRY }}/${{ env.IMAGE_NAME }} 71 | 72 | # Build and push Docker image with Buildx (don't push on PR) 73 | # https://github.com/docker/build-push-action 74 | - name: Build and push Docker image 75 | id: build-and-push 76 | uses: docker/build-push-action@0565240e2d4ab88bba5387d719585280857ece09 # v5.0.0 77 | with: 78 | context: . 79 | push: ${{ github.event_name != 'pull_request' }} 80 | tags: ${{ steps.meta.outputs.tags }} 81 | labels: ${{ steps.meta.outputs.labels }} 82 | cache-from: type=gha 83 | cache-to: type=gha,mode=max 84 | -------------------------------------------------------------------------------- /.github/workflows/docker-hub-release.yml: -------------------------------------------------------------------------------- 1 | # This workflow uses actions that are not certified by GitHub. 2 | # They are provided by a third-party and are governed by 3 | # separate terms of service, privacy policy, and support 4 | # documentation. 5 | # Source: https://docs.github.com/en/actions/publishing-packages/publishing-docker-images 6 | 7 | name: Docker Hub 8 | 9 | on: 10 | workflow_dispatch: 11 | push: 12 | branches: 13 | - 'main' 14 | tags: 15 | - 'v*' 16 | pull_request: 17 | branches: 18 | - 'main' 19 | 20 | jobs: 21 | push_to_registry: 22 | name: Push Docker image to Docker Hub 23 | runs-on: ubuntu-latest 24 | steps: 25 | - name: Check out the repo 26 | uses: actions/checkout@v4 27 | 28 | - name: Log in to Docker Hub 29 | if: github.event_name != 'pull_request' 30 | uses: docker/login-action@v3 31 | with: 32 | username: ${{ secrets.DOCKERHUB_USERNAME }} 33 | password: ${{ secrets.DOCKERHUB_TOKEN }} 34 | 35 | - name: Extract metadata (tags, labels) for Docker 36 | id: meta 37 | uses: docker/metadata-action@v5 38 | with: 39 | images: wildcat/scylla 40 | 41 | - name: Build and push Docker image 42 | uses: docker/build-push-action@v5 43 | with: 44 | context: . 45 | push: ${{ github.event_name != 'pull_request' }} 46 | tags: ${{ steps.meta.outputs.tags }} 47 | labels: ${{ steps.meta.outputs.labels }} -------------------------------------------------------------------------------- /.github/workflows/pythonpackage.yml: -------------------------------------------------------------------------------- 1 | name: 'Python package: Scylla' 2 | 3 | on: [push, pull_request] 4 | 5 | jobs: 6 | build: 7 | 8 | runs-on: ubuntu-latest 9 | strategy: 10 | max-parallel: 2 11 | matrix: 12 | python-version: [ 3.11 ] 13 | 14 | steps: 15 | - uses: actions/checkout@v2 16 | - name: Set up Python ${{ matrix.python-version }} 17 | uses: actions/setup-python@v2 18 | with: 19 | python-version: ${{ matrix.python-version }} 20 | - name: Install dependencies 21 | run: | 22 | sudo apt-get update -y 23 | sudo apt-get install libgnutls28-dev libcurl4-openssl-dev libssl-dev -y 24 | pip install -r tests/requirements-test.txt 25 | pip install -e . 26 | python -m playwright install --with-deps chromium 27 | - name: Build web 28 | run: | 29 | cd frontend 30 | npm install 31 | npm run build:scylla:prod 32 | - name: Lint with flake8 33 | run: | 34 | make style-check 35 | - name: Test with pytest 36 | run: | 37 | pytest --cov=./scylla tests -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | # Byte-compiled / optimized / DLL files 2 | __pycache__/ 3 | *.py[cod] 4 | *$py.class 5 | *.DS_Store 6 | 7 | # C extensions 8 | *.so 9 | 10 | # Distribution / packaging 11 | .Python 12 | build/ 13 | develop-eggs/ 14 | dist/ 15 | scylla/assets/ 16 | downloads/ 17 | eggs/ 18 | .eggs/ 19 | lib/ 20 | lib64/ 21 | parts/ 22 | sdist/ 23 | var/ 24 | wheels/ 25 | *.egg-info/ 26 | .installed.cfg 27 | *.egg 28 | MANIFEST 29 | 30 | # PyInstaller 31 | # Usually these files are written by a python script from a template 32 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 33 | *.manifest 34 | *.spec 35 | 36 | # Installer logs 37 | pip-log.txt 38 | pip-delete-this-directory.txt 39 | 40 | # Unit test / coverage reports 41 | htmlcov/ 42 | .tox/ 43 | .coverage 44 | .coverage.* 45 | .cache 46 | nosetests.xml 47 | coverage.xml 48 | *.cover 49 | .hypothesis/ 50 | .pytest_cache/ 51 | 52 | # Translations 53 | *.mo 54 | *.pot 55 | 56 | # Django stuff: 57 | *.log 58 | .static_storage/ 59 | .media/ 60 | local_settings.py 61 | 62 | # Flask stuff: 63 | instance/ 64 | .webassets-cache 65 | 66 | # Scrapy stuff: 67 | .scrapy 68 | 69 | # Sphinx documentation 70 | docs/build/ 71 | 72 | # PyBuilder 73 | target/ 74 | 75 | # Jupyter Notebook 76 | .ipynb_checkpoints 77 | 78 | # pyenv 79 | .python-version 80 | 81 | # celery beat schedule file 82 | celerybeat-schedule 83 | 84 | # SageMath parsed files 85 | *.sage.py 86 | 87 | # Environments 88 | .env 89 | .venv 90 | env/ 91 | venv/ 92 | ENV/ 93 | env.bak/ 94 | venv.bak/ 95 | 96 | # Spyder project settings 97 | .spyderproject 98 | .spyproject 99 | 100 | # Rope project settings 101 | .ropeproject 102 | 103 | # mkdocs documentation 104 | /site 105 | 106 | # mypy 107 | .mypy_cache/ 108 | 109 | # idea 110 | .idea/ 111 | 112 | out/ 113 | 114 | *.db 115 | 116 | # node 117 | node_modules 118 | 119 | .parcel-cache 120 | 121 | *.bak 122 | 123 | package-lock.json 124 | scylla.db-journal 125 | 126 | -------------------------------------------------------------------------------- /.readthedocs.yml: -------------------------------------------------------------------------------- 1 | formats: 2 | - epub 3 | - htmlzip 4 | 5 | build: 6 | image: latest 7 | 8 | python: 9 | version: 3.6 10 | setup_py_install: false 11 | 12 | -------------------------------------------------------------------------------- /.vscode/launch.json: -------------------------------------------------------------------------------- 1 | { 2 | // Use IntelliSense to learn about possible attributes. 3 | // Hover to view descriptions of existing attributes. 4 | // For more information, visit: https://go.microsoft.com/fwlink/?linkid=830387 5 | "version": "0.2.0", 6 | "configurations": [ 7 | { 8 | "name": "Python: Scylla", 9 | "type": "python", 10 | "request": "launch", 11 | "program": "scylla/__main__.py", 12 | "console": "integratedTerminal", 13 | "env": {"PYTHONPATH": "${workspaceRoot}"} 14 | }, 15 | 16 | { 17 | "name": "Python: Scylla: Test", 18 | "type": "python", 19 | "request": "launch", 20 | "module": "pytest", 21 | "args": [ 22 | "--no-cov" 23 | ], 24 | "console": "integratedTerminal", 25 | "env": {"PYTHONPATH": "${workspaceRoot}"} 26 | } 27 | ] 28 | } -------------------------------------------------------------------------------- /CODE_OF_CONDUCT.md: -------------------------------------------------------------------------------- 1 | # Contributor Covenant Code of Conduct 2 | 3 | ## Our Pledge 4 | 5 | In the interest of fostering an open and welcoming environment, we as contributors and maintainers pledge to making participation in our project and our community a harassment-free experience for everyone, regardless of age, body size, disability, ethnicity, gender identity and expression, level of experience, nationality, personal appearance, race, religion, or sexual identity and orientation. 6 | 7 | ## Our Standards 8 | 9 | Examples of behavior that contributes to creating a positive environment include: 10 | 11 | * Using welcoming and inclusive language 12 | * Being respectful of differing viewpoints and experiences 13 | * Gracefully accepting constructive criticism 14 | * Focusing on what is best for the community 15 | * Showing empathy towards other community members 16 | 17 | Examples of unacceptable behavior by participants include: 18 | 19 | * The use of sexualized language or imagery and unwelcome sexual attention or advances 20 | * Trolling, insulting/derogatory comments, and personal or political attacks 21 | * Public or private harassment 22 | * Publishing others' private information, such as a physical or electronic address, without explicit permission 23 | * Other conduct which could reasonably be considered inappropriate in a professional setting 24 | 25 | ## Our Responsibilities 26 | 27 | Project maintainers are responsible for clarifying the standards of acceptable behavior and are expected to take appropriate and fair corrective action in response to any instances of unacceptable behavior. 28 | 29 | Project maintainers have the right and responsibility to remove, edit, or reject comments, commits, code, wiki edits, issues, and other contributions that are not aligned to this Code of Conduct, or to ban temporarily or permanently any contributor for other behaviors that they deem inappropriate, threatening, offensive, or harmful. 30 | 31 | ## Scope 32 | 33 | This Code of Conduct applies both within project spaces and in public spaces when an individual is representing the project or its community. Examples of representing a project or community include using an official project e-mail address, posting via an official social media account, or acting as an appointed representative at an online or offline event. Representation of a project may be further defined and clarified by project maintainers. 34 | 35 | ## Enforcement 36 | 37 | Instances of abusive, harassing, or otherwise unacceptable behavior may be reported by contacting the project team at wildcat.name@gmail.com. The project team will review and investigate all complaints, and will respond in a way that it deems appropriate to the circumstances. The project team is obligated to maintain confidentiality with regard to the reporter of an incident. Further details of specific enforcement policies may be posted separately. 38 | 39 | Project maintainers who do not follow or enforce the Code of Conduct in good faith may face temporary or permanent repercussions as determined by other members of the project's leadership. 40 | 41 | ## Attribution 42 | 43 | This Code of Conduct is adapted from the [Contributor Covenant][homepage], version 1.4, available at [http://contributor-covenant.org/version/1/4][version] 44 | 45 | [homepage]: http://contributor-covenant.org 46 | [version]: http://contributor-covenant.org/version/1/4/ 47 | -------------------------------------------------------------------------------- /CONTRIBUTING.md: -------------------------------------------------------------------------------- 1 | # Contributing 2 | 3 | When contributing to this repository, please first discuss the change you wish to make via issue, 4 | email, or any other method with the owners of this repository before making a change. 5 | 6 | Please note we have a code of conduct, please follow it in all your interactions with the project. 7 | 8 | ## Pull Request Process 9 | 10 | 1. Ensure any install or build dependencies are removed before the end of the layer when doing a 11 | build. 12 | 2. Update the README.md with details of changes to the interface, this includes new environment 13 | variables, exposed ports, useful file locations and container parameters. 14 | 3. Increase the version numbers in any examples files and the README.md to the new version that this 15 | Pull Request would represent. The versioning scheme we use is [SemVer](http://semver.org/). 16 | 4. You may merge the Pull Request in once you have the sign-off of two other developers, or if you 17 | do not have permission to do that, you may request the second reviewer to merge it for you. 18 | 19 | ## Code of Conduct 20 | 21 | ### Our Pledge 22 | 23 | In the interest of fostering an open and welcoming environment, we as 24 | contributors and maintainers pledge to making participation in our project and 25 | our community a harassment-free experience for everyone, regardless of age, body 26 | size, disability, ethnicity, gender identity and expression, level of experience, 27 | nationality, personal appearance, race, religion, or sexual identity and 28 | orientation. 29 | 30 | ### Our Standards 31 | 32 | Examples of behavior that contributes to creating a positive environment 33 | include: 34 | 35 | * Using welcoming and inclusive language 36 | * Being respectful of differing viewpoints and experiences 37 | * Gracefully accepting constructive criticism 38 | * Focusing on what is best for the community 39 | * Showing empathy towards other community members 40 | 41 | Examples of unacceptable behavior by participants include: 42 | 43 | * The use of sexualized language or imagery and unwelcome sexual attention or 44 | advances 45 | * Trolling, insulting/derogatory comments, and personal or political attacks 46 | * Public or private harassment 47 | * Publishing others' private information, such as a physical or electronic 48 | address, without explicit permission 49 | * Other conduct which could reasonably be considered inappropriate in a 50 | professional setting 51 | 52 | ### Our Responsibilities 53 | 54 | Project maintainers are responsible for clarifying the standards of acceptable 55 | behavior and are expected to take appropriate and fair corrective action in 56 | response to any instances of unacceptable behavior. 57 | 58 | Project maintainers have the right and responsibility to remove, edit, or 59 | reject comments, commits, code, wiki edits, issues, and other contributions 60 | that are not aligned to this Code of Conduct, or to ban temporarily or 61 | permanently any contributor for other behaviors that they deem inappropriate, 62 | threatening, offensive, or harmful. 63 | 64 | ### Scope 65 | 66 | This Code of Conduct applies both within project spaces and in public spaces 67 | when an individual is representing the project or its community. Examples of 68 | representing a project or community include using an official project e-mail 69 | address, posting via an official social media account, or acting as an appointed 70 | representative at an online or offline event. Representation of a project may be 71 | further defined and clarified by project maintainers. 72 | 73 | ### Enforcement 74 | 75 | Instances of abusive, harassing, or otherwise unacceptable behavior may be 76 | reported by contacting the project team at [INSERT EMAIL ADDRESS]. All 77 | complaints will be reviewed and investigated and will result in a response that 78 | is deemed necessary and appropriate to the circumstances. The project team is 79 | obligated to maintain confidentiality with regard to the reporter of an incident. 80 | Further details of specific enforcement policies may be posted separately. 81 | 82 | Project maintainers who do not follow or enforce the Code of Conduct in good 83 | faith may face temporary or permanent repercussions as determined by other 84 | members of the project's leadership. 85 | 86 | ### Attribution 87 | 88 | This Code of Conduct is adapted from the [Contributor Covenant][homepage], version 1.4, 89 | available at [http://contributor-covenant.org/version/1/4][version] 90 | 91 | [homepage]: http://contributor-covenant.org 92 | [version]: http://contributor-covenant.org/version/1/4/ 93 | -------------------------------------------------------------------------------- /Dockerfile: -------------------------------------------------------------------------------- 1 | FROM node:lts as node-build 2 | 3 | WORKDIR /root 4 | 5 | COPY . . 6 | RUN cd frontend && npm install 7 | RUN make assets-build 8 | 9 | FROM ubuntu:focal as python-build 10 | 11 | ENV DEBIAN_FRONTEND=noninteractive 12 | ENV TZ=America/Los_Angeles 13 | 14 | RUN apt-get update && \ 15 | apt-get install -y python3 python3-distutils libpython3-dev curl g++ gcc libxslt-dev make libcurl4-openssl-dev build-essential libssl-dev && \ 16 | update-alternatives --install /usr/bin/python python /usr/bin/python3 1 && \ 17 | curl -sSL https://bootstrap.pypa.io/get-pip.py -o get-pip.py && \ 18 | python get-pip.py && \ 19 | rm get-pip.py && \ 20 | # Feature-parity with node.js base images. 21 | apt-get install -y --no-install-recommends git openssh-client && \ 22 | # clean apt cache 23 | rm -rf /var/lib/apt/lists/* && \ 24 | # Create the pwuser 25 | adduser pwuser 26 | 27 | WORKDIR /app 28 | 29 | COPY --from=node-build /root/scylla/assets ./scylla/assets 30 | COPY requirements.txt . 31 | RUN pip3 install -r requirements.txt 32 | RUN python3 -m playwright install --with-deps chromium 33 | COPY . . 34 | RUN python3 setup.py install 35 | 36 | RUN mkdir -p /var/www/scylla 37 | VOLUME /var/www/scylla 38 | 39 | RUN python3 -m playwright install chromium --with-deps 40 | 41 | EXPOSE 8899 42 | EXPOSE 8081 43 | 44 | CMD python3 -m scylla --db-path /var/www/scylla/scylla.db 45 | -------------------------------------------------------------------------------- /Dockerfile-alpine: -------------------------------------------------------------------------------- 1 | FROM python:3.9-alpine as build 2 | 3 | RUN apk add --update --no-cache g++ gcc libxslt-dev make build-base curl-dev openssl-dev 4 | 5 | RUN mkdir -p /var/www/scylla 6 | WORKDIR /var/www/scylla 7 | 8 | RUN pip install scylla 9 | 10 | FROM python:3.9-alpine as prod 11 | 12 | LABEL maintainer="WildCat " 13 | 14 | RUN apk add --update --no-cache libxslt-dev 15 | 16 | COPY --from=build /usr/local/lib/python3.9/site-packages/ /usr/local/lib/python3.9/site-packages/ 17 | 18 | WORKDIR /var/www/scylla 19 | VOLUME /var/www/scylla 20 | 21 | EXPOSE 8899 22 | EXPOSE 8081 23 | 24 | CMD python -m scylla 25 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | Apache License 2 | Version 2.0, January 2004 3 | http://www.apache.org/licenses/ 4 | 5 | TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION 6 | 7 | 1. Definitions. 8 | 9 | "License" shall mean the terms and conditions for use, reproduction, 10 | and distribution as defined by Sections 1 through 9 of this document. 11 | 12 | "Licensor" shall mean the copyright owner or entity authorized by 13 | the copyright owner that is granting the License. 14 | 15 | "Legal Entity" shall mean the union of the acting entity and all 16 | other entities that control, are controlled by, or are under common 17 | control with that entity. For the purposes of this definition, 18 | "control" means (i) the power, direct or indirect, to cause the 19 | direction or management of such entity, whether by contract or 20 | otherwise, or (ii) ownership of fifty percent (50%) or more of the 21 | outstanding shares, or (iii) beneficial ownership of such entity. 22 | 23 | "You" (or "Your") shall mean an individual or Legal Entity 24 | exercising permissions granted by this License. 25 | 26 | "Source" form shall mean the preferred form for making modifications, 27 | including but not limited to software source code, documentation 28 | source, and configuration files. 29 | 30 | "Object" form shall mean any form resulting from mechanical 31 | transformation or translation of a Source form, including but 32 | not limited to compiled object code, generated documentation, 33 | and conversions to other media types. 34 | 35 | "Work" shall mean the work of authorship, whether in Source or 36 | Object form, made available under the License, as indicated by a 37 | copyright notice that is included in or attached to the work 38 | (an example is provided in the Appendix below). 39 | 40 | "Derivative Works" shall mean any work, whether in Source or Object 41 | form, that is based on (or derived from) the Work and for which the 42 | editorial revisions, annotations, elaborations, or other modifications 43 | represent, as a whole, an original work of authorship. For the purposes 44 | of this License, Derivative Works shall not include works that remain 45 | separable from, or merely link (or bind by name) to the interfaces of, 46 | the Work and Derivative Works thereof. 47 | 48 | "Contribution" shall mean any work of authorship, including 49 | the original version of the Work and any modifications or additions 50 | to that Work or Derivative Works thereof, that is intentionally 51 | submitted to Licensor for inclusion in the Work by the copyright owner 52 | or by an individual or Legal Entity authorized to submit on behalf of 53 | the copyright owner. For the purposes of this definition, "submitted" 54 | means any form of electronic, verbal, or written communication sent 55 | to the Licensor or its representatives, including but not limited to 56 | communication on electronic mailing lists, source code control systems, 57 | and issue tracking systems that are managed by, or on behalf of, the 58 | Licensor for the purpose of discussing and improving the Work, but 59 | excluding communication that is conspicuously marked or otherwise 60 | designated in writing by the copyright owner as "Not a Contribution." 61 | 62 | "Contributor" shall mean Licensor and any individual or Legal Entity 63 | on behalf of whom a Contribution has been received by Licensor and 64 | subsequently incorporated within the Work. 65 | 66 | 2. Grant of Copyright License. Subject to the terms and conditions of 67 | this License, each Contributor hereby grants to You a perpetual, 68 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable 69 | copyright license to reproduce, prepare Derivative Works of, 70 | publicly display, publicly perform, sublicense, and distribute the 71 | Work and such Derivative Works in Source or Object form. 72 | 73 | 3. Grant of Patent License. Subject to the terms and conditions of 74 | this License, each Contributor hereby grants to You a perpetual, 75 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable 76 | (except as stated in this section) patent license to make, have made, 77 | use, offer to sell, sell, import, and otherwise transfer the Work, 78 | where such license applies only to those patent claims licensable 79 | by such Contributor that are necessarily infringed by their 80 | Contribution(s) alone or by combination of their Contribution(s) 81 | with the Work to which such Contribution(s) was submitted. If You 82 | institute patent litigation against any entity (including a 83 | cross-claim or counterclaim in a lawsuit) alleging that the Work 84 | or a Contribution incorporated within the Work constitutes direct 85 | or contributory patent infringement, then any patent licenses 86 | granted to You under this License for that Work shall terminate 87 | as of the date such litigation is filed. 88 | 89 | 4. Redistribution. You may reproduce and distribute copies of the 90 | Work or Derivative Works thereof in any medium, with or without 91 | modifications, and in Source or Object form, provided that You 92 | meet the following conditions: 93 | 94 | (a) You must give any other recipients of the Work or 95 | Derivative Works a copy of this License; and 96 | 97 | (b) You must cause any modified files to carry prominent notices 98 | stating that You changed the files; and 99 | 100 | (c) You must retain, in the Source form of any Derivative Works 101 | that You distribute, all copyright, patent, trademark, and 102 | attribution notices from the Source form of the Work, 103 | excluding those notices that do not pertain to any part of 104 | the Derivative Works; and 105 | 106 | (d) If the Work includes a "NOTICE" text file as part of its 107 | distribution, then any Derivative Works that You distribute must 108 | include a readable copy of the attribution notices contained 109 | within such NOTICE file, excluding those notices that do not 110 | pertain to any part of the Derivative Works, in at least one 111 | of the following places: within a NOTICE text file distributed 112 | as part of the Derivative Works; within the Source form or 113 | documentation, if provided along with the Derivative Works; or, 114 | within a display generated by the Derivative Works, if and 115 | wherever such third-party notices normally appear. The contents 116 | of the NOTICE file are for informational purposes only and 117 | do not modify the License. You may add Your own attribution 118 | notices within Derivative Works that You distribute, alongside 119 | or as an addendum to the NOTICE text from the Work, provided 120 | that such additional attribution notices cannot be construed 121 | as modifying the License. 122 | 123 | You may add Your own copyright statement to Your modifications and 124 | may provide additional or different license terms and conditions 125 | for use, reproduction, or distribution of Your modifications, or 126 | for any such Derivative Works as a whole, provided Your use, 127 | reproduction, and distribution of the Work otherwise complies with 128 | the conditions stated in this License. 129 | 130 | 5. Submission of Contributions. Unless You explicitly state otherwise, 131 | any Contribution intentionally submitted for inclusion in the Work 132 | by You to the Licensor shall be under the terms and conditions of 133 | this License, without any additional terms or conditions. 134 | Notwithstanding the above, nothing herein shall supersede or modify 135 | the terms of any separate license agreement you may have executed 136 | with Licensor regarding such Contributions. 137 | 138 | 6. Trademarks. This License does not grant permission to use the trade 139 | names, trademarks, service marks, or product names of the Licensor, 140 | except as required for reasonable and customary use in describing the 141 | origin of the Work and reproducing the content of the NOTICE file. 142 | 143 | 7. Disclaimer of Warranty. Unless required by applicable law or 144 | agreed to in writing, Licensor provides the Work (and each 145 | Contributor provides its Contributions) on an "AS IS" BASIS, 146 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or 147 | implied, including, without limitation, any warranties or conditions 148 | of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A 149 | PARTICULAR PURPOSE. You are solely responsible for determining the 150 | appropriateness of using or redistributing the Work and assume any 151 | risks associated with Your exercise of permissions under this License. 152 | 153 | 8. Limitation of Liability. In no event and under no legal theory, 154 | whether in tort (including negligence), contract, or otherwise, 155 | unless required by applicable law (such as deliberate and grossly 156 | negligent acts) or agreed to in writing, shall any Contributor be 157 | liable to You for damages, including any direct, indirect, special, 158 | incidental, or consequential damages of any character arising as a 159 | result of this License or out of the use or inability to use the 160 | Work (including but not limited to damages for loss of goodwill, 161 | work stoppage, computer failure or malfunction, or any and all 162 | other commercial damages or losses), even if such Contributor 163 | has been advised of the possibility of such damages. 164 | 165 | 9. Accepting Warranty or Additional Liability. While redistributing 166 | the Work or Derivative Works thereof, You may choose to offer, 167 | and charge a fee for, acceptance of support, warranty, indemnity, 168 | or other liability obligations and/or rights consistent with this 169 | License. However, in accepting such obligations, You may act only 170 | on Your own behalf and on Your sole responsibility, not on behalf 171 | of any other Contributor, and only if You agree to indemnify, 172 | defend, and hold each Contributor harmless for any liability 173 | incurred by, or claims asserted against, such Contributor by reason 174 | of your accepting any such warranty or additional liability. 175 | 176 | END OF TERMS AND CONDITIONS 177 | 178 | APPENDIX: How to apply the Apache License to your work. 179 | 180 | To apply the Apache License to your work, attach the following 181 | boilerplate notice, with the fields enclosed by brackets "[]" 182 | replaced with your own identifying information. (Don't include 183 | the brackets!) The text should be enclosed in the appropriate 184 | comment syntax for the file format. We also recommend that a 185 | file or class name and description of purpose be included on the 186 | same "printed page" as the copyright notice for easier 187 | identification within third-party archives. 188 | 189 | Copyright 2018 Michael Chong 190 | 191 | Licensed under the Apache License, Version 2.0 (the "License"); 192 | you may not use this file except in compliance with the License. 193 | You may obtain a copy of the License at 194 | 195 | http://www.apache.org/licenses/LICENSE-2.0 196 | 197 | Unless required by applicable law or agreed to in writing, software 198 | distributed under the License is distributed on an "AS IS" BASIS, 199 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 200 | See the License for the specific language governing permissions and 201 | limitations under the License. -------------------------------------------------------------------------------- /MANIFEST.in: -------------------------------------------------------------------------------- 1 | # Include the README 2 | include *.md 3 | include *.rst 4 | 5 | # Include the license file 6 | include LICENSE 7 | 8 | # requirements.txt 9 | include requirements.txt 10 | 11 | # Include the assets 12 | recursive-include scylla/assets * -------------------------------------------------------------------------------- /Makefile: -------------------------------------------------------------------------------- 1 | default: 2 | make assets-build 3 | make package-build 4 | make upload 5 | package-build: 6 | rm -rf dist 7 | python setup.py bdist_wheel --universal 8 | upload: 9 | twine upload dist/* 10 | tag: 11 | git tag $(TAG) -m '$(MSG)' && git push --tags origin master 12 | delete-tag: 13 | git tag --delete $(TAG); git push --delete origin $(TAG) 14 | assets-build: 15 | make assets-clean 16 | cd frontend && NODE_ENV=production npm run build:scylla:prod 17 | assets-dev: 18 | cd frontend && npm run dev 19 | assets-clean: 20 | rm -rf scylla/assets 21 | rm -rf build/lib/scylla/assets 22 | rm -rf dist/scylla/scylla/assets 23 | doc: 24 | make doc-en 25 | make doc-zh 26 | doc-en: 27 | cd docs/source && sphinx-apidoc -f -o . ../../scylla 28 | cd docs && PYTHONPATH=../ make html 29 | doc-zh: 30 | cd docs_zh/source && sphinx-apidoc -f -o . ../../scylla 31 | cd docs_zh && PYTHONPATH=../ make html 32 | style-check: 33 | flake8 . --count --config=.flake8.cfg --select=E901,E999,F821,F822,F823 --show-source --statistics 34 | test: 35 | make style-check 36 | pytest --cov=./scylla tests 37 | -------------------------------------------------------------------------------- /PULL_REQUEST_TEMPLATE.md: -------------------------------------------------------------------------------- 1 | Thank you very much for contributing to this project. You are making the world better! 🎉 2 | You might following this template but it is not mandatory. 3 | 4 | ## Proposed Changes 5 | 6 | - 7 | - 8 | - 9 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | ![banner_scylla](https://github.com/imWildCat/scylla/assets/2396817/62498a29-8105-4281-8eb0-73436d4ed5b0) [![Build Status](https://travis-ci.org/imWildCat/scylla.svg?branch=master)](https://travis-ci.org/imWildCat/scylla) 2 | [![codecov](https://codecov.io/gh/imWildCat/scylla/branch/master/graph/badge.svg)](https://codecov.io/gh/imWildCat/scylla) 3 | [![Documentation Status](https://readthedocs.org/projects/scylla-py/badge/?version=latest)](https://scylla.wildcat.io/en/latest/?badge=latest) 4 | [![PyPI version](https://badge.fury.io/py/scylla.svg)](https://badge.fury.io/py/scylla) 5 | [![Docker Pull](https://img.shields.io/docker/pulls/wildcat/scylla.svg)](https://hub.docker.com/r/wildcat/scylla/) 6 | [![Donate](https://img.shields.io/badge/Donate-PayPal-green.svg)](https://www.paypal.com/cgi-bin/webscr?cmd=_s-xclick&hosted_button_id=5DXFA7WGWPZBN) 7 | 8 | 9 | # Scylla 10 | 11 | An intelligent proxy pool for humanities, to extract content from the internet and build your own Large Language Models in this new AI era. 12 | 13 | Key features: 14 | 15 | - Automatic proxy ip crawling and validation 16 | - Easy-to-use JSON API 17 | - Simple but beautiful web-based user interface (eg. geographical 18 | distribution of proxies) 19 | - Get started with only **1 command** minimally 20 | - Simple HTTP Forward proxy server 21 | - [Scrapy] and [requests] integration with only 1 line of code 22 | minimally 23 | - Headless browser crawling 24 | 25 | 26 | Get started 27 | =========== 28 | 29 | Installation 30 | ------------ 31 | 32 | ### Install with Docker (highly recommended) 33 | 34 | ```bash 35 | docker run -d -p 8899:8899 -p 8081:8081 -v /var/www/scylla:/var/www/scylla --name scylla wildcat/scylla:latest 36 | ``` 37 | 38 | ### Install directly via pip 39 | 40 | ```bash 41 | pip install scylla 42 | scylla --help 43 | scylla # Run the crawler and web server for JSON API 44 | ``` 45 | 46 | ### Install from source 47 | 48 | ```bash 49 | git clone https://github.com/imWildCat/scylla.git 50 | cd scylla 51 | 52 | pip install -r requirements.txt 53 | 54 | cd frontend 55 | npm install 56 | cd .. 57 | 58 | make assets-build 59 | 60 | python -m scylla 61 | ``` 62 | 63 | Usage 64 | ----- 65 | 66 | This is an example of running a service locally (`localhost`), using 67 | port `8899`. 68 | 69 | Note: You might have to wait for 1 to 2 minutes in order to get some proxy ips populated in the database for the first time you use Scylla. 70 | 71 | ### JSON API 72 | 73 | #### Proxy IP List 74 | 75 | ```bash 76 | http://localhost:8899/api/v1/proxies 77 | ``` 78 | 79 | Optional URL parameters: 80 | 81 | | Parameters | Default value | Description | 82 | | ----------- | ------------- | ------------------------------------------------------------ | 83 | | `page` | `1` | The page number | 84 | | `limit` | `20` | The number of proxies shown on each page | 85 | | `anonymous` | `any` | Show anonymous proxies or not. Possible values:`true`, only anonymous proxies; `false`, only transparent proxies | 86 | | `https` | `any` | Show HTTPS proxies or not. Possible values:`true`, only HTTPS proxies; `false`, only HTTP proxies | 87 | | `countries` | None | Filter proxies for specific countries. Format example: ``US``, or multi-countries: `US,GB` | 88 | 89 | Sample result: 90 | 91 | ```json 92 | { 93 | "proxies": [{ 94 | "id": 599, 95 | "ip": "91.229.222.163", 96 | "port": 53281, 97 | "is_valid": true, 98 | "created_at": 1527590947, 99 | "updated_at": 1527593751, 100 | "latency": 23.0, 101 | "stability": 0.1, 102 | "is_anonymous": true, 103 | "is_https": true, 104 | "attempts": 1, 105 | "https_attempts": 0, 106 | "location": "54.0451,-0.8053", 107 | "organization": "AS57099 Boundless Networks Limited", 108 | "region": "England", 109 | "country": "GB", 110 | "city": "Malton" 111 | }, { 112 | "id": 75, 113 | "ip": "75.151.213.85", 114 | "port": 8080, 115 | "is_valid": true, 116 | "created_at": 1527590676, 117 | "updated_at": 1527593702, 118 | "latency": 268.0, 119 | "stability": 0.3, 120 | "is_anonymous": true, 121 | "is_https": true, 122 | "attempts": 1, 123 | "https_attempts": 0, 124 | "location": "32.3706,-90.1755", 125 | "organization": "AS7922 Comcast Cable Communications, LLC", 126 | "region": "Mississippi", 127 | "country": "US", 128 | "city": "Jackson" 129 | }, 130 | ... 131 | ], 132 | "count": 1025, 133 | "per_page": 20, 134 | "page": 1, 135 | "total_page": 52 136 | } 137 | ``` 138 | 139 | #### System Statistics 140 | 141 | ```bash 142 | http://localhost:8899/api/v1/stats 143 | ``` 144 | 145 | Sample result: 146 | 147 | ```json 148 | { 149 | "median": 181.2566407083, 150 | "valid_count": 1780, 151 | "total_count": 9528, 152 | "mean": 174.3290085201 153 | } 154 | ``` 155 | 156 | ### HTTP Forward Proxy Server 157 | 158 | By default, Scylla will start a HTTP Forward Proxy Server on port 159 | `8081`. This server will select one proxy updated recently from the 160 | database and it will be used for forward proxy. Whenever an HTTP request 161 | comes, the proxy server will select a proxy randomly. 162 | 163 | Note: HTTPS requests are not supported at present. 164 | 165 | The example for `curl` using this proxy server is shown below: 166 | 167 | ```bash 168 | curl http://api.ipify.org -x http://127.0.0.1:8081 169 | ``` 170 | 171 | You could also use this feature with [requests][]: 172 | 173 | ```python 174 | requests.get('http://api.ipify.org', proxies={'http': 'http://127.0.0.1:8081'}) 175 | ``` 176 | 177 | ### Web UI 178 | 179 | Open `http://localhost:8899` in your browser to see the Web UI of this 180 | project. 181 | 182 | #### Proxy IP List 183 | 184 | ``` 185 | http://localhost:8899/ 186 | ``` 187 | 188 | Screenshot: 189 | 190 | ![screenshot-proxy-list](https://user-images.githubusercontent.com/2396817/40653600-946eae6e-6333-11e8-8bbd-9d2f347c5461.png) 191 | 192 | #### Globally Geographical Distribution Map 193 | 194 | ``` 195 | http://localhost:8899/#/geo 196 | ``` 197 | 198 | Screenshot: 199 | 200 | ![screenshot-geo-distribution](https://user-images.githubusercontent.com/2396817/40653599-9458b6b8-6333-11e8-8e6e-1d90271fc083.png) 201 | 202 | API Documentation 203 | ================= 204 | 205 | Please read [Module 206 | Index](https://scylla.wildcat.io/en/latest/py-modindex.html). 207 | 208 | Roadmap 209 | ======= 210 | 211 | Please see [Projects](https://github.com/imWildCat/scylla/projects). 212 | 213 | Development and Contribution 214 | ============================ 215 | 216 | ```bash 217 | git clone https://github.com/imWildCat/scylla.git 218 | cd scylla 219 | 220 | pip install -r requirements.txt 221 | 222 | npm install 223 | make assets-build 224 | ``` 225 | 226 | Testing 227 | ======= 228 | 229 | If you wish to run tests locally, the commands are shown below: 230 | 231 | ```bash 232 | pip install -r tests/requirements-test.txt 233 | pytest tests/ 234 | ``` 235 | 236 | You are welcomed to add more test cases to this project, increasing the 237 | robustness of this project. 238 | 239 | Naming of This Project 240 | ====================== 241 | 242 | [Scylla](http://prisonbreak.wikia.com/wiki/Scylla) is derived from the 243 | name of a group of memory chips in the American TV series, [Prison 244 | Break](https://en.wikipedia.org/wiki/Prison_Break). This project was 245 | named after this American TV series to pay tribute to it. 246 | 247 | Help 248 | ====================== 249 | [How to install Python Scylla on CentOS7](https://digcodes.com/how-to-install-python-scylla-on-centos7/) 250 | 251 | 252 | Donation 253 | ======== 254 | 255 | If you find this project useful, could you please donate some money to 256 | it? 257 | 258 | No matter how much the money is, Your donation will inspire the author 259 | to develop new features continuously! 🎉 Thank you! 260 | 261 | The ways for donation are shown below: 262 | 263 | GitHub Sponsor 264 | ------ 265 | 266 | I super appreciate if you can join my sponsors here. 267 | 268 | 269 | 270 | PayPal 271 | ------ 272 | 273 | [![paypal_donation](https://www.paypalobjects.com/en_US/i/btn/btn_donateCC_LG.gif)](https://www.paypal.com/cgi-bin/webscr?cmd=_s-xclick&hosted_button_id=5DXFA7WGWPZBN) 274 | 275 | 276 | License 277 | ======= 278 | 279 | Apache License 2.0. For more details, please read the 280 | [LICENSE](https://github.com/imWildCat/scylla/blob/master/LICENSE) file. 281 | 282 | [Alipay and WeChat Donation]: https://user-images.githubusercontent.com/2396817/40589594-cfb0e49e-61e7-11e8-8f7d-c55a29676c40.png 283 | 284 | 285 | [Scrapy]: https://scrapy.org 286 | [requests]: http://docs.python-requests.org/ 287 | -------------------------------------------------------------------------------- /README.rst: -------------------------------------------------------------------------------- 1 | |Scylla Banner| |Build Status| |codecov| |Documentation Status| |PyPI version| |Docker Build Status| |PayPal Donation| 2 | ============================================================================================================================================== 3 | 4 | An intelligent proxy pool for humanities, only supports Python 3.6. Key features: 5 | 6 | - Automatic proxy ip crawling and validation 7 | - Easy-to-use JSON API 8 | - Simple but beautiful web-based user interface (eg. geographical 9 | distribution of proxies) 10 | - Get started with only **1 command** minimally 11 | - Simple HTTP Forward proxy server 12 | - `Scrapy`_ and `requests`_ integration with only 1 line of code minimally 13 | - Headless browser crawling 14 | 15 | 16 | Documentation 17 | ------------- 18 | 19 | Please read the `Documentation`_. 20 | 21 | Quick start 22 | ----------- 23 | 24 | Installation 25 | """""""""""" 26 | 27 | Install with Docker (highly recommended) 28 | ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ 29 | .. code-block:: shell 30 | 31 | docker run -d -p 8899:8899 -p 8081:8081 -v /var/www/scylla:/var/www/scylla --name scylla wildcat/scylla:latest 32 | 33 | Install directly via pip 34 | ^^^^^^^^^^^^^^^^^^^^^^^^^ 35 | 36 | .. code:: bash 37 | 38 | pip install scylla 39 | scylla --help 40 | scylla # Run the crawler and web server for JSON API 41 | 42 | Install from source 43 | ^^^^^^^^^^^^^^^^^^^^^^^ 44 | 45 | .. code:: bash 46 | 47 | git clone https://github.com/imWildCat/scylla.git 48 | cd scylla 49 | 50 | pip install -r requirements.txt 51 | 52 | npm install 53 | make assets-build 54 | 55 | python -m scylla 56 | 57 | For Windows user who fails at installing ``sanic`` due to ``uvloop does not support Windows at the moment``: 58 | 59 | .. code:: bash 60 | 61 | export SANIC_NO_UVLOOP=true 62 | export SANIC_NO_UJSON=true 63 | pip3 install sanic 64 | 65 | If this also fails, yoi will need to manual install sanic from source. 66 | 67 | 68 | Roadmap 69 | -------------- 70 | 71 | Please see `Projects`_. 72 | 73 | Development and Contribution 74 | ---------------------------- 75 | 76 | .. code:: bash 77 | 78 | git clone https://github.com/imWildCat/scylla.git 79 | cd scylla 80 | 81 | pip install -r requirements.txt 82 | 83 | npm install 84 | make assets-build 85 | 86 | Testing 87 | ------- 88 | 89 | If you wish to run tests locally, the commands are shown below: 90 | 91 | .. code:: bash 92 | 93 | pip install -r tests/requirements-test.txt 94 | pytest tests/ 95 | 96 | You are welcomed to add more test cases to this project, increasing the robustness of this project. 97 | 98 | Naming of This Project 99 | ---------------------- 100 | `Scylla`_ is derived from the name of a group of memory chips in the American TV series, `Prison Break`_. This project was named after this American TV series to pay tribute to it. 101 | 102 | 103 | Donation 104 | ---------------------- 105 | If you find this project useful, could you please donate some money to it? 106 | 107 | No matter how much the money is, Your donation will inspire the author to develop new features continuously! 🎉 108 | Thank you! 109 | 110 | The ways for donation are shown below: 111 | 112 | PayPal 113 | """""" 114 | |PayPal Donation Official| 115 | 116 | Alipay or WeChat Pay 117 | """""""""""""""""""" 118 | |Alipay and WeChat Donation| 119 | 120 | 121 | License 122 | ------- 123 | 124 | Apache License 2.0. For more details, please read the 125 | `LICENSE`_ file. 126 | 127 | .. _Module Index: https://scylla.wildcat.io/en/latest/py-modindex.html 128 | .. _Projects: https://github.com/imWildCat/scylla/projects 129 | .. _LICENSE: https://github.com/imWildCat/scylla/blob/master/LICENSE 130 | .. _Travis CI: https://travis-ci.org/imWildCat/scylla 131 | .. _Scylla: http://prisonbreak.wikia.com/wiki/Scylla 132 | .. _Prison Break: https://en.wikipedia.org/wiki/Prison_Break 133 | .. _中文文档: https://scylla.wildcat.io/zh/latest/ 134 | .. _Chinese Documentation: https://scylla.wildcat.io/zh/stable/ 135 | .. _Documentation: https://scylla.wildcat.io/en/stable/ 136 | .. _Scrapy: https://scrapy.org 137 | .. _requests: http://docs.python-requests.org/ 138 | 139 | 140 | .. |Scylla Banner| image:: https://user-images.githubusercontent.com/2396817/40580477-f15a15b8-6136-11e8-9f4b-1f012e90712c.png 141 | .. |Build Status| image:: https://travis-ci.org/imWildCat/scylla.svg?branch=master 142 | :target: https://travis-ci.org/imWildCat/scylla 143 | .. |codecov| image:: https://codecov.io/gh/imWildCat/scylla/branch/master/graph/badge.svg 144 | :target: https://codecov.io/gh/imWildCat/scylla 145 | .. |Documentation Status| image:: https://readthedocs.org/projects/scylla-py/badge/?version=latest 146 | :target: https://scylla.wildcat.io/en/latest/?badge=latest 147 | .. |PyPI version| image:: https://badge.fury.io/py/scylla.svg 148 | :target: https://badge.fury.io/py/scylla 149 | .. |Docker Build Status| image:: https://img.shields.io/docker/build/wildcat/scylla.svg 150 | :target: https://hub.docker.com/r/wildcat/scylla/ 151 | .. |PayPal Donation| image:: https://img.shields.io/badge/Donate-PayPal-green.svg 152 | :target: https://www.paypal.com/cgi-bin/webscr?cmd=_s-xclick&hosted_button_id=5DXFA7WGWPZBN 153 | .. |PayPal Donation Official| image:: https://www.paypalobjects.com/en_US/i/btn/btn_donateCC_LG.gif 154 | :target: https://www.paypal.com/cgi-bin/webscr?cmd=_s-xclick&hosted_button_id=5DXFA7WGWPZBN 155 | .. |Alipay and WeChat Donation| image:: https://user-images.githubusercontent.com/2396817/40589594-cfb0e49e-61e7-11e8-8f7d-c55a29676c40.png 156 | :target: https://user-images.githubusercontent.com/2396817/40589594-cfb0e49e-61e7-11e8-8f7d-c55a29676c40.png 157 | -------------------------------------------------------------------------------- /docker-compose.yml: -------------------------------------------------------------------------------- 1 | version: '3.8' 2 | 3 | services: 4 | scylla: 5 | build: 6 | context: . 7 | target: prod 8 | volumes: 9 | - /var/www/scylla:/var/www/scylla 10 | ports: 11 | - "8899:8899" 12 | - "8081:8081" 13 | -------------------------------------------------------------------------------- /docs/Makefile: -------------------------------------------------------------------------------- 1 | # Minimal makefile for Sphinx documentation 2 | # 3 | 4 | # You can set these variables from the command line. 5 | SPHINXOPTS = 6 | SPHINXBUILD = sphinx-build 7 | SPHINXPROJ = scylla 8 | SOURCEDIR = source 9 | BUILDDIR = build 10 | 11 | # Put it first so that "make" without argument is like "make help". 12 | help: 13 | @$(SPHINXBUILD) -M help "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O) 14 | 15 | .PHONY: help Makefile 16 | 17 | # Catch-all target: route all unknown targets to Sphinx using the new 18 | # "make mode" option. $(O) is meant as a shortcut for $(SPHINXOPTS). 19 | %: Makefile 20 | @$(SPHINXBUILD) -M $@ "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O) -------------------------------------------------------------------------------- /docs/requirements-rtfd.txt: -------------------------------------------------------------------------------- 1 | tornado==6.3.3 2 | peewee==3.2.2 3 | requests==2.31.0 4 | pycurl==7.43.0.1 5 | schedule==0.5.0 6 | six==1.11.0 7 | mock 8 | playwright==1.9.2 9 | pyquery==1.4.3 10 | -------------------------------------------------------------------------------- /docs/source/_templates/sidebarintro.html: -------------------------------------------------------------------------------- 1 | 7 | 8 |

9 | 11 |

12 | 13 |

Quick Links

14 | -------------------------------------------------------------------------------- /docs/source/conf.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | # 3 | # Configuration file for the Sphinx documentation builder. 4 | # 5 | # This file does only contain a selection of the most common options. For a 6 | # full list see the documentation: 7 | # http://www.sphinx-doc.org/en/stable/config 8 | 9 | # -- Path setup -------------------------------------------------------------- 10 | 11 | # If extensions (or modules to document with autodoc) are in another directory, 12 | # add these directories to sys.path here. If the directory is relative to the 13 | # documentation root, use os.path.abspath to make it absolute, like shown here. 14 | # 15 | # import os 16 | # sys.path.insert(0, os.path.abspath('.')) 17 | import sys 18 | from unittest.mock import MagicMock 19 | 20 | # -- Project information ----------------------------------------------------- 21 | 22 | project = 'scylla' 23 | copyright = '2018, WildCat' 24 | author = 'WildCat' 25 | 26 | # The short X.Y version 27 | version = '' 28 | # The full version, including alpha/beta/rc tags 29 | release = '' 30 | 31 | # -- General configuration --------------------------------------------------- 32 | 33 | # If your documentation needs a minimal Sphinx version, state it here. 34 | # 35 | # needs_sphinx = '1.0' 36 | 37 | # Add any Sphinx extension module names here, as strings. They can be 38 | # extensions coming with Sphinx (named 'sphinx.ext.*') or your custom 39 | # ones. 40 | extensions = [ 41 | 'sphinx.ext.autodoc', 42 | 'sphinx.ext.githubpages', 43 | 'sphinx.ext.viewcode', 44 | ] 45 | 46 | # Add any paths that contain templates here, relative to this directory. 47 | templates_path = ['_templates'] 48 | 49 | # The suffix(es) of source filenames. 50 | # You can specify multiple suffix as a list of string: 51 | # 52 | # source_suffix = ['.rst', '.md'] 53 | source_suffix = '.rst' 54 | 55 | # The master toctree document. 56 | master_doc = 'index' 57 | 58 | # The language for content autogenerated by Sphinx. Refer to documentation 59 | # for a list of supported languages. 60 | # 61 | # This is also used if you do content translation via gettext catalogs. 62 | # Usually you set "language" from the command line for these cases. 63 | language = None 64 | 65 | # List of patterns, relative to source directory, that match files and 66 | # directories to ignore when looking for source files. 67 | # This pattern also affects html_static_path and html_extra_path . 68 | exclude_patterns = [] 69 | 70 | # The name of the Pygments (syntax highlighting) style to use. 71 | pygments_style = 'sphinx' 72 | 73 | # -- Options for HTML output ------------------------------------------------- 74 | 75 | # The theme to use for HTML and HTML Help pages. See the documentation for 76 | # a list of builtin themes. 77 | # 78 | html_theme = 'alabaster' 79 | 80 | # Theme options are theme-specific and customize the look and feel of a theme 81 | # further. For a list of options available for each theme, see the 82 | # documentation. 83 | # 84 | # html_theme_options = {} 85 | 86 | # Add any paths that contain custom static files (such as style sheets) here, 87 | # relative to this directory. They are copied after the builtin static files, 88 | # so a file named "default.css" will overwrite the builtin "default.css". 89 | html_static_path = ['_static'] 90 | 91 | # Custom sidebar templates, must be a dictionary that maps document names 92 | # to template names. 93 | # 94 | # The default sidebars (for documents that don't match any pattern) are 95 | # defined by theme itself. Builtin themes are using these templates by 96 | # default: ``['localtoc.html', 'relations.html', 'sourcelink.html', 97 | # 'searchbox.html']``. 98 | # 99 | # html_sidebars = {} 100 | html_sidebars = { 101 | '**': ['sidebarintro.html', 'localtoc.html', 'relations.html', 'sourcelink.html', 'searchbox.html', ], 102 | } 103 | 104 | # -- Options for HTMLHelp output --------------------------------------------- 105 | 106 | # Output file base name for HTML help builder. 107 | htmlhelp_basename = 'scylladoc' 108 | 109 | # -- Options for LaTeX output ------------------------------------------------ 110 | 111 | latex_elements = { 112 | # The paper size ('letterpaper' or 'a4paper'). 113 | # 114 | # 'papersize': 'letterpaper', 115 | 116 | # The font size ('10pt', '11pt' or '12pt'). 117 | # 118 | # 'pointsize': '10pt', 119 | 120 | # Additional stuff for the LaTeX preamble. 121 | # 122 | # 'preamble': '', 123 | 124 | # Latex figure (float) alignment 125 | # 126 | # 'figure_align': 'htbp', 127 | } 128 | 129 | # Grouping the document tree into LaTeX files. List of tuples 130 | # (source start file, target name, title, 131 | # author, documentclass [howto, manual, or own class]). 132 | latex_documents = [ 133 | (master_doc, 'scylla.tex', 'scylla Documentation', 134 | 'WildCat', 'manual'), 135 | ] 136 | 137 | # -- Options for manual page output ------------------------------------------ 138 | 139 | # One entry per manual page. List of tuples 140 | # (source start file, name, description, authors, manual section). 141 | man_pages = [ 142 | (master_doc, 'scylla', 'scylla Documentation', 143 | [author], 1) 144 | ] 145 | 146 | # -- Options for Texinfo output ---------------------------------------------- 147 | 148 | # Grouping the document tree into Texinfo files. List of tuples 149 | # (source start file, target name, title, author, 150 | # dir menu entry, description, category) 151 | texinfo_documents = [ 152 | (master_doc, 'scylla', 'scylla Documentation', 153 | author, 'scylla', 'One line description of project.', 154 | 'Miscellaneous'), 155 | ] 156 | 157 | members_to_watch = ['function', ] 158 | 159 | # Re: https://github.com/dabercro/OpsSpace/blob/880c58f6a6172924ca03145916f6a27cf6633684/docs/conf.py 160 | 161 | 162 | class Mock(MagicMock): 163 | @classmethod 164 | def __getattr__(cls, name): 165 | return MagicMock() 166 | 167 | 168 | MOCK_MODULES = ['pycurl', ] 169 | sys.modules.update((mod_name, Mock()) for mod_name in MOCK_MODULES) 170 | -------------------------------------------------------------------------------- /docs/source/index.rst: -------------------------------------------------------------------------------- 1 | Scylla: An Intelligent Proxy Pool for Humanities™ 2 | ================================================== 3 | 4 | An intelligent proxy pool for humanities, only supports Python 3.6. Key 5 | features: 6 | 7 | - Automatic proxy ip crawling and validation 8 | - Easy-to-use JSON API 9 | - Simple but beautiful web-based user interface (eg. geographical 10 | distribution of proxies) 11 | - Get started with only **1 command** minimally 12 | - Simple HTTP Forward proxy server 13 | - `Scrapy`_ and `requests`_ integration with only 1 line of code minimally 14 | - Headless browser crawling 15 | 16 | 17 | Get started 18 | ----------- 19 | 20 | Installation 21 | """""""""""" 22 | 23 | Install with Docker (highly recommended) 24 | ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ 25 | .. code-block:: shell 26 | 27 | docker run -d -p 8899:8899 -p 8081:8081 -v /var/www/scylla:/var/www/scylla --name scylla wildcat/scylla:latest 28 | 29 | Install directly via pip 30 | ^^^^^^^^^^^^^^^^^^^^^^^^^ 31 | 32 | .. code:: bash 33 | 34 | pip install scylla 35 | scylla --help 36 | scylla # Run the crawler and web server for JSON API 37 | 38 | Install from source 39 | ^^^^^^^^^^^^^^^^^^^^^^^ 40 | 41 | .. code:: bash 42 | 43 | git clone https://github.com/imWildCat/scylla.git 44 | cd scylla 45 | 46 | pip install -r requirements.txt 47 | python -m playwright install --with-deps chromium 48 | 49 | npm install 50 | make build-assets 51 | 52 | python -m scylla 53 | 54 | For Windows user who fails at installing ``sanic`` due to ``uvloop does not support Windows at the moment``: 55 | 56 | .. code:: bash 57 | 58 | export SANIC_NO_UVLOOP=true 59 | export SANIC_NO_UJSON=true 60 | pip3 install sanic 61 | 62 | If this also fails, yoi will need to manual install sanic from source. 63 | 64 | Usage 65 | """"" 66 | 67 | This is an example of running a service locally (``localhost``), using port ``8899``. 68 | 69 | Note: You might have to wait for 1 to 2 minutes in order to get some proxy ips populated in the database for the first time you use Scylla. 70 | 71 | JSON API 72 | ^^^^^^^^^^^^^^^^^^ 73 | 74 | Proxy IP List 75 | ~~~~~~~~~~~~~~~~~~~~ 76 | 77 | .. code:: shell 78 | 79 | http://localhost:8899/api/v1/proxies 80 | 81 | Optional URL parameters: 82 | 83 | ========== ============= ================================================================= 84 | Parameters Default value Description 85 | ========== ============= ================================================================= 86 | page ``1`` The page number 87 | limit ``20`` The number of proxies shown on each page 88 | anonymous ``any`` Show anonymous proxies or not. Possible values:``true``, only anonymous proxies; ``false``, only transparent proxies 89 | https ``any`` Show HTTPS proxies or not. Possible values:``true``, only HTTPS proxies; ``false``, only HTTP proxies 90 | countries None Filter proxies for specific countries. Format example: ``US``, or multi-countries: ``US,GB`` 91 | ========== ============= ================================================================= 92 | 93 | Sample result: 94 | 95 | .. code:: json 96 | 97 | { 98 | "proxies": [{ 99 | "id": 599, 100 | "ip": "91.229.222.163", 101 | "port": 53281, 102 | "is_valid": true, 103 | "created_at": 1527590947, 104 | "updated_at": 1527593751, 105 | "latency": 23.0, 106 | "stability": 0.1, 107 | "is_anonymous": true, 108 | "is_https": true, 109 | "attempts": 1, 110 | "https_attempts": 0, 111 | "location": "54.0451,-0.8053", 112 | "organization": "AS57099 Boundless Networks Limited", 113 | "region": "England", 114 | "country": "GB", 115 | "city": "Malton" 116 | }, { 117 | "id": 75, 118 | "ip": "75.151.213.85", 119 | "port": 8080, 120 | "is_valid": true, 121 | "created_at": 1527590676, 122 | "updated_at": 1527593702, 123 | "latency": 268.0, 124 | "stability": 0.3, 125 | "is_anonymous": true, 126 | "is_https": true, 127 | "attempts": 1, 128 | "https_attempts": 0, 129 | "location": "32.3706,-90.1755", 130 | "organization": "AS7922 Comcast Cable Communications, LLC", 131 | "region": "Mississippi", 132 | "country": "US", 133 | "city": "Jackson" 134 | }, 135 | ... 136 | ], 137 | "count": 1025, 138 | "per_page": 20, 139 | "page": 1, 140 | "total_page": 52 141 | } 142 | 143 | System Statistics 144 | ~~~~~~~~~~~~~~~~~ 145 | 146 | .. code:: shell 147 | 148 | http://localhost:8899/api/v1/stats 149 | 150 | Sample result: 151 | 152 | .. code:: json 153 | 154 | { 155 | "median": 181.2566407083, 156 | "valid_count": 1780, 157 | "total_count": 9528, 158 | "mean": 174.3290085201 159 | } 160 | 161 | HTTP Forward Proxy Server 162 | ^^^^^^^^^^^^^^^^^^^^^^^^^ 163 | 164 | By default, Scylla will start a HTTP Forward Proxy Server on port ``8081``. 165 | This server will select one proxy updated recently from the database and it will be used for forward proxy. 166 | Whenever an HTTP request comes, the proxy server will select a proxy randomly. 167 | 168 | Note: HTTPS requests are not supported at present. 169 | 170 | The example for ``curl`` using this proxy server is shown below: 171 | 172 | .. code:: shell 173 | 174 | curl http://api.ipify.org -x http://127.0.0.1:8081 175 | 176 | You could also use this feature with `requests`_: 177 | 178 | .. code:: python 179 | 180 | requests.get('http://api.ipify.org', proxies={'http': 'http://127.0.0.1:8081'}) 181 | 182 | Web UI 183 | ^^^^^^^^^^^^^^^^^^ 184 | 185 | Open ``http://localhost:8899`` in your browser to see the Web UI of this project. 186 | 187 | Proxy IP List 188 | ~~~~~~~~~~~~~~~~~~~~ 189 | 190 | .. code:: shell 191 | 192 | http://localhost:8899/ 193 | 194 | Screenshot: 195 | 196 | |screenshot-proxy-list| 197 | 198 | Globally Geographical Distribution Map 199 | ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ 200 | 201 | .. code:: shell 202 | 203 | http://localhost:8899/#/geo 204 | 205 | Screenshot: 206 | 207 | |screenshot-geo-distribution| 208 | 209 | Other Examples 210 | -------------- 211 | 212 | .. toctree:: 213 | :maxdepth: 1 214 | 215 | requests_integration 216 | 217 | System Design 218 | ------------- 219 | 220 | .. toctree:: 221 | :maxdepth: 1 222 | 223 | validation_policy 224 | 225 | API Documentation 226 | ----------------- 227 | 228 | Please read :ref:`modindex`. 229 | 230 | Roadmap 231 | -------------- 232 | 233 | Please see `Projects`_. 234 | 235 | Development and Contribution 236 | ---------------------------- 237 | 238 | .. code:: bash 239 | 240 | git clone https://github.com/imWildCat/scylla.git 241 | cd scylla 242 | 243 | pip install -r requirements.txt 244 | 245 | npm install 246 | make build-assets 247 | 248 | Testing 249 | ------- 250 | 251 | If you wish to run tests locally, the commands are shown below: 252 | 253 | .. code:: bash 254 | 255 | pip install -r tests/requirements-test.txt 256 | pytest tests/ 257 | 258 | You are welcomed to add more test cases to this project, increasing the robustness of this project. 259 | 260 | Naming of This Project 261 | ---------------------- 262 | `Scylla`_ is derived from the name of a group of memory chips in the American TV series, `Prison Break`_. This project was named after this American TV series to pay tribute to it. 263 | 264 | Donation 265 | ---------------------- 266 | If you find this project useful, could you please donate some money to it? 267 | 268 | No matter how much the money is, Your donation will inspire the author to develop new features continuously! 🎉 269 | 270 | Thank you! 271 | 272 | The ways for donation are shown below: 273 | 274 | PayPal 275 | """""" 276 | |PayPal Donation Official| 277 | 278 | Alipay or WeChat Pay 279 | """""""""""""""""""" 280 | |Alipay and WeChat Donation| 281 | 282 | License 283 | ------- 284 | 285 | Apache License 2.0. For more details, please read the 286 | `LICENSE`_ file. 287 | 288 | 289 | .. _Projects: https://github.com/imWildCat/scylla/projects 290 | .. _LICENSE: https://github.com/imWildCat/scylla/blob/master/LICENSE 291 | .. _Travis CI: https://travis-ci.org/imWildCat/scylla 292 | .. _Scylla: http://prisonbreak.wikia.com/wiki/Scylla 293 | .. _Prison Break: https://en.wikipedia.org/wiki/Prison_Break 294 | .. _中文文档: https://scylla.wildcat.io/zh/latest/ 295 | .. _Chinese Documentation: https://scylla.wildcat.io/zh/latest/ 296 | .. _Scrapy: https://scrapy.org 297 | .. _requests: http://docs.python-requests.org/ 298 | 299 | .. |screenshot-geo-distribution| image:: https://user-images.githubusercontent.com/2396817/40653599-9458b6b8-6333-11e8-8e6e-1d90271fc083.png 300 | .. |screenshot-proxy-list| image:: https://user-images.githubusercontent.com/2396817/40653600-946eae6e-6333-11e8-8bbd-9d2f347c5461.png 301 | 302 | .. |PayPal Donation Official| image:: https://www.paypalobjects.com/en_US/i/btn/btn_donateCC_LG.gif 303 | :target: https://www.paypal.com/cgi-bin/webscr?cmd=_s-xclick&hosted_button_id=5DXFA7WGWPZBN 304 | .. |Alipay and WeChat Donation| image:: https://user-images.githubusercontent.com/2396817/40589594-cfb0e49e-61e7-11e8-8f7d-c55a29676c40.png 305 | :target: https://user-images.githubusercontent.com/2396817/40589594-cfb0e49e-61e7-11e8-8f7d-c55a29676c40.png 306 | 307 | Indices and tables 308 | ================== 309 | 310 | * :ref:`genindex` 311 | * :ref:`modindex` 312 | * :ref:`search` 313 | -------------------------------------------------------------------------------- /docs/source/modules.rst: -------------------------------------------------------------------------------- 1 | scylla 2 | ====== 3 | 4 | .. toctree:: 5 | :maxdepth: 4 6 | 7 | scylla 8 | -------------------------------------------------------------------------------- /docs/source/requests_integration.rst: -------------------------------------------------------------------------------- 1 | .. _requests_integration: 2 | 3 | Example with Requests 4 | ===================== 5 | 6 | `Requests`_ is a very nice and mature HTTP library for Python. To use Scylla with this library is very easy. 7 | 8 | With the JSON API 9 | ----------------- 10 | 11 | .. code:: python 12 | 13 | import requests 14 | import random 15 | 16 | json_resp = requests.get('http://localhost:8899/api/v1/proxies').json() 17 | proxy = random.choice(json_resp['proxies']) 18 | 19 | requests.get('http://api.ipify.org', proxies={'http': 'http://{}:{}'.format(proxy['ip'], proxy['port'])}) 20 | 21 | HTTPS proxy is also supported as well: 22 | 23 | .. code:: python 24 | 25 | import requests 26 | import random 27 | 28 | json_resp = requests.get('http://localhost:8899/api/v1/proxies?https=true').json() 29 | proxy = random.choice(json_resp['proxies']) 30 | 31 | requests.get('https://api.ipify.org', proxies={'https': 'https://{}:{}'.format(proxy['ip'], proxy['port'])}) 32 | 33 | 34 | 35 | 36 | With the forward proxy server 37 | ----------------------------- 38 | 39 | .. code:: python 40 | 41 | requests.get('http://api.ipify.org', proxies={'http': 'http://127.0.0.1:8081'}) 42 | 43 | 44 | 45 | 46 | .. _Requests: http://docs.python-requests.org/ -------------------------------------------------------------------------------- /docs/source/scylla.providers.rst: -------------------------------------------------------------------------------- 1 | scylla.providers package 2 | ======================== 3 | 4 | Submodules 5 | ---------- 6 | 7 | scylla.providers.a2u\_provider module 8 | ------------------------------------- 9 | 10 | .. automodule:: scylla.providers.a2u_provider 11 | :members: 12 | :undoc-members: 13 | :show-inheritance: 14 | 15 | scylla.providers.base\_provider module 16 | -------------------------------------- 17 | 18 | .. automodule:: scylla.providers.base_provider 19 | :members: 20 | :undoc-members: 21 | :show-inheritance: 22 | 23 | scylla.providers.cool\_proxy\_provider module 24 | --------------------------------------------- 25 | 26 | .. automodule:: scylla.providers.cool_proxy_provider 27 | :members: 28 | :undoc-members: 29 | :show-inheritance: 30 | 31 | scylla.providers.data5u\_provider module 32 | ---------------------------------------- 33 | 34 | .. automodule:: scylla.providers.data5u_provider 35 | :members: 36 | :undoc-members: 37 | :show-inheritance: 38 | 39 | scylla.providers.free\_proxy\_list\_provider module 40 | --------------------------------------------------- 41 | 42 | .. automodule:: scylla.providers.free_proxy_list_provider 43 | :members: 44 | :undoc-members: 45 | :show-inheritance: 46 | 47 | scylla.providers.http\_proxy\_provider module 48 | --------------------------------------------- 49 | 50 | .. automodule:: scylla.providers.http_proxy_provider 51 | :members: 52 | :undoc-members: 53 | :show-inheritance: 54 | 55 | scylla.providers.kuaidaili\_provider module 56 | ------------------------------------------- 57 | 58 | .. automodule:: scylla.providers.kuaidaili_provider 59 | :members: 60 | :undoc-members: 61 | :show-inheritance: 62 | 63 | scylla.providers.spys\_me\_provider module 64 | ------------------------------------------ 65 | 66 | .. automodule:: scylla.providers.spys_me_provider 67 | :members: 68 | :undoc-members: 69 | :show-inheritance: 70 | 71 | scylla.providers.spys\_one\_provider module 72 | ------------------------------------------- 73 | 74 | .. automodule:: scylla.providers.spys_one_provider 75 | :members: 76 | :undoc-members: 77 | :show-inheritance: 78 | 79 | scylla.providers.xici\_provider module 80 | -------------------------------------- 81 | 82 | .. automodule:: scylla.providers.xici_provider 83 | :members: 84 | :undoc-members: 85 | :show-inheritance: 86 | 87 | 88 | Module contents 89 | --------------- 90 | 91 | .. automodule:: scylla.providers 92 | :members: 93 | :undoc-members: 94 | :show-inheritance: 95 | -------------------------------------------------------------------------------- /docs/source/scylla.proxy.rst: -------------------------------------------------------------------------------- 1 | scylla.proxy package 2 | ==================== 3 | 4 | Submodules 5 | ---------- 6 | 7 | scylla.proxy.server module 8 | -------------------------- 9 | 10 | .. automodule:: scylla.proxy.server 11 | :members: 12 | :undoc-members: 13 | :show-inheritance: 14 | 15 | 16 | Module contents 17 | --------------- 18 | 19 | .. automodule:: scylla.proxy 20 | :members: 21 | :undoc-members: 22 | :show-inheritance: 23 | -------------------------------------------------------------------------------- /docs/source/scylla.rst: -------------------------------------------------------------------------------- 1 | scylla package 2 | ============== 3 | 4 | Subpackages 5 | ----------- 6 | 7 | .. toctree:: 8 | 9 | scylla.providers 10 | scylla.proxy 11 | scylla.web 12 | 13 | Submodules 14 | ---------- 15 | 16 | scylla.cli module 17 | ----------------- 18 | 19 | .. automodule:: scylla.cli 20 | :members: 21 | :undoc-members: 22 | :show-inheritance: 23 | 24 | scylla.config module 25 | -------------------- 26 | 27 | .. automodule:: scylla.config 28 | :members: 29 | :undoc-members: 30 | :show-inheritance: 31 | 32 | scylla.database module 33 | ---------------------- 34 | 35 | .. automodule:: scylla.database 36 | :members: 37 | :undoc-members: 38 | :show-inheritance: 39 | 40 | scylla.jobs module 41 | ------------------ 42 | 43 | .. automodule:: scylla.jobs 44 | :members: 45 | :undoc-members: 46 | :show-inheritance: 47 | 48 | scylla.loggings module 49 | ---------------------- 50 | 51 | .. automodule:: scylla.loggings 52 | :members: 53 | :undoc-members: 54 | :show-inheritance: 55 | 56 | scylla.scheduler module 57 | ----------------------- 58 | 59 | .. automodule:: scylla.scheduler 60 | :members: 61 | :undoc-members: 62 | :show-inheritance: 63 | 64 | scylla.tcpping module 65 | --------------------- 66 | 67 | .. automodule:: scylla.tcpping 68 | :members: 69 | :undoc-members: 70 | :show-inheritance: 71 | 72 | scylla.validation\_policy module 73 | -------------------------------- 74 | 75 | .. automodule:: scylla.validation_policy 76 | :members: 77 | :undoc-members: 78 | :show-inheritance: 79 | 80 | scylla.validator module 81 | ----------------------- 82 | 83 | .. automodule:: scylla.validator 84 | :members: 85 | :undoc-members: 86 | :show-inheritance: 87 | 88 | scylla.worker module 89 | -------------------- 90 | 91 | .. automodule:: scylla.worker 92 | :members: 93 | :undoc-members: 94 | :show-inheritance: 95 | 96 | 97 | Module contents 98 | --------------- 99 | 100 | .. automodule:: scylla 101 | :members: 102 | :undoc-members: 103 | :show-inheritance: 104 | -------------------------------------------------------------------------------- /docs/source/scylla.web.rst: -------------------------------------------------------------------------------- 1 | scylla.web package 2 | ================== 3 | 4 | Submodules 5 | ---------- 6 | 7 | scylla.web.server module 8 | ------------------------ 9 | 10 | .. automodule:: scylla.web.server 11 | :members: 12 | :undoc-members: 13 | :show-inheritance: 14 | 15 | 16 | Module contents 17 | --------------- 18 | 19 | .. automodule:: scylla.web 20 | :members: 21 | :undoc-members: 22 | :show-inheritance: 23 | -------------------------------------------------------------------------------- /docs/source/validation_policy.rst: -------------------------------------------------------------------------------- 1 | Validation Policy 2 | ================= 3 | 4 | The validation proxy for proxy ips is described in ``validation_policy.py`` : 5 | 6 | .. literalinclude:: ../../scylla/validation_policy.py 7 | :language: python 8 | :emphasize-lines: 23-42, 44-50 9 | :linenos: -------------------------------------------------------------------------------- /docs_zh/Makefile: -------------------------------------------------------------------------------- 1 | # Minimal makefile for Sphinx documentation 2 | # 3 | 4 | # You can set these variables from the command line. 5 | SPHINXOPTS = 6 | SPHINXBUILD = sphinx-build 7 | SPHINXPROJ = Scylla 8 | SOURCEDIR = source 9 | BUILDDIR = build 10 | 11 | # Put it first so that "make" without argument is like "make help". 12 | help: 13 | @$(SPHINXBUILD) -M help "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O) 14 | 15 | .PHONY: help Makefile 16 | 17 | # Catch-all target: route all unknown targets to Sphinx using the new 18 | # "make mode" option. $(O) is meant as a shortcut for $(SPHINXOPTS). 19 | %: Makefile 20 | @$(SPHINXBUILD) -M $@ "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O) -------------------------------------------------------------------------------- /docs_zh/source/_templates/sidebarintro.html: -------------------------------------------------------------------------------- 1 | 7 | 8 |

9 | 11 |

12 | 13 |

常用链接

14 | -------------------------------------------------------------------------------- /docs_zh/source/conf.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | # 3 | # Configuration file for the Sphinx documentation builder. 4 | # 5 | # This file does only contain a selection of the most common options. For a 6 | # full list see the documentation: 7 | # http://www.sphinx-doc.org/en/master/config 8 | 9 | # -- Path setup -------------------------------------------------------------- 10 | 11 | # If extensions (or modules to document with autodoc) are in another directory, 12 | # add these directories to sys.path here. If the directory is relative to the 13 | # documentation root, use os.path.abspath to make it absolute, like shown here. 14 | # 15 | # import os 16 | # sys.path.insert(0, os.path.abspath('.')) 17 | import sys 18 | from unittest.mock import MagicMock 19 | 20 | # -- Project information ----------------------------------------------------- 21 | 22 | project = 'Scylla' 23 | copyright = '2018, WildCat' 24 | author = 'WildCat' 25 | 26 | # The short X.Y version 27 | version = '' 28 | # The full version, including alpha/beta/rc tags 29 | release = '' 30 | 31 | 32 | # -- General configuration --------------------------------------------------- 33 | 34 | # If your documentation needs a minimal Sphinx version, state it here. 35 | # 36 | # needs_sphinx = '1.0' 37 | 38 | # Add any Sphinx extension module names here, as strings. They can be 39 | # extensions coming with Sphinx (named 'sphinx.ext.*') or your custom 40 | # ones. 41 | extensions = [ 42 | 'sphinx.ext.autodoc', 43 | 'sphinx.ext.viewcode', 44 | 'sphinx.ext.githubpages', 45 | ] 46 | 47 | # Add any paths that contain templates here, relative to this directory. 48 | templates_path = ['_templates'] 49 | 50 | # The suffix(es) of source filenames. 51 | # You can specify multiple suffix as a list of string: 52 | # 53 | # source_suffix = ['.rst', '.md'] 54 | source_suffix = '.rst' 55 | 56 | # The master toctree document. 57 | master_doc = 'index' 58 | 59 | # The language for content autogenerated by Sphinx. Refer to documentation 60 | # for a list of supported languages. 61 | # 62 | # This is also used if you do content translation via gettext catalogs. 63 | # Usually you set "language" from the command line for these cases. 64 | language = 'zh_CN' 65 | 66 | # List of patterns, relative to source directory, that match files and 67 | # directories to ignore when looking for source files. 68 | # This pattern also affects html_static_path and html_extra_path . 69 | exclude_patterns = [] 70 | 71 | # The name of the Pygments (syntax highlighting) style to use. 72 | pygments_style = 'sphinx' 73 | 74 | # -- Options for HTML output ------------------------------------------------- 75 | 76 | # The theme to use for HTML and HTML Help pages. See the documentation for 77 | # a list of builtin themes. 78 | # 79 | html_theme = 'alabaster' 80 | 81 | # Theme options are theme-specific and customize the look and feel of a theme 82 | # further. For a list of options available for each theme, see the 83 | # documentation. 84 | # 85 | # html_theme_options = {} 86 | 87 | # Add any paths that contain custom static files (such as style sheets) here, 88 | # relative to this directory. They are copied after the builtin static files, 89 | # so a file named "default.css" will overwrite the builtin "default.css". 90 | html_static_path = ['_static'] 91 | 92 | # Custom sidebar templates, must be a dictionary that maps document names 93 | # to template names. 94 | # 95 | # The default sidebars (for documents that don't match any pattern) are 96 | # defined by theme itself. Builtin themes are using these templates by 97 | # default: ``['localtoc.html', 'relations.html', 'sourcelink.html', 98 | # 'searchbox.html']``. 99 | # 100 | # html_sidebars = {} 101 | html_sidebars = { 102 | '**': ['sidebarintro.html', 'localtoc.html', 'relations.html', 'sourcelink.html', 'searchbox.html', ], 103 | } 104 | 105 | # -- Options for HTMLHelp output --------------------------------------------- 106 | 107 | # Output file base name for HTML help builder. 108 | htmlhelp_basename = 'Scylla-doc' 109 | 110 | 111 | # -- Options for LaTeX output ------------------------------------------------ 112 | 113 | latex_elements = { 114 | # The paper size ('letterpaper' or 'a4paper'). 115 | # 116 | # 'papersize': 'letterpaper', 117 | 118 | # The font size ('10pt', '11pt' or '12pt'). 119 | # 120 | # 'pointsize': '10pt', 121 | 122 | # Additional stuff for the LaTeX preamble. 123 | # 124 | # 'preamble': '', 125 | 126 | # Latex figure (float) alignment 127 | # 128 | # 'figure_align': 'htbp', 129 | } 130 | 131 | # Grouping the document tree into LaTeX files. List of tuples 132 | # (source start file, target name, title, 133 | # author, documentclass [howto, manual, or own class]). 134 | latex_documents = [ 135 | (master_doc, 'Scylla.tex', 'Scylla Documentation', 136 | 'WildCat', 'manual'), 137 | ] 138 | 139 | # -- Options for manual page output ------------------------------------------ 140 | 141 | # One entry per manual page. List of tuples 142 | # (source start file, name, description, authors, manual section). 143 | man_pages = [ 144 | (master_doc, 'scylla', 'Scylla Documentation', 145 | [author], 1) 146 | ] 147 | 148 | # -- Options for Texinfo output ---------------------------------------------- 149 | 150 | # Grouping the document tree into Texinfo files. List of tuples 151 | # (source start file, target name, title, author, 152 | # dir menu entry, description, category) 153 | texinfo_documents = [ 154 | (master_doc, 'Scylla', 'Scylla Documentation', 155 | author, 'Scylla', 'One line description of project.', 156 | 'Miscellaneous'), 157 | ] 158 | 159 | # -- Options for Epub output ------------------------------------------------- 160 | 161 | # Bibliographic Dublin Core info. 162 | epub_title = project 163 | epub_author = author 164 | epub_publisher = author 165 | epub_copyright = copyright 166 | 167 | # The unique identifier of the text. This can be a ISBN number 168 | # or the project homepage. 169 | # 170 | # epub_identifier = '' 171 | 172 | # A unique identification for the text. 173 | # 174 | # epub_uid = '' 175 | 176 | # A list of files that should not be packed into the epub file. 177 | epub_exclude_files = ['search.html'] 178 | 179 | # -- Extension configuration ------------------------------------------------- 180 | 181 | # Re: https://github.com/dabercro/OpsSpace/blob/880c58f6a6172924ca03145916f6a27cf6633684/docs/conf.py 182 | 183 | 184 | class Mock(MagicMock): 185 | @classmethod 186 | def __getattr__(cls, name): 187 | return MagicMock() 188 | 189 | 190 | MOCK_MODULES = ['pycurl', ] 191 | sys.modules.update((mod_name, Mock()) for mod_name in MOCK_MODULES) 192 | -------------------------------------------------------------------------------- /docs_zh/source/index.rst: -------------------------------------------------------------------------------- 1 | Scylla 中文文档 2 | ================================== 3 | 4 | .. toctree:: 5 | :maxdepth: 2 6 | :caption: Contents: 7 | 8 | 9 | 10 | Scylla 是一款高质量的免费代理 IP 池工具,仅支持 Python 3.6。特性如下: 11 | 12 | - 自动化的代理 IP 爬取与验证 13 | - 易用的 JSON API 14 | - 简单但美观的 web 用户界面,基于 TypeScript 和 React(例如,代理的地理分布) 15 | - 最少仅用\ **一条命令**\ 即可启动 16 | - 简明直接的编程 API(将在 1.1 版本中加入) 17 | - 最少仅用一行代码即可与 `Scrapy`_ 和 `requests`_ 进行集成 18 | - 无头浏览器(headless browser crawling)爬虫 19 | 20 | 快速开始 21 | -------- 22 | 23 | 安装 24 | """"""" 25 | 26 | Docker 安装(推荐) 27 | ^^^^^^^^^^^^^^^^^^^^^^^ 28 | .. code-block:: shell 29 | 30 | docker run -d -p 8899:8899 -p 8081:8081 -v /var/www/scylla:/var/www/scylla --name scylla wildcat/scylla:latest 31 | 32 | 使用 pip 直接安装 33 | ^^^^^^^^^^^^^^^^^^^^^^^ 34 | 35 | .. code:: bash 36 | 37 | pip install scylla 38 | scylla --help 39 | scylla # 运行爬虫和 Web 服务器 40 | 41 | 从源代码安装 42 | ^^^^^^^^^^^^^^^^^^^^^^^ 43 | 44 | .. code:: bash 45 | 46 | git clone https://github.com/imWildCat/scylla.git 47 | cd scylla 48 | 49 | pip install -r requirements.txt 50 | 51 | npm install # 或 yarn install 52 | make build-assets 53 | 54 | python -m scylla 55 | 56 | Windows用户在安装 ``sanic`` 时假如遇到 ``uvloop does not support Windows at the moment``: 57 | 58 | .. code:: bash 59 | 60 | export SANIC_NO_UVLOOP=true 61 | export SANIC_NO_UJSON=true 62 | pip3 install sanic 63 | 64 | 如果仍是失败,你需要从源码安装sanic。 65 | 66 | 使用 67 | """"""" 68 | 69 | 这里以服务运行在本地(``localhost``)为例,使用口号 ``8899``。 70 | 注意:首次运行本项目时,您可能需要等待 1~2 分钟以爬取一定量的代理 IP。 71 | 72 | JSON API 73 | ^^^^^^^^^^^^^^^^^^ 74 | 75 | 代理 IP 列表 76 | ~~~~~~~~~~~~~~~~~~~~ 77 | 78 | .. code:: shell 79 | 80 | http://localhost:8899/api/v1/proxies 81 | 82 | 可选 URL 参数: 83 | 84 | ========= ======== ================================================================ 85 | 参数 默认值 说明 86 | ========= ======== ================================================================ 87 | page ``1`` 页码 88 | limit ``20`` 每页显示代理 IP 的数量 89 | anonymous ``any`` 是否显示匿名代理。可选值:``true``,只显示匿名代理;``false``,只显示透明代理。 90 | https ``any`` 是否显示 HTTPS 代理。可选值:``true``,只显示 HTTPS 代理;``false``,只显示 HTTP 代理。 91 | countries 无 只选取特定国家的代理,格式示例:``US``,或者多国家:``US,GB`` 92 | ========= ======== ================================================================ 93 | 94 | 结果样例: 95 | 96 | .. code:: json 97 | 98 | { 99 | "proxies": [{ 100 | "id": 599, 101 | "ip": "91.229.222.163", 102 | "port": 53281, 103 | "is_valid": true, 104 | "created_at": 1527590947, 105 | "updated_at": 1527593751, 106 | "latency": 23.0, 107 | "stability": 0.1, 108 | "is_anonymous": true, 109 | "is_https": true, 110 | "attempts": 1, 111 | "https_attempts": 0, 112 | "location": "54.0451,-0.8053", 113 | "organization": "AS57099 Boundless Networks Limited", 114 | "region": "England", 115 | "country": "GB", 116 | "city": "Malton" 117 | }, { 118 | "id": 75, 119 | "ip": "75.151.213.85", 120 | "port": 8080, 121 | "is_valid": true, 122 | "created_at": 1527590676, 123 | "updated_at": 1527593702, 124 | "latency": 268.0, 125 | "stability": 0.3, 126 | "is_anonymous": true, 127 | "is_https": true, 128 | "attempts": 1, 129 | "https_attempts": 0, 130 | "location": "32.3706,-90.1755", 131 | "organization": "AS7922 Comcast Cable Communications, LLC", 132 | "region": "Mississippi", 133 | "country": "US", 134 | "city": "Jackson" 135 | }, 136 | ... 137 | ], 138 | "count": 1025, 139 | "per_page": 20, 140 | "page": 1, 141 | "total_page": 52 142 | } 143 | 144 | 系统统计 145 | ~~~~~~~~~~~~~~~~~ 146 | 147 | .. code:: shell 148 | 149 | http://localhost:8899/api/v1/stats 150 | 151 | 结果样例: 152 | 153 | .. code:: json 154 | 155 | { 156 | "median": 181.2566407083, 157 | "valid_count": 1780, 158 | "total_count": 9528, 159 | "mean": 174.3290085201 160 | } 161 | 162 | HTTP 正向代理服务器 163 | ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ 164 | 165 | 默认情况下,Scylla 会在端口 ``8081`` 启动一个 HTTP 正向代理服务器(Forward Proxy Server)。 166 | 这个服务器会从数据库中选择一个刚更新过的代理,并将其用作正向代理。 167 | 每当发出 HTTP 请求时,代理服务器将随机选择一个代理。 168 | 169 | 注意:目前不支持 HTTPS 请求。 170 | 171 | 使用此代理服务器的 “curl” 示例如下: 172 | 173 | .. code:: shell 174 | 175 | curl http://api.ipify.org -x http://127.0.0.1:8081 176 | 177 | 你也可以在 `requests`_ 中使用这个特性: 178 | 179 | .. code:: python 180 | 181 | requests.get('http://api.ipify.org', proxies={'http': 'http://127.0.0.1:8081'}) 182 | 183 | Web 界面 184 | ^^^^^^^^^^^^^^^^^^ 185 | 186 | 打开 ``http://localhost:8899`` 即可访问本项目的 Web 界面。 187 | 188 | 代理 IP 列表 189 | ~~~~~~~~~~~~~~~~~~~~ 190 | 191 | .. code:: shell 192 | 193 | http://localhost:8899/ 194 | 195 | 截图: 196 | 197 | |screenshot-proxy-list| 198 | 199 | 代理 IP 全球分布 200 | ~~~~~~~~~~~~~~~~~~~~ 201 | 202 | .. code:: shell 203 | 204 | http://localhost:8899/#/geo 205 | 206 | 截图: 207 | 208 | |screenshot-geo-distribution| 209 | 210 | 211 | 其他示例 212 | ----------------- 213 | 214 | .. toctree:: 215 | :maxdepth: 1 216 | 217 | requests_integration 218 | 219 | 系统设计 220 | ------------- 221 | 222 | .. toctree:: 223 | :maxdepth: 1 224 | 225 | validation_policy 226 | 227 | API 文档 228 | -------------- 229 | 230 | 请阅读 :ref:`modindex`。更易用的编程接口正在开发中。 231 | 232 | 开发路线图 233 | -------------- 234 | 235 | 请查看 `Projects`_。 236 | 237 | 开发与贡献 238 | ---------------------------- 239 | 240 | .. code:: bash 241 | 242 | git clone https://github.com/imWildCat/scylla.git 243 | cd scylla 244 | 245 | pip install -r requirements.txt 246 | python -m playwright install --with-deps chromium 247 | 248 | npm install 249 | make build-assets 250 | 251 | 测试 252 | ------- 253 | 254 | 本项目使用了较多的单元测试来保证代码的质量,并集成 `Travis CI`_ 来实现持续集成。如需在本地运行测试,命令如下: 255 | 256 | .. code:: bash 257 | 258 | pip install -r tests/requirements-test.txt 259 | pytest tests/ 260 | 261 | 十分欢迎您添加更多的测试用力以增强本项目的鲁棒性。 262 | 263 | 项目命名 264 | -------------- 265 | `Scylla`_,或被称为“锡拉”(中文里),源自于美剧《`越狱`_》中的一组记忆芯片的名字。本项目以此命名,是为了致敬这部美剧。 266 | 267 | 捐助 268 | ---------------------- 269 | 如果您认为这个项目有帮助,不妨为它捐助一点钱? 270 | 271 | 不管钱有多少,您的捐助将会激励作者持续开发新功能!🎉 272 | 273 | 感谢您的支持! 274 | 275 | 捐助方法如下: 276 | 277 | PayPal 278 | """""" 279 | |PayPal Donation Official| 280 | 281 | 支付宝或微信 282 | """""""""""""""""""" 283 | |Alipay and WeChat Donation| 284 | 285 | 协议 286 | ------- 287 | 288 | Apache License 2.0. 如需了解详情,请阅读 `LICENSE`_ 这个文件。 289 | 290 | 291 | 索引表 292 | ================== 293 | 294 | * :ref:`genindex` 295 | * :ref:`modindex` 296 | * :ref:`search` 297 | 298 | .. _Projects: https://github.com/imWildCat/scylla/projects 299 | .. _LICENSE: https://github.com/imWildCat/scylla/blob/master/LICENSE 300 | .. _Travis CI: https://travis-ci.org/imWildCat/scylla 301 | .. _Scylla: http://prisonbreak.wikia.com/wiki/Scylla 302 | .. _越狱: https://zh.wikipedia.org/zh-hans/%E8%B6%8A%E7%8B%B1_(%E7%94%B5%E8%A7%86%E5%89%A7) 303 | .. _Scrapy: https://scrapy.org 304 | .. _requests: http://docs.python-requests.org/ 305 | 306 | .. |screenshot-geo-distribution| image:: https://user-images.githubusercontent.com/2396817/40653599-9458b6b8-6333-11e8-8e6e-1d90271fc083.png 307 | .. |screenshot-proxy-list| image:: https://user-images.githubusercontent.com/2396817/40653600-946eae6e-6333-11e8-8bbd-9d2f347c5461.png 308 | 309 | .. |PayPal Donation Official| image:: https://www.paypalobjects.com/en_US/i/btn/btn_donateCC_LG.gif 310 | :target: https://www.paypal.com/cgi-bin/webscr?cmd=_s-xclick&hosted_button_id=5DXFA7WGWPZBN 311 | .. |Alipay and WeChat Donation| image:: https://user-images.githubusercontent.com/2396817/40589594-cfb0e49e-61e7-11e8-8f7d-c55a29676c40.png 312 | :target: https://user-images.githubusercontent.com/2396817/40589594-cfb0e49e-61e7-11e8-8f7d-c55a29676c40.png 313 | -------------------------------------------------------------------------------- /docs_zh/source/modules.rst: -------------------------------------------------------------------------------- 1 | scylla 2 | ====== 3 | 4 | .. toctree:: 5 | :maxdepth: 4 6 | 7 | scylla 8 | -------------------------------------------------------------------------------- /docs_zh/source/requests_integration.rst: -------------------------------------------------------------------------------- 1 | .. _requests_integration: 2 | 3 | Requests 的一些例子 4 | ========================== 5 | 6 | `Requests`_ 是一个非常好用而且成熟的 Python HTTP 框架。和它一起使用 Scylla 非常简单。 7 | 8 | 调用 JSON API 9 | ----------------- 10 | 11 | .. code:: python 12 | 13 | import requests 14 | import random 15 | 16 | json_resp = requests.get('http://localhost:8899/api/v1/proxies').json() 17 | proxy = random.choice(json_resp['proxies']) 18 | 19 | requests.get('http://api.ipify.org', proxies={'http': 'http://{}:{}'.format(proxy['ip'], proxy['port'])}) 20 | 21 | 也支持 HTTPS 代理: 22 | 23 | .. code:: python 24 | 25 | import requests 26 | import random 27 | 28 | json_resp = requests.get('http://localhost:8899/api/v1/proxies?https=true').json() 29 | proxy = random.choice(json_resp['proxies']) 30 | 31 | requests.get('https://api.ipify.org', proxies={'https': 'https://{}:{}'.format(proxy['ip'], proxy['port'])}) 32 | 33 | 34 | 35 | 36 | 使用正向代理服务器 37 | ----------------------------- 38 | 39 | .. code:: python 40 | 41 | requests.get('http://api.ipify.org', proxies={'http': 'http://127.0.0.1:8081'}) 42 | 43 | 44 | 45 | 46 | .. _Requests: http://docs.python-requests.org/ -------------------------------------------------------------------------------- /docs_zh/source/scylla.providers.rst: -------------------------------------------------------------------------------- 1 | scylla.providers package 2 | ======================== 3 | 4 | Submodules 5 | ---------- 6 | 7 | scylla.providers.a2u\_provider module 8 | ------------------------------------- 9 | 10 | .. automodule:: scylla.providers.a2u_provider 11 | :members: 12 | :undoc-members: 13 | :show-inheritance: 14 | 15 | scylla.providers.base\_provider module 16 | -------------------------------------- 17 | 18 | .. automodule:: scylla.providers.base_provider 19 | :members: 20 | :undoc-members: 21 | :show-inheritance: 22 | 23 | scylla.providers.cool\_proxy\_provider module 24 | --------------------------------------------- 25 | 26 | .. automodule:: scylla.providers.cool_proxy_provider 27 | :members: 28 | :undoc-members: 29 | :show-inheritance: 30 | 31 | scylla.providers.data5u\_provider module 32 | ---------------------------------------- 33 | 34 | .. automodule:: scylla.providers.data5u_provider 35 | :members: 36 | :undoc-members: 37 | :show-inheritance: 38 | 39 | scylla.providers.free\_proxy\_list\_provider module 40 | --------------------------------------------------- 41 | 42 | .. automodule:: scylla.providers.free_proxy_list_provider 43 | :members: 44 | :undoc-members: 45 | :show-inheritance: 46 | 47 | scylla.providers.http\_proxy\_provider module 48 | --------------------------------------------- 49 | 50 | .. automodule:: scylla.providers.http_proxy_provider 51 | :members: 52 | :undoc-members: 53 | :show-inheritance: 54 | 55 | scylla.providers.kuaidaili\_provider module 56 | ------------------------------------------- 57 | 58 | .. automodule:: scylla.providers.kuaidaili_provider 59 | :members: 60 | :undoc-members: 61 | :show-inheritance: 62 | 63 | scylla.providers.spys\_me\_provider module 64 | ------------------------------------------ 65 | 66 | .. automodule:: scylla.providers.spys_me_provider 67 | :members: 68 | :undoc-members: 69 | :show-inheritance: 70 | 71 | scylla.providers.spys\_one\_provider module 72 | ------------------------------------------- 73 | 74 | .. automodule:: scylla.providers.spys_one_provider 75 | :members: 76 | :undoc-members: 77 | :show-inheritance: 78 | 79 | scylla.providers.xici\_provider module 80 | -------------------------------------- 81 | 82 | .. automodule:: scylla.providers.xici_provider 83 | :members: 84 | :undoc-members: 85 | :show-inheritance: 86 | 87 | 88 | Module contents 89 | --------------- 90 | 91 | .. automodule:: scylla.providers 92 | :members: 93 | :undoc-members: 94 | :show-inheritance: 95 | -------------------------------------------------------------------------------- /docs_zh/source/scylla.proxy.rst: -------------------------------------------------------------------------------- 1 | scylla.proxy package 2 | ==================== 3 | 4 | Submodules 5 | ---------- 6 | 7 | scylla.proxy.server module 8 | -------------------------- 9 | 10 | .. automodule:: scylla.proxy.server 11 | :members: 12 | :undoc-members: 13 | :show-inheritance: 14 | 15 | 16 | Module contents 17 | --------------- 18 | 19 | .. automodule:: scylla.proxy 20 | :members: 21 | :undoc-members: 22 | :show-inheritance: 23 | -------------------------------------------------------------------------------- /docs_zh/source/scylla.rst: -------------------------------------------------------------------------------- 1 | scylla package 2 | ============== 3 | 4 | Subpackages 5 | ----------- 6 | 7 | .. toctree:: 8 | 9 | scylla.providers 10 | scylla.proxy 11 | scylla.web 12 | 13 | Submodules 14 | ---------- 15 | 16 | scylla.cli module 17 | ----------------- 18 | 19 | .. automodule:: scylla.cli 20 | :members: 21 | :undoc-members: 22 | :show-inheritance: 23 | 24 | scylla.config module 25 | -------------------- 26 | 27 | .. automodule:: scylla.config 28 | :members: 29 | :undoc-members: 30 | :show-inheritance: 31 | 32 | scylla.database module 33 | ---------------------- 34 | 35 | .. automodule:: scylla.database 36 | :members: 37 | :undoc-members: 38 | :show-inheritance: 39 | 40 | scylla.jobs module 41 | ------------------ 42 | 43 | .. automodule:: scylla.jobs 44 | :members: 45 | :undoc-members: 46 | :show-inheritance: 47 | 48 | scylla.loggings module 49 | ---------------------- 50 | 51 | .. automodule:: scylla.loggings 52 | :members: 53 | :undoc-members: 54 | :show-inheritance: 55 | 56 | scylla.scheduler module 57 | ----------------------- 58 | 59 | .. automodule:: scylla.scheduler 60 | :members: 61 | :undoc-members: 62 | :show-inheritance: 63 | 64 | scylla.tcpping module 65 | --------------------- 66 | 67 | .. automodule:: scylla.tcpping 68 | :members: 69 | :undoc-members: 70 | :show-inheritance: 71 | 72 | scylla.validation\_policy module 73 | -------------------------------- 74 | 75 | .. automodule:: scylla.validation_policy 76 | :members: 77 | :undoc-members: 78 | :show-inheritance: 79 | 80 | scylla.validator module 81 | ----------------------- 82 | 83 | .. automodule:: scylla.validator 84 | :members: 85 | :undoc-members: 86 | :show-inheritance: 87 | 88 | scylla.worker module 89 | -------------------- 90 | 91 | .. automodule:: scylla.worker 92 | :members: 93 | :undoc-members: 94 | :show-inheritance: 95 | 96 | 97 | Module contents 98 | --------------- 99 | 100 | .. automodule:: scylla 101 | :members: 102 | :undoc-members: 103 | :show-inheritance: 104 | -------------------------------------------------------------------------------- /docs_zh/source/scylla.web.rst: -------------------------------------------------------------------------------- 1 | scylla.web package 2 | ================== 3 | 4 | Submodules 5 | ---------- 6 | 7 | scylla.web.server module 8 | ------------------------ 9 | 10 | .. automodule:: scylla.web.server 11 | :members: 12 | :undoc-members: 13 | :show-inheritance: 14 | 15 | 16 | Module contents 17 | --------------- 18 | 19 | .. automodule:: scylla.web 20 | :members: 21 | :undoc-members: 22 | :show-inheritance: 23 | -------------------------------------------------------------------------------- /docs_zh/source/validation_policy.rst: -------------------------------------------------------------------------------- 1 | 验证策略 2 | ================= 3 | 4 | 代理 IP 的验证策略在 ``validation_policy.py`` 可见: 5 | 6 | .. literalinclude:: ../../scylla/validation_policy.py 7 | :language: python 8 | :emphasize-lines: 23-42, 44-50 9 | :linenos: -------------------------------------------------------------------------------- /frontend/.eslintrc.cjs: -------------------------------------------------------------------------------- 1 | module.exports = { 2 | root: true, 3 | env: { browser: true, es2020: true }, 4 | extends: [ 5 | 'eslint:recommended', 6 | 'plugin:@typescript-eslint/recommended', 7 | 'plugin:react-hooks/recommended', 8 | ], 9 | ignorePatterns: ['dist', '.eslintrc.cjs'], 10 | parser: '@typescript-eslint/parser', 11 | plugins: ['react-refresh'], 12 | rules: { 13 | 'react-refresh/only-export-components': [ 14 | 'warn', 15 | { allowConstantExport: true }, 16 | ], 17 | }, 18 | } 19 | -------------------------------------------------------------------------------- /frontend/.gitignore: -------------------------------------------------------------------------------- 1 | # Logs 2 | logs 3 | *.log 4 | npm-debug.log* 5 | yarn-debug.log* 6 | yarn-error.log* 7 | pnpm-debug.log* 8 | lerna-debug.log* 9 | 10 | node_modules 11 | dist 12 | dist-ssr 13 | *.local 14 | 15 | # Editor directories and files 16 | .vscode/* 17 | !.vscode/extensions.json 18 | .idea 19 | .DS_Store 20 | *.suo 21 | *.ntvs* 22 | *.njsproj 23 | *.sln 24 | *.sw? 25 | -------------------------------------------------------------------------------- /frontend/README.md: -------------------------------------------------------------------------------- 1 | # React + TypeScript + Vite 2 | 3 | This template provides a minimal setup to get React working in Vite with HMR and some ESLint rules. 4 | 5 | Currently, two official plugins are available: 6 | 7 | - [@vitejs/plugin-react](https://github.com/vitejs/vite-plugin-react/blob/main/packages/plugin-react/README.md) uses [Babel](https://babeljs.io/) for Fast Refresh 8 | - [@vitejs/plugin-react-swc](https://github.com/vitejs/vite-plugin-react-swc) uses [SWC](https://swc.rs/) for Fast Refresh 9 | 10 | ## Expanding the ESLint configuration 11 | 12 | If you are developing a production application, we recommend updating the configuration to enable type aware lint rules: 13 | 14 | - Configure the top-level `parserOptions` property like this: 15 | 16 | ```js 17 | export default { 18 | // other rules... 19 | parserOptions: { 20 | ecmaVersion: 'latest', 21 | sourceType: 'module', 22 | project: ['./tsconfig.json', './tsconfig.node.json'], 23 | tsconfigRootDir: __dirname, 24 | }, 25 | } 26 | ``` 27 | 28 | - Replace `plugin:@typescript-eslint/recommended` to `plugin:@typescript-eslint/recommended-type-checked` or `plugin:@typescript-eslint/strict-type-checked` 29 | - Optionally add `plugin:@typescript-eslint/stylistic-type-checked` 30 | - Install [eslint-plugin-react](https://github.com/jsx-eslint/eslint-plugin-react) and add `plugin:react/recommended` & `plugin:react/jsx-runtime` to the `extends` list 31 | -------------------------------------------------------------------------------- /frontend/index.html: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | Vite + React + TS 8 | 9 | 10 |
11 | 12 | 13 | 14 | -------------------------------------------------------------------------------- /frontend/package.json: -------------------------------------------------------------------------------- 1 | { 2 | "name": "frontend", 3 | "private": true, 4 | "version": "0.0.0", 5 | "type": "module", 6 | "scripts": { 7 | "dev": "vite", 8 | "build": "tsc && vite build", 9 | "build:scylla:prod": "tsc && vite build --outDir ../scylla/assets", 10 | "lint": "eslint . --ext ts,tsx --report-unused-disable-directives --max-warnings 0", 11 | "preview": "vite preview" 12 | }, 13 | "dependencies": { 14 | "axios": "^1.6.2", 15 | "milligram": "^1.4.1", 16 | "moment": "^2.29.4", 17 | "query-string": "^8.1.0", 18 | "react": "^18.2.0", 19 | "react-dom": "^18.2.0", 20 | "react-router": "^6.19.0", 21 | "react-router-dom": "^6.19.0", 22 | "react-simple-maps": "^3.0.0", 23 | "react-tooltip": "^5.23.0" 24 | }, 25 | "devDependencies": { 26 | "@types/node": "^20.9.1", 27 | "@types/react": "^18.2.37", 28 | "@types/react-dom": "^18.2.15", 29 | "@types/react-simple-maps": "^3.0.3", 30 | "@typescript-eslint/eslint-plugin": "^6.10.0", 31 | "@typescript-eslint/parser": "^6.10.0", 32 | "@vitejs/plugin-react-swc": "^3.5.0", 33 | "eslint": "^8.53.0", 34 | "eslint-plugin-react-hooks": "^4.6.0", 35 | "eslint-plugin-react-refresh": "^0.4.4", 36 | "sass": "^1.69.5", 37 | "typescript": "^5.2.2", 38 | "vite": "^5.0.0" 39 | } 40 | } 41 | -------------------------------------------------------------------------------- /frontend/public/vite.svg: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /frontend/src/App.css: -------------------------------------------------------------------------------- 1 | #root { 2 | max-width: 1280px; 3 | margin: 0 auto; 4 | padding: 2rem; 5 | text-align: center; 6 | } 7 | 8 | .logo { 9 | height: 6em; 10 | padding: 1.5em; 11 | will-change: filter; 12 | transition: filter 300ms; 13 | } 14 | .logo:hover { 15 | filter: drop-shadow(0 0 2em #646cffaa); 16 | } 17 | .logo.react:hover { 18 | filter: drop-shadow(0 0 2em #61dafbaa); 19 | } 20 | 21 | @keyframes logo-spin { 22 | from { 23 | transform: rotate(0deg); 24 | } 25 | to { 26 | transform: rotate(360deg); 27 | } 28 | } 29 | 30 | @media (prefers-reduced-motion: no-preference) { 31 | a:nth-of-type(2) .logo { 32 | animation: logo-spin infinite 20s linear; 33 | } 34 | } 35 | 36 | .card { 37 | padding: 2em; 38 | } 39 | 40 | .read-the-docs { 41 | color: #888; 42 | } 43 | -------------------------------------------------------------------------------- /frontend/src/App.tsx: -------------------------------------------------------------------------------- 1 | import { useState } from 'react' 2 | import reactLogo from './assets/react.svg' 3 | import viteLogo from '/vite.svg' 4 | import './App.css' 5 | 6 | function App() { 7 | const [count, setCount] = useState(0) 8 | 9 | return ( 10 | <> 11 |
12 | 13 | Vite logo 14 | 15 | 16 | React logo 17 | 18 |
19 |

Vite + React

20 |
21 | 24 |

25 | Edit src/App.tsx and save to test HMR 26 |

27 |
28 |

29 | Click on the Vite and React logos to learn more 30 |

31 | 32 | ) 33 | } 34 | 35 | export default App 36 | -------------------------------------------------------------------------------- /frontend/src/LegacyApp.tsx: -------------------------------------------------------------------------------- 1 | import 'milligram'; 2 | import "./index.scss"; 3 | import ScyllaBannerImage from './assets/scylla_banner.png'; 4 | 5 | import ProxyIPList from "./components/ProxyList"; 6 | import GeoDistribution from "./components/GeoDistribution"; 7 | import Statistics from "./components/Statistics"; 8 | 9 | import { BrowserRouter as Router, Routes, Route, NavLink } from 'react-router-dom'; 10 | 11 | export const AppRoute = () => ( 12 | 13 |
14 |
15 | banner 16 |
17 |
    18 |
  • Proxy IP List
  • 19 |
  • Geometric Distribution
  • 20 |
  • Statistics
  • 21 |
22 | 23 | 24 | } /> 25 | } /> 26 | } /> 27 | 28 | 29 |
30 |
31 | All rights reserved. Project Scylla. 32 |
33 |
34 |
35 |
36 | ); 37 | -------------------------------------------------------------------------------- /frontend/src/assets/react.svg: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /frontend/src/assets/scylla_banner.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/imWildCat/scylla/b051fd586f2e3268bb07f8d94a0b27dce01dea12/frontend/src/assets/scylla_banner.png -------------------------------------------------------------------------------- /frontend/src/components/GeoDistribution.tsx: -------------------------------------------------------------------------------- 1 | import * as React from 'react'; 2 | import axios from "axios"; 3 | import {getBaseURL, Proxy, ResponseJSON} from "../utils"; 4 | import {Tooltip} from "react-tooltip" 5 | 6 | import { 7 | ComposableMap, 8 | ZoomableGroup, 9 | Geographies, 10 | Geography, 11 | Marker, 12 | } from 'react-simple-maps'; 13 | 14 | export interface GeoDistributionProps { 15 | } 16 | 17 | export interface GeoDistributionState { 18 | proxies: Proxy[], 19 | } 20 | 21 | export default class GeoDistribution extends React.Component { 22 | 23 | constructor(props: GeoDistributionProps) { 24 | super(props); 25 | this.state = { 26 | proxies: [], 27 | }; 28 | } 29 | 30 | componentDidMount() { 31 | this.loadData(); 32 | } 33 | render() { 34 | return ( 35 |
36 | 37 | 38 | 39 | {({ geographies }) => geographies.map((geography) => ( 40 | 49 | ))} 50 | 51 | {/* Render markers here */} 52 | 53 | 54 | 55 |
56 | ); 57 | } 58 | renderMarker(proxy: Proxy): JSX.Element | null { 59 | const locationStr = proxy.location; 60 | if (locationStr) { 61 | const locations = locationStr.split(',').map(coord => parseFloat(coord)); 62 | 63 | return ( 64 | 68 | {/* ... */} 69 | 70 | ); 71 | } else { 72 | return null; 73 | } 74 | } 75 | 76 | mapProxyColor(proxy: Proxy): string { 77 | if (proxy.latency < 180 && proxy.stability >= 0.6) { 78 | return '#417505'; 79 | } else if (proxy.latency < 300 && proxy.stability >= 0.4) { 80 | return '#F8E71C'; 81 | } else if (proxy.latency < 500 && proxy.stability > 0.0) { 82 | return '#FF3824'; 83 | } else { 84 | return '#000'; 85 | } 86 | } 87 | 88 | async loadData() { 89 | const response = await axios.get(`${getBaseURL()}/api/v1/proxies?limit=4095`); 90 | const res: ResponseJSON = response.data; 91 | const proxies: Proxy[] = res.proxies; 92 | this.setState({ 93 | proxies: proxies, 94 | }); 95 | } 96 | } 97 | 98 | -------------------------------------------------------------------------------- /frontend/src/components/ProxyList.tsx: -------------------------------------------------------------------------------- 1 | import * as React from "react"; 2 | import axios from 'axios'; 3 | import {Link} from 'react-router-dom'; 4 | 5 | import {getBaseURL, Proxy, ResponseJSON} from '../utils'; 6 | import queryString from "query-string"; 7 | import ProxyListFilter from "./ProxyListFilter"; 8 | 9 | import moment from 'moment' 10 | 11 | export interface AppState { 12 | proxies: Proxy[]; 13 | count: number; 14 | per_page: number; 15 | page: number; 16 | total_page: number; 17 | } 18 | 19 | export interface Props { 20 | location: any; 21 | } 22 | 23 | export default class ProxyIPList extends React.Component { 24 | private initialState: AppState = { 25 | proxies: [], 26 | count: 0, 27 | per_page: 0, 28 | page: 0, 29 | total_page: 0, 30 | }; 31 | 32 | constructor(props: Props) { 33 | super(props); 34 | this.state = this.initialState; 35 | } 36 | 37 | render(): JSX.Element { 38 | // const { timesClicked, on } = this.state; 39 | return ( 40 |
41 | {this.renderPagination()} 42 | {this.renderList()} 43 | {this.renderPagination()} 44 |
45 | ); 46 | } 47 | 48 | renderList(): JSX.Element { 49 | const list = this.state.proxies; 50 | return ( 51 |
52 | 53 | 54 | 55 | 56 | 57 | 58 | 59 | 60 | 61 | 62 | 63 | 64 | 65 | {list.map(r => 66 | 67 | 68 | 69 | 70 | 71 | 72 | 73 | 74 | )} 75 | 76 |
IPPortAnonymousProtocolLatencyUpdated at
{r.ip}{r.port}{r.is_anonymous ? 'Yes' : 'No'}{r.is_https ? 'HTTPS' : 'HTTP'}{r.latency.toFixed(0)} ms{moment.unix(r.updated_at).format('YYYYMMDD HH:mm:ss')}
77 |
78 | ); 79 | } 80 | 81 | componentDidMount() { 82 | this.loadData(this.props); 83 | } 84 | 85 | componentWillReceiveProps(nextProp: any) { 86 | this.loadData(nextProp) 87 | } 88 | 89 | async loadData(props: any) { 90 | 91 | const parsed = queryString.parse(props.location.search); 92 | 93 | const page = parsed['page'] || 1; 94 | const https = parsed['https'] || null; 95 | const anonymous = parsed['anonymous'] || null; 96 | 97 | const params: any = {}; 98 | 99 | if (page) { 100 | params['page'] = page; 101 | } 102 | 103 | if (https) { 104 | params['https'] = https; 105 | } 106 | 107 | if (anonymous) { 108 | params['anonymous'] = anonymous; 109 | } 110 | 111 | const response = await axios.get(`${getBaseURL()}/api/v1/proxies?${queryString.stringify(params)}`); 112 | const res: ResponseJSON = response.data; 113 | const proxies: Proxy[] = res.proxies; 114 | this.setState({ 115 | proxies: proxies, 116 | count: res.count, 117 | per_page: res.per_page, 118 | page: res.page, 119 | total_page: res.total_page, 120 | }); 121 | } 122 | 123 | renderPagination(): JSX.Element { 124 | const {total_page, page} = this.state; 125 | 126 | const pagination = []; 127 | 128 | if (page !== 1) { 129 | pagination.push(this.renderPageLink(page - 1, 'Previous page')) 130 | } 131 | 132 | if (page !== total_page) { 133 | pagination.push(this.renderPageLink(page + 1, 'Next page')) 134 | } 135 | 136 | return ( 137 |
    138 | {pagination.map(e => e)} 139 |
140 | ) 141 | } 142 | 143 | private renderPageLink(pageNumber: number, label: string): JSX.Element { 144 | const parsed = queryString.parse(this.props.location.search); 145 | 146 | 147 | return ( 148 |
  • {label}
  • 149 | ); 150 | } 151 | 152 | } -------------------------------------------------------------------------------- /frontend/src/components/ProxyListFilter.tsx: -------------------------------------------------------------------------------- 1 | import * as React from 'react'; 2 | import {Link} from "react-router-dom"; 3 | import queryString from "query-string"; 4 | 5 | export interface ProxyListFilterProps { 6 | location: any; 7 | } 8 | 9 | export interface ProxyListFilterState { 10 | https: boolean | null; 11 | anonymous: boolean | null; 12 | } 13 | 14 | export default class ProxyListFilter extends React.Component { 15 | constructor(props: ProxyListFilterProps) { 16 | super(props); 17 | 18 | this.state = { 19 | https: null, 20 | anonymous: null, 21 | } 22 | } 23 | 24 | render() { 25 | return ( 26 |
    27 | HTTPS 28 | ANONYMOUS 29 |
    30 | ); 31 | } 32 | 33 | componentDidMount() { 34 | this.handleProps(this.props); 35 | } 36 | 37 | componentWillReceiveProps(nextProps: ProxyListFilterProps) { 38 | this.handleProps(nextProps); 39 | } 40 | 41 | handleProps(props: ProxyListFilterProps) { 42 | const parsed = queryString.parse(props.location.search); 43 | 44 | const https = parsed['https'] == 'true' ? true : null; 45 | const anonymous = parsed['anonymous'] == 'true' ? true : null; 46 | 47 | this.setState({https: https, anonymous: anonymous}) 48 | } 49 | 50 | genLink(key: string): string { 51 | 52 | const {anonymous, https} = this.state; 53 | let params: any = { 54 | anonymous, 55 | https, 56 | }; 57 | 58 | if (key === 'HTTPS') { 59 | params['https'] = https == true ? null : true; 60 | } else if (key === 'ANONYMOUS') { 61 | params['anonymous'] = anonymous == true ? null : true; 62 | } 63 | 64 | 65 | return `/?${queryString.stringify(params)}`; 66 | } 67 | 68 | 69 | genClassName(key: string): string { 70 | const {anonymous, https} = this.state; 71 | 72 | let baseClassName = 'button'; 73 | 74 | if (key === 'HTTPS') { 75 | if (!https) { 76 | baseClassName += ' button-outline' 77 | } 78 | } else if (key === 'ANONYMOUS') { 79 | if (!anonymous) { 80 | baseClassName += ' button-outline' 81 | } 82 | } 83 | return baseClassName; 84 | } 85 | 86 | } 87 | -------------------------------------------------------------------------------- /frontend/src/components/Statistics.tsx: -------------------------------------------------------------------------------- 1 | import * as React from 'react'; 2 | import axios from "axios"; 3 | import {getBaseURL, StatsResponseJSON} from "../utils"; 4 | 5 | export interface StatisticsProps { 6 | } 7 | 8 | export interface StatisticsState { 9 | mean: number; 10 | median: number; 11 | total_count: number; 12 | valid_count: number; 13 | } 14 | 15 | export default class Statistics extends React.Component { 16 | constructor(props: StatisticsProps) { 17 | super(props); 18 | 19 | this.state = { 20 | mean: 0, 21 | median: 0, 22 | total_count: 0, 23 | valid_count: 0, 24 | } 25 | } 26 | 27 | render() { 28 | const {mean, median, total_count, valid_count} = this.state; 29 | 30 | return ( 31 |
    32 | At present, the system has crawled: 33 |
      34 |
    • 35 | {total_count} proxy ips in total, 36 |
    • 37 |
    • 38 | {valid_count} of them are valid. 39 |
    • 40 |
    • The mean latency of them is: {mean.toFixed(2)} ms
    • 41 |
    • The median latency of them is: {median.toFixed(2)} ms
    • 42 |
    43 |
    44 | ); 45 | } 46 | 47 | componentDidMount() { 48 | this.loadData() 49 | } 50 | 51 | async loadData() { 52 | const response = await axios.get(`${getBaseURL()}/api/v1/stats`); 53 | const res: StatsResponseJSON = response.data; 54 | this.setState({ 55 | mean: res.mean, 56 | median: res.median, 57 | total_count: res.total_count, 58 | valid_count: res.valid_count, 59 | }); 60 | } 61 | } 62 | -------------------------------------------------------------------------------- /frontend/src/index.css: -------------------------------------------------------------------------------- 1 | :root { 2 | font-family: Inter, system-ui, Avenir, Helvetica, Arial, sans-serif; 3 | line-height: 1.5; 4 | font-weight: 400; 5 | 6 | color-scheme: light dark; 7 | color: rgba(255, 255, 255, 0.87); 8 | background-color: #242424; 9 | 10 | font-synthesis: none; 11 | text-rendering: optimizeLegibility; 12 | -webkit-font-smoothing: antialiased; 13 | -moz-osx-font-smoothing: grayscale; 14 | } 15 | 16 | a { 17 | font-weight: 500; 18 | color: #646cff; 19 | text-decoration: inherit; 20 | } 21 | a:hover { 22 | color: #535bf2; 23 | } 24 | 25 | body { 26 | margin: 0; 27 | display: flex; 28 | place-items: center; 29 | min-width: 320px; 30 | min-height: 100vh; 31 | } 32 | 33 | h1 { 34 | font-size: 3.2em; 35 | line-height: 1.1; 36 | } 37 | 38 | button { 39 | border-radius: 8px; 40 | border: 1px solid transparent; 41 | padding: 0.6em 1.2em; 42 | font-size: 1em; 43 | font-weight: 500; 44 | font-family: inherit; 45 | background-color: #1a1a1a; 46 | cursor: pointer; 47 | transition: border-color 0.25s; 48 | } 49 | button:hover { 50 | border-color: #646cff; 51 | } 52 | button:focus, 53 | button:focus-visible { 54 | outline: 4px auto -webkit-focus-ring-color; 55 | } 56 | 57 | @media (prefers-color-scheme: light) { 58 | :root { 59 | color: #213547; 60 | background-color: #ffffff; 61 | } 62 | a:hover { 63 | color: #747bff; 64 | } 65 | button { 66 | background-color: #f9f9f9; 67 | } 68 | } 69 | -------------------------------------------------------------------------------- /frontend/src/index.html: -------------------------------------------------------------------------------- 1 | 2 | 3 | Scylla 4 | 5 | 6 | 7 | 8 |
    9 | 10 | 11 | -------------------------------------------------------------------------------- /frontend/src/index.scss: -------------------------------------------------------------------------------- 1 | .banner { 2 | &, img { 3 | height: 60px; 4 | } 5 | } 6 | 7 | .navigation { 8 | min-width: 696px; 9 | list-style: none; 10 | padding-top: 20px; 11 | li { 12 | display: inline; 13 | font-weight: bold; 14 | a { 15 | color: gray; 16 | &.active { 17 | color: #000000; 18 | font-weight: bolder; 19 | } 20 | } 21 | } 22 | li+li { 23 | margin-left: 25px; 24 | } 25 | } 26 | 27 | ul.pagination { 28 | min-width: 360px; 29 | list-style: none; 30 | padding-top: 5px; 31 | li { 32 | display: inline; 33 | } 34 | 35 | li+li { 36 | margin-left: 25px; 37 | } 38 | } 39 | 40 | .button + .button { 41 | margin-left: 5px; 42 | } 43 | 44 | footer { 45 | div { 46 | text-align: center; 47 | font-size: 13px; 48 | } 49 | } -------------------------------------------------------------------------------- /frontend/src/main.tsx: -------------------------------------------------------------------------------- 1 | import React from 'react' 2 | import ReactDOM from 'react-dom/client' 3 | import './index.css' 4 | import { AppRoute } from './LegacyApp.tsx' 5 | 6 | ReactDOM.createRoot(document.getElementById('root')!).render( 7 | 8 | 9 | , 10 | ) 11 | -------------------------------------------------------------------------------- /frontend/src/utils.ts: -------------------------------------------------------------------------------- 1 | import * as process from 'process' 2 | 3 | export function getBaseURL(): string { 4 | return process.env['NODE_ENV'] === 'production' ? prodURL() : 'http://localhost:8899'; 5 | } 6 | 7 | function prodURL(): string { 8 | const location = window.location; 9 | return location.protocol + "//" + location.host; 10 | } 11 | 12 | export interface Proxy { 13 | id: number; 14 | ip: string; 15 | port: number; 16 | is_valid: boolean; 17 | created_at: number; 18 | updated_at: number; 19 | latency: number; 20 | stability: number; 21 | is_anonymous: boolean; 22 | is_https: boolean; 23 | location: string; 24 | organization: string; 25 | region: string; 26 | country: string; 27 | city: string; 28 | } 29 | 30 | export interface ResponseJSON { 31 | proxies: Proxy[]; 32 | count: number; 33 | per_page: number; 34 | page: number; 35 | total_page: number; 36 | } 37 | 38 | export interface StatsResponseJSON { 39 | mean: number; 40 | median: number; 41 | total_count: number; 42 | valid_count: number; 43 | } -------------------------------------------------------------------------------- /frontend/src/vite-env.d.ts: -------------------------------------------------------------------------------- 1 | /// 2 | -------------------------------------------------------------------------------- /frontend/tsconfig.json: -------------------------------------------------------------------------------- 1 | { 2 | "compilerOptions": { 3 | "target": "ES2020", 4 | "useDefineForClassFields": true, 5 | "lib": ["ES2020", "DOM", "DOM.Iterable"], 6 | "module": "ESNext", 7 | "skipLibCheck": true, 8 | 9 | /* Bundler mode */ 10 | "moduleResolution": "bundler", 11 | "allowImportingTsExtensions": true, 12 | "resolveJsonModule": true, 13 | "isolatedModules": true, 14 | "noEmit": true, 15 | "jsx": "react-jsx", 16 | 17 | /* Linting */ 18 | "strict": true, 19 | "noUnusedLocals": true, 20 | "noUnusedParameters": true, 21 | "noFallthroughCasesInSwitch": true 22 | }, 23 | "include": ["src"], 24 | "references": [{ "path": "./tsconfig.node.json" }] 25 | } 26 | -------------------------------------------------------------------------------- /frontend/tsconfig.node.json: -------------------------------------------------------------------------------- 1 | { 2 | "compilerOptions": { 3 | "composite": true, 4 | "skipLibCheck": true, 5 | "module": "ESNext", 6 | "moduleResolution": "bundler", 7 | "allowSyntheticDefaultImports": true 8 | }, 9 | "include": ["vite.config.ts"] 10 | } 11 | -------------------------------------------------------------------------------- /frontend/vite.config.ts: -------------------------------------------------------------------------------- 1 | import { defineConfig } from 'vite' 2 | import react from '@vitejs/plugin-react-swc' 3 | 4 | // https://vitejs.dev/config/ 5 | export default defineConfig({ 6 | plugins: [react()], 7 | }) 8 | -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | peewee==3.14.4 2 | requests>=2.32.3 3 | pycurl==7.45.3 4 | schedule==1.1.0 5 | six==1.16.0 6 | playwright>=1.33.0 7 | pyquery==2.0.0 8 | fastapi==0.111.1 9 | uvicorn[standard]==0.30.3 10 | tornado==6.4.1 11 | -------------------------------------------------------------------------------- /scripts/ubuntu_dependence.sh: -------------------------------------------------------------------------------- 1 | sudo apt-get install -y gstreamer1.0-libav libnss3-tools libatk-bridge2.0-0 libcups2-dev libxkbcommon-x11-0 libxcomposite-dev libxrandr2 libgbm-dev libgtk-3-0 -------------------------------------------------------------------------------- /scylla/__init__.py: -------------------------------------------------------------------------------- 1 | from scylla import cli 2 | from ._version import __version__ 3 | 4 | __author__ = 'WildCat' 5 | __copyright__ = 'Copyright 2018, WildCat' 6 | -------------------------------------------------------------------------------- /scylla/__main__.py: -------------------------------------------------------------------------------- 1 | from multiprocessing.spawn import freeze_support 2 | 3 | from scylla.cli import app_main 4 | 5 | if __name__ == '__main__': 6 | freeze_support() 7 | app_main() 8 | -------------------------------------------------------------------------------- /scylla/_version.py: -------------------------------------------------------------------------------- 1 | __version__ = '1.1.7' -------------------------------------------------------------------------------- /scylla/cli.py: -------------------------------------------------------------------------------- 1 | import argparse 2 | import sys 3 | 4 | from scylla.config import batch_set_config, get_config 5 | from ._version import __version__ 6 | 7 | CMD_DESCRIPTION = """Scylla command line mode 8 | This command could start a scheduler for crawling and validating proxies. 9 | In addition, a web server with APIs can also be launched. 10 | 11 | """ 12 | 13 | 14 | def main(args) -> int: 15 | parser = argparse.ArgumentParser(description=CMD_DESCRIPTION, 16 | formatter_class=argparse.ArgumentDefaultsHelpFormatter) 17 | parser.add_argument('--no-webserver', '-no-ws', action='store_true', 18 | help='Prevent starting a web server for JSON API') 19 | parser.add_argument('--web-port', '-wp', type=int, default=8899, 20 | help='The port number for the web server') 21 | parser.add_argument('--web-host', '-wh', type=str, default='0.0.0.0', 22 | help='The hostname for the web server') 23 | parser.add_argument('--skip-scheduler', action='store_true', 24 | help='Prevent the scheduler from crawling') 25 | parser.add_argument('--version', '-v', action='store_true', 26 | help='Print the version of Scylla') 27 | parser.add_argument('--db-path', type=str, default='./scylla.db', 28 | help='The sqlite database file location') 29 | parser.add_argument('--validation-pool', type=int, default=31, 30 | help='The validation pool size (i.e. the limit of concurrent validation tasks for proxies)') 31 | parser.add_argument('--no-forward-proxy-server', action='store_true', 32 | help='Disable the forward proxy server') 33 | parser.add_argument('--proxy-port', '-pp', type=int, default=8081, 34 | help='The port number for the forward proxy') 35 | 36 | parsed_args = parser.parse_args(args) 37 | 38 | parsed_args_dict = vars(parsed_args) 39 | 40 | batch_set_config(**vars(parsed_args)) 41 | 42 | handle_special_flags(parsed_args_dict) 43 | 44 | from scylla.database import create_db_tables 45 | from scylla.loggings import logger 46 | from scylla.scheduler import Scheduler 47 | from scylla.web import start_web_server 48 | from scylla.proxy import start_forward_proxy_server_non_blocking 49 | 50 | create_db_tables() 51 | 52 | s = Scheduler() 53 | 54 | try: 55 | if not get_config('skip_scheduler'): 56 | logger.info('Start the scheduler') 57 | s.start() 58 | 59 | # forward proxy server 60 | if not get_config('no_forward_proxy_server'): 61 | start_forward_proxy_server_non_blocking() 62 | 63 | # web server 64 | if not get_config('no_webserver'): 65 | logger.info('Start the web server') 66 | start_web_server( 67 | host=parsed_args_dict['web_host'], port=parsed_args_dict['web_port']) 68 | 69 | s.join() 70 | except (KeyboardInterrupt, SystemExit): 71 | logger.info('catch KeyboardInterrupt, exiting...') 72 | s.stop() 73 | sys.exit(0) 74 | 75 | return 0 76 | 77 | 78 | def handle_special_flags(args: dict): 79 | if args['version']: 80 | print('v{}'.format(__version__)) 81 | sys.exit(0) 82 | 83 | 84 | def app_main(): 85 | sys.exit(main(sys.argv[1:])) 86 | -------------------------------------------------------------------------------- /scylla/config.py: -------------------------------------------------------------------------------- 1 | from typing import Union 2 | 3 | _config_data = {} 4 | 5 | 6 | def _config_data_instance(): 7 | global _config_data 8 | return _config_data 9 | 10 | 11 | def set_config(key: str, value: str): 12 | _config_data_instance()[key] = value 13 | 14 | 15 | def get_config(key: str, default: str = None) -> Union[str, None]: 16 | try: 17 | return _config_data_instance()[key] 18 | except KeyError: 19 | return default 20 | 21 | 22 | def batch_set_config(**kwargs): 23 | for k, v in kwargs.items(): 24 | set_config(k, v) 25 | -------------------------------------------------------------------------------- /scylla/database.py: -------------------------------------------------------------------------------- 1 | import datetime 2 | import math 3 | 4 | from peewee import CharField, DateTimeField, BooleanField, FloatField, IntegerField, SqliteDatabase 5 | from playhouse.signals import pre_save, Model 6 | 7 | from scylla.config import get_config 8 | from scylla.loggings import logger 9 | 10 | _db = None 11 | 12 | 13 | def create_connection() -> SqliteDatabase: 14 | """ 15 | create a database connection 16 | :rtype: SqliteDatabase 17 | """ 18 | global _db 19 | if _db: 20 | return _db 21 | else: 22 | logger.debug('create new db connection') 23 | _db = SqliteDatabase(get_config('db_path', './scylla.db')) 24 | return _db 25 | 26 | 27 | def create_db_tables(): 28 | db = create_connection() 29 | db.create_tables([ProxyIP]) 30 | 31 | 32 | class BaseModel(Model): 33 | class Meta: 34 | database = create_connection() 35 | 36 | 37 | class ProxyIP(BaseModel): 38 | 39 | class Meta: 40 | table_name = 'proxy_ips' 41 | indexes = ( 42 | (('ip', 'port'), True), 43 | ) 44 | 45 | ip = CharField() 46 | port = IntegerField() 47 | is_valid = BooleanField(default=False) 48 | created_at = DateTimeField(default=datetime.datetime.now) 49 | updated_at = DateTimeField(default=datetime.datetime.now) 50 | latency = FloatField() 51 | stability = FloatField() 52 | is_anonymous = BooleanField(default=False) 53 | is_https = BooleanField(default=False) 54 | attempts = IntegerField(default=0) 55 | https_attempts = IntegerField(default=0) 56 | location = CharField(null=True) 57 | organization = CharField(null=True) 58 | region = CharField(null=True) 59 | country = CharField(null=True) 60 | city = CharField(null=True) 61 | 62 | def assign_from(self, p): 63 | self.ip = p.ip 64 | self.port = p.port 65 | self.is_valid = p.is_valid 66 | self.latency = p.latency 67 | self.stability = p.stability 68 | self.is_anonymous = p.is_anonymous 69 | if not self.is_https: 70 | # Prevent downgrading https proxy to http proxy 71 | self.is_https = p.is_https 72 | self.attempts = p.attempts 73 | self.https_attempts = p.https_attempts 74 | self.location = p.location 75 | self.organization = p.organization 76 | self.region = p.region 77 | self.country = p.country 78 | self.city = p.city 79 | self.updated_at = datetime.datetime.now() 80 | 81 | def __str__(self): 82 | return '[database.ProxyIP ip: {}, port: {}, is_valid: {}, latency: {}]' \ 83 | .format(self.ip, self.port, self.is_valid, self.latency) 84 | 85 | def __repr__(self): 86 | return self.__str__() 87 | 88 | 89 | @pre_save(sender=ProxyIP) 90 | def proxy_ip_on_pre_save_handler(model_class, instance: ProxyIP, created): 91 | instance.latency = math.floor(instance.latency) 92 | -------------------------------------------------------------------------------- /scylla/jobs.py: -------------------------------------------------------------------------------- 1 | from scylla.database import ProxyIP 2 | from scylla.validation_policy import ValidationPolicy 3 | from scylla.validator import Validator 4 | from .loggings import logger 5 | 6 | 7 | def save_ip(p: ProxyIP): 8 | """ 9 | Save a ProxyIP object into database 10 | 11 | :param p: ProxyIP object 12 | """ 13 | basic_query = ProxyIP.select().where(ProxyIP.ip == p.ip, ProxyIP.port == p.port) 14 | count = basic_query.count() 15 | if count == 0: 16 | # logger.debug('Creating new ip record: ' + p.__str__()) 17 | p.save() 18 | else: 19 | # logger.debug('Update an existing ip record: ' + p.__str__()) 20 | 21 | existing_proxy: ProxyIP = ProxyIP.get( 22 | ProxyIP.ip == p.ip, ProxyIP.port == p.port 23 | ) 24 | 25 | existing_proxy.assign_from(p) 26 | 27 | existing_proxy.save() 28 | 29 | # logger.debug('Saved: ' + existing_proxy.__str__()) 30 | 31 | 32 | def validate_proxy_ip(p: ProxyIP): 33 | """ 34 | Validate a ProxyIP object 35 | 36 | Args: 37 | p (ProxyIP): ProxyIP object 38 | """ 39 | policy = ValidationPolicy(proxy_ip=p) 40 | 41 | if not policy.should_validate(): 42 | return 43 | 44 | v = Validator(host=p.ip, port=int(p.port), using_https=policy.should_try_https()) 45 | 46 | try: 47 | v.validate() 48 | except (KeyboardInterrupt, SystemExit): 49 | logger.info("KeyboardInterrupt terminates validate_proxy_ip: " + p.ip) 50 | 51 | meta = v.meta if v.meta else {} 52 | validated_ip = ProxyIP(ip=p.ip, port=p.port, **meta) 53 | # save valid ip into database 54 | validated_ip.latency = v.latency 55 | validated_ip.stability = v.success_rate 56 | validated_ip.is_valid = v.valid 57 | validated_ip.is_anonymous = v.anonymous 58 | 59 | # Increase attempts and https_attempts 60 | validated_ip.attempts = validated_ip.attempts + 1 61 | if v.using_https: 62 | validated_ip.https_attempts = validated_ip.https_attempts + 1 63 | 64 | if v.valid: 65 | validated_ip.is_https = v.using_https 66 | 67 | # logger.debug('Save valid ip into database: \n' + validated_ip.__str__()) 68 | 69 | save_ip(validated_ip) 70 | 71 | # logger.debug('Finish validating ip: {}'.format(validated_ip.ip)) 72 | -------------------------------------------------------------------------------- /scylla/loggings.py: -------------------------------------------------------------------------------- 1 | import logging 2 | import sys 3 | 4 | _formatter = logging.Formatter(fmt="%(asctime)s %(levelname)s: %(message)s", 5 | datefmt="%Y-%m-%d - %H:%M:%S") 6 | _ch = logging.StreamHandler(sys.stdout) 7 | _ch.setLevel(logging.DEBUG) 8 | _ch.setFormatter(_formatter) 9 | 10 | # _fh = logging.FileHandler("mylog.log", "w") 11 | # _fh.setLevel(logging.DEBUG) 12 | # _fh.setFormatter(_formatter) 13 | 14 | logger = logging.getLogger('scylla') 15 | logger.setLevel(logging.DEBUG) 16 | 17 | logger.addHandler(_ch) 18 | # logger.addHandler(_fh) 19 | -------------------------------------------------------------------------------- /scylla/providers/__init__.py: -------------------------------------------------------------------------------- 1 | from scylla.providers.proxy_list_provider import ProxyListProvider 2 | from scylla.providers.proxy_scraper_provider import ProxyScraperProvider 3 | from scylla.providers.proxylists_provider import ProxylistsProvider 4 | from scylla.providers.proxynova_provider import ProxyNovaProvider 5 | from scylla.providers.pubproxy_provider import PubproxyProvider 6 | from scylla.providers.rmccurdy_provider import RmccurdyProvider 7 | from scylla.providers.rudnkh_provider import RudnkhProvider 8 | from scylla.providers.the_speedX_provider import TheSpeedXProvider 9 | from .a2u_provider import A2uProvider 10 | from .base_provider import BaseProvider 11 | from .cool_proxy_provider import CoolProxyProvider 12 | from .data5u_provider import Data5uProvider 13 | from .free_proxy_list_provider import FreeProxyListProvider 14 | from .http_proxy_provider import HttpProxyProvider 15 | from .ipaddress_provider import IpaddressProvider 16 | from .kuaidaili_provider import KuaidailiProvider 17 | from .spys_me_provider import SpyMeProvider 18 | from .spys_one_provider import SpysOneProvider 19 | from .xici_provider import XiciProvider 20 | 21 | all_providers = [ 22 | A2uProvider, 23 | CoolProxyProvider, 24 | Data5uProvider, 25 | FreeProxyListProvider, 26 | HttpProxyProvider, 27 | # KuaidailiProvider, 28 | SpyMeProvider, 29 | SpysOneProvider, 30 | # XiciProvider 31 | IpaddressProvider, 32 | ProxyListProvider, 33 | ProxyScraperProvider, 34 | ProxylistsProvider, 35 | ProxyNovaProvider, 36 | PubproxyProvider, 37 | RmccurdyProvider, 38 | RudnkhProvider, 39 | TheSpeedXProvider 40 | ] 41 | 42 | # Provider references: 43 | # https://github.com/franklingu/proxypool/issues/2 44 | -------------------------------------------------------------------------------- /scylla/providers/a2u_provider.py: -------------------------------------------------------------------------------- 1 | import re 2 | 3 | from pyquery import PyQuery 4 | 5 | from scylla.database import ProxyIP 6 | from .base_provider import BaseProvider 7 | 8 | 9 | class A2uProvider(BaseProvider): 10 | 11 | def urls(self) -> [str]: 12 | return [ 13 | 'https://raw.githubusercontent.com/a2u/free-proxy-list/master/free-proxy-list.txt', 14 | ] 15 | 16 | def parse(self, document: PyQuery) -> [ProxyIP]: 17 | ip_list: [ProxyIP] = [] 18 | 19 | raw_html = document.html() 20 | 21 | ip_port_str_list = re.findall(r'\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3}:\d{2,5}', raw_html) 22 | 23 | for ip_port in ip_port_str_list: 24 | 25 | ip = re.search(r'\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3}', ip_port).group(0) 26 | port = re.search(r':(\d{2,5})', ip_port).group(1) 27 | 28 | if ip and port: 29 | p = ProxyIP(ip=ip, port=port) 30 | ip_list.append(p) 31 | 32 | return ip_list 33 | 34 | @staticmethod 35 | def should_render_js() -> bool: 36 | return False 37 | -------------------------------------------------------------------------------- /scylla/providers/base_provider.py: -------------------------------------------------------------------------------- 1 | from pyquery import PyQuery 2 | 3 | from ..database import ProxyIP 4 | 5 | 6 | class BaseProvider(object): 7 | """BaseProvider is the abstract class for the proxy providers 8 | 9 | :raises NotImplementedError: [if urls() or parse() is not implemented] 10 | """ 11 | 12 | _sleep = 0 13 | 14 | def __init__(self): 15 | pass 16 | 17 | def __str__(self): 18 | return self.__class__.__name__ 19 | 20 | def sleep_seconds(self) -> int: 21 | """Return a sleep time for each request, by default it is 0 22 | 23 | :return: sleep time in seconds 24 | """ 25 | return self._sleep 26 | 27 | def urls(self) -> [str]: 28 | """Return a list of url strings for crawling 29 | 30 | :return: [a list of url strings] 31 | :rtype: [str] 32 | """ 33 | 34 | raise NotImplementedError 35 | 36 | def parse(self, document: PyQuery) -> [ProxyIP]: 37 | """Parse the document in order to get a list of proxies 38 | 39 | :param document: the HTML object from requests-html 40 | :return: a list of proxy ips 41 | """ 42 | 43 | raise NotImplementedError 44 | 45 | @staticmethod 46 | def should_render_js() -> bool: 47 | """Whether needs js rendering 48 | By default, it is False. 49 | 50 | :return: a boolean value indicating whether or not js rendering is needed 51 | :rtype: bool 52 | """ 53 | 54 | return False 55 | -------------------------------------------------------------------------------- /scylla/providers/comp0_provider.py: -------------------------------------------------------------------------------- 1 | from scylla.providers.plain_text_provider import PlainTextProvider 2 | 3 | class Comp0Provider(PlainTextProvider): 4 | 5 | def urls(self) -> [str]: 6 | return [ 7 | 'https://proxy.rudnkh.me/txt', 8 | ] 9 | -------------------------------------------------------------------------------- /scylla/providers/cool_proxy_provider.py: -------------------------------------------------------------------------------- 1 | import re 2 | 3 | from pyquery import PyQuery 4 | 5 | from scylla.database import ProxyIP 6 | from .base_provider import BaseProvider 7 | 8 | 9 | class CoolProxyProvider(BaseProvider): 10 | 11 | def parse(self, document: PyQuery) -> [ProxyIP]: 12 | ip_list: [ProxyIP] = [] 13 | 14 | for ip_row in document.find('table tr'): 15 | ip_row: PyQuery = ip_row 16 | ip_element: PyQuery = ip_row.find('td:nth-child(1)') 17 | port_element: PyQuery = ip_row.find('td:nth-child(2)') 18 | 19 | if ip_element and port_element: 20 | p = ProxyIP(ip=re.sub(r'document\.write\(.+\)', '', ip_element.text()), port=port_element.text()) 21 | 22 | ip_list.append(p) 23 | 24 | return ip_list 25 | 26 | def urls(self) -> [str]: 27 | return [ 28 | 'https://www.cool-proxy.net/proxies/http_proxy_list/country_code:/port:/anonymous:1', 29 | 'https://www.cool-proxy.net/proxies/http_proxy_list/country_code:/port:/anonymous:1/page:2', 30 | 'https://www.cool-proxy.net/proxies/http_proxy_list/country_code:/port:/anonymous:1/page:3', 31 | ] 32 | 33 | @staticmethod 34 | def should_render_js() -> bool: 35 | return True 36 | -------------------------------------------------------------------------------- /scylla/providers/data5u_provider.py: -------------------------------------------------------------------------------- 1 | from pyquery import PyQuery 2 | 3 | from scylla.database import ProxyIP 4 | from scylla.providers import BaseProvider 5 | 6 | 7 | class Data5uProvider(BaseProvider): 8 | 9 | def urls(self) -> [str]: 10 | return [ 11 | 'http://www.data5u.com', 12 | ] 13 | 14 | def parse(self, document: PyQuery) -> [ProxyIP]: 15 | ip_list: [ProxyIP] = [] 16 | 17 | for ip_row in document.find('.wlist > ul > li:nth-child(2) .l2'): 18 | ip_row: PyQuery = ip_row 19 | ip_element = ip_row.find('span:nth-child(1)') 20 | port_element = ip_row.find('span:nth-child(2)') 21 | 22 | if ip_element and port_element: 23 | p = ProxyIP(ip=ip_element.text(), port=port_element.text()) 24 | ip_list.append(p) 25 | 26 | return ip_list 27 | 28 | @staticmethod 29 | def should_render_js() -> bool: 30 | return False 31 | -------------------------------------------------------------------------------- /scylla/providers/free_proxy_list_provider.py: -------------------------------------------------------------------------------- 1 | from pyquery import PyQuery 2 | 3 | from scylla.database import ProxyIP 4 | from .base_provider import BaseProvider 5 | 6 | 7 | class FreeProxyListProvider(BaseProvider): 8 | 9 | def parse(self, document: PyQuery) -> [ProxyIP]: 10 | ip_list: [ProxyIP] = [] 11 | 12 | for ip_row in document.find('#proxylisttable tbody tr'): 13 | ip_row: PyQuery = PyQuery(ip_row) 14 | ip_address: str = ip_row.find('td:nth-child(1)').text() 15 | port: str = ip_row.find('td:nth-child(2)').text() 16 | 17 | p = ProxyIP(ip=ip_address, port=port) 18 | 19 | ip_list.append(p) 20 | 21 | return ip_list 22 | 23 | def urls(self) -> [str]: 24 | return [ 25 | 'https://free-proxy-list.net/' 26 | ] 27 | -------------------------------------------------------------------------------- /scylla/providers/http_proxy_provider.py: -------------------------------------------------------------------------------- 1 | import re 2 | 3 | from pyquery import PyQuery 4 | 5 | from scylla.database import ProxyIP 6 | from .base_provider import BaseProvider 7 | 8 | 9 | class HttpProxyProvider(BaseProvider): 10 | 11 | def parse(self, document: PyQuery) -> [ProxyIP]: 12 | ip_list: [ProxyIP] = [] 13 | 14 | for ip_row in document.find('table.proxytbl tr'): 15 | ip_row: PyQuery = ip_row 16 | ip_element = ip_row.find('td:nth-child(1)') 17 | port_element = ip_row.find('td:nth-child(2)') 18 | 19 | try: 20 | if ip_element and port_element: 21 | port_str = re.search(r'//]]> (\d+)', port_element.text()).group(1) 22 | 23 | p = ProxyIP(ip=ip_element.text(), port=port_str) 24 | 25 | ip_list.append(p) 26 | except AttributeError: 27 | pass 28 | 29 | return ip_list 30 | 31 | def urls(self) -> [str]: 32 | return [ 33 | 'https://proxyhttp.net/free-list/proxy-anonymous-hide-ip-address/', 34 | 'https://proxyhttp.net/', 35 | 'https://proxyhttp.net/free-list/anonymous-server-hide-ip-address/2#proxylist', 36 | ] 37 | 38 | @staticmethod 39 | def should_render_js() -> bool: 40 | return True 41 | -------------------------------------------------------------------------------- /scylla/providers/ipaddress_provider.py: -------------------------------------------------------------------------------- 1 | from typing import List 2 | from pyquery import PyQuery 3 | 4 | from scylla.database import ProxyIP 5 | from .base_provider import BaseProvider 6 | 7 | 8 | class IpaddressProvider(BaseProvider): 9 | 10 | def parse(self, document: PyQuery) -> List[ProxyIP]: 11 | ip_list: List[ProxyIP] = [] 12 | 13 | for ip_row in document.find('.proxylist tbody tr'): 14 | ip_row: PyQuery = PyQuery(ip_row) 15 | ip_port: str = ip_row.find('td:nth-child(1)').text() 16 | ip_address, port = ip_port.split(":") 17 | 18 | p = ProxyIP(ip=ip_address, port=port) 19 | 20 | ip_list.append(p) 21 | 22 | return ip_list 23 | 24 | def urls(self) -> List[str]: 25 | return [ 26 | 'https://www.ipaddress.com/proxy-list/' 27 | ] 28 | -------------------------------------------------------------------------------- /scylla/providers/kuaidaili_provider.py: -------------------------------------------------------------------------------- 1 | from pyquery import PyQuery 2 | 3 | from scylla.database import ProxyIP 4 | from scylla.providers import BaseProvider 5 | 6 | 7 | class KuaidailiProvider(BaseProvider): 8 | 9 | def urls(self) -> [str]: 10 | return [ 11 | 'https://www.kuaidaili.com/free/', 12 | 'https://www.kuaidaili.com/free/inha/2/', 13 | ] 14 | 15 | def parse(self, document: PyQuery) -> [ProxyIP]: 16 | ip_list: [ProxyIP] = [] 17 | 18 | for ip_row in document.find('#list table tr'): 19 | ip_row: PyQuery = ip_row 20 | ip_element = ip_row.find('td[data-title="IP"]') 21 | port_element = ip_row.find('td[data-title="PORT"]') 22 | 23 | if ip_element and port_element: 24 | p = ProxyIP(ip=ip_element.text(), port=port_element.text()) 25 | ip_list.append(p) 26 | 27 | return ip_list 28 | 29 | @staticmethod 30 | def should_render_js() -> bool: 31 | return True 32 | -------------------------------------------------------------------------------- /scylla/providers/plain_text_provider.py: -------------------------------------------------------------------------------- 1 | import re 2 | 3 | from pyquery import PyQuery 4 | 5 | from scylla.database import ProxyIP 6 | from .base_provider import BaseProvider 7 | 8 | 9 | class PlainTextProvider(BaseProvider): 10 | 11 | def urls(self) -> [str]: 12 | return [] 13 | 14 | def parse(self, document: PyQuery) -> [ProxyIP]: 15 | ip_list: [ProxyIP] = [] 16 | 17 | if document is None: 18 | return [] 19 | 20 | text = document.html() 21 | 22 | for ip_port in text.split('\n'): 23 | if ip_port.strip() == '' or not re.match(r'\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3}:(\d{2,5})', ip_port): 24 | continue 25 | ip = re.search(r'\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3}', ip_port).group(0) 26 | port = re.search(r':(\d{2,5})', ip_port).group(1) 27 | 28 | if ip and port: 29 | p = ProxyIP(ip=ip, port=port) 30 | ip_list.append(p) 31 | 32 | return ip_list 33 | 34 | @staticmethod 35 | def should_render_js() -> bool: 36 | return False 37 | -------------------------------------------------------------------------------- /scylla/providers/proxy_list_provider.py: -------------------------------------------------------------------------------- 1 | import base64 2 | import re 3 | from typing import List 4 | 5 | from pyquery import PyQuery 6 | 7 | from scylla.database import ProxyIP 8 | from scylla.worker import Worker 9 | from .base_provider import BaseProvider 10 | import urllib.parse 11 | 12 | class ProxyListProvider(BaseProvider): 13 | 14 | def __init__(self): 15 | super().__init__() 16 | self.w = Worker() 17 | 18 | def parse(self, document: PyQuery) -> List[ProxyIP]: 19 | ip_list: List[ProxyIP] = [] 20 | 21 | if document is None: 22 | return [] 23 | 24 | for ul in document.find('#proxy-table > div.table-wrap ul'): 25 | js_code_element = ul.find('li.proxy script') 26 | 27 | if not js_code_element: 28 | return [] 29 | 30 | js_code = js_code_element.text() 31 | matched = re.findall(r"Proxy\('(.+)'\)", js_code) 32 | if matched and len(matched) > 0: 33 | encoded = matched[0] 34 | ip_port = base64.b64decode(encoded).decode("utf-8") 35 | ip = re.findall(r'\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3}', ip_port)[0] 36 | port = re.findall(r':(\d{2,5})', ip_port)[0] 37 | ip_list.append(ProxyIP(ip=ip, port=port)) 38 | 39 | return ip_list 40 | 41 | def urls(self) -> List[str]: 42 | ret = [] 43 | first_url = 'http://proxy-list.org/english/index.php?p=1' 44 | first_page = self.w.get_html(first_url, False) 45 | if first_page: 46 | ret.append(first_url) 47 | for a in first_page.find('#content div.content div.table-menu a.item'): 48 | relative_path = a.attrib['href'] 49 | absolute_url = urllib.parse.urljoin(first_url, relative_path) 50 | ret.append(absolute_url) 51 | return ret 52 | 53 | 54 | @staticmethod 55 | def should_render_js() -> bool: 56 | return False 57 | 58 | -------------------------------------------------------------------------------- /scylla/providers/proxy_scraper_provider.py: -------------------------------------------------------------------------------- 1 | import json 2 | 3 | from pyquery import PyQuery 4 | 5 | from scylla.database import ProxyIP 6 | from .base_provider import BaseProvider 7 | 8 | 9 | class ProxyScraperProvider(BaseProvider): 10 | 11 | def urls(self) -> [str]: 12 | return ['https://raw.githubusercontent.com/sunny9577/proxy-scraper/master/proxies.json'] 13 | 14 | def parse(self, document: PyQuery) -> [ProxyIP]: 15 | ip_list: [ProxyIP] = [] 16 | 17 | text = document.html() 18 | json_object = json.loads(text) 19 | if not json_object or type(json_object) != list: 20 | return ip_list 21 | 22 | for ip_port in json_object: 23 | p = ProxyIP(ip=ip_port['ip'], port=ip_port['port']) 24 | ip_list.append(p) 25 | 26 | return ip_list 27 | 28 | @staticmethod 29 | def should_render_js() -> bool: 30 | return False 31 | -------------------------------------------------------------------------------- /scylla/providers/proxylists_provider.py: -------------------------------------------------------------------------------- 1 | import re 2 | 3 | from pyquery import PyQuery 4 | 5 | from scylla.database import ProxyIP 6 | from scylla.worker import Worker 7 | from .base_provider import BaseProvider 8 | 9 | 10 | class ProxylistsProvider(BaseProvider): 11 | 12 | def __init__(self): 13 | super().__init__() 14 | self.w = Worker() 15 | self.country_patten = re.compile('^/(.+)_0.html$') 16 | 17 | def parse(self, document: PyQuery) -> [ProxyIP]: 18 | ip_list: [ProxyIP] = [] 19 | 20 | for tr in document.find('table table tr'): 21 | tr: PyQuery = tr 22 | ip_element = tr.find('td:nth-of-type(1)') 23 | port_element = tr.find('td:nth-of-type(2)') 24 | if ip_element and port_element: 25 | ip = re.search(r'\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3}', ip_element.text).group(0) 26 | port = re.search(r'\d{2,5}', port_element.text).group(0) 27 | ip_list.append(ProxyIP(ip=ip, port=port)) 28 | 29 | return ip_list 30 | 31 | def urls(self) -> [str]: 32 | ret = set([]) 33 | country_url = 'http://www.proxylists.net/countries.html' 34 | country_page = self.w.get_html(country_url, False) 35 | if country_page: 36 | for a in country_page.find('a'): 37 | relative_path = a.attrib['href'] 38 | if self.country_patten.match(relative_path) : 39 | ret.update(self.gen_url_for_country(self.country_patten.findall(relative_path)[0])) 40 | break 41 | return list(ret) 42 | 43 | def gen_url_for_country(self, country) -> [str]: 44 | ret = [] 45 | first_page = self.w.get_html('http://www.proxylists.net/{}_0.html'.format(country), False) 46 | for a in first_page.find('table table tr:last-of-type a'): 47 | ret.append('http://www.proxylists.net/{}'.format(a.attrs['href'])) 48 | return ret 49 | 50 | @staticmethod 51 | def should_render_js() -> bool: 52 | return True 53 | -------------------------------------------------------------------------------- /scylla/providers/proxynova_provider.py: -------------------------------------------------------------------------------- 1 | import re 2 | 3 | from pyquery import PyQuery 4 | 5 | from scylla.database import ProxyIP 6 | from .base_provider import BaseProvider 7 | 8 | 9 | class ProxyNovaProvider(BaseProvider): 10 | 11 | def parse(self, document: PyQuery) -> [ProxyIP]: 12 | ip_list: [ProxyIP] = [] 13 | 14 | for tr in document.find('#tbl_proxy_list > tbody:nth-child(2) > tr'): 15 | tr: PyQuery = tr 16 | if not tr.attr('data-proxy-id'): 17 | continue 18 | 19 | script_element = tr.find('td:nth-child(1) > abbr > script') 20 | port_element = tr.find('td:nth-child(2)') 21 | if not script_element or not port_element: 22 | continue 23 | 24 | groups = re.findall( 25 | r"document\.write\('12345678(\d{1,3}\.\d{1,3})'\.substr\(8\) \+ '(\d{1,3}\.\d{1,3}\.\d{1,3})'\)", 26 | script_element.text()) 27 | if not groups or len(groups) != 1: 28 | continue 29 | ip = groups[0][0] + groups[0][1] 30 | port = port_element.text() 31 | ip_list.append(ProxyIP(ip=ip, port=port)) 32 | return ip_list 33 | 34 | def urls(self) -> [str]: 35 | return ['https://www.proxynova.com/proxy-server-list/'] 36 | 37 | @staticmethod 38 | def should_render_js() -> bool: 39 | return False 40 | -------------------------------------------------------------------------------- /scylla/providers/pubproxy_provider.py: -------------------------------------------------------------------------------- 1 | from scylla.providers.plain_text_provider import PlainTextProvider 2 | 3 | class PubproxyProvider(PlainTextProvider): 4 | 5 | def urls(self) -> [str]: 6 | return [ 7 | 'http://pubproxy.com/api/proxy?limit=5&format=txt&type=http&level=anonymous&last_check=60&no_country=CN', 8 | ] 9 | -------------------------------------------------------------------------------- /scylla/providers/rmccurdy_provider.py: -------------------------------------------------------------------------------- 1 | from scylla.providers.plain_text_provider import PlainTextProvider 2 | 3 | class RmccurdyProvider(PlainTextProvider): 4 | 5 | def urls(self) -> [str]: 6 | return [ 7 | 'https://www.rmccurdy.com/scripts/proxy/good.txt', 8 | ] 9 | -------------------------------------------------------------------------------- /scylla/providers/rudnkh_provider.py: -------------------------------------------------------------------------------- 1 | from scylla.providers.plain_text_provider import PlainTextProvider 2 | 3 | 4 | class RudnkhProvider(PlainTextProvider): 5 | 6 | def urls(self) -> [str]: 7 | return [ 8 | 'https://proxy.rudnkh.me/txt', 9 | ] 10 | -------------------------------------------------------------------------------- /scylla/providers/spys_me_provider.py: -------------------------------------------------------------------------------- 1 | import re 2 | 3 | from pyquery import PyQuery 4 | 5 | from scylla.database import ProxyIP 6 | from scylla.providers import BaseProvider 7 | 8 | 9 | class SpyMeProvider(BaseProvider): 10 | 11 | def urls(self) -> [str]: 12 | return [ 13 | 'https://raw.githubusercontent.com/clarketm/proxy-list/master/proxy-list.txt', 14 | ] 15 | 16 | def parse(self, document: PyQuery) -> [ProxyIP]: 17 | ip_list: [ProxyIP] = [] 18 | 19 | text = document.html() 20 | 21 | ip_port_str_list = re.findall(r'\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3}:\d{2,5}', text) 22 | 23 | for ip_port in ip_port_str_list: 24 | 25 | ip = re.search(r'\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3}', ip_port).group(0) 26 | port = re.search(r':(\d{2,5})', ip_port).group(1) 27 | 28 | if ip and port: 29 | p = ProxyIP(ip=ip, port=port) 30 | ip_list.append(p) 31 | 32 | return ip_list 33 | 34 | @staticmethod 35 | def should_render_js() -> bool: 36 | return False 37 | -------------------------------------------------------------------------------- /scylla/providers/spys_one_provider.py: -------------------------------------------------------------------------------- 1 | import re 2 | 3 | from pyquery import PyQuery 4 | 5 | from scylla.database import ProxyIP 6 | from scylla.providers import BaseProvider 7 | 8 | 9 | class SpysOneProvider(BaseProvider): 10 | 11 | def urls(self) -> [str]: 12 | return [ 13 | 'http://spys.one/en/anonymous-proxy-list/', 14 | # 'http://spys.one/en/http-proxy-list/', 15 | # 'http://spys.one/en/https-ssl-proxy/', 16 | ] 17 | 18 | def parse(self, document: PyQuery) -> [ProxyIP]: 19 | ip_list: [ProxyIP] = [] 20 | for ip_row in document.find('table tr[onmouseover]'): 21 | ip_row: PyQuery = ip_row 22 | ip_port_text_elem = ip_row.find('.spy14') 23 | 24 | if ip_port_text_elem: 25 | ip_port_text = ip_port_text_elem.text() 26 | 27 | ip = re.search(r'\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3}', ip_port_text).group(0) 28 | port = re.search(r':\n(\d{2,5})', ip_port_text).group(1) 29 | 30 | if ip and port: 31 | p = ProxyIP(ip=ip, port=port) 32 | ip_list.append(p) 33 | 34 | return ip_list 35 | 36 | @staticmethod 37 | def should_render_js() -> bool: 38 | return True 39 | -------------------------------------------------------------------------------- /scylla/providers/the_speedX_provider.py: -------------------------------------------------------------------------------- 1 | from scylla.providers.plain_text_provider import PlainTextProvider 2 | 3 | class TheSpeedXProvider(PlainTextProvider): 4 | 5 | def urls(self) -> [str]: 6 | return [ 7 | 'https://raw.githubusercontent.com/TheSpeedX/SOCKS-List/master/http.txt', 8 | 'https://raw.githubusercontent.com/TheSpeedX/SOCKS-List/master/socks4.txt', 9 | 'https://raw.githubusercontent.com/TheSpeedX/SOCKS-List/master/socks5.txt', 10 | ] 11 | -------------------------------------------------------------------------------- /scylla/providers/xici_provider.py: -------------------------------------------------------------------------------- 1 | from pyquery import PyQuery 2 | 3 | from scylla.database import ProxyIP 4 | from scylla.providers import BaseProvider 5 | 6 | 7 | class XiciProvider(BaseProvider): 8 | 9 | def urls(self) -> [str]: 10 | return [ 11 | 'http://www.xicidaili.com/nn', 12 | 'http://www.xicidaili.com/wn', 13 | ] 14 | 15 | def parse(self, document: PyQuery) -> [ProxyIP]: 16 | ip_list: [ProxyIP] = [] 17 | 18 | for ip_row in document.find('#ip_list tr'): 19 | ip_row: PyQuery = ip_row 20 | ip_element = ip_row.find('td:nth-child(2)') 21 | port_element = ip_row.find('td:nth-child(3)') 22 | 23 | if ip_element and port_element: 24 | p = ProxyIP(ip=ip_element.text(), port=port_element.text()) 25 | ip_list.append(p) 26 | 27 | return ip_list 28 | 29 | @staticmethod 30 | def should_render_js() -> bool: 31 | return False 32 | -------------------------------------------------------------------------------- /scylla/proxy/__init__.py: -------------------------------------------------------------------------------- 1 | from .server import start_forward_proxy_server_non_blocking, start_forward_proxy_server 2 | -------------------------------------------------------------------------------- /scylla/proxy/server.py: -------------------------------------------------------------------------------- 1 | import random 2 | import sys 3 | import socket 4 | import traceback 5 | from multiprocessing import Process 6 | from typing import List 7 | 8 | from tornado import httpclient, web, ioloop, iostream, gen 9 | from tornado.httpclient import HTTPResponse 10 | 11 | from scylla.config import get_config 12 | from scylla.database import ProxyIP 13 | from scylla.loggings import logger 14 | 15 | # Using CurlAsyncHTTPClient because its proxy support 16 | httpclient.AsyncHTTPClient.configure("tornado.curl_httpclient.CurlAsyncHTTPClient") 17 | 18 | 19 | def get_proxy(https=False) -> ProxyIP: 20 | proxies: List[ProxyIP] = ProxyIP.select().where(ProxyIP.is_valid == True).where(ProxyIP.stability >= 0.9) 21 | 22 | if https: 23 | proxies = proxies.where(ProxyIP.is_https == True) 24 | 25 | proxies = proxies.order_by(ProxyIP.updated_at.desc()).limit(63) 26 | proxy: ProxyIP = random.choice(proxies) 27 | 28 | return proxy 29 | 30 | 31 | class ForwardingRequestHandler(web.RequestHandler): 32 | """ 33 | A very rough ForwardingRequestHandler, support both HTTP and HTTPS. 34 | """ 35 | 36 | SUPPORTED_METHODS = ("GET", "HEAD", "POST", "DELETE", "PATCH", "PUT", 37 | "OPTIONS", "CONNECT") 38 | 39 | def data_received(self, chunk): 40 | pass 41 | 42 | def get_proxy_and_forward(self): 43 | https = False 44 | 45 | if self.request.uri.startswith('https'): 46 | https = True 47 | 48 | disable_forward_proxy = get_config('disable_forward_proxy', default=False) 49 | 50 | if disable_forward_proxy: 51 | self.forward() 52 | else: 53 | proxy = get_proxy(https=https) 54 | self.forward(host=proxy.ip, port=proxy.port) 55 | 56 | @gen.coroutine 57 | def get(self, *args, **kwargs): 58 | self.get_proxy_and_forward() 59 | 60 | @gen.coroutine 61 | def post(self, *args, **kwargs): 62 | self.get_proxy_and_forward() 63 | 64 | @gen.coroutine 65 | def connect(self): 66 | def read_from_client(data): 67 | upstream.write(data) 68 | 69 | def read_from_upstream(data): 70 | client.write(data) 71 | 72 | def client_close(data=None): 73 | if upstream.closed(): 74 | return 75 | if data: 76 | upstream.write(data) 77 | upstream.close() 78 | 79 | def upstream_close(data=None): 80 | if client.closed(): 81 | return 82 | if data: 83 | client.write(data) 84 | client.close() 85 | 86 | def start_tunnel(): 87 | client.read_until_close(client_close, read_from_client) 88 | upstream.read_until_close() 89 | client.write(b'HTTP/1.1 200 Connection established\r\n\r\n') 90 | 91 | def on_connect(data=None): 92 | if data: 93 | first_line = data.splitlines()[0] 94 | http_v, status, text = first_line.split(None, 2) 95 | if 200 == int(status): 96 | start_tunnel() 97 | return 98 | self.set_status(500) 99 | self.finish() 100 | 101 | def start_proxy_tunnel(): 102 | upstream.write(b'CONNECT %b HTTP/1.1\r\n' % bytes(self.request.uri, 'utf8')) 103 | upstream.write(b'Host: %b\r\n' % bytes(self.request.uri, 'utf8')) 104 | upstream.write(b'Proxy-Connection: Keep-Alive\r\n\r\n') 105 | upstream.read_until(b'\r\n\r\n', on_connect) 106 | 107 | try: 108 | proxy = get_proxy(True) 109 | client = self.request.connection.stream 110 | upstream = iostream.IOStream(socket.socket(socket.AF_INET, socket.SOCK_STREAM, 0)) 111 | upstream.connect((proxy.ip, proxy.port), start_proxy_tunnel) 112 | except: 113 | self.set_status(500) 114 | self.write("Internal server error:\n" + 115 | ''.join(traceback.format_exception(*sys.exc_info()))) 116 | self.finish() 117 | 118 | def handle_response(self, response: HTTPResponse): 119 | 120 | if response.body: 121 | self.write(response.body) 122 | self.finish() 123 | elif response.error: 124 | logger.debug('The forward proxy has an error: {}'.format(response.error)) 125 | self.finish() 126 | else: 127 | self.finish() 128 | 129 | def forward(self, host=None, port=None): 130 | try: 131 | url = self.request.uri 132 | 133 | body = self.request.body 134 | 135 | if not body: 136 | body = None 137 | 138 | httpclient.AsyncHTTPClient().fetch( 139 | httpclient.HTTPRequest( 140 | url=url, 141 | method=self.request.method, 142 | body=body, 143 | headers=self.request.headers, 144 | follow_redirects=False, 145 | validate_cert=False, 146 | proxy_host=host, 147 | proxy_port=port), 148 | self.handle_response) 149 | 150 | except httpclient.HTTPError as e: 151 | logger.debug("tornado signalled HTTPError {}".format(e)) 152 | self.set_status(500) 153 | self.finish() 154 | except: 155 | self.set_status(500) 156 | self.write("Internal server error:\n" + 157 | ''.join(traceback.format_exception(*sys.exc_info()))) 158 | self.finish() 159 | 160 | 161 | def make_app(): 162 | return web.Application([ 163 | (r'.*', ForwardingRequestHandler), 164 | ]) 165 | 166 | 167 | def start_forward_proxy_server(): 168 | app = make_app() 169 | port = int(get_config('proxy_port', default='8081')) 170 | app.listen(port) 171 | logger.info('Start forward proxy server on port {}'.format(port)) 172 | ioloop.IOLoop.current().start() 173 | 174 | 175 | def start_forward_proxy_server_non_blocking(): 176 | p = Process(target=start_forward_proxy_server, daemon=True) 177 | p.start() 178 | -------------------------------------------------------------------------------- /scylla/proxy_check_services.py: -------------------------------------------------------------------------------- 1 | class ProxyCheckResult(object): 2 | is_valid: bool = False 3 | is_annoymous: bool = False 4 | location: str = None 5 | orgnization: str = None 6 | region: str = None 7 | country: str = None 8 | city: str = None 9 | 10 | 11 | class ProxyCheckServicesBase(object): 12 | """ 13 | ProxyCheckServicesBase is the abstract class for proxy checking services (i.e. IP checking services). 14 | 15 | :raises NotImplementedError: The check() method must be implemented by the subclasses 16 | """ 17 | 18 | def check(ip: str, port: int): 19 | raise NotImplementedError 20 | -------------------------------------------------------------------------------- /scylla/scheduler.py: -------------------------------------------------------------------------------- 1 | import time 2 | from concurrent.futures import ThreadPoolExecutor 3 | from datetime import datetime, timedelta 4 | from multiprocessing import Queue, Process 5 | from threading import Thread 6 | 7 | import schedule 8 | 9 | from scylla.config import get_config 10 | from scylla.database import ProxyIP 11 | from scylla.jobs import validate_proxy_ip 12 | from scylla.loggings import logger 13 | from scylla.providers import * 14 | from scylla.worker import Worker 15 | 16 | FEED_FROM_DB_INTERVAL_MINUTES = 30 17 | 18 | 19 | def fetch_ips(q: Queue, validator_queue: Queue): 20 | logger.debug('fetch_ips...') 21 | worker = Worker() 22 | 23 | while True: 24 | try: 25 | provider: BaseProvider = q.get()() 26 | 27 | provider_name = provider.__class__.__name__ 28 | 29 | logger.debug('Get a provider from the provider queue: ' + provider_name) 30 | 31 | for url in provider.urls(): 32 | try: 33 | html = worker.get_html(url, render_js=provider.should_render_js()) 34 | except Exception as e: 35 | logger.error("worker.get_html failed: %s", e) 36 | continue 37 | 38 | if html: 39 | proxies = provider.parse(html) 40 | 41 | for p in proxies: 42 | validator_queue.put(p) 43 | # logger.debug('Put new proxy ip into queue: {}'.format(p.__str__())) 44 | 45 | logger.info( 46 | ' {}: feed {} potential proxies into the validator queue'.format(provider_name, len(proxies)) 47 | ) 48 | except (KeyboardInterrupt, InterruptedError, SystemExit): 49 | worker.stop() 50 | logger.info('worker_process exited.') 51 | break 52 | 53 | 54 | def validate_ips(validator_queue: Queue, validator_pool: ThreadPoolExecutor): 55 | while True: 56 | try: 57 | proxy: ProxyIP = validator_queue.get() 58 | 59 | validator_pool.submit(validate_proxy_ip, p=proxy) 60 | except (KeyboardInterrupt, SystemExit): 61 | break 62 | 63 | 64 | def cron_schedule(scheduler, only_once=False): 65 | """ 66 | 67 | :param scheduler: the Scheduler instance 68 | :param only_once: flag for testing 69 | """ 70 | 71 | def feed(): 72 | scheduler.feed_providers() 73 | 74 | def feed_from_db(): 75 | 76 | # TODO: better query (order by attempts) 77 | proxies = ProxyIP.select().where(ProxyIP.updated_at > datetime.now() - timedelta(days=14)) 78 | for p in proxies: 79 | scheduler.validator_queue.put(p) 80 | 81 | logger.debug('Feed {} proxies from the database for a second time validation'.format(len(proxies))) 82 | 83 | # feed providers at the very beginning 84 | scheduler.feed_providers() 85 | 86 | schedule.every(10).minutes.do(feed) 87 | schedule.every(FEED_FROM_DB_INTERVAL_MINUTES).minutes.do(feed_from_db) 88 | 89 | logger.info('Start python scheduler') 90 | 91 | flag = True 92 | 93 | # After 1 minute, try feed_from_db() for the first time 94 | wait_time_for_feed_from_db = 1 if only_once else 60 95 | time.sleep(wait_time_for_feed_from_db) 96 | feed_from_db() 97 | 98 | while flag: 99 | try: 100 | schedule.run_pending() 101 | 102 | if only_once: 103 | flag = False 104 | else: 105 | time.sleep(60) 106 | except (KeyboardInterrupt, InterruptedError): 107 | logger.info('Stopping python scheduler') 108 | break 109 | 110 | 111 | class Scheduler(object): 112 | 113 | def __init__(self): 114 | self.worker_queue = Queue() 115 | self.validator_queue = Queue() 116 | self.worker_process = None 117 | self.validator_thread = None 118 | self.cron_thread = None 119 | self.validator_pool = ThreadPoolExecutor(max_workers=int(get_config('validation_pool', default='31'))) 120 | 121 | def start(self): 122 | """ 123 | Start the scheduler with processes for worker (fetching candidate proxies from different providers), 124 | and validator threads for checking whether the fetched proxies are able to use. 125 | 126 | """ 127 | logger.info('Scheduler starts...') 128 | 129 | self.cron_thread = Thread(target=cron_schedule, args=(self,), daemon=True) 130 | self.worker_process = Process(target=fetch_ips, args=(self.worker_queue, self.validator_queue)) 131 | self.validator_thread = Thread(target=validate_ips, args=(self.validator_queue, self.validator_pool)) 132 | 133 | self.cron_thread.daemon = True 134 | self.worker_process.daemon = True 135 | self.validator_thread.daemon = True 136 | 137 | self.cron_thread.start() 138 | self.worker_process.start() # Python will wait for all process finished 139 | logger.info('worker_process started') 140 | self.validator_thread.start() 141 | logger.info('validator_thread started') 142 | 143 | def join(self): 144 | """ 145 | Wait for worker processes and validator threads 146 | 147 | """ 148 | while (self.worker_process and self.worker_process.is_alive()) or ( 149 | self.validator_thread and self.validator_thread.is_alive()): 150 | try: 151 | self.worker_process.join() 152 | self.validator_thread.join() 153 | except (KeyboardInterrupt, SystemExit): 154 | break 155 | 156 | def feed_providers(self): 157 | logger.debug('feed {} providers...'.format(len(all_providers))) 158 | 159 | for provider in all_providers: 160 | self.worker_queue.put(provider) 161 | 162 | def stop(self): 163 | self.worker_queue.close() 164 | self.worker_process.terminate() 165 | # self.validator_thread.terminate() # TODO: 'terminate' the thread using a flag 166 | self.validator_pool.shutdown(wait=False) 167 | -------------------------------------------------------------------------------- /scylla/tcpping.py: -------------------------------------------------------------------------------- 1 | import socket 2 | import time 3 | from timeit import default_timer as timer 4 | 5 | from six.moves import zip_longest 6 | 7 | 8 | def avg(x): 9 | return sum(x) / float(len(x)) 10 | 11 | 12 | class Socket(object): 13 | def __init__(self, family, type_, timeout): 14 | s = socket.socket(family, type_) 15 | s.settimeout(timeout) 16 | self._s = s 17 | 18 | def connect(self, host, port=80): 19 | self._s.connect((host, int(port))) 20 | 21 | def shutdown(self): 22 | self._s.shutdown(socket.SHUT_RD) 23 | 24 | def close(self): 25 | self._s.close() 26 | 27 | 28 | class Timer(object): 29 | def __init__(self): 30 | self._start = 0 31 | self._stop = 0 32 | 33 | def start(self): 34 | self._start = timer() 35 | 36 | def stop(self): 37 | self._stop = timer() 38 | 39 | def cost(self, funcs, args): 40 | # TODO: handle ConnectionRefusedError 41 | self.start() 42 | for func, arg in zip_longest(funcs, args): 43 | if arg: 44 | func(*arg) 45 | else: 46 | func() 47 | 48 | self.stop() 49 | return self._stop - self._start 50 | 51 | 52 | class Ping(object): 53 | def __init__(self, host: str, port: int, timeout=1): 54 | self.timer = Timer() 55 | 56 | self._successes = 0 57 | self._failed = 0 58 | self._conn_times = [] 59 | self._host = host 60 | self._port = port 61 | self._timeout = timeout 62 | 63 | def _create_socket(self, family, type_): 64 | return Socket(family, type_, self._timeout) 65 | 66 | def _success_rate(self): 67 | count = self._successes + self._failed 68 | try: 69 | rate = float(self._successes) / count 70 | rate = '{0:.2f}'.format(rate) 71 | except ZeroDivisionError: 72 | rate = '0.00' 73 | return float(rate) 74 | 75 | def _get_conn_times(self) -> [float]: 76 | return self._conn_times if self._conn_times != [] else [0] 77 | 78 | def get_maximum(self) -> float: 79 | return max(self._get_conn_times()) 80 | 81 | def get_minimum(self) -> float: 82 | return min(self._get_conn_times()) 83 | 84 | def get_average(self) -> float: 85 | return avg(self._get_conn_times()) 86 | 87 | def get_success_rate(self): 88 | return self._success_rate() 89 | 90 | def ping(self, count=10, sleep=0.3): 91 | for n in range(1, count + 1): 92 | s = self._create_socket(socket.AF_INET, socket.SOCK_STREAM) 93 | try: 94 | time.sleep(sleep) 95 | cost_time = self.timer.cost( 96 | (s.connect, s.shutdown), 97 | ((self._host, self._port), None)) 98 | s_runtime = 1000 * cost_time 99 | 100 | self._conn_times.append(s_runtime) 101 | except socket.timeout: 102 | self._failed += 1 103 | except ConnectionResetError: 104 | self._failed += 1 105 | else: 106 | self._successes += 1 107 | 108 | finally: 109 | s.close() 110 | 111 | 112 | def ping(host: str, port: int, count: int = 10, sleep: float = 0.2) -> (int, float): 113 | """ 114 | Ping a server and port with tcp socket 115 | :param host: the hostname 116 | :param port: the port number 117 | :param count: number of connection tries, by default it is 10 118 | :param sleep: length of sleep time in between sequent pings, by default it is 0.3 119 | :return: a tuple for (average_latency, success_rate) 120 | """ 121 | p = Ping(host=host, port=port) 122 | p.ping(count=count, sleep=sleep) 123 | return p.get_average(), p.get_success_rate() 124 | -------------------------------------------------------------------------------- /scylla/validation_policy.py: -------------------------------------------------------------------------------- 1 | from datetime import datetime, timedelta 2 | 3 | from scylla.database import ProxyIP 4 | 5 | 6 | class ValidationPolicy(object): 7 | """ 8 | ValidationPolicy will make decision about validating a proxy IP from the following aspects: 9 | 1. Whether or not to validate the proxy 10 | 2. Use http or https to validate the proxy 11 | 12 | After 3 attempts, the validator should try no more attempts in 24 hours after its creation. 13 | """ 14 | proxy_ip: ProxyIP = None 15 | 16 | def __init__(self, proxy_ip: ProxyIP): 17 | """ 18 | Constructor of ValidationPolicy 19 | :param proxy_ip: the ProxyIP instance to be validated 20 | """ 21 | self.proxy_ip = proxy_ip 22 | 23 | def should_validate(self) -> bool: 24 | if self.proxy_ip.attempts == 0: 25 | return True 26 | elif self.proxy_ip.attempts < 3 \ 27 | and datetime.now() - self.proxy_ip.created_at < timedelta(hours=24) \ 28 | and not self.proxy_ip.is_valid: 29 | # If the proxy is created within 24 hours, the maximum attempt count is 3 30 | return True 31 | elif timedelta(hours=48) > datetime.now() - self.proxy_ip.created_at > timedelta(hours=24) \ 32 | and self.proxy_ip.attempts < 6: 33 | # The proxy will be validated up to 6 times with in 48 hours after 24 hours 34 | return True 35 | elif datetime.now() - self.proxy_ip.created_at < timedelta(days=7) \ 36 | and self.proxy_ip.attempts < 21 \ 37 | and self.proxy_ip.is_valid: 38 | # After 48 hours the proxy is created, the proxy will be validated up to 39 | # 21 times (3 times a day on average) if it is valid within 7 days. 40 | return True 41 | # By default, return False 42 | return False 43 | 44 | def should_try_https(self) -> bool: 45 | if self.proxy_ip.is_valid and self.proxy_ip.attempts < 3 \ 46 | and self.proxy_ip.https_attempts == 0: 47 | # Try https proxy for the 2nd and 3rd time if the proxy is valid 48 | return True 49 | 50 | return False 51 | -------------------------------------------------------------------------------- /scylla/validator.py: -------------------------------------------------------------------------------- 1 | import json 2 | import math 3 | 4 | import requests 5 | 6 | from .loggings import logger 7 | from .tcpping import ping 8 | 9 | IP_CHECKER_API = 'http://api.ipify.org/?format=json' 10 | IP_CHECKER_API_SSL = 'https://api.ipify.org/?format=json' 11 | 12 | __CURRENT_IP__ = None 13 | 14 | 15 | def get_current_ip(): 16 | global __CURRENT_IP__ 17 | if __CURRENT_IP__: 18 | # logger.debug('get_current_ip from cache') 19 | return __CURRENT_IP__ 20 | else: 21 | # logger.debug('fetch current_ip') 22 | r = requests.get(IP_CHECKER_API) 23 | j = json.loads(r.text) 24 | __CURRENT_IP__ = j['ip'] 25 | return __CURRENT_IP__ 26 | 27 | 28 | class Validator(object): 29 | def __init__(self, host: str, port: int, using_https: bool = False): 30 | self._host = host 31 | self._port = port 32 | 33 | self._using_https = using_https 34 | 35 | # default values 36 | self._success_rate = 0.0 37 | self._latency = float('inf') 38 | 39 | self._anonymous = False 40 | self._valid = False 41 | 42 | self._meta = None 43 | 44 | def validate_latency(self): 45 | try: 46 | (self._latency, self._success_rate) = ping(self._host, self._port) 47 | except ConnectionRefusedError: 48 | self._latency, self._success_rate = math.inf, 0.0 49 | 50 | def validate_proxy(self): 51 | protocol = 'https' if self._using_https else 'http' 52 | proxy_str = '{}://{}:{}'.format(protocol, self._host, self._port) 53 | try: 54 | checking_api = IP_CHECKER_API_SSL if self._using_https else IP_CHECKER_API 55 | 56 | # First request for checking IP 57 | r = requests.get(checking_api, proxies={'https': proxy_str, 'http': proxy_str}, verify=False, timeout=15) 58 | if r.ok: 59 | j = json.loads(r.text) 60 | 61 | if j['ip'] != get_current_ip(): 62 | self._anonymous = True 63 | self._valid = True 64 | 65 | # A second request for meta info 66 | r2 = requests.get('https://api.ip.sb/geoip/{}'.format(j['ip']), timeout=15) 67 | jresponse = r2.json() 68 | 69 | # Load meta data 70 | # TODO: better location check 71 | meta = { 72 | 'location': '{},{}'.format(jresponse['latitude'], jresponse['longitude']), 73 | 'organization': jresponse['organization'] if 'organization' in jresponse else None, 74 | 'region': jresponse['region'], 75 | 'country': jresponse['country_code'], 76 | 'city': jresponse['city'], 77 | } 78 | self._meta = meta 79 | 80 | except requests.Timeout: 81 | logger.debug('Catch requests.Timeout for proxy ip: {}'.format(self._host)) 82 | except requests.RequestException as e: 83 | logger.debug('Catch requests.RequestException for proxy ip: {}'.format(self._host)) 84 | logger.debug(e.__str__()) 85 | 86 | def validate(self): 87 | self.validate_latency() 88 | self.validate_proxy() 89 | 90 | @property 91 | def latency(self): 92 | return self._latency 93 | 94 | @property 95 | def success_rate(self): 96 | return self._success_rate 97 | 98 | @property 99 | def valid(self): 100 | return self._valid 101 | 102 | @property 103 | def anonymous(self): 104 | return self._anonymous 105 | 106 | @property 107 | def meta(self): 108 | return self._meta 109 | 110 | @property 111 | def using_https(self): 112 | return self._using_https 113 | -------------------------------------------------------------------------------- /scylla/web/__init__.py: -------------------------------------------------------------------------------- 1 | from scylla.web.server import start_web_server 2 | -------------------------------------------------------------------------------- /scylla/web/server.py: -------------------------------------------------------------------------------- 1 | import math 2 | import os 3 | from typing import Optional 4 | from fastapi import FastAPI 5 | from fastapi.staticfiles import StaticFiles 6 | 7 | from playhouse.shortcuts import model_to_dict 8 | import uvicorn 9 | import sys 10 | 11 | from scylla.database import ProxyIP 12 | from scylla.loggings import logger 13 | 14 | app = FastAPI() 15 | 16 | 17 | base_path = os.path.abspath(os.path.join(__file__, os.pardir, os.pardir)) 18 | 19 | # app.static('/assets', base_path + '/assets') 20 | # app.static('/', base_path + '/assets/index.html') 21 | # app.static('/*', base_path + '/assets/index.html') 22 | 23 | app.mount("/assets", StaticFiles(directory=base_path + '/assets', html=True), name="assets") 24 | 25 | # app.mount("/", StaticFiles(directory=base_path + '/assets'), name="index") 26 | # app.mount("/*", StaticFiles(directory=base_path + '/assets'), name="index") 27 | 28 | 29 | def _parse_str_to_int(s: str) -> int: 30 | try: 31 | return int(s) 32 | except ValueError: 33 | return 0 34 | 35 | 36 | def _get_valid_proxies_query(): 37 | return ProxyIP.select().where(ProxyIP.latency > 0).where(ProxyIP.latency < 9999) \ 38 | .where(ProxyIP.is_valid == True) 39 | 40 | 41 | @app.get('/api/v1/proxies') 42 | async def api_v1_proxies(limit: int = 20, page: int = 1, anonymous: str = 'any', https: str = 'true', countries: Optional[str] = None): 43 | is_anonymous = 2 # 0: no, 1: yes, 2: any 44 | if anonymous == 'true': 45 | is_anonymous = 1 46 | elif anonymous == 'false': 47 | is_anonymous = 0 48 | else: 49 | is_anonymous = 2 50 | 51 | country_list = [] 52 | if countries: 53 | country_list = countries.split(',') 54 | 55 | proxy_initial_query = _get_valid_proxies_query() 56 | 57 | proxy_query = proxy_initial_query 58 | 59 | if is_anonymous != 2: 60 | if is_anonymous == 1: 61 | proxy_query = proxy_initial_query.where( 62 | ProxyIP.is_anonymous == True) 63 | elif is_anonymous == 0: 64 | proxy_query = proxy_initial_query.where( 65 | ProxyIP.is_anonymous == False) 66 | 67 | if https: 68 | if https == 'true': 69 | proxy_query = proxy_initial_query.where(ProxyIP.is_https == True) 70 | elif https == 'false': 71 | proxy_query = proxy_initial_query.where(ProxyIP.is_https == False) 72 | 73 | if country_list and len(country_list) > 0: 74 | proxy_query = proxy_query.where(ProxyIP.country << country_list) 75 | 76 | count = proxy_query.count() # count before sorting 77 | 78 | proxies = proxy_query.order_by(ProxyIP.updated_at.desc( 79 | ), ProxyIP.latency).offset((page - 1) * limit).limit(limit) 80 | 81 | logger.debug(f'Perform SQL query: {proxy_query.sql()}') 82 | 83 | proxy_list = [] 84 | 85 | for p in proxies: 86 | dict_model = model_to_dict(p) 87 | dict_model['created_at'] = dict_model['created_at'].isoformat() 88 | dict_model['updated_at'] = dict_model['updated_at'].isoformat() 89 | proxy_list.append( 90 | dict_model 91 | ) 92 | 93 | return { 94 | 'proxies': proxy_list, 95 | 'count': count, 96 | 'per_page': limit, 97 | 'page': page, 98 | 'total_page': math.ceil(count / limit), 99 | } 100 | 101 | 102 | @app.get('/api/v1/stats') 103 | async def api_v1_stats(): 104 | median_query: ProxyIP = ProxyIP.raw("""SELECT latency 105 | FROM proxy_ips 106 | WHERE is_valid = 1 107 | ORDER BY latency 108 | LIMIT 1 109 | OFFSET ( 110 | SELECT COUNT(*) FROM proxy_ips WHERE is_valid = 1 111 | ) / 2""").get() 112 | median = median_query.latency 113 | 114 | mean_query: ProxyIP = ProxyIP.raw("""SELECT AVG(latency) as latency 115 | FROM proxy_ips 116 | WHERE is_valid = 1 AND latency < 9999""").get() 117 | mean = mean_query.latency 118 | 119 | valid_count = _get_valid_proxies_query().count() 120 | 121 | total_count = ProxyIP.select().count(None) 122 | 123 | return { 124 | 'median': median, 125 | 'valid_count': valid_count, 126 | 'total_count': total_count, 127 | 'mean': mean, 128 | } 129 | 130 | 131 | def start_web_server(host='0.0.0.0', port=8899): 132 | # parent dir of the current file 133 | app_dir = os.path.abspath(os.path.join(__file__, os.pardir, os.pardir)) 134 | logger.debug(f'App dir: {app_dir}') 135 | sys.path.insert(0, app_dir) 136 | # https://www.uvicorn.org/deployment/#running-programmatically 137 | sys.exit( 138 | uvicorn.run( 139 | 'scylla.web.server:app', host=host, port=port, reload=False, 140 | workers=4 141 | ) 142 | ) 143 | -------------------------------------------------------------------------------- /scylla/worker.py: -------------------------------------------------------------------------------- 1 | import requests 2 | from playwright.sync_api import sync_playwright 3 | from pyquery import PyQuery 4 | from requests import Response 5 | from typing import Union 6 | 7 | from scylla.loggings import logger 8 | 9 | DEFAULT_TIMEOUT_SECONDS = 30 10 | 11 | DEFAULT_USER_AGENT = 'Mozilla/5.0 (Macintosh; Intel Mac OS X 11_2_3) AppleWebKit/537.36 (KHTML, like Gecko) ' \ 12 | 'Chrome/89.0.4389.90 Safari/537.36 ' 13 | 14 | 15 | class Worker: 16 | 17 | def __init__(self): 18 | """Initialize the worker object 19 | 20 | """ 21 | 22 | with sync_playwright() as p: 23 | self.browser = p.chromium.launch() 24 | 25 | self.requests_session = requests.Session() 26 | self.requests_session.headers['User-Agent'] = DEFAULT_USER_AGENT 27 | 28 | def stop(self): 29 | """Clean the session 30 | """ 31 | 32 | self.browser.close() 33 | 34 | self.requests_session.close() 35 | 36 | def get_html(self, url: str, render_js: bool = True) -> Union[PyQuery, None]: 37 | """Get html from a specific URL 38 | 39 | :param url: the URL 40 | :param render_js: [whether to render js], defaults to True 41 | :param render_js: bool, optional 42 | :return: [the HTML string] 43 | :rtype: str 44 | """ 45 | 46 | if render_js: 47 | return self._get_html_js(url) 48 | else: 49 | return self._get_html_no_js(url) 50 | 51 | def _get_html_no_js(self, url: str) -> Union[PyQuery, None]: 52 | try: 53 | # TODO: load config for timeout 54 | response: Response = self.requests_session.get(url, timeout=DEFAULT_TIMEOUT_SECONDS) 55 | except requests.RequestException: 56 | logger.warning('[Worker] Cannot get this url: ' + url) 57 | return None 58 | except (KeyboardInterrupt, SystemExit, InterruptedError): 59 | self.stop() 60 | return None 61 | 62 | if response.ok: 63 | doc = PyQuery(response.text) 64 | return doc 65 | else: 66 | logger.debug(f'Request for {url} failed, status code: {response.status_code}') 67 | return None 68 | 69 | def _get_html_js(self, url: str) -> Union[PyQuery, None]: 70 | page = self.browser.new_page() 71 | response = page.goto(url=url, timeout=DEFAULT_TIMEOUT_SECONDS, wait_until='domcontentloaded') 72 | 73 | if not response: 74 | logger.debug(f'Request for {url} failed because response is None') 75 | return None 76 | 77 | if response.ok: 78 | doc = PyQuery(page.content()) 79 | return doc 80 | else: 81 | logger.debug(f'Request for {url} failed, status code: {response.status}') 82 | return None 83 | -------------------------------------------------------------------------------- /setup.cfg: -------------------------------------------------------------------------------- 1 | [metadata] 2 | description-file = README.md 3 | # This includes the license file in the wheel. 4 | license_file = LICENSE 5 | 6 | [bdist_wheel] 7 | # This flag says to generate wheels that support both Python 2 and Python 8 | # 3. If your code will not run unchanged on both Python 2 and 3, you will 9 | # need to generate separate wheels for each Python version that you 10 | # support. Removing this line (or setting universal to 0) will prevent 11 | # bdist_wheel from trying to make a universal wheel. For more see: 12 | # https://packaging.python.org/tutorials/distributing-packages/#wheels 13 | universal = 1 -------------------------------------------------------------------------------- /setup.py: -------------------------------------------------------------------------------- 1 | import io 2 | import os 3 | 4 | from setuptools import setup, find_packages 5 | 6 | import scylla 7 | 8 | here = os.path.abspath(os.path.dirname(__file__)) 9 | # Import the README and use it as the long-description. 10 | with io.open(os.path.join(here, 'README.rst'), encoding='utf-8') as f: 11 | long_description = '\n' + f.read() 12 | 13 | with open('requirements.txt') as f: 14 | required = f.read().splitlines() 15 | 16 | 17 | setup( 18 | name='scylla', 19 | python_requires='>=3.6.0', 20 | # If your package is a single module, use this instead of 'packages': 21 | packages=find_packages(exclude=["*.tests", "*.tests.*", "tests.*", "tests"]), 22 | entry_points={ 23 | 'console_scripts': ['scylla = scylla.cli:app_main'] 24 | }, 25 | version=scylla.__version__, 26 | description='Intelligent proxy pool for Humans™', 27 | long_description=long_description, 28 | author=scylla.__author__, 29 | author_email='wildcat.name@gmail.com', 30 | url='https://github.com/imWildCat/scylla', 31 | # download_url='https://github.com/imWildCat/scylla/archive/0.1.0.tar.gz', 32 | keywords=['proxy', 'api', 'scylla'], 33 | classifiers=[ 34 | # Trove classifiers 35 | # Full list: https://pypi.python.org/pypi?%3Aaction=list_classifiers 36 | 'Programming Language :: Python', 37 | 'Programming Language :: Python :: 3.6', 38 | 'Programming Language :: Python :: Implementation :: CPython', 39 | 'Programming Language :: Python :: Implementation :: PyPy', 40 | 'License :: OSI Approved :: Apache Software License' 41 | ], 42 | install_requires=required, 43 | include_package_data=True, 44 | ) 45 | -------------------------------------------------------------------------------- /tests/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/imWildCat/scylla/b051fd586f2e3268bb07f8d94a0b27dce01dea12/tests/__init__.py -------------------------------------------------------------------------------- /tests/cli_test.py: -------------------------------------------------------------------------------- 1 | from scylla.cli import main 2 | 3 | 4 | def test_cli(mocker): 5 | # batch_set_config_func = mocker.patch('scylla.app_config.batch_set_config') 6 | scheduler_start = mocker.patch('scylla.scheduler.Scheduler.start') 7 | # create_db_tables = mocker.patch('scylla.database.create_db_tables') 8 | 9 | ret = main(['-no-ws']) 10 | 11 | assert 0 == ret 12 | 13 | # batch_set_config_func.assert_called_once() # FIXME: assert not called but actually called 14 | # create_db_tables.assert_called_once() # FIXME: assert not called but actually called 15 | scheduler_start.assert_called_once() 16 | -------------------------------------------------------------------------------- /tests/config_test.py: -------------------------------------------------------------------------------- 1 | from scylla.config import set_config, get_config, batch_set_config 2 | 3 | 4 | def test_config(): 5 | set_config('foo', 'bar') 6 | config_str = get_config('foo') 7 | assert 'bar' == config_str 8 | 9 | 10 | def test_config_default(): 11 | config_str = get_config('empty', default='baz') 12 | assert 'baz' == config_str 13 | 14 | 15 | def test_batch_set(mocker): 16 | m = mocker.patch('scylla.config.set_config') 17 | batch_set_config(**{'a': 1, 'b': 2}) 18 | m.assert_called() 19 | -------------------------------------------------------------------------------- /tests/conftest.py: -------------------------------------------------------------------------------- 1 | def _create_db_file(): 2 | from scylla.database import create_connection, create_db_tables 3 | create_connection() 4 | create_db_tables() 5 | 6 | 7 | def _delete_db_file(): 8 | import os 9 | os.remove('scylla.db') 10 | 11 | 12 | def pytest_sessionstart(session): 13 | """ before session.main() is called. """ 14 | _create_db_file() 15 | 16 | 17 | def pytest_sessionfinish(session, exitstatus): 18 | """ whole test run finishes. """ 19 | pass 20 | # _delete_db_file() 21 | -------------------------------------------------------------------------------- /tests/database_test.py: -------------------------------------------------------------------------------- 1 | import logging 2 | import random 3 | import socket 4 | import struct 5 | 6 | from scylla.database import create_connection, create_db_tables, ProxyIP 7 | 8 | # Add logging 9 | logger = logging.getLogger('peewee') 10 | logger.setLevel(logging.DEBUG) 11 | logger.addHandler(logging.StreamHandler()) 12 | 13 | 14 | def gen_random_ip() -> str: 15 | """ 16 | Generate random ip 17 | From: https://stackoverflow.com/questions/21014618/python-randomly-generated-ip-address-of-the-string 18 | """ 19 | return socket.inet_ntoa(struct.pack('>I', random.randint(1, 0xffffffff))) 20 | 21 | 22 | def create_test_ip() -> str: 23 | ip_str = gen_random_ip() 24 | ip = ProxyIP(ip=ip_str, port=3306, latency=200.00, stability=100.0, is_valid=True) 25 | ip.save() 26 | return ip_str 27 | 28 | 29 | def delete_test_ip(ip_str: str): 30 | ProxyIP.delete().where(ProxyIP.ip == ip_str).execute() 31 | 32 | 33 | def delete_test_ips(ip_strs: [str]): 34 | for ip in ip_strs: 35 | delete_test_ip(ip) 36 | 37 | 38 | def test_create_connection(): 39 | db = create_connection() 40 | print(db) 41 | 42 | 43 | def test_create_db_tables(): 44 | create_db_tables() 45 | 46 | 47 | def test_create_ip(): 48 | ip_str = create_test_ip() 49 | 50 | count = ProxyIP.select().count() 51 | assert count > 0 52 | 53 | delete_test_ip(ip_str) 54 | 55 | 56 | def test_delete_ip(): 57 | ret = ProxyIP.delete().execute() 58 | print(ret) 59 | 60 | 61 | def test_create_ip_floor_latency(): 62 | ip_str = gen_random_ip() 63 | ip = ProxyIP(ip=ip_str, port=3306, latency=100.66, stability=100.0, is_valid=True) 64 | ip.save() 65 | 66 | assert ip.latency == 100.0 67 | 68 | delete_test_ip(ip_str) 69 | -------------------------------------------------------------------------------- /tests/jobs_test.py: -------------------------------------------------------------------------------- 1 | from scylla.database import ProxyIP 2 | from scylla.jobs import validate_proxy_ip, save_ip 3 | 4 | 5 | def test_validate_proxy_ip(mocker): 6 | method = mocker.patch('scylla.validator.Validator.validate') 7 | method2 = mocker.patch('scylla.jobs.save_ip') 8 | p = ProxyIP(ip='127.0.0.1', port=80) 9 | validate_proxy_ip(p) 10 | method.assert_called_once() 11 | method2.assert_called_once() 12 | 13 | 14 | def test_save_ip(): 15 | p1 = ProxyIP(ip='192.168.0.1', port=443, latency=200, stability=0.5) 16 | save_ip(p1) 17 | # basically the same ip 18 | p2 = ProxyIP(ip='192.168.0.1', port=443, latency=200, stability=0.5) 19 | save_ip(p2) 20 | count = ProxyIP.select().where(ProxyIP.ip == '192.168.0.1').count() 21 | 22 | assert count == 1 23 | 24 | p3 = ProxyIP(ip='192.168.0.1', port=80, latency=200, stability=0.5) 25 | save_ip(p3) 26 | count = ProxyIP.select().where(ProxyIP.ip == '192.168.0.1').count() 27 | 28 | assert count == 2 29 | 30 | ProxyIP.delete().execute() 31 | -------------------------------------------------------------------------------- /tests/requirements-test.txt: -------------------------------------------------------------------------------- 1 | codecov==2.1.13 2 | flake8==6.1.0 3 | setuptools==70.0.0 4 | pytest==7.4.3 5 | pytest-cov==4.1.0 6 | pytest-forked==1.6.0 7 | pytest-mock==3.12.0 8 | pytest-xdist==3.4.0 9 | -------------------------------------------------------------------------------- /tests/scheduler_test.py: -------------------------------------------------------------------------------- 1 | import pytest 2 | 3 | from scylla.scheduler import Scheduler, cron_schedule 4 | 5 | 6 | @pytest.fixture 7 | def scheduler(): 8 | return Scheduler() 9 | 10 | 11 | def test_start(mocker, scheduler): 12 | process_start = mocker.patch('multiprocessing.Process.start') 13 | thread_start = mocker.patch('threading.Thread.start') 14 | 15 | scheduler.start() 16 | 17 | process_start.assert_called_once() 18 | thread_start.assert_called() 19 | 20 | 21 | def test_cron_schedule(mocker, scheduler): 22 | feed_providers = mocker.patch('scylla.scheduler.Scheduler.feed_providers') 23 | cron_schedule(scheduler, only_once=True) 24 | feed_providers.assert_called_once() 25 | 26 | 27 | def test_feed_providers(mocker, scheduler): 28 | pass 29 | # TODO: mock Queue.put or find other solutions 30 | # queue_put = mocker.patch('multiprocessing.Queue.put') 31 | # 32 | # scheduler.feed_providers() 33 | # 34 | # queue_put.assert_called() 35 | -------------------------------------------------------------------------------- /tests/tcpping_test.py: -------------------------------------------------------------------------------- 1 | from scylla.tcpping import Ping, ping 2 | 3 | 4 | def test_ping_class(): 5 | p = Ping('www.example.com', port=80) 6 | 7 | p.ping(5, 0.01) 8 | 9 | assert p.get_maximum() >= p.get_average() 10 | assert p.get_minimum() <= p.get_average() 11 | assert p.get_average() >= 0 12 | 13 | assert p.get_success_rate() >= 0.0 14 | 15 | 16 | def test_ping_func(): 17 | avg, rate = ping('www.example.com', 80) 18 | 19 | assert avg > 0 20 | assert rate > 0.0 21 | -------------------------------------------------------------------------------- /tests/test_loggings.py: -------------------------------------------------------------------------------- 1 | import logging 2 | 3 | 4 | def test_basic_logging(caplog): 5 | caplog.set_level(logging.INFO) 6 | logging.info('foo') 7 | assert 'foo' in caplog.text 8 | -------------------------------------------------------------------------------- /tests/validation_policy_test.py: -------------------------------------------------------------------------------- 1 | from datetime import datetime, timedelta 2 | 3 | import pytest 4 | 5 | from scylla.database import ProxyIP 6 | from scylla.validation_policy import ValidationPolicy 7 | 8 | 9 | @pytest.fixture 10 | def p(): 11 | return ProxyIP(ip='127.0.0.1', port=3306, is_valid=False) 12 | 13 | 14 | @pytest.fixture 15 | def valid_http_proxy(): 16 | return ProxyIP(ip='127.0.0.1', port=3306, is_valid=True) 17 | 18 | 19 | # test should_validate() 20 | 21 | def test_should_validate_policy_attempts_0(p: ProxyIP): 22 | policy = ValidationPolicy(proxy_ip=p) 23 | assert policy.should_validate() 24 | 25 | 26 | def test_should_validate_policy_attempts_1(p: ProxyIP): 27 | p.attempts = 1 28 | policy = ValidationPolicy(proxy_ip=p) 29 | assert policy.should_validate() 30 | 31 | 32 | def test_should_validate_policy_attempts_3(p: ProxyIP): 33 | p.attempts = 3 34 | policy = ValidationPolicy(proxy_ip=p) 35 | assert not policy.should_validate() 36 | 37 | 38 | def test_should_validate_policy_attempts_3_after_24h_in_48h(p: ProxyIP): 39 | p.attempts = 3 40 | p.created_at = datetime.now() - timedelta(hours=25) 41 | policy = ValidationPolicy(proxy_ip=p) 42 | assert policy.should_validate() 43 | 44 | 45 | def test_should_try_https(valid_http_proxy: ProxyIP): 46 | valid_http_proxy.attempts = 1 47 | policy = ValidationPolicy(proxy_ip=valid_http_proxy) 48 | assert policy.should_try_https() 49 | 50 | 51 | def test_should_try_https_attempts_3(valid_http_proxy: ProxyIP): 52 | valid_http_proxy.attempts = 3 53 | policy = ValidationPolicy(proxy_ip=valid_http_proxy) 54 | assert not policy.should_try_https() 55 | -------------------------------------------------------------------------------- /tests/validator_test.py: -------------------------------------------------------------------------------- 1 | import pytest 2 | 3 | from scylla.validator import Validator 4 | 5 | 6 | @pytest.fixture 7 | def validator(): 8 | return Validator(host='145.239.185.126', port=1080) 9 | 10 | 11 | @pytest.fixture 12 | def validator2(): 13 | return Validator(host='162.246.200.100', port=80) 14 | 15 | 16 | def test_latency(validator): 17 | validator.validate_latency() 18 | assert validator.success_rate >= 0 19 | # TODO: fix the problem of latency == 0 20 | assert validator.latency >= 0 21 | 22 | 23 | def test_proxy(validator): 24 | validator.validate_proxy() 25 | 26 | 27 | def test_proxy(validator2): 28 | validator2.validate_proxy() 29 | 30 | 31 | def test_proxy(validator2, mocker): 32 | l = mocker.patch('scylla.validator.Validator.validate_latency') 33 | p = mocker.patch('scylla.validator.Validator.validate_proxy') 34 | validator2.validate() 35 | l.assert_called_once() 36 | p.assert_called_once() 37 | -------------------------------------------------------------------------------- /tests/web/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/imWildCat/scylla/b051fd586f2e3268bb07f8d94a0b27dce01dea12/tests/web/__init__.py -------------------------------------------------------------------------------- /tests/worker_test.py: -------------------------------------------------------------------------------- 1 | import typing 2 | import pytest 3 | 4 | from scylla.worker import Worker 5 | 6 | 7 | @pytest.fixture 8 | def worker_instance(): 9 | return Worker() 10 | 11 | 12 | def test_worker_initialization(worker_instance): 13 | worker = worker_instance 14 | assert worker.browser 15 | assert worker.requests_session 16 | 17 | 18 | def test_get_html_without_js_rendering(worker_instance): 19 | worker: Worker = worker_instance 20 | result = worker.get_html('http://www.example.com/', render_js=False) 21 | 22 | if not result: 23 | assert False 24 | 25 | html: str = typing.cast(str, result.html()) 26 | 27 | assert '' in html 28 | assert '<head' in html 29 | assert '<body>' in html 30 | 31 | # def test_get_html_with_js_rendering(worker_instance): 32 | # worker: Worker = worker_instance 33 | # html = worker.get_html('http://example.com/', render_js=True).html() 34 | # assert '<title>' in html 35 | # assert '<html' in html 36 | # assert '<body' in html # Note: The actual body tag is `<body style="">` 37 | -------------------------------------------------------------------------------- /tsconfig.json: -------------------------------------------------------------------------------- 1 | { 2 | "compilerOptions": { 3 | "module": "commonjs", 4 | "moduleResolution": "node", 5 | "newLine": "LF", 6 | "outDir": "./lib/", 7 | "target": "es5", 8 | "sourceMap": true, 9 | "declaration": true, 10 | "jsx": "preserve", 11 | "lib": [ 12 | "es2017", 13 | "dom" 14 | ], 15 | "strict": true, 16 | "noUnusedLocals": true, 17 | "noUnusedParameters": true, 18 | "noImplicitReturns": true, 19 | "noFallthroughCasesInSwitch": true 20 | }, 21 | "include": [ 22 | "frontend_legacy/src/**/*" 23 | , "frontend/src/LegacyApp.tsx" ], 24 | "exclude": [ 25 | ".git", 26 | "node_modules", 27 | "scylla", 28 | "./tests", 29 | "./docs" 30 | ] 31 | } --------------------------------------------------------------------------------