├── README.md ├── get_cf_clearance.py ├── server.py └── test ├── response.txt └── test.py /README.md: -------------------------------------------------------------------------------- 1 | # cf-scraper-py 2 | 3 | 4 | 1. A remake of https://github.com/zfcsoftware/cf-clearance-scraper/tree/main for a python based solution 5 | 2. Invokes a browser automation tool to open up a website with https://github.com/kaliiiiiiiiii/undetected-playwright-python 6 | 3. Sends back a useragent used and the token for scraping purposes 7 | 8 | 9 | INSTRUCTIONS 10 | 1. Run server.py 11 | 2. Format your python request like shown in the test folder, please make sure to set the exec_path in get_cf and enable permissions if not running localhost. 12 | 13 | # Capsolver Sponsor 14 | [![CapSolver Ads](https://github.com/user-attachments/assets/793acd61-2ad9-46cf-bdec-60a61be962e1)](https://www.capsolver.com/?utm_source=github&utm_medium=repo&utm_campaign=scraping&utm_term=cf-scraper-py) 15 | 16 | Alternatively, you may use Capsolver. For more information and to get started, visit the official [Capsolver website](https://www.capsolver.com/?utm_source=github&utm_medium=repo&utm_campaign=scraping&utm_term=cf-scraper-py). 17 | -------------------------------------------------------------------------------- /get_cf_clearance.py: -------------------------------------------------------------------------------- 1 | from undetected_playwright.sync_api import sync_playwright 2 | import sys 3 | import json 4 | import time 5 | 6 | def get_cf_clearance_cookie(url): 7 | browser = None 8 | used_user_agent = None 9 | cf_clearance = None 10 | 11 | try: 12 | with sync_playwright() as p: 13 | args = ["--disable-blink-features=AutomationControlled"] 14 | browser = p.chromium.launch(args=args, headless=False, executable_path='/Applications/Google Chrome.app/Contents/MacOS/Google Chrome') 15 | context = browser.new_context() 16 | page = context.new_page() 17 | 18 | used_user_agent = page.evaluate("navigator.userAgent") 19 | page.goto(url) 20 | time.sleep(4) 21 | 22 | cookies = context.cookies() 23 | cf_clearance = next((cookie['value'] for cookie in cookies if cookie['name'] == 'cf_clearance'), None) 24 | 25 | except Exception as e: 26 | print(f"Error during Playwright execution: {e}", file=sys.stderr) 27 | finally: 28 | if browser: 29 | try: 30 | browser.close() 31 | except Exception as e: 32 | print(f"Error while closing the browser: {e}", file=sys.stderr) 33 | 34 | result = { 35 | "cf_clearance": cf_clearance, 36 | "user_agent": used_user_agent 37 | } 38 | print(json.dumps(result, indent=4)) # Pretty-print the JSON result 39 | 40 | if __name__ == '__main__': 41 | if len(sys.argv) != 2: 42 | print("Usage: python get_cf_clearance.py ", file=sys.stderr) 43 | sys.exit(1) 44 | 45 | url = sys.argv[1] 46 | get_cf_clearance_cookie(url) 47 | -------------------------------------------------------------------------------- /server.py: -------------------------------------------------------------------------------- 1 | from flask import Flask, request, jsonify 2 | import subprocess 3 | import json 4 | import sys 5 | 6 | app = Flask(__name__) 7 | 8 | @app.route('/get_cf_clearance', methods=['POST']) 9 | def get_cf_clearance(): 10 | data = request.json 11 | url = data.get('url') 12 | 13 | if not url: 14 | return jsonify({'error': 'URL is required'}), 400 15 | 16 | try: 17 | print(f"Received request for URL: {url}") 18 | print(f"Flask server is using Python executable: {sys.executable}") 19 | result = subprocess.run( 20 | [sys.executable, 'get_cf_clearance.py', url], 21 | capture_output=True, text=True 22 | ) 23 | 24 | print(f"Subprocess stdout: {result.stdout}") 25 | print(f"Subprocess stderr: {result.stderr}") 26 | try: 27 | output = json.loads(result.stdout) 28 | except json.JSONDecodeError as e: 29 | print("JSON decode error:", e, file=sys.stderr) 30 | print(f"Subprocess stdout: {result.stdout}", file=sys.stderr) 31 | return jsonify({'error': 'Failed to parse Playwright script output'}), 500 32 | 33 | cf_clearance = output.get('cf_clearance') 34 | user_agent = output.get('user_agent') 35 | 36 | if cf_clearance and user_agent: 37 | response_data = { 38 | 'cf_clearance': cf_clearance, 39 | 'user_agent': user_agent 40 | } 41 | formatted_response = json.dumps(response_data, indent=4) 42 | return app.response_class(formatted_response, content_type='application/json') 43 | else: 44 | return jsonify({'error': 'Could not retrieve cf_clearance token'}), 500 45 | except subprocess.CalledProcessError as e: 46 | print(f"Subprocess error: {e}", file=sys.stderr) 47 | print(f"Standard Output: {e.stdout}") 48 | print(f"Standard Error: {e.stderr}") 49 | return jsonify({'error': 'Failed to execute Playwright script', 'stdout': e.stdout, 'stderr': e.stderr}), 500 50 | 51 | if __name__ == '__main__': 52 | app.run(host='0.0.0.0', port=5001) # Make sure to use a free port 53 | -------------------------------------------------------------------------------- /test/response.txt: -------------------------------------------------------------------------------- 1 | { 2 | 'cf_clearance': 'N6oYZRzN8P4z_j637KZPjdzRBxJtFe5F8ZACShnaeQA-1718988889-1.0.1.1-KJj4enwxWVVGe6TSrANKrOB0A5sbHmTnefnXuhNFCcBOS.MocmcE8nVUo8mMeAaCB8yUfjKUQfEyLl25twSgkQ', 3 | 'user_agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/126.0.0.0 Safari/537.36' 4 | } 5 | -------------------------------------------------------------------------------- /test/test.py: -------------------------------------------------------------------------------- 1 | import requests 2 | 3 | url = 'http://localhost:5001/get_cf_clearance' 4 | #used deliveroo for the interstit test 5 | data = {'url': 'https://deliveroo.co.uk/login'} 6 | 7 | response = requests.post(url, json=data) 8 | 9 | if response.status_code == 200: 10 | print(response.json()) 11 | else: 12 | print(f"Failed to get cf_clearance token: {response.text}") 13 | --------------------------------------------------------------------------------