├── utils ├── text_dealer.py ├── json_dealer.py ├── config_loader.py ├── template_op_gener.py ├── template_field_gener.py └── wq_info_loader.py ├── test_script.sh ├── .gitignore ├── prompts ├── template_evaluating.yaml └── template_generating.yaml ├── evaluator ├── construct_prompts.py ├── backtest_with_wq.py └── backtest_with_wq_mul.py ├── main_scraper.py ├── config.yaml ├── main_evaluator.py ├── main_researcher.py ├── main.py ├── environment.yml ├── researcher ├── generate_alpha.py ├── generate_template.py └── construct_prompts.py ├── scraper ├── preprocess_texts.py └── scrap_posts_from_wq.py └── README.md /utils/text_dealer.py: -------------------------------------------------------------------------------- 1 | def truncate_text(text, max_chars=5000): 2 | """如果字符串过长则截断(优先在句子/段落分隔符)""" 3 | if len(text) <= max_chars: 4 | return text 5 | # 简单做法:按字符截断 6 | return text[:max_chars] + "... [TRUNCATED]" -------------------------------------------------------------------------------- /test_script.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | # 自动循环运行回测,每轮57分钟,暂停3分钟 4 | 5 | # 无限循环 6 | while true 7 | do 8 | echo "🚀 开始运行 main.py ..." 9 | python main_evaluator.py & 10 | PID=$! 11 | 12 | # 运行57分钟(57*60秒) 13 | sleep $((57 * 60)) 14 | 15 | echo "⏹ 停止 main.py (PID=$PID)..." 16 | kill $PID 17 | 18 | # 等待3分钟 19 | echo "🕒 等待3分钟后重新开始..." 20 | sleep $((3 * 60)) 21 | done 22 | 23 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | # Python 缓存和虚拟环境 2 | __pycache__/ 3 | *.py[cod] 4 | *.so 5 | *.egg 6 | *.egg-info/ 7 | .eggs/ 8 | *.pyo 9 | *.pyd 10 | .Python 11 | venv/ 12 | env/ 13 | .venv/ 14 | ENV/ 15 | *.sqlite3 16 | *.db 17 | 18 | # IDE/编辑器文件 19 | .vscode/ 20 | .idea/ 21 | *.swp 22 | *.swo 23 | 24 | # 操作系统 25 | .DS_Store 26 | Thumbs.db 27 | 28 | # 日志/临时文件 29 | *.log 30 | *.tmp 31 | 32 | # Jupyter/实验数据 33 | .ipynb_checkpoints/ 34 | 35 | # 项目草稿和数据 36 | data/alpha_db* 37 | data/hypothesis_db* 38 | data/template_db* 39 | test/ 40 | 41 | # Playwright/Browser自动生成的数据 42 | playwright/.cache/ 43 | 44 | # 其他可执行或临时文件 45 | *.bak 46 | -------------------------------------------------------------------------------- /prompts/template_evaluating.yaml: -------------------------------------------------------------------------------- 1 | fix_fast_expression : |- 2 | # Role 3 | You are an expert in writing WordQuant Brain Fast Expression. 4 | 5 | # Task 6 | Your task is to correct a syntactically invalid Fast Expression based on the provided simulation error message. 7 | You should preserve the original intent of the expression as much as possible and output only the corrected expression (do not include any additional text). 8 | 9 | Here is an invalid WordQuant Brain Fast Expression: 10 | {{ fast_expression }} 11 | 12 | And the simulation error message is: 13 | {{ error_mes }} 14 | 15 | Now please generate the corrected expression below:(do not include any additional text) -------------------------------------------------------------------------------- /evaluator/construct_prompts.py: -------------------------------------------------------------------------------- 1 | import yaml 2 | from pathlib import Path 3 | 4 | # --- 路径 --- 5 | BASE_DIR = Path(__file__).resolve().parents[1] 6 | PROMPT_FILE = BASE_DIR / "prompts" / "template_evaluating.yaml" 7 | 8 | 9 | def build_fix_fast_expression_prompt(alpha_expression : str, error_mes : str): 10 | with open(PROMPT_FILE, "r", encoding="utf-8") as f: 11 | prompt_yaml = yaml.safe_load(f) 12 | template_str = prompt_yaml.get("fix_fast_expression", "") 13 | if not template_str: 14 | raise ValueError("fix_fast_expression not found in template_generating.yaml") 15 | 16 | prompt_filled = ( 17 | template_str 18 | .replace("{{ fast_expression }}", alpha_expression) 19 | .replace("{{ error_mes }}", error_mes) 20 | ) 21 | return prompt_filled -------------------------------------------------------------------------------- /main_scraper.py: -------------------------------------------------------------------------------- 1 | import random 2 | from pathlib import Path 3 | 4 | from evaluator.backtest_with_wq import run_backtest_by_wq_api 5 | from evaluator.backtest_with_wq_mul import run_backtest_mul_by_wq_api 6 | from researcher.construct_prompts import build_wq_knowledge_prompt, build_check_if_blog_helpful, \ 7 | build_blog_to_hypothesis 8 | from researcher.generate_alpha import generate_alphas_from_template 9 | from researcher.generate_template import from_post_to_template 10 | from scraper.preprocess_texts import preprocess_all_html_posts 11 | from scraper.scrap_posts_from_wq import scrape_new_posts 12 | from utils.template_field_gener import generate_template_fields_v2 13 | from utils.template_op_gener import generate_template_ops 14 | from utils.wq_info_loader import OpAndFeature 15 | 16 | if __name__ == "__main__": 17 | # data scraper ------------------------------------ 18 | scrape_new_posts(limit=200) 19 | preprocess_all_html_posts() 20 | -------------------------------------------------------------------------------- /config.yaml: -------------------------------------------------------------------------------- 1 | # =============================== 2 | # Global Configuration File 3 | # =============================== 4 | 5 | # --- OpenAI API Settings --- 6 | openai_base_url: "todo" # e.g. https://api.deepseek.com 7 | openai_api_key: "todo" # e.g. sk-... 8 | openai_model_name: "todo" # e.g. deepseek-chat 9 | reasoner_model_name: "todo" # e.g. deepseek-reasoner 10 | 11 | # --- WorldQuant Platform Credentials --- 12 | worldquant_account: "todo" 13 | worldquant_password: "todo" 14 | 15 | worldquant_login_url: "https://platform.worldquantbrain.com/sign-in" 16 | worldquant_api_auth: "https://api.worldquantbrain.com/authentication" 17 | worldquant_consultant_posts_url: "https://support.worldquantbrain.com/hc/en-us/community/topics/18910956638743-顾问专属中文论坛" 18 | # You can also choose any other WorldQuant Forum URL you have access to. 19 | 20 | # --- Dataset from WorldQuant Brain 21 | enabled_field_datasets: # Select the field database you want to use to build alphas. 22 | - pv1 # Database name reference ./data/wq_fields 23 | - fundamental6 24 | - analyst4 25 | - model16 26 | - news12 -------------------------------------------------------------------------------- /main_evaluator.py: -------------------------------------------------------------------------------- 1 | import random 2 | from pathlib import Path 3 | 4 | from evaluator.backtest_with_wq import run_backtest_by_wq_api 5 | from evaluator.backtest_with_wq_mul import run_backtest_mul_by_wq_api 6 | from researcher.construct_prompts import build_wq_knowledge_prompt, build_check_if_blog_helpful, \ 7 | build_blog_to_hypothesis 8 | from researcher.generate_alpha import generate_alphas_from_template 9 | from researcher.generate_template import from_post_to_template 10 | from scraper.preprocess_texts import preprocess_all_html_posts 11 | from scraper.scrap_posts_from_wq import scrape_new_posts 12 | from utils.template_field_gener import generate_template_fields_v2 13 | from utils.template_op_gener import generate_template_ops 14 | from utils.wq_info_loader import OpAndFeature 15 | 16 | if __name__ == "__main__": 17 | 18 | # alpha evaluator ---------------------------------- 19 | ALPHA_DIR = Path("data/alpha_db_v2/all_alphas") 20 | json_files = list(ALPHA_DIR.glob("*.json")) 21 | random.shuffle(json_files) 22 | for json_file in json_files: 23 | backtest_result = run_backtest_mul_by_wq_api(json_file) 24 | -------------------------------------------------------------------------------- /utils/json_dealer.py: -------------------------------------------------------------------------------- 1 | import json 2 | import re 3 | 4 | def extract_json(text: str): 5 | """ 6 | 尝试从大模型返回的 text 中提取并解析 JSON。 7 | - 自动去掉 Markdown 代码块、注释、解释文字 8 | - 尝试匹配第一个 {...} 或 [...] 的完整 JSON 9 | 返回 Python 对象 10 | """ 11 | if not text: 12 | raise ValueError("❌ Empty text, cannot parse JSON") 13 | 14 | # 1. 去掉Markdown代码块 ```json ... ``` 15 | cleaned = re.sub(r"```(?:json)?", "", text, flags=re.IGNORECASE).strip() 16 | 17 | # 2. 在字符串中找到第一个 { 或 [ 18 | start = min( 19 | (cleaned.find("{") if "{" in cleaned else float("inf")), 20 | (cleaned.find("[") if "[" in cleaned else float("inf")), 21 | ) 22 | if start == float("inf"): 23 | raise ValueError(f"❌ No JSON start symbol found in: {text[:200]}") 24 | 25 | # 3. 截取可能的JSON部分 26 | candidate = cleaned[start:] 27 | 28 | # 4. 尝试从后往前找到匹配的 } 或 ] 29 | end_brace = candidate.rfind("}") 30 | end_bracket = candidate.rfind("]") 31 | end = max(end_brace, end_bracket) 32 | if end == -1: 33 | raise ValueError(f"❌ No JSON end symbol found in: {text[:200]}") 34 | 35 | candidate = candidate[:end + 1] 36 | 37 | # 5. 尝试解析 38 | try: 39 | return json.loads(candidate) 40 | except json.JSONDecodeError as e: 41 | raise ValueError(f"❌ JSON decode error: {e}\nExtracted candidate:\n{candidate[:500]}") 42 | -------------------------------------------------------------------------------- /main_researcher.py: -------------------------------------------------------------------------------- 1 | import random 2 | from pathlib import Path 3 | 4 | from evaluator.backtest_with_wq import run_backtest_by_wq_api 5 | from evaluator.backtest_with_wq_mul import run_backtest_mul_by_wq_api 6 | from researcher.construct_prompts import build_wq_knowledge_prompt, build_check_if_blog_helpful, \ 7 | build_blog_to_hypothesis 8 | from researcher.generate_alpha import generate_alphas_from_template 9 | from researcher.generate_template import from_post_to_template 10 | from scraper.preprocess_texts import preprocess_all_html_posts 11 | from scraper.scrap_posts_from_wq import scrape_new_posts 12 | from utils.template_field_gener import generate_template_fields_v2 13 | from utils.template_op_gener import generate_template_ops 14 | from utils.wq_info_loader import OpAndFeature 15 | 16 | if __name__ == "__main__": 17 | 18 | # alpha researcher -------------------------------- 19 | opAndFeature = OpAndFeature() 20 | opAndFeature.get_operators() 21 | opAndFeature.get_data_fields() 22 | 23 | generate_template_ops() 24 | generate_template_fields_v2() 25 | 26 | POSTS_DIR = Path("data/wq_posts/helpful_posts") 27 | for json_file in POSTS_DIR.glob("*.json"): 28 | 29 | template_file = from_post_to_template(str(json_file)) 30 | if template_file is None: 31 | continue 32 | alphas_file = generate_alphas_from_template(template_file) 33 | -------------------------------------------------------------------------------- /main.py: -------------------------------------------------------------------------------- 1 | import random 2 | from pathlib import Path 3 | 4 | from evaluator.backtest_with_wq import run_backtest_by_wq_api 5 | from evaluator.backtest_with_wq_mul import run_backtest_mul_by_wq_api 6 | from researcher.construct_prompts import build_wq_knowledge_prompt, build_check_if_blog_helpful, \ 7 | build_blog_to_hypothesis 8 | from researcher.generate_alpha import generate_alphas_from_template 9 | from researcher.generate_template import from_post_to_template 10 | from scraper.preprocess_texts import preprocess_all_html_posts 11 | from scraper.scrap_posts_from_wq import scrape_new_posts 12 | from utils.template_field_gener import generate_template_fields_v2 13 | from utils.template_op_gener import generate_template_ops 14 | from utils.wq_info_loader import OpAndFeature 15 | 16 | 17 | if __name__ == "__main__": 18 | # data scraper ------------------------------------ 19 | scrape_new_posts(limit=200) 20 | preprocess_all_html_posts() 21 | 22 | 23 | # alpha researcher -------------------------------- 24 | opAndFeature = OpAndFeature() 25 | opAndFeature.get_operators() 26 | opAndFeature.get_data_fields() 27 | 28 | generate_template_ops() 29 | generate_template_fields_v2() 30 | 31 | POSTS_DIR = Path("data/wq_posts/helpful_posts") 32 | for json_file in POSTS_DIR.glob("*.json"): 33 | 34 | template_file = from_post_to_template(str(json_file)) 35 | if template_file is None: 36 | continue 37 | alphas_file = generate_alphas_from_template(template_file) 38 | 39 | 40 | # alpha evaluator ---------------------------------- 41 | ALPHA_DIR = Path("data/alpha_db_v2/all_alphas") 42 | json_files = list(ALPHA_DIR.glob("*.json")) 43 | random.shuffle(json_files) 44 | for json_file in json_files: 45 | backtest_result = run_backtest_mul_by_wq_api(json_file) 46 | -------------------------------------------------------------------------------- /utils/config_loader.py: -------------------------------------------------------------------------------- 1 | import os 2 | import yaml 3 | from pathlib import Path 4 | from threading import Lock 5 | 6 | 7 | class ConfigLoader: 8 | """ 9 | A singleton configuration loader for the entire project. 10 | Priority: Environment Variables > config.yaml 11 | """ 12 | _instance = None 13 | _config = {} 14 | _lock = Lock() 15 | 16 | def __new__(cls, config_path: str = "config.yaml"): 17 | with cls._lock: 18 | if cls._instance is None: 19 | cls._instance = super().__new__(cls) 20 | cls._instance._load_config(config_path) 21 | return cls._instance 22 | 23 | def _load_config(self, config_path: str): 24 | config_file = Path(config_path) 25 | if not config_file.exists(): 26 | raise FileNotFoundError(f"Config file not found at {config_path}") 27 | 28 | with open(config_file, "r", encoding="utf-8") as f: 29 | yaml_config = yaml.safe_load(f) or {} 30 | 31 | # 环境变量优先(如不在环境变量中则用 config.yaml 值) 32 | self._config = { 33 | "openai_base_url": os.getenv("OPENAI_BASE_URL", yaml_config.get("openai_base_url")), 34 | "openai_api_key": os.getenv("OPENAI_API_KEY", yaml_config.get("openai_api_key")), 35 | "openai_model_name": os.getenv("OPENAI_MODEL_NAME", yaml_config.get("openai_model_name")), 36 | "reasoner_model_name": os.getenv("REASONER_MODEL_NAME", yaml_config.get("reasoner_model_name")), 37 | 38 | "worldquant_account": os.getenv("WORLDQUANT_ACCOUNT", yaml_config.get("worldquant_account")), 39 | "worldquant_password": os.getenv("WORLDQUANT_PASSWORD", yaml_config.get("worldquant_password")), 40 | "worldquant_login_url": os.getenv("WORLDQUAN_LOGIN_URL", yaml_config.get("worldquant_login_url")), 41 | "worldquant_api_auth": os.getenv("WORLDQUAN_API_AUTH", yaml_config.get("worldquant_api_auth")), 42 | "worldquant_consultant_posts_url": os.getenv("WORLDQUANT_CONSULTANT_POSTS_URL", 43 | yaml_config.get("worldquant_consultant_posts_url")), 44 | 45 | "enabled_field_datasets": yaml_config.get("enabled_field_datasets", []) 46 | } 47 | 48 | # 确保是列表格式 49 | if not isinstance(self._config["enabled_field_datasets"], list): 50 | self._config["enabled_field_datasets"] = [self._config["enabled_field_datasets"]] 51 | 52 | @classmethod 53 | def get(cls, key: str, default=None): 54 | """ 55 | 获取配置值。 56 | 使用方法:ConfigLoader.get("openai_api_key") 57 | """ 58 | if cls._instance is None: 59 | cls() # 初始化 60 | return cls._instance._config.get(key, default) 61 | 62 | @classmethod 63 | def all(cls) -> dict: 64 | """ 65 | 获取完整配置字典。 66 | """ 67 | if cls._instance is None: 68 | cls() 69 | return cls._instance._config.copy() 70 | -------------------------------------------------------------------------------- /environment.yml: -------------------------------------------------------------------------------- 1 | name: alphaspire 2 | channels: 3 | - defaults 4 | - conda-forge 5 | dependencies: 6 | - annotated-types=0.6.0 7 | - anyio=4.10.0 8 | - async-timeout=4.0.3 9 | - beautifulsoup4=4.13.5 10 | - blas=1.0 11 | - bottleneck=1.4.2 12 | - brotlicffi=1.0.9.2 13 | - bzip2=1.0.8 14 | - ca-certificates=2025.9.9 15 | - certifi=2025.8.3 16 | - cffi=1.17.1 17 | - charset-normalizer=3.3.2 18 | - distro=1.9.0 19 | - expat=2.7.1 20 | - greenlet=3.2.4 21 | - h11=0.16.0 22 | - httpcore=1.0.9 23 | - httpx=0.28.1 24 | - idna=3.7 25 | - jiter=0.6.1 26 | - joblib=1.5.2 27 | - jsonpatch=1.33 28 | - jsonpointer=3.0.0 29 | - langchain=0.3.25 30 | - langchain-core=0.3.58 31 | - langchain-openai=0.3.16 32 | - langchain-text-splitters=0.3.8 33 | - langsmith=0.3.39 34 | - libcxx=20.1.8 35 | - libffi=3.4.4 36 | - libgfortran5=11.3.0 37 | - libopenblas=0.3.30 38 | - libzlib=1.3.1 39 | - llvm-openmp=20.1.8 40 | - loguru=0.7.2 41 | - lz4-c=1.9.4 42 | - ncurses=6.5 43 | - numexpr=2.11.0 44 | - openai=1.77.0 45 | - openssl=3.5.3 46 | - orjson=3.10.14 47 | - pandas=2.3.2 48 | - pycparser=2.23 49 | - pydantic=2.11.9 50 | - pydantic-core=2.33.2 51 | - pysocks=1.7.1 52 | - python=3.10.18 53 | - pip 54 | - python-dateutil=2.9.0post0 55 | - python-tzdata=2025.2 56 | - pytz=2025.2 57 | - pyyaml=6.0.2 58 | - readline=8.3 59 | - regex=2024.11.6 60 | - requests=2.32.5 61 | - requests-toolbelt=1.0.0 62 | - scikit-learn=1.7.2 63 | - scipy=1.15.3 64 | - six=1.17.0 65 | - soupsieve=2.5 66 | - sqlalchemy=2.0.43 67 | - sqlite=3.50.2 68 | - tenacity=9.0.0 69 | - threadpoolctl=3.5.0 70 | - tiktoken=0.9.0 71 | - tk=8.6.15 72 | - tqdm=4.67.1 73 | - typing-inspection=0.4.0 74 | - typing_extensions=4.15.0 75 | - tzdata=2025b 76 | - urllib3=2.5.0 77 | - wheel=0.45.1 78 | - xz=5.6.4 79 | - yaml=0.2.5 80 | - zlib=1.3.1 81 | - zstandard=0.23.0 82 | - zstd=1.5.6 83 | - pip: 84 | - appnope==0.1.4 85 | - asttokens==3.0.0 86 | - attrs==25.3.0 87 | - backcall==0.2.0 88 | - bleach==6.2.0 89 | - cloudscraper==1.2.71 90 | - contourpy==1.3.2 91 | - cycler==0.12.1 92 | - decorator==5.2.1 93 | - defusedxml==0.7.1 94 | - docopt==0.6.2 95 | - exceptiongroup==1.3.0 96 | - executing==2.2.1 97 | - fastjsonschema==2.21.2 98 | - fonttools==4.60.1 99 | - hdbscan==0.8.40 100 | - ipython==8.12.3 101 | - jedi==0.19.2 102 | - jinja2==3.1.6 103 | - jsonschema==4.25.1 104 | - jsonschema-specifications==2025.9.1 105 | - jupyter-client==8.6.3 106 | - jupyter-core==5.8.1 107 | - jupyterlab-pygments==0.3.0 108 | - kiwisolver==1.4.9 109 | - markupsafe==3.0.3 110 | - matplotlib==3.10.7 111 | - matplotlib-inline==0.1.7 112 | - mistune==3.1.4 113 | - nbclient==0.10.2 114 | - nbconvert==7.16.6 115 | - nbformat==5.10.4 116 | - numpy==2.2.6 117 | - outcome==1.3.0.post0 118 | - packaging==25.0 119 | - pandocfilters==1.5.1 120 | - parso==0.8.5 121 | - pexpect==4.9.0 122 | - pickleshare==0.7.5 123 | - pillow==11.3.0 124 | - pip==25.2 125 | - pipreqs==0.5.0 126 | - platformdirs==4.5.0 127 | - playwright==1.55.0 128 | - prompt-toolkit==3.0.52 129 | - ptyprocess==0.7.0 130 | - pure-eval==0.2.3 131 | - pyee==13.0.0 132 | - pygments==2.19.2 133 | - pyparsing==3.2.5 134 | - python-dotenv==1.1.1 135 | - pyzmq==27.1.0 136 | - referencing==0.36.2 137 | - rpds-py==0.27.1 138 | - selenium==4.35.0 139 | - setuptools==80.9.0 140 | - sniffio==1.3.1 141 | - sortedcontainers==2.4.0 142 | - stack-data==0.6.3 143 | - tinycss2==1.4.0 144 | - tornado==6.5.2 145 | - traitlets==5.14.3 146 | - trio==0.30.0 147 | - trio-websocket==0.12.2 148 | - typing-extensions==4.14.1 149 | - undetected-chromedriver==3.5.5 150 | - wcwidth==0.2.14 151 | - webdriver-manager==4.0.2 152 | - webencodings==0.5.1 153 | - websocket-client==1.8.0 154 | - websockets==15.0.1 155 | - wsproto==1.2.0 156 | - yarg==0.1.9 157 | prefix: /Users/panzhuoran/miniconda3/envs/alphaspire 158 | -------------------------------------------------------------------------------- /researcher/generate_alpha.py: -------------------------------------------------------------------------------- 1 | # generate_alpha.py 2 | import json 3 | import re 4 | import csv 5 | from itertools import product 6 | from pathlib import Path 7 | 8 | BASE_DIR = Path(__file__).resolve().parents[1] 9 | OPERATORS_FILE = BASE_DIR / "data" / "wq_template_operators" / "template_operators.csv" 10 | FIELDS_FILE = BASE_DIR / "data" / "wq_template_fields" / "template_fields.json" 11 | ALPHA_DB = BASE_DIR / "data" / "alpha_db_v2" / "all_alphas" 12 | ALPHA_DB.mkdir(parents=True, exist_ok=True) 13 | 14 | # === 最大单次生成 alpha 数量限制 === 15 | MAX_ALPHAS = 100000 16 | 17 | 18 | def load_operator_type_map(): 19 | """读取 template_operators.csv,返回 {type: [name,...]}""" 20 | operator_map = {} 21 | with open(OPERATORS_FILE, "r", encoding="utf-8") as f: 22 | reader = csv.DictReader(f) 23 | for row in reader: 24 | op_type = row["type"].strip() 25 | name = row["name"].strip() 26 | operator_map.setdefault(op_type, []).append(name) 27 | return operator_map 28 | 29 | 30 | def load_field_type_map(): 31 | """读取 template_fields.json,返回 {final_category: [id,...]}""" 32 | handled_field_map = {} 33 | with open(FIELDS_FILE, "r", encoding="utf-8") as f: 34 | field_map = json.load(f) 35 | for raw_type, ids in field_map.items(): 36 | # 提取 中的核心部分 37 | clean_type = re.sub(r"^$", "", raw_type).strip() 38 | handled_field_map[clean_type] = ids 39 | return handled_field_map 40 | 41 | 42 | 43 | def extract_placeholders(expression): 44 | """提取 的占位符""" 45 | return re.findall(r"", expression) 46 | 47 | 48 | def generate_alphas_from_template(template_path): 49 | """从alpha_template.json生成所有具体alpha""" 50 | # === 加载模板 === 51 | with open(template_path, "r", encoding="utf-8") as f: 52 | template_json = json.load(f) 53 | 54 | template_expr = template_json["TemplateExpression"] 55 | template_name = Path(template_path).stem 56 | 57 | # === 加载映射 === 58 | operator_map = load_operator_type_map() 59 | field_map = load_field_type_map() 60 | 61 | # === 提取占位符 === 62 | placeholders = extract_placeholders(template_expr) 63 | if not placeholders: 64 | print("❌ 模板中未发现占位符,无法展开;直接使用 template 作为 alpha") 65 | all_alphas = [] 66 | all_alphas.append({"alpha": template_expr, "fields_or_ops_used": []}) 67 | out_file = ALPHA_DB / f"{template_name}_alphas.json" 68 | with open(out_file, "w", encoding="utf-8") as f: 69 | json.dump({ 70 | "Template": template_expr, 71 | "GeneratedAlphas": all_alphas 72 | }, f, indent=2, ensure_ascii=False) 73 | print(f"✅ Generated 1 alphas saved to {out_file}") 74 | return out_file 75 | 76 | # === 为每个占位符构建替代列表 === 77 | replacements_list = [] 78 | for ph in placeholders: 79 | # 判断是操作符还是字段 80 | if ph in operator_map: 81 | replacements_list.append(operator_map[ph]) 82 | elif ph in field_map: 83 | replacements_list.append(field_map[ph]) 84 | else: 85 | print(f"❌ 未在字段或操作符映射中找到占位符类型: {ph}") 86 | return None 87 | 88 | # === 笛卡尔积替换 === 89 | all_alphas = [] 90 | count = 0 91 | total_combinations = 1 92 | for lst in replacements_list: 93 | total_combinations *= len(lst) 94 | if total_combinations > MAX_ALPHAS: 95 | print(f"⚠️ Warning: Total possible alphas {total_combinations} exceeds MAX_ALPHAS={MAX_ALPHAS}. " 96 | f"Only generating the first {MAX_ALPHAS} combinations.") 97 | 98 | for combo in product(*replacements_list): 99 | expr = template_expr 100 | # 依次替换占位符 101 | for ph, val in zip(placeholders, combo): 102 | expr = re.sub(rf"+", val, expr, count=1) 103 | all_alphas.append({ 104 | "alpha": expr, 105 | "fields_or_ops_used": combo 106 | }) 107 | count += 1 108 | if count >= MAX_ALPHAS: # 超过限制提前退出 109 | break 110 | 111 | # === 保存 === 112 | out_file = ALPHA_DB / f"{template_name}_alphas.json" 113 | with open(out_file, "w", encoding="utf-8") as f: 114 | json.dump({ 115 | "Template": template_expr, 116 | "GeneratedAlphas": all_alphas 117 | }, f, indent=2, ensure_ascii=False) 118 | 119 | print(f"✅ Generated {len(all_alphas)} alphas saved to {out_file}") 120 | return out_file 121 | 122 | 123 | if __name__ == "__main__": 124 | test_template = BASE_DIR / "data" / "template_db" / "your_alpha_template.json" 125 | generate_alphas_from_template(test_template) 126 | -------------------------------------------------------------------------------- /utils/template_op_gener.py: -------------------------------------------------------------------------------- 1 | import csv, json 2 | from pathlib import Path 3 | 4 | BASE = Path(__file__).resolve().parents[1] 5 | OP_DIR = BASE / "data" / "wq_operators" 6 | IN_CSV = OP_DIR / "operators.csv" 7 | 8 | TEMP_DIR = BASE / "data" / "wq_template_operators" 9 | TEMP_DIR.mkdir(parents=True, exist_ok=True) 10 | OUT_CSV = TEMP_DIR / "template_operators.csv" 11 | OUT_JSON = TEMP_DIR / "template_operators.json" 12 | 13 | TYPE_MAP = { 14 | "Arithmetic:NAry": {"add","multiply","max","min"}, 15 | "Arithmetic:Binary": {"subtract","divide","power","signed_power"}, 16 | "Arithmetic:Unary": {"sign","abs","log","sqrt","inverse","reverse","tanh","sigmoid"}, 17 | "Logical:Unary": {"is_nan","not"}, 18 | "Logical:Binary": {"and","or","less","equal","greater","not_equal","less_equal","greater_equal"}, 19 | "Conditional": {"if_else","trade_when"}, 20 | "TS:Aggregation": {"ts_mean","ts_sum","ts_std_dev","ts_product","ts_av_diff","ts_min_diff","ts_zscore","ts_skewness","ts_entropy","ts_count_nans"}, 21 | "TS:WindowIndex": {"ts_arg_max","ts_arg_min","kth_element","last_diff_value","days_from_last_change","ts_step"}, 22 | "TS:CorrelationRegression": {"ts_corr","ts_covariance","ts_regression"}, 23 | "TS:Transform": {"ts_backfill","ts_delay","ts_decay_linear","ts_target_tvr_decay","ts_scale","ts_quantile","ts_rank","ts_delta","ts_min_max_cps","ts_min_max_diff","hump"}, 24 | "CrossSection:Standardize": {"winsorize","rank","zscore","scale","normalize","quantile"}, 25 | "CrossSection:RegressionProj": {"regression_proj"}, 26 | "Group:Aggregation": {"bucket","densify","group_mean","group_rank","group_extra","group_backfill","group_scale","group_zscore","group_neutralize","group_cartesian_product"}, 27 | "Vector:LinearAlgebra": {"vector_proj","vector_neut","vec_sum","vec_avg"}, 28 | "Special:Domain": {"inst_pnl"}, 29 | } 30 | 31 | 32 | def generate_template_ops(): 33 | # invert for lookups 34 | op_to_type = {} 35 | for t,s in TYPE_MAP.items(): 36 | for op in s: 37 | op_to_type[op] = t 38 | 39 | rows = [] 40 | unknown = set() 41 | if not IN_CSV.exists(): 42 | raise FileNotFoundError(f"{IN_CSV} not found.") 43 | 44 | with open(IN_CSV,newline="",encoding="utf-8") as f: 45 | reader = csv.DictReader(f) 46 | for r in reader: 47 | name = (r.get("name") or "").strip() 48 | t = op_to_type.get(name, "UNKNOWN") 49 | # assign signature templates by type 50 | signature = "" 51 | if t == "Arithmetic:NAry": 52 | signature = "NAry(x1,x2,...[,options])" 53 | elif t == "Arithmetic:Binary": 54 | signature = "Binary(x,y[,options])" 55 | elif t == "Arithmetic:Unary": 56 | signature = "Unary(x[,options])" 57 | elif t == "Logical:Unary": 58 | signature = "LogicalUnary(x)" 59 | elif t == "Logical:Binary": 60 | signature = "LogicalBinary(a,b)" 61 | elif t == "Conditional": 62 | signature = "Conditional(cond, a, b) or Conditional(cond, value)" 63 | elif t == "TS:Aggregation": 64 | signature = "TS_Agg(x,d[,options])" 65 | elif t == "TS:WindowIndex": 66 | signature = "TS_Index(x,d[,k])" 67 | elif t == "TS:CorrelationRegression": 68 | signature = "TS_Bivariate(y,x,d[,options])" 69 | elif t == "TS:Transform": 70 | signature = "TS_Transform(x,d[,params])" 71 | elif t == "CrossSection:Standardize": 72 | signature = "CS_Std(x[,params])" 73 | elif t == "CrossSection:RegressionProj": 74 | signature = "regression_proj(y,x)" 75 | elif t == "Group:Aggregation": 76 | signature = "GroupOp(x, group[,params])" 77 | elif t == "Vector:LinearAlgebra": 78 | signature = "VectorOp(x[,y])" 79 | elif t == "Special:Domain": 80 | signature = "Special(...)" 81 | else: 82 | signature = "" 83 | 84 | if t == "UNKNOWN": 85 | unknown.add(name) 86 | r_out = dict(r) 87 | r_out["type"] = t 88 | r_out["signature_template"] = signature 89 | rows.append(r_out) 90 | 91 | # write CSV 92 | if rows: 93 | fieldnames = list(rows[0].keys()) 94 | with open(OUT_CSV,"w",newline="",encoding="utf-8") as f: 95 | writer = csv.DictWriter(f,fieldnames=fieldnames) 96 | writer.writeheader() 97 | writer.writerows(rows) 98 | with open(OUT_JSON,"w",encoding="utf-8") as f: 99 | json.dump(rows,f,ensure_ascii=False,indent=2) 100 | 101 | print("Wrote", OUT_CSV, OUT_JSON) 102 | if unknown: 103 | print("Unknown operators to classify manually:", sorted(unknown)) 104 | 105 | 106 | if __name__ == "__main__": 107 | generate_template_ops() -------------------------------------------------------------------------------- /scraper/preprocess_texts.py: -------------------------------------------------------------------------------- 1 | import os 2 | import json 3 | import unicodedata 4 | from pathlib import Path 5 | from bs4 import BeautifulSoup 6 | from langchain_openai import ChatOpenAI 7 | from loguru import logger 8 | 9 | from researcher.construct_prompts import build_check_if_blog_helpful 10 | from utils.config_loader import ConfigLoader 11 | 12 | BASE_DIR = Path(__file__).resolve().parents[1] 13 | RAW_DIR = BASE_DIR / "data" / "wq_posts" / "raw_posts" 14 | RAW_DIR.mkdir(parents=True, exist_ok=True) 15 | PROCESSED_DIR = BASE_DIR / "data" / "wq_posts" / "processed_posts" 16 | PROCESSED_DIR.mkdir(parents=True, exist_ok=True) 17 | HELPFUL_DIR = BASE_DIR / "data" / "wq_posts" / "helpful_posts" 18 | HELPFUL_DIR.mkdir(parents=True, exist_ok=True) 19 | 20 | 21 | def clean_text(text: str) -> str: 22 | """ 23 | 清洗文本中非utf-8字符,统一为 NFC 格式 24 | """ 25 | if not text: 26 | return "" 27 | # 先 encode/decode 丢弃非法字符 28 | cleaned = text.encode("utf-8", errors="ignore").decode("utf-8", errors="ignore") 29 | # 再做 Unicode 归一化,避免奇怪的变体 30 | cleaned = unicodedata.normalize("NFC", cleaned) 31 | return cleaned.strip() 32 | 33 | def extract_post_info(html_content: str) -> dict: 34 | """从单个HTML中抽取 description, title, post-body, post-comments""" 35 | soup = BeautifulSoup(html_content, "html.parser") 36 | 37 | # description 38 | description = "" 39 | meta_desc = soup.find("meta", attrs={"name": "description"}) 40 | if meta_desc and meta_desc.get("content"): 41 | description = meta_desc.get("content").strip() 42 | if not description: # 备用 43 | og_desc = soup.find("meta", property="og:description") 44 | if og_desc and og_desc.get("content"): 45 | description = og_desc.get("content").strip() 46 | 47 | # title 48 | title = "" 49 | og_title = soup.find("meta", property="og:title") 50 | if og_title and og_title.get("content"): 51 | title = og_title.get("content").strip() 52 | if not title and soup.title: 53 | title = soup.title.string.strip() 54 | 55 | # post-body 56 | post_body = "" 57 | body_div = soup.find("div", class_="post-body") 58 | if body_div: 59 | post_body = body_div.get_text("\n", strip=True) 60 | 61 | # comments(section.comment-body) 62 | comments = [] 63 | for section in soup.select("section.comment-body"): 64 | text = section.get_text("\n", strip=True) 65 | if text: 66 | comments.append(text) 67 | 68 | return { 69 | "title": title, 70 | "description": description, 71 | "post_body": post_body, 72 | "post_comments": comments, 73 | } 74 | 75 | def preprocess_all_html_posts() -> None: 76 | """批量处理RAW_DIR下所有未处理的html文件""" 77 | raw_files = list(RAW_DIR.glob("*.html")) 78 | logger.info(f"Found {len(raw_files)} raw html files.") 79 | processed_count = 0 80 | 81 | for raw_file in raw_files: 82 | post_id = raw_file.stem # 文件名不带后缀 83 | out_file = PROCESSED_DIR / f"{post_id}.json" 84 | if out_file.exists(): 85 | continue # 已处理 86 | 87 | logger.info(f"Processing {raw_file.name}...") 88 | html_content = raw_file.read_text(encoding="utf-8", errors="ignore") 89 | 90 | # 再做一层clean,避免HTML内的非法字符 91 | html_content = clean_text(html_content) 92 | 93 | post_info = extract_post_info(html_content) 94 | 95 | with open(out_file, "w", encoding="utf-8") as f: 96 | json.dump(post_info, f, ensure_ascii=False, indent=2) 97 | 98 | processed_count += 1 99 | logger.info(f"Saved processed JSON to {out_file}") 100 | 101 | if check_if_post_helpful(out_file): 102 | helpful_file = HELPFUL_DIR / f"{post_id}.json" 103 | with open(helpful_file, "w", encoding="utf-8") as f: 104 | json.dump(post_info, f, ensure_ascii=False, indent=2) 105 | 106 | logger.info(f"Total processed new files: {processed_count}") 107 | 108 | 109 | def check_if_post_helpful(post_file): 110 | base_url = ConfigLoader.get("openai_base_url") 111 | api_key = ConfigLoader.get("openai_api_key") 112 | model_name = ConfigLoader.get("openai_model_name") 113 | 114 | llm = ChatOpenAI( 115 | base_url=base_url, 116 | api_key=api_key, 117 | model=model_name, 118 | temperature=0.2, 119 | ) 120 | formatted = build_check_if_blog_helpful(post_file) 121 | 122 | try: 123 | resp = llm.invoke(formatted) 124 | if hasattr(resp, "content"): 125 | answer = resp.content.strip() 126 | else: 127 | answer = str(resp).strip() 128 | 129 | print(f"🔎 Model output for if {post_file} helpful: {answer}") 130 | 131 | if answer.upper().startswith("Y"): 132 | return True 133 | else: 134 | return False 135 | except Exception as e: 136 | print(f"⚠️ check_if_post_helpful error: {e}") 137 | return False 138 | 139 | 140 | if __name__ == "__main__": 141 | preprocess_all_html_posts() -------------------------------------------------------------------------------- /researcher/generate_template.py: -------------------------------------------------------------------------------- 1 | import random 2 | import json 3 | from pathlib import Path 4 | 5 | from researcher.construct_prompts import build_wq_knowledge_prompt, build_blog_to_hypothesis, \ 6 | build_hypothesis_to_template, build_check_if_blog_helpful 7 | from utils.config_loader import ConfigLoader 8 | 9 | from langchain_openai import ChatOpenAI 10 | from langchain.prompts import ChatPromptTemplate, MessagesPlaceholder 11 | from langchain.memory import ConversationBufferMemory 12 | from langchain.chains import LLMChain 13 | 14 | from utils.json_dealer import extract_json 15 | 16 | # --- 路径 --- 17 | BASE_DIR = Path(__file__).resolve().parents[1] 18 | POSTS_DIR = BASE_DIR / "data" / "wq_posts" / "helpful_posts" 19 | HYPOTHESIS_DB = BASE_DIR / "data" / "hypothesis_db_v2" 20 | TEMPLATE_DB = BASE_DIR / "data" / "template_db_v2" 21 | PROMPT_FILE = BASE_DIR / "prompts" / "template_generating.yaml" 22 | 23 | HYPOTHESIS_DB.mkdir(parents=True, exist_ok=True) 24 | TEMPLATE_DB.mkdir(parents=True, exist_ok=True) 25 | 26 | 27 | # === LangChain Agent 初始化 === 28 | def init_agent(system_prompt): 29 | """初始化长时运行的Agent并注入System Prompt""" 30 | base_url = ConfigLoader.get("openai_base_url") 31 | api_key = ConfigLoader.get("openai_api_key") 32 | model_name = ConfigLoader.get("openai_model_name") 33 | 34 | llm = ChatOpenAI( 35 | base_url=base_url, 36 | api_key=api_key, 37 | model=model_name, 38 | temperature=0.2, 39 | ) 40 | 41 | # memory 保存上下文 42 | memory = ConversationBufferMemory(memory_key="chat_history", return_messages=True) 43 | 44 | # PromptTemplate — system prompt + user placeholder 45 | prompt = ChatPromptTemplate.from_messages([ 46 | ("system", system_prompt), 47 | MessagesPlaceholder(variable_name="chat_history"), 48 | ("user", "{input}") # 后续只传 input 49 | ]) 50 | 51 | chain = LLMChain( 52 | llm=llm, 53 | prompt=prompt, 54 | memory=memory 55 | ) 56 | 57 | return chain 58 | 59 | 60 | # === 随机选择有用的 Blog Post === 61 | def select_valid_post(chain): 62 | post_files = list(POSTS_DIR.glob("*.json")) 63 | if not post_files: 64 | raise FileNotFoundError("❌ No blog post found in processed_posts folder") 65 | 66 | # while True: 67 | # post_file = random.choice(post_files) 68 | # formatted = build_check_if_blog_helpful(post_file) 69 | # output = chain.run(input=formatted).strip() 70 | # 71 | # if output == "Y": 72 | # print(f"✅ Selected blog post: {post_file}") 73 | # return post_file 74 | # else: 75 | # print(f"⚠️ Skipping blog post: {post_file} (not helpful)") 76 | return random.choice(post_files) 77 | 78 | 79 | def check_if_post_helpful(chain, post_file): 80 | formatted = build_check_if_blog_helpful(post_file) 81 | output = chain.run(input=formatted).strip() 82 | if output.upper().startswith("Y"): 83 | return True 84 | else: 85 | return False 86 | 87 | 88 | # === 生成 Hypotheses === 89 | def generate_hypotheses(chain, post_file): 90 | formatted = build_blog_to_hypothesis(post_file) 91 | output = chain.run(input=formatted).strip() 92 | 93 | try: 94 | hypotheses = extract_json(output) 95 | except Exception: 96 | raise ValueError(f"❌ Hypotheses output not valid JSON: {output}") 97 | 98 | out_file = HYPOTHESIS_DB / f"{Path(post_file).stem}_hypotheses.json" 99 | with open(out_file, "w", encoding="utf-8") as f: 100 | json.dump(hypotheses, f, indent=2, ensure_ascii=False) 101 | 102 | print(f"✅ Hypotheses saved: {out_file}") 103 | return out_file 104 | 105 | 106 | # === 生成 Template === 107 | def generate_template(chain, hypotheses_file): 108 | formatted = build_hypothesis_to_template(hypotheses_file) 109 | output = chain.run(input=formatted).strip() 110 | 111 | try: 112 | template_json = extract_json(output) 113 | except Exception: 114 | print(f"❌ Template output not valid JSON: {output}") 115 | return None 116 | 117 | out_file = TEMPLATE_DB / f"{Path(hypotheses_file).stem}_template.json" 118 | with open(out_file, "w", encoding="utf-8") as f: 119 | json.dump(template_json, f, indent=2, ensure_ascii=False) 120 | 121 | print(f"✅ Template saved: {out_file}") 122 | return out_file 123 | 124 | 125 | # === 主流程 === 126 | def from_post_to_template(post_file: str=None): 127 | system_prompt = build_wq_knowledge_prompt() 128 | 129 | chain = init_agent(system_prompt) 130 | 131 | # Step 1: 选择 blog 132 | if post_file: 133 | post_stem = Path(post_file).stem 134 | existing_template = TEMPLATE_DB / f"{post_stem}_hypotheses_template.json" 135 | if existing_template.exists(): 136 | print(f"✅ Template already exists for {post_file}, skipping template and alpha generation.") 137 | return None 138 | blog_file = post_file 139 | 140 | # if check_if_post_helpful(chain, post_file): 141 | # blog_file = post_file 142 | # else: 143 | # print(f"⚠️ Skipping blog post: {post_file} (not helpful)") 144 | # return None 145 | else: 146 | blog_file = select_valid_post(chain) 147 | 148 | # Step 2: 生成 Hypotheses 149 | hypotheses_file = generate_hypotheses(chain, blog_file) 150 | 151 | # Step 3: 生成 Template 152 | template_file = generate_template(chain, hypotheses_file) 153 | 154 | print(f"🎯 Finished: Template generated from {blog_file} successfully.") 155 | return template_file 156 | 157 | 158 | if __name__ == "__main__": 159 | from_post_to_template() 160 | -------------------------------------------------------------------------------- /scraper/scrap_posts_from_wq.py: -------------------------------------------------------------------------------- 1 | """ 2 | scrap_posts_from_wq.py — Playwright版 3 | 打开浏览器,手动登录后抓取 WorldQuant Consultant 帖子,支持多页。 4 | """ 5 | 6 | import time 7 | import datetime 8 | import csv 9 | from pathlib import Path 10 | from bs4 import BeautifulSoup 11 | from loguru import logger 12 | from playwright.sync_api import sync_playwright 13 | 14 | from utils.config_loader import ConfigLoader 15 | 16 | # --- 目录 --- 17 | BASE_DIR = Path(__file__).resolve().parents[1] 18 | RAW_DIR = BASE_DIR / "data" / "wq_posts" / "raw_posts" 19 | RAW_DIR.mkdir(parents=True, exist_ok=True) 20 | INDEX_FILE = RAW_DIR / "index.csv" 21 | COOKIES_FILE = RAW_DIR / "cookies.json" 22 | 23 | 24 | def _load_existing_ids(): 25 | """加载已抓取帖子ID""" 26 | if not INDEX_FILE.exists(): 27 | return set() 28 | with open(INDEX_FILE, "r", encoding="utf-8") as f: 29 | reader = csv.DictReader(f) 30 | return {row["id"] for row in reader} 31 | 32 | 33 | def _save_index_row(post_meta: dict): 34 | """追加写入index.csv""" 35 | file_exists = INDEX_FILE.exists() 36 | with open(INDEX_FILE, "a", newline="", encoding="utf-8") as f: 37 | fieldnames = ["id", "title", "url", "time"] 38 | writer = csv.DictWriter(f, fieldnames=fieldnames) 39 | if not file_exists: 40 | writer.writeheader() 41 | writer.writerow(post_meta) 42 | 43 | 44 | def _save_raw_html(post_id: str, html_content: str): 45 | """保存原始HTML文件""" 46 | now_str = datetime.datetime.now().strftime("%Y%m%d_%H%M%S") 47 | file_path = RAW_DIR / f"{now_str}_{post_id}.html" 48 | with open(file_path, "w", encoding="utf-8") as f: 49 | f.write(html_content) 50 | logger.info(f"Saved raw HTML to {file_path}") 51 | 52 | 53 | def scrape_new_posts(limit: int = 20): 54 | """ 55 | Playwright抓取WorldQuant Consultant新帖子 56 | """ 57 | topic_url = ConfigLoader.get("worldquant_consultant_posts_url") 58 | existing_ids = _load_existing_ids() 59 | new_posts_meta = [] 60 | 61 | with sync_playwright() as p: 62 | browser = p.chromium.launch(headless=False) 63 | context = browser.new_context() 64 | page = context.new_page() 65 | 66 | logger.info(f"Navigating to topic page: {topic_url}") 67 | page.goto(topic_url) 68 | 69 | logger.info("请在浏览器中完成登录,如需。完成后在终端按回车继续...") 70 | input("已登录并看到帖子列表后按回车...") 71 | 72 | fetched_count = 0 73 | has_next = True 74 | while has_next and fetched_count < limit: 75 | # 等主文档加载 76 | try: 77 | page.wait_for_load_state("load", timeout=60000) 78 | except: 79 | logger.warning("load_state timeout, continue anyway") 80 | time.sleep(5) 81 | 82 | html = page.content() 83 | soup = BeautifulSoup(html, "html.parser") 84 | 85 | # 抽取帖子链接 86 | post_links = soup.select("a[href*='/community/posts/']") 87 | logger.info(f"Found {len(post_links)} post links on this page.") 88 | 89 | for link in post_links: 90 | if fetched_count >= limit: 91 | break 92 | post_url = link.get("href") 93 | if not post_url: 94 | continue 95 | full_url = ( 96 | post_url 97 | if post_url.startswith("http") 98 | else "https://support.worldquantbrain.com" + post_url 99 | ) 100 | 101 | import re 102 | m = re.search(r"/posts/(\d+)", full_url) 103 | if not m: 104 | continue 105 | post_id = m.group(1) 106 | 107 | if post_id in existing_ids: 108 | context.storage_state(path=str(COOKIES_FILE)) 109 | logger.info(f"Saved cookies to {COOKIES_FILE}") 110 | browser.close() 111 | logger.info(f"Total new posts scraped: {len(new_posts_meta)}") 112 | return new_posts_meta 113 | 114 | 115 | # 抓帖子详情页 116 | page.goto(full_url) 117 | try: 118 | page.wait_for_load_state("load", timeout=60000) 119 | except: 120 | logger.warning("Timeout loading post page, continue anyway") 121 | time.sleep(3) 122 | html_content = page.content() 123 | _save_raw_html(post_id, html_content) 124 | 125 | title = link.get_text(strip=True) 126 | post_meta = { 127 | "id": post_id, 128 | "title": title, 129 | "url": full_url, 130 | "time": "", 131 | } 132 | _save_index_row(post_meta) 133 | new_posts_meta.append(post_meta) 134 | fetched_count += 1 135 | logger.info(f"New post scraped: {post_meta}") 136 | 137 | # 回到列表页 138 | page.goto(topic_url) 139 | try: 140 | page.wait_for_load_state("load", timeout=60000) 141 | except: 142 | logger.warning("Timeout loading topic page, continue anyway") 143 | time.sleep(5) 144 | 145 | # 翻页:找“Next”按钮 146 | next_button = page.locator("a:has-text('Next')") 147 | if next_button.count() > 0 and fetched_count < limit: 148 | logger.info("Clicking Next button...") 149 | next_button.first.click() 150 | try: 151 | page.wait_for_load_state("load", timeout=60000) 152 | except: 153 | logger.warning("Timeout after clicking Next, continue anyway") 154 | time.sleep(5) 155 | else: 156 | has_next = False 157 | 158 | # 保存cookies 159 | context.storage_state(path=str(COOKIES_FILE)) 160 | logger.info(f"Saved cookies to {COOKIES_FILE}") 161 | 162 | browser.close() 163 | 164 | logger.info(f"Total new posts scraped: {len(new_posts_meta)}") 165 | return new_posts_meta 166 | 167 | 168 | if __name__ == "__main__": 169 | new_posts = scrape_new_posts() 170 | print(f"Scraped {len(new_posts)} new posts.") 171 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # AlphaSpire 2 | 3 | AI agent-driven fully automated Alpha mining. 4 | 5 | ----- 6 | 7 | ## ⚠️ Attention 8 | 9 | Please note that this tool only performs backtest simulations on the platform (Simulate) and does not execute any submission actions (Submit). **Excessive simulations without valid submissions may result in your account being locked.** Please make sure to limit the number of backtests when using this tool. 10 | 11 | ## Work Pipeline 12 | 13 | ```scss 14 | ┌────────────────────────────┐ 15 | │ Data Acquisition Module │ 16 | │ (Scraper & Local Storage) │ 17 | └─────────────┬──────────────┘ 18 | │ 19 | ┌─────────────▼──────────────┐ 20 | │ Preprocessing & Parsing │ 21 | │ (Text cleaning, Metadata) │ 22 | └─────────────┬──────────────┘ 23 | │ 24 | ┌─────────────▼──────────────┐ 25 | │ Researcher (LLM) │ 26 | │ (LangChain / Prompting) │ 27 | └─────────────┬──────────────┘ 28 | │ 29 | ┌─────────────▼──────────────┐ 30 | │ Alpha Template Generating │ 31 | │ (Factor/Signal Extraction) │ 32 | └─────────────┬──────────────┘ 33 | │ 34 | ┌─────────────▼──────────────┐ 35 | │ Evaluator │ 36 | │ (Submit to WQ or local sim)│ 37 | └─────────────┬──────────────┘ 38 | │ 39 | ┌─────────────▼──────────────┐ 40 | │ Results Collecting │ 41 | │ (Store Scores, Rank Alphas)│ 42 | └─────────────┬──────────────┘ 43 | │ 44 | ┌─────────────▼──────────────┐ 45 | │Results analyzing &Iteration│ 46 | │ │ 47 | └────────────────────────────┘ 48 | ``` 49 | 50 | ## Architecture 51 | ### data 52 | The data folder stores posts, operators, fields, operator types, field types, generated alpha templates, generated alphas, and backtesting results information. 53 | 54 | ### prompts 55 | Stores prompt templates used in workflows. 56 | 57 | ### utils 58 | 1. Get the operator set of Fast Expression and save it to ./data/wq_operators 59 | 2. Get the field collection of Fast Expression and save it to ./data/wq_fields 60 | 3. Manual rules classify operators and generate operator types required by templates. (save to ./data/wq_template_operators) 61 | 4. Clustering and large language models are used to assist in generating the field types used in templates. (save to ./data/wq_template_fields) 62 | 5. Various text processing and other miscellaneous. 63 | 64 | ### scraper 65 | 1. Scrape posts from the WorldQuant forum and store them in ./data/wq_posts/raw_posts. 66 | 2. Extract the main information from the original html text of the post into a json file and store it in ./data/wq_posts/processed_posts 67 | 3. Use a large language model to determine whether the post has the potential to generate alpha, and if so, save it to ./data/wq_posts/helpful_posts 68 | 69 | ### researcher 70 | 1. Use LLM to summarize hypothesis from blog and save it to ./data/hypothesis_db_v2 71 | 2. Use LLM to generate template from hypothesis and save it to ./data/template_db_v2 72 | 3. Multiple loops generate alphas based on template, field type, and operator type and save them to ./data/alpha_db_v2/all_alphas 73 | 74 | ### evaluator 75 | Use the WorldQuant backtesting API to evaluate alpha performance and save the results to data/alpha_db_v2/backtest_result 76 | 77 | 78 | ## Deployment 79 | 80 | 1. Create a conda environment 81 | ```bash 82 | conda env create -f environment.yml 83 | conda activate alphaspire 84 | ``` 85 | 2. Fill in the configuration file (config.yaml) 86 | ```yaml 87 | # =============================== 88 | # Global Configuration File 89 | # =============================== 90 | 91 | # --- OpenAI API Settings --- 92 | openai_base_url: "todo" # e.g. https://api.deepseek.com 93 | openai_api_key: "todo" # e.g. sk-... 94 | openai_model_name: "todo" # e.g. deepseek-chat 95 | reasoner_model_name: "todo" # e.g. deepseek-reasoner 96 | 97 | # --- WorldQuant Platform Credentials --- 98 | worldquant_account: "todo" 99 | worldquant_password: "todo" 100 | 101 | worldquant_login_url: "https://platform.worldquantbrain.com/sign-in" 102 | worldquant_api_auth: "https://api.worldquantbrain.com/authentication" 103 | worldquant_consultant_posts_url: "https://support.worldquantbrain.com/hc/en-us/community/topics/18910956638743-顾问专属中文论坛" 104 | # You can also choose any other WorldQuant Forum URL you have access to. 105 | 106 | # --- Dataset from WorldQuant Brain 107 | enabled_field_datasets: # Select the field database you want to use to build alphas. 108 | - pv1 # Database name reference ./data/wq_fields 109 | - fundamental6 110 | - analyst4 111 | - model16 112 | - news12 113 | ``` 114 | 115 | 3. One-click operation 116 | * Full process operation 117 | ```bash 118 | python3 main.py 119 | ``` 120 | * Only run post crawl 121 | ```bash 122 | python3 main_scraper.py 123 | ``` 124 | * Only run Alpha generation 125 | ```bash 126 | python3 main_researcher.py 127 | ``` 128 | * Only run Alpha backtests 129 | ```bash 130 | ./test_script.sh 131 | ``` 132 | OR 133 | ```bash 134 | python3 main_evaluator.py 135 | ``` 136 | 137 | ## Notice 138 | 139 | Regarding WorldQuant backtest parameter settings, currently only manual modification of the parameters in the following code section in evaluator/backtest_with_wq_mul.py is supported. 140 | These will be moved to a separate config file in future versions. 141 | 142 | ``` 143 | payload = { 144 | "type": "REGULAR", 145 | "settings": { 146 | "instrumentType": "EQUITY", 147 | "region": "ASI", 148 | "universe": "MINVOL1M", 149 | "delay": 1, 150 | "decay": 6, 151 | "neutralization": "SUBINDUSTRY", 152 | "truncation": 0.01, 153 | "pasteurization": "ON", 154 | "unitHandling": "VERIFY", 155 | "nanHandling": "ON", 156 | "maxTrade": "ON", 157 | "language": "FASTEXPR", 158 | "visualization": False, 159 | }, 160 | "regular": fixed_expr 161 | } 162 | ``` 163 | -------------------------------------------------------------------------------- /utils/template_field_gener.py: -------------------------------------------------------------------------------- 1 | import json 2 | import csv 3 | import logging 4 | from pathlib import Path 5 | from typing import List, Dict 6 | import pandas as pd 7 | from sklearn.feature_extraction.text import TfidfVectorizer 8 | from openai import OpenAI 9 | import hdbscan 10 | import warnings 11 | warnings.filterwarnings( 12 | "ignore", 13 | message="'force_all_finite' was renamed to 'ensure_all_finite'", 14 | category=FutureWarning, 15 | ) 16 | 17 | from utils.config_loader import ConfigLoader 18 | 19 | BASE_DIR = Path(__file__).resolve().parents[1] 20 | FIELDS_DIR = BASE_DIR / "data" / "wq_fields" 21 | OUTPUT_DIR = BASE_DIR / "data" / "wq_template_fields" 22 | OUTPUT_DIR.mkdir(parents=True, exist_ok=True) 23 | OUT_JSON = OUTPUT_DIR / "template_fields.json" 24 | 25 | logging.basicConfig(level=logging.INFO, format="%(asctime)s %(levelname)s %(message)s") 26 | 27 | 28 | # ========================= 29 | # Step 1. 加载所有 csv 文件 30 | # ========================= 31 | def load_all_fields() -> pd.DataFrame: 32 | """读取 data/wq_fields 下所有有效 CSV 文件并合并""" 33 | dfs = [] 34 | for file in FIELDS_DIR.glob("*.csv"): 35 | if file.stat().st_size == 0: 36 | logging.warning(f"⚠️ Skipping empty file (0 bytes): {file.name}") 37 | continue 38 | try: 39 | df = pd.read_csv(file, dtype=str, keep_default_na=False) 40 | if df.empty or len(df.columns) == 0: 41 | logging.warning(f"⚠️ Skipping file with no valid columns: {file.name}") 42 | continue 43 | df["__dataset__"] = file.stem 44 | dfs.append(df) 45 | 46 | except pd.errors.EmptyDataError: 47 | logging.warning(f"⚠️ Skipping malformed file (EmptyDataError): {file.name}") 48 | continue 49 | except pd.errors.ParserError as e: 50 | logging.warning(f"⚠️ Skipping broken CSV (ParserError): {file.name} ({e})") 51 | continue 52 | except Exception as e: 53 | logging.error(f"❌ Unexpected error reading {file.name}: {e}") 54 | continue 55 | if not dfs: 56 | raise RuntimeError(f"❌ No valid CSV files found in {FIELDS_DIR}") 57 | 58 | return pd.concat(dfs, ignore_index=True) 59 | 60 | 61 | # ========================= 62 | # Step 2. 聚类逻辑 63 | # ========================= 64 | def cluster_fields_by_semantics_auto(df: pd.DataFrame, 65 | min_cluster_size: int = 3, 66 | min_samples: int = 2) -> Dict[int, List[str]]: 67 | """ 68 | 使用 HDBSCAN 自动确定聚类数量的语义聚类方法。 69 | 基于 id + description 文本表示。 70 | """ 71 | if len(df) <= min_cluster_size: 72 | # 数据太少,不聚类 73 | return {0: df["id"].tolist()} 74 | 75 | texts = (df["id"].astype(str) + " " + df["description"].astype(str)).tolist() 76 | 77 | # Step 1. TF-IDF 向量化 78 | tfidf = TfidfVectorizer(max_features=2000) 79 | X = tfidf.fit_transform(texts) 80 | 81 | # Step 2. HDBSCAN 聚类 82 | clusterer = hdbscan.HDBSCAN( 83 | min_cluster_size=min_cluster_size, 84 | min_samples=min_samples, 85 | metric='euclidean', 86 | cluster_selection_method='eom' 87 | ) 88 | labels = clusterer.fit_predict(X.toarray()) 89 | 90 | # Step 3. 聚类结果收集 91 | clusters: Dict[int, List[str]] = {} 92 | for idx, label in enumerate(labels): 93 | if label == -1: 94 | # -1 表示噪声,可选择丢弃或单独归类 95 | label = 9999 # 归入“噪声”类 96 | clusters.setdefault(label, []).append(df.iloc[idx]["id"]) 97 | 98 | # 可选:按簇大小排序 99 | clusters = dict(sorted(clusters.items(), key=lambda x: -len(x[1]))) 100 | return clusters 101 | 102 | 103 | # ========================= 104 | # Step 3. 调用 LLM 命名类别 105 | # ========================= 106 | def get_llm_client(): 107 | return OpenAI( 108 | base_url=ConfigLoader.get("openai_base_url"), 109 | api_key=ConfigLoader.get("openai_api_key"), 110 | ) 111 | 112 | 113 | def name_cluster_with_llm(client, type_name: str, dataset: str, sample_texts: List[str]) -> str: 114 | """调用 LLM 生成聚类名称""" 115 | joined = "\n".join(sample_texts) # 取字段描述 116 | prompt = f""" 117 | You are classifying quantitative finance data fields. 118 | Given the dataset = {dataset} and field type = {type_name}. 119 | Below are some field examples: 120 | {joined} 121 | 122 | Please propose a short, meaningful lowercase name (1-3 words) for this group, 123 | like "momentum", "valuation_ratio", "sentiment_score", etc. 124 | Return only the name. 125 | """ 126 | resp = client.chat.completions.create( 127 | model=ConfigLoader.get("openai_model_name"), 128 | messages=[{"role": "system", "content": "You are a finance data classifier."}, 129 | {"role": "user", "content": prompt}], 130 | temperature=0.3, 131 | ) 132 | name = resp.choices[0].message.content.strip() 133 | # 清理非法字符 134 | name = name.replace(" ", "_").replace("-", "_").lower() 135 | return name 136 | 137 | 138 | # ========================= 139 | # Step 4. 主生成逻辑 140 | # ========================= 141 | def generate_template_fields_v2(): 142 | logging.info("📥 Loading all field csvs...") 143 | df = load_all_fields() 144 | 145 | # 自动检测必要列 146 | expected_cols = {"id", "description", "type"} 147 | if not expected_cols.issubset(df.columns): 148 | raise ValueError(f"Missing required columns in input: {expected_cols - set(df.columns)}") 149 | 150 | client = get_llm_client() 151 | all_mappings = {} 152 | 153 | # 按 dataset + type 分组 154 | grouped = df.groupby(["__dataset__", "type"]) 155 | for (dataset, dtype), subdf in grouped: 156 | if len(subdf) < 3: 157 | print(f"Skipping small group: {dataset}:{dtype} ({len(subdf)})") 158 | continue 159 | 160 | print(f"🧩 Processing dataset={dataset}, type={dtype}, size={len(subdf)}") 161 | clusters = cluster_fields_by_semantics_auto(subdf) 162 | 163 | for cluster_id, field_ids in clusters.items(): 164 | sample_df = subdf[subdf["id"].isin(field_ids)] 165 | sample_texts = (sample_df["id"] + " " + sample_df["description"]).tolist() 166 | type_name = name_cluster_with_llm(client, dtype, dataset, sample_texts) 167 | 168 | key = f"" 169 | all_mappings[key] = field_ids 170 | print(f"✅ Generated type: {key} ({len(field_ids)} fields)") 171 | 172 | # 保存结果 173 | with open(OUT_JSON, "w", encoding="utf-8") as f: 174 | json.dump(all_mappings, f, ensure_ascii=False, indent=2) 175 | print(f"🎯 Saved {len(all_mappings)} template field types to {OUT_JSON}") 176 | 177 | 178 | if __name__ == "__main__": 179 | generate_template_fields_v2() 180 | -------------------------------------------------------------------------------- /evaluator/backtest_with_wq.py: -------------------------------------------------------------------------------- 1 | import json 2 | import csv 3 | import logging 4 | from pathlib import Path 5 | from time import sleep 6 | import requests 7 | from requests.auth import HTTPBasicAuth 8 | 9 | from utils.config_loader import ConfigLoader 10 | 11 | BASE_DIR = Path(__file__).resolve().parents[1] 12 | BACKTEST_DIR = BASE_DIR / "data" / "alpha_db_v2" / "backtest_result" 13 | BACKTEST_DIR.mkdir(parents=True, exist_ok=True) 14 | 15 | logging.basicConfig(filename='backtest_with_wq.log', level=logging.INFO, 16 | format='%(asctime)s - %(levelname)s - %(message)s') 17 | 18 | # ====== 登录并保持 Session ====== 19 | def sign_in(): 20 | """登录 WQ Brain 并返回 session""" 21 | username = ConfigLoader.get('worldquant_account') 22 | password = ConfigLoader.get('worldquant_password') 23 | 24 | sess = requests.Session() 25 | sess.auth = HTTPBasicAuth(username, password) 26 | response = sess.post(ConfigLoader.get('worldquant_api_auth')) 27 | print(f"Login status: {response.status_code}") 28 | return sess 29 | 30 | 31 | def run_backtest_by_wq_api(alphas_json_file): 32 | """回测指定 alphas json 文件""" 33 | sess = sign_in() 34 | 35 | # === 1. 读 alpha JSON === 36 | with open(alphas_json_file, "r", encoding="utf-8") as f: 37 | data = json.load(f) 38 | 39 | print(f"🔬 Start backtest for {alphas_json_file}") 40 | # 支持两种结构 41 | alphas = [] 42 | if "GeneratedAlphas" in data: 43 | for item in data["GeneratedAlphas"]: 44 | alphas.append(item["alpha"]) 45 | elif isinstance(data, list): 46 | for item in data: 47 | alphas.append(item["alpha"]) 48 | else: 49 | print("❌ 不识别的 alpha JSON 格式") 50 | return None 51 | 52 | template_name = Path(alphas_json_file).stem 53 | out_csv = BACKTEST_DIR / f"{template_name}_backtest.csv" 54 | 55 | # === 2. 读取已存在CSV,跳过已回测 === 56 | finished_alphas = set() 57 | if out_csv.exists(): 58 | with open(out_csv, "r", encoding="utf-8") as f: 59 | reader = csv.DictReader(f) 60 | for row in reader: 61 | finished_alphas.add(row["alpha"]) 62 | print(f"⚠️ 已有 {len(finished_alphas)} 条回测结果,将跳过这些 alpha") 63 | 64 | # === 3. CSV准备写入 === 65 | fieldnames = ["alpha", "sharpe", "turnover", "fitness", "returns", "drawdown", "margin"] 66 | csv_file = open(out_csv, "a", newline="", encoding="utf-8") 67 | writer = csv.DictWriter(csv_file, fieldnames=fieldnames) 68 | if csv_file.tell() == 0: # 空文件时写表头 69 | writer.writeheader() 70 | 71 | # === 4. 循环回测 === 72 | alpha_fail_attempt_tolerance = 15 73 | for index, alpha_expr in enumerate(alphas, start=1): 74 | if alpha_expr in finished_alphas: 75 | print(f"✅ 跳过已回测 alpha: {alpha_expr[:40]}...") 76 | continue 77 | 78 | # 组装模拟参数 79 | alpha_payload = { 80 | "type": "REGULAR", 81 | "settings": { 82 | "instrumentType": "EQUITY", 83 | "region": "USA", 84 | "universe": "TOP3000", 85 | "delay": 1, 86 | "decay": 0, 87 | "neutralization": "SUBINDUSTRY", 88 | "truncation": 0.01, 89 | "pasteurization": "ON", 90 | "unitHandling": "VERIFY", 91 | "nanHandling": "OFF", 92 | "language": "FASTEXPR", 93 | "visualization": False, 94 | }, 95 | "regular": alpha_expr 96 | } 97 | 98 | print(f"[{index}/{len(alphas)}] 回测 alpha: {alpha_expr[:60]}...") 99 | keep_trying = True 100 | failure_count = 0 101 | 102 | # === 4.1 提交 Simulation === 103 | while keep_trying: 104 | try: 105 | sim_resp = sess.post( 106 | 'https://api.worldquantbrain.com/simulations', 107 | json=alpha_payload 108 | ) 109 | if sim_resp.status_code not in (200, 201): 110 | raise RuntimeError(f"Simulation submit failed {sim_resp.status_code}: {sim_resp.text}") 111 | 112 | sim_progress_url = sim_resp.headers.get('Location') 113 | if not sim_progress_url: 114 | raise RuntimeError("❌ No Location header in response") 115 | 116 | print(f"🔎 Alpha simulation location: {sim_progress_url}") 117 | keep_trying = False 118 | except Exception as e: 119 | failure_count += 1 120 | print(f"⚠️ No Location, sleep 15 and retry: {e}") 121 | logging.error(f"No Location, sleep 15 and retry: {e}") 122 | sleep(15) 123 | if failure_count >= alpha_fail_attempt_tolerance: 124 | sess = sign_in() # 重新登录 125 | failure_count = 0 126 | logging.error(f"❌ Too many failures,跳过当前 alpha {alpha_expr}") 127 | break 128 | 129 | # === 4.2 轮询 Simulation 结果 === 130 | if not sim_progress_url: 131 | continue 132 | # 等待完成 133 | finished = False 134 | for _ in range(240): # 最多轮询 240 次 * 15s = 60 分钟 135 | status_resp = sess.get(sim_progress_url) 136 | status_json = status_resp.json() 137 | status = status_json.get("status") 138 | if status == "COMPLETE": 139 | alpha_id = status_json.get("alpha") 140 | finished = True 141 | break 142 | elif status == "ERROR": 143 | print(f"❌ Simulation failed for {alpha_expr}") 144 | finished = False 145 | break 146 | else: 147 | print(f"⏳ Status: {status}, sleep 10s") 148 | sleep(10) 149 | if not finished: 150 | continue 151 | 152 | # === 4.3 获取 Alpha 指标 === 153 | # alpha_resp = sess.get(f'https://api.worldquantbrain.com/alphas/{alpha_id}') 154 | for attempt in range(20): 155 | alpha_resp = sess.get(f'https://api.worldquantbrain.com/alphas/{alpha_id}') 156 | if alpha_resp.status_code == 200: 157 | alpha_data = alpha_resp.json() 158 | break 159 | else: 160 | print(f"⏳ Alpha {alpha_id} not ready yet, status={alpha_resp.status_code}, retry {attempt + 1}") 161 | sleep(5) 162 | else: 163 | print(f"❌ Failed to fetch alpha result after retries for alphaId={alpha_id}") 164 | continue # 或 raise 165 | 166 | is_data = alpha_data.get("is", {}) 167 | result_row = { 168 | "alpha": alpha_expr, 169 | "sharpe": is_data.get("sharpe"), 170 | "turnover": is_data.get("turnover"), 171 | "fitness": is_data.get("fitness"), 172 | "returns": is_data.get("returns"), 173 | "drawdown": is_data.get("drawdown"), 174 | "margin": is_data.get("margin"), 175 | } 176 | writer.writerow(result_row) 177 | csv_file.flush() 178 | print(f"✅ 已写入回测结果: sharpe={result_row['sharpe']}, fitness={result_row['fitness']}") 179 | 180 | 181 | csv_file.close() 182 | print(f"🎯 所有回测完成,结果已保存到 {out_csv}") 183 | return str(out_csv) 184 | 185 | 186 | if __name__ == "__main__": 187 | test_file = BASE_DIR / "data" / "alpha_db" / "all_alphas" / "your_template_alphas.json" 188 | run_backtest_by_wq_api(test_file) -------------------------------------------------------------------------------- /utils/wq_info_loader.py: -------------------------------------------------------------------------------- 1 | import json 2 | import logging 3 | from pathlib import Path 4 | from typing import List, Dict 5 | 6 | import requests 7 | import pandas as pd 8 | from requests.auth import HTTPBasicAuth 9 | from utils.config_loader import ConfigLoader 10 | 11 | # --- 目录 --- 12 | BASE_DIR = Path(__file__).resolve().parents[1] 13 | WQ_FIELD_DIR = BASE_DIR / "data" / "wq_fields" 14 | WQ_FIELD_DIR.mkdir(parents=True, exist_ok=True) 15 | WQ_OPERATOR_DIR = BASE_DIR / "data" / "wq_operators" 16 | WQ_OPERATOR_DIR.mkdir(parents=True, exist_ok=True) 17 | 18 | FIELDS_CSV = WQ_FIELD_DIR 19 | OPERATORS_CSV = WQ_OPERATOR_DIR / "operators.csv" 20 | 21 | 22 | class OpAndFeature: 23 | def __init__(self): 24 | self.sess = requests.Session() 25 | username = ConfigLoader.get("worldquant_account") 26 | password = ConfigLoader.get("worldquant_password") 27 | self.setup_auth(username, password) 28 | 29 | def setup_auth(self, username, password) -> None: 30 | """Set up authentication with WorldQuant Brain.""" 31 | self.sess.auth = HTTPBasicAuth(username, password) 32 | 33 | print("Authenticating with WorldQuant Brain...") 34 | response = self.sess.post('https://api.worldquantbrain.com/authentication') 35 | print(f"Authentication response status: {response.status_code}") 36 | logging.debug(f"Authentication response: {response.text[:500]}...") 37 | 38 | if response.status_code != 201: 39 | raise Exception(f"Authentication failed: {response.text}") 40 | 41 | def get_data_fields(self): 42 | """Fetch available data fields from WorldQuant Brain across multiple datasets with random sampling.""" 43 | 44 | # datasets = ['pv1', 'fundamental6', 'analyst4', 'model16', 'news12'] 45 | 46 | datasets = ['analyst4', 47 | 'analyst10', 48 | 'analyst11', 49 | 'analyst14', 50 | 'analyst15', 51 | 'analyst16', 52 | 'analyst35', 53 | 'analyst40', 54 | 'analyst69', 55 | 'earnings3', 56 | 'earnings5', 57 | 'fundamental17', 58 | 'fundamental22', 59 | 'fundamental23', 60 | 'fundamental28', 61 | 'fundamental31', 62 | 'fundamental44', 63 | 'fundamental6', 64 | 'fundamental7', 65 | 'fundamental72', 66 | 'model109', 67 | 'model110', 68 | 'model138', 69 | 'model16', 70 | 'model176', 71 | 'model219', 72 | 'model238', 73 | 'model244', 74 | 'model26', 75 | 'model262', 76 | 'model264', 77 | 'model29', 78 | 'model30', 79 | 'model307', 80 | 'model32', 81 | 'model38', 82 | 'model53', 83 | 'model77', 84 | 'news12', 85 | 'news20', 86 | 'news23', 87 | 'news52', 88 | 'news66', 89 | 'other128', 90 | 'other432', 91 | 'other450', 92 | 'other455', 93 | 'other460', 94 | 'other496', 95 | 'other551', 96 | 'other553', 97 | 'other699', 98 | 'other83', 99 | 'pv1', 100 | 'pv13', 101 | 'pv173', 102 | 'pv29', 103 | 'pv37', 104 | 'pv53', 105 | 'pv72', 106 | 'pv73', 107 | 'pv96', 108 | 'risk60', 109 | 'risk66', 110 | 'risk70', 111 | 'sentiment21', 112 | 'sentiment22', 113 | 'sentiment26', 114 | 'shortinterest6', 115 | 'univ1'] 116 | 117 | base_params = { 118 | 'delay': 1, 119 | 'instrumentType': 'EQUITY', 120 | 'limit': 50, 121 | 'region': 'USA', 122 | 'universe': 'TOP3000' 123 | } 124 | 125 | try: 126 | print("Requesting data fields from multiple datasets...") 127 | for dataset in datasets: 128 | print("------------" + dataset + "--------------\n") 129 | des = str(FIELDS_CSV) + "/" + dataset + ".csv" 130 | if Path(des).exists(): 131 | print(f"Fields CSV already exists at {des}, skipping download.") 132 | continue 133 | 134 | all_fields = [] 135 | params = base_params.copy() 136 | params['dataset.id'] = dataset 137 | 138 | print(f"Getting field count for dataset: {dataset}") 139 | count_response = self.sess.get('https://api.worldquantbrain.com/data-fields', params=params) 140 | 141 | if count_response.status_code == 200: 142 | count_data = count_response.json() 143 | total_fields = count_data.get('count', 0) 144 | print(f"Total fields in {dataset}: {total_fields}") 145 | 146 | params['limit'] = 50 147 | 148 | for offset in range(0, total_fields, params['limit']): 149 | params['offset'] = offset 150 | response = self.sess.get('https://api.worldquantbrain.com/data-fields', params=params) 151 | 152 | if response.status_code == 200: 153 | data = response.json() 154 | fields = data.get('results', []) 155 | print(f"Fetched {len(fields)} fields at offset={offset}") 156 | all_fields.extend(fields) 157 | else: 158 | print(f"Failed to fetch fields for {dataset} at offset={offset}: {response.text[:500]}") 159 | else: 160 | print(f"Failed to get count for {dataset}: {count_response.text[:500]}") 161 | 162 | # 去重 163 | unique_fields = {field['id']: field for field in all_fields}.values() 164 | unique_fields = list(unique_fields) 165 | 166 | # 保存到 CSV 167 | df = pd.DataFrame(unique_fields) 168 | df.to_csv(des, index=False, encoding='utf-8') 169 | print(f"✅ Saved fields CSV to {des}") 170 | 171 | return 172 | 173 | except Exception as e: 174 | print(f"Failed to fetch data fields: {e}") 175 | return 176 | 177 | def get_operators(self) -> List[Dict]: 178 | """Fetch available operators from WorldQuant Brain.""" 179 | if OPERATORS_CSV.exists(): 180 | print(f"Operators CSV already exists at {OPERATORS_CSV}, skipping download.") 181 | return pd.read_csv(OPERATORS_CSV).to_dict(orient='records') 182 | 183 | print("Requesting operators...") 184 | response = self.sess.get('https://api.worldquantbrain.com/operators') 185 | print(f"Operators response status: {response.status_code}") 186 | logging.debug(f"Operators response: {response.text[:500]}...") # Print first 500 chars 187 | 188 | if response.status_code != 200: 189 | raise Exception(f"Failed to get operators: {response.text}") 190 | 191 | data = response.json() 192 | if isinstance(data, list): 193 | operators = data 194 | elif 'results' in data: 195 | operators = data['results'] 196 | else: 197 | raise Exception(f"Unexpected operators response format. Response: {data}") 198 | 199 | df = pd.DataFrame(operators) 200 | df.to_csv(OPERATORS_CSV, index=False, encoding='utf-8') 201 | print(f"✅ Saved operators CSV to {OPERATORS_CSV}") 202 | 203 | return operators 204 | -------------------------------------------------------------------------------- /prompts/template_generating.yaml: -------------------------------------------------------------------------------- 1 | inject_wq_knowledge: |- 2 | # Role 3 | You are a professional quantitative researcher at a hedge fund. 4 | 5 | # Task 6 | You are going to generate alpha templates for alpha mining in form of WorldQuant Fast Expression. 7 | Please notice the following WorldQuant Fast Expression knowledge base: 8 | 9 | ## Fields and Definitions: 10 | {{ fields_and_definitions }} 11 | 12 | ## Operators and Definitions: 13 | {{ operators_and_definitions }} 14 | 15 | ## Field Types with Included Fields: 16 | {{ field_types }} 17 | 18 | ## Operator Types with Included Operators (operator types follows the format): 19 | {{ operator_types }} 20 | 21 | This knowledge represents all the allowed building blocks for alpha template construction. 22 | - Each field and operator listed above can be used directly. 23 | - Each field type and operator type (e.g., , ) can also be used as a placeholder. 24 | - When generating alpha templates, you **must** use only the fields, operators, types listed above and numbers. 25 | - Please remember these fields and operators well and do not create undefined symbols in your future work. 26 | - Do not use scientific notation for decimals (such as 1e-9), as WorldBrain does not support this. 27 | 28 | 29 | check_if_blog_helpful: |- 30 | # Role 31 | You are a quantitative researcher at a hedge fund. 32 | 33 | # Task 34 | You are going to mine alphas based on posts from WorldQuant Brain. 35 | You are given a blog post: 36 | -------------------blog start------------------- 37 | {{ blog_post }} 38 | --------------------blog end-------------------- 39 | Please determine if it is possible to generate alpha or alpha templates from this blog. Output a 'Y' if possible, otherwise output an 'N', and be careful not to output anything else 40 | 41 | blog_to_hypothesis: |- 42 | # Role 43 | Now you are a quantitative researcher at a hedge fund. 44 | 45 | # Task 46 | Your task is to extract **quantitative trading hypotheses** from a blog post. 47 | 48 | # Guideline 49 | Follow these steps carefully: 50 | 1. Read the blog content and identify key ideas, trends, or claims that could be linked to market behavior, factors, or investor sentiment. 51 | 2. Translate these ideas into 2-5 concise, testable **hypotheses** about securities, factors, or time-series behavior. Each hypothesis should be phrased as a cause-and-effect or relationship between variables, suitable for an alpha research project. 52 | 3. When possible, highlight which types of data fields (, , , , , etc.) and which categories of operators (, , , , , etc.) might be relevant to test the hypothesis. Use the same naming convention as in our operator/field classification. 53 | 4. Output a numbered list of hypotheses. For each hypothesis provide: 54 | - **Hypothesis**: A short sentence stating the relationship or effect. 55 | - **Rationale**: Why this relationship may hold according to the blog’s content. 56 | - **Potential Fields/Operators**: Which **field types or operator types** may be relevant (e.g., , ). 57 | 5. The output should conform to the json format. DO NOT output anything other than the json structure.(just to illustrate, do not rely on fields or operators in it) 58 | Here is an output Json example: 59 | [ 60 | { 61 | "Hypothesis": "During periods of heightened market volatility, technology sector stocks tend to experience increased trading volume relative to other sectors.", 62 | "Rationale": "The blog argues that institutional investors seek growth opportunities in tech during uncertainty, driving abnormal trading volume compared to the broader market.", 63 | "Potential Fields": ["", ""], 64 | "Potential Operators": ["", ""] 65 | }, 66 | { 67 | "Hypothesis": "Stocks with stronger analyst earnings revisions outperform their peers in the following quarter.", 68 | "Rationale": "The blog notes that upward revisions by analysts tend to precede positive stock price movements and excess returns.", 69 | "Potential Fields": ["", ""], 70 | "Potential Operators": ["", "", ""] 71 | } 72 | ] 73 | 74 | Constraints: 75 | - Hypotheses should be specific enough to inspire a factor or alpha template but not yet in expression form. 76 | - Do not output any code or alpha template at this stage. 77 | - Do not output any comments within the json structure 78 | 79 | Now you are given a **blog post**: 80 | -------------------blog start------------------- 81 | {{ blog_post }} 82 | --------------------blog end-------------------- 83 | **Please extract quantitative trading hypotheses from it.** 84 | 85 | 86 | hypothesis_to_template: |- 87 | # Role 88 | You are a quantitative researcher at a hedge fund. 89 | You have already been provided the full WorldQuant Fast Expression knowledge base 90 | (fields, operators, field types, operator types) in the system prompt. 91 | To make it clear what "type" includes, we reiterate the operator types and field types in the knowledge base, as well as the specific operators and fields they contain: 92 | ## Field Types with Included Fields: 93 | {{ field_types }} 94 | 95 | ## Operator Types with Included Operators: 96 | {{ operator_types }} 97 | 98 | # Task 99 | You are given **multiple hypotheses**. 100 | Your task: 101 | 1. Review all hypotheses and either synthesize them into **one combined alpha template** 102 | or select the **single most promising hypothesis** to convert into a template. 103 | 2. Using the WorldQuant Fast Expression knowledge base from the system prompt, 104 | generate one valid alpha template expression. 105 | 106 | # Guideline 107 | ## Output format (JSON): 108 | { 109 | "SelectedHypothesis": "string", 110 | "TemplateExpression": "string", 111 | "Description": "string", 112 | "ExpectedBehavior": "string" 113 | } 114 | 115 | ## IMPORTANT: 116 | - You MUST include **between 1 and 3 field/operator types** (placeholders like or ) in the final template expression. 117 | - Templates **without placeholders** are invalid and must not be produced. 118 | - Prefer using types over concrete fields/operators, but concrete fields/operators cannot be ignored when generating template. 119 | - You **must** include at least **1** and at most **4** types(field types or operator types) in template expression. 120 | - Do **not** generate an alpha without using types (types num < 1)!!! 121 | - Do **not** generate a template that contains too many types (types num > 4)!!! 122 | - You need to fully consider all the field types and operator types provided to you, and do not rely on the ones given in the examples. 123 | 124 | ## Rules: 125 | - The template must follow the WorldQuant Fast Expression syntax. 126 | - You may use: 127 | * Concrete fields and operators from the knowledge base. 128 | * Field types and operator types from the knowledge base. 129 | * Field types and operator types from the hypotheses' "Potential Fields" and "Potential Fields". 130 | - Do **not** use scientific notation for decimals (write 0.0001 instead of 1e-4). 131 | - The expression must be plausible, concise, and syntactically valid. 132 | - Do **not** output any comments within the json structure 133 | 134 | ## Good Placeholder Examples (just to illustrate, do not rely on any fields or operators in it): 135 | - ts_rank(, 20) 136 | - divide(, (, 10)) 137 | - () 138 | 139 | ## Output Example (just to illustrate, do not rely on fields or operators in it): 140 | ``` 141 | { 142 | "SelectedHypothesis": "Alpha signals with very low turnover (below 5%) can be transformed into more useful signals by applying ts_target_tvr_decay with expanded lambda_max parameter to increase turnover towards optimal target ranges.", 143 | "TemplateExpression": "(ts_delta(, 5), 0, 5, 0.15)", 144 | "Description": "Applies TS:Transform optimization to a simple 5-day price momentum signal to normalize turnover towards 15% target", 145 | "ExpectedBehavior": "The expression will adaptively adjust the decay parameter to bring the turnover of the underlying momentum signal closer to the target 15% level, making low-turnover signals more viable while preserving the core alpha signal" 146 | } 147 | ``` 148 | 149 | Now you are given the following **hypotheses**: 150 | {{ hypotheses }} 151 | **Please generate your json output from it.(DO NOT output any other contents.)** 152 | 153 | -------------------------------------------------------------------------------- /researcher/construct_prompts.py: -------------------------------------------------------------------------------- 1 | import json 2 | import re 3 | import pandas as pd 4 | import yaml 5 | from pathlib import Path 6 | 7 | from utils.config_loader import ConfigLoader 8 | from utils.text_dealer import truncate_text 9 | 10 | # --- 路径 --- 11 | BASE_DIR = Path(__file__).resolve().parents[1] 12 | PROMPT_FILE = BASE_DIR / "prompts" / "template_generating.yaml" 13 | FIELDS_DIR = BASE_DIR / "data" / "wq_fields" 14 | TEMPLATE_FIELDS_FILE = BASE_DIR / "data" / "wq_template_fields" / "template_fields.json" 15 | OPERATORS_FILE = BASE_DIR / "data" / "wq_template_operators" / "template_operators.csv" 16 | 17 | 18 | def build_wq_knowledge_prompt(): 19 | """ 20 | 读取 YAML 模板,并根据 config 中启用的数据集构建字段、字段类型、操作符信息, 21 | 渲染 inject_wq_knowledge prompt。 22 | """ 23 | 24 | # 读取模板 25 | with open(PROMPT_FILE, "r", encoding="utf-8") as f: 26 | prompt_yaml = yaml.safe_load(f) 27 | template_str = prompt_yaml.get("inject_wq_knowledge", "") 28 | if not template_str: 29 | raise ValueError("inject_wq_knowledge not found in template_generating.yaml") 30 | 31 | # 读取配置:启用的数据集 32 | enabled_datasets = ConfigLoader.get("enabled_field_datasets", []) 33 | print(f"🔧 Enabled datasets from config: {enabled_datasets}") 34 | 35 | # ========================================================= 36 | # 加载字段文件(仅限配置中启用的) 37 | # ========================================================= 38 | field_dfs = [] 39 | for file in FIELDS_DIR.glob("*.csv"): 40 | dataset_name = file.stem 41 | if enabled_datasets and dataset_name not in enabled_datasets: 42 | continue 43 | if file.stat().st_size == 0: 44 | print(f"⚠️ Skipping empty file: {file.name}") 45 | continue 46 | 47 | try: 48 | df = pd.read_csv(file, dtype=str, keep_default_na=False) 49 | df["__dataset__"] = dataset_name 50 | field_dfs.append(df) 51 | except Exception as e: 52 | print(f"❌ Failed to load {file.name}: {e}") 53 | 54 | if not field_dfs: 55 | raise ValueError("❌ No valid field CSVs loaded. Check config.enabled_field_datasets.") 56 | 57 | fields_df = pd.concat(field_dfs, ignore_index=True) 58 | 59 | # 构建字段定义信息 60 | fields_info = [] 61 | for _, row in fields_df.iterrows(): 62 | desc = row.get("description", "") 63 | dtype = row.get("type", "") 64 | dataset = row.get("__dataset__", "") 65 | field_str = f"- **{row['id']}** ({dtype}, {dataset}): {desc}" 66 | fields_info.append(field_str) 67 | 68 | fields_and_definitions = "\n".join(fields_info) 69 | 70 | # ========================================================= 71 | # 加载字段类型映射(来自 template_fields.json) 72 | # ========================================================= 73 | if not TEMPLATE_FIELDS_FILE.exists(): 74 | raise FileNotFoundError(f"❌ template_fields.json not found at {TEMPLATE_FIELDS_FILE}") 75 | 76 | with open(TEMPLATE_FIELDS_FILE, "r", encoding="utf-8") as f: 77 | template_field_data = json.load(f) 78 | 79 | # template_fields.json 格式: { "field_type_name": [list of field ids], ... } 80 | # 仅保留属于启用数据集的字段 81 | filtered_field_types = {} 82 | 83 | for ftype_full, ids in template_field_data.items(): 84 | # 提取 dataset_name,例如从 "" 得到 "pv1" 85 | match = re.search(r":([\w\-]+)\/>$", ftype_full) 86 | if not match: 87 | continue 88 | dataset_name = match.group(1) 89 | # 若 dataset_name 在启用列表中,则保留 90 | if enabled_datasets and dataset_name not in enabled_datasets: 91 | continue 92 | filtered_field_types[ftype_full] = ids 93 | 94 | # 渲染 field types 95 | field_types_str = [] 96 | for ftype, fields in filtered_field_types.items(): 97 | field_types_str.append(f"- **{ftype}**: {', '.join(fields)}") 98 | field_types = "\n".join(field_types_str) 99 | 100 | # ========================================================= 101 | # 加载操作符文件 102 | # ========================================================= 103 | ops_df = pd.read_csv(OPERATORS_FILE) 104 | ops_info = [] 105 | op_types_map = {} 106 | 107 | for _, row in ops_df.iterrows(): 108 | op_str = f"- **{row['name']}**: {row['definition']} — {row['description']}" 109 | ops_info.append(op_str) 110 | op_types_map.setdefault(row['type'], []).append(row['name']) 111 | 112 | operators_and_definitions = "\n".join(ops_info) 113 | 114 | op_types_str = [] 115 | for otype, ops in op_types_map.items(): 116 | op_types_str.append(f"- ****: {', '.join(ops)}") 117 | operator_types = "\n".join(op_types_str) 118 | 119 | # ========================================================= 120 | # 渲染模板 121 | # ========================================================= 122 | prompt_filled = ( 123 | template_str 124 | .replace("{{ fields_and_definitions }}", fields_and_definitions) 125 | .replace("{{ operators_and_definitions }}", operators_and_definitions) 126 | .replace("{{ field_types }}", field_types) 127 | .replace("{{ operator_types }}", operator_types) 128 | ) 129 | 130 | print("✅ WQ knowledge prompt built successfully.") 131 | return prompt_filled 132 | 133 | 134 | def build_check_if_blog_helpful(blog_json_path: str): 135 | """ 136 | 从yaml读取check_if_blog_helpful模板并用blog_json渲染 137 | """ 138 | # 1. 读取yaml 139 | with open(PROMPT_FILE, "r", encoding="utf-8") as f: 140 | prompt_yaml = yaml.safe_load(f) 141 | 142 | template_str = prompt_yaml.get("check_if_blog_helpful", "") 143 | if not template_str: 144 | raise ValueError("check_if_blog_helpful not found in template_generating.yaml") 145 | 146 | # 2. 读取json 147 | with open(blog_json_path, "r", encoding="utf-8") as f: 148 | blog_data = json.load(f) 149 | 150 | # 3. 拼接blog_post文本 151 | # 可以按需求拼接(title+description+post_body+comments) 152 | post_text = f"Title: {blog_data.get('title','')}\n\nDescription: {blog_data.get('description','')}\n\nPost Body: {blog_data.get('post_body','')}\n\nComments:\n" 153 | if blog_data.get("post_comments"): 154 | for i, c in enumerate(blog_data["post_comments"], 1): 155 | post_text += f"[{i}] {c}\n" 156 | 157 | # 4. 替换模板 158 | prompt_filled = template_str.replace("{{ blog_post }}", truncate_text(post_text)) 159 | 160 | return prompt_filled 161 | 162 | 163 | def build_blog_to_hypothesis(blog_json_path: str): 164 | """ 165 | 从yaml读取blog_to_hypothesis模板并用blog_json渲染 166 | """ 167 | # 1. 读取yaml 168 | with open(PROMPT_FILE, "r", encoding="utf-8") as f: 169 | prompt_yaml = yaml.safe_load(f) 170 | 171 | template_str = prompt_yaml.get("blog_to_hypothesis", "") 172 | if not template_str: 173 | raise ValueError("blog_to_hypothesis not found in template_generating.yaml") 174 | 175 | # 2. 读取json 176 | with open(blog_json_path, "r", encoding="utf-8") as f: 177 | blog_data = json.load(f) 178 | 179 | # 3. 拼接blog_post文本 180 | # 可以按需求拼接(title+description+post_body+comments) 181 | post_text = f"Title: {blog_data.get('title','')}\n\nDescription: {blog_data.get('description','')}\n\nPost Body: {blog_data.get('post_body','')}\n\nComments:\n" 182 | if blog_data.get("post_comments"): 183 | for i, c in enumerate(blog_data["post_comments"], 1): 184 | post_text += f"[{i}] {c}\n" 185 | 186 | # 4. 替换模板 187 | prompt_filled = template_str.replace("{{ blog_post }}", truncate_text(post_text)) 188 | 189 | return prompt_filled 190 | 191 | 192 | def build_hypothesis_to_template(hypotheses_json_path: str): 193 | """ 194 | 从yaml读取hypothesis_to_template模板并用hypotheses_json渲染 195 | - 优先使用 template_fields.json(映射 field types -> [field ids]) 196 | - 根据 config 中的 enabled_datasets 过滤 field types (从 中解析 dataset) 197 | - 为防止 token 爆炸,展示每个类型的前 N 个示例并标注总数 198 | """ 199 | import re 200 | from utils.config_loader import ConfigLoader 201 | 202 | # 1. 读取yaml 203 | with open(PROMPT_FILE, "r", encoding="utf-8") as f: 204 | prompt_yaml = yaml.safe_load(f) 205 | 206 | template_str = prompt_yaml.get("hypothesis_to_template", "") 207 | if not template_str: 208 | raise ValueError("hypothesis_to_template not found in template_generating.yaml") 209 | 210 | # 2. 读取hypotheses json 211 | with open(hypotheses_json_path, "r", encoding="utf-8") as f: 212 | hypotheses_data = json.load(f) 213 | 214 | hypotheses_str = json.dumps(hypotheses_data, indent=2, ensure_ascii=False) 215 | 216 | # --- 读取并过滤 field types (优先 template_fields.json) --- 217 | field_types_map = {} 218 | 219 | # 获取用户在 config 中启用的数据集(可为逗号分隔字符串或 list) 220 | enabled = ConfigLoader.get("enabled_field_datasets") 221 | enabled_datasets = None 222 | if enabled: 223 | if isinstance(enabled, str): 224 | enabled_datasets = [s.strip() for s in enabled.split(",") if s.strip()] 225 | elif isinstance(enabled, (list, tuple)): 226 | enabled_datasets = list(enabled) 227 | else: 228 | enabled_datasets = None 229 | 230 | if TEMPLATE_FIELDS_FILE.exists(): 231 | with open(TEMPLATE_FIELDS_FILE, "r", encoding="utf-8") as f: 232 | template_field_data = json.load(f) 233 | 234 | # 解析 key 格式 提取 dataset 并按 enabled_datasets 过滤 235 | for ftype_full, ids in template_field_data.items(): 236 | # 提取最后一个冒号之后直到 '/>' 之间的 dataset 名称 237 | m = re.search(r":([^/>]+)\/>$", ftype_full) 238 | dataset_name = m.group(1) if m else None 239 | 240 | # 如果用户指定了 enabled_datasets,则只保留匹配的 dataset 241 | if enabled_datasets and dataset_name and dataset_name not in enabled_datasets: 242 | continue 243 | 244 | # 保持原始 id 列表(list),后面渲染时会截断显示 245 | field_types_map[ftype_full] = list(ids) 246 | else: 247 | raise FileNotFoundError("❌ template_fields.json not found.") 248 | 249 | # --- 读取操作符类型映射(保持原样,但做截断显示) --- 250 | ops_df = pd.read_csv(OPERATORS_FILE, dtype=str, keep_default_na=False) 251 | op_types_map = {} 252 | for _, row in ops_df.iterrows(): 253 | typ = row.get("type", "Other") 254 | name = row.get("name") 255 | if name: 256 | op_types_map.setdefault(typ, []).append(name) 257 | 258 | # --- 构建可读字符串(为 prompt ): 对每个类型只显示前 N 个示例以节约 token --- 259 | MAX_EXAMPLES_PER_TYPE = 10000 # 每个类型在 prompt 中展示的最大示例数(字段或操作符) 260 | field_types_str_lines = [] 261 | for ftype, ids in field_types_map.items(): 262 | total = len(ids) 263 | display_ids = ids[:MAX_EXAMPLES_PER_TYPE] 264 | suffix = "" if total <= MAX_EXAMPLES_PER_TYPE else f", ... (+{total - MAX_EXAMPLES_PER_TYPE} more)" 265 | field_types_str_lines.append(f"- **{ftype}** ({total} fields): {', '.join(display_ids)}{suffix}") 266 | field_types = "\n".join(field_types_str_lines) 267 | 268 | op_types_str_lines = [] 269 | for otype, ops in op_types_map.items(): 270 | total = len(ops) 271 | display_ops = ops[:MAX_EXAMPLES_PER_TYPE] 272 | suffix = "" if total <= MAX_EXAMPLES_PER_TYPE else f", ... (+{total - MAX_EXAMPLES_PER_TYPE} more)" 273 | op_types_str_lines.append(f"- **** ({total} ops): {', '.join(display_ops)}{suffix}") 274 | operator_types = "\n".join(op_types_str_lines) 275 | 276 | # 4. 替换模板 277 | prompt_filled = ( 278 | template_str 279 | .replace("{{ hypotheses }}", hypotheses_str) 280 | .replace("{{ field_types }}", field_types) 281 | .replace("{{ operator_types }}", operator_types) 282 | ) 283 | 284 | return prompt_filled 285 | 286 | 287 | 288 | 289 | if __name__ == "__main__": 290 | print(build_wq_knowledge_prompt()) 291 | -------------------------------------------------------------------------------- /evaluator/backtest_with_wq_mul.py: -------------------------------------------------------------------------------- 1 | # backtest_with_wq.py 2 | import json 3 | import csv 4 | import logging 5 | from pathlib import Path 6 | from time import sleep 7 | import requests 8 | from openai import OpenAI 9 | from requests.auth import HTTPBasicAuth 10 | 11 | from evaluator.construct_prompts import build_fix_fast_expression_prompt 12 | from utils.config_loader import ConfigLoader 13 | 14 | BASE_DIR = Path(__file__).resolve().parents[1] 15 | BACKTEST_DIR = BASE_DIR / "data" / "alpha_db_v2" / "backtest_result" 16 | BACKTEST_DIR.mkdir(parents=True, exist_ok=True) 17 | 18 | logging.basicConfig(filename='backtest_with_wq.log', level=logging.INFO, 19 | format='%(asctime)s - %(levelname)s - %(message)s') 20 | 21 | 22 | def sign_in(): 23 | """登录 WQ Brain 并返回 session""" 24 | username = ConfigLoader.get('worldquant_account') 25 | password = ConfigLoader.get('worldquant_password') 26 | 27 | sess = requests.Session() 28 | sess.auth = HTTPBasicAuth(username, password) 29 | resp = sess.post(ConfigLoader.get('worldquant_api_auth')) 30 | print(f"Login status: {resp.status_code}") 31 | return sess 32 | 33 | 34 | def run_backtest_mul_by_wq_api(alphas_json_file, batch_size=15): 35 | """批量回测指定 alphas json 文件,采用等待队列方式提升效率""" 36 | sess = sign_in() 37 | 38 | # === 1. 读 alpha JSON === 39 | with open(alphas_json_file, "r", encoding="utf-8") as f: 40 | data = json.load(f) 41 | 42 | print(f"🔬 Start backtest for {alphas_json_file}") 43 | if "GeneratedAlphas" in data: 44 | alphas = [item["alpha"] for item in data["GeneratedAlphas"]] 45 | elif isinstance(data, list): 46 | alphas = [item["alpha"] for item in data] 47 | else: 48 | print("❌ 不识别的 alpha JSON 格式") 49 | return None 50 | 51 | template_name = Path(alphas_json_file).stem 52 | out_csv = BACKTEST_DIR / f"{template_name}_backtest.csv" 53 | 54 | # === 2. 已有结果,跳过 === 55 | finished_alphas = set() 56 | if out_csv.exists(): 57 | with open(out_csv, "r", encoding="utf-8") as f: 58 | reader = csv.DictReader(f) 59 | for row in reader: 60 | finished_alphas.add(row["alpha"]) 61 | print(f"⚠️ 已有 {len(finished_alphas)} 条回测结果,将跳过这些 alpha") 62 | 63 | # === 3. 准备写入 === 64 | fieldnames = ["alpha", "sharpe", "turnover", "fitness", "returns", "drawdown", "margin"] 65 | csv_file = open(out_csv, "a", newline="", encoding="utf-8") 66 | writer = csv.DictWriter(csv_file, fieldnames=fieldnames) 67 | if csv_file.tell() == 0: 68 | writer.writeheader() 69 | 70 | # === 4. 构造 payload 模板 === 71 | def make_payload(expr): 72 | return { 73 | "type": "REGULAR", 74 | "settings": { 75 | "instrumentType": "EQUITY", 76 | "region": "USA", 77 | "universe": "TOP3000", 78 | "delay": 1, 79 | "decay": 0, 80 | "neutralization": "SUBINDUSTRY", 81 | "truncation": 0.01, 82 | "pasteurization": "ON", 83 | "unitHandling": "VERIFY", 84 | "nanHandling": "OFF", 85 | "language": "FASTEXPR", 86 | "visualization": False, 87 | }, 88 | "regular": expr 89 | } 90 | 91 | # === 5. 提交 & 管理 pending 队列 === 92 | pending = {} # sim_id -> {"alpha": expr, "progress_url": url} 93 | retry_queue = [] 94 | 95 | for i, alpha_expr in enumerate(alphas, 1): 96 | if alpha_expr in finished_alphas: 97 | continue 98 | 99 | # 提交 alpha 100 | try: 101 | resp = sess.post("https://api.worldquantbrain.com/simulations", json=make_payload(alpha_expr)) 102 | if resp.status_code not in (200, 201): 103 | if "SIMULATION_LIMIT_EXCEEDED" in resp.text: 104 | retry_queue.append(alpha_expr) 105 | continue 106 | print(f"❌ 提交失败: {resp.status_code}, {resp.text}") 107 | continue 108 | 109 | sim_url = resp.headers.get("Location") 110 | if not sim_url: 111 | retry_queue.append(alpha_expr) 112 | continue 113 | 114 | sim_id = sim_url.split("/")[-1] 115 | pending[sim_id] = {"alpha": alpha_expr, "progress_url": sim_url, "first_time": True} 116 | 117 | print(f"📩 提交成功: {i}/{len(alphas)} -> {alpha_expr[:50]}...") 118 | 119 | # 控制批量大小 120 | if len(pending) >= batch_size: 121 | monitor_pending(sess, pending, writer, alphas_json_file) 122 | except Exception as e: 123 | logging.error(f"提交 {alpha_expr} 出错: {e}") 124 | retry_queue.append(alpha_expr) 125 | 126 | # 处理剩余的 127 | if pending: 128 | monitor_pending(sess, pending, writer, alphas_json_file) 129 | 130 | csv_file.close() 131 | print(f"🎯 回测完成,结果已保存 {out_csv}") 132 | return str(out_csv) 133 | 134 | 135 | def monitor_pending(sess, pending, writer, alphas_json_file): 136 | """监控 pending 队列直到全部完成""" 137 | client = OpenAI( 138 | base_url=ConfigLoader.get("openai_base_url"), 139 | api_key=ConfigLoader.get("openai_api_key"), 140 | ) 141 | 142 | while pending: 143 | finished_ids = [] 144 | for sim_id, info in list(pending.items()): 145 | try: 146 | status_resp = sess.get(info["progress_url"]) 147 | if status_resp.status_code == 429: 148 | continue 149 | 150 | status_json = status_resp.json() 151 | status = status_json.get("status") 152 | 153 | if status in ("COMPLETE", "WARNING"): 154 | alpha_id = status_json.get("alpha") 155 | if not alpha_id: 156 | finished_ids.append(sim_id) 157 | continue 158 | 159 | # 获取结果 160 | alpha_data = None 161 | for _ in range(10): 162 | alpha_resp = sess.get(f"https://api.worldquantbrain.com/alphas/{alpha_id}") 163 | if alpha_resp.status_code == 200: 164 | alpha_data = alpha_resp.json() 165 | break 166 | sleep(3) 167 | 168 | if not alpha_data: 169 | finished_ids.append(sim_id) 170 | continue 171 | 172 | is_data = alpha_data.get("is", {}) 173 | writer.writerow({ 174 | "alpha": info["alpha"], 175 | "sharpe": is_data.get("sharpe"), 176 | "turnover": is_data.get("turnover"), 177 | "fitness": is_data.get("fitness"), 178 | "returns": is_data.get("returns"), 179 | "drawdown": is_data.get("drawdown"), 180 | "margin": is_data.get("margin"), 181 | }) 182 | finished_ids.append(sim_id) 183 | print(f"✅ 完成: {info['alpha']}... fitness={is_data.get('fitness')}") 184 | 185 | elif status == "ERROR": 186 | if info["first_time"]: # 失败直接退出,修复带来的收益过低,时间损耗过高 TODO 187 | # 二次失败,写入 None 188 | writer.writerow({ 189 | "alpha": info["alpha"], 190 | "sharpe": None, 191 | "turnover": None, 192 | "fitness": None, 193 | "returns": None, 194 | "drawdown": None, 195 | "margin": f"FAILED:{status}" 196 | }) 197 | print(f"❌ 二次失败: {info['alpha'][:60]}...") 198 | finished_ids.append(sim_id) 199 | else: 200 | # === 使用 LLM 修复表达式 === 201 | print(f"❌ 模拟失败: {info['alpha'][:60]}...") 202 | fix_exp_prompt = build_fix_fast_expression_prompt(info["alpha"], str(status_json)) 203 | try: 204 | resp = client.chat.completions.create( 205 | model=ConfigLoader.get("reasoner_model_name"), 206 | messages=[ 207 | {"role": "system", "content": "You are an expert in Fast Expression syntax repair."}, 208 | {"role": "user", "content": fix_exp_prompt} 209 | ], 210 | temperature=0.2, 211 | ) 212 | fixed_expr = resp.choices[0].message.content.strip() 213 | print(f"🧩 修复后的表达式: {fixed_expr}") 214 | 215 | # === 替换 alphas_json_file 文件中的旧 alpha 216 | try: 217 | with open(alphas_json_file, "r", encoding="utf-8") as f: 218 | text = f.read() 219 | if info["alpha"] not in text: 220 | print("⚠️ 原始表达式未在文件中找到,跳过替换") 221 | else: 222 | new_text = text.replace(info["alpha"], fixed_expr, 1) # 仅替换第一次出现 223 | with open(alphas_json_file, "w", encoding="utf-8") as f: 224 | f.write(new_text) 225 | print(f"💾 已在 {alphas_json_file} 中替换修复后的表达式") 226 | except Exception as e: 227 | print(f"❌ 替换 {alphas_json_file} 中表达式失败: {e}") 228 | 229 | # === 再次提交修复后的表达式 === 230 | payload = { 231 | "type": "REGULAR", 232 | "settings": { 233 | "instrumentType": "EQUITY", 234 | "region": "ASI", 235 | "universe": "MINVOL1M", 236 | "delay": 1, 237 | "decay": 6, 238 | "neutralization": "SUBINDUSTRY", 239 | "truncation": 0.01, 240 | "pasteurization": "ON", 241 | "unitHandling": "VERIFY", 242 | "nanHandling": "ON", 243 | "maxTrade": "ON", 244 | "language": "FASTEXPR", 245 | "visualization": False, 246 | }, 247 | "regular": fixed_expr 248 | } 249 | 250 | new_resp = sess.post("https://api.worldquantbrain.com/simulations", json=payload) 251 | if new_resp.status_code not in (200, 201): 252 | print(f"⚠️ 修复后提交失败 {new_resp.status_code}: {new_resp.text}") 253 | writer.writerow({ 254 | "alpha": info["alpha"], 255 | "sharpe": None, 256 | "turnover": None, 257 | "fitness": None, 258 | "returns": None, 259 | "drawdown": None, 260 | "margin": "FIX_FAIL_SUBMIT" 261 | }) 262 | finished_ids.append(sim_id) 263 | continue 264 | 265 | new_url = new_resp.headers.get("Location") 266 | if not new_url: 267 | print("⚠️ 修复后提交未返回Location,跳过") 268 | finished_ids.append(sim_id) 269 | continue 270 | 271 | # 替换原 pending 任务为新任务 272 | new_id = new_url.split("/")[-1] 273 | pending[new_id] = { 274 | "alpha": fixed_expr, 275 | "progress_url": new_url, 276 | "first_time": False # 标记为已修复 277 | } 278 | finished_ids.append(sim_id) 279 | print(f"🔁 已重新提交修复后的表达式 {new_id}") 280 | 281 | except Exception as e: 282 | logging.error(f"修复表达式失败: {e}") 283 | writer.writerow({ 284 | "alpha": info["alpha"], 285 | "sharpe": None, 286 | "turnover": None, 287 | "fitness": None, 288 | "returns": None, 289 | "drawdown": None, 290 | "margin": "FIX_FAIL_LLM" 291 | }) 292 | finished_ids.append(sim_id) 293 | 294 | else: 295 | print(f"⏳ {info['alpha'][:40]}... simulation status: {status}") 296 | 297 | except Exception as e: 298 | logging.error(f"检查 {sim_id} 出错: {e}") 299 | 300 | for fid in finished_ids: 301 | pending.pop(fid, None) 302 | 303 | sleep(5) 304 | 305 | 306 | if __name__ == "__main__": 307 | test_file = BASE_DIR / "data" / "alpha_db" / "all_alphas" / "your_template_alphas.json" 308 | run_backtest_mul_by_wq_api(test_file) 309 | --------------------------------------------------------------------------------