├── utils
    ├── text_dealer.py
    ├── json_dealer.py
    ├── config_loader.py
    ├── template_op_gener.py
    ├── template_field_gener.py
    └── wq_info_loader.py
├── test_script.sh
├── .gitignore
├── prompts
    ├── template_evaluating.yaml
    └── template_generating.yaml
├── evaluator
    ├── construct_prompts.py
    ├── backtest_with_wq.py
    └── backtest_with_wq_mul.py
├── main_scraper.py
├── config.yaml
├── main_evaluator.py
├── main_researcher.py
├── main.py
├── environment.yml
├── researcher
    ├── generate_alpha.py
    ├── generate_template.py
    └── construct_prompts.py
├── scraper
    ├── preprocess_texts.py
    └── scrap_posts_from_wq.py
└── README.md


/utils/text_dealer.py:
--------------------------------------------------------------------------------
1 | def truncate_text(text, max_chars=5000):
2 |     """如果字符串过长则截断（优先在句子/段落分隔符）"""
3 |     if len(text) <= max_chars:
4 |         return text
5 |     # 简单做法：按字符截断
6 |     return text[:max_chars] + "... [TRUNCATED]"


--------------------------------------------------------------------------------
/test_script.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | # 自动循环运行回测，每轮57分钟，暂停3分钟
 4 | 
 5 | # 无限循环
 6 | while true
 7 | do
 8 |     echo "🚀 开始运行 main.py ..."
 9 |     python main_evaluator.py &
10 |     PID=$!
11 | 
12 |     # 运行57分钟（57*60秒）
13 |     sleep $((57 * 60))
14 | 
15 |     echo "⏹ 停止 main.py (PID=$PID)..."
16 |     kill $PID
17 | 
18 |     # 等待3分钟
19 |     echo "🕒 等待3分钟后重新开始..."
20 |     sleep $((3 * 60))
21 | done
22 | 
23 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
 1 | # Python 缓存和虚拟环境
 2 | __pycache__/
 3 | *.py[cod]
 4 | *.so
 5 | *.egg
 6 | *.egg-info/
 7 | .eggs/
 8 | *.pyo
 9 | *.pyd
10 | .Python
11 | venv/
12 | env/
13 | .venv/
14 | ENV/
15 | *.sqlite3
16 | *.db
17 | 
18 | # IDE/编辑器文件
19 | .vscode/
20 | .idea/
21 | *.swp
22 | *.swo
23 | 
24 | # 操作系统
25 | .DS_Store
26 | Thumbs.db
27 | 
28 | # 日志/临时文件
29 | *.log
30 | *.tmp
31 | 
32 | # Jupyter/实验数据
33 | .ipynb_checkpoints/
34 | 
35 | # 项目草稿和数据
36 | data/alpha_db*
37 | data/hypothesis_db*
38 | data/template_db*
39 | test/
40 | 
41 | # Playwright/Browser自动生成的数据
42 | playwright/.cache/
43 | 
44 | # 其他可执行或临时文件
45 | *.bak
46 | 


--------------------------------------------------------------------------------
/prompts/template_evaluating.yaml:
--------------------------------------------------------------------------------
 1 | fix_fast_expression : |-
 2 |   # Role
 3 |   You are an expert in writing WordQuant Brain Fast Expression.
 4 |   
 5 |   # Task
 6 |   Your task is to correct a syntactically invalid Fast Expression based on the provided simulation error message.
 7 |   You should preserve the original intent of the expression as much as possible and output only the corrected expression (do not include any additional text).
 8 |   
 9 |   Here is an invalid WordQuant Brain Fast Expression:
10 |   {{ fast_expression }}
11 |   
12 |   And the simulation error message is:
13 |   {{ error_mes }}
14 |   
15 |   Now please generate the corrected expression below:(do not include any additional text)


--------------------------------------------------------------------------------
/evaluator/construct_prompts.py:
--------------------------------------------------------------------------------
 1 | import yaml
 2 | from pathlib import Path
 3 | 
 4 | # --- 路径 ---
 5 | BASE_DIR = Path(__file__).resolve().parents[1]
 6 | PROMPT_FILE = BASE_DIR / "prompts" / "template_evaluating.yaml"
 7 | 
 8 | 
 9 | def build_fix_fast_expression_prompt(alpha_expression : str, error_mes : str):
10 |     with open(PROMPT_FILE, "r", encoding="utf-8") as f:
11 |         prompt_yaml = yaml.safe_load(f)
12 |     template_str = prompt_yaml.get("fix_fast_expression", "")
13 |     if not template_str:
14 |         raise ValueError("fix_fast_expression not found in template_generating.yaml")
15 | 
16 |     prompt_filled = (
17 |         template_str
18 |         .replace("{{ fast_expression }}", alpha_expression)
19 |         .replace("{{ error_mes }}", error_mes)
20 |     )
21 |     return prompt_filled


--------------------------------------------------------------------------------
/main_scraper.py:
--------------------------------------------------------------------------------
 1 | import random
 2 | from pathlib import Path
 3 | 
 4 | from evaluator.backtest_with_wq import run_backtest_by_wq_api
 5 | from evaluator.backtest_with_wq_mul import run_backtest_mul_by_wq_api
 6 | from researcher.construct_prompts import build_wq_knowledge_prompt, build_check_if_blog_helpful, \
 7 |     build_blog_to_hypothesis
 8 | from researcher.generate_alpha import generate_alphas_from_template
 9 | from researcher.generate_template import from_post_to_template
10 | from scraper.preprocess_texts import preprocess_all_html_posts
11 | from scraper.scrap_posts_from_wq import scrape_new_posts
12 | from utils.template_field_gener import generate_template_fields_v2
13 | from utils.template_op_gener import generate_template_ops
14 | from utils.wq_info_loader import OpAndFeature
15 | 
16 | if __name__ == "__main__":
17 |     # data scraper ------------------------------------
18 |     scrape_new_posts(limit=200)
19 |     preprocess_all_html_posts()
20 | 


--------------------------------------------------------------------------------
/config.yaml:
--------------------------------------------------------------------------------
 1 | # ===============================
 2 | # Global Configuration File
 3 | # ===============================
 4 | 
 5 | # --- OpenAI API Settings ---
 6 | openai_base_url: "todo" # e.g. https://api.deepseek.com
 7 | openai_api_key: "todo" # e.g. sk-...
 8 | openai_model_name: "todo" # e.g. deepseek-chat
 9 | reasoner_model_name: "todo" # e.g. deepseek-reasoner
10 | 
11 | # --- WorldQuant Platform Credentials ---
12 | worldquant_account: "todo"
13 | worldquant_password: "todo"
14 | 
15 | worldquant_login_url: "https://platform.worldquantbrain.com/sign-in"
16 | worldquant_api_auth: "https://api.worldquantbrain.com/authentication"
17 | worldquant_consultant_posts_url: "https://support.worldquantbrain.com/hc/en-us/community/topics/18910956638743-顾问专属中文论坛"
18 | # You can also choose any other WorldQuant Forum URL you have access to.
19 | 
20 | # --- Dataset from WorldQuant Brain
21 | enabled_field_datasets: # Select the field database you want to use to build alphas.
22 |   - pv1                 # Database name reference ./data/wq_fields
23 |   - fundamental6
24 |   - analyst4
25 |   - model16
26 |   - news12


--------------------------------------------------------------------------------
/main_evaluator.py:
--------------------------------------------------------------------------------
 1 | import random
 2 | from pathlib import Path
 3 | 
 4 | from evaluator.backtest_with_wq import run_backtest_by_wq_api
 5 | from evaluator.backtest_with_wq_mul import run_backtest_mul_by_wq_api
 6 | from researcher.construct_prompts import build_wq_knowledge_prompt, build_check_if_blog_helpful, \
 7 |     build_blog_to_hypothesis
 8 | from researcher.generate_alpha import generate_alphas_from_template
 9 | from researcher.generate_template import from_post_to_template
10 | from scraper.preprocess_texts import preprocess_all_html_posts
11 | from scraper.scrap_posts_from_wq import scrape_new_posts
12 | from utils.template_field_gener import generate_template_fields_v2
13 | from utils.template_op_gener import generate_template_ops
14 | from utils.wq_info_loader import OpAndFeature
15 | 
16 | if __name__ == "__main__":
17 | 
18 |     # alpha evaluator ----------------------------------
19 |     ALPHA_DIR = Path("data/alpha_db_v2/all_alphas")
20 |     json_files = list(ALPHA_DIR.glob("*.json"))
21 |     random.shuffle(json_files)
22 |     for json_file in json_files:
23 |         backtest_result = run_backtest_mul_by_wq_api(json_file)
24 | 


--------------------------------------------------------------------------------
/utils/json_dealer.py:
--------------------------------------------------------------------------------
 1 | import json
 2 | import re
 3 | 
 4 | def extract_json(text: str):
 5 |     """
 6 |     尝试从大模型返回的 text 中提取并解析 JSON。
 7 |     - 自动去掉 Markdown 代码块、注释、解释文字
 8 |     - 尝试匹配第一个 {...} 或 [...] 的完整 JSON
 9 |     返回 Python 对象
10 |     """
11 |     if not text:
12 |         raise ValueError("❌ Empty text, cannot parse JSON")
13 | 
14 |     # 1. 去掉Markdown代码块 ```json ... ```
15 |     cleaned = re.sub(r"```(?:json)?", "", text, flags=re.IGNORECASE).strip()
16 | 
17 |     # 2. 在字符串中找到第一个 { 或 [
18 |     start = min(
19 |         (cleaned.find("{") if "{" in cleaned else float("inf")),
20 |         (cleaned.find("[") if "[" in cleaned else float("inf")),
21 |     )
22 |     if start == float("inf"):
23 |         raise ValueError(f"❌ No JSON start symbol found in: {text[:200]}")
24 | 
25 |     # 3. 截取可能的JSON部分
26 |     candidate = cleaned[start:]
27 | 
28 |     # 4. 尝试从后往前找到匹配的 } 或 ]
29 |     end_brace = candidate.rfind("}")
30 |     end_bracket = candidate.rfind("]")
31 |     end = max(end_brace, end_bracket)
32 |     if end == -1:
33 |         raise ValueError(f"❌ No JSON end symbol found in: {text[:200]}")
34 | 
35 |     candidate = candidate[:end + 1]
36 | 
37 |     # 5. 尝试解析
38 |     try:
39 |         return json.loads(candidate)
40 |     except json.JSONDecodeError as e:
41 |         raise ValueError(f"❌ JSON decode error: {e}\nExtracted candidate:\n{candidate[:500]}")
42 | 


--------------------------------------------------------------------------------
/main_researcher.py:
--------------------------------------------------------------------------------
 1 | import random
 2 | from pathlib import Path
 3 | 
 4 | from evaluator.backtest_with_wq import run_backtest_by_wq_api
 5 | from evaluator.backtest_with_wq_mul import run_backtest_mul_by_wq_api
 6 | from researcher.construct_prompts import build_wq_knowledge_prompt, build_check_if_blog_helpful, \
 7 |     build_blog_to_hypothesis
 8 | from researcher.generate_alpha import generate_alphas_from_template
 9 | from researcher.generate_template import from_post_to_template
10 | from scraper.preprocess_texts import preprocess_all_html_posts
11 | from scraper.scrap_posts_from_wq import scrape_new_posts
12 | from utils.template_field_gener import generate_template_fields_v2
13 | from utils.template_op_gener import generate_template_ops
14 | from utils.wq_info_loader import OpAndFeature
15 | 
16 | if __name__ == "__main__":
17 | 
18 |     # alpha researcher --------------------------------
19 |     opAndFeature = OpAndFeature()
20 |     opAndFeature.get_operators()
21 |     opAndFeature.get_data_fields()
22 | 
23 |     generate_template_ops()
24 |     generate_template_fields_v2()
25 | 
26 |     POSTS_DIR = Path("data/wq_posts/helpful_posts")
27 |     for json_file in POSTS_DIR.glob("*.json"):
28 | 
29 |         template_file = from_post_to_template(str(json_file))
30 |         if template_file is None:
31 |             continue
32 |         alphas_file = generate_alphas_from_template(template_file)
33 | 


--------------------------------------------------------------------------------
/main.py:
--------------------------------------------------------------------------------
 1 | import random
 2 | from pathlib import Path
 3 | 
 4 | from evaluator.backtest_with_wq import run_backtest_by_wq_api
 5 | from evaluator.backtest_with_wq_mul import run_backtest_mul_by_wq_api
 6 | from researcher.construct_prompts import build_wq_knowledge_prompt, build_check_if_blog_helpful, \
 7 |     build_blog_to_hypothesis
 8 | from researcher.generate_alpha import generate_alphas_from_template
 9 | from researcher.generate_template import from_post_to_template
10 | from scraper.preprocess_texts import preprocess_all_html_posts
11 | from scraper.scrap_posts_from_wq import scrape_new_posts
12 | from utils.template_field_gener import generate_template_fields_v2
13 | from utils.template_op_gener import generate_template_ops
14 | from utils.wq_info_loader import OpAndFeature
15 | 
16 | 
17 | if __name__ == "__main__":
18 |     # data scraper ------------------------------------
19 |     scrape_new_posts(limit=200)
20 |     preprocess_all_html_posts()
21 | 
22 | 
23 |     # alpha researcher --------------------------------
24 |     opAndFeature = OpAndFeature()
25 |     opAndFeature.get_operators()
26 |     opAndFeature.get_data_fields()
27 |     
28 |     generate_template_ops()
29 |     generate_template_fields_v2()
30 | 
31 |     POSTS_DIR = Path("data/wq_posts/helpful_posts")
32 |     for json_file in POSTS_DIR.glob("*.json"):
33 |     
34 |         template_file = from_post_to_template(str(json_file))
35 |         if template_file is None:
36 |             continue
37 |         alphas_file = generate_alphas_from_template(template_file)
38 | 
39 | 
40 |     # alpha evaluator ----------------------------------
41 |     ALPHA_DIR = Path("data/alpha_db_v2/all_alphas")
42 |     json_files = list(ALPHA_DIR.glob("*.json"))
43 |     random.shuffle(json_files)
44 |     for json_file in json_files:
45 |         backtest_result = run_backtest_mul_by_wq_api(json_file)
46 | 


--------------------------------------------------------------------------------
/utils/config_loader.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | import yaml
 3 | from pathlib import Path
 4 | from threading import Lock
 5 | 
 6 | 
 7 | class ConfigLoader:
 8 |     """
 9 |     A singleton configuration loader for the entire project.
10 |     Priority: Environment Variables > config.yaml
11 |     """
12 |     _instance = None
13 |     _config = {}
14 |     _lock = Lock()
15 | 
16 |     def __new__(cls, config_path: str = "config.yaml"):
17 |         with cls._lock:
18 |             if cls._instance is None:
19 |                 cls._instance = super().__new__(cls)
20 |                 cls._instance._load_config(config_path)
21 |             return cls._instance
22 | 
23 |     def _load_config(self, config_path: str):
24 |         config_file = Path(config_path)
25 |         if not config_file.exists():
26 |             raise FileNotFoundError(f"Config file not found at {config_path}")
27 | 
28 |         with open(config_file, "r", encoding="utf-8") as f:
29 |             yaml_config = yaml.safe_load(f) or {}
30 | 
31 |         # 环境变量优先（如不在环境变量中则用 config.yaml 值）
32 |         self._config = {
33 |             "openai_base_url": os.getenv("OPENAI_BASE_URL", yaml_config.get("openai_base_url")),
34 |             "openai_api_key": os.getenv("OPENAI_API_KEY", yaml_config.get("openai_api_key")),
35 |             "openai_model_name": os.getenv("OPENAI_MODEL_NAME", yaml_config.get("openai_model_name")),
36 |             "reasoner_model_name": os.getenv("REASONER_MODEL_NAME", yaml_config.get("reasoner_model_name")),
37 | 
38 |             "worldquant_account": os.getenv("WORLDQUANT_ACCOUNT", yaml_config.get("worldquant_account")),
39 |             "worldquant_password": os.getenv("WORLDQUANT_PASSWORD", yaml_config.get("worldquant_password")),
40 |             "worldquant_login_url": os.getenv("WORLDQUAN_LOGIN_URL", yaml_config.get("worldquant_login_url")),
41 |             "worldquant_api_auth": os.getenv("WORLDQUAN_API_AUTH", yaml_config.get("worldquant_api_auth")),
42 |             "worldquant_consultant_posts_url": os.getenv("WORLDQUANT_CONSULTANT_POSTS_URL",
43 |                                                          yaml_config.get("worldquant_consultant_posts_url")),
44 | 
45 |             "enabled_field_datasets": yaml_config.get("enabled_field_datasets", [])
46 |         }
47 | 
48 |         # 确保是列表格式
49 |         if not isinstance(self._config["enabled_field_datasets"], list):
50 |             self._config["enabled_field_datasets"] = [self._config["enabled_field_datasets"]]
51 | 
52 |     @classmethod
53 |     def get(cls, key: str, default=None):
54 |         """
55 |         获取配置值。
56 |         使用方法：ConfigLoader.get("openai_api_key")
57 |         """
58 |         if cls._instance is None:
59 |             cls()  # 初始化
60 |         return cls._instance._config.get(key, default)
61 | 
62 |     @classmethod
63 |     def all(cls) -> dict:
64 |         """
65 |         获取完整配置字典。
66 |         """
67 |         if cls._instance is None:
68 |             cls()
69 |         return cls._instance._config.copy()
70 | 


--------------------------------------------------------------------------------
/environment.yml:
--------------------------------------------------------------------------------
  1 | name: alphaspire
  2 | channels:
  3 |   - defaults
  4 |   - conda-forge
  5 | dependencies:
  6 |   - annotated-types=0.6.0
  7 |   - anyio=4.10.0
  8 |   - async-timeout=4.0.3
  9 |   - beautifulsoup4=4.13.5
 10 |   - blas=1.0
 11 |   - bottleneck=1.4.2
 12 |   - brotlicffi=1.0.9.2
 13 |   - bzip2=1.0.8
 14 |   - ca-certificates=2025.9.9
 15 |   - certifi=2025.8.3
 16 |   - cffi=1.17.1
 17 |   - charset-normalizer=3.3.2
 18 |   - distro=1.9.0
 19 |   - expat=2.7.1
 20 |   - greenlet=3.2.4
 21 |   - h11=0.16.0
 22 |   - httpcore=1.0.9
 23 |   - httpx=0.28.1
 24 |   - idna=3.7
 25 |   - jiter=0.6.1
 26 |   - joblib=1.5.2
 27 |   - jsonpatch=1.33
 28 |   - jsonpointer=3.0.0
 29 |   - langchain=0.3.25
 30 |   - langchain-core=0.3.58
 31 |   - langchain-openai=0.3.16
 32 |   - langchain-text-splitters=0.3.8
 33 |   - langsmith=0.3.39
 34 |   - libcxx=20.1.8
 35 |   - libffi=3.4.4
 36 |   - libgfortran5=11.3.0
 37 |   - libopenblas=0.3.30
 38 |   - libzlib=1.3.1
 39 |   - llvm-openmp=20.1.8
 40 |   - loguru=0.7.2
 41 |   - lz4-c=1.9.4
 42 |   - ncurses=6.5
 43 |   - numexpr=2.11.0
 44 |   - openai=1.77.0
 45 |   - openssl=3.5.3
 46 |   - orjson=3.10.14
 47 |   - pandas=2.3.2
 48 |   - pycparser=2.23
 49 |   - pydantic=2.11.9
 50 |   - pydantic-core=2.33.2
 51 |   - pysocks=1.7.1
 52 |   - python=3.10.18
 53 |   - pip
 54 |   - python-dateutil=2.9.0post0
 55 |   - python-tzdata=2025.2
 56 |   - pytz=2025.2
 57 |   - pyyaml=6.0.2
 58 |   - readline=8.3
 59 |   - regex=2024.11.6
 60 |   - requests=2.32.5
 61 |   - requests-toolbelt=1.0.0
 62 |   - scikit-learn=1.7.2
 63 |   - scipy=1.15.3
 64 |   - six=1.17.0
 65 |   - soupsieve=2.5
 66 |   - sqlalchemy=2.0.43
 67 |   - sqlite=3.50.2
 68 |   - tenacity=9.0.0
 69 |   - threadpoolctl=3.5.0
 70 |   - tiktoken=0.9.0
 71 |   - tk=8.6.15
 72 |   - tqdm=4.67.1
 73 |   - typing-inspection=0.4.0
 74 |   - typing_extensions=4.15.0
 75 |   - tzdata=2025b
 76 |   - urllib3=2.5.0
 77 |   - wheel=0.45.1
 78 |   - xz=5.6.4
 79 |   - yaml=0.2.5
 80 |   - zlib=1.3.1
 81 |   - zstandard=0.23.0
 82 |   - zstd=1.5.6
 83 |   - pip:
 84 |       - appnope==0.1.4
 85 |       - asttokens==3.0.0
 86 |       - attrs==25.3.0
 87 |       - backcall==0.2.0
 88 |       - bleach==6.2.0
 89 |       - cloudscraper==1.2.71
 90 |       - contourpy==1.3.2
 91 |       - cycler==0.12.1
 92 |       - decorator==5.2.1
 93 |       - defusedxml==0.7.1
 94 |       - docopt==0.6.2
 95 |       - exceptiongroup==1.3.0
 96 |       - executing==2.2.1
 97 |       - fastjsonschema==2.21.2
 98 |       - fonttools==4.60.1
 99 |       - hdbscan==0.8.40
100 |       - ipython==8.12.3
101 |       - jedi==0.19.2
102 |       - jinja2==3.1.6
103 |       - jsonschema==4.25.1
104 |       - jsonschema-specifications==2025.9.1
105 |       - jupyter-client==8.6.3
106 |       - jupyter-core==5.8.1
107 |       - jupyterlab-pygments==0.3.0
108 |       - kiwisolver==1.4.9
109 |       - markupsafe==3.0.3
110 |       - matplotlib==3.10.7
111 |       - matplotlib-inline==0.1.7
112 |       - mistune==3.1.4
113 |       - nbclient==0.10.2
114 |       - nbconvert==7.16.6
115 |       - nbformat==5.10.4
116 |       - numpy==2.2.6
117 |       - outcome==1.3.0.post0
118 |       - packaging==25.0
119 |       - pandocfilters==1.5.1
120 |       - parso==0.8.5
121 |       - pexpect==4.9.0
122 |       - pickleshare==0.7.5
123 |       - pillow==11.3.0
124 |       - pip==25.2
125 |       - pipreqs==0.5.0
126 |       - platformdirs==4.5.0
127 |       - playwright==1.55.0
128 |       - prompt-toolkit==3.0.52
129 |       - ptyprocess==0.7.0
130 |       - pure-eval==0.2.3
131 |       - pyee==13.0.0
132 |       - pygments==2.19.2
133 |       - pyparsing==3.2.5
134 |       - python-dotenv==1.1.1
135 |       - pyzmq==27.1.0
136 |       - referencing==0.36.2
137 |       - rpds-py==0.27.1
138 |       - selenium==4.35.0
139 |       - setuptools==80.9.0
140 |       - sniffio==1.3.1
141 |       - sortedcontainers==2.4.0
142 |       - stack-data==0.6.3
143 |       - tinycss2==1.4.0
144 |       - tornado==6.5.2
145 |       - traitlets==5.14.3
146 |       - trio==0.30.0
147 |       - trio-websocket==0.12.2
148 |       - typing-extensions==4.14.1
149 |       - undetected-chromedriver==3.5.5
150 |       - wcwidth==0.2.14
151 |       - webdriver-manager==4.0.2
152 |       - webencodings==0.5.1
153 |       - websocket-client==1.8.0
154 |       - websockets==15.0.1
155 |       - wsproto==1.2.0
156 |       - yarg==0.1.9
157 | prefix: /Users/panzhuoran/miniconda3/envs/alphaspire
158 | 


--------------------------------------------------------------------------------
/researcher/generate_alpha.py:
--------------------------------------------------------------------------------
  1 | # generate_alpha.py
  2 | import json
  3 | import re
  4 | import csv
  5 | from itertools import product
  6 | from pathlib import Path
  7 | 
  8 | BASE_DIR = Path(__file__).resolve().parents[1]
  9 | OPERATORS_FILE = BASE_DIR / "data" / "wq_template_operators" / "template_operators.csv"
 10 | FIELDS_FILE = BASE_DIR / "data" / "wq_template_fields" / "template_fields.json"
 11 | ALPHA_DB = BASE_DIR / "data" / "alpha_db_v2" / "all_alphas"
 12 | ALPHA_DB.mkdir(parents=True, exist_ok=True)
 13 | 
 14 | # === 最大单次生成 alpha 数量限制 ===
 15 | MAX_ALPHAS = 100000
 16 | 
 17 | 
 18 | def load_operator_type_map():
 19 |     """读取 template_operators.csv，返回 {type: [name,...]}"""
 20 |     operator_map = {}
 21 |     with open(OPERATORS_FILE, "r", encoding="utf-8") as f:
 22 |         reader = csv.DictReader(f)
 23 |         for row in reader:
 24 |             op_type = row["type"].strip()
 25 |             name = row["name"].strip()
 26 |             operator_map.setdefault(op_type, []).append(name)
 27 |     return operator_map
 28 | 
 29 | 
 30 | def load_field_type_map():
 31 |     """读取 template_fields.json，返回 {final_category: [id,...]}"""
 32 |     handled_field_map = {}
 33 |     with open(FIELDS_FILE, "r", encoding="utf-8") as f:
 34 |         field_map = json.load(f)
 35 |     for raw_type, ids in field_map.items():
 36 |         # 提取 </type_name:TYPE:dataset/> 中的核心部分
 37 |         clean_type = re.sub(r"^</|/>$", "", raw_type).strip()
 38 |         handled_field_map[clean_type] = ids
 39 |     return handled_field_map
 40 | 
 41 | 
 42 | 
 43 | def extract_placeholders(expression):
 44 |     """提取 </.../> 的占位符"""
 45 |     return re.findall(r"</(.*?)/>", expression)
 46 | 
 47 | 
 48 | def generate_alphas_from_template(template_path):
 49 |     """从alpha_template.json生成所有具体alpha"""
 50 |     # === 加载模板 ===
 51 |     with open(template_path, "r", encoding="utf-8") as f:
 52 |         template_json = json.load(f)
 53 | 
 54 |     template_expr = template_json["TemplateExpression"]
 55 |     template_name = Path(template_path).stem
 56 | 
 57 |     # === 加载映射 ===
 58 |     operator_map = load_operator_type_map()
 59 |     field_map = load_field_type_map()
 60 | 
 61 |     # === 提取占位符 ===
 62 |     placeholders = extract_placeholders(template_expr)
 63 |     if not placeholders:
 64 |         print("❌ 模板中未发现占位符，无法展开；直接使用 template 作为 alpha")
 65 |         all_alphas = []
 66 |         all_alphas.append({"alpha": template_expr, "fields_or_ops_used": []})
 67 |         out_file = ALPHA_DB / f"{template_name}_alphas.json"
 68 |         with open(out_file, "w", encoding="utf-8") as f:
 69 |             json.dump({
 70 |                 "Template": template_expr,
 71 |                 "GeneratedAlphas": all_alphas
 72 |             }, f, indent=2, ensure_ascii=False)
 73 |         print(f"✅ Generated 1 alphas saved to {out_file}")
 74 |         return out_file
 75 | 
 76 |     # === 为每个占位符构建替代列表 ===
 77 |     replacements_list = []
 78 |     for ph in placeholders:
 79 |         # 判断是操作符还是字段
 80 |         if ph in operator_map:
 81 |             replacements_list.append(operator_map[ph])
 82 |         elif ph in field_map:
 83 |             replacements_list.append(field_map[ph])
 84 |         else:
 85 |             print(f"❌ 未在字段或操作符映射中找到占位符类型: {ph}")
 86 |             return None
 87 | 
 88 |     # === 笛卡尔积替换 ===
 89 |     all_alphas = []
 90 |     count = 0
 91 |     total_combinations = 1
 92 |     for lst in replacements_list:
 93 |         total_combinations *= len(lst)
 94 |     if total_combinations > MAX_ALPHAS:
 95 |         print(f"⚠️ Warning: Total possible alphas {total_combinations} exceeds MAX_ALPHAS={MAX_ALPHAS}. "
 96 |               f"Only generating the first {MAX_ALPHAS} combinations.")
 97 | 
 98 |     for combo in product(*replacements_list):
 99 |         expr = template_expr
100 |         # 依次替换占位符
101 |         for ph, val in zip(placeholders, combo):
102 |             expr = re.sub(rf"</{re.escape(ph)}/>+", val, expr, count=1)
103 |         all_alphas.append({
104 |             "alpha": expr,
105 |             "fields_or_ops_used": combo
106 |         })
107 |         count += 1
108 |         if count >= MAX_ALPHAS:  # 超过限制提前退出
109 |             break
110 | 
111 |     # === 保存 ===
112 |     out_file = ALPHA_DB / f"{template_name}_alphas.json"
113 |     with open(out_file, "w", encoding="utf-8") as f:
114 |         json.dump({
115 |             "Template": template_expr,
116 |             "GeneratedAlphas": all_alphas
117 |         }, f, indent=2, ensure_ascii=False)
118 | 
119 |     print(f"✅ Generated {len(all_alphas)} alphas saved to {out_file}")
120 |     return out_file
121 | 
122 | 
123 | if __name__ == "__main__":
124 |     test_template = BASE_DIR / "data" / "template_db" / "your_alpha_template.json"
125 |     generate_alphas_from_template(test_template)
126 | 


--------------------------------------------------------------------------------
/utils/template_op_gener.py:
--------------------------------------------------------------------------------
  1 | import csv, json
  2 | from pathlib import Path
  3 | 
  4 | BASE = Path(__file__).resolve().parents[1]
  5 | OP_DIR = BASE / "data" / "wq_operators"
  6 | IN_CSV = OP_DIR / "operators.csv"
  7 | 
  8 | TEMP_DIR = BASE / "data" / "wq_template_operators"
  9 | TEMP_DIR.mkdir(parents=True, exist_ok=True)
 10 | OUT_CSV = TEMP_DIR / "template_operators.csv"
 11 | OUT_JSON = TEMP_DIR / "template_operators.json"
 12 | 
 13 | TYPE_MAP = {
 14 |     "Arithmetic:NAry": {"add","multiply","max","min"},
 15 |     "Arithmetic:Binary": {"subtract","divide","power","signed_power"},
 16 |     "Arithmetic:Unary": {"sign","abs","log","sqrt","inverse","reverse","tanh","sigmoid"},
 17 |     "Logical:Unary": {"is_nan","not"},
 18 |     "Logical:Binary": {"and","or","less","equal","greater","not_equal","less_equal","greater_equal"},
 19 |     "Conditional": {"if_else","trade_when"},
 20 |     "TS:Aggregation": {"ts_mean","ts_sum","ts_std_dev","ts_product","ts_av_diff","ts_min_diff","ts_zscore","ts_skewness","ts_entropy","ts_count_nans"},
 21 |     "TS:WindowIndex": {"ts_arg_max","ts_arg_min","kth_element","last_diff_value","days_from_last_change","ts_step"},
 22 |     "TS:CorrelationRegression": {"ts_corr","ts_covariance","ts_regression"},
 23 |     "TS:Transform": {"ts_backfill","ts_delay","ts_decay_linear","ts_target_tvr_decay","ts_scale","ts_quantile","ts_rank","ts_delta","ts_min_max_cps","ts_min_max_diff","hump"},
 24 |     "CrossSection:Standardize": {"winsorize","rank","zscore","scale","normalize","quantile"},
 25 |     "CrossSection:RegressionProj": {"regression_proj"},
 26 |     "Group:Aggregation": {"bucket","densify","group_mean","group_rank","group_extra","group_backfill","group_scale","group_zscore","group_neutralize","group_cartesian_product"},
 27 |     "Vector:LinearAlgebra": {"vector_proj","vector_neut","vec_sum","vec_avg"},
 28 |     "Special:Domain": {"inst_pnl"},
 29 | }
 30 | 
 31 | 
 32 | def generate_template_ops():
 33 |     # invert for lookups
 34 |     op_to_type = {}
 35 |     for t,s in TYPE_MAP.items():
 36 |         for op in s:
 37 |             op_to_type[op] = t
 38 | 
 39 |     rows = []
 40 |     unknown = set()
 41 |     if not IN_CSV.exists():
 42 |         raise FileNotFoundError(f"{IN_CSV} not found.")
 43 | 
 44 |     with open(IN_CSV,newline="",encoding="utf-8") as f:
 45 |         reader = csv.DictReader(f)
 46 |         for r in reader:
 47 |             name = (r.get("name") or "").strip()
 48 |             t = op_to_type.get(name, "UNKNOWN")
 49 |             # assign signature templates by type
 50 |             signature = ""
 51 |             if t == "Arithmetic:NAry":
 52 |                 signature = "NAry(x1,x2,...[,options])"
 53 |             elif t == "Arithmetic:Binary":
 54 |                 signature = "Binary(x,y[,options])"
 55 |             elif t == "Arithmetic:Unary":
 56 |                 signature = "Unary(x[,options])"
 57 |             elif t == "Logical:Unary":
 58 |                 signature = "LogicalUnary(x)"
 59 |             elif t == "Logical:Binary":
 60 |                 signature = "LogicalBinary(a,b)"
 61 |             elif t == "Conditional":
 62 |                 signature = "Conditional(cond, a, b) or Conditional(cond, value)"
 63 |             elif t == "TS:Aggregation":
 64 |                 signature = "TS_Agg(x,d[,options])"
 65 |             elif t == "TS:WindowIndex":
 66 |                 signature = "TS_Index(x,d[,k])"
 67 |             elif t == "TS:CorrelationRegression":
 68 |                 signature = "TS_Bivariate(y,x,d[,options])"
 69 |             elif t == "TS:Transform":
 70 |                 signature = "TS_Transform(x,d[,params])"
 71 |             elif t == "CrossSection:Standardize":
 72 |                 signature = "CS_Std(x[,params])"
 73 |             elif t == "CrossSection:RegressionProj":
 74 |                 signature = "regression_proj(y,x)"
 75 |             elif t == "Group:Aggregation":
 76 |                 signature = "GroupOp(x, group[,params])"
 77 |             elif t == "Vector:LinearAlgebra":
 78 |                 signature = "VectorOp(x[,y])"
 79 |             elif t == "Special:Domain":
 80 |                 signature = "Special(...)"
 81 |             else:
 82 |                 signature = ""
 83 | 
 84 |             if t == "UNKNOWN":
 85 |                 unknown.add(name)
 86 |             r_out = dict(r)
 87 |             r_out["type"] = t
 88 |             r_out["signature_template"] = signature
 89 |             rows.append(r_out)
 90 | 
 91 |     # write CSV
 92 |     if rows:
 93 |         fieldnames = list(rows[0].keys())
 94 |         with open(OUT_CSV,"w",newline="",encoding="utf-8") as f:
 95 |             writer = csv.DictWriter(f,fieldnames=fieldnames)
 96 |             writer.writeheader()
 97 |             writer.writerows(rows)
 98 |         with open(OUT_JSON,"w",encoding="utf-8") as f:
 99 |             json.dump(rows,f,ensure_ascii=False,indent=2)
100 | 
101 |     print("Wrote", OUT_CSV, OUT_JSON)
102 |     if unknown:
103 |         print("Unknown operators to classify manually:", sorted(unknown))
104 | 
105 | 
106 | if __name__ == "__main__":
107 |     generate_template_ops()


--------------------------------------------------------------------------------
/scraper/preprocess_texts.py:
--------------------------------------------------------------------------------
  1 | import os
  2 | import json
  3 | import unicodedata
  4 | from pathlib import Path
  5 | from bs4 import BeautifulSoup
  6 | from langchain_openai import ChatOpenAI
  7 | from loguru import logger
  8 | 
  9 | from researcher.construct_prompts import build_check_if_blog_helpful
 10 | from utils.config_loader import ConfigLoader
 11 | 
 12 | BASE_DIR = Path(__file__).resolve().parents[1]
 13 | RAW_DIR = BASE_DIR / "data" / "wq_posts" / "raw_posts"
 14 | RAW_DIR.mkdir(parents=True, exist_ok=True)
 15 | PROCESSED_DIR = BASE_DIR / "data" / "wq_posts" / "processed_posts"
 16 | PROCESSED_DIR.mkdir(parents=True, exist_ok=True)
 17 | HELPFUL_DIR = BASE_DIR / "data" / "wq_posts" / "helpful_posts"
 18 | HELPFUL_DIR.mkdir(parents=True, exist_ok=True)
 19 | 
 20 | 
 21 | def clean_text(text: str) -> str:
 22 |     """
 23 |     清洗文本中非utf-8字符，统一为 NFC 格式
 24 |     """
 25 |     if not text:
 26 |         return ""
 27 |     # 先 encode/decode 丢弃非法字符
 28 |     cleaned = text.encode("utf-8", errors="ignore").decode("utf-8", errors="ignore")
 29 |     # 再做 Unicode 归一化，避免奇怪的变体
 30 |     cleaned = unicodedata.normalize("NFC", cleaned)
 31 |     return cleaned.strip()
 32 | 
 33 | def extract_post_info(html_content: str) -> dict:
 34 |     """从单个HTML中抽取 description, title, post-body, post-comments"""
 35 |     soup = BeautifulSoup(html_content, "html.parser")
 36 | 
 37 |     # description
 38 |     description = ""
 39 |     meta_desc = soup.find("meta", attrs={"name": "description"})
 40 |     if meta_desc and meta_desc.get("content"):
 41 |         description = meta_desc.get("content").strip()
 42 |     if not description:  # 备用
 43 |         og_desc = soup.find("meta", property="og:description")
 44 |         if og_desc and og_desc.get("content"):
 45 |             description = og_desc.get("content").strip()
 46 | 
 47 |     # title
 48 |     title = ""
 49 |     og_title = soup.find("meta", property="og:title")
 50 |     if og_title and og_title.get("content"):
 51 |         title = og_title.get("content").strip()
 52 |     if not title and soup.title:
 53 |         title = soup.title.string.strip()
 54 | 
 55 |     # post-body
 56 |     post_body = ""
 57 |     body_div = soup.find("div", class_="post-body")
 58 |     if body_div:
 59 |         post_body = body_div.get_text("\n", strip=True)
 60 | 
 61 |     # comments（section.comment-body）
 62 |     comments = []
 63 |     for section in soup.select("section.comment-body"):
 64 |         text = section.get_text("\n", strip=True)
 65 |         if text:
 66 |             comments.append(text)
 67 | 
 68 |     return {
 69 |         "title": title,
 70 |         "description": description,
 71 |         "post_body": post_body,
 72 |         "post_comments": comments,
 73 |     }
 74 | 
 75 | def preprocess_all_html_posts() -> None:
 76 |     """批量处理RAW_DIR下所有未处理的html文件"""
 77 |     raw_files = list(RAW_DIR.glob("*.html"))
 78 |     logger.info(f"Found {len(raw_files)} raw html files.")
 79 |     processed_count = 0
 80 | 
 81 |     for raw_file in raw_files:
 82 |         post_id = raw_file.stem  # 文件名不带后缀
 83 |         out_file = PROCESSED_DIR / f"{post_id}.json"
 84 |         if out_file.exists():
 85 |             continue  # 已处理
 86 | 
 87 |         logger.info(f"Processing {raw_file.name}...")
 88 |         html_content = raw_file.read_text(encoding="utf-8", errors="ignore")
 89 | 
 90 |         # 再做一层clean，避免HTML内的非法字符
 91 |         html_content = clean_text(html_content)
 92 | 
 93 |         post_info = extract_post_info(html_content)
 94 | 
 95 |         with open(out_file, "w", encoding="utf-8") as f:
 96 |             json.dump(post_info, f, ensure_ascii=False, indent=2)
 97 | 
 98 |         processed_count += 1
 99 |         logger.info(f"Saved processed JSON to {out_file}")
100 | 
101 |         if check_if_post_helpful(out_file):
102 |             helpful_file = HELPFUL_DIR / f"{post_id}.json"
103 |             with open(helpful_file, "w", encoding="utf-8") as f:
104 |                 json.dump(post_info, f, ensure_ascii=False, indent=2)
105 | 
106 |     logger.info(f"Total processed new files: {processed_count}")
107 | 
108 | 
109 | def check_if_post_helpful(post_file):
110 |     base_url = ConfigLoader.get("openai_base_url")
111 |     api_key = ConfigLoader.get("openai_api_key")
112 |     model_name = ConfigLoader.get("openai_model_name")
113 | 
114 |     llm = ChatOpenAI(
115 |         base_url=base_url,
116 |         api_key=api_key,
117 |         model=model_name,
118 |         temperature=0.2,
119 |     )
120 |     formatted = build_check_if_blog_helpful(post_file)
121 | 
122 |     try:
123 |         resp = llm.invoke(formatted)
124 |         if hasattr(resp, "content"):
125 |             answer = resp.content.strip()
126 |         else:
127 |             answer = str(resp).strip()
128 | 
129 |         print(f"🔎 Model output for if {post_file} helpful: {answer}")
130 | 
131 |         if answer.upper().startswith("Y"):
132 |             return True
133 |         else:
134 |             return False
135 |     except Exception as e:
136 |         print(f"⚠️ check_if_post_helpful error: {e}")
137 |         return False
138 | 
139 | 
140 | if __name__ == "__main__":
141 |     preprocess_all_html_posts()


--------------------------------------------------------------------------------
/researcher/generate_template.py:
--------------------------------------------------------------------------------
  1 | import random
  2 | import json
  3 | from pathlib import Path
  4 | 
  5 | from researcher.construct_prompts import build_wq_knowledge_prompt, build_blog_to_hypothesis, \
  6 |     build_hypothesis_to_template, build_check_if_blog_helpful
  7 | from utils.config_loader import ConfigLoader
  8 | 
  9 | from langchain_openai import ChatOpenAI
 10 | from langchain.prompts import ChatPromptTemplate, MessagesPlaceholder
 11 | from langchain.memory import ConversationBufferMemory
 12 | from langchain.chains import LLMChain
 13 | 
 14 | from utils.json_dealer import extract_json
 15 | 
 16 | # --- 路径 ---
 17 | BASE_DIR = Path(__file__).resolve().parents[1]
 18 | POSTS_DIR = BASE_DIR / "data" / "wq_posts" / "helpful_posts"
 19 | HYPOTHESIS_DB = BASE_DIR / "data" / "hypothesis_db_v2"
 20 | TEMPLATE_DB = BASE_DIR / "data" / "template_db_v2"
 21 | PROMPT_FILE = BASE_DIR / "prompts" / "template_generating.yaml"
 22 | 
 23 | HYPOTHESIS_DB.mkdir(parents=True, exist_ok=True)
 24 | TEMPLATE_DB.mkdir(parents=True, exist_ok=True)
 25 | 
 26 | 
 27 | # === LangChain Agent 初始化 ===
 28 | def init_agent(system_prompt):
 29 |     """初始化长时运行的Agent并注入System Prompt"""
 30 |     base_url = ConfigLoader.get("openai_base_url")
 31 |     api_key = ConfigLoader.get("openai_api_key")
 32 |     model_name = ConfigLoader.get("openai_model_name")
 33 | 
 34 |     llm = ChatOpenAI(
 35 |         base_url=base_url,
 36 |         api_key=api_key,
 37 |         model=model_name,
 38 |         temperature=0.2,
 39 |     )
 40 | 
 41 |     # memory 保存上下文
 42 |     memory = ConversationBufferMemory(memory_key="chat_history", return_messages=True)
 43 | 
 44 |     # PromptTemplate — system prompt + user placeholder
 45 |     prompt = ChatPromptTemplate.from_messages([
 46 |         ("system", system_prompt),
 47 |         MessagesPlaceholder(variable_name="chat_history"),
 48 |         ("user", "{input}")  # 后续只传 input
 49 |     ])
 50 | 
 51 |     chain = LLMChain(
 52 |         llm=llm,
 53 |         prompt=prompt,
 54 |         memory=memory
 55 |     )
 56 | 
 57 |     return chain
 58 | 
 59 | 
 60 | # === 随机选择有用的 Blog Post ===
 61 | def select_valid_post(chain):
 62 |     post_files = list(POSTS_DIR.glob("*.json"))
 63 |     if not post_files:
 64 |         raise FileNotFoundError("❌ No blog post found in processed_posts folder")
 65 | 
 66 |     # while True:
 67 |     #     post_file = random.choice(post_files)
 68 |     #     formatted = build_check_if_blog_helpful(post_file)
 69 |     #     output = chain.run(input=formatted).strip()
 70 |     #
 71 |     #     if output == "Y":
 72 |     #         print(f"✅ Selected blog post: {post_file}")
 73 |     #         return post_file
 74 |     #     else:
 75 |     #         print(f"⚠️ Skipping blog post: {post_file} (not helpful)")
 76 |     return random.choice(post_files)
 77 | 
 78 | 
 79 | def check_if_post_helpful(chain, post_file):
 80 |     formatted = build_check_if_blog_helpful(post_file)
 81 |     output = chain.run(input=formatted).strip()
 82 |     if output.upper().startswith("Y"):
 83 |         return True
 84 |     else:
 85 |         return False
 86 | 
 87 | 
 88 | # === 生成 Hypotheses ===
 89 | def generate_hypotheses(chain, post_file):
 90 |     formatted = build_blog_to_hypothesis(post_file)
 91 |     output = chain.run(input=formatted).strip()
 92 | 
 93 |     try:
 94 |         hypotheses = extract_json(output)
 95 |     except Exception:
 96 |         raise ValueError(f"❌ Hypotheses output not valid JSON: {output}")
 97 | 
 98 |     out_file = HYPOTHESIS_DB / f"{Path(post_file).stem}_hypotheses.json"
 99 |     with open(out_file, "w", encoding="utf-8") as f:
100 |         json.dump(hypotheses, f, indent=2, ensure_ascii=False)
101 | 
102 |     print(f"✅ Hypotheses saved: {out_file}")
103 |     return out_file
104 | 
105 | 
106 | # === 生成 Template ===
107 | def generate_template(chain, hypotheses_file):
108 |     formatted = build_hypothesis_to_template(hypotheses_file)
109 |     output = chain.run(input=formatted).strip()
110 | 
111 |     try:
112 |         template_json = extract_json(output)
113 |     except Exception:
114 |         print(f"❌ Template output not valid JSON: {output}")
115 |         return None
116 | 
117 |     out_file = TEMPLATE_DB / f"{Path(hypotheses_file).stem}_template.json"
118 |     with open(out_file, "w", encoding="utf-8") as f:
119 |         json.dump(template_json, f, indent=2, ensure_ascii=False)
120 | 
121 |     print(f"✅ Template saved: {out_file}")
122 |     return out_file
123 | 
124 | 
125 | # === 主流程 ===
126 | def from_post_to_template(post_file: str=None):
127 |     system_prompt = build_wq_knowledge_prompt()
128 | 
129 |     chain = init_agent(system_prompt)
130 | 
131 |     # Step 1: 选择 blog
132 |     if post_file:
133 |         post_stem = Path(post_file).stem
134 |         existing_template = TEMPLATE_DB / f"{post_stem}_hypotheses_template.json"
135 |         if existing_template.exists():
136 |             print(f"✅ Template already exists for {post_file}, skipping template and alpha generation.")
137 |             return None
138 |         blog_file = post_file
139 | 
140 |         # if check_if_post_helpful(chain, post_file):
141 |         #     blog_file = post_file
142 |         # else:
143 |         #     print(f"⚠️ Skipping blog post: {post_file} (not helpful)")
144 |         #     return None
145 |     else:
146 |         blog_file = select_valid_post(chain)
147 | 
148 |     # Step 2: 生成 Hypotheses
149 |     hypotheses_file = generate_hypotheses(chain, blog_file)
150 | 
151 |     # Step 3: 生成 Template
152 |     template_file = generate_template(chain, hypotheses_file)
153 | 
154 |     print(f"🎯 Finished: Template generated from {blog_file} successfully.")
155 |     return template_file
156 | 
157 | 
158 | if __name__ == "__main__":
159 |     from_post_to_template()
160 | 


--------------------------------------------------------------------------------
/scraper/scrap_posts_from_wq.py:
--------------------------------------------------------------------------------
  1 | """
  2 | scrap_posts_from_wq.py — Playwright版
  3 | 打开浏览器，手动登录后抓取 WorldQuant Consultant 帖子，支持多页。
  4 | """
  5 | 
  6 | import time
  7 | import datetime
  8 | import csv
  9 | from pathlib import Path
 10 | from bs4 import BeautifulSoup
 11 | from loguru import logger
 12 | from playwright.sync_api import sync_playwright
 13 | 
 14 | from utils.config_loader import ConfigLoader
 15 | 
 16 | # --- 目录 ---
 17 | BASE_DIR = Path(__file__).resolve().parents[1]
 18 | RAW_DIR = BASE_DIR / "data" / "wq_posts" / "raw_posts"
 19 | RAW_DIR.mkdir(parents=True, exist_ok=True)
 20 | INDEX_FILE = RAW_DIR / "index.csv"
 21 | COOKIES_FILE = RAW_DIR / "cookies.json"
 22 | 
 23 | 
 24 | def _load_existing_ids():
 25 |     """加载已抓取帖子ID"""
 26 |     if not INDEX_FILE.exists():
 27 |         return set()
 28 |     with open(INDEX_FILE, "r", encoding="utf-8") as f:
 29 |         reader = csv.DictReader(f)
 30 |         return {row["id"] for row in reader}
 31 | 
 32 | 
 33 | def _save_index_row(post_meta: dict):
 34 |     """追加写入index.csv"""
 35 |     file_exists = INDEX_FILE.exists()
 36 |     with open(INDEX_FILE, "a", newline="", encoding="utf-8") as f:
 37 |         fieldnames = ["id", "title", "url", "time"]
 38 |         writer = csv.DictWriter(f, fieldnames=fieldnames)
 39 |         if not file_exists:
 40 |             writer.writeheader()
 41 |         writer.writerow(post_meta)
 42 | 
 43 | 
 44 | def _save_raw_html(post_id: str, html_content: str):
 45 |     """保存原始HTML文件"""
 46 |     now_str = datetime.datetime.now().strftime("%Y%m%d_%H%M%S")
 47 |     file_path = RAW_DIR / f"{now_str}_{post_id}.html"
 48 |     with open(file_path, "w", encoding="utf-8") as f:
 49 |         f.write(html_content)
 50 |     logger.info(f"Saved raw HTML to {file_path}")
 51 | 
 52 | 
 53 | def scrape_new_posts(limit: int = 20):
 54 |     """
 55 |     Playwright抓取WorldQuant Consultant新帖子
 56 |     """
 57 |     topic_url = ConfigLoader.get("worldquant_consultant_posts_url")
 58 |     existing_ids = _load_existing_ids()
 59 |     new_posts_meta = []
 60 | 
 61 |     with sync_playwright() as p:
 62 |         browser = p.chromium.launch(headless=False)
 63 |         context = browser.new_context()
 64 |         page = context.new_page()
 65 | 
 66 |         logger.info(f"Navigating to topic page: {topic_url}")
 67 |         page.goto(topic_url)
 68 | 
 69 |         logger.info("请在浏览器中完成登录，如需。完成后在终端按回车继续...")
 70 |         input("已登录并看到帖子列表后按回车...")
 71 | 
 72 |         fetched_count = 0
 73 |         has_next = True
 74 |         while has_next and fetched_count < limit:
 75 |             # 等主文档加载
 76 |             try:
 77 |                 page.wait_for_load_state("load", timeout=60000)
 78 |             except:
 79 |                 logger.warning("load_state timeout, continue anyway")
 80 |             time.sleep(5)
 81 | 
 82 |             html = page.content()
 83 |             soup = BeautifulSoup(html, "html.parser")
 84 | 
 85 |             # 抽取帖子链接
 86 |             post_links = soup.select("a[href*='/community/posts/']")
 87 |             logger.info(f"Found {len(post_links)} post links on this page.")
 88 | 
 89 |             for link in post_links:
 90 |                 if fetched_count >= limit:
 91 |                     break
 92 |                 post_url = link.get("href")
 93 |                 if not post_url:
 94 |                     continue
 95 |                 full_url = (
 96 |                     post_url
 97 |                     if post_url.startswith("http")
 98 |                     else "https://support.worldquantbrain.com" + post_url
 99 |                 )
100 | 
101 |                 import re
102 |                 m = re.search(r"/posts/(\d+)", full_url)
103 |                 if not m:
104 |                     continue
105 |                 post_id = m.group(1)
106 | 
107 |                 if post_id in existing_ids:
108 |                     context.storage_state(path=str(COOKIES_FILE))
109 |                     logger.info(f"Saved cookies to {COOKIES_FILE}")
110 |                     browser.close()
111 |                     logger.info(f"Total new posts scraped: {len(new_posts_meta)}")
112 |                     return new_posts_meta
113 | 
114 | 
115 |                 # 抓帖子详情页
116 |                 page.goto(full_url)
117 |                 try:
118 |                     page.wait_for_load_state("load", timeout=60000)
119 |                 except:
120 |                     logger.warning("Timeout loading post page, continue anyway")
121 |                 time.sleep(3)
122 |                 html_content = page.content()
123 |                 _save_raw_html(post_id, html_content)
124 | 
125 |                 title = link.get_text(strip=True)
126 |                 post_meta = {
127 |                     "id": post_id,
128 |                     "title": title,
129 |                     "url": full_url,
130 |                     "time": "",
131 |                 }
132 |                 _save_index_row(post_meta)
133 |                 new_posts_meta.append(post_meta)
134 |                 fetched_count += 1
135 |                 logger.info(f"New post scraped: {post_meta}")
136 | 
137 |                 # 回到列表页
138 |                 page.goto(topic_url)
139 |                 try:
140 |                     page.wait_for_load_state("load", timeout=60000)
141 |                 except:
142 |                     logger.warning("Timeout loading topic page, continue anyway")
143 |                 time.sleep(5)
144 | 
145 |             # 翻页：找“Next”按钮
146 |             next_button = page.locator("a:has-text('Next')")
147 |             if next_button.count() > 0 and fetched_count < limit:
148 |                 logger.info("Clicking Next button...")
149 |                 next_button.first.click()
150 |                 try:
151 |                     page.wait_for_load_state("load", timeout=60000)
152 |                 except:
153 |                     logger.warning("Timeout after clicking Next, continue anyway")
154 |                 time.sleep(5)
155 |             else:
156 |                 has_next = False
157 | 
158 |         # 保存cookies
159 |         context.storage_state(path=str(COOKIES_FILE))
160 |         logger.info(f"Saved cookies to {COOKIES_FILE}")
161 | 
162 |         browser.close()
163 | 
164 |     logger.info(f"Total new posts scraped: {len(new_posts_meta)}")
165 |     return new_posts_meta
166 | 
167 | 
168 | if __name__ == "__main__":
169 |     new_posts = scrape_new_posts()
170 |     print(f"Scraped {len(new_posts)} new posts.")
171 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
  1 | # AlphaSpire
  2 | 
  3 | AI agent-driven fully automated Alpha mining.
  4 | 
  5 | -----
  6 | 
  7 | ## ⚠️ Attention
  8 | 
  9 | Please note that this tool only performs backtest simulations on the platform (Simulate) and does not execute any submission actions (Submit). **Excessive simulations without valid submissions may result in your account being locked.** Please make sure to limit the number of backtests when using this tool.
 10 | 
 11 | ## Work Pipeline
 12 | 
 13 | ```scss
 14 |                 ┌────────────────────────────┐
 15 |                 │   Data Acquisition Module  │
 16 |                 │ (Scraper & Local Storage)  │
 17 |                 └─────────────┬──────────────┘
 18 |                               │
 19 |                 ┌─────────────▼──────────────┐
 20 |                 │   Preprocessing & Parsing  │
 21 |                 │ (Text cleaning, Metadata)  │
 22 |                 └─────────────┬──────────────┘
 23 |                               │
 24 |                 ┌─────────────▼──────────────┐
 25 |                 │     Researcher (LLM)       │
 26 |                 │ (LangChain / Prompting)    │
 27 |                 └─────────────┬──────────────┘
 28 |                               │
 29 |                 ┌─────────────▼──────────────┐
 30 |                 │  Alpha Template Generating │
 31 |                 │ (Factor/Signal Extraction) │
 32 |                 └─────────────┬──────────────┘
 33 |                               │
 34 |                 ┌─────────────▼──────────────┐
 35 |                 │          Evaluator         │
 36 |                 │ (Submit to WQ or local sim)│
 37 |                 └─────────────┬──────────────┘
 38 |                               │
 39 |                 ┌─────────────▼──────────────┐
 40 |                 │    Results Collecting      │
 41 |                 │ (Store Scores, Rank Alphas)│
 42 |                 └─────────────┬──────────────┘
 43 |                               │
 44 |                 ┌─────────────▼──────────────┐
 45 |                 │Results analyzing &Iteration│
 46 |                 │                            │
 47 |                 └────────────────────────────┘
 48 | ```
 49 | 
 50 | ## Architecture
 51 | ### data
 52 | The data folder stores posts, operators, fields, operator types, field types, generated alpha templates, generated alphas, and backtesting results information.
 53 | 
 54 | ### prompts
 55 | Stores prompt templates used in workflows.
 56 | 
 57 | ### utils
 58 | 1. Get the operator set of Fast Expression and save it to ./data/wq_operators
 59 | 2. Get the field collection of Fast Expression and save it to ./data/wq_fields
 60 | 3. Manual rules classify operators and generate operator types required by templates. (save to ./data/wq_template_operators)
 61 | 4. Clustering and large language models are used to assist in generating the field types used in templates. (save to ./data/wq_template_fields)
 62 | 5. Various text processing and other miscellaneous.
 63 | 
 64 | ### scraper
 65 | 1. Scrape posts from the WorldQuant forum and store them in ./data/wq_posts/raw_posts.
 66 | 2. Extract the main information from the original html text of the post into a json file and store it in ./data/wq_posts/processed_posts
 67 | 3. Use a large language model to determine whether the post has the potential to generate alpha, and if so, save it to ./data/wq_posts/helpful_posts
 68 | 
 69 | ### researcher
 70 | 1. Use LLM to summarize hypothesis from blog and save it to ./data/hypothesis_db_v2
 71 | 2. Use LLM to generate template from hypothesis and save it to ./data/template_db_v2
 72 | 3. Multiple loops generate alphas based on template, field type, and operator type and save them to ./data/alpha_db_v2/all_alphas
 73 | 
 74 | ### evaluator
 75 | Use the WorldQuant backtesting API to evaluate alpha performance and save the results to data/alpha_db_v2/backtest_result
 76 | 
 77 | 
 78 | ## Deployment
 79 | 
 80 | 1. Create a conda environment
 81 |     ```bash
 82 |     conda env create -f environment.yml
 83 |     conda activate alphaspire
 84 |     ```
 85 | 2. Fill in the configuration file (config.yaml)
 86 |     ```yaml
 87 |     # ===============================
 88 |     # Global Configuration File
 89 |     # ===============================
 90 |     
 91 |     # --- OpenAI API Settings ---
 92 |     openai_base_url: "todo" # e.g. https://api.deepseek.com
 93 |     openai_api_key: "todo" # e.g. sk-...
 94 |     openai_model_name: "todo" # e.g. deepseek-chat
 95 |     reasoner_model_name: "todo" # e.g. deepseek-reasoner
 96 |     
 97 |     # --- WorldQuant Platform Credentials ---
 98 |     worldquant_account: "todo"
 99 |     worldquant_password: "todo"
100 |     
101 |     worldquant_login_url: "https://platform.worldquantbrain.com/sign-in"
102 |     worldquant_api_auth: "https://api.worldquantbrain.com/authentication"
103 |     worldquant_consultant_posts_url: "https://support.worldquantbrain.com/hc/en-us/community/topics/18910956638743-顾问专属中文论坛"
104 |     # You can also choose any other WorldQuant Forum URL you have access to.
105 |     
106 |     # --- Dataset from WorldQuant Brain
107 |     enabled_field_datasets: # Select the field database you want to use to build alphas.
108 |       - pv1                 # Database name reference ./data/wq_fields
109 |       - fundamental6
110 |       - analyst4
111 |       - model16
112 |       - news12   
113 |     ```
114 | 
115 | 3. One-click operation
116 |    * Full process operation
117 |        ```bash
118 |         python3 main.py
119 |        ```
120 |    * Only run post crawl
121 |        ```bash
122 |         python3 main_scraper.py
123 |        ```
124 |    * Only run Alpha generation
125 |        ```bash
126 |         python3 main_researcher.py
127 |        ```
128 |    * Only run Alpha backtests
129 |        ```bash
130 |         ./test_script.sh
131 |        ```
132 |      OR
133 |        ```bash
134 |         python3 main_evaluator.py
135 |        ```
136 | 
137 | ## Notice
138 | 
139 | Regarding WorldQuant backtest parameter settings, currently only manual modification of the parameters in the following code section in evaluator/backtest_with_wq_mul.py is supported. 
140 | These will be moved to a separate config file in future versions.
141 | 
142 | ```
143 | payload = {
144 |     "type": "REGULAR",
145 |     "settings": {
146 |         "instrumentType": "EQUITY",
147 |         "region": "ASI",
148 |         "universe": "MINVOL1M",
149 |         "delay": 1,
150 |         "decay": 6,
151 |         "neutralization": "SUBINDUSTRY",
152 |         "truncation": 0.01,
153 |         "pasteurization": "ON",
154 |         "unitHandling": "VERIFY",
155 |         "nanHandling": "ON",
156 |         "maxTrade": "ON",
157 |         "language": "FASTEXPR",
158 |         "visualization": False,
159 |     },
160 |     "regular": fixed_expr
161 | }
162 | ```
163 | 


--------------------------------------------------------------------------------
/utils/template_field_gener.py:
--------------------------------------------------------------------------------
  1 | import json
  2 | import csv
  3 | import logging
  4 | from pathlib import Path
  5 | from typing import List, Dict
  6 | import pandas as pd
  7 | from sklearn.feature_extraction.text import TfidfVectorizer
  8 | from openai import OpenAI
  9 | import hdbscan
 10 | import warnings
 11 | warnings.filterwarnings(
 12 |     "ignore",
 13 |     message="'force_all_finite' was renamed to 'ensure_all_finite'",
 14 |     category=FutureWarning,
 15 | )
 16 | 
 17 | from utils.config_loader import ConfigLoader
 18 | 
 19 | BASE_DIR = Path(__file__).resolve().parents[1]
 20 | FIELDS_DIR = BASE_DIR / "data" / "wq_fields"
 21 | OUTPUT_DIR = BASE_DIR / "data" / "wq_template_fields"
 22 | OUTPUT_DIR.mkdir(parents=True, exist_ok=True)
 23 | OUT_JSON = OUTPUT_DIR / "template_fields.json"
 24 | 
 25 | logging.basicConfig(level=logging.INFO, format="%(asctime)s %(levelname)s %(message)s")
 26 | 
 27 | 
 28 | # =========================
 29 | # Step 1. 加载所有 csv 文件
 30 | # =========================
 31 | def load_all_fields() -> pd.DataFrame:
 32 |     """读取 data/wq_fields 下所有有效 CSV 文件并合并"""
 33 |     dfs = []
 34 |     for file in FIELDS_DIR.glob("*.csv"):
 35 |         if file.stat().st_size == 0:
 36 |             logging.warning(f"⚠️ Skipping empty file (0 bytes): {file.name}")
 37 |             continue
 38 |         try:
 39 |             df = pd.read_csv(file, dtype=str, keep_default_na=False)
 40 |             if df.empty or len(df.columns) == 0:
 41 |                 logging.warning(f"⚠️ Skipping file with no valid columns: {file.name}")
 42 |                 continue
 43 |             df["__dataset__"] = file.stem
 44 |             dfs.append(df)
 45 | 
 46 |         except pd.errors.EmptyDataError:
 47 |             logging.warning(f"⚠️ Skipping malformed file (EmptyDataError): {file.name}")
 48 |             continue
 49 |         except pd.errors.ParserError as e:
 50 |             logging.warning(f"⚠️ Skipping broken CSV (ParserError): {file.name} ({e})")
 51 |             continue
 52 |         except Exception as e:
 53 |             logging.error(f"❌ Unexpected error reading {file.name}: {e}")
 54 |             continue
 55 |     if not dfs:
 56 |         raise RuntimeError(f"❌ No valid CSV files found in {FIELDS_DIR}")
 57 | 
 58 |     return pd.concat(dfs, ignore_index=True)
 59 | 
 60 | 
 61 | # =========================
 62 | # Step 2. 聚类逻辑
 63 | # =========================
 64 | def cluster_fields_by_semantics_auto(df: pd.DataFrame,
 65 |                                      min_cluster_size: int = 3,
 66 |                                      min_samples: int = 2) -> Dict[int, List[str]]:
 67 |     """
 68 |     使用 HDBSCAN 自动确定聚类数量的语义聚类方法。
 69 |     基于 id + description 文本表示。
 70 |     """
 71 |     if len(df) <= min_cluster_size:
 72 |         # 数据太少，不聚类
 73 |         return {0: df["id"].tolist()}
 74 | 
 75 |     texts = (df["id"].astype(str) + " " + df["description"].astype(str)).tolist()
 76 | 
 77 |     # Step 1. TF-IDF 向量化
 78 |     tfidf = TfidfVectorizer(max_features=2000)
 79 |     X = tfidf.fit_transform(texts)
 80 | 
 81 |     # Step 2. HDBSCAN 聚类
 82 |     clusterer = hdbscan.HDBSCAN(
 83 |         min_cluster_size=min_cluster_size,
 84 |         min_samples=min_samples,
 85 |         metric='euclidean',
 86 |         cluster_selection_method='eom'
 87 |     )
 88 |     labels = clusterer.fit_predict(X.toarray())
 89 | 
 90 |     # Step 3. 聚类结果收集
 91 |     clusters: Dict[int, List[str]] = {}
 92 |     for idx, label in enumerate(labels):
 93 |         if label == -1:
 94 |             # -1 表示噪声，可选择丢弃或单独归类
 95 |             label = 9999  # 归入“噪声”类
 96 |         clusters.setdefault(label, []).append(df.iloc[idx]["id"])
 97 | 
 98 |     # 可选：按簇大小排序
 99 |     clusters = dict(sorted(clusters.items(), key=lambda x: -len(x[1])))
100 |     return clusters
101 | 
102 | 
103 | # =========================
104 | # Step 3. 调用 LLM 命名类别
105 | # =========================
106 | def get_llm_client():
107 |     return OpenAI(
108 |         base_url=ConfigLoader.get("openai_base_url"),
109 |         api_key=ConfigLoader.get("openai_api_key"),
110 |     )
111 | 
112 | 
113 | def name_cluster_with_llm(client, type_name: str, dataset: str, sample_texts: List[str]) -> str:
114 |     """调用 LLM 生成聚类名称"""
115 |     joined = "\n".join(sample_texts)  # 取字段描述
116 |     prompt = f"""
117 | You are classifying quantitative finance data fields.
118 | Given the dataset = {dataset} and field type = {type_name}.
119 | Below are some field examples:
120 | {joined}
121 | 
122 | Please propose a short, meaningful lowercase name (1-3 words) for this group, 
123 | like "momentum", "valuation_ratio", "sentiment_score", etc.
124 | Return only the name.
125 | """
126 |     resp = client.chat.completions.create(
127 |         model=ConfigLoader.get("openai_model_name"),
128 |         messages=[{"role": "system", "content": "You are a finance data classifier."},
129 |                   {"role": "user", "content": prompt}],
130 |         temperature=0.3,
131 |     )
132 |     name = resp.choices[0].message.content.strip()
133 |     # 清理非法字符
134 |     name = name.replace(" ", "_").replace("-", "_").lower()
135 |     return name
136 | 
137 | 
138 | # =========================
139 | # Step 4. 主生成逻辑
140 | # =========================
141 | def generate_template_fields_v2():
142 |     logging.info("📥 Loading all field csvs...")
143 |     df = load_all_fields()
144 | 
145 |     # 自动检测必要列
146 |     expected_cols = {"id", "description", "type"}
147 |     if not expected_cols.issubset(df.columns):
148 |         raise ValueError(f"Missing required columns in input: {expected_cols - set(df.columns)}")
149 | 
150 |     client = get_llm_client()
151 |     all_mappings = {}
152 | 
153 |     # 按 dataset + type 分组
154 |     grouped = df.groupby(["__dataset__", "type"])
155 |     for (dataset, dtype), subdf in grouped:
156 |         if len(subdf) < 3:
157 |             print(f"Skipping small group: {dataset}:{dtype} ({len(subdf)})")
158 |             continue
159 | 
160 |         print(f"🧩 Processing dataset={dataset}, type={dtype}, size={len(subdf)}")
161 |         clusters = cluster_fields_by_semantics_auto(subdf)
162 | 
163 |         for cluster_id, field_ids in clusters.items():
164 |             sample_df = subdf[subdf["id"].isin(field_ids)]
165 |             sample_texts = (sample_df["id"] + " " + sample_df["description"]).tolist()
166 |             type_name = name_cluster_with_llm(client, dtype, dataset, sample_texts)
167 | 
168 |             key = f"</{type_name}:{dtype}:{dataset}/>"
169 |             all_mappings[key] = field_ids
170 |             print(f"✅ Generated type: {key} ({len(field_ids)} fields)")
171 | 
172 |     # 保存结果
173 |     with open(OUT_JSON, "w", encoding="utf-8") as f:
174 |         json.dump(all_mappings, f, ensure_ascii=False, indent=2)
175 |     print(f"🎯 Saved {len(all_mappings)} template field types to {OUT_JSON}")
176 | 
177 | 
178 | if __name__ == "__main__":
179 |     generate_template_fields_v2()
180 | 


--------------------------------------------------------------------------------
/evaluator/backtest_with_wq.py:
--------------------------------------------------------------------------------
  1 | import json
  2 | import csv
  3 | import logging
  4 | from pathlib import Path
  5 | from time import sleep
  6 | import requests
  7 | from requests.auth import HTTPBasicAuth
  8 | 
  9 | from utils.config_loader import ConfigLoader
 10 | 
 11 | BASE_DIR = Path(__file__).resolve().parents[1]
 12 | BACKTEST_DIR = BASE_DIR / "data" / "alpha_db_v2" / "backtest_result"
 13 | BACKTEST_DIR.mkdir(parents=True, exist_ok=True)
 14 | 
 15 | logging.basicConfig(filename='backtest_with_wq.log', level=logging.INFO,
 16 |                     format='%(asctime)s - %(levelname)s - %(message)s')
 17 | 
 18 | # ====== 登录并保持 Session ======
 19 | def sign_in():
 20 |     """登录 WQ Brain 并返回 session"""
 21 |     username = ConfigLoader.get('worldquant_account')
 22 |     password = ConfigLoader.get('worldquant_password')
 23 | 
 24 |     sess = requests.Session()
 25 |     sess.auth = HTTPBasicAuth(username, password)
 26 |     response = sess.post(ConfigLoader.get('worldquant_api_auth'))
 27 |     print(f"Login status: {response.status_code}")
 28 |     return sess
 29 | 
 30 | 
 31 | def run_backtest_by_wq_api(alphas_json_file):
 32 |     """回测指定 alphas json 文件"""
 33 |     sess = sign_in()
 34 | 
 35 |     # === 1. 读 alpha JSON ===
 36 |     with open(alphas_json_file, "r", encoding="utf-8") as f:
 37 |         data = json.load(f)
 38 | 
 39 |     print(f"🔬 Start backtest for {alphas_json_file}")
 40 |     # 支持两种结构
 41 |     alphas = []
 42 |     if "GeneratedAlphas" in data:
 43 |         for item in data["GeneratedAlphas"]:
 44 |             alphas.append(item["alpha"])
 45 |     elif isinstance(data, list):
 46 |         for item in data:
 47 |             alphas.append(item["alpha"])
 48 |     else:
 49 |         print("❌ 不识别的 alpha JSON 格式")
 50 |         return None
 51 | 
 52 |     template_name = Path(alphas_json_file).stem
 53 |     out_csv = BACKTEST_DIR / f"{template_name}_backtest.csv"
 54 | 
 55 |     # === 2. 读取已存在CSV，跳过已回测 ===
 56 |     finished_alphas = set()
 57 |     if out_csv.exists():
 58 |         with open(out_csv, "r", encoding="utf-8") as f:
 59 |             reader = csv.DictReader(f)
 60 |             for row in reader:
 61 |                 finished_alphas.add(row["alpha"])
 62 |         print(f"⚠️ 已有 {len(finished_alphas)} 条回测结果，将跳过这些 alpha")
 63 | 
 64 |     # === 3. CSV准备写入 ===
 65 |     fieldnames = ["alpha", "sharpe", "turnover", "fitness", "returns", "drawdown", "margin"]
 66 |     csv_file = open(out_csv, "a", newline="", encoding="utf-8")
 67 |     writer = csv.DictWriter(csv_file, fieldnames=fieldnames)
 68 |     if csv_file.tell() == 0:  # 空文件时写表头
 69 |         writer.writeheader()
 70 | 
 71 |     # === 4. 循环回测 ===
 72 |     alpha_fail_attempt_tolerance = 15
 73 |     for index, alpha_expr in enumerate(alphas, start=1):
 74 |         if alpha_expr in finished_alphas:
 75 |             print(f"✅ 跳过已回测 alpha: {alpha_expr[:40]}...")
 76 |             continue
 77 | 
 78 |         # 组装模拟参数
 79 |         alpha_payload = {
 80 |             "type": "REGULAR",
 81 |             "settings": {
 82 |                 "instrumentType": "EQUITY",
 83 |                 "region": "USA",
 84 |                 "universe": "TOP3000",
 85 |                 "delay": 1,
 86 |                 "decay": 0,
 87 |                 "neutralization": "SUBINDUSTRY",
 88 |                 "truncation": 0.01,
 89 |                 "pasteurization": "ON",
 90 |                 "unitHandling": "VERIFY",
 91 |                 "nanHandling": "OFF",
 92 |                 "language": "FASTEXPR",
 93 |                 "visualization": False,
 94 |             },
 95 |             "regular": alpha_expr
 96 |         }
 97 | 
 98 |         print(f"[{index}/{len(alphas)}] 回测 alpha: {alpha_expr[:60]}...")
 99 |         keep_trying = True
100 |         failure_count = 0
101 | 
102 |         # === 4.1 提交 Simulation ===
103 |         while keep_trying:
104 |             try:
105 |                 sim_resp = sess.post(
106 |                     'https://api.worldquantbrain.com/simulations',
107 |                     json=alpha_payload
108 |                 )
109 |                 if sim_resp.status_code not in (200, 201):
110 |                     raise RuntimeError(f"Simulation submit failed {sim_resp.status_code}: {sim_resp.text}")
111 | 
112 |                 sim_progress_url = sim_resp.headers.get('Location')
113 |                 if not sim_progress_url:
114 |                     raise RuntimeError("❌ No Location header in response")
115 | 
116 |                 print(f"🔎 Alpha simulation location: {sim_progress_url}")
117 |                 keep_trying = False
118 |             except Exception as e:
119 |                 failure_count += 1
120 |                 print(f"⚠️ No Location, sleep 15 and retry: {e}")
121 |                 logging.error(f"No Location, sleep 15 and retry: {e}")
122 |                 sleep(15)
123 |                 if failure_count >= alpha_fail_attempt_tolerance:
124 |                     sess = sign_in()  # 重新登录
125 |                     failure_count = 0
126 |                     logging.error(f"❌ Too many failures,跳过当前 alpha {alpha_expr}")
127 |                     break
128 | 
129 |         # === 4.2 轮询 Simulation 结果 ===
130 |         if not sim_progress_url:
131 |             continue
132 |         # 等待完成
133 |         finished = False
134 |         for _ in range(240):  # 最多轮询 240 次 * 15s = 60 分钟
135 |             status_resp = sess.get(sim_progress_url)
136 |             status_json = status_resp.json()
137 |             status = status_json.get("status")
138 |             if status == "COMPLETE":
139 |                 alpha_id = status_json.get("alpha")
140 |                 finished = True
141 |                 break
142 |             elif status == "ERROR":
143 |                 print(f"❌ Simulation failed for {alpha_expr}")
144 |                 finished = False
145 |                 break
146 |             else:
147 |                 print(f"⏳ Status: {status}, sleep 10s")
148 |                 sleep(10)
149 |         if not finished:
150 |             continue
151 | 
152 |         # === 4.3 获取 Alpha 指标 ===
153 |         # alpha_resp = sess.get(f'https://api.worldquantbrain.com/alphas/{alpha_id}')
154 |         for attempt in range(20):
155 |             alpha_resp = sess.get(f'https://api.worldquantbrain.com/alphas/{alpha_id}')
156 |             if alpha_resp.status_code == 200:
157 |                 alpha_data = alpha_resp.json()
158 |                 break
159 |             else:
160 |                 print(f"⏳ Alpha {alpha_id} not ready yet, status={alpha_resp.status_code}, retry {attempt + 1}")
161 |                 sleep(5)
162 |         else:
163 |             print(f"❌ Failed to fetch alpha result after retries for alphaId={alpha_id}")
164 |             continue  # 或 raise
165 | 
166 |         is_data = alpha_data.get("is", {})
167 |         result_row = {
168 |             "alpha": alpha_expr,
169 |             "sharpe": is_data.get("sharpe"),
170 |             "turnover": is_data.get("turnover"),
171 |             "fitness": is_data.get("fitness"),
172 |             "returns": is_data.get("returns"),
173 |             "drawdown": is_data.get("drawdown"),
174 |             "margin": is_data.get("margin"),
175 |         }
176 |         writer.writerow(result_row)
177 |         csv_file.flush()
178 |         print(f"✅ 已写入回测结果: sharpe={result_row['sharpe']}, fitness={result_row['fitness']}")
179 | 
180 | 
181 |     csv_file.close()
182 |     print(f"🎯 所有回测完成，结果已保存到 {out_csv}")
183 |     return str(out_csv)
184 | 
185 | 
186 | if __name__ == "__main__":
187 |     test_file = BASE_DIR / "data" / "alpha_db" / "all_alphas" / "your_template_alphas.json"
188 |     run_backtest_by_wq_api(test_file)


--------------------------------------------------------------------------------
/utils/wq_info_loader.py:
--------------------------------------------------------------------------------
  1 | import json
  2 | import logging
  3 | from pathlib import Path
  4 | from typing import List, Dict
  5 | 
  6 | import requests
  7 | import pandas as pd
  8 | from requests.auth import HTTPBasicAuth
  9 | from utils.config_loader import ConfigLoader
 10 | 
 11 | # --- 目录 ---
 12 | BASE_DIR = Path(__file__).resolve().parents[1]
 13 | WQ_FIELD_DIR = BASE_DIR / "data" / "wq_fields"
 14 | WQ_FIELD_DIR.mkdir(parents=True, exist_ok=True)
 15 | WQ_OPERATOR_DIR = BASE_DIR / "data" / "wq_operators"
 16 | WQ_OPERATOR_DIR.mkdir(parents=True, exist_ok=True)
 17 | 
 18 | FIELDS_CSV = WQ_FIELD_DIR
 19 | OPERATORS_CSV = WQ_OPERATOR_DIR / "operators.csv"
 20 | 
 21 | 
 22 | class OpAndFeature:
 23 |     def __init__(self):
 24 |         self.sess = requests.Session()
 25 |         username = ConfigLoader.get("worldquant_account")
 26 |         password = ConfigLoader.get("worldquant_password")
 27 |         self.setup_auth(username, password)
 28 | 
 29 |     def setup_auth(self, username, password) -> None:
 30 |         """Set up authentication with WorldQuant Brain."""
 31 |         self.sess.auth = HTTPBasicAuth(username, password)
 32 | 
 33 |         print("Authenticating with WorldQuant Brain...")
 34 |         response = self.sess.post('https://api.worldquantbrain.com/authentication')
 35 |         print(f"Authentication response status: {response.status_code}")
 36 |         logging.debug(f"Authentication response: {response.text[:500]}...")
 37 | 
 38 |         if response.status_code != 201:
 39 |             raise Exception(f"Authentication failed: {response.text}")
 40 | 
 41 |     def get_data_fields(self):
 42 |         """Fetch available data fields from WorldQuant Brain across multiple datasets with random sampling."""
 43 | 
 44 |         # datasets = ['pv1', 'fundamental6', 'analyst4', 'model16', 'news12']
 45 | 
 46 |         datasets = ['analyst4',
 47 |                     'analyst10',
 48 |                     'analyst11',
 49 |                     'analyst14',
 50 |                     'analyst15',
 51 |                     'analyst16',
 52 |                     'analyst35',
 53 |                     'analyst40',
 54 |                     'analyst69',
 55 |                     'earnings3',
 56 |                     'earnings5',
 57 |                     'fundamental17',
 58 |                     'fundamental22',
 59 |                     'fundamental23',
 60 |                     'fundamental28',
 61 |                     'fundamental31',
 62 |                     'fundamental44',
 63 |                     'fundamental6',
 64 |                     'fundamental7',
 65 |                     'fundamental72',
 66 |                     'model109',
 67 |                     'model110',
 68 |                     'model138',
 69 |                     'model16',
 70 |                     'model176',
 71 |                     'model219',
 72 |                     'model238',
 73 |                     'model244',
 74 |                     'model26',
 75 |                     'model262',
 76 |                     'model264',
 77 |                     'model29',
 78 |                     'model30',
 79 |                     'model307',
 80 |                     'model32',
 81 |                     'model38',
 82 |                     'model53',
 83 |                     'model77',
 84 |                     'news12',
 85 |                     'news20',
 86 |                     'news23',
 87 |                     'news52',
 88 |                     'news66',
 89 |                     'other128',
 90 |                     'other432',
 91 |                     'other450',
 92 |                     'other455',
 93 |                     'other460',
 94 |                     'other496',
 95 |                     'other551',
 96 |                     'other553',
 97 |                     'other699',
 98 |                     'other83',
 99 |                     'pv1',
100 |                     'pv13',
101 |                     'pv173',
102 |                     'pv29',
103 |                     'pv37',
104 |                     'pv53',
105 |                     'pv72',
106 |                     'pv73',
107 |                     'pv96',
108 |                     'risk60',
109 |                     'risk66',
110 |                     'risk70',
111 |                     'sentiment21',
112 |                     'sentiment22',
113 |                     'sentiment26',
114 |                     'shortinterest6',
115 |                     'univ1']
116 | 
117 |         base_params = {
118 |             'delay': 1,
119 |             'instrumentType': 'EQUITY',
120 |             'limit': 50,
121 |             'region': 'USA',
122 |             'universe': 'TOP3000'
123 |         }
124 | 
125 |         try:
126 |             print("Requesting data fields from multiple datasets...")
127 |             for dataset in datasets:
128 |                 print("------------" + dataset + "--------------\n")
129 |                 des = str(FIELDS_CSV) + "/" + dataset + ".csv"
130 |                 if Path(des).exists():
131 |                     print(f"Fields CSV already exists at {des}, skipping download.")
132 |                     continue
133 | 
134 |                 all_fields = []
135 |                 params = base_params.copy()
136 |                 params['dataset.id'] = dataset
137 | 
138 |                 print(f"Getting field count for dataset: {dataset}")
139 |                 count_response = self.sess.get('https://api.worldquantbrain.com/data-fields', params=params)
140 | 
141 |                 if count_response.status_code == 200:
142 |                     count_data = count_response.json()
143 |                     total_fields = count_data.get('count', 0)
144 |                     print(f"Total fields in {dataset}: {total_fields}")
145 | 
146 |                     params['limit'] = 50
147 | 
148 |                     for offset in range(0, total_fields, params['limit']):
149 |                         params['offset'] = offset
150 |                         response = self.sess.get('https://api.worldquantbrain.com/data-fields', params=params)
151 | 
152 |                         if response.status_code == 200:
153 |                             data = response.json()
154 |                             fields = data.get('results', [])
155 |                             print(f"Fetched {len(fields)} fields at offset={offset}")
156 |                             all_fields.extend(fields)
157 |                         else:
158 |                             print(f"Failed to fetch fields for {dataset} at offset={offset}: {response.text[:500]}")
159 |                 else:
160 |                     print(f"Failed to get count for {dataset}: {count_response.text[:500]}")
161 | 
162 |                 # 去重
163 |                 unique_fields = {field['id']: field for field in all_fields}.values()
164 |                 unique_fields = list(unique_fields)
165 | 
166 |                 # 保存到 CSV
167 |                 df = pd.DataFrame(unique_fields)
168 |                 df.to_csv(des, index=False, encoding='utf-8')
169 |                 print(f"✅ Saved fields CSV to {des}")
170 | 
171 |             return
172 | 
173 |         except Exception as e:
174 |             print(f"Failed to fetch data fields: {e}")
175 |             return
176 | 
177 |     def get_operators(self) -> List[Dict]:
178 |         """Fetch available operators from WorldQuant Brain."""
179 |         if OPERATORS_CSV.exists():
180 |             print(f"Operators CSV already exists at {OPERATORS_CSV}, skipping download.")
181 |             return pd.read_csv(OPERATORS_CSV).to_dict(orient='records')
182 | 
183 |         print("Requesting operators...")
184 |         response = self.sess.get('https://api.worldquantbrain.com/operators')
185 |         print(f"Operators response status: {response.status_code}")
186 |         logging.debug(f"Operators response: {response.text[:500]}...")  # Print first 500 chars
187 | 
188 |         if response.status_code != 200:
189 |             raise Exception(f"Failed to get operators: {response.text}")
190 | 
191 |         data = response.json()
192 |         if isinstance(data, list):
193 |             operators = data
194 |         elif 'results' in data:
195 |             operators = data['results']
196 |         else:
197 |             raise Exception(f"Unexpected operators response format. Response: {data}")
198 | 
199 |         df = pd.DataFrame(operators)
200 |         df.to_csv(OPERATORS_CSV, index=False, encoding='utf-8')
201 |         print(f"✅ Saved operators CSV to {OPERATORS_CSV}")
202 | 
203 |         return operators
204 | 


--------------------------------------------------------------------------------
/prompts/template_generating.yaml:
--------------------------------------------------------------------------------
  1 | inject_wq_knowledge: |-
  2 |   # Role
  3 |   You are a professional quantitative researcher at a hedge fund.
  4 |   
  5 |   # Task
  6 |   You are going to generate alpha templates for alpha mining in form of WorldQuant Fast Expression.
  7 |   Please notice the following WorldQuant Fast Expression knowledge base:
  8 |   
  9 |   ## Fields and Definitions:
 10 |   {{ fields_and_definitions }}
 11 |   
 12 |   ## Operators and Definitions:
 13 |   {{ operators_and_definitions }}
 14 |   
 15 |   ## Field Types with Included Fields:
 16 |   {{ field_types }}
 17 |   
 18 |   ## Operator Types with Included Operators (operator types follows the </short_description:data_type:dataset_name/> format):
 19 |   {{ operator_types }}
 20 |   
 21 |   This knowledge represents all the allowed building blocks for alpha template construction.
 22 |   - Each field and operator listed above can be used directly.
 23 |   - Each field type and operator type (e.g., </market_metrics:MATRIX:pv1/>, </TS:Aggregation/>) can also be used as a placeholder.
 24 |   - When generating alpha templates, you **must** use only the fields, operators, types listed above and numbers.
 25 |   - Please remember these fields and operators well and do not create undefined symbols in your future work.
 26 |   - Do not use scientific notation for decimals (such as 1e-9), as WorldBrain does not support this.
 27 | 
 28 | 
 29 | check_if_blog_helpful: |-
 30 |   # Role
 31 |   You are a quantitative researcher at a hedge fund. 
 32 |   
 33 |   # Task
 34 |   You are going to mine alphas based on posts from WorldQuant Brain.
 35 |   You are given a blog post:
 36 |   -------------------blog start-------------------
 37 |   {{ blog_post }}
 38 |   --------------------blog end--------------------
 39 |   Please determine if it is possible to generate alpha or alpha templates from this blog. Output a 'Y' if possible, otherwise output an 'N', and be careful not to output anything else
 40 | 
 41 | blog_to_hypothesis: |-
 42 |   # Role
 43 |   Now you are a quantitative researcher at a hedge fund. 
 44 |   
 45 |   # Task
 46 |   Your task is to extract **quantitative trading hypotheses** from a blog post. 
 47 |   
 48 |   # Guideline
 49 |   Follow these steps carefully:
 50 |   1. Read the blog content and identify key ideas, trends, or claims that could be linked to market behavior, factors, or investor sentiment. 
 51 |   2. Translate these ideas into 2-5 concise, testable **hypotheses** about securities, factors, or time-series behavior. Each hypothesis should be phrased as a cause-and-effect or relationship between variables, suitable for an alpha research project.  
 52 |   3. When possible, highlight which types of data fields (</market_metrics:MATRIX:pv1/>, </cash_flow_activities:MATRIX:fundamental6/>, </cash_flow_estimates:MATRIX:analyst4/>, </growth_quality_score:MATRIX:model16/>, </gross_income_estimates:MATRIX:analyst4/>, etc.) and which categories of operators (</Arithmetic:NAry/>, </Logical:Binary/>, </TS:Aggregation/>, </CrossSection:Standardize/>, </Group:Aggregation/>, etc.) might be relevant to test the hypothesis. Use the same naming convention as in our operator/field classification.  
 53 |   4. Output a numbered list of hypotheses. For each hypothesis provide:
 54 |       - **Hypothesis**: A short sentence stating the relationship or effect.
 55 |       - **Rationale**: Why this relationship may hold according to the blog’s content.
 56 |       - **Potential Fields/Operators**: Which **field types or operator types** may be relevant (e.g., </grouping:GROUP:pv1/>, </Arithmetic:Unary/>).
 57 |   5. The output should conform to the json format. DO NOT output anything other than the json structure.(just to illustrate, do not rely on fields or operators in it) 
 58 |      Here is an output Json example:
 59 |       [
 60 |         {
 61 |           "Hypothesis": "During periods of heightened market volatility, technology sector stocks tend to experience increased trading volume relative to other sectors.",
 62 |           "Rationale": "The blog argues that institutional investors seek growth opportunities in tech during uncertainty, driving abnormal trading volume compared to the broader market.",
 63 |           "Potential Fields": ["</grouping:GROUP:pv1/>", "</market_metrics:MATRIX:pv1/>"],
 64 |           "Potential Operators": ["</TS:Aggregation/>", "</CrossSection:Standardize/>"]
 65 |         },
 66 |         {
 67 |           "Hypothesis": "Stocks with stronger analyst earnings revisions outperform their peers in the following quarter.",
 68 |           "Rationale": "The blog notes that upward revisions by analysts tend to precede positive stock price movements and excess returns.",
 69 |           "Potential Fields": ["</gross_income_estimates:MATRIX:analyst4/>", "</balance_sheet_items:VECTOR:fundamental6/>"],
 70 |           "Potential Operators": ["</Arithmetic:Unary/>", "</TS:Transform/>", "</CrossSection:Standardize/>"]
 71 |         }
 72 |       ]
 73 | 
 74 |   Constraints:
 75 |   - Hypotheses should be specific enough to inspire a factor or alpha template but not yet in expression form.
 76 |   - Do not output any code or alpha template at this stage.
 77 |   - Do not output any comments within the json structure
 78 |   
 79 |   Now you are given a **blog post**:
 80 |   -------------------blog start-------------------
 81 |   {{ blog_post }}
 82 |   --------------------blog end--------------------
 83 |   **Please extract quantitative trading hypotheses from it.**
 84 |   
 85 | 
 86 | hypothesis_to_template: |-
 87 |   # Role
 88 |   You are a quantitative researcher at a hedge fund.
 89 |   You have already been provided the full WorldQuant Fast Expression knowledge base 
 90 |   (fields, operators, field types, operator types) in the system prompt.
 91 |   To make it clear what "type" includes, we reiterate the operator types and field types in the knowledge base, as well as the specific operators and fields they contain：
 92 |   ## Field Types with Included Fields:
 93 |   {{ field_types }}
 94 |   
 95 |   ## Operator Types with Included Operators:
 96 |   {{ operator_types }}
 97 | 
 98 |   # Task
 99 |   You are given **multiple hypotheses**.
100 |   Your task:
101 |   1. Review all hypotheses and either synthesize them into **one combined alpha template** 
102 |      or select the **single most promising hypothesis** to convert into a template.
103 |   2. Using the WorldQuant Fast Expression knowledge base from the system prompt, 
104 |      generate one valid alpha template expression.
105 |   
106 |   # Guideline
107 |   ## Output format (JSON):
108 |   {
109 |     "SelectedHypothesis": "string",
110 |     "TemplateExpression": "string",
111 |     "Description": "string",
112 |     "ExpectedBehavior": "string"
113 |   }
114 |   
115 |   ## IMPORTANT:
116 |   - You MUST include **between 1 and 3 field/operator types** (placeholders like </market_metrics:MATRIX:pv1/> or </TS:Aggregation/>) in the final template expression.
117 |   - Templates **without placeholders** are invalid and must not be produced.
118 |   - Prefer using types over concrete fields/operators, but concrete fields/operators cannot be ignored when generating template.
119 |   - You **must** include at least **1** and at most **4** types(field types or operator types) in template expression.
120 |     - Do **not** generate an alpha without using types (types num < 1)!!!
121 |     - Do **not** generate a template that contains too many types (types num > 4)!!!
122 |   - You need to fully consider all the field types and operator types provided to you, and do not rely on the ones given in the examples.
123 | 
124 |   ## Rules:
125 |   - The template must follow the WorldQuant Fast Expression syntax.
126 |   - You may use:
127 |     * Concrete fields and operators from the knowledge base.
128 |     * Field types and operator types from the knowledge base.
129 |     * Field types and operator types from the hypotheses' "Potential Fields" and "Potential Fields".
130 |   - Do **not** use scientific notation for decimals (write 0.0001 instead of 1e-4).
131 |   - The expression must be plausible, concise, and syntactically valid.
132 |   - Do **not** output any comments within the json structure
133 |   
134 |   ## Good Placeholder Examples (just to illustrate, do not rely on any fields or operators in it):
135 |   - ts_rank(</market_metrics:MATRIX:pv1/>, 20)
136 |   - divide(</grouping:GROUP:pv1/>, </TS:Aggregation/>(</market_metrics:MATRIX:pv1/>, 10))
137 |   - </CrossSection:Standardize/>(</balance_sheet_items:VECTOR:fundamental6/>)
138 |   
139 |   ## Output Example (just to illustrate, do not rely on fields or operators in it): 
140 |   ```
141 |   {
142 |     "SelectedHypothesis": "Alpha signals with very low turnover (below 5%) can be transformed into more useful signals by applying ts_target_tvr_decay with expanded lambda_max parameter to increase turnover towards optimal target ranges.",
143 |     "TemplateExpression": "</TS:Transform/>(ts_delta(</market_metrics:MATRIX:pv1/>, 5), 0, 5, 0.15)",
144 |     "Description": "Applies TS:Transform optimization to a simple 5-day price momentum signal to normalize turnover towards 15% target",
145 |     "ExpectedBehavior": "The expression will adaptively adjust the decay parameter to bring the turnover of the underlying momentum signal closer to the target 15% level, making low-turnover signals more viable while preserving the core alpha signal"
146 |   }
147 |   ```
148 |   
149 |   Now you are given the following **hypotheses**:
150 |   {{ hypotheses }}
151 |   **Please generate your json output from it.(DO NOT output any other contents.)**
152 | 
153 | 


--------------------------------------------------------------------------------
/researcher/construct_prompts.py:
--------------------------------------------------------------------------------
  1 | import json
  2 | import re
  3 | import pandas as pd
  4 | import yaml
  5 | from pathlib import Path
  6 | 
  7 | from utils.config_loader import ConfigLoader
  8 | from utils.text_dealer import truncate_text
  9 | 
 10 | # --- 路径 ---
 11 | BASE_DIR = Path(__file__).resolve().parents[1]
 12 | PROMPT_FILE = BASE_DIR / "prompts" / "template_generating.yaml"
 13 | FIELDS_DIR = BASE_DIR / "data" / "wq_fields"
 14 | TEMPLATE_FIELDS_FILE = BASE_DIR / "data" / "wq_template_fields" / "template_fields.json"
 15 | OPERATORS_FILE = BASE_DIR / "data" / "wq_template_operators" / "template_operators.csv"
 16 | 
 17 | 
 18 | def build_wq_knowledge_prompt():
 19 |     """
 20 |     读取 YAML 模板，并根据 config 中启用的数据集构建字段、字段类型、操作符信息，
 21 |     渲染 inject_wq_knowledge prompt。
 22 |     """
 23 | 
 24 |     # 读取模板
 25 |     with open(PROMPT_FILE, "r", encoding="utf-8") as f:
 26 |         prompt_yaml = yaml.safe_load(f)
 27 |     template_str = prompt_yaml.get("inject_wq_knowledge", "")
 28 |     if not template_str:
 29 |         raise ValueError("inject_wq_knowledge not found in template_generating.yaml")
 30 | 
 31 |     # 读取配置：启用的数据集
 32 |     enabled_datasets = ConfigLoader.get("enabled_field_datasets", [])
 33 |     print(f"🔧 Enabled datasets from config: {enabled_datasets}")
 34 | 
 35 |     # =========================================================
 36 |     # 加载字段文件（仅限配置中启用的）
 37 |     # =========================================================
 38 |     field_dfs = []
 39 |     for file in FIELDS_DIR.glob("*.csv"):
 40 |         dataset_name = file.stem
 41 |         if enabled_datasets and dataset_name not in enabled_datasets:
 42 |             continue
 43 |         if file.stat().st_size == 0:
 44 |             print(f"⚠️ Skipping empty file: {file.name}")
 45 |             continue
 46 | 
 47 |         try:
 48 |             df = pd.read_csv(file, dtype=str, keep_default_na=False)
 49 |             df["__dataset__"] = dataset_name
 50 |             field_dfs.append(df)
 51 |         except Exception as e:
 52 |             print(f"❌ Failed to load {file.name}: {e}")
 53 | 
 54 |     if not field_dfs:
 55 |         raise ValueError("❌ No valid field CSVs loaded. Check config.enabled_field_datasets.")
 56 | 
 57 |     fields_df = pd.concat(field_dfs, ignore_index=True)
 58 | 
 59 |     # 构建字段定义信息
 60 |     fields_info = []
 61 |     for _, row in fields_df.iterrows():
 62 |         desc = row.get("description", "")
 63 |         dtype = row.get("type", "")
 64 |         dataset = row.get("__dataset__", "")
 65 |         field_str = f"- **{row['id']}** ({dtype}, {dataset}): {desc}"
 66 |         fields_info.append(field_str)
 67 | 
 68 |     fields_and_definitions = "\n".join(fields_info)
 69 | 
 70 |     # =========================================================
 71 |     # 加载字段类型映射（来自 template_fields.json）
 72 |     # =========================================================
 73 |     if not TEMPLATE_FIELDS_FILE.exists():
 74 |         raise FileNotFoundError(f"❌ template_fields.json not found at {TEMPLATE_FIELDS_FILE}")
 75 | 
 76 |     with open(TEMPLATE_FIELDS_FILE, "r", encoding="utf-8") as f:
 77 |         template_field_data = json.load(f)
 78 | 
 79 |     # template_fields.json 格式: { "field_type_name": [list of field ids], ... }
 80 |     # 仅保留属于启用数据集的字段
 81 |     filtered_field_types = {}
 82 | 
 83 |     for ftype_full, ids in template_field_data.items():
 84 |         # 提取 dataset_name，例如从 "</momentum:type:pv1/>" 得到 "pv1"
 85 |         match = re.search(r":([\w\-]+)\/>$", ftype_full)
 86 |         if not match:
 87 |             continue
 88 |         dataset_name = match.group(1)
 89 |         # 若 dataset_name 在启用列表中，则保留
 90 |         if enabled_datasets and dataset_name not in enabled_datasets:
 91 |             continue
 92 |         filtered_field_types[ftype_full] = ids
 93 | 
 94 |     # 渲染 field types
 95 |     field_types_str = []
 96 |     for ftype, fields in filtered_field_types.items():
 97 |         field_types_str.append(f"- **{ftype}**: {', '.join(fields)}")
 98 |     field_types = "\n".join(field_types_str)
 99 | 
100 |     # =========================================================
101 |     # 加载操作符文件
102 |     # =========================================================
103 |     ops_df = pd.read_csv(OPERATORS_FILE)
104 |     ops_info = []
105 |     op_types_map = {}
106 | 
107 |     for _, row in ops_df.iterrows():
108 |         op_str = f"- **{row['name']}**: {row['definition']} — {row['description']}"
109 |         ops_info.append(op_str)
110 |         op_types_map.setdefault(row['type'], []).append(row['name'])
111 | 
112 |     operators_and_definitions = "\n".join(ops_info)
113 | 
114 |     op_types_str = []
115 |     for otype, ops in op_types_map.items():
116 |         op_types_str.append(f"- **</{otype}/>**: {', '.join(ops)}")
117 |     operator_types = "\n".join(op_types_str)
118 | 
119 |     # =========================================================
120 |     # 渲染模板
121 |     # =========================================================
122 |     prompt_filled = (
123 |         template_str
124 |         .replace("{{ fields_and_definitions }}", fields_and_definitions)
125 |         .replace("{{ operators_and_definitions }}", operators_and_definitions)
126 |         .replace("{{ field_types }}", field_types)
127 |         .replace("{{ operator_types }}", operator_types)
128 |     )
129 | 
130 |     print("✅ WQ knowledge prompt built successfully.")
131 |     return prompt_filled
132 | 
133 | 
134 | def build_check_if_blog_helpful(blog_json_path: str):
135 |     """
136 |     从yaml读取check_if_blog_helpful模板并用blog_json渲染
137 |     """
138 |     # 1. 读取yaml
139 |     with open(PROMPT_FILE, "r", encoding="utf-8") as f:
140 |         prompt_yaml = yaml.safe_load(f)
141 | 
142 |     template_str = prompt_yaml.get("check_if_blog_helpful", "")
143 |     if not template_str:
144 |         raise ValueError("check_if_blog_helpful not found in template_generating.yaml")
145 | 
146 |     # 2. 读取json
147 |     with open(blog_json_path, "r", encoding="utf-8") as f:
148 |         blog_data = json.load(f)
149 | 
150 |     # 3. 拼接blog_post文本
151 |     # 可以按需求拼接（title+description+post_body+comments）
152 |     post_text = f"Title: {blog_data.get('title','')}\n\nDescription: {blog_data.get('description','')}\n\nPost Body: {blog_data.get('post_body','')}\n\nComments:\n"
153 |     if blog_data.get("post_comments"):
154 |         for i, c in enumerate(blog_data["post_comments"], 1):
155 |             post_text += f"[{i}] {c}\n"
156 | 
157 |     # 4. 替换模板
158 |     prompt_filled = template_str.replace("{{ blog_post }}", truncate_text(post_text))
159 | 
160 |     return prompt_filled
161 | 
162 | 
163 | def build_blog_to_hypothesis(blog_json_path: str):
164 |     """
165 |     从yaml读取blog_to_hypothesis模板并用blog_json渲染
166 |     """
167 |     # 1. 读取yaml
168 |     with open(PROMPT_FILE, "r", encoding="utf-8") as f:
169 |         prompt_yaml = yaml.safe_load(f)
170 | 
171 |     template_str = prompt_yaml.get("blog_to_hypothesis", "")
172 |     if not template_str:
173 |         raise ValueError("blog_to_hypothesis not found in template_generating.yaml")
174 | 
175 |     # 2. 读取json
176 |     with open(blog_json_path, "r", encoding="utf-8") as f:
177 |         blog_data = json.load(f)
178 | 
179 |     # 3. 拼接blog_post文本
180 |     # 可以按需求拼接（title+description+post_body+comments）
181 |     post_text = f"Title: {blog_data.get('title','')}\n\nDescription: {blog_data.get('description','')}\n\nPost Body: {blog_data.get('post_body','')}\n\nComments:\n"
182 |     if blog_data.get("post_comments"):
183 |         for i, c in enumerate(blog_data["post_comments"], 1):
184 |             post_text += f"[{i}] {c}\n"
185 | 
186 |     # 4. 替换模板
187 |     prompt_filled = template_str.replace("{{ blog_post }}", truncate_text(post_text))
188 | 
189 |     return prompt_filled
190 | 
191 | 
192 | def build_hypothesis_to_template(hypotheses_json_path: str):
193 |     """
194 |     从yaml读取hypothesis_to_template模板并用hypotheses_json渲染
195 |     - 优先使用 template_fields.json（映射 field types -> [field ids]）
196 |     - 根据 config 中的 enabled_datasets 过滤 field types （从 </name:type:dataset/> 中解析 dataset）
197 |     - 为防止 token 爆炸，展示每个类型的前 N 个示例并标注总数
198 |     """
199 |     import re
200 |     from utils.config_loader import ConfigLoader
201 | 
202 |     # 1. 读取yaml
203 |     with open(PROMPT_FILE, "r", encoding="utf-8") as f:
204 |         prompt_yaml = yaml.safe_load(f)
205 | 
206 |     template_str = prompt_yaml.get("hypothesis_to_template", "")
207 |     if not template_str:
208 |         raise ValueError("hypothesis_to_template not found in template_generating.yaml")
209 | 
210 |     # 2. 读取hypotheses json
211 |     with open(hypotheses_json_path, "r", encoding="utf-8") as f:
212 |         hypotheses_data = json.load(f)
213 | 
214 |     hypotheses_str = json.dumps(hypotheses_data, indent=2, ensure_ascii=False)
215 | 
216 |     # --- 读取并过滤 field types (优先 template_fields.json) ---
217 |     field_types_map = {}
218 | 
219 |     # 获取用户在 config 中启用的数据集（可为逗号分隔字符串或 list）
220 |     enabled = ConfigLoader.get("enabled_field_datasets")
221 |     enabled_datasets = None
222 |     if enabled:
223 |         if isinstance(enabled, str):
224 |             enabled_datasets = [s.strip() for s in enabled.split(",") if s.strip()]
225 |         elif isinstance(enabled, (list, tuple)):
226 |             enabled_datasets = list(enabled)
227 |         else:
228 |             enabled_datasets = None
229 | 
230 |     if TEMPLATE_FIELDS_FILE.exists():
231 |         with open(TEMPLATE_FIELDS_FILE, "r", encoding="utf-8") as f:
232 |             template_field_data = json.load(f)
233 | 
234 |         # 解析 key 格式 </name:type:dataset/> 提取 dataset 并按 enabled_datasets 过滤
235 |         for ftype_full, ids in template_field_data.items():
236 |             # 提取最后一个冒号之后直到 '/>' 之间的 dataset 名称
237 |             m = re.search(r":([^/>]+)\/>$", ftype_full)
238 |             dataset_name = m.group(1) if m else None
239 | 
240 |             # 如果用户指定了 enabled_datasets，则只保留匹配的 dataset
241 |             if enabled_datasets and dataset_name and dataset_name not in enabled_datasets:
242 |                 continue
243 | 
244 |             # 保持原始 id 列表（list），后面渲染时会截断显示
245 |             field_types_map[ftype_full] = list(ids)
246 |     else:
247 |         raise FileNotFoundError("❌ template_fields.json not found.")
248 | 
249 |     # --- 读取操作符类型映射（保持原样，但做截断显示） ---
250 |     ops_df = pd.read_csv(OPERATORS_FILE, dtype=str, keep_default_na=False)
251 |     op_types_map = {}
252 |     for _, row in ops_df.iterrows():
253 |         typ = row.get("type", "Other")
254 |         name = row.get("name")
255 |         if name:
256 |             op_types_map.setdefault(typ, []).append(name)
257 | 
258 |     # --- 构建可读字符串（为 prompt ）: 对每个类型只显示前 N 个示例以节约 token ---
259 |     MAX_EXAMPLES_PER_TYPE = 10000  # 每个类型在 prompt 中展示的最大示例数（字段或操作符）
260 |     field_types_str_lines = []
261 |     for ftype, ids in field_types_map.items():
262 |         total = len(ids)
263 |         display_ids = ids[:MAX_EXAMPLES_PER_TYPE]
264 |         suffix = "" if total <= MAX_EXAMPLES_PER_TYPE else f", ... (+{total - MAX_EXAMPLES_PER_TYPE} more)"
265 |         field_types_str_lines.append(f"- **{ftype}** ({total} fields): {', '.join(display_ids)}{suffix}")
266 |     field_types = "\n".join(field_types_str_lines)
267 | 
268 |     op_types_str_lines = []
269 |     for otype, ops in op_types_map.items():
270 |         total = len(ops)
271 |         display_ops = ops[:MAX_EXAMPLES_PER_TYPE]
272 |         suffix = "" if total <= MAX_EXAMPLES_PER_TYPE else f", ... (+{total - MAX_EXAMPLES_PER_TYPE} more)"
273 |         op_types_str_lines.append(f"- **</{otype}/>** ({total} ops): {', '.join(display_ops)}{suffix}")
274 |     operator_types = "\n".join(op_types_str_lines)
275 | 
276 |     # 4. 替换模板
277 |     prompt_filled = (
278 |         template_str
279 |         .replace("{{ hypotheses }}", hypotheses_str)
280 |         .replace("{{ field_types }}", field_types)
281 |         .replace("{{ operator_types }}", operator_types)
282 |     )
283 | 
284 |     return prompt_filled
285 | 
286 | 
287 | 
288 | 
289 | if __name__ == "__main__":
290 |     print(build_wq_knowledge_prompt())
291 | 


--------------------------------------------------------------------------------
/evaluator/backtest_with_wq_mul.py:
--------------------------------------------------------------------------------
  1 | # backtest_with_wq.py
  2 | import json
  3 | import csv
  4 | import logging
  5 | from pathlib import Path
  6 | from time import sleep
  7 | import requests
  8 | from openai import OpenAI
  9 | from requests.auth import HTTPBasicAuth
 10 | 
 11 | from evaluator.construct_prompts import build_fix_fast_expression_prompt
 12 | from utils.config_loader import ConfigLoader
 13 | 
 14 | BASE_DIR = Path(__file__).resolve().parents[1]
 15 | BACKTEST_DIR = BASE_DIR / "data" / "alpha_db_v2" / "backtest_result"
 16 | BACKTEST_DIR.mkdir(parents=True, exist_ok=True)
 17 | 
 18 | logging.basicConfig(filename='backtest_with_wq.log', level=logging.INFO,
 19 |                     format='%(asctime)s - %(levelname)s - %(message)s')
 20 | 
 21 | 
 22 | def sign_in():
 23 |     """登录 WQ Brain 并返回 session"""
 24 |     username = ConfigLoader.get('worldquant_account')
 25 |     password = ConfigLoader.get('worldquant_password')
 26 | 
 27 |     sess = requests.Session()
 28 |     sess.auth = HTTPBasicAuth(username, password)
 29 |     resp = sess.post(ConfigLoader.get('worldquant_api_auth'))
 30 |     print(f"Login status: {resp.status_code}")
 31 |     return sess
 32 | 
 33 | 
 34 | def run_backtest_mul_by_wq_api(alphas_json_file, batch_size=15):
 35 |     """批量回测指定 alphas json 文件，采用等待队列方式提升效率"""
 36 |     sess = sign_in()
 37 | 
 38 |     # === 1. 读 alpha JSON ===
 39 |     with open(alphas_json_file, "r", encoding="utf-8") as f:
 40 |         data = json.load(f)
 41 | 
 42 |     print(f"🔬 Start backtest for {alphas_json_file}")
 43 |     if "GeneratedAlphas" in data:
 44 |         alphas = [item["alpha"] for item in data["GeneratedAlphas"]]
 45 |     elif isinstance(data, list):
 46 |         alphas = [item["alpha"] for item in data]
 47 |     else:
 48 |         print("❌ 不识别的 alpha JSON 格式")
 49 |         return None
 50 | 
 51 |     template_name = Path(alphas_json_file).stem
 52 |     out_csv = BACKTEST_DIR / f"{template_name}_backtest.csv"
 53 | 
 54 |     # === 2. 已有结果，跳过 ===
 55 |     finished_alphas = set()
 56 |     if out_csv.exists():
 57 |         with open(out_csv, "r", encoding="utf-8") as f:
 58 |             reader = csv.DictReader(f)
 59 |             for row in reader:
 60 |                 finished_alphas.add(row["alpha"])
 61 |         print(f"⚠️ 已有 {len(finished_alphas)} 条回测结果，将跳过这些 alpha")
 62 | 
 63 |     # === 3. 准备写入 ===
 64 |     fieldnames = ["alpha", "sharpe", "turnover", "fitness", "returns", "drawdown", "margin"]
 65 |     csv_file = open(out_csv, "a", newline="", encoding="utf-8")
 66 |     writer = csv.DictWriter(csv_file, fieldnames=fieldnames)
 67 |     if csv_file.tell() == 0:
 68 |         writer.writeheader()
 69 | 
 70 |     # === 4. 构造 payload 模板 ===
 71 |     def make_payload(expr):
 72 |         return {
 73 |             "type": "REGULAR",
 74 |             "settings": {
 75 |                 "instrumentType": "EQUITY",
 76 |                 "region": "USA",
 77 |                 "universe": "TOP3000",
 78 |                 "delay": 1,
 79 |                 "decay": 0,
 80 |                 "neutralization": "SUBINDUSTRY",
 81 |                 "truncation": 0.01,
 82 |                 "pasteurization": "ON",
 83 |                 "unitHandling": "VERIFY",
 84 |                 "nanHandling": "OFF",
 85 |                 "language": "FASTEXPR",
 86 |                 "visualization": False,
 87 |             },
 88 |             "regular": expr
 89 |         }
 90 | 
 91 |     # === 5. 提交 & 管理 pending 队列 ===
 92 |     pending = {}  # sim_id -> {"alpha": expr, "progress_url": url}
 93 |     retry_queue = []
 94 | 
 95 |     for i, alpha_expr in enumerate(alphas, 1):
 96 |         if alpha_expr in finished_alphas:
 97 |             continue
 98 | 
 99 |         # 提交 alpha
100 |         try:
101 |             resp = sess.post("https://api.worldquantbrain.com/simulations", json=make_payload(alpha_expr))
102 |             if resp.status_code not in (200, 201):
103 |                 if "SIMULATION_LIMIT_EXCEEDED" in resp.text:
104 |                     retry_queue.append(alpha_expr)
105 |                     continue
106 |                 print(f"❌ 提交失败: {resp.status_code}, {resp.text}")
107 |                 continue
108 | 
109 |             sim_url = resp.headers.get("Location")
110 |             if not sim_url:
111 |                 retry_queue.append(alpha_expr)
112 |                 continue
113 | 
114 |             sim_id = sim_url.split("/")[-1]
115 |             pending[sim_id] = {"alpha": alpha_expr, "progress_url": sim_url, "first_time": True}
116 | 
117 |             print(f"📩 提交成功: {i}/{len(alphas)} -> {alpha_expr[:50]}...")
118 | 
119 |             # 控制批量大小
120 |             if len(pending) >= batch_size:
121 |                 monitor_pending(sess, pending, writer, alphas_json_file)
122 |         except Exception as e:
123 |             logging.error(f"提交 {alpha_expr} 出错: {e}")
124 |             retry_queue.append(alpha_expr)
125 | 
126 |     # 处理剩余的
127 |     if pending:
128 |         monitor_pending(sess, pending, writer, alphas_json_file)
129 | 
130 |     csv_file.close()
131 |     print(f"🎯 回测完成，结果已保存 {out_csv}")
132 |     return str(out_csv)
133 | 
134 | 
135 | def monitor_pending(sess, pending, writer, alphas_json_file):
136 |     """监控 pending 队列直到全部完成"""
137 |     client = OpenAI(
138 |         base_url=ConfigLoader.get("openai_base_url"),
139 |         api_key=ConfigLoader.get("openai_api_key"),
140 |     )
141 | 
142 |     while pending:
143 |         finished_ids = []
144 |         for sim_id, info in list(pending.items()):
145 |             try:
146 |                 status_resp = sess.get(info["progress_url"])
147 |                 if status_resp.status_code == 429:
148 |                     continue
149 | 
150 |                 status_json = status_resp.json()
151 |                 status = status_json.get("status")
152 | 
153 |                 if status in ("COMPLETE", "WARNING"):
154 |                     alpha_id = status_json.get("alpha")
155 |                     if not alpha_id:
156 |                         finished_ids.append(sim_id)
157 |                         continue
158 | 
159 |                     # 获取结果
160 |                     alpha_data = None
161 |                     for _ in range(10):
162 |                         alpha_resp = sess.get(f"https://api.worldquantbrain.com/alphas/{alpha_id}")
163 |                         if alpha_resp.status_code == 200:
164 |                             alpha_data = alpha_resp.json()
165 |                             break
166 |                         sleep(3)
167 | 
168 |                     if not alpha_data:
169 |                         finished_ids.append(sim_id)
170 |                         continue
171 | 
172 |                     is_data = alpha_data.get("is", {})
173 |                     writer.writerow({
174 |                         "alpha": info["alpha"],
175 |                         "sharpe": is_data.get("sharpe"),
176 |                         "turnover": is_data.get("turnover"),
177 |                         "fitness": is_data.get("fitness"),
178 |                         "returns": is_data.get("returns"),
179 |                         "drawdown": is_data.get("drawdown"),
180 |                         "margin": is_data.get("margin"),
181 |                     })
182 |                     finished_ids.append(sim_id)
183 |                     print(f"✅ 完成: {info['alpha']}... fitness={is_data.get('fitness')}")
184 | 
185 |                 elif status == "ERROR":
186 |                     if info["first_time"]: # 失败直接退出，修复带来的收益过低，时间损耗过高 TODO
187 |                         # 二次失败，写入 None
188 |                         writer.writerow({
189 |                             "alpha": info["alpha"],
190 |                             "sharpe": None,
191 |                             "turnover": None,
192 |                             "fitness": None,
193 |                             "returns": None,
194 |                             "drawdown": None,
195 |                             "margin": f"FAILED:{status}"
196 |                         })
197 |                         print(f"❌ 二次失败: {info['alpha'][:60]}...")
198 |                         finished_ids.append(sim_id)
199 |                     else:
200 |                         # === 使用 LLM 修复表达式 ===
201 |                         print(f"❌ 模拟失败: {info['alpha'][:60]}...")
202 |                         fix_exp_prompt = build_fix_fast_expression_prompt(info["alpha"], str(status_json))
203 |                         try:
204 |                             resp = client.chat.completions.create(
205 |                                 model=ConfigLoader.get("reasoner_model_name"),
206 |                                 messages=[
207 |                                     {"role": "system", "content": "You are an expert in Fast Expression syntax repair."},
208 |                                     {"role": "user", "content": fix_exp_prompt}
209 |                                 ],
210 |                                 temperature=0.2,
211 |                             )
212 |                             fixed_expr = resp.choices[0].message.content.strip()
213 |                             print(f"🧩 修复后的表达式: {fixed_expr}")
214 | 
215 |                             # === 替换 alphas_json_file 文件中的旧 alpha
216 |                             try:
217 |                                 with open(alphas_json_file, "r", encoding="utf-8") as f:
218 |                                     text = f.read()
219 |                                 if info["alpha"] not in text:
220 |                                     print("⚠️ 原始表达式未在文件中找到，跳过替换")
221 |                                 else:
222 |                                     new_text = text.replace(info["alpha"], fixed_expr, 1)  # 仅替换第一次出现
223 |                                     with open(alphas_json_file, "w", encoding="utf-8") as f:
224 |                                         f.write(new_text)
225 |                                     print(f"💾 已在 {alphas_json_file} 中替换修复后的表达式")
226 |                             except Exception as e:
227 |                                 print(f"❌ 替换 {alphas_json_file} 中表达式失败: {e}")
228 | 
229 |                             # === 再次提交修复后的表达式 ===
230 |                             payload = {
231 |                                 "type": "REGULAR",
232 |                                 "settings": {
233 |                                     "instrumentType": "EQUITY",
234 |                                     "region": "ASI",
235 |                                     "universe": "MINVOL1M",
236 |                                     "delay": 1,
237 |                                     "decay": 6,
238 |                                     "neutralization": "SUBINDUSTRY",
239 |                                     "truncation": 0.01,
240 |                                     "pasteurization": "ON",
241 |                                     "unitHandling": "VERIFY",
242 |                                     "nanHandling": "ON",
243 |                                     "maxTrade": "ON",
244 |                                     "language": "FASTEXPR",
245 |                                     "visualization": False,
246 |                                 },
247 |                                 "regular": fixed_expr
248 |                             }
249 | 
250 |                             new_resp = sess.post("https://api.worldquantbrain.com/simulations", json=payload)
251 |                             if new_resp.status_code not in (200, 201):
252 |                                 print(f"⚠️ 修复后提交失败 {new_resp.status_code}: {new_resp.text}")
253 |                                 writer.writerow({
254 |                                     "alpha": info["alpha"],
255 |                                     "sharpe": None,
256 |                                     "turnover": None,
257 |                                     "fitness": None,
258 |                                     "returns": None,
259 |                                     "drawdown": None,
260 |                                     "margin": "FIX_FAIL_SUBMIT"
261 |                                 })
262 |                                 finished_ids.append(sim_id)
263 |                                 continue
264 | 
265 |                             new_url = new_resp.headers.get("Location")
266 |                             if not new_url:
267 |                                 print("⚠️ 修复后提交未返回Location，跳过")
268 |                                 finished_ids.append(sim_id)
269 |                                 continue
270 | 
271 |                             # 替换原 pending 任务为新任务
272 |                             new_id = new_url.split("/")[-1]
273 |                             pending[new_id] = {
274 |                                 "alpha": fixed_expr,
275 |                                 "progress_url": new_url,
276 |                                 "first_time": False  # 标记为已修复
277 |                             }
278 |                             finished_ids.append(sim_id)
279 |                             print(f"🔁 已重新提交修复后的表达式 {new_id}")
280 | 
281 |                         except Exception as e:
282 |                             logging.error(f"修复表达式失败: {e}")
283 |                             writer.writerow({
284 |                                 "alpha": info["alpha"],
285 |                                 "sharpe": None,
286 |                                 "turnover": None,
287 |                                 "fitness": None,
288 |                                 "returns": None,
289 |                                 "drawdown": None,
290 |                                 "margin": "FIX_FAIL_LLM"
291 |                             })
292 |                             finished_ids.append(sim_id)
293 | 
294 |                 else:
295 |                     print(f"⏳ {info['alpha'][:40]}... simulation status: {status}")
296 | 
297 |             except Exception as e:
298 |                 logging.error(f"检查 {sim_id} 出错: {e}")
299 | 
300 |         for fid in finished_ids:
301 |             pending.pop(fid, None)
302 | 
303 |         sleep(5)
304 | 
305 | 
306 | if __name__ == "__main__":
307 |     test_file = BASE_DIR / "data" / "alpha_db" / "all_alphas" / "your_template_alphas.json"
308 |     run_backtest_mul_by_wq_api(test_file)
309 | 


--------------------------------------------------------------------------------