├── finnlp ├── data_sources │ ├── __init__.py │ ├── news │ │ ├── __init__.py │ │ ├── _base.py │ │ ├── fmp_streaming.py │ │ ├── akshare_cctv.py │ │ ├── tushare_major_news.py │ │ ├── tipranks_streaming.py │ │ ├── yicai_streaming.py │ │ ├── cnbc_streaming.py │ │ ├── reuters_streaming.py │ │ ├── gurufocus_streaming.py │ │ ├── alliancenews_streaming.py │ │ ├── marketwatch_date_range.py │ │ ├── investorplace_streaming.py │ │ ├── eastmoney_streaming.py │ │ ├── pennystocks_streaming.py │ │ ├── sina_finance_date_range.py │ │ ├── thefly_streaming.py │ │ ├── talkmarkets_streaming.py │ │ ├── seekingalpha_date_range.py │ │ └── marketwatch_streaming.py │ ├── trends │ │ ├── __init__.py │ │ ├── baidu.py │ │ ├── _base.py │ │ └── google.py │ ├── social_media │ │ ├── __init__.py │ │ ├── _base.py │ │ ├── eastmoney_streaming.py │ │ ├── stocktwits_streaming.py │ │ ├── xueqiu_streaming.py │ │ ├── finnhub_sentiment.py │ │ ├── weibo_streaming.py │ │ ├── reddit_streaming.py │ │ ├── twitter_date_range.py │ │ ├── facebook_streaming.py │ │ └── weibo_date_range.py │ ├── company_announcement │ │ ├── __init__.py │ │ ├── _base.py │ │ ├── sina.py │ │ ├── juchao.py │ │ └── sec.py │ ├── sec_filings │ │ ├── prepline_sec_filings │ │ │ ├── api │ │ │ │ ├── __init__.py │ │ │ │ └── app.py │ │ │ └── sections.py │ │ ├── __init__.py │ │ ├── README.md │ │ └── main.py │ ├── datasets │ │ ├── __init__.py │ │ └── load_dataset.py │ ├── earning_calls │ │ ├── __init__.py │ │ ├── main.py │ │ └── utils.py │ └── _base.py ├── large_language_models │ ├── __init__.py │ ├── embeddings │ │ ├── bert.py │ │ ├── __init__.py │ │ └── finbert.py │ ├── openai │ │ ├── __init__.py │ │ ├── openai_chat_agent.py │ │ └── app4gpt_chat_agent.py │ └── sentiment │ │ ├── gpt3.py │ │ ├── paml.py │ │ └── __init__.py ├── benchmarks │ ├── tfns.py │ ├── fpb.py │ ├── nwgi.py │ └── fiqa.py ├── data_engineering │ └── data_cleaning.py └── utils │ └── get_proxy.py ├── docs └── FinNLP │ ├── site │ ├── assets │ │ ├── javascripts │ │ │ └── lunr │ │ │ │ └── min │ │ │ │ ├── lunr.jp.min.js │ │ │ │ ├── lunr.vi.min.js │ │ │ │ ├── lunr.multi.min.js │ │ │ │ ├── lunr.th.min.js │ │ │ │ ├── lunr.ta.min.js │ │ │ │ ├── lunr.zh.min.js │ │ │ │ ├── lunr.ja.min.js │ │ │ │ ├── lunr.hi.min.js │ │ │ │ ├── lunr.stemmer.support.min.js │ │ │ │ ├── lunr.ko.min.js │ │ │ │ ├── lunr.sv.min.js │ │ │ │ ├── lunr.da.min.js │ │ │ │ ├── lunr.no.min.js │ │ │ │ ├── lunr.nl.min.js │ │ │ │ ├── lunr.de.min.js │ │ │ │ └── lunr.du.min.js │ │ ├── images │ │ │ └── favicon.png │ │ └── stylesheets │ │ │ └── palette.a0c5b2b5.min.css.map │ ├── sitemap.xml.gz │ └── sitemap.xml │ ├── mkdocs.yml │ └── docs │ └── zh │ └── index.md ├── requirements.txt ├── .gitignore ├── .gitmodules ├── demo └── README.md ├── LICENSE ├── setup.py ├── markdowns └── codes.md └── test └── en /finnlp/data_sources/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /finnlp/data_sources/news/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /finnlp/data_sources/trends/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /finnlp/data_sources/trends/baidu.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /finnlp/large_language_models/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /finnlp/data_sources/social_media/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /finnlp/large_language_models/embeddings/bert.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /finnlp/large_language_models/openai/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /finnlp/large_language_models/sentiment/gpt3.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /finnlp/large_language_models/sentiment/paml.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /finnlp/data_sources/company_announcement/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /finnlp/large_language_models/embeddings/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /finnlp/large_language_models/embeddings/finbert.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /finnlp/large_language_models/sentiment/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /finnlp/data_sources/sec_filings/prepline_sec_filings/api/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /finnlp/data_sources/datasets/__init__.py: -------------------------------------------------------------------------------- 1 | from load_dataset import load_dataset -------------------------------------------------------------------------------- /docs/FinNLP/site/assets/javascripts/lunr/min/lunr.jp.min.js: -------------------------------------------------------------------------------- 1 | module.exports=require("./lunr.ja"); -------------------------------------------------------------------------------- /finnlp/data_sources/earning_calls/__init__.py: -------------------------------------------------------------------------------- 1 | from finnlp.data_sources.earning_calls.main import EarningCallTranscripts 2 | -------------------------------------------------------------------------------- /docs/FinNLP/site/sitemap.xml.gz: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/AI4Finance-Foundation/FinNLP/HEAD/docs/FinNLP/site/sitemap.xml.gz -------------------------------------------------------------------------------- /finnlp/data_sources/sec_filings/__init__.py: -------------------------------------------------------------------------------- 1 | from finnlp.data_sources.sec_filings.main import SECFilingsLoader 2 | 3 | 4 | 5 | -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | 2 | akshare 3 | tushare 4 | finnhub-python 5 | parsel 6 | requests 7 | pandas 8 | tqdm 9 | pytz 10 | -------------------------------------------------------------------------------- /docs/FinNLP/site/assets/images/favicon.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/AI4Finance-Foundation/FinNLP/HEAD/docs/FinNLP/site/assets/images/favicon.png -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | /demo/chatgpt-trading/token_.py 2 | demo/chatgpt-trading/token_.py 3 | */token_.py 4 | *token_.py 5 | 6 | */__pycache__/* 7 | *__pycache__* -------------------------------------------------------------------------------- /.gitmodules: -------------------------------------------------------------------------------- 1 | [submodule "stocknet-dataset"] 2 | path = stocknet-dataset 3 | url = https://github.com/yumoxu/stocknet-dataset.git 4 | [submodule "Astock"] 5 | path = Astock 6 | url = https://github.com/JinanZou/Astock.git 7 | -------------------------------------------------------------------------------- /finnlp/data_sources/trends/_base.py: -------------------------------------------------------------------------------- 1 | class Trend_Downloader: 2 | 3 | def __init__(self, args = {}): 4 | pass 5 | 6 | def download(self, start_date, end_date, stock = "all"): 7 | pass 8 | 9 | def clean_data(self): 10 | pass 11 | 12 | def gather_one_day(self,date,stock = "all",delay = 0.1): 13 | pass 14 | 15 | def transfer_standard_date_to_nonstandard(self,date): 16 | pass -------------------------------------------------------------------------------- /finnlp/data_sources/social_media/_base.py: -------------------------------------------------------------------------------- 1 | from finnlp.data_sources._base import FinNLP_Downloader 2 | 3 | class Social_Media_Downloader(FinNLP_Downloader): 4 | 5 | def __init__(self, args = {}): 6 | super().__init__(args) 7 | pass 8 | 9 | def download(self, start_date, end_date, stock = "all"): 10 | pass 11 | 12 | def clean_data(self): 13 | pass 14 | 15 | def gather_one_day_news(self,date,stock = "all",delay = 0.1): 16 | pass 17 | 18 | def transfer_standard_date_to_nonstandard(self,date): 19 | pass -------------------------------------------------------------------------------- /finnlp/data_sources/news/_base.py: -------------------------------------------------------------------------------- 1 | from finnlp.data_sources._base import FinNLP_Downloader 2 | 3 | class News_Downloader(FinNLP_Downloader): 4 | 5 | def __init__(self, args = {}): 6 | super().__init__(args) 7 | pass 8 | 9 | def download_date_range(self, start_date, end_date, stock = None): 10 | pass 11 | 12 | def download_streaming(self, stock = None): 13 | pass 14 | 15 | def clean_data(self): 16 | pass 17 | 18 | def _gather_one_part(self, date, stock = None, delay = 0.1): 19 | pass 20 | 21 | def _gather_content(self): 22 | pass 23 | -------------------------------------------------------------------------------- /finnlp/data_sources/company_announcement/_base.py: -------------------------------------------------------------------------------- 1 | from finnlp.data_sources._base import FinNLP_Downloader 2 | 3 | class Company_Announcement_Downloader(FinNLP_Downloader): 4 | 5 | def __init__(self, args = {}): 6 | super().__init__(args) 7 | pass 8 | 9 | def download_date_range_all(self, start_date, end_date): 10 | pass 11 | 12 | def download_date_range_stock(self, start_date, end_date, stock = "AAPL"): 13 | pass 14 | 15 | def download_streaming_all(self, rounds = 3): 16 | pass 17 | 18 | def download_streaming_stock(self, stock = None, rounds = 3): 19 | pass 20 | 21 | def clean_data(self): 22 | pass -------------------------------------------------------------------------------- /finnlp/data_sources/trends/google.py: -------------------------------------------------------------------------------- 1 | from pytrends.request import TrendReq 2 | import pandas as pd 3 | 4 | class Google_Trends: 5 | def __init__(self,args = {}): 6 | # https://github.com/GeneralMills/pytrends 7 | self.pytrends = TrendReq(hl='en-US', tz=360) 8 | 9 | def download(self, start_date, end_date, stock = 'apple' ): 10 | self.date_list = pd.date_range(start_date,end_date) 11 | timeframe = [f"{start_date} {end_date}"] 12 | kw_list = [stock] 13 | self.pytrends.build_payload(kw_list=kw_list, timeframe=timeframe) 14 | res = self.pytrends.interest_over_time() 15 | # res.columns = ["date","value"] 16 | return res 17 | -------------------------------------------------------------------------------- /docs/FinNLP/site/assets/javascripts/lunr/min/lunr.vi.min.js: -------------------------------------------------------------------------------- 1 | !function(e,r){"function"==typeof define&&define.amd?define(r):"object"==typeof exports?module.exports=r():r()(e.lunr)}(this,function(){return function(e){if(void 0===e)throw new Error("Lunr is not present. Please include / require Lunr before this script.");if(void 0===e.stemmerSupport)throw new Error("Lunr stemmer support is not present. Please include / require Lunr stemmer support before this script.");e.vi=function(){this.pipeline.reset(),this.pipeline.add(e.vi.stopWordFilter,e.vi.trimmer)},e.vi.wordCharacters="[A-Za-ẓ̀͐́͑̉̃̓ÂâÊêÔôĂ-ăĐ-đƠ-ơƯ-ư]",e.vi.trimmer=e.trimmerSupport.generateTrimmer(e.vi.wordCharacters),e.Pipeline.registerFunction(e.vi.trimmer,"trimmer-vi"),e.vi.stopWordFilter=e.generateStopWordFilter("là cái nhưng mà".split(" "))}}); -------------------------------------------------------------------------------- /docs/FinNLP/site/assets/javascripts/lunr/min/lunr.multi.min.js: -------------------------------------------------------------------------------- 1 | !function(e,t){"function"==typeof define&&define.amd?define(t):"object"==typeof exports?module.exports=t():t()(e.lunr)}(this,function(){return function(e){e.multiLanguage=function(){for(var t=Array.prototype.slice.call(arguments),i=t.join("-"),r="",n=[],s=[],p=0;p 2 | 3 | 4 | None 5 | 2023-08-24 6 | daily 7 | 8 | 9 | None 10 | 2023-08-24 11 | daily 12 | 13 | 14 | None 15 | 2023-08-24 16 | daily 17 | 18 | 19 | None 20 | 2023-08-24 21 | daily 22 | 23 | 24 | None 25 | 2023-08-24 26 | daily 27 | 28 | -------------------------------------------------------------------------------- /finnlp/data_sources/news/fmp_streaming.py: -------------------------------------------------------------------------------- 1 | import json 2 | import requests 3 | import pandas as pd 4 | from tqdm.notebook import tqdm 5 | 6 | df = pd.read_csv("NAS.csv", index_col=0) 7 | stock_list = df.index.to_list() 8 | 9 | api_key = YOUR_API_KEY # You may find your api key here https://site.financialmodelingprep.com/developer/docs/api-keys 10 | 11 | all = pd.DataFrame() 12 | for stock in tqdm(stock_list): 13 | for page in tqdm(range(500)): 14 | url = f"https://financialmodelingprep.com/api/v3/stock_news?tickers={stock}&page={page+1}&apikey={api_key}" 15 | res = requests.get(url) 16 | res = json.loads(res.text) 17 | if len(res) == 0: 18 | break 19 | else: 20 | res = pd.DataFrame(res) 21 | all = pd.concat([all, res]) 22 | 23 | all = all.reset_index(drop=True) 24 | all.to_csv("dataset_more.csv") -------------------------------------------------------------------------------- /finnlp/data_sources/news/akshare_cctv.py: -------------------------------------------------------------------------------- 1 | import pandas as pd 2 | import akshare as ak 3 | from tqdm.notebook import tqdm 4 | from finnlp.data_sources.news._base import News_Downloader 5 | 6 | 7 | class Akshare_cctv(News_Downloader): 8 | 9 | def __init__(self, args={}): 10 | pass 11 | 12 | def download_news(self, start_date, end_date, stock="all"): 13 | self.date_list = pd.date_range(start_date, end_date) 14 | res = pd.DataFrame() 15 | for date in tqdm(self.date_list): 16 | tmp = self.gather_one_day_news(date) 17 | res = pd.concat([res, tmp]) 18 | self.dataframe = res 19 | 20 | def clean_data(self): 21 | pass 22 | 23 | def gather_one_day_news(self, date, stock="all", delay=0.1): 24 | date = self.transfer_standard_date_to_nonstandard(date) 25 | res = ak.news_cctv(date=date) 26 | return res 27 | 28 | def transfer_standard_date_to_nonstandard(self, date): 29 | return date.strftime("%Y%m%d") -------------------------------------------------------------------------------- /docs/FinNLP/site/assets/javascripts/lunr/min/lunr.th.min.js: -------------------------------------------------------------------------------- 1 | !function(e,r){"function"==typeof define&&define.amd?define(r):"object"==typeof exports?module.exports=r():r()(e.lunr)}(this,function(){return function(e){if(void 0===e)throw new Error("Lunr is not present. Please include / require Lunr before this script.");if(void 0===e.stemmerSupport)throw new Error("Lunr stemmer support is not present. Please include / require Lunr stemmer support before this script.");var r="2"==e.version[0];e.th=function(){this.pipeline.reset(),this.pipeline.add(e.th.trimmer),r?this.tokenizer=e.th.tokenizer:(e.tokenizer&&(e.tokenizer=e.th.tokenizer),this.tokenizerFn&&(this.tokenizerFn=e.th.tokenizer))},e.th.wordCharacters="[฀-๿]",e.th.trimmer=e.trimmerSupport.generateTrimmer(e.th.wordCharacters),e.Pipeline.registerFunction(e.th.trimmer,"trimmer-th");var t=e.wordcut;t.init(),e.th.tokenizer=function(i){if(!arguments.length||null==i||void 0==i)return[];if(Array.isArray(i))return i.map(function(t){return r?new e.Token(t):t});var n=i.toString().replace(/^\s+/,"");return t.cut(n).split("|")}}}); -------------------------------------------------------------------------------- /demo/README.md: -------------------------------------------------------------------------------- 1 | ## Demos: 2 | 3 | ### Ⅰ. ChatGPT Tradings 4 | 5 | 1. [Trade with ChatGPT](https://github.com/AI4Finance-Foundation/ChatGPT-for-FinTech/tree/master/demo/chatgpt-trading-v1) 6 | * Using the ChatGPT to give us trading suggestions. 7 | * On [Ashare (News)](https://github.com/JinanZou/Astock) and A share Market ( `Maotai (贵州茅台 600519)` ) 8 | ![image-20230220011335859](https://cdn.jsdelivr.net/gh/oliverwang15/imgbed@main/img/202302200113884.png) 9 | 2. [Trade like ChatGPT](https://github.com/AI4Finance-Foundation/ChatGPT-for-FinTech/tree/master/demo/chatgpt-trading-v2) 10 | * Using ChatGPT's language model, GPT-3 to create an FinRL agent that trades as smartly as ChatGPT 11 | * On [stocknet-dataset (Tweets)](https://github.com/yumoxu/stocknet-dataset) and US Stocks Market (`AAPL`) 12 | ![image-20230216004801458](https://cdn.jsdelivr.net/gh/oliverwang15/imgbed@main/img/202302181558796.png) 13 | ### Ⅱ. Sentiment Classify 14 | 15 | 1. [Shares News Sentiment Classify.](https://github.com/AI4Finance-Foundation/ChatGPT-for-FinTech/blob/master/demo/shares_news_sentiment_classify.py) -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2024 AI4Finance Foundation Inc. 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /finnlp/data_sources/sec_filings/README.md: -------------------------------------------------------------------------------- 1 | # SEC DATA DOWNLOADER 2 | 3 | Please checkout this repo that I am building on SEC Question Answering Agent [SEC-QA](https://github.com/Athe-kunal/SEC-QA-Agent) 4 | 5 | This repository downloads all the texts from SEC documents (10-K and 10-Q). Currently, it is not supporting documents that are amended, but that will be added in the near futures. 6 | 7 | Install the required dependencies 8 | 9 | ``` 10 | python install -r requirements.txt 11 | ``` 12 | 13 | The SEC Downloader expects 5 attributes 14 | 15 | * tickers: It is a list of valid tickers 16 | * amount: Number of documents that you want to download 17 | * filing_type: 10-K or 10-Q filing type 18 | * num_workers: It is for multithreading and multiprocessing. We have multi-threading at the ticker level and multi-processing at the year level for a given ticker 19 | * include_amends: To include amendments or not. 20 | 21 | 22 | ## REFERENCES 23 | 1. Unstructured SEC Filings API: [repo link](https://github.com/Unstructured-IO/pipeline-sec-filings/tree/main) 24 | 2. SEC Edgar Downloader: [repo link](https://github.com/jadchaar/sec-edgar-downloader) 25 | 26 | -------------------------------------------------------------------------------- /finnlp/data_sources/datasets/load_dataset.py: -------------------------------------------------------------------------------- 1 | import datasets 2 | import pandas as pd 3 | from tqdm.notebook import tqdm 4 | import json 5 | import os 6 | 7 | def load_dataset(dataset_name, **kwargs): 8 | if dataset_name == "Stocknet": 9 | root_path = r"../../../stocknet-dataset/tweet/raw" 10 | stock_lists = os.listdir(root_path) 11 | all = pd.DataFrame() 12 | for stock in tqdm(stock_lists, desc="Loading Stocknet dataset..."): 13 | stock_path = os.path.join(root_path, stock) 14 | date_files = os.listdir(stock_path) 15 | for date in date_files: 16 | with open(os.path.join(stock_path, date_files[0])) as f: 17 | json_list = f.readlines() 18 | tmp_json = [] 19 | for json_str in json_list: 20 | tmp_json.append(json.loads(json_str)) 21 | tmp_json = pd.DataFrame(tmp_json) 22 | all = pd.concat([all, tmp_json], axis=0) 23 | all = all.reset_index(drop=True) 24 | all = datasets.Dataset.from_pandas(all) 25 | return all 26 | 27 | else: 28 | raise NotImplementedError("Only support Stocknet dataset for now") 29 | 30 | -------------------------------------------------------------------------------- /finnlp/data_sources/news/tushare_major_news.py: -------------------------------------------------------------------------------- 1 | import tushare as ts 2 | import pandas as pd 3 | from tqdm.notebook import tqdm 4 | from finnlp.data_sources.news._base import News_Downloader 5 | import time 6 | 7 | class Tushare_Major_News(News_Downloader): 8 | 9 | def __init__(self, args = {}): 10 | token = args["token"] if "token" in args.keys() else "27080ec403c0218f96f388bca1b1d85329d563c91a43672239619ef5" 11 | ts.set_token(token) 12 | self.pro = ts.pro_api() 13 | 14 | def download_news(self, start_date, end_date, stock = "all"): 15 | self.date_list = pd.date_range(start_date,end_date) 16 | res = pd.DataFrame() 17 | for date in tqdm(self.date_list): 18 | tmp = self.gather_one_day_news(date) 19 | res = pd.concat([res,tmp]) 20 | self.dataframe = res 21 | 22 | def gather_one_day_news(self,date,stock = "all",delay = 0.1): 23 | date = self.transfer_standard_date_to_nonstandard(date) 24 | res = self.pro.major_news(start_date = date,end_date = date) 25 | time.sleep(delay) 26 | return res 27 | 28 | def clean_data(self): 29 | pass 30 | 31 | def transfer_standard_date_to_nonstandard(self,date): 32 | return date.strftime("%Y-%m0%d 00:00:00") -------------------------------------------------------------------------------- /finnlp/data_sources/news/tipranks_streaming.py: -------------------------------------------------------------------------------- 1 | import requests 2 | from lxml import etree 3 | from tqdm import tqdm 4 | import pandas as pd 5 | import json 6 | import time 7 | from finnlp.data_sources.news._base import News_Downloader 8 | 9 | # TODO: 10 | # 1. Contents 11 | 12 | class TipRanks_Streaming(News_Downloader): 13 | 14 | def __init__(self, args={}): 15 | super().__init__(args) 16 | self.dataframe = pd.DataFrame() 17 | 18 | def download_streaming_search(self, keyword = "apple", rounds = 10000, delay = 0.5): 19 | url = "https://www.tipranks.com/api/news/posts" 20 | headers = { 21 | 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/116.0.0.0 Safari/537.36', 22 | } 23 | print("Downloading:", end = " ") 24 | for r in range(rounds): 25 | params = { 26 | 'page': r, 27 | 'per_page': '50', 28 | 'search': keyword, 29 | } 30 | res = requests.get(url = url, headers= headers, params=params) 31 | if res.status_code != 200: 32 | break 33 | try: 34 | res = json.loads(res.text) 35 | tmp = pd.DataFrame(res['data']) 36 | self.dataframe = pd.concat([self.dataframe, tmp]) 37 | except: 38 | print(res.text) 39 | # sleep 40 | time.sleep(delay) 41 | print(r, end = " ") -------------------------------------------------------------------------------- /setup.py: -------------------------------------------------------------------------------- 1 | from setuptools import setup, find_packages 2 | 3 | # Read requirements.txt, ignore comments 4 | try: 5 | with open("requirements.txt", "r") as f: 6 | REQUIRES = [line.split('#', 1)[0].strip() for line in f if line.strip()] 7 | except: 8 | print("'requirements.txt' not found!") 9 | REQUIRES = list() 10 | 11 | setup( 12 | name="FinNLP", 13 | version="0.0.1", 14 | include_package_data=True, 15 | author="AI4Finance Foundation", 16 | author_email="contact@ai4finance.org", 17 | url="https://github.com/AI4Finance-Foundation/FinNLP", 18 | license="MIT", 19 | packages=find_packages(), 20 | install_requires=REQUIRES, 21 | description="FinNLP", 22 | long_description="""FinNLP""", 23 | classifiers=[ 24 | # Trove classifiers 25 | # Full list: https://pypi.python.org/pypi?%3Aaction=list_classifiers 26 | "License :: OSI Approved :: MIT License", 27 | "Programming Language :: Python", 28 | "Programming Language :: Python :: 3", 29 | "Programming Language :: Python :: 3.6", 30 | "Programming Language :: Python :: 3.7", 31 | "Programming Language :: Python :: 3.8", 32 | "Programming Language :: Python :: 3.9", 33 | "Programming Language :: Python :: Implementation :: CPython", 34 | "Programming Language :: Python :: Implementation :: PyPy", 35 | ], 36 | keywords="Financial Large Language Models", 37 | platforms=["any"], 38 | python_requires=">=3.6", 39 | ) 40 | -------------------------------------------------------------------------------- /finnlp/data_sources/earning_calls/main.py: -------------------------------------------------------------------------------- 1 | from datetime import datetime 2 | from typing import List 3 | 4 | try: 5 | from finnlp.data_sources.earning_calls.utils import get_earning_transcripts 6 | except ImportError: 7 | from utils import get_earning_transcripts 8 | 9 | 10 | class EarningCallTranscripts(): 11 | def __init__(self, year: int, ticker: str, quarter: str): 12 | """Get the earning call transcripts for a given company, in a given year and quarter 13 | 14 | Args: 15 | year (int): Year of the transcript 16 | ticker (str): ticker symbol of the stock 17 | quarter (str): quarter 18 | """ 19 | curr_year = datetime.now().year 20 | assert year <= curr_year, "The year should be less than current year" 21 | 22 | assert quarter in [ 23 | "Q1", 24 | "Q2", 25 | "Q3", 26 | "Q4", 27 | ], 'The quarter should from the list ["Q1","Q2","Q3","Q4"]' 28 | self.year = year 29 | self.ticker = ticker 30 | self.quarter = quarter 31 | 32 | def load_data(self): 33 | resp_dict, speakers_list = get_earning_transcripts( 34 | self.quarter, self.ticker, self.year 35 | ) 36 | return { 37 | "text":resp_dict["content"], 38 | "metadata":{ 39 | "ticker": resp_dict["symbol"], 40 | "quarter": "Q" + str(resp_dict["quarter"]), 41 | "date_time": resp_dict["date"], 42 | "speakers_list": speakers_list, 43 | }, 44 | } 45 | 46 | -------------------------------------------------------------------------------- /finnlp/data_sources/sec_filings/prepline_sec_filings/api/app.py: -------------------------------------------------------------------------------- 1 | ##################################################################### 2 | # THIS FILE IS AUTOMATICALLY GENERATED BY UNSTRUCTURED API TOOLS. 3 | # DO NOT MODIFY DIRECTLY 4 | ##################################################################### 5 | 6 | 7 | import logging 8 | import os 9 | 10 | from fastapi import FastAPI, Request, status 11 | 12 | from .section import router as section_router 13 | 14 | app = FastAPI( 15 | title="Unstructured Pipeline API", 16 | description="""""", 17 | version="1.0.0", 18 | docs_url="/sec-filings/docs", 19 | openapi_url="/sec-filings/openapi.json", 20 | ) 21 | 22 | allowed_origins = os.environ.get("ALLOWED_ORIGINS", None) 23 | if allowed_origins: 24 | from fastapi.middleware.cors import CORSMiddleware 25 | 26 | app.add_middleware( 27 | CORSMiddleware, 28 | allow_origins=allowed_origins.split(","), 29 | allow_methods=["OPTIONS", "POST"], 30 | allow_headers=["Content-Type"], 31 | ) 32 | 33 | app.include_router(section_router) 34 | 35 | 36 | # Filter out /healthcheck noise 37 | class HealthCheckFilter(logging.Filter): 38 | def filter(self, record: logging.LogRecord) -> bool: 39 | return record.getMessage().find("/healthcheck") == -1 40 | 41 | 42 | logging.getLogger("uvicorn.access").addFilter(HealthCheckFilter()) 43 | 44 | 45 | @app.get("/healthcheck", status_code=status.HTTP_200_OK, include_in_schema=False) 46 | def healthcheck(request: Request): 47 | return {"healthcheck": "HEALTHCHECK STATUS: EVERYTHING OK!"} 48 | -------------------------------------------------------------------------------- /finnlp/data_sources/news/yicai_streaming.py: -------------------------------------------------------------------------------- 1 | import warnings 2 | warnings.filterwarnings("ignore") 3 | import requests 4 | from lxml import etree 5 | from tqdm import tqdm 6 | import pandas as pd 7 | import json 8 | import time 9 | from finnlp.data_sources.news._base import News_Downloader 10 | 11 | # TODO: 12 | # 1. Contents 13 | 14 | class Yicai_Streaming(News_Downloader): 15 | 16 | def __init__(self, args={}): 17 | super().__init__(args) 18 | self.dataframe = pd.DataFrame() 19 | 20 | def download_streaming_search(self, keyword = "茅台", rounds = 3, delay = 0.5): 21 | url = "https://www.yicai.com/api/ajax/getSearchResult" 22 | 23 | headers = { 24 | 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/116.0.0.0 Safari/537.36', 25 | 'Referer':'https://www.yicai.com/search?keys=%E8%8C%85%E5%8F%B0', 26 | 'X-Requested-With': 'XMLHttpRequest', 27 | } 28 | 29 | print("Downloading ...", end = ' ') 30 | for page in range(rounds): 31 | params = { 32 | 'page': page, 33 | 'pagesize': '20', 34 | 'keys': keyword, 35 | 'type': '0', 36 | } 37 | res = requests.get(url = url, headers = headers, params = params) 38 | if res.status_code != 200: 39 | break 40 | res = json.loads(res.text) 41 | res = res['results'] 42 | tmp = pd.DataFrame(res["docs"]) 43 | self.dataframe = pd.concat([self.dataframe, tmp]) 44 | 45 | print(page, end = ' ') 46 | 47 | time.sleep(delay) -------------------------------------------------------------------------------- /finnlp/data_sources/social_media/eastmoney_streaming.py: -------------------------------------------------------------------------------- 1 | import warnings 2 | warnings.filterwarnings("ignore") 3 | import requests 4 | from lxml import etree 5 | from tqdm import tqdm 6 | import pandas as pd 7 | import json 8 | import time 9 | from finnlp.data_sources.social_media._base import Social_Media_Downloader 10 | 11 | # TODO: 12 | # 1. Contents 13 | 14 | class Eastmoney_Streaming(Social_Media_Downloader): 15 | def __init__(self, args = {}): 16 | super().__init__(args) 17 | self.dataframe = pd.DataFrame() 18 | 19 | def download_streaming_stock(self, keyword = "600519", rounds = 3, delay = 0.5): 20 | headers = { 21 | 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/116.0.0.0 Safari/537.36', 22 | } 23 | print('Downloading ...', end =' ') 24 | for page in range(rounds): 25 | url = f"https://guba.eastmoney.com/list,{keyword}_{page+1}.html" 26 | res = requests.get(url=url, headers=headers) 27 | if res.status_code != 200: 28 | break 29 | 30 | res = etree.HTML(res.text) 31 | res = res.xpath("//script")[3].xpath("text()")[0] 32 | article_list, other_list = res.split('var article_list=')[1].strip(";").split('; var other_list=') 33 | article_list = json.loads(article_list) 34 | tmp = pd.DataFrame(article_list['re']) 35 | self.dataframe = pd.concat([self.dataframe, tmp]) 36 | 37 | print(page, end =' ') 38 | time.sleep(delay) 39 | 40 | self.dataframe = self.dataframe.reset_index(drop= True) 41 | 42 | 43 | -------------------------------------------------------------------------------- /docs/FinNLP/site/assets/javascripts/lunr/min/lunr.ta.min.js: -------------------------------------------------------------------------------- 1 | !function(e,t){"function"==typeof define&&define.amd?define(t):"object"==typeof exports?module.exports=t():t()(e.lunr)}(this,function(){return function(e){if(void 0===e)throw new Error("Lunr is not present. Please include / require Lunr before this script.");if(void 0===e.stemmerSupport)throw new Error("Lunr stemmer support is not present. Please include / require Lunr stemmer support before this script.");e.ta=function(){this.pipeline.reset(),this.pipeline.add(e.ta.trimmer,e.ta.stopWordFilter,e.ta.stemmer),this.searchPipeline&&(this.searchPipeline.reset(),this.searchPipeline.add(e.ta.stemmer))},e.ta.wordCharacters="஀-உஊ-ஏஐ-ஙச-ட஠-னப-யர-ஹ஺-ிீ-௉ொ-௏ௐ-௙௚-௟௠-௩௪-௯௰-௹௺-௿a-zA-Za-zA-Z0-90-9",e.ta.trimmer=e.trimmerSupport.generateTrimmer(e.ta.wordCharacters),e.Pipeline.registerFunction(e.ta.trimmer,"trimmer-ta"),e.ta.stopWordFilter=e.generateStopWordFilter("அங்கு அங்கே அது அதை அந்த அவர் அவர்கள் அவள் அவன் அவை ஆக ஆகவே ஆகையால் ஆதலால் ஆதலினால் ஆனாலும் ஆனால் இங்கு இங்கே இது இதை இந்த இப்படி இவர் இவர்கள் இவள் இவன் இவை இவ்வளவு உனக்கு உனது உன் உன்னால் எங்கு எங்கே எது எதை எந்த எப்படி எவர் எவர்கள் எவள் எவன் எவை எவ்வளவு எனக்கு எனது எனவே என் என்ன என்னால் ஏது ஏன் தனது தன்னால் தானே தான் நாங்கள் நாம் நான் நீ நீங்கள்".split(" ")),e.ta.stemmer=function(){return function(e){return"function"==typeof e.update?e.update(function(e){return e}):e}}();var t=e.wordcut;t.init(),e.ta.tokenizer=function(r){if(!arguments.length||null==r||void 0==r)return[];if(Array.isArray(r))return r.map(function(t){return isLunr2?new e.Token(t.toLowerCase()):t.toLowerCase()});var i=r.toString().toLowerCase().replace(/^\s+/,"");return t.cut(i).split("|")},e.Pipeline.registerFunction(e.ta.stemmer,"stemmer-ta"),e.Pipeline.registerFunction(e.ta.stopWordFilter,"stopWordFilter-ta")}}); -------------------------------------------------------------------------------- /finnlp/data_sources/earning_calls/utils.py: -------------------------------------------------------------------------------- 1 | from tenacity import retry, stop_after_attempt, wait_random_exponential 2 | import requests 3 | import json 4 | from datetime import datetime 5 | import re 6 | from typing import List 7 | 8 | 9 | def correct_date(yr, dt): 10 | """Some transcripts have incorrect date, correcting it 11 | 12 | Args: 13 | yr (int): actual 14 | dt (datetime): given date 15 | 16 | Returns: 17 | datetime: corrected date 18 | """ 19 | dt = datetime.strptime(dt, "%Y-%m-%d %H:%M:%S") 20 | if dt.year != yr: 21 | dt = dt.replace(year=yr) 22 | return dt.strftime("%Y-%m-%d %H:%M:%S") 23 | 24 | 25 | def extract_speakers(cont: str) -> List[str]: 26 | """Extract the list of speakers 27 | 28 | Args: 29 | cont (str): transcript content 30 | 31 | Returns: 32 | List[str]: list of speakers 33 | """ 34 | pattern = re.compile(r"\n(.*?):") 35 | matches = pattern.findall(cont) 36 | 37 | return list(set(matches)) 38 | 39 | 40 | @retry(wait=wait_random_exponential(min=1, max=5), stop=stop_after_attempt(2)) 41 | def get_earning_transcripts(quarter: str, ticker: str, year: int): 42 | """Get the earnings transcripts 43 | 44 | Args: 45 | quarter (str) 46 | ticker (str) 47 | year (int) 48 | """ 49 | response = requests.get( 50 | f"https://discountingcashflows.com/api/transcript/{ticker}/{quarter}/{year}/", 51 | auth=("user", "pass"), 52 | ) 53 | 54 | resp_text = json.loads(response.text) 55 | speakers_list = extract_speakers(resp_text[0]["content"]) 56 | corrected_date = correct_date(resp_text[0]["year"], resp_text[0]["date"]) 57 | resp_text[0]["date"] = corrected_date 58 | return resp_text[0], speakers_list 59 | -------------------------------------------------------------------------------- /docs/FinNLP/site/assets/javascripts/lunr/min/lunr.zh.min.js: -------------------------------------------------------------------------------- 1 | !function(e,r){"function"==typeof define&&define.amd?define(r):"object"==typeof exports?module.exports=r(require("@node-rs/jieba")):r()(e.lunr)}(this,function(e){return function(r,t){if(void 0===r)throw new Error("Lunr is not present. Please include / require Lunr before this script.");if(void 0===r.stemmerSupport)throw new Error("Lunr stemmer support is not present. Please include / require Lunr stemmer support before this script.");var i="2"==r.version[0];r.zh=function(){this.pipeline.reset(),this.pipeline.add(r.zh.trimmer,r.zh.stopWordFilter,r.zh.stemmer),i?this.tokenizer=r.zh.tokenizer:(r.tokenizer&&(r.tokenizer=r.zh.tokenizer),this.tokenizerFn&&(this.tokenizerFn=r.zh.tokenizer))},r.zh.tokenizer=function(n){if(!arguments.length||null==n||void 0==n)return[];if(Array.isArray(n))return n.map(function(e){return i?new r.Token(e.toLowerCase()):e.toLowerCase()});t&&e.load(t);var o=n.toString().trim().toLowerCase(),s=[];e.cut(o,!0).forEach(function(e){s=s.concat(e.split(" "))}),s=s.filter(function(e){return!!e});var u=0;return s.map(function(e,t){if(i){var n=o.indexOf(e,u),s={};return s.position=[n,e.length],s.index=t,u=n,new r.Token(e,s)}return e})},r.zh.wordCharacters="\\w一-龥",r.zh.trimmer=r.trimmerSupport.generateTrimmer(r.zh.wordCharacters),r.Pipeline.registerFunction(r.zh.trimmer,"trimmer-zh"),r.zh.stemmer=function(){return function(e){return e}}(),r.Pipeline.registerFunction(r.zh.stemmer,"stemmer-zh"),r.zh.stopWordFilter=r.generateStopWordFilter("的 一 不 在 人 有 是 为 以 于 上 他 而 后 之 来 及 了 因 下 可 到 由 这 与 也 此 但 并 个 其 已 无 小 我 们 起 最 再 今 去 好 只 又 或 很 亦 某 把 那 你 乃 它 吧 被 比 别 趁 当 从 到 得 打 凡 儿 尔 该 各 给 跟 和 何 还 即 几 既 看 据 距 靠 啦 了 另 么 每 们 嘛 拿 哪 那 您 凭 且 却 让 仍 啥 如 若 使 谁 虽 随 同 所 她 哇 嗡 往 哪 些 向 沿 哟 用 于 咱 则 怎 曾 至 致 着 诸 自".split(" ")),r.Pipeline.registerFunction(r.zh.stopWordFilter,"stopWordFilter-zh")}}); -------------------------------------------------------------------------------- /docs/FinNLP/mkdocs.yml: -------------------------------------------------------------------------------- 1 | site_name: FinGPT & FinNLP 2 | site_author: Oliver Wang, Xiao-yang Liu 3 | 4 | nav: 5 | - Hello World: 6 | - About the project: 'index.md' 7 | 8 | - FinGPT Models: 9 | - FinGPT-v1: 'https://github.com/AI4Finance-Foundation/FinGPT/tree/master/fingpt/FinGPT-v1' 10 | - FinGPT-v2: 'https://github.com/AI4Finance-Foundation/FinGPT/tree/master/fingpt/FinGPT-v2' 11 | - FinGPT-v3: 'https://github.com/AI4Finance-Foundation/FinGPT/tree/master/fingpt/FinGPT-v3' 12 | 13 | - Robo Advisor: 14 | - chatgpt-robo-advisor-v1: 'https://github.com/AI4Finance-Foundation/FinGPT/tree/master/fingpt/chatgpt-robo-advisor-v1' 15 | - chatgpt-robo-advisor-v2: 'https://github.com/AI4Finance-Foundation/FinGPT/tree/master/fingpt/chatgpt-robo-advisor-v2' 16 | 17 | - Quantitative Trading: 18 | - chatgpt-trading-v1: 'https://github.com/AI4Finance-Foundation/FinGPT/tree/master/fingpt/chatgpt-trading-v1' 19 | - chatgpt-trading-v2: 'https://github.com/AI4Finance-Foundation/FinGPT/tree/master/fingpt/chatgpt-trading-v2' 20 | 21 | - Low code development: 22 | - chatgpt-low-code-development-v1: 'https://github.com/AI4Finance-Foundation/FinGPT/tree/master/fingpt/chatgpt-low-code-development-v1' 23 | - chatgpt-low-code-development-v2: 'https://github.com/AI4Finance-Foundation/FinGPT/tree/master/fingpt/chatgpt-low-code-development-v2' 24 | 25 | - Data Sources: 26 | - News: jupyter/Data_Sources_News.ipynb 27 | - Social Media: jupyter/Data_Sources_Social_Media.ipynb 28 | - Company Announcement: jupyter/Data_Sources_Company_Announcement.ipynb 29 | 30 | theme: 31 | name: material 32 | 33 | plugins: 34 | - mkdocs-jupyter: 35 | execute: false 36 | 37 | extra: 38 | alternate: 39 | - name: English 40 | link: / 41 | lang: en 42 | - name: 中文 43 | link: /zh/ 44 | lang: zh -------------------------------------------------------------------------------- /markdowns/codes.md: -------------------------------------------------------------------------------- 1 | # FinNLP 2 | 3 | ## Codes 4 | 5 | ### Data Sources 6 | 7 | #### News (Finnhub, Sina) 8 | 9 | ``` python 10 | class News_Downloader: 11 | 12 | def __init__(self, args = {}): 13 | pass 14 | 15 | def download_date_range_all(self, start_date, end_date): 16 | pass 17 | 18 | def download_date_range_stock(self, start_date, end_date, stock = "AAPL"): 19 | pass 20 | 21 | def download_streaming_all(self, rounds = 3): 22 | pass 23 | 24 | def download_streaming_stock(self, stock = None, rounds = 3): 25 | pass 26 | 27 | def clean_data(self): 28 | pass 29 | 30 | def gather_content(self, delay = 0.01): 31 | pass 32 | ``` 33 | 34 | 35 | 36 | #### Social Media (Twitter, Stocktwits, Reddit, Weibo) 37 | 38 | ``` python 39 | class Social_Media_Downloader: 40 | 41 | def __init__(self, args = {}): 42 | pass 43 | 44 | def download_date_range_all(self, start_date, end_date): 45 | pass 46 | 47 | def download_date_range_stock(self, start_date, end_date, stock = "AAPL"): 48 | pass 49 | 50 | def download_streaming_all(self, rounds = 3): 51 | pass 52 | 53 | def download_streaming_stock(self, stock = None, rounds = 3): 54 | pass 55 | 56 | def clean_data(self): 57 | pass 58 | ``` 59 | 60 | #### Company Announcement (Juchao, SEC) 61 | 62 | ``` python 63 | class company_announcement_Downloader: 64 | 65 | def __init__(self, args = {}): 66 | pass 67 | 68 | def download_date_range_all(self, start_date, end_date): 69 | pass 70 | 71 | def download_date_range_stock(self, start_date, end_date, stock = "AAPL"): 72 | pass 73 | 74 | def download_streaming_all(self, rounds = 3): 75 | pass 76 | 77 | def download_streaming_stock(self, stock = None, rounds = 3): 78 | pass 79 | 80 | def clean_data(self): 81 | pass 82 | ``` -------------------------------------------------------------------------------- /finnlp/data_sources/news/cnbc_streaming.py: -------------------------------------------------------------------------------- 1 | import warnings 2 | warnings.filterwarnings("ignore") 3 | import requests 4 | from lxml import etree 5 | from tqdm import tqdm 6 | import pandas as pd 7 | import json 8 | import time 9 | from finnlp.data_sources.news._base import News_Downloader 10 | 11 | # TODO: 12 | # 1. Contents 13 | 14 | class CNBC_Streaming(News_Downloader): 15 | 16 | def __init__(self, args={}): 17 | super().__init__(args) 18 | self.dataframe = pd.DataFrame() 19 | 20 | def download_streaming_search(self, keyword = "apple", rounds = 3, delay = 0.5): 21 | url = "https://api.queryly.com/cnbc/json.aspx" 22 | headers = { 23 | 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/116.0.0.0 Safari/537.36', 24 | 'Referer':'https://www.cnbc.com/', 25 | } 26 | print("Downloading ...", end = ' ') 27 | for page in range(rounds): 28 | params = { 29 | 'queryly_key': '31a35d40a9a64ab3', 30 | 'query': keyword, 31 | 'endindex': page * 10, 32 | 'batchsize': '10', 33 | 'callback': '', 34 | 'showfaceted': 'false', 35 | 'timezoneoffset': '-480', 36 | 'facetedfields': 'formats', 37 | 'facetedkey': 'formats|', 38 | 'facetedvalue': '!Press Release|', 39 | 'sort': 'date', 40 | 'additionalindexes': '4cd6f71fbf22424d,937d600b0d0d4e23,3bfbe40caee7443e,626fdfcd96444f28', 41 | } 42 | res = requests.get(url = url, headers = headers, params = params) 43 | if res.status_code != 200: 44 | break 45 | res = json.loads(res.text) 46 | tmp = pd.DataFrame(res['results']) 47 | self.dataframe = pd.concat([self.dataframe, tmp]) 48 | 49 | print(page, end = ' ') 50 | 51 | time.sleep(delay) 52 | -------------------------------------------------------------------------------- /docs/FinNLP/site/assets/javascripts/lunr/min/lunr.ja.min.js: -------------------------------------------------------------------------------- 1 | !function(e,r){"function"==typeof define&&define.amd?define(r):"object"==typeof exports?module.exports=r():r()(e.lunr)}(this,function(){return function(e){if(void 0===e)throw new Error("Lunr is not present. Please include / require Lunr before this script.");if(void 0===e.stemmerSupport)throw new Error("Lunr stemmer support is not present. Please include / require Lunr stemmer support before this script.");var r="2"==e.version[0];e.ja=function(){this.pipeline.reset(),this.pipeline.add(e.ja.trimmer,e.ja.stopWordFilter,e.ja.stemmer),r?this.tokenizer=e.ja.tokenizer:(e.tokenizer&&(e.tokenizer=e.ja.tokenizer),this.tokenizerFn&&(this.tokenizerFn=e.ja.tokenizer))};var t=new e.TinySegmenter;e.ja.tokenizer=function(i){var n,o,s,p,a,u,m,l,c,f;if(!arguments.length||null==i||void 0==i)return[];if(Array.isArray(i))return i.map(function(t){return r?new e.Token(t.toLowerCase()):t.toLowerCase()});for(o=i.toString().toLowerCase().replace(/^\s+/,""),n=o.length-1;n>=0;n--)if(/\S/.test(o.charAt(n))){o=o.substring(0,n+1);break}for(a=[],s=o.length,c=0,l=0;c<=s;c++)if(u=o.charAt(c),m=c-l,u.match(/\s/)||c==s){if(m>0)for(p=t.segment(o.slice(l,c)).filter(function(e){return!!e}),f=l,n=0;n0: 59 | titles.append(' '.join(title).replace("\n","").strip(" ")) 60 | times.append(' '.join(time_)) 61 | authors.append(' '.join(author)) 62 | 63 | # concat results 64 | tmp = pd.DataFrame([titles, times, authors]).T 65 | tmp.columns = ["title", "time", "author"] 66 | self.dataframe = pd.concat([self.dataframe, tmp]) 67 | 68 | # sleep 69 | time.sleep(delay) 70 | -------------------------------------------------------------------------------- /finnlp/data_sources/news/investorplace_streaming.py: -------------------------------------------------------------------------------- 1 | import warnings 2 | warnings.filterwarnings("ignore") 3 | import requests 4 | from lxml import etree 5 | from tqdm import tqdm 6 | import pandas as pd 7 | import json 8 | import time 9 | from finnlp.data_sources.news._base import News_Downloader 10 | 11 | # TODO: 12 | # 1. Contents 13 | 14 | class InvestorPlace_Streaming(News_Downloader): 15 | 16 | def __init__(self, args={}): 17 | super().__init__(args) 18 | self.dataframe = pd.DataFrame() 19 | 20 | def download_streaming_search(self, keyword = "apple", rounds = 3, delay = 0.5): 21 | url = 'https://investorplace.com/search/' 22 | 23 | headers = { 24 | "User-Agent":"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/116.0.0.0 Safari/537.36" 25 | } 26 | print("Downloading ...", end = ' ') 27 | for page in range(rounds): 28 | params = { 29 | 'q': keyword, 30 | "pg": page, 31 | } 32 | res = requests.get(url = url, params=params, headers=headers) 33 | if res.status_code != 200: 34 | break 35 | 36 | res = etree.HTML(res.text) 37 | div_list = res.xpath("/html/body/main/section/div/div/div/div[2]/div[1]/div[1]/div") 38 | divs = [] 39 | 40 | for div in div_list: 41 | divs += div.xpath("./div") 42 | 43 | titles = [] 44 | times = [] 45 | authors = [] 46 | summaries = [] 47 | 48 | for div in divs: 49 | try: 50 | title = div.xpath('./h2/a//text()')[0] 51 | except: 52 | title = '' 53 | try: 54 | time_ = div.xpath('div/time//text()')[0].replace('\n','').replace('\t','') 55 | except: 56 | time_ = '' 57 | try: 58 | author = div.xpath('div/span/a/text()')[0].replace('\n','').replace('\t','') 59 | except: 60 | author = '' 61 | try: 62 | summary = div.xpath('p/text()')[0].replace('\n','').replace('\t','') 63 | except: 64 | summary = '' 65 | 66 | titles.append(title) 67 | times.append(time_) 68 | authors.append(author) 69 | summaries.append(summary) 70 | 71 | titles.append(title) 72 | 73 | tmp = pd.DataFrame([titles, times, authors, summaries]).T 74 | tmp.columns = ['title', 'time', 'author', 'summary'] 75 | self.dataframe = pd.concat([self.dataframe, tmp]) 76 | 77 | print(page, end = ' ') 78 | 79 | time.sleep(delay) 80 | -------------------------------------------------------------------------------- /finnlp/data_sources/news/eastmoney_streaming.py: -------------------------------------------------------------------------------- 1 | import requests 2 | from lxml import etree 3 | from tqdm import tqdm 4 | import pandas as pd 5 | from finnlp.data_sources.news._base import News_Downloader 6 | 7 | 8 | class Eastmoney_Streaming(News_Downloader): 9 | 10 | def __init__(self, args={}): 11 | super().__init__(args) 12 | self.dataframe = pd.DataFrame() 13 | 14 | def download_streaming_stock(self, stock = "600519", rounds = 3): 15 | print( "Geting pages: ", end = "") 16 | if rounds > 0: 17 | for r in range(rounds): 18 | br = self._gather_pages(stock, r) 19 | if br == "break": 20 | break 21 | else: 22 | r = 1 23 | error_count = 0 24 | while 1: 25 | br = self._gather_pages(stock, r) 26 | if br == "break": 27 | break 28 | elif br == "Error": 29 | error_count +=1 30 | if error_count>10: 31 | print("Connection Error") 32 | r += 1 33 | print( f"Get total {r+1} pages.") 34 | self.dataframe = self.dataframe.reset_index(drop = True) 35 | 36 | def _gather_pages(self, stock, page): 37 | print( page, end = " ") 38 | url = f"https://guba.eastmoney.com/list,{stock},1,f_{page}.html" 39 | headers = { 40 | "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/114.0.0.0 Safari/537.36", 41 | } 42 | 43 | requests.DEFAULT_RETRIES = 5 # 增加重试连接次数 44 | s = requests.session() 45 | s.keep_alive = False # 关闭多余连接 46 | 47 | response = self._request_get(url, headers=headers) 48 | if response.status_code != 200: 49 | return "Error" 50 | 51 | # gather the comtent of the first page 52 | page = etree.HTML(response.text) 53 | trs = page.xpath('//*[@id="mainlist"]/div/ul/li[1]/table/tbody/tr') 54 | have_one = False 55 | for item in trs: 56 | have_one = True 57 | read_amount = item.xpath("./td[1]//text()")[0] 58 | comments = item.xpath("./td[2]//text()")[0] 59 | title = item.xpath("./td[3]/div/a//text()")[0] 60 | content_link = item.xpath("./td[3]/div/a/@href")[0] 61 | author = item.xpath("./td[4]//text()")[0] 62 | time = item.xpath("./td[5]//text()")[0] 63 | tmp = pd.DataFrame([read_amount, comments, title, content_link, author, time]).T 64 | columns = [ "read amount", "comments", "title", "content link", "author", "create time" ] 65 | tmp.columns = columns 66 | self.dataframe = pd.concat([self.dataframe, tmp]) 67 | #print(title) 68 | if have_one == False: 69 | return "break" 70 | -------------------------------------------------------------------------------- /finnlp/data_sources/social_media/weibo_streaming.py: -------------------------------------------------------------------------------- 1 | from finnlp.data_sources.social_media._base import Social_Media_Downloader 2 | 3 | from tqdm import tqdm 4 | from lxml import etree 5 | import pandas as pd 6 | import requests 7 | import time 8 | import json 9 | import re 10 | 11 | class Weibo_Streaming(Social_Media_Downloader): 12 | def __init__(self, args = {}): 13 | super().__init__(args) 14 | self.dataframe = pd.DataFrame() 15 | 16 | def download_streaming_stock(self, stock = "茅台", rounds = 3): 17 | for r in tqdm(range(rounds), desc="Downloading by page.."): 18 | page = r+1 19 | self._gather_one_page(page, stock) 20 | 21 | def _gather_one_page(self,page, stock = "茅台", delay = 0.01): 22 | headers = { 23 | "User-Agent":"Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:109.0) Gecko/20100101 Firefox/112.0" 24 | } 25 | params = { 26 | "containerid": f"100103type=61&q={stock}&t=", 27 | "page_type": "searchall", 28 | "page":page 29 | } 30 | url = f"https://m.weibo.cn/api/container/getIndex" 31 | resp = self._request_get(url, headers=headers, params = params) 32 | 33 | if resp is None: 34 | return "Error" 35 | 36 | res = json.loads(resp.text) 37 | res = res["data"]["cards"] 38 | res = pd.DataFrame(res) 39 | 40 | pbar = tqdm(total = res.shape[0], desc = "Processing the text content and downloading the full passage...") 41 | res[["content_short","content"]] = res.apply(lambda x:self._process_text(x, pbar, delay), axis= 1, result_type= "expand") 42 | 43 | self.dataframe = pd.concat([self.dataframe, res]) 44 | 45 | def _process_text(self,x, pbar, delay = 0.01): 46 | text = x["mblog"]["text"] 47 | text = etree.HTML(text) 48 | content_short = text.xpath(".//text()") 49 | content_short = ''.join(content_short) 50 | 51 | link = text.xpath('.//a/@href') 52 | link = [l for l in link if "status" in l ] 53 | if len(link) >0: 54 | base_url = "https://m.weibo.cn/" 55 | url_new = base_url + link[0] 56 | headers = { 57 | "User-Agent":"Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:109.0) Gecko/20100101 Firefox/112.0" 58 | } 59 | resp = self._request_get(url_new, headers= headers) 60 | if resp is None: 61 | content = content_short 62 | else: 63 | res = etree.HTML(resp.content) 64 | scripts = res.xpath('//script') 65 | content = scripts[2].xpath("text()") 66 | pattern=re.compile('"text": "(.+),\n') 67 | result = pattern.findall(content[0]) 68 | content = etree.HTML(result[0]) 69 | content = content.xpath("//text()") 70 | content = ''.join(content) 71 | else: 72 | content = content_short 73 | 74 | pbar.update(1) 75 | time.sleep(delay) 76 | 77 | return content_short, content 78 | 79 | -------------------------------------------------------------------------------- /finnlp/benchmarks/tfns.py: -------------------------------------------------------------------------------- 1 | import warnings 2 | warnings.filterwarnings("ignore") 3 | 4 | from sklearn.metrics import accuracy_score,f1_score 5 | from datasets import load_dataset 6 | from tqdm import tqdm 7 | import datasets 8 | import torch 9 | 10 | dic = { 11 | 0:"negative", 12 | 1:'positive', 13 | 2:'neutral', 14 | } 15 | 16 | def format_example(example: dict) -> dict: 17 | context = f"Instruction: {example['instruction']}\n" 18 | if example.get("input"): 19 | context += f"Input: {example['input']}\n" 20 | context += "Answer: " 21 | target = example["output"] 22 | return {"context": context, "target": target} 23 | 24 | def change_target(x): 25 | if 'positive' in x or 'Positive' in x: 26 | return 'positive' 27 | elif 'negative' in x or 'Negative' in x: 28 | return 'negative' 29 | else: 30 | return 'neutral' 31 | 32 | def test_tfns(model, tokenizer, batch_size = 8, prompt_fun = None ): 33 | dataset = load_dataset('zeroshot/twitter-financial-news-sentiment') 34 | dataset = dataset['validation'] 35 | dataset = dataset.to_pandas() 36 | dataset['label'] = dataset['label'].apply(lambda x:dic[x]) 37 | 38 | if prompt_fun is None: 39 | dataset["instruction"] = 'What is the sentiment of this tweet? Please choose an answer from {negative/neutral/positive}.' 40 | else: 41 | dataset["instruction"] = dataset.apply(prompt_fun, axis = 1) 42 | 43 | dataset.columns = ['input', 'output', 'instruction'] 44 | dataset[["context","target"]] = dataset.apply(format_example, axis = 1, result_type="expand") 45 | 46 | # print example 47 | print(f"\n\nPrompt example:\n{dataset['context'][0]}\n\n") 48 | 49 | context = dataset['context'].tolist() 50 | 51 | total_steps = dataset.shape[0]//batch_size + 1 52 | print(f"Total len: {len(context)}. Batchsize: {batch_size}. Total steps: {total_steps}") 53 | 54 | 55 | out_text_list = [] 56 | for i in tqdm(range(total_steps)): 57 | tmp_context = context[i* batch_size:(i+1)* batch_size] 58 | tokens = tokenizer(tmp_context, return_tensors='pt', padding=True, max_length=512) 59 | # tokens.pop('token_type_ids') 60 | for k in tokens.keys(): 61 | tokens[k] = tokens[k].cuda() 62 | res = model.generate(**tokens, max_length=512) 63 | res_sentences = [tokenizer.decode(i) for i in res] 64 | out_text = [o.split("Answer: ")[1] for o in res_sentences] 65 | out_text_list += out_text 66 | torch.cuda.empty_cache() 67 | 68 | dataset["out_text"] = out_text_list 69 | dataset["new_target"] = dataset["target"].apply(change_target) 70 | dataset["new_out"] = dataset["out_text"].apply(change_target) 71 | 72 | acc = accuracy_score(dataset["new_target"], dataset["new_out"]) 73 | f1_macro = f1_score(dataset["new_target"], dataset["new_out"], average = "macro") 74 | f1_micro = f1_score(dataset["new_target"], dataset["new_out"], average = "micro") 75 | f1_weighted = f1_score(dataset["new_target"], dataset["new_out"], average = "weighted") 76 | 77 | print(f"Acc: {acc}. F1 macro: {f1_macro}. F1 micro: {f1_micro}. F1 weighted (BloombergGPT): {f1_weighted}. ") 78 | 79 | return dataset -------------------------------------------------------------------------------- /finnlp/benchmarks/fpb.py: -------------------------------------------------------------------------------- 1 | import warnings 2 | warnings.filterwarnings("ignore") 3 | 4 | from sklearn.metrics import accuracy_score,f1_score 5 | from datasets import load_dataset 6 | from tqdm import tqdm 7 | import datasets 8 | import torch 9 | 10 | dic = { 11 | 0:"negative", 12 | 1:'neutral', 13 | 2:'positive', 14 | } 15 | 16 | def format_example(example: dict) -> dict: 17 | context = f"Instruction: {example['instruction']}\n" 18 | if example.get("input"): 19 | context += f"Input: {example['input']}\n" 20 | context += "Answer: " 21 | target = example["output"] 22 | return {"context": context, "target": target} 23 | 24 | def change_target(x): 25 | if 'positive' in x or 'Positive' in x: 26 | return 'positive' 27 | elif 'negative' in x or 'Negative' in x: 28 | return 'negative' 29 | else: 30 | return 'neutral' 31 | 32 | def test_fpb(model, tokenizer, batch_size = 8, prompt_fun = None ): 33 | instructions = load_dataset("financial_phrasebank", "sentences_50agree") 34 | instructions = instructions["train"] 35 | instructions = instructions.train_test_split(seed = 42)['test'] 36 | instructions = instructions.to_pandas() 37 | instructions.columns = ["input", "output"] 38 | instructions["output"] = instructions["output"].apply(lambda x:dic[x]) 39 | 40 | if prompt_fun is None: 41 | instructions["instruction"] = "What is the sentiment of this news? Please choose an answer from {negative/neutral/positive}." 42 | else: 43 | instructions["instruction"] = instructions.apply(prompt_fun, axis = 1) 44 | 45 | instructions[["context","target"]] = instructions.apply(format_example, axis = 1, result_type="expand") 46 | 47 | # print example 48 | print(f"\n\nPrompt example:\n{instructions['context'][0]}\n\n") 49 | 50 | 51 | context = instructions['context'].tolist() 52 | 53 | total_steps = instructions.shape[0]//batch_size + 1 54 | print(f"Total len: {len(context)}. Batchsize: {batch_size}. Total steps: {total_steps}") 55 | 56 | 57 | out_text_list = [] 58 | for i in tqdm(range(total_steps)): 59 | tmp_context = context[i* batch_size:(i+1)* batch_size] 60 | tokens = tokenizer(tmp_context, return_tensors='pt', padding=True, max_length=512) 61 | for k in tokens.keys(): 62 | tokens[k] = tokens[k].cuda() 63 | res = model.generate(**tokens, max_length=512) 64 | res_sentences = [tokenizer.decode(i) for i in res] 65 | out_text = [o.split("Answer: ")[1] for o in res_sentences] 66 | out_text_list += out_text 67 | torch.cuda.empty_cache() 68 | 69 | instructions["out_text"] = out_text_list 70 | instructions["new_target"] = instructions["target"].apply(change_target) 71 | instructions["new_out"] = instructions["out_text"].apply(change_target) 72 | 73 | acc = accuracy_score(instructions["new_target"], instructions["new_out"]) 74 | f1_macro = f1_score(instructions["new_target"], instructions["new_out"], average = "macro") 75 | f1_micro = f1_score(instructions["new_target"], instructions["new_out"], average = "micro") 76 | f1_weighted = f1_score(instructions["new_target"], instructions["new_out"], average = "weighted") 77 | 78 | print(f"Acc: {acc}. F1 macro: {f1_macro}. F1 micro: {f1_micro}. F1 weighted (BloombergGPT): {f1_weighted}. ") 79 | 80 | return instructions -------------------------------------------------------------------------------- /finnlp/benchmarks/nwgi.py: -------------------------------------------------------------------------------- 1 | import warnings 2 | warnings.filterwarnings("ignore") 3 | 4 | from sklearn.metrics import accuracy_score,f1_score 5 | from datasets import load_dataset 6 | from tqdm import tqdm 7 | import datasets 8 | import torch 9 | 10 | dic = { 11 | 'strong negative':"negative", 12 | 'moderately negative':"negative", 13 | 'mildly negative':"neutral", 14 | 'strong positive':"positive", 15 | 'moderately positive':"positive", 16 | 'mildly positive':'neutral', 17 | 'neutral':'neutral', 18 | } 19 | 20 | def format_example(example: dict) -> dict: 21 | context = f"Instruction: {example['instruction']}\n" 22 | if example.get("input"): 23 | context += f"Input: {example['input']}\n" 24 | context += "Answer: " 25 | target = example["output"] 26 | return {"context": context, "target": target} 27 | 28 | def change_target(x): 29 | if 'positive' in x or 'Positive' in x: 30 | return 'positive' 31 | elif 'negative' in x or 'Negative' in x: 32 | return 'negative' 33 | else: 34 | return 'neutral' 35 | 36 | def test_nwgi(model, tokenizer, batch_size = 8, prompt_fun = None ): 37 | dataset = datasets.load_dataset('oliverwang15/news_with_gpt_instructions') 38 | dataset = dataset['test'].to_pandas() 39 | dataset['output'] = dataset['label'].apply(lambda x:dic[x]) 40 | 41 | if prompt_fun is None: 42 | dataset["instruction"] = "What is the sentiment of this news? Please choose an answer from {negative/neutral/positive}." 43 | else: 44 | dataset["instruction"] = dataset.apply(prompt_fun, axis = 1) 45 | dataset["input"] = dataset["news"] 46 | 47 | dataset = dataset[['input', 'output', 'instruction']] 48 | dataset[["context","target"]] = dataset.apply(format_example, axis = 1, result_type="expand") 49 | 50 | # print example 51 | print(f"\n\nPrompt example:\n{dataset['context'][0]}\n\n") 52 | 53 | context = dataset['context'].tolist() 54 | 55 | total_steps = dataset.shape[0]//batch_size + 1 56 | print(f"Total len: {len(context)}. Batchsize: {batch_size}. Total steps: {total_steps}") 57 | 58 | 59 | out_text_list = [] 60 | for i in tqdm(range(total_steps)): 61 | tmp_context = context[i* batch_size:(i+1)* batch_size] 62 | tokens = tokenizer(tmp_context, return_tensors='pt', padding=True, max_length=512) 63 | # tokens.pop('token_type_ids') 64 | for k in tokens.keys(): 65 | tokens[k] = tokens[k].cuda() 66 | res = model.generate(**tokens, max_length=512) 67 | res_sentences = [tokenizer.decode(i) for i in res] 68 | out_text = [o.split("Answer: ")[1] for o in res_sentences] 69 | out_text_list += out_text 70 | torch.cuda.empty_cache() 71 | 72 | dataset["out_text"] = out_text_list 73 | dataset["new_target"] = dataset["target"].apply(change_target) 74 | dataset["new_out"] = dataset["out_text"].apply(change_target) 75 | 76 | acc = accuracy_score(dataset["new_target"], dataset["new_out"]) 77 | f1_macro = f1_score(dataset["new_target"], dataset["new_out"], average = "macro") 78 | f1_micro = f1_score(dataset["new_target"], dataset["new_out"], average = "micro") 79 | f1_weighted = f1_score(dataset["new_target"], dataset["new_out"], average = "weighted") 80 | 81 | print(f"Acc: {acc}. F1 macro: {f1_macro}. F1 micro: {f1_micro}. F1 weighted (BloombergGPT): {f1_weighted}. ") 82 | 83 | return dataset 84 | -------------------------------------------------------------------------------- /finnlp/data_sources/_base.py: -------------------------------------------------------------------------------- 1 | from finnlp.utils.get_proxy import get_china_free_proxy, get_us_free_proxy, Kuaidaili 2 | import requests 3 | 4 | class FinNLP_Downloader: 5 | def __init__(self, args = {}): 6 | self.use_proxy = True if "use_proxy" in args.keys() else False 7 | if self.use_proxy: 8 | self.country = args["use_proxy"] 9 | else: 10 | self.country = None 11 | self.max_retry = args["max_retry"] if "max_retry" in args.keys() else 1 12 | self.proxy_pages = args["proxy_pages"] if "proxy_pages" in args.keys() else 5 13 | if self.use_proxy: 14 | if "kuaidaili" in self.country: 15 | # tunnel, username, password 16 | assert "tunnel" in args.keys(), "Please make sure \'tunnel\' in your keys" 17 | assert "username" in args.keys(), "Please make sure \'username\' in your keys" 18 | assert "password" in args.keys(), "Please make sure \'password\' in your keys" 19 | self.proxy_list = Kuaidaili(args["tunnel"], args["username"], args["password"]) 20 | else: 21 | self.proxy_id = 0 22 | self.proxy_list = self._update_proxy() 23 | else: 24 | self.proxy_list = [] 25 | 26 | def _get_proxy(self): 27 | if self.use_proxy: 28 | if "kuaidaili" in self.country: 29 | proxy = self.proxy_list.get_kuaidaili_tunnel_proxy() 30 | return proxy 31 | elif len(self.proxy_list) >0: 32 | proxy = self.proxy_list[self.proxy_id] 33 | self.proxy_id += 1 34 | if self.proxy_id == len(self.proxy_list): 35 | self.proxy_id = 0 36 | return proxy 37 | else: 38 | return None 39 | 40 | def _update_proxy(self): 41 | if "china" in self.country or "China" in self.country: 42 | return get_china_free_proxy(self.proxy_pages) 43 | else: 44 | return get_us_free_proxy(self.proxy_pages) 45 | 46 | def _request_get(self, url, headers = None, verify = None, params = None): 47 | if headers is None: 48 | headers = { 49 | "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:109.0) Gecko/20100101 Firefox/112.0" 50 | } 51 | max_retry = self.max_retry 52 | proxies = self._get_proxy() 53 | for _ in range(max_retry): 54 | try: 55 | response = requests.get(url = url, proxies = proxies, headers = headers, verify = verify, params = params) 56 | if response.status_code == 200: 57 | break 58 | except: 59 | response = None 60 | 61 | if response is not None and response.status_code != 200: 62 | response = None 63 | 64 | return response 65 | 66 | def _request_post(self, url, headers, json): 67 | max_retry = self.max_retry 68 | proxies = self._get_proxy() 69 | for _ in range(max_retry): 70 | try: 71 | response = requests.post(url = url, headers = headers, json = json, proxies = proxies) 72 | if response.status_code == 200: 73 | break 74 | except: 75 | response = None 76 | 77 | if response is not None and response.status_code != 200: 78 | response = None 79 | 80 | return response 81 | -------------------------------------------------------------------------------- /finnlp/data_sources/news/pennystocks_streaming.py: -------------------------------------------------------------------------------- 1 | import requests 2 | from lxml import etree 3 | from tqdm import tqdm 4 | import pandas as pd 5 | import json 6 | import time as time 7 | from finnlp.data_sources.news._base import News_Downloader 8 | 9 | # TODO: 10 | # 1. More Pages 11 | # 2. Contents 12 | 13 | class PennyStocks_Streaming(News_Downloader): 14 | 15 | def __init__(self, args={}): 16 | super().__init__(args) 17 | self.dataframe = pd.DataFrame() 18 | 19 | def download_streaming_search(self, keyword = "apple", rounds = 3, delay = 2): 20 | # establish session 21 | self._connect_session() 22 | 23 | # download first page 24 | self._download_first_page(keyword, delay = delay) 25 | 26 | # download the following pages 27 | # self._download_other_pages(keyword) 28 | print("Only support the first page now!") 29 | 30 | 31 | def _connect_session(self): 32 | # since the server will check cookies, we need first 33 | # request the main site withour cookies, then finish 34 | # searching for the stock information we want. 35 | self.session = requests.session() 36 | first_url = "https://pennystocks.com/" 37 | headers = { 38 | "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/116.0.0.0 Safari/537.36", 39 | } 40 | print("Requesting https://pennystocks.com ...", end = " ") 41 | res = self.session.get(headers = headers, url = first_url) 42 | if res.status_code !=200: 43 | raise ConnectionError("Can't request https://pennystocks.com. Please check your connection or report this issue on Github") 44 | 45 | print("succeed!") 46 | 47 | def _download_first_page(self, keyword = "apple", max_retry = 5, delay = 2): 48 | url = f"https://pennystocks.com/?s={keyword}" 49 | headers = { 50 | "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/116.0.0.0 Safari/537.36", 51 | } 52 | res = self.session.get(url = url, headers = headers) 53 | res = etree.HTML(res.text) 54 | articles = res.xpath("/html/body/div[3]/div/div[1]/div/div/div/main/div/div[1]/div/article") 55 | # not sure why but this really works 56 | 57 | while max_retry and len(articles) == 0: 58 | import time 59 | time.sleep(delay) 60 | print("Gathering again ..", end = ' ') 61 | res = requests.get(url = url, headers = headers, cookies=self.session.cookies) 62 | res = etree.HTML(res.text) 63 | articles = res.xpath("/html/body/div[3]/div/div[1]/div/div/div/main/div/div[1]/div/article") 64 | max_retry -= 1 65 | print(f"Remaining Retry: {max_retry}") 66 | 67 | 68 | for a in articles: 69 | title = a.xpath("./header/h2/a//text()")[0] 70 | time = a.xpath("./div[3]/div/div/ul/li[1]/text()")[0] 71 | brief = a.xpath("./div[3]/div/div/text()")[0] 72 | reading_time = a.xpath("./div[3]/div/div/ul/li[2]/text()")[0] 73 | columns = ["title", "time", "brief", "reading_time"] 74 | tmp = pd.DataFrame([[title, time, brief, reading_time]], columns=columns) 75 | self.dataframe = pd.concat([self.dataframe, tmp]) 76 | 77 | 78 | def _download_other_pages(self, keyword = "apple"): 79 | pass 80 | 81 | 82 | -------------------------------------------------------------------------------- /finnlp/data_sources/news/sina_finance_date_range.py: -------------------------------------------------------------------------------- 1 | import json 2 | import pytz 3 | import time 4 | import requests 5 | import pandas as pd 6 | import numpy as np 7 | from lxml import etree 8 | from tqdm import tqdm 9 | from finnlp.data_sources.news._base import News_Downloader 10 | 11 | class Sina_Finance_Date_Range(News_Downloader): 12 | 13 | def __init__(self, args={}): 14 | super().__init__(args) 15 | self.dataframe = pd.DataFrame() 16 | 17 | def download_date_range_all(self, start_date, end_date): 18 | self.date_list = pd.date_range(start_date, end_date) 19 | for date in tqdm(self.date_list, desc= "Downloading Titles..."): 20 | tmp = self._gather_one_day(date) 21 | self.dataframe = pd.concat([self.dataframe, tmp]) 22 | self.dataframe = self.dataframe.reset_index(drop = True) 23 | 24 | def _gather_one_day(self, date, delay = 0.1): 25 | end_timestamp = pd.to_datetime(f"{date} 16:00:00").timestamp() 26 | start_timestamp = end_timestamp - 60 * 60 * 24 27 | 28 | res = pd.DataFrame() 29 | for page in range(100): 30 | url = f"https://feed.mix.sina.com.cn/api/roll/get?pageid=153&lid=2516&etime={start_timestamp}&stime={end_timestamp}&ctime={end_timestamp}&date={date}&k=&num=50&page={page}" 31 | response = self._request_get(url = url) 32 | if response is not None: 33 | response.encoding = 'unicode' 34 | text = response.text 35 | text = json.loads(text, strict=True) 36 | text = text["result"] 37 | text = text["data"] 38 | if len(text) == 0: 39 | break 40 | 41 | for i in text: 42 | for ii in i.keys(): 43 | i[ii] = [i[ii]] 44 | tmp = pd.DataFrame(i) 45 | res = pd.concat([res, tmp]) 46 | time.sleep(delay) 47 | 48 | if res.shape[0] != 0: 49 | res.ctime = pd.to_datetime(res.ctime, unit="s", utc=True) 50 | res.mtime = pd.to_datetime(res.mtime, unit="s", utc=True) 51 | res.intime = pd.to_datetime(res.intime, unit="s", utc=True) 52 | 53 | tz = pytz.timezone("Asia/Shanghai") 54 | res.ctime = [t.astimezone(tz) for t in res.ctime] 55 | res.mtime = [t.astimezone(tz) for t in res.mtime] 56 | res.intime = [t.astimezone(tz) for t in res.intime] 57 | 58 | return res 59 | 60 | def gather_content(self, delay = 0.01): 61 | pbar = tqdm(total = self.dataframe.shape[0], desc= "Gathering news contents") 62 | self.dataframe["content"] = self.dataframe.apply(lambda x:self._gather_content_apply(x, pbar, delay), axis = 1) 63 | 64 | def _gather_content_apply(self,x, pbar, delay = 0.01): 65 | url = x.url 66 | response = self._request_get(url=url) 67 | 68 | if response is not None: 69 | # process 70 | response.encoding = 'unicode' 71 | text = response.text 72 | page = etree.HTML(text) 73 | page = page.xpath("//*[@id='artibody']/p") 74 | page = [p.xpath(".//text()") for p in page] 75 | page = [''.join(p) for p in page] 76 | content = "\n".join(page) 77 | content = content.replace("\u3000","") 78 | else: 79 | content = np.nan 80 | 81 | # update 82 | pbar.update(1) 83 | time.sleep(delay) 84 | 85 | return content 86 | -------------------------------------------------------------------------------- /docs/FinNLP/site/assets/stylesheets/palette.a0c5b2b5.min.css.map: -------------------------------------------------------------------------------- 1 | {"version":3,"sources":["src/assets/stylesheets/palette/_scheme.scss","../../../src/assets/stylesheets/palette.scss","src/assets/stylesheets/palette/_accent.scss","src/assets/stylesheets/palette/_primary.scss","src/assets/stylesheets/utilities/_break.scss"],"names":[],"mappings":"AA2BA,cAGE,6BAKE,YAAA,CAGA,mDAAA,CACA,6DAAA,CACA,+DAAA,CACA,gEAAA,CACA,mDAAA,CACA,6DAAA,CACA,+DAAA,CACA,gEAAA,CAGA,gDAAA,CACA,gDAAA,CAGA,4BAAA,CACA,iCAAA,CACA,kCAAA,CACA,mCAAA,CACA,mCAAA,CACA,kCAAA,CACA,iCAAA,CACA,+CAAA,CACA,6DAAA,CACA,gEAAA,CACA,4DAAA,CACA,4DAAA,CACA,6DAAA,CAGA,6CAAA,CAGA,+CAAA,CAGA,iCAAA,CAGA,uDAAA,CACA,6DAAA,CACA,2DAAA,CAGA,yDAAA,CACA,iEAAA,CAGA,mDAAA,CACA,mDAAA,CAGA,qDAAA,CACA,wDAAA,CAGA,0DAAA,CAKA,8DAAA,CAKA,0DCxDF,CD6DE,kHAEE,YC3DJ,CD+DE,gHAEE,eC7DJ,CDoFE,yDACE,4BClFJ,CDiFE,2DACE,4BC/EJ,CD8EE,gEACE,4BC5EJ,CD2EE,2DACE,4BCzEJ,CDwEE,yDACE,4BCtEJ,CDqEE,0DACE,4BCnEJ,CDkEE,gEACE,4BChEJ,CD+DE,0DACE,4BC7DJ,CD4DE,2OACE,4BCjDJ,CDwDA,+FAGE,iCCtDF,CACF,CClDE,2BACE,4BAAA,CACA,2CAAA,CAOE,yBAAA,CACA,qCD8CN,CCxDE,4BACE,4BAAA,CACA,2CAAA,CAOE,yBAAA,CACA,qCDqDN,CC/DE,8BACE,4BAAA,CACA,2CAAA,CAOE,yBAAA,CACA,qCD4DN,CCtEE,mCACE,4BAAA,CACA,2CAAA,CAOE,yBAAA,CACA,qCDmEN,CC7EE,8BACE,4BAAA,CACA,2CAAA,CAOE,yBAAA,CACA,qCD0EN,CCpFE,4BACE,4BAAA,CACA,2CAAA,CAOE,yBAAA,CACA,qCDiFN,CC3FE,kCACE,4BAAA,CACA,2CAAA,CAOE,yBAAA,CACA,qCDwFN,CClGE,4BACE,4BAAA,CACA,2CAAA,CAOE,yBAAA,CACA,qCD+FN,CCzGE,4BACE,4BAAA,CACA,2CAAA,CAOE,yBAAA,CACA,qCDsGN,CChHE,6BACE,4BAAA,CACA,2CAAA,CAOE,yBAAA,CACA,qCD6GN,CCvHE,mCACE,4BAAA,CACA,2CAAA,CAOE,yBAAA,CACA,qCDoHN,CC9HE,4BACE,4BAAA,CACA,2CAAA,CAIE,8BAAA,CACA,qCD8HN,CCrIE,8BACE,4BAAA,CACA,2CAAA,CAIE,8BAAA,CACA,qCDqIN,CC5IE,6BACE,yBAAA,CACA,2CAAA,CAIE,8BAAA,CACA,qCD4IN,CCnJE,8BACE,4BAAA,CACA,2CAAA,CAIE,8BAAA,CACA,qCDmJN,CC1JE,mCACE,4BAAA,CACA,2CAAA,CAOE,yBAAA,CACA,qCDuJN,CE5JE,4BACE,6BAAA,CACA,oCAAA,CACA,mCAAA,CAOE,0BAAA,CACA,sCFyJN,CEpKE,6BACE,6BAAA,CACA,oCAAA,CACA,mCAAA,CAOE,0BAAA,CACA,sCFiKN,CE5KE,+BACE,6BAAA,CACA,oCAAA,CACA,mCAAA,CAOE,0BAAA,CACA,sCFyKN,CEpLE,oCACE,6BAAA,CACA,oCAAA,CACA,mCAAA,CAOE,0BAAA,CACA,sCFiLN,CE5LE,+BACE,6BAAA,CACA,oCAAA,CACA,mCAAA,CAOE,0BAAA,CACA,sCFyLN,CEpME,6BACE,6BAAA,CACA,oCAAA,CACA,mCAAA,CAOE,0BAAA,CACA,sCFiMN,CE5ME,mCACE,6BAAA,CACA,oCAAA,CACA,mCAAA,CAOE,0BAAA,CACA,sCFyMN,CEpNE,6BACE,6BAAA,CACA,oCAAA,CACA,mCAAA,CAOE,0BAAA,CACA,sCFiNN,CE5NE,6BACE,6BAAA,CACA,oCAAA,CACA,mCAAA,CAOE,0BAAA,CACA,sCFyNN,CEpOE,8BACE,6BAAA,CACA,oCAAA,CACA,mCAAA,CAOE,0BAAA,CACA,sCFiON,CE5OE,oCACE,6BAAA,CACA,oCAAA,CACA,mCAAA,CAOE,0BAAA,CACA,sCFyON,CEpPE,6BACE,6BAAA,CACA,oCAAA,CACA,mCAAA,CAIE,+BAAA,CACA,sCFoPN,CE5PE,+BACE,6BAAA,CACA,oCAAA,CACA,mCAAA,CAIE,+BAAA,CACA,sCF4PN,CEpQE,8BACE,6BAAA,CACA,oCAAA,CACA,mCAAA,CAIE,+BAAA,CACA,sCFoQN,CE5QE,+BACE,6BAAA,CACA,oCAAA,CACA,mCAAA,CAIE,+BAAA,CACA,sCF4QN,CEpRE,oCACE,6BAAA,CACA,oCAAA,CACA,mCAAA,CAOE,0BAAA,CACA,sCFiRN,CE5RE,8BACE,6BAAA,CACA,oCAAA,CACA,mCAAA,CAOE,0BAAA,CACA,sCFyRN,CEpSE,6BACE,6BAAA,CACA,oCAAA,CACA,mCAAA,CAOE,0BAAA,CACA,sCAAA,CAKA,4BF6RN,CE7SE,kCACE,6BAAA,CACA,oCAAA,CACA,mCAAA,CAOE,0BAAA,CACA,sCAAA,CAKA,4BFsSN,CEvRE,sEACE,4BF0RJ,CE3RE,+DACE,4BF8RJ,CE/RE,iEACE,4BFkSJ,CEnSE,gEACE,4BFsSJ,CEvSE,iEACE,4BF0SJ,CEjSA,8BACE,0BAAA,CACA,sCAAA,CACA,qCAAA,CACA,+BAAA,CACA,sCAAA,CAGA,4BFkSF,CE/RE,yCACE,+BFiSJ,CE9RI,kDAEE,0CAAA,CACA,sCAAA,CAFA,UFkSN,CG9MI,mCD1EA,+CACE,0BF2RJ,CExRI,qDACE,0BF0RN,CErRE,iEACE,eFuRJ,CACF,CGzNI,sCDvDA,uCACE,oCFmRJ,CACF,CE1QA,8BACE,0BAAA,CACA,sCAAA,CACA,gCAAA,CACA,0BAAA,CACA,sCAAA,CAGA,4BF2QF,CExQE,yCACE,+BF0QJ,CEvQI,kDAEE,0CAAA,CACA,sCAAA,CAFA,UF2QN,CEpQE,yCACE,qBFsQJ,CG/NI,wCDhCA,8CACE,0BFkQJ,CACF,CGvPI,mCDJA,+CACE,0BF8PJ,CE3PI,qDACE,0BF6PN,CACF,CG5OI,wCDTA,iFACE,qBFwPJ,CACF,CGpQI,sCDmBA,uCACE,qBFoPJ,CACF","file":"palette.css"} -------------------------------------------------------------------------------- /docs/FinNLP/site/assets/javascripts/lunr/min/lunr.stemmer.support.min.js: -------------------------------------------------------------------------------- 1 | !function(r,t){"function"==typeof define&&define.amd?define(t):"object"==typeof exports?module.exports=t():t()(r.lunr)}(this,function(){return function(r){r.stemmerSupport={Among:function(r,t,i,s){if(this.toCharArray=function(r){for(var t=r.length,i=new Array(t),s=0;s=i&&(e-=i,t[e>>3]&1<<(7&e)))return this.cursor++,!0}return!1},in_grouping_b:function(t,i,s){if(this.cursor>this.limit_backward){var e=r.charCodeAt(this.cursor-1);if(e<=s&&e>=i&&(e-=i,t[e>>3]&1<<(7&e)))return this.cursor--,!0}return!1},out_grouping:function(t,i,s){if(this.cursors||e>3]&1<<(7&e)))return this.cursor++,!0}return!1},out_grouping_b:function(t,i,s){if(this.cursor>this.limit_backward){var e=r.charCodeAt(this.cursor-1);if(e>s||e>3]&1<<(7&e)))return this.cursor--,!0}return!1},eq_s:function(t,i){if(this.limit-this.cursor>1),f=0,l=o0||e==s||c)break;c=!0}}for(;;){var _=t[s];if(o>=_.s_size){if(this.cursor=n+_.s_size,!_.method)return _.result;var b=_.method();if(this.cursor=n+_.s_size,b)return _.result}if((s=_.substring_i)<0)return 0}},find_among_b:function(t,i){for(var s=0,e=i,n=this.cursor,u=this.limit_backward,o=0,h=0,c=!1;;){for(var a=s+(e-s>>1),f=0,l=o=0;m--){if(n-l==u){f=-1;break}if(f=r.charCodeAt(n-1-l)-_.s[m])break;l++}if(f<0?(e=a,h=l):(s=a,o=l),e-s<=1){if(s>0||e==s||c)break;c=!0}}for(;;){var _=t[s];if(o>=_.s_size){if(this.cursor=n-_.s_size,!_.method)return _.result;var b=_.method();if(this.cursor=n-_.s_size,b)return _.result}if((s=_.substring_i)<0)return 0}},replace_s:function(t,i,s){var e=s.length-(i-t),n=r.substring(0,t),u=r.substring(i);return r=n+s+u,this.limit+=e,this.cursor>=i?this.cursor+=e:this.cursor>t&&(this.cursor=t),e},slice_check:function(){if(this.bra<0||this.bra>this.ket||this.ket>this.limit||this.limit>r.length)throw"faulty slice operation"},slice_from:function(r){this.slice_check(),this.replace_s(this.bra,this.ket,r)},slice_del:function(){this.slice_from("")},insert:function(r,t,i){var s=this.replace_s(r,t,i);r<=this.bra&&(this.bra+=s),r<=this.ket&&(this.ket+=s)},slice_to:function(){return this.slice_check(),r.substring(this.bra,this.ket)},eq_v_b:function(r){return this.eq_s_b(r.length,r)}}}},r.trimmerSupport={generateTrimmer:function(r){var t=new RegExp("^[^"+r+"]+"),i=new RegExp("[^"+r+"]+$");return function(r){return"function"==typeof r.update?r.update(function(r){return r.replace(t,"").replace(i,"")}):r.replace(t,"").replace(i,"")}}}}}); -------------------------------------------------------------------------------- /finnlp/data_sources/company_announcement/sina.py: -------------------------------------------------------------------------------- 1 | import time 2 | import requests 3 | from lxml import etree 4 | from tqdm.notebook import tqdm 5 | import pandas as pd 6 | 7 | class Sina_Announcement_Downloader: 8 | 9 | def __init__(self, args = {}): 10 | pass 11 | 12 | def download(self, stock = "all",max_page = 100): 13 | page = 0 14 | df = pd.DataFrame() 15 | print(f"Getting page: ",end = "") 16 | while page < max_page: 17 | print(page, end = " ") 18 | headers = {"User-Agent":"Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:109.0) Gecko/20100101 Firefox/111.0", 19 | 'Accept-Encoding':'gzip, deflate, br',} 20 | url = f"https://vip.stock.finance.sina.com.cn/corp/view/vCB_AllBulletin.php?stockid={stock}&Page={page}" 21 | response = requests.get(url = url,headers=headers) 22 | # response.encoding = "GBK" 23 | # print(response.content.decode('GBK')) 24 | text = response.content.decode('GBK') 25 | html = etree.HTML(text) 26 | 27 | # get announcement date 28 | date_list = html.xpath("/html/body/div[6]/div[2]/div[2]/table[2]/tr/td[2]/div[1]/ul/text()") 29 | if len(date_list) <= 0: 30 | break 31 | date_list = [date.strip('.\r').strip('.\n').strip('.\xa0').strip(' ') for date in date_list] 32 | date_list = [date for date in date_list if len(date) == 10] 33 | 34 | 35 | # get headlines and urls 36 | url_root = "https://vip.stock.finance.sina.com.cn" 37 | a_list = html.xpath("/html/body/div[6]/div[2]/div[2]/table[2]/tr/td[2]/div[1]/ul/a") 38 | headline_list = [a.xpath("./text()")[0] for a in a_list ] 39 | url_list = [url_root + a.xpath("./@href")[0] for a in a_list ] 40 | 41 | tmp_df = { 42 | "date": date_list, 43 | "headline": headline_list, 44 | "url": url_list, 45 | } 46 | tmp_df = pd.DataFrame(tmp_df) 47 | df = pd.concat([df,tmp_df]) 48 | page += 1 49 | 50 | 51 | with tqdm(total = df.shape[0],desc = "Getting Announcement content" ) as pbar: 52 | df["content"] = df.apply(lambda x: self.get_content(x,pbar), axis=1 ) 53 | 54 | df = df.reset_index(drop=True) 55 | 56 | return df 57 | 58 | def get_content(self,x,pbar,delay = 0.1): 59 | time.sleep(delay) 60 | url = x.url 61 | headers = {"User-Agent":"Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:109.0) Gecko/20100101 Firefox/111.0", 62 | 'Accept-Encoding':'gzip, deflate, br',} 63 | response = requests.get(url = url,headers=headers) 64 | if response.status_code == 200: 65 | try: 66 | text = response.content.decode('GBK') 67 | html = etree.HTML(text) 68 | 69 | # clean content 70 | content_list = html.xpath("//*[@id='content']//text()") 71 | content_list = [content.strip('.\t').strip('.\n').strip('.\r') for content in content_list] 72 | content_list = [content for content in content_list if len(content) != 0] 73 | content = "".join(content_list) 74 | except: 75 | return "can't get content" 76 | else: 77 | return "can't get content" 78 | 79 | pbar.update(1) 80 | 81 | return content 82 | 83 | def clean_data(self): 84 | pass 85 | 86 | def transfer_standard_date_to_nonstandard(self,date): 87 | pass -------------------------------------------------------------------------------- /finnlp/data_sources/news/thefly_streaming.py: -------------------------------------------------------------------------------- 1 | import warnings 2 | warnings.filterwarnings("ignore") 3 | import requests 4 | from lxml import etree 5 | from tqdm import tqdm 6 | import pandas as pd 7 | import json 8 | import time 9 | from finnlp.data_sources.news._base import News_Downloader 10 | 11 | # TODO: 12 | # 1. Contents 13 | # 2. More pages 14 | 15 | class TheFly_Streaming(News_Downloader): 16 | 17 | def __init__(self, args={}): 18 | super().__init__(args) 19 | self.dataframe = pd.DataFrame() 20 | 21 | def download_streaming_search(self, keyword = "AAPL",end_date = None, rounds = 3, delay = 0.5): 22 | # download first page 23 | self._download_first_page(keyword, delay = delay, end_date = end_date) 24 | 25 | # download the following pages 26 | # self._download_other_pages(keyword) 27 | print("Only support the first page now!") 28 | 29 | def _download_first_page(self, keyword = "AAPL", delay = 0.5, end_date = None): 30 | url = "https://thefly.com/news.php" 31 | headers = { 32 | 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/116.0.0.0 Safari/537.36', 33 | } 34 | params = { 35 | 'fecha': end_date, 36 | 'market_stories': 'on', 37 | 'hot_stocks_filter': 'on', 38 | 'rumors_filter': 'on', 39 | 'general_news_filter': 'on', 40 | 'periodicals_filter': 'on', 41 | 'earnings_filter': 'on', 42 | 'technical_analysis_filter': 'on', 43 | 'options_filter': 'on', 44 | 'syndicates_filter': 'on', 45 | 'onthefly': 'on', 46 | 'insight_filter': 'on', 47 | 'market_mover_filter': 'on', 48 | 'e_inter_filter': 'on', 49 | 'mid_wrap_filter': 'on', 50 | 'sec_wrap_filter': 'on', 51 | 'analyst_wrap_filter': 'on', 52 | 'analyst_recommendations': 'on', 53 | 'upgrade_filter': 'on', 54 | 'downgrade_filter': 'on', 55 | 'initiate_filter': 'on', 56 | 'no_change_filter': 'on', 57 | 'events': 'on', 58 | 'symbol': keyword, 59 | } 60 | res = requests.get(url = url, headers= headers, params = params, verify=False) 61 | if res.status_code != 200: 62 | print(f'Connection Error: {res.status_code}') 63 | return f'Connection Error: {res.status_code}' 64 | 65 | res = etree.HTML(res.text) 66 | tables = res.xpath("/html/body/div[2]/div/div/div[1]/table")[1:] 67 | titles = [] 68 | stocks = [] 69 | abstracts = [] 70 | dates = [] 71 | times = [] 72 | for table in tables: 73 | trs = table.xpath("./tr") 74 | for tr in trs: 75 | title = tr.xpath("./td[2]/div[1]/a/span//text()") 76 | if len(title) > 0: 77 | titles.append(' '.join(title)) 78 | stocks.append(' '.join(tr.xpath("./td[2]/div[1]/div/span/text()"))) 79 | abstracts.append(' '.join(tr.xpath("./td[2]/div[2]/dd/p[1]/text()"))) 80 | dates.append(' '.join(tr.xpath("./td[2]/div[1]/span[2]/small/span[3]/text()"))) 81 | times.append(' '.join(tr.xpath("./td[2]/div[1]/span[2]/small/span[3]/div/text()"))) 82 | 83 | tmp = pd.DataFrame([titles, stocks, abstracts, dates, times]).T 84 | tmp.columns = ["title", "stock", "abstract", "date", "time"] 85 | self.dataframe = pd.concat([self.dataframe, tmp]) 86 | 87 | time.sleep(delay) 88 | -------------------------------------------------------------------------------- /finnlp/data_sources/news/talkmarkets_streaming.py: -------------------------------------------------------------------------------- 1 | import warnings 2 | warnings.filterwarnings("ignore") 3 | import requests 4 | from lxml import etree 5 | from tqdm import tqdm 6 | import pandas as pd 7 | import json 8 | import time 9 | from finnlp.data_sources.news._base import News_Downloader 10 | 11 | # TODO: 12 | # 1. Contents 13 | 14 | class TalkMarkets_Streaming(News_Downloader): 15 | 16 | def __init__(self, args={}): 17 | super().__init__(args) 18 | self.dataframe = pd.DataFrame() 19 | 20 | def download_streaming_search(self, keyword = "appple", rounds = 3, delay = 0.5): 21 | # 1. obtain cx 22 | cx = self._obtain_cx(keyword) 23 | 24 | # 2. obtain ces token 25 | ces_token = self._obtain_cse_token(cx) 26 | 27 | # 3. get content (Due to limit of the platform, the max rouund is 10, about 100 news) 28 | print("Downloading...", end = ' ') 29 | for i in range(rounds): 30 | url = "https://cse.google.com/cse/element/v1" 31 | headers = { 32 | "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/116.0.0.0 Safari/537.36" 33 | } 34 | params = { 35 | 'rsz': 'filtered_cse', 36 | 'num': '20', 37 | 'hl': 'en', 38 | 'source': 'gcsc', 39 | 'gss': '.com', 40 | 'start': i*20, 41 | 'cselibv': '827890a761694e44', 42 | 'cx': cx, 43 | 'q': 'apple', 44 | 'safe': 'off', 45 | 'cse_tok': ces_token, 46 | 'sort': 'date', 47 | 'exp': 'csqr,cc', 48 | 'callback': 'google.search.cse.api1861', 49 | } 50 | res = requests.get(url = url, headers= headers, params = params) 51 | if res.status_code != 200: 52 | break 53 | 54 | res = eval(res.text[34:-2]) 55 | tmp = pd.DataFrame(res["results"]) 56 | self.dataframe = pd.concat([self.dataframe, tmp]) 57 | 58 | time.sleep(delay) 59 | print(i, end = ' ') 60 | 61 | def _obtain_cx(self, keyword): 62 | url = "https://talkmarkets.com/search" 63 | headers = { 64 | "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/116.0.0.0 Safari/537.36" 65 | } 66 | params = { 67 | "tab": "General", 68 | "searchQuery": keyword, 69 | } 70 | res = requests.get(url = url, headers= headers, params = params) 71 | if res.status_code != 200: 72 | print(f"Connection Error: {res.status_code}") 73 | return f"Connection Error: {res.status_code}" 74 | 75 | res = etree.HTML(res.text) 76 | cx = res.xpath('.//script[@type="text/javascript"][1]/text()')[1][40:73] 77 | return cx 78 | 79 | def _obtain_cse_token(self, cx, ): 80 | url = "https://cse.google.com/cse.js" 81 | headers = { 82 | "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/116.0.0.0 Safari/537.36" 83 | } 84 | params = { 85 | "cx": cx, 86 | } 87 | res = requests.get(url = url, headers= headers, params = params) 88 | if res.status_code != 200: 89 | print(f"Connection Error: {res.status_code}") 90 | return f"Connection Error: {res.status_code}" 91 | 92 | text = res.text 93 | ces_token = text[5744:5786] 94 | return ces_token 95 | 96 | -------------------------------------------------------------------------------- /docs/FinNLP/site/assets/javascripts/lunr/min/lunr.ko.min.js: -------------------------------------------------------------------------------- 1 | !function(e,r){"function"==typeof define&&define.amd?define(r):"object"==typeof exports?module.exports=r():r()(e.lunr)}(this,function(){return function(e){if(void 0===e)throw new Error("Lunr is not present. Please include / require Lunr before this script.");if(void 0===e.stemmerSupport)throw new Error("Lunr stemmer support is not present. Please include / require Lunr stemmer support before this script.");e.ko=function(){this.pipeline.reset(),this.pipeline.add(e.ko.trimmer,e.ko.stopWordFilter)},e.ko.wordCharacters="[A-Za-z가-힯a]",e.ko.trimmer=e.trimmerSupport.generateTrimmer(e.ko.wordCharacters),e.Pipeline.registerFunction(e.ko.trimmer,"trimmer-ko"),e.ko.stopWordFilter=e.generateStopWordFilter("아 휴 아이구 아이쿠 아이고 어 나 우리 저희 따라 의해 을 를 에 의 가 으로 로 에게 뿐이다 의거하여 근거하여 입각하여 기준으로 예하면 예를 들면 예를 들자면 저 소인 소생 저희 지말고 하지마 하지마라 다른 물론 또한 그리고 비길수 없다 해서는 안된다 뿐만 아니라 만이 아니다 만은 아니다 막론하고 관계없이 그치지 않다 그러나 그런데 하지만 든간에 논하지 않다 따지지 않다 설사 비록 더라도 아니면 만 못하다 하는 편이 낫다 불문하고 향하여 향해서 향하다 쪽으로 틈타 이용하여 타다 오르다 제외하고 이 외에 이 밖에 하여야 비로소 한다면 몰라도 외에도 이곳 여기 부터 기점으로 따라서 할 생각이다 하려고하다 이리하여 그리하여 그렇게 함으로써 하지만 일때 할때 앞에서 중에서 보는데서 으로써 로써 까지 해야한다 일것이다 반드시 할줄알다 할수있다 할수있어 임에 틀림없다 한다면 등 등등 제 겨우 단지 다만 할뿐 딩동 댕그 대해서 대하여 대하면 훨씬 얼마나 얼마만큼 얼마큼 남짓 여 얼마간 약간 다소 좀 조금 다수 몇 얼마 지만 하물며 또한 그러나 그렇지만 하지만 이외에도 대해 말하자면 뿐이다 다음에 반대로 반대로 말하자면 이와 반대로 바꾸어서 말하면 바꾸어서 한다면 만약 그렇지않으면 까악 툭 딱 삐걱거리다 보드득 비걱거리다 꽈당 응당 해야한다 에 가서 각 각각 여러분 각종 각자 제각기 하도록하다 와 과 그러므로 그래서 고로 한 까닭에 하기 때문에 거니와 이지만 대하여 관하여 관한 과연 실로 아니나다를가 생각한대로 진짜로 한적이있다 하곤하였다 하 하하 허허 아하 거바 와 오 왜 어째서 무엇때문에 어찌 하겠는가 무슨 어디 어느곳 더군다나 하물며 더욱이는 어느때 언제 야 이봐 어이 여보시오 흐흐 흥 휴 헉헉 헐떡헐떡 영차 여차 어기여차 끙끙 아야 앗 아야 콸콸 졸졸 좍좍 뚝뚝 주룩주룩 솨 우르르 그래도 또 그리고 바꾸어말하면 바꾸어말하자면 혹은 혹시 답다 및 그에 따르는 때가 되어 즉 지든지 설령 가령 하더라도 할지라도 일지라도 지든지 몇 거의 하마터면 인젠 이젠 된바에야 된이상 만큼\t어찌됏든 그위에 게다가 점에서 보아 비추어 보아 고려하면 하게될것이다 일것이다 비교적 좀 보다더 비하면 시키다 하게하다 할만하다 의해서 연이서 이어서 잇따라 뒤따라 뒤이어 결국 의지하여 기대여 통하여 자마자 더욱더 불구하고 얼마든지 마음대로 주저하지 않고 곧 즉시 바로 당장 하자마자 밖에 안된다 하면된다 그래 그렇지 요컨대 다시 말하자면 바꿔 말하면 즉 구체적으로 말하자면 시작하여 시초에 이상 허 헉 허걱 바와같이 해도좋다 해도된다 게다가 더구나 하물며 와르르 팍 퍽 펄렁 동안 이래 하고있었다 이었다 에서 로부터 까지 예하면 했어요 해요 함께 같이 더불어 마저 마저도 양자 모두 습니다 가까스로 하려고하다 즈음하여 다른 다른 방면으로 해봐요 습니까 했어요 말할것도 없고 무릎쓰고 개의치않고 하는것만 못하다 하는것이 낫다 매 매번 들 모 어느것 어느 로써 갖고말하자면 어디 어느쪽 어느것 어느해 어느 년도 라 해도 언젠가 어떤것 어느것 저기 저쪽 저것 그때 그럼 그러면 요만한걸 그래 그때 저것만큼 그저 이르기까지 할 줄 안다 할 힘이 있다 너 너희 당신 어찌 설마 차라리 할지언정 할지라도 할망정 할지언정 구토하다 게우다 토하다 메쓰겁다 옆사람 퉤 쳇 의거하여 근거하여 의해 따라 힘입어 그 다음 버금 두번째로 기타 첫번째로 나머지는 그중에서 견지에서 형식으로 쓰여 입장에서 위해서 단지 의해되다 하도록시키다 뿐만아니라 반대로 전후 전자 앞의것 잠시 잠깐 하면서 그렇지만 다음에 그러한즉 그런즉 남들 아무거나 어찌하든지 같다 비슷하다 예컨대 이럴정도로 어떻게 만약 만일 위에서 서술한바와같이 인 듯하다 하지 않는다면 만약에 무엇 무슨 어느 어떤 아래윗 조차 한데 그럼에도 불구하고 여전히 심지어 까지도 조차도 하지 않도록 않기 위하여 때 시각 무렵 시간 동안 어때 어떠한 하여금 네 예 우선 누구 누가 알겠는가 아무도 줄은모른다 줄은 몰랏다 하는 김에 겸사겸사 하는바 그런 까닭에 한 이유는 그러니 그러니까 때문에 그 너희 그들 너희들 타인 것 것들 너 위하여 공동으로 동시에 하기 위하여 어찌하여 무엇때문에 붕붕 윙윙 나 우리 엉엉 휘익 윙윙 오호 아하 어쨋든 만 못하다\t하기보다는 차라리 하는 편이 낫다 흐흐 놀라다 상대적으로 말하자면 마치 아니라면 쉿 그렇지 않으면 그렇지 않다면 안 그러면 아니었다면 하든지 아니면 이라면 좋아 알았어 하는것도 그만이다 어쩔수 없다 하나 일 일반적으로 일단 한켠으로는 오자마자 이렇게되면 이와같다면 전부 한마디 한항목 근거로 하기에 아울러 하지 않도록 않기 위해서 이르기까지 이 되다 로 인하여 까닭으로 이유만으로 이로 인하여 그래서 이 때문에 그러므로 그런 까닭에 알 수 있다 결론을 낼 수 있다 으로 인하여 있다 어떤것 관계가 있다 관련이 있다 연관되다 어떤것들 에 대해 이리하여 그리하여 여부 하기보다는 하느니 하면 할수록 운운 이러이러하다 하구나 하도다 다시말하면 다음으로 에 있다 에 달려 있다 우리 우리들 오히려 하기는한데 어떻게 어떻해 어찌됏어 어때 어째서 본대로 자 이 이쪽 여기 이것 이번 이렇게말하자면 이런 이러한 이와 같은 요만큼 요만한 것 얼마 안 되는 것 이만큼 이 정도의 이렇게 많은 것 이와 같다 이때 이렇구나 것과 같이 끼익 삐걱 따위 와 같은 사람들 부류의 사람들 왜냐하면 중의하나 오직 오로지 에 한하다 하기만 하면 도착하다 까지 미치다 도달하다 정도에 이르다 할 지경이다 결과에 이르다 관해서는 여러분 하고 있다 한 후 혼자 자기 자기집 자신 우에 종합한것과같이 총적으로 보면 총적으로 말하면 총적으로 대로 하다 으로서 참 그만이다 할 따름이다 쿵 탕탕 쾅쾅 둥둥 봐 봐라 아이야 아니 와아 응 아이 참나 년 월 일 령 영 일 이 삼 사 오 육 륙 칠 팔 구 이천육 이천칠 이천팔 이천구 하나 둘 셋 넷 다섯 여섯 일곱 여덟 아홉 령 영".split(" ")),e.Pipeline.registerFunction(e.ko.stopWordFilter,"stopWordFilter-ko"),e.ko.stemmer=function(){return function(e){return"function"==typeof e.update?e.update(function(e){return e}):e}}(),e.Pipeline.registerFunction(e.ko.stemmer,"stemmer-ko")}}); -------------------------------------------------------------------------------- /finnlp/benchmarks/fiqa.py: -------------------------------------------------------------------------------- 1 | import warnings 2 | warnings.filterwarnings("ignore") 3 | 4 | from sklearn.metrics import accuracy_score,f1_score 5 | from datasets import load_dataset 6 | from tqdm import tqdm 7 | import datasets 8 | import torch 9 | 10 | def format_example(example: dict) -> dict: 11 | context = f"Instruction: {example['instruction']}\n" 12 | if example.get("input"): 13 | context += f"Input: {example['input']}\n" 14 | context += "Answer: " 15 | target = example["output"] 16 | return {"context": context, "target": target} 17 | 18 | def add_instructions(x): 19 | if x.format == "post": 20 | return "What is the sentiment of this tweet? Please choose an answer from {negative/neutral/positive}." 21 | else: 22 | return "What is the sentiment of this news? Please choose an answer from {negative/neutral/positive}." 23 | 24 | def make_label(x): 25 | if x < - 0.1: return "negative" 26 | elif x >=-0.1 and x < 0.1: return "neutral" 27 | elif x >= 0.1: return "positive" 28 | 29 | def change_target(x): 30 | if 'positive' in x or 'Positive' in x: 31 | return 'positive' 32 | elif 'negative' in x or 'Negative' in x: 33 | return 'negative' 34 | else: 35 | return 'neutral' 36 | 37 | def test_fiqa(model, tokenizer, batch_size = 8, prompt_fun = None ): 38 | dataset = load_dataset('pauri32/fiqa-2018') 39 | dataset = datasets.concatenate_datasets([dataset["train"], dataset["validation"] ,dataset["test"] ]) 40 | dataset = dataset.train_test_split(0.226, seed = 42)['test'] 41 | dataset = dataset.to_pandas() 42 | dataset["output"] = dataset.sentiment_score.apply(make_label) 43 | if prompt_fun is None: 44 | dataset["instruction"] = "What is the sentiment of this news? Please choose an answer from {negative/neutral/positive}." 45 | else: 46 | dataset["instruction"] = dataset.apply(prompt_fun, axis = 1) 47 | 48 | dataset = dataset[['sentence', 'output',"instruction"]] 49 | dataset.columns = ["input", "output","instruction"] 50 | dataset[["context","target"]] = dataset.apply(format_example, axis = 1, result_type="expand") 51 | 52 | # print example 53 | print(f"\n\nPrompt example:\n{dataset['context'][0]}\n\n") 54 | 55 | context = dataset['context'].tolist() 56 | total_steps = dataset.shape[0]//batch_size + 1 57 | print(f"Total len: {len(context)}. Batchsize: {batch_size}. Total steps: {total_steps}") 58 | 59 | out_text_list = [] 60 | 61 | for i in tqdm(range(total_steps)): 62 | tmp_context = context[i* batch_size:(i+1)* batch_size] 63 | tokens = tokenizer(tmp_context, return_tensors='pt', padding=True, max_length=512) 64 | # tokens.pop('token_type_ids') 65 | for k in tokens.keys(): 66 | tokens[k] = tokens[k].cuda() 67 | 68 | res = model.generate(**tokens, max_length=512) 69 | res_sentences = [tokenizer.decode(i) for i in res] 70 | out_text = [o.split("Answer: ")[1] for o in res_sentences] 71 | out_text_list += out_text 72 | torch.cuda.empty_cache() 73 | 74 | dataset["out_text"] = out_text_list 75 | dataset["new_target"] = dataset["target"].apply(change_target) 76 | dataset["new_out"] = dataset["out_text"].apply(change_target) 77 | 78 | acc = accuracy_score(dataset["new_target"], dataset["new_out"]) 79 | f1_macro = f1_score(dataset["new_target"], dataset["new_out"], average = "macro") 80 | f1_micro = f1_score(dataset["new_target"], dataset["new_out"], average = "micro") 81 | f1_weighted = f1_score(dataset["new_target"], dataset["new_out"], average = "weighted") 82 | 83 | print(f"Acc: {acc}. F1 macro: {f1_macro}. F1 micro: {f1_micro}. F1 weighted (BloombergGPT): {f1_weighted}. ") 84 | 85 | return dataset -------------------------------------------------------------------------------- /finnlp/data_sources/social_media/reddit_streaming.py: -------------------------------------------------------------------------------- 1 | from finnlp.data_sources.social_media._base import Social_Media_Downloader 2 | 3 | from tqdm import tqdm 4 | from lxml import etree 5 | import requests 6 | import pandas as pd 7 | import json 8 | import base64 9 | 10 | class Reddit_Streaming(Social_Media_Downloader): 11 | 12 | def __init__(self, args = {}): 13 | super().__init__(args) 14 | self.dataframe = pd.DataFrame() 15 | 16 | def download_streaming_all(self, rounds = 3): 17 | # Download the first page by url 18 | base_url = "https://www.reddit.com/r/wallstreetbets/new/" 19 | pbar = tqdm(total= rounds, desc= "Downloading by pages...") 20 | res = self._request_get(base_url) 21 | if res is None: 22 | raise ConnectionError 23 | 24 | # get the info from init page 25 | html = etree.HTML(res.text) 26 | init = html.xpath("//*[@id='data']/text()")[0] 27 | init = json.loads(init[14:][:-1]) 28 | init = init["posts"]["models"] 29 | tmp_df = pd.DataFrame(init).T.reset_index(drop = True) 30 | self.dataframe = tmp_df 31 | init = [i for i in init if len(i)< 12] 32 | last_id = init[-1] 33 | last_id = self._encode_base64(last_id) 34 | 35 | pbar.update(1) 36 | 37 | # fetch other pages 38 | if rounds > 1: 39 | for _ in range(1,rounds): 40 | last_id = self._fatch_other_pages(last_id, pbar) 41 | 42 | def _fatch_other_pages(self, last_page, pbar): 43 | url = 'https://gql.reddit.com/' 44 | headers = { 45 | "referer":"https://www.reddit.com/", 46 | "authorization": "Bearer -twjFZkBAlpR8gZnZqsGHvz-G5c49PA", 47 | "user-agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/113.0.0.0 Safari/537.36" 48 | } 49 | data = { 50 | "id": "02e3b6d0d0d7", 51 | "variables": { 52 | "name": "wallstreetbets", 53 | "includeIdentity": False, 54 | "adContext": { 55 | "layout": "CARD", 56 | "clientSignalSessionData": { 57 | "adsSeenCount": 4, 58 | "totalPostsSeenCount": 79, 59 | "sessionStartTime": "2023-04-07T15:32:13.933Z", 60 | } 61 | }, 62 | "isFake": False, 63 | "includeAppliedFlair": False, 64 | "includeDevPlatformMetadata": True, 65 | "includeRecents": False, 66 | "includeTrending": False, 67 | "includeSubredditRankings": True, 68 | "includeSubredditChannels": False, 69 | "isAdHocMulti": False, 70 | "isAll": False, 71 | "isLoggedOutGatedOptedin": False, 72 | "isLoggedOutQuarantineOptedin": False, 73 | "isPopular": False, 74 | "recentPostIds": [], 75 | "subredditNames": [], 76 | "sort": "NEW", 77 | "pageSize": 25, 78 | "after": last_page 79 | } 80 | } 81 | response = self._request_post(url = url, headers= headers, json = data) 82 | data = json.loads(response.text) 83 | data = data["data"]["subredditInfoByName"]["elements"]["edges"] 84 | for d in data: 85 | if d["node"]["__typename"] == "SubredditPost": 86 | tmp = pd.DataFrame(d).T 87 | self.dataframe = pd.concat([self.dataframe, tmp]) 88 | last_id = tmp.id.values[0] 89 | 90 | last_id = self._encode_base64(last_id) 91 | pbar.update(1) 92 | 93 | return last_id 94 | 95 | def _encode_base64(self,id): 96 | return base64.b64encode(id.encode('utf-8')).decode() -------------------------------------------------------------------------------- /finnlp/data_sources/social_media/twitter_date_range.py: -------------------------------------------------------------------------------- 1 | import warnings 2 | warnings.filterwarnings("ignore") 3 | 4 | from finnlp.data_sources.social_media._base import Social_Media_Downloader 5 | 6 | import requests 7 | from urllib import parse 8 | from tqdm import tqdm 9 | from datetime import datetime,timedelta 10 | import pandas as pd 11 | import json 12 | import time 13 | 14 | class Twitter_Date_Range(Social_Media_Downloader): 15 | 16 | def __init__(self, args = {}): 17 | super().__init__(args) 18 | self.dataframe = pd.DataFrame() 19 | 20 | def download_date_range_stock(self, start_date, end_date, stock = "AAPL"): 21 | self.date_list = pd.date_range(start_date,end_date) 22 | res = pd.DataFrame() 23 | for date in tqdm(self.date_list, desc= "Downloading by day... "): 24 | tmp = self._gather_one_day(date,stock) 25 | res = pd.concat([res,tmp]) 26 | 27 | res.created_at = pd.to_datetime(res.created_at) 28 | res = res.sort_values("created_at") 29 | res = res.reset_index(drop=True) 30 | # res = res.query(f"created_at >= @start_date & created_at <= @end_date") 31 | res = res[res.created_at >= start_date][res.created_at <= end_date] 32 | res = res.reset_index(drop=True) 33 | self.dataframe = res 34 | 35 | def _gather_one_day(self, date, stock = "AAPL", pbar = None ,delay = 0.01): 36 | time.sleep(delay) 37 | next_date = date + timedelta(days=1) 38 | date = datetime.strftime(date, "%Y-%m-%d") 39 | next_date = datetime.strftime(next_date, "%Y-%m-%d") 40 | 41 | url = "https://twitter.com/i/api/2/search/adaptive.json?include_profile_interstitial_type=1&include_blocking=1&include_blocked_by=1&include_followed_by=1&include_want_retweets=1&include_mute_edge=1&include_can_dm=1&include_can_media_tag=1&skip_status=1&cards_platform=Web-12&include_cards=1&include_ext_alt_text=true&include_quote_count=true&include_reply_count=1&tweet_mode=extended&include_entities=true&include_user_entities=true&include_ext_media_color=true&include_ext_media_availability=true&send_error_codes=true&simple_quoted_tweet=true&q={}&count=20&query_source=typed_query&pc=1&spelling_corrections=1&ext=mediaStats%2ChighlightedLabel%2CvoiceInfo" 42 | url_token = 'https://api.twitter.com/1.1/guest/activate.json' 43 | headers = { 44 | 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/113.0.0.0 Safari/537.36', 45 | 'Accept': '*/*', 46 | 'Accept-Language': 'zh-CN,zh;q=0.8,zh-TW;q=0.7,zh-HK;q=0.5,en-US;q=0.3,en;q=0.2', 47 | 'x-guest-token': '', 48 | 'x-twitter-client-language': 'zh-cn', 49 | 'x-twitter-active-user': 'yes', 50 | 'x-csrf-token': '25ea9d09196a6ba850201d47d7e75733', 51 | 'Sec-Fetch-Dest': 'empty', 52 | 'Sec-Fetch-Mode': 'cors', 53 | 'Sec-Fetch-Site': 'same-origin', 54 | 'authorization': 'Bearer AAAAAAAAAAAAAAAAAAAAANRILgAAAAAAnNwIzUejRCOuH5E6I8xnZz4puTs%3D1Zv7ttfk8LF81IUq16cHjhLTvJu4FA33AGWWjCpTnA', 55 | 'Referer': 'https://twitter.com/', 56 | 'Connection': 'keep-alive', 57 | } 58 | 59 | q = f'{stock} until:{next_date} since:{date}' 60 | token = json.loads(requests.post(url_token, headers = headers).text)['guest_token'] 61 | print(token) 62 | headers['x-guest-token'] = token 63 | url = url.format(parse.quote(q)) 64 | print(url) 65 | res = self._request_get(url, headers = headers) 66 | print(res) 67 | if res is not None: 68 | try: 69 | res = json.loads(res.text) 70 | res = pd.DataFrame(res["globalObjects"]["tweets"]).T.sort_values("created_at") 71 | except: 72 | res = pd.DataFrame() 73 | else: 74 | res = pd.DataFrame() 75 | 76 | return res 77 | -------------------------------------------------------------------------------- /finnlp/data_sources/social_media/facebook_streaming.py: -------------------------------------------------------------------------------- 1 | import warnings 2 | warnings.filterwarnings("ignore") 3 | import requests 4 | from lxml import etree 5 | from tqdm import tqdm 6 | import pandas as pd 7 | import json 8 | import time 9 | from finnlp.data_sources.social_media._base import Social_Media_Downloader 10 | 11 | # TODO: 12 | # 1. Better performance 13 | 14 | import json 15 | import time 16 | import numpy as np 17 | 18 | from selenium import webdriver 19 | from selenium.webdriver.common.by import By 20 | 21 | class Facebook_Streaming(Social_Media_Downloader): 22 | def __init__(self, args = {}): 23 | super().__init__(args) 24 | self.dataframe = pd.DataFrame() 25 | self.cookies = args["cookies"] 26 | self.stealth_path = args["stealth_path"] 27 | self.headless = args["headless"] if "headless" in args.keys() else True 28 | 29 | def download_streaming_stock(self, keyword = "AAPL", rounds = 3, delay = 0.5): 30 | # init 31 | self._init_opt() 32 | 33 | # search for the keyword 34 | search_url = "https://m.facebook.com/search_results/?q=" + keyword 35 | self.browser.get(search_url) 36 | 37 | # click on the posts 38 | post_element = self.browser.find_elements(By.XPATH, "/html/body/div[2]/div/div[2]/div[3]/div[1]")[0] 39 | post_element.click() 40 | time.sleep(5) 41 | 42 | # click on recent posts 43 | post_element = self.browser.find_elements(By.XPATH, "/html/body/div[2]/div/div[2]/div[3]/div[1]")[0] 44 | post_element.click() 45 | time.sleep(5) 46 | 47 | # get data 48 | all = [] 49 | title_divs = self.browser.find_elements(By.XPATH, "/html/body/div[2]/div/div[2]/div") 50 | for title_div in tqdm(title_divs): 51 | # title 52 | try: 53 | title = title_div.find_elements(By.XPATH,"./div[2]/div/div/div[2]/div/div/div/div") 54 | if len(title)>0: 55 | title = title[0].text 56 | else: 57 | title = np.nan 58 | except Exception as e: 59 | print(e) 60 | title = np.nan 61 | 62 | # time 63 | try: 64 | time_element = title_div.find_elements(By.XPATH, './div[2]/div/div/div[1]/div/div/div/div[2]/div[2]/div/span') 65 | if len(time_element)>0: 66 | time_ = time_element[0].text 67 | else: 68 | time_ = np.nan 69 | except: 70 | time_ = np.nan 71 | all.append((title, time_)) 72 | 73 | # close browser 74 | self.browser.close() 75 | 76 | tmp = pd.DataFrame(all, columns=["content", "date"]) 77 | self.dataframe = pd.concat([self.dataframe, tmp]) 78 | self.dataframe = self.dataframe.dropna(how="all") 79 | 80 | print("Only support the first page now!") 81 | 82 | 83 | def _init_opt(self): 84 | self.chromeOptions = webdriver.ChromeOptions() 85 | if self.headless: 86 | self.chromeOptions.add_argument('--headless') 87 | self.chromeOptions.add_argument('--disable-blink-features=AutomationControlled') 88 | self.chromeOptions.add_argument("--user-agent=Mozilla/5.0 (iPhone; CPU iPhone OS 14_1 like Mac OS X) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/14.0 Mobile/15E148 Safari/604.1") 89 | 90 | self.chromeOptions.add_experimental_option('excludeSwitches', ['enable-automation']) 91 | self.browser = webdriver.Chrome(options=self.chromeOptions) 92 | with open(self.stealth_path) as f: 93 | js = f.read() 94 | self.browser.execute_cdp_cmd("Page.addScriptToEvaluateOnNewDocument", { 95 | "source": js 96 | }) 97 | self.browser.get('https://m.facebook.com/') 98 | self.browser.delete_all_cookies() 99 | for i in self.cookies: 100 | self.browser.add_cookie(i) 101 | 102 | self.browser.implicitly_wait(2) 103 | 104 | -------------------------------------------------------------------------------- /finnlp/data_sources/sec_filings/main.py: -------------------------------------------------------------------------------- 1 | from finnlp.data_sources.sec_filings.sec_filings import SECExtractor 2 | import concurrent.futures 3 | import json 4 | import os 5 | import time 6 | from collections import defaultdict 7 | from typing import List 8 | 9 | class SECFilingsLoader(): 10 | """ 11 | SEC Filings loader 12 | Get the SEC filings of multiple tickers 13 | """ 14 | 15 | def __init__( 16 | self, 17 | tickers: List[str], 18 | amount: int, 19 | filing_type: str = "10-K", 20 | num_workers: int = 2, 21 | include_amends: bool = False, 22 | folder_name:str = "data" 23 | ): 24 | assert filing_type in [ 25 | "10-K", 26 | "10-Q", 27 | ], "The supported document types are 10-K and 10-Q" 28 | 29 | self.tickers = tickers 30 | self.amount = amount 31 | self.filing_type = filing_type 32 | self.num_workers = num_workers 33 | self.include_amends = include_amends 34 | 35 | self.se = SECExtractor( 36 | tickers, amount, filing_type, include_amends=include_amends 37 | ) 38 | self.folder_name = folder_name 39 | os.makedirs(self.folder_name, exist_ok=True) 40 | 41 | def multiprocess_run(self, tic): 42 | tic_dict = self.se.get_accession_numbers(tic) 43 | text_dict = defaultdict(list) 44 | for tic, fields in tic_dict.items(): 45 | os.makedirs(f"{self.folder_name}/{tic}", exist_ok=True) 46 | print(f"Started for {tic}") 47 | 48 | field_urls = [field["url"] for field in fields] 49 | years = [field["year"] for field in fields] 50 | with concurrent.futures.ProcessPoolExecutor( 51 | max_workers=self.num_workers 52 | ) as executor: 53 | results = executor.map(self.se.get_text_from_url, field_urls) 54 | for idx, res in enumerate(results): 55 | all_text, filing_type = res 56 | text_dict[tic].append( 57 | { 58 | "year": years[idx], 59 | "ticker": tic, 60 | "all_texts": all_text, 61 | "filing_type": filing_type, 62 | } 63 | ) 64 | return text_dict 65 | 66 | def load_data(self): 67 | start = time.time() 68 | thread_workers = min(len(self.tickers), self.num_workers) 69 | with concurrent.futures.ThreadPoolExecutor( 70 | max_workers=thread_workers 71 | ) as executor: 72 | results = executor.map(self.multiprocess_run, self.tickers) 73 | 74 | for res in results: 75 | curr_tic = list(res.keys())[0] 76 | for data in res[curr_tic]: 77 | curr_year = data["year"] 78 | curr_filing_type = data["filing_type"] 79 | if curr_filing_type in ["10-K/A", "10-Q/A"]: 80 | curr_filing_type = curr_filing_type.replace("/", "") 81 | if curr_filing_type in ["10-K", "10-KA"]: 82 | os.makedirs(f"{self.folder_name}/{curr_tic}/{curr_year}", exist_ok=True) 83 | with open( 84 | f"{self.folder_name}/{curr_tic}/{curr_year}/{curr_filing_type}.json", "w" 85 | ) as f: 86 | json.dump(data, f, indent=4) 87 | elif curr_filing_type in ["10-Q", "10-QA"]: 88 | os.makedirs(f"{self.folder_name}/{curr_tic}/{curr_year[:-2]}", exist_ok=True) 89 | with open( 90 | f"{self.folder_name}/{curr_tic}/{curr_year[:-2]}/{curr_filing_type}_{curr_year[-2:]}.json", 91 | "w", 92 | ) as f: 93 | json.dump(data, f, indent=4) 94 | print( 95 | f"Done for {curr_tic} for document {curr_filing_type} and year" 96 | f" {curr_year}" 97 | ) 98 | 99 | print(f"It took {round(time.time()-start,2)} seconds") 100 | -------------------------------------------------------------------------------- /finnlp/data_engineering/data_cleaning.py: -------------------------------------------------------------------------------- 1 | import re 2 | import os 3 | from transformers import BertTokenizer 4 | from datasketch import MinHash, MinHashLSH 5 | from nltk import ngrams 6 | 7 | # junk data 8 | def junk_eliminate(df, re_expression = r'[&#<>{}\[\]\\]', threshold=0.01, min_len=10): 9 | RE_SUSPICIOUS = re.compile(re_expression) 10 | def impurity(text, min_len=min_len): 11 | """returns the share of suspicious characters in a text""" 12 | if text == None or len(text) < min_len: 13 | return 0 14 | else: 15 | return len(RE_SUSPICIOUS.findall(text))/len(text) 16 | df['impurity'] = df['text'].apply(impurity, min_len=min_len) 17 | total_num_docs = len(df) 18 | impurity_num_docs = len(df[df['impurity'] >= threshold]) 19 | impurity_ratio = impurity_num_docs / total_num_docs 20 | purity_df = df[df['impurity'] < threshold] 21 | return purity_df, impurity_ratio 22 | 23 | # Biased Content 24 | def toxic_eliminate(df, l_kind='en'): 25 | ''' 26 | l_kind = ['en', 'zh'] 27 | ''' 28 | os.system(f"wget https://github.com/LDNOOBW/List-of-Dirty-Naughty-Obscene-and-Otherwise-Bad-Words/blob/master/{l_kind}") 29 | with open(f'./{l_kind}', 'r') as f: 30 | lines = f.readlines() 31 | banned_words = set([line.rstrip('\n') for line in lines]) 32 | df['banned_words_in_text'] = df['text'].apply(lambda text: [word for word in banned_words if word in text.lower().split()]) 33 | df['matches'] = df['banned_words_in_text'].apply(lambda words: len(words) > 0) 34 | total_num_docs = len(df) 35 | biased_num_docs = df['matches'].sum() 36 | biased_content_ratio = biased_num_docs / total_num_docs 37 | non_toxic_df = df[df['matches'] == 0] 38 | return non_toxic_df, biased_content_ratio 39 | 40 | # Too Short Document 41 | def short_eliminate(df, tokenizer = BertTokenizer.from_pretrained('bert-base-uncased'), min_len=100): 42 | # Create a new column with the number of tokens for each text 43 | df['text_length'] = df['text'].apply(lambda text: len(tokenizer.tokenize(text))) 44 | total_num_docs = len(df) 45 | too_short_docs = len(df[df['text_length'] <= min_len]) 46 | too_short_doc_ratio = too_short_docs / total_num_docs 47 | not_short_df = df[df['text_length'] > min_len] 48 | return not_short_df, too_short_doc_ratio 49 | 50 | # Contamination 51 | def process_data(df): 52 | minhashes = {} 53 | for idx, text in enumerate(df['text']): 54 | minhash = MinHash(num_perm=128) 55 | for d in ngrams(text, 13): 56 | s = "".join(d).encode('utf-8') 57 | minhash.update(s) 58 | minhashes[idx] = minhash 59 | return minhashes 60 | 61 | def contamination_eliminate(train_dataset, test_dataset): 62 | train_minhashes = process_data(train_dataset) 63 | test_minhashes = process_data(test_dataset) 64 | 65 | 66 | lsh = MinHashLSH(threshold=0.8, num_perm=128) 67 | 68 | for idx, minhash in train_minhashes.items(): 69 | lsh.insert(idx, minhash) 70 | 71 | duplicates_count = 0 72 | for idx, minhash in test_minhashes.items(): 73 | result = lsh.query(minhash) 74 | if len(result) > 0: 75 | duplicates_count += 1 76 | contamination_ratio = duplicates_count / len(test_dataset) 77 | return contamination_ratio 78 | 79 | # Duplication 80 | def duplication_eliminate(df): 81 | lsh = MinHashLSH(threshold=0.85, num_perm=128) 82 | for i, text in enumerate(df['text']): 83 | minhash = MinHash(num_perm=128) 84 | for word in text.split(): 85 | minhash.update(word.encode('utf-8')) 86 | lsh.insert(str(i), minhash) 87 | 88 | unique_documents = set() 89 | 90 | for i, text in enumerate(df['text']): 91 | query_minhash = MinHash(num_perm=128) 92 | for word in text.split(): 93 | query_minhash.update(word.encode('utf-8')) 94 | results = lsh.query(query_minhash) 95 | try: 96 | unique_documents.add(results[0]) 97 | except Exception as e: 98 | print(f'error: {e}') 99 | total_unique_documents = len(unique_documents) 100 | total_documents = len(df) 101 | duplication_ratio = (total_documents - total_unique_documents) / total_documents 102 | return unique_documents, duplication_ratio 103 | 104 | 105 | -------------------------------------------------------------------------------- /docs/FinNLP/site/assets/javascripts/lunr/min/lunr.sv.min.js: -------------------------------------------------------------------------------- 1 | /*! 2 | * Lunr languages, `Swedish` language 3 | * https://github.com/MihaiValentin/lunr-languages 4 | * 5 | * Copyright 2014, Mihai Valentin 6 | * http://www.mozilla.org/MPL/ 7 | */ 8 | /*! 9 | * based on 10 | * Snowball JavaScript Library v0.3 11 | * http://code.google.com/p/urim/ 12 | * http://snowball.tartarus.org/ 13 | * 14 | * Copyright 2010, Oleg Mazko 15 | * http://www.mozilla.org/MPL/ 16 | */ 17 | 18 | !function(e,r){"function"==typeof define&&define.amd?define(r):"object"==typeof exports?module.exports=r():r()(e.lunr)}(this,function(){return function(e){if(void 0===e)throw new Error("Lunr is not present. Please include / require Lunr before this script.");if(void 0===e.stemmerSupport)throw new Error("Lunr stemmer support is not present. Please include / require Lunr stemmer support before this script.");e.sv=function(){this.pipeline.reset(),this.pipeline.add(e.sv.trimmer,e.sv.stopWordFilter,e.sv.stemmer),this.searchPipeline&&(this.searchPipeline.reset(),this.searchPipeline.add(e.sv.stemmer))},e.sv.wordCharacters="A-Za-zªºÀ-ÖØ-öø-ʸˠ-ˤᴀ-ᴥᴬ-ᵜᵢ-ᵥᵫ-ᵷᵹ-ᶾḀ-ỿⁱⁿₐ-ₜKÅℲⅎⅠ-ↈⱠ-ⱿꜢ-ꞇꞋ-ꞭꞰ-ꞷꟷ-ꟿꬰ-ꭚꭜ-ꭤff-stA-Za-z",e.sv.trimmer=e.trimmerSupport.generateTrimmer(e.sv.wordCharacters),e.Pipeline.registerFunction(e.sv.trimmer,"trimmer-sv"),e.sv.stemmer=function(){var r=e.stemmerSupport.Among,n=e.stemmerSupport.SnowballProgram,t=new function(){function e(){var e,r=w.cursor+3;if(o=w.limit,0<=r||r<=w.limit){for(a=r;;){if(e=w.cursor,w.in_grouping(l,97,246)){w.cursor=e;break}if(w.cursor=e,w.cursor>=w.limit)return;w.cursor++}for(;!w.out_grouping(l,97,246);){if(w.cursor>=w.limit)return;w.cursor++}o=w.cursor,o=o&&(w.limit_backward=o,w.cursor=w.limit,w.ket=w.cursor,e=w.find_among_b(u,37),w.limit_backward=r,e))switch(w.bra=w.cursor,e){case 1:w.slice_del();break;case 2:w.in_grouping_b(d,98,121)&&w.slice_del()}}function i(){var e=w.limit_backward;w.cursor>=o&&(w.limit_backward=o,w.cursor=w.limit,w.find_among_b(c,7)&&(w.cursor=w.limit,w.ket=w.cursor,w.cursor>w.limit_backward&&(w.bra=--w.cursor,w.slice_del())),w.limit_backward=e)}function s(){var e,r;if(w.cursor>=o){if(r=w.limit_backward,w.limit_backward=o,w.cursor=w.limit,w.ket=w.cursor,e=w.find_among_b(m,5))switch(w.bra=w.cursor,e){case 1:w.slice_del();break;case 2:w.slice_from("lös");break;case 3:w.slice_from("full")}w.limit_backward=r}}var a,o,u=[new r("a",-1,1),new r("arna",0,1),new r("erna",0,1),new r("heterna",2,1),new r("orna",0,1),new r("ad",-1,1),new r("e",-1,1),new r("ade",6,1),new r("ande",6,1),new r("arne",6,1),new r("are",6,1),new r("aste",6,1),new r("en",-1,1),new r("anden",12,1),new r("aren",12,1),new r("heten",12,1),new r("ern",-1,1),new r("ar",-1,1),new r("er",-1,1),new r("heter",18,1),new r("or",-1,1),new r("s",-1,2),new r("as",21,1),new r("arnas",22,1),new r("ernas",22,1),new r("ornas",22,1),new r("es",21,1),new r("ades",26,1),new r("andes",26,1),new r("ens",21,1),new r("arens",29,1),new r("hetens",29,1),new r("erns",21,1),new r("at",-1,1),new r("andet",-1,1),new r("het",-1,1),new r("ast",-1,1)],c=[new r("dd",-1,-1),new r("gd",-1,-1),new r("nn",-1,-1),new r("dt",-1,-1),new r("gt",-1,-1),new r("kt",-1,-1),new r("tt",-1,-1)],m=[new r("ig",-1,1),new r("lig",0,1),new r("els",-1,1),new r("fullt",-1,3),new r("löst",-1,2)],l=[17,65,16,1,0,0,0,0,0,0,0,0,0,0,0,0,24,0,32],d=[119,127,149],w=new n;this.setCurrent=function(e){w.setCurrent(e)},this.getCurrent=function(){return w.getCurrent()},this.stem=function(){var r=w.cursor;return e(),w.limit_backward=r,w.cursor=w.limit,t(),w.cursor=w.limit,i(),w.cursor=w.limit,s(),!0}};return function(e){return"function"==typeof e.update?e.update(function(e){return t.setCurrent(e),t.stem(),t.getCurrent()}):(t.setCurrent(e),t.stem(),t.getCurrent())}}(),e.Pipeline.registerFunction(e.sv.stemmer,"stemmer-sv"),e.sv.stopWordFilter=e.generateStopWordFilter("alla allt att av blev bli blir blivit de dem den denna deras dess dessa det detta dig din dina ditt du där då efter ej eller en er era ert ett från för ha hade han hans har henne hennes hon honom hur här i icke ingen inom inte jag ju kan kunde man med mellan men mig min mina mitt mot mycket ni nu när någon något några och om oss på samma sedan sig sin sina sitta själv skulle som så sådan sådana sådant till under upp ut utan vad var vara varför varit varje vars vart vem vi vid vilka vilkas vilken vilket vår våra vårt än är åt över".split(" ")),e.Pipeline.registerFunction(e.sv.stopWordFilter,"stopWordFilter-sv")}}); -------------------------------------------------------------------------------- /docs/FinNLP/site/assets/javascripts/lunr/min/lunr.da.min.js: -------------------------------------------------------------------------------- 1 | /*! 2 | * Lunr languages, `Danish` language 3 | * https://github.com/MihaiValentin/lunr-languages 4 | * 5 | * Copyright 2014, Mihai Valentin 6 | * http://www.mozilla.org/MPL/ 7 | */ 8 | /*! 9 | * based on 10 | * Snowball JavaScript Library v0.3 11 | * http://code.google.com/p/urim/ 12 | * http://snowball.tartarus.org/ 13 | * 14 | * Copyright 2010, Oleg Mazko 15 | * http://www.mozilla.org/MPL/ 16 | */ 17 | 18 | !function(e,r){"function"==typeof define&&define.amd?define(r):"object"==typeof exports?module.exports=r():r()(e.lunr)}(this,function(){return function(e){if(void 0===e)throw new Error("Lunr is not present. Please include / require Lunr before this script.");if(void 0===e.stemmerSupport)throw new Error("Lunr stemmer support is not present. Please include / require Lunr stemmer support before this script.");e.da=function(){this.pipeline.reset(),this.pipeline.add(e.da.trimmer,e.da.stopWordFilter,e.da.stemmer),this.searchPipeline&&(this.searchPipeline.reset(),this.searchPipeline.add(e.da.stemmer))},e.da.wordCharacters="A-Za-zªºÀ-ÖØ-öø-ʸˠ-ˤᴀ-ᴥᴬ-ᵜᵢ-ᵥᵫ-ᵷᵹ-ᶾḀ-ỿⁱⁿₐ-ₜKÅℲⅎⅠ-ↈⱠ-ⱿꜢ-ꞇꞋ-ꞭꞰ-ꞷꟷ-ꟿꬰ-ꭚꭜ-ꭤff-stA-Za-z",e.da.trimmer=e.trimmerSupport.generateTrimmer(e.da.wordCharacters),e.Pipeline.registerFunction(e.da.trimmer,"trimmer-da"),e.da.stemmer=function(){var r=e.stemmerSupport.Among,i=e.stemmerSupport.SnowballProgram,n=new function(){function e(){var e,r=f.cursor+3;if(d=f.limit,0<=r&&r<=f.limit){for(a=r;;){if(e=f.cursor,f.in_grouping(w,97,248)){f.cursor=e;break}if(f.cursor=e,e>=f.limit)return;f.cursor++}for(;!f.out_grouping(w,97,248);){if(f.cursor>=f.limit)return;f.cursor++}d=f.cursor,d=d&&(r=f.limit_backward,f.limit_backward=d,f.ket=f.cursor,e=f.find_among_b(c,32),f.limit_backward=r,e))switch(f.bra=f.cursor,e){case 1:f.slice_del();break;case 2:f.in_grouping_b(p,97,229)&&f.slice_del()}}function t(){var e,r=f.limit-f.cursor;f.cursor>=d&&(e=f.limit_backward,f.limit_backward=d,f.ket=f.cursor,f.find_among_b(l,4)?(f.bra=f.cursor,f.limit_backward=e,f.cursor=f.limit-r,f.cursor>f.limit_backward&&(f.cursor--,f.bra=f.cursor,f.slice_del())):f.limit_backward=e)}function s(){var e,r,i,n=f.limit-f.cursor;if(f.ket=f.cursor,f.eq_s_b(2,"st")&&(f.bra=f.cursor,f.eq_s_b(2,"ig")&&f.slice_del()),f.cursor=f.limit-n,f.cursor>=d&&(r=f.limit_backward,f.limit_backward=d,f.ket=f.cursor,e=f.find_among_b(m,5),f.limit_backward=r,e))switch(f.bra=f.cursor,e){case 1:f.slice_del(),i=f.limit-f.cursor,t(),f.cursor=f.limit-i;break;case 2:f.slice_from("løs")}}function o(){var e;f.cursor>=d&&(e=f.limit_backward,f.limit_backward=d,f.ket=f.cursor,f.out_grouping_b(w,97,248)?(f.bra=f.cursor,u=f.slice_to(u),f.limit_backward=e,f.eq_v_b(u)&&f.slice_del()):f.limit_backward=e)}var a,d,u,c=[new r("hed",-1,1),new r("ethed",0,1),new r("ered",-1,1),new r("e",-1,1),new r("erede",3,1),new r("ende",3,1),new r("erende",5,1),new r("ene",3,1),new r("erne",3,1),new r("ere",3,1),new r("en",-1,1),new r("heden",10,1),new r("eren",10,1),new r("er",-1,1),new r("heder",13,1),new r("erer",13,1),new r("s",-1,2),new r("heds",16,1),new r("es",16,1),new r("endes",18,1),new r("erendes",19,1),new r("enes",18,1),new r("ernes",18,1),new r("eres",18,1),new r("ens",16,1),new r("hedens",24,1),new r("erens",24,1),new r("ers",16,1),new r("ets",16,1),new r("erets",28,1),new r("et",-1,1),new r("eret",30,1)],l=[new r("gd",-1,-1),new r("dt",-1,-1),new r("gt",-1,-1),new r("kt",-1,-1)],m=[new r("ig",-1,1),new r("lig",0,1),new r("elig",1,1),new r("els",-1,1),new r("løst",-1,2)],w=[17,65,16,1,0,0,0,0,0,0,0,0,0,0,0,0,48,0,128],p=[239,254,42,3,0,0,0,0,0,0,0,0,0,0,0,0,16],f=new i;this.setCurrent=function(e){f.setCurrent(e)},this.getCurrent=function(){return f.getCurrent()},this.stem=function(){var r=f.cursor;return e(),f.limit_backward=r,f.cursor=f.limit,n(),f.cursor=f.limit,t(),f.cursor=f.limit,s(),f.cursor=f.limit,o(),!0}};return function(e){return"function"==typeof e.update?e.update(function(e){return n.setCurrent(e),n.stem(),n.getCurrent()}):(n.setCurrent(e),n.stem(),n.getCurrent())}}(),e.Pipeline.registerFunction(e.da.stemmer,"stemmer-da"),e.da.stopWordFilter=e.generateStopWordFilter("ad af alle alt anden at blev blive bliver da de dem den denne der deres det dette dig din disse dog du efter eller en end er et for fra ham han hans har havde have hende hendes her hos hun hvad hvis hvor i ikke ind jeg jer jo kunne man mange med meget men mig min mine mit mod ned noget nogle nu når og også om op os over på selv sig sin sine sit skal skulle som sådan thi til ud under var vi vil ville vor være været".split(" ")),e.Pipeline.registerFunction(e.da.stopWordFilter,"stopWordFilter-da")}}); -------------------------------------------------------------------------------- /docs/FinNLP/site/assets/javascripts/lunr/min/lunr.no.min.js: -------------------------------------------------------------------------------- 1 | /*! 2 | * Lunr languages, `Norwegian` language 3 | * https://github.com/MihaiValentin/lunr-languages 4 | * 5 | * Copyright 2014, Mihai Valentin 6 | * http://www.mozilla.org/MPL/ 7 | */ 8 | /*! 9 | * based on 10 | * Snowball JavaScript Library v0.3 11 | * http://code.google.com/p/urim/ 12 | * http://snowball.tartarus.org/ 13 | * 14 | * Copyright 2010, Oleg Mazko 15 | * http://www.mozilla.org/MPL/ 16 | */ 17 | 18 | !function(e,r){"function"==typeof define&&define.amd?define(r):"object"==typeof exports?module.exports=r():r()(e.lunr)}(this,function(){return function(e){if(void 0===e)throw new Error("Lunr is not present. Please include / require Lunr before this script.");if(void 0===e.stemmerSupport)throw new Error("Lunr stemmer support is not present. Please include / require Lunr stemmer support before this script.");e.no=function(){this.pipeline.reset(),this.pipeline.add(e.no.trimmer,e.no.stopWordFilter,e.no.stemmer),this.searchPipeline&&(this.searchPipeline.reset(),this.searchPipeline.add(e.no.stemmer))},e.no.wordCharacters="A-Za-zªºÀ-ÖØ-öø-ʸˠ-ˤᴀ-ᴥᴬ-ᵜᵢ-ᵥᵫ-ᵷᵹ-ᶾḀ-ỿⁱⁿₐ-ₜKÅℲⅎⅠ-ↈⱠ-ⱿꜢ-ꞇꞋ-ꞭꞰ-ꞷꟷ-ꟿꬰ-ꭚꭜ-ꭤff-stA-Za-z",e.no.trimmer=e.trimmerSupport.generateTrimmer(e.no.wordCharacters),e.Pipeline.registerFunction(e.no.trimmer,"trimmer-no"),e.no.stemmer=function(){var r=e.stemmerSupport.Among,n=e.stemmerSupport.SnowballProgram,i=new function(){function e(){var e,r=w.cursor+3;if(a=w.limit,0<=r||r<=w.limit){for(s=r;;){if(e=w.cursor,w.in_grouping(d,97,248)){w.cursor=e;break}if(e>=w.limit)return;w.cursor=e+1}for(;!w.out_grouping(d,97,248);){if(w.cursor>=w.limit)return;w.cursor++}a=w.cursor,a=a&&(r=w.limit_backward,w.limit_backward=a,w.ket=w.cursor,e=w.find_among_b(m,29),w.limit_backward=r,e))switch(w.bra=w.cursor,e){case 1:w.slice_del();break;case 2:n=w.limit-w.cursor,w.in_grouping_b(c,98,122)?w.slice_del():(w.cursor=w.limit-n,w.eq_s_b(1,"k")&&w.out_grouping_b(d,97,248)&&w.slice_del());break;case 3:w.slice_from("er")}}function t(){var e,r=w.limit-w.cursor;w.cursor>=a&&(e=w.limit_backward,w.limit_backward=a,w.ket=w.cursor,w.find_among_b(u,2)?(w.bra=w.cursor,w.limit_backward=e,w.cursor=w.limit-r,w.cursor>w.limit_backward&&(w.cursor--,w.bra=w.cursor,w.slice_del())):w.limit_backward=e)}function o(){var e,r;w.cursor>=a&&(r=w.limit_backward,w.limit_backward=a,w.ket=w.cursor,e=w.find_among_b(l,11),e?(w.bra=w.cursor,w.limit_backward=r,1==e&&w.slice_del()):w.limit_backward=r)}var s,a,m=[new r("a",-1,1),new r("e",-1,1),new r("ede",1,1),new r("ande",1,1),new r("ende",1,1),new r("ane",1,1),new r("ene",1,1),new r("hetene",6,1),new r("erte",1,3),new r("en",-1,1),new r("heten",9,1),new r("ar",-1,1),new r("er",-1,1),new r("heter",12,1),new r("s",-1,2),new r("as",14,1),new r("es",14,1),new r("edes",16,1),new r("endes",16,1),new r("enes",16,1),new r("hetenes",19,1),new r("ens",14,1),new r("hetens",21,1),new r("ers",14,1),new r("ets",14,1),new r("et",-1,1),new r("het",25,1),new r("ert",-1,3),new r("ast",-1,1)],u=[new r("dt",-1,-1),new r("vt",-1,-1)],l=[new r("leg",-1,1),new r("eleg",0,1),new r("ig",-1,1),new r("eig",2,1),new r("lig",2,1),new r("elig",4,1),new r("els",-1,1),new r("lov",-1,1),new r("elov",7,1),new r("slov",7,1),new r("hetslov",9,1)],d=[17,65,16,1,0,0,0,0,0,0,0,0,0,0,0,0,48,0,128],c=[119,125,149,1],w=new n;this.setCurrent=function(e){w.setCurrent(e)},this.getCurrent=function(){return w.getCurrent()},this.stem=function(){var r=w.cursor;return e(),w.limit_backward=r,w.cursor=w.limit,i(),w.cursor=w.limit,t(),w.cursor=w.limit,o(),!0}};return function(e){return"function"==typeof e.update?e.update(function(e){return i.setCurrent(e),i.stem(),i.getCurrent()}):(i.setCurrent(e),i.stem(),i.getCurrent())}}(),e.Pipeline.registerFunction(e.no.stemmer,"stemmer-no"),e.no.stopWordFilter=e.generateStopWordFilter("alle at av bare begge ble blei bli blir blitt både båe da de deg dei deim deira deires dem den denne der dere deres det dette di din disse ditt du dykk dykkar då eg ein eit eitt eller elles en enn er et ett etter for fordi fra før ha hadde han hans har hennar henne hennes her hjå ho hoe honom hoss hossen hun hva hvem hver hvilke hvilken hvis hvor hvordan hvorfor i ikke ikkje ikkje ingen ingi inkje inn inni ja jeg kan kom korleis korso kun kunne kva kvar kvarhelst kven kvi kvifor man mange me med medan meg meget mellom men mi min mine mitt mot mykje ned no noe noen noka noko nokon nokor nokre nå når og også om opp oss over på samme seg selv si si sia sidan siden sin sine sitt sjøl skal skulle slik so som som somme somt så sånn til um upp ut uten var vart varte ved vere verte vi vil ville vore vors vort vår være være vært å".split(" ")),e.Pipeline.registerFunction(e.no.stopWordFilter,"stopWordFilter-no")}}); -------------------------------------------------------------------------------- /finnlp/data_sources/news/seekingalpha_date_range.py: -------------------------------------------------------------------------------- 1 | import warnings 2 | warnings.filterwarnings("ignore") 3 | 4 | import json 5 | import requests 6 | import pandas as pd 7 | from lxml import etree 8 | from tqdm import tqdm 9 | from datetime import datetime 10 | 11 | from finnlp.data_sources.news._base import News_Downloader 12 | 13 | class SeekingAlpha_Date_Range(News_Downloader): 14 | def __init__(self, args = {}): 15 | super().__init__(args) 16 | 17 | def download_date_range_stock(self, start_date, end_date, stock = "AAPL", proxies = None): 18 | self.dataframe = pd.DataFrame() 19 | start_timestamp = int(datetime.strptime(start_date+'-13', "%Y-%m-%d-%H").timestamp()) 20 | end_timestamp = int(datetime.strptime(end_date+'-13', "%Y-%m-%d-%H").timestamp()) 21 | # Downloading First Page 22 | data, totalpages = self._gather_by_page(start_timestamp, end_timestamp, stock, 1, proxies) 23 | self.dataframe = pd.concat([self.dataframe, data]) 24 | 25 | # Downloading Other Pages 26 | with tqdm(total=totalpages, desc= "Downloading Titles") as bar: 27 | bar.update(1) 28 | for page in range(2, totalpages+1): 29 | data,_ = self._gather_by_page(start_timestamp, end_timestamp, stock, page, proxies) 30 | self.dataframe = pd.concat([self.dataframe, data]) 31 | bar.update(1) 32 | self.dataframe = self.dataframe.reset_index(drop = True) 33 | 34 | def _gather_by_page(self, start_timestamp, end_timestamp, stock, page = 1, proxies = None): 35 | url = f"https://seekingalpha.com/api/v3/symbols/{stock}/news?filter[since]={start_timestamp}&filter[until]={end_timestamp}&id={stock}&include=author%2CprimaryTickers%2CsecondaryTickers%2Csentiments&isMounting=true&page[size]=40&page[number]={page}" 36 | headers = { 37 | 'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:109.0) Gecko/20100101 Firefox/113.0', 38 | 'Referer':f'https://seekingalpha.com/symbol/aapl/news?from=2009-12-31T16%3A00%3A00.000Z&to=2022-01-01T15%3A59%3A59.999Z' 39 | } 40 | response = requests.get(url, headers=headers, proxies=proxies) 41 | if response.status_code != 200: 42 | print(f"stock: {stock}, page: {page} went wrong!") 43 | return pd.DataFrame(), 1 44 | else: 45 | res = json.loads(response.text) 46 | data = pd.DataFrame(res["data"]) 47 | # make new features 48 | new_columns = ["publishOn", "isLockedPro", "commentCount", "gettyImageUrl", "videoPreviewUrl", "themes", "title", "isPaywalled"] 49 | data[new_columns] = data.apply(lambda x:list(x.attributes.values()), axis = 1,result_type ="expand" ) 50 | new_columns = ["author", "sentiments", "primaryTickers", "secondaryTickers", "otherTags"] 51 | data[new_columns] = data.apply(lambda x:list(x.relationships.values()), axis = 1,result_type ="expand" ) 52 | 53 | # total pages 54 | totalpages = res["meta"]["page"]["totalPages"] 55 | return data, totalpages 56 | 57 | 58 | def obtain_content(self, parallel = False, proxies = None): 59 | if parallel: 60 | import os 61 | from pandarallel import pandarallel 62 | pandarallel.initialize(nb_workers=os.cpu_count()) 63 | self.dataframe['content'] = self.dataframe.parallel_apply(lambda x: self._obtain_content(x, proxies = proxies), axis = 1) 64 | else: 65 | self.dataframe['content'] = self.dataframe.apply(lambda x: self._obtain_content(x, proxies = proxies), axis = 1) 66 | 67 | 68 | def _obtain_content(self, x, proxies = None): 69 | url = x['links']['self'] 70 | url = f"https://seekingalpha.com{url}" 71 | headers = { 72 | 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:109.0) Gecko/20100101 Firefox/118.0' 73 | } 74 | res = requests.get(url, headers=headers, proxies=proxies) 75 | if res.status_code != 200: 76 | return '' 77 | else: 78 | resp = etree.HTML(res.text) 79 | resp = resp.xpath('//script[5]//text()') 80 | resp = resp[0].split('window.SSR_DATA = ')[1] 81 | resp = resp[:-1] 82 | resp = json.loads(resp) 83 | content = resp['article']['response']['data']['attributes']['content'] 84 | content = etree.HTML(content) 85 | content = content.xpath('//text()') 86 | content = [c if c!= ' ' else '\n' for c in content] 87 | content = ''.join(content) 88 | content = content.strip() 89 | return content 90 | 91 | 92 | -------------------------------------------------------------------------------- /finnlp/utils/get_proxy.py: -------------------------------------------------------------------------------- 1 | import requests 2 | import parsel 3 | from lxml import etree 4 | from tqdm import tqdm 5 | import time 6 | import re 7 | 8 | def check_china_ips(proxies_list): 9 | """检测ip的方法""" 10 | headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/79.0.3945.79 Safari/537.36'} 11 | 12 | can_use = [] 13 | for proxy in tqdm(proxies_list, desc = "Checking ips"): 14 | try: 15 | response = requests.get('http://www.baidu.com', headers=headers, proxies=proxy, timeout=1) # 超时报错 16 | if response.status_code == 200: 17 | can_use.append(proxy) 18 | except Exception as error: 19 | # print(error) 20 | pass 21 | return can_use 22 | 23 | def check_us_ips(proxies_list): 24 | """检测ip的方法""" 25 | headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/79.0.3945.79 Safari/537.36'} 26 | 27 | can_use = [] 28 | for proxy in tqdm(proxies_list, desc = "Checking ips"): 29 | try: 30 | response = requests.get('http://www.google.com', headers=headers, proxies=proxy, timeout=1) # 超时报错 31 | if response.status_code == 200: 32 | can_use.append(proxy) 33 | except Exception as error: 34 | # print(error) 35 | pass 36 | return can_use 37 | 38 | def get_china_free_proxy(pages = 10): 39 | proxies_list = [] 40 | for page in tqdm(range(1, pages+1), desc = "Gathering free ips by pages..."): 41 | 42 | base_url = f'https://www.kuaidaili.com/free/inha/{page}' 43 | headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/79.0.3945.79 Safari/537.36'} 44 | success = False 45 | while not success: 46 | try: 47 | response = requests.get(base_url, headers=headers) 48 | data = response.text 49 | res = etree.HTML(data) 50 | trs = res.xpath('//table/tbody/tr') 51 | if len(trs)!=0: 52 | success = True 53 | for tr in trs: 54 | proxies_dict = {} 55 | http_type = tr.xpath('./td[4]/text()')[0] 56 | ip_num = tr.xpath('./td[1]/text()')[0] 57 | port_num = tr.xpath('./td[2]/text()')[0] 58 | proxies_dict[http_type] = ip_num + ':' + port_num 59 | proxies_list.append(proxies_dict) 60 | else: 61 | time.delay(0.01) 62 | 63 | except: 64 | pass 65 | 66 | can_use = check_china_ips(proxies_list) 67 | 68 | print(f'获取到的代理ip数量: {len(proxies_list)} 。Get proxy ips: {len(proxies_list)}.') 69 | print(f'能用的代理数量: {len(can_use)}。Usable proxy ips: {len(can_use)}.' ) 70 | 71 | return can_use 72 | 73 | def get_us_free_proxy(pages = 10): 74 | url = "https://openproxy.space/list/http" 75 | headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/79.0.3945.79 Safari/537.36'} 76 | response = requests.get(url, headers=headers) 77 | if response.status_code != 200: 78 | print("Connection Error. Please make sure that your computer now have the access to Google.com") 79 | res = etree.HTML(response.text) 80 | http_type = "HTTP" 81 | proxies_list = [] 82 | 83 | scripts = res.xpath("//script") 84 | content = scripts[3].xpath(".//text()") 85 | pattern = re.compile('LIST",data:(.+),added:') 86 | result_list = pattern.findall(content[0]) 87 | result_list = result_list[0].strip("[{").strip("}]").split("},{") 88 | 89 | for result in result_list: 90 | pattern = re.compile('\[(.+)\]') 91 | result = pattern.findall(result) 92 | result = result[0].split(",") 93 | result = [r.strip("\"") for r in result] 94 | for ip in result: 95 | proxies_list.append( 96 | {http_type: ip} 97 | ) 98 | total = pages* 15 99 | proxies_list = proxies_list[:total] 100 | can_use = check_us_ips(proxies_list) 101 | print(f'Get proxy ips: {len(proxies_list)}.') 102 | print(f'Usable proxy ips: {len(can_use)}.' ) 103 | 104 | return can_use 105 | 106 | class Kuaidaili: 107 | def __init__(self, tunnel, username, password): 108 | self.tunnel = tunnel 109 | self.username = username 110 | self.password = password 111 | 112 | def get_kuaidaili_tunnel_proxy(self): 113 | proxies = { 114 | "http": "http://%(user)s:%(pwd)s@%(proxy)s/" % {"user": self.username, "pwd": self.password, "proxy": self.tunnel}, 115 | "https": "http://%(user)s:%(pwd)s@%(proxy)s/" % {"user": self.username, "pwd": self.password, "proxy": self.tunnel} 116 | } 117 | return proxies -------------------------------------------------------------------------------- /finnlp/data_sources/news/marketwatch_streaming.py: -------------------------------------------------------------------------------- 1 | import requests 2 | from lxml import etree 3 | from tqdm import tqdm 4 | import pandas as pd 5 | import json 6 | import time 7 | from finnlp.data_sources.news._base import News_Downloader 8 | 9 | # TODO: 10 | # 1. More pages 11 | # 2. Contents 12 | 13 | 14 | class MarketWatch_Streaming(News_Downloader): 15 | 16 | def __init__(self, args={}): 17 | super().__init__(args) 18 | self.dataframe = pd.DataFrame() 19 | 20 | def download_streaming_search(self, keyword = "apple", rounds = 3, delay = 0.5): 21 | # download first page 22 | self._download_first_page(keyword, delay = delay) 23 | 24 | # download the following pages 25 | # self._download_other_pages(keyword) 26 | print("Only support the first page now!") 27 | 28 | def download_date_range_search(self, start_date , end_date, keyword = "apple", rounds = 1000, delay = 0.5): 29 | # download first page 30 | self._download_first_page(keyword, delay = delay, start_date = start_date, end_date = end_date) 31 | 32 | # download the following pages 33 | # self._download_other_pages(keyword) 34 | print("Only support the first page now!") 35 | 36 | def _download_first_page(self, keyword = "apple", delay = 0.5, start_date = None, end_date = None): 37 | url = "https://www.marketwatch.com/search" 38 | params = { 39 | 'q': keyword, 40 | 'ts': '0', 41 | 'tab': 'All News', 42 | 'sd': start_date, 43 | 'ed': end_date, 44 | } 45 | headers = { 46 | "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/116.0.0.0 Safari/537.36", 47 | } 48 | 49 | res = requests.get(url = url, headers= headers, params=params) 50 | if res.status_code != 200: 51 | print(f'Connection Error: {res.status_code}') 52 | return f'Connection Error: {res.status_code}' 53 | 54 | res = etree.HTML(res.text) 55 | divs = res.xpath("body/main/div/div[2]/div[2]/div[2]/div[2]/mw-tabs/div[2]/div[1]/div/div[1]/div") 56 | titles = [] 57 | times = [] 58 | authors = [] 59 | for div in divs: 60 | # title 61 | title = div.xpath("./div/h3/a/text()") 62 | # time 63 | time_ = div.xpath("./div/div/span[1]/text()") 64 | # author 65 | author = div.xpath("./div/div/span[2]/text()") 66 | 67 | if len(title)>0: 68 | titles.append(' '.join(title).replace("\n","").strip(" ")) 69 | times.append(' '.join(time_)) 70 | authors.append(' '.join(author)) 71 | 72 | # concat results 73 | tmp = pd.DataFrame([titles, times, authors]).T 74 | tmp.columns = ["title", "time", "author"] 75 | self.dataframe = pd.concat([self.dataframe, tmp]) 76 | 77 | # sleep 78 | time.sleep(delay) 79 | 80 | 81 | 82 | 83 | class MarketWatch_Date_Range(News_Downloader): 84 | 85 | def __init__(self, args={}): 86 | super().__init__(args) 87 | self.dataframe = pd.DataFrame() 88 | 89 | def download_date_range_search(self, start_date , end_date, keyword = "apple", delay = 0.5): 90 | # download first page 91 | self._download_first_page(keyword, delay = delay, start_date = start_date, end_date = end_date) 92 | 93 | # download the following pages 94 | # self._download_other_pages(keyword) 95 | print("Only support the first page now!") 96 | 97 | def _download_first_page(self, keyword = "apple", delay = 0.5, start_date = None, end_date = None): 98 | url = "https://www.marketwatch.com/search" 99 | params = { 100 | 'q': keyword, 101 | 'ts': '0', 102 | 'tab': 'All News', 103 | 'sd': start_date, 104 | 'ed': end_date, 105 | } 106 | headers = { 107 | "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/116.0.0.0 Safari/537.36", 108 | } 109 | 110 | res = requests.get(url = url, headers= headers, params=params) 111 | if res.status_code != 200: 112 | print(f'Connection Error: {res.status_code}') 113 | return f'Connection Error: {res.status_code}' 114 | 115 | res = etree.HTML(res.text) 116 | divs = res.xpath("body/main/div/div[2]/div[2]/div[2]/div[2]/mw-tabs/div[2]/div[1]/div/div[1]/div") 117 | titles = [] 118 | times = [] 119 | authors = [] 120 | for div in divs: 121 | # title 122 | title = div.xpath("./div/h3/a/text()") 123 | # time 124 | time_ = div.xpath("./div/div/span[1]/text()") 125 | # author 126 | author = div.xpath("./div/div/span[2]/text()") 127 | 128 | if len(title)>0: 129 | titles.append(' '.join(title).replace("\n","").strip(" ")) 130 | times.append(' '.join(time_)) 131 | authors.append(' '.join(author)) 132 | 133 | # concat results 134 | tmp = pd.DataFrame([titles, times, authors]).T 135 | tmp.columns = ["title", "time", "author"] 136 | self.dataframe = pd.concat([self.dataframe, tmp]) 137 | 138 | # sleep 139 | time.sleep(delay) 140 | -------------------------------------------------------------------------------- /docs/FinNLP/site/assets/javascripts/lunr/min/lunr.nl.min.js: -------------------------------------------------------------------------------- 1 | /*! 2 | * Lunr languages, `Dutch` language 3 | * https://github.com/MihaiValentin/lunr-languages 4 | * 5 | * Copyright 2014, Mihai Valentin 6 | * http://www.mozilla.org/MPL/ 7 | */ 8 | /*! 9 | * based on 10 | * Snowball JavaScript Library v0.3 11 | * http://code.google.com/p/urim/ 12 | * http://snowball.tartarus.org/ 13 | * 14 | * Copyright 2010, Oleg Mazko 15 | * http://www.mozilla.org/MPL/ 16 | */ 17 | 18 | !function(r,e){"function"==typeof define&&define.amd?define(e):"object"==typeof exports?module.exports=e():e()(r.lunr)}(this,function(){return function(r){if(void 0===r)throw new Error("Lunr is not present. Please include / require Lunr before this script.");if(void 0===r.stemmerSupport)throw new Error("Lunr stemmer support is not present. Please include / require Lunr stemmer support before this script.");r.nl=function(){this.pipeline.reset(),this.pipeline.add(r.nl.trimmer,r.nl.stopWordFilter,r.nl.stemmer),this.searchPipeline&&(this.searchPipeline.reset(),this.searchPipeline.add(r.nl.stemmer))},r.nl.wordCharacters="A-Za-zªºÀ-ÖØ-öø-ʸˠ-ˤᴀ-ᴥᴬ-ᵜᵢ-ᵥᵫ-ᵷᵹ-ᶾḀ-ỿⁱⁿₐ-ₜKÅℲⅎⅠ-ↈⱠ-ⱿꜢ-ꞇꞋ-ꞭꞰ-ꞷꟷ-ꟿꬰ-ꭚꭜ-ꭤff-stA-Za-z",r.nl.trimmer=r.trimmerSupport.generateTrimmer(r.nl.wordCharacters),r.Pipeline.registerFunction(r.nl.trimmer,"trimmer-nl"),r.nl.stemmer=function(){var e=r.stemmerSupport.Among,i=r.stemmerSupport.SnowballProgram,n=new function(){function r(){for(var r,e,i,o=C.cursor;;){if(C.bra=C.cursor,r=C.find_among(b,11))switch(C.ket=C.cursor,r){case 1:C.slice_from("a");continue;case 2:C.slice_from("e");continue;case 3:C.slice_from("i");continue;case 4:C.slice_from("o");continue;case 5:C.slice_from("u");continue;case 6:if(C.cursor>=C.limit)break;C.cursor++;continue}break}for(C.cursor=o,C.bra=o,C.eq_s(1,"y")?(C.ket=C.cursor,C.slice_from("Y")):C.cursor=o;;)if(e=C.cursor,C.in_grouping(q,97,232)){if(i=C.cursor,C.bra=i,C.eq_s(1,"i"))C.ket=C.cursor,C.in_grouping(q,97,232)&&(C.slice_from("I"),C.cursor=e);else if(C.cursor=i,C.eq_s(1,"y"))C.ket=C.cursor,C.slice_from("Y"),C.cursor=e;else if(n(e))break}else if(n(e))break}function n(r){return C.cursor=r,r>=C.limit||(C.cursor++,!1)}function o(){_=C.limit,d=_,t()||(_=C.cursor,_<3&&(_=3),t()||(d=C.cursor))}function t(){for(;!C.in_grouping(q,97,232);){if(C.cursor>=C.limit)return!0;C.cursor++}for(;!C.out_grouping(q,97,232);){if(C.cursor>=C.limit)return!0;C.cursor++}return!1}function s(){for(var r;;)if(C.bra=C.cursor,r=C.find_among(p,3))switch(C.ket=C.cursor,r){case 1:C.slice_from("y");break;case 2:C.slice_from("i");break;case 3:if(C.cursor>=C.limit)return;C.cursor++}}function u(){return _<=C.cursor}function c(){return d<=C.cursor}function a(){var r=C.limit-C.cursor;C.find_among_b(g,3)&&(C.cursor=C.limit-r,C.ket=C.cursor,C.cursor>C.limit_backward&&(C.cursor--,C.bra=C.cursor,C.slice_del()))}function l(){var r;w=!1,C.ket=C.cursor,C.eq_s_b(1,"e")&&(C.bra=C.cursor,u()&&(r=C.limit-C.cursor,C.out_grouping_b(q,97,232)&&(C.cursor=C.limit-r,C.slice_del(),w=!0,a())))}function m(){var r;u()&&(r=C.limit-C.cursor,C.out_grouping_b(q,97,232)&&(C.cursor=C.limit-r,C.eq_s_b(3,"gem")||(C.cursor=C.limit-r,C.slice_del(),a())))}function f(){var r,e,i,n,o,t,s=C.limit-C.cursor;if(C.ket=C.cursor,r=C.find_among_b(h,5))switch(C.bra=C.cursor,r){case 1:u()&&C.slice_from("heid");break;case 2:m();break;case 3:u()&&C.out_grouping_b(j,97,232)&&C.slice_del()}if(C.cursor=C.limit-s,l(),C.cursor=C.limit-s,C.ket=C.cursor,C.eq_s_b(4,"heid")&&(C.bra=C.cursor,c()&&(e=C.limit-C.cursor,C.eq_s_b(1,"c")||(C.cursor=C.limit-e,C.slice_del(),C.ket=C.cursor,C.eq_s_b(2,"en")&&(C.bra=C.cursor,m())))),C.cursor=C.limit-s,C.ket=C.cursor,r=C.find_among_b(k,6))switch(C.bra=C.cursor,r){case 1:if(c()){if(C.slice_del(),i=C.limit-C.cursor,C.ket=C.cursor,C.eq_s_b(2,"ig")&&(C.bra=C.cursor,c()&&(n=C.limit-C.cursor,!C.eq_s_b(1,"e")))){C.cursor=C.limit-n,C.slice_del();break}C.cursor=C.limit-i,a()}break;case 2:c()&&(o=C.limit-C.cursor,C.eq_s_b(1,"e")||(C.cursor=C.limit-o,C.slice_del()));break;case 3:c()&&(C.slice_del(),l());break;case 4:c()&&C.slice_del();break;case 5:c()&&w&&C.slice_del()}C.cursor=C.limit-s,C.out_grouping_b(z,73,232)&&(t=C.limit-C.cursor,C.find_among_b(v,4)&&C.out_grouping_b(q,97,232)&&(C.cursor=C.limit-t,C.ket=C.cursor,C.cursor>C.limit_backward&&(C.cursor--,C.bra=C.cursor,C.slice_del())))}var d,_,w,b=[new e("",-1,6),new e("á",0,1),new e("ä",0,1),new e("é",0,2),new e("ë",0,2),new e("í",0,3),new e("ï",0,3),new e("ó",0,4),new e("ö",0,4),new e("ú",0,5),new e("ü",0,5)],p=[new e("",-1,3),new e("I",0,2),new e("Y",0,1)],g=[new e("dd",-1,-1),new e("kk",-1,-1),new e("tt",-1,-1)],h=[new e("ene",-1,2),new e("se",-1,3),new e("en",-1,2),new e("heden",2,1),new e("s",-1,3)],k=[new e("end",-1,1),new e("ig",-1,2),new e("ing",-1,1),new e("lijk",-1,3),new e("baar",-1,4),new e("bar",-1,5)],v=[new e("aa",-1,-1),new e("ee",-1,-1),new e("oo",-1,-1),new e("uu",-1,-1)],q=[17,65,16,1,0,0,0,0,0,0,0,0,0,0,0,0,128],z=[1,0,0,17,65,16,1,0,0,0,0,0,0,0,0,0,0,0,0,128],j=[17,67,16,1,0,0,0,0,0,0,0,0,0,0,0,0,128],C=new i;this.setCurrent=function(r){C.setCurrent(r)},this.getCurrent=function(){return C.getCurrent()},this.stem=function(){var e=C.cursor;return r(),C.cursor=e,o(),C.limit_backward=e,C.cursor=C.limit,f(),C.cursor=C.limit_backward,s(),!0}};return function(r){return"function"==typeof r.update?r.update(function(r){return n.setCurrent(r),n.stem(),n.getCurrent()}):(n.setCurrent(r),n.stem(),n.getCurrent())}}(),r.Pipeline.registerFunction(r.nl.stemmer,"stemmer-nl"),r.nl.stopWordFilter=r.generateStopWordFilter(" aan al alles als altijd andere ben bij daar dan dat de der deze die dit doch doen door dus een eens en er ge geen geweest haar had heb hebben heeft hem het hier hij hoe hun iemand iets ik in is ja je kan kon kunnen maar me meer men met mij mijn moet na naar niet niets nog nu of om omdat onder ons ook op over reeds te tegen toch toen tot u uit uw van veel voor want waren was wat werd wezen wie wil worden wordt zal ze zelf zich zij zijn zo zonder zou".split(" ")),r.Pipeline.registerFunction(r.nl.stopWordFilter,"stopWordFilter-nl")}}); -------------------------------------------------------------------------------- /finnlp/data_sources/company_announcement/juchao.py: -------------------------------------------------------------------------------- 1 | from finnlp.data_sources.company_announcement._base import Company_Announcement_Downloader 2 | 3 | import requests 4 | import time 5 | import json 6 | import os 7 | import pandas as pd 8 | from tqdm import tqdm 9 | from PyPDF2 import PdfReader 10 | 11 | class Juchao_Announcement(Company_Announcement_Downloader): 12 | 13 | def __init__(self, args = {}): 14 | super().__init__(args) 15 | self.dataframe = pd.DataFrame() 16 | 17 | def download_date_range_stock(self,start_date, end_date, stock = "000001",max_page = 100, searchkey= "", get_content = False, save_dir = "./tmp/" , delate_pdf = False): 18 | self.org_dict = self._get_orgid() 19 | 20 | # download the first page 21 | res = self._get_open_page(start_date, end_date, stock, 1, searchkey) 22 | total_pages = res["totalpages"]+1 23 | 24 | if res["announcements"] is None: 25 | print(f"Nothing related to your searchkey({searchkey}) is found, you may try another one or just leave it blank") 26 | else: 27 | tmp_df = self._process_data(res) 28 | self.dataframe = pd.concat([self.dataframe, tmp_df]) 29 | 30 | page = 2 31 | # download other page 32 | pbar = tqdm(total=total_pages,desc="Downloading by page...") 33 | 34 | for _ in range(max_page): 35 | res = self._get_open_page(start_date, end_date, stock, page, searchkey) 36 | if res["announcements"] is None: 37 | break 38 | tmp_df = self._process_data(res) 39 | self.dataframe = pd.concat([self.dataframe, tmp_df]) 40 | pbar.update(1) 41 | page += 1 42 | pbar.update(1) 43 | # Convert Time 44 | self.dataframe.announcementTime = self.dataframe.announcementTime.apply(lambda x:time.strftime('%Y-%m-%d %H:%M:%S',time.localtime(x/1000))) 45 | self.dataframe.announcementTime = pd.to_datetime(self.dataframe.announcementTime) 46 | 47 | if get_content: 48 | pbar = tqdm(total=self.dataframe.shape[0], desc="Getting the text data...") 49 | self.dataframe[["PDF_path","Content"]] = self.dataframe.apply(lambda x: self._get_pdfs(x,save_dir, delate_pdf, pbar),axis= 1,result_type = "expand") 50 | if delate_pdf: 51 | os.removedirs(save_dir) 52 | 53 | self.dataframe = self.dataframe.reset_index(drop = True) 54 | 55 | def _get_open_page(self,start_date,end_date, stock,page, searchkey): 56 | url = "http://www.cninfo.com.cn/new/hisAnnouncement/query?" 57 | headers = { 58 | "Referer": "http://www.cninfo.com.cn/new/commonUrl/pageOfSearch?url=disclosure/list/search&lastPage=index", 59 | "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/113.0.0.0 Safari/537.36", 60 | } 61 | data = { 62 | "pageNum": page, 63 | "pageSize": "30", 64 | "column": "szse", 65 | "tabName": "fulltext", 66 | "plate":"", 67 | "stock":stock + "," + self.org_dict[stock] , 68 | "searchkey": searchkey, 69 | "secid":"", 70 | "category":"", 71 | "trade":"", 72 | "seDate": f"{start_date}~{end_date}", 73 | "sortName": "", 74 | "sortType": "", 75 | "isHLtitle": "true", 76 | } 77 | res = requests.post(url = url, headers = headers, data = data) 78 | if res.status_code != 200: 79 | raise ConnectionError 80 | 81 | res = json.loads(res.text) 82 | return res 83 | 84 | def _process_data(self,res): 85 | if res is None: 86 | return res 87 | else: 88 | return pd.DataFrame(res["announcements"]) 89 | 90 | def _get_pdfs(self,x, save_dir, delate_pdf,pbar): 91 | os.makedirs(save_dir, exist_ok= True) 92 | adjunctUrl = x.adjunctUrl 93 | pdf_base_url = "http://static.cninfo.com.cn/" 94 | pdf_url = pdf_base_url + adjunctUrl 95 | responsepdf = self._request_get(pdf_url) 96 | 97 | 98 | if responsepdf is None: 99 | pbar.update(1) 100 | return ("Failed Download","Failed Download") 101 | 102 | else: 103 | # make preparations 104 | file_name = x.announcementTitle 105 | file_name = "".join(file_name.split("")) 106 | file_name = "".join(file_name.split("")) 107 | file_name 108 | file_name = f"{x.secCode}_{x.secName}_{file_name}.pdf" 109 | file_path = os.path.join(save_dir, file_name) 110 | 111 | # save pdf 112 | with open(file_path, "wb") as f: 113 | f.write(responsepdf.content) 114 | 115 | # analyze pdf 116 | with open(file_path, "rb") as filehandle: 117 | pdf = PdfReader(filehandle) 118 | text_all = "" 119 | for page in pdf.pages: 120 | text = page.extract_text() 121 | text = "".join(text.split("\n")) 122 | text_all += text 123 | pbar.update(1) 124 | 125 | if delate_pdf: 126 | os.remove(file_path) 127 | return ("removed", text_all) 128 | else: 129 | return (file_path, text_all) 130 | 131 | def _get_orgid(self): 132 | org_dict = {} 133 | org_json = self._request_get("http://www.cninfo.com.cn/new/data/szse_stock.json").json()["stockList"] 134 | 135 | for i in range(len(org_json)): 136 | org_dict[org_json[i]["code"]] = org_json[i]["orgId"] 137 | 138 | return org_dict -------------------------------------------------------------------------------- /docs/FinNLP/site/assets/javascripts/lunr/min/lunr.de.min.js: -------------------------------------------------------------------------------- 1 | /*! 2 | * Lunr languages, `German` language 3 | * https://github.com/MihaiValentin/lunr-languages 4 | * 5 | * Copyright 2014, Mihai Valentin 6 | * http://www.mozilla.org/MPL/ 7 | */ 8 | /*! 9 | * based on 10 | * Snowball JavaScript Library v0.3 11 | * http://code.google.com/p/urim/ 12 | * http://snowball.tartarus.org/ 13 | * 14 | * Copyright 2010, Oleg Mazko 15 | * http://www.mozilla.org/MPL/ 16 | */ 17 | 18 | !function(e,r){"function"==typeof define&&define.amd?define(r):"object"==typeof exports?module.exports=r():r()(e.lunr)}(this,function(){return function(e){if(void 0===e)throw new Error("Lunr is not present. Please include / require Lunr before this script.");if(void 0===e.stemmerSupport)throw new Error("Lunr stemmer support is not present. Please include / require Lunr stemmer support before this script.");e.de=function(){this.pipeline.reset(),this.pipeline.add(e.de.trimmer,e.de.stopWordFilter,e.de.stemmer),this.searchPipeline&&(this.searchPipeline.reset(),this.searchPipeline.add(e.de.stemmer))},e.de.wordCharacters="A-Za-zªºÀ-ÖØ-öø-ʸˠ-ˤᴀ-ᴥᴬ-ᵜᵢ-ᵥᵫ-ᵷᵹ-ᶾḀ-ỿⁱⁿₐ-ₜKÅℲⅎⅠ-ↈⱠ-ⱿꜢ-ꞇꞋ-ꞭꞰ-ꞷꟷ-ꟿꬰ-ꭚꭜ-ꭤff-stA-Za-z",e.de.trimmer=e.trimmerSupport.generateTrimmer(e.de.wordCharacters),e.Pipeline.registerFunction(e.de.trimmer,"trimmer-de"),e.de.stemmer=function(){var r=e.stemmerSupport.Among,n=e.stemmerSupport.SnowballProgram,i=new function(){function e(e,r,n){return!(!v.eq_s(1,e)||(v.ket=v.cursor,!v.in_grouping(p,97,252)))&&(v.slice_from(r),v.cursor=n,!0)}function i(){for(var r,n,i,s,t=v.cursor;;)if(r=v.cursor,v.bra=r,v.eq_s(1,"ß"))v.ket=v.cursor,v.slice_from("ss");else{if(r>=v.limit)break;v.cursor=r+1}for(v.cursor=t;;)for(n=v.cursor;;){if(i=v.cursor,v.in_grouping(p,97,252)){if(s=v.cursor,v.bra=s,e("u","U",i))break;if(v.cursor=s,e("y","Y",i))break}if(i>=v.limit)return void(v.cursor=n);v.cursor=i+1}}function s(){for(;!v.in_grouping(p,97,252);){if(v.cursor>=v.limit)return!0;v.cursor++}for(;!v.out_grouping(p,97,252);){if(v.cursor>=v.limit)return!0;v.cursor++}return!1}function t(){m=v.limit,l=m;var e=v.cursor+3;0<=e&&e<=v.limit&&(d=e,s()||(m=v.cursor,m=v.limit)return;v.cursor++}}}function c(){return m<=v.cursor}function u(){return l<=v.cursor}function a(){var e,r,n,i,s=v.limit-v.cursor;if(v.ket=v.cursor,(e=v.find_among_b(w,7))&&(v.bra=v.cursor,c()))switch(e){case 1:v.slice_del();break;case 2:v.slice_del(),v.ket=v.cursor,v.eq_s_b(1,"s")&&(v.bra=v.cursor,v.eq_s_b(3,"nis")&&v.slice_del());break;case 3:v.in_grouping_b(g,98,116)&&v.slice_del()}if(v.cursor=v.limit-s,v.ket=v.cursor,(e=v.find_among_b(f,4))&&(v.bra=v.cursor,c()))switch(e){case 1:v.slice_del();break;case 2:if(v.in_grouping_b(k,98,116)){var t=v.cursor-3;v.limit_backward<=t&&t<=v.limit&&(v.cursor=t,v.slice_del())}}if(v.cursor=v.limit-s,v.ket=v.cursor,(e=v.find_among_b(_,8))&&(v.bra=v.cursor,u()))switch(e){case 1:v.slice_del(),v.ket=v.cursor,v.eq_s_b(2,"ig")&&(v.bra=v.cursor,r=v.limit-v.cursor,v.eq_s_b(1,"e")||(v.cursor=v.limit-r,u()&&v.slice_del()));break;case 2:n=v.limit-v.cursor,v.eq_s_b(1,"e")||(v.cursor=v.limit-n,v.slice_del());break;case 3:if(v.slice_del(),v.ket=v.cursor,i=v.limit-v.cursor,!v.eq_s_b(2,"er")&&(v.cursor=v.limit-i,!v.eq_s_b(2,"en")))break;v.bra=v.cursor,c()&&v.slice_del();break;case 4:v.slice_del(),v.ket=v.cursor,e=v.find_among_b(b,2),e&&(v.bra=v.cursor,u()&&1==e&&v.slice_del())}}var d,l,m,h=[new r("",-1,6),new r("U",0,2),new r("Y",0,1),new r("ä",0,3),new r("ö",0,4),new r("ü",0,5)],w=[new r("e",-1,2),new r("em",-1,1),new r("en",-1,2),new r("ern",-1,1),new r("er",-1,1),new r("s",-1,3),new r("es",5,2)],f=[new r("en",-1,1),new r("er",-1,1),new r("st",-1,2),new r("est",2,1)],b=[new r("ig",-1,1),new r("lich",-1,1)],_=[new r("end",-1,1),new r("ig",-1,2),new r("ung",-1,1),new r("lich",-1,3),new r("isch",-1,2),new r("ik",-1,2),new r("heit",-1,3),new r("keit",-1,4)],p=[17,65,16,1,0,0,0,0,0,0,0,0,0,0,0,0,8,0,32,8],g=[117,30,5],k=[117,30,4],v=new n;this.setCurrent=function(e){v.setCurrent(e)},this.getCurrent=function(){return v.getCurrent()},this.stem=function(){var e=v.cursor;return i(),v.cursor=e,t(),v.limit_backward=e,v.cursor=v.limit,a(),v.cursor=v.limit_backward,o(),!0}};return function(e){return"function"==typeof e.update?e.update(function(e){return i.setCurrent(e),i.stem(),i.getCurrent()}):(i.setCurrent(e),i.stem(),i.getCurrent())}}(),e.Pipeline.registerFunction(e.de.stemmer,"stemmer-de"),e.de.stopWordFilter=e.generateStopWordFilter("aber alle allem allen aller alles als also am an ander andere anderem anderen anderer anderes anderm andern anderr anders auch auf aus bei bin bis bist da damit dann das dasselbe dazu daß dein deine deinem deinen deiner deines dem demselben den denn denselben der derer derselbe derselben des desselben dessen dich die dies diese dieselbe dieselben diesem diesen dieser dieses dir doch dort du durch ein eine einem einen einer eines einig einige einigem einigen einiger einiges einmal er es etwas euch euer eure eurem euren eurer eures für gegen gewesen hab habe haben hat hatte hatten hier hin hinter ich ihm ihn ihnen ihr ihre ihrem ihren ihrer ihres im in indem ins ist jede jedem jeden jeder jedes jene jenem jenen jener jenes jetzt kann kein keine keinem keinen keiner keines können könnte machen man manche manchem manchen mancher manches mein meine meinem meinen meiner meines mich mir mit muss musste nach nicht nichts noch nun nur ob oder ohne sehr sein seine seinem seinen seiner seines selbst sich sie sind so solche solchem solchen solcher solches soll sollte sondern sonst um und uns unse unsem unsen unser unses unter viel vom von vor war waren warst was weg weil weiter welche welchem welchen welcher welches wenn werde werden wie wieder will wir wird wirst wo wollen wollte während würde würden zu zum zur zwar zwischen über".split(" ")),e.Pipeline.registerFunction(e.de.stopWordFilter,"stopWordFilter-de")}}); -------------------------------------------------------------------------------- /test/en: -------------------------------------------------------------------------------- 1 | 2g1c 2 | 2 girls 1 cup 3 | acrotomophilia 4 | alabama hot pocket 5 | alaskan pipeline 6 | anal 7 | anilingus 8 | anus 9 | apeshit 10 | arsehole 11 | ass 12 | asshole 13 | assmunch 14 | auto erotic 15 | autoerotic 16 | babeland 17 | baby batter 18 | baby juice 19 | ball gag 20 | ball gravy 21 | ball kicking 22 | ball licking 23 | ball sack 24 | ball sucking 25 | bangbros 26 | bangbus 27 | bareback 28 | barely legal 29 | barenaked 30 | bastard 31 | bastardo 32 | bastinado 33 | bbw 34 | bdsm 35 | beaner 36 | beaners 37 | beaver cleaver 38 | beaver lips 39 | beastiality 40 | bestiality 41 | big black 42 | big breasts 43 | big knockers 44 | big tits 45 | bimbos 46 | birdlock 47 | bitch 48 | bitches 49 | black cock 50 | blonde action 51 | blonde on blonde action 52 | blowjob 53 | blow job 54 | blow your load 55 | blue waffle 56 | blumpkin 57 | bollocks 58 | bondage 59 | boner 60 | boob 61 | boobs 62 | booty call 63 | brown showers 64 | brunette action 65 | bukkake 66 | bulldyke 67 | bullet vibe 68 | bullshit 69 | bung hole 70 | bunghole 71 | busty 72 | butt 73 | buttcheeks 74 | butthole 75 | camel toe 76 | camgirl 77 | camslut 78 | camwhore 79 | carpet muncher 80 | carpetmuncher 81 | chocolate rosebuds 82 | cialis 83 | circlejerk 84 | cleveland steamer 85 | clit 86 | clitoris 87 | clover clamps 88 | clusterfuck 89 | cock 90 | cocks 91 | coprolagnia 92 | coprophilia 93 | cornhole 94 | coon 95 | coons 96 | creampie 97 | cum 98 | cumming 99 | cumshot 100 | cumshots 101 | cunnilingus 102 | cunt 103 | darkie 104 | date rape 105 | daterape 106 | deep throat 107 | deepthroat 108 | dendrophilia 109 | dick 110 | dildo 111 | dingleberry 112 | dingleberries 113 | dirty pillows 114 | dirty sanchez 115 | doggie style 116 | doggiestyle 117 | doggy style 118 | doggystyle 119 | dog style 120 | dolcett 121 | domination 122 | dominatrix 123 | dommes 124 | donkey punch 125 | double dong 126 | double penetration 127 | dp action 128 | dry hump 129 | dvda 130 | eat my ass 131 | ecchi 132 | ejaculation 133 | erotic 134 | erotism 135 | escort 136 | eunuch 137 | fag 138 | faggot 139 | fecal 140 | felch 141 | fellatio 142 | feltch 143 | female squirting 144 | femdom 145 | figging 146 | fingerbang 147 | fingering 148 | fisting 149 | foot fetish 150 | footjob 151 | frotting 152 | fuck 153 | fuck buttons 154 | fuckin 155 | fucking 156 | fucktards 157 | fudge packer 158 | fudgepacker 159 | futanari 160 | gangbang 161 | gang bang 162 | gay sex 163 | genitals 164 | giant cock 165 | girl on 166 | girl on top 167 | girls gone wild 168 | goatcx 169 | goatse 170 | god damn 171 | gokkun 172 | golden shower 173 | goodpoop 174 | goo girl 175 | goregasm 176 | grope 177 | group sex 178 | g-spot 179 | guro 180 | hand job 181 | handjob 182 | hard core 183 | hardcore 184 | hentai 185 | homoerotic 186 | honkey 187 | hooker 188 | horny 189 | hot carl 190 | hot chick 191 | how to kill 192 | how to murder 193 | huge fat 194 | humping 195 | incest 196 | intercourse 197 | jack off 198 | jail bait 199 | jailbait 200 | jelly donut 201 | jerk off 202 | jigaboo 203 | jiggaboo 204 | jiggerboo 205 | jizz 206 | juggs 207 | kike 208 | kinbaku 209 | kinkster 210 | kinky 211 | knobbing 212 | leather restraint 213 | leather straight jacket 214 | lemon party 215 | livesex 216 | lolita 217 | lovemaking 218 | make me come 219 | male squirting 220 | masturbate 221 | masturbating 222 | masturbation 223 | menage a trois 224 | milf 225 | missionary position 226 | mong 227 | motherfucker 228 | mound of venus 229 | mr hands 230 | muff diver 231 | muffdiving 232 | nambla 233 | nawashi 234 | negro 235 | neonazi 236 | nigga 237 | nigger 238 | nig nog 239 | nimphomania 240 | nipple 241 | nipples 242 | nsfw 243 | nsfw images 244 | nude 245 | nudity 246 | nutten 247 | nympho 248 | nymphomania 249 | octopussy 250 | omorashi 251 | one cup two girls 252 | one guy one jar 253 | orgasm 254 | orgy 255 | paedophile 256 | paki 257 | panties 258 | panty 259 | pedobear 260 | pedophile 261 | pegging 262 | penis 263 | phone sex 264 | piece of shit 265 | pikey 266 | pissing 267 | piss pig 268 | pisspig 269 | playboy 270 | pleasure chest 271 | pole smoker 272 | ponyplay 273 | poof 274 | poon 275 | poontang 276 | punany 277 | poop chute 278 | poopchute 279 | porn 280 | porno 281 | pornography 282 | prince albert piercing 283 | pthc 284 | pubes 285 | pussy 286 | queaf 287 | queef 288 | quim 289 | raghead 290 | raging boner 291 | rape 292 | raping 293 | rapist 294 | rectum 295 | reverse cowgirl 296 | rimjob 297 | rimming 298 | rosy palm 299 | rosy palm and her 5 sisters 300 | rusty trombone 301 | sadism 302 | santorum 303 | scat 304 | schlong 305 | scissoring 306 | semen 307 | sex 308 | sexcam 309 | sexo 310 | sexy 311 | sexual 312 | sexually 313 | sexuality 314 | shaved beaver 315 | shaved pussy 316 | shemale 317 | shibari 318 | shit 319 | shitblimp 320 | shitty 321 | shota 322 | shrimping 323 | skeet 324 | slanteye 325 | slut 326 | s&m 327 | smut 328 | snatch 329 | snowballing 330 | sodomize 331 | sodomy 332 | spastic 333 | spic 334 | splooge 335 | splooge moose 336 | spooge 337 | spread legs 338 | spunk 339 | strap on 340 | strapon 341 | strappado 342 | strip club 343 | style doggy 344 | suck 345 | sucks 346 | suicide girls 347 | sultry women 348 | swastika 349 | swinger 350 | tainted love 351 | taste my 352 | tea bagging 353 | threesome 354 | throating 355 | thumbzilla 356 | tied up 357 | tight white 358 | tit 359 | tits 360 | titties 361 | titty 362 | tongue in a 363 | topless 364 | tosser 365 | towelhead 366 | tranny 367 | tribadism 368 | tub girl 369 | tubgirl 370 | tushy 371 | twat 372 | twink 373 | twinkie 374 | two girls one cup 375 | undressing 376 | upskirt 377 | urethra play 378 | urophilia 379 | vagina 380 | venus mound 381 | viagra 382 | vibrator 383 | violet wand 384 | vorarephilia 385 | voyeur 386 | voyeurweb 387 | voyuer 388 | vulva 389 | wank 390 | wetback 391 | wet dream 392 | white power 393 | whore 394 | worldsex 395 | wrapping men 396 | wrinkled starfish 397 | xx 398 | xxx 399 | yaoi 400 | yellow showers 401 | yiffy 402 | zoophilia 403 | 🖕 404 | -------------------------------------------------------------------------------- /docs/FinNLP/site/assets/javascripts/lunr/min/lunr.du.min.js: -------------------------------------------------------------------------------- 1 | /*! 2 | * Lunr languages, `Dutch` language 3 | * https://github.com/MihaiValentin/lunr-languages 4 | * 5 | * Copyright 2014, Mihai Valentin 6 | * http://www.mozilla.org/MPL/ 7 | */ 8 | /*! 9 | * based on 10 | * Snowball JavaScript Library v0.3 11 | * http://code.google.com/p/urim/ 12 | * http://snowball.tartarus.org/ 13 | * 14 | * Copyright 2010, Oleg Mazko 15 | * http://www.mozilla.org/MPL/ 16 | */ 17 | 18 | !function(e,r){"function"==typeof define&&define.amd?define(r):"object"==typeof exports?module.exports=r():r()(e.lunr)}(this,function(){return function(e){if(void 0===e)throw new Error("Lunr is not present. Please include / require Lunr before this script.");if(void 0===e.stemmerSupport)throw new Error("Lunr stemmer support is not present. Please include / require Lunr stemmer support before this script.");console.warn('[Lunr Languages] Please use the "nl" instead of the "du". The "nl" code is the standard code for Dutch language, and "du" will be removed in the next major versions.'),e.du=function(){this.pipeline.reset(),this.pipeline.add(e.du.trimmer,e.du.stopWordFilter,e.du.stemmer),this.searchPipeline&&(this.searchPipeline.reset(),this.searchPipeline.add(e.du.stemmer))},e.du.wordCharacters="A-Za-zªºÀ-ÖØ-öø-ʸˠ-ˤᴀ-ᴥᴬ-ᵜᵢ-ᵥᵫ-ᵷᵹ-ᶾḀ-ỿⁱⁿₐ-ₜKÅℲⅎⅠ-ↈⱠ-ⱿꜢ-ꞇꞋ-ꞭꞰ-ꞷꟷ-ꟿꬰ-ꭚꭜ-ꭤff-stA-Za-z",e.du.trimmer=e.trimmerSupport.generateTrimmer(e.du.wordCharacters),e.Pipeline.registerFunction(e.du.trimmer,"trimmer-du"),e.du.stemmer=function(){var r=e.stemmerSupport.Among,i=e.stemmerSupport.SnowballProgram,n=new function(){function e(){for(var e,r,i,o=C.cursor;;){if(C.bra=C.cursor,e=C.find_among(b,11))switch(C.ket=C.cursor,e){case 1:C.slice_from("a");continue;case 2:C.slice_from("e");continue;case 3:C.slice_from("i");continue;case 4:C.slice_from("o");continue;case 5:C.slice_from("u");continue;case 6:if(C.cursor>=C.limit)break;C.cursor++;continue}break}for(C.cursor=o,C.bra=o,C.eq_s(1,"y")?(C.ket=C.cursor,C.slice_from("Y")):C.cursor=o;;)if(r=C.cursor,C.in_grouping(q,97,232)){if(i=C.cursor,C.bra=i,C.eq_s(1,"i"))C.ket=C.cursor,C.in_grouping(q,97,232)&&(C.slice_from("I"),C.cursor=r);else if(C.cursor=i,C.eq_s(1,"y"))C.ket=C.cursor,C.slice_from("Y"),C.cursor=r;else if(n(r))break}else if(n(r))break}function n(e){return C.cursor=e,e>=C.limit||(C.cursor++,!1)}function o(){_=C.limit,f=_,t()||(_=C.cursor,_<3&&(_=3),t()||(f=C.cursor))}function t(){for(;!C.in_grouping(q,97,232);){if(C.cursor>=C.limit)return!0;C.cursor++}for(;!C.out_grouping(q,97,232);){if(C.cursor>=C.limit)return!0;C.cursor++}return!1}function s(){for(var e;;)if(C.bra=C.cursor,e=C.find_among(p,3))switch(C.ket=C.cursor,e){case 1:C.slice_from("y");break;case 2:C.slice_from("i");break;case 3:if(C.cursor>=C.limit)return;C.cursor++}}function u(){return _<=C.cursor}function c(){return f<=C.cursor}function a(){var e=C.limit-C.cursor;C.find_among_b(g,3)&&(C.cursor=C.limit-e,C.ket=C.cursor,C.cursor>C.limit_backward&&(C.cursor--,C.bra=C.cursor,C.slice_del()))}function l(){var e;w=!1,C.ket=C.cursor,C.eq_s_b(1,"e")&&(C.bra=C.cursor,u()&&(e=C.limit-C.cursor,C.out_grouping_b(q,97,232)&&(C.cursor=C.limit-e,C.slice_del(),w=!0,a())))}function m(){var e;u()&&(e=C.limit-C.cursor,C.out_grouping_b(q,97,232)&&(C.cursor=C.limit-e,C.eq_s_b(3,"gem")||(C.cursor=C.limit-e,C.slice_del(),a())))}function d(){var e,r,i,n,o,t,s=C.limit-C.cursor;if(C.ket=C.cursor,e=C.find_among_b(h,5))switch(C.bra=C.cursor,e){case 1:u()&&C.slice_from("heid");break;case 2:m();break;case 3:u()&&C.out_grouping_b(z,97,232)&&C.slice_del()}if(C.cursor=C.limit-s,l(),C.cursor=C.limit-s,C.ket=C.cursor,C.eq_s_b(4,"heid")&&(C.bra=C.cursor,c()&&(r=C.limit-C.cursor,C.eq_s_b(1,"c")||(C.cursor=C.limit-r,C.slice_del(),C.ket=C.cursor,C.eq_s_b(2,"en")&&(C.bra=C.cursor,m())))),C.cursor=C.limit-s,C.ket=C.cursor,e=C.find_among_b(k,6))switch(C.bra=C.cursor,e){case 1:if(c()){if(C.slice_del(),i=C.limit-C.cursor,C.ket=C.cursor,C.eq_s_b(2,"ig")&&(C.bra=C.cursor,c()&&(n=C.limit-C.cursor,!C.eq_s_b(1,"e")))){C.cursor=C.limit-n,C.slice_del();break}C.cursor=C.limit-i,a()}break;case 2:c()&&(o=C.limit-C.cursor,C.eq_s_b(1,"e")||(C.cursor=C.limit-o,C.slice_del()));break;case 3:c()&&(C.slice_del(),l());break;case 4:c()&&C.slice_del();break;case 5:c()&&w&&C.slice_del()}C.cursor=C.limit-s,C.out_grouping_b(j,73,232)&&(t=C.limit-C.cursor,C.find_among_b(v,4)&&C.out_grouping_b(q,97,232)&&(C.cursor=C.limit-t,C.ket=C.cursor,C.cursor>C.limit_backward&&(C.cursor--,C.bra=C.cursor,C.slice_del())))}var f,_,w,b=[new r("",-1,6),new r("á",0,1),new r("ä",0,1),new r("é",0,2),new r("ë",0,2),new r("í",0,3),new r("ï",0,3),new r("ó",0,4),new r("ö",0,4),new r("ú",0,5),new r("ü",0,5)],p=[new r("",-1,3),new r("I",0,2),new r("Y",0,1)],g=[new r("dd",-1,-1),new r("kk",-1,-1),new r("tt",-1,-1)],h=[new r("ene",-1,2),new r("se",-1,3),new r("en",-1,2),new r("heden",2,1),new r("s",-1,3)],k=[new r("end",-1,1),new r("ig",-1,2),new r("ing",-1,1),new r("lijk",-1,3),new r("baar",-1,4),new r("bar",-1,5)],v=[new r("aa",-1,-1),new r("ee",-1,-1),new r("oo",-1,-1),new r("uu",-1,-1)],q=[17,65,16,1,0,0,0,0,0,0,0,0,0,0,0,0,128],j=[1,0,0,17,65,16,1,0,0,0,0,0,0,0,0,0,0,0,0,128],z=[17,67,16,1,0,0,0,0,0,0,0,0,0,0,0,0,128],C=new i;this.setCurrent=function(e){C.setCurrent(e)},this.getCurrent=function(){return C.getCurrent()},this.stem=function(){var r=C.cursor;return e(),C.cursor=r,o(),C.limit_backward=r,C.cursor=C.limit,d(),C.cursor=C.limit_backward,s(),!0}};return function(e){return"function"==typeof e.update?e.update(function(e){return n.setCurrent(e),n.stem(),n.getCurrent()}):(n.setCurrent(e),n.stem(),n.getCurrent())}}(),e.Pipeline.registerFunction(e.du.stemmer,"stemmer-du"),e.du.stopWordFilter=e.generateStopWordFilter(" aan al alles als altijd andere ben bij daar dan dat de der deze die dit doch doen door dus een eens en er ge geen geweest haar had heb hebben heeft hem het hier hij hoe hun iemand iets ik in is ja je kan kon kunnen maar me meer men met mij mijn moet na naar niet niets nog nu of om omdat onder ons ook op over reeds te tegen toch toen tot u uit uw van veel voor want waren was wat werd wezen wie wil worden wordt zal ze zelf zich zij zijn zo zonder zou".split(" ")),e.Pipeline.registerFunction(e.du.stopWordFilter,"stopWordFilter-du")}}); -------------------------------------------------------------------------------- /finnlp/data_sources/company_announcement/sec.py: -------------------------------------------------------------------------------- 1 | from finnlp.data_sources.company_announcement._base import Company_Announcement_Downloader 2 | 3 | from tqdm import tqdm 4 | from lxml import etree 5 | import pandas as pd 6 | import requests 7 | import json 8 | import time 9 | 10 | class SEC_Announcement(Company_Announcement_Downloader): 11 | 12 | def __init__(self, args = {}): 13 | super().__init__(args) 14 | self.dataframe = pd.DataFrame() 15 | 16 | def download_date_range_stock(self, start_date, end_date, stock = "AAPL", delay = 0.1): 17 | entityName = self._get_entity_name(stock) 18 | # first page 19 | total_pages = self._gather_one_page(start_date, end_date, 1, entityName, delay) 20 | # other pages 21 | if total_pages>1: 22 | for page in tqdm(range(1, total_pages), desc="Downloading other page..."): 23 | self._gather_one_page(start_date, end_date, page + 1, entityName, delay ) 24 | 25 | self.dataframe = self.dataframe.reset_index(drop = True) 26 | 27 | def _get_entity_name(self, stock = "AAPL"): 28 | url = "https://efts.sec.gov/LATEST/search-index" 29 | headers = { 30 | "user-agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/113.0.0.0 Safari/537.36" 31 | } 32 | params = { 33 | "keysTyped":stock 34 | } 35 | resp = self._request_get(url = url, headers= headers, params= params) 36 | if resp is None: 37 | raise ConnectionError("Can't get entity name") 38 | 39 | res = json.loads(resp.text) 40 | item_list = res["hits"]["hits"] 41 | entityName_list = [] 42 | for item in item_list: 43 | c_name_one = item["_source"]["entity_words"] 44 | c_name_two = item["_id"].zfill(10) 45 | entityName = f"{c_name_one} (CIK {c_name_two})" 46 | entityName_list.append(entityName) 47 | 48 | entityName = entityName_list[0] 49 | 50 | return entityName 51 | 52 | def _gather_one_page(self, start_date, end_date, page, entityName = "Apple Inc. (AAPL) (CIK 0000320193)", delay = 0.01): 53 | from_ = (page-1)*100 54 | url = "https://efts.sec.gov/LATEST/search-index" 55 | headers = { 56 | "user-agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/113.0.0.0 Safari/537.36" 57 | } 58 | params = { 59 | "dateRange": "all", 60 | "entityName": entityName, 61 | "startdt": start_date, 62 | "enddt": end_date, 63 | "from" : from_, 64 | "page" : page, 65 | } 66 | 67 | resp = self._request_get(url = url, headers= headers, params= params) 68 | 69 | if resp is None: 70 | return 'Error' 71 | res = json.loads(resp.text) 72 | 73 | # total 74 | total_items = res["hits"]["total"]["value"] 75 | if total_items % 100 == 0: 76 | total_pages = total_items // 100 77 | else: 78 | total_pages = total_items // 100 + 1 79 | 80 | items = res["hits"]["hits"] 81 | 82 | url_base = "https://www.sec.gov/Archives/edgar/data" 83 | 84 | for item in tqdm(items, desc="Downloading by item..." ): 85 | url_third = item["_source"]["xsl"] 86 | url_second, url_fourth = item["_id"].split(":") 87 | url_second = url_second.split("-") 88 | url_first = url_second[0] 89 | url_first = url_first.strip("0") 90 | url_second = ''.join(url_second) 91 | url_first, url_second, url_fourth 92 | 93 | if url_third is not None: 94 | url_new = f"{url_base}/{url_first}/{url_second}/{url_third}/{url_fourth}" 95 | else: 96 | url_new = f"{url_base}/{url_first}/{url_second}/{url_fourth}" 97 | respn = self._request_get(url = url_new, headers= headers) 98 | if respn is None: 99 | continue 100 | try: 101 | res = etree.HTML(respn.text) 102 | content = res.xpath("/html/body//text()") 103 | content = [c for c in content if c != "\n"] 104 | content = "".join(content) 105 | 106 | _id = item["_id"] 107 | ciks = item["_source"]["ciks"] 108 | period_ending = item["_source"]["period_ending"] 109 | root_form = item["_source"]["root_form"] 110 | file_num = item["_source"]["file_num"] 111 | display_names = item["_source"]["display_names"] 112 | xsl = item["_source"]["xsl"] 113 | sequence = item["_source"]["sequence"] 114 | file_date = item["_source"]["file_date"] 115 | biz_states = item["_source"]["biz_states"] 116 | sics = item["_source"]["sics"] 117 | form = item["_source"]["form"] 118 | adsh = item["_source"]["adsh"] 119 | film_num = item["_source"]["film_num"] 120 | biz_locations = item["_source"]["biz_locations"] 121 | file_type = item["_source"]["file_type"] 122 | file_description = item["_source"]["file_description"] 123 | inc_states = item["_source"]["inc_states"] 124 | ite = item["_source"]["items"] 125 | 126 | data = [ 127 | _id, ciks, period_ending, root_form, file_num, display_names, xsl, sequence, 128 | file_date, biz_states, sics, form, adsh, film_num, biz_locations, file_type, 129 | file_description, inc_states, ite, content 130 | ] 131 | columns = [ 132 | "_id", "ciks", "period_ending", "root_form", "file_num", "display_names", "xsl", "sequence", 133 | "file_date", "biz_states", "sics", "form", "adsh", "film_num", "biz_locations", "file_type", 134 | "file_description", "inc_states", "ite", "content" 135 | ] 136 | tmp = pd.DataFrame(data = data).T 137 | tmp.columns = columns 138 | 139 | self.dataframe = pd.concat([self.dataframe, tmp]) 140 | time.sleep(delay) 141 | except: 142 | continue 143 | 144 | return total_pages 145 | 146 | -------------------------------------------------------------------------------- /finnlp/data_sources/social_media/weibo_date_range.py: -------------------------------------------------------------------------------- 1 | from finnlp.data_sources.social_media._base import Social_Media_Downloader 2 | 3 | from tqdm import tqdm 4 | from lxml import etree 5 | import pandas as pd 6 | import numpy as np 7 | import requests 8 | import datetime 9 | import time 10 | import json 11 | import re 12 | 13 | class Weibo_Date_Range(Social_Media_Downloader): 14 | def __init__(self, args = {}): 15 | super().__init__(args) 16 | if "cookies" not in args.keys(): 17 | raise ValueError("You need first log in at https://weibo.com/ and then copy you cookies and use it as the [value] of [key] \'cookies\' ") 18 | self.cookies = args["cookies"] 19 | self.dataframe = pd.DataFrame() 20 | 21 | def download_date_range_stock(self, start_date, end_date, start_hour= 0,end_hour = 0,stock = "茅台", delay = 0.01): 22 | self.date_list = pd.date_range(start_date, end_date) 23 | for date in tqdm(self.date_list, desc = "Downloading by dates..."): 24 | date = date.strftime("%Y-%m-%d") 25 | self._gather_one_day(date, start_hour, end_hour, stock, delay) 26 | self.dataframe = self.dataframe.reset_index(drop = True) 27 | 28 | def _gather_one_day(self,date,start_hour, end_hour, stock = "茅台", delay = 0.01): 29 | if start_hour == 0 and end_hour == 0: 30 | start_date = datetime.datetime.strptime(date, "%Y-%m-%d") 31 | end_date = start_date + datetime.timedelta(days=1) 32 | start_date = start_date.strftime("%Y-%m-%d") 33 | end_date = end_date.strftime("%Y-%m-%d") 34 | else: 35 | start_date = date, end_date = date 36 | 37 | # first page 38 | all_urls = self._gather_first_page(start_date, end_date, start_hour, end_hour, stock, delay) 39 | # another pages 40 | if len(all_urls)>1: 41 | base_url= "https://s.weibo.com/" 42 | for url_new in all_urls: 43 | url_new = base_url + url_new 44 | self._gather_other_pages(date, url_new, delay) 45 | 46 | def _gather_first_page(self,start_date, end_date, start_hour, end_hour, stock = "茅台", delay = 0.01): 47 | 48 | headers = { 49 | "cookie": self.cookies, 50 | "User-Agent":"Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:109.0) Gecko/20100101 Firefox/112.0", 51 | } 52 | 53 | params = { 54 | "q": stock, 55 | "typeall": "1", 56 | "suball": "1", 57 | "timescope":f"custom:{start_date}-{start_hour}:{end_date}-{end_hour}", 58 | "Refer":"g", 59 | "page":"1" 60 | } 61 | 62 | url = f"https://s.weibo.com/weibo" 63 | resp = self._request_get(url, headers=headers, params = params) 64 | 65 | if resp is None: 66 | return "Error" 67 | 68 | if "passport.weibo.com" in resp.url: 69 | raise ValueError("Your cookies is useless. Please first log in at https://weibo.com/ and then copy you cookies and use it as the [value] of [key] \'cookies\' ") 70 | 71 | res = etree.HTML(resp.content) 72 | # get all pages 73 | all_pages = res.xpath('//*[@id="pl_feedlist_index"]/div[3]/div[1]/span/ul/li//@href') 74 | items = res.xpath('//div[@class="card-wrap"]') 75 | for i in items: 76 | ps = i.xpath('.//div[@class="content"]//p') 77 | try: 78 | content = ps[0].xpath(".//text()") 79 | content = ''.join(content) 80 | content = content.replace('\n',"") 81 | content = content.replace(' ',"") 82 | content = content.replace('\u200b',"") 83 | except: 84 | continue 85 | 86 | info = ps[1].xpath(".//text()") 87 | try: 88 | date_content = info[1] 89 | date_content = date_content.replace('\n',"") 90 | date_content = date_content.replace(' ',"") 91 | except: 92 | date_content = np.nan 93 | 94 | try: 95 | source = info[3] 96 | except: 97 | source = np.nan 98 | 99 | tmp = pd.DataFrame([start_date, date_content, source, content]).T 100 | tmp.columns = ["date","date_content", "source", "content"] 101 | self.dataframe = pd.concat([self.dataframe, tmp]) 102 | 103 | time.sleep(delay) 104 | 105 | return all_pages 106 | 107 | def _gather_other_pages(self, date, url, delay = 0.01): 108 | 109 | headers = { 110 | "cookie": self.cookies, 111 | "User-Agent":"Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:109.0) Gecko/20100101 Firefox/112.0", 112 | } 113 | 114 | resp = self._request_get(url, headers=headers) 115 | 116 | if resp is None: 117 | return "Error" 118 | 119 | if "passport.weibo.com" in resp.url: 120 | raise ValueError("Your cookies is useless. Please first log in at https://weibo.com/ and then copy you cookies and use it as the [value] of [key] \'cookies\' ") 121 | 122 | res = etree.HTML(resp.content) 123 | # get all pages 124 | all_pages = res.xpath('//*[@id="pl_feedlist_index"]/div[3]/div[1]/span/ul/li//@href') 125 | items = res.xpath('//div[@class="card-wrap"]') 126 | for i in items: 127 | ps = i.xpath('.//div[@class="content"]//p') 128 | try: 129 | content = ps[0].xpath(".//text()") 130 | content = ''.join(content) 131 | content = content.replace('\n',"") 132 | content = content.replace(' ',"") 133 | content = content.replace('\u200b',"") 134 | except: 135 | continue 136 | 137 | info = ps[1].xpath(".//text()") 138 | try: 139 | date_content = info[1] 140 | date_content = date_content.replace('\n',"") 141 | date_content = date_content.replace(' ',"") 142 | except: 143 | date_content = np.nan 144 | 145 | try: 146 | source = info[3] 147 | except: 148 | source = np.nan 149 | 150 | tmp = pd.DataFrame([date, date_content, source, content]).T 151 | tmp.columns = ["date", "date_content", "source", "content"] 152 | self.dataframe = pd.concat([self.dataframe, tmp]) 153 | 154 | time.sleep(delay) 155 | -------------------------------------------------------------------------------- /finnlp/data_sources/sec_filings/prepline_sec_filings/sections.py: -------------------------------------------------------------------------------- 1 | """Module for defining/enumerating the common sections from SEC forms""" 2 | import re 3 | from enum import Enum 4 | from typing import List 5 | 6 | 7 | class SECSection(Enum): 8 | PROSPECTUS_SUMMARY = re.compile(r"^(?:prospectus )?summary$") 9 | ABOUT_PROSPECTUS = re.compile(r"about this prospectus") 10 | FORWARD_LOOKING_STATEMENTS = re.compile(r"forward[ -]looking statements") 11 | RISK_FACTORS = re.compile(r"risk factors") 12 | USE_OF_PROCEEDS = re.compile(r"use of proceeds") 13 | DIVIDEND_POLICY = re.compile(r"^dividend policy") 14 | CAPITALIZATION = re.compile(r"^capitalization$") 15 | DILUTION = re.compile(r"^dilution$") 16 | MANAGEMENT_DISCUSSION = re.compile(r"^management(?:[\u2019']s)? discussion") 17 | BUSINESS = re.compile(r"^business$") 18 | MANAGEMENT = re.compile(r"^(?:(?:our )?management)|(?:executive officers)$") 19 | COMPENSATION = re.compile(r"compensation") 20 | RELATED_PARTY_TRANSACTIONS = re.compile(r"(?:relationships|related).*transactions") 21 | PRINCIPAL_STOCKHOLDERS = re.compile( 22 | r"(?:principal.*(?:stockholder|shareholder)s?)|(?:(security|stock|share) " 23 | r"ownership .*certain)" 24 | ) 25 | DESCRIPTION_OF_STOCK = re.compile( 26 | r"^description of (?:capital stock|share capital|securities)" 27 | ) 28 | DESCRIPTION_OF_DEBT = re.compile(r"^description of .*debt") 29 | FUTURE_SALE = re.compile(r"(?:shares|stock) eligible for future sale") 30 | US_TAX = re.compile( 31 | r"(?:us|u\.s\.|united states|material federal).* tax" 32 | r" (?:consideration|consequence)" 33 | ) 34 | UNDERWRITING = re.compile(r"underwrit") 35 | LEGAL_MATTERS = re.compile(r"legal matters") 36 | EXPERTS = re.compile(r"^experts$") 37 | MORE_INFORMATION = re.compile(r"(?:additional|more) information") 38 | FINANCIAL_STATEMENTS = r"financial statements" 39 | MARKET_RISK_DISCLOSURES = ( 40 | r"(?:quantitative|qualitative) disclosures? about market risk" 41 | ) 42 | CONTROLS_AND_PROCEDURES = r"controls and procedures" 43 | LEGAL_PROCEEDINGS = r"legal proceedings" 44 | DEFAULTS = r"defaults (?:up)?on .*securities" 45 | MINE_SAFETY = r"mine safety disclosures?" 46 | OTHER_INFORMATION = r"other information" 47 | UNRESOLVED_STAFF_COMMENTS = r"unresolved staff comments" 48 | PROPERTIES = r"^properties$" 49 | MARKET_FOR_REGISTRANT_COMMON_EQUITY = ( 50 | r"market for(?: the)? (?:registrant|company)(?:['\u2019]s)? common equity" 51 | ) 52 | ACCOUNTING_DISAGREEMENTS = r"disagreements with accountants" 53 | FOREIGN_JURISDICTIONS = r"diclosure .*foreign jurisdictions .*inspection" 54 | EXECUTIVE_OFFICERS = r"executive officers" 55 | ACCOUNTING_FEES = r"accounting fees" 56 | EXHIBITS = r"^exhibits?(.*financial statement schedules)?$" 57 | FORM_SUMMARY = r"^form .*summary$" 58 | # NOTE(yuming): Additional section titles used in test_real_examples.py, 59 | # maybe change this when custom regex string param is allowed. 60 | CERTAIN_TRADEMARKS = r"certain trademarks" 61 | OFFER_PRICE = r"(?:determination of )offering price" 62 | 63 | @property 64 | def pattern(self): 65 | return self.value 66 | 67 | 68 | ALL_SECTIONS = "_ALL" 69 | 70 | section_string_to_enum = {enum.name: enum for enum in SECSection} 71 | 72 | # NOTE(robinson) - Sections are listed in the following document from SEC 73 | # ref: https://www.sec.gov/files/form10-k.pdf 74 | SECTIONS_10K = ( 75 | SECSection.BUSINESS, # ITEM 1 76 | SECSection.RISK_FACTORS, # ITEM 1A 77 | SECSection.UNRESOLVED_STAFF_COMMENTS, # ITEM 1B 78 | SECSection.PROPERTIES, # ITEM 2 79 | SECSection.LEGAL_PROCEEDINGS, # ITEM 3 80 | SECSection.MINE_SAFETY, # ITEM 4 81 | SECSection.MARKET_FOR_REGISTRANT_COMMON_EQUITY, # ITEM 5 82 | # NOTE(robinson) - ITEM 6 is "RESERVED" 83 | SECSection.MANAGEMENT_DISCUSSION, # ITEM 7 84 | SECSection.MARKET_RISK_DISCLOSURES, # ITEM 7A 85 | SECSection.FINANCIAL_STATEMENTS, # ITEM 8 86 | SECSection.ACCOUNTING_DISAGREEMENTS, # ITEM 9 87 | SECSection.CONTROLS_AND_PROCEDURES, # ITEM 9A 88 | # NOTE(robinson) - ITEM 9B is other information 89 | SECSection.FOREIGN_JURISDICTIONS, # ITEM 9C 90 | SECSection.MANAGEMENT, # ITEM 10 91 | SECSection.COMPENSATION, # ITEM 11 92 | SECSection.PRINCIPAL_STOCKHOLDERS, # ITEM 12 93 | SECSection.RELATED_PARTY_TRANSACTIONS, # ITEM 13 94 | SECSection.ACCOUNTING_FEES, # ITEM 14 95 | SECSection.EXHIBITS, # ITEM 15 96 | SECSection.FORM_SUMMARY, # ITEM 16 97 | ) 98 | 99 | # NOTE(robinson) - Sections are listed in the following document from SEC 100 | # ref: https://www.sec.gov/files/form10-q.pdf 101 | SECTIONS_10Q = ( 102 | # Part I - Financial information 103 | SECSection.FINANCIAL_STATEMENTS, # ITEM 1 104 | SECSection.MANAGEMENT_DISCUSSION, # ITEM 2 105 | SECSection.MARKET_RISK_DISCLOSURES, # ITEM 3 106 | SECSection.CONTROLS_AND_PROCEDURES, # ITEM 4 107 | # Part II - Other information 108 | SECSection.LEGAL_PROCEEDINGS, # ITEM 1 109 | SECSection.RISK_FACTORS, # ITEM 1A 110 | SECSection.USE_OF_PROCEEDS, # ITEM 2 111 | SECSection.DEFAULTS, # ITEM 3 112 | SECSection.MINE_SAFETY, # ITEM 4 113 | SECSection.OTHER_INFORMATION, # ITEM 5 114 | ) 115 | 116 | SECTIONS_S1 = ( 117 | SECSection.PROSPECTUS_SUMMARY, 118 | SECSection.ABOUT_PROSPECTUS, 119 | SECSection.FORWARD_LOOKING_STATEMENTS, 120 | SECSection.RISK_FACTORS, 121 | SECSection.USE_OF_PROCEEDS, 122 | SECSection.DIVIDEND_POLICY, 123 | SECSection.CAPITALIZATION, 124 | SECSection.DILUTION, 125 | SECSection.MANAGEMENT_DISCUSSION, 126 | SECSection.BUSINESS, 127 | SECSection.MANAGEMENT, 128 | SECSection.COMPENSATION, 129 | SECSection.RELATED_PARTY_TRANSACTIONS, 130 | SECSection.PRINCIPAL_STOCKHOLDERS, 131 | SECSection.DESCRIPTION_OF_STOCK, 132 | SECSection.DESCRIPTION_OF_DEBT, 133 | SECSection.FUTURE_SALE, 134 | SECSection.US_TAX, 135 | SECSection.UNDERWRITING, 136 | SECSection.LEGAL_MATTERS, 137 | SECSection.EXPERTS, 138 | SECSection.MORE_INFORMATION, 139 | ) 140 | 141 | 142 | def validate_section_names(section_names: List[str]): 143 | """Return section names that don't correspond to a defined enum.""" 144 | if len(section_names) == 1 and section_names[0] == ALL_SECTIONS: 145 | return None 146 | elif len(section_names) > 1 and ALL_SECTIONS in section_names: 147 | raise ValueError(f"{ALL_SECTIONS} may not be specified with other sections") 148 | 149 | invalid_names = [ 150 | name for name in section_names if name not in section_string_to_enum 151 | ] 152 | if invalid_names: 153 | raise ValueError(f"The following section names are not valid: {invalid_names}") 154 | return None 155 | -------------------------------------------------------------------------------- /docs/FinNLP/docs/zh/index.md: -------------------------------------------------------------------------------- 1 | # 互联网金融数据 2 | 3 | 演示内容请参见[FinGPT](https://github.com/AI4Finance-Foundation/ChatGPT-for-FinTech) 4 | 5 | **免责声明:我们根据MIT教育许可证的规定共享代码以供学术研究之用。此处不构成任何金融建议,亦非交易真实资金的推荐。在交易或投资之前请使用常识并首先咨询专业人士。** 6 | 7 | ## Ⅰ. 架构 8 | 9 | ![image-20230505200244043](https://cdn.jsdelivr.net/gh/oliverwang15/imgbed@main/img/202305052002139.png) 10 | 11 | * 整个项目由4个部分组成: 12 | 13 | * 第一部分是**数据源**,在这里,我们从互联网上收集历史和流媒体数据。 14 | 15 | * 接下来,我们将数据推送到**数据工程**部分,在这里我们会对数据进行清洗,标记化处理和提示工程。 16 | 17 | * 然后,数据被推送到**大语言模型(LLMs)**。在这里,我们可以以不同的方式使用LLMs。我们不仅可以使用收集到的数据来训练我们自己的**轻量级微调模型**,还可以使用这些数据和**训练好的模型**或**LLM API**来支持我们的应用程序。 18 | 19 | * 最后一部分将是**应用程序**部分,我们可以使用数据和LLMs来制作许多有趣的应用程序。 20 | 21 | ## Ⅱ. 数据源 22 | 23 | ![image-20230505200446477](https://cdn.jsdelivr.net/gh/oliverwang15/imgbed@main/img/202305052004539.png) 24 | 25 | * 由于空间限制,我们只展示了其中一部分。 26 | 27 | ### 1. [新闻](jupyter/Data_Sources_News.ipynb) 28 | 29 | | 平台 | 数据类型 | 相关市场 | 指定公司 | 时间范围 | 数据源类型 | 限制条件 | 文档数量(万) | 支持情况 | 30 | | :----------------------------------------------------------: | :--------: | :------------: | :----------------------------------------------------------: | :---------------: | :--------: | :-------------------: | ------------------------------------------------------------ | ------------------------------------------------------------ | 31 | | 雅虎 | 金融新闻 | 美国股票 | √ | 时间范围 | 官方 | N/A | 1,500+ | √ | 32 | | 路透社 | 金融新闻 | 美国股票 | × | 时间范围 | 官方 | N/A | 1,500+ | √ | 33 | | 新浪 | 金融新闻 | 中国股票 | × | 时间范围 | 官方 | N/A | 2,000+ | √ | 34 | | 东方财富 | 金融新闻 | 中国股票 | √ | 时间范围 | 官方 | N/A | 1,000+ | √ | 35 | | 第一财经 | 金融新闻 | 中国股票 | √ | 时间范围 | 官方 | N/A | 500+ | 即将 | 36 | | 央视 | 政府新闻 | 中国股票 | × | 时间范围 | 第三方 | N/A | 4 | √ | 37 | | 美国主流媒体 | 金融新闻 | 美国股票 | √ | 时间范围 | 第三方 | 账户 (免费) | 3,200+ | √ | 38 | | 中国主流媒体 | 金融新闻 | 中国股票 | × | 时间范围 | 第三方 | ¥500/年 | 3000+ | √ | 39 | 40 | * FinGPT可能比Bloomberg的文档数目更少,但我们在同一个数量级上。 41 | 42 | ### 2. [社交媒体](jupyter/Data_Sources_Social_Media.iypnb) 43 | 44 | | 平台 | 数据类型 | 相关市场 | 指定公司 | 范围类型 | 来源类型 | 限制 | 文档 (1e4) | 支持 | 45 | | :---------------------: | :------: | :------: | :------: | :------: | :------: | :-----: | ---------- | :--: | 46 | | Twitter | 推文 | 美国股票 | √ | 时间范围 | 官方 | N/A | 18,000+ | √ | 47 | | StockTwits | 推文 | 美国股票 | √ | 最新 | 官方 | N/A | 160,000+ | √ | 48 | | Reddit (wallstreetbets) | 帖子 | 美国股票 | × | 最新 | 官方 | N/A | 9+ | √ | 49 | | 微博 | 推文 | 中国股票 | √ | 时间范围 | 官方 | Cookies | 1,400,000+ | √ | 50 | | 微博 | 推文 | 中国股票 | √ | 最新 | 官方 | N/A | 1,400,000+ | √ | 51 | 52 | * 在 **BloomberGPT** 中,他们**不收集社交媒体数据**,但我们认为**公众舆论是干扰股票市场的最重要因素之一**。 53 | 54 | ### 3. [公司公告](jupyter/Data_Sources_Company_Announcement.ipynb) 55 | 56 | | 平台 | 数据类型 | 相关市场 | 指定公司 | 范围类型 | 数据来源 | 限制 | 文档数 (1e4) | 支持情况 | 57 | | :---------------: | :------: | :------: | :------: | :------: | :------: | :--: | ------------ | :------: | 58 | | 巨潮网 (官方) | 文本 | 中国股票 | √ | 时间范围 | 官方 | N/A | 2,790+ | √ | 59 | | 美国证监会 (官方) | 文本 | 美国股票 | √ | 时间范围 | 官方 | N/A | 1,440+ | √ | 60 | 61 | * 由于我们从不同的股票市场收集数据,因此我们比Bloomberg GPT有更多的申报文档。 62 | 63 | ### 4. 趋势 64 | 65 | | 平台 | 数据类型 | 相关市场 | 数据源 | 指定公司 | 范围类型 | 源类型 | 限制 | 66 | | :--------------------------------------------------: | :------: | :------: | :-----------------------------------------------------: | :------: | :------: | :----: | :--: | 67 | | [谷歌趋势](https://trends.google.com/trends/explore) | 指数 | 美国股票 | [Google Trends](./finnlp/data_sources/trends/google.py) | √ | 日期范围 | 官方 | N/A | 68 | | [百度指数](https://index.baidu.com/v2/index.html#/) | 指数 | 中国股票 | 即将推出 | - | - | - | - | 69 | 70 | 71 | ### 5. 数据集 72 | | 数据源 | 类型 | 股票 | 日期 | 可用性 | 73 | | :----------------------------------------------------------: | :--: | :--: | :----------------------: | :----: | 74 | | [AShare](https://github.com/JinanZou/Astock) | 新闻 | 3680 | 2018-07-01 到 2021-11-30 | √ | 75 | | [stocknet-dataset](https://github.com/yumoxu/stocknet-dataset) | 推文 | 87 | 2014-01-02 到 2015-12-30 | √ | 76 | | [CHRNN](https://github.com/wuhuizhe/CHRNN) | 推文 | 38 | 2017-01-03 到 2017-12-28 | √ | 77 | 78 | ## Ⅲ. 模型 79 | 80 | ![image-20230505200618504](https://cdn.jsdelivr.net/gh/oliverwang15/imgbed@main/img/202305052006541.png) 81 | 82 | * 在数据中心的自然语言处理领域,我们不需要从头开始训练模型。我们只需要调用API和进行轻量级的微调。 83 | * 左边是一些可能会用到的LLM APIs,中间是我们可能用来进行微调的模型,右边是一些微调方法。 84 | 85 | ### 1. 微调:Tensor Layers (LoRA) 86 | 87 | ![image-20230505200944411](https://cdn.jsdelivr.net/gh/oliverwang15/imgbed@main/img/202305052009480.png) 88 | 89 | * 在FinGPT中,我们使用新的金融数据集对预训练的LLM进行微调。高质量的标记数据是许多成功的LLM(包括ChatGPT)的最重要的关键之一。 90 | * 然而,这些高质量的标记数据通常非常昂贵和耗时,并且我们可能需要金融专家的帮助。 91 | * 如果我们的目标是使用LLM分析与金融相关的文本数据并帮助量化交易,为什么不让市场为我们做标记呢? 92 | * 因此,在这里,我们使用每个新闻相关的股票价格变化百分比作为输出标签,我们使用阈值将标签分成三组(积极的,消极的和中立的),并使用它们和新闻情感的标签。 93 | * 相应地,在提示工程师部分,我们还要求模型选择其中一个正面的,负面的和中性的作为输出,以便我们充分利用预训练信息。 94 | * 通过使用LoRA,我们可以将可训练参数减少从6.17B到3.67M。 95 | * 如表格所示,与chatGLM相比,FinGPT可以在多个指标上实现大幅改善。然而,直接将我们的模型用于量化交易可能是不合适的。由于大多数新闻标题都是中性的,LLMs的大多数原始输出都是中性的,因此LLMs在积极和消极的标签上表现不佳,而这些标签可能对于量化交易是有用的。 96 | * 然而,在微调之后,我们已经见证了在预测积极和消极标签方面的巨大改进。 97 | * 这也是为什么该模型可以实现积极的交易结果的原因。 98 | 99 | ### 2. 微调:强化学习在股价上的应用 (RLSP) 100 | 101 | ![image-20230505201209946](https://cdn.jsdelivr.net/gh/oliverwang15/imgbed@main/img/202305052012996.png) 102 | 103 | * 同样地,我们可以使用股价上的强化学习(RLSP)来替换ChatGPT中使用的人类反馈上的强化学习。 104 | 105 | ## Ⅳ. 应用 106 | 107 | ### 1. 智能投顾 108 | 109 | ![image-20230505201913233](https://cdn.jsdelivr.net/gh/oliverwang15/imgbed@main/img/202305052019296.png) 110 | 111 | * **ChatGPT可以像专业人士一样进行投资建议。** 112 | * 在这个例子中,苹果的**股价上涨**与ChatGPT分析新闻的**预测相符**。 113 | 114 | ### 2. 量化交易 115 | 116 | ![image-20230505201841001](https://cdn.jsdelivr.net/gh/oliverwang15/imgbed@main/img/202305052018035.png) 117 | 118 | * 我们还可以使用新闻、社交媒体推文或者公司公告来**构建情感因子**,右侧的部分是由Twitter推文和ChatGPT信号产生的交易结果,数据来自于一个称为[stocknet-dataset](https://link.zhihu.com/?target=https%3A//github.com/yumoxu/stocknet-dataset)的数据集。 119 | * 正如您从图片中所看到的,由ChatGPT生成的交易信号**非常出色**,我们甚至可以**仅通过根据Twitter情感因子交易而获得良好的结果**。 120 | * 因此,我们可以通过**结合价格因素**来获得更好的结果。 121 | 122 | ### 3. 低代码开发 123 | 124 | ![image-20230505202028292](https://cdn.jsdelivr.net/gh/oliverwang15/imgbed@main/img/202305052020363.png) 125 | 126 | * 我们可以使用LLMs的帮助来编写代码。 127 | * 右侧显示了我们如何**快速高效地**开发我们的因子和其他代码。 --------------------------------------------------------------------------------