├── finnlp
    ├── data_sources
    │   ├── __init__.py
    │   ├── news
    │   │   ├── __init__.py
    │   │   ├── _base.py
    │   │   ├── fmp_streaming.py
    │   │   ├── akshare_cctv.py
    │   │   ├── tushare_major_news.py
    │   │   ├── tipranks_streaming.py
    │   │   ├── yicai_streaming.py
    │   │   ├── cnbc_streaming.py
    │   │   ├── reuters_streaming.py
    │   │   ├── gurufocus_streaming.py
    │   │   ├── alliancenews_streaming.py
    │   │   ├── marketwatch_date_range.py
    │   │   ├── investorplace_streaming.py
    │   │   ├── eastmoney_streaming.py
    │   │   ├── pennystocks_streaming.py
    │   │   ├── sina_finance_date_range.py
    │   │   ├── thefly_streaming.py
    │   │   ├── talkmarkets_streaming.py
    │   │   ├── seekingalpha_date_range.py
    │   │   └── marketwatch_streaming.py
    │   ├── trends
    │   │   ├── __init__.py
    │   │   ├── baidu.py
    │   │   ├── _base.py
    │   │   └── google.py
    │   ├── social_media
    │   │   ├── __init__.py
    │   │   ├── _base.py
    │   │   ├── eastmoney_streaming.py
    │   │   ├── stocktwits_streaming.py
    │   │   ├── xueqiu_streaming.py
    │   │   ├── finnhub_sentiment.py
    │   │   ├── weibo_streaming.py
    │   │   ├── reddit_streaming.py
    │   │   ├── twitter_date_range.py
    │   │   ├── facebook_streaming.py
    │   │   └── weibo_date_range.py
    │   ├── company_announcement
    │   │   ├── __init__.py
    │   │   ├── _base.py
    │   │   ├── sina.py
    │   │   ├── juchao.py
    │   │   └── sec.py
    │   ├── sec_filings
    │   │   ├── prepline_sec_filings
    │   │   │   ├── api
    │   │   │   │   ├── __init__.py
    │   │   │   │   └── app.py
    │   │   │   └── sections.py
    │   │   ├── __init__.py
    │   │   ├── README.md
    │   │   └── main.py
    │   ├── datasets
    │   │   ├── __init__.py
    │   │   └── load_dataset.py
    │   ├── earning_calls
    │   │   ├── __init__.py
    │   │   ├── main.py
    │   │   └── utils.py
    │   └── _base.py
    ├── large_language_models
    │   ├── __init__.py
    │   ├── embeddings
    │   │   ├── bert.py
    │   │   ├── __init__.py
    │   │   └── finbert.py
    │   ├── openai
    │   │   ├── __init__.py
    │   │   ├── openai_chat_agent.py
    │   │   └── app4gpt_chat_agent.py
    │   └── sentiment
    │   │   ├── gpt3.py
    │   │   ├── paml.py
    │   │   └── __init__.py
    ├── benchmarks
    │   ├── tfns.py
    │   ├── fpb.py
    │   ├── nwgi.py
    │   └── fiqa.py
    ├── data_engineering
    │   └── data_cleaning.py
    └── utils
    │   └── get_proxy.py
├── docs
    └── FinNLP
    │   ├── site
    │       ├── assets
    │       │   ├── javascripts
    │       │   │   └── lunr
    │       │   │   │   └── min
    │       │   │   │       ├── lunr.jp.min.js
    │       │   │   │       ├── lunr.vi.min.js
    │       │   │   │       ├── lunr.multi.min.js
    │       │   │   │       ├── lunr.th.min.js
    │       │   │   │       ├── lunr.ta.min.js
    │       │   │   │       ├── lunr.zh.min.js
    │       │   │   │       ├── lunr.ja.min.js
    │       │   │   │       ├── lunr.hi.min.js
    │       │   │   │       ├── lunr.stemmer.support.min.js
    │       │   │   │       ├── lunr.ko.min.js
    │       │   │   │       ├── lunr.sv.min.js
    │       │   │   │       ├── lunr.da.min.js
    │       │   │   │       ├── lunr.no.min.js
    │       │   │   │       ├── lunr.nl.min.js
    │       │   │   │       ├── lunr.de.min.js
    │       │   │   │       └── lunr.du.min.js
    │       │   ├── images
    │       │   │   └── favicon.png
    │       │   └── stylesheets
    │       │   │   └── palette.a0c5b2b5.min.css.map
    │       ├── sitemap.xml.gz
    │       └── sitemap.xml
    │   ├── mkdocs.yml
    │   └── docs
    │       └── zh
    │           └── index.md
├── requirements.txt
├── .gitignore
├── .gitmodules
├── demo
    └── README.md
├── LICENSE
├── setup.py
├── markdowns
    └── codes.md
└── test
    └── en


/finnlp/data_sources/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/finnlp/data_sources/news/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/finnlp/data_sources/trends/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/finnlp/data_sources/trends/baidu.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/finnlp/large_language_models/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/finnlp/data_sources/social_media/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/finnlp/large_language_models/embeddings/bert.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/finnlp/large_language_models/openai/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/finnlp/large_language_models/sentiment/gpt3.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/finnlp/large_language_models/sentiment/paml.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/finnlp/data_sources/company_announcement/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/finnlp/large_language_models/embeddings/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/finnlp/large_language_models/embeddings/finbert.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/finnlp/large_language_models/sentiment/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/finnlp/data_sources/sec_filings/prepline_sec_filings/api/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/finnlp/data_sources/datasets/__init__.py:
--------------------------------------------------------------------------------
1 | from load_dataset import load_dataset


--------------------------------------------------------------------------------
/docs/FinNLP/site/assets/javascripts/lunr/min/lunr.jp.min.js:
--------------------------------------------------------------------------------
1 | module.exports=require("./lunr.ja");


--------------------------------------------------------------------------------
/finnlp/data_sources/earning_calls/__init__.py:
--------------------------------------------------------------------------------
1 | from finnlp.data_sources.earning_calls.main import EarningCallTranscripts
2 | 


--------------------------------------------------------------------------------
/docs/FinNLP/site/sitemap.xml.gz:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/AI4Finance-Foundation/FinNLP/HEAD/docs/FinNLP/site/sitemap.xml.gz


--------------------------------------------------------------------------------
/finnlp/data_sources/sec_filings/__init__.py:
--------------------------------------------------------------------------------
1 | from finnlp.data_sources.sec_filings.main import SECFilingsLoader
2 | 
3 | 
4 | 
5 | 


--------------------------------------------------------------------------------
/requirements.txt:
--------------------------------------------------------------------------------
 1 | 
 2 | akshare
 3 | tushare
 4 | finnhub-python
 5 | parsel
 6 | requests
 7 | pandas
 8 | tqdm
 9 | pytz
10 | 


--------------------------------------------------------------------------------
/docs/FinNLP/site/assets/images/favicon.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/AI4Finance-Foundation/FinNLP/HEAD/docs/FinNLP/site/assets/images/favicon.png


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
1 | /demo/chatgpt-trading/token_.py
2 | demo/chatgpt-trading/token_.py
3 | */token_.py
4 | *token_.py
5 | 
6 | */__pycache__/*
7 | *__pycache__*


--------------------------------------------------------------------------------
/.gitmodules:
--------------------------------------------------------------------------------
1 | [submodule "stocknet-dataset"]
2 | 	path = stocknet-dataset
3 | 	url = https://github.com/yumoxu/stocknet-dataset.git
4 | [submodule "Astock"]
5 | 	path = Astock
6 | 	url = https://github.com/JinanZou/Astock.git
7 | 


--------------------------------------------------------------------------------
/finnlp/data_sources/trends/_base.py:
--------------------------------------------------------------------------------
 1 | class Trend_Downloader:
 2 | 
 3 |     def __init__(self, args = {}):
 4 |         pass
 5 | 
 6 |     def download(self, start_date, end_date, stock = "all"):
 7 |         pass
 8 | 
 9 |     def clean_data(self):
10 |         pass
11 | 
12 |     def gather_one_day(self,date,stock = "all",delay = 0.1):
13 |         pass
14 | 
15 |     def transfer_standard_date_to_nonstandard(self,date):
16 |         pass


--------------------------------------------------------------------------------
/finnlp/data_sources/social_media/_base.py:
--------------------------------------------------------------------------------
 1 | from finnlp.data_sources._base import FinNLP_Downloader
 2 | 
 3 | class Social_Media_Downloader(FinNLP_Downloader):
 4 | 
 5 |     def __init__(self, args = {}):
 6 |         super().__init__(args)
 7 |         pass
 8 | 
 9 |     def download(self, start_date, end_date, stock = "all"):
10 |         pass
11 | 
12 |     def clean_data(self):
13 |         pass
14 | 
15 |     def gather_one_day_news(self,date,stock = "all",delay = 0.1):
16 |         pass
17 | 
18 |     def transfer_standard_date_to_nonstandard(self,date):
19 |         pass


--------------------------------------------------------------------------------
/finnlp/data_sources/news/_base.py:
--------------------------------------------------------------------------------
 1 | from finnlp.data_sources._base import FinNLP_Downloader
 2 | 
 3 | class News_Downloader(FinNLP_Downloader):
 4 |     
 5 |     def __init__(self, args = {}):
 6 |         super().__init__(args)
 7 |         pass
 8 | 
 9 |     def download_date_range(self, start_date, end_date, stock = None):
10 |         pass
11 |     
12 |     def download_streaming(self, stock = None):
13 |         pass
14 | 
15 |     def clean_data(self):
16 |         pass
17 | 
18 |     def _gather_one_part(self, date, stock = None, delay = 0.1):
19 |         pass
20 |     
21 |     def _gather_content(self):
22 |         pass
23 | 


--------------------------------------------------------------------------------
/finnlp/data_sources/company_announcement/_base.py:
--------------------------------------------------------------------------------
 1 | from finnlp.data_sources._base import FinNLP_Downloader
 2 | 
 3 | class Company_Announcement_Downloader(FinNLP_Downloader):
 4 | 
 5 |     def __init__(self, args = {}):
 6 |         super().__init__(args)
 7 |         pass
 8 | 
 9 |     def download_date_range_all(self, start_date, end_date):
10 |         pass
11 |     
12 |     def download_date_range_stock(self, start_date, end_date, stock = "AAPL"):
13 |         pass
14 |     
15 |     def download_streaming_all(self, rounds = 3):
16 |         pass
17 |     
18 |     def download_streaming_stock(self, stock = None, rounds = 3):
19 |         pass
20 | 
21 |     def clean_data(self):
22 |         pass


--------------------------------------------------------------------------------
/finnlp/data_sources/trends/google.py:
--------------------------------------------------------------------------------
 1 | from pytrends.request import TrendReq
 2 | import pandas as pd
 3 | 
 4 | class Google_Trends:
 5 |     def __init__(self,args = {}):
 6 |         # https://github.com/GeneralMills/pytrends
 7 |         self.pytrends = TrendReq(hl='en-US', tz=360)
 8 |   
 9 |     def download(self, start_date, end_date, stock = 'apple' ):
10 |         self.date_list = pd.date_range(start_date,end_date)
11 |         timeframe = [f"{start_date} {end_date}"]
12 |         kw_list = [stock]
13 |         self.pytrends.build_payload(kw_list=kw_list, timeframe=timeframe)
14 |         res = self.pytrends.interest_over_time()
15 |         # res.columns = ["date","value"]
16 |         return res
17 | 


--------------------------------------------------------------------------------
/docs/FinNLP/site/assets/javascripts/lunr/min/lunr.vi.min.js:
--------------------------------------------------------------------------------
1 | !function(e,r){"function"==typeof define&&define.amd?define(r):"object"==typeof exports?module.exports=r():r()(e.lunr)}(this,function(){return function(e){if(void 0===e)throw new Error("Lunr is not present. Please include / require Lunr before this script.");if(void 0===e.stemmerSupport)throw new Error("Lunr stemmer support is not present. Please include / require Lunr stemmer support before this script.");e.vi=function(){this.pipeline.reset(),this.pipeline.add(e.vi.stopWordFilter,e.vi.trimmer)},e.vi.wordCharacters="[A-Za-ẓ̀͐́͑̉̃̓ÂâÊêÔôĂ-ăĐ-đƠ-ơƯ-ư]",e.vi.trimmer=e.trimmerSupport.generateTrimmer(e.vi.wordCharacters),e.Pipeline.registerFunction(e.vi.trimmer,"trimmer-vi"),e.vi.stopWordFilter=e.generateStopWordFilter("là cái nhưng mà".split(" "))}});


--------------------------------------------------------------------------------
/docs/FinNLP/site/assets/javascripts/lunr/min/lunr.multi.min.js:
--------------------------------------------------------------------------------
1 | !function(e,t){"function"==typeof define&&define.amd?define(t):"object"==typeof exports?module.exports=t():t()(e.lunr)}(this,function(){return function(e){e.multiLanguage=function(){for(var t=Array.prototype.slice.call(arguments),i=t.join("-"),r="",n=[],s=[],p=0;p<t.length;++p)"en"==t[p]?(r+="\\w",n.unshift(e.stopWordFilter),n.push(e.stemmer),s.push(e.stemmer)):(r+=e[t[p]].wordCharacters,e[t[p]].stopWordFilter&&n.unshift(e[t[p]].stopWordFilter),e[t[p]].stemmer&&(n.push(e[t[p]].stemmer),s.push(e[t[p]].stemmer)));var o=e.trimmerSupport.generateTrimmer(r);return e.Pipeline.registerFunction(o,"lunr-multi-trimmer-"+i),n.unshift(o),function(){this.pipeline.reset(),this.pipeline.add.apply(this.pipeline,n),this.searchPipeline&&(this.searchPipeline.reset(),this.searchPipeline.add.apply(this.searchPipeline,s))}}}});


--------------------------------------------------------------------------------
/docs/FinNLP/site/sitemap.xml:
--------------------------------------------------------------------------------
 1 | <?xml version="1.0" encoding="UTF-8"?>
 2 | <urlset xmlns="http://www.sitemaps.org/schemas/sitemap/0.9">
 3 |     <url>
 4 |          <loc>None</loc>
 5 |          <lastmod>2023-08-24</lastmod>
 6 |          <changefreq>daily</changefreq>
 7 |     </url>
 8 |     <url>
 9 |          <loc>None</loc>
10 |          <lastmod>2023-08-24</lastmod>
11 |          <changefreq>daily</changefreq>
12 |     </url>
13 |     <url>
14 |          <loc>None</loc>
15 |          <lastmod>2023-08-24</lastmod>
16 |          <changefreq>daily</changefreq>
17 |     </url>
18 |     <url>
19 |          <loc>None</loc>
20 |          <lastmod>2023-08-24</lastmod>
21 |          <changefreq>daily</changefreq>
22 |     </url>
23 |     <url>
24 |          <loc>None</loc>
25 |          <lastmod>2023-08-24</lastmod>
26 |          <changefreq>daily</changefreq>
27 |     </url>
28 | </urlset>


--------------------------------------------------------------------------------
/finnlp/data_sources/news/fmp_streaming.py:
--------------------------------------------------------------------------------
 1 | import json
 2 | import requests
 3 | import pandas as pd
 4 | from tqdm.notebook import tqdm
 5 | 
 6 | df = pd.read_csv("NAS.csv", index_col=0)
 7 | stock_list = df.index.to_list()
 8 | 
 9 | api_key = YOUR_API_KEY  # You may find your api key here https://site.financialmodelingprep.com/developer/docs/api-keys
10 | 
11 | all = pd.DataFrame()
12 | for stock in tqdm(stock_list):
13 |     for page in tqdm(range(500)):
14 |         url = f"https://financialmodelingprep.com/api/v3/stock_news?tickers={stock}&page={page+1}&apikey={api_key}"
15 |         res = requests.get(url)
16 |         res = json.loads(res.text)
17 |         if len(res) == 0:
18 |             break
19 |         else:
20 |             res = pd.DataFrame(res)
21 |             all = pd.concat([all, res])
22 | 
23 | all = all.reset_index(drop=True)
24 | all.to_csv("dataset_more.csv")


--------------------------------------------------------------------------------
/finnlp/data_sources/news/akshare_cctv.py:
--------------------------------------------------------------------------------
 1 | import pandas as pd
 2 | import akshare as ak
 3 | from tqdm.notebook import tqdm
 4 | from finnlp.data_sources.news._base import News_Downloader
 5 | 
 6 | 
 7 | class Akshare_cctv(News_Downloader):
 8 | 
 9 |     def __init__(self, args={}):
10 |         pass
11 | 
12 |     def download_news(self, start_date, end_date, stock="all"):
13 |         self.date_list = pd.date_range(start_date, end_date)
14 |         res = pd.DataFrame()
15 |         for date in tqdm(self.date_list):
16 |             tmp = self.gather_one_day_news(date)
17 |             res = pd.concat([res, tmp])
18 |         self.dataframe = res
19 | 
20 |     def clean_data(self):
21 |         pass
22 | 
23 |     def gather_one_day_news(self, date, stock="all", delay=0.1):
24 |         date = self.transfer_standard_date_to_nonstandard(date)
25 |         res = ak.news_cctv(date=date)
26 |         return res
27 | 
28 |     def transfer_standard_date_to_nonstandard(self, date):
29 |         return date.strftime("%Y%m%d")


--------------------------------------------------------------------------------
/docs/FinNLP/site/assets/javascripts/lunr/min/lunr.th.min.js:
--------------------------------------------------------------------------------
1 | !function(e,r){"function"==typeof define&&define.amd?define(r):"object"==typeof exports?module.exports=r():r()(e.lunr)}(this,function(){return function(e){if(void 0===e)throw new Error("Lunr is not present. Please include / require Lunr before this script.");if(void 0===e.stemmerSupport)throw new Error("Lunr stemmer support is not present. Please include / require Lunr stemmer support before this script.");var r="2"==e.version[0];e.th=function(){this.pipeline.reset(),this.pipeline.add(e.th.trimmer),r?this.tokenizer=e.th.tokenizer:(e.tokenizer&&(e.tokenizer=e.th.tokenizer),this.tokenizerFn&&(this.tokenizerFn=e.th.tokenizer))},e.th.wordCharacters="[฀-๿]",e.th.trimmer=e.trimmerSupport.generateTrimmer(e.th.wordCharacters),e.Pipeline.registerFunction(e.th.trimmer,"trimmer-th");var t=e.wordcut;t.init(),e.th.tokenizer=function(i){if(!arguments.length||null==i||void 0==i)return[];if(Array.isArray(i))return i.map(function(t){return r?new e.Token(t):t});var n=i.toString().replace(/^\s+/,"");return t.cut(n).split("|")}}});


--------------------------------------------------------------------------------
/demo/README.md:
--------------------------------------------------------------------------------
 1 | ## Demos:
 2 | 
 3 | ### Ⅰ. ChatGPT Tradings
 4 | 
 5 | 1. [Trade with ChatGPT](https://github.com/AI4Finance-Foundation/ChatGPT-for-FinTech/tree/master/demo/chatgpt-trading-v1)
 6 |     * Using the ChatGPT to give us trading suggestions.
 7 |     * On [Ashare (News)](https://github.com/JinanZou/Astock) and A share Market ( `Maotai (贵州茅台 600519)` )
 8 |     ![image-20230220011335859](https://cdn.jsdelivr.net/gh/oliverwang15/imgbed@main/img/202302200113884.png)
 9 | 2. [Trade like ChatGPT](https://github.com/AI4Finance-Foundation/ChatGPT-for-FinTech/tree/master/demo/chatgpt-trading-v2)
10 |     * Using ChatGPT's language model, GPT-3 to create an FinRL agent that trades as smartly as ChatGPT
11 |     * On [stocknet-dataset (Tweets)](https://github.com/yumoxu/stocknet-dataset) and US Stocks Market (`AAPL`)
12 |     ![image-20230216004801458](https://cdn.jsdelivr.net/gh/oliverwang15/imgbed@main/img/202302181558796.png)
13 | ### Ⅱ. Sentiment Classify
14 | 
15 | 1. [Shares News Sentiment Classify.](https://github.com/AI4Finance-Foundation/ChatGPT-for-FinTech/blob/master/demo/shares_news_sentiment_classify.py)


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 | MIT License
 2 | 
 3 | Copyright (c) 2024 AI4Finance Foundation Inc.
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | of this software and associated documentation files (the "Software"), to deal
 7 | in the Software without restriction, including without limitation the rights
 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 | 
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 | 
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 | 


--------------------------------------------------------------------------------
/finnlp/data_sources/sec_filings/README.md:
--------------------------------------------------------------------------------
 1 | # SEC DATA DOWNLOADER
 2 | 
 3 | Please checkout this repo that I am building on SEC Question Answering Agent [SEC-QA](https://github.com/Athe-kunal/SEC-QA-Agent)
 4 | 
 5 | This repository downloads all the texts from SEC documents (10-K and 10-Q). Currently, it is not supporting documents that are amended, but that will be added in the near futures.
 6 | 
 7 | Install the required dependencies
 8 | 
 9 | ```
10 | python install -r requirements.txt
11 | ```
12 | 
13 | The SEC Downloader expects 5 attributes
14 | 
15 | * tickers: It is a list of valid tickers
16 | * amount: Number of documents that you want to download
17 | * filing_type: 10-K or 10-Q filing type
18 | * num_workers: It is for multithreading and multiprocessing. We have multi-threading at the ticker level and multi-processing at the year level for a given ticker
19 | * include_amends: To include amendments or not.
20 | 
21 | 
22 | ## REFERENCES
23 | 1. Unstructured SEC Filings API: [repo link](https://github.com/Unstructured-IO/pipeline-sec-filings/tree/main)
24 | 2. SEC Edgar Downloader: [repo link](https://github.com/jadchaar/sec-edgar-downloader)
25 | 
26 | 


--------------------------------------------------------------------------------
/finnlp/data_sources/datasets/load_dataset.py:
--------------------------------------------------------------------------------
 1 | import datasets
 2 | import pandas as pd
 3 | from tqdm.notebook import tqdm
 4 | import json
 5 | import os
 6 | 
 7 | def load_dataset(dataset_name, **kwargs):
 8 |     if dataset_name == "Stocknet":
 9 |         root_path = r"../../../stocknet-dataset/tweet/raw"
10 |         stock_lists = os.listdir(root_path)
11 |         all = pd.DataFrame()
12 |         for stock in tqdm(stock_lists, desc="Loading Stocknet dataset..."):
13 |             stock_path = os.path.join(root_path, stock)
14 |             date_files = os.listdir(stock_path)
15 |             for date in date_files:
16 |                 with open(os.path.join(stock_path, date_files[0])) as f:
17 |                     json_list = f.readlines()
18 |                 tmp_json = []
19 |                 for json_str in json_list:
20 |                     tmp_json.append(json.loads(json_str))
21 |                 tmp_json = pd.DataFrame(tmp_json)
22 |                 all = pd.concat([all, tmp_json], axis=0)
23 |         all = all.reset_index(drop=True)
24 |         all = datasets.Dataset.from_pandas(all)
25 |         return all
26 | 
27 |     else:
28 |         raise NotImplementedError("Only support Stocknet dataset for now")
29 | 
30 | 


--------------------------------------------------------------------------------
/finnlp/data_sources/news/tushare_major_news.py:
--------------------------------------------------------------------------------
 1 | import tushare as ts
 2 | import pandas as pd
 3 | from tqdm.notebook import tqdm
 4 | from finnlp.data_sources.news._base import News_Downloader
 5 | import time
 6 | 
 7 | class Tushare_Major_News(News_Downloader):
 8 | 
 9 |     def __init__(self, args = {}):
10 |         token = args["token"] if "token" in args.keys() else "27080ec403c0218f96f388bca1b1d85329d563c91a43672239619ef5"
11 |         ts.set_token(token)
12 |         self.pro = ts.pro_api()
13 | 
14 |     def download_news(self, start_date, end_date, stock = "all"):
15 |         self.date_list = pd.date_range(start_date,end_date)
16 |         res = pd.DataFrame()
17 |         for date in tqdm(self.date_list):
18 |             tmp = self.gather_one_day_news(date)
19 |             res = pd.concat([res,tmp])
20 |         self.dataframe = res
21 | 
22 |     def gather_one_day_news(self,date,stock = "all",delay = 0.1):
23 |         date = self.transfer_standard_date_to_nonstandard(date)
24 |         res = self.pro.major_news(start_date = date,end_date = date)
25 |         time.sleep(delay)
26 |         return res
27 | 
28 |     def clean_data(self):
29 |         pass
30 | 
31 |     def transfer_standard_date_to_nonstandard(self,date):
32 |         return date.strftime("%Y-%m0%d 00:00:00")


--------------------------------------------------------------------------------
/finnlp/data_sources/news/tipranks_streaming.py:
--------------------------------------------------------------------------------
 1 | import requests
 2 | from lxml import etree
 3 | from tqdm import tqdm
 4 | import pandas as pd
 5 | import json
 6 | import time
 7 | from finnlp.data_sources.news._base import News_Downloader
 8 | 
 9 | # TODO:
10 | # 1. Contents
11 | 
12 | class TipRanks_Streaming(News_Downloader):
13 | 
14 |     def __init__(self, args={}):
15 |         super().__init__(args)
16 |         self.dataframe = pd.DataFrame()
17 | 
18 |     def download_streaming_search(self, keyword = "apple", rounds = 10000, delay = 0.5):
19 |         url = "https://www.tipranks.com/api/news/posts"
20 |         headers = {
21 |             'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/116.0.0.0 Safari/537.36',
22 |         }
23 |         print("Downloading:", end = " ")
24 |         for r in range(rounds):
25 |             params = {
26 |                 'page': r,
27 |                 'per_page': '50',
28 |                 'search': keyword,
29 |             }
30 |             res = requests.get(url = url, headers= headers, params=params)
31 |             if res.status_code != 200:
32 |                 break
33 |             try:
34 |                 res = json.loads(res.text)
35 |                 tmp = pd.DataFrame(res['data'])
36 |                 self.dataframe = pd.concat([self.dataframe, tmp])
37 |             except:
38 |                 print(res.text)
39 |             # sleep
40 |             time.sleep(delay)
41 |             print(r, end = " ")


--------------------------------------------------------------------------------
/setup.py:
--------------------------------------------------------------------------------
 1 | from setuptools import setup, find_packages
 2 | 
 3 | # Read requirements.txt, ignore comments
 4 | try:
 5 |     with open("requirements.txt", "r") as f:
 6 |         REQUIRES = [line.split('#', 1)[0].strip() for line in f if line.strip()]
 7 | except:
 8 |     print("'requirements.txt' not found!")
 9 |     REQUIRES = list()
10 | 
11 | setup(
12 |     name="FinNLP",
13 |     version="0.0.1",
14 |     include_package_data=True,
15 |     author="AI4Finance Foundation",
16 |     author_email="contact@ai4finance.org",
17 |     url="https://github.com/AI4Finance-Foundation/FinNLP",
18 |     license="MIT",
19 |     packages=find_packages(),
20 |     install_requires=REQUIRES,
21 |     description="FinNLP",
22 |     long_description="""FinNLP""",
23 |     classifiers=[
24 |         # Trove classifiers
25 |         # Full list: https://pypi.python.org/pypi?%3Aaction=list_classifiers
26 |         "License :: OSI Approved :: MIT License",
27 |         "Programming Language :: Python",
28 |         "Programming Language :: Python :: 3",
29 |         "Programming Language :: Python :: 3.6",
30 |         "Programming Language :: Python :: 3.7",
31 |         "Programming Language :: Python :: 3.8",
32 |         "Programming Language :: Python :: 3.9",
33 |         "Programming Language :: Python :: Implementation :: CPython",
34 |         "Programming Language :: Python :: Implementation :: PyPy",
35 |     ],
36 |     keywords="Financial Large Language Models",
37 |     platforms=["any"],
38 |     python_requires=">=3.6",
39 | )
40 | 


--------------------------------------------------------------------------------
/finnlp/data_sources/earning_calls/main.py:
--------------------------------------------------------------------------------
 1 | from datetime import datetime
 2 | from typing import List
 3 | 
 4 | try:
 5 |     from finnlp.data_sources.earning_calls.utils import get_earning_transcripts
 6 | except ImportError:
 7 |     from utils import get_earning_transcripts
 8 | 
 9 | 
10 | class EarningCallTranscripts():
11 |     def __init__(self, year: int, ticker: str, quarter: str):
12 |         """Get the earning call transcripts for a given company, in a given year and quarter
13 | 
14 |         Args:
15 |             year (int): Year of the transcript
16 |             ticker (str): ticker symbol of the stock
17 |             quarter (str): quarter
18 |         """
19 |         curr_year = datetime.now().year
20 |         assert year <= curr_year, "The year should be less than current year"
21 | 
22 |         assert quarter in [
23 |             "Q1",
24 |             "Q2",
25 |             "Q3",
26 |             "Q4",
27 |         ], 'The quarter should from the list ["Q1","Q2","Q3","Q4"]'
28 |         self.year = year
29 |         self.ticker = ticker
30 |         self.quarter = quarter
31 | 
32 |     def load_data(self):
33 |         resp_dict, speakers_list = get_earning_transcripts(
34 |             self.quarter, self.ticker, self.year
35 |         )
36 |         return {
37 |             "text":resp_dict["content"],
38 |             "metadata":{
39 |                 "ticker": resp_dict["symbol"],
40 |                 "quarter": "Q" + str(resp_dict["quarter"]),
41 |                 "date_time": resp_dict["date"],
42 |                 "speakers_list": speakers_list,
43 |             },
44 |         }
45 |         
46 | 


--------------------------------------------------------------------------------
/finnlp/data_sources/sec_filings/prepline_sec_filings/api/app.py:
--------------------------------------------------------------------------------
 1 | #####################################################################
 2 | # THIS FILE IS AUTOMATICALLY GENERATED BY UNSTRUCTURED API TOOLS.
 3 | # DO NOT MODIFY DIRECTLY
 4 | #####################################################################
 5 | 
 6 | 
 7 | import logging
 8 | import os
 9 | 
10 | from fastapi import FastAPI, Request, status
11 | 
12 | from .section import router as section_router
13 | 
14 | app = FastAPI(
15 |     title="Unstructured Pipeline API",
16 |     description="""""",
17 |     version="1.0.0",
18 |     docs_url="/sec-filings/docs",
19 |     openapi_url="/sec-filings/openapi.json",
20 | )
21 | 
22 | allowed_origins = os.environ.get("ALLOWED_ORIGINS", None)
23 | if allowed_origins:
24 |     from fastapi.middleware.cors import CORSMiddleware
25 | 
26 |     app.add_middleware(
27 |         CORSMiddleware,
28 |         allow_origins=allowed_origins.split(","),
29 |         allow_methods=["OPTIONS", "POST"],
30 |         allow_headers=["Content-Type"],
31 |     )
32 | 
33 | app.include_router(section_router)
34 | 
35 | 
36 | # Filter out /healthcheck noise
37 | class HealthCheckFilter(logging.Filter):
38 |     def filter(self, record: logging.LogRecord) -> bool:
39 |         return record.getMessage().find("/healthcheck") == -1
40 | 
41 | 
42 | logging.getLogger("uvicorn.access").addFilter(HealthCheckFilter())
43 | 
44 | 
45 | @app.get("/healthcheck", status_code=status.HTTP_200_OK, include_in_schema=False)
46 | def healthcheck(request: Request):
47 |     return {"healthcheck": "HEALTHCHECK STATUS: EVERYTHING OK!"}
48 | 


--------------------------------------------------------------------------------
/finnlp/data_sources/news/yicai_streaming.py:
--------------------------------------------------------------------------------
 1 | import warnings
 2 | warnings.filterwarnings("ignore")
 3 | import requests
 4 | from lxml import etree
 5 | from tqdm import tqdm
 6 | import pandas as pd
 7 | import json
 8 | import time
 9 | from finnlp.data_sources.news._base import News_Downloader
10 | 
11 | # TODO:
12 | # 1. Contents
13 | 
14 | class Yicai_Streaming(News_Downloader):
15 | 
16 |     def __init__(self, args={}):
17 |         super().__init__(args)
18 |         self.dataframe = pd.DataFrame()
19 | 
20 |     def download_streaming_search(self, keyword = "茅台", rounds = 3, delay = 0.5):
21 |         url = "https://www.yicai.com/api/ajax/getSearchResult"
22 |         
23 |         headers = {
24 |             'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/116.0.0.0 Safari/537.36',
25 |             'Referer':'https://www.yicai.com/search?keys=%E8%8C%85%E5%8F%B0',
26 |             'X-Requested-With': 'XMLHttpRequest',
27 |         }
28 | 
29 |         print("Downloading ...", end = ' ')
30 |         for page in range(rounds):
31 |             params = {
32 |                 'page': page,
33 |                 'pagesize': '20',
34 |                 'keys': keyword,
35 |                 'type': '0',
36 |             }
37 |             res = requests.get(url = url, headers = headers, params = params)
38 |             if res.status_code != 200:
39 |                 break
40 |             res = json.loads(res.text)
41 |             res = res['results']
42 |             tmp = pd.DataFrame(res["docs"])
43 |             self.dataframe = pd.concat([self.dataframe, tmp])
44 | 
45 |             print(page, end = ' ')
46 | 
47 |             time.sleep(delay)


--------------------------------------------------------------------------------
/finnlp/data_sources/social_media/eastmoney_streaming.py:
--------------------------------------------------------------------------------
 1 | import warnings
 2 | warnings.filterwarnings("ignore")
 3 | import requests
 4 | from lxml import etree
 5 | from tqdm import tqdm
 6 | import pandas as pd
 7 | import json
 8 | import time
 9 | from finnlp.data_sources.social_media._base import Social_Media_Downloader
10 | 
11 | # TODO:
12 | # 1. Contents
13 | 
14 | class Eastmoney_Streaming(Social_Media_Downloader):
15 |     def __init__(self, args = {}):
16 |         super().__init__(args)
17 |         self.dataframe = pd.DataFrame()
18 | 
19 |     def download_streaming_stock(self, keyword = "600519", rounds = 3, delay = 0.5):
20 |         headers = {
21 |             'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/116.0.0.0 Safari/537.36',
22 |         }
23 |         print('Downloading ...', end =' ')
24 |         for page in range(rounds):
25 |             url = f"https://guba.eastmoney.com/list,{keyword}_{page+1}.html"
26 |             res = requests.get(url=url, headers=headers)
27 |             if res.status_code != 200:
28 |                 break
29 | 
30 |             res = etree.HTML(res.text)
31 |             res = res.xpath("//script")[3].xpath("text()")[0]
32 |             article_list, other_list = res.split('var article_list=')[1].strip(";").split(';    var other_list=')
33 |             article_list = json.loads(article_list)
34 |             tmp = pd.DataFrame(article_list['re'])
35 |             self.dataframe = pd.concat([self.dataframe, tmp])
36 | 
37 |             print(page, end =' ')
38 |             time.sleep(delay)
39 |         
40 |         self.dataframe = self.dataframe.reset_index(drop= True)
41 | 
42 | 
43 |         


--------------------------------------------------------------------------------
/docs/FinNLP/site/assets/javascripts/lunr/min/lunr.ta.min.js:
--------------------------------------------------------------------------------
1 | !function(e,t){"function"==typeof define&&define.amd?define(t):"object"==typeof exports?module.exports=t():t()(e.lunr)}(this,function(){return function(e){if(void 0===e)throw new Error("Lunr is not present. Please include / require Lunr before this script.");if(void 0===e.stemmerSupport)throw new Error("Lunr stemmer support is not present. Please include / require Lunr stemmer support before this script.");e.ta=function(){this.pipeline.reset(),this.pipeline.add(e.ta.trimmer,e.ta.stopWordFilter,e.ta.stemmer),this.searchPipeline&&(this.searchPipeline.reset(),this.searchPipeline.add(e.ta.stemmer))},e.ta.wordCharacters="஀-உஊ-ஏஐ-ஙச-ட஠-னப-யர-ஹ஺-ிீ-௉ொ-௏ௐ-௙௚-௟௠-௩௪-௯௰-௹௺-௿a-zA-Zａ-ｚＡ-Ｚ0-9０-９",e.ta.trimmer=e.trimmerSupport.generateTrimmer(e.ta.wordCharacters),e.Pipeline.registerFunction(e.ta.trimmer,"trimmer-ta"),e.ta.stopWordFilter=e.generateStopWordFilter("அங்கு அங்கே அது அதை அந்த அவர் அவர்கள் அவள் அவன் அவை ஆக ஆகவே ஆகையால் ஆதலால் ஆதலினால் ஆனாலும் ஆனால் இங்கு இங்கே இது இதை இந்த இப்படி இவர் இவர்கள் இவள் இவன் இவை இவ்வளவு உனக்கு உனது உன் உன்னால் எங்கு எங்கே எது எதை எந்த எப்படி எவர் எவர்கள் எவள் எவன் எவை எவ்வளவு எனக்கு எனது எனவே என் என்ன என்னால் ஏது ஏன் தனது தன்னால் தானே தான் நாங்கள் நாம் நான் நீ நீங்கள்".split(" ")),e.ta.stemmer=function(){return function(e){return"function"==typeof e.update?e.update(function(e){return e}):e}}();var t=e.wordcut;t.init(),e.ta.tokenizer=function(r){if(!arguments.length||null==r||void 0==r)return[];if(Array.isArray(r))return r.map(function(t){return isLunr2?new e.Token(t.toLowerCase()):t.toLowerCase()});var i=r.toString().toLowerCase().replace(/^\s+/,"");return t.cut(i).split("|")},e.Pipeline.registerFunction(e.ta.stemmer,"stemmer-ta"),e.Pipeline.registerFunction(e.ta.stopWordFilter,"stopWordFilter-ta")}});


--------------------------------------------------------------------------------
/finnlp/data_sources/earning_calls/utils.py:
--------------------------------------------------------------------------------
 1 | from tenacity import retry, stop_after_attempt, wait_random_exponential
 2 | import requests
 3 | import json
 4 | from datetime import datetime
 5 | import re
 6 | from typing import List
 7 | 
 8 | 
 9 | def correct_date(yr, dt):
10 |     """Some transcripts have incorrect date, correcting it
11 | 
12 |     Args:
13 |         yr (int): actual
14 |         dt (datetime): given date
15 | 
16 |     Returns:
17 |         datetime: corrected date
18 |     """
19 |     dt = datetime.strptime(dt, "%Y-%m-%d %H:%M:%S")
20 |     if dt.year != yr:
21 |         dt = dt.replace(year=yr)
22 |     return dt.strftime("%Y-%m-%d %H:%M:%S")
23 | 
24 | 
25 | def extract_speakers(cont: str) -> List[str]:
26 |     """Extract the list of speakers
27 | 
28 |     Args:
29 |         cont (str): transcript content
30 | 
31 |     Returns:
32 |         List[str]: list of speakers
33 |     """
34 |     pattern = re.compile(r"\n(.*?):")
35 |     matches = pattern.findall(cont)
36 | 
37 |     return list(set(matches))
38 | 
39 | 
40 | @retry(wait=wait_random_exponential(min=1, max=5), stop=stop_after_attempt(2))
41 | def get_earning_transcripts(quarter: str, ticker: str, year: int):
42 |     """Get the earnings transcripts
43 | 
44 |     Args:
45 |         quarter (str)
46 |         ticker (str)
47 |         year (int)
48 |     """
49 |     response = requests.get(
50 |         f"https://discountingcashflows.com/api/transcript/{ticker}/{quarter}/{year}/",
51 |         auth=("user", "pass"),
52 |     )
53 | 
54 |     resp_text = json.loads(response.text)
55 |     speakers_list = extract_speakers(resp_text[0]["content"])
56 |     corrected_date = correct_date(resp_text[0]["year"], resp_text[0]["date"])
57 |     resp_text[0]["date"] = corrected_date
58 |     return resp_text[0], speakers_list
59 | 


--------------------------------------------------------------------------------
/docs/FinNLP/site/assets/javascripts/lunr/min/lunr.zh.min.js:
--------------------------------------------------------------------------------
1 | !function(e,r){"function"==typeof define&&define.amd?define(r):"object"==typeof exports?module.exports=r(require("@node-rs/jieba")):r()(e.lunr)}(this,function(e){return function(r,t){if(void 0===r)throw new Error("Lunr is not present. Please include / require Lunr before this script.");if(void 0===r.stemmerSupport)throw new Error("Lunr stemmer support is not present. Please include / require Lunr stemmer support before this script.");var i="2"==r.version[0];r.zh=function(){this.pipeline.reset(),this.pipeline.add(r.zh.trimmer,r.zh.stopWordFilter,r.zh.stemmer),i?this.tokenizer=r.zh.tokenizer:(r.tokenizer&&(r.tokenizer=r.zh.tokenizer),this.tokenizerFn&&(this.tokenizerFn=r.zh.tokenizer))},r.zh.tokenizer=function(n){if(!arguments.length||null==n||void 0==n)return[];if(Array.isArray(n))return n.map(function(e){return i?new r.Token(e.toLowerCase()):e.toLowerCase()});t&&e.load(t);var o=n.toString().trim().toLowerCase(),s=[];e.cut(o,!0).forEach(function(e){s=s.concat(e.split(" "))}),s=s.filter(function(e){return!!e});var u=0;return s.map(function(e,t){if(i){var n=o.indexOf(e,u),s={};return s.position=[n,e.length],s.index=t,u=n,new r.Token(e,s)}return e})},r.zh.wordCharacters="\\w一-龥",r.zh.trimmer=r.trimmerSupport.generateTrimmer(r.zh.wordCharacters),r.Pipeline.registerFunction(r.zh.trimmer,"trimmer-zh"),r.zh.stemmer=function(){return function(e){return e}}(),r.Pipeline.registerFunction(r.zh.stemmer,"stemmer-zh"),r.zh.stopWordFilter=r.generateStopWordFilter("的 一 不 在 人 有 是 为 以 于 上 他 而 后 之 来 及 了 因 下 可 到 由 这 与 也 此 但 并 个 其 已 无 小 我 们 起 最 再 今 去 好 只 又 或 很 亦 某 把 那 你 乃 它 吧 被 比 别 趁 当 从 到 得 打 凡 儿 尔 该 各 给 跟 和 何 还 即 几 既 看 据 距 靠 啦 了 另 么 每 们 嘛 拿 哪 那 您 凭 且 却 让 仍 啥 如 若 使 谁 虽 随 同 所 她 哇 嗡 往 哪 些 向 沿 哟 用 于 咱 则 怎 曾 至 致 着 诸 自".split(" ")),r.Pipeline.registerFunction(r.zh.stopWordFilter,"stopWordFilter-zh")}});


--------------------------------------------------------------------------------
/docs/FinNLP/mkdocs.yml:
--------------------------------------------------------------------------------
 1 | site_name: FinGPT & FinNLP
 2 | site_author: Oliver Wang, Xiao-yang Liu
 3 | 
 4 | nav:
 5 |   - Hello World: 
 6 |     - About the project: 'index.md'
 7 | 
 8 |   - FinGPT Models:
 9 |     - FinGPT-v1: 'https://github.com/AI4Finance-Foundation/FinGPT/tree/master/fingpt/FinGPT-v1'
10 |     - FinGPT-v2: 'https://github.com/AI4Finance-Foundation/FinGPT/tree/master/fingpt/FinGPT-v2'
11 |     - FinGPT-v3: 'https://github.com/AI4Finance-Foundation/FinGPT/tree/master/fingpt/FinGPT-v3'
12 | 
13 |   - Robo Advisor:
14 |     - chatgpt-robo-advisor-v1: 'https://github.com/AI4Finance-Foundation/FinGPT/tree/master/fingpt/chatgpt-robo-advisor-v1'
15 |     - chatgpt-robo-advisor-v2: 'https://github.com/AI4Finance-Foundation/FinGPT/tree/master/fingpt/chatgpt-robo-advisor-v2'
16 | 
17 |   - Quantitative Trading:
18 |     - chatgpt-trading-v1: 'https://github.com/AI4Finance-Foundation/FinGPT/tree/master/fingpt/chatgpt-trading-v1'
19 |     - chatgpt-trading-v2: 'https://github.com/AI4Finance-Foundation/FinGPT/tree/master/fingpt/chatgpt-trading-v2'
20 | 
21 |   - Low code development:
22 |     - chatgpt-low-code-development-v1: 'https://github.com/AI4Finance-Foundation/FinGPT/tree/master/fingpt/chatgpt-low-code-development-v1'
23 |     - chatgpt-low-code-development-v2: 'https://github.com/AI4Finance-Foundation/FinGPT/tree/master/fingpt/chatgpt-low-code-development-v2'
24 | 
25 |   - Data Sources:  
26 |     - News: jupyter/Data_Sources_News.ipynb
27 |     - Social Media: jupyter/Data_Sources_Social_Media.ipynb
28 |     - Company Announcement: jupyter/Data_Sources_Company_Announcement.ipynb
29 | 
30 | theme: 
31 |   name: material
32 | 
33 | plugins:
34 |   - mkdocs-jupyter:
35 |      execute: false
36 | 
37 | extra:
38 |   alternate:
39 |     - name: English
40 |       link: /
41 |       lang: en
42 |     - name: 中文
43 |       link: /zh/
44 |       lang: zh


--------------------------------------------------------------------------------
/markdowns/codes.md:
--------------------------------------------------------------------------------
 1 | # FinNLP
 2 | 
 3 | ## Codes
 4 | 
 5 | ### Data Sources
 6 | 
 7 | #### News (Finnhub, Sina)
 8 | 
 9 | ``` python
10 | class News_Downloader:
11 |     
12 |     def __init__(self, args = {}):
13 |         pass
14 | 
15 |     def download_date_range_all(self, start_date, end_date):
16 |         pass
17 |     
18 |     def download_date_range_stock(self, start_date, end_date, stock = "AAPL"):
19 |         pass
20 |     
21 |     def download_streaming_all(self, rounds = 3):
22 |         pass
23 |     
24 |     def download_streaming_stock(self, stock = None, rounds = 3):
25 |         pass
26 | 
27 |     def clean_data(self):
28 |         pass
29 |     
30 |     def gather_content(self, delay = 0.01):
31 |         pass
32 | ```
33 | 
34 | 
35 | 
36 | #### Social Media (Twitter, Stocktwits, Reddit, Weibo)
37 | 
38 | ``` python
39 | class Social_Media_Downloader:
40 | 
41 |     def __init__(self, args = {}):
42 |         pass
43 | 
44 |     def download_date_range_all(self, start_date, end_date):
45 |         pass
46 |     
47 |     def download_date_range_stock(self, start_date, end_date, stock = "AAPL"):
48 |         pass
49 |     
50 |     def download_streaming_all(self, rounds = 3):
51 |         pass
52 |     
53 |     def download_streaming_stock(self, stock = None, rounds = 3):
54 |         pass
55 | 
56 |     def clean_data(self):
57 |         pass
58 | ```
59 | 
60 | #### Company Announcement (Juchao, SEC)
61 | 
62 | ``` python
63 | class company_announcement_Downloader:
64 | 
65 |     def __init__(self, args = {}):
66 |         pass
67 | 
68 |     def download_date_range_all(self, start_date, end_date):
69 |         pass
70 |     
71 |     def download_date_range_stock(self, start_date, end_date, stock = "AAPL"):
72 |         pass
73 |     
74 |     def download_streaming_all(self, rounds = 3):
75 |         pass
76 |     
77 |     def download_streaming_stock(self, stock = None, rounds = 3):
78 |         pass
79 | 
80 |     def clean_data(self):
81 |         pass
82 | ```


--------------------------------------------------------------------------------
/finnlp/data_sources/news/cnbc_streaming.py:
--------------------------------------------------------------------------------
 1 | import warnings
 2 | warnings.filterwarnings("ignore")
 3 | import requests
 4 | from lxml import etree
 5 | from tqdm import tqdm
 6 | import pandas as pd
 7 | import json
 8 | import time
 9 | from finnlp.data_sources.news._base import News_Downloader
10 | 
11 | # TODO:
12 | # 1. Contents
13 | 
14 | class CNBC_Streaming(News_Downloader):
15 | 
16 |     def __init__(self, args={}):
17 |         super().__init__(args)
18 |         self.dataframe = pd.DataFrame()
19 | 
20 |     def download_streaming_search(self, keyword = "apple", rounds = 3, delay = 0.5):
21 |         url = "https://api.queryly.com/cnbc/json.aspx"
22 |         headers = {
23 |             'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/116.0.0.0 Safari/537.36',
24 |             'Referer':'https://www.cnbc.com/',
25 |         }
26 |         print("Downloading ...", end = ' ')
27 |         for page in range(rounds):
28 |             params = {
29 |                 'queryly_key': '31a35d40a9a64ab3',
30 |                 'query': keyword,
31 |                 'endindex': page * 10,
32 |                 'batchsize': '10',
33 |                 'callback': '',
34 |                 'showfaceted': 'false',
35 |                 'timezoneoffset': '-480',
36 |                 'facetedfields': 'formats',
37 |                 'facetedkey': 'formats|',
38 |                 'facetedvalue': '!Press Release|',
39 |                 'sort': 'date',
40 |                 'additionalindexes': '4cd6f71fbf22424d,937d600b0d0d4e23,3bfbe40caee7443e,626fdfcd96444f28',
41 |             }
42 |             res = requests.get(url = url, headers = headers, params = params)
43 |             if res.status_code != 200:
44 |                 break
45 |             res = json.loads(res.text)
46 |             tmp = pd.DataFrame(res['results'])
47 |             self.dataframe = pd.concat([self.dataframe, tmp])
48 | 
49 |             print(page, end = ' ')
50 | 
51 |             time.sleep(delay)
52 | 


--------------------------------------------------------------------------------
/docs/FinNLP/site/assets/javascripts/lunr/min/lunr.ja.min.js:
--------------------------------------------------------------------------------
1 | !function(e,r){"function"==typeof define&&define.amd?define(r):"object"==typeof exports?module.exports=r():r()(e.lunr)}(this,function(){return function(e){if(void 0===e)throw new Error("Lunr is not present. Please include / require Lunr before this script.");if(void 0===e.stemmerSupport)throw new Error("Lunr stemmer support is not present. Please include / require Lunr stemmer support before this script.");var r="2"==e.version[0];e.ja=function(){this.pipeline.reset(),this.pipeline.add(e.ja.trimmer,e.ja.stopWordFilter,e.ja.stemmer),r?this.tokenizer=e.ja.tokenizer:(e.tokenizer&&(e.tokenizer=e.ja.tokenizer),this.tokenizerFn&&(this.tokenizerFn=e.ja.tokenizer))};var t=new e.TinySegmenter;e.ja.tokenizer=function(i){var n,o,s,p,a,u,m,l,c,f;if(!arguments.length||null==i||void 0==i)return[];if(Array.isArray(i))return i.map(function(t){return r?new e.Token(t.toLowerCase()):t.toLowerCase()});for(o=i.toString().toLowerCase().replace(/^\s+/,""),n=o.length-1;n>=0;n--)if(/\S/.test(o.charAt(n))){o=o.substring(0,n+1);break}for(a=[],s=o.length,c=0,l=0;c<=s;c++)if(u=o.charAt(c),m=c-l,u.match(/\s/)||c==s){if(m>0)for(p=t.segment(o.slice(l,c)).filter(function(e){return!!e}),f=l,n=0;n<p.length;n++)r?a.push(new e.Token(p[n],{position:[f,p[n].length],index:a.length})):a.push(p[n]),f+=p[n].length;l=c+1}return a},e.ja.stemmer=function(){return function(e){return e}}(),e.Pipeline.registerFunction(e.ja.stemmer,"stemmer-ja"),e.ja.wordCharacters="一二三四五六七八九十百千万億兆一-龠々〆ヵヶぁ-んァ-ヴーｱ-ﾝﾞa-zA-Zａ-ｚＡ-Ｚ0-9０-９",e.ja.trimmer=e.trimmerSupport.generateTrimmer(e.ja.wordCharacters),e.Pipeline.registerFunction(e.ja.trimmer,"trimmer-ja"),e.ja.stopWordFilter=e.generateStopWordFilter("これ それ あれ この その あの ここ そこ あそこ こちら どこ だれ なに なん 何 私 貴方 貴方方 我々 私達 あの人 あのかた 彼女 彼 です あります おります います は が の に を で え から まで より も どの と し それで しかし".split(" ")),e.Pipeline.registerFunction(e.ja.stopWordFilter,"stopWordFilter-ja"),e.jp=e.ja,e.Pipeline.registerFunction(e.jp.stemmer,"stemmer-jp"),e.Pipeline.registerFunction(e.jp.trimmer,"trimmer-jp"),e.Pipeline.registerFunction(e.jp.stopWordFilter,"stopWordFilter-jp")}});


--------------------------------------------------------------------------------
/finnlp/data_sources/news/reuters_streaming.py:
--------------------------------------------------------------------------------
 1 | import requests
 2 | from lxml import etree
 3 | from tqdm import tqdm
 4 | import pandas as pd
 5 | import json
 6 | import time
 7 | from finnlp.data_sources.news._base import News_Downloader
 8 | 
 9 | # TODO:
10 | # 1. Contents
11 | 
12 | 
13 | class Reuters_Streaming(News_Downloader):
14 | 
15 |     def __init__(self, args={}):
16 |         super().__init__(args)
17 |         self.dataframe = pd.DataFrame()
18 | 
19 |     def download_streaming_search(self, keyword = "apple", rounds = 3, delay = 0.5):
20 |         news_per_page = 20
21 |         url = "https://www.reuters.com/pf/api/v3/content/fetch/articles-by-search-v2"
22 |         headers = {
23 |             "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/116.0.0.0 Safari/537.36",
24 |             "Referer": "https://www.reuters.com/site-search/?query=AAPL&sort=newest&offset=0"
25 |         }
26 | 
27 |         print( "Geting pages: ", end = "")
28 |         for i in range(rounds):
29 |             offset = i * news_per_page
30 |             params = {
31 |                 "query": f'{{"keyword":"{keyword}","offset":{offset},"orderby":"display_date:desc","size":20,"website":"reuters"}}',
32 |                 "d": "144",
33 |                 "_website": "reuters",
34 |             }
35 |             response = self._request_get(url, headers=headers, params = params)
36 |             
37 |             # check connection error
38 |             if response.status_code != 200:
39 |                 return "Error"
40 |             
41 |             # Phrase response
42 |             response = json.loads(response.text)
43 | 
44 |             # check whether return content
45 |             if response["statusCode"] != 200:
46 |                 print("Early Stopping")
47 |                 break
48 |             
49 |             # make pandas DataFrame
50 |             tmp = pd.DataFrame(response["result"]["articles"])
51 |             self.dataframe = pd.concat([self.dataframe, tmp])
52 | 
53 |             # finish
54 |             print( i+1, end = " ")
55 |             time.sleep(delay)
56 | 


--------------------------------------------------------------------------------
/finnlp/data_sources/news/gurufocus_streaming.py:
--------------------------------------------------------------------------------
 1 | import warnings
 2 | warnings.filterwarnings("ignore")
 3 | import requests
 4 | from lxml import etree
 5 | from tqdm import tqdm
 6 | import pandas as pd
 7 | import json
 8 | import time
 9 | from finnlp.data_sources.news._base import News_Downloader
10 | 
11 | # TODO:
12 | # 1. Contents
13 | # 2. More pages
14 | 
15 | class GuruFocus_Streaming(News_Downloader):
16 | 
17 |     def __init__(self, args={}):
18 |         super().__init__(args)
19 |         self.dataframe = pd.DataFrame()
20 | 
21 |     def download_streaming_search(self, keyword = "AAPL", rounds = 3, delay = 0.5):
22 |         url = f"https://www.gurufocus.com/stock/{keyword}/article"
23 |         headers = {
24 |             'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/116.0.0.0 Safari/537.36',
25 |         }
26 |         res = requests.get(url = url, headers= headers)
27 |         if res.status_code != 200:
28 |             print(f"Connection Error: {res.status_code}")
29 |             return f"Connection Error: {res.status_code}"
30 |         
31 |         res = etree.HTML(res.text)
32 |         divs = res.xpath("/html/body/div[1]/div/section/section/main/div[1]/div[4]/div[1]/div/div")[1:]
33 |         titles = []
34 |         views = []
35 |         sources = []
36 |         datetimes = []
37 |         for div in divs:
38 |             # title
39 |             title = " ".join(div.xpath("./div[1]/h4/a//text()"))
40 |             title = title.replace("\n",  '').strip(" ")
41 |             titles.append(title)
42 | 
43 |             # summary
44 |             summary = " ".join(div.xpath("div[5]/text()")).replace('\n','').strip(' ')
45 |             view ,source, datetime = summary.split(' \xa0\xa0 ')
46 |             views.append(view)
47 |             sources.append(source)
48 |             datetimes.append(datetime)
49 | 
50 |         tmp = pd.DataFrame([titles, views, sources, datetimes]).T
51 |         tmp.columns = ["title", "view" ,"source", "datetime"]
52 |         self.dataframe = pd.concat([self.dataframe, tmp])
53 | 
54 |         print("Only support first page now!")
55 | 
56 | 


--------------------------------------------------------------------------------
/docs/FinNLP/site/assets/javascripts/lunr/min/lunr.hi.min.js:
--------------------------------------------------------------------------------
1 | !function(e,r){"function"==typeof define&&define.amd?define(r):"object"==typeof exports?module.exports=r():r()(e.lunr)}(this,function(){return function(e){if(void 0===e)throw new Error("Lunr is not present. Please include / require Lunr before this script.");if(void 0===e.stemmerSupport)throw new Error("Lunr stemmer support is not present. Please include / require Lunr stemmer support before this script.");e.hi=function(){this.pipeline.reset(),this.pipeline.add(e.hi.trimmer,e.hi.stopWordFilter,e.hi.stemmer),this.searchPipeline&&(this.searchPipeline.reset(),this.searchPipeline.add(e.hi.stemmer))},e.hi.wordCharacters="ऀ-ःऄ-एऐ-टठ-यर-िी-ॏॐ-य़ॠ-९॰-ॿa-zA-Zａ-ｚＡ-Ｚ0-9０-９",e.hi.trimmer=e.trimmerSupport.generateTrimmer(e.hi.wordCharacters),e.Pipeline.registerFunction(e.hi.trimmer,"trimmer-hi"),e.hi.stopWordFilter=e.generateStopWordFilter("अत अपना अपनी अपने अभी अंदर आदि आप इत्यादि इन इनका इन्हीं इन्हें इन्हों इस इसका इसकी इसके इसमें इसी इसे उन उनका उनकी उनके उनको उन्हीं उन्हें उन्हों उस उसके उसी उसे एक एवं एस ऐसे और कई कर करता करते करना करने करें कहते कहा का काफ़ी कि कितना किन्हें किन्हों किया किर किस किसी किसे की कुछ कुल के को कोई कौन कौनसा गया घर जब जहाँ जा जितना जिन जिन्हें जिन्हों जिस जिसे जीधर जैसा जैसे जो तक तब तरह तिन तिन्हें तिन्हों तिस तिसे तो था थी थे दबारा दिया दुसरा दूसरे दो द्वारा न नके नहीं ना निहायत नीचे ने पर पहले पूरा पे फिर बनी बही बहुत बाद बाला बिलकुल भी भीतर मगर मानो मे में यदि यह यहाँ यही या यिह ये रखें रहा रहे ऱ्वासा लिए लिये लेकिन व वग़ैरह वर्ग वह वहाँ वहीं वाले वुह वे वो सकता सकते सबसे सभी साथ साबुत साभ सारा से सो संग ही हुआ हुई हुए है हैं हो होता होती होते होना होने".split(" ")),e.hi.stemmer=function(){return function(e){return"function"==typeof e.update?e.update(function(e){return e}):e}}();var r=e.wordcut;r.init(),e.hi.tokenizer=function(i){if(!arguments.length||null==i||void 0==i)return[];if(Array.isArray(i))return i.map(function(r){return isLunr2?new e.Token(r.toLowerCase()):r.toLowerCase()});var t=i.toString().toLowerCase().replace(/^\s+/,"");return r.cut(t).split("|")},e.Pipeline.registerFunction(e.hi.stemmer,"stemmer-hi"),e.Pipeline.registerFunction(e.hi.stopWordFilter,"stopWordFilter-hi")}});


--------------------------------------------------------------------------------
/finnlp/data_sources/social_media/stocktwits_streaming.py:
--------------------------------------------------------------------------------
 1 | from finnlp.data_sources.social_media._base import Social_Media_Downloader
 2 | 
 3 | import requests
 4 | import pandas as pd
 5 | from tqdm import tqdm
 6 | import json
 7 | 
 8 | class Stocktwits_Streaming(Social_Media_Downloader):
 9 | 
10 |     def __init__(self, args = {}):
11 |         super().__init__(args)
12 |         self.dataframe = pd.DataFrame()
13 | 
14 |     def download_streaming_stock(self, stock = "AAPL", rounds = 3):
15 |         url = f"https://api.stocktwits.com/api/2/streams/symbol/{stock}.json"
16 |         headers = {
17 |             'accept': 'application/json',
18 |             'accept-encoding': 'gzip, deflate, br',
19 |             'accept-language': 'zh-CN,zh;q=0.9,en-US;q=0.8,en;q=0.7',
20 |             'authorization': 'OAuth 8a881f43cbc7af061ec2aa35deec9b44f7e3cc09',
21 |             'dnt': '1',
22 |             'origin': 'https://stocktwits.com',
23 |             'referer': 'https://stocktwits.com/',
24 |             
25 |             'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/112.0.0.0 Safari/537.36',
26 |         }
27 |         for i in tqdm(range(rounds)):
28 |             if i == 0:
29 |                 params = { 
30 |                 "filter":"top",
31 |                 "limit":1000,
32 |                 # "max":410000000,
33 |                 }
34 |             else:
35 |                 params = { 
36 |                 "filter":"top",
37 |                 "limit":1000,
38 |                 "max":max,
39 |                 }
40 |             response = self._request_get(url = url, headers=headers, params=params)
41 |             if response is None:
42 |                 print(f"Fetch data fail. Please check your stock name :{stock} and connections. You may raise an issue if you can't solve this problem")
43 |                 continue
44 |             else:
45 |                 res = json.loads(response.text)
46 |                 max = res["cursor"]["since"]
47 |                 res = pd.DataFrame(res["messages"])
48 |                 self.dataframe = pd.concat([self.dataframe,res])
49 |         
50 |         self.dataframe = self.dataframe.reset_index(drop = True)
51 | 


--------------------------------------------------------------------------------
/finnlp/data_sources/news/alliancenews_streaming.py:
--------------------------------------------------------------------------------
 1 | import warnings
 2 | warnings.filterwarnings("ignore")
 3 | import requests
 4 | from lxml import etree
 5 | from tqdm import tqdm
 6 | import pandas as pd
 7 | import json
 8 | import time
 9 | from finnlp.data_sources.news._base import News_Downloader
10 | 
11 | # TODO:
12 | # 1. Contents
13 | 
14 | ## Download Alliance News from Interactive Investor (https://www.ii.co.uk/news/source/alliance-news)
15 | 
16 | class AllianceNews_Streaming(News_Downloader):
17 | 
18 |     def __init__(self, args={}):
19 |         super().__init__(args)
20 |         self.dataframe = pd.DataFrame()
21 | 
22 |     def download_streaming_search(self, keyword = "appple", rounds = 3, delay = 0.5):
23 |         # download first page
24 |         url = "https://api-prod.ii.co.uk/api/1/content/articles"
25 |         headers = {
26 |             'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/116.0.0.0 Safari/537.36',
27 |             'Referer': 'https://www.ii.co.uk/news/source/alliance-news',
28 |             'Ii-Consumer-Type': 'web.public'
29 |         }
30 |         params = {
31 |             'pageSize': '12',
32 |             'source': 'ALLIANCE',
33 |         }
34 |         res = requests.get(url = url, headers= headers, params = params)
35 |         if res.status_code != 200:
36 |             print(f"Connection Error: {res.status_code}")
37 |             return f"Connection Error: {res.status_code}"
38 |         
39 |         res = json.loads(res.text)
40 |         nextId = res["nextId"]
41 |         tmp = pd.DataFrame(res["results"])
42 |         self.dataframe = pd.concat([self.dataframe, tmp])
43 | 
44 |         # download other pages
45 |         for i in range(rounds-1):
46 |             params["nextId"] = nextId
47 |             res = requests.get(url = url, headers= headers, params = params)
48 |             if res.status_code != 200:
49 |                 break
50 |             
51 |             res = json.loads(res.text)
52 |             if "nextId" in res.keys():
53 |                 nextId = res["nextId"]
54 |             else:
55 |                 break
56 | 
57 |             tmp = pd.DataFrame(res["results"])
58 |             self.dataframe = pd.concat([self.dataframe, tmp])
59 | 


--------------------------------------------------------------------------------
/finnlp/large_language_models/openai/openai_chat_agent.py:
--------------------------------------------------------------------------------
 1 | import openai
 2 | import numpy as np
 3 | 
 4 | class Openai_Chat_Agent:
 5 |     def __init__(self,args):
 6 |         assert "token" in args.keys()
 7 |         openai.api_key = args["token"]
 8 |         
 9 |         self.temperature = args["temperature"] if "temperature" in args.keys() else 1
10 |         self.top_p = args["top_p"] if "top_p" in args.keys() else 1
11 |         self.n = args["n"] if "n" in args.keys() else 1
12 |         self.max_tokens = args["max_tokens"] if "max_tokens" in args.keys() else None
13 |         self.presence_penalty = args["presence_penalty"] if "presence_penalty" in args.keys() else 0
14 |         self.frequency_penalty = args["frequency_penalty"] if "frequency_penalty" in args.keys() else 0
15 |         
16 |         self.conversation_list = []
17 |         if "init_prompt" in args.keys():
18 |             self.conversation_list.append(
19 |                 {"role":"system","content":args["init_prompt"]}
20 |             )
21 | 
22 |     def get_single_response(self,prompt):
23 |         self.conversation_list.append({"role":"user","content":prompt})
24 |         response = openai.ChatCompletion.create(
25 |             model = "gpt-3.5-turbo",
26 |             messages = self.conversation_list,
27 |             temperature = self.temperature,
28 |             top_p = self.top_p,
29 |             n = self.n,
30 |             max_tokens = self.max_tokens,
31 |             presence_penalty = self.presence_penalty,
32 |             frequency_penalty = self.frequency_penalty,
33 |         )
34 |         answer = response.choices[0].message['content']
35 |         self.conversation_list.append({"role":"assistant","content":answer})
36 |         return answer
37 |     
38 |     def show_conversation(self):
39 |         conversation_list = self.conversation_list
40 |         for msg in conversation_list:
41 |             content = msg['content']
42 |             content = content.replace(".",".\n")
43 |             if msg['role'] == 'user':
44 |                 print(f"\U0001F47B: {content}\n")
45 |             elif msg['role'] == 'system':
46 |                 print(f"\U0001F4BB: {content}\n")
47 |             else:
48 |                 print(f"\U0001F916: {content}\n")
49 | 
50 |     def get_multiple_response(self,prompts):
51 |         pass
52 |     
53 | 


--------------------------------------------------------------------------------
/finnlp/data_sources/social_media/xueqiu_streaming.py:
--------------------------------------------------------------------------------
 1 | import warnings
 2 | warnings.filterwarnings("ignore")
 3 | import requests
 4 | from lxml import etree
 5 | from tqdm import tqdm
 6 | import pandas as pd
 7 | import json
 8 | import time
 9 | from finnlp.data_sources.social_media._base import Social_Media_Downloader
10 | 
11 | # TODO:
12 | # 1. Contents
13 | 
14 | class Xueqiu_Streaming(Social_Media_Downloader):
15 |     def __init__(self, args = {}):
16 |         super().__init__(args)
17 |         self.dataframe = pd.DataFrame()
18 | 
19 |     def download_streaming_stock(self, keyword = "茅台", rounds = 3, delay = 0.5):
20 |         # first get cookie
21 |         self._get_cookie(keyword = keyword)
22 | 
23 |         url = "https://xueqiu.com/query/v1/search/status.json"
24 |         headers = {
25 |             'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/116.0.0.0 Safari/537.36',
26 |         }
27 |         print("Downloading ...", end = ' ')
28 |         for page in range(rounds):
29 |             params = {
30 |                 'sortId': '2',
31 |                 'q': keyword,
32 |                 'count': '10',
33 |                 'page': page,
34 |             }
35 | 
36 |             res = self.session.get(url = url, headers= headers, params = params)
37 |             if res.status_code != 200:
38 |                 break
39 | 
40 |             res = json.loads(res.text)
41 |             tmp = pd.DataFrame(res["list"])
42 |             self.dataframe = pd.concat([self.dataframe, tmp])
43 | 
44 |             print(page, end = ' ')
45 | 
46 |             time.sleep(delay)
47 |         
48 |         self.dataframe["created_at"] = pd.to_datetime(self.dataframe["created_at"], unit = 'ms')
49 |         
50 | 
51 |     def _get_cookie(self, keyword = "茅台"):
52 |         first_url = "https://xueqiu.com/k"
53 |         headers = {
54 |             'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/116.0.0.0 Safari/537.36',
55 |         }
56 |         params = {
57 |             'q': keyword
58 |         }
59 | 
60 |         self.session = requests.session()
61 | 
62 |         res = self.session.get(headers = headers, url = first_url, params=params)
63 |         if res.status_code != 200:
64 |             print(f"Connection Error: {res.status_code}")
65 |             return f"Connection Error: {res.status_code}"
66 | 
67 | 
68 | 


--------------------------------------------------------------------------------
/finnlp/large_language_models/openai/app4gpt_chat_agent.py:
--------------------------------------------------------------------------------
 1 | # https://www.app4gpt.com   
 2 | # A replacement for openai's API in China
 3 | 
 4 | import openai
 5 | import numpy as np
 6 | 
 7 | class App4gpt_Chat_Agent:
 8 |     def __init__(self,args):
 9 |         assert "token" in args.keys()
10 |         openai.api_key = args["token"]
11 |         openai.api_base = "https://api.app4gpt.com/v1"
12 |         
13 |         self.temperature = args["temperature"] if "temperature" in args.keys() else 1
14 |         self.top_p = args["top_p"] if "top_p" in args.keys() else 1
15 |         self.n = args["n"] if "n" in args.keys() else 1
16 |         self.max_tokens = args["max_tokens"] if "max_tokens" in args.keys() else None
17 |         self.presence_penalty = args["presence_penalty"] if "presence_penalty" in args.keys() else 0
18 |         self.frequency_penalty = args["frequency_penalty"] if "frequency_penalty" in args.keys() else 0
19 |         
20 |         self.conversation_list = []
21 |         if "init_prompt" in args.keys():
22 |             self.conversation_list.append(
23 |                 {"role":"system","content":args["init_prompt"]}
24 |             )
25 | 
26 |     def get_single_response(self,prompt, model = "gpt-3.5-turbo"):
27 |         self.conversation_list.append({"role":"user","content":prompt})
28 |         response = openai.ChatCompletion.create(
29 |             model = model,
30 |             messages = self.conversation_list,
31 |             temperature = self.temperature,
32 |             top_p = self.top_p,
33 |             n = self.n,
34 |             max_tokens = self.max_tokens,
35 |             presence_penalty = self.presence_penalty,
36 |             frequency_penalty = self.frequency_penalty,
37 |         )
38 |         answer = response.choices[0].message['content']
39 |         self.conversation_list.append({"role":"assistant","content":answer})
40 |         return answer
41 |     
42 |     def show_conversation(self):
43 |         conversation_list = self.conversation_list
44 |         for msg in conversation_list:
45 |             content = msg['content']
46 |             content = content.replace(".",".\n")
47 |             if msg['role'] == 'user':
48 |                 print(f"\U0001F47B: {content}\n")
49 |             elif msg['role'] == 'system':
50 |                 print(f"\U0001F4BB: {content}\n")
51 |             else:
52 |                 print(f"\U0001F916: {content}\n")
53 | 
54 |     def get_multiple_response(self,prompts):
55 |         pass
56 |     


--------------------------------------------------------------------------------
/finnlp/data_sources/social_media/finnhub_sentiment.py:
--------------------------------------------------------------------------------
 1 | from finnlp.data_sources.social_media._base import Social_Media_Downloader
 2 | from tqdm.notebook import tqdm
 3 | import pandas as pd
 4 | import finnhub
 5 | import time
 6 | 
 7 | class Finnhub_Sentiment(Social_Media_Downloader):
 8 |     def __init__(self, args = {}):
 9 |         super().__init__(args)
10 |         assert "token" in args.keys(), "Please input your finnhub token. Avaliable at https://finnhub.io/dashboard"
11 |         self.finnhub_client = finnhub.Client(api_key=args["token"])
12 |         self.delay = args["delay"] if "dalay" in args.keys() else 0.7
13 | 
14 |     def download_sentiment(self, start_date, end_date, stock = "APPL"):
15 |         self.reddit = pd.DataFrame()
16 |         self.twitter = pd.DataFrame()
17 |         self.date_list = pd.date_range(start_date,end_date)
18 |         days_each_time = 4
19 |         date_list = self.date_list
20 |         # cal total lenth
21 |         if len(date_list)%days_each_time == 0:
22 |             total = len(date_list)//days_each_time
23 |         else:
24 |             total = len(date_list)//days_each_time+1
25 |         with tqdm(total=total) as bar:
26 |             while len(date_list):
27 |                 tmp_date_list = date_list[:days_each_time]
28 |                 date_list = date_list[days_each_time:]
29 |                 tmp_start_date = tmp_date_list[0].strftime("%Y-%m-%d")
30 |                 tmp_end_date = tmp_date_list[-1].strftime("%Y-%m-%d")
31 |                 reddit, _stock_name, twitter = self.gather_one_day_sentiment(tmp_start_date,tmp_end_date,stock = stock )
32 |                 self.reddit = pd.concat([self.reddit,reddit])
33 |                 self.twitter = pd.concat([self.twitter,twitter])
34 |                 bar.update(1)
35 |         self.reddit = self.reddit.sort_values("atTime")
36 |         self.twitter = self.twitter.sort_values("atTime")
37 | 
38 |     def gather_one_day_sentiment(self,start_date, end_date, stock = "APPL"):
39 |         res  = self.finnhub_client.stock_social_sentiment(stock, _from=start_date, to=end_date)
40 |         reddit = res["reddit"]
41 |         symbol = res["symbol"]
42 |         twitter = res["twitter"]
43 |         reddit = pd.DataFrame(reddit)
44 |         # print(reddit)
45 |         
46 |         twitter = pd.DataFrame(twitter)
47 |         try:
48 |             reddit["atTime"] = pd.to_datetime(reddit["atTime"],errors = "ignore")
49 |             twitter["atTime"] = pd.to_datetime(twitter["atTime"],errors = "ignore")
50 |         except:
51 |             pass
52 |         time.sleep(self.delay)
53 |         return reddit,symbol,twitter
54 | 


--------------------------------------------------------------------------------
/finnlp/data_sources/news/marketwatch_date_range.py:
--------------------------------------------------------------------------------
 1 | import requests
 2 | from lxml import etree
 3 | from tqdm import tqdm
 4 | import pandas as pd
 5 | import json
 6 | import time
 7 | from finnlp.data_sources.news._base import News_Downloader
 8 | 
 9 | # TODO:
10 | # 1. More pages
11 | # 2. Contents
12 | 
13 | class MarketWatch_Date_Range(News_Downloader):
14 | 
15 |     def __init__(self, args={}):
16 |         super().__init__(args)
17 |         self.dataframe = pd.DataFrame()
18 | 
19 |     def download_date_range_search(self, start_date , end_date, keyword = "apple", delay = 0.5):
20 |         # download first page
21 |         self._download_first_page(keyword, delay = delay, start_date = start_date, end_date = end_date)
22 |        
23 |         # download the following pages
24 |         # self._download_other_pages(keyword)
25 |         print("Only support the first page now!")
26 | 
27 |     def _download_first_page(self, keyword = "apple", delay = 0.5, start_date = None, end_date = None):
28 |         url = "https://www.marketwatch.com/search"
29 |         params = {
30 |             'q': keyword,
31 |             'ts': '5',
32 |             'tab': 'All News',
33 |             'sd': start_date,
34 |             'ed': end_date,
35 |         }
36 |         headers = {
37 |             "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/116.0.0.0 Safari/537.36",
38 |         }
39 | 
40 |         res = requests.get(url = url, headers= headers, params=params)
41 |         if res.status_code != 200:
42 |             print(f'Connection Error: {res.status_code}')
43 |             return f'Connection Error: {res.status_code}'
44 | 
45 |         res = etree.HTML(res.text)
46 |         divs = res.xpath("body/main/div/div[2]/div[2]/div[2]/div[2]/mw-tabs/div[2]/div[1]/div/div[1]/div")
47 |         titles = []
48 |         times = []
49 |         authors = []
50 |         for div in divs:
51 |             # title
52 |             title = div.xpath("./div/h3/a/text()")
53 |             # time
54 |             time_ = div.xpath("./div/div/span[1]/text()")
55 |             # author
56 |             author = div.xpath("./div/div/span[2]/text()")
57 | 
58 |             if len(title)>0:
59 |                 titles.append(' '.join(title).replace("\n","").strip(" "))
60 |                 times.append(' '.join(time_))
61 |                 authors.append(' '.join(author))
62 | 
63 |         # concat results
64 |         tmp = pd.DataFrame([titles, times, authors]).T
65 |         tmp.columns = ["title", "time", "author"]
66 |         self.dataframe = pd.concat([self.dataframe, tmp])
67 | 
68 |         # sleep
69 |         time.sleep(delay)
70 |     


--------------------------------------------------------------------------------
/finnlp/data_sources/news/investorplace_streaming.py:
--------------------------------------------------------------------------------
 1 | import warnings
 2 | warnings.filterwarnings("ignore")
 3 | import requests
 4 | from lxml import etree
 5 | from tqdm import tqdm
 6 | import pandas as pd
 7 | import json
 8 | import time
 9 | from finnlp.data_sources.news._base import News_Downloader
10 | 
11 | # TODO:
12 | # 1. Contents
13 | 
14 | class InvestorPlace_Streaming(News_Downloader):
15 | 
16 |     def __init__(self, args={}):
17 |         super().__init__(args)
18 |         self.dataframe = pd.DataFrame()
19 | 
20 |     def download_streaming_search(self, keyword = "apple", rounds = 3, delay = 0.5):
21 |         url = 'https://investorplace.com/search/'
22 | 
23 |         headers = {
24 |             "User-Agent":"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/116.0.0.0 Safari/537.36"
25 |         }
26 |         print("Downloading ...", end = ' ')
27 |         for page in range(rounds):
28 |             params = {
29 |                 'q': keyword,
30 |                 "pg": page,
31 |             }
32 |             res = requests.get(url = url, params=params, headers=headers)
33 |             if res.status_code != 200:
34 |                 break
35 | 
36 |             res = etree.HTML(res.text)
37 |             div_list = res.xpath("/html/body/main/section/div/div/div/div[2]/div[1]/div[1]/div")
38 |             divs = []
39 | 
40 |             for div in div_list:
41 |                 divs += div.xpath("./div")
42 | 
43 |             titles = []
44 |             times = []
45 |             authors = []
46 |             summaries = []
47 | 
48 |             for div in divs:
49 |                 try:
50 |                     title = div.xpath('./h2/a//text()')[0]
51 |                 except:
52 |                     title = ''
53 |                 try:
54 |                     time_ = div.xpath('div/time//text()')[0].replace('\n','').replace('\t','')
55 |                 except:
56 |                     time_ = ''
57 |                 try:
58 |                     author = div.xpath('div/span/a/text()')[0].replace('\n','').replace('\t','')
59 |                 except:
60 |                     author = ''
61 |                 try:
62 |                     summary = div.xpath('p/text()')[0].replace('\n','').replace('\t','')
63 |                 except:
64 |                     summary = ''
65 | 
66 |                 titles.append(title)
67 |                 times.append(time_)
68 |                 authors.append(author)
69 |                 summaries.append(summary)
70 |                 
71 |                 titles.append(title)
72 | 
73 |             tmp = pd.DataFrame([titles, times, authors, summaries]).T
74 |             tmp.columns = ['title', 'time', 'author', 'summary']
75 |             self.dataframe = pd.concat([self.dataframe, tmp])
76 | 
77 |             print(page, end = ' ')
78 | 
79 |             time.sleep(delay)
80 | 


--------------------------------------------------------------------------------
/finnlp/data_sources/news/eastmoney_streaming.py:
--------------------------------------------------------------------------------
 1 | import requests
 2 | from lxml import etree
 3 | from tqdm import tqdm
 4 | import pandas as pd
 5 | from finnlp.data_sources.news._base import News_Downloader
 6 | 
 7 | 
 8 | class Eastmoney_Streaming(News_Downloader):
 9 | 
10 |     def __init__(self, args={}):
11 |         super().__init__(args)
12 |         self.dataframe = pd.DataFrame()
13 | 
14 |     def download_streaming_stock(self, stock = "600519", rounds = 3):
15 |         print( "Geting pages: ", end = "")
16 |         if rounds > 0:
17 |             for r in range(rounds):
18 |                 br = self._gather_pages(stock, r)
19 |                 if br == "break":
20 |                     break
21 |         else:
22 |             r = 1
23 |             error_count = 0
24 |             while 1:
25 |                 br = self._gather_pages(stock, r)
26 |                 if br == "break":
27 |                     break
28 |                 elif br == "Error":
29 |                     error_count +=1
30 |                 if error_count>10:
31 |                     print("Connection Error")
32 |                 r += 1
33 |         print( f"Get total {r+1} pages.")
34 |         self.dataframe = self.dataframe.reset_index(drop = True)
35 |     
36 |     def _gather_pages(self, stock, page):
37 |         print( page, end = " ")
38 |         url = f"https://guba.eastmoney.com/list,{stock},1,f_{page}.html"
39 |         headers = {
40 |             "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/114.0.0.0 Safari/537.36",
41 |         }
42 | 
43 |         requests.DEFAULT_RETRIES = 5  # 增加重试连接次数
44 |         s = requests.session()
45 |         s.keep_alive = False  # 关闭多余连接
46 |         
47 |         response = self._request_get(url, headers=headers)
48 |         if response.status_code != 200:
49 |             return "Error"
50 |         
51 |         # gather the comtent of the first page
52 |         page = etree.HTML(response.text)
53 |         trs = page.xpath('//*[@id="mainlist"]/div/ul/li[1]/table/tbody/tr')
54 |         have_one = False
55 |         for item in trs:
56 |             have_one = True
57 |             read_amount = item.xpath("./td[1]//text()")[0]
58 |             comments = item.xpath("./td[2]//text()")[0]
59 |             title = item.xpath("./td[3]/div/a//text()")[0]
60 |             content_link = item.xpath("./td[3]/div/a/@href")[0]
61 |             author = item.xpath("./td[4]//text()")[0]
62 |             time = item.xpath("./td[5]//text()")[0]
63 |             tmp = pd.DataFrame([read_amount, comments, title, content_link, author, time]).T
64 |             columns = [ "read amount", "comments", "title", "content link", "author", "create time" ]
65 |             tmp.columns = columns
66 |             self.dataframe = pd.concat([self.dataframe, tmp])
67 |             #print(title)
68 |         if have_one == False:
69 |             return "break"
70 | 


--------------------------------------------------------------------------------
/finnlp/data_sources/social_media/weibo_streaming.py:
--------------------------------------------------------------------------------
 1 | from finnlp.data_sources.social_media._base import Social_Media_Downloader
 2 | 
 3 | from tqdm import tqdm
 4 | from lxml import etree
 5 | import pandas as pd
 6 | import requests
 7 | import time
 8 | import json
 9 | import re
10 | 
11 | class Weibo_Streaming(Social_Media_Downloader):
12 |     def __init__(self, args = {}):
13 |         super().__init__(args)
14 |         self.dataframe = pd.DataFrame()
15 | 
16 |     def download_streaming_stock(self, stock = "茅台", rounds = 3):
17 |         for r in tqdm(range(rounds), desc="Downloading by page.."):
18 |             page = r+1
19 |             self._gather_one_page(page, stock)
20 | 
21 |     def _gather_one_page(self,page, stock = "茅台", delay = 0.01):
22 |         headers = {
23 |             "User-Agent":"Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:109.0) Gecko/20100101 Firefox/112.0"
24 |             }
25 |         params = {
26 |             "containerid": f"100103type=61&q={stock}&t=",
27 |             "page_type": "searchall",
28 |             "page":page
29 |         }
30 |         url = f"https://m.weibo.cn/api/container/getIndex"
31 |         resp = self._request_get(url, headers=headers, params = params)
32 | 
33 |         if resp is None:
34 |             return "Error"
35 |         
36 |         res = json.loads(resp.text)
37 |         res = res["data"]["cards"]
38 |         res = pd.DataFrame(res)
39 | 
40 |         pbar = tqdm(total = res.shape[0], desc = "Processing the text content and downloading the full passage...")
41 |         res[["content_short","content"]] = res.apply(lambda x:self._process_text(x, pbar, delay), axis= 1, result_type= "expand")
42 | 
43 |         self.dataframe = pd.concat([self.dataframe, res]) 
44 |     
45 |     def _process_text(self,x, pbar, delay = 0.01):
46 |         text = x["mblog"]["text"]
47 |         text = etree.HTML(text)
48 |         content_short = text.xpath(".//text()")
49 |         content_short = ''.join(content_short)
50 |         
51 |         link = text.xpath('.//a/@href')
52 |         link = [l for l in link if "status" in l ]
53 |         if len(link) >0:
54 |             base_url = "https://m.weibo.cn/"
55 |             url_new = base_url + link[0]
56 |             headers = {
57 |                 "User-Agent":"Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:109.0) Gecko/20100101 Firefox/112.0"
58 |             }
59 |             resp = self._request_get(url_new, headers= headers) 
60 |             if resp is None:
61 |                 content = content_short
62 |             else:
63 |                 res = etree.HTML(resp.content)
64 |                 scripts = res.xpath('//script')
65 |                 content = scripts[2].xpath("text()")
66 |                 pattern=re.compile('"text": "(.+),\n')
67 |                 result = pattern.findall(content[0])
68 |                 content = etree.HTML(result[0])
69 |                 content = content.xpath("//text()")
70 |                 content = ''.join(content)
71 |         else:
72 |             content = content_short
73 | 
74 |         pbar.update(1)
75 |         time.sleep(delay)
76 | 
77 |         return content_short, content
78 | 
79 | 


--------------------------------------------------------------------------------
/finnlp/benchmarks/tfns.py:
--------------------------------------------------------------------------------
 1 | import warnings
 2 | warnings.filterwarnings("ignore")
 3 | 
 4 | from sklearn.metrics import accuracy_score,f1_score
 5 | from datasets import load_dataset
 6 | from tqdm import tqdm
 7 | import datasets
 8 | import torch
 9 | 
10 | dic = {
11 |     0:"negative",
12 |     1:'positive',
13 |     2:'neutral',
14 | }
15 | 
16 | def format_example(example: dict) -> dict:
17 |     context = f"Instruction: {example['instruction']}\n"
18 |     if example.get("input"):
19 |         context += f"Input: {example['input']}\n"
20 |     context += "Answer: "
21 |     target = example["output"]
22 |     return {"context": context, "target": target}
23 | 
24 | def change_target(x):
25 |     if 'positive' in x or 'Positive' in x:
26 |         return 'positive'
27 |     elif 'negative' in x or 'Negative' in x:
28 |         return 'negative'
29 |     else:
30 |         return 'neutral'
31 | 
32 | def test_tfns(model, tokenizer, batch_size = 8, prompt_fun = None ):
33 |     dataset = load_dataset('zeroshot/twitter-financial-news-sentiment')
34 |     dataset = dataset['validation']
35 |     dataset = dataset.to_pandas()
36 |     dataset['label'] = dataset['label'].apply(lambda x:dic[x])
37 |     
38 |     if prompt_fun is None:
39 |         dataset["instruction"] = 'What is the sentiment of this tweet? Please choose an answer from {negative/neutral/positive}.'
40 |     else:
41 |         dataset["instruction"] = dataset.apply(prompt_fun, axis = 1)
42 | 
43 |     dataset.columns = ['input', 'output', 'instruction']
44 |     dataset[["context","target"]] = dataset.apply(format_example, axis = 1, result_type="expand")
45 | 
46 |     # print example
47 |     print(f"\n\nPrompt example:\n{dataset['context'][0]}\n\n")
48 | 
49 |     context = dataset['context'].tolist()
50 |     
51 |     total_steps = dataset.shape[0]//batch_size + 1
52 |     print(f"Total len: {len(context)}. Batchsize: {batch_size}. Total steps: {total_steps}")
53 | 
54 | 
55 |     out_text_list = []
56 |     for i in tqdm(range(total_steps)):
57 |         tmp_context = context[i* batch_size:(i+1)* batch_size]
58 |         tokens = tokenizer(tmp_context, return_tensors='pt', padding=True, max_length=512)
59 |         # tokens.pop('token_type_ids')
60 |         for k in tokens.keys():
61 |             tokens[k] = tokens[k].cuda()
62 |         res = model.generate(**tokens, max_length=512)
63 |         res_sentences = [tokenizer.decode(i) for i in res]
64 |         out_text = [o.split("Answer: ")[1] for o in res_sentences]
65 |         out_text_list += out_text
66 |         torch.cuda.empty_cache()
67 | 
68 |     dataset["out_text"] = out_text_list
69 |     dataset["new_target"] = dataset["target"].apply(change_target)
70 |     dataset["new_out"] = dataset["out_text"].apply(change_target)
71 | 
72 |     acc = accuracy_score(dataset["new_target"], dataset["new_out"])
73 |     f1_macro = f1_score(dataset["new_target"], dataset["new_out"], average = "macro")
74 |     f1_micro = f1_score(dataset["new_target"], dataset["new_out"], average = "micro")
75 |     f1_weighted = f1_score(dataset["new_target"], dataset["new_out"], average = "weighted")
76 | 
77 |     print(f"Acc: {acc}. F1 macro: {f1_macro}. F1 micro: {f1_micro}. F1 weighted (BloombergGPT): {f1_weighted}. ")
78 | 
79 |     return dataset


--------------------------------------------------------------------------------
/finnlp/benchmarks/fpb.py:
--------------------------------------------------------------------------------
 1 | import warnings
 2 | warnings.filterwarnings("ignore")
 3 | 
 4 | from sklearn.metrics import accuracy_score,f1_score
 5 | from datasets import load_dataset
 6 | from tqdm import tqdm
 7 | import datasets
 8 | import torch
 9 | 
10 | dic = {
11 |         0:"negative",
12 |         1:'neutral',
13 |         2:'positive',
14 |     }
15 | 
16 | def format_example(example: dict) -> dict:
17 |     context = f"Instruction: {example['instruction']}\n"
18 |     if example.get("input"):
19 |         context += f"Input: {example['input']}\n"
20 |     context += "Answer: "
21 |     target = example["output"]
22 |     return {"context": context, "target": target}
23 | 
24 | def change_target(x):
25 |     if 'positive' in x or 'Positive' in x:
26 |         return 'positive'
27 |     elif 'negative' in x or 'Negative' in x:
28 |         return 'negative'
29 |     else:
30 |         return 'neutral'
31 | 
32 | def test_fpb(model, tokenizer, batch_size = 8, prompt_fun = None ):
33 |     instructions = load_dataset("financial_phrasebank", "sentences_50agree")
34 |     instructions = instructions["train"]
35 |     instructions = instructions.train_test_split(seed = 42)['test']
36 |     instructions = instructions.to_pandas()
37 |     instructions.columns = ["input", "output"]
38 |     instructions["output"] = instructions["output"].apply(lambda x:dic[x])
39 | 
40 |     if prompt_fun is None:
41 |         instructions["instruction"] = "What is the sentiment of this news? Please choose an answer from {negative/neutral/positive}."
42 |     else:
43 |         instructions["instruction"] = instructions.apply(prompt_fun, axis = 1)
44 |     
45 |     instructions[["context","target"]] = instructions.apply(format_example, axis = 1, result_type="expand")
46 | 
47 |     # print example
48 |     print(f"\n\nPrompt example:\n{instructions['context'][0]}\n\n")
49 | 
50 | 
51 |     context = instructions['context'].tolist()
52 |     
53 |     total_steps = instructions.shape[0]//batch_size + 1
54 |     print(f"Total len: {len(context)}. Batchsize: {batch_size}. Total steps: {total_steps}")
55 | 
56 | 
57 |     out_text_list = []
58 |     for i in tqdm(range(total_steps)):
59 |         tmp_context = context[i* batch_size:(i+1)* batch_size]
60 |         tokens = tokenizer(tmp_context, return_tensors='pt', padding=True, max_length=512)
61 |         for k in tokens.keys():
62 |             tokens[k] = tokens[k].cuda()
63 |         res = model.generate(**tokens, max_length=512)
64 |         res_sentences = [tokenizer.decode(i) for i in res]
65 |         out_text = [o.split("Answer: ")[1] for o in res_sentences]
66 |         out_text_list += out_text
67 |         torch.cuda.empty_cache()
68 | 
69 |     instructions["out_text"] = out_text_list
70 |     instructions["new_target"] = instructions["target"].apply(change_target)
71 |     instructions["new_out"] = instructions["out_text"].apply(change_target)
72 | 
73 |     acc = accuracy_score(instructions["new_target"], instructions["new_out"])
74 |     f1_macro = f1_score(instructions["new_target"], instructions["new_out"], average = "macro")
75 |     f1_micro = f1_score(instructions["new_target"], instructions["new_out"], average = "micro")
76 |     f1_weighted = f1_score(instructions["new_target"], instructions["new_out"], average = "weighted")
77 | 
78 |     print(f"Acc: {acc}. F1 macro: {f1_macro}. F1 micro: {f1_micro}. F1 weighted (BloombergGPT): {f1_weighted}. ")
79 | 
80 |     return instructions


--------------------------------------------------------------------------------
/finnlp/benchmarks/nwgi.py:
--------------------------------------------------------------------------------
 1 | import warnings
 2 | warnings.filterwarnings("ignore")
 3 | 
 4 | from sklearn.metrics import accuracy_score,f1_score
 5 | from datasets import load_dataset
 6 | from tqdm import tqdm
 7 | import datasets
 8 | import torch
 9 | 
10 | dic = {
11 |     'strong negative':"negative",
12 |     'moderately negative':"negative",
13 |     'mildly negative':"neutral",
14 |     'strong positive':"positive",
15 |     'moderately positive':"positive",
16 |     'mildly positive':'neutral',
17 |     'neutral':'neutral',
18 | }
19 | 
20 | def format_example(example: dict) -> dict:
21 |     context = f"Instruction: {example['instruction']}\n"
22 |     if example.get("input"):
23 |         context += f"Input: {example['input']}\n"
24 |     context += "Answer: "
25 |     target = example["output"]
26 |     return {"context": context, "target": target}
27 | 
28 | def change_target(x):
29 |     if 'positive' in x or 'Positive' in x:
30 |         return 'positive'
31 |     elif 'negative' in x or 'Negative' in x:
32 |         return 'negative'
33 |     else:
34 |         return 'neutral'
35 | 
36 | def test_nwgi(model, tokenizer, batch_size = 8, prompt_fun = None ):
37 |     dataset = datasets.load_dataset('oliverwang15/news_with_gpt_instructions')
38 |     dataset = dataset['test'].to_pandas()
39 |     dataset['output'] = dataset['label'].apply(lambda x:dic[x])
40 | 
41 |     if prompt_fun is None:
42 |         dataset["instruction"] = "What is the sentiment of this news? Please choose an answer from {negative/neutral/positive}."
43 |     else:
44 |         dataset["instruction"] = dataset.apply(prompt_fun, axis = 1)
45 |     dataset["input"] = dataset["news"]
46 | 
47 |     dataset = dataset[['input', 'output', 'instruction']]
48 |     dataset[["context","target"]] = dataset.apply(format_example, axis = 1, result_type="expand")
49 | 
50 |     # print example
51 |     print(f"\n\nPrompt example:\n{dataset['context'][0]}\n\n")
52 | 
53 |     context = dataset['context'].tolist()
54 |     
55 |     total_steps = dataset.shape[0]//batch_size + 1
56 |     print(f"Total len: {len(context)}. Batchsize: {batch_size}. Total steps: {total_steps}")
57 | 
58 | 
59 |     out_text_list = []
60 |     for i in tqdm(range(total_steps)):
61 |         tmp_context = context[i* batch_size:(i+1)* batch_size]
62 |         tokens = tokenizer(tmp_context, return_tensors='pt', padding=True, max_length=512)
63 |         # tokens.pop('token_type_ids')
64 |         for k in tokens.keys():
65 |             tokens[k] = tokens[k].cuda()
66 |         res = model.generate(**tokens, max_length=512)
67 |         res_sentences = [tokenizer.decode(i) for i in res]
68 |         out_text = [o.split("Answer: ")[1] for o in res_sentences]
69 |         out_text_list += out_text
70 |         torch.cuda.empty_cache()
71 | 
72 |     dataset["out_text"] = out_text_list
73 |     dataset["new_target"] = dataset["target"].apply(change_target)
74 |     dataset["new_out"] = dataset["out_text"].apply(change_target)
75 | 
76 |     acc = accuracy_score(dataset["new_target"], dataset["new_out"])
77 |     f1_macro = f1_score(dataset["new_target"], dataset["new_out"], average = "macro")
78 |     f1_micro = f1_score(dataset["new_target"], dataset["new_out"], average = "micro")
79 |     f1_weighted = f1_score(dataset["new_target"], dataset["new_out"], average = "weighted")
80 | 
81 |     print(f"Acc: {acc}. F1 macro: {f1_macro}. F1 micro: {f1_micro}. F1 weighted (BloombergGPT): {f1_weighted}. ")
82 | 
83 |     return dataset
84 | 


--------------------------------------------------------------------------------
/finnlp/data_sources/_base.py:
--------------------------------------------------------------------------------
 1 | from finnlp.utils.get_proxy import get_china_free_proxy, get_us_free_proxy, Kuaidaili
 2 | import requests
 3 | 
 4 | class FinNLP_Downloader:
 5 |     def __init__(self, args = {}):
 6 |         self.use_proxy = True if "use_proxy" in args.keys() else False
 7 |         if self.use_proxy:
 8 |             self.country = args["use_proxy"]
 9 |         else:
10 |             self.country = None
11 |         self.max_retry = args["max_retry"] if "max_retry" in args.keys() else 1
12 |         self.proxy_pages = args["proxy_pages"] if "proxy_pages" in args.keys() else 5
13 |         if self.use_proxy:
14 |             if "kuaidaili" in self.country:
15 |                 # tunnel, username, password
16 |                 assert "tunnel" in args.keys(), "Please make sure \'tunnel\' in your keys"
17 |                 assert "username" in args.keys(), "Please make sure \'username\' in your keys"
18 |                 assert "password" in args.keys(), "Please make sure \'password\' in your keys"
19 |                 self.proxy_list = Kuaidaili(args["tunnel"], args["username"], args["password"])
20 |             else:
21 |                 self.proxy_id = 0
22 |                 self.proxy_list = self._update_proxy()
23 |         else:
24 |             self.proxy_list = []   
25 |         
26 |     def _get_proxy(self):
27 |         if self.use_proxy:
28 |             if "kuaidaili" in self.country:
29 |                 proxy = self.proxy_list.get_kuaidaili_tunnel_proxy()
30 |                 return proxy
31 |             elif len(self.proxy_list) >0:
32 |                 proxy = self.proxy_list[self.proxy_id]
33 |                 self.proxy_id += 1
34 |                 if self.proxy_id == len(self.proxy_list):
35 |                     self.proxy_id = 0 
36 |                 return proxy
37 |         else:
38 |             return None
39 | 
40 |     def _update_proxy(self):
41 |         if "china" in self.country or "China" in self.country:
42 |             return get_china_free_proxy(self.proxy_pages)
43 |         else:
44 |             return get_us_free_proxy(self.proxy_pages)
45 | 
46 |     def _request_get(self, url, headers = None, verify = None, params = None):
47 |         if headers is None:
48 |             headers = {
49 |             "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:109.0) Gecko/20100101 Firefox/112.0"
50 |             }
51 |         max_retry = self.max_retry
52 |         proxies = self._get_proxy()
53 |         for _ in range(max_retry):
54 |             try:
55 |                 response = requests.get(url = url, proxies = proxies, headers = headers, verify = verify, params = params)
56 |                 if response.status_code == 200:
57 |                     break
58 |             except:
59 |                 response = None
60 | 
61 |         if response is not None and response.status_code != 200:
62 |             response = None
63 | 
64 |         return response
65 |     
66 |     def _request_post(self, url, headers, json):
67 |         max_retry = self.max_retry
68 |         proxies = self._get_proxy()
69 |         for _ in range(max_retry):
70 |             try:
71 |                 response = requests.post(url = url, headers = headers, json = json, proxies = proxies)
72 |                 if response.status_code == 200:
73 |                     break
74 |             except:
75 |                 response = None
76 | 
77 |         if response is not None and response.status_code != 200:
78 |             response = None
79 | 
80 |         return response
81 | 


--------------------------------------------------------------------------------
/finnlp/data_sources/news/pennystocks_streaming.py:
--------------------------------------------------------------------------------
 1 | import requests
 2 | from lxml import etree
 3 | from tqdm import tqdm
 4 | import pandas as pd
 5 | import json
 6 | import time as time
 7 | from finnlp.data_sources.news._base import News_Downloader
 8 | 
 9 | # TODO:
10 | # 1. More Pages
11 | # 2. Contents
12 | 
13 | class PennyStocks_Streaming(News_Downloader):
14 | 
15 |     def __init__(self, args={}):
16 |         super().__init__(args)
17 |         self.dataframe = pd.DataFrame()
18 | 
19 |     def download_streaming_search(self, keyword = "apple", rounds = 3, delay = 2):
20 |         # establish session
21 |         self._connect_session()
22 | 
23 |         # download first page
24 |         self._download_first_page(keyword, delay = delay)
25 |        
26 |         # download the following pages
27 |         # self._download_other_pages(keyword)
28 |         print("Only support the first page now!")
29 | 
30 | 
31 |     def _connect_session(self):
32 |         # since the server will check cookies, we need first 
33 |         # request the main site withour cookies, then finish 
34 |         # searching for the stock information we want.
35 |         self.session = requests.session()
36 |         first_url = "https://pennystocks.com/"
37 |         headers = {
38 |             "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/116.0.0.0 Safari/537.36",
39 |         }
40 |         print("Requesting https://pennystocks.com ...", end = " ")
41 |         res = self.session.get(headers = headers, url = first_url)
42 |         if res.status_code !=200:
43 |             raise ConnectionError("Can't request https://pennystocks.com. Please check your connection or report this issue on Github")
44 |         
45 |         print("succeed!")
46 | 
47 |     def _download_first_page(self, keyword = "apple", max_retry = 5, delay = 2):
48 |         url = f"https://pennystocks.com/?s={keyword}"
49 |         headers = {
50 |             "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/116.0.0.0 Safari/537.36",
51 |         }
52 |         res = self.session.get(url = url, headers = headers)
53 |         res = etree.HTML(res.text)
54 |         articles = res.xpath("/html/body/div[3]/div/div[1]/div/div/div/main/div/div[1]/div/article")
55 |         # not sure why but this really works
56 |         
57 |         while max_retry and len(articles) == 0:
58 |             import time
59 |             time.sleep(delay)
60 |             print("Gathering again ..", end = ' ')
61 |             res = requests.get(url = url, headers = headers, cookies=self.session.cookies)
62 |             res = etree.HTML(res.text)
63 |             articles = res.xpath("/html/body/div[3]/div/div[1]/div/div/div/main/div/div[1]/div/article")
64 |             max_retry -= 1
65 |             print(f"Remaining Retry: {max_retry}")
66 | 
67 | 
68 |         for a in articles:
69 |             title = a.xpath("./header/h2/a//text()")[0]
70 |             time = a.xpath("./div[3]/div/div/ul/li[1]/text()")[0]
71 |             brief = a.xpath("./div[3]/div/div/text()")[0]
72 |             reading_time = a.xpath("./div[3]/div/div/ul/li[2]/text()")[0]
73 |             columns = ["title", "time", "brief", "reading_time"]
74 |             tmp = pd.DataFrame([[title, time, brief, reading_time]], columns=columns)
75 |             self.dataframe = pd.concat([self.dataframe, tmp])
76 | 
77 | 
78 |     def _download_other_pages(self, keyword = "apple"):
79 |         pass
80 | 
81 | 
82 | 


--------------------------------------------------------------------------------
/finnlp/data_sources/news/sina_finance_date_range.py:
--------------------------------------------------------------------------------
 1 | import json
 2 | import pytz
 3 | import time
 4 | import requests
 5 | import pandas as pd
 6 | import numpy as np
 7 | from lxml import etree
 8 | from tqdm import tqdm
 9 | from finnlp.data_sources.news._base import News_Downloader
10 | 
11 | class Sina_Finance_Date_Range(News_Downloader):
12 | 
13 |     def __init__(self, args={}):
14 |         super().__init__(args)
15 |         self.dataframe = pd.DataFrame()
16 | 
17 |     def download_date_range_all(self, start_date, end_date):
18 |         self.date_list = pd.date_range(start_date, end_date)
19 |         for date in tqdm(self.date_list, desc= "Downloading Titles..."):
20 |             tmp = self._gather_one_day(date)
21 |             self.dataframe = pd.concat([self.dataframe, tmp])
22 |         self.dataframe = self.dataframe.reset_index(drop = True)
23 | 
24 |     def _gather_one_day(self, date, delay = 0.1):
25 |         end_timestamp = pd.to_datetime(f"{date} 16:00:00").timestamp()
26 |         start_timestamp = end_timestamp - 60 * 60 * 24
27 | 
28 |         res = pd.DataFrame()
29 |         for page in range(100):
30 |             url = f"https://feed.mix.sina.com.cn/api/roll/get?pageid=153&lid=2516&etime={start_timestamp}&stime={end_timestamp}&ctime={end_timestamp}&date={date}&k=&num=50&page={page}"
31 |             response = self._request_get(url = url)
32 |             if response is not None:
33 |                 response.encoding = 'unicode'
34 |                 text = response.text
35 |                 text = json.loads(text, strict=True)
36 |                 text = text["result"]
37 |                 text = text["data"]
38 |                 if len(text) == 0:
39 |                     break
40 | 
41 |                 for i in text:
42 |                     for ii in i.keys():
43 |                         i[ii] = [i[ii]]
44 |                     tmp = pd.DataFrame(i)
45 |                     res = pd.concat([res, tmp])
46 |                 time.sleep(delay)
47 |         
48 |         if res.shape[0] != 0:
49 |             res.ctime = pd.to_datetime(res.ctime, unit="s", utc=True)
50 |             res.mtime = pd.to_datetime(res.mtime, unit="s", utc=True)
51 |             res.intime = pd.to_datetime(res.intime, unit="s", utc=True)
52 | 
53 |             tz = pytz.timezone("Asia/Shanghai")
54 |             res.ctime = [t.astimezone(tz) for t in res.ctime]
55 |             res.mtime = [t.astimezone(tz) for t in res.mtime]
56 |             res.intime = [t.astimezone(tz) for t in res.intime]
57 | 
58 |         return res
59 | 
60 |     def gather_content(self, delay = 0.01):
61 |         pbar = tqdm(total = self.dataframe.shape[0], desc= "Gathering news contents")
62 |         self.dataframe["content"] = self.dataframe.apply(lambda x:self._gather_content_apply(x, pbar, delay), axis = 1)
63 | 
64 |     def _gather_content_apply(self,x, pbar, delay = 0.01):
65 |         url = x.url
66 |         response = self._request_get(url=url)
67 | 
68 |         if response is not None:
69 |             # process
70 |             response.encoding = 'unicode'
71 |             text = response.text
72 |             page = etree.HTML(text)
73 |             page = page.xpath("//*[@id='artibody']/p")
74 |             page = [p.xpath(".//text()") for p in page]
75 |             page = [''.join(p) for p in page]
76 |             content = "\n".join(page)
77 |             content = content.replace("\u3000","")
78 |         else:
79 |             content = np.nan
80 | 
81 |         # update
82 |         pbar.update(1)
83 |         time.sleep(delay)
84 | 
85 |         return content
86 |         


--------------------------------------------------------------------------------
/docs/FinNLP/site/assets/stylesheets/palette.a0c5b2b5.min.css.map:
--------------------------------------------------------------------------------
1 | {"version":3,"sources":["src/assets/stylesheets/palette/_scheme.scss","../../../src/assets/stylesheets/palette.scss","src/assets/stylesheets/palette/_accent.scss","src/assets/stylesheets/palette/_primary.scss","src/assets/stylesheets/utilities/_break.scss"],"names":[],"mappings":"AA2BA,cAGE,6BAKE,YAAA,CAGA,mDAAA,CACA,6DAAA,CACA,+DAAA,CACA,gEAAA,CACA,mDAAA,CACA,6DAAA,CACA,+DAAA,CACA,gEAAA,CAGA,gDAAA,CACA,gDAAA,CAGA,4BAAA,CACA,iCAAA,CACA,kCAAA,CACA,mCAAA,CACA,mCAAA,CACA,kCAAA,CACA,iCAAA,CACA,+CAAA,CACA,6DAAA,CACA,gEAAA,CACA,4DAAA,CACA,4DAAA,CACA,6DAAA,CAGA,6CAAA,CAGA,+CAAA,CAGA,iCAAA,CAGA,uDAAA,CACA,6DAAA,CACA,2DAAA,CAGA,yDAAA,CACA,iEAAA,CAGA,mDAAA,CACA,mDAAA,CAGA,qDAAA,CACA,wDAAA,CAGA,0DAAA,CAKA,8DAAA,CAKA,0DCxDF,CD6DE,kHAEE,YC3DJ,CD+DE,gHAEE,eC7DJ,CDoFE,yDACE,4BClFJ,CDiFE,2DACE,4BC/EJ,CD8EE,gEACE,4BC5EJ,CD2EE,2DACE,4BCzEJ,CDwEE,yDACE,4BCtEJ,CDqEE,0DACE,4BCnEJ,CDkEE,gEACE,4BChEJ,CD+DE,0DACE,4BC7DJ,CD4DE,2OACE,4BCjDJ,CDwDA,+FAGE,iCCtDF,CACF,CClDE,2BACE,4BAAA,CACA,2CAAA,CAOE,yBAAA,CACA,qCD8CN,CCxDE,4BACE,4BAAA,CACA,2CAAA,CAOE,yBAAA,CACA,qCDqDN,CC/DE,8BACE,4BAAA,CACA,2CAAA,CAOE,yBAAA,CACA,qCD4DN,CCtEE,mCACE,4BAAA,CACA,2CAAA,CAOE,yBAAA,CACA,qCDmEN,CC7EE,8BACE,4BAAA,CACA,2CAAA,CAOE,yBAAA,CACA,qCD0EN,CCpFE,4BACE,4BAAA,CACA,2CAAA,CAOE,yBAAA,CACA,qCDiFN,CC3FE,kCACE,4BAAA,CACA,2CAAA,CAOE,yBAAA,CACA,qCDwFN,CClGE,4BACE,4BAAA,CACA,2CAAA,CAOE,yBAAA,CACA,qCD+FN,CCzGE,4BACE,4BAAA,CACA,2CAAA,CAOE,yBAAA,CACA,qCDsGN,CChHE,6BACE,4BAAA,CACA,2CAAA,CAOE,yBAAA,CACA,qCD6GN,CCvHE,mCACE,4BAAA,CACA,2CAAA,CAOE,yBAAA,CACA,qCDoHN,CC9HE,4BACE,4BAAA,CACA,2CAAA,CAIE,8BAAA,CACA,qCD8HN,CCrIE,8BACE,4BAAA,CACA,2CAAA,CAIE,8BAAA,CACA,qCDqIN,CC5IE,6BACE,yBAAA,CACA,2CAAA,CAIE,8BAAA,CACA,qCD4IN,CCnJE,8BACE,4BAAA,CACA,2CAAA,CAIE,8BAAA,CACA,qCDmJN,CC1JE,mCACE,4BAAA,CACA,2CAAA,CAOE,yBAAA,CACA,qCDuJN,CE5JE,4BACE,6BAAA,CACA,oCAAA,CACA,mCAAA,CAOE,0BAAA,CACA,sCFyJN,CEpKE,6BACE,6BAAA,CACA,oCAAA,CACA,mCAAA,CAOE,0BAAA,CACA,sCFiKN,CE5KE,+BACE,6BAAA,CACA,oCAAA,CACA,mCAAA,CAOE,0BAAA,CACA,sCFyKN,CEpLE,oCACE,6BAAA,CACA,oCAAA,CACA,mCAAA,CAOE,0BAAA,CACA,sCFiLN,CE5LE,+BACE,6BAAA,CACA,oCAAA,CACA,mCAAA,CAOE,0BAAA,CACA,sCFyLN,CEpME,6BACE,6BAAA,CACA,oCAAA,CACA,mCAAA,CAOE,0BAAA,CACA,sCFiMN,CE5ME,mCACE,6BAAA,CACA,oCAAA,CACA,mCAAA,CAOE,0BAAA,CACA,sCFyMN,CEpNE,6BACE,6BAAA,CACA,oCAAA,CACA,mCAAA,CAOE,0BAAA,CACA,sCFiNN,CE5NE,6BACE,6BAAA,CACA,oCAAA,CACA,mCAAA,CAOE,0BAAA,CACA,sCFyNN,CEpOE,8BACE,6BAAA,CACA,oCAAA,CACA,mCAAA,CAOE,0BAAA,CACA,sCFiON,CE5OE,oCACE,6BAAA,CACA,oCAAA,CACA,mCAAA,CAOE,0BAAA,CACA,sCFyON,CEpPE,6BACE,6BAAA,CACA,oCAAA,CACA,mCAAA,CAIE,+BAAA,CACA,sCFoPN,CE5PE,+BACE,6BAAA,CACA,oCAAA,CACA,mCAAA,CAIE,+BAAA,CACA,sCF4PN,CEpQE,8BACE,6BAAA,CACA,oCAAA,CACA,mCAAA,CAIE,+BAAA,CACA,sCFoQN,CE5QE,+BACE,6BAAA,CACA,oCAAA,CACA,mCAAA,CAIE,+BAAA,CACA,sCF4QN,CEpRE,oCACE,6BAAA,CACA,oCAAA,CACA,mCAAA,CAOE,0BAAA,CACA,sCFiRN,CE5RE,8BACE,6BAAA,CACA,oCAAA,CACA,mCAAA,CAOE,0BAAA,CACA,sCFyRN,CEpSE,6BACE,6BAAA,CACA,oCAAA,CACA,mCAAA,CAOE,0BAAA,CACA,sCAAA,CAKA,4BF6RN,CE7SE,kCACE,6BAAA,CACA,oCAAA,CACA,mCAAA,CAOE,0BAAA,CACA,sCAAA,CAKA,4BFsSN,CEvRE,sEACE,4BF0RJ,CE3RE,+DACE,4BF8RJ,CE/RE,iEACE,4BFkSJ,CEnSE,gEACE,4BFsSJ,CEvSE,iEACE,4BF0SJ,CEjSA,8BACE,0BAAA,CACA,sCAAA,CACA,qCAAA,CACA,+BAAA,CACA,sCAAA,CAGA,4BFkSF,CE/RE,yCACE,+BFiSJ,CE9RI,kDAEE,0CAAA,CACA,sCAAA,CAFA,UFkSN,CG9MI,mCD1EA,+CACE,0BF2RJ,CExRI,qDACE,0BF0RN,CErRE,iEACE,eFuRJ,CACF,CGzNI,sCDvDA,uCACE,oCFmRJ,CACF,CE1QA,8BACE,0BAAA,CACA,sCAAA,CACA,gCAAA,CACA,0BAAA,CACA,sCAAA,CAGA,4BF2QF,CExQE,yCACE,+BF0QJ,CEvQI,kDAEE,0CAAA,CACA,sCAAA,CAFA,UF2QN,CEpQE,yCACE,qBFsQJ,CG/NI,wCDhCA,8CACE,0BFkQJ,CACF,CGvPI,mCDJA,+CACE,0BF8PJ,CE3PI,qDACE,0BF6PN,CACF,CG5OI,wCDTA,iFACE,qBFwPJ,CACF,CGpQI,sCDmBA,uCACE,qBFoPJ,CACF","file":"palette.css"}


--------------------------------------------------------------------------------
/docs/FinNLP/site/assets/javascripts/lunr/min/lunr.stemmer.support.min.js:
--------------------------------------------------------------------------------
1 | !function(r,t){"function"==typeof define&&define.amd?define(t):"object"==typeof exports?module.exports=t():t()(r.lunr)}(this,function(){return function(r){r.stemmerSupport={Among:function(r,t,i,s){if(this.toCharArray=function(r){for(var t=r.length,i=new Array(t),s=0;s<t;s++)i[s]=r.charCodeAt(s);return i},!r&&""!=r||!t&&0!=t||!i)throw"Bad Among initialisation: s:"+r+", substring_i: "+t+", result: "+i;this.s_size=r.length,this.s=this.toCharArray(r),this.substring_i=t,this.result=i,this.method=s},SnowballProgram:function(){var r;return{bra:0,ket:0,limit:0,cursor:0,limit_backward:0,setCurrent:function(t){r=t,this.cursor=0,this.limit=t.length,this.limit_backward=0,this.bra=this.cursor,this.ket=this.limit},getCurrent:function(){var t=r;return r=null,t},in_grouping:function(t,i,s){if(this.cursor<this.limit){var e=r.charCodeAt(this.cursor);if(e<=s&&e>=i&&(e-=i,t[e>>3]&1<<(7&e)))return this.cursor++,!0}return!1},in_grouping_b:function(t,i,s){if(this.cursor>this.limit_backward){var e=r.charCodeAt(this.cursor-1);if(e<=s&&e>=i&&(e-=i,t[e>>3]&1<<(7&e)))return this.cursor--,!0}return!1},out_grouping:function(t,i,s){if(this.cursor<this.limit){var e=r.charCodeAt(this.cursor);if(e>s||e<i)return this.cursor++,!0;if(e-=i,!(t[e>>3]&1<<(7&e)))return this.cursor++,!0}return!1},out_grouping_b:function(t,i,s){if(this.cursor>this.limit_backward){var e=r.charCodeAt(this.cursor-1);if(e>s||e<i)return this.cursor--,!0;if(e-=i,!(t[e>>3]&1<<(7&e)))return this.cursor--,!0}return!1},eq_s:function(t,i){if(this.limit-this.cursor<t)return!1;for(var s=0;s<t;s++)if(r.charCodeAt(this.cursor+s)!=i.charCodeAt(s))return!1;return this.cursor+=t,!0},eq_s_b:function(t,i){if(this.cursor-this.limit_backward<t)return!1;for(var s=0;s<t;s++)if(r.charCodeAt(this.cursor-t+s)!=i.charCodeAt(s))return!1;return this.cursor-=t,!0},find_among:function(t,i){for(var s=0,e=i,n=this.cursor,u=this.limit,o=0,h=0,c=!1;;){for(var a=s+(e-s>>1),f=0,l=o<h?o:h,_=t[a],m=l;m<_.s_size;m++){if(n+l==u){f=-1;break}if(f=r.charCodeAt(n+l)-_.s[m])break;l++}if(f<0?(e=a,h=l):(s=a,o=l),e-s<=1){if(s>0||e==s||c)break;c=!0}}for(;;){var _=t[s];if(o>=_.s_size){if(this.cursor=n+_.s_size,!_.method)return _.result;var b=_.method();if(this.cursor=n+_.s_size,b)return _.result}if((s=_.substring_i)<0)return 0}},find_among_b:function(t,i){for(var s=0,e=i,n=this.cursor,u=this.limit_backward,o=0,h=0,c=!1;;){for(var a=s+(e-s>>1),f=0,l=o<h?o:h,_=t[a],m=_.s_size-1-l;m>=0;m--){if(n-l==u){f=-1;break}if(f=r.charCodeAt(n-1-l)-_.s[m])break;l++}if(f<0?(e=a,h=l):(s=a,o=l),e-s<=1){if(s>0||e==s||c)break;c=!0}}for(;;){var _=t[s];if(o>=_.s_size){if(this.cursor=n-_.s_size,!_.method)return _.result;var b=_.method();if(this.cursor=n-_.s_size,b)return _.result}if((s=_.substring_i)<0)return 0}},replace_s:function(t,i,s){var e=s.length-(i-t),n=r.substring(0,t),u=r.substring(i);return r=n+s+u,this.limit+=e,this.cursor>=i?this.cursor+=e:this.cursor>t&&(this.cursor=t),e},slice_check:function(){if(this.bra<0||this.bra>this.ket||this.ket>this.limit||this.limit>r.length)throw"faulty slice operation"},slice_from:function(r){this.slice_check(),this.replace_s(this.bra,this.ket,r)},slice_del:function(){this.slice_from("")},insert:function(r,t,i){var s=this.replace_s(r,t,i);r<=this.bra&&(this.bra+=s),r<=this.ket&&(this.ket+=s)},slice_to:function(){return this.slice_check(),r.substring(this.bra,this.ket)},eq_v_b:function(r){return this.eq_s_b(r.length,r)}}}},r.trimmerSupport={generateTrimmer:function(r){var t=new RegExp("^[^"+r+"]+"),i=new RegExp("[^"+r+"]+$");return function(r){return"function"==typeof r.update?r.update(function(r){return r.replace(t,"").replace(i,"")}):r.replace(t,"").replace(i,"")}}}}});


--------------------------------------------------------------------------------
/finnlp/data_sources/company_announcement/sina.py:
--------------------------------------------------------------------------------
 1 | import time
 2 | import requests
 3 | from lxml import etree
 4 | from tqdm.notebook import tqdm
 5 | import pandas as pd
 6 | 
 7 | class Sina_Announcement_Downloader:
 8 | 
 9 |     def __init__(self, args = {}):
10 |         pass
11 | 
12 |     def download(self, stock = "all",max_page = 100):
13 |         page = 0
14 |         df = pd.DataFrame()
15 |         print(f"Getting page: ",end = "")
16 |         while page < max_page:
17 |             print(page, end = " ")
18 |             headers = {"User-Agent":"Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:109.0) Gecko/20100101 Firefox/111.0",
19 |                 'Accept-Encoding':'gzip, deflate, br',}
20 |             url = f"https://vip.stock.finance.sina.com.cn/corp/view/vCB_AllBulletin.php?stockid={stock}&Page={page}"
21 |             response = requests.get(url = url,headers=headers)
22 |             # response.encoding = "GBK"
23 |             # print(response.content.decode('GBK'))
24 |             text = response.content.decode('GBK')
25 |             html = etree.HTML(text)
26 | 
27 |             # get announcement date
28 |             date_list = html.xpath("/html/body/div[6]/div[2]/div[2]/table[2]/tr/td[2]/div[1]/ul/text()")
29 |             if len(date_list) <= 0:
30 |                 break
31 |             date_list = [date.strip('.\r').strip('.\n').strip('.\xa0').strip(' ') for date in date_list]
32 |             date_list = [date for date in date_list if len(date) == 10]
33 | 
34 | 
35 |             # get headlines and urls
36 |             url_root = "https://vip.stock.finance.sina.com.cn"
37 |             a_list = html.xpath("/html/body/div[6]/div[2]/div[2]/table[2]/tr/td[2]/div[1]/ul/a")
38 |             headline_list = [a.xpath("./text()")[0] for a in a_list ]
39 |             url_list = [url_root + a.xpath("./@href")[0] for a in a_list ]
40 |             
41 |             tmp_df = {
42 |                 "date": date_list,
43 |                 "headline": headline_list,
44 |                 "url": url_list,
45 |             }
46 |             tmp_df = pd.DataFrame(tmp_df)
47 |             df = pd.concat([df,tmp_df])
48 |             page += 1
49 |         
50 |         
51 |         with tqdm(total = df.shape[0],desc = "Getting Announcement content" ) as pbar:
52 |             df["content"] = df.apply(lambda x: self.get_content(x,pbar), axis=1 )
53 |         
54 |         df = df.reset_index(drop=True)
55 | 
56 |         return df
57 |         
58 |     def get_content(self,x,pbar,delay = 0.1):
59 |         time.sleep(delay)
60 |         url = x.url
61 |         headers = {"User-Agent":"Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:109.0) Gecko/20100101 Firefox/111.0",
62 |                 'Accept-Encoding':'gzip, deflate, br',}
63 |         response = requests.get(url = url,headers=headers)
64 |         if response.status_code == 200:
65 |             try:
66 |                 text = response.content.decode('GBK')
67 |                 html = etree.HTML(text)
68 | 
69 |                 # clean content
70 |                 content_list = html.xpath("//*[@id='content']//text()")
71 |                 content_list = [content.strip('.\t').strip('.\n').strip('.\r') for content in content_list]
72 |                 content_list = [content for content in content_list if len(content) != 0]
73 |                 content = "".join(content_list)
74 |             except:
75 |                 return "can't get content"
76 |         else:
77 |             return "can't get content"
78 | 
79 |         pbar.update(1)
80 | 
81 |         return content
82 | 
83 |     def clean_data(self):
84 |         pass
85 | 
86 |     def transfer_standard_date_to_nonstandard(self,date):
87 |         pass


--------------------------------------------------------------------------------
/finnlp/data_sources/news/thefly_streaming.py:
--------------------------------------------------------------------------------
 1 | import warnings
 2 | warnings.filterwarnings("ignore")
 3 | import requests
 4 | from lxml import etree
 5 | from tqdm import tqdm
 6 | import pandas as pd
 7 | import json
 8 | import time
 9 | from finnlp.data_sources.news._base import News_Downloader
10 | 
11 | # TODO:
12 | # 1. Contents
13 | # 2. More pages
14 | 
15 | class TheFly_Streaming(News_Downloader):
16 | 
17 |     def __init__(self, args={}):
18 |         super().__init__(args)
19 |         self.dataframe = pd.DataFrame()
20 | 
21 |     def download_streaming_search(self, keyword = "AAPL",end_date = None, rounds = 3, delay = 0.5):
22 |         # download first page
23 |         self._download_first_page(keyword, delay = delay, end_date = end_date)
24 |        
25 |         # download the following pages
26 |         # self._download_other_pages(keyword)
27 |         print("Only support the first page now!")
28 | 
29 |     def _download_first_page(self, keyword = "AAPL", delay = 0.5, end_date = None):
30 |         url = "https://thefly.com/news.php"
31 |         headers = {
32 |             'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/116.0.0.0 Safari/537.36',
33 |         }
34 |         params = {
35 |             'fecha': end_date,
36 |             'market_stories': 'on',
37 |             'hot_stocks_filter': 'on',
38 |             'rumors_filter': 'on',
39 |             'general_news_filter': 'on',
40 |             'periodicals_filter': 'on',
41 |             'earnings_filter': 'on',
42 |             'technical_analysis_filter': 'on',
43 |             'options_filter': 'on',
44 |             'syndicates_filter': 'on',
45 |             'onthefly': 'on',
46 |             'insight_filter': 'on',
47 |             'market_mover_filter': 'on',
48 |             'e_inter_filter': 'on',
49 |             'mid_wrap_filter': 'on',
50 |             'sec_wrap_filter': 'on',
51 |             'analyst_wrap_filter': 'on',
52 |             'analyst_recommendations': 'on',
53 |             'upgrade_filter': 'on',
54 |             'downgrade_filter': 'on',
55 |             'initiate_filter': 'on',
56 |             'no_change_filter': 'on',
57 |             'events': 'on',
58 |             'symbol': keyword, 
59 |         }
60 |         res = requests.get(url = url, headers= headers, params = params, verify=False)
61 |         if res.status_code != 200:
62 |             print(f'Connection Error: {res.status_code}')
63 |             return f'Connection Error: {res.status_code}'
64 |         
65 |         res = etree.HTML(res.text)
66 |         tables = res.xpath("/html/body/div[2]/div/div/div[1]/table")[1:]
67 |         titles = []
68 |         stocks = []
69 |         abstracts = []
70 |         dates = []
71 |         times = []
72 |         for table in tables:
73 |             trs = table.xpath("./tr")
74 |             for tr in trs:
75 |                 title = tr.xpath("./td[2]/div[1]/a/span//text()")
76 |                 if len(title) > 0:
77 |                     titles.append(' '.join(title))
78 |                     stocks.append(' '.join(tr.xpath("./td[2]/div[1]/div/span/text()")))
79 |                     abstracts.append(' '.join(tr.xpath("./td[2]/div[2]/dd/p[1]/text()")))
80 |                     dates.append(' '.join(tr.xpath("./td[2]/div[1]/span[2]/small/span[3]/text()")))
81 |                     times.append(' '.join(tr.xpath("./td[2]/div[1]/span[2]/small/span[3]/div/text()")))
82 | 
83 |         tmp = pd.DataFrame([titles, stocks, abstracts, dates, times]).T
84 |         tmp.columns = ["title", "stock", "abstract", "date", "time"]
85 |         self.dataframe = pd.concat([self.dataframe, tmp])
86 | 
87 |         time.sleep(delay)
88 | 


--------------------------------------------------------------------------------
/finnlp/data_sources/news/talkmarkets_streaming.py:
--------------------------------------------------------------------------------
 1 | import warnings
 2 | warnings.filterwarnings("ignore")
 3 | import requests
 4 | from lxml import etree
 5 | from tqdm import tqdm
 6 | import pandas as pd
 7 | import json
 8 | import time
 9 | from finnlp.data_sources.news._base import News_Downloader
10 | 
11 | # TODO:
12 | # 1. Contents
13 | 
14 | class TalkMarkets_Streaming(News_Downloader):
15 |     
16 |     def __init__(self, args={}):
17 |         super().__init__(args)
18 |         self.dataframe = pd.DataFrame()
19 | 
20 |     def download_streaming_search(self, keyword = "appple", rounds = 3, delay = 0.5):
21 |         # 1. obtain cx
22 |         cx = self._obtain_cx(keyword)
23 | 
24 |         # 2. obtain ces token
25 |         ces_token = self._obtain_cse_token(cx)
26 | 
27 |         # 3. get content (Due to limit of the platform, the max rouund is 10, about 100 news)
28 |         print("Downloading...", end = ' ')
29 |         for i in range(rounds):
30 |             url = "https://cse.google.com/cse/element/v1"
31 |             headers = {
32 |                 "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/116.0.0.0 Safari/537.36"
33 |             }
34 |             params = {
35 |                 'rsz': 'filtered_cse',
36 |                 'num': '20',
37 |                 'hl': 'en',
38 |                 'source': 'gcsc',
39 |                 'gss': '.com',
40 |                 'start': i*20,
41 |                 'cselibv': '827890a761694e44',
42 |                 'cx': cx,
43 |                 'q': 'apple',
44 |                 'safe': 'off',
45 |                 'cse_tok': ces_token,
46 |                 'sort': 'date', 
47 |                 'exp': 'csqr,cc',
48 |                 'callback': 'google.search.cse.api1861',
49 |             }
50 |             res = requests.get(url = url, headers= headers, params = params)
51 |             if res.status_code != 200:
52 |                 break
53 | 
54 |             res = eval(res.text[34:-2])
55 |             tmp = pd.DataFrame(res["results"])
56 |             self.dataframe = pd.concat([self.dataframe, tmp])
57 | 
58 |             time.sleep(delay)
59 |             print(i, end = ' ')       
60 | 
61 |     def _obtain_cx(self, keyword):
62 |         url = "https://talkmarkets.com/search"
63 |         headers = {
64 |             "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/116.0.0.0 Safari/537.36"
65 |         }
66 |         params = {
67 |             "tab": "General",
68 |             "searchQuery": keyword,
69 |         }
70 |         res = requests.get(url = url, headers= headers, params = params)
71 |         if res.status_code != 200:
72 |             print(f"Connection Error: {res.status_code}")
73 |             return f"Connection Error: {res.status_code}"
74 |         
75 |         res = etree.HTML(res.text)
76 |         cx = res.xpath('.//script[@type="text/javascript"][1]/text()')[1][40:73]
77 |         return cx
78 |     
79 |     def _obtain_cse_token(self, cx, ):
80 |         url = "https://cse.google.com/cse.js"
81 |         headers = {
82 |             "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/116.0.0.0 Safari/537.36"
83 |         }
84 |         params = {
85 |             "cx": cx,
86 |         }
87 |         res = requests.get(url = url, headers= headers, params = params)
88 |         if res.status_code != 200:
89 |             print(f"Connection Error: {res.status_code}")
90 |             return f"Connection Error: {res.status_code}"
91 |         
92 |         text = res.text
93 |         ces_token = text[5744:5786]
94 |         return ces_token
95 |         
96 | 


--------------------------------------------------------------------------------
/docs/FinNLP/site/assets/javascripts/lunr/min/lunr.ko.min.js:
--------------------------------------------------------------------------------
1 | !function(e,r){"function"==typeof define&&define.amd?define(r):"object"==typeof exports?module.exports=r():r()(e.lunr)}(this,function(){return function(e){if(void 0===e)throw new Error("Lunr is not present. Please include / require Lunr before this script.");if(void 0===e.stemmerSupport)throw new Error("Lunr stemmer support is not present. Please include / require Lunr stemmer support before this script.");e.ko=function(){this.pipeline.reset(),this.pipeline.add(e.ko.trimmer,e.ko.stopWordFilter)},e.ko.wordCharacters="[A-Za-z가-힯a]",e.ko.trimmer=e.trimmerSupport.generateTrimmer(e.ko.wordCharacters),e.Pipeline.registerFunction(e.ko.trimmer,"trimmer-ko"),e.ko.stopWordFilter=e.generateStopWordFilter("아 휴 아이구 아이쿠 아이고 어 나 우리 저희 따라 의해 을 를 에 의 가 으로 로 에게 뿐이다 의거하여 근거하여 입각하여 기준으로 예하면 예를 들면 예를 들자면 저 소인 소생 저희 지말고 하지마 하지마라 다른 물론 또한 그리고 비길수 없다 해서는 안된다 뿐만 아니라 만이 아니다 만은 아니다 막론하고 관계없이 그치지 않다 그러나 그런데 하지만 든간에 논하지 않다 따지지 않다 설사 비록 더라도 아니면 만 못하다 하는 편이 낫다 불문하고 향하여 향해서 향하다 쪽으로 틈타 이용하여 타다 오르다 제외하고 이 외에 이 밖에 하여야 비로소 한다면 몰라도 외에도 이곳 여기 부터 기점으로 따라서 할 생각이다 하려고하다 이리하여 그리하여 그렇게 함으로써 하지만 일때 할때 앞에서 중에서 보는데서 으로써 로써 까지 해야한다 일것이다 반드시 할줄알다 할수있다 할수있어 임에 틀림없다 한다면 등 등등 제 겨우 단지 다만 할뿐 딩동 댕그 대해서 대하여 대하면 훨씬 얼마나 얼마만큼 얼마큼 남짓 여 얼마간 약간 다소 좀 조금 다수 몇 얼마 지만 하물며 또한 그러나 그렇지만 하지만 이외에도 대해 말하자면 뿐이다 다음에 반대로 반대로 말하자면 이와 반대로 바꾸어서 말하면 바꾸어서 한다면 만약 그렇지않으면 까악 툭 딱 삐걱거리다 보드득 비걱거리다 꽈당 응당 해야한다 에 가서 각 각각 여러분 각종 각자 제각기 하도록하다 와 과 그러므로 그래서 고로 한 까닭에 하기 때문에 거니와 이지만 대하여 관하여 관한 과연 실로 아니나다를가 생각한대로 진짜로 한적이있다 하곤하였다 하 하하 허허 아하 거바 와 오 왜 어째서 무엇때문에 어찌 하겠는가 무슨 어디 어느곳 더군다나 하물며 더욱이는 어느때 언제 야 이봐 어이 여보시오 흐흐 흥 휴 헉헉 헐떡헐떡 영차 여차 어기여차 끙끙 아야 앗 아야 콸콸 졸졸 좍좍 뚝뚝 주룩주룩 솨 우르르 그래도 또 그리고 바꾸어말하면 바꾸어말하자면 혹은 혹시 답다 및 그에 따르는 때가 되어 즉 지든지 설령 가령 하더라도 할지라도 일지라도 지든지 몇 거의 하마터면 인젠 이젠 된바에야 된이상 만큼\t어찌됏든 그위에 게다가 점에서 보아 비추어 보아 고려하면 하게될것이다 일것이다 비교적 좀 보다더 비하면 시키다 하게하다 할만하다 의해서 연이서 이어서 잇따라 뒤따라 뒤이어 결국 의지하여 기대여 통하여 자마자 더욱더 불구하고 얼마든지 마음대로 주저하지 않고 곧 즉시 바로 당장 하자마자 밖에 안된다 하면된다 그래 그렇지 요컨대 다시 말하자면 바꿔 말하면 즉 구체적으로 말하자면 시작하여 시초에 이상 허 헉 허걱 바와같이 해도좋다 해도된다 게다가 더구나 하물며 와르르 팍 퍽 펄렁 동안 이래 하고있었다 이었다 에서 로부터 까지 예하면 했어요 해요 함께 같이 더불어 마저 마저도 양자 모두 습니다 가까스로 하려고하다 즈음하여 다른 다른 방면으로 해봐요 습니까 했어요 말할것도 없고 무릎쓰고 개의치않고 하는것만 못하다 하는것이 낫다 매 매번 들 모 어느것 어느 로써 갖고말하자면 어디 어느쪽 어느것 어느해 어느 년도 라 해도 언젠가 어떤것 어느것 저기 저쪽 저것 그때 그럼 그러면 요만한걸 그래 그때 저것만큼 그저 이르기까지 할 줄 안다 할 힘이 있다 너 너희 당신 어찌 설마 차라리 할지언정 할지라도 할망정 할지언정 구토하다 게우다 토하다 메쓰겁다 옆사람 퉤 쳇 의거하여 근거하여 의해 따라 힘입어 그 다음 버금 두번째로 기타 첫번째로 나머지는 그중에서 견지에서 형식으로 쓰여 입장에서 위해서 단지 의해되다 하도록시키다 뿐만아니라 반대로 전후 전자 앞의것 잠시 잠깐 하면서 그렇지만 다음에 그러한즉 그런즉 남들 아무거나 어찌하든지 같다 비슷하다 예컨대 이럴정도로 어떻게 만약 만일 위에서 서술한바와같이 인 듯하다 하지 않는다면 만약에 무엇 무슨 어느 어떤 아래윗 조차 한데 그럼에도 불구하고 여전히 심지어 까지도 조차도 하지 않도록 않기 위하여 때 시각 무렵 시간 동안 어때 어떠한 하여금 네 예 우선 누구 누가 알겠는가 아무도 줄은모른다 줄은 몰랏다 하는 김에 겸사겸사 하는바 그런 까닭에 한 이유는 그러니 그러니까 때문에 그 너희 그들 너희들 타인 것 것들 너 위하여 공동으로 동시에 하기 위하여 어찌하여 무엇때문에 붕붕 윙윙 나 우리 엉엉 휘익 윙윙 오호 아하 어쨋든 만 못하다\t하기보다는 차라리 하는 편이 낫다 흐흐 놀라다 상대적으로 말하자면 마치 아니라면 쉿 그렇지 않으면 그렇지 않다면 안 그러면 아니었다면 하든지 아니면 이라면 좋아 알았어 하는것도 그만이다 어쩔수 없다 하나 일 일반적으로 일단 한켠으로는 오자마자 이렇게되면 이와같다면 전부 한마디 한항목 근거로 하기에 아울러 하지 않도록 않기 위해서 이르기까지 이 되다 로 인하여 까닭으로 이유만으로 이로 인하여 그래서 이 때문에 그러므로 그런 까닭에 알 수 있다 결론을 낼 수 있다 으로 인하여 있다 어떤것 관계가 있다 관련이 있다 연관되다 어떤것들 에 대해 이리하여 그리하여 여부 하기보다는 하느니 하면 할수록 운운 이러이러하다 하구나 하도다 다시말하면 다음으로 에 있다 에 달려 있다 우리 우리들 오히려 하기는한데 어떻게 어떻해 어찌됏어 어때 어째서 본대로 자 이 이쪽 여기 이것 이번 이렇게말하자면 이런 이러한 이와 같은 요만큼 요만한 것 얼마 안 되는 것 이만큼 이 정도의 이렇게 많은 것 이와 같다 이때 이렇구나 것과 같이 끼익 삐걱 따위 와 같은 사람들 부류의 사람들 왜냐하면 중의하나 오직 오로지 에 한하다 하기만 하면 도착하다 까지 미치다 도달하다 정도에 이르다 할 지경이다 결과에 이르다 관해서는 여러분 하고 있다 한 후 혼자 자기 자기집 자신 우에 종합한것과같이 총적으로 보면 총적으로 말하면 총적으로 대로 하다 으로서 참 그만이다 할 따름이다 쿵 탕탕 쾅쾅 둥둥 봐 봐라 아이야 아니 와아 응 아이 참나 년 월 일 령 영 일 이 삼 사 오 육 륙 칠 팔 구 이천육 이천칠 이천팔 이천구 하나 둘 셋 넷 다섯 여섯 일곱 여덟 아홉 령 영".split(" ")),e.Pipeline.registerFunction(e.ko.stopWordFilter,"stopWordFilter-ko"),e.ko.stemmer=function(){return function(e){return"function"==typeof e.update?e.update(function(e){return e}):e}}(),e.Pipeline.registerFunction(e.ko.stemmer,"stemmer-ko")}});


--------------------------------------------------------------------------------
/finnlp/benchmarks/fiqa.py:
--------------------------------------------------------------------------------
 1 | import warnings
 2 | warnings.filterwarnings("ignore")
 3 | 
 4 | from sklearn.metrics import accuracy_score,f1_score
 5 | from datasets import load_dataset
 6 | from tqdm import tqdm
 7 | import datasets
 8 | import torch
 9 | 
10 | def format_example(example: dict) -> dict:
11 |     context = f"Instruction: {example['instruction']}\n"
12 |     if example.get("input"):
13 |         context += f"Input: {example['input']}\n"
14 |     context += "Answer: "
15 |     target = example["output"]
16 |     return {"context": context, "target": target}
17 | 
18 | def add_instructions(x):
19 |     if x.format == "post":
20 |         return "What is the sentiment of this tweet? Please choose an answer from {negative/neutral/positive}."
21 |     else:
22 |         return "What is the sentiment of this news? Please choose an answer from {negative/neutral/positive}."
23 | 
24 | def make_label(x):
25 |     if x < - 0.1: return "negative"
26 |     elif x >=-0.1 and x < 0.1: return "neutral"
27 |     elif x >= 0.1: return "positive"
28 | 
29 | def change_target(x):
30 |     if 'positive' in x or 'Positive' in x:
31 |         return 'positive'
32 |     elif 'negative' in x or 'Negative' in x:
33 |         return 'negative'
34 |     else:
35 |         return 'neutral'
36 | 
37 | def test_fiqa(model, tokenizer, batch_size = 8, prompt_fun = None ):
38 |     dataset = load_dataset('pauri32/fiqa-2018')
39 |     dataset = datasets.concatenate_datasets([dataset["train"], dataset["validation"] ,dataset["test"] ])
40 |     dataset = dataset.train_test_split(0.226, seed = 42)['test']
41 |     dataset = dataset.to_pandas()
42 |     dataset["output"] = dataset.sentiment_score.apply(make_label)
43 |     if prompt_fun is None:
44 |         dataset["instruction"] = "What is the sentiment of this news? Please choose an answer from {negative/neutral/positive}."
45 |     else:
46 |         dataset["instruction"] = dataset.apply(prompt_fun, axis = 1)
47 | 
48 |     dataset = dataset[['sentence', 'output',"instruction"]]
49 |     dataset.columns = ["input", "output","instruction"]
50 |     dataset[["context","target"]] = dataset.apply(format_example, axis = 1, result_type="expand")
51 | 
52 |     # print example
53 |     print(f"\n\nPrompt example:\n{dataset['context'][0]}\n\n")
54 | 
55 |     context = dataset['context'].tolist()
56 |     total_steps = dataset.shape[0]//batch_size + 1
57 |     print(f"Total len: {len(context)}. Batchsize: {batch_size}. Total steps: {total_steps}")
58 | 
59 |     out_text_list = []
60 | 
61 |     for i in tqdm(range(total_steps)):
62 |         tmp_context = context[i* batch_size:(i+1)* batch_size]
63 |         tokens = tokenizer(tmp_context, return_tensors='pt', padding=True, max_length=512)
64 |         # tokens.pop('token_type_ids')
65 |         for k in tokens.keys():
66 |             tokens[k] = tokens[k].cuda()
67 |         
68 |         res = model.generate(**tokens, max_length=512)
69 |         res_sentences = [tokenizer.decode(i) for i in res]
70 |         out_text = [o.split("Answer: ")[1] for o in res_sentences]
71 |         out_text_list += out_text
72 |         torch.cuda.empty_cache()
73 | 
74 |     dataset["out_text"] = out_text_list
75 |     dataset["new_target"] = dataset["target"].apply(change_target)
76 |     dataset["new_out"] = dataset["out_text"].apply(change_target)
77 | 
78 |     acc = accuracy_score(dataset["new_target"], dataset["new_out"])
79 |     f1_macro = f1_score(dataset["new_target"], dataset["new_out"], average = "macro")
80 |     f1_micro = f1_score(dataset["new_target"], dataset["new_out"], average = "micro")
81 |     f1_weighted = f1_score(dataset["new_target"], dataset["new_out"], average = "weighted")
82 | 
83 |     print(f"Acc: {acc}. F1 macro: {f1_macro}. F1 micro: {f1_micro}. F1 weighted (BloombergGPT): {f1_weighted}. ")
84 | 
85 |     return dataset


--------------------------------------------------------------------------------
/finnlp/data_sources/social_media/reddit_streaming.py:
--------------------------------------------------------------------------------
 1 | from finnlp.data_sources.social_media._base import Social_Media_Downloader
 2 | 
 3 | from tqdm import tqdm
 4 | from lxml import etree
 5 | import requests
 6 | import pandas as pd
 7 | import json
 8 | import base64
 9 | 
10 | class Reddit_Streaming(Social_Media_Downloader):
11 | 
12 |     def __init__(self, args = {}):
13 |         super().__init__(args)
14 |         self.dataframe = pd.DataFrame()
15 | 
16 |     def download_streaming_all(self, rounds = 3):
17 |         # Download the first page by url
18 |         base_url = "https://www.reddit.com/r/wallstreetbets/new/"
19 |         pbar = tqdm(total= rounds, desc= "Downloading by pages...")
20 |         res = self._request_get(base_url)
21 |         if res is None:
22 |             raise ConnectionError
23 |         
24 |         # get the info from init page
25 |         html = etree.HTML(res.text)
26 |         init = html.xpath("//*[@id='data']/text()")[0]
27 |         init = json.loads(init[14:][:-1])
28 |         init = init["posts"]["models"]
29 |         tmp_df = pd.DataFrame(init).T.reset_index(drop = True)
30 |         self.dataframe = tmp_df
31 |         init = [i for i in init if len(i)< 12]
32 |         last_id = init[-1]
33 |         last_id = self._encode_base64(last_id)
34 |         
35 |         pbar.update(1)
36 | 
37 |         # fetch other pages
38 |         if rounds > 1:
39 |             for _ in range(1,rounds):
40 |                 last_id = self._fatch_other_pages(last_id, pbar)
41 | 
42 |     def _fatch_other_pages(self, last_page, pbar):
43 |         url = 'https://gql.reddit.com/'
44 |         headers = {
45 |             "referer":"https://www.reddit.com/",
46 |             "authorization": "Bearer -twjFZkBAlpR8gZnZqsGHvz-G5c49PA",
47 |             "user-agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/113.0.0.0 Safari/537.36"
48 |         }
49 |         data = {
50 |         "id": "02e3b6d0d0d7",
51 |         "variables": {
52 |             "name": "wallstreetbets",
53 |             "includeIdentity": False,
54 |             "adContext": {
55 |             "layout": "CARD",
56 |             "clientSignalSessionData": {
57 |                 "adsSeenCount": 4,
58 |                 "totalPostsSeenCount": 79,
59 |                 "sessionStartTime": "2023-04-07T15:32:13.933Z",
60 |             }
61 |             },
62 |             "isFake": False,
63 |             "includeAppliedFlair": False,
64 |             "includeDevPlatformMetadata": True,
65 |             "includeRecents": False,
66 |             "includeTrending": False,
67 |             "includeSubredditRankings": True,
68 |             "includeSubredditChannels": False,
69 |             "isAdHocMulti": False,
70 |             "isAll": False,
71 |             "isLoggedOutGatedOptedin": False,
72 |             "isLoggedOutQuarantineOptedin": False,
73 |             "isPopular": False,
74 |             "recentPostIds": [],
75 |             "subredditNames": [],
76 |             "sort": "NEW",
77 |             "pageSize": 25,
78 |             "after": last_page
79 |             }
80 |         }
81 |         response = self._request_post(url = url, headers= headers, json = data)
82 |         data = json.loads(response.text)
83 |         data = data["data"]["subredditInfoByName"]["elements"]["edges"]
84 |         for d in data:
85 |             if d["node"]["__typename"] == "SubredditPost":
86 |                 tmp = pd.DataFrame(d).T
87 |                 self.dataframe = pd.concat([self.dataframe, tmp])
88 |                 last_id = tmp.id.values[0]
89 |         
90 |         last_id = self._encode_base64(last_id)
91 |         pbar.update(1)
92 | 
93 |         return last_id
94 | 
95 |     def _encode_base64(self,id):
96 |         return base64.b64encode(id.encode('utf-8')).decode()


--------------------------------------------------------------------------------
/finnlp/data_sources/social_media/twitter_date_range.py:
--------------------------------------------------------------------------------
 1 | import warnings
 2 | warnings.filterwarnings("ignore")
 3 | 
 4 | from finnlp.data_sources.social_media._base import Social_Media_Downloader
 5 | 
 6 | import requests
 7 | from urllib import parse
 8 | from tqdm import tqdm
 9 | from datetime import datetime,timedelta
10 | import pandas as pd
11 | import json
12 | import time
13 | 
14 | class Twitter_Date_Range(Social_Media_Downloader):
15 | 
16 |     def __init__(self, args = {}):
17 |         super().__init__(args)
18 |         self.dataframe = pd.DataFrame()
19 | 
20 |     def download_date_range_stock(self, start_date, end_date, stock = "AAPL"):
21 |         self.date_list = pd.date_range(start_date,end_date)
22 |         res = pd.DataFrame()
23 |         for date in tqdm(self.date_list, desc= "Downloading by day... "):
24 |             tmp = self._gather_one_day(date,stock)
25 |             res = pd.concat([res,tmp])
26 |         
27 |         res.created_at = pd.to_datetime(res.created_at)
28 |         res = res.sort_values("created_at")
29 |         res = res.reset_index(drop=True)
30 |         # res = res.query(f"created_at >= @start_date & created_at <= @end_date")
31 |         res = res[res.created_at >= start_date][res.created_at <= end_date]
32 |         res = res.reset_index(drop=True)
33 |         self.dataframe = res
34 | 
35 |     def _gather_one_day(self, date, stock = "AAPL", pbar = None ,delay = 0.01):
36 |         time.sleep(delay)
37 |         next_date = date + timedelta(days=1)
38 |         date = datetime.strftime(date, "%Y-%m-%d")
39 |         next_date = datetime.strftime(next_date, "%Y-%m-%d")
40 | 
41 |         url = "https://twitter.com/i/api/2/search/adaptive.json?include_profile_interstitial_type=1&include_blocking=1&include_blocked_by=1&include_followed_by=1&include_want_retweets=1&include_mute_edge=1&include_can_dm=1&include_can_media_tag=1&skip_status=1&cards_platform=Web-12&include_cards=1&include_ext_alt_text=true&include_quote_count=true&include_reply_count=1&tweet_mode=extended&include_entities=true&include_user_entities=true&include_ext_media_color=true&include_ext_media_availability=true&send_error_codes=true&simple_quoted_tweet=true&q={}&count=20&query_source=typed_query&pc=1&spelling_corrections=1&ext=mediaStats%2ChighlightedLabel%2CvoiceInfo"
42 |         url_token = 'https://api.twitter.com/1.1/guest/activate.json'
43 |         headers = {
44 |             'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/113.0.0.0 Safari/537.36',
45 |             'Accept': '*/*',
46 |             'Accept-Language': 'zh-CN,zh;q=0.8,zh-TW;q=0.7,zh-HK;q=0.5,en-US;q=0.3,en;q=0.2',
47 |             'x-guest-token': '',
48 |             'x-twitter-client-language': 'zh-cn',
49 |             'x-twitter-active-user': 'yes',
50 |             'x-csrf-token': '25ea9d09196a6ba850201d47d7e75733',
51 |             'Sec-Fetch-Dest': 'empty',
52 |             'Sec-Fetch-Mode': 'cors',
53 |             'Sec-Fetch-Site': 'same-origin',
54 |             'authorization': 'Bearer AAAAAAAAAAAAAAAAAAAAANRILgAAAAAAnNwIzUejRCOuH5E6I8xnZz4puTs%3D1Zv7ttfk8LF81IUq16cHjhLTvJu4FA33AGWWjCpTnA',
55 |             'Referer': 'https://twitter.com/',
56 |             'Connection': 'keep-alive',
57 |         }
58 | 
59 |         q = f'{stock} until:{next_date} since:{date}'
60 |         token = json.loads(requests.post(url_token, headers = headers).text)['guest_token']
61 |         print(token)
62 |         headers['x-guest-token'] = token
63 |         url = url.format(parse.quote(q))
64 |         print(url)
65 |         res = self._request_get(url, headers = headers)
66 |         print(res)
67 |         if res is not None:
68 |             try:
69 |                 res = json.loads(res.text)
70 |                 res = pd.DataFrame(res["globalObjects"]["tweets"]).T.sort_values("created_at")
71 |             except:
72 |                 res = pd.DataFrame()
73 |         else:
74 |             res = pd.DataFrame()
75 |             
76 |         return res
77 | 


--------------------------------------------------------------------------------
/finnlp/data_sources/social_media/facebook_streaming.py:
--------------------------------------------------------------------------------
  1 | import warnings
  2 | warnings.filterwarnings("ignore")
  3 | import requests
  4 | from lxml import etree
  5 | from tqdm import tqdm
  6 | import pandas as pd
  7 | import json
  8 | import time
  9 | from finnlp.data_sources.social_media._base import Social_Media_Downloader
 10 | 
 11 | # TODO:
 12 | # 1. Better performance
 13 | 
 14 | import json
 15 | import time
 16 | import numpy as np
 17 | 
 18 | from selenium import webdriver
 19 | from selenium.webdriver.common.by import By
 20 | 
 21 | class Facebook_Streaming(Social_Media_Downloader):
 22 |     def __init__(self, args = {}):
 23 |         super().__init__(args)
 24 |         self.dataframe = pd.DataFrame()
 25 |         self.cookies = args["cookies"]
 26 |         self.stealth_path = args["stealth_path"]
 27 |         self.headless = args["headless"] if "headless" in args.keys() else True
 28 | 
 29 |     def download_streaming_stock(self, keyword = "AAPL", rounds = 3, delay = 0.5):
 30 |         # init
 31 |         self._init_opt()
 32 | 
 33 |         # search for the keyword
 34 |         search_url = "https://m.facebook.com/search_results/?q=" + keyword
 35 |         self.browser.get(search_url)
 36 | 
 37 |         # click on the posts
 38 |         post_element = self.browser.find_elements(By.XPATH, "/html/body/div[2]/div/div[2]/div[3]/div[1]")[0]
 39 |         post_element.click()
 40 |         time.sleep(5)
 41 | 
 42 |         # click on recent posts
 43 |         post_element = self.browser.find_elements(By.XPATH, "/html/body/div[2]/div/div[2]/div[3]/div[1]")[0]
 44 |         post_element.click()
 45 |         time.sleep(5)
 46 |         
 47 |         # get data
 48 |         all = []
 49 |         title_divs = self.browser.find_elements(By.XPATH, "/html/body/div[2]/div/div[2]/div")
 50 |         for title_div in tqdm(title_divs):
 51 |             # title
 52 |             try:
 53 |                 title = title_div.find_elements(By.XPATH,"./div[2]/div/div/div[2]/div/div/div/div")
 54 |                 if len(title)>0:
 55 |                     title = title[0].text
 56 |                 else:
 57 |                     title = np.nan
 58 |             except Exception as e:
 59 |                 print(e)
 60 |                 title = np.nan
 61 |             
 62 |             # time
 63 |             try:
 64 |                 time_element = title_div.find_elements(By.XPATH, './div[2]/div/div/div[1]/div/div/div/div[2]/div[2]/div/span')
 65 |                 if len(time_element)>0:
 66 |                     time_ = time_element[0].text
 67 |                 else:
 68 |                     time_ = np.nan
 69 |             except:
 70 |                 time_ = np.nan
 71 |             all.append((title, time_))
 72 | 
 73 |         # close browser
 74 |         self.browser.close()
 75 | 
 76 |         tmp = pd.DataFrame(all, columns=["content", "date"])
 77 |         self.dataframe = pd.concat([self.dataframe, tmp])
 78 |         self.dataframe = self.dataframe.dropna(how="all")
 79 | 
 80 |         print("Only support the first page now!")
 81 | 
 82 |     
 83 |     def _init_opt(self):
 84 |         self.chromeOptions = webdriver.ChromeOptions()
 85 |         if self.headless:
 86 |             self.chromeOptions.add_argument('--headless')
 87 |         self.chromeOptions.add_argument('--disable-blink-features=AutomationControlled')
 88 |         self.chromeOptions.add_argument("--user-agent=Mozilla/5.0 (iPhone; CPU iPhone OS 14_1 like Mac OS X) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/14.0 Mobile/15E148 Safari/604.1")
 89 | 
 90 |         self.chromeOptions.add_experimental_option('excludeSwitches', ['enable-automation'])
 91 |         self.browser = webdriver.Chrome(options=self.chromeOptions)
 92 |         with open(self.stealth_path) as f:
 93 |             js = f.read()
 94 |         self.browser.execute_cdp_cmd("Page.addScriptToEvaluateOnNewDocument", {
 95 |             "source": js
 96 |         })
 97 |         self.browser.get('https://m.facebook.com/')
 98 |         self.browser.delete_all_cookies()
 99 |         for i in self.cookies: 
100 |             self.browser.add_cookie(i)
101 | 
102 |         self.browser.implicitly_wait(2)
103 | 
104 | 


--------------------------------------------------------------------------------
/finnlp/data_sources/sec_filings/main.py:
--------------------------------------------------------------------------------
  1 | from finnlp.data_sources.sec_filings.sec_filings import SECExtractor
  2 | import concurrent.futures
  3 | import json
  4 | import os
  5 | import time
  6 | from collections import defaultdict
  7 | from typing import List
  8 | 
  9 | class SECFilingsLoader():
 10 |     """
 11 |     SEC Filings loader
 12 |     Get the SEC filings of multiple tickers
 13 |     """
 14 | 
 15 |     def __init__(
 16 |         self,
 17 |         tickers: List[str],
 18 |         amount: int,
 19 |         filing_type: str = "10-K",
 20 |         num_workers: int = 2,
 21 |         include_amends: bool = False,
 22 |         folder_name:str = "data"
 23 |     ):
 24 |         assert filing_type in [
 25 |             "10-K",
 26 |             "10-Q",
 27 |         ], "The supported document types are 10-K and 10-Q"
 28 | 
 29 |         self.tickers = tickers
 30 |         self.amount = amount
 31 |         self.filing_type = filing_type
 32 |         self.num_workers = num_workers
 33 |         self.include_amends = include_amends
 34 | 
 35 |         self.se = SECExtractor(
 36 |             tickers, amount, filing_type, include_amends=include_amends
 37 |         )
 38 |         self.folder_name = folder_name
 39 |         os.makedirs(self.folder_name, exist_ok=True)
 40 | 
 41 |     def multiprocess_run(self, tic):
 42 |         tic_dict = self.se.get_accession_numbers(tic)
 43 |         text_dict = defaultdict(list)
 44 |         for tic, fields in tic_dict.items():
 45 |             os.makedirs(f"{self.folder_name}/{tic}", exist_ok=True)
 46 |             print(f"Started for {tic}")
 47 | 
 48 |             field_urls = [field["url"] for field in fields]
 49 |             years = [field["year"] for field in fields]
 50 |             with concurrent.futures.ProcessPoolExecutor(
 51 |                 max_workers=self.num_workers
 52 |             ) as executor:
 53 |                 results = executor.map(self.se.get_text_from_url, field_urls)
 54 |             for idx, res in enumerate(results):
 55 |                 all_text, filing_type = res
 56 |                 text_dict[tic].append(
 57 |                     {
 58 |                         "year": years[idx],
 59 |                         "ticker": tic,
 60 |                         "all_texts": all_text,
 61 |                         "filing_type": filing_type,
 62 |                     }
 63 |                 )
 64 |         return text_dict
 65 | 
 66 |     def load_data(self):
 67 |         start = time.time()
 68 |         thread_workers = min(len(self.tickers), self.num_workers)
 69 |         with concurrent.futures.ThreadPoolExecutor(
 70 |             max_workers=thread_workers
 71 |         ) as executor:
 72 |             results = executor.map(self.multiprocess_run, self.tickers)
 73 | 
 74 |         for res in results:
 75 |             curr_tic = list(res.keys())[0]
 76 |             for data in res[curr_tic]:
 77 |                 curr_year = data["year"]
 78 |                 curr_filing_type = data["filing_type"]
 79 |                 if curr_filing_type in ["10-K/A", "10-Q/A"]:
 80 |                     curr_filing_type = curr_filing_type.replace("/", "")
 81 |                 if curr_filing_type in ["10-K", "10-KA"]:
 82 |                     os.makedirs(f"{self.folder_name}/{curr_tic}/{curr_year}", exist_ok=True)
 83 |                     with open(
 84 |                         f"{self.folder_name}/{curr_tic}/{curr_year}/{curr_filing_type}.json", "w"
 85 |                     ) as f:
 86 |                         json.dump(data, f, indent=4)
 87 |                 elif curr_filing_type in ["10-Q", "10-QA"]:
 88 |                     os.makedirs(f"{self.folder_name}/{curr_tic}/{curr_year[:-2]}", exist_ok=True)
 89 |                     with open(
 90 |                         f"{self.folder_name}/{curr_tic}/{curr_year[:-2]}/{curr_filing_type}_{curr_year[-2:]}.json",
 91 |                         "w",
 92 |                     ) as f:
 93 |                         json.dump(data, f, indent=4)
 94 |                 print(
 95 |                     f"Done for {curr_tic} for document {curr_filing_type} and year"
 96 |                     f" {curr_year}"
 97 |                 )
 98 | 
 99 |         print(f"It took {round(time.time()-start,2)} seconds")
100 | 


--------------------------------------------------------------------------------
/finnlp/data_engineering/data_cleaning.py:
--------------------------------------------------------------------------------
  1 | import re
  2 | import os
  3 | from transformers import BertTokenizer
  4 | from datasketch import MinHash, MinHashLSH
  5 | from nltk import ngrams
  6 | 
  7 | # junk data
  8 | def junk_eliminate(df, re_expression =  r'[&#<>{}\[\]\\]', threshold=0.01, min_len=10):
  9 |     RE_SUSPICIOUS = re.compile(re_expression)
 10 |     def impurity(text, min_len=min_len):
 11 |         """returns the share of suspicious characters in a text"""
 12 |         if text == None or len(text) < min_len:
 13 |             return 0
 14 |         else:
 15 |             return len(RE_SUSPICIOUS.findall(text))/len(text)
 16 |     df['impurity'] = df['text'].apply(impurity, min_len=min_len)
 17 |     total_num_docs = len(df)
 18 |     impurity_num_docs = len(df[df['impurity']  >= threshold])
 19 |     impurity_ratio = impurity_num_docs / total_num_docs
 20 |     purity_df = df[df['impurity']  < threshold]
 21 |     return purity_df, impurity_ratio
 22 | 
 23 | # Biased Content
 24 | def toxic_eliminate(df, l_kind='en'):
 25 |     '''
 26 |       l_kind = ['en', 'zh']
 27 |     '''
 28 |     os.system(f"wget https://github.com/LDNOOBW/List-of-Dirty-Naughty-Obscene-and-Otherwise-Bad-Words/blob/master/{l_kind}")
 29 |     with open(f'./{l_kind}', 'r') as f:
 30 |         lines = f.readlines()
 31 |     banned_words = set([line.rstrip('\n') for line in lines])
 32 |     df['banned_words_in_text'] = df['text'].apply(lambda text: [word for word in banned_words if word in text.lower().split()])
 33 |     df['matches'] = df['banned_words_in_text'].apply(lambda words: len(words) > 0)
 34 |     total_num_docs = len(df)
 35 |     biased_num_docs = df['matches'].sum()
 36 |     biased_content_ratio = biased_num_docs / total_num_docs
 37 |     non_toxic_df = df[df['matches'] == 0]
 38 |     return non_toxic_df, biased_content_ratio
 39 | 
 40 | # Too Short Document
 41 | def short_eliminate(df, tokenizer = BertTokenizer.from_pretrained('bert-base-uncased'), min_len=100):
 42 |     # Create a new column with the number of tokens for each text
 43 |     df['text_length'] = df['text'].apply(lambda text: len(tokenizer.tokenize(text)))
 44 |     total_num_docs = len(df)
 45 |     too_short_docs = len(df[df['text_length'] <= min_len])
 46 |     too_short_doc_ratio = too_short_docs / total_num_docs
 47 |     not_short_df = df[df['text_length'] > min_len]
 48 |     return not_short_df, too_short_doc_ratio
 49 | 
 50 | # Contamination
 51 | def process_data(df):
 52 |     minhashes = {}
 53 |     for idx, text in enumerate(df['text']):
 54 |         minhash = MinHash(num_perm=128)
 55 |         for d in ngrams(text, 13):
 56 |             s = "".join(d).encode('utf-8')
 57 |             minhash.update(s)
 58 |         minhashes[idx] = minhash
 59 |     return minhashes
 60 | 
 61 | def contamination_eliminate(train_dataset, test_dataset):
 62 |     train_minhashes = process_data(train_dataset)
 63 |     test_minhashes = process_data(test_dataset)
 64 | 
 65 | 
 66 |     lsh = MinHashLSH(threshold=0.8, num_perm=128)
 67 | 
 68 |     for idx, minhash in train_minhashes.items():
 69 |         lsh.insert(idx, minhash)
 70 | 
 71 |     duplicates_count = 0
 72 |     for idx, minhash in test_minhashes.items():
 73 |         result = lsh.query(minhash)
 74 |         if len(result) > 0:
 75 |             duplicates_count += 1
 76 |     contamination_ratio = duplicates_count / len(test_dataset)
 77 |     return contamination_ratio
 78 | 
 79 | # Duplication
 80 | def duplication_eliminate(df):
 81 |     lsh = MinHashLSH(threshold=0.85, num_perm=128)
 82 |     for i, text in enumerate(df['text']):
 83 |         minhash = MinHash(num_perm=128)
 84 |         for word in text.split():
 85 |             minhash.update(word.encode('utf-8'))
 86 |         lsh.insert(str(i), minhash)
 87 | 
 88 |     unique_documents = set()
 89 | 
 90 |     for i, text in enumerate(df['text']):
 91 |         query_minhash = MinHash(num_perm=128)
 92 |         for word in text.split():
 93 |             query_minhash.update(word.encode('utf-8'))
 94 |         results = lsh.query(query_minhash)
 95 |         try:
 96 |             unique_documents.add(results[0])
 97 |         except Exception as e:
 98 |             print(f'error: {e}')
 99 |     total_unique_documents = len(unique_documents)
100 |     total_documents = len(df)
101 |     duplication_ratio = (total_documents - total_unique_documents) / total_documents
102 |     return unique_documents, duplication_ratio
103 | 
104 | 
105 | 


--------------------------------------------------------------------------------
/docs/FinNLP/site/assets/javascripts/lunr/min/lunr.sv.min.js:
--------------------------------------------------------------------------------
 1 | /*!
 2 |  * Lunr languages, `Swedish` language
 3 |  * https://github.com/MihaiValentin/lunr-languages
 4 |  *
 5 |  * Copyright 2014, Mihai Valentin
 6 |  * http://www.mozilla.org/MPL/
 7 |  */
 8 | /*!
 9 |  * based on
10 |  * Snowball JavaScript Library v0.3
11 |  * http://code.google.com/p/urim/
12 |  * http://snowball.tartarus.org/
13 |  *
14 |  * Copyright 2010, Oleg Mazko
15 |  * http://www.mozilla.org/MPL/
16 |  */
17 | 
18 | !function(e,r){"function"==typeof define&&define.amd?define(r):"object"==typeof exports?module.exports=r():r()(e.lunr)}(this,function(){return function(e){if(void 0===e)throw new Error("Lunr is not present. Please include / require Lunr before this script.");if(void 0===e.stemmerSupport)throw new Error("Lunr stemmer support is not present. Please include / require Lunr stemmer support before this script.");e.sv=function(){this.pipeline.reset(),this.pipeline.add(e.sv.trimmer,e.sv.stopWordFilter,e.sv.stemmer),this.searchPipeline&&(this.searchPipeline.reset(),this.searchPipeline.add(e.sv.stemmer))},e.sv.wordCharacters="A-Za-zªºÀ-ÖØ-öø-ʸˠ-ˤᴀ-ᴥᴬ-ᵜᵢ-ᵥᵫ-ᵷᵹ-ᶾḀ-ỿⁱⁿₐ-ₜKÅℲⅎⅠ-ↈⱠ-ⱿꜢ-ꞇꞋ-ꞭꞰ-ꞷꟷ-ꟿꬰ-ꭚꭜ-ꭤﬀ-ﬆＡ-Ｚａ-ｚ",e.sv.trimmer=e.trimmerSupport.generateTrimmer(e.sv.wordCharacters),e.Pipeline.registerFunction(e.sv.trimmer,"trimmer-sv"),e.sv.stemmer=function(){var r=e.stemmerSupport.Among,n=e.stemmerSupport.SnowballProgram,t=new function(){function e(){var e,r=w.cursor+3;if(o=w.limit,0<=r||r<=w.limit){for(a=r;;){if(e=w.cursor,w.in_grouping(l,97,246)){w.cursor=e;break}if(w.cursor=e,w.cursor>=w.limit)return;w.cursor++}for(;!w.out_grouping(l,97,246);){if(w.cursor>=w.limit)return;w.cursor++}o=w.cursor,o<a&&(o=a)}}function t(){var e,r=w.limit_backward;if(w.cursor>=o&&(w.limit_backward=o,w.cursor=w.limit,w.ket=w.cursor,e=w.find_among_b(u,37),w.limit_backward=r,e))switch(w.bra=w.cursor,e){case 1:w.slice_del();break;case 2:w.in_grouping_b(d,98,121)&&w.slice_del()}}function i(){var e=w.limit_backward;w.cursor>=o&&(w.limit_backward=o,w.cursor=w.limit,w.find_among_b(c,7)&&(w.cursor=w.limit,w.ket=w.cursor,w.cursor>w.limit_backward&&(w.bra=--w.cursor,w.slice_del())),w.limit_backward=e)}function s(){var e,r;if(w.cursor>=o){if(r=w.limit_backward,w.limit_backward=o,w.cursor=w.limit,w.ket=w.cursor,e=w.find_among_b(m,5))switch(w.bra=w.cursor,e){case 1:w.slice_del();break;case 2:w.slice_from("lös");break;case 3:w.slice_from("full")}w.limit_backward=r}}var a,o,u=[new r("a",-1,1),new r("arna",0,1),new r("erna",0,1),new r("heterna",2,1),new r("orna",0,1),new r("ad",-1,1),new r("e",-1,1),new r("ade",6,1),new r("ande",6,1),new r("arne",6,1),new r("are",6,1),new r("aste",6,1),new r("en",-1,1),new r("anden",12,1),new r("aren",12,1),new r("heten",12,1),new r("ern",-1,1),new r("ar",-1,1),new r("er",-1,1),new r("heter",18,1),new r("or",-1,1),new r("s",-1,2),new r("as",21,1),new r("arnas",22,1),new r("ernas",22,1),new r("ornas",22,1),new r("es",21,1),new r("ades",26,1),new r("andes",26,1),new r("ens",21,1),new r("arens",29,1),new r("hetens",29,1),new r("erns",21,1),new r("at",-1,1),new r("andet",-1,1),new r("het",-1,1),new r("ast",-1,1)],c=[new r("dd",-1,-1),new r("gd",-1,-1),new r("nn",-1,-1),new r("dt",-1,-1),new r("gt",-1,-1),new r("kt",-1,-1),new r("tt",-1,-1)],m=[new r("ig",-1,1),new r("lig",0,1),new r("els",-1,1),new r("fullt",-1,3),new r("löst",-1,2)],l=[17,65,16,1,0,0,0,0,0,0,0,0,0,0,0,0,24,0,32],d=[119,127,149],w=new n;this.setCurrent=function(e){w.setCurrent(e)},this.getCurrent=function(){return w.getCurrent()},this.stem=function(){var r=w.cursor;return e(),w.limit_backward=r,w.cursor=w.limit,t(),w.cursor=w.limit,i(),w.cursor=w.limit,s(),!0}};return function(e){return"function"==typeof e.update?e.update(function(e){return t.setCurrent(e),t.stem(),t.getCurrent()}):(t.setCurrent(e),t.stem(),t.getCurrent())}}(),e.Pipeline.registerFunction(e.sv.stemmer,"stemmer-sv"),e.sv.stopWordFilter=e.generateStopWordFilter("alla allt att av blev bli blir blivit de dem den denna deras dess dessa det detta dig din dina ditt du där då efter ej eller en er era ert ett från för ha hade han hans har henne hennes hon honom hur här i icke ingen inom inte jag ju kan kunde man med mellan men mig min mina mitt mot mycket ni nu när någon något några och om oss på samma sedan sig sin sina sitta själv skulle som så sådan sådana sådant till under upp ut utan vad var vara varför varit varje vars vart vem vi vid vilka vilkas vilken vilket vår våra vårt än är åt över".split(" ")),e.Pipeline.registerFunction(e.sv.stopWordFilter,"stopWordFilter-sv")}});


--------------------------------------------------------------------------------
/docs/FinNLP/site/assets/javascripts/lunr/min/lunr.da.min.js:
--------------------------------------------------------------------------------
 1 | /*!
 2 |  * Lunr languages, `Danish` language
 3 |  * https://github.com/MihaiValentin/lunr-languages
 4 |  *
 5 |  * Copyright 2014, Mihai Valentin
 6 |  * http://www.mozilla.org/MPL/
 7 |  */
 8 | /*!
 9 |  * based on
10 |  * Snowball JavaScript Library v0.3
11 |  * http://code.google.com/p/urim/
12 |  * http://snowball.tartarus.org/
13 |  *
14 |  * Copyright 2010, Oleg Mazko
15 |  * http://www.mozilla.org/MPL/
16 |  */
17 | 
18 | !function(e,r){"function"==typeof define&&define.amd?define(r):"object"==typeof exports?module.exports=r():r()(e.lunr)}(this,function(){return function(e){if(void 0===e)throw new Error("Lunr is not present. Please include / require Lunr before this script.");if(void 0===e.stemmerSupport)throw new Error("Lunr stemmer support is not present. Please include / require Lunr stemmer support before this script.");e.da=function(){this.pipeline.reset(),this.pipeline.add(e.da.trimmer,e.da.stopWordFilter,e.da.stemmer),this.searchPipeline&&(this.searchPipeline.reset(),this.searchPipeline.add(e.da.stemmer))},e.da.wordCharacters="A-Za-zªºÀ-ÖØ-öø-ʸˠ-ˤᴀ-ᴥᴬ-ᵜᵢ-ᵥᵫ-ᵷᵹ-ᶾḀ-ỿⁱⁿₐ-ₜKÅℲⅎⅠ-ↈⱠ-ⱿꜢ-ꞇꞋ-ꞭꞰ-ꞷꟷ-ꟿꬰ-ꭚꭜ-ꭤﬀ-ﬆＡ-Ｚａ-ｚ",e.da.trimmer=e.trimmerSupport.generateTrimmer(e.da.wordCharacters),e.Pipeline.registerFunction(e.da.trimmer,"trimmer-da"),e.da.stemmer=function(){var r=e.stemmerSupport.Among,i=e.stemmerSupport.SnowballProgram,n=new function(){function e(){var e,r=f.cursor+3;if(d=f.limit,0<=r&&r<=f.limit){for(a=r;;){if(e=f.cursor,f.in_grouping(w,97,248)){f.cursor=e;break}if(f.cursor=e,e>=f.limit)return;f.cursor++}for(;!f.out_grouping(w,97,248);){if(f.cursor>=f.limit)return;f.cursor++}d=f.cursor,d<a&&(d=a)}}function n(){var e,r;if(f.cursor>=d&&(r=f.limit_backward,f.limit_backward=d,f.ket=f.cursor,e=f.find_among_b(c,32),f.limit_backward=r,e))switch(f.bra=f.cursor,e){case 1:f.slice_del();break;case 2:f.in_grouping_b(p,97,229)&&f.slice_del()}}function t(){var e,r=f.limit-f.cursor;f.cursor>=d&&(e=f.limit_backward,f.limit_backward=d,f.ket=f.cursor,f.find_among_b(l,4)?(f.bra=f.cursor,f.limit_backward=e,f.cursor=f.limit-r,f.cursor>f.limit_backward&&(f.cursor--,f.bra=f.cursor,f.slice_del())):f.limit_backward=e)}function s(){var e,r,i,n=f.limit-f.cursor;if(f.ket=f.cursor,f.eq_s_b(2,"st")&&(f.bra=f.cursor,f.eq_s_b(2,"ig")&&f.slice_del()),f.cursor=f.limit-n,f.cursor>=d&&(r=f.limit_backward,f.limit_backward=d,f.ket=f.cursor,e=f.find_among_b(m,5),f.limit_backward=r,e))switch(f.bra=f.cursor,e){case 1:f.slice_del(),i=f.limit-f.cursor,t(),f.cursor=f.limit-i;break;case 2:f.slice_from("løs")}}function o(){var e;f.cursor>=d&&(e=f.limit_backward,f.limit_backward=d,f.ket=f.cursor,f.out_grouping_b(w,97,248)?(f.bra=f.cursor,u=f.slice_to(u),f.limit_backward=e,f.eq_v_b(u)&&f.slice_del()):f.limit_backward=e)}var a,d,u,c=[new r("hed",-1,1),new r("ethed",0,1),new r("ered",-1,1),new r("e",-1,1),new r("erede",3,1),new r("ende",3,1),new r("erende",5,1),new r("ene",3,1),new r("erne",3,1),new r("ere",3,1),new r("en",-1,1),new r("heden",10,1),new r("eren",10,1),new r("er",-1,1),new r("heder",13,1),new r("erer",13,1),new r("s",-1,2),new r("heds",16,1),new r("es",16,1),new r("endes",18,1),new r("erendes",19,1),new r("enes",18,1),new r("ernes",18,1),new r("eres",18,1),new r("ens",16,1),new r("hedens",24,1),new r("erens",24,1),new r("ers",16,1),new r("ets",16,1),new r("erets",28,1),new r("et",-1,1),new r("eret",30,1)],l=[new r("gd",-1,-1),new r("dt",-1,-1),new r("gt",-1,-1),new r("kt",-1,-1)],m=[new r("ig",-1,1),new r("lig",0,1),new r("elig",1,1),new r("els",-1,1),new r("løst",-1,2)],w=[17,65,16,1,0,0,0,0,0,0,0,0,0,0,0,0,48,0,128],p=[239,254,42,3,0,0,0,0,0,0,0,0,0,0,0,0,16],f=new i;this.setCurrent=function(e){f.setCurrent(e)},this.getCurrent=function(){return f.getCurrent()},this.stem=function(){var r=f.cursor;return e(),f.limit_backward=r,f.cursor=f.limit,n(),f.cursor=f.limit,t(),f.cursor=f.limit,s(),f.cursor=f.limit,o(),!0}};return function(e){return"function"==typeof e.update?e.update(function(e){return n.setCurrent(e),n.stem(),n.getCurrent()}):(n.setCurrent(e),n.stem(),n.getCurrent())}}(),e.Pipeline.registerFunction(e.da.stemmer,"stemmer-da"),e.da.stopWordFilter=e.generateStopWordFilter("ad af alle alt anden at blev blive bliver da de dem den denne der deres det dette dig din disse dog du efter eller en end er et for fra ham han hans har havde have hende hendes her hos hun hvad hvis hvor i ikke ind jeg jer jo kunne man mange med meget men mig min mine mit mod ned noget nogle nu når og også om op os over på selv sig sin sine sit skal skulle som sådan thi til ud under var vi vil ville vor være været".split(" ")),e.Pipeline.registerFunction(e.da.stopWordFilter,"stopWordFilter-da")}});


--------------------------------------------------------------------------------
/docs/FinNLP/site/assets/javascripts/lunr/min/lunr.no.min.js:
--------------------------------------------------------------------------------
 1 | /*!
 2 |  * Lunr languages, `Norwegian` language
 3 |  * https://github.com/MihaiValentin/lunr-languages
 4 |  *
 5 |  * Copyright 2014, Mihai Valentin
 6 |  * http://www.mozilla.org/MPL/
 7 |  */
 8 | /*!
 9 |  * based on
10 |  * Snowball JavaScript Library v0.3
11 |  * http://code.google.com/p/urim/
12 |  * http://snowball.tartarus.org/
13 |  *
14 |  * Copyright 2010, Oleg Mazko
15 |  * http://www.mozilla.org/MPL/
16 |  */
17 | 
18 | !function(e,r){"function"==typeof define&&define.amd?define(r):"object"==typeof exports?module.exports=r():r()(e.lunr)}(this,function(){return function(e){if(void 0===e)throw new Error("Lunr is not present. Please include / require Lunr before this script.");if(void 0===e.stemmerSupport)throw new Error("Lunr stemmer support is not present. Please include / require Lunr stemmer support before this script.");e.no=function(){this.pipeline.reset(),this.pipeline.add(e.no.trimmer,e.no.stopWordFilter,e.no.stemmer),this.searchPipeline&&(this.searchPipeline.reset(),this.searchPipeline.add(e.no.stemmer))},e.no.wordCharacters="A-Za-zªºÀ-ÖØ-öø-ʸˠ-ˤᴀ-ᴥᴬ-ᵜᵢ-ᵥᵫ-ᵷᵹ-ᶾḀ-ỿⁱⁿₐ-ₜKÅℲⅎⅠ-ↈⱠ-ⱿꜢ-ꞇꞋ-ꞭꞰ-ꞷꟷ-ꟿꬰ-ꭚꭜ-ꭤﬀ-ﬆＡ-Ｚａ-ｚ",e.no.trimmer=e.trimmerSupport.generateTrimmer(e.no.wordCharacters),e.Pipeline.registerFunction(e.no.trimmer,"trimmer-no"),e.no.stemmer=function(){var r=e.stemmerSupport.Among,n=e.stemmerSupport.SnowballProgram,i=new function(){function e(){var e,r=w.cursor+3;if(a=w.limit,0<=r||r<=w.limit){for(s=r;;){if(e=w.cursor,w.in_grouping(d,97,248)){w.cursor=e;break}if(e>=w.limit)return;w.cursor=e+1}for(;!w.out_grouping(d,97,248);){if(w.cursor>=w.limit)return;w.cursor++}a=w.cursor,a<s&&(a=s)}}function i(){var e,r,n;if(w.cursor>=a&&(r=w.limit_backward,w.limit_backward=a,w.ket=w.cursor,e=w.find_among_b(m,29),w.limit_backward=r,e))switch(w.bra=w.cursor,e){case 1:w.slice_del();break;case 2:n=w.limit-w.cursor,w.in_grouping_b(c,98,122)?w.slice_del():(w.cursor=w.limit-n,w.eq_s_b(1,"k")&&w.out_grouping_b(d,97,248)&&w.slice_del());break;case 3:w.slice_from("er")}}function t(){var e,r=w.limit-w.cursor;w.cursor>=a&&(e=w.limit_backward,w.limit_backward=a,w.ket=w.cursor,w.find_among_b(u,2)?(w.bra=w.cursor,w.limit_backward=e,w.cursor=w.limit-r,w.cursor>w.limit_backward&&(w.cursor--,w.bra=w.cursor,w.slice_del())):w.limit_backward=e)}function o(){var e,r;w.cursor>=a&&(r=w.limit_backward,w.limit_backward=a,w.ket=w.cursor,e=w.find_among_b(l,11),e?(w.bra=w.cursor,w.limit_backward=r,1==e&&w.slice_del()):w.limit_backward=r)}var s,a,m=[new r("a",-1,1),new r("e",-1,1),new r("ede",1,1),new r("ande",1,1),new r("ende",1,1),new r("ane",1,1),new r("ene",1,1),new r("hetene",6,1),new r("erte",1,3),new r("en",-1,1),new r("heten",9,1),new r("ar",-1,1),new r("er",-1,1),new r("heter",12,1),new r("s",-1,2),new r("as",14,1),new r("es",14,1),new r("edes",16,1),new r("endes",16,1),new r("enes",16,1),new r("hetenes",19,1),new r("ens",14,1),new r("hetens",21,1),new r("ers",14,1),new r("ets",14,1),new r("et",-1,1),new r("het",25,1),new r("ert",-1,3),new r("ast",-1,1)],u=[new r("dt",-1,-1),new r("vt",-1,-1)],l=[new r("leg",-1,1),new r("eleg",0,1),new r("ig",-1,1),new r("eig",2,1),new r("lig",2,1),new r("elig",4,1),new r("els",-1,1),new r("lov",-1,1),new r("elov",7,1),new r("slov",7,1),new r("hetslov",9,1)],d=[17,65,16,1,0,0,0,0,0,0,0,0,0,0,0,0,48,0,128],c=[119,125,149,1],w=new n;this.setCurrent=function(e){w.setCurrent(e)},this.getCurrent=function(){return w.getCurrent()},this.stem=function(){var r=w.cursor;return e(),w.limit_backward=r,w.cursor=w.limit,i(),w.cursor=w.limit,t(),w.cursor=w.limit,o(),!0}};return function(e){return"function"==typeof e.update?e.update(function(e){return i.setCurrent(e),i.stem(),i.getCurrent()}):(i.setCurrent(e),i.stem(),i.getCurrent())}}(),e.Pipeline.registerFunction(e.no.stemmer,"stemmer-no"),e.no.stopWordFilter=e.generateStopWordFilter("alle at av bare begge ble blei bli blir blitt både båe da de deg dei deim deira deires dem den denne der dere deres det dette di din disse ditt du dykk dykkar då eg ein eit eitt eller elles en enn er et ett etter for fordi fra før ha hadde han hans har hennar henne hennes her hjå ho hoe honom hoss hossen hun hva hvem hver hvilke hvilken hvis hvor hvordan hvorfor i ikke ikkje ikkje ingen ingi inkje inn inni ja jeg kan kom korleis korso kun kunne kva kvar kvarhelst kven kvi kvifor man mange me med medan meg meget mellom men mi min mine mitt mot mykje ned no noe noen noka noko nokon nokor nokre nå når og også om opp oss over på samme seg selv si si sia sidan siden sin sine sitt sjøl skal skulle slik so som som somme somt så sånn til um upp ut uten var vart varte ved vere verte vi vil ville vore vors vort vår være være vært å".split(" ")),e.Pipeline.registerFunction(e.no.stopWordFilter,"stopWordFilter-no")}});


--------------------------------------------------------------------------------
/finnlp/data_sources/news/seekingalpha_date_range.py:
--------------------------------------------------------------------------------
 1 | import warnings
 2 | warnings.filterwarnings("ignore")
 3 | 
 4 | import json
 5 | import requests
 6 | import pandas as pd
 7 | from lxml import etree
 8 | from tqdm import tqdm
 9 | from datetime import datetime
10 | 
11 | from finnlp.data_sources.news._base import News_Downloader
12 | 
13 | class SeekingAlpha_Date_Range(News_Downloader):
14 |     def __init__(self, args = {}):
15 |         super().__init__(args)
16 |         
17 |     def download_date_range_stock(self, start_date, end_date, stock = "AAPL", proxies = None):
18 |         self.dataframe = pd.DataFrame()
19 |         start_timestamp = int(datetime.strptime(start_date+'-13', "%Y-%m-%d-%H").timestamp())
20 |         end_timestamp = int(datetime.strptime(end_date+'-13', "%Y-%m-%d-%H").timestamp())
21 |         # Downloading First Page
22 |         data, totalpages = self._gather_by_page(start_timestamp, end_timestamp, stock, 1, proxies)
23 |         self.dataframe = pd.concat([self.dataframe, data])
24 |         
25 |         # Downloading Other Pages
26 |         with tqdm(total=totalpages, desc= "Downloading Titles") as bar:
27 |             bar.update(1)
28 |             for page in range(2, totalpages+1):
29 |                 data,_ = self._gather_by_page(start_timestamp, end_timestamp, stock, page, proxies)
30 |                 self.dataframe = pd.concat([self.dataframe, data])
31 |                 bar.update(1)
32 |         self.dataframe = self.dataframe.reset_index(drop = True)
33 | 
34 |     def _gather_by_page(self, start_timestamp, end_timestamp, stock, page = 1, proxies = None):
35 |         url = f"https://seekingalpha.com/api/v3/symbols/{stock}/news?filter[since]={start_timestamp}&filter[until]={end_timestamp}&id={stock}&include=author%2CprimaryTickers%2CsecondaryTickers%2Csentiments&isMounting=true&page[size]=40&page[number]={page}"
36 |         headers = {
37 |             'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:109.0) Gecko/20100101 Firefox/113.0',
38 |             'Referer':f'https://seekingalpha.com/symbol/aapl/news?from=2009-12-31T16%3A00%3A00.000Z&to=2022-01-01T15%3A59%3A59.999Z'
39 |         }
40 |         response = requests.get(url, headers=headers, proxies=proxies)
41 |         if response.status_code != 200:
42 |             print(f"stock: {stock}, page: {page} went wrong!")
43 |             return pd.DataFrame(), 1
44 |         else:
45 |             res = json.loads(response.text)
46 |             data = pd.DataFrame(res["data"])
47 |             # make new features
48 |             new_columns = ["publishOn", "isLockedPro", "commentCount", "gettyImageUrl", "videoPreviewUrl", "themes", "title", "isPaywalled"]
49 |             data[new_columns] = data.apply(lambda x:list(x.attributes.values()), axis = 1,result_type ="expand" )
50 |             new_columns = ["author", "sentiments", "primaryTickers", "secondaryTickers", "otherTags"]
51 |             data[new_columns] = data.apply(lambda x:list(x.relationships.values()), axis = 1,result_type ="expand" )
52 | 
53 |             # total pages
54 |             totalpages = res["meta"]["page"]["totalPages"]
55 |             return data, totalpages
56 | 
57 | 
58 |     def obtain_content(self, parallel = False, proxies = None):
59 |         if parallel:
60 |             import os
61 |             from pandarallel import pandarallel
62 |             pandarallel.initialize(nb_workers=os.cpu_count())
63 |             self.dataframe['content'] = self.dataframe.parallel_apply(lambda x: self._obtain_content(x, proxies = proxies), axis = 1)
64 |         else:
65 |             self.dataframe['content'] = self.dataframe.apply(lambda x: self._obtain_content(x, proxies = proxies), axis = 1)
66 | 
67 | 
68 |     def _obtain_content(self, x, proxies = None):
69 |         url = x['links']['self']
70 |         url = f"https://seekingalpha.com{url}"
71 |         headers = {
72 |             'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:109.0) Gecko/20100101 Firefox/118.0'
73 |         }
74 |         res = requests.get(url, headers=headers, proxies=proxies)
75 |         if res.status_code != 200:
76 |             return ''
77 |         else:
78 |             resp = etree.HTML(res.text)
79 |             resp = resp.xpath('//script[5]//text()')
80 |             resp = resp[0].split('window.SSR_DATA = ')[1]
81 |             resp = resp[:-1]
82 |             resp = json.loads(resp)
83 |             content = resp['article']['response']['data']['attributes']['content']
84 |             content = etree.HTML(content)
85 |             content = content.xpath('//text()')
86 |             content = [c if c!= ' ' else '\n' for c in content]
87 |             content = ''.join(content)
88 |             content = content.strip()
89 |             return content
90 | 
91 |         
92 | 


--------------------------------------------------------------------------------
/finnlp/utils/get_proxy.py:
--------------------------------------------------------------------------------
  1 | import requests
  2 | import parsel
  3 | from lxml import etree
  4 | from tqdm import tqdm
  5 | import time
  6 | import re
  7 | 
  8 | def check_china_ips(proxies_list):
  9 |     """检测ip的方法"""
 10 |     headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/79.0.3945.79 Safari/537.36'}
 11 |     
 12 |     can_use = []
 13 |     for proxy in tqdm(proxies_list, desc = "Checking ips"):
 14 |         try:
 15 |             response = requests.get('http://www.baidu.com', headers=headers, proxies=proxy, timeout=1)  # 超时报错
 16 |             if response.status_code == 200:
 17 |                 can_use.append(proxy)
 18 |         except Exception as error:
 19 |             # print(error)
 20 |             pass
 21 |     return can_use
 22 | 
 23 | def check_us_ips(proxies_list):
 24 |     """检测ip的方法"""
 25 |     headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/79.0.3945.79 Safari/537.36'}
 26 | 
 27 |     can_use = []
 28 |     for proxy in tqdm(proxies_list, desc = "Checking ips"):
 29 |         try:
 30 |             response = requests.get('http://www.google.com', headers=headers, proxies=proxy, timeout=1)  # 超时报错
 31 |             if response.status_code == 200:
 32 |                 can_use.append(proxy)
 33 |         except Exception as error:
 34 |             # print(error)
 35 |             pass
 36 |     return can_use
 37 | 
 38 | def get_china_free_proxy(pages = 10):
 39 |     proxies_list = []
 40 |     for page in tqdm(range(1, pages+1), desc = "Gathering free ips by pages..."):
 41 | 
 42 |         base_url = f'https://www.kuaidaili.com/free/inha/{page}'
 43 |         headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/79.0.3945.79 Safari/537.36'}
 44 |         success = False
 45 |         while not success:
 46 |             try:
 47 |                 response = requests.get(base_url, headers=headers)
 48 |                 data = response.text
 49 |                 res = etree.HTML(data)
 50 |                 trs = res.xpath('//table/tbody/tr')
 51 |                 if len(trs)!=0:
 52 |                     success = True
 53 |                     for tr in trs:
 54 |                         proxies_dict = {}
 55 |                         http_type = tr.xpath('./td[4]/text()')[0]
 56 |                         ip_num = tr.xpath('./td[1]/text()')[0]
 57 |                         port_num = tr.xpath('./td[2]/text()')[0]
 58 |                         proxies_dict[http_type] = ip_num + ':' + port_num
 59 |                         proxies_list.append(proxies_dict)
 60 |                 else:
 61 |                     time.delay(0.01)
 62 |       
 63 |             except:
 64 |                 pass
 65 | 
 66 |     can_use = check_china_ips(proxies_list)
 67 | 
 68 |     print(f'获取到的代理ip数量: {len(proxies_list)} 。Get proxy ips: {len(proxies_list)}.')
 69 |     print(f'能用的代理数量： {len(can_use)}。Usable proxy ips: {len(can_use)}.' )
 70 | 
 71 |     return can_use
 72 | 
 73 | def get_us_free_proxy(pages = 10):
 74 |     url = "https://openproxy.space/list/http"
 75 |     headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/79.0.3945.79 Safari/537.36'}
 76 |     response = requests.get(url, headers=headers)
 77 |     if response.status_code != 200:
 78 |         print("Connection Error. Please make sure that your computer now have the access to Google.com")
 79 |     res = etree.HTML(response.text)
 80 |     http_type = "HTTP"
 81 |     proxies_list = []
 82 | 
 83 |     scripts = res.xpath("//script")
 84 |     content = scripts[3].xpath(".//text()")
 85 |     pattern = re.compile('LIST",data:(.+),added:')
 86 |     result_list = pattern.findall(content[0])
 87 |     result_list = result_list[0].strip("[{").strip("}]").split("},{")
 88 | 
 89 |     for result in result_list:
 90 |         pattern = re.compile('\[(.+)\]')
 91 |         result = pattern.findall(result)
 92 |         result = result[0].split(",")
 93 |         result = [r.strip("\"") for r in result]
 94 |         for ip in result:
 95 |             proxies_list.append(
 96 |                 {http_type: ip}
 97 |             )
 98 |     total = pages* 15
 99 |     proxies_list = proxies_list[:total] 
100 |     can_use = check_us_ips(proxies_list)
101 |     print(f'Get proxy ips: {len(proxies_list)}.')
102 |     print(f'Usable proxy ips: {len(can_use)}.' )
103 | 
104 |     return can_use
105 | 
106 | class Kuaidaili:
107 |     def __init__(self, tunnel, username, password):
108 |         self.tunnel = tunnel
109 |         self.username = username
110 |         self.password = password
111 | 
112 |     def get_kuaidaili_tunnel_proxy(self):
113 |         proxies = {
114 |             "http": "http://%(user)s:%(pwd)s@%(proxy)s/" % {"user": self.username, "pwd": self.password, "proxy": self.tunnel},
115 |             "https": "http://%(user)s:%(pwd)s@%(proxy)s/" % {"user": self.username, "pwd": self.password, "proxy": self.tunnel}
116 |         }
117 |         return proxies


--------------------------------------------------------------------------------
/finnlp/data_sources/news/marketwatch_streaming.py:
--------------------------------------------------------------------------------
  1 | import requests
  2 | from lxml import etree
  3 | from tqdm import tqdm
  4 | import pandas as pd
  5 | import json
  6 | import time
  7 | from finnlp.data_sources.news._base import News_Downloader
  8 | 
  9 | # TODO:
 10 | # 1. More pages
 11 | # 2. Contents
 12 | 
 13 | 
 14 | class MarketWatch_Streaming(News_Downloader):
 15 | 
 16 |     def __init__(self, args={}):
 17 |         super().__init__(args)
 18 |         self.dataframe = pd.DataFrame()
 19 | 
 20 |     def download_streaming_search(self, keyword = "apple", rounds = 3, delay = 0.5):
 21 |         # download first page
 22 |         self._download_first_page(keyword, delay = delay)
 23 |        
 24 |         # download the following pages
 25 |         # self._download_other_pages(keyword)
 26 |         print("Only support the first page now!")
 27 | 
 28 |     def download_date_range_search(self, start_date , end_date, keyword = "apple", rounds = 1000, delay = 0.5):
 29 |         # download first page
 30 |         self._download_first_page(keyword, delay = delay, start_date = start_date, end_date = end_date)
 31 |        
 32 |         # download the following pages
 33 |         # self._download_other_pages(keyword)
 34 |         print("Only support the first page now!")
 35 | 
 36 |     def _download_first_page(self, keyword = "apple", delay = 0.5, start_date = None, end_date = None):
 37 |         url = "https://www.marketwatch.com/search"
 38 |         params = {
 39 |             'q': keyword,
 40 |             'ts': '0',
 41 |             'tab': 'All News',
 42 |             'sd': start_date,
 43 |             'ed': end_date,
 44 |         }
 45 |         headers = {
 46 |             "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/116.0.0.0 Safari/537.36",
 47 |         }
 48 | 
 49 |         res = requests.get(url = url, headers= headers, params=params)
 50 |         if res.status_code != 200:
 51 |             print(f'Connection Error: {res.status_code}')
 52 |             return f'Connection Error: {res.status_code}'
 53 | 
 54 |         res = etree.HTML(res.text)
 55 |         divs = res.xpath("body/main/div/div[2]/div[2]/div[2]/div[2]/mw-tabs/div[2]/div[1]/div/div[1]/div")
 56 |         titles = []
 57 |         times = []
 58 |         authors = []
 59 |         for div in divs:
 60 |             # title
 61 |             title = div.xpath("./div/h3/a/text()")
 62 |             # time
 63 |             time_ = div.xpath("./div/div/span[1]/text()")
 64 |             # author
 65 |             author = div.xpath("./div/div/span[2]/text()")
 66 | 
 67 |             if len(title)>0:
 68 |                 titles.append(' '.join(title).replace("\n","").strip(" "))
 69 |                 times.append(' '.join(time_))
 70 |                 authors.append(' '.join(author))
 71 | 
 72 |         # concat results
 73 |         tmp = pd.DataFrame([titles, times, authors]).T
 74 |         tmp.columns = ["title", "time", "author"]
 75 |         self.dataframe = pd.concat([self.dataframe, tmp])
 76 | 
 77 |         # sleep
 78 |         time.sleep(delay)
 79 | 
 80 | 
 81 | 
 82 | 
 83 | class MarketWatch_Date_Range(News_Downloader):
 84 | 
 85 |     def __init__(self, args={}):
 86 |         super().__init__(args)
 87 |         self.dataframe = pd.DataFrame()
 88 | 
 89 |     def download_date_range_search(self, start_date , end_date, keyword = "apple", delay = 0.5):
 90 |         # download first page
 91 |         self._download_first_page(keyword, delay = delay, start_date = start_date, end_date = end_date)
 92 |        
 93 |         # download the following pages
 94 |         # self._download_other_pages(keyword)
 95 |         print("Only support the first page now!")
 96 | 
 97 |     def _download_first_page(self, keyword = "apple", delay = 0.5, start_date = None, end_date = None):
 98 |         url = "https://www.marketwatch.com/search"
 99 |         params = {
100 |             'q': keyword,
101 |             'ts': '0',
102 |             'tab': 'All News',
103 |             'sd': start_date,
104 |             'ed': end_date,
105 |         }
106 |         headers = {
107 |             "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/116.0.0.0 Safari/537.36",
108 |         }
109 | 
110 |         res = requests.get(url = url, headers= headers, params=params)
111 |         if res.status_code != 200:
112 |             print(f'Connection Error: {res.status_code}')
113 |             return f'Connection Error: {res.status_code}'
114 | 
115 |         res = etree.HTML(res.text)
116 |         divs = res.xpath("body/main/div/div[2]/div[2]/div[2]/div[2]/mw-tabs/div[2]/div[1]/div/div[1]/div")
117 |         titles = []
118 |         times = []
119 |         authors = []
120 |         for div in divs:
121 |             # title
122 |             title = div.xpath("./div/h3/a/text()")
123 |             # time
124 |             time_ = div.xpath("./div/div/span[1]/text()")
125 |             # author
126 |             author = div.xpath("./div/div/span[2]/text()")
127 | 
128 |             if len(title)>0:
129 |                 titles.append(' '.join(title).replace("\n","").strip(" "))
130 |                 times.append(' '.join(time_))
131 |                 authors.append(' '.join(author))
132 | 
133 |         # concat results
134 |         tmp = pd.DataFrame([titles, times, authors]).T
135 |         tmp.columns = ["title", "time", "author"]
136 |         self.dataframe = pd.concat([self.dataframe, tmp])
137 | 
138 |         # sleep
139 |         time.sleep(delay)
140 |     


--------------------------------------------------------------------------------
/docs/FinNLP/site/assets/javascripts/lunr/min/lunr.nl.min.js:
--------------------------------------------------------------------------------
 1 | /*!
 2 |  * Lunr languages, `Dutch` language
 3 |  * https://github.com/MihaiValentin/lunr-languages
 4 |  *
 5 |  * Copyright 2014, Mihai Valentin
 6 |  * http://www.mozilla.org/MPL/
 7 |  */
 8 | /*!
 9 |  * based on
10 |  * Snowball JavaScript Library v0.3
11 |  * http://code.google.com/p/urim/
12 |  * http://snowball.tartarus.org/
13 |  *
14 |  * Copyright 2010, Oleg Mazko
15 |  * http://www.mozilla.org/MPL/
16 |  */
17 | 
18 | !function(r,e){"function"==typeof define&&define.amd?define(e):"object"==typeof exports?module.exports=e():e()(r.lunr)}(this,function(){return function(r){if(void 0===r)throw new Error("Lunr is not present. Please include / require Lunr before this script.");if(void 0===r.stemmerSupport)throw new Error("Lunr stemmer support is not present. Please include / require Lunr stemmer support before this script.");r.nl=function(){this.pipeline.reset(),this.pipeline.add(r.nl.trimmer,r.nl.stopWordFilter,r.nl.stemmer),this.searchPipeline&&(this.searchPipeline.reset(),this.searchPipeline.add(r.nl.stemmer))},r.nl.wordCharacters="A-Za-zªºÀ-ÖØ-öø-ʸˠ-ˤᴀ-ᴥᴬ-ᵜᵢ-ᵥᵫ-ᵷᵹ-ᶾḀ-ỿⁱⁿₐ-ₜKÅℲⅎⅠ-ↈⱠ-ⱿꜢ-ꞇꞋ-ꞭꞰ-ꞷꟷ-ꟿꬰ-ꭚꭜ-ꭤﬀ-ﬆＡ-Ｚａ-ｚ",r.nl.trimmer=r.trimmerSupport.generateTrimmer(r.nl.wordCharacters),r.Pipeline.registerFunction(r.nl.trimmer,"trimmer-nl"),r.nl.stemmer=function(){var e=r.stemmerSupport.Among,i=r.stemmerSupport.SnowballProgram,n=new function(){function r(){for(var r,e,i,o=C.cursor;;){if(C.bra=C.cursor,r=C.find_among(b,11))switch(C.ket=C.cursor,r){case 1:C.slice_from("a");continue;case 2:C.slice_from("e");continue;case 3:C.slice_from("i");continue;case 4:C.slice_from("o");continue;case 5:C.slice_from("u");continue;case 6:if(C.cursor>=C.limit)break;C.cursor++;continue}break}for(C.cursor=o,C.bra=o,C.eq_s(1,"y")?(C.ket=C.cursor,C.slice_from("Y")):C.cursor=o;;)if(e=C.cursor,C.in_grouping(q,97,232)){if(i=C.cursor,C.bra=i,C.eq_s(1,"i"))C.ket=C.cursor,C.in_grouping(q,97,232)&&(C.slice_from("I"),C.cursor=e);else if(C.cursor=i,C.eq_s(1,"y"))C.ket=C.cursor,C.slice_from("Y"),C.cursor=e;else if(n(e))break}else if(n(e))break}function n(r){return C.cursor=r,r>=C.limit||(C.cursor++,!1)}function o(){_=C.limit,d=_,t()||(_=C.cursor,_<3&&(_=3),t()||(d=C.cursor))}function t(){for(;!C.in_grouping(q,97,232);){if(C.cursor>=C.limit)return!0;C.cursor++}for(;!C.out_grouping(q,97,232);){if(C.cursor>=C.limit)return!0;C.cursor++}return!1}function s(){for(var r;;)if(C.bra=C.cursor,r=C.find_among(p,3))switch(C.ket=C.cursor,r){case 1:C.slice_from("y");break;case 2:C.slice_from("i");break;case 3:if(C.cursor>=C.limit)return;C.cursor++}}function u(){return _<=C.cursor}function c(){return d<=C.cursor}function a(){var r=C.limit-C.cursor;C.find_among_b(g,3)&&(C.cursor=C.limit-r,C.ket=C.cursor,C.cursor>C.limit_backward&&(C.cursor--,C.bra=C.cursor,C.slice_del()))}function l(){var r;w=!1,C.ket=C.cursor,C.eq_s_b(1,"e")&&(C.bra=C.cursor,u()&&(r=C.limit-C.cursor,C.out_grouping_b(q,97,232)&&(C.cursor=C.limit-r,C.slice_del(),w=!0,a())))}function m(){var r;u()&&(r=C.limit-C.cursor,C.out_grouping_b(q,97,232)&&(C.cursor=C.limit-r,C.eq_s_b(3,"gem")||(C.cursor=C.limit-r,C.slice_del(),a())))}function f(){var r,e,i,n,o,t,s=C.limit-C.cursor;if(C.ket=C.cursor,r=C.find_among_b(h,5))switch(C.bra=C.cursor,r){case 1:u()&&C.slice_from("heid");break;case 2:m();break;case 3:u()&&C.out_grouping_b(j,97,232)&&C.slice_del()}if(C.cursor=C.limit-s,l(),C.cursor=C.limit-s,C.ket=C.cursor,C.eq_s_b(4,"heid")&&(C.bra=C.cursor,c()&&(e=C.limit-C.cursor,C.eq_s_b(1,"c")||(C.cursor=C.limit-e,C.slice_del(),C.ket=C.cursor,C.eq_s_b(2,"en")&&(C.bra=C.cursor,m())))),C.cursor=C.limit-s,C.ket=C.cursor,r=C.find_among_b(k,6))switch(C.bra=C.cursor,r){case 1:if(c()){if(C.slice_del(),i=C.limit-C.cursor,C.ket=C.cursor,C.eq_s_b(2,"ig")&&(C.bra=C.cursor,c()&&(n=C.limit-C.cursor,!C.eq_s_b(1,"e")))){C.cursor=C.limit-n,C.slice_del();break}C.cursor=C.limit-i,a()}break;case 2:c()&&(o=C.limit-C.cursor,C.eq_s_b(1,"e")||(C.cursor=C.limit-o,C.slice_del()));break;case 3:c()&&(C.slice_del(),l());break;case 4:c()&&C.slice_del();break;case 5:c()&&w&&C.slice_del()}C.cursor=C.limit-s,C.out_grouping_b(z,73,232)&&(t=C.limit-C.cursor,C.find_among_b(v,4)&&C.out_grouping_b(q,97,232)&&(C.cursor=C.limit-t,C.ket=C.cursor,C.cursor>C.limit_backward&&(C.cursor--,C.bra=C.cursor,C.slice_del())))}var d,_,w,b=[new e("",-1,6),new e("á",0,1),new e("ä",0,1),new e("é",0,2),new e("ë",0,2),new e("í",0,3),new e("ï",0,3),new e("ó",0,4),new e("ö",0,4),new e("ú",0,5),new e("ü",0,5)],p=[new e("",-1,3),new e("I",0,2),new e("Y",0,1)],g=[new e("dd",-1,-1),new e("kk",-1,-1),new e("tt",-1,-1)],h=[new e("ene",-1,2),new e("se",-1,3),new e("en",-1,2),new e("heden",2,1),new e("s",-1,3)],k=[new e("end",-1,1),new e("ig",-1,2),new e("ing",-1,1),new e("lijk",-1,3),new e("baar",-1,4),new e("bar",-1,5)],v=[new e("aa",-1,-1),new e("ee",-1,-1),new e("oo",-1,-1),new e("uu",-1,-1)],q=[17,65,16,1,0,0,0,0,0,0,0,0,0,0,0,0,128],z=[1,0,0,17,65,16,1,0,0,0,0,0,0,0,0,0,0,0,0,128],j=[17,67,16,1,0,0,0,0,0,0,0,0,0,0,0,0,128],C=new i;this.setCurrent=function(r){C.setCurrent(r)},this.getCurrent=function(){return C.getCurrent()},this.stem=function(){var e=C.cursor;return r(),C.cursor=e,o(),C.limit_backward=e,C.cursor=C.limit,f(),C.cursor=C.limit_backward,s(),!0}};return function(r){return"function"==typeof r.update?r.update(function(r){return n.setCurrent(r),n.stem(),n.getCurrent()}):(n.setCurrent(r),n.stem(),n.getCurrent())}}(),r.Pipeline.registerFunction(r.nl.stemmer,"stemmer-nl"),r.nl.stopWordFilter=r.generateStopWordFilter(" aan al alles als altijd andere ben bij daar dan dat de der deze die dit doch doen door dus een eens en er ge geen geweest haar had heb hebben heeft hem het hier hij hoe hun iemand iets ik in is ja je kan kon kunnen maar me meer men met mij mijn moet na naar niet niets nog nu of om omdat onder ons ook op over reeds te tegen toch toen tot u uit uw van veel voor want waren was wat werd wezen wie wil worden wordt zal ze zelf zich zij zijn zo zonder zou".split(" ")),r.Pipeline.registerFunction(r.nl.stopWordFilter,"stopWordFilter-nl")}});


--------------------------------------------------------------------------------
/finnlp/data_sources/company_announcement/juchao.py:
--------------------------------------------------------------------------------
  1 | from finnlp.data_sources.company_announcement._base import Company_Announcement_Downloader
  2 | 
  3 | import requests
  4 | import time
  5 | import json
  6 | import os
  7 | import pandas as pd
  8 | from tqdm import tqdm
  9 | from PyPDF2 import PdfReader
 10 | 
 11 | class Juchao_Announcement(Company_Announcement_Downloader):
 12 | 
 13 |     def __init__(self, args = {}):
 14 |         super().__init__(args)
 15 |         self.dataframe = pd.DataFrame()
 16 | 
 17 |     def download_date_range_stock(self,start_date, end_date, stock = "000001",max_page = 100, searchkey= "", get_content = False, save_dir = "./tmp/" , delate_pdf = False):
 18 |         self.org_dict = self._get_orgid()
 19 | 
 20 |         # download the first page
 21 |         res = self._get_open_page(start_date, end_date, stock, 1, searchkey)
 22 |         total_pages = res["totalpages"]+1
 23 |         
 24 |         if res["announcements"] is None:
 25 |             print(f"Nothing related to your searchkey({searchkey}) is found, you may try another one or just leave it blank")
 26 |         else:
 27 |             tmp_df = self._process_data(res)
 28 |             self.dataframe = pd.concat([self.dataframe, tmp_df])
 29 | 
 30 |             page = 2
 31 |             # download other page
 32 |             pbar = tqdm(total=total_pages,desc="Downloading by page...")
 33 |             
 34 |             for _ in range(max_page):
 35 |                 res = self._get_open_page(start_date, end_date, stock, page, searchkey) 
 36 |                 if res["announcements"] is None:
 37 |                     break
 38 |                 tmp_df = self._process_data(res)
 39 |                 self.dataframe = pd.concat([self.dataframe, tmp_df])
 40 |                 pbar.update(1)
 41 |                 page += 1
 42 |             pbar.update(1)
 43 |         # Convert Time
 44 |         self.dataframe.announcementTime = self.dataframe.announcementTime.apply(lambda x:time.strftime('%Y-%m-%d %H:%M:%S',time.localtime(x/1000)))
 45 |         self.dataframe.announcementTime = pd.to_datetime(self.dataframe.announcementTime)
 46 |         
 47 |         if get_content:
 48 |             pbar = tqdm(total=self.dataframe.shape[0], desc="Getting the text data...")
 49 |             self.dataframe[["PDF_path","Content"]] = self.dataframe.apply(lambda x: self._get_pdfs(x,save_dir, delate_pdf, pbar),axis= 1,result_type  = "expand")
 50 |         if delate_pdf:
 51 |             os.removedirs(save_dir)
 52 | 
 53 |         self.dataframe = self.dataframe.reset_index(drop = True)
 54 |         
 55 |     def _get_open_page(self,start_date,end_date, stock,page, searchkey):
 56 |         url = "http://www.cninfo.com.cn/new/hisAnnouncement/query?"
 57 |         headers = {
 58 |             "Referer": "http://www.cninfo.com.cn/new/commonUrl/pageOfSearch?url=disclosure/list/search&lastPage=index",
 59 |             "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/113.0.0.0 Safari/537.36",
 60 |         }
 61 |         data = {
 62 |             "pageNum": page,
 63 |             "pageSize": "30",
 64 |             "column": "szse",
 65 |             "tabName": "fulltext",
 66 |             "plate":"", 
 67 |             "stock":stock + "," + self.org_dict[stock] ,
 68 |             "searchkey": searchkey,
 69 |             "secid":"", 
 70 |             "category":"", 
 71 |             "trade":"", 
 72 |             "seDate": f"{start_date}~{end_date}",
 73 |             "sortName": "", 
 74 |             "sortType": "", 
 75 |             "isHLtitle": "true",
 76 |             }
 77 |         res = requests.post(url = url, headers = headers, data = data)
 78 |         if res.status_code != 200:
 79 |             raise ConnectionError
 80 |         
 81 |         res = json.loads(res.text)
 82 |         return res
 83 |     
 84 |     def _process_data(self,res):
 85 |         if res is None:
 86 |             return res
 87 |         else:
 88 |             return pd.DataFrame(res["announcements"])
 89 | 
 90 |     def _get_pdfs(self,x, save_dir, delate_pdf,pbar):
 91 |         os.makedirs(save_dir, exist_ok= True)
 92 |         adjunctUrl = x.adjunctUrl
 93 |         pdf_base_url = "http://static.cninfo.com.cn/"
 94 |         pdf_url = pdf_base_url + adjunctUrl
 95 |         responsepdf = self._request_get(pdf_url)
 96 |         
 97 | 
 98 |         if responsepdf is None:
 99 |             pbar.update(1)
100 |             return ("Failed Download","Failed Download")
101 | 
102 |         else:
103 |             # make preparations
104 |             file_name = x.announcementTitle
105 |             file_name = "".join(file_name.split("<em>"))
106 |             file_name = "".join(file_name.split("</em>"))
107 |             file_name
108 |             file_name = f"{x.secCode}_{x.secName}_{file_name}.pdf"
109 |             file_path = os.path.join(save_dir, file_name)
110 | 
111 |             # save pdf
112 |             with open(file_path, "wb") as f:
113 |                 f.write(responsepdf.content)
114 |             
115 |             # analyze pdf
116 |             with open(file_path, "rb") as filehandle:
117 |                 pdf = PdfReader(filehandle)
118 |                 text_all = ""
119 |                 for page in pdf.pages:
120 |                     text = page.extract_text()
121 |                     text = "".join(text.split("\n"))
122 |                     text_all += text
123 |             pbar.update(1)
124 | 
125 |             if delate_pdf:
126 |                 os.remove(file_path)
127 |                 return ("removed", text_all)
128 |             else:
129 |                 return (file_path, text_all)          
130 | 
131 |     def _get_orgid(self):
132 |         org_dict = {}
133 |         org_json = self._request_get("http://www.cninfo.com.cn/new/data/szse_stock.json").json()["stockList"]
134 | 
135 |         for i in range(len(org_json)):
136 |             org_dict[org_json[i]["code"]] = org_json[i]["orgId"]
137 | 
138 |         return org_dict


--------------------------------------------------------------------------------
/docs/FinNLP/site/assets/javascripts/lunr/min/lunr.de.min.js:
--------------------------------------------------------------------------------
 1 | /*!
 2 |  * Lunr languages, `German` language
 3 |  * https://github.com/MihaiValentin/lunr-languages
 4 |  *
 5 |  * Copyright 2014, Mihai Valentin
 6 |  * http://www.mozilla.org/MPL/
 7 |  */
 8 | /*!
 9 |  * based on
10 |  * Snowball JavaScript Library v0.3
11 |  * http://code.google.com/p/urim/
12 |  * http://snowball.tartarus.org/
13 |  *
14 |  * Copyright 2010, Oleg Mazko
15 |  * http://www.mozilla.org/MPL/
16 |  */
17 | 
18 | !function(e,r){"function"==typeof define&&define.amd?define(r):"object"==typeof exports?module.exports=r():r()(e.lunr)}(this,function(){return function(e){if(void 0===e)throw new Error("Lunr is not present. Please include / require Lunr before this script.");if(void 0===e.stemmerSupport)throw new Error("Lunr stemmer support is not present. Please include / require Lunr stemmer support before this script.");e.de=function(){this.pipeline.reset(),this.pipeline.add(e.de.trimmer,e.de.stopWordFilter,e.de.stemmer),this.searchPipeline&&(this.searchPipeline.reset(),this.searchPipeline.add(e.de.stemmer))},e.de.wordCharacters="A-Za-zªºÀ-ÖØ-öø-ʸˠ-ˤᴀ-ᴥᴬ-ᵜᵢ-ᵥᵫ-ᵷᵹ-ᶾḀ-ỿⁱⁿₐ-ₜKÅℲⅎⅠ-ↈⱠ-ⱿꜢ-ꞇꞋ-ꞭꞰ-ꞷꟷ-ꟿꬰ-ꭚꭜ-ꭤﬀ-ﬆＡ-Ｚａ-ｚ",e.de.trimmer=e.trimmerSupport.generateTrimmer(e.de.wordCharacters),e.Pipeline.registerFunction(e.de.trimmer,"trimmer-de"),e.de.stemmer=function(){var r=e.stemmerSupport.Among,n=e.stemmerSupport.SnowballProgram,i=new function(){function e(e,r,n){return!(!v.eq_s(1,e)||(v.ket=v.cursor,!v.in_grouping(p,97,252)))&&(v.slice_from(r),v.cursor=n,!0)}function i(){for(var r,n,i,s,t=v.cursor;;)if(r=v.cursor,v.bra=r,v.eq_s(1,"ß"))v.ket=v.cursor,v.slice_from("ss");else{if(r>=v.limit)break;v.cursor=r+1}for(v.cursor=t;;)for(n=v.cursor;;){if(i=v.cursor,v.in_grouping(p,97,252)){if(s=v.cursor,v.bra=s,e("u","U",i))break;if(v.cursor=s,e("y","Y",i))break}if(i>=v.limit)return void(v.cursor=n);v.cursor=i+1}}function s(){for(;!v.in_grouping(p,97,252);){if(v.cursor>=v.limit)return!0;v.cursor++}for(;!v.out_grouping(p,97,252);){if(v.cursor>=v.limit)return!0;v.cursor++}return!1}function t(){m=v.limit,l=m;var e=v.cursor+3;0<=e&&e<=v.limit&&(d=e,s()||(m=v.cursor,m<d&&(m=d),s()||(l=v.cursor)))}function o(){for(var e,r;;){if(r=v.cursor,v.bra=r,!(e=v.find_among(h,6)))return;switch(v.ket=v.cursor,e){case 1:v.slice_from("y");break;case 2:case 5:v.slice_from("u");break;case 3:v.slice_from("a");break;case 4:v.slice_from("o");break;case 6:if(v.cursor>=v.limit)return;v.cursor++}}}function c(){return m<=v.cursor}function u(){return l<=v.cursor}function a(){var e,r,n,i,s=v.limit-v.cursor;if(v.ket=v.cursor,(e=v.find_among_b(w,7))&&(v.bra=v.cursor,c()))switch(e){case 1:v.slice_del();break;case 2:v.slice_del(),v.ket=v.cursor,v.eq_s_b(1,"s")&&(v.bra=v.cursor,v.eq_s_b(3,"nis")&&v.slice_del());break;case 3:v.in_grouping_b(g,98,116)&&v.slice_del()}if(v.cursor=v.limit-s,v.ket=v.cursor,(e=v.find_among_b(f,4))&&(v.bra=v.cursor,c()))switch(e){case 1:v.slice_del();break;case 2:if(v.in_grouping_b(k,98,116)){var t=v.cursor-3;v.limit_backward<=t&&t<=v.limit&&(v.cursor=t,v.slice_del())}}if(v.cursor=v.limit-s,v.ket=v.cursor,(e=v.find_among_b(_,8))&&(v.bra=v.cursor,u()))switch(e){case 1:v.slice_del(),v.ket=v.cursor,v.eq_s_b(2,"ig")&&(v.bra=v.cursor,r=v.limit-v.cursor,v.eq_s_b(1,"e")||(v.cursor=v.limit-r,u()&&v.slice_del()));break;case 2:n=v.limit-v.cursor,v.eq_s_b(1,"e")||(v.cursor=v.limit-n,v.slice_del());break;case 3:if(v.slice_del(),v.ket=v.cursor,i=v.limit-v.cursor,!v.eq_s_b(2,"er")&&(v.cursor=v.limit-i,!v.eq_s_b(2,"en")))break;v.bra=v.cursor,c()&&v.slice_del();break;case 4:v.slice_del(),v.ket=v.cursor,e=v.find_among_b(b,2),e&&(v.bra=v.cursor,u()&&1==e&&v.slice_del())}}var d,l,m,h=[new r("",-1,6),new r("U",0,2),new r("Y",0,1),new r("ä",0,3),new r("ö",0,4),new r("ü",0,5)],w=[new r("e",-1,2),new r("em",-1,1),new r("en",-1,2),new r("ern",-1,1),new r("er",-1,1),new r("s",-1,3),new r("es",5,2)],f=[new r("en",-1,1),new r("er",-1,1),new r("st",-1,2),new r("est",2,1)],b=[new r("ig",-1,1),new r("lich",-1,1)],_=[new r("end",-1,1),new r("ig",-1,2),new r("ung",-1,1),new r("lich",-1,3),new r("isch",-1,2),new r("ik",-1,2),new r("heit",-1,3),new r("keit",-1,4)],p=[17,65,16,1,0,0,0,0,0,0,0,0,0,0,0,0,8,0,32,8],g=[117,30,5],k=[117,30,4],v=new n;this.setCurrent=function(e){v.setCurrent(e)},this.getCurrent=function(){return v.getCurrent()},this.stem=function(){var e=v.cursor;return i(),v.cursor=e,t(),v.limit_backward=e,v.cursor=v.limit,a(),v.cursor=v.limit_backward,o(),!0}};return function(e){return"function"==typeof e.update?e.update(function(e){return i.setCurrent(e),i.stem(),i.getCurrent()}):(i.setCurrent(e),i.stem(),i.getCurrent())}}(),e.Pipeline.registerFunction(e.de.stemmer,"stemmer-de"),e.de.stopWordFilter=e.generateStopWordFilter("aber alle allem allen aller alles als also am an ander andere anderem anderen anderer anderes anderm andern anderr anders auch auf aus bei bin bis bist da damit dann das dasselbe dazu daß dein deine deinem deinen deiner deines dem demselben den denn denselben der derer derselbe derselben des desselben dessen dich die dies diese dieselbe dieselben diesem diesen dieser dieses dir doch dort du durch ein eine einem einen einer eines einig einige einigem einigen einiger einiges einmal er es etwas euch euer eure eurem euren eurer eures für gegen gewesen hab habe haben hat hatte hatten hier hin hinter ich ihm ihn ihnen ihr ihre ihrem ihren ihrer ihres im in indem ins ist jede jedem jeden jeder jedes jene jenem jenen jener jenes jetzt kann kein keine keinem keinen keiner keines können könnte machen man manche manchem manchen mancher manches mein meine meinem meinen meiner meines mich mir mit muss musste nach nicht nichts noch nun nur ob oder ohne sehr sein seine seinem seinen seiner seines selbst sich sie sind so solche solchem solchen solcher solches soll sollte sondern sonst um und uns unse unsem unsen unser unses unter viel vom von vor war waren warst was weg weil weiter welche welchem welchen welcher welches wenn werde werden wie wieder will wir wird wirst wo wollen wollte während würde würden zu zum zur zwar zwischen über".split(" ")),e.Pipeline.registerFunction(e.de.stopWordFilter,"stopWordFilter-de")}});


--------------------------------------------------------------------------------
/test/en:
--------------------------------------------------------------------------------
  1 | 2g1c
  2 | 2 girls 1 cup
  3 | acrotomophilia
  4 | alabama hot pocket
  5 | alaskan pipeline
  6 | anal
  7 | anilingus
  8 | anus
  9 | apeshit
 10 | arsehole
 11 | ass
 12 | asshole
 13 | assmunch
 14 | auto erotic
 15 | autoerotic
 16 | babeland
 17 | baby batter
 18 | baby juice
 19 | ball gag
 20 | ball gravy
 21 | ball kicking
 22 | ball licking
 23 | ball sack
 24 | ball sucking
 25 | bangbros
 26 | bangbus
 27 | bareback
 28 | barely legal
 29 | barenaked
 30 | bastard
 31 | bastardo
 32 | bastinado
 33 | bbw
 34 | bdsm
 35 | beaner
 36 | beaners
 37 | beaver cleaver
 38 | beaver lips
 39 | beastiality
 40 | bestiality
 41 | big black
 42 | big breasts
 43 | big knockers
 44 | big tits
 45 | bimbos
 46 | birdlock
 47 | bitch
 48 | bitches
 49 | black cock
 50 | blonde action
 51 | blonde on blonde action
 52 | blowjob
 53 | blow job
 54 | blow your load
 55 | blue waffle
 56 | blumpkin
 57 | bollocks
 58 | bondage
 59 | boner
 60 | boob
 61 | boobs
 62 | booty call
 63 | brown showers
 64 | brunette action
 65 | bukkake
 66 | bulldyke
 67 | bullet vibe
 68 | bullshit
 69 | bung hole
 70 | bunghole
 71 | busty
 72 | butt
 73 | buttcheeks
 74 | butthole
 75 | camel toe
 76 | camgirl
 77 | camslut
 78 | camwhore
 79 | carpet muncher
 80 | carpetmuncher
 81 | chocolate rosebuds
 82 | cialis
 83 | circlejerk
 84 | cleveland steamer
 85 | clit
 86 | clitoris
 87 | clover clamps
 88 | clusterfuck
 89 | cock
 90 | cocks
 91 | coprolagnia
 92 | coprophilia
 93 | cornhole
 94 | coon
 95 | coons
 96 | creampie
 97 | cum
 98 | cumming
 99 | cumshot
100 | cumshots
101 | cunnilingus
102 | cunt
103 | darkie
104 | date rape
105 | daterape
106 | deep throat
107 | deepthroat
108 | dendrophilia
109 | dick
110 | dildo
111 | dingleberry
112 | dingleberries
113 | dirty pillows
114 | dirty sanchez
115 | doggie style
116 | doggiestyle
117 | doggy style
118 | doggystyle
119 | dog style
120 | dolcett
121 | domination
122 | dominatrix
123 | dommes
124 | donkey punch
125 | double dong
126 | double penetration
127 | dp action
128 | dry hump
129 | dvda
130 | eat my ass
131 | ecchi
132 | ejaculation
133 | erotic
134 | erotism
135 | escort
136 | eunuch
137 | fag
138 | faggot
139 | fecal
140 | felch
141 | fellatio
142 | feltch
143 | female squirting
144 | femdom
145 | figging
146 | fingerbang
147 | fingering
148 | fisting
149 | foot fetish
150 | footjob
151 | frotting
152 | fuck
153 | fuck buttons
154 | fuckin
155 | fucking
156 | fucktards
157 | fudge packer
158 | fudgepacker
159 | futanari
160 | gangbang
161 | gang bang
162 | gay sex
163 | genitals
164 | giant cock
165 | girl on
166 | girl on top
167 | girls gone wild
168 | goatcx
169 | goatse
170 | god damn
171 | gokkun
172 | golden shower
173 | goodpoop
174 | goo girl
175 | goregasm
176 | grope
177 | group sex
178 | g-spot
179 | guro
180 | hand job
181 | handjob
182 | hard core
183 | hardcore
184 | hentai
185 | homoerotic
186 | honkey
187 | hooker
188 | horny
189 | hot carl
190 | hot chick
191 | how to kill
192 | how to murder
193 | huge fat
194 | humping
195 | incest
196 | intercourse
197 | jack off
198 | jail bait
199 | jailbait
200 | jelly donut
201 | jerk off
202 | jigaboo
203 | jiggaboo
204 | jiggerboo
205 | jizz
206 | juggs
207 | kike
208 | kinbaku
209 | kinkster
210 | kinky
211 | knobbing
212 | leather restraint
213 | leather straight jacket
214 | lemon party
215 | livesex
216 | lolita
217 | lovemaking
218 | make me come
219 | male squirting
220 | masturbate
221 | masturbating
222 | masturbation
223 | menage a trois
224 | milf
225 | missionary position
226 | mong
227 | motherfucker
228 | mound of venus
229 | mr hands
230 | muff diver
231 | muffdiving
232 | nambla
233 | nawashi
234 | negro
235 | neonazi
236 | nigga
237 | nigger
238 | nig nog
239 | nimphomania
240 | nipple
241 | nipples
242 | nsfw
243 | nsfw images
244 | nude
245 | nudity
246 | nutten
247 | nympho
248 | nymphomania
249 | octopussy
250 | omorashi
251 | one cup two girls
252 | one guy one jar
253 | orgasm
254 | orgy
255 | paedophile
256 | paki
257 | panties
258 | panty
259 | pedobear
260 | pedophile
261 | pegging
262 | penis
263 | phone sex
264 | piece of shit
265 | pikey
266 | pissing
267 | piss pig
268 | pisspig
269 | playboy
270 | pleasure chest
271 | pole smoker
272 | ponyplay
273 | poof
274 | poon
275 | poontang
276 | punany
277 | poop chute
278 | poopchute
279 | porn
280 | porno
281 | pornography
282 | prince albert piercing
283 | pthc
284 | pubes
285 | pussy
286 | queaf
287 | queef
288 | quim
289 | raghead
290 | raging boner
291 | rape
292 | raping
293 | rapist
294 | rectum
295 | reverse cowgirl
296 | rimjob
297 | rimming
298 | rosy palm
299 | rosy palm and her 5 sisters
300 | rusty trombone
301 | sadism
302 | santorum
303 | scat
304 | schlong
305 | scissoring
306 | semen
307 | sex
308 | sexcam
309 | sexo
310 | sexy
311 | sexual
312 | sexually
313 | sexuality
314 | shaved beaver
315 | shaved pussy
316 | shemale
317 | shibari
318 | shit
319 | shitblimp
320 | shitty
321 | shota
322 | shrimping
323 | skeet
324 | slanteye
325 | slut
326 | s&m
327 | smut
328 | snatch
329 | snowballing
330 | sodomize
331 | sodomy
332 | spastic
333 | spic
334 | splooge
335 | splooge moose
336 | spooge
337 | spread legs
338 | spunk
339 | strap on
340 | strapon
341 | strappado
342 | strip club
343 | style doggy
344 | suck
345 | sucks
346 | suicide girls
347 | sultry women
348 | swastika
349 | swinger
350 | tainted love
351 | taste my
352 | tea bagging
353 | threesome
354 | throating
355 | thumbzilla
356 | tied up
357 | tight white
358 | tit
359 | tits
360 | titties
361 | titty
362 | tongue in a
363 | topless
364 | tosser
365 | towelhead
366 | tranny
367 | tribadism
368 | tub girl
369 | tubgirl
370 | tushy
371 | twat
372 | twink
373 | twinkie
374 | two girls one cup
375 | undressing
376 | upskirt
377 | urethra play
378 | urophilia
379 | vagina
380 | venus mound
381 | viagra
382 | vibrator
383 | violet wand
384 | vorarephilia
385 | voyeur
386 | voyeurweb
387 | voyuer
388 | vulva
389 | wank
390 | wetback
391 | wet dream
392 | white power
393 | whore
394 | worldsex
395 | wrapping men
396 | wrinkled starfish
397 | xx
398 | xxx
399 | yaoi
400 | yellow showers
401 | yiffy
402 | zoophilia
403 | 🖕
404 | 


--------------------------------------------------------------------------------
/docs/FinNLP/site/assets/javascripts/lunr/min/lunr.du.min.js:
--------------------------------------------------------------------------------
 1 | /*!
 2 |  * Lunr languages, `Dutch` language
 3 |  * https://github.com/MihaiValentin/lunr-languages
 4 |  *
 5 |  * Copyright 2014, Mihai Valentin
 6 |  * http://www.mozilla.org/MPL/
 7 |  */
 8 | /*!
 9 |  * based on
10 |  * Snowball JavaScript Library v0.3
11 |  * http://code.google.com/p/urim/
12 |  * http://snowball.tartarus.org/
13 |  *
14 |  * Copyright 2010, Oleg Mazko
15 |  * http://www.mozilla.org/MPL/
16 |  */
17 | 
18 | !function(e,r){"function"==typeof define&&define.amd?define(r):"object"==typeof exports?module.exports=r():r()(e.lunr)}(this,function(){return function(e){if(void 0===e)throw new Error("Lunr is not present. Please include / require Lunr before this script.");if(void 0===e.stemmerSupport)throw new Error("Lunr stemmer support is not present. Please include / require Lunr stemmer support before this script.");console.warn('[Lunr Languages] Please use the "nl" instead of the "du". The "nl" code is the standard code for Dutch language, and "du" will be removed in the next major versions.'),e.du=function(){this.pipeline.reset(),this.pipeline.add(e.du.trimmer,e.du.stopWordFilter,e.du.stemmer),this.searchPipeline&&(this.searchPipeline.reset(),this.searchPipeline.add(e.du.stemmer))},e.du.wordCharacters="A-Za-zªºÀ-ÖØ-öø-ʸˠ-ˤᴀ-ᴥᴬ-ᵜᵢ-ᵥᵫ-ᵷᵹ-ᶾḀ-ỿⁱⁿₐ-ₜKÅℲⅎⅠ-ↈⱠ-ⱿꜢ-ꞇꞋ-ꞭꞰ-ꞷꟷ-ꟿꬰ-ꭚꭜ-ꭤﬀ-ﬆＡ-Ｚａ-ｚ",e.du.trimmer=e.trimmerSupport.generateTrimmer(e.du.wordCharacters),e.Pipeline.registerFunction(e.du.trimmer,"trimmer-du"),e.du.stemmer=function(){var r=e.stemmerSupport.Among,i=e.stemmerSupport.SnowballProgram,n=new function(){function e(){for(var e,r,i,o=C.cursor;;){if(C.bra=C.cursor,e=C.find_among(b,11))switch(C.ket=C.cursor,e){case 1:C.slice_from("a");continue;case 2:C.slice_from("e");continue;case 3:C.slice_from("i");continue;case 4:C.slice_from("o");continue;case 5:C.slice_from("u");continue;case 6:if(C.cursor>=C.limit)break;C.cursor++;continue}break}for(C.cursor=o,C.bra=o,C.eq_s(1,"y")?(C.ket=C.cursor,C.slice_from("Y")):C.cursor=o;;)if(r=C.cursor,C.in_grouping(q,97,232)){if(i=C.cursor,C.bra=i,C.eq_s(1,"i"))C.ket=C.cursor,C.in_grouping(q,97,232)&&(C.slice_from("I"),C.cursor=r);else if(C.cursor=i,C.eq_s(1,"y"))C.ket=C.cursor,C.slice_from("Y"),C.cursor=r;else if(n(r))break}else if(n(r))break}function n(e){return C.cursor=e,e>=C.limit||(C.cursor++,!1)}function o(){_=C.limit,f=_,t()||(_=C.cursor,_<3&&(_=3),t()||(f=C.cursor))}function t(){for(;!C.in_grouping(q,97,232);){if(C.cursor>=C.limit)return!0;C.cursor++}for(;!C.out_grouping(q,97,232);){if(C.cursor>=C.limit)return!0;C.cursor++}return!1}function s(){for(var e;;)if(C.bra=C.cursor,e=C.find_among(p,3))switch(C.ket=C.cursor,e){case 1:C.slice_from("y");break;case 2:C.slice_from("i");break;case 3:if(C.cursor>=C.limit)return;C.cursor++}}function u(){return _<=C.cursor}function c(){return f<=C.cursor}function a(){var e=C.limit-C.cursor;C.find_among_b(g,3)&&(C.cursor=C.limit-e,C.ket=C.cursor,C.cursor>C.limit_backward&&(C.cursor--,C.bra=C.cursor,C.slice_del()))}function l(){var e;w=!1,C.ket=C.cursor,C.eq_s_b(1,"e")&&(C.bra=C.cursor,u()&&(e=C.limit-C.cursor,C.out_grouping_b(q,97,232)&&(C.cursor=C.limit-e,C.slice_del(),w=!0,a())))}function m(){var e;u()&&(e=C.limit-C.cursor,C.out_grouping_b(q,97,232)&&(C.cursor=C.limit-e,C.eq_s_b(3,"gem")||(C.cursor=C.limit-e,C.slice_del(),a())))}function d(){var e,r,i,n,o,t,s=C.limit-C.cursor;if(C.ket=C.cursor,e=C.find_among_b(h,5))switch(C.bra=C.cursor,e){case 1:u()&&C.slice_from("heid");break;case 2:m();break;case 3:u()&&C.out_grouping_b(z,97,232)&&C.slice_del()}if(C.cursor=C.limit-s,l(),C.cursor=C.limit-s,C.ket=C.cursor,C.eq_s_b(4,"heid")&&(C.bra=C.cursor,c()&&(r=C.limit-C.cursor,C.eq_s_b(1,"c")||(C.cursor=C.limit-r,C.slice_del(),C.ket=C.cursor,C.eq_s_b(2,"en")&&(C.bra=C.cursor,m())))),C.cursor=C.limit-s,C.ket=C.cursor,e=C.find_among_b(k,6))switch(C.bra=C.cursor,e){case 1:if(c()){if(C.slice_del(),i=C.limit-C.cursor,C.ket=C.cursor,C.eq_s_b(2,"ig")&&(C.bra=C.cursor,c()&&(n=C.limit-C.cursor,!C.eq_s_b(1,"e")))){C.cursor=C.limit-n,C.slice_del();break}C.cursor=C.limit-i,a()}break;case 2:c()&&(o=C.limit-C.cursor,C.eq_s_b(1,"e")||(C.cursor=C.limit-o,C.slice_del()));break;case 3:c()&&(C.slice_del(),l());break;case 4:c()&&C.slice_del();break;case 5:c()&&w&&C.slice_del()}C.cursor=C.limit-s,C.out_grouping_b(j,73,232)&&(t=C.limit-C.cursor,C.find_among_b(v,4)&&C.out_grouping_b(q,97,232)&&(C.cursor=C.limit-t,C.ket=C.cursor,C.cursor>C.limit_backward&&(C.cursor--,C.bra=C.cursor,C.slice_del())))}var f,_,w,b=[new r("",-1,6),new r("á",0,1),new r("ä",0,1),new r("é",0,2),new r("ë",0,2),new r("í",0,3),new r("ï",0,3),new r("ó",0,4),new r("ö",0,4),new r("ú",0,5),new r("ü",0,5)],p=[new r("",-1,3),new r("I",0,2),new r("Y",0,1)],g=[new r("dd",-1,-1),new r("kk",-1,-1),new r("tt",-1,-1)],h=[new r("ene",-1,2),new r("se",-1,3),new r("en",-1,2),new r("heden",2,1),new r("s",-1,3)],k=[new r("end",-1,1),new r("ig",-1,2),new r("ing",-1,1),new r("lijk",-1,3),new r("baar",-1,4),new r("bar",-1,5)],v=[new r("aa",-1,-1),new r("ee",-1,-1),new r("oo",-1,-1),new r("uu",-1,-1)],q=[17,65,16,1,0,0,0,0,0,0,0,0,0,0,0,0,128],j=[1,0,0,17,65,16,1,0,0,0,0,0,0,0,0,0,0,0,0,128],z=[17,67,16,1,0,0,0,0,0,0,0,0,0,0,0,0,128],C=new i;this.setCurrent=function(e){C.setCurrent(e)},this.getCurrent=function(){return C.getCurrent()},this.stem=function(){var r=C.cursor;return e(),C.cursor=r,o(),C.limit_backward=r,C.cursor=C.limit,d(),C.cursor=C.limit_backward,s(),!0}};return function(e){return"function"==typeof e.update?e.update(function(e){return n.setCurrent(e),n.stem(),n.getCurrent()}):(n.setCurrent(e),n.stem(),n.getCurrent())}}(),e.Pipeline.registerFunction(e.du.stemmer,"stemmer-du"),e.du.stopWordFilter=e.generateStopWordFilter(" aan al alles als altijd andere ben bij daar dan dat de der deze die dit doch doen door dus een eens en er ge geen geweest haar had heb hebben heeft hem het hier hij hoe hun iemand iets ik in is ja je kan kon kunnen maar me meer men met mij mijn moet na naar niet niets nog nu of om omdat onder ons ook op over reeds te tegen toch toen tot u uit uw van veel voor want waren was wat werd wezen wie wil worden wordt zal ze zelf zich zij zijn zo zonder zou".split(" ")),e.Pipeline.registerFunction(e.du.stopWordFilter,"stopWordFilter-du")}});


--------------------------------------------------------------------------------
/finnlp/data_sources/company_announcement/sec.py:
--------------------------------------------------------------------------------
  1 | from finnlp.data_sources.company_announcement._base import Company_Announcement_Downloader
  2 | 
  3 | from tqdm import tqdm
  4 | from lxml import etree
  5 | import pandas as pd
  6 | import requests
  7 | import json
  8 | import time
  9 | 
 10 | class SEC_Announcement(Company_Announcement_Downloader):
 11 | 
 12 |     def __init__(self, args = {}):
 13 |         super().__init__(args)
 14 |         self.dataframe = pd.DataFrame()
 15 | 
 16 |     def download_date_range_stock(self, start_date, end_date, stock = "AAPL", delay = 0.1):
 17 |         entityName = self._get_entity_name(stock)
 18 |         # first page
 19 |         total_pages = self._gather_one_page(start_date, end_date, 1, entityName, delay)
 20 |         # other pages
 21 |         if total_pages>1:
 22 |             for page in tqdm(range(1, total_pages), desc="Downloading other page..."):
 23 |                 self._gather_one_page(start_date, end_date, page + 1, entityName, delay )
 24 | 
 25 |         self.dataframe = self.dataframe.reset_index(drop = True)
 26 |         
 27 |     def _get_entity_name(self, stock = "AAPL"):
 28 |         url = "https://efts.sec.gov/LATEST/search-index"
 29 |         headers = {
 30 |             "user-agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/113.0.0.0 Safari/537.36"
 31 |         }
 32 |         params = {
 33 |             "keysTyped":stock
 34 |         }
 35 |         resp = self._request_get(url = url, headers= headers, params= params)
 36 |         if resp is None:
 37 |             raise ConnectionError("Can't get entity name")
 38 |         
 39 |         res = json.loads(resp.text)
 40 |         item_list = res["hits"]["hits"]
 41 |         entityName_list = []
 42 |         for item in item_list:
 43 |             c_name_one = item["_source"]["entity_words"]
 44 |             c_name_two = item["_id"].zfill(10)
 45 |             entityName = f"{c_name_one} (CIK {c_name_two})"
 46 |             entityName_list.append(entityName)
 47 |         
 48 |         entityName = entityName_list[0]
 49 | 
 50 |         return entityName
 51 |     
 52 |     def _gather_one_page(self, start_date, end_date, page, entityName = "Apple Inc. (AAPL) (CIK 0000320193)", delay = 0.01):
 53 |         from_ = (page-1)*100
 54 |         url = "https://efts.sec.gov/LATEST/search-index"
 55 |         headers = {
 56 |             "user-agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/113.0.0.0 Safari/537.36"
 57 |         }
 58 |         params = {
 59 |             "dateRange": "all",
 60 |             "entityName": entityName,
 61 |             "startdt": start_date,
 62 |             "enddt": end_date,
 63 |             "from" : from_,
 64 |             "page" : page,
 65 |         }
 66 | 
 67 |         resp = self._request_get(url = url, headers= headers, params= params)
 68 |         
 69 |         if resp is None:
 70 |             return 'Error'
 71 |         res = json.loads(resp.text)
 72 | 
 73 |         # total
 74 |         total_items = res["hits"]["total"]["value"]
 75 |         if total_items % 100 == 0:
 76 |             total_pages = total_items // 100 
 77 |         else:
 78 |             total_pages = total_items // 100 + 1
 79 | 
 80 |         items = res["hits"]["hits"]
 81 | 
 82 |         url_base = "https://www.sec.gov/Archives/edgar/data"
 83 | 
 84 |         for item in tqdm(items, desc="Downloading by item..." ):
 85 |             url_third = item["_source"]["xsl"]
 86 |             url_second, url_fourth = item["_id"].split(":")
 87 |             url_second = url_second.split("-")
 88 |             url_first = url_second[0]
 89 |             url_first = url_first.strip("0")
 90 |             url_second = ''.join(url_second)
 91 |             url_first, url_second, url_fourth
 92 | 
 93 |             if url_third is not None:
 94 |                 url_new = f"{url_base}/{url_first}/{url_second}/{url_third}/{url_fourth}"
 95 |             else:
 96 |                 url_new = f"{url_base}/{url_first}/{url_second}/{url_fourth}"
 97 |             respn = self._request_get(url = url_new, headers= headers)
 98 |             if respn is None:
 99 |                 continue
100 |             try:
101 |                 res = etree.HTML(respn.text)
102 |                 content = res.xpath("/html/body//text()")
103 |                 content = [c for c in content if c != "\n"]
104 |                 content = "".join(content)
105 |                 
106 |                 _id = item["_id"]
107 |                 ciks = item["_source"]["ciks"]
108 |                 period_ending = item["_source"]["period_ending"]
109 |                 root_form = item["_source"]["root_form"]
110 |                 file_num = item["_source"]["file_num"]
111 |                 display_names = item["_source"]["display_names"]
112 |                 xsl = item["_source"]["xsl"]
113 |                 sequence = item["_source"]["sequence"]
114 |                 file_date = item["_source"]["file_date"]
115 |                 biz_states = item["_source"]["biz_states"]
116 |                 sics = item["_source"]["sics"]
117 |                 form = item["_source"]["form"]
118 |                 adsh = item["_source"]["adsh"]
119 |                 film_num = item["_source"]["film_num"]
120 |                 biz_locations = item["_source"]["biz_locations"]
121 |                 file_type = item["_source"]["file_type"]
122 |                 file_description = item["_source"]["file_description"]
123 |                 inc_states = item["_source"]["inc_states"]
124 |                 ite = item["_source"]["items"]
125 | 
126 |                 data = [
127 |                     _id, ciks, period_ending, root_form, file_num, display_names, xsl, sequence,
128 |                     file_date, biz_states, sics, form, adsh, film_num, biz_locations, file_type,
129 |                     file_description, inc_states, ite, content
130 |                 ]
131 |                 columns = [
132 |                     "_id", "ciks", "period_ending", "root_form", "file_num", "display_names", "xsl", "sequence",
133 |                     "file_date", "biz_states", "sics", "form", "adsh", "film_num", "biz_locations", "file_type",
134 |                     "file_description", "inc_states", "ite", "content"
135 |                 ]
136 |                 tmp = pd.DataFrame(data = data).T
137 |                 tmp.columns = columns
138 | 
139 |                 self.dataframe = pd.concat([self.dataframe, tmp])
140 |                 time.sleep(delay)
141 |             except:
142 |                 continue
143 |         
144 |         return total_pages
145 |     
146 | 


--------------------------------------------------------------------------------
/finnlp/data_sources/social_media/weibo_date_range.py:
--------------------------------------------------------------------------------
  1 | from finnlp.data_sources.social_media._base import Social_Media_Downloader
  2 | 
  3 | from tqdm import tqdm
  4 | from lxml import etree
  5 | import pandas as pd
  6 | import numpy as np
  7 | import requests
  8 | import datetime
  9 | import time
 10 | import json
 11 | import re
 12 | 
 13 | class Weibo_Date_Range(Social_Media_Downloader):
 14 |     def __init__(self, args = {}):
 15 |         super().__init__(args)
 16 |         if "cookies" not in args.keys():
 17 |             raise ValueError("You need first log in at https://weibo.com/ and then copy you cookies and use it as the [value] of [key] \'cookies\' ")
 18 |         self.cookies = args["cookies"]
 19 |         self.dataframe = pd.DataFrame()
 20 | 
 21 |     def download_date_range_stock(self, start_date, end_date, start_hour= 0,end_hour = 0,stock = "茅台", delay = 0.01):
 22 |         self.date_list = pd.date_range(start_date, end_date)
 23 |         for date in tqdm(self.date_list, desc = "Downloading by dates..."):
 24 |             date = date.strftime("%Y-%m-%d")
 25 |             self._gather_one_day(date, start_hour, end_hour, stock, delay)
 26 |         self.dataframe = self.dataframe.reset_index(drop = True)
 27 | 
 28 |     def _gather_one_day(self,date,start_hour, end_hour, stock = "茅台", delay = 0.01):
 29 |         if start_hour == 0 and end_hour == 0:
 30 |             start_date = datetime.datetime.strptime(date, "%Y-%m-%d")
 31 |             end_date = start_date + datetime.timedelta(days=1)
 32 |             start_date = start_date.strftime("%Y-%m-%d")
 33 |             end_date = end_date.strftime("%Y-%m-%d")
 34 |         else:
 35 |             start_date = date, end_date = date 
 36 | 
 37 |         # first page
 38 |         all_urls = self._gather_first_page(start_date, end_date, start_hour, end_hour, stock, delay)
 39 |         # another pages
 40 |         if len(all_urls)>1:
 41 |             base_url=  "https://s.weibo.com/"
 42 |             for url_new in all_urls:
 43 |                 url_new = base_url + url_new
 44 |                 self._gather_other_pages(date, url_new, delay)
 45 |          
 46 |     def _gather_first_page(self,start_date, end_date, start_hour, end_hour, stock = "茅台", delay = 0.01):  
 47 |         
 48 |         headers = {
 49 |             "cookie": self.cookies,
 50 |             "User-Agent":"Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:109.0) Gecko/20100101 Firefox/112.0", 
 51 |             }
 52 |         
 53 |         params = {
 54 |             "q": stock,
 55 |             "typeall": "1",
 56 |             "suball": "1",
 57 |             "timescope":f"custom:{start_date}-{start_hour}:{end_date}-{end_hour}",
 58 |             "Refer":"g",
 59 |             "page":"1"
 60 |         }
 61 | 
 62 |         url = f"https://s.weibo.com/weibo"
 63 |         resp = self._request_get(url, headers=headers, params = params)
 64 |         
 65 |         if resp is None:
 66 |             return "Error"
 67 | 
 68 |         if "passport.weibo.com" in resp.url:
 69 |             raise ValueError("Your cookies is useless. Please first log in at https://weibo.com/ and then copy you cookies and use it as the [value] of [key] \'cookies\' ")
 70 | 
 71 |         res = etree.HTML(resp.content)
 72 |         # get all pages
 73 |         all_pages = res.xpath('//*[@id="pl_feedlist_index"]/div[3]/div[1]/span/ul/li//@href')
 74 |         items = res.xpath('//div[@class="card-wrap"]')
 75 |         for i in items:
 76 |             ps = i.xpath('.//div[@class="content"]//p')
 77 |             try:
 78 |                 content = ps[0].xpath(".//text()")
 79 |                 content = ''.join(content)
 80 |                 content = content.replace('\n',"")
 81 |                 content = content.replace(' ',"")
 82 |                 content = content.replace('\u200b',"")
 83 |             except:
 84 |                 continue
 85 |             
 86 |             info = ps[1].xpath(".//text()")
 87 |             try:
 88 |                 date_content = info[1]
 89 |                 date_content = date_content.replace('\n',"")
 90 |                 date_content = date_content.replace(' ',"")
 91 |             except:
 92 |                 date_content = np.nan
 93 | 
 94 |             try:
 95 |                 source = info[3]
 96 |             except:
 97 |                 source = np.nan
 98 |             
 99 |             tmp = pd.DataFrame([start_date, date_content, source, content]).T
100 |             tmp.columns = ["date","date_content", "source", "content"]
101 |             self.dataframe = pd.concat([self.dataframe, tmp])
102 | 
103 |         time.sleep(delay)
104 | 
105 |         return all_pages
106 |     
107 |     def _gather_other_pages(self, date, url, delay = 0.01):
108 |         
109 |         headers = {
110 |             "cookie": self.cookies,
111 |             "User-Agent":"Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:109.0) Gecko/20100101 Firefox/112.0", 
112 |             }
113 |         
114 |         resp = self._request_get(url, headers=headers)
115 | 
116 |         if resp is None:
117 |             return "Error"
118 | 
119 |         if "passport.weibo.com" in resp.url:
120 |             raise ValueError("Your cookies is useless. Please first log in at https://weibo.com/ and then copy you cookies and use it as the [value] of [key] \'cookies\' ")
121 | 
122 |         res = etree.HTML(resp.content)
123 |         # get all pages
124 |         all_pages = res.xpath('//*[@id="pl_feedlist_index"]/div[3]/div[1]/span/ul/li//@href')
125 |         items = res.xpath('//div[@class="card-wrap"]')
126 |         for i in items:
127 |             ps = i.xpath('.//div[@class="content"]//p')
128 |             try:
129 |                 content = ps[0].xpath(".//text()")
130 |                 content = ''.join(content)
131 |                 content = content.replace('\n',"")
132 |                 content = content.replace(' ',"")
133 |                 content = content.replace('\u200b',"")
134 |             except:
135 |                 continue
136 |             
137 |             info = ps[1].xpath(".//text()")
138 |             try:
139 |                 date_content = info[1]
140 |                 date_content = date_content.replace('\n',"")
141 |                 date_content = date_content.replace(' ',"")
142 |             except:
143 |                 date_content = np.nan
144 | 
145 |             try:
146 |                 source = info[3]
147 |             except:
148 |                 source = np.nan
149 |             
150 |             tmp = pd.DataFrame([date, date_content, source, content]).T
151 |             tmp.columns = ["date", "date_content", "source", "content"]
152 |             self.dataframe = pd.concat([self.dataframe, tmp])
153 | 
154 |         time.sleep(delay)
155 | 


--------------------------------------------------------------------------------
/finnlp/data_sources/sec_filings/prepline_sec_filings/sections.py:
--------------------------------------------------------------------------------
  1 | """Module for defining/enumerating the common sections from SEC forms"""
  2 | import re
  3 | from enum import Enum
  4 | from typing import List
  5 | 
  6 | 
  7 | class SECSection(Enum):
  8 |     PROSPECTUS_SUMMARY = re.compile(r"^(?:prospectus )?summary$")
  9 |     ABOUT_PROSPECTUS = re.compile(r"about this prospectus")
 10 |     FORWARD_LOOKING_STATEMENTS = re.compile(r"forward[ -]looking statements")
 11 |     RISK_FACTORS = re.compile(r"risk factors")
 12 |     USE_OF_PROCEEDS = re.compile(r"use of proceeds")
 13 |     DIVIDEND_POLICY = re.compile(r"^dividend policy")
 14 |     CAPITALIZATION = re.compile(r"^capitalization$")
 15 |     DILUTION = re.compile(r"^dilution$")
 16 |     MANAGEMENT_DISCUSSION = re.compile(r"^management(?:[\u2019']s)? discussion")
 17 |     BUSINESS = re.compile(r"^business$")
 18 |     MANAGEMENT = re.compile(r"^(?:(?:our )?management)|(?:executive officers)$")
 19 |     COMPENSATION = re.compile(r"compensation")
 20 |     RELATED_PARTY_TRANSACTIONS = re.compile(r"(?:relationships|related).*transactions")
 21 |     PRINCIPAL_STOCKHOLDERS = re.compile(
 22 |         r"(?:principal.*(?:stockholder|shareholder)s?)|(?:(security|stock|share) "
 23 |         r"ownership .*certain)"
 24 |     )
 25 |     DESCRIPTION_OF_STOCK = re.compile(
 26 |         r"^description of (?:capital stock|share capital|securities)"
 27 |     )
 28 |     DESCRIPTION_OF_DEBT = re.compile(r"^description of .*debt")
 29 |     FUTURE_SALE = re.compile(r"(?:shares|stock) eligible for future sale")
 30 |     US_TAX = re.compile(
 31 |         r"(?:us|u\.s\.|united states|material federal).* tax"
 32 |         r" (?:consideration|consequence)"
 33 |     )
 34 |     UNDERWRITING = re.compile(r"underwrit")
 35 |     LEGAL_MATTERS = re.compile(r"legal matters")
 36 |     EXPERTS = re.compile(r"^experts$")
 37 |     MORE_INFORMATION = re.compile(r"(?:additional|more) information")
 38 |     FINANCIAL_STATEMENTS = r"financial statements"
 39 |     MARKET_RISK_DISCLOSURES = (
 40 |         r"(?:quantitative|qualitative) disclosures? about market risk"
 41 |     )
 42 |     CONTROLS_AND_PROCEDURES = r"controls and procedures"
 43 |     LEGAL_PROCEEDINGS = r"legal proceedings"
 44 |     DEFAULTS = r"defaults (?:up)?on .*securities"
 45 |     MINE_SAFETY = r"mine safety disclosures?"
 46 |     OTHER_INFORMATION = r"other information"
 47 |     UNRESOLVED_STAFF_COMMENTS = r"unresolved staff comments"
 48 |     PROPERTIES = r"^properties$"
 49 |     MARKET_FOR_REGISTRANT_COMMON_EQUITY = (
 50 |         r"market for(?: the)? (?:registrant|company)(?:['\u2019]s)? common equity"
 51 |     )
 52 |     ACCOUNTING_DISAGREEMENTS = r"disagreements with accountants"
 53 |     FOREIGN_JURISDICTIONS = r"diclosure .*foreign jurisdictions .*inspection"
 54 |     EXECUTIVE_OFFICERS = r"executive officers"
 55 |     ACCOUNTING_FEES = r"accounting fees"
 56 |     EXHIBITS = r"^exhibits?(.*financial statement schedules)?$"
 57 |     FORM_SUMMARY = r"^form .*summary$"
 58 |     # NOTE(yuming): Additional section titles used in test_real_examples.py,
 59 |     # maybe change this when custom regex string param is allowed.
 60 |     CERTAIN_TRADEMARKS = r"certain trademarks"
 61 |     OFFER_PRICE = r"(?:determination of )offering price"
 62 | 
 63 |     @property
 64 |     def pattern(self):
 65 |         return self.value
 66 | 
 67 | 
 68 | ALL_SECTIONS = "_ALL"
 69 | 
 70 | section_string_to_enum = {enum.name: enum for enum in SECSection}
 71 | 
 72 | # NOTE(robinson) - Sections are listed in the following document from SEC
 73 | # ref: https://www.sec.gov/files/form10-k.pdf
 74 | SECTIONS_10K = (
 75 |     SECSection.BUSINESS,  # ITEM 1
 76 |     SECSection.RISK_FACTORS,  # ITEM 1A
 77 |     SECSection.UNRESOLVED_STAFF_COMMENTS,  # ITEM 1B
 78 |     SECSection.PROPERTIES,  # ITEM 2
 79 |     SECSection.LEGAL_PROCEEDINGS,  # ITEM 3
 80 |     SECSection.MINE_SAFETY,  # ITEM 4
 81 |     SECSection.MARKET_FOR_REGISTRANT_COMMON_EQUITY,  # ITEM 5
 82 |     # NOTE(robinson) - ITEM 6 is "RESERVED"
 83 |     SECSection.MANAGEMENT_DISCUSSION,  # ITEM 7
 84 |     SECSection.MARKET_RISK_DISCLOSURES,  # ITEM 7A
 85 |     SECSection.FINANCIAL_STATEMENTS,  # ITEM 8
 86 |     SECSection.ACCOUNTING_DISAGREEMENTS,  # ITEM 9
 87 |     SECSection.CONTROLS_AND_PROCEDURES,  # ITEM 9A
 88 |     # NOTE(robinson) - ITEM 9B is other information
 89 |     SECSection.FOREIGN_JURISDICTIONS,  # ITEM 9C
 90 |     SECSection.MANAGEMENT,  # ITEM 10
 91 |     SECSection.COMPENSATION,  # ITEM 11
 92 |     SECSection.PRINCIPAL_STOCKHOLDERS,  # ITEM 12
 93 |     SECSection.RELATED_PARTY_TRANSACTIONS,  # ITEM 13
 94 |     SECSection.ACCOUNTING_FEES,  # ITEM 14
 95 |     SECSection.EXHIBITS,  # ITEM 15
 96 |     SECSection.FORM_SUMMARY,  # ITEM 16
 97 | )
 98 | 
 99 | # NOTE(robinson) - Sections are listed in the following document from SEC
100 | # ref: https://www.sec.gov/files/form10-q.pdf
101 | SECTIONS_10Q = (
102 |     # Part I - Financial information
103 |     SECSection.FINANCIAL_STATEMENTS,  # ITEM 1
104 |     SECSection.MANAGEMENT_DISCUSSION,  # ITEM 2
105 |     SECSection.MARKET_RISK_DISCLOSURES,  # ITEM 3
106 |     SECSection.CONTROLS_AND_PROCEDURES,  # ITEM 4
107 |     # Part II - Other information
108 |     SECSection.LEGAL_PROCEEDINGS,  # ITEM 1
109 |     SECSection.RISK_FACTORS,  # ITEM 1A
110 |     SECSection.USE_OF_PROCEEDS,  # ITEM 2
111 |     SECSection.DEFAULTS,  # ITEM 3
112 |     SECSection.MINE_SAFETY,  # ITEM 4
113 |     SECSection.OTHER_INFORMATION,  # ITEM 5
114 | )
115 | 
116 | SECTIONS_S1 = (
117 |     SECSection.PROSPECTUS_SUMMARY,
118 |     SECSection.ABOUT_PROSPECTUS,
119 |     SECSection.FORWARD_LOOKING_STATEMENTS,
120 |     SECSection.RISK_FACTORS,
121 |     SECSection.USE_OF_PROCEEDS,
122 |     SECSection.DIVIDEND_POLICY,
123 |     SECSection.CAPITALIZATION,
124 |     SECSection.DILUTION,
125 |     SECSection.MANAGEMENT_DISCUSSION,
126 |     SECSection.BUSINESS,
127 |     SECSection.MANAGEMENT,
128 |     SECSection.COMPENSATION,
129 |     SECSection.RELATED_PARTY_TRANSACTIONS,
130 |     SECSection.PRINCIPAL_STOCKHOLDERS,
131 |     SECSection.DESCRIPTION_OF_STOCK,
132 |     SECSection.DESCRIPTION_OF_DEBT,
133 |     SECSection.FUTURE_SALE,
134 |     SECSection.US_TAX,
135 |     SECSection.UNDERWRITING,
136 |     SECSection.LEGAL_MATTERS,
137 |     SECSection.EXPERTS,
138 |     SECSection.MORE_INFORMATION,
139 | )
140 | 
141 | 
142 | def validate_section_names(section_names: List[str]):
143 |     """Return section names that don't correspond to a defined enum."""
144 |     if len(section_names) == 1 and section_names[0] == ALL_SECTIONS:
145 |         return None
146 |     elif len(section_names) > 1 and ALL_SECTIONS in section_names:
147 |         raise ValueError(f"{ALL_SECTIONS} may not be specified with other sections")
148 | 
149 |     invalid_names = [
150 |         name for name in section_names if name not in section_string_to_enum
151 |     ]
152 |     if invalid_names:
153 |         raise ValueError(f"The following section names are not valid: {invalid_names}")
154 |     return None
155 | 


--------------------------------------------------------------------------------
/docs/FinNLP/docs/zh/index.md:
--------------------------------------------------------------------------------
  1 | # 互联网金融数据
  2 | 
  3 | 演示内容请参见[FinGPT](https://github.com/AI4Finance-Foundation/ChatGPT-for-FinTech)
  4 | 
  5 | **免责声明：我们根据MIT教育许可证的规定共享代码以供学术研究之用。此处不构成任何金融建议，亦非交易真实资金的推荐。在交易或投资之前请使用常识并首先咨询专业人士。**
  6 | 
  7 | ## Ⅰ. 架构
  8 | 
  9 | ![image-20230505200244043](https://cdn.jsdelivr.net/gh/oliverwang15/imgbed@main/img/202305052002139.png)
 10 | 
 11 | * 整个项目由4个部分组成：
 12 | 
 13 |   * 第一部分是**数据源**，在这里，我们从互联网上收集历史和流媒体数据。
 14 | 
 15 |   * 接下来，我们将数据推送到**数据工程**部分，在这里我们会对数据进行清洗，标记化处理和提示工程。
 16 | 
 17 |   * 然后，数据被推送到**大语言模型（LLMs）**。在这里，我们可以以不同的方式使用LLMs。我们不仅可以使用收集到的数据来训练我们自己的**轻量级微调模型**，还可以使用这些数据和**训练好的模型**或**LLM API**来支持我们的应用程序。
 18 |   
 19 |   * 最后一部分将是**应用程序**部分，我们可以使用数据和LLMs来制作许多有趣的应用程序。
 20 | 
 21 | ## Ⅱ. 数据源
 22 | 
 23 | ![image-20230505200446477](https://cdn.jsdelivr.net/gh/oliverwang15/imgbed@main/img/202305052004539.png)
 24 | 
 25 | * 由于空间限制，我们只展示了其中一部分。
 26 | 
 27 | ### 1. [新闻](jupyter/Data_Sources_News.ipynb)
 28 | 
 29 | | 平台 | 数据类型 | 相关市场 | 指定公司 | 时间范围 | 数据源类型 | 限制条件 | 文档数量(万) | 支持情况 |
 30 | | :----------------------------------------------------------: | :--------: | :------------: | :----------------------------------------------------------: | :---------------: | :--------: | :-------------------: | ------------------------------------------------------------ | ------------------------------------------------------------ |
 31 | |        雅虎        | 金融新闻  |   美国股票    |         √         | 时间范围  |  官方   |         N/A          |         1,500+    |    √    |
 32 | |       路透社       | 金融新闻  |   美国股票    |         ×         | 时间范围  |  官方   |         N/A          |         1,500+    |    √    |
 33 | |        新浪         | 金融新闻  |   中国股票    |         ×         | 时间范围  |  官方   |         N/A          |         2,000+    |    √    |
 34 | |      东方财富      | 金融新闻  |   中国股票    |         √         | 时间范围  |  官方   |         N/A          |         1,000+    |    √    |
 35 | |        第一财经        | 金融新闻  |   中国股票    |         √         | 时间范围  |  官方   |         N/A          |         500+      |  即将    |
 36 | |        央视         | 政府新闻 |   中国股票    |         ×         | 时间范围  | 第三方 |         N/A          |         4         |    √    |
 37 | | 美国主流媒体 | 金融新闻  |   美国股票    |         √         | 时间范围  | 第三方 |    账户 (免费)    |     3,200+    |    √    |
 38 | | 中国主流媒体 | 金融新闻  |   中国股票    |         ×         | 时间范围  | 第三方 | ￥500/年 | 3000+ |    √    |
 39 | 
 40 | * FinGPT可能比Bloomberg的文档数目更少，但我们在同一个数量级上。
 41 | 
 42 | ### 2. [社交媒体](jupyter/Data_Sources_Social_Media.iypnb)
 43 | 
 44 | |          平台           | 数据类型 | 相关市场 | 指定公司 | 范围类型 | 来源类型 |  限制   | 文档 (1e4) | 支持 |
 45 | | :---------------------: | :------: | :------: | :------: | :------: | :------: | :-----: | ---------- | :--: |
 46 | |         Twitter         |   推文   | 美国股票 |    √     | 时间范围 |   官方   |   N/A   | 18,000+    |  √   |
 47 | |       StockTwits        |   推文   | 美国股票 |    √     |   最新   |   官方   |   N/A   | 160,000+   |  √   |
 48 | | Reddit (wallstreetbets) |   帖子   | 美国股票 |    ×     |   最新   |   官方   |   N/A   | 9+         |  √   |
 49 | |          微博           |   推文   | 中国股票 |    √     | 时间范围 |   官方   | Cookies | 1,400,000+ |  √   |
 50 | |          微博           |   推文   | 中国股票 |    √     |   最新   |   官方   |   N/A   | 1,400,000+ |  √   |
 51 | 
 52 | * 在 **BloomberGPT** 中，他们**不收集社交媒体数据**，但我们认为**公众舆论是干扰股票市场的最重要因素之一**。
 53 | 
 54 | ### 3. [公司公告](jupyter/Data_Sources_Company_Announcement.ipynb)
 55 | 
 56 | |       平台        | 数据类型 | 相关市场 | 指定公司 | 范围类型 | 数据来源 | 限制 | 文档数 (1e4) | 支持情况 |
 57 | | :---------------: | :------: | :------: | :------: | :------: | :------: | :--: | ------------ | :------: |
 58 | |   巨潮网 (官方)   |   文本   | 中国股票 |    √     | 时间范围 |   官方   | N/A  | 2,790+       |    √     |
 59 | | 美国证监会 (官方) |   文本   | 美国股票 |    √     | 时间范围 |   官方   | N/A  | 1,440+       |    √     |
 60 | 
 61 | * 由于我们从不同的股票市场收集数据，因此我们比Bloomberg GPT有更多的申报文档。
 62 | 
 63 | ### 4. 趋势
 64 | 
 65 | |                         平台                         | 数据类型 | 相关市场 |                         数据源                          | 指定公司 | 范围类型 | 源类型 | 限制 |
 66 | | :--------------------------------------------------: | :------: | :------: | :-----------------------------------------------------: | :------: | :------: | :----: | :--: |
 67 | | [谷歌趋势](https://trends.google.com/trends/explore) |   指数   | 美国股票 | [Google Trends](./finnlp/data_sources/trends/google.py) |    √     | 日期范围 |  官方  | N/A  |
 68 | | [百度指数](https://index.baidu.com/v2/index.html#/)  |   指数   | 中国股票 |                        即将推出                         |    -     |    -     |   -    |  -   |
 69 | 
 70 | 
 71 | ### 5. 数据集
 72 | |                            数据源                            | 类型 | 股票 |           日期           | 可用性 |
 73 | | :----------------------------------------------------------: | :--: | :--: | :----------------------: | :----: |
 74 | |         [AShare](https://github.com/JinanZou/Astock)         | 新闻 | 3680 | 2018-07-01 到 2021-11-30 |   √    |
 75 | | [stocknet-dataset](https://github.com/yumoxu/stocknet-dataset) | 推文 |  87  | 2014-01-02 到 2015-12-30 |   √    |
 76 | |          [CHRNN](https://github.com/wuhuizhe/CHRNN)          | 推文 |  38  | 2017-01-03 到 2017-12-28 |   √    |
 77 | 
 78 | ## Ⅲ. 模型
 79 | 
 80 | ![image-20230505200618504](https://cdn.jsdelivr.net/gh/oliverwang15/imgbed@main/img/202305052006541.png)
 81 | 
 82 | * 在数据中心的自然语言处理领域，我们不需要从头开始训练模型。我们只需要调用API和进行轻量级的微调。
 83 | * 左边是一些可能会用到的LLM APIs，中间是我们可能用来进行微调的模型，右边是一些微调方法。
 84 | 
 85 | ### 1. 微调：Tensor Layers (LoRA)
 86 | 
 87 | ![image-20230505200944411](https://cdn.jsdelivr.net/gh/oliverwang15/imgbed@main/img/202305052009480.png)
 88 | 
 89 | * 在FinGPT中，我们使用新的金融数据集对预训练的LLM进行微调。高质量的标记数据是许多成功的LLM（包括ChatGPT）的最重要的关键之一。
 90 | * 然而，这些高质量的标记数据通常非常昂贵和耗时，并且我们可能需要金融专家的帮助。
 91 | * 如果我们的目标是使用LLM分析与金融相关的文本数据并帮助量化交易，为什么不让市场为我们做标记呢？
 92 | * 因此，在这里，我们使用每个新闻相关的股票价格变化百分比作为输出标签，我们使用阈值将标签分成三组（积极的，消极的和中立的），并使用它们和新闻情感的标签。
 93 | * 相应地，在提示工程师部分，我们还要求模型选择其中一个正面的，负面的和中性的作为输出，以便我们充分利用预训练信息。
 94 | * 通过使用LoRA，我们可以将可训练参数减少从6.17B到3.67M。
 95 | * 如表格所示，与chatGLM相比，FinGPT可以在多个指标上实现大幅改善。然而，直接将我们的模型用于量化交易可能是不合适的。由于大多数新闻标题都是中性的，LLMs的大多数原始输出都是中性的，因此LLMs在积极和消极的标签上表现不佳，而这些标签可能对于量化交易是有用的。
 96 | * 然而，在微调之后，我们已经见证了在预测积极和消极标签方面的巨大改进。
 97 | * 这也是为什么该模型可以实现积极的交易结果的原因。
 98 | 
 99 | ### 2. 微调：强化学习在股价上的应用 (RLSP)
100 | 
101 | ![image-20230505201209946](https://cdn.jsdelivr.net/gh/oliverwang15/imgbed@main/img/202305052012996.png)
102 | 
103 | * 同样地，我们可以使用股价上的强化学习（RLSP）来替换ChatGPT中使用的人类反馈上的强化学习。
104 | 
105 | ## Ⅳ. 应用
106 | 
107 | ### 1. 智能投顾
108 | 
109 | ![image-20230505201913233](https://cdn.jsdelivr.net/gh/oliverwang15/imgbed@main/img/202305052019296.png)
110 | 
111 | * **ChatGPT可以像专业人士一样进行投资建议。**
112 | * 在这个例子中，苹果的**股价上涨**与ChatGPT分析新闻的**预测相符**。
113 | 
114 | ### 2. 量化交易
115 | 
116 | ![image-20230505201841001](https://cdn.jsdelivr.net/gh/oliverwang15/imgbed@main/img/202305052018035.png)
117 | 
118 | * 我们还可以使用新闻、社交媒体推文或者公司公告来**构建情感因子**，右侧的部分是由Twitter推文和ChatGPT信号产生的交易结果，数据来自于一个称为[stocknet-dataset](https://link.zhihu.com/?target=https%3A//github.com/yumoxu/stocknet-dataset)的数据集。
119 | * 正如您从图片中所看到的，由ChatGPT生成的交易信号**非常出色**，我们甚至可以**仅通过根据Twitter情感因子交易而获得良好的结果**。
120 | * 因此，我们可以通过**结合价格因素**来获得更好的结果。
121 | 
122 | ### 3. 低代码开发
123 | 
124 | ![image-20230505202028292](https://cdn.jsdelivr.net/gh/oliverwang15/imgbed@main/img/202305052020363.png)
125 | 
126 | * 我们可以使用LLMs的帮助来编写代码。
127 | * 右侧显示了我们如何**快速高效地**开发我们的因子和其他代码。


--------------------------------------------------------------------------------