├── figs
    ├── logo.png
    ├── perplexity.py
    ├── single.py
    ├── polar.py
    ├── winrate.py
    ├── compare_strings.json
    └── compare.py
├── requirements.txt
├── .gitignore
├── data
    ├── monthly_updater
    │   ├── readme.md
    │   ├── monthly_wikitext.py
    │   ├── monthly_image.py
    │   ├── monthly_math.py
    │   ├── monthly_code.py
    │   └── monthly_arxiv.py
    ├── code_repos.txt
    ├── push_wiki_alltime.py
    ├── push_github_dataset.py
    ├── push_arxiv_dataset.py
    ├── collect_bbc_months.py
    ├── push_math_dataset.py
    ├── doc_info.py
    ├── analyse_news.py
    ├── analyse_wikitext.py
    ├── wiki_dataset.py
    ├── wikitext_alltime.py
    ├── bbc_alltime.py
    ├── maintain_wikitext_latest.py
    ├── audio_dataset.py
    ├── bbc_news_image.py
    ├── math_dataset.py
    ├── github_dataset.py
    ├── reddit_crawler.py
    ├── squad_wiki_title.text
    ├── arxiv_dataset.py
    └── wikipedia.py
├── .github
    └── workflows
    │   ├── weekly_downloader.yml
    │   └── monthly_updater.yml
├── readme.md
├── push_to_hf_hub.py
├── github_downloader.py
├── wikitext_downloader.py
├── eval
    └── contamination.py
├── arxiv_downloader.py
└── bbc_downloader.py


/figs/logo.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/liyucheng09/LatestEval/HEAD/figs/logo.png


--------------------------------------------------------------------------------
/requirements.txt:
--------------------------------------------------------------------------------
1 | arxiv
2 | pylatexenc
3 | datasets
4 | bs4
5 | goose3
6 | configobj
7 | mwparserfromhell
8 | GitPython


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
 1 | .DS_Store
 2 | *.html
 3 | eval/*.txt
 4 | eval/saves/*.txt
 5 | __pycache__/
 6 | .vscode/
 7 | data/*.json
 8 | *.log
 9 | *.out
10 | *.error
11 | 
12 | wmt22-zhen/
13 | arxiv/
14 | bbc/
15 | github/


--------------------------------------------------------------------------------
/data/monthly_updater/readme.md:
--------------------------------------------------------------------------------
 1 | # Monthly data collection - where the RealTimeData program is hosted
 2 | 
 3 | ## sources available
 4 | 
 5 | - arxiv
 6 | - bbc_news
 7 | - code
 8 | - bbc_image
 9 | - math
10 | - wikitext
11 | 
12 | ## Check all data dumps from 2017 to current
13 | 
14 | Please find the RealTimeData program here: [RealTimeData](https://huggingface.co/RealTimeData).
15 | 
16 | ## Crawl data by your own
17 | 
18 | ```python
19 | python monthly_arxiv.py
20 | ```
21 | 
22 | This will crawl the arxiv data this month (from the 1st to the current date) and push to RealTimeData repos (if you have push authority).
23 | 
24 | ## Ask for more data source / contribute new data
25 | 
26 | This is program welcomes all contributions. Please open an issue or pull request if you have any suggestions or want to contribute new data sources.


--------------------------------------------------------------------------------
/data/code_repos.txt:
--------------------------------------------------------------------------------
 1 | Stirling-Tools/Stirling-PDF
 2 | microsoft/PowerToys
 3 | veler/DevToys
 4 | NationalSecurityAgency/ghidra
 5 | Kurento/kurento-media-server
 6 | silverwind/droppy
 7 | llvm-mirror/clang
 8 | facebookarchive/beringei
 9 | shadowsocks/shadowsocks-qt5
10 | go-ego/riot
11 | flynn/flynn
12 | lipangit/JiaoZiVideoPlayer
13 | keras-team/keras
14 | aseprite/aseprite
15 | godotengine/godot
16 | lua/lua
17 | musescore/MuseScore
18 | apache/spark
19 | apache/hadoop
20 | scikit-learn/scikit-learn
21 | Leaflet/Leaflet
22 | overleaf/overleaf
23 | pytorch/pytorch
24 | huggingface/transformers
25 | animate-css/animate.css
26 | psf/requests
27 | pandas-dev/pandas
28 | django/django
29 | numpy/numpy
30 | facebook/react
31 | vuejs/core
32 | vuejs/vue
33 | android/architecture-samples
34 | sqlite/sqlite
35 | elastic/elasticsearch
36 | openssl/openssl
37 | gohugoio/hugo
38 | laravel/laravel
39 | WordPress/WordPress
40 | Unity-Technologies/ml-agents
41 | opencv/opencv


--------------------------------------------------------------------------------
/data/push_wiki_alltime.py:
--------------------------------------------------------------------------------
 1 | import datasets
 2 | from glob import glob
 3 | import os
 4 | import json
 5 | 
 6 | if __name__ == '__main__':
 7 |     files = glob('/vol/research/lyc/wikitext_alltime/wiki/*.json')
 8 |     hf_token = os.environ['HF_TOKEN']
 9 | 
10 |     for file in files:
11 |         
12 |         all_articles = []
13 | 
14 |         time = os.path.basename(file).strip('.json')
15 |         year = int(time.split('-')[0])
16 |         month = int(time.split('-')[1])
17 | 
18 |         time_stamp = f'{year}-{month:02d}'
19 |         if time_stamp not in ['2024-01', '2024-02']:
20 |             continue
21 |         print(f"Processing {time_stamp}")
22 | 
23 |         with open(file) as f:
24 |             data = json.load(f)
25 |         
26 |         for title, article in data.items():
27 |             article['time'] = time_stamp
28 |             all_articles.append(article)
29 |         
30 |         ds = datasets.Dataset.from_list(all_articles)
31 |         ds.push_to_hub(f"RealTimeData/wikitext_alltime", config_name=time_stamp, token=hf_token)
32 | 


--------------------------------------------------------------------------------
/data/push_github_dataset.py:
--------------------------------------------------------------------------------
 1 | from glob import glob
 2 | import datasets
 3 | import os
 4 | import json
 5 | 
 6 | if __name__ == '__main__':
 7 |     hf_token = os.environ['HF_TOKEN']
 8 |     all_months = [f'{year}-{month:02}' for year in range(2017, 2024) for month in range(1, 13)]
 9 |     all_months += [f'2024-{month:02}' for month in range(1,3)]
10 |     
11 |     # try:
12 |     #     exists_config = datasets.get_dataset_config_names('RealTimeData/code_alltime')
13 |     # except datasets.exceptions.DatasetNotFoundError:
14 |     #     exists_config = []
15 |     #     pass
16 | 
17 |     for month in all_months:
18 |         # if month in exists_config:
19 |         #     continue
20 |         code_paths = glob(f'/vol/research/lyc/github_dataset/{month}/*/*.json')
21 |         all_codes = []
22 |         for code in code_paths:
23 |             with open(code, 'r') as f:
24 |                 all_codes.append(json.load(f))
25 |         ds = datasets.Dataset.from_list(all_codes)
26 |         print('='*20)
27 |         print(f'Finished {month}')
28 |         print(ds)
29 |         ds.push_to_hub(f'RealTimeData/code_alltime', config_name = month, token=hf_token)
30 |         print(f'Pushed {month} to hub')
31 | 


--------------------------------------------------------------------------------
/data/push_arxiv_dataset.py:
--------------------------------------------------------------------------------
 1 | from glob import glob
 2 | import datasets
 3 | import os
 4 | import json
 5 | 
 6 | if __name__ == '__main__':
 7 |     hf_token = os.environ['HF_TOKEN']
 8 |     all_months = [f'{year}-{month:02}' for year in range(2017, 2024) for month in range(1, 13)]
 9 | 
10 |     # try:
11 |     #     exists_config = datasets.get_dataset_config_names('RealTimeData/arxiv_alltime')
12 |     # except datasets.exceptions.DatasetNotFoundError:
13 |     #     exists_config = []
14 |     #     pass
15 | 
16 |     # all months before 2021-02 (included) are already pushed, so remove these months from all_months
17 |     all_months = all_months[all_months.index('2021-03'):]
18 | 
19 |     for month in all_months:
20 |         # if month in exists_config:
21 |         #     continue
22 |         paper_paths = glob(f'/vol/research/lyc/arxiv_alltime/{month}/*.json')
23 |         all_papers = []
24 |         for paper in paper_paths:
25 |             with open(paper, 'r') as f:
26 |                 all_papers.append(json.load(f))
27 |         ds = datasets.Dataset.from_list(all_papers)
28 |         print('='*20)
29 |         print(f'Finished {month}')
30 |         print(ds)
31 |         ds.push_to_hub(f'RealTimeData/arxiv_alltime', config_name = month, token=hf_token)
32 |         print(f'Pushed {month} to hub')
33 | 


--------------------------------------------------------------------------------
/data/collect_bbc_months.py:
--------------------------------------------------------------------------------
 1 | from glob import glob
 2 | import json
 3 | 
 4 | if __name__ == '__main__':
 5 |     # /vol/research/lyc/bbc/2023/0/articles.1 indicates day 1, month 0, year 2023
 6 |     docs = glob('/vol/research/lyc/bbc/*/*/articles.*')
 7 | 
 8 |     # now group by month
 9 |     times = {}
10 |     for doc in docs:
11 |         year = doc.split('/')[-3]
12 |         month = doc.split('/')[-2]
13 |         month = int(month)%12 + 1
14 |         time = f'{year}-{month}'
15 |         if time not in times:
16 |             times[time] = []
17 | 
18 |         with open(doc, 'r') as f:
19 |             articles = json.load(f)['articles']
20 |             times[time].extend(articles)
21 |     
22 |     # now save
23 |     # each month should save as a json dict
24 |     # target path /vol/research/lyc/bbc/bbc_alltime/articles/2023-{month}.json
25 |     for time in times:
26 |         articles = times[time]
27 |         month = time.split('-')[1]
28 |         year = time.split('-')[0]
29 |         # now turn list of dicts to dict of lists
30 |         articles = { key: [article[key] for article in articles] for key in articles[0] }
31 |         with open(f'/vol/research/lyc/bbc/bbc_alltime/articles/{year}-{month}.json', 'w') as f:
32 |             json.dump(articles, f, ensure_ascii=False)
33 |         print(f'Finished {year} {month}')


--------------------------------------------------------------------------------
/data/push_math_dataset.py:
--------------------------------------------------------------------------------
 1 | import datasets
 2 | import json
 3 | from glob import glob
 4 | 
 5 | if __name__ == '__main__':
 6 | 
 7 |     files = glob('/vol/research/lyc/math/*.json')
 8 |     for file in files:
 9 |         with open(file, 'r') as f:
10 |             data = json.load(f)
11 |         
12 |         time_stamp = file.split('/')[-1].split('.')[0]
13 |         if time_stamp not in ['2024-01', '2024-02']:
14 |             continue
15 | 
16 |         all_instances = []
17 |         for qa in data.values():
18 |             instance = {}
19 |             instance['question'] = qa['title']
20 |             instance['question_id'] = qa['question_id']
21 |             instance['score'] = qa['score']
22 |             instance['link'] = qa['link']
23 |             instance['body'] = qa['body']
24 |             if 'answers' not in qa:
25 |                 continue
26 |             instance['answers'] = [{'text': a['body'], 'score': a['score'], 'answer_id': a['answer_id']} for a in qa['answers']]
27 | 
28 |             verbolised = f"Question: {instance['question']}\n"
29 |             for ans_index, ans in enumerate(instance['answers']):
30 |                 verbolised += f"Answer {ans_index + 1}: {ans['text']}\n"
31 |             instance['verbolised'] = verbolised
32 | 
33 |             all_instances.append(instance)
34 |         
35 |         dataset = datasets.Dataset.from_list(all_instances)
36 |         print(dataset)
37 | 
38 |         dataset.push_to_hub('RealTimeData/math_alltime', time_stamp)
39 |         print(f"Pushed {time_stamp} to hub")


--------------------------------------------------------------------------------
/data/doc_info.py:
--------------------------------------------------------------------------------
 1 | import docx
 2 | import re
 3 | # from wikipedia import gpt3_self_info
 4 | import sys
 5 | 
 6 | def getText(filename):
 7 |     doc = docx.Document(filename)
 8 |     fullText = []
 9 |     for para in doc.paragraphs:
10 |         fullText.append(para.text)
11 |     return '\n'.join(fullText)
12 | 
13 | def beautify_text(text, num_words = 1000):
14 |     text = re.sub(r'\n+', '\n', text)
15 |     text = re.sub(r'\s+', ' ', text)
16 |     
17 |     # use first 1000 words
18 |     text = ' '.join(text.split(' ')[:num_words])
19 |     return text
20 | 
21 | def verbalise_docs(path = '/user/HS502/yl02706/LatestEval/data/mmlu', num_words = 1000):
22 |     docs = ['q17-1.docx', 'q18-1.docx', 'q19-1.docx', 'q20-1.docx', 'q22-1.docx', 'q23-1.docx']
23 |     docs = [ path + '/' + doc for doc in docs ]
24 |     doc_text = [ getText(doc) for doc in docs ]
25 | 
26 |     doc_text = [ beautify_text(doc, num_words=num_words) for doc in doc_text ]
27 | 
28 |     return {
29 |         doc: [doc_string] for doc, doc_string in zip(docs, doc_text)
30 |     }
31 | 
32 | if __name__ == '__main__':
33 |     docs = ['data/q17-1.docx', 'data/q18-1.docx', 'data/q19-1.docx', 'data/q20-1.docx', 'data/q22-1.docx', 'data/q23-1.docx']
34 |     doc_text = [ getText(doc) for doc in docs ]
35 | 
36 |     doc_text = [ beautify_text(doc) for doc in doc_text ]
37 | 
38 |     for doc, doc_string in zip(docs, doc_text):
39 |         print('----------------------')
40 |         print(doc)
41 | 
42 |         _, info = gpt3_self_info(doc_string)
43 |         print(sum(info)/len(info))


--------------------------------------------------------------------------------
/figs/perplexity.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | import pandas as pd
 3 | import matplotlib.pyplot as plt
 4 | 
 5 | # Data
 6 | df = pd.read_csv('figs/perplexity.tsv', sep='\t').set_index('Model')
 7 | df.drop('LatestEval', axis = 1, inplace=True)
 8 | df.drop(['opt-350m', 'opt-1.6b'], axis=0, inplace=True)
 9 | 
10 | data_dict = df.to_dict()
11 | 
12 | metrics_data = {
13 |     'QuAC': ('s', 'violet'),
14 |     'BoolQ': ('+', 'violet'),
15 |     'SQuAD': ('x', 'violet'),
16 |     'Wikitext': ('D', 'navy'),
17 |     'NewWiki': ('^', 'navy'),
18 |     # 'LatestEval': ('o', 'gold')
19 | }
20 | 
21 | fig, ax = plt.subplots(figsize=(8, 2.8), dpi=150)
22 | 
23 | # Create a horizontal scatter plot for each metric
24 | for benchmark, numbers in data_dict.items():
25 |     marker_style, color = metrics_data[benchmark]
26 |     models, perplexities = list(numbers.keys()), list(numbers.values())
27 |     # perplexities = np.exp(perplexities)  # assuming metrics are in log scale
28 |     plt.scatter(perplexities, models, label=benchmark, s=20, marker=marker_style, color=color)
29 | 
30 | # Adjust plot
31 | plt.ylabel('Models', fontweight='bold')
32 | plt.xlabel('Perplexity', fontweight='bold')
33 | plt.legend( loc='upper right', bbox_to_anchor=(1.05, 1.0), ncol=1, fontsize=8)
34 | plt.grid(True, which='both', linestyle='--', linewidth=0.5)
35 | plt.xlim(left = 1.3)  # Adjusting xlim to be slightly more than the max value for better visualization
36 | 
37 | # plt.gca().xaxis.tick_top()
38 | # plt.gca().xaxis.set_label_position('top')
39 | 
40 | ax.spines['top'].set_visible(False)
41 | ax.spines['right'].set_visible(False)
42 | 
43 | plt.tight_layout()
44 | 
45 | plt.show()


--------------------------------------------------------------------------------
/figs/single.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | import pandas as pd
 3 | import matplotlib.pyplot as plt
 4 | 
 5 | # Data
 6 | df = pd.read_csv('figs/perplexity.tsv', sep='\t').set_index('Model')
 7 | df.drop('LatestEval', axis = 1, inplace=True)
 8 | df.drop(['opt-350m', 'opt-1.6b', 'gpt-3', 'llama-7b', 'llama-30b'], axis=0, inplace=True)
 9 | 
10 | data_dict = df.to_dict()
11 | 
12 | metrics_data = {
13 |     'QuAC': ('s', 'violet'),
14 |     'BoolQ': ('+', 'violet'),
15 |     'SQuAD': ('x', 'violet'),
16 |     'memorised': ('D', 'navy'),
17 |     'clean': ('^', 'navy'),
18 |     # 'LatestEval': ('o', 'gold')
19 | }
20 | 
21 | fig, ax = plt.subplots(figsize=(4, 1), dpi=200)
22 | 
23 | # Create a horizontal scatter plot for each metric
24 | for benchmark, numbers in data_dict.items():
25 |     marker_style, color = metrics_data[benchmark]
26 |     models, perplexities = list(numbers.keys()), list(numbers.values())
27 |     # perplexities = np.exp(perplexities)  # assuming metrics are in log scale
28 |     plt.scatter(perplexities, ['perplexity'], label=benchmark, s=20, marker=marker_style, color=color)
29 |     ax.annotate(benchmark, xy=(perplexities[-1], 'perplexity'), xytext=(0, 2), textcoords='offset points', va='bottom', fontsize=7, rotation=45)
30 | 
31 | # Adjust plot
32 | plt.grid(True, linestyle='--', linewidth=0.5, axis='y')
33 | plt.xlim(left = 1.6)  # Adjusting xlim to be slightly more than the max value for better visualization
34 | 
35 | ax.xaxis.set_visible(False)
36 | 
37 | ax.spines['top'].set_visible(False)
38 | ax.spines['right'].set_visible(False)
39 | ax.spines['left'].set_visible(False)
40 | ax.spines['bottom'].set_visible(False)
41 | 
42 | plt.tight_layout()
43 | 
44 | plt.show()


--------------------------------------------------------------------------------
/figs/polar.py:
--------------------------------------------------------------------------------
 1 | import pandas as pd
 2 | import numpy as np
 3 | import matplotlib.pyplot as plt
 4 | 
 5 | df_score = pd.read_csv("figs/single_answer_score.tsv", sep='\t')
 6 | 
 7 | # Calculate the number of categories
 8 | categories = df_score['category'].unique()
 9 | N = len(categories)
10 | 
11 | # Calculate angle for each category
12 | theta = np.linspace(0.0, 2 * np.pi, N, endpoint=False)
13 | theta = np.append(theta, theta[0])
14 | # Set up the polar axis
15 | fig, ax = plt.subplots(subplot_kw={'projection': 'polar'}, figsize=(8, 6), dpi=150)
16 | ax.set_facecolor("#f5f5f5")
17 | 
18 | markers = {
19 |     'gpt-3.5-turbo': 'o',
20 |     'gpt-4': '+',
21 |     'llama-13b': 'x',
22 |     'llama-30b': 's',
23 |     'vicuna-13b': 'd',
24 | }
25 | 
26 | # Loop through each model and plot on the polar axis
27 | for model in df_score['model'].unique():
28 |     values = df_score[df_score['model'] == model]['score'].values
29 |     # Ensure the plot is closed by repeating the first value
30 |     values = np.append(values, values[0])
31 |     ax.plot(theta, values, label=model, marker=markers[model], alpha=0.8, markersize=5)
32 | 
33 | # Fill the area under the plot for better visualization (optional)
34 | # ax.fill(theta, values, 'b', alpha=0.1)
35 | 
36 | # Set the y-ticks (radii) and x-ticks (categories)
37 | ax.set_xticks(theta[:-1])
38 | ax.set_xticklabels(categories, fontsize=14)  # Label x-ticks with categories
39 | 
40 | ax.set_yticks([0, 2, 4, 6, 8, 10])
41 | 
42 | # Customize the grid and title
43 | ax.grid(True)
44 | 
45 | # Display a legend
46 | ax.legend(loc='upper right', bbox_to_anchor=(1.32, 1.1), fontsize=14)
47 | 
48 | # Save the figure
49 | fig.tight_layout()
50 | fig.savefig("fig.png", dpi=150)
51 | 
52 | # Show the plot
53 | # plt.show()
54 | 


--------------------------------------------------------------------------------
/data/analyse_news.py:
--------------------------------------------------------------------------------
 1 | from difflib import SequenceMatcher
 2 | import datasets
 3 | import multiprocessing
 4 | 
 5 | def compare_texts(text1, text2):
 6 |     # Split the texts into words
 7 |     words1 = text1.split()
 8 |     words2 = text2.split()
 9 | 
10 |     # Create a SequenceMatcher to compare the two word lists
11 |     matcher = SequenceMatcher(None, words1, words2)
12 | 
13 |     # Calculate the similarity ratio
14 |     similarity = matcher.ratio()
15 | 
16 |     # Calculate the difference ratio
17 |     difference = 1 - similarity
18 | 
19 |     return difference
20 | 
21 | def main(month, first_month_articles):
22 |     ds = datasets.load_dataset('RealTimeData/bbc_news_alltime', month, split='train')
23 |     # compare to first month
24 |     content = '\n'.join(['\n'.join(article.splitlines()[:10]) for article in ds['content']])
25 |     difference = compare_texts(content, first_month_articles)
26 | 
27 |     print(f"Finished {month}, average difference: {difference}")
28 |     return (month, difference)
29 | 
30 | if __name__ == '__main__':
31 |     
32 |     months = [f'{year}-{month:02d}' for year in range(2017, 2024) for month in range(1, 13) if not (year == 2023 and month == 12)]
33 |     first_month = datasets.load_dataset('RealTimeData/bbc_news_alltime', months[0], split='train')
34 |     first_month_articles = '\n'.join(['\n'.join(article.splitlines()[:10]) for article in first_month['content']])
35 |     diffs = {}
36 | 
37 |     months = months[1:]
38 |     # main got two arguments, month and first_month_articles
39 |     # pool size 4
40 |     with multiprocessing.Pool(8) as pool:
41 |         for month, diff in pool.starmap(main, [(month, first_month_articles) for month in months]):
42 |             diffs[month] = diff
43 | 
44 |     print(diffs)


--------------------------------------------------------------------------------
/data/analyse_wikitext.py:
--------------------------------------------------------------------------------
 1 | from difflib import SequenceMatcher
 2 | import datasets
 3 | import multiprocessing
 4 | 
 5 | def compare_texts(text1, text2):
 6 |     # Split the texts into words
 7 |     words1 = text1.split()
 8 |     words2 = text2.split()
 9 | 
10 |     # Create a SequenceMatcher to compare the two word lists
11 |     matcher = SequenceMatcher(None, words1, words2)
12 | 
13 |     # Calculate the similarity ratio
14 |     similarity = matcher.ratio()
15 | 
16 |     # Calculate the difference ratio
17 |     difference = 1 - similarity
18 | 
19 |     return difference
20 | 
21 | def main(month, first_month_articles):
22 |     ds = datasets.load_dataset('RealTimeData/wikitext_alltime', month, split='train')
23 |     # compare to first month
24 |     diffs = []
25 |     for article in ds:
26 |         title = article['title']
27 |         text = article['text']
28 |         if title not in first_month_articles:
29 |             print(f"Article {title} not found in first month")
30 |             continue
31 |         first_month_text = first_month_articles[title]
32 |         difference = compare_texts(text, first_month_text)
33 |         diffs.append(difference)
34 |     
35 |     avg_diff = sum(diffs) / len(diffs)
36 |     print(f"Finished {month}, average difference: {avg_diff}")
37 |     return (month, avg_diff)
38 | 
39 | if __name__ == '__main__':
40 |     
41 |     months = [f'{year}-{month:02d}' for year in range(2017, 2024) for month in range(1, 13) if not (year == 2023 and month == 12)]
42 |     first_month = datasets.load_dataset('RealTimeData/wikitext_alltime', months[0], split='train')
43 |     first_month_articles = {title: article for title, article in zip(first_month['title'], first_month['text'])}
44 |     diffs = {}
45 | 
46 |     months = months[1:]
47 |     # main got two arguments, month and first_month_articles
48 |     # pool size 4
49 |     with multiprocessing.Pool(8) as pool:
50 |         for month, diff in pool.starmap(main, [(month, first_month_articles) for month in months]):
51 |             diffs[month] = diff
52 | 
53 |     print(diffs)


--------------------------------------------------------------------------------
/figs/winrate.py:
--------------------------------------------------------------------------------
 1 | import pandas as pd
 2 | import plotly.express as px
 3 | 
 4 | def compute_pairwise_win_fraction(battles):
 5 |     # Times each model wins as Model A
 6 |     a_win_ptbl = pd.pivot_table(
 7 |         battles[battles['winner'] == "model_a"],
 8 |         index="model_a", columns="model_b", aggfunc="size", fill_value=0)
 9 | 
10 |     # Table counting times each model wins as Model B
11 |     b_win_ptbl = pd.pivot_table(
12 |         battles[battles['winner'] == "model_b"],
13 |         index="model_a", columns="model_b", aggfunc="size", fill_value=0)
14 | 
15 |     # Table counting number of A-B pairs
16 |     num_battles_ptbl = pd.pivot_table(battles,
17 |         index="model_a", columns="model_b", aggfunc="size", fill_value=0)
18 | 
19 |     # Computing the proportion of wins for each model as A and as B
20 |     # against all other models
21 |     row_beats_col_freq = (
22 |         (a_win_ptbl + b_win_ptbl.T) /
23 |         (num_battles_ptbl + num_battles_ptbl.T)
24 |     )
25 | 
26 |     # Arrange ordering according to proprition of wins
27 |     prop_wins = row_beats_col_freq.mean(axis=1).sort_values(ascending=False)
28 |     model_names = list(prop_wins.keys())
29 |     row_beats_col = row_beats_col_freq.loc[model_names, model_names]
30 |     return row_beats_col
31 | 
32 | def visualize_pairwise_win_fraction(battles, title):
33 |     row_beats_col = compute_pairwise_win_fraction(battles)
34 |     fig = px.imshow(row_beats_col, color_continuous_scale='RdBu',
35 |                     text_auto=".2f", title=title)
36 |     fig.update_layout(
37 |                 #   xaxis_title=" Model B: Loser",
38 |                 #   yaxis_title="Model A: Winner",
39 |                     xaxis_title=None,
40 |                     yaxis_title=None,
41 |                     xaxis_side="top", height=700, width=600,
42 |                     title_y=0.07, title_x=0.5)
43 |     fig.update_traces(hovertemplate=
44 |                   "Model A: %{y}<br>Model B: %{x}<br>Fraction of A Wins: %{z}<extra></extra>")
45 | 
46 |     return fig
47 | 
48 | df = pd.read_csv("figs/winrate2.tsv", sep='\t')
49 | df = df[df['winner'].isin(['model_a', 'model_b'])]
50 | df = df[df['model_a']!=df['model_b']]
51 | 
52 | fig = visualize_pairwise_win_fraction(df,
53 |       title = "Pair-wise Win Rate")
54 | 
55 | fig.show()
56 | 
57 | fig.update_layout(
58 |     font=dict(
59 |         size=18,
60 |     ),
61 | )
62 | fig.write_image("fig.png", width=700, height=650, scale=2)


--------------------------------------------------------------------------------
/data/wiki_dataset.py:
--------------------------------------------------------------------------------
 1 | import requests
 2 | import traceback
 3 | import mwparserfromhell
 4 | import datetime
 5 | import os
 6 | 
 7 | import sys
 8 | import json
 9 | import time
10 | 
11 | from tqdm import tqdm
12 | 
13 | WIKI_API_ENDPOINT = "https://en.wikipedia.org/w/api.php"
14 | 
15 | def parse_to_plain_text(wikitext):
16 |     parsed = mwparserfromhell.parse(wikitext)
17 |     return parsed.strip_code()
18 | 
19 | def fetch_content(title, date=None):
20 |     params = {
21 |         "action": "query",
22 |         "format": "json",
23 |         "titles": title,
24 |         "prop": "revisions",
25 |         "rvprop": "content",
26 |         "rvlimit": "1",
27 |     }
28 |     if date: params["rvstart"] = date
29 |     try:
30 |         response = requests.get(WIKI_API_ENDPOINT, params=params)
31 |         response.raise_for_status()  # Will raise an error if the HTTP request returned an unsuccessful status code
32 |         data = response.json()
33 |         if 'error' in data:
34 |             print(f"Error fetching content for {title}: {data['error']['info']}")
35 |             return None
36 | 
37 |         page = next(iter(data['query']['pages'].values()))
38 |         if 'revisions' not in page:
39 |             print(f"No revisions found for {title}")
40 |             return None
41 |         content = page['revisions'][0]['*']
42 |         
43 |         # Check if the content is a redirect and skip if true
44 |         if content.lower().startswith("#redirect"):
45 |             print(f"{title} is a redirect page.")
46 |             return None
47 |         text = parse_to_plain_text(content)
48 |         if len(text.split(' ')) < 300:
49 |             print(f"{title} is less than 300 words.")
50 |             return None
51 |         
52 |         return {
53 |             "title": page['title'],
54 |             "text": text,
55 |             "pageid": page['pageid'],
56 |         }, content
57 | 
58 |     except Exception as e:
59 |         print(f"An error occurred while fetching content for {title}: {str(e)}")
60 |         traceback.print_exc()  # This will print the full traceback
61 | 
62 |     return None
63 | 
64 | if __name__ == "__main__":
65 |     year, month, save_path = sys.argv[1:]
66 |     month = int(month)%12 + 1
67 | 
68 |     start_time = datetime.datetime(int(year), month, 1)
69 |     end_time = start_time + datetime.timedelta(days=28)
70 | 
71 |     print(f'Fetching wiki articles from {start_time.isoformat()} to {end_time.isoformat()}')
72 | 
73 |     with open('/user/HS502/yl02706/LatestEval/data/squad_wiki_title.text') as f:
74 |         titles = [line.strip() for line in f.readlines()]
75 |     historical_contents = [fetch_content(title, end_time) for title in tqdm(titles)]
76 |     historical_contents = [content[0] for content in historical_contents if content is not None]
77 |     historical_to_save = {title: content for title, content in zip(titles, historical_contents)}
78 | 
79 |     save_file = os.path.join(save_path, f'{year}-{month}.json')
80 |     with open(save_file, 'w') as f:
81 |         json.dump(historical_to_save, f, ensure_ascii=False)
82 |     print(f'Saved {len(historical_contents)} articles to {save_file}')


--------------------------------------------------------------------------------
/.github/workflows/weekly_downloader.yml:
--------------------------------------------------------------------------------
  1 | name: Weekly Downloader
  2 | 
  3 | on:
  4 |   schedule:
  5 |     # This cron job initiates the action at 00:00 every Sunday
  6 |     - cron: '0 0 * * 1'
  7 | 
  8 | jobs:
  9 |   wiki_downloader:
 10 |     runs-on: ubuntu-latest
 11 | 
 12 |     # Define environment variables for all steps in this job
 13 |     env:
 14 |       HF_TOKEN: ${{ secrets.HF_TOKEN }}
 15 |       Github_Token: ${{ secrets.gh_token }}
 16 | 
 17 |     steps:
 18 |     - name: Checkout repository
 19 |       uses: actions/checkout@v2
 20 | 
 21 |     - name: Set up Python
 22 |       uses: actions/setup-python@v2
 23 |       with:
 24 |         python-version: '3.10'  # Choose your desired Python version
 25 | 
 26 |     - name: Install dependencies
 27 |       run: |
 28 |         python -m pip install --upgrade pip
 29 |         pip install -r requirements.txt
 30 | 
 31 |     - name: Run script
 32 |       run: python wikitext_downloader.py
 33 |   
 34 |   arxiv_downloader:
 35 |     runs-on: ubuntu-latest
 36 | 
 37 |     # Define environment variables for all steps in this job
 38 |     env:
 39 |       HF_TOKEN: ${{ secrets.HF_TOKEN }}
 40 |       Github_Token: ${{ secrets.Github_Token }}
 41 | 
 42 |     steps:
 43 |     - name: Checkout repository
 44 |       uses: actions/checkout@v2
 45 | 
 46 |     - name: Set up Python
 47 |       uses: actions/setup-python@v2
 48 |       with:
 49 |         python-version: '3.10'  # Choose your desired Python version
 50 | 
 51 |     - name: Install dependencies
 52 |       run: |
 53 |         python -m pip install --upgrade pip
 54 |         pip install -r requirements.txt
 55 | 
 56 |     - name: Run script
 57 |       run: python arxiv_downloader.py
 58 |     
 59 |   bbc_downloader:
 60 |     runs-on: ubuntu-latest
 61 | 
 62 |     # Define environment variables for all steps in this job
 63 |     env:
 64 |       HF_TOKEN: ${{ secrets.HF_TOKEN }}
 65 |       Github_Token: ${{ secrets.Github_Token }}
 66 | 
 67 |     steps:
 68 |     - name: Checkout repository
 69 |       uses: actions/checkout@v2
 70 | 
 71 |     - name: Set up Python
 72 |       uses: actions/setup-python@v2
 73 |       with:
 74 |         python-version: '3.10'  # Choose your desired Python version
 75 | 
 76 |     - name: Install dependencies
 77 |       run: |
 78 |         python -m pip install --upgrade pip
 79 |         pip install -r requirements.txt
 80 | 
 81 |     - name: Run script
 82 |       run: python bbc_downloader.py
 83 |   
 84 |   github_downloader:
 85 |     runs-on: ubuntu-latest
 86 | 
 87 |     # Define environment variables for all steps in this job
 88 |     env:
 89 |       HF_TOKEN: ${{ secrets.HF_TOKEN }}
 90 |       Github_Token: ${{ secrets.Github_Token }}
 91 | 
 92 |     steps:
 93 |     - name: Checkout repository
 94 |       uses: actions/checkout@v2
 95 | 
 96 |     - name: Set up Python
 97 |       uses: actions/setup-python@v2
 98 |       with:
 99 |         python-version: '3.10'  # Choose your desired Python version
100 | 
101 |     - name: Install dependencies
102 |       run: |
103 |         python -m pip install --upgrade pip
104 |         pip install -r requirements.txt
105 | 
106 |     - name: Run script
107 |       run: python github_downloader.py
108 | 


--------------------------------------------------------------------------------
/readme.md:
--------------------------------------------------------------------------------
 1 | <p align="center">
 2 |     <img src="https://github.com/liyucheng09/LatestEval/blob/master/figs/logo.png" alt="Logo of Selective Context" width="auto" height="160" />
 3 | </p>
 4 | 
 5 | # "Uncheatable" LLMs Evaluation - LatestEval
 6 | 
 7 | Humans receive new test questions every exam, but LLMs? They've been evaluated with the same benchmarks for too long. Why not assess LLMs with fresh test just like we test our students? In this project, we introduce LatestEval, which automatically constructs language model benchmarks using the latest materials (e.g., arXiv, BBC, Wikipedia, etc.) to prevent "cheating" and data contamination.
 8 | 
 9 | **News!!**
10 | 
11 | - **15 Dec, 2023** - This project was accpeted by the main track of **AAAI 2024** :partying_face:! Check out the paper here: :point_right: [Dynamic Test Construction with Latest Materials](https://arxiv.org/abs/2312.12343).
12 | 
13 | # Key Features
14 | 
15 | 1. We maintain a QA benchmark that updates every half month using the latest online resources (created in the past half month). This approach aims to avoid 1) LLMs being trained on the test set (cheating); and 2) the unintentional inclusion of test questions in the training dataset (data contamination).
16 | 2. We analyzed real Human-AI conversations to ensure the automated benchmark aligns well with real-life applications (see [paper](https://arxiv.org/abs/2312.12343) for more detail).
17 | 
18 | 
19 | # The Benchmark
20 | 
21 | Access the latest benchmark dorectly at [Huggingface Hub](https://huggingface.co/LatestEval)!
22 | 
23 | - Latest benchmark of GitHub: [HF Hub](https://huggingface.co/datasets/LatestEval/github-latest)
24 | - Latest benchmark of arXiv: [HF Hub](https://huggingface.co/datasets/LatestEval/arxiv-latest)
25 | - Latest benchmark of BBC: [HF Hub](https://huggingface.co/datasets/LatestEval/bbc-latest)
26 | - The Full benchmark with all sources: [HF Hub](https://huggingface.co/datasets/LatestEval/full-latest)
27 | 
28 | The benchmarks are created with latest materials, find these raw materials/documents at [Huggingface Hub](https://huggingface.co/RealTimeData)
29 | 
30 | # Evaluate your LLM on LatestEval
31 | 
32 | We will add LatestEval to [lm-evaluation-harness](https://github.com/EleutherAI/lm-evaluation-harness) and [OpenCompass](https://github.com/open-compass/opencompass). Stay tuned.
33 | 
34 | # Create benchmarks with your own data
35 | 
36 | 1. Put your documents as `.txt` files under `./<your_doc_path>`.
37 | 2. Set your OpenAI key:
38 | 
39 | ```
40 | export OPENAI_API_KEY=<Your OpenAI key>
41 | ```
42 | 
43 | 3. Simply run:
44 | 
45 | ```
46 | python data_processor.py --source customized --file_path <your_path> --num_docs 100
47 | ```
48 | 
49 | If you want to reproduce LatestEval on arXiv, BBC, GitHub:
50 | 
51 | ```
52 | python data_processor.py --source arxiv --num_docs 100
53 | ```
54 | 
55 | # Issue
56 | 
57 | Open an issue if you have any problems or want to discuss.
58 | 
59 | # Citation
60 | 
61 | If you find this project useful, consider cite this project:
62 | 
63 | ```
64 | @misc{li2023avoiding,
65 |       title={Avoiding Data Contamination in Language Model Evaluation: Dynamic Test Construction with Latest Materials}, 
66 |       author={Yucheng Li and Frank Guerin and Chenghua Lin},
67 |       year={2023},
68 |       eprint={2312.12343},
69 |       archivePrefix={arXiv},
70 |       primaryClass={cs.CL}
71 | }
72 | ```


--------------------------------------------------------------------------------
/data/monthly_updater/monthly_wikitext.py:
--------------------------------------------------------------------------------
 1 | import requests
 2 | import traceback
 3 | import mwparserfromhell
 4 | import datetime
 5 | import os
 6 | import datasets
 7 | 
 8 | import sys
 9 | import json
10 | import time
11 | 
12 | from tqdm import tqdm
13 | 
14 | WIKI_API_ENDPOINT = "https://en.wikipedia.org/w/api.php"
15 | 
16 | def parse_to_plain_text(wikitext):
17 |     parsed = mwparserfromhell.parse(wikitext)
18 |     return parsed.strip_code()
19 | 
20 | def fetch_content(title, date=None):
21 |     params = {
22 |         "action": "query",
23 |         "format": "json",
24 |         "titles": title,
25 |         "prop": "revisions",
26 |         "rvprop": "content",
27 |         "rvlimit": "1",
28 |     }
29 |     if date: params["rvstart"] = date
30 |     try:
31 |         response = requests.get(WIKI_API_ENDPOINT, params=params)
32 |         response.raise_for_status()  # Will raise an error if the HTTP request returned an unsuccessful status code
33 |         data = response.json()
34 |         if 'error' in data:
35 |             print(f"Error fetching content for {title}: {data['error']['info']}")
36 |             return None
37 | 
38 |         page = next(iter(data['query']['pages'].values()))
39 |         if 'revisions' not in page:
40 |             print(f"No revisions found for {title}")
41 |             return None
42 |         content = page['revisions'][0]['*']
43 |         
44 |         # Check if the content is a redirect and skip if true
45 |         if content.lower().startswith("#redirect"):
46 |             print(f"{title} is a redirect page.")
47 |             return None
48 |         text = parse_to_plain_text(content)
49 |         if len(text.split(' ')) < 300:
50 |             print(f"{title} is less than 300 words.")
51 |             return None
52 |         
53 |         return {
54 |             "title": page['title'],
55 |             "text": text,
56 |             "pageid": page['pageid'],
57 |         }
58 | 
59 |     except Exception as e:
60 |         print(f"An error occurred while fetching content for {title}: {str(e)}")
61 |         traceback.print_exc()  # This will print the full traceback
62 | 
63 |     return None
64 | 
65 | if __name__ == "__main__":
66 |     today = datetime.datetime.today()
67 |     year = today.year
68 |     month = today.month
69 | 
70 |     hf_token = os.environ['HF_TOKEN']
71 | 
72 |     start_time = datetime.datetime(year, month, 1)
73 |     end_time = today
74 | 
75 |     print(f'Fetching wiki articles from {start_time.isoformat()} to {end_time.isoformat()}')
76 | 
77 |     with open('./data/squad_wiki_title.text') as f:
78 |         titles = [line.strip() for line in f.readlines()]
79 |     historical_contents = [fetch_content(title, end_time) for title in tqdm(titles)]
80 |     historical_contents = [content for content in historical_contents if content is not None]
81 |     historical_to_save = {title: content for title, content in zip(titles, historical_contents)}
82 | 
83 |     save_file = f'{year}-{month}.json'
84 |     with open(save_file, 'w') as f:
85 |         json.dump(historical_to_save, f, ensure_ascii=False)
86 |     print(f'Saved {len(historical_contents)} articles to {save_file}')
87 | 
88 |     from huggingface_hub import hf_hub_download, RepoCard, upload_file
89 | 
90 |     with open(save_file) as f:
91 |         data = json.load(f)
92 |     
93 |     all_articles = []
94 |     for title, article in data.items():
95 |         article['time'] = f'{year}-{month:02d}'
96 |         all_articles.append(article)
97 |     
98 |     ds = datasets.Dataset.from_list(all_articles)
99 |     ds.push_to_hub(f"RealTimeData/wikitext_alltime", config_name=f'{year}-{month:02d}', token=hf_token)


--------------------------------------------------------------------------------
/data/wikitext_alltime.py:
--------------------------------------------------------------------------------
  1 | import datasets
  2 | import json
  3 | 
  4 | dl = datasets.DownloadManager()
  5 | configs_file = dl.download('https://huggingface.co/datasets/RealTimeData/wikitext_alltime/raw/main/configs.txt')
  6 | 
  7 | with open(configs_file, encoding="utf-8") as f:
  8 |     _TIMES = f.read().splitlines()
  9 | 
 10 | _TIMES += ['all']
 11 | 
 12 | _CITATION = """\
 13 | @misc{li2023estimating,
 14 |       title={Estimating Contamination via Perplexity: Quantifying Memorisation in Language Model Evaluation}, 
 15 |       author={Yucheng Li},
 16 |       year={2023},
 17 |       eprint={2309.10677},
 18 |       archivePrefix={arXiv},
 19 |       primaryClass={cs.CL}
 20 | }
 21 | """
 22 | 
 23 | _DESCRIPTION = """\
 24 | This dataset contains Wikipedia articles of 419 selected pages every month from 2017-1 to current. The articles are arraged by month. Access the specific month by using the format "YYYY-MM" as config. Such as load_dataset("RealTimeData/wikitext_alltime", "2021-1").
 25 | """
 26 | 
 27 | _HOMEPAGE = "https://github.com/liyucheng09/Contamination_Detector"
 28 | 
 29 | class Wikitext_alltimes(datasets.GeneratorBasedBuilder):
 30 | 
 31 |     BUILDER_CONFIGS = [
 32 |         datasets.BuilderConfig(
 33 |             name=time, version=datasets.Version("1.0.0"), description=f"419 selected wikipedia articles edited in the priod of {time}"
 34 |         )
 35 |         for time in _TIMES
 36 |     ]
 37 | 
 38 |     def _info(self):
 39 |         features = datasets.Features(
 40 |             {
 41 |                 "title": datasets.Value("string"),
 42 |                 "pageid": datasets.Value("int64"),
 43 |                 "text": datasets.Value("string"),
 44 |                 "time": datasets.Value("string"),
 45 |             }
 46 |         )
 47 |         return datasets.DatasetInfo(
 48 |             description=_DESCRIPTION,
 49 |             features=features,
 50 |             homepage=_HOMEPAGE,
 51 |             citation=_CITATION,
 52 |         )
 53 | 
 54 |     def _split_generators(self, dl_manager):
 55 |         """Returns SplitGenerators."""
 56 |         if self.config.name == "all":
 57 |             times = _TIMES[:-1]
 58 |             files = dl_manager.download([f"wiki/{time}.json" for time in _TIMES ])
 59 |             return [
 60 |                 datasets.SplitGenerator(
 61 |                     name=datasets.Split.TRAIN,
 62 |                     gen_kwargs={"files": files},
 63 |                 )
 64 |             ]
 65 |         else:
 66 |             time = self.config.name
 67 |             _URL = f"wiki/{time}.json"
 68 |             file = dl_manager.download(_URL)
 69 |             return [
 70 |                 datasets.SplitGenerator(
 71 |                     name=datasets.Split.TRAIN,
 72 |                     gen_kwargs={"files": file},
 73 |                 )
 74 |             ]
 75 | 
 76 |     def _generate_examples(self, files):
 77 |         """Yields examples."""
 78 |         if self.config.name == "all":
 79 |             assert isinstance(files, list)
 80 |             for file in files:
 81 |                 time = file.strip('.json')
 82 |                 with open(file, encoding="utf-8") as f:
 83 |                     data = json.load(f)
 84 |                 for title, article in data.items():
 85 |                     yield f'{time}-{title}', {
 86 |                         "title": article['title'],
 87 |                         "pageid": article['pageid'],
 88 |                         "text": article['text'],
 89 |                         "time": time,
 90 |                     }
 91 |         else:
 92 |             assert isinstance(files, str)
 93 |             time = self.config.name
 94 |             with open(files, encoding="utf-8") as f:
 95 |                 data = json.load(f)
 96 |             for title, article in data.items():
 97 |                 yield f'{time}-{title}', {
 98 |                     "title": article['title'],
 99 |                     "pageid": article['pageid'],
100 |                     "text": article['text'],
101 |                     "time": time,
102 |                 }


--------------------------------------------------------------------------------
/push_to_hf_hub.py:
--------------------------------------------------------------------------------
  1 | # Merge with RealTimeData/ and push to Huggingface Hub
  2 | 
  3 | from glob import glob
  4 | import datasets
  5 | import json
  6 | from huggingface_hub import RepoCard, create_branch, create_tag
  7 | from data_processor import ArxivEval, BBCNewsEval, GithubEval
  8 | import datetime
  9 | 
 10 | if __name__ == "__main__":
 11 |     # Load the dataset
 12 |     # for example, benchmarks/latest/qa_pairs_arxiv_2023-46.json
 13 | 
 14 |     today = datetime.date.today()
 15 | 
 16 |     RepoCardText = """
 17 | # LatestEval for {source}
 18 | 
 19 | This benchmark was created with at {year} week {week} with the latest data from {source}.
 20 | 
 21 | check more details at our [github page](https://github.com/liyucheng09/LatestEval)."""
 22 | 
 23 |     source2ds = {}
 24 |     latest_ds = []
 25 | 
 26 |     for file in glob('benchmarks/2023-51/*.json'):
 27 |         with open(file, 'r') as f:
 28 |             data = json.load(f)
 29 | 
 30 |         if 'arxiv' in file:
 31 |             source = 'arxiv'
 32 |             docs = ArxivEval('RealTimeData/arxiv_latest', num_docs='all').docs
 33 |         elif 'bbc' in file:
 34 |             source = 'bbc'
 35 |             docs = BBCNewsEval('RealTimeData/bbc_latest', num_docs='all').docs
 36 |         elif 'github' in file:
 37 |             source = 'github'
 38 |             docs = GithubEval('RealTimeData/github_latest', num_docs='all').docs
 39 | 
 40 |         source2ds[source] = data
 41 |         
 42 |         time_stamp = file.split('_')[-1].split('.')[0]
 43 |         year = time_stamp.split('-')[0]
 44 |         week = time_stamp.split('-')[1]
 45 | 
 46 |         test_samples = []
 47 |         for doc in data:
 48 |             doc_id = doc['id'][len(source)+1:]
 49 |             sents = None
 50 |             for d in docs:
 51 |                 if d.entry_id == doc_id:
 52 |                     sents = d.original_sentences
 53 |             assert sents is not None, f'{doc_id} not found in {source} data'
 54 | 
 55 |             if isinstance(doc['response'], str):
 56 |                 try:
 57 |                     doc['response'] = eval(doc['response'])
 58 |                 except:
 59 |                     print(doc['response'])
 60 |                     continue
 61 |             
 62 |             for example in doc['response']:
 63 |                 sent_index = example['sentence_index']
 64 |                 passage = ''
 65 |                 for sent_i, sent in enumerate(sents):
 66 |                     if sent_i == sent_index:
 67 |                         passage += example['place_holder'] + ' '
 68 |                     else:
 69 |                         passage += sent + ' '
 70 |                 test_samples.append({
 71 |                     'source': source,
 72 |                     'doc_id': doc_id,
 73 |                     'passage': passage,
 74 |                     'query': example['query'],
 75 |                     'answer': example['key_information'],
 76 |                     'query_category': example['answer_type'],
 77 |                     'sent_index': sent_index
 78 |                 })
 79 | 
 80 |         latest_ds.extend(test_samples)
 81 | 
 82 |         # dataset = datasets.Dataset.from_list(test_samples)
 83 |         # dataset.push_to_hub(f'LatestEval/{source}-latest', branch='main')
 84 |         # dataset.push_to_hub(f'LatestEval/{source}-{year}-week{week}')
 85 | 
 86 |         # card = RepoCard(RepoCardText.format(source=source, year=year, week=week))
 87 |         # card.push_to_hub(f'LatestEval/{source}-latest', repo_type='dataset')
 88 |         # card.push_to_hub(f'LatestEval/{source}-{year}-week{week}', repo_type='dataset')
 89 | 
 90 |     # all three sources together
 91 |     # flatten the data and add source column
 92 | 
 93 |     dataset = datasets.Dataset.from_list(latest_ds)
 94 |     dataset.push_to_hub(f'LatestEval/full-latest', branch='main')
 95 |     dataset.push_to_hub(f'LatestEval/full-{year}-week{week}')
 96 | 
 97 |     card = RepoCard(RepoCardText.format(source='all', year=year, week=week))
 98 |     card.push_to_hub(f'LatestEval/full-latest', repo_type='dataset')
 99 |     card.push_to_hub(f'LatestEval/full-{year}-week{week}', repo_type='dataset')
100 | 


--------------------------------------------------------------------------------
/figs/compare_strings.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "prompts": [
 3 |     "Beyoncé Giselle Knowles-Carter (/biːˈjɒnseɪ/ bee-YON-say) (born September 4, 1981) is an American singer, songwriter, record producer and actress. Born and raised in Houston, Texas, she performed in various singing and dancing competitions as a child, and rose to fame in",
 4 |     "Persian (/ˈpɜːrʒən, -ʃən/), also known by its endonym Farsi (فارسی fārsi (fɒːɾˈsiː) ( listen)), is one of the Western Iranian languages within the Indo-Iranian branch of the Indo-European language family. It is primarily",
 5 |     "The number of Malayalam speakers in Lakshadweep is 51,100, which is only 0.15% of the total number, but is as much as about 84% of the population of Lakshadweep. In all, Malayalis made up 3.22% of the total Indian population in 2001. Of the total 33,066,",
 6 |     "The British have a love-hate relationship with the NHS. According to researchers at the King's Fund, the public gave the NHS its worst rating since records began 40 years ago. Just 29% said they were satisfied with the NHS in 2022. And yet we still love it. A whopping 90% of"
 7 |   ],
 8 |   "predictions": [
 9 |     " the late 1990s as lead singer of R&B girl-group Destiny's Child. Managed by her father, Mathew Knowles, the group became one of the world's best-selling girl groups of all time. Their hiatus saw the release of Beyoncé's debut album, Dangerously in Love (2003), which established her as a solo artist worldwide, earned five Grammy Awards and featured the Billboard Hot 100 number-one singles \"Crazy in Love\" and \"Baby Boy\".",
10 |     " spoken in Iran, Afghanistan (officially known as Dari since 1958),[3] and Tajikistan (officially known as Tajiki since the Soviet era),[4] and some other regions which historically were Persianate societies and considered part of Greater Iran. It is written right to left in the Persian alphabet, a modified variant of the Arabic script.",
11 |     " 392 Malayalam speakers in India in 2001, 33,015,420 spoke the standard dialects, 19,643 spoke the Yerava dialect and 31,329 spoke non-standard regional variations like Eranadan. As per the 1991 census data, 28.85% of all Malayalam speakers in India spoke a second language and 19.64% of the total knew three or more languages.",
12 |     " of Britons believe that the NHS is a crucial institution that should be preserved. The juxtaposition between dissatisfaction with the current state and overall reverence for the institution speaks volumes about the complex relationship the British public has with their healthcare system."
13 |   ],
14 |   "references": [
15 |     " the late 1990s as lead singer of the R&B girl-group Destiny's Child. Managed by her father, Mathew Knowles, the group became one of the world's best-selling girl groups of all time. Her debut solo album, Dangerously in Love (2003), debuted at number one on the US Billboard 200 chart and earned her five Grammy Awards, solidifying her as a solo artist as well. Throughout her career, Beyoncé has sold over 100 million records worldwide as a solo artist and a further 60 million records with Destiny's Child, making her one of the best-selling music artists of all time. She has won 23 Grammy Awards and is the most",
16 |     " spoken in Iran, Afghanistan (officially known as Dari since 1958), and Tajikistan (officially known as Tajiki since the Soviet era), and some other regions which historically were Persianate societies and considered part of Greater Iran. It is written in the Persian alphabet, a modified variant of the Arabic script, which itself evolved from the Aramaic alphabet.",
17 |     " 392 Malayalam speakers in India in 2001, 33,015,420 spoke the standard dialects, 19,643 spoke the Yerava dialect and 31,329 spoke non-standard regional variations like Eranadan. As per the 1991 census data, 28.85% of all Malayalam speakers in India spoke a second language and 19.64% of the total knew three or more languages.",
18 |     "of the public agrees the service should be free and available to everyone. But with more than seven million people on waiting lists, almost everyone knows someone who isn't getting the care they need. As the NHS approaches its 75th anniversary, politicians are falling over themselves to praise the service."
19 |   ]
20 | }


--------------------------------------------------------------------------------
/.github/workflows/monthly_updater.yml:
--------------------------------------------------------------------------------
  1 | name: Monthly Updater
  2 | 
  3 | on:
  4 |     schedule:
  5 |       # This cron job initiates the action at 00:00 on the 28th of every month
  6 |       - cron: '0 0 28 * *'
  7 |     
  8 |     workflow_dispatch:
  9 | 
 10 | jobs:
 11 |   wiki_downloader:
 12 |     runs-on: ubuntu-latest
 13 | 
 14 |     # Define environment variables for all steps in this job
 15 |     env:
 16 |       HF_TOKEN: ${{ secrets.HF_TOKEN }}
 17 |       Github_Token: ${{ secrets.gh_token }}
 18 |       Overflow_Token: ${{ secrets.overflow_token }}
 19 | 
 20 |     steps:
 21 |     - name: Checkout repository
 22 |       uses: actions/checkout@v2
 23 | 
 24 |     - name: Set up Python
 25 |       uses: actions/setup-python@v2
 26 |       with:
 27 |         python-version: '3.10'  # Choose your desired Python version
 28 | 
 29 |     - name: Install dependencies
 30 |       run: |
 31 |         python -m pip install --upgrade pip
 32 |         pip install -r requirements.txt
 33 | 
 34 |     - name: Run script
 35 |       run: python data/monthly_updater/monthly_wikitext.py
 36 |   
 37 |   arxiv_downloader:
 38 |     runs-on: ubuntu-latest
 39 | 
 40 |     # Define environment variables for all steps in this job
 41 |     env:
 42 |       HF_TOKEN: ${{ secrets.HF_TOKEN }}
 43 |       Github_Token: ${{ secrets.gh_token }}
 44 |       Overflow_Token: ${{ secrets.overflow_token }}
 45 | 
 46 |     steps:
 47 |     - name: Checkout repository
 48 |       uses: actions/checkout@v2
 49 | 
 50 |     - name: Set up Python
 51 |       uses: actions/setup-python@v2
 52 |       with:
 53 |         python-version: '3.10'  # Choose your desired Python version
 54 | 
 55 |     - name: Install dependencies
 56 |       run: |
 57 |         python -m pip install --upgrade pip
 58 |         pip install -r requirements.txt
 59 | 
 60 |     - name: Run script
 61 |       run: python data/monthly_updater/monthly_arxiv.py
 62 |     
 63 |   bbc_downloader:
 64 |     runs-on: ubuntu-latest
 65 | 
 66 |     # Define environment variables for all steps in this job
 67 |     env:
 68 |       HF_TOKEN: ${{ secrets.HF_TOKEN }}
 69 |       Github_Token: ${{ secrets.gh_token }}
 70 |       Overflow_Token: ${{ secrets.overflow_token }}
 71 | 
 72 |     steps:
 73 |     - name: Checkout repository
 74 |       uses: actions/checkout@v2
 75 | 
 76 |     - name: Set up Python
 77 |       uses: actions/setup-python@v2
 78 |       with:
 79 |         python-version: '3.10'  # Choose your desired Python version
 80 | 
 81 |     - name: Install dependencies
 82 |       run: |
 83 |         python -m pip install --upgrade pip
 84 |         pip install -r requirements.txt
 85 | 
 86 |     - name: Run script
 87 |       run: python data/monthly_updater/monthly_bbc_news.py
 88 |   
 89 |   math_downloader:
 90 |     runs-on: ubuntu-latest
 91 | 
 92 |     # Define environment variables for all steps in this job
 93 |     env:
 94 |       HF_TOKEN: ${{ secrets.HF_TOKEN }}
 95 |       Github_Token: ${{ secrets.gh_token }}
 96 |       Overflow_API_KEY: ${{ secrets.overflow_token }}
 97 | 
 98 |     steps:
 99 |     - name: Checkout repository
100 |       uses: actions/checkout@v2
101 | 
102 |     - name: Set up Python
103 |       uses: actions/setup-python@v2
104 |       with:
105 |         python-version: '3.10'  # Choose your desired Python version
106 | 
107 |     - name: Install dependencies
108 |       run: |
109 |         python -m pip install --upgrade pip
110 |         pip install -r requirements.txt
111 | 
112 |     - name: Run script
113 |       run: python data/monthly_updater/monthly_math.py
114 |   
115 |   code_downloader:
116 |     runs-on: ubuntu-latest
117 | 
118 |     # Define environment variables for all steps in this job
119 |     env:
120 |       HF_TOKEN: ${{ secrets.HF_TOKEN }}
121 |       Github_Token: ${{ secrets.gh_token }}
122 |       Overflow_Token: ${{ secrets.overflow_token }}
123 | 
124 |     steps:
125 |     - name: Checkout repository
126 |       uses: actions/checkout@v2
127 | 
128 |     - name: Set up Python
129 |       uses: actions/setup-python@v2
130 |       with:
131 |         python-version: '3.10'  # Choose your desired Python version
132 | 
133 |     - name: Install dependencies
134 |       run: |
135 |         python -m pip install --upgrade pip
136 |         pip install -r requirements.txt
137 | 
138 |     - name: Run script
139 |       run: python data/monthly_updater/monthly_code.py
140 | 


--------------------------------------------------------------------------------
/github_downloader.py:
--------------------------------------------------------------------------------
  1 | import requests
  2 | import base64
  3 | from datetime import datetime, timedelta
  4 | import os
  5 | import json
  6 | from tqdm import tqdm
  7 | from huggingface_hub import create_branch, create_tag, RepoCard
  8 | import traceback
  9 | 
 10 | github_token = os.environ['Github_Token']
 11 | headers = {'Authorization': f'token {github_token}'}
 12 | 
 13 | hf_token = os.environ['HF_TOKEN']
 14 | 
 15 | today = datetime.now()
 16 | start_date = today - timedelta(weeks=2)
 17 | start_date_str = start_date.strftime("%Y-%m-%d")
 18 | 
 19 | end_date = start_date + timedelta(days=7)
 20 | end_date_str = end_date.strftime("%Y-%m-%d")
 21 | 
 22 | out_path = f"dataset/github/{start_date_str}"
 23 | if not os.path.exists(out_path):
 24 |     os.makedirs(out_path)
 25 | 
 26 | def load_checkpoint():
 27 |     try:
 28 |         with open(f'{start_date_str}_checkpoint.json', 'r') as f:
 29 |             checkpoint = json.load(f)
 30 |             return checkpoint.get('page', 1), checkpoint.get('last_repo_index', 0)
 31 |     except FileNotFoundError:
 32 |         return 1, 0
 33 | 
 34 | def save_checkpoint(page, last_repo_index):
 35 |     with open(f'{start_date_str}_checkpoint.json', 'w') as f:
 36 |         json.dump({'page': page, 'last_repo_index': last_repo_index}, f)
 37 | 
 38 | page, last_repo_index = 1, 0
 39 | # page, last_repo_index = load_checkpoint()
 40 | all_readmes = []
 41 | 
 42 | while True:
 43 |     response = requests.get(f'https://api.github.com/search/repositories?q=created:{start_date_str}..{end_date_str}&sort=stars&order=desc&per_page=100&page={page}', headers=headers)
 44 |     data = response.json()
 45 | 
 46 |     if 'items' not in data:
 47 |         break
 48 |     if not data['items']:
 49 |         break
 50 |     for repo in tqdm(data['items'][last_repo_index:]):
 51 |         owner = repo['owner']['login']
 52 |         repo_name = repo['name']
 53 |         
 54 |         full_name = repo['full_name']
 55 |         url = repo['html_url']
 56 |         description = repo['description']
 57 |         stars = repo['stargazers_count']
 58 |         forks = repo['forks_count']
 59 | 
 60 |         response = requests.get(f'https://api.github.com/repos/{owner}/{repo_name}/readme', headers=headers)
 61 |         readme_data = response.json()
 62 | 
 63 |         if 'content' in readme_data:
 64 |             readme_content = base64.b64decode(readme_data['content']).decode('utf-8')
 65 |             # print(f"Repository {repo_name} README content:")
 66 |             # print(readme_content)
 67 |             with open(f"{out_path}/{full_name.replace('/', '_')}_README.md", 'w') as f:
 68 |                 readme_obj = {'full_name': full_name, 'url': url, 'description': description, 'readme': readme_content, 'stars': stars, 'forks': forks}
 69 |                 all_readmes.append(readme_obj)
 70 |                 json.dump(readme_obj, f, ensure_ascii=False)
 71 |         else:
 72 |             print(f"Repository {repo_name} doesn't have a README.")
 73 |         
 74 |     page += 1
 75 | 
 76 | import datasets
 77 | 
 78 | all_readmes = { k: [v[k] for v in all_readmes] for k in all_readmes[0].keys() }
 79 | ds = datasets.Dataset.from_dict(all_readmes)
 80 | 
 81 | try:
 82 |     create_branch("RealTimeData/github_latest", branch=start_date_str, repo_type="dataset", token=hf_token)
 83 | except:
 84 |     traceback.print_exc()
 85 | ds.push_to_hub("RealTimeData/github_latest", token=hf_token, branch='main')
 86 | ds.push_to_hub("RealTimeData/github_latest", token=hf_token, branch=start_date_str)
 87 | 
 88 | text = f"""
 89 | # Latest GitHub Repositories
 90 | 
 91 | You could always access the latest Github repos via this dataset.
 92 | 
 93 | We update the dataset weekly, on every Sunday. So the dataset always provides the latest Github repos from the last week.
 94 | 
 95 | The current dataset on main branch contains the latest Github Repos submitted from {start_date_str} to {end_date_str}.
 96 | 
 97 | The data collection is conducted on {today.date().isoformat()}.
 98 | 
 99 | Use the dataset via:
100 | ```
101 | ds = datasets.load_dataset('RealTimeData/github_latest')
102 | ```
103 | 
104 | # Previsou versions
105 | 
106 | You could access previous versions by requesting different branches.
107 | 
108 | For example, you could find the 2023-08-06 version via:
109 | ```
110 | ds = datasets.load_dataset('RealTimeData/github_latest', revision = '2023-08-06')
111 | ```
112 | 
113 | Check all available versions by clicking the "Files and versions" button on the top bar.
114 | """
115 | card = RepoCard(text)
116 | card.push_to_hub('RealTimeData/github_latest', repo_type='dataset', token=hf_token)


--------------------------------------------------------------------------------
/data/bbc_alltime.py:
--------------------------------------------------------------------------------
  1 | import datasets
  2 | import json
  3 | 
  4 | dl = datasets.DownloadManager()
  5 | configs_file = dl.download('https://huggingface.co/datasets/RealTimeData/bbc_alltime/raw/main/configs.txt')
  6 | 
  7 | with open(configs_file, encoding="utf-8") as f:
  8 |     _TIMES = f.read().splitlines()
  9 | 
 10 | _CITATION = """\
 11 | @misc{li2023estimating,
 12 |       title={Estimating Contamination via Perplexity: Quantifying Memorisation in Language Model Evaluation}, 
 13 |       author={Yucheng Li},
 14 |       year={2023},
 15 |       eprint={2309.10677},
 16 |       archivePrefix={arXiv},
 17 |       primaryClass={cs.CL}
 18 | }
 19 | """
 20 | 
 21 | _DESCRIPTION = """\
 22 | This dataset contains BBC News articles for every month from 2017-1 to current. Access a specific month by using the format "YYYY-MM" as config. Such as load_dataset("RealTimeData/bbc_alltime", "2021-1").
 23 | """
 24 | 
 25 | _HOMEPAGE = "https://github.com/liyucheng09/Contamination_Detector"
 26 | 
 27 | class Bbc_alltimes(datasets.GeneratorBasedBuilder):
 28 | 
 29 |     BUILDER_CONFIGS = [
 30 |         datasets.BuilderConfig(
 31 |             name=time, version=datasets.Version("1.0.0"), description=f"BBC News articles published in the priod of {time}"
 32 |         )
 33 |         for time in _TIMES
 34 |     ]
 35 | 
 36 |     def _info(self):
 37 |         features = datasets.Features(
 38 |             {
 39 |                 "title": datasets.Value("string"),
 40 |                 "published_date": datasets.Value("string"),
 41 |                 "authors": datasets.Value("string"),
 42 |                 "description": datasets.Value("string"),
 43 |                 "section": datasets.Value("string"),
 44 |                 "content": datasets.Value("string"),
 45 |                 "link": datasets.Value("string"),
 46 |             }
 47 |         )
 48 |         return datasets.DatasetInfo(
 49 |             description=_DESCRIPTION,
 50 |             features=features,
 51 |             homepage=_HOMEPAGE,
 52 |             citation=_CITATION,
 53 |         )
 54 | 
 55 |     def _split_generators(self, dl_manager):
 56 |         """Returns SplitGenerators."""
 57 |         if self.config.name == "all":
 58 |             times = _TIMES[:-1]
 59 |             files = dl_manager.download([f"articles/{time}.json" for time in _TIMES ])
 60 |             return [
 61 |                 datasets.SplitGenerator(
 62 |                     name=datasets.Split.TRAIN,
 63 |                     gen_kwargs={"files": files},
 64 |                 )
 65 |             ]
 66 |         else:
 67 |             time = self.config.name
 68 |             _URL = f"articles/{time}.json"
 69 |             file = dl_manager.download(_URL)
 70 |             return [
 71 |                 datasets.SplitGenerator(
 72 |                     name=datasets.Split.TRAIN,
 73 |                     gen_kwargs={"files": file},
 74 |                 )
 75 |             ]
 76 | 
 77 |     def _generate_examples(self, files):
 78 |         """Yields examples."""
 79 |         if self.config.name == "all":
 80 |             assert isinstance(files, list)
 81 |             for file in files:
 82 |                 time = file.strip('.json')
 83 |                 with open(file, encoding="utf-8") as f:
 84 |                     data = json.load(f)
 85 |                 length = len(data['title'])
 86 |                 for i in range(length):
 87 |                     yield f'{time}-{i}', {
 88 |                         "title": data['title'][i],
 89 |                         "published_date": data['published_date'][i],
 90 |                         "authors": data['authors'][i],
 91 |                         "description": data['description'][i],
 92 |                         "section": data['section'][i],
 93 |                         "content": data['content'][i],
 94 |                         "link": data['link'][i],
 95 |                     }
 96 |         else:
 97 |             assert isinstance(files, str)
 98 |             time = self.config.name
 99 |             with open(files, encoding="utf-8") as f:
100 |                 data = json.load(f)
101 |             length = len(data['title'])
102 |             for i in range(length):
103 |                 yield f'{time}-{i}', {
104 |                     "title": data['title'][i],
105 |                     "published_date": data['published_date'][i],
106 |                     "authors": data['authors'][i],
107 |                     "description": data['description'][i],
108 |                     "section": data['section'][i],
109 |                     "content": data['content'][i],
110 |                     "link": data['link'][i],
111 |                 }


--------------------------------------------------------------------------------
/data/maintain_wikitext_latest.py:
--------------------------------------------------------------------------------
  1 | import requests
  2 | import traceback
  3 | import mwparserfromhell
  4 | import datetime
  5 | import os
  6 | 
  7 | import sys
  8 | import json
  9 | import time
 10 | 
 11 | from tqdm import tqdm
 12 | 
 13 | WIKI_API_ENDPOINT = "https://en.wikipedia.org/w/api.php"
 14 | 
 15 | def parse_to_plain_text(wikitext):
 16 |     parsed = mwparserfromhell.parse(wikitext)
 17 |     return parsed.strip_code()
 18 | 
 19 | def fetch_content(title, date=None):
 20 |     params = {
 21 |         "action": "query",
 22 |         "format": "json",
 23 |         "titles": title,
 24 |         "prop": "revisions",
 25 |         "rvprop": "content",
 26 |         "rvlimit": "1",
 27 |     }
 28 |     if date: params["rvstart"] = date
 29 |     try:
 30 |         response = requests.get(WIKI_API_ENDPOINT, params=params)
 31 |         response.raise_for_status()  # Will raise an error if the HTTP request returned an unsuccessful status code
 32 |         data = response.json()
 33 |         if 'error' in data:
 34 |             print(f"Error fetching content for {title}: {data['error']['info']}")
 35 |             return None
 36 | 
 37 |         page = next(iter(data['query']['pages'].values()))
 38 |         if 'revisions' not in page:
 39 |             print(f"No revisions found for {title}")
 40 |             return None
 41 |         content = page['revisions'][0]['*']
 42 |         
 43 |         # Check if the content is a redirect and skip if true
 44 |         if content.lower().startswith("#redirect"):
 45 |             print(f"{title} is a redirect page.")
 46 |             return None
 47 |         text = parse_to_plain_text(content)
 48 |         if len(text.split(' ')) < 300:
 49 |             print(f"{title} is less than 300 words.")
 50 |             return None
 51 |         
 52 |         return {
 53 |             "title": page['title'],
 54 |             "text": text,
 55 |             "pageid": page['pageid'],
 56 |         }
 57 | 
 58 |     except Exception as e:
 59 |         print(f"An error occurred while fetching content for {title}: {str(e)}")
 60 |         traceback.print_exc()  # This will print the full traceback
 61 | 
 62 |     return None
 63 | 
 64 | if __name__ == "__main__":
 65 |     today = datetime.date.today()
 66 |     year = today.year
 67 |     month = today.month
 68 | 
 69 |     hf_token = os.environ['HF_TOKEN']
 70 | 
 71 |     start_time = datetime.datetime(year, month, 1)
 72 |     end_time = today
 73 | 
 74 |     print(f'Fetching wiki articles from {start_time.isoformat()} to {end_time.isoformat()}')
 75 | 
 76 |     with open('./data/squad_wiki_title.text') as f:
 77 |         titles = [line.strip() for line in f.readlines()]
 78 |     historical_contents = [fetch_content(title, end_time) for title in tqdm(titles)]
 79 |     historical_contents = [content for content in historical_contents if content is not None]
 80 |     historical_to_save = {title: content for title, content in zip(titles, historical_contents)}
 81 | 
 82 |     save_file = f'{year}-{month}.json'
 83 |     with open(save_file, 'w') as f:
 84 |         json.dump(historical_to_save, f, ensure_ascii=False)
 85 |     print(f'Saved {len(historical_contents)} articles to {save_file}')
 86 | 
 87 |     from huggingface_hub import hf_hub_download, RepoCard, upload_file
 88 | 
 89 |     upload_file(
 90 |         path_or_fileobj = save_file,
 91 |         path_in_repo = f'wiki/{year}-{month}.json',
 92 |         repo_id = 'RealTimeData/wikitext_alltime',
 93 |         repo_type = 'dataset',
 94 |         token=hf_token,
 95 |     )
 96 | 
 97 |     file = hf_hub_download(repo_id="RealTimeData/wikitext_alltime", filename="configs.txt", repo_type='dataset')
 98 |     with open(file) as f:
 99 |         times = json.read(f).splitlines()
100 |     times.append(f'{year}-{month}')
101 | 
102 |     with open('configs.txt', 'w') as f:
103 |         f.write('\n'.join(times))
104 | 
105 |     upload_file(
106 |         path_or_fileobj = 'configs.txt',
107 |         path_in_repo = 'configs.txt',
108 |         repo_id = 'RealTimeData/wikitext_alltime',
109 |         repo_type = 'dataset',
110 |         token=hf_token,
111 |     )
112 | 
113 |     text = f"""
114 | # Wikitext for All Times
115 | 
116 | You could find 491 selected wiki articles every month from 2017-1 to {year_str}-{month_str}.
117 | 
118 | Use this to download wiki articles during a specific month:
119 | ```
120 | ds = datasets.load_dataset('RealTimeData/wikitext_alltime', '2017-8')
121 | ```
122 | 
123 | The time stamp follows the format of "YYYY-MM".
124 | 
125 | # An example
126 | 
127 | ```
128 | > ds = datasets.load_dataset('RealTimeData/wikitext_alltime', '2023-10', split='train')
129 | > ds[0]
130 | 
131 | {'title': 'Queen Victoria',
132 |  'pageid': 47923,
133 |  'text': 'Victoria (Alexa ...',
134 |  'time': '2023-10'}
135 | ```
136 | """
137 |     card = RepoCard(text)
138 |     card.push_to_hub('RealTimeData/wikitext_alltime', repo_type='dataset', token=hf_token)


--------------------------------------------------------------------------------
/data/audio_dataset.py:
--------------------------------------------------------------------------------
  1 | import datetime
  2 | import yt_dlp
  3 | from googleapiclient.discovery import build
  4 | import os
  5 | import sys
  6 | import re
  7 | from glob import glob
  8 | import datasets
  9 | import time
 10 | import random
 11 | import soundfile as sf
 12 | import struct
 13 | import numpy as np
 14 | 
 15 | def get_popular_videos(youtube, start_date, end_date, max_results=30):
 16 |     published_after = start_date.strftime("%Y-%m-%dT%H:%M:%SZ")
 17 |     published_before = end_date.strftime("%Y-%m-%dT%H:%M:%SZ")
 18 |     request = youtube.search().list(
 19 |         part="snippet",
 20 |         maxResults=max_results,
 21 |         order="viewCount",
 22 |         publishedAfter=published_after,
 23 |         publishedBefore=published_before,
 24 |         type="video",
 25 |     )
 26 |     response = request.execute()
 27 |     videos_ids = [item['id']['videoId'] for item in response['items']]
 28 |     return videos_ids
 29 | 
 30 | def parse_duration(duration_string):
 31 |     hours = re.search(r'(\d+)H', duration_string)
 32 |     minutes = re.search(r'(\d+)M', duration_string)
 33 |     seconds = re.search(r'(\d+)S', duration_string)
 34 | 
 35 |     hours = int(hours.group(1)) if hours else 0
 36 |     minutes = int(minutes.group(1)) if minutes else 0
 37 |     seconds = int(seconds.group(1)) if seconds else 0
 38 | 
 39 |     return hours * 3600 + minutes * 60 + seconds
 40 | 
 41 | def filter_too_long_video(youtube, video_ids, max_duration=600, max_results=30):
 42 |     request = youtube.videos().list(
 43 |         part="contentDetails",
 44 |         id=','.join(video_ids),
 45 |     )
 46 |     response = request.execute()
 47 | 
 48 |     final_videos = []
 49 |     for item in response['items']:
 50 |         duration = parse_duration(item['contentDetails']['duration'])
 51 |         if duration <= max_duration:
 52 |             final_videos.append(item['id'])
 53 |         if len(final_videos) >= max_results:
 54 |             break
 55 |     return final_videos
 56 | 
 57 | def download_audio(video_id, save_path):
 58 |     ydl_opts = {
 59 |         'format': 'bestaudio/best',
 60 |         'postprocessors': [{
 61 |             'key': 'FFmpegExtractAudio',
 62 |             'preferredcodec': 'flac',
 63 |             'preferredquality': '192',
 64 |         }],
 65 |         'postprocessor_args': [
 66 |             '-ar', '16000'  # Set audio sample rate to 16 kHz
 67 |         ],
 68 |         'outtmpl': os.path.join(save_path, '%(id)s.%(ext)s'),
 69 |     }
 70 |     with yt_dlp.YoutubeDL(ydl_opts) as ydl:
 71 |         ydl.download([f'http://www.youtube.com/watch?v={video_id}'])
 72 | 
 73 | if __name__ == '__main__':
 74 | 
 75 |     month, save_path, = sys.argv[1:]
 76 |     month = int(month) + 1
 77 | 
 78 |     videos_per_month = 3
 79 |     api_key = os.environ['YOUTUBE_API_KEY']
 80 |     youtube = build('youtube', 'v3', developerKey=api_key)
 81 | 
 82 |     # time_stamps = [f'{year}-{month:02d}' for year in range(2017, 2024)]
 83 |     time_stamps = [f'{year}-{month:02d}' for year in range(2017, 2024) for month in range(1, 13)]
 84 | 
 85 |     for time_stamp in time_stamps:
 86 |         files = glob(os.path.join(save_path, time_stamp, '*.flac'))
 87 |         print(f"Start {time_stamp}...")
 88 | 
 89 |         if not len(files) >= videos_per_month:
 90 |             year, month = time_stamp.split('-')
 91 | 
 92 |             start_date = datetime.date(int(year), int(month), 1)
 93 |             end_of_month = datetime.date(int(year), int(month), 28)
 94 | 
 95 |             video_ids = get_popular_videos(youtube, start_date, end_of_month, max_results=50)
 96 |             video_ids = filter_too_long_video(youtube, video_ids, max_duration=600, max_results=videos_per_month)
 97 |             for video in video_ids:
 98 |                 download_audio(video, os.path.join(save_path, time_stamp))
 99 |             
100 |             print(f"Downloaded {len(video_ids)} videos in {time_stamp}")
101 | 
102 |         files = glob(os.path.join(save_path, time_stamp, '*.flac'))
103 |         random.shuffle(files)
104 |         files = files[:videos_per_month]
105 |         instances = []
106 |         for file in files:
107 |             data, samplerate = sf.read(file)
108 |             if len(data.shape) > 1:
109 |                 data = data.mean(axis=1)
110 |             denormalized_data = np.int16(data * 32767)
111 |             byte_stream = b''.join(struct.pack('<h', sample) for sample in denormalized_data)
112 |             instances.append(
113 |                 {
114 |                     'audio': byte_stream,
115 |                     'name': file,
116 |                     'time': time_stamp,
117 |                     'sampling_rate': samplerate,
118 |                 }
119 |             )
120 | 
121 |         ds = datasets.Dataset.from_list(instances)
122 | 
123 |         ds.push_to_hub('RealTimeData/audio_alltime', config_name=time_stamp, token=os.environ['HF_TOKEN'])
124 |         print(f"Finished {time_stamp}.")


--------------------------------------------------------------------------------
/data/monthly_updater/monthly_image.py:
--------------------------------------------------------------------------------
 1 | import datasets
 2 | import requests
 3 | import sys
 4 | import os
 5 | from mimetypes import guess_extension
 6 | from PIL import Image
 7 | from time import sleep
 8 | import pandas as pd
 9 | import random
10 | 
11 | def download_image(url, save_path, headers, max_retries=5):
12 |     retries = 0
13 |     while retries < max_retries:
14 |         try:
15 |             response = requests.get(url, headers=headers)
16 |             if response.status_code == 200:
17 |                 extension = guess_extension(response.headers['content-type']) or '.jpg'
18 |                 save_path = os.path.join(save_path, f"{i}{extension}")
19 |                 with open(save_path, 'wb') as f:
20 |                     f.write(response.content)
21 |                 return save_path
22 |             else:
23 |                 retries += 1
24 |                 sleep(1)  # Wait for 1 second before retrying
25 |         except Exception as e:
26 |             print(f"An error occurred: {e}")
27 |             retries += 1
28 |             sleep(1)
29 |     return None
30 | 
31 | if __name__ == "__main__":
32 | 
33 |     hf_token = os.environ['HF_TOKEN']
34 | 
35 |     month, save_path, = sys.argv[1:]
36 |     month = int(month) + 1
37 | 
38 |     header = {
39 |         "User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) "
40 |         "AppleWebKit/537.36 (KHTML, like Gecko) "
41 |         "Chrome/91.0.4472.114 Safari/537.36"
42 |     }
43 | 
44 |     all_times = datasets.get_dataset_config_names('RealTimeData/bbc_news_alltime')
45 | 
46 |     # find times in the year
47 |     times = []
48 |     for t in all_times:
49 |         if t not in ['2017-01', '2017-02', '2017-03', '2017-04', '2017-05', '2017-06', '2017-07', '2017-08', '2017-09', '2017-10', '2017-11', '2017-12', '2018-01', '2018-02', '2018-03', '2018-04', '2018-05', '2018-06', '2018-07', '2018-08', '2018-09', '2018-10', '2018-11', '2018-12', '2019-01', '2019-02', '2019-03', '2019-04', '2019-05', '2019-06', '2019-07', '2019-08', '2019-09', '2019-10', '2019-11', '2019-12', '2020-01', '2020-02', '2020-03', '2020-04', '2020-05', '2020-06', '2020-07', '2020-08', '2020-09', '2020-10', '2020-11', '2020-12', '2021-01', '2021-02', '2021-03', '2021-04', '2021-05', '2021-06', '2021-07', '2021-08', '2021-09', '2021-10', '2021-11', '2021-12', '2022-01', '2022-02', '2022-03', '2022-04', '2022-06', '2022-07', '2022-08', '2022-11', '2022-12', '2023-01', '2023-02', '2023-03', '2023-04', '2023-06', '2023-07', '2023-08', '2023-11', '2023-12']:
50 |         # if f'{month:02d}' == t[-2:] and t not in ['2017-01', '2017-02', '2017-03', '2017-04', '2017-05', '2017-06', '2017-07', '2017-08', '2017-09', '2017-10', '2017-11', '2017-12', '2018-01', '2018-02', '2018-03', '2018-04', '2018-05', '2018-06', '2018-07', '2018-08', '2018-09', '2018-10', '2018-11', '2018-12', '2019-01', '2019-02', '2019-03', '2019-04', '2019-05', '2019-06', '2019-07', '2019-08', '2019-09', '2019-10', '2019-11', '2019-12', '2020-01', '2020-02', '2020-03', '2020-04', '2020-05', '2020-06', '2020-07', '2020-08', '2020-09', '2020-10', '2020-11', '2020-12', '2021-01', '2021-02', '2021-03', '2021-04', '2021-05', '2021-06', '2021-07', '2021-08', '2021-09', '2021-10', '2021-11', '2021-12', '2022-01', '2022-02', '2022-03', '2022-04', '2022-06', '2022-07', '2022-08', '2022-11', '2022-12', '2023-01', '2023-02', '2023-03', '2023-04', '2023-06', '2023-07', '2023-08', '2023-11', '2023-12']:
51 |             times.append(t)
52 | 
53 |     # load image links
54 |     for time_stamp in times:
55 |         print(f"Processing {time_stamp}")
56 |         year = int(time_stamp.split('-')[0])
57 | 
58 |         save_path = os.path.join(save_path, str(year), str(month), 'images')
59 |         if not os.path.exists(save_path):
60 |             os.makedirs(save_path)
61 | 
62 |         sleep(random.randint(1, 10))
63 |         ds = datasets.load_dataset('RealTimeData/bbc_news_alltime', time_stamp, split='train')
64 |         urls = ds['top_image']
65 |         article_links = ds['link']
66 |         titles = ds['title']
67 |         print(f"Found {len(urls)} links in {time_stamp}")
68 | 
69 |         all_images = []
70 |         links_and_files = []
71 |         for i, (url, link, title) in enumerate(zip(urls, article_links, titles)):
72 |             if link is None:
73 |                 all_images.append({'url': url, 'img': None, 'title': title})
74 |                 continue
75 |             
76 |             # download image
77 |             downloaded_image = download_image(url, save_path, header)
78 |             if downloaded_image is None:
79 |                 print(f"Failed to download image {url}")
80 |                 all_images.append({'url': url, 'img': None, 'title': title})
81 |                 continue
82 | 
83 |             # load image
84 |             img = Image.open(downloaded_image)
85 |             file = os.path.basename(downloaded_image)
86 |             all_images.append({'url': url, 'img': img, 'title': title})
87 |             links_and_files.append({'link': link, 'file': file})
88 | 
89 |         # save links and files to metadata.csv
90 |         df = pd.DataFrame(links_and_files)
91 |         df.to_csv(os.path.join(save_path, 'metadata.csv'))
92 | 
93 |         # Huggingface datasets
94 |         ds = datasets.Dataset.from_list(all_images)
95 |         ds.push_to_hub(f"RealTimeData/bbc_images_alltime", config_name=time_stamp, token=hf_token)
96 |         print(f"{time_stamp} done")
97 | 


--------------------------------------------------------------------------------
/data/bbc_news_image.py:
--------------------------------------------------------------------------------
 1 | import datasets
 2 | import requests
 3 | import sys
 4 | import os
 5 | from mimetypes import guess_extension
 6 | from PIL import Image
 7 | from time import sleep
 8 | import pandas as pd
 9 | import random
10 | 
11 | def download_image(url, save_path, headers, max_retries=5):
12 |     retries = 0
13 |     while retries < max_retries:
14 |         try:
15 |             response = requests.get(url, headers=headers)
16 |             if response.status_code == 200:
17 |                 extension = guess_extension(response.headers['content-type']) or '.jpg'
18 |                 save_path = os.path.join(save_path, f"{i}{extension}")
19 |                 with open(save_path, 'wb') as f:
20 |                     f.write(response.content)
21 |                 return save_path
22 |             else:
23 |                 retries += 1
24 |                 sleep(1)  # Wait for 1 second before retrying
25 |         except Exception as e:
26 |             print(f"An error occurred: {e}")
27 |             retries += 1
28 |             sleep(1)
29 |     return None
30 | 
31 | if __name__ == "__main__":
32 | 
33 |     hf_token = os.environ['HF_TOKEN']
34 | 
35 |     month, save_path, = sys.argv[1:]
36 |     month = int(month) + 1
37 | 
38 |     header = {
39 |         "User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) "
40 |         "AppleWebKit/537.36 (KHTML, like Gecko) "
41 |         "Chrome/91.0.4472.114 Safari/537.36"
42 |     }
43 | 
44 |     all_times = datasets.get_dataset_config_names('RealTimeData/bbc_news_alltime')
45 | 
46 |     # find times in the year
47 |     times = ['2024-01', '2024-02']
48 |     # for t in all_times:
49 |     #     if t not in ['2017-01', '2017-02', '2017-03', '2017-04', '2017-05', '2017-06', '2017-07', '2017-08', '2017-09', '2017-10', '2017-11', '2017-12', '2018-01', '2018-02', '2018-03', '2018-04', '2018-05', '2018-06', '2018-07', '2018-08', '2018-09', '2018-10', '2018-11', '2018-12', '2019-01', '2019-02', '2019-03', '2019-04', '2019-05', '2019-06', '2019-07', '2019-08', '2019-09', '2019-10', '2019-11', '2019-12', '2020-01', '2020-02', '2020-03', '2020-04', '2020-05', '2020-06', '2020-07', '2020-08', '2020-09', '2020-10', '2020-11', '2020-12', '2021-01', '2021-02', '2021-03', '2021-04', '2021-05', '2021-06', '2021-07', '2021-08', '2021-09', '2021-10', '2021-11', '2021-12', '2022-01', '2022-02', '2022-03', '2022-04', '2022-06', '2022-07', '2022-08', '2022-11', '2022-12', '2023-01', '2023-02', '2023-03', '2023-04', '2023-06', '2023-07', '2023-08', '2023-11', '2023-12']:
50 |     #     # if f'{month:02d}' == t[-2:] and t not in ['2017-01', '2017-02', '2017-03', '2017-04', '2017-05', '2017-06', '2017-07', '2017-08', '2017-09', '2017-10', '2017-11', '2017-12', '2018-01', '2018-02', '2018-03', '2018-04', '2018-05', '2018-06', '2018-07', '2018-08', '2018-09', '2018-10', '2018-11', '2018-12', '2019-01', '2019-02', '2019-03', '2019-04', '2019-05', '2019-06', '2019-07', '2019-08', '2019-09', '2019-10', '2019-11', '2019-12', '2020-01', '2020-02', '2020-03', '2020-04', '2020-05', '2020-06', '2020-07', '2020-08', '2020-09', '2020-10', '2020-11', '2020-12', '2021-01', '2021-02', '2021-03', '2021-04', '2021-05', '2021-06', '2021-07', '2021-08', '2021-09', '2021-10', '2021-11', '2021-12', '2022-01', '2022-02', '2022-03', '2022-04', '2022-06', '2022-07', '2022-08', '2022-11', '2022-12', '2023-01', '2023-02', '2023-03', '2023-04', '2023-06', '2023-07', '2023-08', '2023-11', '2023-12']:
51 |     #         times.append(t)
52 | 
53 |     # load image links
54 |     for time_stamp in times:
55 |         print(f"Processing {time_stamp}")
56 |         year = int(time_stamp.split('-')[0])
57 | 
58 |         save_path = os.path.join(save_path, str(year), str(month), 'images')
59 |         if not os.path.exists(save_path):
60 |             os.makedirs(save_path)
61 | 
62 |         sleep(random.randint(1, 10))
63 |         ds = datasets.load_dataset('RealTimeData/bbc_news_alltime', time_stamp, split='train')
64 |         urls = ds['top_image']
65 |         article_links = ds['link']
66 |         titles = ds['title']
67 |         print(f"Found {len(urls)} links in {time_stamp}")
68 | 
69 |         all_images = []
70 |         links_and_files = []
71 |         for i, (url, link, title) in enumerate(zip(urls, article_links, titles)):
72 |             if link is None:
73 |                 all_images.append({'url': url, 'img': None, 'title': title})
74 |                 continue
75 |             
76 |             # download image
77 |             downloaded_image = download_image(url, save_path, header)
78 |             if downloaded_image is None:
79 |                 print(f"Failed to download image {url}")
80 |                 all_images.append({'url': url, 'img': None, 'title': title})
81 |                 continue
82 | 
83 |             # load image
84 |             img = Image.open(downloaded_image)
85 |             file = os.path.basename(downloaded_image)
86 |             all_images.append({'url': url, 'img': img, 'title': title})
87 |             links_and_files.append({'link': link, 'file': file})
88 | 
89 |         # save links and files to metadata.csv
90 |         df = pd.DataFrame(links_and_files)
91 |         df.to_csv(os.path.join(save_path, 'metadata.csv'))
92 | 
93 |         # Huggingface datasets
94 |         ds = datasets.Dataset.from_list(all_images)
95 |         ds.push_to_hub(f"RealTimeData/bbc_images_alltime", config_name=time_stamp, token=hf_token)
96 |         print(f"{time_stamp} done")
97 | 


--------------------------------------------------------------------------------
/data/math_dataset.py:
--------------------------------------------------------------------------------
  1 | import requests
  2 | import time
  3 | from datetime import datetime
  4 | import sys
  5 | import os
  6 | import json
  7 | 
  8 | Overflow_API_KEY = os.environ['Overflow_API_KEY']
  9 | 
 10 | def fetch_answers_for_question(site, question_id):
 11 |     """Fetch answers for a given question ID."""
 12 |     url = f"https://api.stackexchange.com/2.3/questions/{question_id}/answers"
 13 |     params = {
 14 |         'order': 'desc',
 15 |         'sort': 'votes',
 16 |         'site': site,
 17 |         'filter': 'withbody',  # Include answer body
 18 |         'key': Overflow_API_KEY,
 19 |     }
 20 |     response = requests.get(url, params=params)
 21 |     data = response.json()
 22 |     # if data['quota_remaining'] < 10:
 23 |     #     print(f"Warning: API quota remaining is {data['quota_remaining']} for site {site} and question {question_id}.")
 24 |     #     time.sleep(10)
 25 |     print(f"Quota remaining: {data['quota_remaining']}")
 26 |     return data.get('items', [])
 27 | 
 28 | def batched_answer_fetch(site, questions):
 29 |     """Fetch answers for given question IDs in batches."""
 30 |     all_answers = []
 31 |     # API might have a limit on the number of IDs per request; adjust batch_size if needed
 32 |     answer_count = 0
 33 |     batch_ids = []
 34 |     question_and_answers = {}
 35 |     for question in questions:
 36 |         if question['answer_count'] < 2:
 37 |             continue
 38 |         question['site'] = site
 39 |         question_and_answers[question['question_id']] = question
 40 |         answer_count += question['answer_count']
 41 |         if answer_count < 100:
 42 |             batch_ids.append(question['question_id'])
 43 |             continue
 44 |         else:
 45 |             ids_string = ';'.join(map(str, batch_ids))
 46 |             url = f"https://api.stackexchange.com/2.3/questions/{ids_string}/answers"
 47 |             params = {
 48 |                 'order': 'desc',
 49 |                 'sort': 'votes',
 50 |                 'site': site,
 51 |                 'filter': 'withbody',  # Include answer body
 52 |                 'key': Overflow_API_KEY, # Replace YOUR_API_KEY with your actual Stack Exchange API key
 53 |                 'pagesize': 100,  # Adjust as per your needs, max is typically 100
 54 |             }
 55 |             response = requests.get(url, params=params)
 56 |             data = response.json()
 57 |             all_answers.extend(data.get('items', []))
 58 |             for ans in all_answers:
 59 |                 question_id = ans['question_id']
 60 |                 assert question_id in question_and_answers
 61 |                 question_and_answers[question_id].setdefault('answers', []).append(ans)
 62 |             
 63 |             print(f"Quota remaining: {data['quota_remaining']}")
 64 |             batch_ids = [question['question_id']]
 65 |             answer_count = 0
 66 |                 
 67 |     return question_and_answers
 68 | 
 69 | def fetch_questions_within_period(site, time_stamp, max_items=100):
 70 |     questions = []
 71 |     page = 1
 72 |     has_more = True
 73 |     year = int(time_stamp.split('-')[0])
 74 |     month = int(time_stamp.split('-')[1])
 75 |     start_date = datetime(year, month, 1)
 76 |     end_date = datetime(year, month, 28)
 77 | 
 78 |     fromdate = int(start_date.timestamp())
 79 |     todate = int(end_date.timestamp())
 80 |     
 81 |     while has_more and len(questions) < max_items:
 82 |         url = "https://api.stackexchange.com/2.3/questions"
 83 |         params = {
 84 |             'site': site,
 85 |             'sort': 'votes',
 86 |             'fromdate': fromdate,
 87 |             'todate': todate,
 88 |             'pagesize': 100,  # Adjust as per your needs, max is typically 100
 89 |             'page': page,
 90 |             'filter': 'withbody',  # Include question body
 91 |             'key': Overflow_API_KEY,
 92 |         }
 93 |         response = requests.get(url, params=params)
 94 |         data = response.json()
 95 |         questions.extend(data.get('items', []))
 96 |         has_more = data.get('has_more', False)
 97 |         page += 1
 98 |         time.sleep(0.05)
 99 |     
100 |     return questions
101 | 
102 | if __name__ == '__main__':
103 |     month, save_path, = sys.argv[1:]
104 |     month = int(month) % 12 + 1
105 | 
106 |     if not os.path.exists(save_path):
107 |         os.makedirs(save_path)
108 | 
109 |     # time_stampes = [f'{year}-{month:02d}' for year in range(2017, 2024) for month in range(1, 13)]
110 |     time_stampes = [f'2024-{month:02d}' for month in range(1,3)]
111 | 
112 |     # Example usage
113 |     sites = ["math", "mathoverflow"]
114 |     for time_stamp in time_stampes:
115 |         all_results = {}
116 |         for site in sites:
117 |             questions = fetch_questions_within_period(site, time_stamp, max_items=1500)
118 |             print(f"Found {len(questions)} questions on {site} for {time_stamp}")
119 | 
120 |             question_and_answers = batched_answer_fetch(site, questions)
121 |             all_results.update(question_and_answers)
122 |         
123 |         # Save the data
124 |         file_path = os.path.join(save_path, f'{time_stamp}.json')
125 |         with open(file_path, 'w') as f:
126 |             json.dump(all_results, f, ensure_ascii=False, indent=2)
127 |         
128 |         print(f"Saved {len(all_results)} questions and answers for {time_stamp} to {file_path}")


--------------------------------------------------------------------------------
/wikitext_downloader.py:
--------------------------------------------------------------------------------
  1 | import requests
  2 | import traceback
  3 | import mwparserfromhell
  4 | import datetime
  5 | import datasets
  6 | from huggingface_hub import create_branch, create_tag, RepoCard
  7 | import os
  8 | 
  9 | WIKI_API_ENDPOINT = "https://en.wikipedia.org/w/api.php"
 10 | 
 11 | def parse_to_plain_text(wikitext):
 12 |     parsed = mwparserfromhell.parse(wikitext)
 13 |     return parsed.strip_code()
 14 | 
 15 | def fetch_recent_changes(from_date, to_date = '2023-08-01T00:00:00', limit = 2000, continue_token = None):
 16 |     params = {
 17 |         "action": "query",
 18 |         "format": "json",
 19 |         "list": "recentchanges",
 20 |         "rcstart": to_date,  # starting from the newer date
 21 |         "rcend": from_date,  # ending at the older date
 22 |         "rctype": "new",
 23 |         "rcnamespace": "0",
 24 |         "rclimit": "500",
 25 |         "rcprop": "title|timestamp"
 26 |     }
 27 |     if continue_token is not None:
 28 |         params['rccontinue'] = continue_token
 29 |     req = requests.Request('GET', WIKI_API_ENDPOINT, params=params).prepare()
 30 |     response = requests.get(WIKI_API_ENDPOINT, params=params).json()
 31 |     
 32 |     # Check if the response contains the expected data
 33 |     if 'query' in response and 'recentchanges' in response['query']:
 34 |         results = [entry['title'] for entry in response['query']['recentchanges']]
 35 |         num_results = len(results)
 36 |         if num_results < limit:
 37 |             continue_token = response['continue']['rccontinue'] if 'continue' in response else None
 38 |             if continue_token is not None:
 39 |                 results += fetch_recent_changes(from_date, to_date, limit - num_results, continue_token)
 40 |         return results
 41 |     else:
 42 |         return []
 43 | 
 44 | def fetch_content(title, date=None):
 45 |     params = {
 46 |         "action": "query",
 47 |         "format": "json",
 48 |         "titles": title,
 49 |         "prop": "revisions",
 50 |         "rvprop": "content",
 51 |         "rvlimit": "1",
 52 |     }
 53 |     if date: params["rvstart"] = date
 54 |     try:
 55 |         response = requests.get(WIKI_API_ENDPOINT, params=params)
 56 |         response.raise_for_status()  # Will raise an error if the HTTP request returned an unsuccessful status code
 57 |         data = response.json()
 58 |         if 'error' in data:
 59 |             print(f"Error fetching content for {title}: {data['error']['info']}")
 60 |             return None
 61 | 
 62 |         page = next(iter(data['query']['pages'].values()))
 63 |         if 'revisions' not in page:
 64 |             print(f"No revisions found for {title}")
 65 |             return None
 66 |         content = page['revisions'][0]['*']
 67 |         
 68 |         # Check if the content is a redirect and skip if true
 69 |         if content.lower().startswith("#redirect"):
 70 |             print(f"{title} is a redirect page.")
 71 |             return None
 72 |         text = parse_to_plain_text(content)
 73 |         if len(text.split(' ')) < 300:
 74 |             print(f"{title} is less than 300 words.")
 75 |             return None
 76 |         
 77 |         return {
 78 |             "title": page['title'],
 79 |             "text": text,
 80 |             "pageid": page['pageid'],
 81 |         }
 82 | 
 83 |     except Exception as e:
 84 |         print(f"An error occurred while fetching content for {title}: {str(e)}")
 85 |         traceback.print_exc()  # This will print the full traceback
 86 | 
 87 |     return None
 88 | 
 89 | if __name__ == "__main__":
 90 | 
 91 |     hf_token = os.environ['HF_TOKEN']
 92 | 
 93 |     today = datetime.datetime.now()
 94 |     one_week_ago = today - datetime.timedelta(days=7)
 95 |     two_week_ago = today - datetime.timedelta(days=14)
 96 | 
 97 |     new_articles = fetch_recent_changes(two_week_ago.isoformat(), one_week_ago.isoformat(), limit=8000)
 98 |     print(f'Num new articles: {len(new_articles)}')
 99 | 
100 |     articles = []
101 |     for article in new_articles:
102 |         content = fetch_content(article)
103 |         if content is not None:
104 |             articles.append(content)
105 |     
106 |     ds = datasets.Dataset.from_dict({key: [article[key] for article in articles] for key in articles[0].keys()})
107 |     print(ds)
108 |     try:
109 |         create_branch("RealTimeData/wikitext_latest", branch=one_week_ago.date().isoformat(), repo_type="dataset", token=hf_token)
110 |     except:
111 |         traceback.print_exc()
112 |         
113 |     ds.push_to_hub('RealTimeData/wikitext_latest', branch='main', token=hf_token)
114 |     ds.push_to_hub('RealTimeData/wikitext_latest', branch=one_week_ago.date().isoformat(), token=hf_token)
115 | 
116 |     text = f"""
117 | # Latest Wikitext
118 | 
119 | You could always access the latest Wikipedia texts via this dataset.
120 | 
121 | We update the dataset weekly, on every Sunday. So the dataset always provides the latest Wikipedia texts from the last week.
122 | 
123 | The current dataset on main branch contains the latest wikipedia texts created from {two_week_ago.date().isoformat()} to {one_week_ago.date().isoformat()}.
124 | 
125 | The data collection is conducted on {today.date().isoformat()}.
126 | 
127 | Use the dataset via:
128 | ```
129 | ds = datasets.load_dataset('RealTimeData/wikitext_latest')
130 | ```
131 | 
132 | # Previsou versions
133 | 
134 | You could access previous versions by requesting different branches.
135 | 
136 | For example, you could find the 2023-08-12 version via:
137 | ```
138 | ds = datasets.load_dataset('RealTimeData/wikitext_latest', revision = '2023-08-12')
139 | ```
140 | 
141 | Check all available versions by clicking the "Files and versions" button on the top bar.
142 | """
143 |     card = RepoCard(text)
144 |     card.push_to_hub('RealTimeData/wikitext_latest', repo_type='dataset', token=hf_token)


--------------------------------------------------------------------------------
/data/monthly_updater/monthly_math.py:
--------------------------------------------------------------------------------
  1 | import requests
  2 | import time
  3 | from datetime import datetime
  4 | import sys
  5 | import os
  6 | import json
  7 | import datasets
  8 | 
  9 | Overflow_API_KEY = os.environ['Overflow_API_KEY']
 10 | 
 11 | def fetch_answers_for_question(site, question_id):
 12 |     """Fetch answers for a given question ID."""
 13 |     url = f"https://api.stackexchange.com/2.3/questions/{question_id}/answers"
 14 |     params = {
 15 |         'order': 'desc',
 16 |         'sort': 'votes',
 17 |         'site': site,
 18 |         'filter': 'withbody',  # Include answer body
 19 |         'key': Overflow_API_KEY,
 20 |     }
 21 |     response = requests.get(url, params=params)
 22 |     data = response.json()
 23 |     # if data['quota_remaining'] < 10:
 24 |     #     print(f"Warning: API quota remaining is {data['quota_remaining']} for site {site} and question {question_id}.")
 25 |     #     time.sleep(10)
 26 |     print(f"Quota remaining: {data['quota_remaining']}")
 27 |     return data.get('items', [])
 28 | 
 29 | def batched_answer_fetch(site, questions):
 30 |     """Fetch answers for given question IDs in batches."""
 31 |     all_answers = []
 32 |     # API might have a limit on the number of IDs per request; adjust batch_size if needed
 33 |     answer_count = 0
 34 |     batch_ids = []
 35 |     question_and_answers = {}
 36 |     for question in questions:
 37 |         if question['answer_count'] < 2:
 38 |             continue
 39 |         question['site'] = site
 40 |         question_and_answers[question['question_id']] = question
 41 |         answer_count += question['answer_count']
 42 |         if answer_count < 100:
 43 |             batch_ids.append(question['question_id'])
 44 |             continue
 45 |         else:
 46 |             ids_string = ';'.join(map(str, batch_ids))
 47 |             url = f"https://api.stackexchange.com/2.3/questions/{ids_string}/answers"
 48 |             params = {
 49 |                 'order': 'desc',
 50 |                 'sort': 'votes',
 51 |                 'site': site,
 52 |                 'filter': 'withbody',  # Include answer body
 53 |                 'key': Overflow_API_KEY, # Replace YOUR_API_KEY with your actual Stack Exchange API key
 54 |                 'pagesize': 100,  # Adjust as per your needs, max is typically 100
 55 |             }
 56 |             response = requests.get(url, params=params)
 57 |             data = response.json()
 58 |             all_answers.extend(data.get('items', []))
 59 |             for ans in all_answers:
 60 |                 question_id = ans['question_id']
 61 |                 assert question_id in question_and_answers
 62 |                 question_and_answers[question_id].setdefault('answers', []).append(ans)
 63 |             
 64 |             print(f"Quota remaining: {data['quota_remaining']}")
 65 |             batch_ids = [question['question_id']]
 66 |             answer_count = 0
 67 |                 
 68 |     return question_and_answers
 69 | 
 70 | def fetch_questions_within_period(site, time_stamp, max_items=100):
 71 |     questions = []
 72 |     page = 1
 73 |     has_more = True
 74 |     year = int(time_stamp.split('-')[0])
 75 |     month = int(time_stamp.split('-')[1])
 76 |     start_date = datetime(year, month, 1)
 77 |     end_date = datetime(year, month, 28)
 78 | 
 79 |     fromdate = int(start_date.timestamp())
 80 |     todate = int(end_date.timestamp())
 81 |     
 82 |     while has_more and len(questions) < max_items:
 83 |         url = "https://api.stackexchange.com/2.3/questions"
 84 |         params = {
 85 |             'site': site,
 86 |             'sort': 'votes',
 87 |             'fromdate': fromdate,
 88 |             'todate': todate,
 89 |             'pagesize': 100,  # Adjust as per your needs, max is typically 100
 90 |             'page': page,
 91 |             'filter': 'withbody',  # Include question body
 92 |             'key': Overflow_API_KEY,
 93 |         }
 94 |         response = requests.get(url, params=params)
 95 |         data = response.json()
 96 |         questions.extend(data.get('items', []))
 97 |         has_more = data.get('has_more', False)
 98 |         page += 1
 99 |         time.sleep(0.05)
100 |     
101 |     return questions
102 | 
103 | if __name__ == '__main__':
104 |     hf_token = os.environ['HF_TOKEN']
105 | 
106 |     today = datetime.today()
107 |     year = today.year
108 |     month = today.month
109 |     time_stamp = f'{year}-{month:02d}'
110 | 
111 |     save_path = './math'
112 | 
113 |     if not os.path.exists(save_path):
114 |         os.makedirs(save_path)
115 | 
116 |     # Example usage
117 |     sites = ["math", "mathoverflow"]
118 |     all_results = {}
119 |     for site in sites:
120 |         questions = fetch_questions_within_period(site, time_stamp, max_items=1800)
121 |         print(f"Found {len(questions)} questions on {site} for {time_stamp}")
122 | 
123 |         question_and_answers = batched_answer_fetch(site, questions)
124 |         all_results.update(question_and_answers)
125 |     
126 |     # Save the data
127 |     file_path = os.path.join(save_path, f'{time_stamp}.json')
128 |     with open(file_path, 'w') as f:
129 |         json.dump(all_results, f, ensure_ascii=False, indent=2)
130 |     
131 |     print(f"Saved {len(all_results)} questions and answers for {time_stamp} to {file_path}")
132 | 
133 |     # load the data
134 |     with open(file_path) as f:
135 |         data = json.load(f)
136 | 
137 |     all_instances = []
138 |     for qa in data.values():
139 |         instance = {}
140 |         instance['question'] = qa['title']
141 |         instance['question_id'] = qa['question_id']
142 |         instance['score'] = qa['score']
143 |         instance['link'] = qa['link']
144 |         instance['body'] = qa['body']
145 |         if 'answers' not in qa:
146 |             continue
147 |         instance['answers'] = [{'text': a['body'], 'score': a['score'], 'answer_id': a['answer_id']} for a in qa['answers']]
148 | 
149 |         verbolised = f"Question: {instance['question']}\n"
150 |         for ans_index, ans in enumerate(instance['answers']):
151 |             verbolised += f"Answer {ans_index + 1}: {ans['text']}\n"
152 |         instance['verbolised'] = verbolised
153 | 
154 |         all_instances.append(instance)
155 | 
156 |     ds = datasets.Dataset.from_list(all_instances)
157 |     ds.push_to_hub('RealTimeData/math_alltime', time_stamp, token=hf_token)


--------------------------------------------------------------------------------
/figs/compare.py:
--------------------------------------------------------------------------------
 1 | def compare_strings(str_list1, str_list2, benchmarks, prompts):
 2 |     from difflib import SequenceMatcher
 3 | 
 4 |     all_result = []
 5 |     for str1, str2, title, prompt in zip(str_list1, str_list2, benchmarks, prompts):
 6 |         result = []
 7 |         result.append('<div class="text-box">')
 8 |         result.append(f'<div class="title">{title}</div>')
 9 |         result.append(f'<div class="prompt">{prompt}</div>')
10 |         s = SequenceMatcher(None, str1, str2)
11 |         
12 |         for opcode, a0, a1, b0, b1 in s.get_opcodes():
13 |             if opcode == 'equal':
14 |                 result.append(f'<span class="difference">{str1[a0:a1]}</span>')
15 |             else:
16 |                 result.append(str1[a0:a1])
17 |         result.append('</div>')
18 |         all_result.append(''.join(result))
19 |             
20 |     return ''.join(all_result)
21 | 
22 | # Test the function
23 | # prompts = [
24 | #     """Beyoncé Giselle Knowles-Carter (/biːˈjɒnseɪ/ bee-YON-say) (born September 4, 1981) is an American singer, songwriter, record producer and actress. Born and raised in Houston, Texas, she performed in various singing and dancing competitions as a child, and rose to fame in""",
25 | #     """Persian (/ˈpɜːrʒən, -ʃən/), also known by its endonym Farsi (فارسی fārsi (fɒːɾˈsiː) ( listen)), is one of the Western Iranian languages within the Indo-Iranian branch of the Indo-European language family. It is primarily""",
26 | #     """The number of Malayalam speakers in Lakshadweep is 51,100, which is only 0.15% of the total number, but is as much as about 84% of the population of Lakshadweep. In all, Malayalis made up 3.22% of the total Indian population in 2001. Of the total 33,066,""",
27 | #     """The British have a love-hate relationship with the NHS. According to researchers at the King's Fund, the public gave the NHS its worst rating since records began 40 years ago. Just 29% said they were satisfied with the NHS in 2022. And yet we still love it. A whopping 90% of""",
28 | # ]
29 | # predictions = [
30 | #     """ the late 1990s as lead singer of R&B girl-group Destiny's Child. Managed by her father, Mathew Knowles, the group became one of the world's best-selling girl groups of all time. Their hiatus saw the release of Beyoncé's debut album, Dangerously in Love (2003), which established her as a solo artist worldwide, earned five Grammy Awards and featured the Billboard Hot 100 number-one singles "Crazy in Love" and "Baby Boy".""",
31 | #     """ spoken in Iran, Afghanistan (officially known as Dari since 1958),[3] and Tajikistan (officially known as Tajiki since the Soviet era),[4] and some other regions which historically were Persianate societies and considered part of Greater Iran. It is written right to left in the Persian alphabet, a modified variant of the Arabic script.""",
32 | #     """ 392 Malayalam speakers in India in 2001, 33,015,420 spoke the standard dialects, 19,643 spoke the Yerava dialect and 31,329 spoke non-standard regional variations like Eranadan. As per the 1991 census data, 28.85% of all Malayalam speakers in India spoke a second language and 19.64% of the total knew three or more languages.""",
33 | #     """ of Britons believe that the NHS is a crucial institution that should be preserved. The juxtaposition between dissatisfaction with the current state and overall reverence for the institution speaks volumes about the complex relationship the British public has with their healthcare system."""
34 | 
35 | # ]
36 | # references = [
37 | #     """ the late 1990s as lead singer of the R&B girl-group Destiny's Child. Managed by her father, Mathew Knowles, the group became one of the world's best-selling girl groups of all time. Her debut solo album, Dangerously in Love (2003), debuted at number one on the US Billboard 200 chart and earned her five Grammy Awards, solidifying her as a solo artist as well. Throughout her career, Beyoncé has sold over 100 million records worldwide as a solo artist and a further 60 million records with Destiny's Child, making her one of the best-selling music artists of all time. She has won 23 Grammy Awards and is the most""",
38 | #     """ spoken in Iran, Afghanistan (officially known as Dari since 1958), and Tajikistan (officially known as Tajiki since the Soviet era), and some other regions which historically were Persianate societies and considered part of Greater Iran. It is written in the Persian alphabet, a modified variant of the Arabic script, which itself evolved from the Aramaic alphabet.""",
39 | #     """ 392 Malayalam speakers in India in 2001, 33,015,420 spoke the standard dialects, 19,643 spoke the Yerava dialect and 31,329 spoke non-standard regional variations like Eranadan. As per the 1991 census data, 28.85% of all Malayalam speakers in India spoke a second language and 19.64% of the total knew three or more languages.""",
40 | #     """of the public agrees the service should be free and available to everyone. But with more than seven million people on waiting lists, almost everyone knows someone who isn't getting the care they need. As the NHS approaches its 75th anniversary, politicians are falling over themselves to praise the service."""
41 | # ]
42 | 
43 | import json
44 | with open('figs/kanye.json', 'r') as f:
45 |     data = json.load(f)
46 | 
47 | prompts = data['prompts']
48 | predictions = data['predictions']
49 | references = data['references']
50 | 
51 | # benchmarks = ['squad', 'boolq', 'quac', 'LatestEval']
52 | benchmarks = ['memorised', 'clean']
53 | html_output = compare_strings(predictions, references, benchmarks, prompts)
54 | 
55 | with open('output.html', 'w') as f:
56 |     f.write(f"""
57 |     <html>
58 |     <head>
59 |         <style>
60 |             .difference {{
61 |                 background: #a9d0f5;
62 |             }}
63 |             .text-box {{
64 |                 width: 300px;  /* Adjust as needed */
65 |                 height: 200px; /* Adjust as needed */
66 |                 border: 1px solid #000;
67 |                 padding: 10px;
68 |                 overflow: auto;
69 |             }}
70 |             .title {{
71 |                 font-size: 20px;
72 |                 font-weight: bold;
73 |                 margin-bottom: 10px;
74 |             }}
75 |             .container {{
76 |                 display: flex;
77 |                 gap: 20px; /* provides space between the boxes */
78 |             }}
79 |             .prompt {{
80 |                 font-size: 10px;
81 |                 font-style: italic;
82 |                 margin-bottom: 4px;
83 |             }}
84 |         </style>
85 |     </head>
86 |     <body>
87 |         <div class="container">
88 |             {html_output}
89 |         </div>
90 |     </body>
91 |     </html>
92 |     """)
93 | 
94 | print("HTML output saved to 'output.html'")
95 | 


--------------------------------------------------------------------------------
/data/github_dataset.py:
--------------------------------------------------------------------------------
  1 | from git import Repo
  2 | import datetime
  3 | import os
  4 | import sys
  5 | import multiprocessing
  6 | import difflib
  7 | import json
  8 | import itertools
  9 | import shutil
 10 | 
 11 | def clone_repo(repo_url, local_path, overwrite=False):
 12 |     if os.path.exists(local_path):
 13 |         if overwrite:
 14 |             shutil.rmtree(local_path)
 15 |         else:
 16 |             print(f"Repo {local_path} already exists")
 17 |             return
 18 |     Repo.clone_from(repo_url, local_path)
 19 | 
 20 | def get_file_content(commit, file_path):
 21 |     # Retrieves the file content for a given commit
 22 |     blob = commit.tree / file_path
 23 |     return blob.data_stream.read().decode('utf-8', errors='ignore')
 24 | 
 25 | def compare_files(repo, diff_item, start_commit, end_commit, code_extensions):
 26 |     file_path = diff_item.b_path
 27 |     _, ext = os.path.splitext(file_path)
 28 |     if ext not in code_extensions:
 29 |         # print(f"Skipping {file_path} because it is not a code file")
 30 |         return None
 31 | 
 32 |     # If the change type is added, we append it anyway
 33 |     # If the change type is modified, we only append it if the file was significantly changed (more than 50% of its lines were changed)
 34 |     if diff_item.change_type == 'M':
 35 |         a_content = get_file_content(start_commit, diff_item.a_path)
 36 |         b_content = get_file_content(end_commit, diff_item.b_path)
 37 | 
 38 |         # Use difflib to compare contents
 39 |         diff = difflib.unified_diff(
 40 |             a_content.splitlines(keepends=True),
 41 |             b_content.splitlines(keepends=True),
 42 |             fromfile=diff_item.a_path,
 43 |             tofile=diff_item.b_path
 44 |         )
 45 | 
 46 |         # Count the number of lines added
 47 |         changes = sum(1 for line in diff if line.startswith('+') and not line.startswith('++'))
 48 | 
 49 |         # if the file was not significantly changed, skip it. we consider a file significantly changed if more than 50% of its lines were changed
 50 |         if changes < 0.5 * len(a_content.splitlines()):
 51 |             return None
 52 |     elif diff_item.change_type == 'A':
 53 |         b_content = get_file_content(end_commit, diff_item.b_path)
 54 |         changes = len(b_content.splitlines())
 55 |     elif diff_item.change_type == 'R':
 56 |         # skip renamed files
 57 |         return None
 58 |     else:
 59 |         print(diff_item.change_type)
 60 |         return None
 61 | 
 62 |     return {
 63 |         'file_path': file_path,
 64 |         'num_changed_lines': changes,
 65 |         'code': b_content,
 66 |     }
 67 | 
 68 | def get_monthly_diff_file_objects(local_path, start_date, end_date, code_extensions):
 69 |     repo = Repo(local_path)
 70 |     repo_name = local_path.split('/')[-1]
 71 |     file_changes = []
 72 | 
 73 |     try:
 74 |         start_commit = next(repo.iter_commits(until=start_date))
 75 |         end_commit = next(repo.iter_commits(until=end_date))
 76 |     except StopIteration:
 77 |         print(f"Repo {local_path} has no commits in {start_date} - {end_date}")
 78 |         return file_changes
 79 | 
 80 |     start_commit_date = datetime.datetime.fromtimestamp(start_commit.committed_date).date()
 81 |     end_commit_date = datetime.datetime.fromtimestamp(end_commit.committed_date).date()
 82 |     end_commit_date_str = end_commit_date.strftime("%Y-%m-%d")
 83 | 
 84 |     if start_commit_date > end_date or end_commit_date < start_date:
 85 |         # print(f"Repo {local_path} has no commits in the given time range")
 86 |         return file_changes
 87 | 
 88 |     diff_index = start_commit.diff(end_commit, **{'find_renames=50%': True, 'insert_kwargs_after': '-r'})
 89 | 
 90 |     for diff_item in diff_index.iter_change_type('M'):
 91 |         result = compare_files(repo, diff_item, start_commit, end_commit, code_extensions)
 92 |         if result:
 93 |             result['repo_name'] = repo_name
 94 |             result['commit_date'] = end_commit_date_str
 95 |             result['sha'] = end_commit.hexsha
 96 |             file_changes.append(result)
 97 |     
 98 |     for diff_item in diff_index.iter_change_type('A'):
 99 |         result = compare_files(repo, diff_item, start_commit, end_commit, code_extensions)
100 |         if result:
101 |             result['repo_name'] = repo_name
102 |             result['commit_date'] = end_commit_date_str
103 |             result['sha'] = end_commit.hexsha
104 |             file_changes.append(result)
105 | 
106 |     # Ranking files by the extent of added lines
107 |     ranked_files = sorted(file_changes, key=lambda x: x['num_changed_lines'], reverse=True)
108 |     # print(f"Total {len(ranked_files)} files changed")
109 |     return ranked_files
110 | 
111 | def main(time_stamp, local_repo, save_path):
112 |     # try:
113 |     year, month = time_stamp.split('-')
114 |     first_day = datetime.date(int(year), int(month), 1)
115 |     last_day = datetime.date(int(year), int(month), 28)
116 |     
117 |     repo_name = local_repo.split('/')[-1]
118 |     # print(f"Processing {repo_name} at {time_stamp}")
119 | 
120 |     save_path = os.path.join(save_path, time_stamp, repo_name)
121 |     if not os.path.exists(save_path):
122 |         os.makedirs(save_path)
123 | 
124 |     ranked_files = get_monthly_diff_file_objects(local_repo, first_day, last_day, code_extensions)
125 |     for index, file in enumerate(ranked_files[:50]):
126 |         save_file_path = os.path.join(save_path, f"{index}.json")
127 |         with open(save_file_path, 'w') as f:
128 |             json.dump(file, f, ensure_ascii=False, indent=2)
129 |     return (time_stamp, repo_name, len(ranked_files))
130 | 
131 | if __name__ == '__main__':
132 |     repo_path, save_dir, = sys.argv[1:]
133 | 
134 |     time_stamps = [f'{year}-{month:02d}' for year in range(2017, 2024) for month in range(1, 13)]
135 |     time_stamps += [f'2024-{month:02d}' for month in range(1, 3)]
136 | 
137 |     # pre_defined repos
138 |     with open('/user/HS502/yl02706/LatestEval/data/code_repos.txt', 'r') as f:
139 |         repos = f.readlines()
140 |     
141 |     print(f"Total {len(repos)} repos")
142 |     
143 |     # Prepare URLs and local paths
144 |     urls = [f'https://github.com/{repo.strip()}.git' for repo in repos]
145 |     local_paths = [os.path.join(repo_path, repo.replace('/', '_')).strip() for repo in repos]
146 | 
147 |     # clone repos
148 |     with multiprocessing.Pool(2) as pool:
149 |         pool.starmap(clone_repo, zip(urls, local_paths))
150 |     
151 |     code_extensions = {'.py', '.js', '.java', '.cpp', '.c', '.cs', '.go', '.rb', '.php', '.ts', '.jsx', '.tsx', '.css', '.sh', '.pl', '.bat'}
152 |     
153 |     combinations = list(itertools.product(time_stamps, local_paths))
154 |     # combinations = sorted(combinations, key=lambda x: x[-1])
155 |     flattened_args = [(time_stamp, local_path, save_dir) for time_stamp, local_path in combinations]
156 | 
157 |     print(f"Total {len(flattened_args)} combinations")
158 |     with multiprocessing.Pool(8) as pool:
159 |         ALL_PROCESSED = pool.starmap(main, flattened_args)
160 | 
161 |     # use single process instead
162 |     # ALL_PROCESSED = []
163 |     # for args in flattened_args:
164 |     #     ALL_PROCESSED.append(main(*args))
165 |     
166 |     print(f"Total {len(ALL_PROCESSED)} processed")
167 | 


--------------------------------------------------------------------------------
/data/monthly_updater/monthly_code.py:
--------------------------------------------------------------------------------
  1 | from git import Repo
  2 | import datetime
  3 | import os
  4 | import sys
  5 | import multiprocessing
  6 | import difflib
  7 | import json
  8 | import itertools
  9 | import shutil
 10 | from glob import glob
 11 | import datasets
 12 | 
 13 | def clone_repo(repo_url, local_path, overwrite=False, since=None):
 14 |     if os.path.exists(local_path):
 15 |         if overwrite:
 16 |             shutil.rmtree(local_path)
 17 |         else:
 18 |             print(f"Repo {local_path} already exists")
 19 |             return
 20 |     Repo.clone_from(repo_url, local_path, multi_options=[f'--shallow-since={since}'] if since is not None else None)
 21 | 
 22 | def get_file_content(commit, file_path):
 23 |     # Retrieves the file content for a given commit
 24 |     blob = commit.tree / file_path
 25 |     return blob.data_stream.read().decode('utf-8', errors='ignore')
 26 | 
 27 | def compare_files(repo, diff_item, start_commit, end_commit, code_extensions):
 28 |     file_path = diff_item.b_path
 29 |     _, ext = os.path.splitext(file_path)
 30 |     if ext not in code_extensions:
 31 |         # print(f"Skipping {file_path} because it is not a code file")
 32 |         return None
 33 | 
 34 |     # If the change type is added, we append it anyway
 35 |     # If the change type is modified, we only append it if the file was significantly changed (more than 50% of its lines were changed)
 36 |     if diff_item.change_type == 'M':
 37 |         a_content = get_file_content(start_commit, diff_item.a_path)
 38 |         b_content = get_file_content(end_commit, diff_item.b_path)
 39 | 
 40 |         # Use difflib to compare contents
 41 |         diff = difflib.unified_diff(
 42 |             a_content.splitlines(keepends=True),
 43 |             b_content.splitlines(keepends=True),
 44 |             fromfile=diff_item.a_path,
 45 |             tofile=diff_item.b_path
 46 |         )
 47 | 
 48 |         # Count the number of lines added
 49 |         changes = sum(1 for line in diff if line.startswith('+') and not line.startswith('++'))
 50 | 
 51 |         # if the file was not significantly changed, skip it. we consider a file significantly changed if more than 50% of its lines were changed
 52 |         if changes < 0.5 * len(a_content.splitlines()):
 53 |             return None
 54 |     elif diff_item.change_type == 'A':
 55 |         b_content = get_file_content(end_commit, diff_item.b_path)
 56 |         changes = len(b_content.splitlines())
 57 |     elif diff_item.change_type == 'R':
 58 |         # skip renamed files
 59 |         return None
 60 |     else:
 61 |         print(diff_item.change_type)
 62 |         return None
 63 | 
 64 |     return {
 65 |         'file_path': file_path,
 66 |         'num_changed_lines': changes,
 67 |         'code': b_content,
 68 |     }
 69 | 
 70 | def get_monthly_diff_file_objects(local_path, start_date, end_date, code_extensions):
 71 |     repo = Repo(local_path)
 72 |     repo_name = local_path.split('/')[-1]
 73 |     file_changes = []
 74 | 
 75 |     try:
 76 |         start_commit = next(repo.iter_commits(since=start_date, reverse=True))
 77 |         end_commit = next(repo.iter_commits(until=end_date))
 78 |     except StopIteration:
 79 |         print(f"Repo {local_path} has no commits in {start_date} - {end_date}")
 80 |         return file_changes
 81 | 
 82 |     start_commit_date = datetime.datetime.fromtimestamp(start_commit.committed_date).date()
 83 |     end_commit_date = datetime.datetime.fromtimestamp(end_commit.committed_date).date()
 84 |     end_commit_date_str = end_commit_date.strftime("%Y-%m-%d")
 85 | 
 86 |     if start_commit_date > end_date or end_commit_date < start_date:
 87 |         # print(f"Repo {local_path} has no commits in the given time range")
 88 |         return file_changes
 89 | 
 90 |     diff_index = start_commit.diff(end_commit, **{'find_renames=50%': True, 'insert_kwargs_after': '-r'})
 91 | 
 92 |     for diff_item in diff_index.iter_change_type('M'):
 93 |         result = compare_files(repo, diff_item, start_commit, end_commit, code_extensions)
 94 |         if result:
 95 |             result['repo_name'] = repo_name
 96 |             result['commit_date'] = end_commit_date_str
 97 |             result['sha'] = end_commit.hexsha
 98 |             file_changes.append(result)
 99 |     
100 |     for diff_item in diff_index.iter_change_type('A'):
101 |         result = compare_files(repo, diff_item, start_commit, end_commit, code_extensions)
102 |         if result:
103 |             result['repo_name'] = repo_name
104 |             result['commit_date'] = end_commit_date_str
105 |             result['sha'] = end_commit.hexsha
106 |             file_changes.append(result)
107 | 
108 |     # Ranking files by the extent of added lines
109 |     ranked_files = sorted(file_changes, key=lambda x: x['num_changed_lines'], reverse=True)
110 |     # print(f"Total {len(ranked_files)} files changed")
111 |     return ranked_files
112 | 
113 | def main(time_stamp, local_repo, save_path):
114 |     year, month = time_stamp.split('-')
115 |     first_day = datetime.date(int(year), int(month), 1)
116 |     last_day = datetime.date(int(year), int(month), 28)
117 |     
118 |     repo_name = local_repo.split('/')[-1]
119 |     # print(f"Processing {repo_name} at {time_stamp}")
120 | 
121 |     save_path = os.path.join(save_path, time_stamp, repo_name)
122 |     if not os.path.exists(save_path):
123 |         os.makedirs(save_path)
124 | 
125 |     ranked_files = get_monthly_diff_file_objects(local_repo, first_day, last_day, code_extensions)
126 |     for index, file in enumerate(ranked_files[:50]):
127 |         save_file_path = os.path.join(save_path, f"{index}.json")
128 |         with open(save_file_path, 'w') as f:
129 |             json.dump(file, f, ensure_ascii=False, indent=2)
130 |     return (time_stamp, repo_name, len(ranked_files))
131 |     # print(f"Saved to {save_path}")
132 | 
133 | if __name__ == '__main__':
134 |     today = datetime.date.today()
135 |     year = today.year
136 |     month = today.month
137 | 
138 |     time_stamp = f'{year}-{month:02d}'
139 |     first_day_string = f'{year}-{month:02d}-01'
140 | 
141 |     repo_path, save_dir = 'repos/', 'code_data/'
142 |     repo_list = 'data/code_repos.txt'
143 | 
144 |     # time_stamps = [f'{year}-{month:02d}' for year in range(2017, 2024) for month in range(1, 13)]
145 |     time_stamps = [time_stamp]
146 | 
147 |     # pre_defined repos
148 |     with open(repo_list, 'r') as f:
149 |         repos = f.readlines()
150 |     
151 |     print(f"Total {len(repos)} repos")
152 |     
153 |     # Prepare URLs and local paths
154 |     urls = [f'https://github.com/{repo.strip()}.git' for repo in repos]
155 |     local_paths = [os.path.join(repo_path, repo.replace('/', '_')).strip() for repo in repos]
156 | 
157 |     # clone repos
158 |     sucess_paths = []
159 |     for url, local_path in zip(urls, local_paths):
160 |         try:
161 |             clone_repo(url, local_path, overwrite=True, since=first_day_string)
162 |             sucess_paths.append(local_path)
163 |         except:
164 |             print(f"Failed to clone {url}")
165 |     
166 |     code_extensions = {'.py', '.js', '.java', '.cpp', '.c', '.cs', '.go', '.rb', '.php', '.ts', '.jsx', '.tsx', '.css', '.sh', '.pl', '.bat'}
167 |     
168 |     combinations = list(itertools.product(time_stamps, sucess_paths))
169 |     # combinations = sorted(combinations, key=lambda x: x[-1])
170 |     flattened_args = [(time_stamp, local_path, save_dir) for time_stamp, local_path in combinations]
171 | 
172 |     print(f"Total {len(flattened_args)} combinations")
173 |     with multiprocessing.Pool(2) as pool:
174 |         ALL_PROCESSED = pool.starmap(main, flattened_args)
175 |     
176 |     print(f"Total {len(ALL_PROCESSED)} processed")
177 | 
178 |     hf_token = os.environ['HF_TOKEN']
179 |     code_files = glob(f'{save_dir}/{time_stamp}/*/*.json')
180 |     all_codes = []
181 |     for code in code_files:
182 |         with open(code, 'r') as f:
183 |             all_codes.append(json.load(f))
184 |     ds = datasets.Dataset.from_list(all_codes)
185 |     print('='*20)
186 |     print(f'Finished {time_stamp}')
187 |     ds.push_to_hub(f'RealTimeData/code_alltime', config_name = time_stamp, token=hf_token)
188 |     print(f'Pushed {time_stamp} to hub')


--------------------------------------------------------------------------------
/eval/contamination.py:
--------------------------------------------------------------------------------
  1 | import datasets
  2 | from nltk.tokenize import sent_tokenize, word_tokenize
  3 | from nltk import ngrams
  4 | import pandas as pd
  5 | import nltk
  6 | from nltk.stem import WordNetLemmatizer
  7 | import os
  8 | import time
  9 | import openai
 10 | from tqdm import tqdm
 11 | from transformers import GPT2TokenizerFast
 12 | 
 13 | T = GPT2TokenizerFast.from_pretrained("gpt2")
 14 | prompt_length = 250
 15 | suffix_length = 500 - prompt_length
 16 | 
 17 | def data_sampler():
 18 |     quac = datasets.load_dataset("quac", split="validation")
 19 |     boolq = datasets.load_dataset("boolq", split="validation")
 20 |     squad = datasets.load_dataset("squad_v2", split="validation")
 21 | 
 22 |     latesteval_1 = datasets.load_dataset("RealTimeData/bbc_news_week1_july_2023", split="train")
 23 |     latesteval_2 = datasets.load_dataset("RealTimeData/github_july_week1_2023", split="train")
 24 |     latesteval_3 = datasets.load_dataset("RealTimeData/arxiv_july_week1_2023", split="train")
 25 | 
 26 |     def get_prefix_and_suffix(doc, dataset_name = None):
 27 |         if dataset_name is None:
 28 |             raise ValueError("dataset_name must be specified")
 29 |         if dataset_name == "quac" or dataset_name == "squad_v2":
 30 |             text = T(doc['context']).input_ids
 31 |             if dataset_name == "quac":
 32 |                 title = 'quac, ' + doc['wikipedia_page_title'] + ', ' + doc['section_title'] + ', '
 33 |             elif dataset_name == "squad_v2":
 34 |                 title = 'squadv2, ' + 'wikipedia, ' + doc['title'] + ', '
 35 |             # text = word_tokenize(doc['context'])
 36 |         elif dataset_name == "boolq":
 37 |             text = T(doc['passage']).input_ids
 38 |             title =  'boolq, wikipedia, '
 39 |             # text = word_tokenize(doc['passage'])
 40 |         elif dataset_name == "latesteval_1":
 41 |             text = doc['content'].replace("\n", " ")
 42 |             text = T(text).input_ids
 43 |             # text = word_tokenize(doc['content'])
 44 |             title = 'bbc, '
 45 |         if len(text) > 1000:
 46 |             prefix = T.decode(text[:prompt_length])
 47 |             suffix = T.decode(text[suffix_length:])
 48 |         else:
 49 |             suffix = T.decode(text[-suffix_length:])
 50 |             prefix = T.decode(text[: -suffix_length])
 51 |         # prefix = " ".join(prefix)
 52 |         # suffix = " ".join(suffix)
 53 |         prefix = title + prefix
 54 |         return pd.Series([prefix, suffix], index=['prefix', 'suffix'])
 55 | 
 56 |     # quac = quac.to_pandas().sample(n=10, random_state=42)
 57 |     # boolq = boolq.to_pandas().sample(n=100, random_state=42)
 58 |     # squad = squad.to_pandas().sample(n=100, random_state=42)
 59 |     # latesteval_1 = latesteval_1.to_pandas().sample(n=100, random_state=42)
 60 | 
 61 |     quac = quac.to_pandas().head(n=100)
 62 |     boolq = boolq.to_pandas().head(n=100)
 63 |     squad = squad.to_pandas().head(n=100)
 64 |     latesteval_1 = latesteval_1.to_pandas().head(n=30)
 65 | 
 66 |     quac = quac.apply(get_prefix_and_suffix, axis=1, dataset_name="quac")
 67 |     boolq = boolq.apply(get_prefix_and_suffix, axis=1, dataset_name="boolq")
 68 |     squad = squad.apply(get_prefix_and_suffix, axis=1, dataset_name="squad_v2")
 69 |     latesteval = latesteval_1.apply(get_prefix_and_suffix, axis=1, dataset_name="latesteval_1")
 70 | 
 71 |     return {
 72 |         "quac": quac,
 73 |         "boolq": boolq,
 74 |         "squad": squad,
 75 |         "latesteval": latesteval
 76 |     }
 77 | 
 78 | def identify_contamination(reference_suffixes, continuations):
 79 |     
 80 |     def generate_word_ngrams(text, n, use_lemmatization=False):
 81 |         tokens = T(text.lower()).input_ids  
 82 |         
 83 |         # Optionally, lemmatize words
 84 |         if use_lemmatization:
 85 |             lemmatizer = WordNetLemmatizer()
 86 |             words = [lemmatizer.lemmatize(word) for word in words]
 87 |         return list(ngrams(tokens, n))
 88 | 
 89 |     results = []
 90 |     for suffix, continuation in zip(reference_suffixes, continuations):
 91 |         suffix_ngrams = set(generate_word_ngrams(suffix, 9))
 92 |         continuation_ngrams = set(generate_word_ngrams(continuation, 9))
 93 | 
 94 |         intersection = suffix_ngrams.intersection(continuation_ngrams)
 95 | 
 96 |         if len(intersection) > 0:
 97 |             results.append((True, suffix, continuation, intersection))
 98 |     
 99 |     return results
100 | 
101 | def generate_continuation(model, prompts, reference_suffix, benchmark, batch_size=10):
102 |     # three models at this moment: gpt-3, gpt-4, llama-2
103 | 
104 |     prompts = prompts.tolist()
105 | 
106 |     if model in ['gpt-4', 'davinci', 'curie', 'babbage']:
107 |         generate = gpt
108 |     else:
109 |         generate = hf_generate
110 |     
111 |     continuations = []
112 |     output_file = f"eval/{model}_{benchmark}_{prompt_length}_continuation.txt"
113 |     prompt_file = f"eval/{model}_{benchmark}_{prompt_length}_prompt.txt"
114 |     reference_suffix_file = f"eval/{model}_{benchmark}_{prompt_length}_reference_suffix.txt"
115 |     if os.path.exists(output_file):
116 |         with open(output_file, "r") as f:
117 |             continuations = f.readlines()
118 |         return continuations
119 |     else:
120 |         with open(output_file, "w") as f, open(prompt_file, "w") as f2, open(reference_suffix_file, "w") as f3:
121 |             for i in tqdm(range(0, len(prompts), batch_size)):
122 |                 prompt = prompts[i: i + batch_size]
123 |                 reference_suffix_batch = reference_suffix[i: i + batch_size]
124 |                 continuation = generate(prompt, model=model)
125 |                 continuations.extend(continuation)
126 |                 f.write('\n'.join(continuation) + "\n")
127 |                 f2.write('\n'.join(prompt) + "\n")
128 |                 f3.write('\n'.join(reference_suffix_batch) + "\n")
129 |         
130 |         return continuations
131 | 
132 | def hf_generate(model, prompt):
133 |     pass
134 | 
135 | def gpt(prompt, num_retry = 5, model = "gpt-3.5-turbo"):
136 |     # generate answer by gpt-3.5-turbo
137 |     openai_key = os.environ.get("OPENAI_API_KEY")
138 |     for _ in range(num_retry):
139 |         try:
140 |             if model in ['davinci', 'curie', 'babbage']:
141 |                 r = openai.Completion.create(
142 |                     model=model,
143 |                     prompt=prompt,
144 |                     max_tokens=250,
145 |                     temperature=0,
146 |                     logit_bias={"198": -100},
147 |                     logprobs=0,
148 |                 )
149 |             elif model in ['gpt-3.5-turbo', 'gpt-4']:
150 |                 r = openai.ChatCompletion.create(
151 |                     model = model,
152 |                     messages = [
153 |                         {"role": "user", "content": prompt},
154 |                     ],
155 |                     max_tokens=250,
156 |                     temperature = 0,
157 |                     logit_bias={"198": -100}
158 |                 )
159 |             break
160 |         except Exception as e:
161 |             print(e)
162 |             time.sleep(1)
163 |     
164 |     if model in ['davinci', 'curie', 'babbage']:
165 |         return [x['text'].replace('\n', ' ') for x in r['choices']]
166 |     elif model in ['gpt-3.5-turbo', 'gpt-4']:
167 |         return [x['message']['content'] for x in r['choices']]
168 | 
169 | if __name__ == "__main__":
170 |     samples = data_sampler()
171 | 
172 |     quac = samples['quac']
173 |     boolq = samples['boolq']
174 |     squad = samples['squad']
175 |     latesteval = samples['latesteval']
176 | 
177 |     model = 'curie'
178 | 
179 |     quac_continuations = generate_continuation(model, quac['prefix'], quac['suffix'], "quac")
180 |     quac_results = identify_contamination(quac['suffix'], quac_continuations)
181 | 
182 |     print(f"-- quac: {len(quac_results)}, -- {len(quac_results) / len(quac)}")
183 | 
184 |     boolq_continuations = generate_continuation(model, boolq['prefix'], boolq['suffix'], "boolq")
185 |     boolq_results = identify_contamination(boolq['suffix'], boolq_continuations)
186 | 
187 |     print(f"-- boolq: {len(boolq_results)}, -- {len(boolq_results) / len(boolq)}")
188 | 
189 |     squad_continuations = generate_continuation(model, squad['prefix'], squad['suffix'], "squad")
190 |     squad_results = identify_contamination(squad['suffix'], squad_continuations)
191 | 
192 |     print(f"-- squad: {len(squad_results)}, -- {len(squad_results) / len(squad)}")
193 | 
194 |     latesteval_continuations = generate_continuation(model, latesteval['prefix'], latesteval['suffix'], "latesteval")
195 |     latesteval_results = identify_contamination(latesteval['suffix'], latesteval_continuations)
196 | 
197 |     print(f"-- latesteval: {len(latesteval_results)}, -- {len(latesteval_results) / len(latesteval)}")


--------------------------------------------------------------------------------
/data/reddit_crawler.py:
--------------------------------------------------------------------------------
  1 | import requests
  2 | from bs4 import BeautifulSoup
  3 | import praw
  4 | import sys
  5 | from typing import List
  6 | import json
  7 | import os
  8 | import time
  9 | import datetime
 10 | import traceback
 11 | 
 12 | class Forum:
 13 |     def __init__(self, task_name, start_url, wait_time):
 14 |         self.task_name = task_name
 15 |         self.url = start_url
 16 |         self.wait_time = wait_time
 17 | 
 18 |         self.session = requests.Session()
 19 |         self.setup_session()
 20 |     
 21 |         self.posts = None
 22 |     
 23 |     def setup_session(self):
 24 |         """
 25 |             _summary_: Setup session
 26 |         """
 27 |         headers = {
 28 |             'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.36'
 29 |         }
 30 |         self.session.headers.update(headers)
 31 |     
 32 |     def get_forum_pages(self):
 33 |         """
 34 |             _summary_: Get all pages links of the forum
 35 |         """
 36 |         return NotImplementedError()
 37 |     
 38 |     def get_forum_content(self, page_url):
 39 |         """
 40 |             _summary_: Get all content of the forum
 41 |             should have:
 42 |                 - title
 43 |                 - main content
 44 |                 - comments
 45 |                 - votes of comments
 46 |         """
 47 |         return NotImplementedError()
 48 |     
 49 |     def obtain_content(self):
 50 |         """
 51 |             _summary_: Obtain content from each post
 52 |         """
 53 |         list_of_content = []
 54 |         for post in self.posts:
 55 |             list_of_content.append(self.get_forum_content(post))
 56 |         
 57 |         self.content = list_of_content
 58 |     
 59 |     def save_content(self):
 60 |         """
 61 |             _summary_: Save list_of_content to a file
 62 |         """
 63 |         return NotImplementedError()
 64 |     
 65 |     @classmethod
 66 |     def filter_func(cls, tag, prefix):
 67 |         if tag.has_attr('class'):
 68 |             class_str = ' '.join(tag['class'])
 69 |             # return class_str.startswith('node node--id')
 70 |             return class_str.startswith(prefix)
 71 |         return False
 72 | 
 73 | class MentalHealth(Forum):
 74 |     def __init__(self, task_name, start_url, wait_time):
 75 |         super().__init__(task_name, start_url, wait_time)
 76 | 
 77 |     def get_forum_pages(self):
 78 |         # get all sub forums
 79 | 
 80 |         sub_forums = []
 81 |         response = self.session.get(self.url)
 82 |         soup = BeautifulSoup(response.text, 'html.parser')
 83 |         for link in soup.find_all(lambda tag: self.filter_func(tag, 'node node--id')):
 84 |             sub_forums.append(link.find('a')['href'])
 85 |         
 86 |         # get all posts from each forum
 87 |         posts = []
 88 |         for sub_forum in sub_forums:
 89 |             response = self.session.get(sub_forum)
 90 |             soup = BeautifulSoup(response.text, 'html.parser')
 91 |             for link in soup.find_all(lambda tag: self.filter_func(tag, 'structItem structItem--thread')):
 92 |                 posts.append(link['href'])
 93 |         
 94 |         self.posts = posts
 95 | 
 96 |     def get_forum_content(self, page_url):
 97 |         response = self.session.get(page_url)
 98 |         soup = BeautifulSoup(response.text, 'html.parser')
 99 |         title = soup.find('h1', {'class': 'p-title-value'}).text
100 |         list_of_content = []
101 |         for article in soup.find_all(lambda tag: self.filter_func(tag, 'message message--post ')):
102 |             author = article['data-author']
103 |             content = article.find('div', {'class': 'bbWrapper'})
104 |             if content:
105 |                 mentioned = content.find_all('a', {'class': 'username'})
106 |                 content = content.text
107 |             else:
108 |                 continue
109 |             
110 |             footer = article.find('ul', {'class': 'sv-rating-bar__ratings'})
111 |             if footer:
112 |                 ratings = footer.find_all('li', {'class': 'sv-rating sv-rating--empty-list'})
113 |                 rating_sum = sum([ int(rate.text) for rate in rating])
114 |             else:
115 |                 rating_sum = 0
116 |             
117 |             list_of_content.append({
118 |                 'author': author,
119 |                 'content': content,
120 |                 'mentioned': mentioned,
121 |                 'rating': rating_sum
122 |             })
123 | 
124 |         next_page = soup.find('a', {'class': 'pageNav-jump pageNav-jump--next'})
125 |         if next_page:
126 |             next_page = next_page['href']
127 |             list_of_content += self.get_forum_content(next_page)
128 | 
129 |         return list_of_content      
130 | 
131 | 
132 | class Reddit:
133 |     def __init__(self, subreddits, time_filter, num_posts, save_path, time_limit = None):
134 |         self.subreddits = subreddits
135 |         self.time_filter = time_filter
136 |         self.num_posts = num_posts
137 |         self.save_path = save_path
138 |         self.time_limit = time_limit
139 | 
140 |         self.reddit = praw.Reddit('DataCollector')
141 |         self.posts = self.get_reddit_posts()
142 |         # self.dump_posts()
143 | 
144 |     def created_after_time_limit(self, created_utc):
145 |         if self.time_limit is None:
146 |             return True
147 |         dt_object = datetime.datetime.fromtimestamp(created_utc)
148 |         return dt_object >= self.time_limit
149 | 
150 |     def get_reddit_posts(self):
151 |         # all_posts = {}
152 |         for subreddit in self.subreddits:
153 |             subreddit_ = subreddit
154 |             subreddit = self.reddit.subreddit(subreddit)
155 |             list_of_posts = []
156 |             for post in subreddit.top(time_filter = self.time_filter, limit=self.num_posts):
157 |                 created_time = post.created_utc
158 |                 if not self.created_after_time_limit(created_time): continue
159 |                 created_time_str = datetime.datetime.fromtimestamp(created_time).strftime('%Y-%m-%d %H:%M:%S')
160 |                 title = post.title
161 |                 content = post.selftext
162 | 
163 |                 for i in range(3):
164 |                     try:
165 |                         comments = self.deal_with_comments(post.comments.list())
166 |                     except praw.exceptions.APIException as e:
167 |                         traceback.print_exc()
168 |                         time.sleep(10)
169 |                     else:
170 |                         break
171 | 
172 |                 score = post.score
173 |                 the_post = {
174 |                     'title': title,
175 |                     'content': content,
176 |                     'comments': comments,
177 |                     'created_time': created_time_str,
178 |                     'score': score,
179 |                     'subreddit': subreddit_
180 |                 }
181 |                 list_of_posts.append(the_post)
182 |             self.dump_posts(list_of_posts, subreddit_)
183 |             # all_posts[subreddit_] = list_of_posts
184 |         # return all_posts
185 |     
186 |     def deal_with_comments(self, comments, depth = 3):
187 |         results = []
188 |         if depth < 0: return results
189 |         depth -= 1
190 |         for comment in comments:
191 |             if isinstance(comment, praw.models.MoreComments): continue
192 |             author = comment.author
193 |             content = comment.body
194 |             score = comment.score
195 |             created_time = comment.created_utc
196 |             created_time_str = datetime.datetime.fromtimestamp(created_time).strftime('%Y-%m-%d %H:%M:%S')
197 |             replies = comment.replies
198 |             if len(replies):
199 |                 replies = self.deal_with_comments(replies, depth=depth)
200 |             else: replies = []
201 |             the_comment = {
202 |                 'author': author.name if author is not None else '',
203 |                 'content': content,
204 |                 'score': score,
205 |                 'created_time': created_time_str,
206 |                 'replies': replies
207 |             }
208 |             results.append(the_comment)
209 |         return results
210 |     
211 |     def dump_posts(self, list_of_posts, subreddit = None):
212 |         path = os.path.join(self.save_path, f"{subreddit if subreddit is not None else 'all'}.json")
213 |         with open(path, 'w') as f:
214 |             json.dump(list_of_posts, f)
215 | 
216 | if __name__ == '__main__':
217 |     # should define the XDG_CONFIG_HOME to the config file
218 |     cwd, = sys.argv[1:]
219 |     data_collectors = Reddit(['investing', 'wallstreetbets', 'CryptoCurrency', 'politics', 'healthcare'], 'month', 100, cwd, time_limit=datetime.datetime(2023, 7, 1))


--------------------------------------------------------------------------------
/data/squad_wiki_title.text:
--------------------------------------------------------------------------------
  1 | Queen_Victoria
  2 | Grape
  3 | Athanasius_of_Alexandria
  4 | Lighting
  5 | BBC_Television
  6 | Federal_Bureau_of_Investigation
  7 | Punjab,_Pakistan
  8 | Capacitor
  9 | Sino-Tibetan_relations_during_the_Ming_dynasty
 10 | History_of_India
 11 | Plymouth
 12 | Space_Race
 13 | Myocardial_infarction
 14 | The_Times
 15 | Franco-Prussian_War
 16 | Literature
 17 | War_on_Terror
 18 | Aircraft_carrier
 19 | Turner_Classic_Movies
 20 | Royal_assent
 21 | Muslim_world
 22 | Sahara
 23 | Galicia_(Spain)
 24 | YouTube
 25 | Santa_Monica,_California
 26 | Imperial_College_London
 27 | Textual_criticism
 28 | Sichuan
 29 | Institute_of_technology
 30 | Railway_electrification_system
 31 | Mesozoic
 32 | Cyprus
 33 | The_Sun_(United_Kingdom)
 34 | Order_of_the_British_Empire
 35 | Republic_of_the_Congo
 36 | Materialism
 37 | Qing_dynasty
 38 | To_Kill_a_Mockingbird
 39 | Greece
 40 | 2008_Sichuan_earthquake
 41 | Edmund_Burke
 42 | Northwestern_University
 43 | CBC_Television
 44 | Germans
 45 | Race_and_ethnicity_in_the_United_States_Census
 46 | Iranian_languages
 47 | Adolescence
 48 | Armenia
 49 | Intellectual_property
 50 | Law_of_the_United_States
 51 | Hanover
 52 | Tuberculosis
 53 | Dialect
 54 | Josip_Broz_Tito
 55 | Political_philosophy
 56 | Bern
 57 | Pitch_(music)
 58 | Pope_John_XXIII
 59 | Black_people
 60 | List_of_numbered_streets_in_Manhattan
 61 | Montevideo
 62 | Nigeria
 63 | Paper
 64 | Swaziland
 65 | Liberal_Party_of_Australia
 66 | Seven_Years%27_War
 67 | Zinc
 68 | Treaty
 69 | Hellenistic_period
 70 | London
 71 | European_Central_Bank
 72 | Thuringia
 73 | Circadian_rhythm
 74 | Estonian_language
 75 | Cork_(city)
 76 | Westminster_Abbey
 77 | Data_compression
 78 | United_States_Air_Force
 79 | Separation_of_powers_under_the_United_States_Constitution
 80 | On_the_Origin_of_Species
 81 | Nanjing
 82 | Zhejiang
 83 | Late_Middle_Ages
 84 | PlayStation_3
 85 | Neptune
 86 | Carnival
 87 | Hindu_philosophy
 88 | Dell
 89 | Everton_F.C.
 90 | Armenians
 91 | Samurai
 92 | Federal_Aviation_Administration
 93 | Spanish_language_in_the_United_States
 94 | Alps
 95 | Digimon
 96 | Compact_disc
 97 | God
 98 | Botany
 99 | Heresy
100 | The_Bronx
101 | Roman_Republic
102 | Wayback_Machine
103 | Airport
104 | Red
105 | Internet_service_provider
106 | Chicago_Cubs
107 | Detroit
108 | Culture
109 | New_York_City
110 | Marshall_Islands
111 | Hyderabad
112 | Pharmaceutical_industry
113 | Saint_Helena
114 | Oklahoma_City
115 | Bras%C3%ADlia
116 | Korean_War
117 | Biodiversity
118 | Brigham_Young_University
119 | Oklahoma
120 | Eton_College
121 | Alfred_North_Whitehead
122 | Russian_language
123 | A_cappella
124 | Richmond,_Virginia
125 | Genocide
126 | Great_Plains
127 | British_Empire
128 | Emotion
129 | Comics
130 | Napoleon
131 | MP3
132 | England_national_football_team
133 | Green
134 | Palermo
135 | Freemasonry
136 | Letter_case
137 | Communications_in_Somalia
138 | Exhibition_game
139 | Hard_rock
140 | Somalis
141 | University
142 | Pacific_War
143 | San_Diego
144 | British_Isles
145 | Mosaic
146 | Pesticide
147 | Bill_%26_Melinda_Gates_Foundation
148 | University_of_Notre_Dame
149 | Hunter-gatherer
150 | Hokkien
151 | Economy_of_Greece
152 | Windows_8
153 | Universal_Studios
154 | Nintendo_Entertainment_System
155 | St._John%27s,_Newfoundland_and_Labrador
156 | Immaculate_Conception
157 | Southeast_Asia
158 | Rajasthan
159 | Mammal
160 | Communication
161 | Greeks
162 | Chihuahua_(state)
163 | Database
164 | Orthodox_Judaism
165 | Ashkenazi_Jews
166 | Immunology
167 | Flowering_plant
168 | Capital_punishment_in_the_United_States
169 | Switzerland
170 | Christian
171 | Beyoncé
172 | Tristan_da_Cunha
173 | Diarrhea
174 | Architecture
175 | East_India_Company
176 | Aspirated_consonant
177 | Valencia
178 | Gene
179 | Crucifixion_of_Jesus
180 | Financial_crisis_of_2007%E2%80%9308
181 | Asthma
182 | Central_African_Republic
183 | Predation
184 | Computer_security
185 | Protestantism
186 | Russian_Soviet_Federative_Socialist_Republic
187 | Israel
188 | Neoclassical_architecture
189 | Elevator
190 | Frédéric_Chopin
191 | Group_(mathematics)
192 | Glacier
193 | Gamal_Abdel_Nasser
194 | Incandescent_light_bulb
195 | Old_English
196 | Antenna_(radio)
197 | States_of_Germany
198 | IBM
199 | Virgil
200 | Montana
201 | Pain
202 | Mexico_City
203 | Infection
204 | Slavs
205 | Friedrich_Hayek
206 | Multiracial_American
207 | Alaska
208 | Buddhism
209 | Kathmandu
210 | Yale_University
211 | Guinea-Bissau
212 | Anti-aircraft_warfare
213 | Solar_energy
214 | Affirmative_action_in_the_United_States
215 | 2008_Summer_Olympics_torch_relay
216 | Human_Development_Index
217 | Guam
218 | Party_leaders_of_the_United_States_House_of_Representatives
219 | FC_Barcelona
220 | Professional_wrestling
221 | Strasbourg
222 | Richard_Feynman
223 | Wood
224 | Royal_Institute_of_British_Architects
225 | Myanmar
226 | Paris
227 | Southampton
228 | Georgian_architecture
229 | Royal_Dutch_Shell
230 | Madrasa
231 | Department_store
232 | Adult_contemporary_music
233 | Quran
234 | Near_East
235 | Dutch_Republic
236 | George_VI
237 | Imamah_(Shia_doctrine)
238 | History_of_science
239 | Arena_Football_League
240 | Crimean_War
241 | Appalachian_Mountains
242 | Canadian_football
243 | Association_football
244 | Infrared
245 | Dutch_language
246 | Eritrea
247 | Saint_Barth%C3%A9lemy
248 | Catalan_language
249 | Samoa
250 | Sexual_orientation
251 | Atlantic_City,_New_Jersey
252 | Classical_music
253 | Dominican_Order
254 | Warsaw_Pact
255 | Antarctica
256 | Lancashire
257 | American_Idol
258 | John_von_Neumann
259 | Copper
260 | Southern_Europe
261 | BeiDou_Navigation_Satellite_System
262 | Ottoman_Empire
263 | General_Electric
264 | Heian_period
265 | Humanism
266 | Digestion
267 | Unicode
268 | Computer
269 | United_States_dollar
270 | Madonna_(entertainer)
271 | FA_Cup
272 | East_Prussia
273 | Religion_in_ancient_Rome
274 | Bermuda
275 | Supreme_court
276 | Washington_University_in_St._Louis
277 | Xbox_360
278 | Cotton
279 | Melbourne
280 | North_Carolina
281 | Tibet
282 | Super_Nintendo_Entertainment_System
283 | Boston
284 | Pope_Paul_VI
285 | Idealism
286 | Education
287 | Baptists
288 | Tajikistan
289 | Tucson,_Arizona
290 | Namibia
291 | Dwight_D._Eisenhower
292 | Rule_of_law
293 | Jews
294 | Norfolk_Island
295 | Police
296 | Chinese_characters
297 | Annelid
298 | Hunting
299 | Software_testing
300 | LaserDisc
301 | Indigenous_peoples_of_the_Americas
302 | Portugal
303 | Cubism
304 | Bird
305 | Uranium
306 | Raleigh,_North_Carolina
307 | Alexander_Graham_Bell
308 | Nutrition
309 | Neolithic
310 | Asphalt
311 | Cardinal_(Catholicism)
312 | Houston
313 | Mary_(mother_of_Jesus)
314 | United_States_presidential_election,_2004
315 | Prime_minister
316 | Genome
317 | Utrecht
318 | Charleston,_South_Carolina
319 | Kievan_Rus%27
320 | Premier_League
321 | Presbyterianism
322 | Insect
323 | John_Kerry
324 | Karl_Popper
325 | Comprehensive_school
326 | Philadelphia
327 | Seattle
328 | Glass
329 | Sanskrit
330 | Iran
331 | Labour_Party_(UK)
332 | Separation_of_church_and_state_in_the_United_States
333 | Nonprofit_organization
334 | Philosophy_of_space_and_time
335 | Pub
336 | National_Archives_and_Records_Administration
337 | Middle_Ages
338 | Szlachta
339 | House_music
340 | Gramophone_record
341 | Czech_language
342 | Vacuum
343 | Central_Intelligence_Agency
344 | Film_speed
345 | Himachal_Pradesh
346 | Phonology
347 | Canadian_Armed_Forces
348 | Muammar_Gaddafi
349 | Dissolution_of_the_Soviet_Union
350 | High-definition_television
351 | Alloy
352 | Arsenal_F.C.
353 | New_Delhi
354 | Translation
355 | USB
356 | Transistor
357 | Tuvalu
358 | Somerset
359 | Renewable_energy_commercialization
360 | Videoconferencing
361 | Political_party
362 | Gregorian_calendar
363 | Serbo-Croatian
364 | United_Nations_Population_Fund
365 | Brain
366 | ASCII
367 | Ministry_of_Defence_(United_Kingdom)
368 | Mandolin
369 | Antibiotics
370 | Great_power
371 | Beer
372 | Spectre_(2015_film)
373 | Apollo
374 | Energy
375 | Avicenna
376 | Gothic_architecture
377 | Steven_Spielberg
378 | Animal
379 | Geological_history_of_Earth
380 | Miami
381 | University_of_Kansas
382 | Daylight_saving_time
383 | Identity_(social_science)
384 | Canon_law
385 | Sumer
386 | Modern_history
387 | Planck_constant
388 | Child_labour
389 | Buckingham_Palace
390 | Sony_Music_Entertainment
391 | Age_of_Enlightenment
392 | Tennessee
393 | Electric_motor
394 | Marvel_Comics
395 | Federalism
396 | Mali
397 | Geography_of_the_United_States
398 | The_Legend_of_Zelda:_Twilight_Princess
399 | Kanye_West
400 | Molotov%E2%80%93Ribbentrop_Pact
401 | Umayyad_Caliphate
402 | Estonia
403 | Race_(human_categorization)
404 | New_Haven,_Connecticut
405 | Endangered_Species_Act
406 | Symbiosis
407 | Military_history_of_the_United_States
408 | Dog
409 | Printed_circuit_board
410 | Empiricism
411 | The_Blitz
412 | Han_dynasty
413 | Light-emitting_diode
414 | Alsace
415 | United_States_Army
416 | Macintosh
417 | Clothing
418 | Comcast
419 | Elizabeth_II
420 | Liberia
421 | Jehovah%27s_Witnesses
422 | 51st_state
423 | IPod
424 | Bacteria
425 | Matter
426 | Poultry
427 | Gymnastics
428 | John,_King_of_England
429 | Time
430 | Arnold_Schwarzenegger
431 | Queen_(band)
432 | Memory
433 | Florida
434 | Political_corruption
435 | Web_browser
436 | Hydrogen
437 | Ann_Arbor,_Michigan
438 | Bird_migration
439 | Post-punk
440 | Anthropology
441 | Copyright_infringement
442 | Egypt


--------------------------------------------------------------------------------
/data/arxiv_dataset.py:
--------------------------------------------------------------------------------
  1 | import arxiv
  2 | import datetime
  3 | from queue import Queue
  4 | from threading import Thread, Lock
  5 | import os
  6 | import logging
  7 | import time
  8 | import tarfile
  9 | from pylatexenc.latexwalker import LatexWalker, LatexEnvironmentNode, LatexMacroNode
 10 | from pylatexenc import latex2text
 11 | from pylatexenc.macrospec import LatexContextDb
 12 | import shutil
 13 | import re
 14 | import json
 15 | from glob import glob
 16 | from huggingface_hub import create_branch, create_tag, RepoCard
 17 | import datasets
 18 | import sys
 19 | 
 20 | def filter_element(context, exclude_elements = []):
 21 |     
 22 |     new_context = LatexContextDb()
 23 | 
 24 |     new_context.unknown_macro_spec = context.unknown_macro_spec
 25 |     new_context.unknown_environment_spec = context.unknown_environment_spec
 26 |     new_context.unknown_specials_spec = context.unknown_specials_spec
 27 | 
 28 |     filter_element_func = lambda dict_to_filter: {k:v for k,v in dict_to_filter.items() if k not in exclude_elements}.values()
 29 |     for cat in context.category_list:
 30 | 
 31 |         # include this category
 32 |         new_context.add_context_category(
 33 |             cat,
 34 |             macros=filter_element_func(context.d[cat]['macros']),
 35 |             environments=filter_element_func(context.d[cat]['environments']),
 36 |             specials=filter_element_func(context.d[cat]['specials']),
 37 |         )
 38 | 
 39 |     return new_context
 40 | 
 41 | class TextExtractor:
 42 | 
 43 |     def __init__(self):
 44 |         self.l2t_context_db = latex2text.get_default_latex_context_db()
 45 |         self.l2t_context_db = filter_element(self.l2t_context_db, ['href'])
 46 | 
 47 |         self.l2t = latex2text.LatexNodes2Text(latex_context=self.l2t_context_db)
 48 |     
 49 |     def extract(self, latex_code):
 50 |         result = parse_tex_ignore_figures(latex_code)
 51 |         return self.l2t.nodelist_to_text(result)
 52 | 
 53 | def remove_figure_nodes(node_list):
 54 |     filtered_node_list = []
 55 |     for node in node_list:
 56 |         # Ignore the 'figure' environment
 57 |         if node.isNodeType(LatexEnvironmentNode):
 58 |             if node.environmentname in [ 'figure', 'figure*', 'algorithm', 'table', 'table*', 'algorithmic']:
 59 |                 continue
 60 |         if hasattr(node, 'nodelist'):
 61 |             node.nodelist = remove_figure_nodes(node.nodelist)
 62 |         filtered_node_list.append(node)
 63 |     return filtered_node_list
 64 | 
 65 | def parse_tex_ignore_figures(tex_code):
 66 |     walker = LatexWalker(tex_code)
 67 |     parsed = walker.get_latex_nodes()[0]
 68 | 
 69 |     for node in parsed:
 70 |         if node.isNodeType(LatexEnvironmentNode):
 71 |             if node.environmentname == 'document':
 72 |                 parsed = [node]
 73 |                 break
 74 | 
 75 |     filtered_nodes = remove_figure_nodes(parsed)
 76 |     return filtered_nodes
 77 | 
 78 | def resolve_input_commands(latex_code, base_dir="."):
 79 |     input_pattern = re.compile(r"(?<!\\)\\input\{(.*?)\}")
 80 |     comment_pattern = re.compile(r"(?<!\\)%.*")
 81 | 
 82 |     def replace_input(match):
 83 |         filename = match.group(1)
 84 |         file_path = os.path.join(base_dir, filename)
 85 |         if not file_path.endswith(".tex"):
 86 |             file_path += ".tex"
 87 |         with open(file_path, "r", encoding='utf-8', errors='ignore') as input_file:
 88 |             content = input_file.read()
 89 |         return resolve_input_commands(content, base_dir=os.path.dirname(file_path))
 90 | 
 91 |     # Remove comments
 92 |     code_no_comments = comment_pattern.sub("", latex_code)
 93 | 
 94 |     # Resolve input commands
 95 |     resolved_code = input_pattern.sub(replace_input, code_no_comments)
 96 | 
 97 |     return resolved_code
 98 | 
 99 | def pruned_latex_to_text(latex_code, math_mode = 'remove'):
100 |     result = parse_tex_ignore_figures(latex_code)
101 |     return latex2text.LatexNodes2Text(math_mode = math_mode).nodelist_to_text(result)
102 | 
103 | class Worker(Thread):
104 |     def __init__(self, queue, thread_id, text_save_dir):
105 |         Thread.__init__(self)
106 |         self.queue = queue
107 |         self.thread_id = thread_id
108 |         self.text_save_dir = text_save_dir
109 | 
110 |         self.text_extractor = TextExtractor()
111 | 
112 |         # Initialize logging for this thread
113 |         self.logger = logging.getLogger(f"Thread-{thread_id}")
114 |         handler = logging.FileHandler(f"thread_{thread_id}.log")
115 |         formatter = logging.Formatter('%(asctime)s %(levelname)s: %(message)s')
116 |         handler.setFormatter(formatter)
117 |         self.logger.addHandler(handler)
118 |         self.logger.setLevel(logging.INFO)
119 | 
120 |     def run(self):
121 |         while True:
122 |             # Get the work from the queue and expand the tuple
123 |             result_index, result = self.queue.get()
124 |             self.process_result(result)
125 |             self.queue.task_done()
126 |     
127 |     def process_result(self, result):
128 |         meta_data = {
129 |             'entry_id': result.entry_id,
130 |             'published': result.published.strftime("%Y%m%d%H%M%S"),
131 |             'title': result.title,
132 |             'authors': [author.name for author in result.authors],
133 |             'primary_category': result.primary_category,
134 |             'categories': result.categories
135 |         }
136 |         # self.logger.info(f'------ META {meta_data}')
137 |         paper_id = result.entry_id.split('/')[-1]
138 |         try:
139 |             result.download_source('./', filename = f'{paper_id}.arxiv_source')
140 |         except Exception as e:
141 |             self.logger.error(f'ERROR: {e}')
142 |             time.sleep(3)
143 |             return
144 |         
145 |         try:
146 |             with tarfile.open(f'{paper_id}.arxiv_source') as tar:
147 |                 tar.extractall(f'./{paper_id}')
148 |             logging.info(f'------ Extracted {paper_id}.arxiv_source')
149 |         except Exception as e:
150 |             os.remove(f'{paper_id}.arxiv_source')
151 |             self.logger.error(f'ERROR: {e}')
152 |             time.sleep(3)
153 |             return
154 |         
155 |         try:
156 |             extracted_files = os.listdir(f'./{paper_id}')
157 |             tex_files = [file for file in extracted_files if file.endswith('.tex')]
158 |             if len(tex_files) > 1:
159 |                 if 'main.tex' in tex_files: tex_files = ['main.tex']
160 |                 else:
161 |                     self.logger.info(f'------ Found multiple tex files: {tex_files}')
162 |                     return
163 |             elif len(tex_files) == 0:
164 |                 self.logger.info(f'------ Found no tex files')
165 |                 return
166 |             tex_file = tex_files[0]
167 |             with open(f'./{paper_id}/{tex_file}', 'r', encoding='utf-8', errors='ignore') as f:
168 |                 latex_code = f.read()
169 |                 if '\\input' in latex_code:
170 |                     latex_code = resolve_input_commands(latex_code, base_dir=f'./{paper_id}')
171 |                 text = self.text_extractor.extract(latex_code)
172 |             
173 |             meta_data['text'] = text
174 |             with open(f'{self.text_save_dir}/{paper_id}.json', 'w') as f:
175 |                 json.dump(meta_data, f, ensure_ascii=False)
176 |             
177 |             self.logger.info(f'------ Saved {paper_id}.json')
178 |             
179 |         except Exception as e:
180 |             self.logger.error(f'ERROR: {e}')
181 |             time.sleep(3)
182 |             return
183 | 
184 |         finally:
185 |             shutil.rmtree(f'./{paper_id}')
186 |             os.remove(f'{paper_id}.arxiv_source')
187 | 
188 | 
189 | if __name__ == '__main__':
190 |     hf_token = os.environ['HF_TOKEN']
191 |     year, month, save_dir, = sys.argv[1:]
192 |     month = int(month) % 12 + 1
193 | 
194 |     if f'{year}-{month:02d}' in ['2021-01', '2021-02', '2021-03']:
195 |         print(f"Skip {year}-{month:02d}")
196 |         exit()
197 | 
198 |     time_stamp = f'{year}-{month:02d}'
199 | 
200 |     first_day = datetime.date(int(year), int(month), 1)
201 |     last_day = datetime.date(int(year), int(month), 28)
202 | 
203 |     start_time_str = first_day.strftime("%Y%m%d%H%M%S")
204 |     end_time_str = last_day.strftime("%Y%m%d%H%M%S")
205 | 
206 |     text_save_dir = os.path.join(save_dir, time_stamp)
207 |     if not os.path.exists(text_save_dir):
208 |         os.makedirs(text_save_dir)
209 |     
210 |     search = arxiv.Search(
211 |         query=f'submittedDate:[{start_time_str} TO {end_time_str}]',
212 |         sort_by = arxiv.SortCriterion.SubmittedDate,
213 |         sort_order=arxiv.SortOrder.Descending,
214 |         max_results=800
215 |     )
216 | 
217 |     q = Queue()
218 |     num_threads = 4
219 |     
220 |     for i in range(num_threads):
221 |         worker = Worker(q, i, text_save_dir,)
222 |         worker.daemon = True
223 |         worker.start()
224 | 
225 |     for index, result in enumerate(search.results()):
226 |         q.put((index, result))
227 | 
228 |     q.join()
229 | 
230 |     print(f"Finished {time_stamp}")
231 | 
232 |     # files = glob(f'{text_save_dir}/*.json')
233 |     # ds = datasets.load_dataset('json', data_files=files, split='train')
234 | 
235 |     # ds.push_to_hub(
236 |     #     "RealTimeData/arxiv_alltime",
237 |     #     config_name=time_stamp,
238 |     #     token=hf_token,
239 |     # )


--------------------------------------------------------------------------------
/data/monthly_updater/monthly_arxiv.py:
--------------------------------------------------------------------------------
  1 | import arxiv
  2 | import datetime
  3 | from queue import Queue
  4 | from threading import Thread, Lock
  5 | import os
  6 | import logging
  7 | import time
  8 | import tarfile
  9 | from pylatexenc.latexwalker import LatexWalker, LatexEnvironmentNode, LatexMacroNode
 10 | from pylatexenc import latex2text
 11 | from pylatexenc.macrospec import LatexContextDb
 12 | import shutil
 13 | import re
 14 | import json
 15 | from glob import glob
 16 | from huggingface_hub import create_branch, create_tag, RepoCard
 17 | import datasets
 18 | import sys
 19 | 
 20 | def filter_element(context, exclude_elements = []):
 21 |     
 22 |     new_context = LatexContextDb()
 23 | 
 24 |     new_context.unknown_macro_spec = context.unknown_macro_spec
 25 |     new_context.unknown_environment_spec = context.unknown_environment_spec
 26 |     new_context.unknown_specials_spec = context.unknown_specials_spec
 27 | 
 28 |     filter_element_func = lambda dict_to_filter: {k:v for k,v in dict_to_filter.items() if k not in exclude_elements}.values()
 29 |     for cat in context.category_list:
 30 | 
 31 |         # include this category
 32 |         new_context.add_context_category(
 33 |             cat,
 34 |             macros=filter_element_func(context.d[cat]['macros']),
 35 |             environments=filter_element_func(context.d[cat]['environments']),
 36 |             specials=filter_element_func(context.d[cat]['specials']),
 37 |         )
 38 | 
 39 |     return new_context
 40 | 
 41 | class TextExtractor:
 42 | 
 43 |     def __init__(self):
 44 |         self.l2t_context_db = latex2text.get_default_latex_context_db()
 45 |         self.l2t_context_db.add_context_category(
 46 |             'Abstract',
 47 |             macros={},
 48 |             environments=[
 49 |                 latex2text.EnvironmentTextSpec("abstract", simplify_repl=r'§ ABSTRACT %(body)s'),
 50 |                 latex2text.EnvironmentTextSpec("Abstract", simplify_repl=r'§ ABSTRACT %(body)s')
 51 |             ],
 52 |             specials={}
 53 |         )
 54 |         self.l2t_context_db = filter_element(self.l2t_context_db, ['href'])
 55 | 
 56 |         self.l2t = latex2text.LatexNodes2Text(latex_context=self.l2t_context_db)
 57 |     
 58 |     def extract(self, latex_code):
 59 |         result = parse_tex_ignore_figures(latex_code)
 60 |         return self.l2t.nodelist_to_text(result)
 61 | 
 62 | def remove_figure_nodes(node_list):
 63 |     filtered_node_list = []
 64 |     for node in node_list:
 65 |         # Ignore the 'figure' environment
 66 |         if node.isNodeType(LatexEnvironmentNode):
 67 |             if node.environmentname in [ 'figure', 'figure*', 'algorithm', 'table', 'table*', 'algorithmic']:
 68 |                 continue
 69 |         if hasattr(node, 'nodelist'):
 70 |             node.nodelist = remove_figure_nodes(node.nodelist)
 71 |         filtered_node_list.append(node)
 72 |     return filtered_node_list
 73 | 
 74 | def parse_tex_ignore_figures(tex_code):
 75 |     walker = LatexWalker(tex_code)
 76 |     parsed = walker.get_latex_nodes()[0]
 77 | 
 78 |     for node in parsed:
 79 |         if node.isNodeType(LatexEnvironmentNode):
 80 |             if node.environmentname == 'document':
 81 |                 parsed = [node]
 82 |                 break
 83 | 
 84 |     filtered_nodes = remove_figure_nodes(parsed)
 85 |     return filtered_nodes
 86 | 
 87 | def resolve_input_commands(latex_code, base_dir="."):
 88 |     input_pattern = re.compile(r"(?<!\\)\\input\{(.*?)\}")
 89 |     comment_pattern = re.compile(r"(?<!\\)%.*")
 90 | 
 91 |     def replace_input(match):
 92 |         filename = match.group(1)
 93 |         file_path = os.path.join(base_dir, filename)
 94 |         if not file_path.endswith(".tex"):
 95 |             file_path += ".tex"
 96 |         with open(file_path, "r", encoding='utf-8', errors='ignore') as input_file:
 97 |             content = input_file.read()
 98 |         return resolve_input_commands(content, base_dir=os.path.dirname(file_path))
 99 | 
100 |     # Remove comments
101 |     code_no_comments = comment_pattern.sub("", latex_code)
102 | 
103 |     # Resolve input commands
104 |     resolved_code = input_pattern.sub(replace_input, code_no_comments)
105 | 
106 |     return resolved_code
107 | 
108 | def pruned_latex_to_text(latex_code, math_mode = 'remove'):
109 |     result = parse_tex_ignore_figures(latex_code)
110 |     return latex2text.LatexNodes2Text(math_mode = math_mode).nodelist_to_text(result)
111 | 
112 | class Worker(Thread):
113 |     def __init__(self, queue, thread_id, text_save_dir):
114 |         Thread.__init__(self)
115 |         self.queue = queue
116 |         self.thread_id = thread_id
117 |         self.text_save_dir = text_save_dir
118 | 
119 |         self.text_extractor = TextExtractor()
120 | 
121 |         # Initialize logging for this thread
122 |         self.logger = logging.getLogger(f"Thread-{thread_id}")
123 |         handler = logging.FileHandler(f"thread_{thread_id}.log")
124 |         formatter = logging.Formatter('%(asctime)s %(levelname)s: %(message)s')
125 |         handler.setFormatter(formatter)
126 |         self.logger.addHandler(handler)
127 |         self.logger.setLevel(logging.INFO)
128 | 
129 |     def run(self):
130 |         while True:
131 |             # Get the work from the queue and expand the tuple
132 |             result_index, result = self.queue.get()
133 |             self.process_result(result)
134 |             self.queue.task_done()
135 |     
136 |     def process_result(self, result):
137 |         meta_data = {
138 |             'entry_id': result.entry_id,
139 |             'published': result.published.strftime("%Y%m%d%H%M%S"),
140 |             'title': result.title,
141 |             'authors': [author.name for author in result.authors],
142 |             'primary_category': result.primary_category,
143 |             'categories': result.categories
144 |         }
145 |         # self.logger.info(f'------ META {meta_data}')
146 |         paper_id = result.entry_id.split('/')[-1]
147 |         try:
148 |             result.download_source('./', filename = f'{paper_id}.arxiv_source')
149 |         except Exception as e:
150 |             self.logger.error(f'ERROR: {e}')
151 |             time.sleep(3)
152 |             return
153 |         
154 |         try:
155 |             with tarfile.open(f'{paper_id}.arxiv_source') as tar:
156 |                 tar.extractall(f'./{paper_id}')
157 |             logging.info(f'------ Extracted {paper_id}.arxiv_source')
158 |         except Exception as e:
159 |             os.remove(f'{paper_id}.arxiv_source')
160 |             self.logger.error(f'ERROR: {e}')
161 |             time.sleep(3)
162 |             return
163 |         
164 |         try:
165 |             extracted_files = os.listdir(f'./{paper_id}')
166 |             tex_files = [file for file in extracted_files if file.endswith('.tex')]
167 |             if len(tex_files) > 1:
168 |                 if 'main.tex' in tex_files: tex_files = ['main.tex']
169 |                 else:
170 |                     self.logger.info(f'------ Found multiple tex files: {tex_files}')
171 |                     return
172 |             elif len(tex_files) == 0:
173 |                 self.logger.info(f'------ Found no tex files')
174 |                 return
175 |             tex_file = tex_files[0]
176 |             with open(f'./{paper_id}/{tex_file}', 'r', encoding='utf-8', errors='ignore') as f:
177 |                 latex_code = f.read()
178 |                 if '\\input' in latex_code:
179 |                     latex_code = resolve_input_commands(latex_code, base_dir=f'./{paper_id}')
180 |                 text = self.text_extractor.extract(latex_code)
181 |             
182 |             meta_data['text'] = text
183 |             with open(f'{self.text_save_dir}/{paper_id}.json', 'w') as f:
184 |                 json.dump(meta_data, f, ensure_ascii=False)
185 |             
186 |             self.logger.info(f'------ Saved {paper_id}.json')
187 |             
188 |         except Exception as e:
189 |             self.logger.error(f'ERROR: {e}')
190 |             time.sleep(3)
191 |             return
192 | 
193 |         finally:
194 |             shutil.rmtree(f'./{paper_id}')
195 |             os.remove(f'{paper_id}.arxiv_source')
196 | 
197 | 
198 | if __name__ == '__main__':
199 |     today = datetime.date.today()
200 |     year = today.year
201 |     month = today.month
202 |     save_dir = './arxiv_data/'
203 | 
204 |     hf_token = os.environ['HF_TOKEN']
205 |     time_stamp = f'{year}-{month:02d}'
206 | 
207 |     first_day = datetime.date(int(year), int(month), 1)
208 |     last_day = datetime.date(int(year), int(month), 28)
209 | 
210 |     start_time_str = first_day.strftime("%Y%m%d%H%M%S")
211 |     end_time_str = last_day.strftime("%Y%m%d%H%M%S")
212 | 
213 |     text_save_dir = os.path.join(save_dir, time_stamp)
214 |     if not os.path.exists(text_save_dir):
215 |         os.makedirs(text_save_dir)
216 |     
217 |     search = arxiv.Search(
218 |         query=f'submittedDate:[{start_time_str} TO {end_time_str}]',
219 |         sort_by = arxiv.SortCriterion.SubmittedDate,
220 |         sort_order=arxiv.SortOrder.Descending,
221 |         max_results=1000
222 |     )
223 | 
224 |     q = Queue()
225 |     num_threads = 4
226 |     
227 |     for i in range(num_threads):
228 |         worker = Worker(q, i, text_save_dir,)
229 |         worker.daemon = True
230 |         worker.start()
231 | 
232 |     for index, result in enumerate(search.results()):
233 |         q.put((index, result))
234 | 
235 |     q.join()
236 | 
237 |     print(f"Finished {time_stamp}")
238 | 
239 |     files = glob(f'{text_save_dir}/*.json')
240 |     ds = datasets.load_dataset('json', data_files=files, split='train')
241 | 
242 |     ds.push_to_hub(
243 |         "RealTimeData/arxiv_alltime",
244 |         config_name=time_stamp,
245 |         token=hf_token,
246 |     )


--------------------------------------------------------------------------------
/arxiv_downloader.py:
--------------------------------------------------------------------------------
  1 | import arxiv
  2 | import datetime
  3 | from queue import Queue
  4 | from threading import Thread, Lock
  5 | import os
  6 | import logging
  7 | import time
  8 | import tarfile
  9 | from pylatexenc.latexwalker import LatexWalker, LatexEnvironmentNode, LatexMacroNode
 10 | from pylatexenc import latex2text
 11 | from pylatexenc.macrospec import LatexContextDb
 12 | import shutil
 13 | import re
 14 | import json
 15 | from glob import glob
 16 | from huggingface_hub import create_branch, create_tag, RepoCard
 17 | import datasets
 18 | import sys
 19 | 
 20 | def filter_element(context, exclude_elements = []):
 21 |     
 22 |     new_context = LatexContextDb()
 23 | 
 24 |     new_context.unknown_macro_spec = context.unknown_macro_spec
 25 |     new_context.unknown_environment_spec = context.unknown_environment_spec
 26 |     new_context.unknown_specials_spec = context.unknown_specials_spec
 27 | 
 28 |     filter_element_func = lambda dict_to_filter: {k:v for k,v in dict_to_filter.items() if k not in exclude_elements}.values()
 29 |     for cat in context.category_list:
 30 | 
 31 |         # include this category
 32 |         new_context.add_context_category(
 33 |             cat,
 34 |             macros=filter_element_func(context.d[cat]['macros']),
 35 |             environments=filter_element_func(context.d[cat]['environments']),
 36 |             specials=filter_element_func(context.d[cat]['specials']),
 37 |         )
 38 | 
 39 |     return new_context
 40 | 
 41 | class TextExtractor:
 42 | 
 43 |     def __init__(self):
 44 |         self.l2t_context_db = latex2text.get_default_latex_context_db()
 45 |         self.l2t_context_db.add_context_category(
 46 |             'Abstract',
 47 |             macros={},
 48 |             environments=[
 49 |                 latex2text.EnvironmentTextSpec("abstract", simplify_repl=r'§ ABSTRACT %(body)s'),
 50 |                 latex2text.EnvironmentTextSpec("Abstract", simplify_repl=r'§ ABSTRACT %(body)s')
 51 |             ],
 52 |             specials={}
 53 |         )
 54 |         self.l2t_context_db = filter_element(self.l2t_context_db, ['href'])
 55 | 
 56 |         self.l2t = latex2text.LatexNodes2Text(latex_context=self.l2t_context_db)
 57 |     
 58 |     def extract(self, latex_code):
 59 |         result = parse_tex_ignore_figures(latex_code)
 60 |         return self.l2t.nodelist_to_text(result)
 61 | 
 62 | def remove_figure_nodes(node_list):
 63 |     filtered_node_list = []
 64 |     for node in node_list:
 65 |         # Ignore the 'figure' environment
 66 |         if node.isNodeType(LatexEnvironmentNode):
 67 |             if node.environmentname in [ 'figure', 'figure*', 'algorithm', 'table', 'table*', 'algorithmic']:
 68 |                 continue
 69 |         if hasattr(node, 'nodelist'):
 70 |             node.nodelist = remove_figure_nodes(node.nodelist)
 71 |         filtered_node_list.append(node)
 72 |     return filtered_node_list
 73 | 
 74 | def parse_tex_ignore_figures(tex_code):
 75 |     walker = LatexWalker(tex_code)
 76 |     parsed = walker.get_latex_nodes()[0]
 77 | 
 78 |     for node in parsed:
 79 |         if node.isNodeType(LatexEnvironmentNode):
 80 |             if node.environmentname == 'document':
 81 |                 parsed = [node]
 82 |                 break
 83 | 
 84 |     filtered_nodes = remove_figure_nodes(parsed)
 85 |     return filtered_nodes
 86 | 
 87 | def resolve_input_commands(latex_code, base_dir="."):
 88 |     input_pattern = re.compile(r"(?<!\\)\\input\{(.*?)\}")
 89 |     comment_pattern = re.compile(r"(?<!\\)%.*")
 90 | 
 91 |     def replace_input(match):
 92 |         filename = match.group(1)
 93 |         file_path = os.path.join(base_dir, filename)
 94 |         if not file_path.endswith(".tex"):
 95 |             file_path += ".tex"
 96 |         with open(file_path, "r", encoding='utf-8', errors='ignore') as input_file:
 97 |             content = input_file.read()
 98 |         return resolve_input_commands(content, base_dir=os.path.dirname(file_path))
 99 | 
100 |     # Remove comments
101 |     code_no_comments = comment_pattern.sub("", latex_code)
102 | 
103 |     # Resolve input commands
104 |     resolved_code = input_pattern.sub(replace_input, code_no_comments)
105 | 
106 |     return resolved_code
107 | 
108 | def pruned_latex_to_text(latex_code, math_mode = 'remove'):
109 |     result = parse_tex_ignore_figures(latex_code)
110 |     return latex2text.LatexNodes2Text(math_mode = math_mode).nodelist_to_text(result)
111 | 
112 | class Worker(Thread):
113 |     def __init__(self, queue, thread_id, text_save_dir):
114 |         Thread.__init__(self)
115 |         self.queue = queue
116 |         self.thread_id = thread_id
117 |         self.text_save_dir = text_save_dir
118 | 
119 |         self.text_extractor = TextExtractor()
120 | 
121 |         # Initialize logging for this thread
122 |         self.logger = logging.getLogger(f"Thread-{thread_id}")
123 |         handler = logging.FileHandler(f"thread_{thread_id}.log")
124 |         formatter = logging.Formatter('%(asctime)s %(levelname)s: %(message)s')
125 |         handler.setFormatter(formatter)
126 |         self.logger.addHandler(handler)
127 |         self.logger.setLevel(logging.INFO)
128 | 
129 |     def run(self):
130 |         while True:
131 |             # Get the work from the queue and expand the tuple
132 |             result_index, result = self.queue.get()
133 |             self.process_result(result)
134 |             self.queue.task_done()
135 |     
136 |     def process_result(self, result):
137 |         meta_data = {
138 |             'entry_id': result.entry_id,
139 |             'published': result.published.strftime("%Y%m%d%H%M%S"),
140 |             'title': result.title,
141 |             'authors': [author.name for author in result.authors],
142 |             'primary_category': result.primary_category,
143 |             'categories': result.categories
144 |         }
145 |         # self.logger.info(f'------ META {meta_data}')
146 |         paper_id = result.entry_id.split('/')[-1]
147 |         try:
148 |             result.download_source('./', filename = f'{paper_id}.arxiv_source')
149 |         except Exception as e:
150 |             self.logger.error(f'ERROR: {e}')
151 |             time.sleep(3)
152 |             return
153 |         
154 |         try:
155 |             with tarfile.open(f'{paper_id}.arxiv_source') as tar:
156 |                 tar.extractall(f'./{paper_id}')
157 |             logging.info(f'------ Extracted {paper_id}.arxiv_source')
158 |         except Exception as e:
159 |             os.remove(f'{paper_id}.arxiv_source')
160 |             self.logger.error(f'ERROR: {e}')
161 |             time.sleep(3)
162 |             return
163 |         
164 |         try:
165 |             extracted_files = os.listdir(f'./{paper_id}')
166 |             tex_files = [file for file in extracted_files if file.endswith('.tex')]
167 |             if len(tex_files) > 1:
168 |                 if 'main.tex' in tex_files: tex_files = ['main.tex']
169 |                 else:
170 |                     self.logger.info(f'------ Found multiple tex files: {tex_files}')
171 |                     return
172 |             elif len(tex_files) == 0:
173 |                 self.logger.info(f'------ Found no tex files')
174 |                 return
175 |             tex_file = tex_files[0]
176 |             with open(f'./{paper_id}/{tex_file}', 'r', encoding='utf-8', errors='ignore') as f:
177 |                 latex_code = f.read()
178 |                 if '\\input' in latex_code:
179 |                     latex_code = resolve_input_commands(latex_code, base_dir=f'./{paper_id}')
180 |                 text = self.text_extractor.extract(latex_code)
181 |             
182 |             meta_data['text'] = text
183 |             with open(f'{self.text_save_dir}/{paper_id}.json', 'w') as f:
184 |                 json.dump(meta_data, f, ensure_ascii=False)
185 |             
186 |             self.logger.info(f'------ Saved {paper_id}.json')
187 |             
188 |         except Exception as e:
189 |             self.logger.error(f'ERROR: {e}')
190 |             time.sleep(3)
191 |             return
192 | 
193 |         finally:
194 |             shutil.rmtree(f'./{paper_id}')
195 |             os.remove(f'{paper_id}.arxiv_source')
196 | 
197 | 
198 | if __name__ == '__main__':
199 |     hf_token = os.environ['HF_TOKEN']
200 | 
201 |     today = datetime.date.today()
202 |     start_time = today - datetime.timedelta(days=7)
203 | 
204 |     start_time_str = start_time.strftime("%Y%m%d%H%M%S")
205 |     end_time_str = today.strftime("%Y%m%d%H%M%S")
206 | 
207 |     text_save_dir = f'arxiv_{start_time_str}_to_{end_time_str}'
208 |     if not os.path.exists(text_save_dir):
209 |         os.makedirs(text_save_dir)
210 |     
211 |     search = arxiv.Search(
212 |         query=f'submittedDate:[{start_time_str} TO {end_time_str}]',
213 |         sort_by = arxiv.SortCriterion.SubmittedDate,
214 |         sort_order=arxiv.SortOrder.Descending,
215 |         max_results=1600
216 |     )
217 | 
218 |     q = Queue()
219 |     num_threads = 4
220 |     
221 |     for i in range(num_threads):
222 |         worker = Worker(q, i, text_save_dir,)
223 |         worker.daemon = True
224 |         worker.start()
225 | 
226 |     for index, result in enumerate(search.results()):
227 |         q.put((index, result))
228 | 
229 |     q.join()
230 | 
231 |     files = glob(f'{text_save_dir}/*.json')
232 |     ds = datasets.load_dataset('json', data_files=files, split='train')
233 | 
234 |     try:
235 |         create_branch('RealTimeData/arxiv_latest', branch=today.isoformat(), token=hf_token, repo_type='dataset')
236 |     except:
237 |         pass
238 |     ds.push_to_hub('RealTimeData/arxiv_latest', token=hf_token, branch=today.isoformat())
239 |     ds.push_to_hub('RealTimeData/arxiv_latest', token=hf_token, branch='main')
240 | 
241 |     text = f"""
242 | # Latest arXiv
243 | 
244 | You could always access the latest arXiv papers via this dataset.
245 | 
246 | We update the dataset weekly, on every Sunday. So the dataset always provides the latest arXiv papers created in the past week.
247 | 
248 | The current dataset on main branch contains the latest arXiv papers submitted from {start_time.isoformat()} to {today.isoformat()}.
249 | 
250 | The data collection was conducted on {today.isoformat()}.
251 | 
252 | Use the dataset via:
253 | ```
254 | ds = datasets.load_dataset('RealTimeData/arxiv_latest')
255 | ```
256 | 
257 | # Previsou versions
258 | 
259 | You could access previous versions by requesting different branches.
260 | 
261 | For example, you could find the 2023-08-20 version via:
262 | ```
263 | ds = datasets.load_dataset('RealTimeData/arxiv_latest', revision = '2023-08-20')
264 | ```
265 | 
266 | Check all available versions by clicking the "Files and versions" button on the top bar.
267 | """
268 |     card = RepoCard(text)
269 |     card.push_to_hub('RealTimeData/arxiv_latest', repo_type='dataset', token=hf_token)


--------------------------------------------------------------------------------
/data/wikipedia.py:
--------------------------------------------------------------------------------
  1 | import requests
  2 | import mwparserfromhell
  3 | import json
  4 | import os
  5 | from transformers import LlamaForCausalLM, LlamaTokenizerFast, AutoModelForCausalLM, AutoTokenizer, OPTForCausalLM
  6 | import sys
  7 | import torch
  8 | from tqdm import tqdm
  9 | import traceback
 10 | from auto_gptq import AutoGPTQForCausalLM, BaseQuantizeConfig
 11 | import datasets
 12 | import numpy as np
 13 | import time
 14 | import openai
 15 | from doc_info import verbalise_docs
 16 | 
 17 | WIKI_API_ENDPOINT = "https://en.wikipedia.org/w/api.php"
 18 | 
 19 | def self_info(text, model, tokenizer, merge = False):
 20 |     def merge_sub_tokens(log_probs, word_ids):
 21 |         # merge log probs of sub_tokens
 22 |         merged_log_probs = []
 23 |         current_word_id = None
 24 |         current_word_log_prob = None
 25 |         counter = 1
 26 | 
 27 |         for log_prob, word_id in zip(log_probs, word_ids):
 28 |             if word_id is not None:
 29 |                 if current_word_id != word_id:
 30 |                     if current_word_id is not None:
 31 |                         merged_log_probs.extend([current_word_log_prob] * counter)
 32 |                     counter = 1
 33 |                     current_word_id = word_id
 34 |                     current_word_log_prob = log_prob
 35 |                 else:
 36 |                     counter += 1
 37 |                     current_word_log_prob = current_word_log_prob + log_prob
 38 | 
 39 |         if current_word_id is not None:
 40 |             merged_log_probs.extend([current_word_log_prob] * counter)
 41 | 
 42 |         return merged_log_probs
 43 | 
 44 |     # this function is used to get the self-information of a text
 45 |     # the model should be a causal language model, e.g. GPT2LMHeadModel
 46 | 
 47 |     # tokenize the text
 48 |     text = f"{tokenizer.bos_token}{text}"
 49 |     encoding = tokenizer(text, return_tensors="pt", max_length=model.config.max_position_embeddings, truncation=True)
 50 |     encoding = encoding.to(model.device)
 51 | 
 52 |     # get the logits
 53 |     with torch.no_grad():
 54 |         logits = model(**encoding).logits
 55 |         probs = torch.softmax(logits, dim=-1)
 56 |         info = -torch.log(probs)
 57 | 
 58 |     input_ids = encoding['input_ids']
 59 |     input_ids_expaned = input_ids[:, 1:].unsqueeze(-1)
 60 |     info = info[:, :-1].gather(-1, input_ids_expaned).squeeze(-1).squeeze(0).tolist()
 61 | 
 62 |     tokens = [tokenizer.decode(token_) for token_ in input_ids.squeeze().tolist()[1:]]
 63 |     if merge:
 64 |         info = merge_sub_tokens(info, encoding.word_ids()[1:])
 65 |     return tokens, info
 66 | 
 67 | def gpt3_self_info(text, num_retry = 5):
 68 |     # text = text[:1000]
 69 |     openai.api_key = os.environ["OPENAI_API_KEY"]
 70 | 
 71 |     for _ in range(num_retry):
 72 |         try:
 73 |             r = openai.Completion.create(
 74 |                 model="curie",
 75 |                 prompt=f"<|endoftext|>{text}",
 76 |                 max_tokens=0,
 77 |                 temperature=0,
 78 |                 echo=True,
 79 |                 logprobs=0,
 80 |             )
 81 |             break
 82 |         except Exception as e:
 83 |             print(e)
 84 |             time.sleep(1)
 85 | 
 86 |     result = r['choices'][0]
 87 |     tokens, logprobs = result["logprobs"]["tokens"][1:], result["logprobs"]["token_logprobs"][1:]
 88 | 
 89 |     assert len(tokens) == len(logprobs), f"Expected {len(tokens)} logprobs, got {len(logprobs)}"
 90 | 
 91 |     self_info = [ -logprob for logprob in logprobs]
 92 |     # TODO: deal with the first delimiter
 93 |     return tokens, self_info
 94 | 
 95 | def fetch_recent_changes(from_date, to_date = '2023-08-01T00:00:00'):
 96 |     params = {
 97 |         "action": "query",
 98 |         "format": "json",
 99 |         "list": "recentchanges",
100 |         "rcstart": to_date,  # starting from the newer date
101 |         "rcend": from_date,  # ending at the older date
102 |         "rctype": "new",
103 |         "rcnamespace": "0",
104 |         "rclimit": "500",
105 |         "rcprop": "title|timestamp"
106 |     }
107 |     req = requests.Request('GET', WIKI_API_ENDPOINT, params=params).prepare()
108 |     response = requests.get(WIKI_API_ENDPOINT, params=params).json()
109 |     
110 |     # Check if the response contains the expected data
111 |     if 'query' in response and 'recentchanges' in response['query']:
112 |         return [entry['title'] for entry in response['query']['recentchanges']]
113 |     else:
114 |         return []
115 | 
116 | def fetch_content(title, date=None):
117 |     params = {
118 |         "action": "query",
119 |         "format": "json",
120 |         "titles": title,
121 |         "prop": "revisions",
122 |         "rvprop": "content",
123 |         "rvlimit": "1",
124 |     }
125 |     if date: params["rvstart"] = date
126 |     try:
127 |         response = requests.get(WIKI_API_ENDPOINT, params=params)
128 |         response.raise_for_status()  # Will raise an error if the HTTP request returned an unsuccessful status code
129 |         data = response.json()
130 |         if 'error' in data:
131 |             print(f"Error fetching content for {title}: {data['error']['info']}")
132 |             return None
133 | 
134 |         page = next(iter(data['query']['pages'].values()))
135 |         if 'revisions' not in page:
136 |             print(f"No revisions found for {title}")
137 |             return None
138 |         content = page['revisions'][0]['*']
139 |         
140 |         # Check if the content is a redirect and skip if true
141 |         if content.startswith("#REDIRECT"):
142 |             print(f"{title} is a redirect page.")
143 |             return None
144 |         return content
145 | 
146 |     except Exception as e:
147 |         print(f"An error occurred while fetching content for {title}: {str(e)}")
148 |         traceback.print_exc()  # This will print the full traceback
149 | 
150 |     return None
151 | 
152 | def parse_to_plain_text(wikitext):
153 |     parsed = mwparserfromhell.parse(wikitext)
154 |     return parsed.strip_code()
155 | 
156 | def select_token_window(text, token_count=400):
157 |     tokens = text.split()
158 |     if len(tokens) <= token_count:
159 |         return text
160 |     ramdom_start = np.random.randint(0, len(tokens) - token_count)
161 |     tokens = tokens[ramdom_start:ramdom_start + token_count]
162 |     return ' '.join(tokens)
163 | 
164 | def fetch_latest_and_historical_wiki_pages(cache_dir = '', historical_date = '2022-07-01T00:00:00Z', token_count = 300):
165 |     # 1. Fetch the latest created pages from July 2023 and their content.
166 |     recent_wiki_path = os.path.join(cache_dir, 'recent_wiki_pages.json')
167 |     if not os.path.exists(recent_wiki_path):
168 |         recent_titles = fetch_recent_changes("2023-07-01T00:00:00Z")
169 |         recent_contents = [fetch_content(title) for title in tqdm(recent_titles)]
170 |         recent_contents = [content for content in recent_contents if content is not None]
171 | 
172 |         data_to_save = {title: content for title, content in zip(recent_titles, recent_contents)}
173 |         with open(recent_wiki_path, 'w') as file:
174 |             json.dump(data_to_save, file, ensure_ascii=False, indent=4)
175 |     else:
176 |         with open(recent_wiki_path) as file:
177 |             data_to_save = json.load(file)
178 |         recent_titles = list(data_to_save.keys())
179 |         recent_contents = list(data_to_save.values())
180 |         recent_contents = [content for content in recent_contents if content is not None]
181 | 
182 |     # 2. Fetch a historical version of a specific title from July 2022.
183 |     historical_wiki_path = os.path.join(cache_dir, 'historical_wiki_pages.json')
184 |     if not os.path.exists(historical_wiki_path):
185 |         with open(os.path.join(cache_dir, 'data/squad_wiki_title.text')) as f:
186 |             titles = [line.strip() for line in f.readlines()]
187 |         historical_contents = [fetch_content(title, historical_date) for title in tqdm(titles)]
188 |         historical_contents = [content for content in historical_contents if content is not None]
189 |         historical_to_save = {title: content for title, content in zip(titles, historical_contents)}
190 |         with open(historical_wiki_path, 'w') as file:
191 |             json.dump(historical_to_save, file, ensure_ascii=False, indent=4)
192 |     else:
193 |         with open(historical_wiki_path) as file:
194 |             historical_to_save = json.load(file)
195 |         historical_titles = list(historical_to_save.keys())
196 |         historical_contents = list(historical_to_save.values())
197 |         historical_contents = [content for content in historical_contents if content is not None]
198 | 
199 |     # 3. Parse the content to plain text.
200 |     recent_plain_text_path = os.path.join(cache_dir, 'recent_plain_text.json')
201 |     historical_plain_text_path = os.path.join(cache_dir, 'historical_plain_text.json')
202 |     if not os.path.exists(recent_plain_text_path):
203 |         plain_texts_recent = [parse_to_plain_text(content) for content in recent_contents]
204 |         plain_texts_historical = [parse_to_plain_text(content) for content in historical_contents]
205 |         with open(recent_plain_text_path, 'w') as file:
206 |             json.dump(plain_texts_recent, file, ensure_ascii=False, indent=4)
207 |         with open(historical_plain_text_path, 'w') as file:
208 |             json.dump(plain_texts_historical, file, ensure_ascii=False, indent=4)
209 |     else:
210 |         with open(recent_plain_text_path) as file:
211 |             plain_texts_recent = json.load(file)
212 |         with open(historical_plain_text_path) as file:
213 |             plain_texts_historical = json.load(file)
214 | 
215 |     # 4. Select a 1000-token window from the text.
216 |     selected_windows_recent = [select_token_window(text, token_count=token_count) for text in plain_texts_recent]
217 |     selected_windows_historical = [select_token_window(text, token_count=token_count) for text in plain_texts_historical]
218 | 
219 |     return selected_windows_recent, selected_windows_historical
220 | 
221 | def prepare_comparing_data(datasets_and_texts_col, num_samples=200, token_count=300):
222 |     # datasets_and_texts is a dict of list {dataset_name: col_name}
223 | 
224 |     datasets_and_texts = {}
225 |     for dataset_name, col_name in datasets_and_texts_col.items():
226 |         if dataset_name in ['quac', 'squad_v2', 'boolq', 'iohadrubin/mini_xsum', 'liyucheng/trivia_qa_wiki_val']:
227 |             ds = datasets.load_dataset(dataset_name, split='validation')
228 |         elif 'RealTimeData' in dataset_name:
229 |             ds = datasets.load_dataset(dataset_name, split='train')
230 |         ds = ds[col_name][:num_samples]
231 | 
232 |         datasets_and_texts[dataset_name + f'_{token_count}_words'] = [select_token_window(text, token_count=token_count) for text in ds]
233 |         # datasets_and_texts[dataset_name + '_200_words'] = [select_token_window(text, token_count=200) for text in ds]
234 |     
235 |     return datasets_and_texts
236 | 
237 | if __name__ == "__main__":
238 |     cwd, model_name, token_count, = sys.argv[1:]
239 |     token_count = int(token_count)
240 |     batch_size = 8
241 | 
242 |     recent_snippets, historical_snippets = fetch_latest_and_historical_wiki_pages(cache_dir=cwd, token_count=token_count)
243 |     recent_snippets = recent_snippets[:120]
244 |     historical_snippets = historical_snippets[:120]
245 |     wikipedia_and_texts = {
246 |         'wiki_recent': recent_snippets,
247 |         'wiki_historical': historical_snippets
248 |     }
249 |     # datasets_and_texts = prepare_comparing_data({
250 |         # 'liyucheng/trivia_qa_wiki_val': 'wiki_context_sample'
251 |         # 'RealTimeData/bbc_latest': 'content',
252 |         # 'RealTimeData/bbc_2017': 'content',
253 |         # 'iohadrubin/mini_xsum': 'document'
254 |         # 'quac': 'context',
255 |         # 'boolq': 'passage',
256 |         # 'squad_v2': 'context',
257 |         # 'RealTimeData/github_july_week1_2023': 'readme',
258 |         # 'RealTimeData/arxiv_july_week1_2023': 'text',
259 |         # 'RealTimeData/bbc_news_week1_july_2023': 'content',
260 |     # }, token_count=token_count, num_samples=120)
261 |     datasets_and_texts = verbalise_docs(num_words=token_count)
262 | 
263 |     if 'GPTQ' in model_name:
264 |         # only llama-30b use gptq
265 |         model = AutoGPTQForCausalLM.from_quantized(model_name, device = 'cuda:0', use_safetensors = True, disable_exllama=True if '30b' in model_name else False)
266 |         tokenizer = LlamaTokenizerFast.from_pretrained(model_name)
267 |     elif 'llama' in model_name.lower():
268 |         model = LlamaForCausalLM.from_pretrained(model_name, torch_dtype=torch.float16, device_map='auto')
269 |         tokenizer = LlamaTokenizerFast.from_pretrained(model_name)
270 |     elif 'opt' in model_name.lower():
271 |         model = OPTForCausalLM.from_pretrained(model_name, device_map='auto')
272 |         tokenizer = AutoTokenizer.from_pretrained(model_name)
273 |     elif 'gpt2' == model_name.lower():
274 |         model = AutoModelForCausalLM.from_pretrained(model_name)
275 |         tokenizer = AutoTokenizer.from_pretrained(model_name)
276 | 
277 |         # datasets_and_texts = prepare_comparing_data({
278 |         #     'RealTimeData/News_Seq_2021': 'maintext',
279 |         #     'RealTimeData/News_August_2023': 'maintext',
280 |         # })
281 |     
282 |     # datasets_and_texts.update(wikipedia_and_texts)
283 | 
284 |     print('=====================')
285 |     print(f'Model: {model_name}')
286 | 
287 |     for dataset_name, texts in datasets_and_texts.items():
288 |         print(f'=====================')
289 |         print(f'Dataset: {dataset_name}')
290 |         infos = []
291 |         for text in tqdm(texts):
292 |             try:
293 |                 if 'curie' in model_name.lower():
294 |                     tokens, info = gpt3_self_info(text)
295 |                 else:
296 |                     tokens, info = self_info(text, model, tokenizer)
297 |             except:
298 |                 traceback.print_exc()
299 |                 time.sleep(10)
300 |                 continue
301 |             # print('text:', text, '\ninfo:', info)
302 |             infos.append(sum(info)/len(info))
303 |         print(f'Average self-info: {sum(infos)/len(infos)}')


--------------------------------------------------------------------------------
/bbc_downloader.py:
--------------------------------------------------------------------------------
  1 | import weakref
  2 | import requests
  3 | 
  4 | from configobj import ConfigObj
  5 | 
  6 | class Configuration:
  7 | 
  8 |     def __init__(self):
  9 |         self.__properties = dict()
 10 |         properties = self._init_properties()
 11 |         for property_, value, transform_fn in properties:
 12 |             if transform_fn is not None:
 13 |                 value = transform_fn(value)
 14 |             setattr(self, property_, value)
 15 |             self.__properties[property_] = {
 16 |                 'default-value': value,
 17 |                 'transform_fn': transform_fn
 18 |             }
 19 | 
 20 |     def _init_properties(self):
 21 |         # [[name, default-value, transform_fn]]
 22 |         return []
 23 | 
 24 |     # TODO: hierachical config
 25 |     def load(self, path):
 26 |         config = ConfigObj(path, encoding='UTF-8')
 27 |         for property_, value in config.items():
 28 |             transform_fn = self.__properties[property_]['transform_fn']
 29 |             if transform_fn is not None:
 30 |                 value = transform_fn(value)
 31 |             setattr(self, property_, value)
 32 | 
 33 | from dateutil.relativedelta import relativedelta
 34 | # from datetime import datetime, date
 35 | import datetime
 36 | 
 37 | class DatasetConfiguration(Configuration):
 38 | 
 39 |     def _format_date(self, date_str):
 40 |         return datetime.datetime.strptime(date_str, '%Y-%m-%d')
 41 | 
 42 |     def _calculate_step(self, step):
 43 |         step = int(step)
 44 |         if self.step_unit == 'day':
 45 |             return relativedelta(days=step)
 46 |         elif self.step_unit == 'month':
 47 |             return relativedelta(months=step)
 48 |         else:
 49 |             return relativedelta(years=step)
 50 | 
 51 |     def _init_properties(self):
 52 |         return [
 53 |             ['name', '', str],
 54 |             ['base_api_url', 'http://dracos.co.uk/made/bbc-news-archive/{year}/{month:0>2}/{day:0>2}/', str],
 55 |             ['start_date', '2016-01-01', self._format_date],
 56 |             ['end_date', '2017-01-01', self._format_date],
 57 |             ['step_unit', 'day', str],
 58 |             ['step', 1, self._calculate_step],
 59 |             ['path', './dataset/bbc/', str],
 60 |             ['sleep', 1, float]
 61 |         ]
 62 | 
 63 | class NetWorkConfiguration(Configuration):
 64 | 
 65 |     HTTP_TIMEOUT = 30
 66 |     STRICT = True
 67 |     USER_AGENT = 'Mozilla'
 68 | 
 69 |     def _init_properties(self):
 70 |         return [
 71 |             ['browser_user_agent', 'Mozilla', str],
 72 |             ['http_timeout', 30, int],
 73 |             ['strict', True, lambda v: str(v) == 'True']
 74 |         ]
 75 | 
 76 | class NetworkError(RuntimeError):
 77 | 
 78 |     def __init__(self, status_code, reason):
 79 |         self.reason = reason
 80 |         self.status_code = status_code
 81 | 
 82 | class NetworkFetcher(object):
 83 | 
 84 |     def __init__(self):
 85 |         self.config = NetWorkConfiguration()
 86 |         # self.config.load('./settings/network.cfg')
 87 |         self.config.strict = False
 88 | 
 89 |         self._connection = requests.Session()
 90 |         self._connection.headers['User-agent'] = self.config.browser_user_agent
 91 |         self._finalizer = weakref.finalize(self, self.close)
 92 | 
 93 |         self._url = None
 94 |         self.response = None
 95 |         self.headers = None
 96 | 
 97 |     def close(self):
 98 |         if self._connection is not None:
 99 |             self._connection.close()
100 |             self._connection = None
101 | 
102 |     def get_url(self):
103 |         return self._url
104 | 
105 |     def fetch(self, url):
106 |         try:
107 |             response = self._connection.get(url, timeout=self.config.http_timeout, headers=self.headers)
108 |         except Exception:
109 |             return None
110 |         if response.ok:
111 |             self._url = response.url
112 |             text = response.content
113 |         else:
114 |             self._url = None
115 |             text = None
116 |             if self.config.strict:
117 |                 raise NetworkError(response.status_code, response.reason)
118 | 
119 |         return text
120 | 
121 | class DownloadLinkFetcher:
122 | 
123 |     RETRY = 5
124 | 
125 |     def __init__(self, config):
126 |         self.base_api_url = config.base_api_url
127 | 
128 |         self.start_date = config.start_date
129 |         self.current_date = config.start_date
130 |         self.end_date = config.end_date
131 |         self.step_unit = config.step_unit
132 |         self.step = config.step
133 | 
134 |         self.html_fetcher = NetworkFetcher()
135 | 
136 |     def _format_link(self, link):
137 |         print(link)
138 |         hash_index = link.find('#')
139 |         if hash_index != -1:
140 |             link = link[:hash_index]
141 |         if link and link[-1] == '/':
142 |             link = link[:-1]
143 |         return link
144 | 
145 |     def _link_filter(self, link, filters):
146 |         if not link:
147 |             return False
148 |         if not link[-1].isdigit():
149 |             return False
150 |         for filter_ in filters:
151 |             if link[filter_[1]:filter_[2]] == filter_[0]:
152 |                 return False
153 |         return True
154 | 
155 |     def _html_to_links(self, html):
156 |         return []
157 | 
158 |     def _next_api(self, base_url, current_date):
159 |         return ''
160 | 
161 |     def next(self):
162 |         if self.current_date >= self.end_date:
163 |             return None, None
164 |         api_url = self._next_api(self.base_api_url, self.current_date)
165 |         date = self.current_date
166 |         self.current_date += self.step
167 |         return api_url, date
168 | 
169 |     def fetch(self, api_url):
170 |         print('fetching download links...')
171 |         html = self.html_fetcher.fetch(api_url)
172 |         if html is None:
173 |             for _ in range(0, self.RETRY):
174 |                 html = self.html_fetcher.fetch(api_url)
175 |                 if html is not None:
176 |                     break
177 |         if html is None or len(html) == 0:
178 |             print('api', api_url, ' failed')
179 |             return []
180 |         links = self._html_to_links(html)
181 |         return links
182 | 
183 | from bs4 import BeautifulSoup
184 | 
185 | class BBCLinkFetcher(DownloadLinkFetcher):
186 | 
187 |     BBC_FILTERS = [
188 |         ['programmes', 21, 31],
189 |         ['correspondents', 26, 40],
190 |         ['iplayer', 21, 28],
191 |         ['radio', 21, 26],
192 |         ['live', 27, 31],
193 |         ['m', 7, 8],
194 |         ['video_and_audio', 26, 41]
195 |     ]
196 | 
197 |     def _next_api(self, base_url, current_date):
198 |         year = current_date.year
199 |         month = current_date.month
200 |         day = current_date.day
201 |         api_url = base_url.format(year=year, month=month, day=day)
202 |         return api_url
203 | 
204 |     def _html_to_links(self, html):
205 |         soup = BeautifulSoup(html, 'lxml')
206 | 
207 |         links = list()
208 |         # news links are the hrefs of a
209 |         elements = soup.table.find_all('a')
210 |         # elements = soup.table.find_all('a', class_='title-link')
211 |         for element in elements:
212 |             href = element.get('href')
213 |             if not href:
214 |                 continue
215 |             link = self._format_link(href)
216 |             if self._link_filter(link, self.BBC_FILTERS):
217 |                 links.append(link)
218 | 
219 |         return list(set(links))
220 | 
221 | 
222 | import sys
223 | import os.path
224 | import json
225 | import time
226 | from datetime import timedelta
227 | 
228 | class ArticleFetcher:
229 | 
230 |     RETRY = 5
231 | 
232 |     def __init__(self, config):
233 |         self.config = config
234 |         self.download_link_fetcher = None
235 |         self.html_fetcher = NetworkFetcher()
236 |         self.path = config.path
237 | 
238 |         self.total_date = 0
239 | 
240 |         self._mkdir(self.path,
241 |                     config.start_date,
242 |                     config.end_date,
243 |                     config.step)
244 | 
245 |     def _mkdir(self, path, start_date, end_date, step):
246 |         if os.path.isdir(path):
247 |             # current_date = start_date
248 |             # while current_date < end_date:
249 |             #     current_date += step
250 |             #     self.total_date += 1
251 |             # return
252 |             pass
253 |         else:
254 |             os.makedirs(path)
255 |         current_date = start_date
256 |         existed_years = dict()
257 |         while current_date < end_date:
258 |             year = current_date.year
259 |             month = current_date.month
260 |             day = current_date.day
261 | 
262 |             year_path = os.path.join(path, str(year))
263 |             month_path = os.path.join(year_path, str(month))
264 |             day_path = os.path.join(month_path, str(day))
265 | 
266 |             if year not in existed_years.keys():
267 |                 existed_years[year] = dict()
268 |                 if not os.path.isdir(year_path):
269 |                     os.mkdir(year_path)
270 | 
271 |             if (step.months > 0) or (step.days > 0):
272 |                 year_content = existed_years[year]
273 |                 if month not in year_content.keys():
274 |                     year_content[month] = True
275 |                     if not os.path.isdir(month_path):
276 |                         os.mkdir(month_path)
277 | 
278 |             if step.days > 0:
279 |                 if not os.path.isdir(day_path):
280 |                     os.mkdir(day_path)
281 |             current_date += step
282 | 
283 |             self.total_date += 1
284 | 
285 |     def _html_to_infomation(self, html, link, date):
286 |         return {}
287 | 
288 |     def _extract_information(self, link, date):
289 |         html = self.html_fetcher.fetch(link)
290 |         if html is None:
291 |             for _ in range(0, self.RETRY):
292 |                 html = self.html_fetcher.fetch(link)
293 |                 if html is not None:
294 |                     break
295 |         if html is None:
296 |             print('article ', link, 'failed')
297 |             return None
298 |         return self._html_to_infomation(html, link, date)
299 | 
300 |     def _get_storage_path(self, path, date):
301 |         return os.path.join(path, str(date.year), str(date.month), str(date.day))
302 | 
303 |     def _lazy_storage(self, storage_path, links, date, current_date):
304 |         total_links = len(links)
305 |         current_link = 1
306 | 
307 |         titles_path = os.path.join(storage_path, f'titles.{current_date}')
308 |         with open(titles_path, mode='w', encoding='utf-8') as titles_file:
309 |             articles = list()
310 |             titles = list()
311 |             for link in links:
312 |                 print('>>> {c} in {t} articles\r'.format(c=current_link, t=total_links), end='')
313 |                 current_link += 1
314 | 
315 |                 article = self._extract_information(link, date)
316 |                 if article is not None:
317 |                     titles.append(article['title'] + '\n')
318 |                     articles.append(article)
319 | 
320 |             articles_path = os.path.join(storage_path, f'articles.{current_date}')
321 |             with open(articles_path, mode='w', encoding='utf-8') as articles_file:
322 |                 json.dump({
323 |                     'expected_number': len(links),
324 |                     'number': len(articles),
325 |                     'articles': articles
326 |                 }, articles_file, indent=4)
327 |             titles_file.writelines(titles)
328 | 
329 |     def _non_lazy_storage(self, storage_path, links, date):
330 |         total_links = len(links)
331 |         current_link = 1
332 | 
333 |         titles_path = os.path.join(storage_path, 'titles')
334 |         with open(titles_path, mode='w', encoding='utf-8') as titles_file:
335 |             for article_index, link in enumerate(links):
336 |                 print('{c} in {t} articles\r'.format(c=current_link, t=total_links), end='')
337 |                 current_link += 1
338 | 
339 |                 article = self._extract_information(link, date)
340 |                 if article is not None:
341 |                     titles_file.write(article['title'] + '\n')
342 | 
343 |                     article_path = os.path.join(storage_path, str(article_index))
344 |                     with open(article_path, mode='w', encoding='utf-8') as article_file:
345 |                         json.dump(article, article_file, indent=4)
346 | 
347 |     def fetch(self, lazy_storage=True):
348 |         current_date = 1
349 |         while True:
350 |             api_url, date = self.download_link_fetcher.next()
351 |             if api_url is None:
352 |                 break
353 |             print(date.strftime('%Y-%m-%d'),
354 |                   '{c} in {t} dates                  '.format(c=current_date, t=self.total_date))
355 | 
356 |             # storage_path = self._get_storage_path(self.path, date)
357 |             storage_path = self.path
358 |             links = self.download_link_fetcher.fetch(api_url)
359 |             if lazy_storage:
360 |                 self._lazy_storage(storage_path, links, date, current_date)
361 |             else:
362 |                 self._non_lazy_storage(storage_path, links, date)
363 | 
364 |             time.sleep(self.config.sleep)
365 | 
366 |             print(date.strftime('%Y-%m-%d'),
367 |                   'date {c} finished                 '.format(c=current_date))
368 |             current_date += 1
369 | 
370 | import json
371 | 
372 | from bs4 import BeautifulSoup
373 | from goose3 import Goose
374 | from goose3.extractors.content import ContentExtractor
375 | 
376 | eps = 1e-6
377 | f1 = ContentExtractor.calculate_best_node
378 | f2 = ContentExtractor.post_cleanup
379 | 
380 | 
381 | def post_cleanup(ce_inst):
382 |     """\
383 |     remove any divs that looks like non-content,
384 |     clusters of links, or paras with no gusto
385 |     """
386 |     parse_tags = ['p']
387 |     if ce_inst.config.parse_lists:
388 |         parse_tags.extend(['ul', 'ol'])
389 |     if ce_inst.config.parse_headers:
390 |         parse_tags.extend(['h1', 'h2', 'h3', 'h4', 'h5', 'h6'])
391 | 
392 |     target_node = ce_inst.article.top_node
393 |     node = ce_inst.add_siblings(target_node)
394 |     for elm in ce_inst.parser.getChildren(node):
395 |         e_tag = ce_inst.parser.getTag(elm)
396 |         if e_tag not in parse_tags:
397 |             if ce_inst.is_highlink_density(elm) or ce_inst.is_table_and_no_para_exist(elm):
398 |                 ce_inst.parser.remove(elm)
399 |     return node
400 | 
401 | 
402 | def calculate_best_node(ce_inst, doc):
403 |     top_node = None
404 |     nodes_to_check = ce_inst.nodes_to_check(doc)
405 | 
406 |     starting_boost = float(1.0)
407 |     cnt = 0
408 |     i = 0
409 |     parent_nodes = []
410 |     nodes_with_text = []
411 | 
412 |     for node in nodes_to_check:
413 |         text_node = ce_inst.parser.getText(node)
414 |         word_stats = ce_inst.stopwords_class(language=ce_inst.get_language()).get_stopword_count(text_node)
415 |         high_link_density = ce_inst.is_highlink_density(node)
416 |         if word_stats.get_stopword_count() > 2 and not high_link_density:
417 |             nodes_with_text.append(node)
418 | 
419 |     nodes_number = len(nodes_with_text)
420 |     negative_scoring = 0
421 |     bottom_negativescore_nodes = float(nodes_number) * 0.25
422 | 
423 |     for node in nodes_with_text:
424 |         boost_score = float(0)
425 |         # boost
426 |         if ce_inst.is_boostable(node):
427 |             if cnt >= 0:
428 |                 boost_score = float((1.0 / starting_boost) * 50)
429 |                 starting_boost += 1
430 |         # nodes_number
431 |         if nodes_number > 15:
432 |             if (nodes_number - i) <= bottom_negativescore_nodes:
433 |                 booster = float(bottom_negativescore_nodes - (nodes_number - i))
434 |                 boost_score = float(-pow(booster, float(2)))
435 |                 negscore = abs(boost_score) + negative_scoring
436 |                 if negscore > 40:
437 |                     boost_score = float(5)
438 | 
439 |         text_node = ce_inst.parser.getText(node)
440 |         word_stats = ce_inst.stopwords_class(language=ce_inst.get_language()).get_stopword_count(text_node)
441 |         upscore = int(word_stats.get_stopword_count() + boost_score)
442 | 
443 |         # parent node
444 |         parent_node = ce_inst.parser.getParent(node)
445 |         ce_inst.update_score(parent_node, upscore)
446 |         ce_inst.update_node_count(parent_node, 1)
447 | 
448 |         if parent_node not in parent_nodes:
449 |             parent_nodes.append(parent_node)
450 | 
451 |         # parentparent node
452 |         parent_parent_node = ce_inst.parser.getParent(parent_node)
453 |         if parent_parent_node is not None:
454 |             ce_inst.update_node_count(parent_parent_node, 1)
455 |             ce_inst.update_score(parent_parent_node, upscore - eps)
456 |             if parent_parent_node not in parent_nodes:
457 |                 parent_nodes.append(parent_parent_node)
458 | 
459 |         # parentparentparent node
460 |         parent_parent_parent_node = ce_inst.parser.getParent(parent_parent_node)
461 |         if parent_parent_parent_node is not None:
462 |             ce_inst.update_node_count(parent_parent_parent_node, 1)
463 |             ce_inst.update_score(parent_parent_parent_node, upscore - 2 * eps)
464 |             if parent_parent_parent_node not in parent_nodes:
465 |                 parent_nodes.append(parent_parent_parent_node)
466 |         cnt += 1
467 |         i += 1
468 | 
469 |     top_node_score = 0
470 |     for itm in parent_nodes:
471 |         score = ce_inst.get_score(itm)
472 | 
473 |         if score > top_node_score:
474 |             top_node = itm
475 |             top_node_score = score
476 | 
477 |         if top_node is None:
478 |             top_node = itm
479 | 
480 |     return top_node
481 | 
482 | 
483 | class BBCArticleFetcher(ArticleFetcher):
484 | 
485 |     def __init__(self, config):
486 |         super(BBCArticleFetcher, self).__init__(config)
487 |         self.download_link_fetcher = BBCLinkFetcher(config)
488 | 
489 |     def _extract_title(self, soup):
490 |         if soup.title is not None:
491 |             return soup.title.get_text()
492 | 
493 |     def _extract_published_date(self, date):
494 |         return date.strftime('%Y-%m-%d')
495 | 
496 |     def _extract_authors(self, soup):
497 |         authors_elements = soup.find_all('meta', property='article:author')
498 |         if authors_elements is not None:
499 |             return [authors_element['content'] for authors_element in authors_elements]
500 | 
501 |     def _extract_description(self, soup):
502 |         description_element = soup.find('meta', property='og:description')
503 |         if description_element is not None:
504 |             return description_element['content']
505 | 
506 |     def _extract_section(self, soup):
507 |         section_element = soup.find('meta', property='article:section')
508 |         if section_element is not None:
509 |             return section_element['content']
510 | 
511 |     def _extract_content(self, html):
512 |         ContentExtractor.calculate_best_node = calculate_best_node
513 |         ContentExtractor.post_cleanup = post_cleanup
514 |         g = Goose({'enable_image_fetching': False})
515 |         article = g.extract(raw_html=html)
516 |         ContentExtractor.calculate_best_node = f1
517 |         ContentExtractor.post_cleanup = f2
518 |         return article.cleaned_text
519 | 
520 |     def _html_to_infomation(self, html, link, date):
521 |         soup = BeautifulSoup(html, 'lxml')
522 |         head = soup.head
523 | 
524 |         try:
525 |             title = self._extract_title(head)
526 |             published_date = self._extract_published_date(date)
527 |             authors = self._extract_authors(head)
528 |             description = self._extract_description(head)
529 |             section = self._extract_section(head)
530 |             content = self._extract_content(html)
531 |         except Exception:
532 |             return None
533 | 
534 |         return {
535 |             'title': title,
536 |             'published_date': published_date,
537 |             'authors': authors,
538 |             'description': description,
539 |             'section': section,
540 |             'content': content,
541 |             'link': link
542 |         }
543 | 
544 | if __name__ == '__main__':
545 | 
546 |     today = datetime.date.today()
547 |     today_str = today.strftime('%Y-%m-%d')
548 |     two_weeks_ago = today - datetime.timedelta(days=7)
549 |     two_weeks_ago_str = two_weeks_ago.strftime('%Y-%m-%d')
550 | 
551 |     config = DatasetConfiguration()
552 |     config.start_date = two_weeks_ago
553 |     config.end_date = today
554 |     config.path = 'dataset/bbc'
555 | 
556 |     bbc_article_fetcher = BBCArticleFetcher(config)
557 |     bbc_article_fetcher.fetch()
558 | 
559 |     from glob import glob
560 |     files = glob(f'dataset/bbc/articles.*')
561 |     files.sort()
562 | 
563 |     import datasets
564 |     import json
565 |     import os
566 | 
567 |     hf_token = os.environ['HF_TOKEN']
568 | 
569 |     all_articles = []
570 |     for file in files:
571 |         with open(file) as f:
572 |             articles = json.load(f)
573 | 
574 |         articles = articles['articles']
575 |         for article in articles:
576 |             article['authors'] = article['authors'][0] if article['authors'] else None
577 |             all_articles.append(article)
578 |     
579 |     with open('all_articles.json', 'w') as f:
580 |         json.dump(all_articles, f, indent=4, ensure_ascii=False)
581 |     
582 |     ds = datasets.Dataset.from_dict({key: [article[key] for article in all_articles] for key in all_articles[0].keys()})
583 |     ds.save_to_disk('bbc')
584 | 
585 |     from huggingface_hub import create_branch, create_tag, RepoCard
586 | 
587 |     create_branch('RealTimeData/bbc_latest', repo_type='dataset', branch=today_str, token=hf_token)
588 |     ds.push_to_hub('RealTimeData/bbc_latest', token=hf_token, branch='main')
589 |     ds.push_to_hub('RealTimeData/bbc_latest', token=hf_token, branch=today_str)
590 | 
591 |     text = f"""
592 | # Latest BBC News
593 | 
594 | You could always access the latest BBC News articles via this dataset.
595 | 
596 | We update the dataset weekly, on every Sunday. So the dataset always provides the latest BBC News article from the last week.
597 | 
598 | The current dataset on main branch contains the latest BBC News articles submitted from {two_weeks_ago.isoformat()} to {today.isoformat()}.
599 | 
600 | The data collection is conducted on {today.isoformat()}.
601 | 
602 | Use the dataset via:
603 | ```
604 | ds = datasets.load_dataset('RealTimeData/bbc_latest')
605 | ```
606 | 
607 | # Previsou versions
608 | 
609 | You could access previous versions by requesting different branches.
610 | 
611 | For example, you could find the 2023-08-20 version via:
612 | ```
613 | ds = datasets.load_dataset('RealTimeData/bbc_latest', revision = '2023-08-20')
614 | ```
615 | 
616 | Check all available versions by clicking the "Files and versions" button on the top bar.
617 | """
618 |     card = RepoCard(text)
619 |     card.push_to_hub('RealTimeData/bbc_latest', repo_type='dataset', token=hf_token)


--------------------------------------------------------------------------------