├── figs
├── logo.png
├── perplexity.py
├── single.py
├── polar.py
├── winrate.py
├── compare_strings.json
└── compare.py
├── requirements.txt
├── .gitignore
├── data
├── monthly_updater
│ ├── readme.md
│ ├── monthly_wikitext.py
│ ├── monthly_image.py
│ ├── monthly_math.py
│ ├── monthly_code.py
│ └── monthly_arxiv.py
├── code_repos.txt
├── push_wiki_alltime.py
├── push_github_dataset.py
├── push_arxiv_dataset.py
├── collect_bbc_months.py
├── push_math_dataset.py
├── doc_info.py
├── analyse_news.py
├── analyse_wikitext.py
├── wiki_dataset.py
├── wikitext_alltime.py
├── bbc_alltime.py
├── maintain_wikitext_latest.py
├── audio_dataset.py
├── bbc_news_image.py
├── math_dataset.py
├── github_dataset.py
├── reddit_crawler.py
├── squad_wiki_title.text
├── arxiv_dataset.py
└── wikipedia.py
├── .github
└── workflows
│ ├── weekly_downloader.yml
│ └── monthly_updater.yml
├── readme.md
├── push_to_hf_hub.py
├── github_downloader.py
├── wikitext_downloader.py
├── eval
└── contamination.py
├── arxiv_downloader.py
└── bbc_downloader.py
/figs/logo.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/liyucheng09/LatestEval/HEAD/figs/logo.png
--------------------------------------------------------------------------------
/requirements.txt:
--------------------------------------------------------------------------------
1 | arxiv
2 | pylatexenc
3 | datasets
4 | bs4
5 | goose3
6 | configobj
7 | mwparserfromhell
8 | GitPython
--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
1 | .DS_Store
2 | *.html
3 | eval/*.txt
4 | eval/saves/*.txt
5 | __pycache__/
6 | .vscode/
7 | data/*.json
8 | *.log
9 | *.out
10 | *.error
11 |
12 | wmt22-zhen/
13 | arxiv/
14 | bbc/
15 | github/
--------------------------------------------------------------------------------
/data/monthly_updater/readme.md:
--------------------------------------------------------------------------------
1 | # Monthly data collection - where the RealTimeData program is hosted
2 |
3 | ## sources available
4 |
5 | - arxiv
6 | - bbc_news
7 | - code
8 | - bbc_image
9 | - math
10 | - wikitext
11 |
12 | ## Check all data dumps from 2017 to current
13 |
14 | Please find the RealTimeData program here: [RealTimeData](https://huggingface.co/RealTimeData).
15 |
16 | ## Crawl data by your own
17 |
18 | ```python
19 | python monthly_arxiv.py
20 | ```
21 |
22 | This will crawl the arxiv data this month (from the 1st to the current date) and push to RealTimeData repos (if you have push authority).
23 |
24 | ## Ask for more data source / contribute new data
25 |
26 | This is program welcomes all contributions. Please open an issue or pull request if you have any suggestions or want to contribute new data sources.
--------------------------------------------------------------------------------
/data/code_repos.txt:
--------------------------------------------------------------------------------
1 | Stirling-Tools/Stirling-PDF
2 | microsoft/PowerToys
3 | veler/DevToys
4 | NationalSecurityAgency/ghidra
5 | Kurento/kurento-media-server
6 | silverwind/droppy
7 | llvm-mirror/clang
8 | facebookarchive/beringei
9 | shadowsocks/shadowsocks-qt5
10 | go-ego/riot
11 | flynn/flynn
12 | lipangit/JiaoZiVideoPlayer
13 | keras-team/keras
14 | aseprite/aseprite
15 | godotengine/godot
16 | lua/lua
17 | musescore/MuseScore
18 | apache/spark
19 | apache/hadoop
20 | scikit-learn/scikit-learn
21 | Leaflet/Leaflet
22 | overleaf/overleaf
23 | pytorch/pytorch
24 | huggingface/transformers
25 | animate-css/animate.css
26 | psf/requests
27 | pandas-dev/pandas
28 | django/django
29 | numpy/numpy
30 | facebook/react
31 | vuejs/core
32 | vuejs/vue
33 | android/architecture-samples
34 | sqlite/sqlite
35 | elastic/elasticsearch
36 | openssl/openssl
37 | gohugoio/hugo
38 | laravel/laravel
39 | WordPress/WordPress
40 | Unity-Technologies/ml-agents
41 | opencv/opencv
--------------------------------------------------------------------------------
/data/push_wiki_alltime.py:
--------------------------------------------------------------------------------
1 | import datasets
2 | from glob import glob
3 | import os
4 | import json
5 |
6 | if __name__ == '__main__':
7 | files = glob('/vol/research/lyc/wikitext_alltime/wiki/*.json')
8 | hf_token = os.environ['HF_TOKEN']
9 |
10 | for file in files:
11 |
12 | all_articles = []
13 |
14 | time = os.path.basename(file).strip('.json')
15 | year = int(time.split('-')[0])
16 | month = int(time.split('-')[1])
17 |
18 | time_stamp = f'{year}-{month:02d}'
19 | if time_stamp not in ['2024-01', '2024-02']:
20 | continue
21 | print(f"Processing {time_stamp}")
22 |
23 | with open(file) as f:
24 | data = json.load(f)
25 |
26 | for title, article in data.items():
27 | article['time'] = time_stamp
28 | all_articles.append(article)
29 |
30 | ds = datasets.Dataset.from_list(all_articles)
31 | ds.push_to_hub(f"RealTimeData/wikitext_alltime", config_name=time_stamp, token=hf_token)
32 |
--------------------------------------------------------------------------------
/data/push_github_dataset.py:
--------------------------------------------------------------------------------
1 | from glob import glob
2 | import datasets
3 | import os
4 | import json
5 |
6 | if __name__ == '__main__':
7 | hf_token = os.environ['HF_TOKEN']
8 | all_months = [f'{year}-{month:02}' for year in range(2017, 2024) for month in range(1, 13)]
9 | all_months += [f'2024-{month:02}' for month in range(1,3)]
10 |
11 | # try:
12 | # exists_config = datasets.get_dataset_config_names('RealTimeData/code_alltime')
13 | # except datasets.exceptions.DatasetNotFoundError:
14 | # exists_config = []
15 | # pass
16 |
17 | for month in all_months:
18 | # if month in exists_config:
19 | # continue
20 | code_paths = glob(f'/vol/research/lyc/github_dataset/{month}/*/*.json')
21 | all_codes = []
22 | for code in code_paths:
23 | with open(code, 'r') as f:
24 | all_codes.append(json.load(f))
25 | ds = datasets.Dataset.from_list(all_codes)
26 | print('='*20)
27 | print(f'Finished {month}')
28 | print(ds)
29 | ds.push_to_hub(f'RealTimeData/code_alltime', config_name = month, token=hf_token)
30 | print(f'Pushed {month} to hub')
31 |
--------------------------------------------------------------------------------
/data/push_arxiv_dataset.py:
--------------------------------------------------------------------------------
1 | from glob import glob
2 | import datasets
3 | import os
4 | import json
5 |
6 | if __name__ == '__main__':
7 | hf_token = os.environ['HF_TOKEN']
8 | all_months = [f'{year}-{month:02}' for year in range(2017, 2024) for month in range(1, 13)]
9 |
10 | # try:
11 | # exists_config = datasets.get_dataset_config_names('RealTimeData/arxiv_alltime')
12 | # except datasets.exceptions.DatasetNotFoundError:
13 | # exists_config = []
14 | # pass
15 |
16 | # all months before 2021-02 (included) are already pushed, so remove these months from all_months
17 | all_months = all_months[all_months.index('2021-03'):]
18 |
19 | for month in all_months:
20 | # if month in exists_config:
21 | # continue
22 | paper_paths = glob(f'/vol/research/lyc/arxiv_alltime/{month}/*.json')
23 | all_papers = []
24 | for paper in paper_paths:
25 | with open(paper, 'r') as f:
26 | all_papers.append(json.load(f))
27 | ds = datasets.Dataset.from_list(all_papers)
28 | print('='*20)
29 | print(f'Finished {month}')
30 | print(ds)
31 | ds.push_to_hub(f'RealTimeData/arxiv_alltime', config_name = month, token=hf_token)
32 | print(f'Pushed {month} to hub')
33 |
--------------------------------------------------------------------------------
/data/collect_bbc_months.py:
--------------------------------------------------------------------------------
1 | from glob import glob
2 | import json
3 |
4 | if __name__ == '__main__':
5 | # /vol/research/lyc/bbc/2023/0/articles.1 indicates day 1, month 0, year 2023
6 | docs = glob('/vol/research/lyc/bbc/*/*/articles.*')
7 |
8 | # now group by month
9 | times = {}
10 | for doc in docs:
11 | year = doc.split('/')[-3]
12 | month = doc.split('/')[-2]
13 | month = int(month)%12 + 1
14 | time = f'{year}-{month}'
15 | if time not in times:
16 | times[time] = []
17 |
18 | with open(doc, 'r') as f:
19 | articles = json.load(f)['articles']
20 | times[time].extend(articles)
21 |
22 | # now save
23 | # each month should save as a json dict
24 | # target path /vol/research/lyc/bbc/bbc_alltime/articles/2023-{month}.json
25 | for time in times:
26 | articles = times[time]
27 | month = time.split('-')[1]
28 | year = time.split('-')[0]
29 | # now turn list of dicts to dict of lists
30 | articles = { key: [article[key] for article in articles] for key in articles[0] }
31 | with open(f'/vol/research/lyc/bbc/bbc_alltime/articles/{year}-{month}.json', 'w') as f:
32 | json.dump(articles, f, ensure_ascii=False)
33 | print(f'Finished {year} {month}')
--------------------------------------------------------------------------------
/data/push_math_dataset.py:
--------------------------------------------------------------------------------
1 | import datasets
2 | import json
3 | from glob import glob
4 |
5 | if __name__ == '__main__':
6 |
7 | files = glob('/vol/research/lyc/math/*.json')
8 | for file in files:
9 | with open(file, 'r') as f:
10 | data = json.load(f)
11 |
12 | time_stamp = file.split('/')[-1].split('.')[0]
13 | if time_stamp not in ['2024-01', '2024-02']:
14 | continue
15 |
16 | all_instances = []
17 | for qa in data.values():
18 | instance = {}
19 | instance['question'] = qa['title']
20 | instance['question_id'] = qa['question_id']
21 | instance['score'] = qa['score']
22 | instance['link'] = qa['link']
23 | instance['body'] = qa['body']
24 | if 'answers' not in qa:
25 | continue
26 | instance['answers'] = [{'text': a['body'], 'score': a['score'], 'answer_id': a['answer_id']} for a in qa['answers']]
27 |
28 | verbolised = f"Question: {instance['question']}\n"
29 | for ans_index, ans in enumerate(instance['answers']):
30 | verbolised += f"Answer {ans_index + 1}: {ans['text']}\n"
31 | instance['verbolised'] = verbolised
32 |
33 | all_instances.append(instance)
34 |
35 | dataset = datasets.Dataset.from_list(all_instances)
36 | print(dataset)
37 |
38 | dataset.push_to_hub('RealTimeData/math_alltime', time_stamp)
39 | print(f"Pushed {time_stamp} to hub")
--------------------------------------------------------------------------------
/data/doc_info.py:
--------------------------------------------------------------------------------
1 | import docx
2 | import re
3 | # from wikipedia import gpt3_self_info
4 | import sys
5 |
6 | def getText(filename):
7 | doc = docx.Document(filename)
8 | fullText = []
9 | for para in doc.paragraphs:
10 | fullText.append(para.text)
11 | return '\n'.join(fullText)
12 |
13 | def beautify_text(text, num_words = 1000):
14 | text = re.sub(r'\n+', '\n', text)
15 | text = re.sub(r'\s+', ' ', text)
16 |
17 | # use first 1000 words
18 | text = ' '.join(text.split(' ')[:num_words])
19 | return text
20 |
21 | def verbalise_docs(path = '/user/HS502/yl02706/LatestEval/data/mmlu', num_words = 1000):
22 | docs = ['q17-1.docx', 'q18-1.docx', 'q19-1.docx', 'q20-1.docx', 'q22-1.docx', 'q23-1.docx']
23 | docs = [ path + '/' + doc for doc in docs ]
24 | doc_text = [ getText(doc) for doc in docs ]
25 |
26 | doc_text = [ beautify_text(doc, num_words=num_words) for doc in doc_text ]
27 |
28 | return {
29 | doc: [doc_string] for doc, doc_string in zip(docs, doc_text)
30 | }
31 |
32 | if __name__ == '__main__':
33 | docs = ['data/q17-1.docx', 'data/q18-1.docx', 'data/q19-1.docx', 'data/q20-1.docx', 'data/q22-1.docx', 'data/q23-1.docx']
34 | doc_text = [ getText(doc) for doc in docs ]
35 |
36 | doc_text = [ beautify_text(doc) for doc in doc_text ]
37 |
38 | for doc, doc_string in zip(docs, doc_text):
39 | print('----------------------')
40 | print(doc)
41 |
42 | _, info = gpt3_self_info(doc_string)
43 | print(sum(info)/len(info))
--------------------------------------------------------------------------------
/figs/perplexity.py:
--------------------------------------------------------------------------------
1 | import numpy as np
2 | import pandas as pd
3 | import matplotlib.pyplot as plt
4 |
5 | # Data
6 | df = pd.read_csv('figs/perplexity.tsv', sep='\t').set_index('Model')
7 | df.drop('LatestEval', axis = 1, inplace=True)
8 | df.drop(['opt-350m', 'opt-1.6b'], axis=0, inplace=True)
9 |
10 | data_dict = df.to_dict()
11 |
12 | metrics_data = {
13 | 'QuAC': ('s', 'violet'),
14 | 'BoolQ': ('+', 'violet'),
15 | 'SQuAD': ('x', 'violet'),
16 | 'Wikitext': ('D', 'navy'),
17 | 'NewWiki': ('^', 'navy'),
18 | # 'LatestEval': ('o', 'gold')
19 | }
20 |
21 | fig, ax = plt.subplots(figsize=(8, 2.8), dpi=150)
22 |
23 | # Create a horizontal scatter plot for each metric
24 | for benchmark, numbers in data_dict.items():
25 | marker_style, color = metrics_data[benchmark]
26 | models, perplexities = list(numbers.keys()), list(numbers.values())
27 | # perplexities = np.exp(perplexities) # assuming metrics are in log scale
28 | plt.scatter(perplexities, models, label=benchmark, s=20, marker=marker_style, color=color)
29 |
30 | # Adjust plot
31 | plt.ylabel('Models', fontweight='bold')
32 | plt.xlabel('Perplexity', fontweight='bold')
33 | plt.legend( loc='upper right', bbox_to_anchor=(1.05, 1.0), ncol=1, fontsize=8)
34 | plt.grid(True, which='both', linestyle='--', linewidth=0.5)
35 | plt.xlim(left = 1.3) # Adjusting xlim to be slightly more than the max value for better visualization
36 |
37 | # plt.gca().xaxis.tick_top()
38 | # plt.gca().xaxis.set_label_position('top')
39 |
40 | ax.spines['top'].set_visible(False)
41 | ax.spines['right'].set_visible(False)
42 |
43 | plt.tight_layout()
44 |
45 | plt.show()
--------------------------------------------------------------------------------
/figs/single.py:
--------------------------------------------------------------------------------
1 | import numpy as np
2 | import pandas as pd
3 | import matplotlib.pyplot as plt
4 |
5 | # Data
6 | df = pd.read_csv('figs/perplexity.tsv', sep='\t').set_index('Model')
7 | df.drop('LatestEval', axis = 1, inplace=True)
8 | df.drop(['opt-350m', 'opt-1.6b', 'gpt-3', 'llama-7b', 'llama-30b'], axis=0, inplace=True)
9 |
10 | data_dict = df.to_dict()
11 |
12 | metrics_data = {
13 | 'QuAC': ('s', 'violet'),
14 | 'BoolQ': ('+', 'violet'),
15 | 'SQuAD': ('x', 'violet'),
16 | 'memorised': ('D', 'navy'),
17 | 'clean': ('^', 'navy'),
18 | # 'LatestEval': ('o', 'gold')
19 | }
20 |
21 | fig, ax = plt.subplots(figsize=(4, 1), dpi=200)
22 |
23 | # Create a horizontal scatter plot for each metric
24 | for benchmark, numbers in data_dict.items():
25 | marker_style, color = metrics_data[benchmark]
26 | models, perplexities = list(numbers.keys()), list(numbers.values())
27 | # perplexities = np.exp(perplexities) # assuming metrics are in log scale
28 | plt.scatter(perplexities, ['perplexity'], label=benchmark, s=20, marker=marker_style, color=color)
29 | ax.annotate(benchmark, xy=(perplexities[-1], 'perplexity'), xytext=(0, 2), textcoords='offset points', va='bottom', fontsize=7, rotation=45)
30 |
31 | # Adjust plot
32 | plt.grid(True, linestyle='--', linewidth=0.5, axis='y')
33 | plt.xlim(left = 1.6) # Adjusting xlim to be slightly more than the max value for better visualization
34 |
35 | ax.xaxis.set_visible(False)
36 |
37 | ax.spines['top'].set_visible(False)
38 | ax.spines['right'].set_visible(False)
39 | ax.spines['left'].set_visible(False)
40 | ax.spines['bottom'].set_visible(False)
41 |
42 | plt.tight_layout()
43 |
44 | plt.show()
--------------------------------------------------------------------------------
/figs/polar.py:
--------------------------------------------------------------------------------
1 | import pandas as pd
2 | import numpy as np
3 | import matplotlib.pyplot as plt
4 |
5 | df_score = pd.read_csv("figs/single_answer_score.tsv", sep='\t')
6 |
7 | # Calculate the number of categories
8 | categories = df_score['category'].unique()
9 | N = len(categories)
10 |
11 | # Calculate angle for each category
12 | theta = np.linspace(0.0, 2 * np.pi, N, endpoint=False)
13 | theta = np.append(theta, theta[0])
14 | # Set up the polar axis
15 | fig, ax = plt.subplots(subplot_kw={'projection': 'polar'}, figsize=(8, 6), dpi=150)
16 | ax.set_facecolor("#f5f5f5")
17 |
18 | markers = {
19 | 'gpt-3.5-turbo': 'o',
20 | 'gpt-4': '+',
21 | 'llama-13b': 'x',
22 | 'llama-30b': 's',
23 | 'vicuna-13b': 'd',
24 | }
25 |
26 | # Loop through each model and plot on the polar axis
27 | for model in df_score['model'].unique():
28 | values = df_score[df_score['model'] == model]['score'].values
29 | # Ensure the plot is closed by repeating the first value
30 | values = np.append(values, values[0])
31 | ax.plot(theta, values, label=model, marker=markers[model], alpha=0.8, markersize=5)
32 |
33 | # Fill the area under the plot for better visualization (optional)
34 | # ax.fill(theta, values, 'b', alpha=0.1)
35 |
36 | # Set the y-ticks (radii) and x-ticks (categories)
37 | ax.set_xticks(theta[:-1])
38 | ax.set_xticklabels(categories, fontsize=14) # Label x-ticks with categories
39 |
40 | ax.set_yticks([0, 2, 4, 6, 8, 10])
41 |
42 | # Customize the grid and title
43 | ax.grid(True)
44 |
45 | # Display a legend
46 | ax.legend(loc='upper right', bbox_to_anchor=(1.32, 1.1), fontsize=14)
47 |
48 | # Save the figure
49 | fig.tight_layout()
50 | fig.savefig("fig.png", dpi=150)
51 |
52 | # Show the plot
53 | # plt.show()
54 |
--------------------------------------------------------------------------------
/data/analyse_news.py:
--------------------------------------------------------------------------------
1 | from difflib import SequenceMatcher
2 | import datasets
3 | import multiprocessing
4 |
5 | def compare_texts(text1, text2):
6 | # Split the texts into words
7 | words1 = text1.split()
8 | words2 = text2.split()
9 |
10 | # Create a SequenceMatcher to compare the two word lists
11 | matcher = SequenceMatcher(None, words1, words2)
12 |
13 | # Calculate the similarity ratio
14 | similarity = matcher.ratio()
15 |
16 | # Calculate the difference ratio
17 | difference = 1 - similarity
18 |
19 | return difference
20 |
21 | def main(month, first_month_articles):
22 | ds = datasets.load_dataset('RealTimeData/bbc_news_alltime', month, split='train')
23 | # compare to first month
24 | content = '\n'.join(['\n'.join(article.splitlines()[:10]) for article in ds['content']])
25 | difference = compare_texts(content, first_month_articles)
26 |
27 | print(f"Finished {month}, average difference: {difference}")
28 | return (month, difference)
29 |
30 | if __name__ == '__main__':
31 |
32 | months = [f'{year}-{month:02d}' for year in range(2017, 2024) for month in range(1, 13) if not (year == 2023 and month == 12)]
33 | first_month = datasets.load_dataset('RealTimeData/bbc_news_alltime', months[0], split='train')
34 | first_month_articles = '\n'.join(['\n'.join(article.splitlines()[:10]) for article in first_month['content']])
35 | diffs = {}
36 |
37 | months = months[1:]
38 | # main got two arguments, month and first_month_articles
39 | # pool size 4
40 | with multiprocessing.Pool(8) as pool:
41 | for month, diff in pool.starmap(main, [(month, first_month_articles) for month in months]):
42 | diffs[month] = diff
43 |
44 | print(diffs)
--------------------------------------------------------------------------------
/data/analyse_wikitext.py:
--------------------------------------------------------------------------------
1 | from difflib import SequenceMatcher
2 | import datasets
3 | import multiprocessing
4 |
5 | def compare_texts(text1, text2):
6 | # Split the texts into words
7 | words1 = text1.split()
8 | words2 = text2.split()
9 |
10 | # Create a SequenceMatcher to compare the two word lists
11 | matcher = SequenceMatcher(None, words1, words2)
12 |
13 | # Calculate the similarity ratio
14 | similarity = matcher.ratio()
15 |
16 | # Calculate the difference ratio
17 | difference = 1 - similarity
18 |
19 | return difference
20 |
21 | def main(month, first_month_articles):
22 | ds = datasets.load_dataset('RealTimeData/wikitext_alltime', month, split='train')
23 | # compare to first month
24 | diffs = []
25 | for article in ds:
26 | title = article['title']
27 | text = article['text']
28 | if title not in first_month_articles:
29 | print(f"Article {title} not found in first month")
30 | continue
31 | first_month_text = first_month_articles[title]
32 | difference = compare_texts(text, first_month_text)
33 | diffs.append(difference)
34 |
35 | avg_diff = sum(diffs) / len(diffs)
36 | print(f"Finished {month}, average difference: {avg_diff}")
37 | return (month, avg_diff)
38 |
39 | if __name__ == '__main__':
40 |
41 | months = [f'{year}-{month:02d}' for year in range(2017, 2024) for month in range(1, 13) if not (year == 2023 and month == 12)]
42 | first_month = datasets.load_dataset('RealTimeData/wikitext_alltime', months[0], split='train')
43 | first_month_articles = {title: article for title, article in zip(first_month['title'], first_month['text'])}
44 | diffs = {}
45 |
46 | months = months[1:]
47 | # main got two arguments, month and first_month_articles
48 | # pool size 4
49 | with multiprocessing.Pool(8) as pool:
50 | for month, diff in pool.starmap(main, [(month, first_month_articles) for month in months]):
51 | diffs[month] = diff
52 |
53 | print(diffs)
--------------------------------------------------------------------------------
/figs/winrate.py:
--------------------------------------------------------------------------------
1 | import pandas as pd
2 | import plotly.express as px
3 |
4 | def compute_pairwise_win_fraction(battles):
5 | # Times each model wins as Model A
6 | a_win_ptbl = pd.pivot_table(
7 | battles[battles['winner'] == "model_a"],
8 | index="model_a", columns="model_b", aggfunc="size", fill_value=0)
9 |
10 | # Table counting times each model wins as Model B
11 | b_win_ptbl = pd.pivot_table(
12 | battles[battles['winner'] == "model_b"],
13 | index="model_a", columns="model_b", aggfunc="size", fill_value=0)
14 |
15 | # Table counting number of A-B pairs
16 | num_battles_ptbl = pd.pivot_table(battles,
17 | index="model_a", columns="model_b", aggfunc="size", fill_value=0)
18 |
19 | # Computing the proportion of wins for each model as A and as B
20 | # against all other models
21 | row_beats_col_freq = (
22 | (a_win_ptbl + b_win_ptbl.T) /
23 | (num_battles_ptbl + num_battles_ptbl.T)
24 | )
25 |
26 | # Arrange ordering according to proprition of wins
27 | prop_wins = row_beats_col_freq.mean(axis=1).sort_values(ascending=False)
28 | model_names = list(prop_wins.keys())
29 | row_beats_col = row_beats_col_freq.loc[model_names, model_names]
30 | return row_beats_col
31 |
32 | def visualize_pairwise_win_fraction(battles, title):
33 | row_beats_col = compute_pairwise_win_fraction(battles)
34 | fig = px.imshow(row_beats_col, color_continuous_scale='RdBu',
35 | text_auto=".2f", title=title)
36 | fig.update_layout(
37 | # xaxis_title=" Model B: Loser",
38 | # yaxis_title="Model A: Winner",
39 | xaxis_title=None,
40 | yaxis_title=None,
41 | xaxis_side="top", height=700, width=600,
42 | title_y=0.07, title_x=0.5)
43 | fig.update_traces(hovertemplate=
44 | "Model A: %{y}
Model B: %{x}
Fraction of A Wins: %{z}")
45 |
46 | return fig
47 |
48 | df = pd.read_csv("figs/winrate2.tsv", sep='\t')
49 | df = df[df['winner'].isin(['model_a', 'model_b'])]
50 | df = df[df['model_a']!=df['model_b']]
51 |
52 | fig = visualize_pairwise_win_fraction(df,
53 | title = "Pair-wise Win Rate")
54 |
55 | fig.show()
56 |
57 | fig.update_layout(
58 | font=dict(
59 | size=18,
60 | ),
61 | )
62 | fig.write_image("fig.png", width=700, height=650, scale=2)
--------------------------------------------------------------------------------
/data/wiki_dataset.py:
--------------------------------------------------------------------------------
1 | import requests
2 | import traceback
3 | import mwparserfromhell
4 | import datetime
5 | import os
6 |
7 | import sys
8 | import json
9 | import time
10 |
11 | from tqdm import tqdm
12 |
13 | WIKI_API_ENDPOINT = "https://en.wikipedia.org/w/api.php"
14 |
15 | def parse_to_plain_text(wikitext):
16 | parsed = mwparserfromhell.parse(wikitext)
17 | return parsed.strip_code()
18 |
19 | def fetch_content(title, date=None):
20 | params = {
21 | "action": "query",
22 | "format": "json",
23 | "titles": title,
24 | "prop": "revisions",
25 | "rvprop": "content",
26 | "rvlimit": "1",
27 | }
28 | if date: params["rvstart"] = date
29 | try:
30 | response = requests.get(WIKI_API_ENDPOINT, params=params)
31 | response.raise_for_status() # Will raise an error if the HTTP request returned an unsuccessful status code
32 | data = response.json()
33 | if 'error' in data:
34 | print(f"Error fetching content for {title}: {data['error']['info']}")
35 | return None
36 |
37 | page = next(iter(data['query']['pages'].values()))
38 | if 'revisions' not in page:
39 | print(f"No revisions found for {title}")
40 | return None
41 | content = page['revisions'][0]['*']
42 |
43 | # Check if the content is a redirect and skip if true
44 | if content.lower().startswith("#redirect"):
45 | print(f"{title} is a redirect page.")
46 | return None
47 | text = parse_to_plain_text(content)
48 | if len(text.split(' ')) < 300:
49 | print(f"{title} is less than 300 words.")
50 | return None
51 |
52 | return {
53 | "title": page['title'],
54 | "text": text,
55 | "pageid": page['pageid'],
56 | }, content
57 |
58 | except Exception as e:
59 | print(f"An error occurred while fetching content for {title}: {str(e)}")
60 | traceback.print_exc() # This will print the full traceback
61 |
62 | return None
63 |
64 | if __name__ == "__main__":
65 | year, month, save_path = sys.argv[1:]
66 | month = int(month)%12 + 1
67 |
68 | start_time = datetime.datetime(int(year), month, 1)
69 | end_time = start_time + datetime.timedelta(days=28)
70 |
71 | print(f'Fetching wiki articles from {start_time.isoformat()} to {end_time.isoformat()}')
72 |
73 | with open('/user/HS502/yl02706/LatestEval/data/squad_wiki_title.text') as f:
74 | titles = [line.strip() for line in f.readlines()]
75 | historical_contents = [fetch_content(title, end_time) for title in tqdm(titles)]
76 | historical_contents = [content[0] for content in historical_contents if content is not None]
77 | historical_to_save = {title: content for title, content in zip(titles, historical_contents)}
78 |
79 | save_file = os.path.join(save_path, f'{year}-{month}.json')
80 | with open(save_file, 'w') as f:
81 | json.dump(historical_to_save, f, ensure_ascii=False)
82 | print(f'Saved {len(historical_contents)} articles to {save_file}')
--------------------------------------------------------------------------------
/.github/workflows/weekly_downloader.yml:
--------------------------------------------------------------------------------
1 | name: Weekly Downloader
2 |
3 | on:
4 | schedule:
5 | # This cron job initiates the action at 00:00 every Sunday
6 | - cron: '0 0 * * 1'
7 |
8 | jobs:
9 | wiki_downloader:
10 | runs-on: ubuntu-latest
11 |
12 | # Define environment variables for all steps in this job
13 | env:
14 | HF_TOKEN: ${{ secrets.HF_TOKEN }}
15 | Github_Token: ${{ secrets.gh_token }}
16 |
17 | steps:
18 | - name: Checkout repository
19 | uses: actions/checkout@v2
20 |
21 | - name: Set up Python
22 | uses: actions/setup-python@v2
23 | with:
24 | python-version: '3.10' # Choose your desired Python version
25 |
26 | - name: Install dependencies
27 | run: |
28 | python -m pip install --upgrade pip
29 | pip install -r requirements.txt
30 |
31 | - name: Run script
32 | run: python wikitext_downloader.py
33 |
34 | arxiv_downloader:
35 | runs-on: ubuntu-latest
36 |
37 | # Define environment variables for all steps in this job
38 | env:
39 | HF_TOKEN: ${{ secrets.HF_TOKEN }}
40 | Github_Token: ${{ secrets.Github_Token }}
41 |
42 | steps:
43 | - name: Checkout repository
44 | uses: actions/checkout@v2
45 |
46 | - name: Set up Python
47 | uses: actions/setup-python@v2
48 | with:
49 | python-version: '3.10' # Choose your desired Python version
50 |
51 | - name: Install dependencies
52 | run: |
53 | python -m pip install --upgrade pip
54 | pip install -r requirements.txt
55 |
56 | - name: Run script
57 | run: python arxiv_downloader.py
58 |
59 | bbc_downloader:
60 | runs-on: ubuntu-latest
61 |
62 | # Define environment variables for all steps in this job
63 | env:
64 | HF_TOKEN: ${{ secrets.HF_TOKEN }}
65 | Github_Token: ${{ secrets.Github_Token }}
66 |
67 | steps:
68 | - name: Checkout repository
69 | uses: actions/checkout@v2
70 |
71 | - name: Set up Python
72 | uses: actions/setup-python@v2
73 | with:
74 | python-version: '3.10' # Choose your desired Python version
75 |
76 | - name: Install dependencies
77 | run: |
78 | python -m pip install --upgrade pip
79 | pip install -r requirements.txt
80 |
81 | - name: Run script
82 | run: python bbc_downloader.py
83 |
84 | github_downloader:
85 | runs-on: ubuntu-latest
86 |
87 | # Define environment variables for all steps in this job
88 | env:
89 | HF_TOKEN: ${{ secrets.HF_TOKEN }}
90 | Github_Token: ${{ secrets.Github_Token }}
91 |
92 | steps:
93 | - name: Checkout repository
94 | uses: actions/checkout@v2
95 |
96 | - name: Set up Python
97 | uses: actions/setup-python@v2
98 | with:
99 | python-version: '3.10' # Choose your desired Python version
100 |
101 | - name: Install dependencies
102 | run: |
103 | python -m pip install --upgrade pip
104 | pip install -r requirements.txt
105 |
106 | - name: Run script
107 | run: python github_downloader.py
108 |
--------------------------------------------------------------------------------
/readme.md:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 | # "Uncheatable" LLMs Evaluation - LatestEval
6 |
7 | Humans receive new test questions every exam, but LLMs? They've been evaluated with the same benchmarks for too long. Why not assess LLMs with fresh test just like we test our students? In this project, we introduce LatestEval, which automatically constructs language model benchmarks using the latest materials (e.g., arXiv, BBC, Wikipedia, etc.) to prevent "cheating" and data contamination.
8 |
9 | **News!!**
10 |
11 | - **15 Dec, 2023** - This project was accpeted by the main track of **AAAI 2024** :partying_face:! Check out the paper here: :point_right: [Dynamic Test Construction with Latest Materials](https://arxiv.org/abs/2312.12343).
12 |
13 | # Key Features
14 |
15 | 1. We maintain a QA benchmark that updates every half month using the latest online resources (created in the past half month). This approach aims to avoid 1) LLMs being trained on the test set (cheating); and 2) the unintentional inclusion of test questions in the training dataset (data contamination).
16 | 2. We analyzed real Human-AI conversations to ensure the automated benchmark aligns well with real-life applications (see [paper](https://arxiv.org/abs/2312.12343) for more detail).
17 |
18 |
19 | # The Benchmark
20 |
21 | Access the latest benchmark dorectly at [Huggingface Hub](https://huggingface.co/LatestEval)!
22 |
23 | - Latest benchmark of GitHub: [HF Hub](https://huggingface.co/datasets/LatestEval/github-latest)
24 | - Latest benchmark of arXiv: [HF Hub](https://huggingface.co/datasets/LatestEval/arxiv-latest)
25 | - Latest benchmark of BBC: [HF Hub](https://huggingface.co/datasets/LatestEval/bbc-latest)
26 | - The Full benchmark with all sources: [HF Hub](https://huggingface.co/datasets/LatestEval/full-latest)
27 |
28 | The benchmarks are created with latest materials, find these raw materials/documents at [Huggingface Hub](https://huggingface.co/RealTimeData)
29 |
30 | # Evaluate your LLM on LatestEval
31 |
32 | We will add LatestEval to [lm-evaluation-harness](https://github.com/EleutherAI/lm-evaluation-harness) and [OpenCompass](https://github.com/open-compass/opencompass). Stay tuned.
33 |
34 | # Create benchmarks with your own data
35 |
36 | 1. Put your documents as `.txt` files under `./`.
37 | 2. Set your OpenAI key:
38 |
39 | ```
40 | export OPENAI_API_KEY=
41 | ```
42 |
43 | 3. Simply run:
44 |
45 | ```
46 | python data_processor.py --source customized --file_path --num_docs 100
47 | ```
48 |
49 | If you want to reproduce LatestEval on arXiv, BBC, GitHub:
50 |
51 | ```
52 | python data_processor.py --source arxiv --num_docs 100
53 | ```
54 |
55 | # Issue
56 |
57 | Open an issue if you have any problems or want to discuss.
58 |
59 | # Citation
60 |
61 | If you find this project useful, consider cite this project:
62 |
63 | ```
64 | @misc{li2023avoiding,
65 | title={Avoiding Data Contamination in Language Model Evaluation: Dynamic Test Construction with Latest Materials},
66 | author={Yucheng Li and Frank Guerin and Chenghua Lin},
67 | year={2023},
68 | eprint={2312.12343},
69 | archivePrefix={arXiv},
70 | primaryClass={cs.CL}
71 | }
72 | ```
--------------------------------------------------------------------------------
/data/monthly_updater/monthly_wikitext.py:
--------------------------------------------------------------------------------
1 | import requests
2 | import traceback
3 | import mwparserfromhell
4 | import datetime
5 | import os
6 | import datasets
7 |
8 | import sys
9 | import json
10 | import time
11 |
12 | from tqdm import tqdm
13 |
14 | WIKI_API_ENDPOINT = "https://en.wikipedia.org/w/api.php"
15 |
16 | def parse_to_plain_text(wikitext):
17 | parsed = mwparserfromhell.parse(wikitext)
18 | return parsed.strip_code()
19 |
20 | def fetch_content(title, date=None):
21 | params = {
22 | "action": "query",
23 | "format": "json",
24 | "titles": title,
25 | "prop": "revisions",
26 | "rvprop": "content",
27 | "rvlimit": "1",
28 | }
29 | if date: params["rvstart"] = date
30 | try:
31 | response = requests.get(WIKI_API_ENDPOINT, params=params)
32 | response.raise_for_status() # Will raise an error if the HTTP request returned an unsuccessful status code
33 | data = response.json()
34 | if 'error' in data:
35 | print(f"Error fetching content for {title}: {data['error']['info']}")
36 | return None
37 |
38 | page = next(iter(data['query']['pages'].values()))
39 | if 'revisions' not in page:
40 | print(f"No revisions found for {title}")
41 | return None
42 | content = page['revisions'][0]['*']
43 |
44 | # Check if the content is a redirect and skip if true
45 | if content.lower().startswith("#redirect"):
46 | print(f"{title} is a redirect page.")
47 | return None
48 | text = parse_to_plain_text(content)
49 | if len(text.split(' ')) < 300:
50 | print(f"{title} is less than 300 words.")
51 | return None
52 |
53 | return {
54 | "title": page['title'],
55 | "text": text,
56 | "pageid": page['pageid'],
57 | }
58 |
59 | except Exception as e:
60 | print(f"An error occurred while fetching content for {title}: {str(e)}")
61 | traceback.print_exc() # This will print the full traceback
62 |
63 | return None
64 |
65 | if __name__ == "__main__":
66 | today = datetime.datetime.today()
67 | year = today.year
68 | month = today.month
69 |
70 | hf_token = os.environ['HF_TOKEN']
71 |
72 | start_time = datetime.datetime(year, month, 1)
73 | end_time = today
74 |
75 | print(f'Fetching wiki articles from {start_time.isoformat()} to {end_time.isoformat()}')
76 |
77 | with open('./data/squad_wiki_title.text') as f:
78 | titles = [line.strip() for line in f.readlines()]
79 | historical_contents = [fetch_content(title, end_time) for title in tqdm(titles)]
80 | historical_contents = [content for content in historical_contents if content is not None]
81 | historical_to_save = {title: content for title, content in zip(titles, historical_contents)}
82 |
83 | save_file = f'{year}-{month}.json'
84 | with open(save_file, 'w') as f:
85 | json.dump(historical_to_save, f, ensure_ascii=False)
86 | print(f'Saved {len(historical_contents)} articles to {save_file}')
87 |
88 | from huggingface_hub import hf_hub_download, RepoCard, upload_file
89 |
90 | with open(save_file) as f:
91 | data = json.load(f)
92 |
93 | all_articles = []
94 | for title, article in data.items():
95 | article['time'] = f'{year}-{month:02d}'
96 | all_articles.append(article)
97 |
98 | ds = datasets.Dataset.from_list(all_articles)
99 | ds.push_to_hub(f"RealTimeData/wikitext_alltime", config_name=f'{year}-{month:02d}', token=hf_token)
--------------------------------------------------------------------------------
/data/wikitext_alltime.py:
--------------------------------------------------------------------------------
1 | import datasets
2 | import json
3 |
4 | dl = datasets.DownloadManager()
5 | configs_file = dl.download('https://huggingface.co/datasets/RealTimeData/wikitext_alltime/raw/main/configs.txt')
6 |
7 | with open(configs_file, encoding="utf-8") as f:
8 | _TIMES = f.read().splitlines()
9 |
10 | _TIMES += ['all']
11 |
12 | _CITATION = """\
13 | @misc{li2023estimating,
14 | title={Estimating Contamination via Perplexity: Quantifying Memorisation in Language Model Evaluation},
15 | author={Yucheng Li},
16 | year={2023},
17 | eprint={2309.10677},
18 | archivePrefix={arXiv},
19 | primaryClass={cs.CL}
20 | }
21 | """
22 |
23 | _DESCRIPTION = """\
24 | This dataset contains Wikipedia articles of 419 selected pages every month from 2017-1 to current. The articles are arraged by month. Access the specific month by using the format "YYYY-MM" as config. Such as load_dataset("RealTimeData/wikitext_alltime", "2021-1").
25 | """
26 |
27 | _HOMEPAGE = "https://github.com/liyucheng09/Contamination_Detector"
28 |
29 | class Wikitext_alltimes(datasets.GeneratorBasedBuilder):
30 |
31 | BUILDER_CONFIGS = [
32 | datasets.BuilderConfig(
33 | name=time, version=datasets.Version("1.0.0"), description=f"419 selected wikipedia articles edited in the priod of {time}"
34 | )
35 | for time in _TIMES
36 | ]
37 |
38 | def _info(self):
39 | features = datasets.Features(
40 | {
41 | "title": datasets.Value("string"),
42 | "pageid": datasets.Value("int64"),
43 | "text": datasets.Value("string"),
44 | "time": datasets.Value("string"),
45 | }
46 | )
47 | return datasets.DatasetInfo(
48 | description=_DESCRIPTION,
49 | features=features,
50 | homepage=_HOMEPAGE,
51 | citation=_CITATION,
52 | )
53 |
54 | def _split_generators(self, dl_manager):
55 | """Returns SplitGenerators."""
56 | if self.config.name == "all":
57 | times = _TIMES[:-1]
58 | files = dl_manager.download([f"wiki/{time}.json" for time in _TIMES ])
59 | return [
60 | datasets.SplitGenerator(
61 | name=datasets.Split.TRAIN,
62 | gen_kwargs={"files": files},
63 | )
64 | ]
65 | else:
66 | time = self.config.name
67 | _URL = f"wiki/{time}.json"
68 | file = dl_manager.download(_URL)
69 | return [
70 | datasets.SplitGenerator(
71 | name=datasets.Split.TRAIN,
72 | gen_kwargs={"files": file},
73 | )
74 | ]
75 |
76 | def _generate_examples(self, files):
77 | """Yields examples."""
78 | if self.config.name == "all":
79 | assert isinstance(files, list)
80 | for file in files:
81 | time = file.strip('.json')
82 | with open(file, encoding="utf-8") as f:
83 | data = json.load(f)
84 | for title, article in data.items():
85 | yield f'{time}-{title}', {
86 | "title": article['title'],
87 | "pageid": article['pageid'],
88 | "text": article['text'],
89 | "time": time,
90 | }
91 | else:
92 | assert isinstance(files, str)
93 | time = self.config.name
94 | with open(files, encoding="utf-8") as f:
95 | data = json.load(f)
96 | for title, article in data.items():
97 | yield f'{time}-{title}', {
98 | "title": article['title'],
99 | "pageid": article['pageid'],
100 | "text": article['text'],
101 | "time": time,
102 | }
--------------------------------------------------------------------------------
/push_to_hf_hub.py:
--------------------------------------------------------------------------------
1 | # Merge with RealTimeData/ and push to Huggingface Hub
2 |
3 | from glob import glob
4 | import datasets
5 | import json
6 | from huggingface_hub import RepoCard, create_branch, create_tag
7 | from data_processor import ArxivEval, BBCNewsEval, GithubEval
8 | import datetime
9 |
10 | if __name__ == "__main__":
11 | # Load the dataset
12 | # for example, benchmarks/latest/qa_pairs_arxiv_2023-46.json
13 |
14 | today = datetime.date.today()
15 |
16 | RepoCardText = """
17 | # LatestEval for {source}
18 |
19 | This benchmark was created with at {year} week {week} with the latest data from {source}.
20 |
21 | check more details at our [github page](https://github.com/liyucheng09/LatestEval)."""
22 |
23 | source2ds = {}
24 | latest_ds = []
25 |
26 | for file in glob('benchmarks/2023-51/*.json'):
27 | with open(file, 'r') as f:
28 | data = json.load(f)
29 |
30 | if 'arxiv' in file:
31 | source = 'arxiv'
32 | docs = ArxivEval('RealTimeData/arxiv_latest', num_docs='all').docs
33 | elif 'bbc' in file:
34 | source = 'bbc'
35 | docs = BBCNewsEval('RealTimeData/bbc_latest', num_docs='all').docs
36 | elif 'github' in file:
37 | source = 'github'
38 | docs = GithubEval('RealTimeData/github_latest', num_docs='all').docs
39 |
40 | source2ds[source] = data
41 |
42 | time_stamp = file.split('_')[-1].split('.')[0]
43 | year = time_stamp.split('-')[0]
44 | week = time_stamp.split('-')[1]
45 |
46 | test_samples = []
47 | for doc in data:
48 | doc_id = doc['id'][len(source)+1:]
49 | sents = None
50 | for d in docs:
51 | if d.entry_id == doc_id:
52 | sents = d.original_sentences
53 | assert sents is not None, f'{doc_id} not found in {source} data'
54 |
55 | if isinstance(doc['response'], str):
56 | try:
57 | doc['response'] = eval(doc['response'])
58 | except:
59 | print(doc['response'])
60 | continue
61 |
62 | for example in doc['response']:
63 | sent_index = example['sentence_index']
64 | passage = ''
65 | for sent_i, sent in enumerate(sents):
66 | if sent_i == sent_index:
67 | passage += example['place_holder'] + ' '
68 | else:
69 | passage += sent + ' '
70 | test_samples.append({
71 | 'source': source,
72 | 'doc_id': doc_id,
73 | 'passage': passage,
74 | 'query': example['query'],
75 | 'answer': example['key_information'],
76 | 'query_category': example['answer_type'],
77 | 'sent_index': sent_index
78 | })
79 |
80 | latest_ds.extend(test_samples)
81 |
82 | # dataset = datasets.Dataset.from_list(test_samples)
83 | # dataset.push_to_hub(f'LatestEval/{source}-latest', branch='main')
84 | # dataset.push_to_hub(f'LatestEval/{source}-{year}-week{week}')
85 |
86 | # card = RepoCard(RepoCardText.format(source=source, year=year, week=week))
87 | # card.push_to_hub(f'LatestEval/{source}-latest', repo_type='dataset')
88 | # card.push_to_hub(f'LatestEval/{source}-{year}-week{week}', repo_type='dataset')
89 |
90 | # all three sources together
91 | # flatten the data and add source column
92 |
93 | dataset = datasets.Dataset.from_list(latest_ds)
94 | dataset.push_to_hub(f'LatestEval/full-latest', branch='main')
95 | dataset.push_to_hub(f'LatestEval/full-{year}-week{week}')
96 |
97 | card = RepoCard(RepoCardText.format(source='all', year=year, week=week))
98 | card.push_to_hub(f'LatestEval/full-latest', repo_type='dataset')
99 | card.push_to_hub(f'LatestEval/full-{year}-week{week}', repo_type='dataset')
100 |
--------------------------------------------------------------------------------
/figs/compare_strings.json:
--------------------------------------------------------------------------------
1 | {
2 | "prompts": [
3 | "Beyoncé Giselle Knowles-Carter (/biːˈjɒnseɪ/ bee-YON-say) (born September 4, 1981) is an American singer, songwriter, record producer and actress. Born and raised in Houston, Texas, she performed in various singing and dancing competitions as a child, and rose to fame in",
4 | "Persian (/ˈpɜːrʒən, -ʃən/), also known by its endonym Farsi (فارسی fārsi (fɒːɾˈsiː) ( listen)), is one of the Western Iranian languages within the Indo-Iranian branch of the Indo-European language family. It is primarily",
5 | "The number of Malayalam speakers in Lakshadweep is 51,100, which is only 0.15% of the total number, but is as much as about 84% of the population of Lakshadweep. In all, Malayalis made up 3.22% of the total Indian population in 2001. Of the total 33,066,",
6 | "The British have a love-hate relationship with the NHS. According to researchers at the King's Fund, the public gave the NHS its worst rating since records began 40 years ago. Just 29% said they were satisfied with the NHS in 2022. And yet we still love it. A whopping 90% of"
7 | ],
8 | "predictions": [
9 | " the late 1990s as lead singer of R&B girl-group Destiny's Child. Managed by her father, Mathew Knowles, the group became one of the world's best-selling girl groups of all time. Their hiatus saw the release of Beyoncé's debut album, Dangerously in Love (2003), which established her as a solo artist worldwide, earned five Grammy Awards and featured the Billboard Hot 100 number-one singles \"Crazy in Love\" and \"Baby Boy\".",
10 | " spoken in Iran, Afghanistan (officially known as Dari since 1958),[3] and Tajikistan (officially known as Tajiki since the Soviet era),[4] and some other regions which historically were Persianate societies and considered part of Greater Iran. It is written right to left in the Persian alphabet, a modified variant of the Arabic script.",
11 | " 392 Malayalam speakers in India in 2001, 33,015,420 spoke the standard dialects, 19,643 spoke the Yerava dialect and 31,329 spoke non-standard regional variations like Eranadan. As per the 1991 census data, 28.85% of all Malayalam speakers in India spoke a second language and 19.64% of the total knew three or more languages.",
12 | " of Britons believe that the NHS is a crucial institution that should be preserved. The juxtaposition between dissatisfaction with the current state and overall reverence for the institution speaks volumes about the complex relationship the British public has with their healthcare system."
13 | ],
14 | "references": [
15 | " the late 1990s as lead singer of the R&B girl-group Destiny's Child. Managed by her father, Mathew Knowles, the group became one of the world's best-selling girl groups of all time. Her debut solo album, Dangerously in Love (2003), debuted at number one on the US Billboard 200 chart and earned her five Grammy Awards, solidifying her as a solo artist as well. Throughout her career, Beyoncé has sold over 100 million records worldwide as a solo artist and a further 60 million records with Destiny's Child, making her one of the best-selling music artists of all time. She has won 23 Grammy Awards and is the most",
16 | " spoken in Iran, Afghanistan (officially known as Dari since 1958), and Tajikistan (officially known as Tajiki since the Soviet era), and some other regions which historically were Persianate societies and considered part of Greater Iran. It is written in the Persian alphabet, a modified variant of the Arabic script, which itself evolved from the Aramaic alphabet.",
17 | " 392 Malayalam speakers in India in 2001, 33,015,420 spoke the standard dialects, 19,643 spoke the Yerava dialect and 31,329 spoke non-standard regional variations like Eranadan. As per the 1991 census data, 28.85% of all Malayalam speakers in India spoke a second language and 19.64% of the total knew three or more languages.",
18 | "of the public agrees the service should be free and available to everyone. But with more than seven million people on waiting lists, almost everyone knows someone who isn't getting the care they need. As the NHS approaches its 75th anniversary, politicians are falling over themselves to praise the service."
19 | ]
20 | }
--------------------------------------------------------------------------------
/.github/workflows/monthly_updater.yml:
--------------------------------------------------------------------------------
1 | name: Monthly Updater
2 |
3 | on:
4 | schedule:
5 | # This cron job initiates the action at 00:00 on the 28th of every month
6 | - cron: '0 0 28 * *'
7 |
8 | workflow_dispatch:
9 |
10 | jobs:
11 | wiki_downloader:
12 | runs-on: ubuntu-latest
13 |
14 | # Define environment variables for all steps in this job
15 | env:
16 | HF_TOKEN: ${{ secrets.HF_TOKEN }}
17 | Github_Token: ${{ secrets.gh_token }}
18 | Overflow_Token: ${{ secrets.overflow_token }}
19 |
20 | steps:
21 | - name: Checkout repository
22 | uses: actions/checkout@v2
23 |
24 | - name: Set up Python
25 | uses: actions/setup-python@v2
26 | with:
27 | python-version: '3.10' # Choose your desired Python version
28 |
29 | - name: Install dependencies
30 | run: |
31 | python -m pip install --upgrade pip
32 | pip install -r requirements.txt
33 |
34 | - name: Run script
35 | run: python data/monthly_updater/monthly_wikitext.py
36 |
37 | arxiv_downloader:
38 | runs-on: ubuntu-latest
39 |
40 | # Define environment variables for all steps in this job
41 | env:
42 | HF_TOKEN: ${{ secrets.HF_TOKEN }}
43 | Github_Token: ${{ secrets.gh_token }}
44 | Overflow_Token: ${{ secrets.overflow_token }}
45 |
46 | steps:
47 | - name: Checkout repository
48 | uses: actions/checkout@v2
49 |
50 | - name: Set up Python
51 | uses: actions/setup-python@v2
52 | with:
53 | python-version: '3.10' # Choose your desired Python version
54 |
55 | - name: Install dependencies
56 | run: |
57 | python -m pip install --upgrade pip
58 | pip install -r requirements.txt
59 |
60 | - name: Run script
61 | run: python data/monthly_updater/monthly_arxiv.py
62 |
63 | bbc_downloader:
64 | runs-on: ubuntu-latest
65 |
66 | # Define environment variables for all steps in this job
67 | env:
68 | HF_TOKEN: ${{ secrets.HF_TOKEN }}
69 | Github_Token: ${{ secrets.gh_token }}
70 | Overflow_Token: ${{ secrets.overflow_token }}
71 |
72 | steps:
73 | - name: Checkout repository
74 | uses: actions/checkout@v2
75 |
76 | - name: Set up Python
77 | uses: actions/setup-python@v2
78 | with:
79 | python-version: '3.10' # Choose your desired Python version
80 |
81 | - name: Install dependencies
82 | run: |
83 | python -m pip install --upgrade pip
84 | pip install -r requirements.txt
85 |
86 | - name: Run script
87 | run: python data/monthly_updater/monthly_bbc_news.py
88 |
89 | math_downloader:
90 | runs-on: ubuntu-latest
91 |
92 | # Define environment variables for all steps in this job
93 | env:
94 | HF_TOKEN: ${{ secrets.HF_TOKEN }}
95 | Github_Token: ${{ secrets.gh_token }}
96 | Overflow_API_KEY: ${{ secrets.overflow_token }}
97 |
98 | steps:
99 | - name: Checkout repository
100 | uses: actions/checkout@v2
101 |
102 | - name: Set up Python
103 | uses: actions/setup-python@v2
104 | with:
105 | python-version: '3.10' # Choose your desired Python version
106 |
107 | - name: Install dependencies
108 | run: |
109 | python -m pip install --upgrade pip
110 | pip install -r requirements.txt
111 |
112 | - name: Run script
113 | run: python data/monthly_updater/monthly_math.py
114 |
115 | code_downloader:
116 | runs-on: ubuntu-latest
117 |
118 | # Define environment variables for all steps in this job
119 | env:
120 | HF_TOKEN: ${{ secrets.HF_TOKEN }}
121 | Github_Token: ${{ secrets.gh_token }}
122 | Overflow_Token: ${{ secrets.overflow_token }}
123 |
124 | steps:
125 | - name: Checkout repository
126 | uses: actions/checkout@v2
127 |
128 | - name: Set up Python
129 | uses: actions/setup-python@v2
130 | with:
131 | python-version: '3.10' # Choose your desired Python version
132 |
133 | - name: Install dependencies
134 | run: |
135 | python -m pip install --upgrade pip
136 | pip install -r requirements.txt
137 |
138 | - name: Run script
139 | run: python data/monthly_updater/monthly_code.py
140 |
--------------------------------------------------------------------------------
/github_downloader.py:
--------------------------------------------------------------------------------
1 | import requests
2 | import base64
3 | from datetime import datetime, timedelta
4 | import os
5 | import json
6 | from tqdm import tqdm
7 | from huggingface_hub import create_branch, create_tag, RepoCard
8 | import traceback
9 |
10 | github_token = os.environ['Github_Token']
11 | headers = {'Authorization': f'token {github_token}'}
12 |
13 | hf_token = os.environ['HF_TOKEN']
14 |
15 | today = datetime.now()
16 | start_date = today - timedelta(weeks=2)
17 | start_date_str = start_date.strftime("%Y-%m-%d")
18 |
19 | end_date = start_date + timedelta(days=7)
20 | end_date_str = end_date.strftime("%Y-%m-%d")
21 |
22 | out_path = f"dataset/github/{start_date_str}"
23 | if not os.path.exists(out_path):
24 | os.makedirs(out_path)
25 |
26 | def load_checkpoint():
27 | try:
28 | with open(f'{start_date_str}_checkpoint.json', 'r') as f:
29 | checkpoint = json.load(f)
30 | return checkpoint.get('page', 1), checkpoint.get('last_repo_index', 0)
31 | except FileNotFoundError:
32 | return 1, 0
33 |
34 | def save_checkpoint(page, last_repo_index):
35 | with open(f'{start_date_str}_checkpoint.json', 'w') as f:
36 | json.dump({'page': page, 'last_repo_index': last_repo_index}, f)
37 |
38 | page, last_repo_index = 1, 0
39 | # page, last_repo_index = load_checkpoint()
40 | all_readmes = []
41 |
42 | while True:
43 | response = requests.get(f'https://api.github.com/search/repositories?q=created:{start_date_str}..{end_date_str}&sort=stars&order=desc&per_page=100&page={page}', headers=headers)
44 | data = response.json()
45 |
46 | if 'items' not in data:
47 | break
48 | if not data['items']:
49 | break
50 | for repo in tqdm(data['items'][last_repo_index:]):
51 | owner = repo['owner']['login']
52 | repo_name = repo['name']
53 |
54 | full_name = repo['full_name']
55 | url = repo['html_url']
56 | description = repo['description']
57 | stars = repo['stargazers_count']
58 | forks = repo['forks_count']
59 |
60 | response = requests.get(f'https://api.github.com/repos/{owner}/{repo_name}/readme', headers=headers)
61 | readme_data = response.json()
62 |
63 | if 'content' in readme_data:
64 | readme_content = base64.b64decode(readme_data['content']).decode('utf-8')
65 | # print(f"Repository {repo_name} README content:")
66 | # print(readme_content)
67 | with open(f"{out_path}/{full_name.replace('/', '_')}_README.md", 'w') as f:
68 | readme_obj = {'full_name': full_name, 'url': url, 'description': description, 'readme': readme_content, 'stars': stars, 'forks': forks}
69 | all_readmes.append(readme_obj)
70 | json.dump(readme_obj, f, ensure_ascii=False)
71 | else:
72 | print(f"Repository {repo_name} doesn't have a README.")
73 |
74 | page += 1
75 |
76 | import datasets
77 |
78 | all_readmes = { k: [v[k] for v in all_readmes] for k in all_readmes[0].keys() }
79 | ds = datasets.Dataset.from_dict(all_readmes)
80 |
81 | try:
82 | create_branch("RealTimeData/github_latest", branch=start_date_str, repo_type="dataset", token=hf_token)
83 | except:
84 | traceback.print_exc()
85 | ds.push_to_hub("RealTimeData/github_latest", token=hf_token, branch='main')
86 | ds.push_to_hub("RealTimeData/github_latest", token=hf_token, branch=start_date_str)
87 |
88 | text = f"""
89 | # Latest GitHub Repositories
90 |
91 | You could always access the latest Github repos via this dataset.
92 |
93 | We update the dataset weekly, on every Sunday. So the dataset always provides the latest Github repos from the last week.
94 |
95 | The current dataset on main branch contains the latest Github Repos submitted from {start_date_str} to {end_date_str}.
96 |
97 | The data collection is conducted on {today.date().isoformat()}.
98 |
99 | Use the dataset via:
100 | ```
101 | ds = datasets.load_dataset('RealTimeData/github_latest')
102 | ```
103 |
104 | # Previsou versions
105 |
106 | You could access previous versions by requesting different branches.
107 |
108 | For example, you could find the 2023-08-06 version via:
109 | ```
110 | ds = datasets.load_dataset('RealTimeData/github_latest', revision = '2023-08-06')
111 | ```
112 |
113 | Check all available versions by clicking the "Files and versions" button on the top bar.
114 | """
115 | card = RepoCard(text)
116 | card.push_to_hub('RealTimeData/github_latest', repo_type='dataset', token=hf_token)
--------------------------------------------------------------------------------
/data/bbc_alltime.py:
--------------------------------------------------------------------------------
1 | import datasets
2 | import json
3 |
4 | dl = datasets.DownloadManager()
5 | configs_file = dl.download('https://huggingface.co/datasets/RealTimeData/bbc_alltime/raw/main/configs.txt')
6 |
7 | with open(configs_file, encoding="utf-8") as f:
8 | _TIMES = f.read().splitlines()
9 |
10 | _CITATION = """\
11 | @misc{li2023estimating,
12 | title={Estimating Contamination via Perplexity: Quantifying Memorisation in Language Model Evaluation},
13 | author={Yucheng Li},
14 | year={2023},
15 | eprint={2309.10677},
16 | archivePrefix={arXiv},
17 | primaryClass={cs.CL}
18 | }
19 | """
20 |
21 | _DESCRIPTION = """\
22 | This dataset contains BBC News articles for every month from 2017-1 to current. Access a specific month by using the format "YYYY-MM" as config. Such as load_dataset("RealTimeData/bbc_alltime", "2021-1").
23 | """
24 |
25 | _HOMEPAGE = "https://github.com/liyucheng09/Contamination_Detector"
26 |
27 | class Bbc_alltimes(datasets.GeneratorBasedBuilder):
28 |
29 | BUILDER_CONFIGS = [
30 | datasets.BuilderConfig(
31 | name=time, version=datasets.Version("1.0.0"), description=f"BBC News articles published in the priod of {time}"
32 | )
33 | for time in _TIMES
34 | ]
35 |
36 | def _info(self):
37 | features = datasets.Features(
38 | {
39 | "title": datasets.Value("string"),
40 | "published_date": datasets.Value("string"),
41 | "authors": datasets.Value("string"),
42 | "description": datasets.Value("string"),
43 | "section": datasets.Value("string"),
44 | "content": datasets.Value("string"),
45 | "link": datasets.Value("string"),
46 | }
47 | )
48 | return datasets.DatasetInfo(
49 | description=_DESCRIPTION,
50 | features=features,
51 | homepage=_HOMEPAGE,
52 | citation=_CITATION,
53 | )
54 |
55 | def _split_generators(self, dl_manager):
56 | """Returns SplitGenerators."""
57 | if self.config.name == "all":
58 | times = _TIMES[:-1]
59 | files = dl_manager.download([f"articles/{time}.json" for time in _TIMES ])
60 | return [
61 | datasets.SplitGenerator(
62 | name=datasets.Split.TRAIN,
63 | gen_kwargs={"files": files},
64 | )
65 | ]
66 | else:
67 | time = self.config.name
68 | _URL = f"articles/{time}.json"
69 | file = dl_manager.download(_URL)
70 | return [
71 | datasets.SplitGenerator(
72 | name=datasets.Split.TRAIN,
73 | gen_kwargs={"files": file},
74 | )
75 | ]
76 |
77 | def _generate_examples(self, files):
78 | """Yields examples."""
79 | if self.config.name == "all":
80 | assert isinstance(files, list)
81 | for file in files:
82 | time = file.strip('.json')
83 | with open(file, encoding="utf-8") as f:
84 | data = json.load(f)
85 | length = len(data['title'])
86 | for i in range(length):
87 | yield f'{time}-{i}', {
88 | "title": data['title'][i],
89 | "published_date": data['published_date'][i],
90 | "authors": data['authors'][i],
91 | "description": data['description'][i],
92 | "section": data['section'][i],
93 | "content": data['content'][i],
94 | "link": data['link'][i],
95 | }
96 | else:
97 | assert isinstance(files, str)
98 | time = self.config.name
99 | with open(files, encoding="utf-8") as f:
100 | data = json.load(f)
101 | length = len(data['title'])
102 | for i in range(length):
103 | yield f'{time}-{i}', {
104 | "title": data['title'][i],
105 | "published_date": data['published_date'][i],
106 | "authors": data['authors'][i],
107 | "description": data['description'][i],
108 | "section": data['section'][i],
109 | "content": data['content'][i],
110 | "link": data['link'][i],
111 | }
--------------------------------------------------------------------------------
/data/maintain_wikitext_latest.py:
--------------------------------------------------------------------------------
1 | import requests
2 | import traceback
3 | import mwparserfromhell
4 | import datetime
5 | import os
6 |
7 | import sys
8 | import json
9 | import time
10 |
11 | from tqdm import tqdm
12 |
13 | WIKI_API_ENDPOINT = "https://en.wikipedia.org/w/api.php"
14 |
15 | def parse_to_plain_text(wikitext):
16 | parsed = mwparserfromhell.parse(wikitext)
17 | return parsed.strip_code()
18 |
19 | def fetch_content(title, date=None):
20 | params = {
21 | "action": "query",
22 | "format": "json",
23 | "titles": title,
24 | "prop": "revisions",
25 | "rvprop": "content",
26 | "rvlimit": "1",
27 | }
28 | if date: params["rvstart"] = date
29 | try:
30 | response = requests.get(WIKI_API_ENDPOINT, params=params)
31 | response.raise_for_status() # Will raise an error if the HTTP request returned an unsuccessful status code
32 | data = response.json()
33 | if 'error' in data:
34 | print(f"Error fetching content for {title}: {data['error']['info']}")
35 | return None
36 |
37 | page = next(iter(data['query']['pages'].values()))
38 | if 'revisions' not in page:
39 | print(f"No revisions found for {title}")
40 | return None
41 | content = page['revisions'][0]['*']
42 |
43 | # Check if the content is a redirect and skip if true
44 | if content.lower().startswith("#redirect"):
45 | print(f"{title} is a redirect page.")
46 | return None
47 | text = parse_to_plain_text(content)
48 | if len(text.split(' ')) < 300:
49 | print(f"{title} is less than 300 words.")
50 | return None
51 |
52 | return {
53 | "title": page['title'],
54 | "text": text,
55 | "pageid": page['pageid'],
56 | }
57 |
58 | except Exception as e:
59 | print(f"An error occurred while fetching content for {title}: {str(e)}")
60 | traceback.print_exc() # This will print the full traceback
61 |
62 | return None
63 |
64 | if __name__ == "__main__":
65 | today = datetime.date.today()
66 | year = today.year
67 | month = today.month
68 |
69 | hf_token = os.environ['HF_TOKEN']
70 |
71 | start_time = datetime.datetime(year, month, 1)
72 | end_time = today
73 |
74 | print(f'Fetching wiki articles from {start_time.isoformat()} to {end_time.isoformat()}')
75 |
76 | with open('./data/squad_wiki_title.text') as f:
77 | titles = [line.strip() for line in f.readlines()]
78 | historical_contents = [fetch_content(title, end_time) for title in tqdm(titles)]
79 | historical_contents = [content for content in historical_contents if content is not None]
80 | historical_to_save = {title: content for title, content in zip(titles, historical_contents)}
81 |
82 | save_file = f'{year}-{month}.json'
83 | with open(save_file, 'w') as f:
84 | json.dump(historical_to_save, f, ensure_ascii=False)
85 | print(f'Saved {len(historical_contents)} articles to {save_file}')
86 |
87 | from huggingface_hub import hf_hub_download, RepoCard, upload_file
88 |
89 | upload_file(
90 | path_or_fileobj = save_file,
91 | path_in_repo = f'wiki/{year}-{month}.json',
92 | repo_id = 'RealTimeData/wikitext_alltime',
93 | repo_type = 'dataset',
94 | token=hf_token,
95 | )
96 |
97 | file = hf_hub_download(repo_id="RealTimeData/wikitext_alltime", filename="configs.txt", repo_type='dataset')
98 | with open(file) as f:
99 | times = json.read(f).splitlines()
100 | times.append(f'{year}-{month}')
101 |
102 | with open('configs.txt', 'w') as f:
103 | f.write('\n'.join(times))
104 |
105 | upload_file(
106 | path_or_fileobj = 'configs.txt',
107 | path_in_repo = 'configs.txt',
108 | repo_id = 'RealTimeData/wikitext_alltime',
109 | repo_type = 'dataset',
110 | token=hf_token,
111 | )
112 |
113 | text = f"""
114 | # Wikitext for All Times
115 |
116 | You could find 491 selected wiki articles every month from 2017-1 to {year_str}-{month_str}.
117 |
118 | Use this to download wiki articles during a specific month:
119 | ```
120 | ds = datasets.load_dataset('RealTimeData/wikitext_alltime', '2017-8')
121 | ```
122 |
123 | The time stamp follows the format of "YYYY-MM".
124 |
125 | # An example
126 |
127 | ```
128 | > ds = datasets.load_dataset('RealTimeData/wikitext_alltime', '2023-10', split='train')
129 | > ds[0]
130 |
131 | {'title': 'Queen Victoria',
132 | 'pageid': 47923,
133 | 'text': 'Victoria (Alexa ...',
134 | 'time': '2023-10'}
135 | ```
136 | """
137 | card = RepoCard(text)
138 | card.push_to_hub('RealTimeData/wikitext_alltime', repo_type='dataset', token=hf_token)
--------------------------------------------------------------------------------
/data/audio_dataset.py:
--------------------------------------------------------------------------------
1 | import datetime
2 | import yt_dlp
3 | from googleapiclient.discovery import build
4 | import os
5 | import sys
6 | import re
7 | from glob import glob
8 | import datasets
9 | import time
10 | import random
11 | import soundfile as sf
12 | import struct
13 | import numpy as np
14 |
15 | def get_popular_videos(youtube, start_date, end_date, max_results=30):
16 | published_after = start_date.strftime("%Y-%m-%dT%H:%M:%SZ")
17 | published_before = end_date.strftime("%Y-%m-%dT%H:%M:%SZ")
18 | request = youtube.search().list(
19 | part="snippet",
20 | maxResults=max_results,
21 | order="viewCount",
22 | publishedAfter=published_after,
23 | publishedBefore=published_before,
24 | type="video",
25 | )
26 | response = request.execute()
27 | videos_ids = [item['id']['videoId'] for item in response['items']]
28 | return videos_ids
29 |
30 | def parse_duration(duration_string):
31 | hours = re.search(r'(\d+)H', duration_string)
32 | minutes = re.search(r'(\d+)M', duration_string)
33 | seconds = re.search(r'(\d+)S', duration_string)
34 |
35 | hours = int(hours.group(1)) if hours else 0
36 | minutes = int(minutes.group(1)) if minutes else 0
37 | seconds = int(seconds.group(1)) if seconds else 0
38 |
39 | return hours * 3600 + minutes * 60 + seconds
40 |
41 | def filter_too_long_video(youtube, video_ids, max_duration=600, max_results=30):
42 | request = youtube.videos().list(
43 | part="contentDetails",
44 | id=','.join(video_ids),
45 | )
46 | response = request.execute()
47 |
48 | final_videos = []
49 | for item in response['items']:
50 | duration = parse_duration(item['contentDetails']['duration'])
51 | if duration <= max_duration:
52 | final_videos.append(item['id'])
53 | if len(final_videos) >= max_results:
54 | break
55 | return final_videos
56 |
57 | def download_audio(video_id, save_path):
58 | ydl_opts = {
59 | 'format': 'bestaudio/best',
60 | 'postprocessors': [{
61 | 'key': 'FFmpegExtractAudio',
62 | 'preferredcodec': 'flac',
63 | 'preferredquality': '192',
64 | }],
65 | 'postprocessor_args': [
66 | '-ar', '16000' # Set audio sample rate to 16 kHz
67 | ],
68 | 'outtmpl': os.path.join(save_path, '%(id)s.%(ext)s'),
69 | }
70 | with yt_dlp.YoutubeDL(ydl_opts) as ydl:
71 | ydl.download([f'http://www.youtube.com/watch?v={video_id}'])
72 |
73 | if __name__ == '__main__':
74 |
75 | month, save_path, = sys.argv[1:]
76 | month = int(month) + 1
77 |
78 | videos_per_month = 3
79 | api_key = os.environ['YOUTUBE_API_KEY']
80 | youtube = build('youtube', 'v3', developerKey=api_key)
81 |
82 | # time_stamps = [f'{year}-{month:02d}' for year in range(2017, 2024)]
83 | time_stamps = [f'{year}-{month:02d}' for year in range(2017, 2024) for month in range(1, 13)]
84 |
85 | for time_stamp in time_stamps:
86 | files = glob(os.path.join(save_path, time_stamp, '*.flac'))
87 | print(f"Start {time_stamp}...")
88 |
89 | if not len(files) >= videos_per_month:
90 | year, month = time_stamp.split('-')
91 |
92 | start_date = datetime.date(int(year), int(month), 1)
93 | end_of_month = datetime.date(int(year), int(month), 28)
94 |
95 | video_ids = get_popular_videos(youtube, start_date, end_of_month, max_results=50)
96 | video_ids = filter_too_long_video(youtube, video_ids, max_duration=600, max_results=videos_per_month)
97 | for video in video_ids:
98 | download_audio(video, os.path.join(save_path, time_stamp))
99 |
100 | print(f"Downloaded {len(video_ids)} videos in {time_stamp}")
101 |
102 | files = glob(os.path.join(save_path, time_stamp, '*.flac'))
103 | random.shuffle(files)
104 | files = files[:videos_per_month]
105 | instances = []
106 | for file in files:
107 | data, samplerate = sf.read(file)
108 | if len(data.shape) > 1:
109 | data = data.mean(axis=1)
110 | denormalized_data = np.int16(data * 32767)
111 | byte_stream = b''.join(struct.pack('')
8 | result.append(f'{title}
')
9 | result.append(f'{prompt}
')
10 | s = SequenceMatcher(None, str1, str2)
11 |
12 | for opcode, a0, a1, b0, b1 in s.get_opcodes():
13 | if opcode == 'equal':
14 | result.append(f'{str1[a0:a1]}')
15 | else:
16 | result.append(str1[a0:a1])
17 | result.append('')
18 | all_result.append(''.join(result))
19 |
20 | return ''.join(all_result)
21 |
22 | # Test the function
23 | # prompts = [
24 | # """Beyoncé Giselle Knowles-Carter (/biːˈjɒnseɪ/ bee-YON-say) (born September 4, 1981) is an American singer, songwriter, record producer and actress. Born and raised in Houston, Texas, she performed in various singing and dancing competitions as a child, and rose to fame in""",
25 | # """Persian (/ˈpɜːrʒən, -ʃən/), also known by its endonym Farsi (فارسی fārsi (fɒːɾˈsiː) ( listen)), is one of the Western Iranian languages within the Indo-Iranian branch of the Indo-European language family. It is primarily""",
26 | # """The number of Malayalam speakers in Lakshadweep is 51,100, which is only 0.15% of the total number, but is as much as about 84% of the population of Lakshadweep. In all, Malayalis made up 3.22% of the total Indian population in 2001. Of the total 33,066,""",
27 | # """The British have a love-hate relationship with the NHS. According to researchers at the King's Fund, the public gave the NHS its worst rating since records began 40 years ago. Just 29% said they were satisfied with the NHS in 2022. And yet we still love it. A whopping 90% of""",
28 | # ]
29 | # predictions = [
30 | # """ the late 1990s as lead singer of R&B girl-group Destiny's Child. Managed by her father, Mathew Knowles, the group became one of the world's best-selling girl groups of all time. Their hiatus saw the release of Beyoncé's debut album, Dangerously in Love (2003), which established her as a solo artist worldwide, earned five Grammy Awards and featured the Billboard Hot 100 number-one singles "Crazy in Love" and "Baby Boy".""",
31 | # """ spoken in Iran, Afghanistan (officially known as Dari since 1958),[3] and Tajikistan (officially known as Tajiki since the Soviet era),[4] and some other regions which historically were Persianate societies and considered part of Greater Iran. It is written right to left in the Persian alphabet, a modified variant of the Arabic script.""",
32 | # """ 392 Malayalam speakers in India in 2001, 33,015,420 spoke the standard dialects, 19,643 spoke the Yerava dialect and 31,329 spoke non-standard regional variations like Eranadan. As per the 1991 census data, 28.85% of all Malayalam speakers in India spoke a second language and 19.64% of the total knew three or more languages.""",
33 | # """ of Britons believe that the NHS is a crucial institution that should be preserved. The juxtaposition between dissatisfaction with the current state and overall reverence for the institution speaks volumes about the complex relationship the British public has with their healthcare system."""
34 |
35 | # ]
36 | # references = [
37 | # """ the late 1990s as lead singer of the R&B girl-group Destiny's Child. Managed by her father, Mathew Knowles, the group became one of the world's best-selling girl groups of all time. Her debut solo album, Dangerously in Love (2003), debuted at number one on the US Billboard 200 chart and earned her five Grammy Awards, solidifying her as a solo artist as well. Throughout her career, Beyoncé has sold over 100 million records worldwide as a solo artist and a further 60 million records with Destiny's Child, making her one of the best-selling music artists of all time. She has won 23 Grammy Awards and is the most""",
38 | # """ spoken in Iran, Afghanistan (officially known as Dari since 1958), and Tajikistan (officially known as Tajiki since the Soviet era), and some other regions which historically were Persianate societies and considered part of Greater Iran. It is written in the Persian alphabet, a modified variant of the Arabic script, which itself evolved from the Aramaic alphabet.""",
39 | # """ 392 Malayalam speakers in India in 2001, 33,015,420 spoke the standard dialects, 19,643 spoke the Yerava dialect and 31,329 spoke non-standard regional variations like Eranadan. As per the 1991 census data, 28.85% of all Malayalam speakers in India spoke a second language and 19.64% of the total knew three or more languages.""",
40 | # """of the public agrees the service should be free and available to everyone. But with more than seven million people on waiting lists, almost everyone knows someone who isn't getting the care they need. As the NHS approaches its 75th anniversary, politicians are falling over themselves to praise the service."""
41 | # ]
42 |
43 | import json
44 | with open('figs/kanye.json', 'r') as f:
45 | data = json.load(f)
46 |
47 | prompts = data['prompts']
48 | predictions = data['predictions']
49 | references = data['references']
50 |
51 | # benchmarks = ['squad', 'boolq', 'quac', 'LatestEval']
52 | benchmarks = ['memorised', 'clean']
53 | html_output = compare_strings(predictions, references, benchmarks, prompts)
54 |
55 | with open('output.html', 'w') as f:
56 | f.write(f"""
57 |
58 |
59 |
85 |
86 |
87 |
88 | {html_output}
89 |
90 |
91 |
92 | """)
93 |
94 | print("HTML output saved to 'output.html'")
95 |
--------------------------------------------------------------------------------
/data/github_dataset.py:
--------------------------------------------------------------------------------
1 | from git import Repo
2 | import datetime
3 | import os
4 | import sys
5 | import multiprocessing
6 | import difflib
7 | import json
8 | import itertools
9 | import shutil
10 |
11 | def clone_repo(repo_url, local_path, overwrite=False):
12 | if os.path.exists(local_path):
13 | if overwrite:
14 | shutil.rmtree(local_path)
15 | else:
16 | print(f"Repo {local_path} already exists")
17 | return
18 | Repo.clone_from(repo_url, local_path)
19 |
20 | def get_file_content(commit, file_path):
21 | # Retrieves the file content for a given commit
22 | blob = commit.tree / file_path
23 | return blob.data_stream.read().decode('utf-8', errors='ignore')
24 |
25 | def compare_files(repo, diff_item, start_commit, end_commit, code_extensions):
26 | file_path = diff_item.b_path
27 | _, ext = os.path.splitext(file_path)
28 | if ext not in code_extensions:
29 | # print(f"Skipping {file_path} because it is not a code file")
30 | return None
31 |
32 | # If the change type is added, we append it anyway
33 | # If the change type is modified, we only append it if the file was significantly changed (more than 50% of its lines were changed)
34 | if diff_item.change_type == 'M':
35 | a_content = get_file_content(start_commit, diff_item.a_path)
36 | b_content = get_file_content(end_commit, diff_item.b_path)
37 |
38 | # Use difflib to compare contents
39 | diff = difflib.unified_diff(
40 | a_content.splitlines(keepends=True),
41 | b_content.splitlines(keepends=True),
42 | fromfile=diff_item.a_path,
43 | tofile=diff_item.b_path
44 | )
45 |
46 | # Count the number of lines added
47 | changes = sum(1 for line in diff if line.startswith('+') and not line.startswith('++'))
48 |
49 | # if the file was not significantly changed, skip it. we consider a file significantly changed if more than 50% of its lines were changed
50 | if changes < 0.5 * len(a_content.splitlines()):
51 | return None
52 | elif diff_item.change_type == 'A':
53 | b_content = get_file_content(end_commit, diff_item.b_path)
54 | changes = len(b_content.splitlines())
55 | elif diff_item.change_type == 'R':
56 | # skip renamed files
57 | return None
58 | else:
59 | print(diff_item.change_type)
60 | return None
61 |
62 | return {
63 | 'file_path': file_path,
64 | 'num_changed_lines': changes,
65 | 'code': b_content,
66 | }
67 |
68 | def get_monthly_diff_file_objects(local_path, start_date, end_date, code_extensions):
69 | repo = Repo(local_path)
70 | repo_name = local_path.split('/')[-1]
71 | file_changes = []
72 |
73 | try:
74 | start_commit = next(repo.iter_commits(until=start_date))
75 | end_commit = next(repo.iter_commits(until=end_date))
76 | except StopIteration:
77 | print(f"Repo {local_path} has no commits in {start_date} - {end_date}")
78 | return file_changes
79 |
80 | start_commit_date = datetime.datetime.fromtimestamp(start_commit.committed_date).date()
81 | end_commit_date = datetime.datetime.fromtimestamp(end_commit.committed_date).date()
82 | end_commit_date_str = end_commit_date.strftime("%Y-%m-%d")
83 |
84 | if start_commit_date > end_date or end_commit_date < start_date:
85 | # print(f"Repo {local_path} has no commits in the given time range")
86 | return file_changes
87 |
88 | diff_index = start_commit.diff(end_commit, **{'find_renames=50%': True, 'insert_kwargs_after': '-r'})
89 |
90 | for diff_item in diff_index.iter_change_type('M'):
91 | result = compare_files(repo, diff_item, start_commit, end_commit, code_extensions)
92 | if result:
93 | result['repo_name'] = repo_name
94 | result['commit_date'] = end_commit_date_str
95 | result['sha'] = end_commit.hexsha
96 | file_changes.append(result)
97 |
98 | for diff_item in diff_index.iter_change_type('A'):
99 | result = compare_files(repo, diff_item, start_commit, end_commit, code_extensions)
100 | if result:
101 | result['repo_name'] = repo_name
102 | result['commit_date'] = end_commit_date_str
103 | result['sha'] = end_commit.hexsha
104 | file_changes.append(result)
105 |
106 | # Ranking files by the extent of added lines
107 | ranked_files = sorted(file_changes, key=lambda x: x['num_changed_lines'], reverse=True)
108 | # print(f"Total {len(ranked_files)} files changed")
109 | return ranked_files
110 |
111 | def main(time_stamp, local_repo, save_path):
112 | # try:
113 | year, month = time_stamp.split('-')
114 | first_day = datetime.date(int(year), int(month), 1)
115 | last_day = datetime.date(int(year), int(month), 28)
116 |
117 | repo_name = local_repo.split('/')[-1]
118 | # print(f"Processing {repo_name} at {time_stamp}")
119 |
120 | save_path = os.path.join(save_path, time_stamp, repo_name)
121 | if not os.path.exists(save_path):
122 | os.makedirs(save_path)
123 |
124 | ranked_files = get_monthly_diff_file_objects(local_repo, first_day, last_day, code_extensions)
125 | for index, file in enumerate(ranked_files[:50]):
126 | save_file_path = os.path.join(save_path, f"{index}.json")
127 | with open(save_file_path, 'w') as f:
128 | json.dump(file, f, ensure_ascii=False, indent=2)
129 | return (time_stamp, repo_name, len(ranked_files))
130 |
131 | if __name__ == '__main__':
132 | repo_path, save_dir, = sys.argv[1:]
133 |
134 | time_stamps = [f'{year}-{month:02d}' for year in range(2017, 2024) for month in range(1, 13)]
135 | time_stamps += [f'2024-{month:02d}' for month in range(1, 3)]
136 |
137 | # pre_defined repos
138 | with open('/user/HS502/yl02706/LatestEval/data/code_repos.txt', 'r') as f:
139 | repos = f.readlines()
140 |
141 | print(f"Total {len(repos)} repos")
142 |
143 | # Prepare URLs and local paths
144 | urls = [f'https://github.com/{repo.strip()}.git' for repo in repos]
145 | local_paths = [os.path.join(repo_path, repo.replace('/', '_')).strip() for repo in repos]
146 |
147 | # clone repos
148 | with multiprocessing.Pool(2) as pool:
149 | pool.starmap(clone_repo, zip(urls, local_paths))
150 |
151 | code_extensions = {'.py', '.js', '.java', '.cpp', '.c', '.cs', '.go', '.rb', '.php', '.ts', '.jsx', '.tsx', '.css', '.sh', '.pl', '.bat'}
152 |
153 | combinations = list(itertools.product(time_stamps, local_paths))
154 | # combinations = sorted(combinations, key=lambda x: x[-1])
155 | flattened_args = [(time_stamp, local_path, save_dir) for time_stamp, local_path in combinations]
156 |
157 | print(f"Total {len(flattened_args)} combinations")
158 | with multiprocessing.Pool(8) as pool:
159 | ALL_PROCESSED = pool.starmap(main, flattened_args)
160 |
161 | # use single process instead
162 | # ALL_PROCESSED = []
163 | # for args in flattened_args:
164 | # ALL_PROCESSED.append(main(*args))
165 |
166 | print(f"Total {len(ALL_PROCESSED)} processed")
167 |
--------------------------------------------------------------------------------
/data/monthly_updater/monthly_code.py:
--------------------------------------------------------------------------------
1 | from git import Repo
2 | import datetime
3 | import os
4 | import sys
5 | import multiprocessing
6 | import difflib
7 | import json
8 | import itertools
9 | import shutil
10 | from glob import glob
11 | import datasets
12 |
13 | def clone_repo(repo_url, local_path, overwrite=False, since=None):
14 | if os.path.exists(local_path):
15 | if overwrite:
16 | shutil.rmtree(local_path)
17 | else:
18 | print(f"Repo {local_path} already exists")
19 | return
20 | Repo.clone_from(repo_url, local_path, multi_options=[f'--shallow-since={since}'] if since is not None else None)
21 |
22 | def get_file_content(commit, file_path):
23 | # Retrieves the file content for a given commit
24 | blob = commit.tree / file_path
25 | return blob.data_stream.read().decode('utf-8', errors='ignore')
26 |
27 | def compare_files(repo, diff_item, start_commit, end_commit, code_extensions):
28 | file_path = diff_item.b_path
29 | _, ext = os.path.splitext(file_path)
30 | if ext not in code_extensions:
31 | # print(f"Skipping {file_path} because it is not a code file")
32 | return None
33 |
34 | # If the change type is added, we append it anyway
35 | # If the change type is modified, we only append it if the file was significantly changed (more than 50% of its lines were changed)
36 | if diff_item.change_type == 'M':
37 | a_content = get_file_content(start_commit, diff_item.a_path)
38 | b_content = get_file_content(end_commit, diff_item.b_path)
39 |
40 | # Use difflib to compare contents
41 | diff = difflib.unified_diff(
42 | a_content.splitlines(keepends=True),
43 | b_content.splitlines(keepends=True),
44 | fromfile=diff_item.a_path,
45 | tofile=diff_item.b_path
46 | )
47 |
48 | # Count the number of lines added
49 | changes = sum(1 for line in diff if line.startswith('+') and not line.startswith('++'))
50 |
51 | # if the file was not significantly changed, skip it. we consider a file significantly changed if more than 50% of its lines were changed
52 | if changes < 0.5 * len(a_content.splitlines()):
53 | return None
54 | elif diff_item.change_type == 'A':
55 | b_content = get_file_content(end_commit, diff_item.b_path)
56 | changes = len(b_content.splitlines())
57 | elif diff_item.change_type == 'R':
58 | # skip renamed files
59 | return None
60 | else:
61 | print(diff_item.change_type)
62 | return None
63 |
64 | return {
65 | 'file_path': file_path,
66 | 'num_changed_lines': changes,
67 | 'code': b_content,
68 | }
69 |
70 | def get_monthly_diff_file_objects(local_path, start_date, end_date, code_extensions):
71 | repo = Repo(local_path)
72 | repo_name = local_path.split('/')[-1]
73 | file_changes = []
74 |
75 | try:
76 | start_commit = next(repo.iter_commits(since=start_date, reverse=True))
77 | end_commit = next(repo.iter_commits(until=end_date))
78 | except StopIteration:
79 | print(f"Repo {local_path} has no commits in {start_date} - {end_date}")
80 | return file_changes
81 |
82 | start_commit_date = datetime.datetime.fromtimestamp(start_commit.committed_date).date()
83 | end_commit_date = datetime.datetime.fromtimestamp(end_commit.committed_date).date()
84 | end_commit_date_str = end_commit_date.strftime("%Y-%m-%d")
85 |
86 | if start_commit_date > end_date or end_commit_date < start_date:
87 | # print(f"Repo {local_path} has no commits in the given time range")
88 | return file_changes
89 |
90 | diff_index = start_commit.diff(end_commit, **{'find_renames=50%': True, 'insert_kwargs_after': '-r'})
91 |
92 | for diff_item in diff_index.iter_change_type('M'):
93 | result = compare_files(repo, diff_item, start_commit, end_commit, code_extensions)
94 | if result:
95 | result['repo_name'] = repo_name
96 | result['commit_date'] = end_commit_date_str
97 | result['sha'] = end_commit.hexsha
98 | file_changes.append(result)
99 |
100 | for diff_item in diff_index.iter_change_type('A'):
101 | result = compare_files(repo, diff_item, start_commit, end_commit, code_extensions)
102 | if result:
103 | result['repo_name'] = repo_name
104 | result['commit_date'] = end_commit_date_str
105 | result['sha'] = end_commit.hexsha
106 | file_changes.append(result)
107 |
108 | # Ranking files by the extent of added lines
109 | ranked_files = sorted(file_changes, key=lambda x: x['num_changed_lines'], reverse=True)
110 | # print(f"Total {len(ranked_files)} files changed")
111 | return ranked_files
112 |
113 | def main(time_stamp, local_repo, save_path):
114 | year, month = time_stamp.split('-')
115 | first_day = datetime.date(int(year), int(month), 1)
116 | last_day = datetime.date(int(year), int(month), 28)
117 |
118 | repo_name = local_repo.split('/')[-1]
119 | # print(f"Processing {repo_name} at {time_stamp}")
120 |
121 | save_path = os.path.join(save_path, time_stamp, repo_name)
122 | if not os.path.exists(save_path):
123 | os.makedirs(save_path)
124 |
125 | ranked_files = get_monthly_diff_file_objects(local_repo, first_day, last_day, code_extensions)
126 | for index, file in enumerate(ranked_files[:50]):
127 | save_file_path = os.path.join(save_path, f"{index}.json")
128 | with open(save_file_path, 'w') as f:
129 | json.dump(file, f, ensure_ascii=False, indent=2)
130 | return (time_stamp, repo_name, len(ranked_files))
131 | # print(f"Saved to {save_path}")
132 |
133 | if __name__ == '__main__':
134 | today = datetime.date.today()
135 | year = today.year
136 | month = today.month
137 |
138 | time_stamp = f'{year}-{month:02d}'
139 | first_day_string = f'{year}-{month:02d}-01'
140 |
141 | repo_path, save_dir = 'repos/', 'code_data/'
142 | repo_list = 'data/code_repos.txt'
143 |
144 | # time_stamps = [f'{year}-{month:02d}' for year in range(2017, 2024) for month in range(1, 13)]
145 | time_stamps = [time_stamp]
146 |
147 | # pre_defined repos
148 | with open(repo_list, 'r') as f:
149 | repos = f.readlines()
150 |
151 | print(f"Total {len(repos)} repos")
152 |
153 | # Prepare URLs and local paths
154 | urls = [f'https://github.com/{repo.strip()}.git' for repo in repos]
155 | local_paths = [os.path.join(repo_path, repo.replace('/', '_')).strip() for repo in repos]
156 |
157 | # clone repos
158 | sucess_paths = []
159 | for url, local_path in zip(urls, local_paths):
160 | try:
161 | clone_repo(url, local_path, overwrite=True, since=first_day_string)
162 | sucess_paths.append(local_path)
163 | except:
164 | print(f"Failed to clone {url}")
165 |
166 | code_extensions = {'.py', '.js', '.java', '.cpp', '.c', '.cs', '.go', '.rb', '.php', '.ts', '.jsx', '.tsx', '.css', '.sh', '.pl', '.bat'}
167 |
168 | combinations = list(itertools.product(time_stamps, sucess_paths))
169 | # combinations = sorted(combinations, key=lambda x: x[-1])
170 | flattened_args = [(time_stamp, local_path, save_dir) for time_stamp, local_path in combinations]
171 |
172 | print(f"Total {len(flattened_args)} combinations")
173 | with multiprocessing.Pool(2) as pool:
174 | ALL_PROCESSED = pool.starmap(main, flattened_args)
175 |
176 | print(f"Total {len(ALL_PROCESSED)} processed")
177 |
178 | hf_token = os.environ['HF_TOKEN']
179 | code_files = glob(f'{save_dir}/{time_stamp}/*/*.json')
180 | all_codes = []
181 | for code in code_files:
182 | with open(code, 'r') as f:
183 | all_codes.append(json.load(f))
184 | ds = datasets.Dataset.from_list(all_codes)
185 | print('='*20)
186 | print(f'Finished {time_stamp}')
187 | ds.push_to_hub(f'RealTimeData/code_alltime', config_name = time_stamp, token=hf_token)
188 | print(f'Pushed {time_stamp} to hub')
--------------------------------------------------------------------------------
/eval/contamination.py:
--------------------------------------------------------------------------------
1 | import datasets
2 | from nltk.tokenize import sent_tokenize, word_tokenize
3 | from nltk import ngrams
4 | import pandas as pd
5 | import nltk
6 | from nltk.stem import WordNetLemmatizer
7 | import os
8 | import time
9 | import openai
10 | from tqdm import tqdm
11 | from transformers import GPT2TokenizerFast
12 |
13 | T = GPT2TokenizerFast.from_pretrained("gpt2")
14 | prompt_length = 250
15 | suffix_length = 500 - prompt_length
16 |
17 | def data_sampler():
18 | quac = datasets.load_dataset("quac", split="validation")
19 | boolq = datasets.load_dataset("boolq", split="validation")
20 | squad = datasets.load_dataset("squad_v2", split="validation")
21 |
22 | latesteval_1 = datasets.load_dataset("RealTimeData/bbc_news_week1_july_2023", split="train")
23 | latesteval_2 = datasets.load_dataset("RealTimeData/github_july_week1_2023", split="train")
24 | latesteval_3 = datasets.load_dataset("RealTimeData/arxiv_july_week1_2023", split="train")
25 |
26 | def get_prefix_and_suffix(doc, dataset_name = None):
27 | if dataset_name is None:
28 | raise ValueError("dataset_name must be specified")
29 | if dataset_name == "quac" or dataset_name == "squad_v2":
30 | text = T(doc['context']).input_ids
31 | if dataset_name == "quac":
32 | title = 'quac, ' + doc['wikipedia_page_title'] + ', ' + doc['section_title'] + ', '
33 | elif dataset_name == "squad_v2":
34 | title = 'squadv2, ' + 'wikipedia, ' + doc['title'] + ', '
35 | # text = word_tokenize(doc['context'])
36 | elif dataset_name == "boolq":
37 | text = T(doc['passage']).input_ids
38 | title = 'boolq, wikipedia, '
39 | # text = word_tokenize(doc['passage'])
40 | elif dataset_name == "latesteval_1":
41 | text = doc['content'].replace("\n", " ")
42 | text = T(text).input_ids
43 | # text = word_tokenize(doc['content'])
44 | title = 'bbc, '
45 | if len(text) > 1000:
46 | prefix = T.decode(text[:prompt_length])
47 | suffix = T.decode(text[suffix_length:])
48 | else:
49 | suffix = T.decode(text[-suffix_length:])
50 | prefix = T.decode(text[: -suffix_length])
51 | # prefix = " ".join(prefix)
52 | # suffix = " ".join(suffix)
53 | prefix = title + prefix
54 | return pd.Series([prefix, suffix], index=['prefix', 'suffix'])
55 |
56 | # quac = quac.to_pandas().sample(n=10, random_state=42)
57 | # boolq = boolq.to_pandas().sample(n=100, random_state=42)
58 | # squad = squad.to_pandas().sample(n=100, random_state=42)
59 | # latesteval_1 = latesteval_1.to_pandas().sample(n=100, random_state=42)
60 |
61 | quac = quac.to_pandas().head(n=100)
62 | boolq = boolq.to_pandas().head(n=100)
63 | squad = squad.to_pandas().head(n=100)
64 | latesteval_1 = latesteval_1.to_pandas().head(n=30)
65 |
66 | quac = quac.apply(get_prefix_and_suffix, axis=1, dataset_name="quac")
67 | boolq = boolq.apply(get_prefix_and_suffix, axis=1, dataset_name="boolq")
68 | squad = squad.apply(get_prefix_and_suffix, axis=1, dataset_name="squad_v2")
69 | latesteval = latesteval_1.apply(get_prefix_and_suffix, axis=1, dataset_name="latesteval_1")
70 |
71 | return {
72 | "quac": quac,
73 | "boolq": boolq,
74 | "squad": squad,
75 | "latesteval": latesteval
76 | }
77 |
78 | def identify_contamination(reference_suffixes, continuations):
79 |
80 | def generate_word_ngrams(text, n, use_lemmatization=False):
81 | tokens = T(text.lower()).input_ids
82 |
83 | # Optionally, lemmatize words
84 | if use_lemmatization:
85 | lemmatizer = WordNetLemmatizer()
86 | words = [lemmatizer.lemmatize(word) for word in words]
87 | return list(ngrams(tokens, n))
88 |
89 | results = []
90 | for suffix, continuation in zip(reference_suffixes, continuations):
91 | suffix_ngrams = set(generate_word_ngrams(suffix, 9))
92 | continuation_ngrams = set(generate_word_ngrams(continuation, 9))
93 |
94 | intersection = suffix_ngrams.intersection(continuation_ngrams)
95 |
96 | if len(intersection) > 0:
97 | results.append((True, suffix, continuation, intersection))
98 |
99 | return results
100 |
101 | def generate_continuation(model, prompts, reference_suffix, benchmark, batch_size=10):
102 | # three models at this moment: gpt-3, gpt-4, llama-2
103 |
104 | prompts = prompts.tolist()
105 |
106 | if model in ['gpt-4', 'davinci', 'curie', 'babbage']:
107 | generate = gpt
108 | else:
109 | generate = hf_generate
110 |
111 | continuations = []
112 | output_file = f"eval/{model}_{benchmark}_{prompt_length}_continuation.txt"
113 | prompt_file = f"eval/{model}_{benchmark}_{prompt_length}_prompt.txt"
114 | reference_suffix_file = f"eval/{model}_{benchmark}_{prompt_length}_reference_suffix.txt"
115 | if os.path.exists(output_file):
116 | with open(output_file, "r") as f:
117 | continuations = f.readlines()
118 | return continuations
119 | else:
120 | with open(output_file, "w") as f, open(prompt_file, "w") as f2, open(reference_suffix_file, "w") as f3:
121 | for i in tqdm(range(0, len(prompts), batch_size)):
122 | prompt = prompts[i: i + batch_size]
123 | reference_suffix_batch = reference_suffix[i: i + batch_size]
124 | continuation = generate(prompt, model=model)
125 | continuations.extend(continuation)
126 | f.write('\n'.join(continuation) + "\n")
127 | f2.write('\n'.join(prompt) + "\n")
128 | f3.write('\n'.join(reference_suffix_batch) + "\n")
129 |
130 | return continuations
131 |
132 | def hf_generate(model, prompt):
133 | pass
134 |
135 | def gpt(prompt, num_retry = 5, model = "gpt-3.5-turbo"):
136 | # generate answer by gpt-3.5-turbo
137 | openai_key = os.environ.get("OPENAI_API_KEY")
138 | for _ in range(num_retry):
139 | try:
140 | if model in ['davinci', 'curie', 'babbage']:
141 | r = openai.Completion.create(
142 | model=model,
143 | prompt=prompt,
144 | max_tokens=250,
145 | temperature=0,
146 | logit_bias={"198": -100},
147 | logprobs=0,
148 | )
149 | elif model in ['gpt-3.5-turbo', 'gpt-4']:
150 | r = openai.ChatCompletion.create(
151 | model = model,
152 | messages = [
153 | {"role": "user", "content": prompt},
154 | ],
155 | max_tokens=250,
156 | temperature = 0,
157 | logit_bias={"198": -100}
158 | )
159 | break
160 | except Exception as e:
161 | print(e)
162 | time.sleep(1)
163 |
164 | if model in ['davinci', 'curie', 'babbage']:
165 | return [x['text'].replace('\n', ' ') for x in r['choices']]
166 | elif model in ['gpt-3.5-turbo', 'gpt-4']:
167 | return [x['message']['content'] for x in r['choices']]
168 |
169 | if __name__ == "__main__":
170 | samples = data_sampler()
171 |
172 | quac = samples['quac']
173 | boolq = samples['boolq']
174 | squad = samples['squad']
175 | latesteval = samples['latesteval']
176 |
177 | model = 'curie'
178 |
179 | quac_continuations = generate_continuation(model, quac['prefix'], quac['suffix'], "quac")
180 | quac_results = identify_contamination(quac['suffix'], quac_continuations)
181 |
182 | print(f"-- quac: {len(quac_results)}, -- {len(quac_results) / len(quac)}")
183 |
184 | boolq_continuations = generate_continuation(model, boolq['prefix'], boolq['suffix'], "boolq")
185 | boolq_results = identify_contamination(boolq['suffix'], boolq_continuations)
186 |
187 | print(f"-- boolq: {len(boolq_results)}, -- {len(boolq_results) / len(boolq)}")
188 |
189 | squad_continuations = generate_continuation(model, squad['prefix'], squad['suffix'], "squad")
190 | squad_results = identify_contamination(squad['suffix'], squad_continuations)
191 |
192 | print(f"-- squad: {len(squad_results)}, -- {len(squad_results) / len(squad)}")
193 |
194 | latesteval_continuations = generate_continuation(model, latesteval['prefix'], latesteval['suffix'], "latesteval")
195 | latesteval_results = identify_contamination(latesteval['suffix'], latesteval_continuations)
196 |
197 | print(f"-- latesteval: {len(latesteval_results)}, -- {len(latesteval_results) / len(latesteval)}")
--------------------------------------------------------------------------------
/data/reddit_crawler.py:
--------------------------------------------------------------------------------
1 | import requests
2 | from bs4 import BeautifulSoup
3 | import praw
4 | import sys
5 | from typing import List
6 | import json
7 | import os
8 | import time
9 | import datetime
10 | import traceback
11 |
12 | class Forum:
13 | def __init__(self, task_name, start_url, wait_time):
14 | self.task_name = task_name
15 | self.url = start_url
16 | self.wait_time = wait_time
17 |
18 | self.session = requests.Session()
19 | self.setup_session()
20 |
21 | self.posts = None
22 |
23 | def setup_session(self):
24 | """
25 | _summary_: Setup session
26 | """
27 | headers = {
28 | 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.36'
29 | }
30 | self.session.headers.update(headers)
31 |
32 | def get_forum_pages(self):
33 | """
34 | _summary_: Get all pages links of the forum
35 | """
36 | return NotImplementedError()
37 |
38 | def get_forum_content(self, page_url):
39 | """
40 | _summary_: Get all content of the forum
41 | should have:
42 | - title
43 | - main content
44 | - comments
45 | - votes of comments
46 | """
47 | return NotImplementedError()
48 |
49 | def obtain_content(self):
50 | """
51 | _summary_: Obtain content from each post
52 | """
53 | list_of_content = []
54 | for post in self.posts:
55 | list_of_content.append(self.get_forum_content(post))
56 |
57 | self.content = list_of_content
58 |
59 | def save_content(self):
60 | """
61 | _summary_: Save list_of_content to a file
62 | """
63 | return NotImplementedError()
64 |
65 | @classmethod
66 | def filter_func(cls, tag, prefix):
67 | if tag.has_attr('class'):
68 | class_str = ' '.join(tag['class'])
69 | # return class_str.startswith('node node--id')
70 | return class_str.startswith(prefix)
71 | return False
72 |
73 | class MentalHealth(Forum):
74 | def __init__(self, task_name, start_url, wait_time):
75 | super().__init__(task_name, start_url, wait_time)
76 |
77 | def get_forum_pages(self):
78 | # get all sub forums
79 |
80 | sub_forums = []
81 | response = self.session.get(self.url)
82 | soup = BeautifulSoup(response.text, 'html.parser')
83 | for link in soup.find_all(lambda tag: self.filter_func(tag, 'node node--id')):
84 | sub_forums.append(link.find('a')['href'])
85 |
86 | # get all posts from each forum
87 | posts = []
88 | for sub_forum in sub_forums:
89 | response = self.session.get(sub_forum)
90 | soup = BeautifulSoup(response.text, 'html.parser')
91 | for link in soup.find_all(lambda tag: self.filter_func(tag, 'structItem structItem--thread')):
92 | posts.append(link['href'])
93 |
94 | self.posts = posts
95 |
96 | def get_forum_content(self, page_url):
97 | response = self.session.get(page_url)
98 | soup = BeautifulSoup(response.text, 'html.parser')
99 | title = soup.find('h1', {'class': 'p-title-value'}).text
100 | list_of_content = []
101 | for article in soup.find_all(lambda tag: self.filter_func(tag, 'message message--post ')):
102 | author = article['data-author']
103 | content = article.find('div', {'class': 'bbWrapper'})
104 | if content:
105 | mentioned = content.find_all('a', {'class': 'username'})
106 | content = content.text
107 | else:
108 | continue
109 |
110 | footer = article.find('ul', {'class': 'sv-rating-bar__ratings'})
111 | if footer:
112 | ratings = footer.find_all('li', {'class': 'sv-rating sv-rating--empty-list'})
113 | rating_sum = sum([ int(rate.text) for rate in rating])
114 | else:
115 | rating_sum = 0
116 |
117 | list_of_content.append({
118 | 'author': author,
119 | 'content': content,
120 | 'mentioned': mentioned,
121 | 'rating': rating_sum
122 | })
123 |
124 | next_page = soup.find('a', {'class': 'pageNav-jump pageNav-jump--next'})
125 | if next_page:
126 | next_page = next_page['href']
127 | list_of_content += self.get_forum_content(next_page)
128 |
129 | return list_of_content
130 |
131 |
132 | class Reddit:
133 | def __init__(self, subreddits, time_filter, num_posts, save_path, time_limit = None):
134 | self.subreddits = subreddits
135 | self.time_filter = time_filter
136 | self.num_posts = num_posts
137 | self.save_path = save_path
138 | self.time_limit = time_limit
139 |
140 | self.reddit = praw.Reddit('DataCollector')
141 | self.posts = self.get_reddit_posts()
142 | # self.dump_posts()
143 |
144 | def created_after_time_limit(self, created_utc):
145 | if self.time_limit is None:
146 | return True
147 | dt_object = datetime.datetime.fromtimestamp(created_utc)
148 | return dt_object >= self.time_limit
149 |
150 | def get_reddit_posts(self):
151 | # all_posts = {}
152 | for subreddit in self.subreddits:
153 | subreddit_ = subreddit
154 | subreddit = self.reddit.subreddit(subreddit)
155 | list_of_posts = []
156 | for post in subreddit.top(time_filter = self.time_filter, limit=self.num_posts):
157 | created_time = post.created_utc
158 | if not self.created_after_time_limit(created_time): continue
159 | created_time_str = datetime.datetime.fromtimestamp(created_time).strftime('%Y-%m-%d %H:%M:%S')
160 | title = post.title
161 | content = post.selftext
162 |
163 | for i in range(3):
164 | try:
165 | comments = self.deal_with_comments(post.comments.list())
166 | except praw.exceptions.APIException as e:
167 | traceback.print_exc()
168 | time.sleep(10)
169 | else:
170 | break
171 |
172 | score = post.score
173 | the_post = {
174 | 'title': title,
175 | 'content': content,
176 | 'comments': comments,
177 | 'created_time': created_time_str,
178 | 'score': score,
179 | 'subreddit': subreddit_
180 | }
181 | list_of_posts.append(the_post)
182 | self.dump_posts(list_of_posts, subreddit_)
183 | # all_posts[subreddit_] = list_of_posts
184 | # return all_posts
185 |
186 | def deal_with_comments(self, comments, depth = 3):
187 | results = []
188 | if depth < 0: return results
189 | depth -= 1
190 | for comment in comments:
191 | if isinstance(comment, praw.models.MoreComments): continue
192 | author = comment.author
193 | content = comment.body
194 | score = comment.score
195 | created_time = comment.created_utc
196 | created_time_str = datetime.datetime.fromtimestamp(created_time).strftime('%Y-%m-%d %H:%M:%S')
197 | replies = comment.replies
198 | if len(replies):
199 | replies = self.deal_with_comments(replies, depth=depth)
200 | else: replies = []
201 | the_comment = {
202 | 'author': author.name if author is not None else '',
203 | 'content': content,
204 | 'score': score,
205 | 'created_time': created_time_str,
206 | 'replies': replies
207 | }
208 | results.append(the_comment)
209 | return results
210 |
211 | def dump_posts(self, list_of_posts, subreddit = None):
212 | path = os.path.join(self.save_path, f"{subreddit if subreddit is not None else 'all'}.json")
213 | with open(path, 'w') as f:
214 | json.dump(list_of_posts, f)
215 |
216 | if __name__ == '__main__':
217 | # should define the XDG_CONFIG_HOME to the config file
218 | cwd, = sys.argv[1:]
219 | data_collectors = Reddit(['investing', 'wallstreetbets', 'CryptoCurrency', 'politics', 'healthcare'], 'month', 100, cwd, time_limit=datetime.datetime(2023, 7, 1))
--------------------------------------------------------------------------------
/data/squad_wiki_title.text:
--------------------------------------------------------------------------------
1 | Queen_Victoria
2 | Grape
3 | Athanasius_of_Alexandria
4 | Lighting
5 | BBC_Television
6 | Federal_Bureau_of_Investigation
7 | Punjab,_Pakistan
8 | Capacitor
9 | Sino-Tibetan_relations_during_the_Ming_dynasty
10 | History_of_India
11 | Plymouth
12 | Space_Race
13 | Myocardial_infarction
14 | The_Times
15 | Franco-Prussian_War
16 | Literature
17 | War_on_Terror
18 | Aircraft_carrier
19 | Turner_Classic_Movies
20 | Royal_assent
21 | Muslim_world
22 | Sahara
23 | Galicia_(Spain)
24 | YouTube
25 | Santa_Monica,_California
26 | Imperial_College_London
27 | Textual_criticism
28 | Sichuan
29 | Institute_of_technology
30 | Railway_electrification_system
31 | Mesozoic
32 | Cyprus
33 | The_Sun_(United_Kingdom)
34 | Order_of_the_British_Empire
35 | Republic_of_the_Congo
36 | Materialism
37 | Qing_dynasty
38 | To_Kill_a_Mockingbird
39 | Greece
40 | 2008_Sichuan_earthquake
41 | Edmund_Burke
42 | Northwestern_University
43 | CBC_Television
44 | Germans
45 | Race_and_ethnicity_in_the_United_States_Census
46 | Iranian_languages
47 | Adolescence
48 | Armenia
49 | Intellectual_property
50 | Law_of_the_United_States
51 | Hanover
52 | Tuberculosis
53 | Dialect
54 | Josip_Broz_Tito
55 | Political_philosophy
56 | Bern
57 | Pitch_(music)
58 | Pope_John_XXIII
59 | Black_people
60 | List_of_numbered_streets_in_Manhattan
61 | Montevideo
62 | Nigeria
63 | Paper
64 | Swaziland
65 | Liberal_Party_of_Australia
66 | Seven_Years%27_War
67 | Zinc
68 | Treaty
69 | Hellenistic_period
70 | London
71 | European_Central_Bank
72 | Thuringia
73 | Circadian_rhythm
74 | Estonian_language
75 | Cork_(city)
76 | Westminster_Abbey
77 | Data_compression
78 | United_States_Air_Force
79 | Separation_of_powers_under_the_United_States_Constitution
80 | On_the_Origin_of_Species
81 | Nanjing
82 | Zhejiang
83 | Late_Middle_Ages
84 | PlayStation_3
85 | Neptune
86 | Carnival
87 | Hindu_philosophy
88 | Dell
89 | Everton_F.C.
90 | Armenians
91 | Samurai
92 | Federal_Aviation_Administration
93 | Spanish_language_in_the_United_States
94 | Alps
95 | Digimon
96 | Compact_disc
97 | God
98 | Botany
99 | Heresy
100 | The_Bronx
101 | Roman_Republic
102 | Wayback_Machine
103 | Airport
104 | Red
105 | Internet_service_provider
106 | Chicago_Cubs
107 | Detroit
108 | Culture
109 | New_York_City
110 | Marshall_Islands
111 | Hyderabad
112 | Pharmaceutical_industry
113 | Saint_Helena
114 | Oklahoma_City
115 | Bras%C3%ADlia
116 | Korean_War
117 | Biodiversity
118 | Brigham_Young_University
119 | Oklahoma
120 | Eton_College
121 | Alfred_North_Whitehead
122 | Russian_language
123 | A_cappella
124 | Richmond,_Virginia
125 | Genocide
126 | Great_Plains
127 | British_Empire
128 | Emotion
129 | Comics
130 | Napoleon
131 | MP3
132 | England_national_football_team
133 | Green
134 | Palermo
135 | Freemasonry
136 | Letter_case
137 | Communications_in_Somalia
138 | Exhibition_game
139 | Hard_rock
140 | Somalis
141 | University
142 | Pacific_War
143 | San_Diego
144 | British_Isles
145 | Mosaic
146 | Pesticide
147 | Bill_%26_Melinda_Gates_Foundation
148 | University_of_Notre_Dame
149 | Hunter-gatherer
150 | Hokkien
151 | Economy_of_Greece
152 | Windows_8
153 | Universal_Studios
154 | Nintendo_Entertainment_System
155 | St._John%27s,_Newfoundland_and_Labrador
156 | Immaculate_Conception
157 | Southeast_Asia
158 | Rajasthan
159 | Mammal
160 | Communication
161 | Greeks
162 | Chihuahua_(state)
163 | Database
164 | Orthodox_Judaism
165 | Ashkenazi_Jews
166 | Immunology
167 | Flowering_plant
168 | Capital_punishment_in_the_United_States
169 | Switzerland
170 | Christian
171 | Beyoncé
172 | Tristan_da_Cunha
173 | Diarrhea
174 | Architecture
175 | East_India_Company
176 | Aspirated_consonant
177 | Valencia
178 | Gene
179 | Crucifixion_of_Jesus
180 | Financial_crisis_of_2007%E2%80%9308
181 | Asthma
182 | Central_African_Republic
183 | Predation
184 | Computer_security
185 | Protestantism
186 | Russian_Soviet_Federative_Socialist_Republic
187 | Israel
188 | Neoclassical_architecture
189 | Elevator
190 | Frédéric_Chopin
191 | Group_(mathematics)
192 | Glacier
193 | Gamal_Abdel_Nasser
194 | Incandescent_light_bulb
195 | Old_English
196 | Antenna_(radio)
197 | States_of_Germany
198 | IBM
199 | Virgil
200 | Montana
201 | Pain
202 | Mexico_City
203 | Infection
204 | Slavs
205 | Friedrich_Hayek
206 | Multiracial_American
207 | Alaska
208 | Buddhism
209 | Kathmandu
210 | Yale_University
211 | Guinea-Bissau
212 | Anti-aircraft_warfare
213 | Solar_energy
214 | Affirmative_action_in_the_United_States
215 | 2008_Summer_Olympics_torch_relay
216 | Human_Development_Index
217 | Guam
218 | Party_leaders_of_the_United_States_House_of_Representatives
219 | FC_Barcelona
220 | Professional_wrestling
221 | Strasbourg
222 | Richard_Feynman
223 | Wood
224 | Royal_Institute_of_British_Architects
225 | Myanmar
226 | Paris
227 | Southampton
228 | Georgian_architecture
229 | Royal_Dutch_Shell
230 | Madrasa
231 | Department_store
232 | Adult_contemporary_music
233 | Quran
234 | Near_East
235 | Dutch_Republic
236 | George_VI
237 | Imamah_(Shia_doctrine)
238 | History_of_science
239 | Arena_Football_League
240 | Crimean_War
241 | Appalachian_Mountains
242 | Canadian_football
243 | Association_football
244 | Infrared
245 | Dutch_language
246 | Eritrea
247 | Saint_Barth%C3%A9lemy
248 | Catalan_language
249 | Samoa
250 | Sexual_orientation
251 | Atlantic_City,_New_Jersey
252 | Classical_music
253 | Dominican_Order
254 | Warsaw_Pact
255 | Antarctica
256 | Lancashire
257 | American_Idol
258 | John_von_Neumann
259 | Copper
260 | Southern_Europe
261 | BeiDou_Navigation_Satellite_System
262 | Ottoman_Empire
263 | General_Electric
264 | Heian_period
265 | Humanism
266 | Digestion
267 | Unicode
268 | Computer
269 | United_States_dollar
270 | Madonna_(entertainer)
271 | FA_Cup
272 | East_Prussia
273 | Religion_in_ancient_Rome
274 | Bermuda
275 | Supreme_court
276 | Washington_University_in_St._Louis
277 | Xbox_360
278 | Cotton
279 | Melbourne
280 | North_Carolina
281 | Tibet
282 | Super_Nintendo_Entertainment_System
283 | Boston
284 | Pope_Paul_VI
285 | Idealism
286 | Education
287 | Baptists
288 | Tajikistan
289 | Tucson,_Arizona
290 | Namibia
291 | Dwight_D._Eisenhower
292 | Rule_of_law
293 | Jews
294 | Norfolk_Island
295 | Police
296 | Chinese_characters
297 | Annelid
298 | Hunting
299 | Software_testing
300 | LaserDisc
301 | Indigenous_peoples_of_the_Americas
302 | Portugal
303 | Cubism
304 | Bird
305 | Uranium
306 | Raleigh,_North_Carolina
307 | Alexander_Graham_Bell
308 | Nutrition
309 | Neolithic
310 | Asphalt
311 | Cardinal_(Catholicism)
312 | Houston
313 | Mary_(mother_of_Jesus)
314 | United_States_presidential_election,_2004
315 | Prime_minister
316 | Genome
317 | Utrecht
318 | Charleston,_South_Carolina
319 | Kievan_Rus%27
320 | Premier_League
321 | Presbyterianism
322 | Insect
323 | John_Kerry
324 | Karl_Popper
325 | Comprehensive_school
326 | Philadelphia
327 | Seattle
328 | Glass
329 | Sanskrit
330 | Iran
331 | Labour_Party_(UK)
332 | Separation_of_church_and_state_in_the_United_States
333 | Nonprofit_organization
334 | Philosophy_of_space_and_time
335 | Pub
336 | National_Archives_and_Records_Administration
337 | Middle_Ages
338 | Szlachta
339 | House_music
340 | Gramophone_record
341 | Czech_language
342 | Vacuum
343 | Central_Intelligence_Agency
344 | Film_speed
345 | Himachal_Pradesh
346 | Phonology
347 | Canadian_Armed_Forces
348 | Muammar_Gaddafi
349 | Dissolution_of_the_Soviet_Union
350 | High-definition_television
351 | Alloy
352 | Arsenal_F.C.
353 | New_Delhi
354 | Translation
355 | USB
356 | Transistor
357 | Tuvalu
358 | Somerset
359 | Renewable_energy_commercialization
360 | Videoconferencing
361 | Political_party
362 | Gregorian_calendar
363 | Serbo-Croatian
364 | United_Nations_Population_Fund
365 | Brain
366 | ASCII
367 | Ministry_of_Defence_(United_Kingdom)
368 | Mandolin
369 | Antibiotics
370 | Great_power
371 | Beer
372 | Spectre_(2015_film)
373 | Apollo
374 | Energy
375 | Avicenna
376 | Gothic_architecture
377 | Steven_Spielberg
378 | Animal
379 | Geological_history_of_Earth
380 | Miami
381 | University_of_Kansas
382 | Daylight_saving_time
383 | Identity_(social_science)
384 | Canon_law
385 | Sumer
386 | Modern_history
387 | Planck_constant
388 | Child_labour
389 | Buckingham_Palace
390 | Sony_Music_Entertainment
391 | Age_of_Enlightenment
392 | Tennessee
393 | Electric_motor
394 | Marvel_Comics
395 | Federalism
396 | Mali
397 | Geography_of_the_United_States
398 | The_Legend_of_Zelda:_Twilight_Princess
399 | Kanye_West
400 | Molotov%E2%80%93Ribbentrop_Pact
401 | Umayyad_Caliphate
402 | Estonia
403 | Race_(human_categorization)
404 | New_Haven,_Connecticut
405 | Endangered_Species_Act
406 | Symbiosis
407 | Military_history_of_the_United_States
408 | Dog
409 | Printed_circuit_board
410 | Empiricism
411 | The_Blitz
412 | Han_dynasty
413 | Light-emitting_diode
414 | Alsace
415 | United_States_Army
416 | Macintosh
417 | Clothing
418 | Comcast
419 | Elizabeth_II
420 | Liberia
421 | Jehovah%27s_Witnesses
422 | 51st_state
423 | IPod
424 | Bacteria
425 | Matter
426 | Poultry
427 | Gymnastics
428 | John,_King_of_England
429 | Time
430 | Arnold_Schwarzenegger
431 | Queen_(band)
432 | Memory
433 | Florida
434 | Political_corruption
435 | Web_browser
436 | Hydrogen
437 | Ann_Arbor,_Michigan
438 | Bird_migration
439 | Post-punk
440 | Anthropology
441 | Copyright_infringement
442 | Egypt
--------------------------------------------------------------------------------
/data/arxiv_dataset.py:
--------------------------------------------------------------------------------
1 | import arxiv
2 | import datetime
3 | from queue import Queue
4 | from threading import Thread, Lock
5 | import os
6 | import logging
7 | import time
8 | import tarfile
9 | from pylatexenc.latexwalker import LatexWalker, LatexEnvironmentNode, LatexMacroNode
10 | from pylatexenc import latex2text
11 | from pylatexenc.macrospec import LatexContextDb
12 | import shutil
13 | import re
14 | import json
15 | from glob import glob
16 | from huggingface_hub import create_branch, create_tag, RepoCard
17 | import datasets
18 | import sys
19 |
20 | def filter_element(context, exclude_elements = []):
21 |
22 | new_context = LatexContextDb()
23 |
24 | new_context.unknown_macro_spec = context.unknown_macro_spec
25 | new_context.unknown_environment_spec = context.unknown_environment_spec
26 | new_context.unknown_specials_spec = context.unknown_specials_spec
27 |
28 | filter_element_func = lambda dict_to_filter: {k:v for k,v in dict_to_filter.items() if k not in exclude_elements}.values()
29 | for cat in context.category_list:
30 |
31 | # include this category
32 | new_context.add_context_category(
33 | cat,
34 | macros=filter_element_func(context.d[cat]['macros']),
35 | environments=filter_element_func(context.d[cat]['environments']),
36 | specials=filter_element_func(context.d[cat]['specials']),
37 | )
38 |
39 | return new_context
40 |
41 | class TextExtractor:
42 |
43 | def __init__(self):
44 | self.l2t_context_db = latex2text.get_default_latex_context_db()
45 | self.l2t_context_db = filter_element(self.l2t_context_db, ['href'])
46 |
47 | self.l2t = latex2text.LatexNodes2Text(latex_context=self.l2t_context_db)
48 |
49 | def extract(self, latex_code):
50 | result = parse_tex_ignore_figures(latex_code)
51 | return self.l2t.nodelist_to_text(result)
52 |
53 | def remove_figure_nodes(node_list):
54 | filtered_node_list = []
55 | for node in node_list:
56 | # Ignore the 'figure' environment
57 | if node.isNodeType(LatexEnvironmentNode):
58 | if node.environmentname in [ 'figure', 'figure*', 'algorithm', 'table', 'table*', 'algorithmic']:
59 | continue
60 | if hasattr(node, 'nodelist'):
61 | node.nodelist = remove_figure_nodes(node.nodelist)
62 | filtered_node_list.append(node)
63 | return filtered_node_list
64 |
65 | def parse_tex_ignore_figures(tex_code):
66 | walker = LatexWalker(tex_code)
67 | parsed = walker.get_latex_nodes()[0]
68 |
69 | for node in parsed:
70 | if node.isNodeType(LatexEnvironmentNode):
71 | if node.environmentname == 'document':
72 | parsed = [node]
73 | break
74 |
75 | filtered_nodes = remove_figure_nodes(parsed)
76 | return filtered_nodes
77 |
78 | def resolve_input_commands(latex_code, base_dir="."):
79 | input_pattern = re.compile(r"(? 1:
159 | if 'main.tex' in tex_files: tex_files = ['main.tex']
160 | else:
161 | self.logger.info(f'------ Found multiple tex files: {tex_files}')
162 | return
163 | elif len(tex_files) == 0:
164 | self.logger.info(f'------ Found no tex files')
165 | return
166 | tex_file = tex_files[0]
167 | with open(f'./{paper_id}/{tex_file}', 'r', encoding='utf-8', errors='ignore') as f:
168 | latex_code = f.read()
169 | if '\\input' in latex_code:
170 | latex_code = resolve_input_commands(latex_code, base_dir=f'./{paper_id}')
171 | text = self.text_extractor.extract(latex_code)
172 |
173 | meta_data['text'] = text
174 | with open(f'{self.text_save_dir}/{paper_id}.json', 'w') as f:
175 | json.dump(meta_data, f, ensure_ascii=False)
176 |
177 | self.logger.info(f'------ Saved {paper_id}.json')
178 |
179 | except Exception as e:
180 | self.logger.error(f'ERROR: {e}')
181 | time.sleep(3)
182 | return
183 |
184 | finally:
185 | shutil.rmtree(f'./{paper_id}')
186 | os.remove(f'{paper_id}.arxiv_source')
187 |
188 |
189 | if __name__ == '__main__':
190 | hf_token = os.environ['HF_TOKEN']
191 | year, month, save_dir, = sys.argv[1:]
192 | month = int(month) % 12 + 1
193 |
194 | if f'{year}-{month:02d}' in ['2021-01', '2021-02', '2021-03']:
195 | print(f"Skip {year}-{month:02d}")
196 | exit()
197 |
198 | time_stamp = f'{year}-{month:02d}'
199 |
200 | first_day = datetime.date(int(year), int(month), 1)
201 | last_day = datetime.date(int(year), int(month), 28)
202 |
203 | start_time_str = first_day.strftime("%Y%m%d%H%M%S")
204 | end_time_str = last_day.strftime("%Y%m%d%H%M%S")
205 |
206 | text_save_dir = os.path.join(save_dir, time_stamp)
207 | if not os.path.exists(text_save_dir):
208 | os.makedirs(text_save_dir)
209 |
210 | search = arxiv.Search(
211 | query=f'submittedDate:[{start_time_str} TO {end_time_str}]',
212 | sort_by = arxiv.SortCriterion.SubmittedDate,
213 | sort_order=arxiv.SortOrder.Descending,
214 | max_results=800
215 | )
216 |
217 | q = Queue()
218 | num_threads = 4
219 |
220 | for i in range(num_threads):
221 | worker = Worker(q, i, text_save_dir,)
222 | worker.daemon = True
223 | worker.start()
224 |
225 | for index, result in enumerate(search.results()):
226 | q.put((index, result))
227 |
228 | q.join()
229 |
230 | print(f"Finished {time_stamp}")
231 |
232 | # files = glob(f'{text_save_dir}/*.json')
233 | # ds = datasets.load_dataset('json', data_files=files, split='train')
234 |
235 | # ds.push_to_hub(
236 | # "RealTimeData/arxiv_alltime",
237 | # config_name=time_stamp,
238 | # token=hf_token,
239 | # )
--------------------------------------------------------------------------------
/data/monthly_updater/monthly_arxiv.py:
--------------------------------------------------------------------------------
1 | import arxiv
2 | import datetime
3 | from queue import Queue
4 | from threading import Thread, Lock
5 | import os
6 | import logging
7 | import time
8 | import tarfile
9 | from pylatexenc.latexwalker import LatexWalker, LatexEnvironmentNode, LatexMacroNode
10 | from pylatexenc import latex2text
11 | from pylatexenc.macrospec import LatexContextDb
12 | import shutil
13 | import re
14 | import json
15 | from glob import glob
16 | from huggingface_hub import create_branch, create_tag, RepoCard
17 | import datasets
18 | import sys
19 |
20 | def filter_element(context, exclude_elements = []):
21 |
22 | new_context = LatexContextDb()
23 |
24 | new_context.unknown_macro_spec = context.unknown_macro_spec
25 | new_context.unknown_environment_spec = context.unknown_environment_spec
26 | new_context.unknown_specials_spec = context.unknown_specials_spec
27 |
28 | filter_element_func = lambda dict_to_filter: {k:v for k,v in dict_to_filter.items() if k not in exclude_elements}.values()
29 | for cat in context.category_list:
30 |
31 | # include this category
32 | new_context.add_context_category(
33 | cat,
34 | macros=filter_element_func(context.d[cat]['macros']),
35 | environments=filter_element_func(context.d[cat]['environments']),
36 | specials=filter_element_func(context.d[cat]['specials']),
37 | )
38 |
39 | return new_context
40 |
41 | class TextExtractor:
42 |
43 | def __init__(self):
44 | self.l2t_context_db = latex2text.get_default_latex_context_db()
45 | self.l2t_context_db.add_context_category(
46 | 'Abstract',
47 | macros={},
48 | environments=[
49 | latex2text.EnvironmentTextSpec("abstract", simplify_repl=r'§ ABSTRACT %(body)s'),
50 | latex2text.EnvironmentTextSpec("Abstract", simplify_repl=r'§ ABSTRACT %(body)s')
51 | ],
52 | specials={}
53 | )
54 | self.l2t_context_db = filter_element(self.l2t_context_db, ['href'])
55 |
56 | self.l2t = latex2text.LatexNodes2Text(latex_context=self.l2t_context_db)
57 |
58 | def extract(self, latex_code):
59 | result = parse_tex_ignore_figures(latex_code)
60 | return self.l2t.nodelist_to_text(result)
61 |
62 | def remove_figure_nodes(node_list):
63 | filtered_node_list = []
64 | for node in node_list:
65 | # Ignore the 'figure' environment
66 | if node.isNodeType(LatexEnvironmentNode):
67 | if node.environmentname in [ 'figure', 'figure*', 'algorithm', 'table', 'table*', 'algorithmic']:
68 | continue
69 | if hasattr(node, 'nodelist'):
70 | node.nodelist = remove_figure_nodes(node.nodelist)
71 | filtered_node_list.append(node)
72 | return filtered_node_list
73 |
74 | def parse_tex_ignore_figures(tex_code):
75 | walker = LatexWalker(tex_code)
76 | parsed = walker.get_latex_nodes()[0]
77 |
78 | for node in parsed:
79 | if node.isNodeType(LatexEnvironmentNode):
80 | if node.environmentname == 'document':
81 | parsed = [node]
82 | break
83 |
84 | filtered_nodes = remove_figure_nodes(parsed)
85 | return filtered_nodes
86 |
87 | def resolve_input_commands(latex_code, base_dir="."):
88 | input_pattern = re.compile(r"(? 1:
168 | if 'main.tex' in tex_files: tex_files = ['main.tex']
169 | else:
170 | self.logger.info(f'------ Found multiple tex files: {tex_files}')
171 | return
172 | elif len(tex_files) == 0:
173 | self.logger.info(f'------ Found no tex files')
174 | return
175 | tex_file = tex_files[0]
176 | with open(f'./{paper_id}/{tex_file}', 'r', encoding='utf-8', errors='ignore') as f:
177 | latex_code = f.read()
178 | if '\\input' in latex_code:
179 | latex_code = resolve_input_commands(latex_code, base_dir=f'./{paper_id}')
180 | text = self.text_extractor.extract(latex_code)
181 |
182 | meta_data['text'] = text
183 | with open(f'{self.text_save_dir}/{paper_id}.json', 'w') as f:
184 | json.dump(meta_data, f, ensure_ascii=False)
185 |
186 | self.logger.info(f'------ Saved {paper_id}.json')
187 |
188 | except Exception as e:
189 | self.logger.error(f'ERROR: {e}')
190 | time.sleep(3)
191 | return
192 |
193 | finally:
194 | shutil.rmtree(f'./{paper_id}')
195 | os.remove(f'{paper_id}.arxiv_source')
196 |
197 |
198 | if __name__ == '__main__':
199 | today = datetime.date.today()
200 | year = today.year
201 | month = today.month
202 | save_dir = './arxiv_data/'
203 |
204 | hf_token = os.environ['HF_TOKEN']
205 | time_stamp = f'{year}-{month:02d}'
206 |
207 | first_day = datetime.date(int(year), int(month), 1)
208 | last_day = datetime.date(int(year), int(month), 28)
209 |
210 | start_time_str = first_day.strftime("%Y%m%d%H%M%S")
211 | end_time_str = last_day.strftime("%Y%m%d%H%M%S")
212 |
213 | text_save_dir = os.path.join(save_dir, time_stamp)
214 | if not os.path.exists(text_save_dir):
215 | os.makedirs(text_save_dir)
216 |
217 | search = arxiv.Search(
218 | query=f'submittedDate:[{start_time_str} TO {end_time_str}]',
219 | sort_by = arxiv.SortCriterion.SubmittedDate,
220 | sort_order=arxiv.SortOrder.Descending,
221 | max_results=1000
222 | )
223 |
224 | q = Queue()
225 | num_threads = 4
226 |
227 | for i in range(num_threads):
228 | worker = Worker(q, i, text_save_dir,)
229 | worker.daemon = True
230 | worker.start()
231 |
232 | for index, result in enumerate(search.results()):
233 | q.put((index, result))
234 |
235 | q.join()
236 |
237 | print(f"Finished {time_stamp}")
238 |
239 | files = glob(f'{text_save_dir}/*.json')
240 | ds = datasets.load_dataset('json', data_files=files, split='train')
241 |
242 | ds.push_to_hub(
243 | "RealTimeData/arxiv_alltime",
244 | config_name=time_stamp,
245 | token=hf_token,
246 | )
--------------------------------------------------------------------------------
/arxiv_downloader.py:
--------------------------------------------------------------------------------
1 | import arxiv
2 | import datetime
3 | from queue import Queue
4 | from threading import Thread, Lock
5 | import os
6 | import logging
7 | import time
8 | import tarfile
9 | from pylatexenc.latexwalker import LatexWalker, LatexEnvironmentNode, LatexMacroNode
10 | from pylatexenc import latex2text
11 | from pylatexenc.macrospec import LatexContextDb
12 | import shutil
13 | import re
14 | import json
15 | from glob import glob
16 | from huggingface_hub import create_branch, create_tag, RepoCard
17 | import datasets
18 | import sys
19 |
20 | def filter_element(context, exclude_elements = []):
21 |
22 | new_context = LatexContextDb()
23 |
24 | new_context.unknown_macro_spec = context.unknown_macro_spec
25 | new_context.unknown_environment_spec = context.unknown_environment_spec
26 | new_context.unknown_specials_spec = context.unknown_specials_spec
27 |
28 | filter_element_func = lambda dict_to_filter: {k:v for k,v in dict_to_filter.items() if k not in exclude_elements}.values()
29 | for cat in context.category_list:
30 |
31 | # include this category
32 | new_context.add_context_category(
33 | cat,
34 | macros=filter_element_func(context.d[cat]['macros']),
35 | environments=filter_element_func(context.d[cat]['environments']),
36 | specials=filter_element_func(context.d[cat]['specials']),
37 | )
38 |
39 | return new_context
40 |
41 | class TextExtractor:
42 |
43 | def __init__(self):
44 | self.l2t_context_db = latex2text.get_default_latex_context_db()
45 | self.l2t_context_db.add_context_category(
46 | 'Abstract',
47 | macros={},
48 | environments=[
49 | latex2text.EnvironmentTextSpec("abstract", simplify_repl=r'§ ABSTRACT %(body)s'),
50 | latex2text.EnvironmentTextSpec("Abstract", simplify_repl=r'§ ABSTRACT %(body)s')
51 | ],
52 | specials={}
53 | )
54 | self.l2t_context_db = filter_element(self.l2t_context_db, ['href'])
55 |
56 | self.l2t = latex2text.LatexNodes2Text(latex_context=self.l2t_context_db)
57 |
58 | def extract(self, latex_code):
59 | result = parse_tex_ignore_figures(latex_code)
60 | return self.l2t.nodelist_to_text(result)
61 |
62 | def remove_figure_nodes(node_list):
63 | filtered_node_list = []
64 | for node in node_list:
65 | # Ignore the 'figure' environment
66 | if node.isNodeType(LatexEnvironmentNode):
67 | if node.environmentname in [ 'figure', 'figure*', 'algorithm', 'table', 'table*', 'algorithmic']:
68 | continue
69 | if hasattr(node, 'nodelist'):
70 | node.nodelist = remove_figure_nodes(node.nodelist)
71 | filtered_node_list.append(node)
72 | return filtered_node_list
73 |
74 | def parse_tex_ignore_figures(tex_code):
75 | walker = LatexWalker(tex_code)
76 | parsed = walker.get_latex_nodes()[0]
77 |
78 | for node in parsed:
79 | if node.isNodeType(LatexEnvironmentNode):
80 | if node.environmentname == 'document':
81 | parsed = [node]
82 | break
83 |
84 | filtered_nodes = remove_figure_nodes(parsed)
85 | return filtered_nodes
86 |
87 | def resolve_input_commands(latex_code, base_dir="."):
88 | input_pattern = re.compile(r"(? 1:
168 | if 'main.tex' in tex_files: tex_files = ['main.tex']
169 | else:
170 | self.logger.info(f'------ Found multiple tex files: {tex_files}')
171 | return
172 | elif len(tex_files) == 0:
173 | self.logger.info(f'------ Found no tex files')
174 | return
175 | tex_file = tex_files[0]
176 | with open(f'./{paper_id}/{tex_file}', 'r', encoding='utf-8', errors='ignore') as f:
177 | latex_code = f.read()
178 | if '\\input' in latex_code:
179 | latex_code = resolve_input_commands(latex_code, base_dir=f'./{paper_id}')
180 | text = self.text_extractor.extract(latex_code)
181 |
182 | meta_data['text'] = text
183 | with open(f'{self.text_save_dir}/{paper_id}.json', 'w') as f:
184 | json.dump(meta_data, f, ensure_ascii=False)
185 |
186 | self.logger.info(f'------ Saved {paper_id}.json')
187 |
188 | except Exception as e:
189 | self.logger.error(f'ERROR: {e}')
190 | time.sleep(3)
191 | return
192 |
193 | finally:
194 | shutil.rmtree(f'./{paper_id}')
195 | os.remove(f'{paper_id}.arxiv_source')
196 |
197 |
198 | if __name__ == '__main__':
199 | hf_token = os.environ['HF_TOKEN']
200 |
201 | today = datetime.date.today()
202 | start_time = today - datetime.timedelta(days=7)
203 |
204 | start_time_str = start_time.strftime("%Y%m%d%H%M%S")
205 | end_time_str = today.strftime("%Y%m%d%H%M%S")
206 |
207 | text_save_dir = f'arxiv_{start_time_str}_to_{end_time_str}'
208 | if not os.path.exists(text_save_dir):
209 | os.makedirs(text_save_dir)
210 |
211 | search = arxiv.Search(
212 | query=f'submittedDate:[{start_time_str} TO {end_time_str}]',
213 | sort_by = arxiv.SortCriterion.SubmittedDate,
214 | sort_order=arxiv.SortOrder.Descending,
215 | max_results=1600
216 | )
217 |
218 | q = Queue()
219 | num_threads = 4
220 |
221 | for i in range(num_threads):
222 | worker = Worker(q, i, text_save_dir,)
223 | worker.daemon = True
224 | worker.start()
225 |
226 | for index, result in enumerate(search.results()):
227 | q.put((index, result))
228 |
229 | q.join()
230 |
231 | files = glob(f'{text_save_dir}/*.json')
232 | ds = datasets.load_dataset('json', data_files=files, split='train')
233 |
234 | try:
235 | create_branch('RealTimeData/arxiv_latest', branch=today.isoformat(), token=hf_token, repo_type='dataset')
236 | except:
237 | pass
238 | ds.push_to_hub('RealTimeData/arxiv_latest', token=hf_token, branch=today.isoformat())
239 | ds.push_to_hub('RealTimeData/arxiv_latest', token=hf_token, branch='main')
240 |
241 | text = f"""
242 | # Latest arXiv
243 |
244 | You could always access the latest arXiv papers via this dataset.
245 |
246 | We update the dataset weekly, on every Sunday. So the dataset always provides the latest arXiv papers created in the past week.
247 |
248 | The current dataset on main branch contains the latest arXiv papers submitted from {start_time.isoformat()} to {today.isoformat()}.
249 |
250 | The data collection was conducted on {today.isoformat()}.
251 |
252 | Use the dataset via:
253 | ```
254 | ds = datasets.load_dataset('RealTimeData/arxiv_latest')
255 | ```
256 |
257 | # Previsou versions
258 |
259 | You could access previous versions by requesting different branches.
260 |
261 | For example, you could find the 2023-08-20 version via:
262 | ```
263 | ds = datasets.load_dataset('RealTimeData/arxiv_latest', revision = '2023-08-20')
264 | ```
265 |
266 | Check all available versions by clicking the "Files and versions" button on the top bar.
267 | """
268 | card = RepoCard(text)
269 | card.push_to_hub('RealTimeData/arxiv_latest', repo_type='dataset', token=hf_token)
--------------------------------------------------------------------------------
/data/wikipedia.py:
--------------------------------------------------------------------------------
1 | import requests
2 | import mwparserfromhell
3 | import json
4 | import os
5 | from transformers import LlamaForCausalLM, LlamaTokenizerFast, AutoModelForCausalLM, AutoTokenizer, OPTForCausalLM
6 | import sys
7 | import torch
8 | from tqdm import tqdm
9 | import traceback
10 | from auto_gptq import AutoGPTQForCausalLM, BaseQuantizeConfig
11 | import datasets
12 | import numpy as np
13 | import time
14 | import openai
15 | from doc_info import verbalise_docs
16 |
17 | WIKI_API_ENDPOINT = "https://en.wikipedia.org/w/api.php"
18 |
19 | def self_info(text, model, tokenizer, merge = False):
20 | def merge_sub_tokens(log_probs, word_ids):
21 | # merge log probs of sub_tokens
22 | merged_log_probs = []
23 | current_word_id = None
24 | current_word_log_prob = None
25 | counter = 1
26 |
27 | for log_prob, word_id in zip(log_probs, word_ids):
28 | if word_id is not None:
29 | if current_word_id != word_id:
30 | if current_word_id is not None:
31 | merged_log_probs.extend([current_word_log_prob] * counter)
32 | counter = 1
33 | current_word_id = word_id
34 | current_word_log_prob = log_prob
35 | else:
36 | counter += 1
37 | current_word_log_prob = current_word_log_prob + log_prob
38 |
39 | if current_word_id is not None:
40 | merged_log_probs.extend([current_word_log_prob] * counter)
41 |
42 | return merged_log_probs
43 |
44 | # this function is used to get the self-information of a text
45 | # the model should be a causal language model, e.g. GPT2LMHeadModel
46 |
47 | # tokenize the text
48 | text = f"{tokenizer.bos_token}{text}"
49 | encoding = tokenizer(text, return_tensors="pt", max_length=model.config.max_position_embeddings, truncation=True)
50 | encoding = encoding.to(model.device)
51 |
52 | # get the logits
53 | with torch.no_grad():
54 | logits = model(**encoding).logits
55 | probs = torch.softmax(logits, dim=-1)
56 | info = -torch.log(probs)
57 |
58 | input_ids = encoding['input_ids']
59 | input_ids_expaned = input_ids[:, 1:].unsqueeze(-1)
60 | info = info[:, :-1].gather(-1, input_ids_expaned).squeeze(-1).squeeze(0).tolist()
61 |
62 | tokens = [tokenizer.decode(token_) for token_ in input_ids.squeeze().tolist()[1:]]
63 | if merge:
64 | info = merge_sub_tokens(info, encoding.word_ids()[1:])
65 | return tokens, info
66 |
67 | def gpt3_self_info(text, num_retry = 5):
68 | # text = text[:1000]
69 | openai.api_key = os.environ["OPENAI_API_KEY"]
70 |
71 | for _ in range(num_retry):
72 | try:
73 | r = openai.Completion.create(
74 | model="curie",
75 | prompt=f"<|endoftext|>{text}",
76 | max_tokens=0,
77 | temperature=0,
78 | echo=True,
79 | logprobs=0,
80 | )
81 | break
82 | except Exception as e:
83 | print(e)
84 | time.sleep(1)
85 |
86 | result = r['choices'][0]
87 | tokens, logprobs = result["logprobs"]["tokens"][1:], result["logprobs"]["token_logprobs"][1:]
88 |
89 | assert len(tokens) == len(logprobs), f"Expected {len(tokens)} logprobs, got {len(logprobs)}"
90 |
91 | self_info = [ -logprob for logprob in logprobs]
92 | # TODO: deal with the first delimiter
93 | return tokens, self_info
94 |
95 | def fetch_recent_changes(from_date, to_date = '2023-08-01T00:00:00'):
96 | params = {
97 | "action": "query",
98 | "format": "json",
99 | "list": "recentchanges",
100 | "rcstart": to_date, # starting from the newer date
101 | "rcend": from_date, # ending at the older date
102 | "rctype": "new",
103 | "rcnamespace": "0",
104 | "rclimit": "500",
105 | "rcprop": "title|timestamp"
106 | }
107 | req = requests.Request('GET', WIKI_API_ENDPOINT, params=params).prepare()
108 | response = requests.get(WIKI_API_ENDPOINT, params=params).json()
109 |
110 | # Check if the response contains the expected data
111 | if 'query' in response and 'recentchanges' in response['query']:
112 | return [entry['title'] for entry in response['query']['recentchanges']]
113 | else:
114 | return []
115 |
116 | def fetch_content(title, date=None):
117 | params = {
118 | "action": "query",
119 | "format": "json",
120 | "titles": title,
121 | "prop": "revisions",
122 | "rvprop": "content",
123 | "rvlimit": "1",
124 | }
125 | if date: params["rvstart"] = date
126 | try:
127 | response = requests.get(WIKI_API_ENDPOINT, params=params)
128 | response.raise_for_status() # Will raise an error if the HTTP request returned an unsuccessful status code
129 | data = response.json()
130 | if 'error' in data:
131 | print(f"Error fetching content for {title}: {data['error']['info']}")
132 | return None
133 |
134 | page = next(iter(data['query']['pages'].values()))
135 | if 'revisions' not in page:
136 | print(f"No revisions found for {title}")
137 | return None
138 | content = page['revisions'][0]['*']
139 |
140 | # Check if the content is a redirect and skip if true
141 | if content.startswith("#REDIRECT"):
142 | print(f"{title} is a redirect page.")
143 | return None
144 | return content
145 |
146 | except Exception as e:
147 | print(f"An error occurred while fetching content for {title}: {str(e)}")
148 | traceback.print_exc() # This will print the full traceback
149 |
150 | return None
151 |
152 | def parse_to_plain_text(wikitext):
153 | parsed = mwparserfromhell.parse(wikitext)
154 | return parsed.strip_code()
155 |
156 | def select_token_window(text, token_count=400):
157 | tokens = text.split()
158 | if len(tokens) <= token_count:
159 | return text
160 | ramdom_start = np.random.randint(0, len(tokens) - token_count)
161 | tokens = tokens[ramdom_start:ramdom_start + token_count]
162 | return ' '.join(tokens)
163 |
164 | def fetch_latest_and_historical_wiki_pages(cache_dir = '', historical_date = '2022-07-01T00:00:00Z', token_count = 300):
165 | # 1. Fetch the latest created pages from July 2023 and their content.
166 | recent_wiki_path = os.path.join(cache_dir, 'recent_wiki_pages.json')
167 | if not os.path.exists(recent_wiki_path):
168 | recent_titles = fetch_recent_changes("2023-07-01T00:00:00Z")
169 | recent_contents = [fetch_content(title) for title in tqdm(recent_titles)]
170 | recent_contents = [content for content in recent_contents if content is not None]
171 |
172 | data_to_save = {title: content for title, content in zip(recent_titles, recent_contents)}
173 | with open(recent_wiki_path, 'w') as file:
174 | json.dump(data_to_save, file, ensure_ascii=False, indent=4)
175 | else:
176 | with open(recent_wiki_path) as file:
177 | data_to_save = json.load(file)
178 | recent_titles = list(data_to_save.keys())
179 | recent_contents = list(data_to_save.values())
180 | recent_contents = [content for content in recent_contents if content is not None]
181 |
182 | # 2. Fetch a historical version of a specific title from July 2022.
183 | historical_wiki_path = os.path.join(cache_dir, 'historical_wiki_pages.json')
184 | if not os.path.exists(historical_wiki_path):
185 | with open(os.path.join(cache_dir, 'data/squad_wiki_title.text')) as f:
186 | titles = [line.strip() for line in f.readlines()]
187 | historical_contents = [fetch_content(title, historical_date) for title in tqdm(titles)]
188 | historical_contents = [content for content in historical_contents if content is not None]
189 | historical_to_save = {title: content for title, content in zip(titles, historical_contents)}
190 | with open(historical_wiki_path, 'w') as file:
191 | json.dump(historical_to_save, file, ensure_ascii=False, indent=4)
192 | else:
193 | with open(historical_wiki_path) as file:
194 | historical_to_save = json.load(file)
195 | historical_titles = list(historical_to_save.keys())
196 | historical_contents = list(historical_to_save.values())
197 | historical_contents = [content for content in historical_contents if content is not None]
198 |
199 | # 3. Parse the content to plain text.
200 | recent_plain_text_path = os.path.join(cache_dir, 'recent_plain_text.json')
201 | historical_plain_text_path = os.path.join(cache_dir, 'historical_plain_text.json')
202 | if not os.path.exists(recent_plain_text_path):
203 | plain_texts_recent = [parse_to_plain_text(content) for content in recent_contents]
204 | plain_texts_historical = [parse_to_plain_text(content) for content in historical_contents]
205 | with open(recent_plain_text_path, 'w') as file:
206 | json.dump(plain_texts_recent, file, ensure_ascii=False, indent=4)
207 | with open(historical_plain_text_path, 'w') as file:
208 | json.dump(plain_texts_historical, file, ensure_ascii=False, indent=4)
209 | else:
210 | with open(recent_plain_text_path) as file:
211 | plain_texts_recent = json.load(file)
212 | with open(historical_plain_text_path) as file:
213 | plain_texts_historical = json.load(file)
214 |
215 | # 4. Select a 1000-token window from the text.
216 | selected_windows_recent = [select_token_window(text, token_count=token_count) for text in plain_texts_recent]
217 | selected_windows_historical = [select_token_window(text, token_count=token_count) for text in plain_texts_historical]
218 |
219 | return selected_windows_recent, selected_windows_historical
220 |
221 | def prepare_comparing_data(datasets_and_texts_col, num_samples=200, token_count=300):
222 | # datasets_and_texts is a dict of list {dataset_name: col_name}
223 |
224 | datasets_and_texts = {}
225 | for dataset_name, col_name in datasets_and_texts_col.items():
226 | if dataset_name in ['quac', 'squad_v2', 'boolq', 'iohadrubin/mini_xsum', 'liyucheng/trivia_qa_wiki_val']:
227 | ds = datasets.load_dataset(dataset_name, split='validation')
228 | elif 'RealTimeData' in dataset_name:
229 | ds = datasets.load_dataset(dataset_name, split='train')
230 | ds = ds[col_name][:num_samples]
231 |
232 | datasets_and_texts[dataset_name + f'_{token_count}_words'] = [select_token_window(text, token_count=token_count) for text in ds]
233 | # datasets_and_texts[dataset_name + '_200_words'] = [select_token_window(text, token_count=200) for text in ds]
234 |
235 | return datasets_and_texts
236 |
237 | if __name__ == "__main__":
238 | cwd, model_name, token_count, = sys.argv[1:]
239 | token_count = int(token_count)
240 | batch_size = 8
241 |
242 | recent_snippets, historical_snippets = fetch_latest_and_historical_wiki_pages(cache_dir=cwd, token_count=token_count)
243 | recent_snippets = recent_snippets[:120]
244 | historical_snippets = historical_snippets[:120]
245 | wikipedia_and_texts = {
246 | 'wiki_recent': recent_snippets,
247 | 'wiki_historical': historical_snippets
248 | }
249 | # datasets_and_texts = prepare_comparing_data({
250 | # 'liyucheng/trivia_qa_wiki_val': 'wiki_context_sample'
251 | # 'RealTimeData/bbc_latest': 'content',
252 | # 'RealTimeData/bbc_2017': 'content',
253 | # 'iohadrubin/mini_xsum': 'document'
254 | # 'quac': 'context',
255 | # 'boolq': 'passage',
256 | # 'squad_v2': 'context',
257 | # 'RealTimeData/github_july_week1_2023': 'readme',
258 | # 'RealTimeData/arxiv_july_week1_2023': 'text',
259 | # 'RealTimeData/bbc_news_week1_july_2023': 'content',
260 | # }, token_count=token_count, num_samples=120)
261 | datasets_and_texts = verbalise_docs(num_words=token_count)
262 |
263 | if 'GPTQ' in model_name:
264 | # only llama-30b use gptq
265 | model = AutoGPTQForCausalLM.from_quantized(model_name, device = 'cuda:0', use_safetensors = True, disable_exllama=True if '30b' in model_name else False)
266 | tokenizer = LlamaTokenizerFast.from_pretrained(model_name)
267 | elif 'llama' in model_name.lower():
268 | model = LlamaForCausalLM.from_pretrained(model_name, torch_dtype=torch.float16, device_map='auto')
269 | tokenizer = LlamaTokenizerFast.from_pretrained(model_name)
270 | elif 'opt' in model_name.lower():
271 | model = OPTForCausalLM.from_pretrained(model_name, device_map='auto')
272 | tokenizer = AutoTokenizer.from_pretrained(model_name)
273 | elif 'gpt2' == model_name.lower():
274 | model = AutoModelForCausalLM.from_pretrained(model_name)
275 | tokenizer = AutoTokenizer.from_pretrained(model_name)
276 |
277 | # datasets_and_texts = prepare_comparing_data({
278 | # 'RealTimeData/News_Seq_2021': 'maintext',
279 | # 'RealTimeData/News_August_2023': 'maintext',
280 | # })
281 |
282 | # datasets_and_texts.update(wikipedia_and_texts)
283 |
284 | print('=====================')
285 | print(f'Model: {model_name}')
286 |
287 | for dataset_name, texts in datasets_and_texts.items():
288 | print(f'=====================')
289 | print(f'Dataset: {dataset_name}')
290 | infos = []
291 | for text in tqdm(texts):
292 | try:
293 | if 'curie' in model_name.lower():
294 | tokens, info = gpt3_self_info(text)
295 | else:
296 | tokens, info = self_info(text, model, tokenizer)
297 | except:
298 | traceback.print_exc()
299 | time.sleep(10)
300 | continue
301 | # print('text:', text, '\ninfo:', info)
302 | infos.append(sum(info)/len(info))
303 | print(f'Average self-info: {sum(infos)/len(infos)}')
--------------------------------------------------------------------------------
/bbc_downloader.py:
--------------------------------------------------------------------------------
1 | import weakref
2 | import requests
3 |
4 | from configobj import ConfigObj
5 |
6 | class Configuration:
7 |
8 | def __init__(self):
9 | self.__properties = dict()
10 | properties = self._init_properties()
11 | for property_, value, transform_fn in properties:
12 | if transform_fn is not None:
13 | value = transform_fn(value)
14 | setattr(self, property_, value)
15 | self.__properties[property_] = {
16 | 'default-value': value,
17 | 'transform_fn': transform_fn
18 | }
19 |
20 | def _init_properties(self):
21 | # [[name, default-value, transform_fn]]
22 | return []
23 |
24 | # TODO: hierachical config
25 | def load(self, path):
26 | config = ConfigObj(path, encoding='UTF-8')
27 | for property_, value in config.items():
28 | transform_fn = self.__properties[property_]['transform_fn']
29 | if transform_fn is not None:
30 | value = transform_fn(value)
31 | setattr(self, property_, value)
32 |
33 | from dateutil.relativedelta import relativedelta
34 | # from datetime import datetime, date
35 | import datetime
36 |
37 | class DatasetConfiguration(Configuration):
38 |
39 | def _format_date(self, date_str):
40 | return datetime.datetime.strptime(date_str, '%Y-%m-%d')
41 |
42 | def _calculate_step(self, step):
43 | step = int(step)
44 | if self.step_unit == 'day':
45 | return relativedelta(days=step)
46 | elif self.step_unit == 'month':
47 | return relativedelta(months=step)
48 | else:
49 | return relativedelta(years=step)
50 |
51 | def _init_properties(self):
52 | return [
53 | ['name', '', str],
54 | ['base_api_url', 'http://dracos.co.uk/made/bbc-news-archive/{year}/{month:0>2}/{day:0>2}/', str],
55 | ['start_date', '2016-01-01', self._format_date],
56 | ['end_date', '2017-01-01', self._format_date],
57 | ['step_unit', 'day', str],
58 | ['step', 1, self._calculate_step],
59 | ['path', './dataset/bbc/', str],
60 | ['sleep', 1, float]
61 | ]
62 |
63 | class NetWorkConfiguration(Configuration):
64 |
65 | HTTP_TIMEOUT = 30
66 | STRICT = True
67 | USER_AGENT = 'Mozilla'
68 |
69 | def _init_properties(self):
70 | return [
71 | ['browser_user_agent', 'Mozilla', str],
72 | ['http_timeout', 30, int],
73 | ['strict', True, lambda v: str(v) == 'True']
74 | ]
75 |
76 | class NetworkError(RuntimeError):
77 |
78 | def __init__(self, status_code, reason):
79 | self.reason = reason
80 | self.status_code = status_code
81 |
82 | class NetworkFetcher(object):
83 |
84 | def __init__(self):
85 | self.config = NetWorkConfiguration()
86 | # self.config.load('./settings/network.cfg')
87 | self.config.strict = False
88 |
89 | self._connection = requests.Session()
90 | self._connection.headers['User-agent'] = self.config.browser_user_agent
91 | self._finalizer = weakref.finalize(self, self.close)
92 |
93 | self._url = None
94 | self.response = None
95 | self.headers = None
96 |
97 | def close(self):
98 | if self._connection is not None:
99 | self._connection.close()
100 | self._connection = None
101 |
102 | def get_url(self):
103 | return self._url
104 |
105 | def fetch(self, url):
106 | try:
107 | response = self._connection.get(url, timeout=self.config.http_timeout, headers=self.headers)
108 | except Exception:
109 | return None
110 | if response.ok:
111 | self._url = response.url
112 | text = response.content
113 | else:
114 | self._url = None
115 | text = None
116 | if self.config.strict:
117 | raise NetworkError(response.status_code, response.reason)
118 |
119 | return text
120 |
121 | class DownloadLinkFetcher:
122 |
123 | RETRY = 5
124 |
125 | def __init__(self, config):
126 | self.base_api_url = config.base_api_url
127 |
128 | self.start_date = config.start_date
129 | self.current_date = config.start_date
130 | self.end_date = config.end_date
131 | self.step_unit = config.step_unit
132 | self.step = config.step
133 |
134 | self.html_fetcher = NetworkFetcher()
135 |
136 | def _format_link(self, link):
137 | print(link)
138 | hash_index = link.find('#')
139 | if hash_index != -1:
140 | link = link[:hash_index]
141 | if link and link[-1] == '/':
142 | link = link[:-1]
143 | return link
144 |
145 | def _link_filter(self, link, filters):
146 | if not link:
147 | return False
148 | if not link[-1].isdigit():
149 | return False
150 | for filter_ in filters:
151 | if link[filter_[1]:filter_[2]] == filter_[0]:
152 | return False
153 | return True
154 |
155 | def _html_to_links(self, html):
156 | return []
157 |
158 | def _next_api(self, base_url, current_date):
159 | return ''
160 |
161 | def next(self):
162 | if self.current_date >= self.end_date:
163 | return None, None
164 | api_url = self._next_api(self.base_api_url, self.current_date)
165 | date = self.current_date
166 | self.current_date += self.step
167 | return api_url, date
168 |
169 | def fetch(self, api_url):
170 | print('fetching download links...')
171 | html = self.html_fetcher.fetch(api_url)
172 | if html is None:
173 | for _ in range(0, self.RETRY):
174 | html = self.html_fetcher.fetch(api_url)
175 | if html is not None:
176 | break
177 | if html is None or len(html) == 0:
178 | print('api', api_url, ' failed')
179 | return []
180 | links = self._html_to_links(html)
181 | return links
182 |
183 | from bs4 import BeautifulSoup
184 |
185 | class BBCLinkFetcher(DownloadLinkFetcher):
186 |
187 | BBC_FILTERS = [
188 | ['programmes', 21, 31],
189 | ['correspondents', 26, 40],
190 | ['iplayer', 21, 28],
191 | ['radio', 21, 26],
192 | ['live', 27, 31],
193 | ['m', 7, 8],
194 | ['video_and_audio', 26, 41]
195 | ]
196 |
197 | def _next_api(self, base_url, current_date):
198 | year = current_date.year
199 | month = current_date.month
200 | day = current_date.day
201 | api_url = base_url.format(year=year, month=month, day=day)
202 | return api_url
203 |
204 | def _html_to_links(self, html):
205 | soup = BeautifulSoup(html, 'lxml')
206 |
207 | links = list()
208 | # news links are the hrefs of a
209 | elements = soup.table.find_all('a')
210 | # elements = soup.table.find_all('a', class_='title-link')
211 | for element in elements:
212 | href = element.get('href')
213 | if not href:
214 | continue
215 | link = self._format_link(href)
216 | if self._link_filter(link, self.BBC_FILTERS):
217 | links.append(link)
218 |
219 | return list(set(links))
220 |
221 |
222 | import sys
223 | import os.path
224 | import json
225 | import time
226 | from datetime import timedelta
227 |
228 | class ArticleFetcher:
229 |
230 | RETRY = 5
231 |
232 | def __init__(self, config):
233 | self.config = config
234 | self.download_link_fetcher = None
235 | self.html_fetcher = NetworkFetcher()
236 | self.path = config.path
237 |
238 | self.total_date = 0
239 |
240 | self._mkdir(self.path,
241 | config.start_date,
242 | config.end_date,
243 | config.step)
244 |
245 | def _mkdir(self, path, start_date, end_date, step):
246 | if os.path.isdir(path):
247 | # current_date = start_date
248 | # while current_date < end_date:
249 | # current_date += step
250 | # self.total_date += 1
251 | # return
252 | pass
253 | else:
254 | os.makedirs(path)
255 | current_date = start_date
256 | existed_years = dict()
257 | while current_date < end_date:
258 | year = current_date.year
259 | month = current_date.month
260 | day = current_date.day
261 |
262 | year_path = os.path.join(path, str(year))
263 | month_path = os.path.join(year_path, str(month))
264 | day_path = os.path.join(month_path, str(day))
265 |
266 | if year not in existed_years.keys():
267 | existed_years[year] = dict()
268 | if not os.path.isdir(year_path):
269 | os.mkdir(year_path)
270 |
271 | if (step.months > 0) or (step.days > 0):
272 | year_content = existed_years[year]
273 | if month not in year_content.keys():
274 | year_content[month] = True
275 | if not os.path.isdir(month_path):
276 | os.mkdir(month_path)
277 |
278 | if step.days > 0:
279 | if not os.path.isdir(day_path):
280 | os.mkdir(day_path)
281 | current_date += step
282 |
283 | self.total_date += 1
284 |
285 | def _html_to_infomation(self, html, link, date):
286 | return {}
287 |
288 | def _extract_information(self, link, date):
289 | html = self.html_fetcher.fetch(link)
290 | if html is None:
291 | for _ in range(0, self.RETRY):
292 | html = self.html_fetcher.fetch(link)
293 | if html is not None:
294 | break
295 | if html is None:
296 | print('article ', link, 'failed')
297 | return None
298 | return self._html_to_infomation(html, link, date)
299 |
300 | def _get_storage_path(self, path, date):
301 | return os.path.join(path, str(date.year), str(date.month), str(date.day))
302 |
303 | def _lazy_storage(self, storage_path, links, date, current_date):
304 | total_links = len(links)
305 | current_link = 1
306 |
307 | titles_path = os.path.join(storage_path, f'titles.{current_date}')
308 | with open(titles_path, mode='w', encoding='utf-8') as titles_file:
309 | articles = list()
310 | titles = list()
311 | for link in links:
312 | print('>>> {c} in {t} articles\r'.format(c=current_link, t=total_links), end='')
313 | current_link += 1
314 |
315 | article = self._extract_information(link, date)
316 | if article is not None:
317 | titles.append(article['title'] + '\n')
318 | articles.append(article)
319 |
320 | articles_path = os.path.join(storage_path, f'articles.{current_date}')
321 | with open(articles_path, mode='w', encoding='utf-8') as articles_file:
322 | json.dump({
323 | 'expected_number': len(links),
324 | 'number': len(articles),
325 | 'articles': articles
326 | }, articles_file, indent=4)
327 | titles_file.writelines(titles)
328 |
329 | def _non_lazy_storage(self, storage_path, links, date):
330 | total_links = len(links)
331 | current_link = 1
332 |
333 | titles_path = os.path.join(storage_path, 'titles')
334 | with open(titles_path, mode='w', encoding='utf-8') as titles_file:
335 | for article_index, link in enumerate(links):
336 | print('{c} in {t} articles\r'.format(c=current_link, t=total_links), end='')
337 | current_link += 1
338 |
339 | article = self._extract_information(link, date)
340 | if article is not None:
341 | titles_file.write(article['title'] + '\n')
342 |
343 | article_path = os.path.join(storage_path, str(article_index))
344 | with open(article_path, mode='w', encoding='utf-8') as article_file:
345 | json.dump(article, article_file, indent=4)
346 |
347 | def fetch(self, lazy_storage=True):
348 | current_date = 1
349 | while True:
350 | api_url, date = self.download_link_fetcher.next()
351 | if api_url is None:
352 | break
353 | print(date.strftime('%Y-%m-%d'),
354 | '{c} in {t} dates '.format(c=current_date, t=self.total_date))
355 |
356 | # storage_path = self._get_storage_path(self.path, date)
357 | storage_path = self.path
358 | links = self.download_link_fetcher.fetch(api_url)
359 | if lazy_storage:
360 | self._lazy_storage(storage_path, links, date, current_date)
361 | else:
362 | self._non_lazy_storage(storage_path, links, date)
363 |
364 | time.sleep(self.config.sleep)
365 |
366 | print(date.strftime('%Y-%m-%d'),
367 | 'date {c} finished '.format(c=current_date))
368 | current_date += 1
369 |
370 | import json
371 |
372 | from bs4 import BeautifulSoup
373 | from goose3 import Goose
374 | from goose3.extractors.content import ContentExtractor
375 |
376 | eps = 1e-6
377 | f1 = ContentExtractor.calculate_best_node
378 | f2 = ContentExtractor.post_cleanup
379 |
380 |
381 | def post_cleanup(ce_inst):
382 | """\
383 | remove any divs that looks like non-content,
384 | clusters of links, or paras with no gusto
385 | """
386 | parse_tags = ['p']
387 | if ce_inst.config.parse_lists:
388 | parse_tags.extend(['ul', 'ol'])
389 | if ce_inst.config.parse_headers:
390 | parse_tags.extend(['h1', 'h2', 'h3', 'h4', 'h5', 'h6'])
391 |
392 | target_node = ce_inst.article.top_node
393 | node = ce_inst.add_siblings(target_node)
394 | for elm in ce_inst.parser.getChildren(node):
395 | e_tag = ce_inst.parser.getTag(elm)
396 | if e_tag not in parse_tags:
397 | if ce_inst.is_highlink_density(elm) or ce_inst.is_table_and_no_para_exist(elm):
398 | ce_inst.parser.remove(elm)
399 | return node
400 |
401 |
402 | def calculate_best_node(ce_inst, doc):
403 | top_node = None
404 | nodes_to_check = ce_inst.nodes_to_check(doc)
405 |
406 | starting_boost = float(1.0)
407 | cnt = 0
408 | i = 0
409 | parent_nodes = []
410 | nodes_with_text = []
411 |
412 | for node in nodes_to_check:
413 | text_node = ce_inst.parser.getText(node)
414 | word_stats = ce_inst.stopwords_class(language=ce_inst.get_language()).get_stopword_count(text_node)
415 | high_link_density = ce_inst.is_highlink_density(node)
416 | if word_stats.get_stopword_count() > 2 and not high_link_density:
417 | nodes_with_text.append(node)
418 |
419 | nodes_number = len(nodes_with_text)
420 | negative_scoring = 0
421 | bottom_negativescore_nodes = float(nodes_number) * 0.25
422 |
423 | for node in nodes_with_text:
424 | boost_score = float(0)
425 | # boost
426 | if ce_inst.is_boostable(node):
427 | if cnt >= 0:
428 | boost_score = float((1.0 / starting_boost) * 50)
429 | starting_boost += 1
430 | # nodes_number
431 | if nodes_number > 15:
432 | if (nodes_number - i) <= bottom_negativescore_nodes:
433 | booster = float(bottom_negativescore_nodes - (nodes_number - i))
434 | boost_score = float(-pow(booster, float(2)))
435 | negscore = abs(boost_score) + negative_scoring
436 | if negscore > 40:
437 | boost_score = float(5)
438 |
439 | text_node = ce_inst.parser.getText(node)
440 | word_stats = ce_inst.stopwords_class(language=ce_inst.get_language()).get_stopword_count(text_node)
441 | upscore = int(word_stats.get_stopword_count() + boost_score)
442 |
443 | # parent node
444 | parent_node = ce_inst.parser.getParent(node)
445 | ce_inst.update_score(parent_node, upscore)
446 | ce_inst.update_node_count(parent_node, 1)
447 |
448 | if parent_node not in parent_nodes:
449 | parent_nodes.append(parent_node)
450 |
451 | # parentparent node
452 | parent_parent_node = ce_inst.parser.getParent(parent_node)
453 | if parent_parent_node is not None:
454 | ce_inst.update_node_count(parent_parent_node, 1)
455 | ce_inst.update_score(parent_parent_node, upscore - eps)
456 | if parent_parent_node not in parent_nodes:
457 | parent_nodes.append(parent_parent_node)
458 |
459 | # parentparentparent node
460 | parent_parent_parent_node = ce_inst.parser.getParent(parent_parent_node)
461 | if parent_parent_parent_node is not None:
462 | ce_inst.update_node_count(parent_parent_parent_node, 1)
463 | ce_inst.update_score(parent_parent_parent_node, upscore - 2 * eps)
464 | if parent_parent_parent_node not in parent_nodes:
465 | parent_nodes.append(parent_parent_parent_node)
466 | cnt += 1
467 | i += 1
468 |
469 | top_node_score = 0
470 | for itm in parent_nodes:
471 | score = ce_inst.get_score(itm)
472 |
473 | if score > top_node_score:
474 | top_node = itm
475 | top_node_score = score
476 |
477 | if top_node is None:
478 | top_node = itm
479 |
480 | return top_node
481 |
482 |
483 | class BBCArticleFetcher(ArticleFetcher):
484 |
485 | def __init__(self, config):
486 | super(BBCArticleFetcher, self).__init__(config)
487 | self.download_link_fetcher = BBCLinkFetcher(config)
488 |
489 | def _extract_title(self, soup):
490 | if soup.title is not None:
491 | return soup.title.get_text()
492 |
493 | def _extract_published_date(self, date):
494 | return date.strftime('%Y-%m-%d')
495 |
496 | def _extract_authors(self, soup):
497 | authors_elements = soup.find_all('meta', property='article:author')
498 | if authors_elements is not None:
499 | return [authors_element['content'] for authors_element in authors_elements]
500 |
501 | def _extract_description(self, soup):
502 | description_element = soup.find('meta', property='og:description')
503 | if description_element is not None:
504 | return description_element['content']
505 |
506 | def _extract_section(self, soup):
507 | section_element = soup.find('meta', property='article:section')
508 | if section_element is not None:
509 | return section_element['content']
510 |
511 | def _extract_content(self, html):
512 | ContentExtractor.calculate_best_node = calculate_best_node
513 | ContentExtractor.post_cleanup = post_cleanup
514 | g = Goose({'enable_image_fetching': False})
515 | article = g.extract(raw_html=html)
516 | ContentExtractor.calculate_best_node = f1
517 | ContentExtractor.post_cleanup = f2
518 | return article.cleaned_text
519 |
520 | def _html_to_infomation(self, html, link, date):
521 | soup = BeautifulSoup(html, 'lxml')
522 | head = soup.head
523 |
524 | try:
525 | title = self._extract_title(head)
526 | published_date = self._extract_published_date(date)
527 | authors = self._extract_authors(head)
528 | description = self._extract_description(head)
529 | section = self._extract_section(head)
530 | content = self._extract_content(html)
531 | except Exception:
532 | return None
533 |
534 | return {
535 | 'title': title,
536 | 'published_date': published_date,
537 | 'authors': authors,
538 | 'description': description,
539 | 'section': section,
540 | 'content': content,
541 | 'link': link
542 | }
543 |
544 | if __name__ == '__main__':
545 |
546 | today = datetime.date.today()
547 | today_str = today.strftime('%Y-%m-%d')
548 | two_weeks_ago = today - datetime.timedelta(days=7)
549 | two_weeks_ago_str = two_weeks_ago.strftime('%Y-%m-%d')
550 |
551 | config = DatasetConfiguration()
552 | config.start_date = two_weeks_ago
553 | config.end_date = today
554 | config.path = 'dataset/bbc'
555 |
556 | bbc_article_fetcher = BBCArticleFetcher(config)
557 | bbc_article_fetcher.fetch()
558 |
559 | from glob import glob
560 | files = glob(f'dataset/bbc/articles.*')
561 | files.sort()
562 |
563 | import datasets
564 | import json
565 | import os
566 |
567 | hf_token = os.environ['HF_TOKEN']
568 |
569 | all_articles = []
570 | for file in files:
571 | with open(file) as f:
572 | articles = json.load(f)
573 |
574 | articles = articles['articles']
575 | for article in articles:
576 | article['authors'] = article['authors'][0] if article['authors'] else None
577 | all_articles.append(article)
578 |
579 | with open('all_articles.json', 'w') as f:
580 | json.dump(all_articles, f, indent=4, ensure_ascii=False)
581 |
582 | ds = datasets.Dataset.from_dict({key: [article[key] for article in all_articles] for key in all_articles[0].keys()})
583 | ds.save_to_disk('bbc')
584 |
585 | from huggingface_hub import create_branch, create_tag, RepoCard
586 |
587 | create_branch('RealTimeData/bbc_latest', repo_type='dataset', branch=today_str, token=hf_token)
588 | ds.push_to_hub('RealTimeData/bbc_latest', token=hf_token, branch='main')
589 | ds.push_to_hub('RealTimeData/bbc_latest', token=hf_token, branch=today_str)
590 |
591 | text = f"""
592 | # Latest BBC News
593 |
594 | You could always access the latest BBC News articles via this dataset.
595 |
596 | We update the dataset weekly, on every Sunday. So the dataset always provides the latest BBC News article from the last week.
597 |
598 | The current dataset on main branch contains the latest BBC News articles submitted from {two_weeks_ago.isoformat()} to {today.isoformat()}.
599 |
600 | The data collection is conducted on {today.isoformat()}.
601 |
602 | Use the dataset via:
603 | ```
604 | ds = datasets.load_dataset('RealTimeData/bbc_latest')
605 | ```
606 |
607 | # Previsou versions
608 |
609 | You could access previous versions by requesting different branches.
610 |
611 | For example, you could find the 2023-08-20 version via:
612 | ```
613 | ds = datasets.load_dataset('RealTimeData/bbc_latest', revision = '2023-08-20')
614 | ```
615 |
616 | Check all available versions by clicking the "Files and versions" button on the top bar.
617 | """
618 | card = RepoCard(text)
619 | card.push_to_hub('RealTimeData/bbc_latest', repo_type='dataset', token=hf_token)
--------------------------------------------------------------------------------