├── figs ├── logo.png ├── perplexity.py ├── single.py ├── polar.py ├── winrate.py ├── compare_strings.json └── compare.py ├── requirements.txt ├── .gitignore ├── data ├── monthly_updater │ ├── readme.md │ ├── monthly_wikitext.py │ ├── monthly_image.py │ ├── monthly_math.py │ ├── monthly_code.py │ └── monthly_arxiv.py ├── code_repos.txt ├── push_wiki_alltime.py ├── push_github_dataset.py ├── push_arxiv_dataset.py ├── collect_bbc_months.py ├── push_math_dataset.py ├── doc_info.py ├── analyse_news.py ├── analyse_wikitext.py ├── wiki_dataset.py ├── wikitext_alltime.py ├── bbc_alltime.py ├── maintain_wikitext_latest.py ├── audio_dataset.py ├── bbc_news_image.py ├── math_dataset.py ├── github_dataset.py ├── reddit_crawler.py ├── squad_wiki_title.text ├── arxiv_dataset.py └── wikipedia.py ├── .github └── workflows │ ├── weekly_downloader.yml │ └── monthly_updater.yml ├── readme.md ├── push_to_hf_hub.py ├── github_downloader.py ├── wikitext_downloader.py ├── eval └── contamination.py ├── arxiv_downloader.py └── bbc_downloader.py /figs/logo.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/liyucheng09/LatestEval/HEAD/figs/logo.png -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | arxiv 2 | pylatexenc 3 | datasets 4 | bs4 5 | goose3 6 | configobj 7 | mwparserfromhell 8 | GitPython -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | .DS_Store 2 | *.html 3 | eval/*.txt 4 | eval/saves/*.txt 5 | __pycache__/ 6 | .vscode/ 7 | data/*.json 8 | *.log 9 | *.out 10 | *.error 11 | 12 | wmt22-zhen/ 13 | arxiv/ 14 | bbc/ 15 | github/ -------------------------------------------------------------------------------- /data/monthly_updater/readme.md: -------------------------------------------------------------------------------- 1 | # Monthly data collection - where the RealTimeData program is hosted 2 | 3 | ## sources available 4 | 5 | - arxiv 6 | - bbc_news 7 | - code 8 | - bbc_image 9 | - math 10 | - wikitext 11 | 12 | ## Check all data dumps from 2017 to current 13 | 14 | Please find the RealTimeData program here: [RealTimeData](https://huggingface.co/RealTimeData). 15 | 16 | ## Crawl data by your own 17 | 18 | ```python 19 | python monthly_arxiv.py 20 | ``` 21 | 22 | This will crawl the arxiv data this month (from the 1st to the current date) and push to RealTimeData repos (if you have push authority). 23 | 24 | ## Ask for more data source / contribute new data 25 | 26 | This is program welcomes all contributions. Please open an issue or pull request if you have any suggestions or want to contribute new data sources. -------------------------------------------------------------------------------- /data/code_repos.txt: -------------------------------------------------------------------------------- 1 | Stirling-Tools/Stirling-PDF 2 | microsoft/PowerToys 3 | veler/DevToys 4 | NationalSecurityAgency/ghidra 5 | Kurento/kurento-media-server 6 | silverwind/droppy 7 | llvm-mirror/clang 8 | facebookarchive/beringei 9 | shadowsocks/shadowsocks-qt5 10 | go-ego/riot 11 | flynn/flynn 12 | lipangit/JiaoZiVideoPlayer 13 | keras-team/keras 14 | aseprite/aseprite 15 | godotengine/godot 16 | lua/lua 17 | musescore/MuseScore 18 | apache/spark 19 | apache/hadoop 20 | scikit-learn/scikit-learn 21 | Leaflet/Leaflet 22 | overleaf/overleaf 23 | pytorch/pytorch 24 | huggingface/transformers 25 | animate-css/animate.css 26 | psf/requests 27 | pandas-dev/pandas 28 | django/django 29 | numpy/numpy 30 | facebook/react 31 | vuejs/core 32 | vuejs/vue 33 | android/architecture-samples 34 | sqlite/sqlite 35 | elastic/elasticsearch 36 | openssl/openssl 37 | gohugoio/hugo 38 | laravel/laravel 39 | WordPress/WordPress 40 | Unity-Technologies/ml-agents 41 | opencv/opencv -------------------------------------------------------------------------------- /data/push_wiki_alltime.py: -------------------------------------------------------------------------------- 1 | import datasets 2 | from glob import glob 3 | import os 4 | import json 5 | 6 | if __name__ == '__main__': 7 | files = glob('/vol/research/lyc/wikitext_alltime/wiki/*.json') 8 | hf_token = os.environ['HF_TOKEN'] 9 | 10 | for file in files: 11 | 12 | all_articles = [] 13 | 14 | time = os.path.basename(file).strip('.json') 15 | year = int(time.split('-')[0]) 16 | month = int(time.split('-')[1]) 17 | 18 | time_stamp = f'{year}-{month:02d}' 19 | if time_stamp not in ['2024-01', '2024-02']: 20 | continue 21 | print(f"Processing {time_stamp}") 22 | 23 | with open(file) as f: 24 | data = json.load(f) 25 | 26 | for title, article in data.items(): 27 | article['time'] = time_stamp 28 | all_articles.append(article) 29 | 30 | ds = datasets.Dataset.from_list(all_articles) 31 | ds.push_to_hub(f"RealTimeData/wikitext_alltime", config_name=time_stamp, token=hf_token) 32 | -------------------------------------------------------------------------------- /data/push_github_dataset.py: -------------------------------------------------------------------------------- 1 | from glob import glob 2 | import datasets 3 | import os 4 | import json 5 | 6 | if __name__ == '__main__': 7 | hf_token = os.environ['HF_TOKEN'] 8 | all_months = [f'{year}-{month:02}' for year in range(2017, 2024) for month in range(1, 13)] 9 | all_months += [f'2024-{month:02}' for month in range(1,3)] 10 | 11 | # try: 12 | # exists_config = datasets.get_dataset_config_names('RealTimeData/code_alltime') 13 | # except datasets.exceptions.DatasetNotFoundError: 14 | # exists_config = [] 15 | # pass 16 | 17 | for month in all_months: 18 | # if month in exists_config: 19 | # continue 20 | code_paths = glob(f'/vol/research/lyc/github_dataset/{month}/*/*.json') 21 | all_codes = [] 22 | for code in code_paths: 23 | with open(code, 'r') as f: 24 | all_codes.append(json.load(f)) 25 | ds = datasets.Dataset.from_list(all_codes) 26 | print('='*20) 27 | print(f'Finished {month}') 28 | print(ds) 29 | ds.push_to_hub(f'RealTimeData/code_alltime', config_name = month, token=hf_token) 30 | print(f'Pushed {month} to hub') 31 | -------------------------------------------------------------------------------- /data/push_arxiv_dataset.py: -------------------------------------------------------------------------------- 1 | from glob import glob 2 | import datasets 3 | import os 4 | import json 5 | 6 | if __name__ == '__main__': 7 | hf_token = os.environ['HF_TOKEN'] 8 | all_months = [f'{year}-{month:02}' for year in range(2017, 2024) for month in range(1, 13)] 9 | 10 | # try: 11 | # exists_config = datasets.get_dataset_config_names('RealTimeData/arxiv_alltime') 12 | # except datasets.exceptions.DatasetNotFoundError: 13 | # exists_config = [] 14 | # pass 15 | 16 | # all months before 2021-02 (included) are already pushed, so remove these months from all_months 17 | all_months = all_months[all_months.index('2021-03'):] 18 | 19 | for month in all_months: 20 | # if month in exists_config: 21 | # continue 22 | paper_paths = glob(f'/vol/research/lyc/arxiv_alltime/{month}/*.json') 23 | all_papers = [] 24 | for paper in paper_paths: 25 | with open(paper, 'r') as f: 26 | all_papers.append(json.load(f)) 27 | ds = datasets.Dataset.from_list(all_papers) 28 | print('='*20) 29 | print(f'Finished {month}') 30 | print(ds) 31 | ds.push_to_hub(f'RealTimeData/arxiv_alltime', config_name = month, token=hf_token) 32 | print(f'Pushed {month} to hub') 33 | -------------------------------------------------------------------------------- /data/collect_bbc_months.py: -------------------------------------------------------------------------------- 1 | from glob import glob 2 | import json 3 | 4 | if __name__ == '__main__': 5 | # /vol/research/lyc/bbc/2023/0/articles.1 indicates day 1, month 0, year 2023 6 | docs = glob('/vol/research/lyc/bbc/*/*/articles.*') 7 | 8 | # now group by month 9 | times = {} 10 | for doc in docs: 11 | year = doc.split('/')[-3] 12 | month = doc.split('/')[-2] 13 | month = int(month)%12 + 1 14 | time = f'{year}-{month}' 15 | if time not in times: 16 | times[time] = [] 17 | 18 | with open(doc, 'r') as f: 19 | articles = json.load(f)['articles'] 20 | times[time].extend(articles) 21 | 22 | # now save 23 | # each month should save as a json dict 24 | # target path /vol/research/lyc/bbc/bbc_alltime/articles/2023-{month}.json 25 | for time in times: 26 | articles = times[time] 27 | month = time.split('-')[1] 28 | year = time.split('-')[0] 29 | # now turn list of dicts to dict of lists 30 | articles = { key: [article[key] for article in articles] for key in articles[0] } 31 | with open(f'/vol/research/lyc/bbc/bbc_alltime/articles/{year}-{month}.json', 'w') as f: 32 | json.dump(articles, f, ensure_ascii=False) 33 | print(f'Finished {year} {month}') -------------------------------------------------------------------------------- /data/push_math_dataset.py: -------------------------------------------------------------------------------- 1 | import datasets 2 | import json 3 | from glob import glob 4 | 5 | if __name__ == '__main__': 6 | 7 | files = glob('/vol/research/lyc/math/*.json') 8 | for file in files: 9 | with open(file, 'r') as f: 10 | data = json.load(f) 11 | 12 | time_stamp = file.split('/')[-1].split('.')[0] 13 | if time_stamp not in ['2024-01', '2024-02']: 14 | continue 15 | 16 | all_instances = [] 17 | for qa in data.values(): 18 | instance = {} 19 | instance['question'] = qa['title'] 20 | instance['question_id'] = qa['question_id'] 21 | instance['score'] = qa['score'] 22 | instance['link'] = qa['link'] 23 | instance['body'] = qa['body'] 24 | if 'answers' not in qa: 25 | continue 26 | instance['answers'] = [{'text': a['body'], 'score': a['score'], 'answer_id': a['answer_id']} for a in qa['answers']] 27 | 28 | verbolised = f"Question: {instance['question']}\n" 29 | for ans_index, ans in enumerate(instance['answers']): 30 | verbolised += f"Answer {ans_index + 1}: {ans['text']}\n" 31 | instance['verbolised'] = verbolised 32 | 33 | all_instances.append(instance) 34 | 35 | dataset = datasets.Dataset.from_list(all_instances) 36 | print(dataset) 37 | 38 | dataset.push_to_hub('RealTimeData/math_alltime', time_stamp) 39 | print(f"Pushed {time_stamp} to hub") -------------------------------------------------------------------------------- /data/doc_info.py: -------------------------------------------------------------------------------- 1 | import docx 2 | import re 3 | # from wikipedia import gpt3_self_info 4 | import sys 5 | 6 | def getText(filename): 7 | doc = docx.Document(filename) 8 | fullText = [] 9 | for para in doc.paragraphs: 10 | fullText.append(para.text) 11 | return '\n'.join(fullText) 12 | 13 | def beautify_text(text, num_words = 1000): 14 | text = re.sub(r'\n+', '\n', text) 15 | text = re.sub(r'\s+', ' ', text) 16 | 17 | # use first 1000 words 18 | text = ' '.join(text.split(' ')[:num_words]) 19 | return text 20 | 21 | def verbalise_docs(path = '/user/HS502/yl02706/LatestEval/data/mmlu', num_words = 1000): 22 | docs = ['q17-1.docx', 'q18-1.docx', 'q19-1.docx', 'q20-1.docx', 'q22-1.docx', 'q23-1.docx'] 23 | docs = [ path + '/' + doc for doc in docs ] 24 | doc_text = [ getText(doc) for doc in docs ] 25 | 26 | doc_text = [ beautify_text(doc, num_words=num_words) for doc in doc_text ] 27 | 28 | return { 29 | doc: [doc_string] for doc, doc_string in zip(docs, doc_text) 30 | } 31 | 32 | if __name__ == '__main__': 33 | docs = ['data/q17-1.docx', 'data/q18-1.docx', 'data/q19-1.docx', 'data/q20-1.docx', 'data/q22-1.docx', 'data/q23-1.docx'] 34 | doc_text = [ getText(doc) for doc in docs ] 35 | 36 | doc_text = [ beautify_text(doc) for doc in doc_text ] 37 | 38 | for doc, doc_string in zip(docs, doc_text): 39 | print('----------------------') 40 | print(doc) 41 | 42 | _, info = gpt3_self_info(doc_string) 43 | print(sum(info)/len(info)) -------------------------------------------------------------------------------- /figs/perplexity.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import pandas as pd 3 | import matplotlib.pyplot as plt 4 | 5 | # Data 6 | df = pd.read_csv('figs/perplexity.tsv', sep='\t').set_index('Model') 7 | df.drop('LatestEval', axis = 1, inplace=True) 8 | df.drop(['opt-350m', 'opt-1.6b'], axis=0, inplace=True) 9 | 10 | data_dict = df.to_dict() 11 | 12 | metrics_data = { 13 | 'QuAC': ('s', 'violet'), 14 | 'BoolQ': ('+', 'violet'), 15 | 'SQuAD': ('x', 'violet'), 16 | 'Wikitext': ('D', 'navy'), 17 | 'NewWiki': ('^', 'navy'), 18 | # 'LatestEval': ('o', 'gold') 19 | } 20 | 21 | fig, ax = plt.subplots(figsize=(8, 2.8), dpi=150) 22 | 23 | # Create a horizontal scatter plot for each metric 24 | for benchmark, numbers in data_dict.items(): 25 | marker_style, color = metrics_data[benchmark] 26 | models, perplexities = list(numbers.keys()), list(numbers.values()) 27 | # perplexities = np.exp(perplexities) # assuming metrics are in log scale 28 | plt.scatter(perplexities, models, label=benchmark, s=20, marker=marker_style, color=color) 29 | 30 | # Adjust plot 31 | plt.ylabel('Models', fontweight='bold') 32 | plt.xlabel('Perplexity', fontweight='bold') 33 | plt.legend( loc='upper right', bbox_to_anchor=(1.05, 1.0), ncol=1, fontsize=8) 34 | plt.grid(True, which='both', linestyle='--', linewidth=0.5) 35 | plt.xlim(left = 1.3) # Adjusting xlim to be slightly more than the max value for better visualization 36 | 37 | # plt.gca().xaxis.tick_top() 38 | # plt.gca().xaxis.set_label_position('top') 39 | 40 | ax.spines['top'].set_visible(False) 41 | ax.spines['right'].set_visible(False) 42 | 43 | plt.tight_layout() 44 | 45 | plt.show() -------------------------------------------------------------------------------- /figs/single.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import pandas as pd 3 | import matplotlib.pyplot as plt 4 | 5 | # Data 6 | df = pd.read_csv('figs/perplexity.tsv', sep='\t').set_index('Model') 7 | df.drop('LatestEval', axis = 1, inplace=True) 8 | df.drop(['opt-350m', 'opt-1.6b', 'gpt-3', 'llama-7b', 'llama-30b'], axis=0, inplace=True) 9 | 10 | data_dict = df.to_dict() 11 | 12 | metrics_data = { 13 | 'QuAC': ('s', 'violet'), 14 | 'BoolQ': ('+', 'violet'), 15 | 'SQuAD': ('x', 'violet'), 16 | 'memorised': ('D', 'navy'), 17 | 'clean': ('^', 'navy'), 18 | # 'LatestEval': ('o', 'gold') 19 | } 20 | 21 | fig, ax = plt.subplots(figsize=(4, 1), dpi=200) 22 | 23 | # Create a horizontal scatter plot for each metric 24 | for benchmark, numbers in data_dict.items(): 25 | marker_style, color = metrics_data[benchmark] 26 | models, perplexities = list(numbers.keys()), list(numbers.values()) 27 | # perplexities = np.exp(perplexities) # assuming metrics are in log scale 28 | plt.scatter(perplexities, ['perplexity'], label=benchmark, s=20, marker=marker_style, color=color) 29 | ax.annotate(benchmark, xy=(perplexities[-1], 'perplexity'), xytext=(0, 2), textcoords='offset points', va='bottom', fontsize=7, rotation=45) 30 | 31 | # Adjust plot 32 | plt.grid(True, linestyle='--', linewidth=0.5, axis='y') 33 | plt.xlim(left = 1.6) # Adjusting xlim to be slightly more than the max value for better visualization 34 | 35 | ax.xaxis.set_visible(False) 36 | 37 | ax.spines['top'].set_visible(False) 38 | ax.spines['right'].set_visible(False) 39 | ax.spines['left'].set_visible(False) 40 | ax.spines['bottom'].set_visible(False) 41 | 42 | plt.tight_layout() 43 | 44 | plt.show() -------------------------------------------------------------------------------- /figs/polar.py: -------------------------------------------------------------------------------- 1 | import pandas as pd 2 | import numpy as np 3 | import matplotlib.pyplot as plt 4 | 5 | df_score = pd.read_csv("figs/single_answer_score.tsv", sep='\t') 6 | 7 | # Calculate the number of categories 8 | categories = df_score['category'].unique() 9 | N = len(categories) 10 | 11 | # Calculate angle for each category 12 | theta = np.linspace(0.0, 2 * np.pi, N, endpoint=False) 13 | theta = np.append(theta, theta[0]) 14 | # Set up the polar axis 15 | fig, ax = plt.subplots(subplot_kw={'projection': 'polar'}, figsize=(8, 6), dpi=150) 16 | ax.set_facecolor("#f5f5f5") 17 | 18 | markers = { 19 | 'gpt-3.5-turbo': 'o', 20 | 'gpt-4': '+', 21 | 'llama-13b': 'x', 22 | 'llama-30b': 's', 23 | 'vicuna-13b': 'd', 24 | } 25 | 26 | # Loop through each model and plot on the polar axis 27 | for model in df_score['model'].unique(): 28 | values = df_score[df_score['model'] == model]['score'].values 29 | # Ensure the plot is closed by repeating the first value 30 | values = np.append(values, values[0]) 31 | ax.plot(theta, values, label=model, marker=markers[model], alpha=0.8, markersize=5) 32 | 33 | # Fill the area under the plot for better visualization (optional) 34 | # ax.fill(theta, values, 'b', alpha=0.1) 35 | 36 | # Set the y-ticks (radii) and x-ticks (categories) 37 | ax.set_xticks(theta[:-1]) 38 | ax.set_xticklabels(categories, fontsize=14) # Label x-ticks with categories 39 | 40 | ax.set_yticks([0, 2, 4, 6, 8, 10]) 41 | 42 | # Customize the grid and title 43 | ax.grid(True) 44 | 45 | # Display a legend 46 | ax.legend(loc='upper right', bbox_to_anchor=(1.32, 1.1), fontsize=14) 47 | 48 | # Save the figure 49 | fig.tight_layout() 50 | fig.savefig("fig.png", dpi=150) 51 | 52 | # Show the plot 53 | # plt.show() 54 | -------------------------------------------------------------------------------- /data/analyse_news.py: -------------------------------------------------------------------------------- 1 | from difflib import SequenceMatcher 2 | import datasets 3 | import multiprocessing 4 | 5 | def compare_texts(text1, text2): 6 | # Split the texts into words 7 | words1 = text1.split() 8 | words2 = text2.split() 9 | 10 | # Create a SequenceMatcher to compare the two word lists 11 | matcher = SequenceMatcher(None, words1, words2) 12 | 13 | # Calculate the similarity ratio 14 | similarity = matcher.ratio() 15 | 16 | # Calculate the difference ratio 17 | difference = 1 - similarity 18 | 19 | return difference 20 | 21 | def main(month, first_month_articles): 22 | ds = datasets.load_dataset('RealTimeData/bbc_news_alltime', month, split='train') 23 | # compare to first month 24 | content = '\n'.join(['\n'.join(article.splitlines()[:10]) for article in ds['content']]) 25 | difference = compare_texts(content, first_month_articles) 26 | 27 | print(f"Finished {month}, average difference: {difference}") 28 | return (month, difference) 29 | 30 | if __name__ == '__main__': 31 | 32 | months = [f'{year}-{month:02d}' for year in range(2017, 2024) for month in range(1, 13) if not (year == 2023 and month == 12)] 33 | first_month = datasets.load_dataset('RealTimeData/bbc_news_alltime', months[0], split='train') 34 | first_month_articles = '\n'.join(['\n'.join(article.splitlines()[:10]) for article in first_month['content']]) 35 | diffs = {} 36 | 37 | months = months[1:] 38 | # main got two arguments, month and first_month_articles 39 | # pool size 4 40 | with multiprocessing.Pool(8) as pool: 41 | for month, diff in pool.starmap(main, [(month, first_month_articles) for month in months]): 42 | diffs[month] = diff 43 | 44 | print(diffs) -------------------------------------------------------------------------------- /data/analyse_wikitext.py: -------------------------------------------------------------------------------- 1 | from difflib import SequenceMatcher 2 | import datasets 3 | import multiprocessing 4 | 5 | def compare_texts(text1, text2): 6 | # Split the texts into words 7 | words1 = text1.split() 8 | words2 = text2.split() 9 | 10 | # Create a SequenceMatcher to compare the two word lists 11 | matcher = SequenceMatcher(None, words1, words2) 12 | 13 | # Calculate the similarity ratio 14 | similarity = matcher.ratio() 15 | 16 | # Calculate the difference ratio 17 | difference = 1 - similarity 18 | 19 | return difference 20 | 21 | def main(month, first_month_articles): 22 | ds = datasets.load_dataset('RealTimeData/wikitext_alltime', month, split='train') 23 | # compare to first month 24 | diffs = [] 25 | for article in ds: 26 | title = article['title'] 27 | text = article['text'] 28 | if title not in first_month_articles: 29 | print(f"Article {title} not found in first month") 30 | continue 31 | first_month_text = first_month_articles[title] 32 | difference = compare_texts(text, first_month_text) 33 | diffs.append(difference) 34 | 35 | avg_diff = sum(diffs) / len(diffs) 36 | print(f"Finished {month}, average difference: {avg_diff}") 37 | return (month, avg_diff) 38 | 39 | if __name__ == '__main__': 40 | 41 | months = [f'{year}-{month:02d}' for year in range(2017, 2024) for month in range(1, 13) if not (year == 2023 and month == 12)] 42 | first_month = datasets.load_dataset('RealTimeData/wikitext_alltime', months[0], split='train') 43 | first_month_articles = {title: article for title, article in zip(first_month['title'], first_month['text'])} 44 | diffs = {} 45 | 46 | months = months[1:] 47 | # main got two arguments, month and first_month_articles 48 | # pool size 4 49 | with multiprocessing.Pool(8) as pool: 50 | for month, diff in pool.starmap(main, [(month, first_month_articles) for month in months]): 51 | diffs[month] = diff 52 | 53 | print(diffs) -------------------------------------------------------------------------------- /figs/winrate.py: -------------------------------------------------------------------------------- 1 | import pandas as pd 2 | import plotly.express as px 3 | 4 | def compute_pairwise_win_fraction(battles): 5 | # Times each model wins as Model A 6 | a_win_ptbl = pd.pivot_table( 7 | battles[battles['winner'] == "model_a"], 8 | index="model_a", columns="model_b", aggfunc="size", fill_value=0) 9 | 10 | # Table counting times each model wins as Model B 11 | b_win_ptbl = pd.pivot_table( 12 | battles[battles['winner'] == "model_b"], 13 | index="model_a", columns="model_b", aggfunc="size", fill_value=0) 14 | 15 | # Table counting number of A-B pairs 16 | num_battles_ptbl = pd.pivot_table(battles, 17 | index="model_a", columns="model_b", aggfunc="size", fill_value=0) 18 | 19 | # Computing the proportion of wins for each model as A and as B 20 | # against all other models 21 | row_beats_col_freq = ( 22 | (a_win_ptbl + b_win_ptbl.T) / 23 | (num_battles_ptbl + num_battles_ptbl.T) 24 | ) 25 | 26 | # Arrange ordering according to proprition of wins 27 | prop_wins = row_beats_col_freq.mean(axis=1).sort_values(ascending=False) 28 | model_names = list(prop_wins.keys()) 29 | row_beats_col = row_beats_col_freq.loc[model_names, model_names] 30 | return row_beats_col 31 | 32 | def visualize_pairwise_win_fraction(battles, title): 33 | row_beats_col = compute_pairwise_win_fraction(battles) 34 | fig = px.imshow(row_beats_col, color_continuous_scale='RdBu', 35 | text_auto=".2f", title=title) 36 | fig.update_layout( 37 | # xaxis_title=" Model B: Loser", 38 | # yaxis_title="Model A: Winner", 39 | xaxis_title=None, 40 | yaxis_title=None, 41 | xaxis_side="top", height=700, width=600, 42 | title_y=0.07, title_x=0.5) 43 | fig.update_traces(hovertemplate= 44 | "Model A: %{y}
Model B: %{x}
Fraction of A Wins: %{z}") 45 | 46 | return fig 47 | 48 | df = pd.read_csv("figs/winrate2.tsv", sep='\t') 49 | df = df[df['winner'].isin(['model_a', 'model_b'])] 50 | df = df[df['model_a']!=df['model_b']] 51 | 52 | fig = visualize_pairwise_win_fraction(df, 53 | title = "Pair-wise Win Rate") 54 | 55 | fig.show() 56 | 57 | fig.update_layout( 58 | font=dict( 59 | size=18, 60 | ), 61 | ) 62 | fig.write_image("fig.png", width=700, height=650, scale=2) -------------------------------------------------------------------------------- /data/wiki_dataset.py: -------------------------------------------------------------------------------- 1 | import requests 2 | import traceback 3 | import mwparserfromhell 4 | import datetime 5 | import os 6 | 7 | import sys 8 | import json 9 | import time 10 | 11 | from tqdm import tqdm 12 | 13 | WIKI_API_ENDPOINT = "https://en.wikipedia.org/w/api.php" 14 | 15 | def parse_to_plain_text(wikitext): 16 | parsed = mwparserfromhell.parse(wikitext) 17 | return parsed.strip_code() 18 | 19 | def fetch_content(title, date=None): 20 | params = { 21 | "action": "query", 22 | "format": "json", 23 | "titles": title, 24 | "prop": "revisions", 25 | "rvprop": "content", 26 | "rvlimit": "1", 27 | } 28 | if date: params["rvstart"] = date 29 | try: 30 | response = requests.get(WIKI_API_ENDPOINT, params=params) 31 | response.raise_for_status() # Will raise an error if the HTTP request returned an unsuccessful status code 32 | data = response.json() 33 | if 'error' in data: 34 | print(f"Error fetching content for {title}: {data['error']['info']}") 35 | return None 36 | 37 | page = next(iter(data['query']['pages'].values())) 38 | if 'revisions' not in page: 39 | print(f"No revisions found for {title}") 40 | return None 41 | content = page['revisions'][0]['*'] 42 | 43 | # Check if the content is a redirect and skip if true 44 | if content.lower().startswith("#redirect"): 45 | print(f"{title} is a redirect page.") 46 | return None 47 | text = parse_to_plain_text(content) 48 | if len(text.split(' ')) < 300: 49 | print(f"{title} is less than 300 words.") 50 | return None 51 | 52 | return { 53 | "title": page['title'], 54 | "text": text, 55 | "pageid": page['pageid'], 56 | }, content 57 | 58 | except Exception as e: 59 | print(f"An error occurred while fetching content for {title}: {str(e)}") 60 | traceback.print_exc() # This will print the full traceback 61 | 62 | return None 63 | 64 | if __name__ == "__main__": 65 | year, month, save_path = sys.argv[1:] 66 | month = int(month)%12 + 1 67 | 68 | start_time = datetime.datetime(int(year), month, 1) 69 | end_time = start_time + datetime.timedelta(days=28) 70 | 71 | print(f'Fetching wiki articles from {start_time.isoformat()} to {end_time.isoformat()}') 72 | 73 | with open('/user/HS502/yl02706/LatestEval/data/squad_wiki_title.text') as f: 74 | titles = [line.strip() for line in f.readlines()] 75 | historical_contents = [fetch_content(title, end_time) for title in tqdm(titles)] 76 | historical_contents = [content[0] for content in historical_contents if content is not None] 77 | historical_to_save = {title: content for title, content in zip(titles, historical_contents)} 78 | 79 | save_file = os.path.join(save_path, f'{year}-{month}.json') 80 | with open(save_file, 'w') as f: 81 | json.dump(historical_to_save, f, ensure_ascii=False) 82 | print(f'Saved {len(historical_contents)} articles to {save_file}') -------------------------------------------------------------------------------- /.github/workflows/weekly_downloader.yml: -------------------------------------------------------------------------------- 1 | name: Weekly Downloader 2 | 3 | on: 4 | schedule: 5 | # This cron job initiates the action at 00:00 every Sunday 6 | - cron: '0 0 * * 1' 7 | 8 | jobs: 9 | wiki_downloader: 10 | runs-on: ubuntu-latest 11 | 12 | # Define environment variables for all steps in this job 13 | env: 14 | HF_TOKEN: ${{ secrets.HF_TOKEN }} 15 | Github_Token: ${{ secrets.gh_token }} 16 | 17 | steps: 18 | - name: Checkout repository 19 | uses: actions/checkout@v2 20 | 21 | - name: Set up Python 22 | uses: actions/setup-python@v2 23 | with: 24 | python-version: '3.10' # Choose your desired Python version 25 | 26 | - name: Install dependencies 27 | run: | 28 | python -m pip install --upgrade pip 29 | pip install -r requirements.txt 30 | 31 | - name: Run script 32 | run: python wikitext_downloader.py 33 | 34 | arxiv_downloader: 35 | runs-on: ubuntu-latest 36 | 37 | # Define environment variables for all steps in this job 38 | env: 39 | HF_TOKEN: ${{ secrets.HF_TOKEN }} 40 | Github_Token: ${{ secrets.Github_Token }} 41 | 42 | steps: 43 | - name: Checkout repository 44 | uses: actions/checkout@v2 45 | 46 | - name: Set up Python 47 | uses: actions/setup-python@v2 48 | with: 49 | python-version: '3.10' # Choose your desired Python version 50 | 51 | - name: Install dependencies 52 | run: | 53 | python -m pip install --upgrade pip 54 | pip install -r requirements.txt 55 | 56 | - name: Run script 57 | run: python arxiv_downloader.py 58 | 59 | bbc_downloader: 60 | runs-on: ubuntu-latest 61 | 62 | # Define environment variables for all steps in this job 63 | env: 64 | HF_TOKEN: ${{ secrets.HF_TOKEN }} 65 | Github_Token: ${{ secrets.Github_Token }} 66 | 67 | steps: 68 | - name: Checkout repository 69 | uses: actions/checkout@v2 70 | 71 | - name: Set up Python 72 | uses: actions/setup-python@v2 73 | with: 74 | python-version: '3.10' # Choose your desired Python version 75 | 76 | - name: Install dependencies 77 | run: | 78 | python -m pip install --upgrade pip 79 | pip install -r requirements.txt 80 | 81 | - name: Run script 82 | run: python bbc_downloader.py 83 | 84 | github_downloader: 85 | runs-on: ubuntu-latest 86 | 87 | # Define environment variables for all steps in this job 88 | env: 89 | HF_TOKEN: ${{ secrets.HF_TOKEN }} 90 | Github_Token: ${{ secrets.Github_Token }} 91 | 92 | steps: 93 | - name: Checkout repository 94 | uses: actions/checkout@v2 95 | 96 | - name: Set up Python 97 | uses: actions/setup-python@v2 98 | with: 99 | python-version: '3.10' # Choose your desired Python version 100 | 101 | - name: Install dependencies 102 | run: | 103 | python -m pip install --upgrade pip 104 | pip install -r requirements.txt 105 | 106 | - name: Run script 107 | run: python github_downloader.py 108 | -------------------------------------------------------------------------------- /readme.md: -------------------------------------------------------------------------------- 1 |

2 | Logo of Selective Context 3 |

4 | 5 | # "Uncheatable" LLMs Evaluation - LatestEval 6 | 7 | Humans receive new test questions every exam, but LLMs? They've been evaluated with the same benchmarks for too long. Why not assess LLMs with fresh test just like we test our students? In this project, we introduce LatestEval, which automatically constructs language model benchmarks using the latest materials (e.g., arXiv, BBC, Wikipedia, etc.) to prevent "cheating" and data contamination. 8 | 9 | **News!!** 10 | 11 | - **15 Dec, 2023** - This project was accpeted by the main track of **AAAI 2024** :partying_face:! Check out the paper here: :point_right: [Dynamic Test Construction with Latest Materials](https://arxiv.org/abs/2312.12343). 12 | 13 | # Key Features 14 | 15 | 1. We maintain a QA benchmark that updates every half month using the latest online resources (created in the past half month). This approach aims to avoid 1) LLMs being trained on the test set (cheating); and 2) the unintentional inclusion of test questions in the training dataset (data contamination). 16 | 2. We analyzed real Human-AI conversations to ensure the automated benchmark aligns well with real-life applications (see [paper](https://arxiv.org/abs/2312.12343) for more detail). 17 | 18 | 19 | # The Benchmark 20 | 21 | Access the latest benchmark dorectly at [Huggingface Hub](https://huggingface.co/LatestEval)! 22 | 23 | - Latest benchmark of GitHub: [HF Hub](https://huggingface.co/datasets/LatestEval/github-latest) 24 | - Latest benchmark of arXiv: [HF Hub](https://huggingface.co/datasets/LatestEval/arxiv-latest) 25 | - Latest benchmark of BBC: [HF Hub](https://huggingface.co/datasets/LatestEval/bbc-latest) 26 | - The Full benchmark with all sources: [HF Hub](https://huggingface.co/datasets/LatestEval/full-latest) 27 | 28 | The benchmarks are created with latest materials, find these raw materials/documents at [Huggingface Hub](https://huggingface.co/RealTimeData) 29 | 30 | # Evaluate your LLM on LatestEval 31 | 32 | We will add LatestEval to [lm-evaluation-harness](https://github.com/EleutherAI/lm-evaluation-harness) and [OpenCompass](https://github.com/open-compass/opencompass). Stay tuned. 33 | 34 | # Create benchmarks with your own data 35 | 36 | 1. Put your documents as `.txt` files under `./`. 37 | 2. Set your OpenAI key: 38 | 39 | ``` 40 | export OPENAI_API_KEY= 41 | ``` 42 | 43 | 3. Simply run: 44 | 45 | ``` 46 | python data_processor.py --source customized --file_path --num_docs 100 47 | ``` 48 | 49 | If you want to reproduce LatestEval on arXiv, BBC, GitHub: 50 | 51 | ``` 52 | python data_processor.py --source arxiv --num_docs 100 53 | ``` 54 | 55 | # Issue 56 | 57 | Open an issue if you have any problems or want to discuss. 58 | 59 | # Citation 60 | 61 | If you find this project useful, consider cite this project: 62 | 63 | ``` 64 | @misc{li2023avoiding, 65 | title={Avoiding Data Contamination in Language Model Evaluation: Dynamic Test Construction with Latest Materials}, 66 | author={Yucheng Li and Frank Guerin and Chenghua Lin}, 67 | year={2023}, 68 | eprint={2312.12343}, 69 | archivePrefix={arXiv}, 70 | primaryClass={cs.CL} 71 | } 72 | ``` -------------------------------------------------------------------------------- /data/monthly_updater/monthly_wikitext.py: -------------------------------------------------------------------------------- 1 | import requests 2 | import traceback 3 | import mwparserfromhell 4 | import datetime 5 | import os 6 | import datasets 7 | 8 | import sys 9 | import json 10 | import time 11 | 12 | from tqdm import tqdm 13 | 14 | WIKI_API_ENDPOINT = "https://en.wikipedia.org/w/api.php" 15 | 16 | def parse_to_plain_text(wikitext): 17 | parsed = mwparserfromhell.parse(wikitext) 18 | return parsed.strip_code() 19 | 20 | def fetch_content(title, date=None): 21 | params = { 22 | "action": "query", 23 | "format": "json", 24 | "titles": title, 25 | "prop": "revisions", 26 | "rvprop": "content", 27 | "rvlimit": "1", 28 | } 29 | if date: params["rvstart"] = date 30 | try: 31 | response = requests.get(WIKI_API_ENDPOINT, params=params) 32 | response.raise_for_status() # Will raise an error if the HTTP request returned an unsuccessful status code 33 | data = response.json() 34 | if 'error' in data: 35 | print(f"Error fetching content for {title}: {data['error']['info']}") 36 | return None 37 | 38 | page = next(iter(data['query']['pages'].values())) 39 | if 'revisions' not in page: 40 | print(f"No revisions found for {title}") 41 | return None 42 | content = page['revisions'][0]['*'] 43 | 44 | # Check if the content is a redirect and skip if true 45 | if content.lower().startswith("#redirect"): 46 | print(f"{title} is a redirect page.") 47 | return None 48 | text = parse_to_plain_text(content) 49 | if len(text.split(' ')) < 300: 50 | print(f"{title} is less than 300 words.") 51 | return None 52 | 53 | return { 54 | "title": page['title'], 55 | "text": text, 56 | "pageid": page['pageid'], 57 | } 58 | 59 | except Exception as e: 60 | print(f"An error occurred while fetching content for {title}: {str(e)}") 61 | traceback.print_exc() # This will print the full traceback 62 | 63 | return None 64 | 65 | if __name__ == "__main__": 66 | today = datetime.datetime.today() 67 | year = today.year 68 | month = today.month 69 | 70 | hf_token = os.environ['HF_TOKEN'] 71 | 72 | start_time = datetime.datetime(year, month, 1) 73 | end_time = today 74 | 75 | print(f'Fetching wiki articles from {start_time.isoformat()} to {end_time.isoformat()}') 76 | 77 | with open('./data/squad_wiki_title.text') as f: 78 | titles = [line.strip() for line in f.readlines()] 79 | historical_contents = [fetch_content(title, end_time) for title in tqdm(titles)] 80 | historical_contents = [content for content in historical_contents if content is not None] 81 | historical_to_save = {title: content for title, content in zip(titles, historical_contents)} 82 | 83 | save_file = f'{year}-{month}.json' 84 | with open(save_file, 'w') as f: 85 | json.dump(historical_to_save, f, ensure_ascii=False) 86 | print(f'Saved {len(historical_contents)} articles to {save_file}') 87 | 88 | from huggingface_hub import hf_hub_download, RepoCard, upload_file 89 | 90 | with open(save_file) as f: 91 | data = json.load(f) 92 | 93 | all_articles = [] 94 | for title, article in data.items(): 95 | article['time'] = f'{year}-{month:02d}' 96 | all_articles.append(article) 97 | 98 | ds = datasets.Dataset.from_list(all_articles) 99 | ds.push_to_hub(f"RealTimeData/wikitext_alltime", config_name=f'{year}-{month:02d}', token=hf_token) -------------------------------------------------------------------------------- /data/wikitext_alltime.py: -------------------------------------------------------------------------------- 1 | import datasets 2 | import json 3 | 4 | dl = datasets.DownloadManager() 5 | configs_file = dl.download('https://huggingface.co/datasets/RealTimeData/wikitext_alltime/raw/main/configs.txt') 6 | 7 | with open(configs_file, encoding="utf-8") as f: 8 | _TIMES = f.read().splitlines() 9 | 10 | _TIMES += ['all'] 11 | 12 | _CITATION = """\ 13 | @misc{li2023estimating, 14 | title={Estimating Contamination via Perplexity: Quantifying Memorisation in Language Model Evaluation}, 15 | author={Yucheng Li}, 16 | year={2023}, 17 | eprint={2309.10677}, 18 | archivePrefix={arXiv}, 19 | primaryClass={cs.CL} 20 | } 21 | """ 22 | 23 | _DESCRIPTION = """\ 24 | This dataset contains Wikipedia articles of 419 selected pages every month from 2017-1 to current. The articles are arraged by month. Access the specific month by using the format "YYYY-MM" as config. Such as load_dataset("RealTimeData/wikitext_alltime", "2021-1"). 25 | """ 26 | 27 | _HOMEPAGE = "https://github.com/liyucheng09/Contamination_Detector" 28 | 29 | class Wikitext_alltimes(datasets.GeneratorBasedBuilder): 30 | 31 | BUILDER_CONFIGS = [ 32 | datasets.BuilderConfig( 33 | name=time, version=datasets.Version("1.0.0"), description=f"419 selected wikipedia articles edited in the priod of {time}" 34 | ) 35 | for time in _TIMES 36 | ] 37 | 38 | def _info(self): 39 | features = datasets.Features( 40 | { 41 | "title": datasets.Value("string"), 42 | "pageid": datasets.Value("int64"), 43 | "text": datasets.Value("string"), 44 | "time": datasets.Value("string"), 45 | } 46 | ) 47 | return datasets.DatasetInfo( 48 | description=_DESCRIPTION, 49 | features=features, 50 | homepage=_HOMEPAGE, 51 | citation=_CITATION, 52 | ) 53 | 54 | def _split_generators(self, dl_manager): 55 | """Returns SplitGenerators.""" 56 | if self.config.name == "all": 57 | times = _TIMES[:-1] 58 | files = dl_manager.download([f"wiki/{time}.json" for time in _TIMES ]) 59 | return [ 60 | datasets.SplitGenerator( 61 | name=datasets.Split.TRAIN, 62 | gen_kwargs={"files": files}, 63 | ) 64 | ] 65 | else: 66 | time = self.config.name 67 | _URL = f"wiki/{time}.json" 68 | file = dl_manager.download(_URL) 69 | return [ 70 | datasets.SplitGenerator( 71 | name=datasets.Split.TRAIN, 72 | gen_kwargs={"files": file}, 73 | ) 74 | ] 75 | 76 | def _generate_examples(self, files): 77 | """Yields examples.""" 78 | if self.config.name == "all": 79 | assert isinstance(files, list) 80 | for file in files: 81 | time = file.strip('.json') 82 | with open(file, encoding="utf-8") as f: 83 | data = json.load(f) 84 | for title, article in data.items(): 85 | yield f'{time}-{title}', { 86 | "title": article['title'], 87 | "pageid": article['pageid'], 88 | "text": article['text'], 89 | "time": time, 90 | } 91 | else: 92 | assert isinstance(files, str) 93 | time = self.config.name 94 | with open(files, encoding="utf-8") as f: 95 | data = json.load(f) 96 | for title, article in data.items(): 97 | yield f'{time}-{title}', { 98 | "title": article['title'], 99 | "pageid": article['pageid'], 100 | "text": article['text'], 101 | "time": time, 102 | } -------------------------------------------------------------------------------- /push_to_hf_hub.py: -------------------------------------------------------------------------------- 1 | # Merge with RealTimeData/ and push to Huggingface Hub 2 | 3 | from glob import glob 4 | import datasets 5 | import json 6 | from huggingface_hub import RepoCard, create_branch, create_tag 7 | from data_processor import ArxivEval, BBCNewsEval, GithubEval 8 | import datetime 9 | 10 | if __name__ == "__main__": 11 | # Load the dataset 12 | # for example, benchmarks/latest/qa_pairs_arxiv_2023-46.json 13 | 14 | today = datetime.date.today() 15 | 16 | RepoCardText = """ 17 | # LatestEval for {source} 18 | 19 | This benchmark was created with at {year} week {week} with the latest data from {source}. 20 | 21 | check more details at our [github page](https://github.com/liyucheng09/LatestEval).""" 22 | 23 | source2ds = {} 24 | latest_ds = [] 25 | 26 | for file in glob('benchmarks/2023-51/*.json'): 27 | with open(file, 'r') as f: 28 | data = json.load(f) 29 | 30 | if 'arxiv' in file: 31 | source = 'arxiv' 32 | docs = ArxivEval('RealTimeData/arxiv_latest', num_docs='all').docs 33 | elif 'bbc' in file: 34 | source = 'bbc' 35 | docs = BBCNewsEval('RealTimeData/bbc_latest', num_docs='all').docs 36 | elif 'github' in file: 37 | source = 'github' 38 | docs = GithubEval('RealTimeData/github_latest', num_docs='all').docs 39 | 40 | source2ds[source] = data 41 | 42 | time_stamp = file.split('_')[-1].split('.')[0] 43 | year = time_stamp.split('-')[0] 44 | week = time_stamp.split('-')[1] 45 | 46 | test_samples = [] 47 | for doc in data: 48 | doc_id = doc['id'][len(source)+1:] 49 | sents = None 50 | for d in docs: 51 | if d.entry_id == doc_id: 52 | sents = d.original_sentences 53 | assert sents is not None, f'{doc_id} not found in {source} data' 54 | 55 | if isinstance(doc['response'], str): 56 | try: 57 | doc['response'] = eval(doc['response']) 58 | except: 59 | print(doc['response']) 60 | continue 61 | 62 | for example in doc['response']: 63 | sent_index = example['sentence_index'] 64 | passage = '' 65 | for sent_i, sent in enumerate(sents): 66 | if sent_i == sent_index: 67 | passage += example['place_holder'] + ' ' 68 | else: 69 | passage += sent + ' ' 70 | test_samples.append({ 71 | 'source': source, 72 | 'doc_id': doc_id, 73 | 'passage': passage, 74 | 'query': example['query'], 75 | 'answer': example['key_information'], 76 | 'query_category': example['answer_type'], 77 | 'sent_index': sent_index 78 | }) 79 | 80 | latest_ds.extend(test_samples) 81 | 82 | # dataset = datasets.Dataset.from_list(test_samples) 83 | # dataset.push_to_hub(f'LatestEval/{source}-latest', branch='main') 84 | # dataset.push_to_hub(f'LatestEval/{source}-{year}-week{week}') 85 | 86 | # card = RepoCard(RepoCardText.format(source=source, year=year, week=week)) 87 | # card.push_to_hub(f'LatestEval/{source}-latest', repo_type='dataset') 88 | # card.push_to_hub(f'LatestEval/{source}-{year}-week{week}', repo_type='dataset') 89 | 90 | # all three sources together 91 | # flatten the data and add source column 92 | 93 | dataset = datasets.Dataset.from_list(latest_ds) 94 | dataset.push_to_hub(f'LatestEval/full-latest', branch='main') 95 | dataset.push_to_hub(f'LatestEval/full-{year}-week{week}') 96 | 97 | card = RepoCard(RepoCardText.format(source='all', year=year, week=week)) 98 | card.push_to_hub(f'LatestEval/full-latest', repo_type='dataset') 99 | card.push_to_hub(f'LatestEval/full-{year}-week{week}', repo_type='dataset') 100 | -------------------------------------------------------------------------------- /figs/compare_strings.json: -------------------------------------------------------------------------------- 1 | { 2 | "prompts": [ 3 | "Beyoncé Giselle Knowles-Carter (/biːˈjɒnseɪ/ bee-YON-say) (born September 4, 1981) is an American singer, songwriter, record producer and actress. Born and raised in Houston, Texas, she performed in various singing and dancing competitions as a child, and rose to fame in", 4 | "Persian (/ˈpɜːrʒən, -ʃən/), also known by its endonym Farsi (فارسی fārsi (fɒːɾˈsiː) ( listen)), is one of the Western Iranian languages within the Indo-Iranian branch of the Indo-European language family. It is primarily", 5 | "The number of Malayalam speakers in Lakshadweep is 51,100, which is only 0.15% of the total number, but is as much as about 84% of the population of Lakshadweep. In all, Malayalis made up 3.22% of the total Indian population in 2001. Of the total 33,066,", 6 | "The British have a love-hate relationship with the NHS. According to researchers at the King's Fund, the public gave the NHS its worst rating since records began 40 years ago. Just 29% said they were satisfied with the NHS in 2022. And yet we still love it. A whopping 90% of" 7 | ], 8 | "predictions": [ 9 | " the late 1990s as lead singer of R&B girl-group Destiny's Child. Managed by her father, Mathew Knowles, the group became one of the world's best-selling girl groups of all time. Their hiatus saw the release of Beyoncé's debut album, Dangerously in Love (2003), which established her as a solo artist worldwide, earned five Grammy Awards and featured the Billboard Hot 100 number-one singles \"Crazy in Love\" and \"Baby Boy\".", 10 | " spoken in Iran, Afghanistan (officially known as Dari since 1958),[3] and Tajikistan (officially known as Tajiki since the Soviet era),[4] and some other regions which historically were Persianate societies and considered part of Greater Iran. It is written right to left in the Persian alphabet, a modified variant of the Arabic script.", 11 | " 392 Malayalam speakers in India in 2001, 33,015,420 spoke the standard dialects, 19,643 spoke the Yerava dialect and 31,329 spoke non-standard regional variations like Eranadan. As per the 1991 census data, 28.85% of all Malayalam speakers in India spoke a second language and 19.64% of the total knew three or more languages.", 12 | " of Britons believe that the NHS is a crucial institution that should be preserved. The juxtaposition between dissatisfaction with the current state and overall reverence for the institution speaks volumes about the complex relationship the British public has with their healthcare system." 13 | ], 14 | "references": [ 15 | " the late 1990s as lead singer of the R&B girl-group Destiny's Child. Managed by her father, Mathew Knowles, the group became one of the world's best-selling girl groups of all time. Her debut solo album, Dangerously in Love (2003), debuted at number one on the US Billboard 200 chart and earned her five Grammy Awards, solidifying her as a solo artist as well. Throughout her career, Beyoncé has sold over 100 million records worldwide as a solo artist and a further 60 million records with Destiny's Child, making her one of the best-selling music artists of all time. She has won 23 Grammy Awards and is the most", 16 | " spoken in Iran, Afghanistan (officially known as Dari since 1958), and Tajikistan (officially known as Tajiki since the Soviet era), and some other regions which historically were Persianate societies and considered part of Greater Iran. It is written in the Persian alphabet, a modified variant of the Arabic script, which itself evolved from the Aramaic alphabet.", 17 | " 392 Malayalam speakers in India in 2001, 33,015,420 spoke the standard dialects, 19,643 spoke the Yerava dialect and 31,329 spoke non-standard regional variations like Eranadan. As per the 1991 census data, 28.85% of all Malayalam speakers in India spoke a second language and 19.64% of the total knew three or more languages.", 18 | "of the public agrees the service should be free and available to everyone. But with more than seven million people on waiting lists, almost everyone knows someone who isn't getting the care they need. As the NHS approaches its 75th anniversary, politicians are falling over themselves to praise the service." 19 | ] 20 | } -------------------------------------------------------------------------------- /.github/workflows/monthly_updater.yml: -------------------------------------------------------------------------------- 1 | name: Monthly Updater 2 | 3 | on: 4 | schedule: 5 | # This cron job initiates the action at 00:00 on the 28th of every month 6 | - cron: '0 0 28 * *' 7 | 8 | workflow_dispatch: 9 | 10 | jobs: 11 | wiki_downloader: 12 | runs-on: ubuntu-latest 13 | 14 | # Define environment variables for all steps in this job 15 | env: 16 | HF_TOKEN: ${{ secrets.HF_TOKEN }} 17 | Github_Token: ${{ secrets.gh_token }} 18 | Overflow_Token: ${{ secrets.overflow_token }} 19 | 20 | steps: 21 | - name: Checkout repository 22 | uses: actions/checkout@v2 23 | 24 | - name: Set up Python 25 | uses: actions/setup-python@v2 26 | with: 27 | python-version: '3.10' # Choose your desired Python version 28 | 29 | - name: Install dependencies 30 | run: | 31 | python -m pip install --upgrade pip 32 | pip install -r requirements.txt 33 | 34 | - name: Run script 35 | run: python data/monthly_updater/monthly_wikitext.py 36 | 37 | arxiv_downloader: 38 | runs-on: ubuntu-latest 39 | 40 | # Define environment variables for all steps in this job 41 | env: 42 | HF_TOKEN: ${{ secrets.HF_TOKEN }} 43 | Github_Token: ${{ secrets.gh_token }} 44 | Overflow_Token: ${{ secrets.overflow_token }} 45 | 46 | steps: 47 | - name: Checkout repository 48 | uses: actions/checkout@v2 49 | 50 | - name: Set up Python 51 | uses: actions/setup-python@v2 52 | with: 53 | python-version: '3.10' # Choose your desired Python version 54 | 55 | - name: Install dependencies 56 | run: | 57 | python -m pip install --upgrade pip 58 | pip install -r requirements.txt 59 | 60 | - name: Run script 61 | run: python data/monthly_updater/monthly_arxiv.py 62 | 63 | bbc_downloader: 64 | runs-on: ubuntu-latest 65 | 66 | # Define environment variables for all steps in this job 67 | env: 68 | HF_TOKEN: ${{ secrets.HF_TOKEN }} 69 | Github_Token: ${{ secrets.gh_token }} 70 | Overflow_Token: ${{ secrets.overflow_token }} 71 | 72 | steps: 73 | - name: Checkout repository 74 | uses: actions/checkout@v2 75 | 76 | - name: Set up Python 77 | uses: actions/setup-python@v2 78 | with: 79 | python-version: '3.10' # Choose your desired Python version 80 | 81 | - name: Install dependencies 82 | run: | 83 | python -m pip install --upgrade pip 84 | pip install -r requirements.txt 85 | 86 | - name: Run script 87 | run: python data/monthly_updater/monthly_bbc_news.py 88 | 89 | math_downloader: 90 | runs-on: ubuntu-latest 91 | 92 | # Define environment variables for all steps in this job 93 | env: 94 | HF_TOKEN: ${{ secrets.HF_TOKEN }} 95 | Github_Token: ${{ secrets.gh_token }} 96 | Overflow_API_KEY: ${{ secrets.overflow_token }} 97 | 98 | steps: 99 | - name: Checkout repository 100 | uses: actions/checkout@v2 101 | 102 | - name: Set up Python 103 | uses: actions/setup-python@v2 104 | with: 105 | python-version: '3.10' # Choose your desired Python version 106 | 107 | - name: Install dependencies 108 | run: | 109 | python -m pip install --upgrade pip 110 | pip install -r requirements.txt 111 | 112 | - name: Run script 113 | run: python data/monthly_updater/monthly_math.py 114 | 115 | code_downloader: 116 | runs-on: ubuntu-latest 117 | 118 | # Define environment variables for all steps in this job 119 | env: 120 | HF_TOKEN: ${{ secrets.HF_TOKEN }} 121 | Github_Token: ${{ secrets.gh_token }} 122 | Overflow_Token: ${{ secrets.overflow_token }} 123 | 124 | steps: 125 | - name: Checkout repository 126 | uses: actions/checkout@v2 127 | 128 | - name: Set up Python 129 | uses: actions/setup-python@v2 130 | with: 131 | python-version: '3.10' # Choose your desired Python version 132 | 133 | - name: Install dependencies 134 | run: | 135 | python -m pip install --upgrade pip 136 | pip install -r requirements.txt 137 | 138 | - name: Run script 139 | run: python data/monthly_updater/monthly_code.py 140 | -------------------------------------------------------------------------------- /github_downloader.py: -------------------------------------------------------------------------------- 1 | import requests 2 | import base64 3 | from datetime import datetime, timedelta 4 | import os 5 | import json 6 | from tqdm import tqdm 7 | from huggingface_hub import create_branch, create_tag, RepoCard 8 | import traceback 9 | 10 | github_token = os.environ['Github_Token'] 11 | headers = {'Authorization': f'token {github_token}'} 12 | 13 | hf_token = os.environ['HF_TOKEN'] 14 | 15 | today = datetime.now() 16 | start_date = today - timedelta(weeks=2) 17 | start_date_str = start_date.strftime("%Y-%m-%d") 18 | 19 | end_date = start_date + timedelta(days=7) 20 | end_date_str = end_date.strftime("%Y-%m-%d") 21 | 22 | out_path = f"dataset/github/{start_date_str}" 23 | if not os.path.exists(out_path): 24 | os.makedirs(out_path) 25 | 26 | def load_checkpoint(): 27 | try: 28 | with open(f'{start_date_str}_checkpoint.json', 'r') as f: 29 | checkpoint = json.load(f) 30 | return checkpoint.get('page', 1), checkpoint.get('last_repo_index', 0) 31 | except FileNotFoundError: 32 | return 1, 0 33 | 34 | def save_checkpoint(page, last_repo_index): 35 | with open(f'{start_date_str}_checkpoint.json', 'w') as f: 36 | json.dump({'page': page, 'last_repo_index': last_repo_index}, f) 37 | 38 | page, last_repo_index = 1, 0 39 | # page, last_repo_index = load_checkpoint() 40 | all_readmes = [] 41 | 42 | while True: 43 | response = requests.get(f'https://api.github.com/search/repositories?q=created:{start_date_str}..{end_date_str}&sort=stars&order=desc&per_page=100&page={page}', headers=headers) 44 | data = response.json() 45 | 46 | if 'items' not in data: 47 | break 48 | if not data['items']: 49 | break 50 | for repo in tqdm(data['items'][last_repo_index:]): 51 | owner = repo['owner']['login'] 52 | repo_name = repo['name'] 53 | 54 | full_name = repo['full_name'] 55 | url = repo['html_url'] 56 | description = repo['description'] 57 | stars = repo['stargazers_count'] 58 | forks = repo['forks_count'] 59 | 60 | response = requests.get(f'https://api.github.com/repos/{owner}/{repo_name}/readme', headers=headers) 61 | readme_data = response.json() 62 | 63 | if 'content' in readme_data: 64 | readme_content = base64.b64decode(readme_data['content']).decode('utf-8') 65 | # print(f"Repository {repo_name} README content:") 66 | # print(readme_content) 67 | with open(f"{out_path}/{full_name.replace('/', '_')}_README.md", 'w') as f: 68 | readme_obj = {'full_name': full_name, 'url': url, 'description': description, 'readme': readme_content, 'stars': stars, 'forks': forks} 69 | all_readmes.append(readme_obj) 70 | json.dump(readme_obj, f, ensure_ascii=False) 71 | else: 72 | print(f"Repository {repo_name} doesn't have a README.") 73 | 74 | page += 1 75 | 76 | import datasets 77 | 78 | all_readmes = { k: [v[k] for v in all_readmes] for k in all_readmes[0].keys() } 79 | ds = datasets.Dataset.from_dict(all_readmes) 80 | 81 | try: 82 | create_branch("RealTimeData/github_latest", branch=start_date_str, repo_type="dataset", token=hf_token) 83 | except: 84 | traceback.print_exc() 85 | ds.push_to_hub("RealTimeData/github_latest", token=hf_token, branch='main') 86 | ds.push_to_hub("RealTimeData/github_latest", token=hf_token, branch=start_date_str) 87 | 88 | text = f""" 89 | # Latest GitHub Repositories 90 | 91 | You could always access the latest Github repos via this dataset. 92 | 93 | We update the dataset weekly, on every Sunday. So the dataset always provides the latest Github repos from the last week. 94 | 95 | The current dataset on main branch contains the latest Github Repos submitted from {start_date_str} to {end_date_str}. 96 | 97 | The data collection is conducted on {today.date().isoformat()}. 98 | 99 | Use the dataset via: 100 | ``` 101 | ds = datasets.load_dataset('RealTimeData/github_latest') 102 | ``` 103 | 104 | # Previsou versions 105 | 106 | You could access previous versions by requesting different branches. 107 | 108 | For example, you could find the 2023-08-06 version via: 109 | ``` 110 | ds = datasets.load_dataset('RealTimeData/github_latest', revision = '2023-08-06') 111 | ``` 112 | 113 | Check all available versions by clicking the "Files and versions" button on the top bar. 114 | """ 115 | card = RepoCard(text) 116 | card.push_to_hub('RealTimeData/github_latest', repo_type='dataset', token=hf_token) -------------------------------------------------------------------------------- /data/bbc_alltime.py: -------------------------------------------------------------------------------- 1 | import datasets 2 | import json 3 | 4 | dl = datasets.DownloadManager() 5 | configs_file = dl.download('https://huggingface.co/datasets/RealTimeData/bbc_alltime/raw/main/configs.txt') 6 | 7 | with open(configs_file, encoding="utf-8") as f: 8 | _TIMES = f.read().splitlines() 9 | 10 | _CITATION = """\ 11 | @misc{li2023estimating, 12 | title={Estimating Contamination via Perplexity: Quantifying Memorisation in Language Model Evaluation}, 13 | author={Yucheng Li}, 14 | year={2023}, 15 | eprint={2309.10677}, 16 | archivePrefix={arXiv}, 17 | primaryClass={cs.CL} 18 | } 19 | """ 20 | 21 | _DESCRIPTION = """\ 22 | This dataset contains BBC News articles for every month from 2017-1 to current. Access a specific month by using the format "YYYY-MM" as config. Such as load_dataset("RealTimeData/bbc_alltime", "2021-1"). 23 | """ 24 | 25 | _HOMEPAGE = "https://github.com/liyucheng09/Contamination_Detector" 26 | 27 | class Bbc_alltimes(datasets.GeneratorBasedBuilder): 28 | 29 | BUILDER_CONFIGS = [ 30 | datasets.BuilderConfig( 31 | name=time, version=datasets.Version("1.0.0"), description=f"BBC News articles published in the priod of {time}" 32 | ) 33 | for time in _TIMES 34 | ] 35 | 36 | def _info(self): 37 | features = datasets.Features( 38 | { 39 | "title": datasets.Value("string"), 40 | "published_date": datasets.Value("string"), 41 | "authors": datasets.Value("string"), 42 | "description": datasets.Value("string"), 43 | "section": datasets.Value("string"), 44 | "content": datasets.Value("string"), 45 | "link": datasets.Value("string"), 46 | } 47 | ) 48 | return datasets.DatasetInfo( 49 | description=_DESCRIPTION, 50 | features=features, 51 | homepage=_HOMEPAGE, 52 | citation=_CITATION, 53 | ) 54 | 55 | def _split_generators(self, dl_manager): 56 | """Returns SplitGenerators.""" 57 | if self.config.name == "all": 58 | times = _TIMES[:-1] 59 | files = dl_manager.download([f"articles/{time}.json" for time in _TIMES ]) 60 | return [ 61 | datasets.SplitGenerator( 62 | name=datasets.Split.TRAIN, 63 | gen_kwargs={"files": files}, 64 | ) 65 | ] 66 | else: 67 | time = self.config.name 68 | _URL = f"articles/{time}.json" 69 | file = dl_manager.download(_URL) 70 | return [ 71 | datasets.SplitGenerator( 72 | name=datasets.Split.TRAIN, 73 | gen_kwargs={"files": file}, 74 | ) 75 | ] 76 | 77 | def _generate_examples(self, files): 78 | """Yields examples.""" 79 | if self.config.name == "all": 80 | assert isinstance(files, list) 81 | for file in files: 82 | time = file.strip('.json') 83 | with open(file, encoding="utf-8") as f: 84 | data = json.load(f) 85 | length = len(data['title']) 86 | for i in range(length): 87 | yield f'{time}-{i}', { 88 | "title": data['title'][i], 89 | "published_date": data['published_date'][i], 90 | "authors": data['authors'][i], 91 | "description": data['description'][i], 92 | "section": data['section'][i], 93 | "content": data['content'][i], 94 | "link": data['link'][i], 95 | } 96 | else: 97 | assert isinstance(files, str) 98 | time = self.config.name 99 | with open(files, encoding="utf-8") as f: 100 | data = json.load(f) 101 | length = len(data['title']) 102 | for i in range(length): 103 | yield f'{time}-{i}', { 104 | "title": data['title'][i], 105 | "published_date": data['published_date'][i], 106 | "authors": data['authors'][i], 107 | "description": data['description'][i], 108 | "section": data['section'][i], 109 | "content": data['content'][i], 110 | "link": data['link'][i], 111 | } -------------------------------------------------------------------------------- /data/maintain_wikitext_latest.py: -------------------------------------------------------------------------------- 1 | import requests 2 | import traceback 3 | import mwparserfromhell 4 | import datetime 5 | import os 6 | 7 | import sys 8 | import json 9 | import time 10 | 11 | from tqdm import tqdm 12 | 13 | WIKI_API_ENDPOINT = "https://en.wikipedia.org/w/api.php" 14 | 15 | def parse_to_plain_text(wikitext): 16 | parsed = mwparserfromhell.parse(wikitext) 17 | return parsed.strip_code() 18 | 19 | def fetch_content(title, date=None): 20 | params = { 21 | "action": "query", 22 | "format": "json", 23 | "titles": title, 24 | "prop": "revisions", 25 | "rvprop": "content", 26 | "rvlimit": "1", 27 | } 28 | if date: params["rvstart"] = date 29 | try: 30 | response = requests.get(WIKI_API_ENDPOINT, params=params) 31 | response.raise_for_status() # Will raise an error if the HTTP request returned an unsuccessful status code 32 | data = response.json() 33 | if 'error' in data: 34 | print(f"Error fetching content for {title}: {data['error']['info']}") 35 | return None 36 | 37 | page = next(iter(data['query']['pages'].values())) 38 | if 'revisions' not in page: 39 | print(f"No revisions found for {title}") 40 | return None 41 | content = page['revisions'][0]['*'] 42 | 43 | # Check if the content is a redirect and skip if true 44 | if content.lower().startswith("#redirect"): 45 | print(f"{title} is a redirect page.") 46 | return None 47 | text = parse_to_plain_text(content) 48 | if len(text.split(' ')) < 300: 49 | print(f"{title} is less than 300 words.") 50 | return None 51 | 52 | return { 53 | "title": page['title'], 54 | "text": text, 55 | "pageid": page['pageid'], 56 | } 57 | 58 | except Exception as e: 59 | print(f"An error occurred while fetching content for {title}: {str(e)}") 60 | traceback.print_exc() # This will print the full traceback 61 | 62 | return None 63 | 64 | if __name__ == "__main__": 65 | today = datetime.date.today() 66 | year = today.year 67 | month = today.month 68 | 69 | hf_token = os.environ['HF_TOKEN'] 70 | 71 | start_time = datetime.datetime(year, month, 1) 72 | end_time = today 73 | 74 | print(f'Fetching wiki articles from {start_time.isoformat()} to {end_time.isoformat()}') 75 | 76 | with open('./data/squad_wiki_title.text') as f: 77 | titles = [line.strip() for line in f.readlines()] 78 | historical_contents = [fetch_content(title, end_time) for title in tqdm(titles)] 79 | historical_contents = [content for content in historical_contents if content is not None] 80 | historical_to_save = {title: content for title, content in zip(titles, historical_contents)} 81 | 82 | save_file = f'{year}-{month}.json' 83 | with open(save_file, 'w') as f: 84 | json.dump(historical_to_save, f, ensure_ascii=False) 85 | print(f'Saved {len(historical_contents)} articles to {save_file}') 86 | 87 | from huggingface_hub import hf_hub_download, RepoCard, upload_file 88 | 89 | upload_file( 90 | path_or_fileobj = save_file, 91 | path_in_repo = f'wiki/{year}-{month}.json', 92 | repo_id = 'RealTimeData/wikitext_alltime', 93 | repo_type = 'dataset', 94 | token=hf_token, 95 | ) 96 | 97 | file = hf_hub_download(repo_id="RealTimeData/wikitext_alltime", filename="configs.txt", repo_type='dataset') 98 | with open(file) as f: 99 | times = json.read(f).splitlines() 100 | times.append(f'{year}-{month}') 101 | 102 | with open('configs.txt', 'w') as f: 103 | f.write('\n'.join(times)) 104 | 105 | upload_file( 106 | path_or_fileobj = 'configs.txt', 107 | path_in_repo = 'configs.txt', 108 | repo_id = 'RealTimeData/wikitext_alltime', 109 | repo_type = 'dataset', 110 | token=hf_token, 111 | ) 112 | 113 | text = f""" 114 | # Wikitext for All Times 115 | 116 | You could find 491 selected wiki articles every month from 2017-1 to {year_str}-{month_str}. 117 | 118 | Use this to download wiki articles during a specific month: 119 | ``` 120 | ds = datasets.load_dataset('RealTimeData/wikitext_alltime', '2017-8') 121 | ``` 122 | 123 | The time stamp follows the format of "YYYY-MM". 124 | 125 | # An example 126 | 127 | ``` 128 | > ds = datasets.load_dataset('RealTimeData/wikitext_alltime', '2023-10', split='train') 129 | > ds[0] 130 | 131 | {'title': 'Queen Victoria', 132 | 'pageid': 47923, 133 | 'text': 'Victoria (Alexa ...', 134 | 'time': '2023-10'} 135 | ``` 136 | """ 137 | card = RepoCard(text) 138 | card.push_to_hub('RealTimeData/wikitext_alltime', repo_type='dataset', token=hf_token) -------------------------------------------------------------------------------- /data/audio_dataset.py: -------------------------------------------------------------------------------- 1 | import datetime 2 | import yt_dlp 3 | from googleapiclient.discovery import build 4 | import os 5 | import sys 6 | import re 7 | from glob import glob 8 | import datasets 9 | import time 10 | import random 11 | import soundfile as sf 12 | import struct 13 | import numpy as np 14 | 15 | def get_popular_videos(youtube, start_date, end_date, max_results=30): 16 | published_after = start_date.strftime("%Y-%m-%dT%H:%M:%SZ") 17 | published_before = end_date.strftime("%Y-%m-%dT%H:%M:%SZ") 18 | request = youtube.search().list( 19 | part="snippet", 20 | maxResults=max_results, 21 | order="viewCount", 22 | publishedAfter=published_after, 23 | publishedBefore=published_before, 24 | type="video", 25 | ) 26 | response = request.execute() 27 | videos_ids = [item['id']['videoId'] for item in response['items']] 28 | return videos_ids 29 | 30 | def parse_duration(duration_string): 31 | hours = re.search(r'(\d+)H', duration_string) 32 | minutes = re.search(r'(\d+)M', duration_string) 33 | seconds = re.search(r'(\d+)S', duration_string) 34 | 35 | hours = int(hours.group(1)) if hours else 0 36 | minutes = int(minutes.group(1)) if minutes else 0 37 | seconds = int(seconds.group(1)) if seconds else 0 38 | 39 | return hours * 3600 + minutes * 60 + seconds 40 | 41 | def filter_too_long_video(youtube, video_ids, max_duration=600, max_results=30): 42 | request = youtube.videos().list( 43 | part="contentDetails", 44 | id=','.join(video_ids), 45 | ) 46 | response = request.execute() 47 | 48 | final_videos = [] 49 | for item in response['items']: 50 | duration = parse_duration(item['contentDetails']['duration']) 51 | if duration <= max_duration: 52 | final_videos.append(item['id']) 53 | if len(final_videos) >= max_results: 54 | break 55 | return final_videos 56 | 57 | def download_audio(video_id, save_path): 58 | ydl_opts = { 59 | 'format': 'bestaudio/best', 60 | 'postprocessors': [{ 61 | 'key': 'FFmpegExtractAudio', 62 | 'preferredcodec': 'flac', 63 | 'preferredquality': '192', 64 | }], 65 | 'postprocessor_args': [ 66 | '-ar', '16000' # Set audio sample rate to 16 kHz 67 | ], 68 | 'outtmpl': os.path.join(save_path, '%(id)s.%(ext)s'), 69 | } 70 | with yt_dlp.YoutubeDL(ydl_opts) as ydl: 71 | ydl.download([f'http://www.youtube.com/watch?v={video_id}']) 72 | 73 | if __name__ == '__main__': 74 | 75 | month, save_path, = sys.argv[1:] 76 | month = int(month) + 1 77 | 78 | videos_per_month = 3 79 | api_key = os.environ['YOUTUBE_API_KEY'] 80 | youtube = build('youtube', 'v3', developerKey=api_key) 81 | 82 | # time_stamps = [f'{year}-{month:02d}' for year in range(2017, 2024)] 83 | time_stamps = [f'{year}-{month:02d}' for year in range(2017, 2024) for month in range(1, 13)] 84 | 85 | for time_stamp in time_stamps: 86 | files = glob(os.path.join(save_path, time_stamp, '*.flac')) 87 | print(f"Start {time_stamp}...") 88 | 89 | if not len(files) >= videos_per_month: 90 | year, month = time_stamp.split('-') 91 | 92 | start_date = datetime.date(int(year), int(month), 1) 93 | end_of_month = datetime.date(int(year), int(month), 28) 94 | 95 | video_ids = get_popular_videos(youtube, start_date, end_of_month, max_results=50) 96 | video_ids = filter_too_long_video(youtube, video_ids, max_duration=600, max_results=videos_per_month) 97 | for video in video_ids: 98 | download_audio(video, os.path.join(save_path, time_stamp)) 99 | 100 | print(f"Downloaded {len(video_ids)} videos in {time_stamp}") 101 | 102 | files = glob(os.path.join(save_path, time_stamp, '*.flac')) 103 | random.shuffle(files) 104 | files = files[:videos_per_month] 105 | instances = [] 106 | for file in files: 107 | data, samplerate = sf.read(file) 108 | if len(data.shape) > 1: 109 | data = data.mean(axis=1) 110 | denormalized_data = np.int16(data * 32767) 111 | byte_stream = b''.join(struct.pack('') 8 | result.append(f'
{title}
') 9 | result.append(f'
{prompt}
') 10 | s = SequenceMatcher(None, str1, str2) 11 | 12 | for opcode, a0, a1, b0, b1 in s.get_opcodes(): 13 | if opcode == 'equal': 14 | result.append(f'{str1[a0:a1]}') 15 | else: 16 | result.append(str1[a0:a1]) 17 | result.append('') 18 | all_result.append(''.join(result)) 19 | 20 | return ''.join(all_result) 21 | 22 | # Test the function 23 | # prompts = [ 24 | # """Beyoncé Giselle Knowles-Carter (/biːˈjɒnseɪ/ bee-YON-say) (born September 4, 1981) is an American singer, songwriter, record producer and actress. Born and raised in Houston, Texas, she performed in various singing and dancing competitions as a child, and rose to fame in""", 25 | # """Persian (/ˈpɜːrʒən, -ʃən/), also known by its endonym Farsi (فارسی fārsi (fɒːɾˈsiː) ( listen)), is one of the Western Iranian languages within the Indo-Iranian branch of the Indo-European language family. It is primarily""", 26 | # """The number of Malayalam speakers in Lakshadweep is 51,100, which is only 0.15% of the total number, but is as much as about 84% of the population of Lakshadweep. In all, Malayalis made up 3.22% of the total Indian population in 2001. Of the total 33,066,""", 27 | # """The British have a love-hate relationship with the NHS. According to researchers at the King's Fund, the public gave the NHS its worst rating since records began 40 years ago. Just 29% said they were satisfied with the NHS in 2022. And yet we still love it. A whopping 90% of""", 28 | # ] 29 | # predictions = [ 30 | # """ the late 1990s as lead singer of R&B girl-group Destiny's Child. Managed by her father, Mathew Knowles, the group became one of the world's best-selling girl groups of all time. Their hiatus saw the release of Beyoncé's debut album, Dangerously in Love (2003), which established her as a solo artist worldwide, earned five Grammy Awards and featured the Billboard Hot 100 number-one singles "Crazy in Love" and "Baby Boy".""", 31 | # """ spoken in Iran, Afghanistan (officially known as Dari since 1958),[3] and Tajikistan (officially known as Tajiki since the Soviet era),[4] and some other regions which historically were Persianate societies and considered part of Greater Iran. It is written right to left in the Persian alphabet, a modified variant of the Arabic script.""", 32 | # """ 392 Malayalam speakers in India in 2001, 33,015,420 spoke the standard dialects, 19,643 spoke the Yerava dialect and 31,329 spoke non-standard regional variations like Eranadan. As per the 1991 census data, 28.85% of all Malayalam speakers in India spoke a second language and 19.64% of the total knew three or more languages.""", 33 | # """ of Britons believe that the NHS is a crucial institution that should be preserved. The juxtaposition between dissatisfaction with the current state and overall reverence for the institution speaks volumes about the complex relationship the British public has with their healthcare system.""" 34 | 35 | # ] 36 | # references = [ 37 | # """ the late 1990s as lead singer of the R&B girl-group Destiny's Child. Managed by her father, Mathew Knowles, the group became one of the world's best-selling girl groups of all time. Her debut solo album, Dangerously in Love (2003), debuted at number one on the US Billboard 200 chart and earned her five Grammy Awards, solidifying her as a solo artist as well. Throughout her career, Beyoncé has sold over 100 million records worldwide as a solo artist and a further 60 million records with Destiny's Child, making her one of the best-selling music artists of all time. She has won 23 Grammy Awards and is the most""", 38 | # """ spoken in Iran, Afghanistan (officially known as Dari since 1958), and Tajikistan (officially known as Tajiki since the Soviet era), and some other regions which historically were Persianate societies and considered part of Greater Iran. It is written in the Persian alphabet, a modified variant of the Arabic script, which itself evolved from the Aramaic alphabet.""", 39 | # """ 392 Malayalam speakers in India in 2001, 33,015,420 spoke the standard dialects, 19,643 spoke the Yerava dialect and 31,329 spoke non-standard regional variations like Eranadan. As per the 1991 census data, 28.85% of all Malayalam speakers in India spoke a second language and 19.64% of the total knew three or more languages.""", 40 | # """of the public agrees the service should be free and available to everyone. But with more than seven million people on waiting lists, almost everyone knows someone who isn't getting the care they need. As the NHS approaches its 75th anniversary, politicians are falling over themselves to praise the service.""" 41 | # ] 42 | 43 | import json 44 | with open('figs/kanye.json', 'r') as f: 45 | data = json.load(f) 46 | 47 | prompts = data['prompts'] 48 | predictions = data['predictions'] 49 | references = data['references'] 50 | 51 | # benchmarks = ['squad', 'boolq', 'quac', 'LatestEval'] 52 | benchmarks = ['memorised', 'clean'] 53 | html_output = compare_strings(predictions, references, benchmarks, prompts) 54 | 55 | with open('output.html', 'w') as f: 56 | f.write(f""" 57 | 58 | 59 | 85 | 86 | 87 |
88 | {html_output} 89 |
90 | 91 | 92 | """) 93 | 94 | print("HTML output saved to 'output.html'") 95 | -------------------------------------------------------------------------------- /data/github_dataset.py: -------------------------------------------------------------------------------- 1 | from git import Repo 2 | import datetime 3 | import os 4 | import sys 5 | import multiprocessing 6 | import difflib 7 | import json 8 | import itertools 9 | import shutil 10 | 11 | def clone_repo(repo_url, local_path, overwrite=False): 12 | if os.path.exists(local_path): 13 | if overwrite: 14 | shutil.rmtree(local_path) 15 | else: 16 | print(f"Repo {local_path} already exists") 17 | return 18 | Repo.clone_from(repo_url, local_path) 19 | 20 | def get_file_content(commit, file_path): 21 | # Retrieves the file content for a given commit 22 | blob = commit.tree / file_path 23 | return blob.data_stream.read().decode('utf-8', errors='ignore') 24 | 25 | def compare_files(repo, diff_item, start_commit, end_commit, code_extensions): 26 | file_path = diff_item.b_path 27 | _, ext = os.path.splitext(file_path) 28 | if ext not in code_extensions: 29 | # print(f"Skipping {file_path} because it is not a code file") 30 | return None 31 | 32 | # If the change type is added, we append it anyway 33 | # If the change type is modified, we only append it if the file was significantly changed (more than 50% of its lines were changed) 34 | if diff_item.change_type == 'M': 35 | a_content = get_file_content(start_commit, diff_item.a_path) 36 | b_content = get_file_content(end_commit, diff_item.b_path) 37 | 38 | # Use difflib to compare contents 39 | diff = difflib.unified_diff( 40 | a_content.splitlines(keepends=True), 41 | b_content.splitlines(keepends=True), 42 | fromfile=diff_item.a_path, 43 | tofile=diff_item.b_path 44 | ) 45 | 46 | # Count the number of lines added 47 | changes = sum(1 for line in diff if line.startswith('+') and not line.startswith('++')) 48 | 49 | # if the file was not significantly changed, skip it. we consider a file significantly changed if more than 50% of its lines were changed 50 | if changes < 0.5 * len(a_content.splitlines()): 51 | return None 52 | elif diff_item.change_type == 'A': 53 | b_content = get_file_content(end_commit, diff_item.b_path) 54 | changes = len(b_content.splitlines()) 55 | elif diff_item.change_type == 'R': 56 | # skip renamed files 57 | return None 58 | else: 59 | print(diff_item.change_type) 60 | return None 61 | 62 | return { 63 | 'file_path': file_path, 64 | 'num_changed_lines': changes, 65 | 'code': b_content, 66 | } 67 | 68 | def get_monthly_diff_file_objects(local_path, start_date, end_date, code_extensions): 69 | repo = Repo(local_path) 70 | repo_name = local_path.split('/')[-1] 71 | file_changes = [] 72 | 73 | try: 74 | start_commit = next(repo.iter_commits(until=start_date)) 75 | end_commit = next(repo.iter_commits(until=end_date)) 76 | except StopIteration: 77 | print(f"Repo {local_path} has no commits in {start_date} - {end_date}") 78 | return file_changes 79 | 80 | start_commit_date = datetime.datetime.fromtimestamp(start_commit.committed_date).date() 81 | end_commit_date = datetime.datetime.fromtimestamp(end_commit.committed_date).date() 82 | end_commit_date_str = end_commit_date.strftime("%Y-%m-%d") 83 | 84 | if start_commit_date > end_date or end_commit_date < start_date: 85 | # print(f"Repo {local_path} has no commits in the given time range") 86 | return file_changes 87 | 88 | diff_index = start_commit.diff(end_commit, **{'find_renames=50%': True, 'insert_kwargs_after': '-r'}) 89 | 90 | for diff_item in diff_index.iter_change_type('M'): 91 | result = compare_files(repo, diff_item, start_commit, end_commit, code_extensions) 92 | if result: 93 | result['repo_name'] = repo_name 94 | result['commit_date'] = end_commit_date_str 95 | result['sha'] = end_commit.hexsha 96 | file_changes.append(result) 97 | 98 | for diff_item in diff_index.iter_change_type('A'): 99 | result = compare_files(repo, diff_item, start_commit, end_commit, code_extensions) 100 | if result: 101 | result['repo_name'] = repo_name 102 | result['commit_date'] = end_commit_date_str 103 | result['sha'] = end_commit.hexsha 104 | file_changes.append(result) 105 | 106 | # Ranking files by the extent of added lines 107 | ranked_files = sorted(file_changes, key=lambda x: x['num_changed_lines'], reverse=True) 108 | # print(f"Total {len(ranked_files)} files changed") 109 | return ranked_files 110 | 111 | def main(time_stamp, local_repo, save_path): 112 | # try: 113 | year, month = time_stamp.split('-') 114 | first_day = datetime.date(int(year), int(month), 1) 115 | last_day = datetime.date(int(year), int(month), 28) 116 | 117 | repo_name = local_repo.split('/')[-1] 118 | # print(f"Processing {repo_name} at {time_stamp}") 119 | 120 | save_path = os.path.join(save_path, time_stamp, repo_name) 121 | if not os.path.exists(save_path): 122 | os.makedirs(save_path) 123 | 124 | ranked_files = get_monthly_diff_file_objects(local_repo, first_day, last_day, code_extensions) 125 | for index, file in enumerate(ranked_files[:50]): 126 | save_file_path = os.path.join(save_path, f"{index}.json") 127 | with open(save_file_path, 'w') as f: 128 | json.dump(file, f, ensure_ascii=False, indent=2) 129 | return (time_stamp, repo_name, len(ranked_files)) 130 | 131 | if __name__ == '__main__': 132 | repo_path, save_dir, = sys.argv[1:] 133 | 134 | time_stamps = [f'{year}-{month:02d}' for year in range(2017, 2024) for month in range(1, 13)] 135 | time_stamps += [f'2024-{month:02d}' for month in range(1, 3)] 136 | 137 | # pre_defined repos 138 | with open('/user/HS502/yl02706/LatestEval/data/code_repos.txt', 'r') as f: 139 | repos = f.readlines() 140 | 141 | print(f"Total {len(repos)} repos") 142 | 143 | # Prepare URLs and local paths 144 | urls = [f'https://github.com/{repo.strip()}.git' for repo in repos] 145 | local_paths = [os.path.join(repo_path, repo.replace('/', '_')).strip() for repo in repos] 146 | 147 | # clone repos 148 | with multiprocessing.Pool(2) as pool: 149 | pool.starmap(clone_repo, zip(urls, local_paths)) 150 | 151 | code_extensions = {'.py', '.js', '.java', '.cpp', '.c', '.cs', '.go', '.rb', '.php', '.ts', '.jsx', '.tsx', '.css', '.sh', '.pl', '.bat'} 152 | 153 | combinations = list(itertools.product(time_stamps, local_paths)) 154 | # combinations = sorted(combinations, key=lambda x: x[-1]) 155 | flattened_args = [(time_stamp, local_path, save_dir) for time_stamp, local_path in combinations] 156 | 157 | print(f"Total {len(flattened_args)} combinations") 158 | with multiprocessing.Pool(8) as pool: 159 | ALL_PROCESSED = pool.starmap(main, flattened_args) 160 | 161 | # use single process instead 162 | # ALL_PROCESSED = [] 163 | # for args in flattened_args: 164 | # ALL_PROCESSED.append(main(*args)) 165 | 166 | print(f"Total {len(ALL_PROCESSED)} processed") 167 | -------------------------------------------------------------------------------- /data/monthly_updater/monthly_code.py: -------------------------------------------------------------------------------- 1 | from git import Repo 2 | import datetime 3 | import os 4 | import sys 5 | import multiprocessing 6 | import difflib 7 | import json 8 | import itertools 9 | import shutil 10 | from glob import glob 11 | import datasets 12 | 13 | def clone_repo(repo_url, local_path, overwrite=False, since=None): 14 | if os.path.exists(local_path): 15 | if overwrite: 16 | shutil.rmtree(local_path) 17 | else: 18 | print(f"Repo {local_path} already exists") 19 | return 20 | Repo.clone_from(repo_url, local_path, multi_options=[f'--shallow-since={since}'] if since is not None else None) 21 | 22 | def get_file_content(commit, file_path): 23 | # Retrieves the file content for a given commit 24 | blob = commit.tree / file_path 25 | return blob.data_stream.read().decode('utf-8', errors='ignore') 26 | 27 | def compare_files(repo, diff_item, start_commit, end_commit, code_extensions): 28 | file_path = diff_item.b_path 29 | _, ext = os.path.splitext(file_path) 30 | if ext not in code_extensions: 31 | # print(f"Skipping {file_path} because it is not a code file") 32 | return None 33 | 34 | # If the change type is added, we append it anyway 35 | # If the change type is modified, we only append it if the file was significantly changed (more than 50% of its lines were changed) 36 | if diff_item.change_type == 'M': 37 | a_content = get_file_content(start_commit, diff_item.a_path) 38 | b_content = get_file_content(end_commit, diff_item.b_path) 39 | 40 | # Use difflib to compare contents 41 | diff = difflib.unified_diff( 42 | a_content.splitlines(keepends=True), 43 | b_content.splitlines(keepends=True), 44 | fromfile=diff_item.a_path, 45 | tofile=diff_item.b_path 46 | ) 47 | 48 | # Count the number of lines added 49 | changes = sum(1 for line in diff if line.startswith('+') and not line.startswith('++')) 50 | 51 | # if the file was not significantly changed, skip it. we consider a file significantly changed if more than 50% of its lines were changed 52 | if changes < 0.5 * len(a_content.splitlines()): 53 | return None 54 | elif diff_item.change_type == 'A': 55 | b_content = get_file_content(end_commit, diff_item.b_path) 56 | changes = len(b_content.splitlines()) 57 | elif diff_item.change_type == 'R': 58 | # skip renamed files 59 | return None 60 | else: 61 | print(diff_item.change_type) 62 | return None 63 | 64 | return { 65 | 'file_path': file_path, 66 | 'num_changed_lines': changes, 67 | 'code': b_content, 68 | } 69 | 70 | def get_monthly_diff_file_objects(local_path, start_date, end_date, code_extensions): 71 | repo = Repo(local_path) 72 | repo_name = local_path.split('/')[-1] 73 | file_changes = [] 74 | 75 | try: 76 | start_commit = next(repo.iter_commits(since=start_date, reverse=True)) 77 | end_commit = next(repo.iter_commits(until=end_date)) 78 | except StopIteration: 79 | print(f"Repo {local_path} has no commits in {start_date} - {end_date}") 80 | return file_changes 81 | 82 | start_commit_date = datetime.datetime.fromtimestamp(start_commit.committed_date).date() 83 | end_commit_date = datetime.datetime.fromtimestamp(end_commit.committed_date).date() 84 | end_commit_date_str = end_commit_date.strftime("%Y-%m-%d") 85 | 86 | if start_commit_date > end_date or end_commit_date < start_date: 87 | # print(f"Repo {local_path} has no commits in the given time range") 88 | return file_changes 89 | 90 | diff_index = start_commit.diff(end_commit, **{'find_renames=50%': True, 'insert_kwargs_after': '-r'}) 91 | 92 | for diff_item in diff_index.iter_change_type('M'): 93 | result = compare_files(repo, diff_item, start_commit, end_commit, code_extensions) 94 | if result: 95 | result['repo_name'] = repo_name 96 | result['commit_date'] = end_commit_date_str 97 | result['sha'] = end_commit.hexsha 98 | file_changes.append(result) 99 | 100 | for diff_item in diff_index.iter_change_type('A'): 101 | result = compare_files(repo, diff_item, start_commit, end_commit, code_extensions) 102 | if result: 103 | result['repo_name'] = repo_name 104 | result['commit_date'] = end_commit_date_str 105 | result['sha'] = end_commit.hexsha 106 | file_changes.append(result) 107 | 108 | # Ranking files by the extent of added lines 109 | ranked_files = sorted(file_changes, key=lambda x: x['num_changed_lines'], reverse=True) 110 | # print(f"Total {len(ranked_files)} files changed") 111 | return ranked_files 112 | 113 | def main(time_stamp, local_repo, save_path): 114 | year, month = time_stamp.split('-') 115 | first_day = datetime.date(int(year), int(month), 1) 116 | last_day = datetime.date(int(year), int(month), 28) 117 | 118 | repo_name = local_repo.split('/')[-1] 119 | # print(f"Processing {repo_name} at {time_stamp}") 120 | 121 | save_path = os.path.join(save_path, time_stamp, repo_name) 122 | if not os.path.exists(save_path): 123 | os.makedirs(save_path) 124 | 125 | ranked_files = get_monthly_diff_file_objects(local_repo, first_day, last_day, code_extensions) 126 | for index, file in enumerate(ranked_files[:50]): 127 | save_file_path = os.path.join(save_path, f"{index}.json") 128 | with open(save_file_path, 'w') as f: 129 | json.dump(file, f, ensure_ascii=False, indent=2) 130 | return (time_stamp, repo_name, len(ranked_files)) 131 | # print(f"Saved to {save_path}") 132 | 133 | if __name__ == '__main__': 134 | today = datetime.date.today() 135 | year = today.year 136 | month = today.month 137 | 138 | time_stamp = f'{year}-{month:02d}' 139 | first_day_string = f'{year}-{month:02d}-01' 140 | 141 | repo_path, save_dir = 'repos/', 'code_data/' 142 | repo_list = 'data/code_repos.txt' 143 | 144 | # time_stamps = [f'{year}-{month:02d}' for year in range(2017, 2024) for month in range(1, 13)] 145 | time_stamps = [time_stamp] 146 | 147 | # pre_defined repos 148 | with open(repo_list, 'r') as f: 149 | repos = f.readlines() 150 | 151 | print(f"Total {len(repos)} repos") 152 | 153 | # Prepare URLs and local paths 154 | urls = [f'https://github.com/{repo.strip()}.git' for repo in repos] 155 | local_paths = [os.path.join(repo_path, repo.replace('/', '_')).strip() for repo in repos] 156 | 157 | # clone repos 158 | sucess_paths = [] 159 | for url, local_path in zip(urls, local_paths): 160 | try: 161 | clone_repo(url, local_path, overwrite=True, since=first_day_string) 162 | sucess_paths.append(local_path) 163 | except: 164 | print(f"Failed to clone {url}") 165 | 166 | code_extensions = {'.py', '.js', '.java', '.cpp', '.c', '.cs', '.go', '.rb', '.php', '.ts', '.jsx', '.tsx', '.css', '.sh', '.pl', '.bat'} 167 | 168 | combinations = list(itertools.product(time_stamps, sucess_paths)) 169 | # combinations = sorted(combinations, key=lambda x: x[-1]) 170 | flattened_args = [(time_stamp, local_path, save_dir) for time_stamp, local_path in combinations] 171 | 172 | print(f"Total {len(flattened_args)} combinations") 173 | with multiprocessing.Pool(2) as pool: 174 | ALL_PROCESSED = pool.starmap(main, flattened_args) 175 | 176 | print(f"Total {len(ALL_PROCESSED)} processed") 177 | 178 | hf_token = os.environ['HF_TOKEN'] 179 | code_files = glob(f'{save_dir}/{time_stamp}/*/*.json') 180 | all_codes = [] 181 | for code in code_files: 182 | with open(code, 'r') as f: 183 | all_codes.append(json.load(f)) 184 | ds = datasets.Dataset.from_list(all_codes) 185 | print('='*20) 186 | print(f'Finished {time_stamp}') 187 | ds.push_to_hub(f'RealTimeData/code_alltime', config_name = time_stamp, token=hf_token) 188 | print(f'Pushed {time_stamp} to hub') -------------------------------------------------------------------------------- /eval/contamination.py: -------------------------------------------------------------------------------- 1 | import datasets 2 | from nltk.tokenize import sent_tokenize, word_tokenize 3 | from nltk import ngrams 4 | import pandas as pd 5 | import nltk 6 | from nltk.stem import WordNetLemmatizer 7 | import os 8 | import time 9 | import openai 10 | from tqdm import tqdm 11 | from transformers import GPT2TokenizerFast 12 | 13 | T = GPT2TokenizerFast.from_pretrained("gpt2") 14 | prompt_length = 250 15 | suffix_length = 500 - prompt_length 16 | 17 | def data_sampler(): 18 | quac = datasets.load_dataset("quac", split="validation") 19 | boolq = datasets.load_dataset("boolq", split="validation") 20 | squad = datasets.load_dataset("squad_v2", split="validation") 21 | 22 | latesteval_1 = datasets.load_dataset("RealTimeData/bbc_news_week1_july_2023", split="train") 23 | latesteval_2 = datasets.load_dataset("RealTimeData/github_july_week1_2023", split="train") 24 | latesteval_3 = datasets.load_dataset("RealTimeData/arxiv_july_week1_2023", split="train") 25 | 26 | def get_prefix_and_suffix(doc, dataset_name = None): 27 | if dataset_name is None: 28 | raise ValueError("dataset_name must be specified") 29 | if dataset_name == "quac" or dataset_name == "squad_v2": 30 | text = T(doc['context']).input_ids 31 | if dataset_name == "quac": 32 | title = 'quac, ' + doc['wikipedia_page_title'] + ', ' + doc['section_title'] + ', ' 33 | elif dataset_name == "squad_v2": 34 | title = 'squadv2, ' + 'wikipedia, ' + doc['title'] + ', ' 35 | # text = word_tokenize(doc['context']) 36 | elif dataset_name == "boolq": 37 | text = T(doc['passage']).input_ids 38 | title = 'boolq, wikipedia, ' 39 | # text = word_tokenize(doc['passage']) 40 | elif dataset_name == "latesteval_1": 41 | text = doc['content'].replace("\n", " ") 42 | text = T(text).input_ids 43 | # text = word_tokenize(doc['content']) 44 | title = 'bbc, ' 45 | if len(text) > 1000: 46 | prefix = T.decode(text[:prompt_length]) 47 | suffix = T.decode(text[suffix_length:]) 48 | else: 49 | suffix = T.decode(text[-suffix_length:]) 50 | prefix = T.decode(text[: -suffix_length]) 51 | # prefix = " ".join(prefix) 52 | # suffix = " ".join(suffix) 53 | prefix = title + prefix 54 | return pd.Series([prefix, suffix], index=['prefix', 'suffix']) 55 | 56 | # quac = quac.to_pandas().sample(n=10, random_state=42) 57 | # boolq = boolq.to_pandas().sample(n=100, random_state=42) 58 | # squad = squad.to_pandas().sample(n=100, random_state=42) 59 | # latesteval_1 = latesteval_1.to_pandas().sample(n=100, random_state=42) 60 | 61 | quac = quac.to_pandas().head(n=100) 62 | boolq = boolq.to_pandas().head(n=100) 63 | squad = squad.to_pandas().head(n=100) 64 | latesteval_1 = latesteval_1.to_pandas().head(n=30) 65 | 66 | quac = quac.apply(get_prefix_and_suffix, axis=1, dataset_name="quac") 67 | boolq = boolq.apply(get_prefix_and_suffix, axis=1, dataset_name="boolq") 68 | squad = squad.apply(get_prefix_and_suffix, axis=1, dataset_name="squad_v2") 69 | latesteval = latesteval_1.apply(get_prefix_and_suffix, axis=1, dataset_name="latesteval_1") 70 | 71 | return { 72 | "quac": quac, 73 | "boolq": boolq, 74 | "squad": squad, 75 | "latesteval": latesteval 76 | } 77 | 78 | def identify_contamination(reference_suffixes, continuations): 79 | 80 | def generate_word_ngrams(text, n, use_lemmatization=False): 81 | tokens = T(text.lower()).input_ids 82 | 83 | # Optionally, lemmatize words 84 | if use_lemmatization: 85 | lemmatizer = WordNetLemmatizer() 86 | words = [lemmatizer.lemmatize(word) for word in words] 87 | return list(ngrams(tokens, n)) 88 | 89 | results = [] 90 | for suffix, continuation in zip(reference_suffixes, continuations): 91 | suffix_ngrams = set(generate_word_ngrams(suffix, 9)) 92 | continuation_ngrams = set(generate_word_ngrams(continuation, 9)) 93 | 94 | intersection = suffix_ngrams.intersection(continuation_ngrams) 95 | 96 | if len(intersection) > 0: 97 | results.append((True, suffix, continuation, intersection)) 98 | 99 | return results 100 | 101 | def generate_continuation(model, prompts, reference_suffix, benchmark, batch_size=10): 102 | # three models at this moment: gpt-3, gpt-4, llama-2 103 | 104 | prompts = prompts.tolist() 105 | 106 | if model in ['gpt-4', 'davinci', 'curie', 'babbage']: 107 | generate = gpt 108 | else: 109 | generate = hf_generate 110 | 111 | continuations = [] 112 | output_file = f"eval/{model}_{benchmark}_{prompt_length}_continuation.txt" 113 | prompt_file = f"eval/{model}_{benchmark}_{prompt_length}_prompt.txt" 114 | reference_suffix_file = f"eval/{model}_{benchmark}_{prompt_length}_reference_suffix.txt" 115 | if os.path.exists(output_file): 116 | with open(output_file, "r") as f: 117 | continuations = f.readlines() 118 | return continuations 119 | else: 120 | with open(output_file, "w") as f, open(prompt_file, "w") as f2, open(reference_suffix_file, "w") as f3: 121 | for i in tqdm(range(0, len(prompts), batch_size)): 122 | prompt = prompts[i: i + batch_size] 123 | reference_suffix_batch = reference_suffix[i: i + batch_size] 124 | continuation = generate(prompt, model=model) 125 | continuations.extend(continuation) 126 | f.write('\n'.join(continuation) + "\n") 127 | f2.write('\n'.join(prompt) + "\n") 128 | f3.write('\n'.join(reference_suffix_batch) + "\n") 129 | 130 | return continuations 131 | 132 | def hf_generate(model, prompt): 133 | pass 134 | 135 | def gpt(prompt, num_retry = 5, model = "gpt-3.5-turbo"): 136 | # generate answer by gpt-3.5-turbo 137 | openai_key = os.environ.get("OPENAI_API_KEY") 138 | for _ in range(num_retry): 139 | try: 140 | if model in ['davinci', 'curie', 'babbage']: 141 | r = openai.Completion.create( 142 | model=model, 143 | prompt=prompt, 144 | max_tokens=250, 145 | temperature=0, 146 | logit_bias={"198": -100}, 147 | logprobs=0, 148 | ) 149 | elif model in ['gpt-3.5-turbo', 'gpt-4']: 150 | r = openai.ChatCompletion.create( 151 | model = model, 152 | messages = [ 153 | {"role": "user", "content": prompt}, 154 | ], 155 | max_tokens=250, 156 | temperature = 0, 157 | logit_bias={"198": -100} 158 | ) 159 | break 160 | except Exception as e: 161 | print(e) 162 | time.sleep(1) 163 | 164 | if model in ['davinci', 'curie', 'babbage']: 165 | return [x['text'].replace('\n', ' ') for x in r['choices']] 166 | elif model in ['gpt-3.5-turbo', 'gpt-4']: 167 | return [x['message']['content'] for x in r['choices']] 168 | 169 | if __name__ == "__main__": 170 | samples = data_sampler() 171 | 172 | quac = samples['quac'] 173 | boolq = samples['boolq'] 174 | squad = samples['squad'] 175 | latesteval = samples['latesteval'] 176 | 177 | model = 'curie' 178 | 179 | quac_continuations = generate_continuation(model, quac['prefix'], quac['suffix'], "quac") 180 | quac_results = identify_contamination(quac['suffix'], quac_continuations) 181 | 182 | print(f"-- quac: {len(quac_results)}, -- {len(quac_results) / len(quac)}") 183 | 184 | boolq_continuations = generate_continuation(model, boolq['prefix'], boolq['suffix'], "boolq") 185 | boolq_results = identify_contamination(boolq['suffix'], boolq_continuations) 186 | 187 | print(f"-- boolq: {len(boolq_results)}, -- {len(boolq_results) / len(boolq)}") 188 | 189 | squad_continuations = generate_continuation(model, squad['prefix'], squad['suffix'], "squad") 190 | squad_results = identify_contamination(squad['suffix'], squad_continuations) 191 | 192 | print(f"-- squad: {len(squad_results)}, -- {len(squad_results) / len(squad)}") 193 | 194 | latesteval_continuations = generate_continuation(model, latesteval['prefix'], latesteval['suffix'], "latesteval") 195 | latesteval_results = identify_contamination(latesteval['suffix'], latesteval_continuations) 196 | 197 | print(f"-- latesteval: {len(latesteval_results)}, -- {len(latesteval_results) / len(latesteval)}") -------------------------------------------------------------------------------- /data/reddit_crawler.py: -------------------------------------------------------------------------------- 1 | import requests 2 | from bs4 import BeautifulSoup 3 | import praw 4 | import sys 5 | from typing import List 6 | import json 7 | import os 8 | import time 9 | import datetime 10 | import traceback 11 | 12 | class Forum: 13 | def __init__(self, task_name, start_url, wait_time): 14 | self.task_name = task_name 15 | self.url = start_url 16 | self.wait_time = wait_time 17 | 18 | self.session = requests.Session() 19 | self.setup_session() 20 | 21 | self.posts = None 22 | 23 | def setup_session(self): 24 | """ 25 | _summary_: Setup session 26 | """ 27 | headers = { 28 | 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.36' 29 | } 30 | self.session.headers.update(headers) 31 | 32 | def get_forum_pages(self): 33 | """ 34 | _summary_: Get all pages links of the forum 35 | """ 36 | return NotImplementedError() 37 | 38 | def get_forum_content(self, page_url): 39 | """ 40 | _summary_: Get all content of the forum 41 | should have: 42 | - title 43 | - main content 44 | - comments 45 | - votes of comments 46 | """ 47 | return NotImplementedError() 48 | 49 | def obtain_content(self): 50 | """ 51 | _summary_: Obtain content from each post 52 | """ 53 | list_of_content = [] 54 | for post in self.posts: 55 | list_of_content.append(self.get_forum_content(post)) 56 | 57 | self.content = list_of_content 58 | 59 | def save_content(self): 60 | """ 61 | _summary_: Save list_of_content to a file 62 | """ 63 | return NotImplementedError() 64 | 65 | @classmethod 66 | def filter_func(cls, tag, prefix): 67 | if tag.has_attr('class'): 68 | class_str = ' '.join(tag['class']) 69 | # return class_str.startswith('node node--id') 70 | return class_str.startswith(prefix) 71 | return False 72 | 73 | class MentalHealth(Forum): 74 | def __init__(self, task_name, start_url, wait_time): 75 | super().__init__(task_name, start_url, wait_time) 76 | 77 | def get_forum_pages(self): 78 | # get all sub forums 79 | 80 | sub_forums = [] 81 | response = self.session.get(self.url) 82 | soup = BeautifulSoup(response.text, 'html.parser') 83 | for link in soup.find_all(lambda tag: self.filter_func(tag, 'node node--id')): 84 | sub_forums.append(link.find('a')['href']) 85 | 86 | # get all posts from each forum 87 | posts = [] 88 | for sub_forum in sub_forums: 89 | response = self.session.get(sub_forum) 90 | soup = BeautifulSoup(response.text, 'html.parser') 91 | for link in soup.find_all(lambda tag: self.filter_func(tag, 'structItem structItem--thread')): 92 | posts.append(link['href']) 93 | 94 | self.posts = posts 95 | 96 | def get_forum_content(self, page_url): 97 | response = self.session.get(page_url) 98 | soup = BeautifulSoup(response.text, 'html.parser') 99 | title = soup.find('h1', {'class': 'p-title-value'}).text 100 | list_of_content = [] 101 | for article in soup.find_all(lambda tag: self.filter_func(tag, 'message message--post ')): 102 | author = article['data-author'] 103 | content = article.find('div', {'class': 'bbWrapper'}) 104 | if content: 105 | mentioned = content.find_all('a', {'class': 'username'}) 106 | content = content.text 107 | else: 108 | continue 109 | 110 | footer = article.find('ul', {'class': 'sv-rating-bar__ratings'}) 111 | if footer: 112 | ratings = footer.find_all('li', {'class': 'sv-rating sv-rating--empty-list'}) 113 | rating_sum = sum([ int(rate.text) for rate in rating]) 114 | else: 115 | rating_sum = 0 116 | 117 | list_of_content.append({ 118 | 'author': author, 119 | 'content': content, 120 | 'mentioned': mentioned, 121 | 'rating': rating_sum 122 | }) 123 | 124 | next_page = soup.find('a', {'class': 'pageNav-jump pageNav-jump--next'}) 125 | if next_page: 126 | next_page = next_page['href'] 127 | list_of_content += self.get_forum_content(next_page) 128 | 129 | return list_of_content 130 | 131 | 132 | class Reddit: 133 | def __init__(self, subreddits, time_filter, num_posts, save_path, time_limit = None): 134 | self.subreddits = subreddits 135 | self.time_filter = time_filter 136 | self.num_posts = num_posts 137 | self.save_path = save_path 138 | self.time_limit = time_limit 139 | 140 | self.reddit = praw.Reddit('DataCollector') 141 | self.posts = self.get_reddit_posts() 142 | # self.dump_posts() 143 | 144 | def created_after_time_limit(self, created_utc): 145 | if self.time_limit is None: 146 | return True 147 | dt_object = datetime.datetime.fromtimestamp(created_utc) 148 | return dt_object >= self.time_limit 149 | 150 | def get_reddit_posts(self): 151 | # all_posts = {} 152 | for subreddit in self.subreddits: 153 | subreddit_ = subreddit 154 | subreddit = self.reddit.subreddit(subreddit) 155 | list_of_posts = [] 156 | for post in subreddit.top(time_filter = self.time_filter, limit=self.num_posts): 157 | created_time = post.created_utc 158 | if not self.created_after_time_limit(created_time): continue 159 | created_time_str = datetime.datetime.fromtimestamp(created_time).strftime('%Y-%m-%d %H:%M:%S') 160 | title = post.title 161 | content = post.selftext 162 | 163 | for i in range(3): 164 | try: 165 | comments = self.deal_with_comments(post.comments.list()) 166 | except praw.exceptions.APIException as e: 167 | traceback.print_exc() 168 | time.sleep(10) 169 | else: 170 | break 171 | 172 | score = post.score 173 | the_post = { 174 | 'title': title, 175 | 'content': content, 176 | 'comments': comments, 177 | 'created_time': created_time_str, 178 | 'score': score, 179 | 'subreddit': subreddit_ 180 | } 181 | list_of_posts.append(the_post) 182 | self.dump_posts(list_of_posts, subreddit_) 183 | # all_posts[subreddit_] = list_of_posts 184 | # return all_posts 185 | 186 | def deal_with_comments(self, comments, depth = 3): 187 | results = [] 188 | if depth < 0: return results 189 | depth -= 1 190 | for comment in comments: 191 | if isinstance(comment, praw.models.MoreComments): continue 192 | author = comment.author 193 | content = comment.body 194 | score = comment.score 195 | created_time = comment.created_utc 196 | created_time_str = datetime.datetime.fromtimestamp(created_time).strftime('%Y-%m-%d %H:%M:%S') 197 | replies = comment.replies 198 | if len(replies): 199 | replies = self.deal_with_comments(replies, depth=depth) 200 | else: replies = [] 201 | the_comment = { 202 | 'author': author.name if author is not None else '', 203 | 'content': content, 204 | 'score': score, 205 | 'created_time': created_time_str, 206 | 'replies': replies 207 | } 208 | results.append(the_comment) 209 | return results 210 | 211 | def dump_posts(self, list_of_posts, subreddit = None): 212 | path = os.path.join(self.save_path, f"{subreddit if subreddit is not None else 'all'}.json") 213 | with open(path, 'w') as f: 214 | json.dump(list_of_posts, f) 215 | 216 | if __name__ == '__main__': 217 | # should define the XDG_CONFIG_HOME to the config file 218 | cwd, = sys.argv[1:] 219 | data_collectors = Reddit(['investing', 'wallstreetbets', 'CryptoCurrency', 'politics', 'healthcare'], 'month', 100, cwd, time_limit=datetime.datetime(2023, 7, 1)) -------------------------------------------------------------------------------- /data/squad_wiki_title.text: -------------------------------------------------------------------------------- 1 | Queen_Victoria 2 | Grape 3 | Athanasius_of_Alexandria 4 | Lighting 5 | BBC_Television 6 | Federal_Bureau_of_Investigation 7 | Punjab,_Pakistan 8 | Capacitor 9 | Sino-Tibetan_relations_during_the_Ming_dynasty 10 | History_of_India 11 | Plymouth 12 | Space_Race 13 | Myocardial_infarction 14 | The_Times 15 | Franco-Prussian_War 16 | Literature 17 | War_on_Terror 18 | Aircraft_carrier 19 | Turner_Classic_Movies 20 | Royal_assent 21 | Muslim_world 22 | Sahara 23 | Galicia_(Spain) 24 | YouTube 25 | Santa_Monica,_California 26 | Imperial_College_London 27 | Textual_criticism 28 | Sichuan 29 | Institute_of_technology 30 | Railway_electrification_system 31 | Mesozoic 32 | Cyprus 33 | The_Sun_(United_Kingdom) 34 | Order_of_the_British_Empire 35 | Republic_of_the_Congo 36 | Materialism 37 | Qing_dynasty 38 | To_Kill_a_Mockingbird 39 | Greece 40 | 2008_Sichuan_earthquake 41 | Edmund_Burke 42 | Northwestern_University 43 | CBC_Television 44 | Germans 45 | Race_and_ethnicity_in_the_United_States_Census 46 | Iranian_languages 47 | Adolescence 48 | Armenia 49 | Intellectual_property 50 | Law_of_the_United_States 51 | Hanover 52 | Tuberculosis 53 | Dialect 54 | Josip_Broz_Tito 55 | Political_philosophy 56 | Bern 57 | Pitch_(music) 58 | Pope_John_XXIII 59 | Black_people 60 | List_of_numbered_streets_in_Manhattan 61 | Montevideo 62 | Nigeria 63 | Paper 64 | Swaziland 65 | Liberal_Party_of_Australia 66 | Seven_Years%27_War 67 | Zinc 68 | Treaty 69 | Hellenistic_period 70 | London 71 | European_Central_Bank 72 | Thuringia 73 | Circadian_rhythm 74 | Estonian_language 75 | Cork_(city) 76 | Westminster_Abbey 77 | Data_compression 78 | United_States_Air_Force 79 | Separation_of_powers_under_the_United_States_Constitution 80 | On_the_Origin_of_Species 81 | Nanjing 82 | Zhejiang 83 | Late_Middle_Ages 84 | PlayStation_3 85 | Neptune 86 | Carnival 87 | Hindu_philosophy 88 | Dell 89 | Everton_F.C. 90 | Armenians 91 | Samurai 92 | Federal_Aviation_Administration 93 | Spanish_language_in_the_United_States 94 | Alps 95 | Digimon 96 | Compact_disc 97 | God 98 | Botany 99 | Heresy 100 | The_Bronx 101 | Roman_Republic 102 | Wayback_Machine 103 | Airport 104 | Red 105 | Internet_service_provider 106 | Chicago_Cubs 107 | Detroit 108 | Culture 109 | New_York_City 110 | Marshall_Islands 111 | Hyderabad 112 | Pharmaceutical_industry 113 | Saint_Helena 114 | Oklahoma_City 115 | Bras%C3%ADlia 116 | Korean_War 117 | Biodiversity 118 | Brigham_Young_University 119 | Oklahoma 120 | Eton_College 121 | Alfred_North_Whitehead 122 | Russian_language 123 | A_cappella 124 | Richmond,_Virginia 125 | Genocide 126 | Great_Plains 127 | British_Empire 128 | Emotion 129 | Comics 130 | Napoleon 131 | MP3 132 | England_national_football_team 133 | Green 134 | Palermo 135 | Freemasonry 136 | Letter_case 137 | Communications_in_Somalia 138 | Exhibition_game 139 | Hard_rock 140 | Somalis 141 | University 142 | Pacific_War 143 | San_Diego 144 | British_Isles 145 | Mosaic 146 | Pesticide 147 | Bill_%26_Melinda_Gates_Foundation 148 | University_of_Notre_Dame 149 | Hunter-gatherer 150 | Hokkien 151 | Economy_of_Greece 152 | Windows_8 153 | Universal_Studios 154 | Nintendo_Entertainment_System 155 | St._John%27s,_Newfoundland_and_Labrador 156 | Immaculate_Conception 157 | Southeast_Asia 158 | Rajasthan 159 | Mammal 160 | Communication 161 | Greeks 162 | Chihuahua_(state) 163 | Database 164 | Orthodox_Judaism 165 | Ashkenazi_Jews 166 | Immunology 167 | Flowering_plant 168 | Capital_punishment_in_the_United_States 169 | Switzerland 170 | Christian 171 | Beyoncé 172 | Tristan_da_Cunha 173 | Diarrhea 174 | Architecture 175 | East_India_Company 176 | Aspirated_consonant 177 | Valencia 178 | Gene 179 | Crucifixion_of_Jesus 180 | Financial_crisis_of_2007%E2%80%9308 181 | Asthma 182 | Central_African_Republic 183 | Predation 184 | Computer_security 185 | Protestantism 186 | Russian_Soviet_Federative_Socialist_Republic 187 | Israel 188 | Neoclassical_architecture 189 | Elevator 190 | Frédéric_Chopin 191 | Group_(mathematics) 192 | Glacier 193 | Gamal_Abdel_Nasser 194 | Incandescent_light_bulb 195 | Old_English 196 | Antenna_(radio) 197 | States_of_Germany 198 | IBM 199 | Virgil 200 | Montana 201 | Pain 202 | Mexico_City 203 | Infection 204 | Slavs 205 | Friedrich_Hayek 206 | Multiracial_American 207 | Alaska 208 | Buddhism 209 | Kathmandu 210 | Yale_University 211 | Guinea-Bissau 212 | Anti-aircraft_warfare 213 | Solar_energy 214 | Affirmative_action_in_the_United_States 215 | 2008_Summer_Olympics_torch_relay 216 | Human_Development_Index 217 | Guam 218 | Party_leaders_of_the_United_States_House_of_Representatives 219 | FC_Barcelona 220 | Professional_wrestling 221 | Strasbourg 222 | Richard_Feynman 223 | Wood 224 | Royal_Institute_of_British_Architects 225 | Myanmar 226 | Paris 227 | Southampton 228 | Georgian_architecture 229 | Royal_Dutch_Shell 230 | Madrasa 231 | Department_store 232 | Adult_contemporary_music 233 | Quran 234 | Near_East 235 | Dutch_Republic 236 | George_VI 237 | Imamah_(Shia_doctrine) 238 | History_of_science 239 | Arena_Football_League 240 | Crimean_War 241 | Appalachian_Mountains 242 | Canadian_football 243 | Association_football 244 | Infrared 245 | Dutch_language 246 | Eritrea 247 | Saint_Barth%C3%A9lemy 248 | Catalan_language 249 | Samoa 250 | Sexual_orientation 251 | Atlantic_City,_New_Jersey 252 | Classical_music 253 | Dominican_Order 254 | Warsaw_Pact 255 | Antarctica 256 | Lancashire 257 | American_Idol 258 | John_von_Neumann 259 | Copper 260 | Southern_Europe 261 | BeiDou_Navigation_Satellite_System 262 | Ottoman_Empire 263 | General_Electric 264 | Heian_period 265 | Humanism 266 | Digestion 267 | Unicode 268 | Computer 269 | United_States_dollar 270 | Madonna_(entertainer) 271 | FA_Cup 272 | East_Prussia 273 | Religion_in_ancient_Rome 274 | Bermuda 275 | Supreme_court 276 | Washington_University_in_St._Louis 277 | Xbox_360 278 | Cotton 279 | Melbourne 280 | North_Carolina 281 | Tibet 282 | Super_Nintendo_Entertainment_System 283 | Boston 284 | Pope_Paul_VI 285 | Idealism 286 | Education 287 | Baptists 288 | Tajikistan 289 | Tucson,_Arizona 290 | Namibia 291 | Dwight_D._Eisenhower 292 | Rule_of_law 293 | Jews 294 | Norfolk_Island 295 | Police 296 | Chinese_characters 297 | Annelid 298 | Hunting 299 | Software_testing 300 | LaserDisc 301 | Indigenous_peoples_of_the_Americas 302 | Portugal 303 | Cubism 304 | Bird 305 | Uranium 306 | Raleigh,_North_Carolina 307 | Alexander_Graham_Bell 308 | Nutrition 309 | Neolithic 310 | Asphalt 311 | Cardinal_(Catholicism) 312 | Houston 313 | Mary_(mother_of_Jesus) 314 | United_States_presidential_election,_2004 315 | Prime_minister 316 | Genome 317 | Utrecht 318 | Charleston,_South_Carolina 319 | Kievan_Rus%27 320 | Premier_League 321 | Presbyterianism 322 | Insect 323 | John_Kerry 324 | Karl_Popper 325 | Comprehensive_school 326 | Philadelphia 327 | Seattle 328 | Glass 329 | Sanskrit 330 | Iran 331 | Labour_Party_(UK) 332 | Separation_of_church_and_state_in_the_United_States 333 | Nonprofit_organization 334 | Philosophy_of_space_and_time 335 | Pub 336 | National_Archives_and_Records_Administration 337 | Middle_Ages 338 | Szlachta 339 | House_music 340 | Gramophone_record 341 | Czech_language 342 | Vacuum 343 | Central_Intelligence_Agency 344 | Film_speed 345 | Himachal_Pradesh 346 | Phonology 347 | Canadian_Armed_Forces 348 | Muammar_Gaddafi 349 | Dissolution_of_the_Soviet_Union 350 | High-definition_television 351 | Alloy 352 | Arsenal_F.C. 353 | New_Delhi 354 | Translation 355 | USB 356 | Transistor 357 | Tuvalu 358 | Somerset 359 | Renewable_energy_commercialization 360 | Videoconferencing 361 | Political_party 362 | Gregorian_calendar 363 | Serbo-Croatian 364 | United_Nations_Population_Fund 365 | Brain 366 | ASCII 367 | Ministry_of_Defence_(United_Kingdom) 368 | Mandolin 369 | Antibiotics 370 | Great_power 371 | Beer 372 | Spectre_(2015_film) 373 | Apollo 374 | Energy 375 | Avicenna 376 | Gothic_architecture 377 | Steven_Spielberg 378 | Animal 379 | Geological_history_of_Earth 380 | Miami 381 | University_of_Kansas 382 | Daylight_saving_time 383 | Identity_(social_science) 384 | Canon_law 385 | Sumer 386 | Modern_history 387 | Planck_constant 388 | Child_labour 389 | Buckingham_Palace 390 | Sony_Music_Entertainment 391 | Age_of_Enlightenment 392 | Tennessee 393 | Electric_motor 394 | Marvel_Comics 395 | Federalism 396 | Mali 397 | Geography_of_the_United_States 398 | The_Legend_of_Zelda:_Twilight_Princess 399 | Kanye_West 400 | Molotov%E2%80%93Ribbentrop_Pact 401 | Umayyad_Caliphate 402 | Estonia 403 | Race_(human_categorization) 404 | New_Haven,_Connecticut 405 | Endangered_Species_Act 406 | Symbiosis 407 | Military_history_of_the_United_States 408 | Dog 409 | Printed_circuit_board 410 | Empiricism 411 | The_Blitz 412 | Han_dynasty 413 | Light-emitting_diode 414 | Alsace 415 | United_States_Army 416 | Macintosh 417 | Clothing 418 | Comcast 419 | Elizabeth_II 420 | Liberia 421 | Jehovah%27s_Witnesses 422 | 51st_state 423 | IPod 424 | Bacteria 425 | Matter 426 | Poultry 427 | Gymnastics 428 | John,_King_of_England 429 | Time 430 | Arnold_Schwarzenegger 431 | Queen_(band) 432 | Memory 433 | Florida 434 | Political_corruption 435 | Web_browser 436 | Hydrogen 437 | Ann_Arbor,_Michigan 438 | Bird_migration 439 | Post-punk 440 | Anthropology 441 | Copyright_infringement 442 | Egypt -------------------------------------------------------------------------------- /data/arxiv_dataset.py: -------------------------------------------------------------------------------- 1 | import arxiv 2 | import datetime 3 | from queue import Queue 4 | from threading import Thread, Lock 5 | import os 6 | import logging 7 | import time 8 | import tarfile 9 | from pylatexenc.latexwalker import LatexWalker, LatexEnvironmentNode, LatexMacroNode 10 | from pylatexenc import latex2text 11 | from pylatexenc.macrospec import LatexContextDb 12 | import shutil 13 | import re 14 | import json 15 | from glob import glob 16 | from huggingface_hub import create_branch, create_tag, RepoCard 17 | import datasets 18 | import sys 19 | 20 | def filter_element(context, exclude_elements = []): 21 | 22 | new_context = LatexContextDb() 23 | 24 | new_context.unknown_macro_spec = context.unknown_macro_spec 25 | new_context.unknown_environment_spec = context.unknown_environment_spec 26 | new_context.unknown_specials_spec = context.unknown_specials_spec 27 | 28 | filter_element_func = lambda dict_to_filter: {k:v for k,v in dict_to_filter.items() if k not in exclude_elements}.values() 29 | for cat in context.category_list: 30 | 31 | # include this category 32 | new_context.add_context_category( 33 | cat, 34 | macros=filter_element_func(context.d[cat]['macros']), 35 | environments=filter_element_func(context.d[cat]['environments']), 36 | specials=filter_element_func(context.d[cat]['specials']), 37 | ) 38 | 39 | return new_context 40 | 41 | class TextExtractor: 42 | 43 | def __init__(self): 44 | self.l2t_context_db = latex2text.get_default_latex_context_db() 45 | self.l2t_context_db = filter_element(self.l2t_context_db, ['href']) 46 | 47 | self.l2t = latex2text.LatexNodes2Text(latex_context=self.l2t_context_db) 48 | 49 | def extract(self, latex_code): 50 | result = parse_tex_ignore_figures(latex_code) 51 | return self.l2t.nodelist_to_text(result) 52 | 53 | def remove_figure_nodes(node_list): 54 | filtered_node_list = [] 55 | for node in node_list: 56 | # Ignore the 'figure' environment 57 | if node.isNodeType(LatexEnvironmentNode): 58 | if node.environmentname in [ 'figure', 'figure*', 'algorithm', 'table', 'table*', 'algorithmic']: 59 | continue 60 | if hasattr(node, 'nodelist'): 61 | node.nodelist = remove_figure_nodes(node.nodelist) 62 | filtered_node_list.append(node) 63 | return filtered_node_list 64 | 65 | def parse_tex_ignore_figures(tex_code): 66 | walker = LatexWalker(tex_code) 67 | parsed = walker.get_latex_nodes()[0] 68 | 69 | for node in parsed: 70 | if node.isNodeType(LatexEnvironmentNode): 71 | if node.environmentname == 'document': 72 | parsed = [node] 73 | break 74 | 75 | filtered_nodes = remove_figure_nodes(parsed) 76 | return filtered_nodes 77 | 78 | def resolve_input_commands(latex_code, base_dir="."): 79 | input_pattern = re.compile(r"(? 1: 159 | if 'main.tex' in tex_files: tex_files = ['main.tex'] 160 | else: 161 | self.logger.info(f'------ Found multiple tex files: {tex_files}') 162 | return 163 | elif len(tex_files) == 0: 164 | self.logger.info(f'------ Found no tex files') 165 | return 166 | tex_file = tex_files[0] 167 | with open(f'./{paper_id}/{tex_file}', 'r', encoding='utf-8', errors='ignore') as f: 168 | latex_code = f.read() 169 | if '\\input' in latex_code: 170 | latex_code = resolve_input_commands(latex_code, base_dir=f'./{paper_id}') 171 | text = self.text_extractor.extract(latex_code) 172 | 173 | meta_data['text'] = text 174 | with open(f'{self.text_save_dir}/{paper_id}.json', 'w') as f: 175 | json.dump(meta_data, f, ensure_ascii=False) 176 | 177 | self.logger.info(f'------ Saved {paper_id}.json') 178 | 179 | except Exception as e: 180 | self.logger.error(f'ERROR: {e}') 181 | time.sleep(3) 182 | return 183 | 184 | finally: 185 | shutil.rmtree(f'./{paper_id}') 186 | os.remove(f'{paper_id}.arxiv_source') 187 | 188 | 189 | if __name__ == '__main__': 190 | hf_token = os.environ['HF_TOKEN'] 191 | year, month, save_dir, = sys.argv[1:] 192 | month = int(month) % 12 + 1 193 | 194 | if f'{year}-{month:02d}' in ['2021-01', '2021-02', '2021-03']: 195 | print(f"Skip {year}-{month:02d}") 196 | exit() 197 | 198 | time_stamp = f'{year}-{month:02d}' 199 | 200 | first_day = datetime.date(int(year), int(month), 1) 201 | last_day = datetime.date(int(year), int(month), 28) 202 | 203 | start_time_str = first_day.strftime("%Y%m%d%H%M%S") 204 | end_time_str = last_day.strftime("%Y%m%d%H%M%S") 205 | 206 | text_save_dir = os.path.join(save_dir, time_stamp) 207 | if not os.path.exists(text_save_dir): 208 | os.makedirs(text_save_dir) 209 | 210 | search = arxiv.Search( 211 | query=f'submittedDate:[{start_time_str} TO {end_time_str}]', 212 | sort_by = arxiv.SortCriterion.SubmittedDate, 213 | sort_order=arxiv.SortOrder.Descending, 214 | max_results=800 215 | ) 216 | 217 | q = Queue() 218 | num_threads = 4 219 | 220 | for i in range(num_threads): 221 | worker = Worker(q, i, text_save_dir,) 222 | worker.daemon = True 223 | worker.start() 224 | 225 | for index, result in enumerate(search.results()): 226 | q.put((index, result)) 227 | 228 | q.join() 229 | 230 | print(f"Finished {time_stamp}") 231 | 232 | # files = glob(f'{text_save_dir}/*.json') 233 | # ds = datasets.load_dataset('json', data_files=files, split='train') 234 | 235 | # ds.push_to_hub( 236 | # "RealTimeData/arxiv_alltime", 237 | # config_name=time_stamp, 238 | # token=hf_token, 239 | # ) -------------------------------------------------------------------------------- /data/monthly_updater/monthly_arxiv.py: -------------------------------------------------------------------------------- 1 | import arxiv 2 | import datetime 3 | from queue import Queue 4 | from threading import Thread, Lock 5 | import os 6 | import logging 7 | import time 8 | import tarfile 9 | from pylatexenc.latexwalker import LatexWalker, LatexEnvironmentNode, LatexMacroNode 10 | from pylatexenc import latex2text 11 | from pylatexenc.macrospec import LatexContextDb 12 | import shutil 13 | import re 14 | import json 15 | from glob import glob 16 | from huggingface_hub import create_branch, create_tag, RepoCard 17 | import datasets 18 | import sys 19 | 20 | def filter_element(context, exclude_elements = []): 21 | 22 | new_context = LatexContextDb() 23 | 24 | new_context.unknown_macro_spec = context.unknown_macro_spec 25 | new_context.unknown_environment_spec = context.unknown_environment_spec 26 | new_context.unknown_specials_spec = context.unknown_specials_spec 27 | 28 | filter_element_func = lambda dict_to_filter: {k:v for k,v in dict_to_filter.items() if k not in exclude_elements}.values() 29 | for cat in context.category_list: 30 | 31 | # include this category 32 | new_context.add_context_category( 33 | cat, 34 | macros=filter_element_func(context.d[cat]['macros']), 35 | environments=filter_element_func(context.d[cat]['environments']), 36 | specials=filter_element_func(context.d[cat]['specials']), 37 | ) 38 | 39 | return new_context 40 | 41 | class TextExtractor: 42 | 43 | def __init__(self): 44 | self.l2t_context_db = latex2text.get_default_latex_context_db() 45 | self.l2t_context_db.add_context_category( 46 | 'Abstract', 47 | macros={}, 48 | environments=[ 49 | latex2text.EnvironmentTextSpec("abstract", simplify_repl=r'§ ABSTRACT %(body)s'), 50 | latex2text.EnvironmentTextSpec("Abstract", simplify_repl=r'§ ABSTRACT %(body)s') 51 | ], 52 | specials={} 53 | ) 54 | self.l2t_context_db = filter_element(self.l2t_context_db, ['href']) 55 | 56 | self.l2t = latex2text.LatexNodes2Text(latex_context=self.l2t_context_db) 57 | 58 | def extract(self, latex_code): 59 | result = parse_tex_ignore_figures(latex_code) 60 | return self.l2t.nodelist_to_text(result) 61 | 62 | def remove_figure_nodes(node_list): 63 | filtered_node_list = [] 64 | for node in node_list: 65 | # Ignore the 'figure' environment 66 | if node.isNodeType(LatexEnvironmentNode): 67 | if node.environmentname in [ 'figure', 'figure*', 'algorithm', 'table', 'table*', 'algorithmic']: 68 | continue 69 | if hasattr(node, 'nodelist'): 70 | node.nodelist = remove_figure_nodes(node.nodelist) 71 | filtered_node_list.append(node) 72 | return filtered_node_list 73 | 74 | def parse_tex_ignore_figures(tex_code): 75 | walker = LatexWalker(tex_code) 76 | parsed = walker.get_latex_nodes()[0] 77 | 78 | for node in parsed: 79 | if node.isNodeType(LatexEnvironmentNode): 80 | if node.environmentname == 'document': 81 | parsed = [node] 82 | break 83 | 84 | filtered_nodes = remove_figure_nodes(parsed) 85 | return filtered_nodes 86 | 87 | def resolve_input_commands(latex_code, base_dir="."): 88 | input_pattern = re.compile(r"(? 1: 168 | if 'main.tex' in tex_files: tex_files = ['main.tex'] 169 | else: 170 | self.logger.info(f'------ Found multiple tex files: {tex_files}') 171 | return 172 | elif len(tex_files) == 0: 173 | self.logger.info(f'------ Found no tex files') 174 | return 175 | tex_file = tex_files[0] 176 | with open(f'./{paper_id}/{tex_file}', 'r', encoding='utf-8', errors='ignore') as f: 177 | latex_code = f.read() 178 | if '\\input' in latex_code: 179 | latex_code = resolve_input_commands(latex_code, base_dir=f'./{paper_id}') 180 | text = self.text_extractor.extract(latex_code) 181 | 182 | meta_data['text'] = text 183 | with open(f'{self.text_save_dir}/{paper_id}.json', 'w') as f: 184 | json.dump(meta_data, f, ensure_ascii=False) 185 | 186 | self.logger.info(f'------ Saved {paper_id}.json') 187 | 188 | except Exception as e: 189 | self.logger.error(f'ERROR: {e}') 190 | time.sleep(3) 191 | return 192 | 193 | finally: 194 | shutil.rmtree(f'./{paper_id}') 195 | os.remove(f'{paper_id}.arxiv_source') 196 | 197 | 198 | if __name__ == '__main__': 199 | today = datetime.date.today() 200 | year = today.year 201 | month = today.month 202 | save_dir = './arxiv_data/' 203 | 204 | hf_token = os.environ['HF_TOKEN'] 205 | time_stamp = f'{year}-{month:02d}' 206 | 207 | first_day = datetime.date(int(year), int(month), 1) 208 | last_day = datetime.date(int(year), int(month), 28) 209 | 210 | start_time_str = first_day.strftime("%Y%m%d%H%M%S") 211 | end_time_str = last_day.strftime("%Y%m%d%H%M%S") 212 | 213 | text_save_dir = os.path.join(save_dir, time_stamp) 214 | if not os.path.exists(text_save_dir): 215 | os.makedirs(text_save_dir) 216 | 217 | search = arxiv.Search( 218 | query=f'submittedDate:[{start_time_str} TO {end_time_str}]', 219 | sort_by = arxiv.SortCriterion.SubmittedDate, 220 | sort_order=arxiv.SortOrder.Descending, 221 | max_results=1000 222 | ) 223 | 224 | q = Queue() 225 | num_threads = 4 226 | 227 | for i in range(num_threads): 228 | worker = Worker(q, i, text_save_dir,) 229 | worker.daemon = True 230 | worker.start() 231 | 232 | for index, result in enumerate(search.results()): 233 | q.put((index, result)) 234 | 235 | q.join() 236 | 237 | print(f"Finished {time_stamp}") 238 | 239 | files = glob(f'{text_save_dir}/*.json') 240 | ds = datasets.load_dataset('json', data_files=files, split='train') 241 | 242 | ds.push_to_hub( 243 | "RealTimeData/arxiv_alltime", 244 | config_name=time_stamp, 245 | token=hf_token, 246 | ) -------------------------------------------------------------------------------- /arxiv_downloader.py: -------------------------------------------------------------------------------- 1 | import arxiv 2 | import datetime 3 | from queue import Queue 4 | from threading import Thread, Lock 5 | import os 6 | import logging 7 | import time 8 | import tarfile 9 | from pylatexenc.latexwalker import LatexWalker, LatexEnvironmentNode, LatexMacroNode 10 | from pylatexenc import latex2text 11 | from pylatexenc.macrospec import LatexContextDb 12 | import shutil 13 | import re 14 | import json 15 | from glob import glob 16 | from huggingface_hub import create_branch, create_tag, RepoCard 17 | import datasets 18 | import sys 19 | 20 | def filter_element(context, exclude_elements = []): 21 | 22 | new_context = LatexContextDb() 23 | 24 | new_context.unknown_macro_spec = context.unknown_macro_spec 25 | new_context.unknown_environment_spec = context.unknown_environment_spec 26 | new_context.unknown_specials_spec = context.unknown_specials_spec 27 | 28 | filter_element_func = lambda dict_to_filter: {k:v for k,v in dict_to_filter.items() if k not in exclude_elements}.values() 29 | for cat in context.category_list: 30 | 31 | # include this category 32 | new_context.add_context_category( 33 | cat, 34 | macros=filter_element_func(context.d[cat]['macros']), 35 | environments=filter_element_func(context.d[cat]['environments']), 36 | specials=filter_element_func(context.d[cat]['specials']), 37 | ) 38 | 39 | return new_context 40 | 41 | class TextExtractor: 42 | 43 | def __init__(self): 44 | self.l2t_context_db = latex2text.get_default_latex_context_db() 45 | self.l2t_context_db.add_context_category( 46 | 'Abstract', 47 | macros={}, 48 | environments=[ 49 | latex2text.EnvironmentTextSpec("abstract", simplify_repl=r'§ ABSTRACT %(body)s'), 50 | latex2text.EnvironmentTextSpec("Abstract", simplify_repl=r'§ ABSTRACT %(body)s') 51 | ], 52 | specials={} 53 | ) 54 | self.l2t_context_db = filter_element(self.l2t_context_db, ['href']) 55 | 56 | self.l2t = latex2text.LatexNodes2Text(latex_context=self.l2t_context_db) 57 | 58 | def extract(self, latex_code): 59 | result = parse_tex_ignore_figures(latex_code) 60 | return self.l2t.nodelist_to_text(result) 61 | 62 | def remove_figure_nodes(node_list): 63 | filtered_node_list = [] 64 | for node in node_list: 65 | # Ignore the 'figure' environment 66 | if node.isNodeType(LatexEnvironmentNode): 67 | if node.environmentname in [ 'figure', 'figure*', 'algorithm', 'table', 'table*', 'algorithmic']: 68 | continue 69 | if hasattr(node, 'nodelist'): 70 | node.nodelist = remove_figure_nodes(node.nodelist) 71 | filtered_node_list.append(node) 72 | return filtered_node_list 73 | 74 | def parse_tex_ignore_figures(tex_code): 75 | walker = LatexWalker(tex_code) 76 | parsed = walker.get_latex_nodes()[0] 77 | 78 | for node in parsed: 79 | if node.isNodeType(LatexEnvironmentNode): 80 | if node.environmentname == 'document': 81 | parsed = [node] 82 | break 83 | 84 | filtered_nodes = remove_figure_nodes(parsed) 85 | return filtered_nodes 86 | 87 | def resolve_input_commands(latex_code, base_dir="."): 88 | input_pattern = re.compile(r"(? 1: 168 | if 'main.tex' in tex_files: tex_files = ['main.tex'] 169 | else: 170 | self.logger.info(f'------ Found multiple tex files: {tex_files}') 171 | return 172 | elif len(tex_files) == 0: 173 | self.logger.info(f'------ Found no tex files') 174 | return 175 | tex_file = tex_files[0] 176 | with open(f'./{paper_id}/{tex_file}', 'r', encoding='utf-8', errors='ignore') as f: 177 | latex_code = f.read() 178 | if '\\input' in latex_code: 179 | latex_code = resolve_input_commands(latex_code, base_dir=f'./{paper_id}') 180 | text = self.text_extractor.extract(latex_code) 181 | 182 | meta_data['text'] = text 183 | with open(f'{self.text_save_dir}/{paper_id}.json', 'w') as f: 184 | json.dump(meta_data, f, ensure_ascii=False) 185 | 186 | self.logger.info(f'------ Saved {paper_id}.json') 187 | 188 | except Exception as e: 189 | self.logger.error(f'ERROR: {e}') 190 | time.sleep(3) 191 | return 192 | 193 | finally: 194 | shutil.rmtree(f'./{paper_id}') 195 | os.remove(f'{paper_id}.arxiv_source') 196 | 197 | 198 | if __name__ == '__main__': 199 | hf_token = os.environ['HF_TOKEN'] 200 | 201 | today = datetime.date.today() 202 | start_time = today - datetime.timedelta(days=7) 203 | 204 | start_time_str = start_time.strftime("%Y%m%d%H%M%S") 205 | end_time_str = today.strftime("%Y%m%d%H%M%S") 206 | 207 | text_save_dir = f'arxiv_{start_time_str}_to_{end_time_str}' 208 | if not os.path.exists(text_save_dir): 209 | os.makedirs(text_save_dir) 210 | 211 | search = arxiv.Search( 212 | query=f'submittedDate:[{start_time_str} TO {end_time_str}]', 213 | sort_by = arxiv.SortCriterion.SubmittedDate, 214 | sort_order=arxiv.SortOrder.Descending, 215 | max_results=1600 216 | ) 217 | 218 | q = Queue() 219 | num_threads = 4 220 | 221 | for i in range(num_threads): 222 | worker = Worker(q, i, text_save_dir,) 223 | worker.daemon = True 224 | worker.start() 225 | 226 | for index, result in enumerate(search.results()): 227 | q.put((index, result)) 228 | 229 | q.join() 230 | 231 | files = glob(f'{text_save_dir}/*.json') 232 | ds = datasets.load_dataset('json', data_files=files, split='train') 233 | 234 | try: 235 | create_branch('RealTimeData/arxiv_latest', branch=today.isoformat(), token=hf_token, repo_type='dataset') 236 | except: 237 | pass 238 | ds.push_to_hub('RealTimeData/arxiv_latest', token=hf_token, branch=today.isoformat()) 239 | ds.push_to_hub('RealTimeData/arxiv_latest', token=hf_token, branch='main') 240 | 241 | text = f""" 242 | # Latest arXiv 243 | 244 | You could always access the latest arXiv papers via this dataset. 245 | 246 | We update the dataset weekly, on every Sunday. So the dataset always provides the latest arXiv papers created in the past week. 247 | 248 | The current dataset on main branch contains the latest arXiv papers submitted from {start_time.isoformat()} to {today.isoformat()}. 249 | 250 | The data collection was conducted on {today.isoformat()}. 251 | 252 | Use the dataset via: 253 | ``` 254 | ds = datasets.load_dataset('RealTimeData/arxiv_latest') 255 | ``` 256 | 257 | # Previsou versions 258 | 259 | You could access previous versions by requesting different branches. 260 | 261 | For example, you could find the 2023-08-20 version via: 262 | ``` 263 | ds = datasets.load_dataset('RealTimeData/arxiv_latest', revision = '2023-08-20') 264 | ``` 265 | 266 | Check all available versions by clicking the "Files and versions" button on the top bar. 267 | """ 268 | card = RepoCard(text) 269 | card.push_to_hub('RealTimeData/arxiv_latest', repo_type='dataset', token=hf_token) -------------------------------------------------------------------------------- /data/wikipedia.py: -------------------------------------------------------------------------------- 1 | import requests 2 | import mwparserfromhell 3 | import json 4 | import os 5 | from transformers import LlamaForCausalLM, LlamaTokenizerFast, AutoModelForCausalLM, AutoTokenizer, OPTForCausalLM 6 | import sys 7 | import torch 8 | from tqdm import tqdm 9 | import traceback 10 | from auto_gptq import AutoGPTQForCausalLM, BaseQuantizeConfig 11 | import datasets 12 | import numpy as np 13 | import time 14 | import openai 15 | from doc_info import verbalise_docs 16 | 17 | WIKI_API_ENDPOINT = "https://en.wikipedia.org/w/api.php" 18 | 19 | def self_info(text, model, tokenizer, merge = False): 20 | def merge_sub_tokens(log_probs, word_ids): 21 | # merge log probs of sub_tokens 22 | merged_log_probs = [] 23 | current_word_id = None 24 | current_word_log_prob = None 25 | counter = 1 26 | 27 | for log_prob, word_id in zip(log_probs, word_ids): 28 | if word_id is not None: 29 | if current_word_id != word_id: 30 | if current_word_id is not None: 31 | merged_log_probs.extend([current_word_log_prob] * counter) 32 | counter = 1 33 | current_word_id = word_id 34 | current_word_log_prob = log_prob 35 | else: 36 | counter += 1 37 | current_word_log_prob = current_word_log_prob + log_prob 38 | 39 | if current_word_id is not None: 40 | merged_log_probs.extend([current_word_log_prob] * counter) 41 | 42 | return merged_log_probs 43 | 44 | # this function is used to get the self-information of a text 45 | # the model should be a causal language model, e.g. GPT2LMHeadModel 46 | 47 | # tokenize the text 48 | text = f"{tokenizer.bos_token}{text}" 49 | encoding = tokenizer(text, return_tensors="pt", max_length=model.config.max_position_embeddings, truncation=True) 50 | encoding = encoding.to(model.device) 51 | 52 | # get the logits 53 | with torch.no_grad(): 54 | logits = model(**encoding).logits 55 | probs = torch.softmax(logits, dim=-1) 56 | info = -torch.log(probs) 57 | 58 | input_ids = encoding['input_ids'] 59 | input_ids_expaned = input_ids[:, 1:].unsqueeze(-1) 60 | info = info[:, :-1].gather(-1, input_ids_expaned).squeeze(-1).squeeze(0).tolist() 61 | 62 | tokens = [tokenizer.decode(token_) for token_ in input_ids.squeeze().tolist()[1:]] 63 | if merge: 64 | info = merge_sub_tokens(info, encoding.word_ids()[1:]) 65 | return tokens, info 66 | 67 | def gpt3_self_info(text, num_retry = 5): 68 | # text = text[:1000] 69 | openai.api_key = os.environ["OPENAI_API_KEY"] 70 | 71 | for _ in range(num_retry): 72 | try: 73 | r = openai.Completion.create( 74 | model="curie", 75 | prompt=f"<|endoftext|>{text}", 76 | max_tokens=0, 77 | temperature=0, 78 | echo=True, 79 | logprobs=0, 80 | ) 81 | break 82 | except Exception as e: 83 | print(e) 84 | time.sleep(1) 85 | 86 | result = r['choices'][0] 87 | tokens, logprobs = result["logprobs"]["tokens"][1:], result["logprobs"]["token_logprobs"][1:] 88 | 89 | assert len(tokens) == len(logprobs), f"Expected {len(tokens)} logprobs, got {len(logprobs)}" 90 | 91 | self_info = [ -logprob for logprob in logprobs] 92 | # TODO: deal with the first delimiter 93 | return tokens, self_info 94 | 95 | def fetch_recent_changes(from_date, to_date = '2023-08-01T00:00:00'): 96 | params = { 97 | "action": "query", 98 | "format": "json", 99 | "list": "recentchanges", 100 | "rcstart": to_date, # starting from the newer date 101 | "rcend": from_date, # ending at the older date 102 | "rctype": "new", 103 | "rcnamespace": "0", 104 | "rclimit": "500", 105 | "rcprop": "title|timestamp" 106 | } 107 | req = requests.Request('GET', WIKI_API_ENDPOINT, params=params).prepare() 108 | response = requests.get(WIKI_API_ENDPOINT, params=params).json() 109 | 110 | # Check if the response contains the expected data 111 | if 'query' in response and 'recentchanges' in response['query']: 112 | return [entry['title'] for entry in response['query']['recentchanges']] 113 | else: 114 | return [] 115 | 116 | def fetch_content(title, date=None): 117 | params = { 118 | "action": "query", 119 | "format": "json", 120 | "titles": title, 121 | "prop": "revisions", 122 | "rvprop": "content", 123 | "rvlimit": "1", 124 | } 125 | if date: params["rvstart"] = date 126 | try: 127 | response = requests.get(WIKI_API_ENDPOINT, params=params) 128 | response.raise_for_status() # Will raise an error if the HTTP request returned an unsuccessful status code 129 | data = response.json() 130 | if 'error' in data: 131 | print(f"Error fetching content for {title}: {data['error']['info']}") 132 | return None 133 | 134 | page = next(iter(data['query']['pages'].values())) 135 | if 'revisions' not in page: 136 | print(f"No revisions found for {title}") 137 | return None 138 | content = page['revisions'][0]['*'] 139 | 140 | # Check if the content is a redirect and skip if true 141 | if content.startswith("#REDIRECT"): 142 | print(f"{title} is a redirect page.") 143 | return None 144 | return content 145 | 146 | except Exception as e: 147 | print(f"An error occurred while fetching content for {title}: {str(e)}") 148 | traceback.print_exc() # This will print the full traceback 149 | 150 | return None 151 | 152 | def parse_to_plain_text(wikitext): 153 | parsed = mwparserfromhell.parse(wikitext) 154 | return parsed.strip_code() 155 | 156 | def select_token_window(text, token_count=400): 157 | tokens = text.split() 158 | if len(tokens) <= token_count: 159 | return text 160 | ramdom_start = np.random.randint(0, len(tokens) - token_count) 161 | tokens = tokens[ramdom_start:ramdom_start + token_count] 162 | return ' '.join(tokens) 163 | 164 | def fetch_latest_and_historical_wiki_pages(cache_dir = '', historical_date = '2022-07-01T00:00:00Z', token_count = 300): 165 | # 1. Fetch the latest created pages from July 2023 and their content. 166 | recent_wiki_path = os.path.join(cache_dir, 'recent_wiki_pages.json') 167 | if not os.path.exists(recent_wiki_path): 168 | recent_titles = fetch_recent_changes("2023-07-01T00:00:00Z") 169 | recent_contents = [fetch_content(title) for title in tqdm(recent_titles)] 170 | recent_contents = [content for content in recent_contents if content is not None] 171 | 172 | data_to_save = {title: content for title, content in zip(recent_titles, recent_contents)} 173 | with open(recent_wiki_path, 'w') as file: 174 | json.dump(data_to_save, file, ensure_ascii=False, indent=4) 175 | else: 176 | with open(recent_wiki_path) as file: 177 | data_to_save = json.load(file) 178 | recent_titles = list(data_to_save.keys()) 179 | recent_contents = list(data_to_save.values()) 180 | recent_contents = [content for content in recent_contents if content is not None] 181 | 182 | # 2. Fetch a historical version of a specific title from July 2022. 183 | historical_wiki_path = os.path.join(cache_dir, 'historical_wiki_pages.json') 184 | if not os.path.exists(historical_wiki_path): 185 | with open(os.path.join(cache_dir, 'data/squad_wiki_title.text')) as f: 186 | titles = [line.strip() for line in f.readlines()] 187 | historical_contents = [fetch_content(title, historical_date) for title in tqdm(titles)] 188 | historical_contents = [content for content in historical_contents if content is not None] 189 | historical_to_save = {title: content for title, content in zip(titles, historical_contents)} 190 | with open(historical_wiki_path, 'w') as file: 191 | json.dump(historical_to_save, file, ensure_ascii=False, indent=4) 192 | else: 193 | with open(historical_wiki_path) as file: 194 | historical_to_save = json.load(file) 195 | historical_titles = list(historical_to_save.keys()) 196 | historical_contents = list(historical_to_save.values()) 197 | historical_contents = [content for content in historical_contents if content is not None] 198 | 199 | # 3. Parse the content to plain text. 200 | recent_plain_text_path = os.path.join(cache_dir, 'recent_plain_text.json') 201 | historical_plain_text_path = os.path.join(cache_dir, 'historical_plain_text.json') 202 | if not os.path.exists(recent_plain_text_path): 203 | plain_texts_recent = [parse_to_plain_text(content) for content in recent_contents] 204 | plain_texts_historical = [parse_to_plain_text(content) for content in historical_contents] 205 | with open(recent_plain_text_path, 'w') as file: 206 | json.dump(plain_texts_recent, file, ensure_ascii=False, indent=4) 207 | with open(historical_plain_text_path, 'w') as file: 208 | json.dump(plain_texts_historical, file, ensure_ascii=False, indent=4) 209 | else: 210 | with open(recent_plain_text_path) as file: 211 | plain_texts_recent = json.load(file) 212 | with open(historical_plain_text_path) as file: 213 | plain_texts_historical = json.load(file) 214 | 215 | # 4. Select a 1000-token window from the text. 216 | selected_windows_recent = [select_token_window(text, token_count=token_count) for text in plain_texts_recent] 217 | selected_windows_historical = [select_token_window(text, token_count=token_count) for text in plain_texts_historical] 218 | 219 | return selected_windows_recent, selected_windows_historical 220 | 221 | def prepare_comparing_data(datasets_and_texts_col, num_samples=200, token_count=300): 222 | # datasets_and_texts is a dict of list {dataset_name: col_name} 223 | 224 | datasets_and_texts = {} 225 | for dataset_name, col_name in datasets_and_texts_col.items(): 226 | if dataset_name in ['quac', 'squad_v2', 'boolq', 'iohadrubin/mini_xsum', 'liyucheng/trivia_qa_wiki_val']: 227 | ds = datasets.load_dataset(dataset_name, split='validation') 228 | elif 'RealTimeData' in dataset_name: 229 | ds = datasets.load_dataset(dataset_name, split='train') 230 | ds = ds[col_name][:num_samples] 231 | 232 | datasets_and_texts[dataset_name + f'_{token_count}_words'] = [select_token_window(text, token_count=token_count) for text in ds] 233 | # datasets_and_texts[dataset_name + '_200_words'] = [select_token_window(text, token_count=200) for text in ds] 234 | 235 | return datasets_and_texts 236 | 237 | if __name__ == "__main__": 238 | cwd, model_name, token_count, = sys.argv[1:] 239 | token_count = int(token_count) 240 | batch_size = 8 241 | 242 | recent_snippets, historical_snippets = fetch_latest_and_historical_wiki_pages(cache_dir=cwd, token_count=token_count) 243 | recent_snippets = recent_snippets[:120] 244 | historical_snippets = historical_snippets[:120] 245 | wikipedia_and_texts = { 246 | 'wiki_recent': recent_snippets, 247 | 'wiki_historical': historical_snippets 248 | } 249 | # datasets_and_texts = prepare_comparing_data({ 250 | # 'liyucheng/trivia_qa_wiki_val': 'wiki_context_sample' 251 | # 'RealTimeData/bbc_latest': 'content', 252 | # 'RealTimeData/bbc_2017': 'content', 253 | # 'iohadrubin/mini_xsum': 'document' 254 | # 'quac': 'context', 255 | # 'boolq': 'passage', 256 | # 'squad_v2': 'context', 257 | # 'RealTimeData/github_july_week1_2023': 'readme', 258 | # 'RealTimeData/arxiv_july_week1_2023': 'text', 259 | # 'RealTimeData/bbc_news_week1_july_2023': 'content', 260 | # }, token_count=token_count, num_samples=120) 261 | datasets_and_texts = verbalise_docs(num_words=token_count) 262 | 263 | if 'GPTQ' in model_name: 264 | # only llama-30b use gptq 265 | model = AutoGPTQForCausalLM.from_quantized(model_name, device = 'cuda:0', use_safetensors = True, disable_exllama=True if '30b' in model_name else False) 266 | tokenizer = LlamaTokenizerFast.from_pretrained(model_name) 267 | elif 'llama' in model_name.lower(): 268 | model = LlamaForCausalLM.from_pretrained(model_name, torch_dtype=torch.float16, device_map='auto') 269 | tokenizer = LlamaTokenizerFast.from_pretrained(model_name) 270 | elif 'opt' in model_name.lower(): 271 | model = OPTForCausalLM.from_pretrained(model_name, device_map='auto') 272 | tokenizer = AutoTokenizer.from_pretrained(model_name) 273 | elif 'gpt2' == model_name.lower(): 274 | model = AutoModelForCausalLM.from_pretrained(model_name) 275 | tokenizer = AutoTokenizer.from_pretrained(model_name) 276 | 277 | # datasets_and_texts = prepare_comparing_data({ 278 | # 'RealTimeData/News_Seq_2021': 'maintext', 279 | # 'RealTimeData/News_August_2023': 'maintext', 280 | # }) 281 | 282 | # datasets_and_texts.update(wikipedia_and_texts) 283 | 284 | print('=====================') 285 | print(f'Model: {model_name}') 286 | 287 | for dataset_name, texts in datasets_and_texts.items(): 288 | print(f'=====================') 289 | print(f'Dataset: {dataset_name}') 290 | infos = [] 291 | for text in tqdm(texts): 292 | try: 293 | if 'curie' in model_name.lower(): 294 | tokens, info = gpt3_self_info(text) 295 | else: 296 | tokens, info = self_info(text, model, tokenizer) 297 | except: 298 | traceback.print_exc() 299 | time.sleep(10) 300 | continue 301 | # print('text:', text, '\ninfo:', info) 302 | infos.append(sum(info)/len(info)) 303 | print(f'Average self-info: {sum(infos)/len(infos)}') -------------------------------------------------------------------------------- /bbc_downloader.py: -------------------------------------------------------------------------------- 1 | import weakref 2 | import requests 3 | 4 | from configobj import ConfigObj 5 | 6 | class Configuration: 7 | 8 | def __init__(self): 9 | self.__properties = dict() 10 | properties = self._init_properties() 11 | for property_, value, transform_fn in properties: 12 | if transform_fn is not None: 13 | value = transform_fn(value) 14 | setattr(self, property_, value) 15 | self.__properties[property_] = { 16 | 'default-value': value, 17 | 'transform_fn': transform_fn 18 | } 19 | 20 | def _init_properties(self): 21 | # [[name, default-value, transform_fn]] 22 | return [] 23 | 24 | # TODO: hierachical config 25 | def load(self, path): 26 | config = ConfigObj(path, encoding='UTF-8') 27 | for property_, value in config.items(): 28 | transform_fn = self.__properties[property_]['transform_fn'] 29 | if transform_fn is not None: 30 | value = transform_fn(value) 31 | setattr(self, property_, value) 32 | 33 | from dateutil.relativedelta import relativedelta 34 | # from datetime import datetime, date 35 | import datetime 36 | 37 | class DatasetConfiguration(Configuration): 38 | 39 | def _format_date(self, date_str): 40 | return datetime.datetime.strptime(date_str, '%Y-%m-%d') 41 | 42 | def _calculate_step(self, step): 43 | step = int(step) 44 | if self.step_unit == 'day': 45 | return relativedelta(days=step) 46 | elif self.step_unit == 'month': 47 | return relativedelta(months=step) 48 | else: 49 | return relativedelta(years=step) 50 | 51 | def _init_properties(self): 52 | return [ 53 | ['name', '', str], 54 | ['base_api_url', 'http://dracos.co.uk/made/bbc-news-archive/{year}/{month:0>2}/{day:0>2}/', str], 55 | ['start_date', '2016-01-01', self._format_date], 56 | ['end_date', '2017-01-01', self._format_date], 57 | ['step_unit', 'day', str], 58 | ['step', 1, self._calculate_step], 59 | ['path', './dataset/bbc/', str], 60 | ['sleep', 1, float] 61 | ] 62 | 63 | class NetWorkConfiguration(Configuration): 64 | 65 | HTTP_TIMEOUT = 30 66 | STRICT = True 67 | USER_AGENT = 'Mozilla' 68 | 69 | def _init_properties(self): 70 | return [ 71 | ['browser_user_agent', 'Mozilla', str], 72 | ['http_timeout', 30, int], 73 | ['strict', True, lambda v: str(v) == 'True'] 74 | ] 75 | 76 | class NetworkError(RuntimeError): 77 | 78 | def __init__(self, status_code, reason): 79 | self.reason = reason 80 | self.status_code = status_code 81 | 82 | class NetworkFetcher(object): 83 | 84 | def __init__(self): 85 | self.config = NetWorkConfiguration() 86 | # self.config.load('./settings/network.cfg') 87 | self.config.strict = False 88 | 89 | self._connection = requests.Session() 90 | self._connection.headers['User-agent'] = self.config.browser_user_agent 91 | self._finalizer = weakref.finalize(self, self.close) 92 | 93 | self._url = None 94 | self.response = None 95 | self.headers = None 96 | 97 | def close(self): 98 | if self._connection is not None: 99 | self._connection.close() 100 | self._connection = None 101 | 102 | def get_url(self): 103 | return self._url 104 | 105 | def fetch(self, url): 106 | try: 107 | response = self._connection.get(url, timeout=self.config.http_timeout, headers=self.headers) 108 | except Exception: 109 | return None 110 | if response.ok: 111 | self._url = response.url 112 | text = response.content 113 | else: 114 | self._url = None 115 | text = None 116 | if self.config.strict: 117 | raise NetworkError(response.status_code, response.reason) 118 | 119 | return text 120 | 121 | class DownloadLinkFetcher: 122 | 123 | RETRY = 5 124 | 125 | def __init__(self, config): 126 | self.base_api_url = config.base_api_url 127 | 128 | self.start_date = config.start_date 129 | self.current_date = config.start_date 130 | self.end_date = config.end_date 131 | self.step_unit = config.step_unit 132 | self.step = config.step 133 | 134 | self.html_fetcher = NetworkFetcher() 135 | 136 | def _format_link(self, link): 137 | print(link) 138 | hash_index = link.find('#') 139 | if hash_index != -1: 140 | link = link[:hash_index] 141 | if link and link[-1] == '/': 142 | link = link[:-1] 143 | return link 144 | 145 | def _link_filter(self, link, filters): 146 | if not link: 147 | return False 148 | if not link[-1].isdigit(): 149 | return False 150 | for filter_ in filters: 151 | if link[filter_[1]:filter_[2]] == filter_[0]: 152 | return False 153 | return True 154 | 155 | def _html_to_links(self, html): 156 | return [] 157 | 158 | def _next_api(self, base_url, current_date): 159 | return '' 160 | 161 | def next(self): 162 | if self.current_date >= self.end_date: 163 | return None, None 164 | api_url = self._next_api(self.base_api_url, self.current_date) 165 | date = self.current_date 166 | self.current_date += self.step 167 | return api_url, date 168 | 169 | def fetch(self, api_url): 170 | print('fetching download links...') 171 | html = self.html_fetcher.fetch(api_url) 172 | if html is None: 173 | for _ in range(0, self.RETRY): 174 | html = self.html_fetcher.fetch(api_url) 175 | if html is not None: 176 | break 177 | if html is None or len(html) == 0: 178 | print('api', api_url, ' failed') 179 | return [] 180 | links = self._html_to_links(html) 181 | return links 182 | 183 | from bs4 import BeautifulSoup 184 | 185 | class BBCLinkFetcher(DownloadLinkFetcher): 186 | 187 | BBC_FILTERS = [ 188 | ['programmes', 21, 31], 189 | ['correspondents', 26, 40], 190 | ['iplayer', 21, 28], 191 | ['radio', 21, 26], 192 | ['live', 27, 31], 193 | ['m', 7, 8], 194 | ['video_and_audio', 26, 41] 195 | ] 196 | 197 | def _next_api(self, base_url, current_date): 198 | year = current_date.year 199 | month = current_date.month 200 | day = current_date.day 201 | api_url = base_url.format(year=year, month=month, day=day) 202 | return api_url 203 | 204 | def _html_to_links(self, html): 205 | soup = BeautifulSoup(html, 'lxml') 206 | 207 | links = list() 208 | # news links are the hrefs of a 209 | elements = soup.table.find_all('a') 210 | # elements = soup.table.find_all('a', class_='title-link') 211 | for element in elements: 212 | href = element.get('href') 213 | if not href: 214 | continue 215 | link = self._format_link(href) 216 | if self._link_filter(link, self.BBC_FILTERS): 217 | links.append(link) 218 | 219 | return list(set(links)) 220 | 221 | 222 | import sys 223 | import os.path 224 | import json 225 | import time 226 | from datetime import timedelta 227 | 228 | class ArticleFetcher: 229 | 230 | RETRY = 5 231 | 232 | def __init__(self, config): 233 | self.config = config 234 | self.download_link_fetcher = None 235 | self.html_fetcher = NetworkFetcher() 236 | self.path = config.path 237 | 238 | self.total_date = 0 239 | 240 | self._mkdir(self.path, 241 | config.start_date, 242 | config.end_date, 243 | config.step) 244 | 245 | def _mkdir(self, path, start_date, end_date, step): 246 | if os.path.isdir(path): 247 | # current_date = start_date 248 | # while current_date < end_date: 249 | # current_date += step 250 | # self.total_date += 1 251 | # return 252 | pass 253 | else: 254 | os.makedirs(path) 255 | current_date = start_date 256 | existed_years = dict() 257 | while current_date < end_date: 258 | year = current_date.year 259 | month = current_date.month 260 | day = current_date.day 261 | 262 | year_path = os.path.join(path, str(year)) 263 | month_path = os.path.join(year_path, str(month)) 264 | day_path = os.path.join(month_path, str(day)) 265 | 266 | if year not in existed_years.keys(): 267 | existed_years[year] = dict() 268 | if not os.path.isdir(year_path): 269 | os.mkdir(year_path) 270 | 271 | if (step.months > 0) or (step.days > 0): 272 | year_content = existed_years[year] 273 | if month not in year_content.keys(): 274 | year_content[month] = True 275 | if not os.path.isdir(month_path): 276 | os.mkdir(month_path) 277 | 278 | if step.days > 0: 279 | if not os.path.isdir(day_path): 280 | os.mkdir(day_path) 281 | current_date += step 282 | 283 | self.total_date += 1 284 | 285 | def _html_to_infomation(self, html, link, date): 286 | return {} 287 | 288 | def _extract_information(self, link, date): 289 | html = self.html_fetcher.fetch(link) 290 | if html is None: 291 | for _ in range(0, self.RETRY): 292 | html = self.html_fetcher.fetch(link) 293 | if html is not None: 294 | break 295 | if html is None: 296 | print('article ', link, 'failed') 297 | return None 298 | return self._html_to_infomation(html, link, date) 299 | 300 | def _get_storage_path(self, path, date): 301 | return os.path.join(path, str(date.year), str(date.month), str(date.day)) 302 | 303 | def _lazy_storage(self, storage_path, links, date, current_date): 304 | total_links = len(links) 305 | current_link = 1 306 | 307 | titles_path = os.path.join(storage_path, f'titles.{current_date}') 308 | with open(titles_path, mode='w', encoding='utf-8') as titles_file: 309 | articles = list() 310 | titles = list() 311 | for link in links: 312 | print('>>> {c} in {t} articles\r'.format(c=current_link, t=total_links), end='') 313 | current_link += 1 314 | 315 | article = self._extract_information(link, date) 316 | if article is not None: 317 | titles.append(article['title'] + '\n') 318 | articles.append(article) 319 | 320 | articles_path = os.path.join(storage_path, f'articles.{current_date}') 321 | with open(articles_path, mode='w', encoding='utf-8') as articles_file: 322 | json.dump({ 323 | 'expected_number': len(links), 324 | 'number': len(articles), 325 | 'articles': articles 326 | }, articles_file, indent=4) 327 | titles_file.writelines(titles) 328 | 329 | def _non_lazy_storage(self, storage_path, links, date): 330 | total_links = len(links) 331 | current_link = 1 332 | 333 | titles_path = os.path.join(storage_path, 'titles') 334 | with open(titles_path, mode='w', encoding='utf-8') as titles_file: 335 | for article_index, link in enumerate(links): 336 | print('{c} in {t} articles\r'.format(c=current_link, t=total_links), end='') 337 | current_link += 1 338 | 339 | article = self._extract_information(link, date) 340 | if article is not None: 341 | titles_file.write(article['title'] + '\n') 342 | 343 | article_path = os.path.join(storage_path, str(article_index)) 344 | with open(article_path, mode='w', encoding='utf-8') as article_file: 345 | json.dump(article, article_file, indent=4) 346 | 347 | def fetch(self, lazy_storage=True): 348 | current_date = 1 349 | while True: 350 | api_url, date = self.download_link_fetcher.next() 351 | if api_url is None: 352 | break 353 | print(date.strftime('%Y-%m-%d'), 354 | '{c} in {t} dates '.format(c=current_date, t=self.total_date)) 355 | 356 | # storage_path = self._get_storage_path(self.path, date) 357 | storage_path = self.path 358 | links = self.download_link_fetcher.fetch(api_url) 359 | if lazy_storage: 360 | self._lazy_storage(storage_path, links, date, current_date) 361 | else: 362 | self._non_lazy_storage(storage_path, links, date) 363 | 364 | time.sleep(self.config.sleep) 365 | 366 | print(date.strftime('%Y-%m-%d'), 367 | 'date {c} finished '.format(c=current_date)) 368 | current_date += 1 369 | 370 | import json 371 | 372 | from bs4 import BeautifulSoup 373 | from goose3 import Goose 374 | from goose3.extractors.content import ContentExtractor 375 | 376 | eps = 1e-6 377 | f1 = ContentExtractor.calculate_best_node 378 | f2 = ContentExtractor.post_cleanup 379 | 380 | 381 | def post_cleanup(ce_inst): 382 | """\ 383 | remove any divs that looks like non-content, 384 | clusters of links, or paras with no gusto 385 | """ 386 | parse_tags = ['p'] 387 | if ce_inst.config.parse_lists: 388 | parse_tags.extend(['ul', 'ol']) 389 | if ce_inst.config.parse_headers: 390 | parse_tags.extend(['h1', 'h2', 'h3', 'h4', 'h5', 'h6']) 391 | 392 | target_node = ce_inst.article.top_node 393 | node = ce_inst.add_siblings(target_node) 394 | for elm in ce_inst.parser.getChildren(node): 395 | e_tag = ce_inst.parser.getTag(elm) 396 | if e_tag not in parse_tags: 397 | if ce_inst.is_highlink_density(elm) or ce_inst.is_table_and_no_para_exist(elm): 398 | ce_inst.parser.remove(elm) 399 | return node 400 | 401 | 402 | def calculate_best_node(ce_inst, doc): 403 | top_node = None 404 | nodes_to_check = ce_inst.nodes_to_check(doc) 405 | 406 | starting_boost = float(1.0) 407 | cnt = 0 408 | i = 0 409 | parent_nodes = [] 410 | nodes_with_text = [] 411 | 412 | for node in nodes_to_check: 413 | text_node = ce_inst.parser.getText(node) 414 | word_stats = ce_inst.stopwords_class(language=ce_inst.get_language()).get_stopword_count(text_node) 415 | high_link_density = ce_inst.is_highlink_density(node) 416 | if word_stats.get_stopword_count() > 2 and not high_link_density: 417 | nodes_with_text.append(node) 418 | 419 | nodes_number = len(nodes_with_text) 420 | negative_scoring = 0 421 | bottom_negativescore_nodes = float(nodes_number) * 0.25 422 | 423 | for node in nodes_with_text: 424 | boost_score = float(0) 425 | # boost 426 | if ce_inst.is_boostable(node): 427 | if cnt >= 0: 428 | boost_score = float((1.0 / starting_boost) * 50) 429 | starting_boost += 1 430 | # nodes_number 431 | if nodes_number > 15: 432 | if (nodes_number - i) <= bottom_negativescore_nodes: 433 | booster = float(bottom_negativescore_nodes - (nodes_number - i)) 434 | boost_score = float(-pow(booster, float(2))) 435 | negscore = abs(boost_score) + negative_scoring 436 | if negscore > 40: 437 | boost_score = float(5) 438 | 439 | text_node = ce_inst.parser.getText(node) 440 | word_stats = ce_inst.stopwords_class(language=ce_inst.get_language()).get_stopword_count(text_node) 441 | upscore = int(word_stats.get_stopword_count() + boost_score) 442 | 443 | # parent node 444 | parent_node = ce_inst.parser.getParent(node) 445 | ce_inst.update_score(parent_node, upscore) 446 | ce_inst.update_node_count(parent_node, 1) 447 | 448 | if parent_node not in parent_nodes: 449 | parent_nodes.append(parent_node) 450 | 451 | # parentparent node 452 | parent_parent_node = ce_inst.parser.getParent(parent_node) 453 | if parent_parent_node is not None: 454 | ce_inst.update_node_count(parent_parent_node, 1) 455 | ce_inst.update_score(parent_parent_node, upscore - eps) 456 | if parent_parent_node not in parent_nodes: 457 | parent_nodes.append(parent_parent_node) 458 | 459 | # parentparentparent node 460 | parent_parent_parent_node = ce_inst.parser.getParent(parent_parent_node) 461 | if parent_parent_parent_node is not None: 462 | ce_inst.update_node_count(parent_parent_parent_node, 1) 463 | ce_inst.update_score(parent_parent_parent_node, upscore - 2 * eps) 464 | if parent_parent_parent_node not in parent_nodes: 465 | parent_nodes.append(parent_parent_parent_node) 466 | cnt += 1 467 | i += 1 468 | 469 | top_node_score = 0 470 | for itm in parent_nodes: 471 | score = ce_inst.get_score(itm) 472 | 473 | if score > top_node_score: 474 | top_node = itm 475 | top_node_score = score 476 | 477 | if top_node is None: 478 | top_node = itm 479 | 480 | return top_node 481 | 482 | 483 | class BBCArticleFetcher(ArticleFetcher): 484 | 485 | def __init__(self, config): 486 | super(BBCArticleFetcher, self).__init__(config) 487 | self.download_link_fetcher = BBCLinkFetcher(config) 488 | 489 | def _extract_title(self, soup): 490 | if soup.title is not None: 491 | return soup.title.get_text() 492 | 493 | def _extract_published_date(self, date): 494 | return date.strftime('%Y-%m-%d') 495 | 496 | def _extract_authors(self, soup): 497 | authors_elements = soup.find_all('meta', property='article:author') 498 | if authors_elements is not None: 499 | return [authors_element['content'] for authors_element in authors_elements] 500 | 501 | def _extract_description(self, soup): 502 | description_element = soup.find('meta', property='og:description') 503 | if description_element is not None: 504 | return description_element['content'] 505 | 506 | def _extract_section(self, soup): 507 | section_element = soup.find('meta', property='article:section') 508 | if section_element is not None: 509 | return section_element['content'] 510 | 511 | def _extract_content(self, html): 512 | ContentExtractor.calculate_best_node = calculate_best_node 513 | ContentExtractor.post_cleanup = post_cleanup 514 | g = Goose({'enable_image_fetching': False}) 515 | article = g.extract(raw_html=html) 516 | ContentExtractor.calculate_best_node = f1 517 | ContentExtractor.post_cleanup = f2 518 | return article.cleaned_text 519 | 520 | def _html_to_infomation(self, html, link, date): 521 | soup = BeautifulSoup(html, 'lxml') 522 | head = soup.head 523 | 524 | try: 525 | title = self._extract_title(head) 526 | published_date = self._extract_published_date(date) 527 | authors = self._extract_authors(head) 528 | description = self._extract_description(head) 529 | section = self._extract_section(head) 530 | content = self._extract_content(html) 531 | except Exception: 532 | return None 533 | 534 | return { 535 | 'title': title, 536 | 'published_date': published_date, 537 | 'authors': authors, 538 | 'description': description, 539 | 'section': section, 540 | 'content': content, 541 | 'link': link 542 | } 543 | 544 | if __name__ == '__main__': 545 | 546 | today = datetime.date.today() 547 | today_str = today.strftime('%Y-%m-%d') 548 | two_weeks_ago = today - datetime.timedelta(days=7) 549 | two_weeks_ago_str = two_weeks_ago.strftime('%Y-%m-%d') 550 | 551 | config = DatasetConfiguration() 552 | config.start_date = two_weeks_ago 553 | config.end_date = today 554 | config.path = 'dataset/bbc' 555 | 556 | bbc_article_fetcher = BBCArticleFetcher(config) 557 | bbc_article_fetcher.fetch() 558 | 559 | from glob import glob 560 | files = glob(f'dataset/bbc/articles.*') 561 | files.sort() 562 | 563 | import datasets 564 | import json 565 | import os 566 | 567 | hf_token = os.environ['HF_TOKEN'] 568 | 569 | all_articles = [] 570 | for file in files: 571 | with open(file) as f: 572 | articles = json.load(f) 573 | 574 | articles = articles['articles'] 575 | for article in articles: 576 | article['authors'] = article['authors'][0] if article['authors'] else None 577 | all_articles.append(article) 578 | 579 | with open('all_articles.json', 'w') as f: 580 | json.dump(all_articles, f, indent=4, ensure_ascii=False) 581 | 582 | ds = datasets.Dataset.from_dict({key: [article[key] for article in all_articles] for key in all_articles[0].keys()}) 583 | ds.save_to_disk('bbc') 584 | 585 | from huggingface_hub import create_branch, create_tag, RepoCard 586 | 587 | create_branch('RealTimeData/bbc_latest', repo_type='dataset', branch=today_str, token=hf_token) 588 | ds.push_to_hub('RealTimeData/bbc_latest', token=hf_token, branch='main') 589 | ds.push_to_hub('RealTimeData/bbc_latest', token=hf_token, branch=today_str) 590 | 591 | text = f""" 592 | # Latest BBC News 593 | 594 | You could always access the latest BBC News articles via this dataset. 595 | 596 | We update the dataset weekly, on every Sunday. So the dataset always provides the latest BBC News article from the last week. 597 | 598 | The current dataset on main branch contains the latest BBC News articles submitted from {two_weeks_ago.isoformat()} to {today.isoformat()}. 599 | 600 | The data collection is conducted on {today.isoformat()}. 601 | 602 | Use the dataset via: 603 | ``` 604 | ds = datasets.load_dataset('RealTimeData/bbc_latest') 605 | ``` 606 | 607 | # Previsou versions 608 | 609 | You could access previous versions by requesting different branches. 610 | 611 | For example, you could find the 2023-08-20 version via: 612 | ``` 613 | ds = datasets.load_dataset('RealTimeData/bbc_latest', revision = '2023-08-20') 614 | ``` 615 | 616 | Check all available versions by clicking the "Files and versions" button on the top bar. 617 | """ 618 | card = RepoCard(text) 619 | card.push_to_hub('RealTimeData/bbc_latest', repo_type='dataset', token=hf_token) --------------------------------------------------------------------------------