├── .gitignore ├── LICENSE ├── README.md ├── annotate_data ├── annotate.sh ├── clusters.py ├── domains.py ├── edu.py ├── embed.py ├── fasttext.py ├── perplexity.py └── tokens.py ├── define_domains ├── k-means-clustering │ ├── CODE_OF_CONDUCT.md │ ├── CONTRIBUTING.md │ ├── LICENSE │ ├── README.md │ ├── __init__.py │ ├── configs │ │ ├── 1level_dcml.yaml │ │ ├── 2levels_random_embeddings.yaml │ │ └── 4levels_web_based_images.yaml │ ├── exps │ │ ├── dclm-1level-k118 │ │ │ └── level1 │ │ │ │ ├── centroids.npy │ │ │ │ └── slurm_script.s │ │ ├── dclm-1level-k13824 │ │ │ └── level1 │ │ │ │ ├── centroids.npy │ │ │ │ └── slurm_script.s │ │ ├── dclm-1level-k24 │ │ │ └── level1 │ │ │ │ ├── centroids.npy │ │ │ │ └── slurm_script.s │ │ ├── dclm-1level-k2822 │ │ │ └── level1 │ │ │ │ ├── centroids.npy │ │ │ │ └── slurm_script.s │ │ ├── dclm-1level-k576 │ │ │ └── level1 │ │ │ │ ├── centroids.npy │ │ │ │ └── slurm_script.s │ │ └── dclm-1level-k67723 │ │ │ └── level1 │ │ │ ├── centroids.npy │ │ │ └── slurm_script.s │ ├── images │ │ ├── curation_pipeline.png │ │ └── toy_example.png │ ├── requirements.txt │ ├── scripts │ │ ├── __init__.py │ │ ├── hierarchical_kmeans_launcher.py │ │ ├── run_distributed_kmeans.py │ │ ├── run_hierarchical_sampling.py │ │ └── split_clusters.py │ ├── setup.py │ ├── src │ │ ├── __init__.py │ │ ├── clusters.py │ │ ├── dist_comm.py │ │ ├── distributed_kmeans_gpu.py │ │ ├── hierarchical_kmeans_gpu.py │ │ ├── hierarchical_sampling.py │ │ ├── kmeans_gpu.py │ │ └── utils.py │ └── vis │ │ ├── __init__.py │ │ ├── generalized_kmeans_1d.py │ │ └── notebook.ipynb ├── prompt_classify.py ├── prompt_classify.sh ├── taxonomies │ ├── formats.yaml │ └── topics.yaml ├── train_classifier.py └── train_classifier.sh ├── domain_statistics.py ├── learn_mixtures ├── average_mixtures.py ├── combine_mixtures.py └── training_mixes.py ├── select_training_data.py └── website ├── assets ├── data │ ├── examples │ │ ├── format0.json │ │ ├── format1.json │ │ ├── format10.json │ │ ├── format11.json │ │ ├── format12.json │ │ ├── format13.json │ │ ├── format14.json │ │ ├── format15.json │ │ ├── format16.json │ │ ├── format17.json │ │ ├── format18.json │ │ ├── format19.json │ │ ├── format2.json │ │ ├── format20.json │ │ ├── format21.json │ │ ├── format22.json │ │ ├── format23.json │ │ ├── format3.json │ │ ├── format4.json │ │ ├── format5.json │ │ ├── format6.json │ │ ├── format7.json │ │ ├── format8.json │ │ ├── format9.json │ │ ├── topic0.json │ │ ├── topic0_format0.json │ │ ├── topic0_format1.json │ │ ├── topic0_format10.json │ │ ├── topic0_format11.json │ │ ├── topic0_format12.json │ │ ├── topic0_format13.json │ │ ├── topic0_format14.json │ │ ├── topic0_format15.json │ │ ├── topic0_format16.json │ │ ├── topic0_format17.json │ │ ├── topic0_format18.json │ │ ├── topic0_format19.json │ │ ├── topic0_format2.json │ │ ├── topic0_format20.json │ │ ├── topic0_format21.json │ │ ├── topic0_format22.json │ │ ├── topic0_format23.json │ │ ├── topic0_format3.json │ │ ├── topic0_format4.json │ │ ├── topic0_format5.json │ │ ├── topic0_format6.json │ │ ├── topic0_format7.json │ │ ├── topic0_format8.json │ │ ├── topic0_format9.json │ │ ├── topic1.json │ │ ├── topic10.json │ │ ├── topic10_format0.json │ │ ├── topic10_format1.json │ │ ├── topic10_format10.json │ │ ├── topic10_format11.json │ │ ├── topic10_format12.json │ │ ├── topic10_format13.json │ │ ├── topic10_format14.json │ │ ├── topic10_format15.json │ │ ├── topic10_format16.json │ │ ├── topic10_format17.json │ │ ├── topic10_format18.json │ │ ├── topic10_format19.json │ │ ├── topic10_format2.json │ │ ├── topic10_format20.json │ │ ├── topic10_format21.json │ │ ├── topic10_format22.json │ │ ├── topic10_format23.json │ │ ├── topic10_format3.json │ │ ├── topic10_format4.json │ │ ├── topic10_format5.json │ │ ├── topic10_format6.json │ │ ├── topic10_format7.json │ │ ├── topic10_format8.json │ │ ├── topic10_format9.json │ │ ├── topic11.json │ │ ├── topic11_format0.json │ │ ├── topic11_format1.json │ │ ├── topic11_format10.json │ │ ├── topic11_format11.json │ │ ├── topic11_format12.json │ │ ├── topic11_format13.json │ │ ├── topic11_format14.json │ │ ├── topic11_format15.json │ │ ├── topic11_format16.json │ │ ├── topic11_format17.json │ │ ├── topic11_format18.json │ │ ├── topic11_format19.json │ │ ├── topic11_format2.json │ │ ├── topic11_format20.json │ │ ├── topic11_format21.json │ │ ├── topic11_format22.json │ │ ├── topic11_format23.json │ │ ├── topic11_format3.json │ │ ├── topic11_format4.json │ │ ├── topic11_format5.json │ │ ├── topic11_format6.json │ │ ├── topic11_format7.json │ │ ├── topic11_format8.json │ │ ├── topic11_format9.json │ │ ├── topic12.json │ │ ├── topic12_format0.json │ │ ├── topic12_format1.json │ │ ├── topic12_format10.json │ │ ├── topic12_format11.json │ │ ├── topic12_format12.json │ │ ├── topic12_format13.json │ │ ├── topic12_format14.json │ │ ├── topic12_format15.json │ │ ├── topic12_format16.json │ │ ├── topic12_format17.json │ │ ├── topic12_format18.json │ │ ├── topic12_format19.json │ │ ├── topic12_format2.json │ │ ├── topic12_format20.json │ │ ├── topic12_format21.json │ │ ├── topic12_format22.json │ │ ├── topic12_format23.json │ │ ├── topic12_format3.json │ │ ├── topic12_format4.json │ │ ├── topic12_format5.json │ │ ├── topic12_format6.json │ │ ├── topic12_format7.json │ │ ├── topic12_format8.json │ │ ├── topic12_format9.json │ │ ├── topic13.json │ │ ├── topic13_format0.json │ │ ├── topic13_format1.json │ │ ├── topic13_format10.json │ │ ├── topic13_format11.json │ │ ├── topic13_format12.json │ │ ├── topic13_format13.json │ │ ├── topic13_format14.json │ │ ├── topic13_format15.json │ │ ├── topic13_format16.json │ │ ├── topic13_format17.json │ │ ├── topic13_format18.json │ │ ├── topic13_format19.json │ │ ├── topic13_format2.json │ │ ├── topic13_format20.json │ │ ├── topic13_format21.json │ │ ├── topic13_format22.json │ │ ├── topic13_format23.json │ │ ├── topic13_format3.json │ │ ├── topic13_format4.json │ │ ├── topic13_format5.json │ │ ├── topic13_format6.json │ │ ├── topic13_format7.json │ │ ├── topic13_format8.json │ │ ├── topic13_format9.json │ │ ├── topic14.json │ │ ├── topic14_format0.json │ │ ├── topic14_format1.json │ │ ├── topic14_format10.json │ │ ├── topic14_format11.json │ │ ├── topic14_format12.json │ │ ├── topic14_format13.json │ │ ├── topic14_format14.json │ │ ├── topic14_format15.json │ │ ├── topic14_format16.json │ │ ├── topic14_format17.json │ │ ├── topic14_format18.json │ │ ├── topic14_format19.json │ │ ├── topic14_format2.json │ │ ├── topic14_format20.json │ │ ├── topic14_format21.json │ │ ├── topic14_format22.json │ │ ├── topic14_format23.json │ │ ├── topic14_format3.json │ │ ├── topic14_format4.json │ │ ├── topic14_format5.json │ │ ├── topic14_format6.json │ │ ├── topic14_format7.json │ │ ├── topic14_format8.json │ │ ├── topic14_format9.json │ │ ├── topic15.json │ │ ├── topic15_format0.json │ │ ├── topic15_format1.json │ │ ├── topic15_format10.json │ │ ├── topic15_format11.json │ │ ├── topic15_format12.json │ │ ├── topic15_format13.json │ │ ├── topic15_format14.json │ │ ├── topic15_format15.json │ │ ├── topic15_format16.json │ │ ├── topic15_format17.json │ │ ├── topic15_format18.json │ │ ├── topic15_format19.json │ │ ├── topic15_format2.json │ │ ├── topic15_format20.json │ │ ├── topic15_format21.json │ │ ├── topic15_format22.json │ │ ├── topic15_format23.json │ │ ├── topic15_format3.json │ │ ├── topic15_format4.json │ │ ├── topic15_format5.json │ │ ├── topic15_format6.json │ │ ├── topic15_format7.json │ │ ├── topic15_format8.json │ │ ├── topic15_format9.json │ │ ├── topic16.json │ │ ├── topic16_format0.json │ │ ├── topic16_format1.json │ │ ├── topic16_format10.json │ │ ├── topic16_format11.json │ │ ├── topic16_format12.json │ │ ├── topic16_format13.json │ │ ├── topic16_format14.json │ │ ├── topic16_format15.json │ │ ├── topic16_format16.json │ │ ├── topic16_format17.json │ │ ├── topic16_format18.json │ │ ├── topic16_format19.json │ │ ├── topic16_format2.json │ │ ├── topic16_format20.json │ │ ├── topic16_format21.json │ │ ├── topic16_format22.json │ │ ├── topic16_format23.json │ │ ├── topic16_format3.json │ │ ├── topic16_format4.json │ │ ├── topic16_format5.json │ │ ├── topic16_format6.json │ │ ├── topic16_format7.json │ │ ├── topic16_format8.json │ │ ├── topic16_format9.json │ │ ├── topic17.json │ │ ├── topic17_format0.json │ │ ├── topic17_format1.json │ │ ├── topic17_format10.json │ │ ├── topic17_format11.json │ │ ├── topic17_format12.json │ │ ├── topic17_format13.json │ │ ├── topic17_format14.json │ │ ├── topic17_format15.json │ │ ├── topic17_format16.json │ │ ├── topic17_format17.json │ │ ├── topic17_format18.json │ │ ├── topic17_format19.json │ │ ├── topic17_format2.json │ │ ├── topic17_format20.json │ │ ├── topic17_format21.json │ │ ├── topic17_format22.json │ │ ├── topic17_format23.json │ │ ├── topic17_format3.json │ │ ├── topic17_format4.json │ │ ├── topic17_format5.json │ │ ├── topic17_format6.json │ │ ├── topic17_format7.json │ │ ├── topic17_format8.json │ │ ├── topic17_format9.json │ │ ├── topic18.json │ │ ├── topic18_format0.json │ │ ├── topic18_format1.json │ │ ├── topic18_format10.json │ │ ├── topic18_format11.json │ │ ├── topic18_format12.json │ │ ├── topic18_format13.json │ │ ├── topic18_format14.json │ │ ├── topic18_format15.json │ │ ├── topic18_format16.json │ │ ├── topic18_format17.json │ │ ├── topic18_format18.json │ │ ├── topic18_format19.json │ │ ├── topic18_format2.json │ │ ├── topic18_format20.json │ │ ├── topic18_format21.json │ │ ├── topic18_format22.json │ │ ├── topic18_format23.json │ │ ├── topic18_format3.json │ │ ├── topic18_format4.json │ │ ├── topic18_format5.json │ │ ├── topic18_format6.json │ │ ├── topic18_format7.json │ │ ├── topic18_format8.json │ │ ├── topic18_format9.json │ │ ├── topic19.json │ │ ├── topic19_format0.json │ │ ├── topic19_format1.json │ │ ├── topic19_format10.json │ │ ├── topic19_format11.json │ │ ├── topic19_format12.json │ │ ├── topic19_format13.json │ │ ├── topic19_format14.json │ │ ├── topic19_format15.json │ │ ├── topic19_format16.json │ │ ├── topic19_format17.json │ │ ├── topic19_format18.json │ │ ├── topic19_format19.json │ │ ├── topic19_format2.json │ │ ├── topic19_format20.json │ │ ├── topic19_format21.json │ │ ├── topic19_format22.json │ │ ├── topic19_format23.json │ │ ├── topic19_format3.json │ │ ├── topic19_format4.json │ │ ├── topic19_format5.json │ │ ├── topic19_format6.json │ │ ├── topic19_format7.json │ │ ├── topic19_format8.json │ │ ├── topic19_format9.json │ │ ├── topic1_format0.json │ │ ├── topic1_format1.json │ │ ├── topic1_format10.json │ │ ├── topic1_format11.json │ │ ├── topic1_format12.json │ │ ├── topic1_format13.json │ │ ├── topic1_format14.json │ │ ├── topic1_format15.json │ │ ├── topic1_format16.json │ │ ├── topic1_format17.json │ │ ├── topic1_format18.json │ │ ├── topic1_format19.json │ │ ├── topic1_format2.json │ │ ├── topic1_format20.json │ │ ├── topic1_format21.json │ │ ├── topic1_format22.json │ │ ├── topic1_format23.json │ │ ├── topic1_format3.json │ │ ├── topic1_format4.json │ │ ├── topic1_format5.json │ │ ├── topic1_format6.json │ │ ├── topic1_format7.json │ │ ├── topic1_format8.json │ │ ├── topic1_format9.json │ │ ├── topic2.json │ │ ├── topic20.json │ │ ├── topic20_format0.json │ │ ├── topic20_format1.json │ │ ├── topic20_format10.json │ │ ├── topic20_format11.json │ │ ├── topic20_format12.json │ │ ├── topic20_format13.json │ │ ├── topic20_format14.json │ │ ├── topic20_format15.json │ │ ├── topic20_format16.json │ │ ├── topic20_format17.json │ │ ├── topic20_format18.json │ │ ├── topic20_format19.json │ │ ├── topic20_format2.json │ │ ├── topic20_format20.json │ │ ├── topic20_format21.json │ │ ├── topic20_format22.json │ │ ├── topic20_format23.json │ │ ├── topic20_format3.json │ │ ├── topic20_format4.json │ │ ├── topic20_format5.json │ │ ├── topic20_format6.json │ │ ├── topic20_format7.json │ │ ├── topic20_format8.json │ │ ├── topic20_format9.json │ │ ├── topic21.json │ │ ├── topic21_format0.json │ │ ├── topic21_format1.json │ │ ├── topic21_format10.json │ │ ├── topic21_format11.json │ │ ├── topic21_format12.json │ │ ├── topic21_format13.json │ │ ├── topic21_format14.json │ │ ├── topic21_format15.json │ │ ├── topic21_format16.json │ │ ├── topic21_format17.json │ │ ├── topic21_format18.json │ │ ├── topic21_format19.json │ │ ├── topic21_format2.json │ │ ├── topic21_format20.json │ │ ├── topic21_format21.json │ │ ├── topic21_format22.json │ │ ├── topic21_format23.json │ │ ├── topic21_format3.json │ │ ├── topic21_format4.json │ │ ├── topic21_format5.json │ │ ├── topic21_format6.json │ │ ├── topic21_format7.json │ │ ├── topic21_format8.json │ │ ├── topic21_format9.json │ │ ├── topic22.json │ │ ├── topic22_format0.json │ │ ├── topic22_format1.json │ │ ├── topic22_format10.json │ │ ├── topic22_format11.json │ │ ├── topic22_format12.json │ │ ├── topic22_format13.json │ │ ├── topic22_format14.json │ │ ├── topic22_format15.json │ │ ├── topic22_format16.json │ │ ├── topic22_format17.json │ │ ├── topic22_format18.json │ │ ├── topic22_format19.json │ │ ├── topic22_format2.json │ │ ├── topic22_format20.json │ │ ├── topic22_format21.json │ │ ├── topic22_format22.json │ │ ├── topic22_format23.json │ │ ├── topic22_format3.json │ │ ├── topic22_format4.json │ │ ├── topic22_format5.json │ │ ├── topic22_format6.json │ │ ├── topic22_format7.json │ │ ├── topic22_format8.json │ │ ├── topic22_format9.json │ │ ├── topic23.json │ │ ├── topic23_format0.json │ │ ├── topic23_format1.json │ │ ├── topic23_format10.json │ │ ├── topic23_format11.json │ │ ├── topic23_format12.json │ │ ├── topic23_format13.json │ │ ├── topic23_format14.json │ │ ├── topic23_format15.json │ │ ├── topic23_format16.json │ │ ├── topic23_format17.json │ │ ├── topic23_format18.json │ │ ├── topic23_format19.json │ │ ├── topic23_format2.json │ │ ├── topic23_format20.json │ │ ├── topic23_format21.json │ │ ├── topic23_format22.json │ │ ├── topic23_format23.json │ │ ├── topic23_format3.json │ │ ├── topic23_format4.json │ │ ├── topic23_format5.json │ │ ├── topic23_format6.json │ │ ├── topic23_format7.json │ │ ├── topic23_format8.json │ │ ├── topic23_format9.json │ │ ├── topic2_format0.json │ │ ├── topic2_format1.json │ │ ├── topic2_format10.json │ │ ├── topic2_format11.json │ │ ├── topic2_format12.json │ │ ├── topic2_format13.json │ │ ├── topic2_format14.json │ │ ├── topic2_format15.json │ │ ├── topic2_format16.json │ │ ├── topic2_format17.json │ │ ├── topic2_format18.json │ │ ├── topic2_format19.json │ │ ├── topic2_format2.json │ │ ├── topic2_format20.json │ │ ├── topic2_format21.json │ │ ├── topic2_format22.json │ │ ├── topic2_format23.json │ │ ├── topic2_format3.json │ │ ├── topic2_format4.json │ │ ├── topic2_format5.json │ │ ├── topic2_format6.json │ │ ├── topic2_format7.json │ │ ├── topic2_format8.json │ │ ├── topic2_format9.json │ │ ├── topic3.json │ │ ├── topic3_format0.json │ │ ├── topic3_format1.json │ │ ├── topic3_format10.json │ │ ├── topic3_format11.json │ │ ├── topic3_format12.json │ │ ├── topic3_format13.json │ │ ├── topic3_format14.json │ │ ├── topic3_format15.json │ │ ├── topic3_format16.json │ │ ├── topic3_format17.json │ │ ├── topic3_format18.json │ │ ├── topic3_format19.json │ │ ├── topic3_format2.json │ │ ├── topic3_format20.json │ │ ├── topic3_format21.json │ │ ├── topic3_format22.json │ │ ├── topic3_format23.json │ │ ├── topic3_format3.json │ │ ├── topic3_format4.json │ │ ├── topic3_format5.json │ │ ├── topic3_format6.json │ │ ├── topic3_format7.json │ │ ├── topic3_format8.json │ │ ├── topic3_format9.json │ │ ├── topic4.json │ │ ├── topic4_format0.json │ │ ├── topic4_format1.json │ │ ├── topic4_format10.json │ │ ├── topic4_format11.json │ │ ├── topic4_format12.json │ │ ├── topic4_format13.json │ │ ├── topic4_format14.json │ │ ├── topic4_format15.json │ │ ├── topic4_format16.json │ │ ├── topic4_format17.json │ │ ├── topic4_format18.json │ │ ├── topic4_format19.json │ │ ├── topic4_format2.json │ │ ├── topic4_format20.json │ │ ├── topic4_format21.json │ │ ├── topic4_format22.json │ │ ├── topic4_format23.json │ │ ├── topic4_format3.json │ │ ├── topic4_format4.json │ │ ├── topic4_format5.json │ │ ├── topic4_format6.json │ │ ├── topic4_format7.json │ │ ├── topic4_format8.json │ │ ├── topic4_format9.json │ │ ├── topic5.json │ │ ├── topic5_format0.json │ │ ├── topic5_format1.json │ │ ├── topic5_format10.json │ │ ├── topic5_format11.json │ │ ├── topic5_format12.json │ │ ├── topic5_format13.json │ │ ├── topic5_format14.json │ │ ├── topic5_format15.json │ │ ├── topic5_format16.json │ │ ├── topic5_format17.json │ │ ├── topic5_format18.json │ │ ├── topic5_format19.json │ │ ├── topic5_format2.json │ │ ├── topic5_format20.json │ │ ├── topic5_format21.json │ │ ├── topic5_format22.json │ │ ├── topic5_format23.json │ │ ├── topic5_format3.json │ │ ├── topic5_format4.json │ │ ├── topic5_format5.json │ │ ├── topic5_format6.json │ │ ├── topic5_format7.json │ │ ├── topic5_format8.json │ │ ├── topic5_format9.json │ │ ├── topic6.json │ │ ├── topic6_format0.json │ │ ├── topic6_format1.json │ │ ├── topic6_format10.json │ │ ├── topic6_format11.json │ │ ├── topic6_format12.json │ │ ├── topic6_format13.json │ │ ├── topic6_format14.json │ │ ├── topic6_format15.json │ │ ├── topic6_format16.json │ │ ├── topic6_format17.json │ │ ├── topic6_format18.json │ │ ├── topic6_format19.json │ │ ├── topic6_format2.json │ │ ├── topic6_format20.json │ │ ├── topic6_format21.json │ │ ├── topic6_format22.json │ │ ├── topic6_format23.json │ │ ├── topic6_format3.json │ │ ├── topic6_format4.json │ │ ├── topic6_format5.json │ │ ├── topic6_format6.json │ │ ├── topic6_format7.json │ │ ├── topic6_format8.json │ │ ├── topic6_format9.json │ │ ├── topic7.json │ │ ├── topic7_format0.json │ │ ├── topic7_format1.json │ │ ├── topic7_format10.json │ │ ├── topic7_format11.json │ │ ├── topic7_format12.json │ │ ├── topic7_format13.json │ │ ├── topic7_format14.json │ │ ├── topic7_format15.json │ │ ├── topic7_format16.json │ │ ├── topic7_format17.json │ │ ├── topic7_format18.json │ │ ├── topic7_format19.json │ │ ├── topic7_format2.json │ │ ├── topic7_format20.json │ │ ├── topic7_format21.json │ │ ├── topic7_format22.json │ │ ├── topic7_format23.json │ │ ├── topic7_format3.json │ │ ├── topic7_format4.json │ │ ├── topic7_format5.json │ │ ├── topic7_format6.json │ │ ├── topic7_format7.json │ │ ├── topic7_format8.json │ │ ├── topic7_format9.json │ │ ├── topic8.json │ │ ├── topic8_format0.json │ │ ├── topic8_format1.json │ │ ├── topic8_format10.json │ │ ├── topic8_format11.json │ │ ├── topic8_format12.json │ │ ├── topic8_format13.json │ │ ├── topic8_format14.json │ │ ├── topic8_format15.json │ │ ├── topic8_format16.json │ │ ├── topic8_format17.json │ │ ├── topic8_format18.json │ │ ├── topic8_format19.json │ │ ├── topic8_format2.json │ │ ├── topic8_format20.json │ │ ├── topic8_format21.json │ │ ├── topic8_format22.json │ │ ├── topic8_format23.json │ │ ├── topic8_format3.json │ │ ├── topic8_format4.json │ │ ├── topic8_format5.json │ │ ├── topic8_format6.json │ │ ├── topic8_format7.json │ │ ├── topic8_format8.json │ │ ├── topic8_format9.json │ │ ├── topic9.json │ │ ├── topic9_format0.json │ │ ├── topic9_format1.json │ │ ├── topic9_format10.json │ │ ├── topic9_format11.json │ │ ├── topic9_format12.json │ │ ├── topic9_format13.json │ │ ├── topic9_format14.json │ │ ├── topic9_format15.json │ │ ├── topic9_format16.json │ │ ├── topic9_format17.json │ │ ├── topic9_format18.json │ │ ├── topic9_format19.json │ │ ├── topic9_format2.json │ │ ├── topic9_format20.json │ │ ├── topic9_format21.json │ │ ├── topic9_format22.json │ │ ├── topic9_format23.json │ │ ├── topic9_format3.json │ │ ├── topic9_format4.json │ │ ├── topic9_format5.json │ │ ├── topic9_format6.json │ │ ├── topic9_format7.json │ │ ├── topic9_format8.json │ │ └── topic9_format9.json │ ├── formats.json │ ├── statistics.json │ └── topics.json ├── images │ ├── ai2_logo.png │ ├── icon.png │ ├── mixtures_implicit.png │ ├── mixtures_regmix.png │ ├── pli_logo.svg │ ├── princeton_logo.png │ ├── results_main.png │ ├── treemaps.png │ ├── uc_berkeley_logo.png │ └── uw_logo.png └── js │ └── treemaps.js └── index.html /.gitignore: -------------------------------------------------------------------------------- 1 | __pycache__/ 2 | .DS_Store 3 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Organize the Web: Constructing Domains Enhances Pre-Training Data Curation 2 | 3 | [[Paper](https://arxiv.org/pdf/2502.10341.pdf)] [[Website](https://weborganizer.allen.ai)] [[Hugging Face](https://huggingface.co/WebOrganizer)] 4 | 5 | Overview over WebOrganizer domains 6 | 7 | *Interactively explore these domains and examples of web pages they contain at [https://weborganizer.allen.ai](https://weborganizer.allen.ai)* 8 | 9 | 10 | 11 | ## Resources 12 | 13 | #### Domain Classifiers 14 | All our domain classifiers are available on Huggingface Hub. Our default domain classifiers use both the URL and web site content to make predictions. We also provide two additional models that only use the web site content and therefore can be applied to a wider variety of documents. 15 | 1. __Topic__: [WebOrganizer/TopicClassifier](https://huggingface.co/WebOrganizer/TopicClassifier) ([-NoURL version](https://huggingface.co/WebOrganizer/TopicClassifier-NoURL)) 16 | 2. __Format__: [WebOrganizer/FormatClassifier](https://huggingface.co/WebOrganizer/FormatClassifier) ([-NoURL version](https://huggingface.co/WebOrganizer/FormatClassifier-NoURL)) 17 | 18 | These domains classifiers are trained on the following datasets: 19 | 1. In a first stage, 1M web pages classifed by __Llama-3.1-8B__, available on HuggingFace Hub: 20 | * [WebOrganizer/TopicAnnotations-Llama-3.1-8B](https://huggingface.co/datasets/WebOrganizer/TopicAnnotations-Llama-3.1-8B) 21 | * [WebOrganizer/FormatAnnotations-Llama-3.1-8B](https://huggingface.co/datasets/WebOrganizer/FormatAnnotations-Llama-3.1-8B) 22 | 2. In a second stage, 100K pages classified by __Llama-3.1-405B-FP8__, available on HuggingFace Hub: 23 | * [WebOrganizer/TopicAnnotations-Llama-3.1-405B-FP8](https://huggingface.co/datasets/WebOrganizer/TopicAnnotations-Llama-3.1-405B-FP8) 24 | * [WebOrganizer/FormatAnnotations-Llama-3.1-405B-FP8](https://huggingface.co/datasets/WebOrganizer/FormatAnnotations-Llama-3.1-405B-FP8) 25 | 26 | The __topic and format definitions__ and instructions for prompting large language models to classify documents are available in `define_domains/taxonomies`. The script for prompting models is `define_domains/prompt_classify.sh`. The 1M web pages were randomly sampled from DCLM RefinedWeb. 27 | 28 | 29 | #### Corpus Annotations 30 | We pre-process the `1b-1x` pool from DataComps-LM using [RefinedWeb filters](https://github.com/mlfoundations/dclm/blob/main/baselines/baselines_configs/dclm_baseline_refinedweb.yaml) and [BFF deduplication](https://github.com/mlfoundations/dclm/tree/main/dedup/bff). 31 | The resulting 200B token corpus is available at, together with the annotations: [WebOrganizer/Corpus-200B](https://huggingface.co/datasets/WebOrganizer/Corpus-200B). 32 | 33 | __Download the dataset by cloning the repository with Git LFS instead of HuggingFace's `load_dataset()`.__ 34 | 35 | The dataset has the following folder structure: 36 | ```bash 37 | Corpus-200B/ 38 | documents/ # Pre-processed web documents 39 | - CC_shard_00000000_processed.jsonl.zst 40 | - CC_shard_00000001_processed.jsonl.zst 41 | - ... 42 | tokens/ # number of tokens per document (GPT-NeoX tokenizer) 43 | - CC_shard_00000000_processed.npy 44 | - CC_shard_00000001_processed.npy 45 | - ... 46 | scores_dclm-fasttext/ # DCLM-fasttext score 47 | - CC_shard_00000000_processed.npy 48 | - ... 49 | scores_fineweb-edu/ # FineWeb-Edu score 50 | - CC_shard_00000000_processed.npy 51 | - ... 52 | scores_fineweb-edu__rounded/ # Rounded FineWeb-Edu score 53 | - CC_shard_00000000_processed__rounded.npy 54 | - ... 55 | domains_topics/ # TopicClassifier annotations 56 | - CC_shard_00000000_processed__choice.npy # index of top choice 57 | - ... 58 | domain_topics__logits/ 59 | - CC_shard_00000000_processed__logits.npy # logits for each topic 60 | - ... 61 | domains_formats/ # FormatClassifier annotations 62 | - CC_shard_00000000_processed__choice.npy # index of top choice 63 | - ... 64 | domains_formats/ # FormatClassifier annotations 65 | - CC_shard_00000000_processed__logits.npy # logits for each format 66 | - ... 67 | domains_clusters-k24/ # K-means clusters 68 | - CC_shard_00000000_processed.npy # cluster assignment for each document 69 | - ... 70 | ``` 71 | We also include statistics about the presence and co-occurence of domains in the `domain_statistics/` folder, computed with the `domain_statistics.py` script. 72 | 73 | ## Installation 74 | Different steps in this repository require different dependencies: 75 | 76 | * __Data pre-processing__: *coming soon* 77 | ```bash 78 | # install datatools and gte... 79 | ``` 80 | 81 | * __K-means clustering__: The code in `define_domains/k-means-clustering` is a fork of [facebookresearch/ssl-data-curation](https://github.com/facebookresearch/ssl-data-curation/tree/main). Please read the README in the this directory for installation instructions and to see our modifications. 82 | 83 | * __DataComps-LM tokenization and training__: Please refer to the [DataComps-LM repository](https://github.com/mlfoundations/dclm) for instructions on how to tokenize and train models for DataComps-LM. 84 | 85 | 86 | ## Training New Domain Classifiers 87 | You can define a new taxonomy config in `define_domains/taxonomies` and then train a new domain classifier using the `define_domains/prompt_classify.sh` script. 88 | To distill the Llama annotations into a new domain classifier, use the `define_domains/train_classifier.sh` script and pass the new training dataset as a script option. For two stage training, simply run the training script twice with different training datasets, and initialize the second stage with the model checkpoint from the first stage. 89 | 90 | ## Annotating Data 91 | The script `annotate_data/annotate.sh` does large-scale data annotation using a slurm job array to iterate through the document shards in the `Corpus-200B` folder, and annotate each document with quality and domain annotations, which are stored as numpy arrays in separate annotation folders. 92 | 93 | ## Predict a Training Distribution with RegMix 94 | *Coming soon...* 95 | 96 | ## Selecting Training Data for Language Models 97 | `select_training_data.py` uses the folder structure of the `Corpus-200B` and used by the annotation scripts to select training data for language models. 98 | 99 | Example usage: 100 | ```python 101 | python select_training_data.py \ 102 | --input_base "datasets/Corpus-200B" \ 103 | --output_base "datasets/selected/Baseline-30B" \ 104 | --num_tokens 30000000000 \ 105 | --do_sample \ 106 | --num_proc 16 107 | ``` 108 | 109 | It supports various options for quality filtering and domain mixing and uses multiple workers to write data in parallel. 110 | The script first writes indices for each document shard in the `Corpus-200B` folder and then uses multiple workers to write the data in parallel. 111 | You can use the `domain_statistics.py` script to summarize the domain distribution of datasets and use these for selecting training data by passing them to `--ref_distribution `. 112 | 113 | The folder of selected documents can then be used with the tokenization and training scripts from the [DCLM repository](https://github.com/mlfoundations/dclm) to train a new language model. 114 | 115 | 116 | ## Citation 117 | ```bibtex 118 | @article{wettig2025organize, 119 | title={Organize the Web: Constructing Domains Enhances Pre-Training Data Curation}, 120 | author={Alexander Wettig and Kyle Lo and Sewon Min and Hannaneh Hajishirzi and Danqi Chen and Luca Soldaini}, 121 | journal={arXiv preprint arXiv:2502.10341}, 122 | year={2025} 123 | } 124 | ``` 125 | -------------------------------------------------------------------------------- /annotate_data/annotate.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | #SBATCH -J annotate 3 | #SBATCH -N 1 -c 9 --gres=gpu:1 --mem=72G 4 | #SBATCH --output=slurm/%x-%A_%a.out 5 | #SBATCH -t 0-24 6 | #SBATCH -a 0-31 7 | 8 | # Point to the root data directory 9 | # Each job will iterate through a subset of files in $DATA_ROOT/$DOCUMENTS_DIR 10 | # and annotate them with quality scores and domains 11 | data_root=${DATA_ROOT:-} 12 | documents_dir=${DOCUMENTS_DIR:-"documents"} 13 | 14 | # Use WORKER/NUM_WORKERS env variables, slurm array variables or default to 0/1 15 | num_workers=${NUM_WORKERS:-${SLURM_ARRAY_TASK_COUNT:-1}} 16 | worker=${WORKER:-${SLURM_ARRAY_TASK_ID:-0}} 17 | 18 | files=( $(ls -1 "$data_root/$documents_dir" | .jsonl.zst ) ) 19 | num_files=${#files[@]} 20 | 21 | # Iterate through files for this work 22 | for id in $(jq -n "range($worker; $num_files; $num_workers)"); do 23 | file=${files[$id]} 24 | output_file=${file%%.*} 25 | 26 | # Tokenize data and compute length 27 | python tokens.py \ 28 | $data_root/$documents_dir/$file \ 29 | $data_root/tokens/$output_file 30 | 31 | # Compute DCLM-fasttext scores 32 | python fasttext.py \ 33 | $data_root/$documents_dir/$file \ 34 | $data_root/scores_dclm-fasttext/$output_file \ 35 | --model_path 36 | 37 | 38 | # ^ The two scripts above do not make use of a GPU and should be run separately 39 | # Everything below is accelerated a lot with GPUs 40 | 41 | # Compute FineWeb-Edu scores 42 | python edu.py \ 43 | $data_root/$documents_dir/$file \ 44 | $data_root/scores_fineweb-edu/$output_file \ 45 | --model_name HuggingFaceTB/fineweb-edu-classifier 46 | 47 | # Compute Topic and Format domains 48 | python domains.py \ 49 | $data_root/$documents_dir/$file \ 50 | $data_root/domains_topics/$output_file \ 51 | --model_name WebOrganizer/WebOrganizer-TopicClassifier 52 | python domains.py \ 53 | $data_root/$documents_dir/$file \ 54 | $data_root/domains_formats/$output_file \ 55 | --model_name WebOrganizer/WebOrganizer-FormatClassifier 56 | 57 | # # For annotating kmeans clusters 58 | # python embed.py \ 59 | # $data_root/$documents_dir/$file \ 60 | # $data_root/embeds/$output_file 61 | # python clusters.py \ 62 | # $data_root/embeds/${output_file}.npy \ 63 | # $data_root/domains_clusters-k24/$output_file \ 64 | # --clustering_folder ../define_domains/k-means-clustering/exps/dclm-k24 65 | done 66 | -------------------------------------------------------------------------------- /annotate_data/clusters.py: -------------------------------------------------------------------------------- 1 | from tqdm import tqdm 2 | import numpy as np 3 | import torch 4 | from dataclasses import dataclass 5 | from functools import partial 6 | from pathlib import Path 7 | from typing import Optional 8 | 9 | from datatools.process import process, ProcessOptions 10 | from datatools.load import load, LoadOptions 11 | from simple_parsing import ArgumentParser 12 | 13 | 14 | @dataclass 15 | class ScriptOptions: 16 | clustering_folder: Path 17 | batch_size: int = 4192 18 | device: str = "cpu" 19 | 20 | 21 | def assign_clusters(dataset, indices, process_id, options): 22 | centroids_paths = sorted(options.clustering_folder.glob("level*/centroids.npy")) 23 | 24 | centroids_by_level = [torch.tensor(np.load(centroids_path)).to(options.device) for centroids_path in centroids_paths] 25 | 26 | for i in tqdm(range(0, len(dataset), options.batch_size), disable = process_id != 0): 27 | batch = [dataset[j] for j in range(i, min(i + options.batch_size, len(dataset)))] 28 | embeddings = torch.tensor(np.stack(batch)).to(options.device) 29 | 30 | assignments_by_level = [] 31 | 32 | for centroids in centroids_by_level: 33 | # Compute distances 34 | distances = torch.cdist(embeddings, centroids) 35 | 36 | # Get cluster assignments 37 | cluster_ids = torch.argmin(distances, dim=1) 38 | assignments_by_level.append(cluster_ids.cpu().numpy()) 39 | 40 | embeddings = centroids[cluster_ids] 41 | 42 | for cluster_id_by_level in zip(*assignments_by_level): 43 | if len(cluster_id_by_level) == 1: 44 | yield { 45 | "": cluster_id_by_level[0] 46 | } 47 | else: 48 | yield { 49 | f"level{i+1}": cluster_id 50 | for i, cluster_id in enumerate(cluster_id_by_level) 51 | } 52 | 53 | 54 | if __name__ == "__main__": 55 | parser = ArgumentParser() 56 | 57 | parser.add_argument("inputs", type=Path, nargs="+", help="Input embeds paths") 58 | parser.add_argument("output", type=Path, help="Output dataset path") 59 | 60 | parser.add_arguments(ScriptOptions, dest="script_options") 61 | parser.add_arguments(LoadOptions, dest="load_options") 62 | parser.add_arguments(ProcessOptions, dest="process_options") 63 | 64 | args = parser.parse_args() 65 | args.process_options.ndarray = True 66 | 67 | print("Arguments:", args) 68 | dataset = load(*args.inputs, options=args.load_options) 69 | N = len(dataset) 70 | print(f"Loaded dataset with {N} samples") 71 | 72 | process(dataset, partial(assign_clusters, options=args.script_options), args.output, args.process_options) -------------------------------------------------------------------------------- /annotate_data/domains.py: -------------------------------------------------------------------------------- 1 | from dataclasses import dataclass 2 | from functools import partial 3 | from pathlib import Path 4 | from tqdm import tqdm 5 | 6 | from transformers import AutoModelForSequenceClassification, AutoTokenizer 7 | import torch 8 | from torch.utils.data import DataLoader 9 | import numpy 10 | 11 | from datatools.process import process, ProcessOptions 12 | from datatools.load import load, LoadOptions 13 | from simple_parsing import ArgumentParser, field 14 | from typing import Dict, Any 15 | 16 | @dataclass 17 | class EmbedOptions: 18 | model_name: str 19 | batch_size: int = 128 20 | num_dataloader_workers: int = 8 21 | max_length: int = 8192 22 | input_template: str = """{url} 23 | 24 | {text}""" 25 | 26 | 27 | class DataCollator: 28 | def __init__(self, tokenizer, options): 29 | self.tokenizer = tokenizer 30 | self.options = options 31 | 32 | @torch.no_grad() 33 | def __call__(self, features): 34 | documents = [self.options.input_template.format(**f) for f in features] 35 | return self.tokenizer(documents, padding=True, truncation=True, return_tensors='pt', max_length=self.options.max_length) 36 | 37 | 38 | def load_model_and_tokenizer(options): 39 | tokenizer = AutoTokenizer.from_pretrained(options.model_name) 40 | model = AutoModelForSequenceClassification.from_pretrained(options.model_name, trust_remote_code=True) 41 | return model, tokenizer 42 | 43 | 44 | 45 | @torch.inference_mode() 46 | def predict_fn(subset, indices, process_id, options): 47 | 48 | model, tokenizer = load_model_and_tokenizer(options) 49 | model.to(torch.bfloat16) 50 | model.cuda() 51 | model.eval() 52 | 53 | data_loader = DataLoader(subset, 54 | batch_size=options.batch_size, 55 | collate_fn=DataCollator(tokenizer, options), 56 | num_workers=options.num_dataloader_workers, 57 | prefetch_factor=4, 58 | pin_memory=True, 59 | shuffle=False) 60 | 61 | for batch in tqdm(data_loader, disable=(process_id != 0)): 62 | for key in batch: 63 | batch[key] = batch[key].cuda() 64 | 65 | model_output = model(**batch) 66 | 67 | logits = model_output.logits.float().cpu() 68 | choices = logits.argmax(axis=-1).cpu() 69 | 70 | for seq_logits, seq_choice in zip(logits, choices): 71 | yield {"logits": seq_logits.numpy(), "choice": seq_choice.item()} 72 | 73 | 74 | if __name__ == "__main__": 75 | parser = ArgumentParser() 76 | 77 | parser.add_argument("inputs", type=Path, nargs="+", help="Input dataset paths") 78 | parser.add_argument("output", type=Path, help="Output dataset path") 79 | 80 | parser.add_arguments(EmbedOptions, dest="embed_options") 81 | parser.add_arguments(LoadOptions, dest="load_options") 82 | parser.add_arguments(ProcessOptions, dest="process_options") 83 | 84 | args = parser.parse_args() 85 | args.process_options.ndarray = True 86 | 87 | print("Arguments:", args) 88 | dataset = load(*args.inputs, options=args.load_options) 89 | N = len(dataset) 90 | print(f"Loaded dataset with {N} samples") 91 | 92 | process( 93 | dataset, 94 | partial(predict_fn, options=args.embed_options), 95 | args.output, 96 | args.process_options 97 | ) 98 | -------------------------------------------------------------------------------- /annotate_data/edu.py: -------------------------------------------------------------------------------- 1 | from dataclasses import dataclass 2 | from functools import partial 3 | from pathlib import Path 4 | from tqdm import tqdm 5 | 6 | from transformers import AutoModelForSequenceClassification, AutoTokenizer 7 | import torch 8 | from torch.utils.data import DataLoader 9 | import numpy 10 | 11 | from datatools.process import process, ProcessOptions 12 | from datatools.load import load, LoadOptions 13 | from simple_parsing import ArgumentParser, field 14 | from typing import Dict, Any 15 | 16 | @dataclass 17 | class EmbedOptions: 18 | model_name: str = "HuggingFaceTB/fineweb-edu-classifier" 19 | batch_size: int = 128 20 | num_dataloader_workers: int = 8 21 | max_length: int = 512 22 | input_template: str = "{text}" 23 | 24 | 25 | class DataCollator: 26 | def __init__(self, tokenizer, options): 27 | self.tokenizer = tokenizer 28 | self.options = options 29 | 30 | @torch.no_grad() 31 | def __call__(self, features): 32 | documents = [self.options.input_template.format(**f) for f in features] 33 | return self.tokenizer(documents, padding=True, truncation=True, return_tensors='pt', max_length=self.options.max_length) 34 | 35 | 36 | def load_model_and_tokenizer(options): 37 | tokenizer = AutoTokenizer.from_pretrained(options.model_name) 38 | model = AutoModelForSequenceClassification.from_pretrained(options.model_name) 39 | return model, tokenizer 40 | 41 | 42 | 43 | @torch.inference_mode() 44 | def predict_fn(subset, indices, process_id, options): 45 | 46 | model, tokenizer = load_model_and_tokenizer(options) 47 | model.to(torch.bfloat16) 48 | model.cuda() 49 | model.eval() 50 | 51 | data_loader = DataLoader(subset, 52 | batch_size=options.batch_size, 53 | collate_fn=DataCollator(tokenizer, options), 54 | num_workers=options.num_dataloader_workers, 55 | prefetch_factor=4, 56 | pin_memory=True, 57 | shuffle=False) 58 | 59 | for batch in tqdm(data_loader, disable=(process_id != 0)): 60 | for key in batch: 61 | batch[key] = batch[key].cuda() 62 | 63 | model_output = model(**batch) 64 | 65 | scores = model_output.logits.squeeze(-1).float().cpu().detach().numpy() 66 | 67 | for seq_score in scores: 68 | yield { 69 | "": seq_score.item(), 70 | "rounded": int(round(max(0, min(seq_score.item(), 5)))) 71 | } 72 | 73 | 74 | if __name__ == "__main__": 75 | parser = ArgumentParser() 76 | 77 | parser.add_argument("inputs", type=Path, nargs="+", help="Input dataset paths") 78 | parser.add_argument("output", type=Path, help="Output dataset path") 79 | 80 | parser.add_arguments(EmbedOptions, dest="embed_options") 81 | parser.add_arguments(LoadOptions, dest="load_options") 82 | parser.add_arguments(ProcessOptions, dest="process_options") 83 | 84 | args = parser.parse_args() 85 | args.process_options.ndarray = True 86 | 87 | print("Arguments:", args) 88 | dataset = load(*args.inputs, options=args.load_options) 89 | N = len(dataset) 90 | print(f"Loaded dataset with {N} samples") 91 | 92 | process( 93 | dataset, 94 | partial(predict_fn, options=args.embed_options), 95 | args.output, 96 | args.process_options 97 | ) 98 | -------------------------------------------------------------------------------- /annotate_data/embed.py: -------------------------------------------------------------------------------- 1 | from dataclasses import dataclass 2 | from functools import partial 3 | from pathlib import Path 4 | from tqdm import tqdm 5 | 6 | from transformers import AutoModel, AutoTokenizer 7 | import torch 8 | from torch.utils.data import DataLoader 9 | import numpy 10 | 11 | from datatools.process import process, ProcessOptions 12 | from datatools.load import load, LoadOptions 13 | from simple_parsing import ArgumentParser, field 14 | from typing import Dict, Any 15 | 16 | @dataclass 17 | class EmbedOptions: 18 | model_name: str = "Alibaba-NLP/gte-base-en-v1.5" 19 | batch_size: int = 128 20 | num_dataloader_workers: int = 8 21 | pooling_strategy: str = "cls" 22 | normalize_embeddings: bool = True 23 | max_length: int = 8192 24 | input_template: str = "{text}" 25 | 26 | 27 | class DataCollator: 28 | def __init__(self, tokenizer, options): 29 | self.tokenizer = tokenizer 30 | self.options = options 31 | 32 | @torch.no_grad() 33 | def __call__(self, features): 34 | documents = [self.options.input_template.format(**f) for f in features] 35 | return self.tokenizer(documents, padding=True, truncation=True, return_tensors='pt', max_length=self.options.max_length) 36 | 37 | 38 | @torch.inference_mode() 39 | def pooling(model_output, attention_mask, pooling_strategy): 40 | if pooling_strategy == "cls": 41 | return model_output.last_hidden_state[:, 0].float() 42 | elif pooling_strategy == "mean": 43 | token_embeddings = model_output[0] 44 | input_mask_expanded = attention_mask.unsqueeze(-1).expand(token_embeddings.size()).float() 45 | return torch.sum(token_embeddings * input_mask_expanded, 1) / torch.clamp(input_mask_expanded.sum(1), min=1e-9) 46 | 47 | 48 | def load_model_and_tokenizer(options): 49 | if options.model_name.startswith("nomic-ai/"): 50 | try: 51 | from contrastors.models.encoder.modeling_nomic_bert import NomicBertModel 52 | except: 53 | raise ImportError("Could not import NomicBertModel. Please install the https://github.com/nomic-ai/contrastors in this folder") 54 | 55 | tokenizer = AutoTokenizer.from_pretrained('bert-base-uncased') 56 | model = NomicBertModel.from_pretrained('nomic-ai/nomic-embed-text-v1', add_pooling_layer=False) 57 | else: 58 | tokenizer = AutoTokenizer.from_pretrained(options.model_name) 59 | model = AutoModel.from_pretrained(options.model_name, trust_remote_code=True) 60 | return model, tokenizer 61 | 62 | 63 | 64 | @torch.inference_mode() 65 | def predict_fn(subset, indices, process_id, options): 66 | 67 | model, tokenizer = load_model_and_tokenizer(options) 68 | model.to(torch.bfloat16) 69 | model.cuda() 70 | model.eval() 71 | 72 | data_loader = DataLoader(subset, 73 | batch_size=options.batch_size, 74 | collate_fn=DataCollator(tokenizer, options), 75 | num_workers=options.num_dataloader_workers, 76 | prefetch_factor=4, 77 | pin_memory=True, 78 | shuffle=False) 79 | 80 | for batch in tqdm(data_loader, disable=(process_id != 0)): 81 | for key in batch: 82 | batch[key] = batch[key].cuda() 83 | 84 | model_output = model(**batch) 85 | embeddings = pooling(model_output, batch['attention_mask'], options.pooling_strategy) 86 | 87 | if options.normalize_embeddings: 88 | embeddings = torch.nn.functional.normalize(embeddings, p=2, dim=1) 89 | 90 | for embed in embeddings: 91 | yield {"": embed.cpu().numpy()} 92 | 93 | 94 | if __name__ == "__main__": 95 | parser = ArgumentParser() 96 | 97 | parser.add_argument("inputs", type=Path, nargs="+", help="Input dataset paths") 98 | parser.add_argument("output", type=Path, help="Output dataset path") 99 | 100 | parser.add_arguments(EmbedOptions, dest="embed_options") 101 | parser.add_arguments(LoadOptions, dest="load_options") 102 | parser.add_arguments(ProcessOptions, dest="process_options") 103 | 104 | args = parser.parse_args() 105 | args.process_options.ndarray = True 106 | 107 | print("Arguments:", args) 108 | dataset = load(*args.inputs, options=args.load_options) 109 | N = len(dataset) 110 | print(f"Loaded dataset with {N} samples") 111 | 112 | process( 113 | dataset, 114 | partial(predict_fn, options=args.embed_options), 115 | args.output, 116 | args.process_options 117 | ) 118 | -------------------------------------------------------------------------------- /annotate_data/fasttext.py: -------------------------------------------------------------------------------- 1 | import os 2 | from typing import Dict, List, Callable 3 | from functools import partial 4 | 5 | import fasttext 6 | from tqdm import tqdm 7 | 8 | from pathlib import Path 9 | 10 | from simple_parsing import ArgumentParser, field 11 | from datatools.process import process, ProcessOptions 12 | from datatools.load import load, LoadOptions 13 | 14 | 15 | def classify_fasttext_hq_prob(model: fasttext.FastText._FastText, content: str): 16 | # Clean the input text by joining all lines into a single string 17 | text = " ".join(content.strip().splitlines()) 18 | 19 | # Make the prediction 20 | pred = model.predict(text) 21 | 22 | # Extract the predicted label and its probability 23 | (pred_label, pred_prob) = pred 24 | pred_label = pred_label[0] 25 | hq_prob = pred_prob[0] 26 | 27 | # If the predicted label is 'CC', adjust the probability of it being 'Wikipedia' 28 | if pred_label == "__label__cc": 29 | hq_prob = 1 - hq_prob 30 | 31 | # Return the output 32 | return hq_prob 33 | 34 | 35 | def predict_fn(dataset, indices, process_id, model_path, text_field="text"): 36 | model = fasttext.load_model(model_path) 37 | 38 | for i in tqdm(range(len(dataset)), disable=(process_id != 0)): 39 | text = dataset[i][text_field] 40 | hq_prob = classify_fasttext_hq_prob(model, text) 41 | yield {"": hq_prob} 42 | 43 | 44 | if __name__ == "__main__": 45 | parser = ArgumentParser(add_config_path_arg=True) 46 | 47 | parser.add_argument("inputs", type=Path, nargs="+", help="Input dataset paths") 48 | parser.add_argument("output", type=Path, help="Output dataset path") 49 | 50 | 51 | parser.add_argument("--text_field", type=str, default="text", help="Name of the field containing the text to classify") 52 | parser.add_argument("--model_path", type=str, default="fasttext_oh_eli5.bin", help="Path to the FastText model") 53 | 54 | parser.add_arguments(LoadOptions, dest="load_options") 55 | parser.add_arguments(ProcessOptions, dest="process_options") 56 | 57 | args = parser.parse_args() 58 | args.process_options.ndarray = True 59 | 60 | print("Arguments:", args) 61 | dataset = load(*args.inputs, options=args.load_options) 62 | N = len(dataset) 63 | print(f"Loaded dataset with {N} samples") 64 | 65 | 66 | process( 67 | dataset, 68 | partial( 69 | predict_fn, 70 | text_field=args.text_field, 71 | model_path=args.model_path 72 | ), 73 | args.output, args.process_options 74 | ) 75 | -------------------------------------------------------------------------------- /annotate_data/perplexity.py: -------------------------------------------------------------------------------- 1 | from dataclasses import dataclass 2 | from functools import partial 3 | from pathlib import Path 4 | from tqdm import tqdm 5 | 6 | from transformers import AutoModelForCausalLM, AutoTokenizer 7 | import torch 8 | from torch.utils.data import DataLoader 9 | import numpy 10 | 11 | from datatools.process import process, ProcessOptions 12 | from datatools.load import load, LoadOptions 13 | from simple_parsing import ArgumentParser, field 14 | from typing import Dict, Any 15 | 16 | 17 | @dataclass 18 | class PerplexityOptions: 19 | model_name: str = "EleutherAI/pythia-160m" 20 | batch_size: int = 32 21 | num_dataloader_workers: int = 8 22 | max_length: int = 2048 23 | 24 | 25 | class DataCollator: 26 | def __init__(self, max_length): 27 | self.max_length = max_length 28 | 29 | @torch.no_grad() 30 | def __call__(self, features): 31 | bsz = len(features) 32 | seqs = [features[i]["input_ids"] for i in range(bsz)] 33 | max_length = min(max(len(seq) for seq in seqs), self.max_length) 34 | 35 | input_ids = torch.zeros(bsz, max_length, dtype=torch.long) 36 | attention_mask = torch.zeros(bsz, max_length, dtype=torch.long) 37 | 38 | for i, seq in enumerate(seqs): 39 | seq = seq[:max_length] 40 | input_ids[i, :len(seq)] = torch.tensor(seq) 41 | attention_mask[i, :len(seq)] = 1 42 | 43 | return { 44 | "input_ids": input_ids, 45 | "attention_mask": attention_mask 46 | } 47 | 48 | 49 | @torch.inference_mode() 50 | def predict_fn(subset, indices, process_id, options): 51 | model = AutoModelForCausalLM.from_pretrained(options.model_name, attn_implementation="flash_attention_2") 52 | model.to(torch.bfloat16) 53 | model.cuda() 54 | model.eval() 55 | 56 | data_loader = DataLoader(subset, 57 | batch_size=options.batch_size, 58 | collate_fn=DataCollator(options.max_length), 59 | num_workers=options.num_dataloader_workers, 60 | prefetch_factor=4, 61 | pin_memory=True, 62 | shuffle=False) 63 | 64 | for batch in tqdm(data_loader, disable=(process_id != 0)): 65 | input_ids = batch["input_ids"].cuda() 66 | attention_mask = batch["attention_mask"].cuda() 67 | 68 | logits = model(input_ids=input_ids, attention_mask=attention_mask).logits[:, :-1].float() 69 | labels = torch.where(attention_mask == 1, input_ids, torch.zeros_like(input_ids) - 100)[:, 1:] 70 | seq_lens = attention_mask.sum(1) 71 | 72 | seq_losses = torch.nn.functional.cross_entropy(logits.transpose(1, 2), labels, reduction='none').sum(1) / seq_lens 73 | 74 | for seq_loss in seq_losses : 75 | yield {"": seq_loss.cpu().numpy()} 76 | 77 | 78 | if __name__ == "__main__": 79 | parser = ArgumentParser() 80 | 81 | parser.add_argument("inputs", type=Path, nargs="+", help="Input dataset paths") 82 | parser.add_argument("output", type=Path, help="Output dataset path") 83 | 84 | parser.add_arguments(PerplexityOptions, dest="embed_options") 85 | parser.add_arguments(LoadOptions, dest="load_options") 86 | parser.add_arguments(ProcessOptions, dest="process_options") 87 | 88 | args = parser.parse_args() 89 | args.process_options.ndarray = True 90 | 91 | print("Arguments:", args) 92 | dataset = load(*args.inputs, options=args.load_options) 93 | N = len(dataset) 94 | print(f"Loaded dataset with {N} samples") 95 | 96 | process( 97 | dataset, 98 | partial(predict_fn, options=args.embed_options), 99 | args.output, 100 | args.process_options 101 | ) 102 | -------------------------------------------------------------------------------- /annotate_data/tokens.py: -------------------------------------------------------------------------------- 1 | import os 2 | from typing import Dict, List, Callable 3 | from functools import partial 4 | 5 | from tqdm import tqdm 6 | 7 | from pathlib import Path 8 | 9 | from simple_parsing import ArgumentParser, field 10 | from datatools.process import process, ProcessOptions 11 | from datatools.load import load, LoadOptions 12 | from transformers import AutoTokenizer 13 | import numpy as np 14 | 15 | def predict_fn(dataset, indices, process_id, tokenizer_name, text_field="text"): 16 | tokenizer = AutoTokenizer.from_pretrained(tokenizer_name) 17 | 18 | for i in tqdm(range(len(dataset)), disable=(process_id != 0)): 19 | text = dataset[i][text_field] 20 | tokens = tokenizer.encode(text) 21 | num_tokens = len(tokens) + 1 # + 1 for 22 | yield { 23 | "": num_tokens, 24 | # "bin": np.clip(np.log2(num_tokens).astype(int), 6, 11) 25 | } 26 | 27 | 28 | if __name__ == "__main__": 29 | parser = ArgumentParser(add_config_path_arg=True) 30 | 31 | parser.add_argument("inputs", type=Path, nargs="+", help="Input dataset paths") 32 | parser.add_argument("output", type=Path, help="Output dataset path") 33 | 34 | parser.add_argument("--text_field", type=str, default="text", help="Name of the field containing the text to classify") 35 | parser.add_argument("--tokenizer", type=str, default="EleutherAI/gpt-neox-20b", help="Path to the FastText model") 36 | 37 | parser.add_arguments(LoadOptions, dest="load_options") 38 | parser.add_arguments(ProcessOptions, dest="process_options") 39 | 40 | args = parser.parse_args() 41 | args.process_options.ndarray = True 42 | 43 | print("Arguments:", args) 44 | dataset = load(*args.inputs, options=args.load_options) 45 | N = len(dataset) 46 | print(f"Loaded dataset with {N} samples") 47 | 48 | 49 | process( 50 | dataset, 51 | partial( 52 | predict_fn, 53 | text_field=args.text_field, 54 | tokenizer_name=args.tokenizer 55 | ), 56 | args.output, args.process_options 57 | ) 58 | -------------------------------------------------------------------------------- /define_domains/k-means-clustering/CODE_OF_CONDUCT.md: -------------------------------------------------------------------------------- 1 | # Code of Conduct 2 | 3 | ## Our Pledge 4 | 5 | In the interest of fostering an open and welcoming environment, we as 6 | contributors and maintainers pledge to make participation in our project and 7 | our community a harassment-free experience for everyone, regardless of age, body 8 | size, disability, ethnicity, sex characteristics, gender identity and expression, 9 | level of experience, education, socio-economic status, nationality, personal 10 | appearance, race, religion, or sexual identity and orientation. 11 | 12 | ## Our Standards 13 | 14 | Examples of behavior that contributes to creating a positive environment 15 | include: 16 | 17 | * Using welcoming and inclusive language 18 | * Being respectful of differing viewpoints and experiences 19 | * Gracefully accepting constructive criticism 20 | * Focusing on what is best for the community 21 | * Showing empathy towards other community members 22 | 23 | Examples of unacceptable behavior by participants include: 24 | 25 | * The use of sexualized language or imagery and unwelcome sexual attention or 26 | advances 27 | * Trolling, insulting/derogatory comments, and personal or political attacks 28 | * Public or private harassment 29 | * Publishing others' private information, such as a physical or electronic 30 | address, without explicit permission 31 | * Other conduct which could reasonably be considered inappropriate in a 32 | professional setting 33 | 34 | ## Our Responsibilities 35 | 36 | Project maintainers are responsible for clarifying the standards of acceptable 37 | behavior and are expected to take appropriate and fair corrective action in 38 | response to any instances of unacceptable behavior. 39 | 40 | Project maintainers have the right and responsibility to remove, edit, or 41 | reject comments, commits, code, wiki edits, issues, and other contributions 42 | that are not aligned to this Code of Conduct, or to ban temporarily or 43 | permanently any contributor for other behaviors that they deem inappropriate, 44 | threatening, offensive, or harmful. 45 | 46 | ## Scope 47 | 48 | This Code of Conduct applies within all project spaces, and it also applies when 49 | an individual is representing the project or its community in public spaces. 50 | Examples of representing a project or community include using an official 51 | project e-mail address, posting via an official social media account, or acting 52 | as an appointed representative at an online or offline event. Representation of 53 | a project may be further defined and clarified by project maintainers. 54 | 55 | This Code of Conduct also applies outside the project spaces when there is a 56 | reasonable belief that an individual's behavior may have a negative impact on 57 | the project or its community. 58 | 59 | ## Enforcement 60 | 61 | Instances of abusive, harassing, or otherwise unacceptable behavior may be 62 | reported by contacting the project team at . All 63 | complaints will be reviewed and investigated and will result in a response that 64 | is deemed necessary and appropriate to the circumstances. The project team is 65 | obligated to maintain confidentiality with regard to the reporter of an incident. 66 | Further details of specific enforcement policies may be posted separately. 67 | 68 | Project maintainers who do not follow or enforce the Code of Conduct in good 69 | faith may face temporary or permanent repercussions as determined by other 70 | members of the project's leadership. 71 | 72 | ## Attribution 73 | 74 | This Code of Conduct is adapted from the [Contributor Covenant][homepage], version 1.4, 75 | available at https://www.contributor-covenant.org/version/1/4/code-of-conduct.html 76 | 77 | [homepage]: https://www.contributor-covenant.org 78 | 79 | For answers to common questions about this code of conduct, see 80 | https://www.contributor-covenant.org/faq 81 | -------------------------------------------------------------------------------- /define_domains/k-means-clustering/CONTRIBUTING.md: -------------------------------------------------------------------------------- 1 | # Contributing to this repository 2 | We want to make contributing to this project as easy and transparent as 3 | possible. 4 | 5 | ## Pull Requests 6 | We actively welcome your pull requests. 7 | 8 | 1. Fork the repo and create your branch from `main`. 9 | 2. If you've added code that should be tested, add tests. 10 | 3. If you've changed APIs, update the documentation. 11 | 4. Ensure the test suite passes. 12 | 5. Make sure your code lints. 13 | 6. If you haven't already, complete the Contributor License Agreement ("CLA"). 14 | 15 | ## Contributor License Agreement ("CLA") 16 | In order to accept your pull request, we need you to submit a CLA. You only need 17 | to do this once to work on any of Meta's open source projects. 18 | 19 | Complete your CLA here: 20 | 21 | ## Issues 22 | We use GitHub issues to track public bugs. Please ensure your description is 23 | clear and has sufficient instructions to be able to reproduce the issue. 24 | 25 | Meta has a [bounty program](https://bugbounty.meta.com/) for the safe 26 | disclosure of security bugs. In those cases, please go through the process 27 | outlined on that page and do not file a public issue. 28 | 29 | ## License 30 | By contributing to this repository, you agree that your contributions will be licensed 31 | under the LICENSE file in the root directory of this source tree. 32 | -------------------------------------------------------------------------------- /define_domains/k-means-clustering/README.md: -------------------------------------------------------------------------------- 1 | # Automatic Data Curation for Self-Supervised Learning: A Clustering-Based Approach 2 | 3 | __*This repository has been adapted from the [ssl-data-curation](https://github.com/facebookresearch/ssl-data-curation) repository. 4 | We have added functionality to read embeddings from many numpy files and MDS datasets. We have also changed the multi-node slurm implementation to make use of torchrun on each node. The experimental scripts for the paper can be found in `exps/*/level1/slurm_script.s`.*__ 5 | 6 | **[FAIR at Meta](https://ai.facebook.com/research/)** 7 | 8 | *Huy V. Vo, 9 | Vasil Khalidov, 10 | Timothée Darcet, 11 | Théo Moutakanni, 12 | Nikita Smetanin, 13 | Marc Szafraniec, 14 | Hugo Touvron, 15 | Camille Couprie, 16 | Maxime Oquab, 17 | Armand Joulin, 18 | Hervé Jégou, 19 | Patrick Labatut, 20 | Piotr Bojanowski* 21 | 22 | PyTorch implementation for the data curation pipeline with hierarchical k-means. For more detail, see the paper **[Automatic Data Curation for Self-Supervised Learning: A Clustering-Based Approach](https://arxiv.org/abs/2405.15613)**. 23 | 24 |

25 | data curation pipeline 26 |

27 | 28 | ## Contents 29 | - [Installation](#installation) 30 | - [Running hierarchical k-means](#running-hierarchical-k-means) 31 | * [On small data](#on-small-data) 32 | * [On large data](#on-large-data) 33 | - [Notebook](#notebook) 34 | - [Contributing](#contributing) 35 | - [License](#license) 36 | - [Citation](#citation) 37 | 38 | ## Installation 39 | ``` 40 | git clone git@github.com:facebookresearch/ssl-data-curation.git 41 | cd ssl-data-curation 42 | conda create -n ssl-data-curation python=3.10 43 | conda activate ssl-data-curation 44 | pip install -r requirements.txt 45 | ``` 46 | 47 | ## Running hierarchical k-means 48 | ### On small data 49 | We provide below an example of a 2-level hierarchical k-means on a small toy random dataset. We first run hierarchical k-means on the toy dataset then sample 1000 points from it with hierarchical sampling. A visualisation is provided in [vis/notebook.ipynb](vis/notebook.ipynb). 50 | ``` 51 | import torch 52 | import numpy as np 53 | 54 | from src.clusters import HierarchicalCluster 55 | from src import ( 56 | hierarchical_kmeans_gpu as hkmg, 57 | hierarchical_sampling as hs 58 | ) 59 | 60 | def make_ring(n, rmin, rmax): 61 | r = np.random.rand(n) * (rmax - rmin) + rmin 62 | alpha = np.random.rand(n) * 2 * np.pi 63 | return np.vstack([r * np.cos(alpha), r * np.sin(alpha)]).T 64 | 65 | data = np.concatenate([ 66 | make_ring(20000, 0.7, 1.0) + np.array([-2.2, 1.]), 67 | make_ring(200, 0.7, 1.0) + np.array([0., 1.]), 68 | make_ring(1000, 0.7, 1.0) + np.array([2.2, 1.]), 69 | make_ring(500, 0.7, 1.0) + np.array([-1.2, 0.2]), 70 | make_ring(8000, 0.7, 1.0) + np.array([1.2, 0.2]), 71 | ]) 72 | 73 | clusters = hkmg.hierarchical_kmeans_with_resampling( 74 | data=torch.tensor(data, device="cuda", dtype=torch.float32), 75 | n_clusters=[1000, 300], 76 | n_levels=2, 77 | sample_sizes=[15, 2], 78 | verbose=False, 79 | ) 80 | 81 | cl = HierarchicalCluster.from_dict(clusters) 82 | sampled_indices = hs.hierarchical_sampling(cl, target_size=1000) 83 | ``` 84 | 85 |

86 | data curation pipeline 87 |

88 | 89 | ### On large data 90 | To launch hierarchical k-means on large data, we need to prepare a config file. We provide below an example illustrating how to launch a 2-level hierarchical k-means on random embeddings with config in [configs/2levels_random_embeddings.yaml](configs/2levels_random_embeddings.yaml). 91 | ``` 92 | # Prepare the experiment 93 | cd ssl-data-curation 94 | mkdir -p data 95 | cd scripts 96 | python -c 'import numpy as np; np.save( "../data/100k_random.npy", np.random.randn(100000,256))' 97 | python hierarchical_kmeans_launcher.py \ 98 | --exp_dir ../data/2levels_random_embeddings \ 99 | --embeddings_path ../data/100k_random.npy \ 100 | --config_file ../configs/2levels_random_embeddings.yaml 101 | 102 | cd ../data/2levels_random_embeddings 103 | # Launch with slurm 104 | bash launcher.sh 105 | # Launch locally if only 1 node is used 106 | # bash local_launcher.sh 107 | 108 | cd ssl-data-curation/scripts 109 | # Sampled indices will be saved in ssl-data-curation/data/2levels_random_embeddings/curated_datasets 110 | PYTHONPATH=.. python run_hierarchical_sampling.py \ 111 | --clustering_path ../data/2levels_random_embeddings \ 112 | --target_size 20000 \ 113 | --save 114 | ``` 115 | 116 | We also provide the config used for our web-based image data pool in [configs/4levels_web_based_images.yaml](configs/4levels_web_based_images.yaml). 117 | 118 | ## Notebook 119 | We provide a [notebook](vis/notebook.ipynb) to reproduce visualizations in the paper and show additional examples. 120 | 121 | ## Contributing 122 | See [contributing](CONTRIBUTING.md) and the [code of conduct](CODE_OF_CONDUCT.md). 123 | 124 | ## License 125 | This code is CC-BY-NC 4.0 licensed, as found in [LICENSE](LICENSE). 126 | 127 | ## Citation 128 | If you find our work useful, please consider giving a star and a citation: 129 | ``` 130 | @article{vo2024automatic, 131 | title={Automatic Data Curation for Self-Supervised Learning: A Clustering-Based Approach}, 132 | author={Vo, Huy V. and Khalidov, Vasil and Darcet, Timoth{\'e}e and Moutakanni, Th{\'e}o and Smetanin, Nikita and Szafraniec, Marc and Touvron, Hugo and Couprie, Camille and Oquab, Maxime and Joulin, Armand and Jégou, Hervé and Labatut, Patrick and Bojanowski, Piotr}, 133 | journal={arXiv:2405.15613}, 134 | year={2024}, 135 | } 136 | ``` -------------------------------------------------------------------------------- /define_domains/k-means-clustering/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Meta Platforms, Inc. and affiliates. 2 | # All rights reserved. 3 | # 4 | # This source code is licensed under the license found in the 5 | # LICENSE file in the root directory of this source tree. 6 | 7 | 8 | -------------------------------------------------------------------------------- /define_domains/k-means-clustering/configs/1level_dcml.yaml: -------------------------------------------------------------------------------- 1 | # Number of levels in hierarchical k-means. 2 | n_levels: 1 3 | # Number of updates of centroids in the main k-means loop. 4 | n_iters: 50 5 | # Number of clusters in each level of hierarchical k-means. 6 | # For efficiency in the first level, we run first a k-means 7 | # with 100k clusters, then split each cluster into 100 8 | # smaller ones to have 10M clusters. 9 | n_clusters: 10 | - 24 11 | # If > 1, run the level in two steps. First, k-means is executed once. 12 | # Then, each obtained cluster is splitted into "n_split" smaller clusters, 13 | # which are considered final and used in the subsequent level. 14 | n_splits: 15 | - 1 16 | # Number of resampling steps in each level. 17 | # For efficiency, we do not use resampling in the first level. 18 | n_resampling_steps: 19 | - 1 20 | # Number of data points sampled from each cluster in the resampling steps. 21 | # It is roughly half the average cluster size in each level. 22 | sample_size: 23 | - 1 24 | # Specified if running only on a subset of the data pool. 25 | # For example, we extract embeddings for all images in the data pool, 26 | # but run the curation pipeline only on a deduplicated subset. 27 | subset_indices_path: null 28 | checkpoint_period: 10_000 29 | dtype: float32 30 | high_precision: float32 31 | ngpus_per_node: 32 | - 8 33 | nnodes: 34 | - 4 35 | ncpus_per_gpu: 6 36 | sampling_strategy: r 37 | slurm_partition: pli-c 38 | -------------------------------------------------------------------------------- /define_domains/k-means-clustering/configs/2levels_random_embeddings.yaml: -------------------------------------------------------------------------------- 1 | # Number of levels in hierarchical k-means. 2 | n_levels: 2 3 | # Number of updates of centroids in the main k-means loop. 4 | n_iters: 50 5 | # Number of clusters in each level of hierarchical k-means. 6 | n_clusters: 7 | - 5000 8 | - 1000 9 | # If > 1, run the level in two steps. First, k-means is executed once. 10 | # Then, each obtained cluster is splitted into "n_split" smaller clusters, 11 | # which are considered final and used in the subsequent level. 12 | n_splits: 13 | - 1 14 | - 1 15 | # Number of resampling steps in each level. 16 | n_resampling_steps: 17 | - 10 18 | - 10 19 | # Number of data points sampled from each cluster in the resampling steps. 20 | # It is roughly half the average cluster size in each level. 21 | sample_size: 22 | - 10 23 | - 3 24 | # Specified if running only on a subset of the data pool. 25 | # For example, we extract embeddings for all images in the data pool, 26 | # but run the curation pipeline only on a deduplicated subset. 27 | subset_indices_path: null 28 | checkpoint_period: 1000 29 | dtype: float64 30 | high_precision: float64 31 | ngpus_per_node: 32 | - 2 33 | - 2 34 | nnodes: 35 | - 1 36 | - 1 37 | ncpus_per_gpu: 10 38 | sampling_strategy: c 39 | slurm_partition: null 40 | -------------------------------------------------------------------------------- /define_domains/k-means-clustering/configs/4levels_web_based_images.yaml: -------------------------------------------------------------------------------- 1 | # Number of levels in hierarchical k-means. 2 | n_levels: 4 3 | # Number of updates of centroids in the main k-means loop. 4 | n_iters: 50 5 | # Number of clusters in each level of hierarchical k-means. 6 | # For efficiency in the first level, we run first a k-means 7 | # with 100k clusters, then split each cluster into 100 8 | # smaller ones to have 10M clusters. 9 | n_clusters: 10 | - 100_000 11 | - 500_000 12 | - 50_000 13 | - 10_000 14 | # If > 1, run the level in two steps. First, k-means is executed once. 15 | # Then, each obtained cluster is splitted into "n_split" smaller clusters, 16 | # which are considered final and used in the subsequent level. 17 | n_splits: 18 | - 100 19 | - 1 20 | - 1 21 | - 1 22 | # Number of resampling steps in each level. 23 | # For efficiency, we do not use resampling in the first level. 24 | n_resampling_steps: 25 | - 1 26 | - 10 27 | - 10 28 | - 10 29 | # Number of data points sampled from each cluster in the resampling steps. 30 | # It is roughly half the average cluster size in each level. 31 | sample_size: 32 | - 1 33 | - 10 34 | - 5 35 | - 3 36 | # Specified if running only on a subset of the data pool. 37 | # For example, we extract embeddings for all images in the data pool, 38 | # but run the curation pipeline only on a deduplicated subset. 39 | subset_indices_path: null 40 | checkpoint_period: 10_000 41 | dtype: float64 42 | high_precision: float64 43 | ngpus_per_node: 44 | - 8 45 | - 8 46 | - 8 47 | - 8 48 | nnodes: 49 | - 16 50 | - 2 51 | - 1 52 | - 1 53 | ncpus_per_gpu: 10 54 | sampling_strategy: c 55 | slurm_partition: null 56 | -------------------------------------------------------------------------------- /define_domains/k-means-clustering/exps/dclm-1level-k118/level1/centroids.npy: -------------------------------------------------------------------------------- 1 | /scratch/gpfs/awettig/delve/k-means-clustering/exps/dclm-1level-k118/level1/step0/centroids.npy -------------------------------------------------------------------------------- /define_domains/k-means-clustering/exps/dclm-1level-k118/level1/slurm_script.s: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | #SBATCH --nodes=4 4 | #SBATCH --gres=gpu:8 5 | #SBATCH --ntasks-per-node=1 6 | #SBATCH --job-name=kmeans_level1 7 | #SBATCH --time=1-0 8 | #SBATCH --mem=800G 9 | #SBATCH --cpus-per-task=32 10 | #SBATCH --partition=pli-c 11 | 12 | EXPDIR=/scratch/gpfs/awettig/delve/k-means-clustering/exps/dclm-1level-k118/level1 13 | cd /scratch/gpfs/awettig/delve/k-means-clustering/scripts 14 | 15 | master_addr=$(scontrol show hostnames "$SLURM_JOB_NODELIST" | head -n 1) 16 | 17 | PYTHONPATH=.. \ 18 | srun -N 4 --unbuffered --output="$EXPDIR"/logs/%j_%t_log.out --error="$EXPDIR"/logs/%j_%t_log.err torchrun \ 19 | --nnodes=4 \ 20 | --nproc_per_node=8 \ 21 | --rdzv_backend=c10d \ 22 | --rdzv_endpoint=$master_addr:56321 \ 23 | run_distributed_kmeans.py \ 24 | --use_torchrun \ 25 | --data_path /scratch/gpfs/PLI/awettig/dclm/dclm-pool-1b-1x/deduplicated/embeds \ 26 | --n_clusters 118 \ 27 | --n_iters 50 \ 28 | --chunk_size 1694915 \ 29 | --dtype float32 \ 30 | --high_precision float32 \ 31 | --checkpoint_period 10000 \ 32 | --exp_dir $EXPDIR \ 33 | --n_steps 1 \ 34 | --sample_size 1 \ 35 | --do_not_sort_clusters \ 36 | --held_out_shards 100 \ 37 | --sampling_strategy r 38 | -------------------------------------------------------------------------------- /define_domains/k-means-clustering/exps/dclm-1level-k13824/level1/centroids.npy: -------------------------------------------------------------------------------- 1 | /scratch/gpfs/awettig/delve/k-means-clustering/exps/dclm-1level-k13824/level1/step0/centroids.npy -------------------------------------------------------------------------------- /define_domains/k-means-clustering/exps/dclm-1level-k13824/level1/slurm_script.s: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | #SBATCH --nodes=4 4 | #SBATCH --gres=gpu:8 5 | #SBATCH --ntasks-per-node=1 6 | #SBATCH --job-name=kmeans_level1 7 | #SBATCH --time=1-0 8 | #SBATCH --mem=800G 9 | #SBATCH --cpus-per-task=32 10 | #SBATCH --partition=pli-c 11 | 12 | EXPDIR=/scratch/gpfs/awettig/delve/k-means-clustering/exps/dclm-1level-k13824/level1 13 | cd /scratch/gpfs/awettig/delve/k-means-clustering/scripts 14 | 15 | master_addr=$(scontrol show hostnames "$SLURM_JOB_NODELIST" | head -n 1) 16 | 17 | PYTHONPATH=.. \ 18 | srun -N 4 --unbuffered --output="$EXPDIR"/logs/%j_%t_log.out --error="$EXPDIR"/logs/%j_%t_log.err torchrun \ 19 | --nnodes=4 \ 20 | --nproc_per_node=8 \ 21 | --rdzv_backend=c10d \ 22 | --rdzv_endpoint=$master_addr:56321 \ 23 | run_distributed_kmeans.py \ 24 | --use_torchrun \ 25 | --data_path /scratch/gpfs/PLI/awettig/dclm/dclm-pool-1b-1x/deduplicated/embeds \ 26 | --n_clusters 13824 \ 27 | --n_iters 50 \ 28 | --chunk_size 14467 \ 29 | --dtype float32 \ 30 | --high_precision float32 \ 31 | --checkpoint_period 10000 \ 32 | --exp_dir $EXPDIR \ 33 | --n_steps 1 \ 34 | --sample_size 1 \ 35 | --do_not_sort_clusters \ 36 | --held_out_shards 100 \ 37 | --sampling_strategy r 38 | -------------------------------------------------------------------------------- /define_domains/k-means-clustering/exps/dclm-1level-k24/level1/centroids.npy: -------------------------------------------------------------------------------- 1 | /scratch/gpfs/awettig/delve/k-means-clustering/exps/dclm-1level-k24/level1/step0/centroids.npy -------------------------------------------------------------------------------- /define_domains/k-means-clustering/exps/dclm-1level-k24/level1/slurm_script.s: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | #SBATCH --nodes=3 4 | #SBATCH --gres=gpu:8 5 | #SBATCH --ntasks-per-node=1 6 | #SBATCH --job-name=kmeans_level1 7 | #SBATCH --time=1-0 8 | #SBATCH --mem=800G 9 | #SBATCH --cpus-per-task=32 10 | #SBATCH --partition=pli-c 11 | 12 | EXPDIR=/scratch/gpfs/awettig/delve/k-means-clustering/exps/dclm-1level-k24/level1 13 | cd /scratch/gpfs/awettig/delve/k-means-clustering/scripts 14 | 15 | master_addr=$(scontrol show hostnames "$SLURM_JOB_NODELIST" | head -n 1) 16 | 17 | PYTHONPATH=.. \ 18 | srun -N 3 --unbuffered --output="$EXPDIR"/logs/%j_%t_log.out --error="$EXPDIR"/logs/%j_%t_log.err torchrun \ 19 | --nnodes=3 \ 20 | --nproc_per_node=8 \ 21 | --rdzv_backend=c10d \ 22 | --rdzv_endpoint=$master_addr:56321 \ 23 | run_distributed_kmeans.py \ 24 | --use_torchrun \ 25 | --data_path /scratch/gpfs/PLI/awettig/dclm/dclm-pool-1b-1x/deduplicated/embeds \ 26 | --n_clusters 24 \ 27 | --n_iters 50 \ 28 | --chunk_size 8333333 \ 29 | --dtype float32 \ 30 | --high_precision float32 \ 31 | --checkpoint_period 10000 \ 32 | --exp_dir $EXPDIR \ 33 | --n_steps 1 \ 34 | --sample_size 1 \ 35 | --do_not_sort_clusters \ 36 | --held_out_shards 100 \ 37 | --sampling_strategy r 38 | -------------------------------------------------------------------------------- /define_domains/k-means-clustering/exps/dclm-1level-k2822/level1/centroids.npy: -------------------------------------------------------------------------------- 1 | /scratch/gpfs/awettig/delve/k-means-clustering/exps/dclm-1level-k2822/level1/step0/centroids.npy -------------------------------------------------------------------------------- /define_domains/k-means-clustering/exps/dclm-1level-k2822/level1/slurm_script.s: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | #SBATCH --nodes=4 4 | #SBATCH --gres=gpu:8 5 | #SBATCH --ntasks-per-node=1 6 | #SBATCH --job-name=kmeans_level1 7 | #SBATCH --time=1-0 8 | #SBATCH --mem=800G 9 | #SBATCH --cpus-per-task=32 10 | #SBATCH --partition=pli-c 11 | 12 | EXPDIR=/scratch/gpfs/awettig/delve/k-means-clustering/exps/dclm-1level-k2822/level1 13 | cd /scratch/gpfs/awettig/delve/k-means-clustering/scripts 14 | 15 | master_addr=$(scontrol show hostnames "$SLURM_JOB_NODELIST" | head -n 1) 16 | 17 | PYTHONPATH=.. \ 18 | srun -N 4 --unbuffered --output="$EXPDIR"/logs/%j_%t_log.out --error="$EXPDIR"/logs/%j_%t_log.err torchrun \ 19 | --nnodes=4 \ 20 | --nproc_per_node=8 \ 21 | --rdzv_backend=c10d \ 22 | --rdzv_endpoint=$master_addr:56321 \ 23 | run_distributed_kmeans.py \ 24 | --use_torchrun \ 25 | --data_path /scratch/gpfs/PLI/awettig/dclm/dclm-pool-1b-1x/deduplicated/embeds \ 26 | --n_clusters 2822 \ 27 | --n_iters 50 \ 28 | --chunk_size 70871 \ 29 | --dtype float32 \ 30 | --high_precision float32 \ 31 | --checkpoint_period 10000 \ 32 | --exp_dir $EXPDIR \ 33 | --n_steps 1 \ 34 | --sample_size 1 \ 35 | --do_not_sort_clusters \ 36 | --held_out_shards 100 \ 37 | --sampling_strategy r 38 | -------------------------------------------------------------------------------- /define_domains/k-means-clustering/exps/dclm-1level-k576/level1/centroids.npy: -------------------------------------------------------------------------------- 1 | /scratch/gpfs/awettig/delve/k-means-clustering/exps/dclm-1level-k576/level1/step0/centroids.npy -------------------------------------------------------------------------------- /define_domains/k-means-clustering/exps/dclm-1level-k576/level1/slurm_script.s: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | #SBATCH --nodes=4 4 | #SBATCH --gres=gpu:8 5 | #SBATCH --ntasks-per-node=1 6 | #SBATCH --job-name=kmeans_level1 7 | #SBATCH --time=1-0 8 | #SBATCH --mem=800G 9 | #SBATCH --cpus-per-task=32 10 | #SBATCH --partition=pli-c 11 | 12 | EXPDIR=/scratch/gpfs/awettig/delve/k-means-clustering/exps/dclm-1level-k576/level1 13 | cd /scratch/gpfs/awettig/delve/k-means-clustering/scripts 14 | 15 | master_addr=$(scontrol show hostnames "$SLURM_JOB_NODELIST" | head -n 1) 16 | 17 | PYTHONPATH=.. \ 18 | srun -N 4 --unbuffered --output="$EXPDIR"/logs/%j_%t_log.out --error="$EXPDIR"/logs/%j_%t_log.err torchrun \ 19 | --nnodes=4 \ 20 | --nproc_per_node=8 \ 21 | --rdzv_backend=c10d \ 22 | --rdzv_endpoint=$master_addr:56321 \ 23 | run_distributed_kmeans.py \ 24 | --use_torchrun \ 25 | --data_path /scratch/gpfs/PLI/awettig/dclm/dclm-pool-1b-1x/deduplicated/embeds \ 26 | --n_clusters 576 \ 27 | --n_iters 50 \ 28 | --chunk_size 347222 \ 29 | --dtype float32 \ 30 | --high_precision float32 \ 31 | --checkpoint_period 10000 \ 32 | --exp_dir $EXPDIR \ 33 | --n_steps 1 \ 34 | --sample_size 1 \ 35 | --do_not_sort_clusters \ 36 | --held_out_shards 100 \ 37 | --sampling_strategy r 38 | -------------------------------------------------------------------------------- /define_domains/k-means-clustering/exps/dclm-1level-k67723/level1/centroids.npy: -------------------------------------------------------------------------------- 1 | /scratch/gpfs/awettig/delve/k-means-clustering/exps/dclm-1level-k67723/level1/step0/centroids.npy -------------------------------------------------------------------------------- /define_domains/k-means-clustering/exps/dclm-1level-k67723/level1/slurm_script.s: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | #SBATCH --nodes=4 4 | #SBATCH --gres=gpu:8 5 | #SBATCH --ntasks-per-node=1 6 | #SBATCH --job-name=kmeans_level1 7 | #SBATCH --time=1-0 8 | #SBATCH --mem=800G 9 | #SBATCH --cpus-per-task=32 10 | #SBATCH --partition=pli-c 11 | 12 | EXPDIR=/scratch/gpfs/awettig/delve/k-means-clustering/exps/dclm-1level-k67723/level1 13 | cd /scratch/gpfs/awettig/delve/k-means-clustering/scripts 14 | 15 | master_addr=$(scontrol show hostnames "$SLURM_JOB_NODELIST" | head -n 1) 16 | 17 | PYTHONPATH=.. \ 18 | srun -N 4 --unbuffered --output="$EXPDIR"/logs/%j_%t_log.out --error="$EXPDIR"/logs/%j_%t_log.err torchrun \ 19 | --nnodes=4 \ 20 | --nproc_per_node=8 \ 21 | --rdzv_backend=c10d \ 22 | --rdzv_endpoint=$master_addr:56321 \ 23 | run_distributed_kmeans.py \ 24 | --use_torchrun \ 25 | --data_path /scratch/gpfs/PLI/awettig/dclm/dclm-pool-1b-1x/deduplicated/embeds \ 26 | --n_clusters 67723 \ 27 | --n_iters 50 \ 28 | --chunk_size 2953 \ 29 | --dtype float32 \ 30 | --high_precision float32 \ 31 | --checkpoint_period 10000 \ 32 | --exp_dir $EXPDIR \ 33 | --n_steps 1 \ 34 | --sample_size 1 \ 35 | --do_not_sort_clusters \ 36 | --held_out_shards 100 \ 37 | --sampling_strategy r 38 | -------------------------------------------------------------------------------- /define_domains/k-means-clustering/images/curation_pipeline.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/CodeCreator/WebOrganizer/b3da665635be3ee6c51484509f0fa5699f24d28c/define_domains/k-means-clustering/images/curation_pipeline.png -------------------------------------------------------------------------------- /define_domains/k-means-clustering/images/toy_example.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/CodeCreator/WebOrganizer/b3da665635be3ee6c51484509f0fa5699f24d28c/define_domains/k-means-clustering/images/toy_example.png -------------------------------------------------------------------------------- /define_domains/k-means-clustering/requirements.txt: -------------------------------------------------------------------------------- 1 | --extra-index-url https://download.pytorch.org/whl/cu121 2 | torch==2.2 3 | matplotlib==3.8.2 4 | scipy==1.11.4 5 | numpy==1.24.4 6 | omegaconf 7 | scikit-learn>=1.5.0 8 | tqdm 9 | ipykernel 10 | -------------------------------------------------------------------------------- /define_domains/k-means-clustering/scripts/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Meta Platforms, Inc. and affiliates. 2 | # All rights reserved. 3 | # 4 | # This source code is licensed under the license found in the 5 | # LICENSE file in the root directory of this source tree. 6 | 7 | 8 | -------------------------------------------------------------------------------- /define_domains/k-means-clustering/scripts/run_hierarchical_sampling.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Meta Platforms, Inc. and affiliates. 2 | # All rights reserved. 3 | # 4 | # This source code is licensed under the license found in the 5 | # LICENSE file in the root directory of this source tree. 6 | 7 | import logging 8 | from argparse import ArgumentParser 9 | from pathlib import Path 10 | 11 | import numpy as np 12 | 13 | from src.clusters import HierarchicalCluster 14 | from src.utils import setup_logging 15 | from src.hierarchical_sampling import hierarchical_sampling 16 | 17 | logger = logging.getLogger("hkmeans") 18 | 19 | if __name__ == "__main__": 20 | parser = ArgumentParser() 21 | parser.add_argument("--save", action="store_true") 22 | parser.add_argument("--clustering_path", "-clus", type=str, required=True) 23 | parser.add_argument( 24 | "--target_size", 25 | type=int, 26 | required=True, 27 | help="Target size of the sampled set" 28 | ) 29 | parser.add_argument( 30 | "--multiplier", 31 | "-m", 32 | type=int, 33 | default=1, 34 | help="Maximum number of times an image is selected" 35 | ) 36 | parser.add_argument( 37 | "--sampling_strategy", 38 | "-ss", 39 | type=str, 40 | default="r", 41 | help='"r" for random, "c" for closest', 42 | ) 43 | parser.add_argument( 44 | "--sort_indices", 45 | action="store_true", 46 | help="If true, sort indices in increasing order", 47 | ) 48 | parser.add_argument( 49 | "--name_suffix", 50 | type=str, 51 | default="", 52 | help="Suffix to add to the indice file name", 53 | ) 54 | parser.add_argument( 55 | "--valid_indices_path", 56 | type=str, 57 | default=None, 58 | help=( 59 | "Path to .npy file containing valid indices of the base dataset. " 60 | "The clustering is computed only on these valid images." 61 | ), 62 | ) 63 | parser.add_argument( 64 | "--cluster_fname", 65 | type=str, 66 | default="sorted_clusters.npy", 67 | help="name of files containing clusters", 68 | ) 69 | parser.add_argument("--save_dir_name", type=str, default="curated_datasets") 70 | 71 | args = parser.parse_args() 72 | args.clustering_path = Path(args.clustering_path).resolve() 73 | setup_logging() 74 | logger.info(f"args: {args}") 75 | 76 | cl = HierarchicalCluster.from_file( 77 | cluster_path=args.clustering_path, 78 | cluster_fname=args.cluster_fname 79 | ) 80 | 81 | sampled_indices = hierarchical_sampling( 82 | cl, 83 | args.target_size, 84 | args.multiplier, 85 | args.sampling_strategy, 86 | ) 87 | if args.valid_indices_path is not None: 88 | valid_indices = np.load(args.valid_indices_path) 89 | assert len(valid_indices) == np.sum( 90 | [len(el) for el in cl.clusters[1]] 91 | ), "Number of images is not equal to valid_indices size" 92 | sampled_indices = valid_indices[sampled_indices] 93 | 94 | if args.sort_indices: 95 | sampled_indices = np.sort(sampled_indices) 96 | 97 | num_images = len(sampled_indices) 98 | logger.info(f"Number of selected data points: {num_images}") 99 | 100 | save_indices_path = Path( 101 | args.clustering_path, 102 | args.save_dir_name, 103 | f'{cl.n_levels}{args.sampling_strategy}_mul{args.multiplier}_' 104 | f'{args.target_size}_balanced_selection.npy' 105 | ) 106 | if len(args.name_suffix) > 0: 107 | save_indices_path = Path( 108 | str(save_indices_path).replace(".npy", f"_{args.name_suffix}.npy") 109 | ) 110 | logger.info(f"Indices will be saved to {str(save_indices_path.resolve())}") 111 | if args.save: 112 | Path(args.clustering_path, args.save_dir_name).mkdir(exist_ok=True) 113 | np.save(save_indices_path, sampled_indices) 114 | logger.info("Indices are saved!") 115 | -------------------------------------------------------------------------------- /define_domains/k-means-clustering/scripts/split_clusters.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Meta Platforms, Inc. and affiliates. 2 | # All rights reserved. 3 | # 4 | # This source code is licensed under the license found in the 5 | # LICENSE file in the root directory of this source tree. 6 | 7 | import os 8 | import sys 9 | from argparse import ArgumentParser 10 | import logging 11 | from pathlib import Path 12 | from tqdm import tqdm 13 | 14 | import numpy as np 15 | import torch 16 | 17 | from src.utils import setup_logging, MDSPseudoMemMap, MultiMemMap 18 | 19 | from src.dist_comm import ( 20 | enable_distributed, 21 | get_global_rank, 22 | get_global_size, 23 | is_main_process, 24 | synchronize, 25 | ) 26 | from src import distributed_kmeans_gpu as dkmg, kmeans_gpu as kmg 27 | 28 | 29 | logger = logging.getLogger("hkmeans") 30 | 31 | 32 | def split_clusters( 33 | data_path, 34 | subset_indices_path, 35 | clusters_path, 36 | n_splits, 37 | n_iters, 38 | dtype, 39 | high_precision, 40 | save_path, 41 | device="cuda", 42 | use_torchrun=False, 43 | checkpoint_period=10, 44 | verbose=False, 45 | ): 46 | enable_distributed( 47 | use_torchrun=use_torchrun, 48 | overwrite=True, 49 | ) 50 | 51 | synchronize() 52 | logger.info("initial synchronized!") 53 | 54 | 55 | if os.path.isdir(data_path): 56 | X = MultiMemMap(data_path) 57 | else: 58 | X = np.load(data_path, mmap_mode="r") 59 | 60 | if subset_indices_path is not None: 61 | logger.info(f"Using subset with indices in {subset_indices_path}") 62 | subset_indices = np.load(subset_indices_path) 63 | X = dkmg.ExtendedNumpyMemMap(X, subset_indices) 64 | clusters = np.load(clusters_path, allow_pickle=True) 65 | n_clusters = len(clusters) 66 | 67 | part_indices = dkmg.get_part_indices(n_clusters, get_global_size()) 68 | rank = get_global_rank() 69 | 70 | # load checkpoints if exist 71 | if Path(save_path, f"split_checkpoint_{rank}.npy").exists(): 72 | ckpt = np.load( 73 | Path(save_path, f"split_checkpoint_{rank}.npy"), allow_pickle=True 74 | ).item() 75 | small_centroids = list(ckpt["small_centroids"]) 76 | small_clusters = list(ckpt["small_clusters"]) 77 | last_index = ckpt["last_index"] 78 | assert last_index - part_indices[rank] + 1 == len(small_centroids) 79 | else: 80 | small_centroids = [] 81 | small_clusters = [] 82 | last_index = part_indices[rank] - 1 83 | 84 | # run kmeans++ on clusters 85 | for cluster_idx in tqdm( 86 | range(last_index + 1, part_indices[rank + 1]), 87 | desc="Splitting pre-clusters", 88 | file=sys.stdout, 89 | bar_format="{l_bar}{bar}{r_bar}", 90 | ): 91 | if verbose: 92 | logger.info(f"Processing cluster {cluster_idx}") 93 | point_indices = np.sort(clusters[cluster_idx]) 94 | if len(point_indices) > 0: 95 | point_feats = torch.tensor(X[point_indices], device=device, dtype=dtype) 96 | _small_centroids, _small_clusters, _, _ = kmg.kmeans( 97 | point_feats, 98 | min(n_splits, len(point_indices)), 99 | n_iters, 100 | chunk_size=-1, 101 | init_method="kmeans++", 102 | dist="l2", 103 | high_precision=high_precision, 104 | ) 105 | 106 | _small_clusters = kmg.sort_cluster_by_distance( 107 | point_feats, 108 | _small_centroids, 109 | _small_clusters, 110 | device="cuda", 111 | dtype=dtype, 112 | ) 113 | _small_clusters = [point_indices[el.astype(int)] for el in _small_clusters] 114 | 115 | non_empty_clusters = [len(el) > 0 for el in _small_clusters] 116 | _small_clusters = [el for el in _small_clusters if len(el) > 0] 117 | _small_centroids = _small_centroids[non_empty_clusters] 118 | 119 | small_centroids.append(_small_centroids.cpu().numpy()) 120 | small_clusters += _small_clusters 121 | 122 | del point_feats 123 | if( 124 | cluster_idx % checkpoint_period == 0 or 125 | cluster_idx == part_indices[rank + 1] - 1 126 | ): 127 | np.save( 128 | Path(save_path, f"split_checkpoint_{rank}.npy"), 129 | { 130 | "small_centroids": small_centroids, 131 | "small_clusters": small_clusters, 132 | "last_index": cluster_idx, 133 | }, 134 | ) 135 | synchronize() 136 | logger.info("Gathering clusters") 137 | if is_main_process(): 138 | centroids = [] 139 | clusters = [] 140 | for i in tqdm( 141 | range(get_global_size()), 142 | desc="Gathering splitted clusters", 143 | file=sys.stdout, 144 | bar_format="{l_bar}{bar}{r_bar}", 145 | ): 146 | split_data = np.load( 147 | Path(save_path, f"split_checkpoint_{i}.npy"), 148 | allow_pickle=True 149 | ).item() 150 | small_centroids = np.concatenate(split_data["small_centroids"]) 151 | small_clusters = split_data["small_clusters"] 152 | assert( 153 | len(small_centroids) == len(small_clusters) 154 | ), f"Inconsistent shape in split_checkpoint_{i}.npy" 155 | assert split_data["last_index"] == part_indices[i + 1] - 1 156 | centroids.append(small_centroids) 157 | clusters += small_clusters 158 | centroids = np.concatenate(centroids) 159 | clusters = np.array(clusters, dtype=object) 160 | 161 | logger.info("Saving centroids and clusters") 162 | np.save(Path(save_path, "centroids.npy"), centroids) 163 | np.save(Path(save_path, "sorted_clusters.npy"), clusters) 164 | logger.info("Cleaning checkpoints") 165 | for i in range(get_global_size()): 166 | Path(save_path, f"split_checkpoint_{i}.npy").unlink(missing_ok=True) 167 | logger.info("Finished split_clusters!") 168 | 169 | if __name__ == "__main__": 170 | parser = ArgumentParser() 171 | parser.add_argument("--data_path", type=str, required=True) 172 | parser.add_argument("--subset_indices_path", type=str, default=None) 173 | parser.add_argument("--clusters_path", type=str, required=True) 174 | parser.add_argument("--n_splits", type=int, required=True) 175 | parser.add_argument("--n_iters", type=int, required=True) 176 | parser.add_argument("--dtype", type=str, default="float32") 177 | parser.add_argument("--high_precision", type=str, default="float32") 178 | parser.add_argument("--save_path", type=str, required=True) 179 | parser.add_argument("--use_torchrun", action="store_true") 180 | 181 | args = parser.parse_args() 182 | setup_logging() 183 | 184 | def parse_dtype(dtype): 185 | if dtype == "float32": 186 | return torch.float32 187 | elif dtype == "float64": 188 | return torch.float64 189 | elif dtype == "float16": 190 | return torch.float16 191 | else: 192 | raise ValueError(f"Value of args.dtype ({args.dtype}) not regconised") 193 | 194 | args.dtype = parse_dtype(args.dtype) 195 | args.high_precision = parse_dtype(args.high_precision) 196 | 197 | split_clusters( 198 | args.data_path, 199 | args.subset_indices_path, 200 | args.clusters_path, 201 | args.n_splits, 202 | args.n_iters, 203 | args.dtype, 204 | args.high_precision, 205 | args.save_path, 206 | "cuda", 207 | args.use_torchrun, 208 | ) 209 | synchronize() 210 | -------------------------------------------------------------------------------- /define_domains/k-means-clustering/setup.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Meta Platforms, Inc. and affiliates. 2 | # All rights reserved. 3 | # 4 | # This source code is licensed under the license found in the 5 | # LICENSE file in the root directory of this source tree. 6 | 7 | from setuptools import find_packages, setup 8 | 9 | 10 | setup( 11 | name="ssl_data_curation", 12 | packages=find_packages(), 13 | ) 14 | -------------------------------------------------------------------------------- /define_domains/k-means-clustering/src/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Meta Platforms, Inc. and affiliates. 2 | # All rights reserved. 3 | # 4 | # This source code is licensed under the license found in the 5 | # LICENSE file in the root directory of this source tree. 6 | 7 | 8 | -------------------------------------------------------------------------------- /define_domains/k-means-clustering/src/clusters.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Meta Platforms, Inc. and affiliates. 2 | # All rights reserved. 3 | # 4 | # This source code is licensed under the license found in the 5 | # LICENSE file in the root directory of this source tree. 6 | 7 | import logging 8 | from pathlib import Path 9 | import pickle 10 | from typing import Dict, List 11 | 12 | import numpy as np 13 | 14 | 15 | logger = logging.getLogger("hkmeans") 16 | 17 | 18 | def load_clusters_from_file(fpath): 19 | """ 20 | Utility to load clusters fromj different file formats. 21 | """ 22 | if Path(fpath).suffix == ".pkl": 23 | with open(fpath, "rb") as f: 24 | return np.array(pickle.load(f), dtype=object) 25 | else: 26 | return np.load(Path(fpath), allow_pickle=True) 27 | 28 | class HierarchicalCluster: 29 | """ 30 | Class representing a hierarchy of clusters returned by hierarchical k-means. 31 | """ 32 | def __init__(self): 33 | self.cluster_path = None 34 | self.n_levels = None 35 | self.cluster_fname = None 36 | self.is_loaded = False 37 | self.is_processed = False 38 | self.n_clusters = {} 39 | self.clusters = {} 40 | self.flat_clusters = {} 41 | self.clusters_size = {} 42 | self.flat_clusters_size = {} 43 | self.size_order = {} 44 | self.flat_size_order = {} 45 | 46 | def load_clusters_from_file(self): 47 | for level in range(1, 1 + self.n_levels): 48 | self.clusters[level] = load_clusters_from_file( 49 | Path( 50 | self.cluster_path, 51 | f"level{level}", 52 | self.cluster_fname 53 | ) 54 | ) 55 | self.n_clusters[level] = len(self.clusters[level]) 56 | self.is_loaded = True 57 | 58 | def process_clusters(self): 59 | if not self.is_loaded: 60 | raise RuntimeError("Clusters must be loaded before being processed") 61 | logger.info("Computing flat clusters") 62 | self.flat_clusters[1] = self.clusters[1] 63 | for level in range(2, 1 + self.n_levels): 64 | current_non_flat = self.clusters[level] 65 | prev_flat = self.flat_clusters[level - 1] 66 | self.flat_clusters[level] = np.array( 67 | [ 68 | np.concatenate([prev_flat[el] for el in clus]) 69 | if len(clus) > 0 else np.array([]) 70 | for clus in current_non_flat 71 | ], 72 | dtype=object, 73 | ) 74 | 75 | logger.info("Computing cluster length") 76 | for level, clus in self.clusters.items(): 77 | self.clusters_size[level] = np.array([len(el) for el in clus]) 78 | 79 | for level, clus in self.flat_clusters.items(): 80 | self.flat_clusters_size[level] = np.array([len(el) for el in clus]) 81 | 82 | logger.info("Sorting clusters by length") 83 | for level, clsize in self.clusters_size.items(): 84 | self.size_order[level] = np.argsort(clsize)[::-1] 85 | 86 | for level, flat_clsize in self.flat_clusters_size.items(): 87 | self.flat_size_order[level] = np.argsort(flat_clsize)[::-1] 88 | 89 | self.is_processed = True 90 | 91 | @staticmethod 92 | def from_file( 93 | cluster_path, 94 | cluster_fname="sorted_clusters.npy", 95 | ): 96 | """ 97 | Method for reading hierarchical clusters from files 98 | """ 99 | logger.info("Loading hierarchical clusters from file.") 100 | cl = HierarchicalCluster() 101 | cl.cluster_path = cluster_path 102 | cl.cluster_fname = cluster_fname 103 | cl.n_levels = 0 104 | while True: 105 | if Path(cl.cluster_path, f"level{cl.n_levels + 1}").exists(): 106 | cl.n_levels += 1 107 | else: 108 | break 109 | cl.load_clusters_from_file() 110 | cl.process_clusters() 111 | return cl 112 | 113 | @staticmethod 114 | def from_dict(clusters: List[Dict]): 115 | """ 116 | Read hierarchical clusters from a list of dictionaries. 117 | 118 | Parameters: 119 | clusters: List[Dict] 120 | Each element is a dictionary containing a field name "clusters". 121 | An example is the output of hierarchical_kmeans_gpu.hierarchical_kmeans 122 | 123 | Return: 124 | A instance of HierarchicalCluster. 125 | """ 126 | logger.info("Loading hierarchical clusters from dictionaries.") 127 | cl = HierarchicalCluster() 128 | cl.n_levels = len(clusters) 129 | for level in range(1, 1 + cl.n_levels): 130 | cl.clusters[level] = clusters[level - 1]["clusters"] 131 | cl.n_clusters[level] = len(cl.clusters[level]) 132 | cl.is_loaded = True 133 | cl.process_clusters() 134 | return cl 135 | -------------------------------------------------------------------------------- /define_domains/k-means-clustering/src/hierarchical_kmeans_gpu.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Meta Platforms, Inc. and affiliates. 2 | # All rights reserved. 3 | # 4 | # This source code is licensed under the license found in the 5 | # LICENSE file in the root directory of this source tree. 6 | 7 | import sys 8 | import logging 9 | from tqdm import tqdm 10 | 11 | import torch 12 | import numpy as np 13 | 14 | from . import kmeans_gpu as kmg 15 | 16 | 17 | logger = logging.getLogger("hkmeans") 18 | MEMORY_LIMIT = 1e8 19 | 20 | 21 | def hierarchical_kmeans( 22 | data, 23 | n_clusters, 24 | n_levels, 25 | init_method="kmeans++", 26 | num_init=1, 27 | verbose=True 28 | ): 29 | """ 30 | Run hierarchical k-means on data without resampling steps. 31 | 32 | Parameters: 33 | data: 2-D numpy array 34 | Data embeddings. 35 | n_clusters: List[int] 36 | Number of clusters for each level of hierarchical k-means 37 | n_levels: int 38 | Number of levels in hierarchical k-means. 39 | init_method: str, default = "k-means++" 40 | Initialization method for k-means centroids. 41 | Options are "k-means" and "random". 42 | num_init: int, default=1 43 | Number of re-initialization for each k-means run. 44 | 45 | Returns: 46 | List[dict], clustering results for each level of hierarchical k-means, 47 | including 48 | centroids: 2-D numpy array 49 | Centroids of clusters. 50 | assigment: 1-D numpy array 51 | Mapping from data points to cluster indices. 52 | clusters: array of array 53 | pot: float 54 | K-means potential. 55 | """ 56 | assert len(n_clusters) == n_levels 57 | logger.info(f"{n_levels}-level hierarchical kmeans") 58 | res = [] 59 | for kmid in range(n_levels): 60 | logger.info(f"Level {kmid+1}") 61 | if kmid == 0: 62 | X = data 63 | else: 64 | X = res[kmid - 1]["centroids"] 65 | chunk_size = min(X.shape[0], int(MEMORY_LIMIT / n_clusters[kmid])) 66 | centroids, clusters, cluster_assignment, pot = kmg.kmeans( 67 | X, 68 | n_clusters=n_clusters[kmid], 69 | n_iters=50, 70 | chunk_size=chunk_size, 71 | num_init=num_init, 72 | init_method=init_method, 73 | dist="l2", 74 | high_precision=torch.float64, 75 | random_state=None, 76 | verbose=verbose 77 | ) 78 | res.append( 79 | { 80 | "centroids": centroids, 81 | "assignment": cluster_assignment, 82 | "clusters": clusters, 83 | "pot": pot, 84 | } 85 | ) 86 | return res 87 | 88 | 89 | def hierarchical_kmeans_with_resampling( 90 | data, 91 | n_clusters, 92 | n_levels, 93 | sample_sizes, 94 | n_resamples=10, 95 | init_method="kmeans++", 96 | num_init=1, 97 | sample_strategy="closest", 98 | verbose=True, 99 | ): 100 | """ 101 | Run hierarchical k-means on data without resampling steps. 102 | 103 | Parameters: 104 | data: 2-D numpy array 105 | Data embeddings. 106 | n_clusters: List[int] 107 | Number of clusters for each level of hierarchical k-means 108 | n_levels: int 109 | Number of levels in hierarchical k-means. 110 | sample_size: List[int] 111 | Number of points to sample from each cluster in resampling steps. 112 | n_resamples: int 113 | Number of resampling steps in each level. 114 | init_method: str, default = "k-means++" 115 | Initialization method for k-means centroids. 116 | Options are "k-means" and "random". 117 | num_init: int, default=1 118 | Number of re-initialization for each k-means run. 119 | sampling_strategy: str, default = "closest" 120 | How to sample points from clusters in resampling steps. 121 | Options are "closest" and "random". 122 | 123 | Returns: 124 | List[dict], clustering results for each level of hierarchical k-means, 125 | including 126 | centroids: 2-D numpy array 127 | Centroids of clusters. 128 | assigment: 1-D numpy array 129 | Mapping from data points to cluster indices. 130 | clusters: array of array 131 | pot: float 132 | K-means potential. 133 | """ 134 | assert len(n_clusters) == n_levels 135 | assert len(sample_sizes) == n_levels 136 | logger.info(f"{n_levels}-level hierarchical kmeans") 137 | res = [] 138 | for kmid in range(n_levels): 139 | logger.info(f"Level {kmid+1}") 140 | logger.info("Initial kmeans") 141 | if kmid == 0: 142 | X = data 143 | else: 144 | X = res[kmid - 1]["centroids"] 145 | chunk_size = min(X.shape[0], int(MEMORY_LIMIT / n_clusters[kmid])) 146 | logger.info("Running the initial k-means") 147 | centroids, clusters, cluster_assignment, _ = kmg.kmeans( 148 | X, 149 | n_clusters=n_clusters[kmid], 150 | n_iters=50, 151 | chunk_size=chunk_size, 152 | num_init=num_init, 153 | init_method=init_method, 154 | dist="l2", 155 | high_precision=torch.float64, 156 | random_state=None, 157 | verbose=verbose, 158 | ) 159 | logger.info("Resampling-kmeans") 160 | if sample_sizes[kmid] > 1: 161 | _sample_size = sample_sizes[kmid] 162 | for _ in tqdm( 163 | range(n_resamples), 164 | desc="Hierarchical k-means resampling steps", 165 | file=sys.stdout, 166 | bar_format="{l_bar}{bar}{r_bar}", 167 | ): 168 | if sample_strategy == "closest": 169 | sorted_clusters = [ 170 | _cluster[ 171 | torch.argsort( 172 | torch.cdist(X[_cluster], centroids[i, None]) 173 | .flatten() 174 | ) 175 | .cpu() 176 | .numpy() 177 | ] 178 | for i, _cluster in enumerate(clusters) 179 | ] 180 | sampled_points = torch.concat( 181 | [ 182 | X[_cluster[: _sample_size]] 183 | for _cluster in sorted_clusters 184 | ] 185 | ) 186 | elif sample_strategy == "random": 187 | sampled_points = torch.concat( 188 | [ 189 | X[ 190 | np.random.choice( 191 | _cluster, 192 | min(len(_cluster), _sample_size), 193 | replace=False 194 | ) 195 | ] 196 | for _cluster in clusters 197 | ] 198 | ) 199 | else: 200 | raise ValueError( 201 | f"sample_strategy={sample_strategy} not supported!" 202 | ) 203 | chunk_size = min( 204 | sampled_points.shape[0], 205 | int(MEMORY_LIMIT / n_clusters[kmid]) 206 | ) 207 | centroids, _, _, _ = kmg.kmeans( 208 | sampled_points, 209 | n_clusters=n_clusters[kmid], 210 | n_iters=50, 211 | chunk_size=chunk_size, 212 | num_init=num_init, 213 | init_method=init_method, 214 | dist="l2", 215 | high_precision=torch.float64, 216 | random_state=None, 217 | verbose=False 218 | ) 219 | cluster_assignment = kmg.assign_clusters( 220 | centroids, 221 | X, 222 | "l2", 223 | chunk_size=chunk_size, 224 | verbose=False 225 | ).cpu().numpy() 226 | clusters = kmg.create_clusters_from_cluster_assignment( 227 | cluster_assignment, 228 | n_clusters[kmid] 229 | ) 230 | res.append( 231 | { 232 | "centroids": centroids, 233 | "assignment": cluster_assignment, 234 | "clusters": clusters, 235 | "pot": -1, 236 | } 237 | ) 238 | return res 239 | -------------------------------------------------------------------------------- /define_domains/k-means-clustering/src/hierarchical_sampling.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Meta Platforms, Inc. and affiliates. 2 | # All rights reserved. 3 | # 4 | # This source code is licensed under the license found in the 5 | # LICENSE file in the root directory of this source tree. 6 | 7 | import sys 8 | import logging 9 | import random 10 | 11 | import numpy as np 12 | from tqdm import tqdm 13 | 14 | from src.clusters import HierarchicalCluster 15 | 16 | 17 | logger = logging.getLogger("hkmeans") 18 | 19 | def random_selection(clusters, valid_clusters, num_per_cluster): 20 | """ 21 | Parameters: 22 | clusters: (num_cluster, ) np.array 23 | clusters[i] contain indices of points in cluster i 24 | valid_clusters: list or np.array 25 | indices of clusters that are considered 26 | num_per_cluster: int 27 | number of points selected from each cluster 28 | 29 | Returns: 30 | array containing indices of selected points 31 | """ 32 | num_clusters = len(clusters) 33 | selected = [[]] * num_clusters 34 | for cluster_id in tqdm( 35 | valid_clusters, 36 | desc="Random sampling from clusters", 37 | file=sys.stdout, 38 | bar_format="{l_bar}{bar}{r_bar}", 39 | ): 40 | selected[cluster_id] = random.sample( 41 | list(clusters[cluster_id]), min(num_per_cluster, len(clusters[cluster_id])) 42 | ) 43 | return np.concatenate(selected).astype(np.int64) 44 | 45 | 46 | def closest_to_centroid_selection(sorted_clusters, valid_clusters, num_per_cluster): 47 | """ 48 | Parameters: 49 | sorted_clusters: (num_cluster, ) np.array 50 | clusters[i] contain indices of points in cluster i 51 | indices in clusters[i] are sorted in increasing distance from the centroid i 52 | valid_clusters: list or np.array 53 | indices of clusters that are considered 54 | num_per_cluster: int, number of points selected from each cluster 55 | 56 | Returns: 57 | array containing indices of selected points 58 | """ 59 | num_clusters = len(sorted_clusters) 60 | selected = [[]] * num_clusters 61 | for cluster_id in tqdm( 62 | valid_clusters, 63 | desc="Closest-to-centroid sampling from clusters", 64 | file=sys.stdout, 65 | bar_format="{l_bar}{bar}{r_bar}", 66 | ): 67 | selected[cluster_id] = sorted_clusters[cluster_id][:num_per_cluster] 68 | return np.concatenate(selected).astype(np.int64) 69 | 70 | 71 | def _find_best_cut_left(arr, target): 72 | """ 73 | Find integers x such that sum(min(x, arr)) best approximates target 74 | """ 75 | if target < 0: 76 | raise ValueError(f"target {target} must be non-negative!") 77 | if np.min(arr) < 0: 78 | raise ValueError("arr has negative elements!") 79 | if np.sum(arr) <= target: 80 | return np.max(arr) 81 | left = 0 82 | right = np.max(arr) 83 | while right - left > 1: 84 | mid = (left + right) // 2 85 | sum_with_mid = np.sum(np.minimum(mid, arr)) 86 | if sum_with_mid > target: 87 | right = mid 88 | elif sum_with_mid < target: 89 | left = mid 90 | else: 91 | return mid 92 | if np.sum(np.minimum(right, arr)) <= target: 93 | return right 94 | return left 95 | 96 | 97 | def find_subcluster_target_size( 98 | subcluster_sizes, 99 | target_size, 100 | multiplier, 101 | ): 102 | """ 103 | Given the target number of points to sample from a clusters, 104 | find number of points to sample from its subclusters. 105 | """ 106 | if isinstance(subcluster_sizes, np.ndarray): 107 | arr = subcluster_sizes * multiplier 108 | else: 109 | arr = np.array(subcluster_sizes) * multiplier 110 | best_cut_left = _find_best_cut_left(arr, target_size) 111 | if best_cut_left == np.max(arr): 112 | return arr 113 | else: 114 | subcluster_target_sizes = np.minimum(best_cut_left, arr) 115 | remainder = target_size - subcluster_target_sizes.sum() 116 | candidates = np.where(arr > best_cut_left)[0] 117 | subcluster_target_sizes[np.random.choice(candidates, remainder, replace=False)] = best_cut_left + 1 118 | assert subcluster_target_sizes.sum() == target_size 119 | assert np.all(subcluster_target_sizes <= arr) 120 | return subcluster_target_sizes 121 | 122 | 123 | def recursive_hierarchical_sampling( 124 | clusters: HierarchicalCluster, 125 | level: int, 126 | target_size: int, 127 | cl_index: int, 128 | multiplier: int, 129 | sampling_strategy: str = "r", 130 | ): 131 | """ 132 | Given a target number of points to sample from a cluster, return 133 | the a set of sampled points. 134 | """ 135 | if level == 1: 136 | current_cluster = clusters.clusters[1][cl_index] 137 | current_cluster_size = clusters.clusters_size[1][cl_index] 138 | if current_cluster_size * multiplier <= target_size: 139 | return np.tile(current_cluster, multiplier) 140 | else: 141 | n_replicates = target_size // current_cluster_size 142 | replicates = np.tile(current_cluster, n_replicates) 143 | remaining_target = target_size - n_replicates * current_cluster_size 144 | if sampling_strategy == "r": # random 145 | remaining_samples = np.random.choice( 146 | current_cluster, 147 | remaining_target, 148 | replace=False, 149 | ) 150 | elif sampling_strategy == "c": # "closest" 151 | remaining_samples = current_cluster[:remaining_target] 152 | else: 153 | raise ValueError(f"sampling_strategy={sampling_strategy} is not supported") 154 | return np.concatenate([replicates, remaining_samples]) 155 | else: 156 | subcl_indices = clusters.clusters[level][cl_index] 157 | subcluster_sizes = clusters.flat_clusters_size[level - 1][subcl_indices] 158 | subcluster_target_sizes = find_subcluster_target_size( 159 | subcluster_sizes, 160 | target_size, 161 | multiplier, 162 | ) 163 | samples = [] 164 | for i, subcl_index in enumerate(subcl_indices): 165 | samples.append( 166 | recursive_hierarchical_sampling( 167 | clusters, 168 | level - 1, 169 | subcluster_target_sizes[i], 170 | subcl_index, 171 | multiplier, 172 | sampling_strategy, 173 | ) 174 | ) 175 | return np.concatenate(samples) 176 | 177 | 178 | def hierarchical_sampling( 179 | clusters: HierarchicalCluster, 180 | target_size: int, 181 | multiplier: int = 1, 182 | sampling_strategy: str = "r", 183 | ): 184 | """ 185 | Method for sample hierarchically from a hierarchy of clusters. 186 | """ 187 | if (not clusters.is_loaded) or (not clusters.is_processed): 188 | raise RuntimeError("HierarchicalCluster is not loaded or processed.") 189 | n_levels = clusters.n_levels 190 | cluster_target_sizes = find_subcluster_target_size( 191 | clusters.flat_clusters_size[n_levels], 192 | target_size, 193 | multiplier, 194 | ) 195 | samples = [] 196 | for cl_index in tqdm( 197 | range(len(clusters.clusters[n_levels])), 198 | desc="Hierarchical sampling from clusters", 199 | file=sys.stdout, 200 | bar_format="{l_bar}{bar}{r_bar}", 201 | ): 202 | samples.append( 203 | recursive_hierarchical_sampling( 204 | clusters, 205 | n_levels, 206 | cluster_target_sizes[cl_index], 207 | cl_index, 208 | multiplier, 209 | sampling_strategy, 210 | ) 211 | ) 212 | samples = np.concatenate(samples) 213 | return samples 214 | -------------------------------------------------------------------------------- /define_domains/k-means-clustering/src/utils.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Meta Platforms, Inc. and affiliates. 2 | # All rights reserved. 3 | # 4 | # This source code is licensed under the license found in the 5 | # LICENSE file in the root directory of this source tree. 6 | 7 | import sys 8 | import logging 9 | from pathlib import Path 10 | 11 | import numpy as np 12 | import torch 13 | 14 | from streaming import LocalDataset 15 | 16 | 17 | class MultiMemMap: 18 | def __init__(self, path: str, held_out_shards: int = 0): 19 | """ 20 | Parameters: 21 | X: memmap to a numy array, or an array 22 | indices: array, indices representing the slice 23 | """ 24 | paths = sorted(Path(path).glob("*.npy")) 25 | self.shards = [ 26 | np.load(path, mmap_mode="r") 27 | for path in paths[:len(paths)-held_out_shards] 28 | ] 29 | self.lengths = [ 30 | len(shard) for shard in self.shards 31 | ] 32 | self.cum_lengths = np.cumsum(self.lengths) 33 | self.dim = self.shards[0].shape[-1] 34 | self.dtype = self.shards[0].dtype 35 | 36 | def __getitem__(self, ids): 37 | if isinstance(ids, int): 38 | return self.__getitem__([ids])[0] 39 | ids = np.arange(len(self))[ids] 40 | 41 | shard_idx = np.searchsorted(self.cum_lengths, ids, side='right') 42 | results = np.zeros((len(shard_idx), self.dim), dtype=self.dtype) 43 | 44 | for shard_id in np.unique(shard_idx): 45 | ids_mask = shard_idx == shard_id 46 | results[ids_mask] = self.shards[shard_id][ids[ids_mask] - self.cum_lengths[shard_id]] 47 | return results 48 | 49 | def __len__(self): 50 | return self.cum_lengths[-1] 51 | 52 | @property 53 | def shape(self): 54 | return (self.cum_lengths[-1], self.dim) 55 | 56 | def numpy(self): 57 | return self.__getitem__(slice(0, len(self))) 58 | 59 | def to_tensor(self, dtype, device): 60 | return torch.tensor(self.numpy(), device=device, dtype=dtype) 61 | 62 | 63 | class MDSPseudoMemMap(LocalDataset): 64 | def __init__(self, path: str, field="embedding"): 65 | """ 66 | Parameters: 67 | X: memmap to a numy array, or an array 68 | indices: array, indices representing the slice 69 | """ 70 | super().__init__(path) 71 | self.field = field 72 | 73 | def __getitem__(self, ids): 74 | result = super().__getitem__(ids) 75 | if isinstance(result, dict): 76 | return result[self.field] 77 | elif isinstance(result[0], dict): 78 | return np.stack([r[self.field] for r in result]) 79 | else: 80 | return np.stack(result) 81 | 82 | @property 83 | def shape(self): 84 | return (len(self), len(self[0])) 85 | 86 | def numpy(self): 87 | return self.__getitem__(slice(0, len(self))) 88 | 89 | def to_tensor(self, dtype, device): 90 | return torch.tensor(self.numpy(), device=device, dtype=dtype) 91 | 92 | 93 | 94 | def create_clusters_from_cluster_assignment( 95 | cluster_assignment: np.array, 96 | num_clusters: int, 97 | return_object_array: bool = True, 98 | ): 99 | """ 100 | Build clusters from cluster assignment. 101 | """ 102 | ID = np.argsort(cluster_assignment) 103 | sorted_cluster_assigment = cluster_assignment[ID] 104 | index_split = np.searchsorted(sorted_cluster_assigment, list(range(num_clusters))) 105 | clusters = np.split(ID, index_split[1:]) 106 | if return_object_array: 107 | return np.array(clusters, dtype=object) 108 | else: 109 | return clusters 110 | 111 | 112 | def find_all_checkpoints(save_dir, pattern): 113 | """ 114 | Parameters: 115 | pattern: str 116 | checkpoint name format _%d., 117 | e.g., kmpp_checkpoint_%d.pth 118 | """ 119 | save_dir = Path(save_dir) 120 | ckpt_list = [str(el.stem) for el in save_dir.glob(pattern.replace("%d", "*"))] 121 | ckpt_list = [int(el.split("_")[-1]) for el in ckpt_list] 122 | ckpt_list = sorted(ckpt_list) 123 | return [Path(save_dir, pattern % el) for el in ckpt_list] 124 | 125 | 126 | def get_last_valid_checkpoint(save_dir, pattern): 127 | """ 128 | Find path to the last checkpoint. 129 | """ 130 | ckpt_list = find_all_checkpoints(save_dir, pattern) 131 | for ckpt_path in ckpt_list[::-1]: 132 | try: 133 | if ".pth" in pattern: 134 | _ = torch.load(ckpt_path, map_location="cpu") 135 | elif ".npy" in pattern: 136 | _ = np.load(ckpt_path) 137 | else: 138 | raise ValueError("Pattern not recognized!") 139 | return ckpt_path 140 | except Exception: 141 | continue 142 | return None 143 | 144 | 145 | def _delete_old_checkpoint( 146 | save_dir, current_iter, checkpoint_period, max_num_checkpoints, pattern 147 | ): 148 | Path( 149 | save_dir, pattern % (current_iter - checkpoint_period * max_num_checkpoints) 150 | ).unlink(missing_ok=True) 151 | 152 | 153 | def setup_logging( 154 | *, 155 | name: str = None, 156 | level: int = logging.INFO, 157 | capture_warnings: bool = True, 158 | ) -> None: 159 | """ 160 | Basic setting for logger. 161 | """ 162 | logging.captureWarnings(capture_warnings) 163 | 164 | logger = logging.getLogger(name) 165 | logger.setLevel(level) 166 | 167 | if logger.hasHandlers(): 168 | return 169 | 170 | fmt_prefix = ( 171 | "%(levelname).1s%(asctime)s %(process)s %(name)s %(filename)s:%(lineno)s] " 172 | ) 173 | fmt_message = "%(message)s" 174 | fmt = fmt_prefix + fmt_message 175 | datefmt = "%Y%m%d %H:%M:%S" 176 | formatter = logging.Formatter(fmt=fmt, datefmt=datefmt) 177 | 178 | handler = logging.StreamHandler(sys.stdout) 179 | handler.setLevel(level) 180 | handler.setFormatter(formatter) 181 | 182 | logger.propagate = False 183 | logger.addHandler(handler) 184 | return 185 | -------------------------------------------------------------------------------- /define_domains/k-means-clustering/vis/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Meta Platforms, Inc. and affiliates. 2 | # All rights reserved. 3 | # 4 | # This source code is licensed under the license found in the 5 | # LICENSE file in the root directory of this source tree. -------------------------------------------------------------------------------- /define_domains/k-means-clustering/vis/generalized_kmeans_1d.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Meta Platforms, Inc. and affiliates. 2 | # All rights reserved. 3 | # 4 | # This source code is licensed under the license found in the 5 | # LICENSE file in the root directory of this source tree. 6 | 7 | import sys 8 | 9 | import numpy as np 10 | import torch 11 | from tqdm import tqdm 12 | from sklearn.utils import check_random_state 13 | 14 | from src import kmeans_gpu as kmg 15 | from src.utils import create_clusters_from_cluster_assignment 16 | 17 | 18 | def l2_squared_power(x, xi, n): 19 | """ 20 | Compute L_2 ^ (2 * n) distance 21 | """ 22 | return (x - xi) ** (2 * n) 23 | 24 | 25 | def l2_squared_power_der(x, xi, n): 26 | """ 27 | Compute the derivative of L_2 ^ (2 * n) distance 28 | """ 29 | return 2 * n * (x - xi) ** (2 * n - 1) 30 | 31 | 32 | def l2_squared_power_der2(x, xi, n): 33 | """ 34 | Compute second-order derivative of L_2 ^ (2 * n) distance 35 | """ 36 | return 2 * n * (2 * n - 1) * (x - xi) ** (2 * n - 2) 37 | 38 | 39 | def kmeans_plusplus( 40 | X, 41 | n_clusters, 42 | x_squared_norms, 43 | dist, 44 | power=1, 45 | random_state=None, 46 | n_local_trials=None, 47 | save_running_results=False, 48 | high_precision=torch.float32, 49 | verbose=False, 50 | ): 51 | """ 52 | Computational component for initialization of n_clusters by 53 | k-means++. Prior validation of data is assumed. 54 | Parameters 55 | ---------- 56 | X : torch.tensor of shape (n_samples, n_features) 57 | The data to pick seeds for. 58 | n_clusters : int 59 | The number of seeds to choose. 60 | x_squared_norms : torch.tensor (n_samples,) 61 | Squared Euclidean norm of each data point. 62 | dist: str 63 | Type of distance function. Options are "l2" or "cos". 64 | power: int 65 | Distance is L_2 ^ (2 * power). 66 | random_state : RandomState instance 67 | The generator used to initialize the centers. 68 | See :term:`Glossary `. 69 | n_local_trials : int, default=None 70 | The number of seeding trials for each center (except the first), 71 | of which the one reducing inertia the most is greedily chosen. 72 | Set to None to make the number of trials depend logarithmically 73 | on the number of seeds (2+log(k)); this is the default. 74 | save_running_results: bool, default=False 75 | Whether to save temporary results during execution. 76 | high_precision: torch.Type 77 | type for high-precision computations. 78 | verbose: bool, default=False 79 | 80 | Returns 81 | ------- 82 | centers : torch.tensor of shape (n_clusters, n_features) 83 | The initial centers for k-means. 84 | indices : ndarray of shape (n_clusters,) 85 | The index location of the chosen centers in the data array X. For a 86 | given index and center, X[index] = center. 87 | """ 88 | if random_state is None: 89 | random_state = check_random_state(random_state) 90 | 91 | n_samples, n_features = X.shape 92 | 93 | centers = torch.empty((n_clusters, n_features), dtype=X.dtype).to(X.device) 94 | pots = torch.empty((n_clusters,), device=X.device, dtype=high_precision) 95 | 96 | # Set the number of local seeding trials if none is given 97 | if n_local_trials is None: 98 | n_local_trials = 2 + int(np.log(n_clusters)) 99 | 100 | # Pick first center randomly and track index of point 101 | center_id = random_state.randint(n_samples) 102 | indices = np.full(n_clusters, -1, dtype=int) 103 | centers[0] = X[center_id] 104 | indices[0] = center_id 105 | 106 | # Initialize list of closest distances and calculate current potential 107 | closest_dist_sq = ( 108 | kmg.compute_distance(X[center_id, None], X, x_squared_norms, dist)[0].type( 109 | high_precision 110 | ) 111 | ** power 112 | ) 113 | current_pot = closest_dist_sq.sum() 114 | pots[0] = current_pot 115 | 116 | # Pick the remaining n_clusters-1 points 117 | if verbose: 118 | iterates = tqdm( 119 | range(1, n_clusters), 120 | desc="Genralized kmeans++ initialization", 121 | file=sys.stdout, 122 | bar_format="{l_bar}{bar}{r_bar}", 123 | ) 124 | else: 125 | iterates = range(1, n_clusters) 126 | for c in iterates: 127 | # Choose center candidates by sampling with probability proportional 128 | # to the distance to the closest existing center 129 | rand_vals = ( 130 | torch.tensor(random_state.uniform(size=n_local_trials)).to( 131 | current_pot.device 132 | ) 133 | * current_pot 134 | ) 135 | candidate_ids = torch.searchsorted( 136 | torch.cumsum(closest_dist_sq, dim=0), rand_vals 137 | ) 138 | # numerical imprecision can result in a candidate_id out of range 139 | torch.clip(candidate_ids, None, closest_dist_sq.shape[0] - 1, out=candidate_ids) 140 | 141 | # Compute distances to center candidates 142 | distance_to_candidates = ( 143 | kmg.compute_distance(X[candidate_ids], X, x_squared_norms, dist).type( 144 | high_precision 145 | ) 146 | ** power 147 | ) 148 | 149 | # update closest distances squared and potential for each candidate 150 | torch.minimum( 151 | closest_dist_sq, distance_to_candidates, out=distance_to_candidates 152 | ) 153 | candidates_pot = distance_to_candidates.sum(dim=1) 154 | 155 | # Decide which candidate is the best 156 | best_candidate = torch.argmin(candidates_pot) 157 | current_pot = candidates_pot[best_candidate] 158 | closest_dist_sq = distance_to_candidates[best_candidate] 159 | best_candidate = candidate_ids[best_candidate] 160 | 161 | # Permanently add best center candidate found in local tries 162 | centers[c] = X[best_candidate] 163 | indices[c] = best_candidate 164 | pots[c] = current_pot 165 | 166 | if save_running_results and c % 1000 == 0: 167 | np.save( 168 | "kmpp_running_results.npy", 169 | {"centers": centers.cpu().numpy(), "indices": indices, "iter": c}, 170 | ) 171 | 172 | return centers, indices 173 | 174 | 175 | def compute_centroids(X, n, n_iters=5, method="newton", verbose=False): 176 | """ 177 | Compute k-means centroids given a set of points, according to distortion 178 | function L_2 ^ (2 * n), with Newton method. 179 | """ 180 | if method == "newton": 181 | # Initialize the centroid with L_2^2 means. 182 | c = X.mean() 183 | if len(X) == 1: 184 | return c 185 | for _ in range(n_iters): 186 | if verbose: 187 | f = torch.sum(l2_squared_power(c, X, n)) 188 | print(f, end=", ") 189 | der_f = torch.sum(l2_squared_power_der(c, X, n)) 190 | der2_f = torch.sum(l2_squared_power_der2(c, X, n)) 191 | if der_f == 0: 192 | break 193 | c -= der_f / der2_f 194 | return c 195 | else: 196 | raise ValueError("Method not supported!") 197 | 198 | 199 | def assign_clusters(X, centers, chunk_size=-1): 200 | """ 201 | Assign points to centroids. 202 | """ 203 | cluster_assignment = ( 204 | kmg.assign_clusters(centers, X, "l2", chunk_size=chunk_size, verbose=False) 205 | .cpu() 206 | .numpy() 207 | ) 208 | clusters = create_clusters_from_cluster_assignment(cluster_assignment, len(centers)) 209 | return clusters 210 | 211 | 212 | def update_centroids(X, clusters, n): 213 | """ 214 | Update centroids based on the new clusters after reassignment. 215 | """ 216 | n_clusters = len(clusters) 217 | centers = torch.zeros((n_clusters, 1), device=X.device, dtype=X.dtype) 218 | for cid in range(n_clusters): 219 | if len(clusters[cid]) > 0: 220 | centers[cid, 0] = compute_centroids(X[clusters[cid]], n).item() 221 | return centers 222 | 223 | 224 | def generalized_kmeans_1d( 225 | X, n_clusters, n, n_iters=50, init_method="k-means++", chunk_size=-1 226 | ): 227 | """ 228 | Run generalized k-means with distance L_2 ^ (2 * n) 229 | """ 230 | assert X.ndim == 2 231 | # initialize 232 | if init_method == "k-means++": 233 | x_squared_norms = torch.linalg.vector_norm(X, dim=1) ** 2 234 | centers, _ = kmeans_plusplus(X, n_clusters, x_squared_norms, "l2", n) 235 | else: 236 | centers = X[np.random.choice(len(X), n_clusters, replace=False), :] 237 | clusters = assign_clusters(X, centers, chunk_size=chunk_size) 238 | for _ in tqdm( 239 | range(n_iters), 240 | desc="Generalized kmeans iterations", 241 | file=sys.stdout, 242 | bar_format="{l_bar}{bar}{r_bar}", 243 | ): 244 | centers = update_centroids(X, clusters, n) 245 | clusters = assign_clusters(X, centers, chunk_size=chunk_size) 246 | return centers, clusters 247 | -------------------------------------------------------------------------------- /define_domains/prompt_classify.py: -------------------------------------------------------------------------------- 1 | """ 2 | Usage: 3 | OUTLINES_CACHE_DIR=/tmp/outlines python -m sglang.launch_server --model-path meta-llama/Meta-Llama-3.1-8B-Instruct --port 30000 4 | python prompt_classify --config_path 5 | """ 6 | 7 | import sglang as sgl 8 | 9 | from tqdm import tqdm 10 | import numpy as np 11 | import torch 12 | 13 | from dataclasses import dataclass 14 | from functools import partial 15 | from pathlib import Path 16 | from tqdm import tqdm 17 | from typing import Optional, List, Dict, Any 18 | import time 19 | 20 | from simple_parsing import ArgumentParser, field 21 | from simple_parsing.helpers import Serializable 22 | from datatools.process import process, ProcessOptions 23 | from datatools.load import load, LoadOptions 24 | from retry import retry 25 | from urllib.error import URLError 26 | 27 | 28 | @dataclass 29 | class PromptConfig(Serializable): 30 | system_template: Optional[str] = None 31 | template: Optional[str] = None 32 | choices: Optional[List[str]] = None 33 | demonstrations: List[Dict[str, str]] = field(cmd=False, default=None) 34 | labels: List[str] = field(default_factory=lambda: list("ABCDEFGHIJKLMNOPQRSTUVWXYZ")) 35 | 36 | response_prefix: Optional[str] = "" 37 | truncation: int = 50_000 # Truncate the input text to this character length 38 | 39 | randomize_choices: bool = True 40 | randomize_demonstrations: bool = True 41 | randomize_seed: int = 42 42 | 43 | 44 | def get_permutation(index: int, prompt_config: PromptConfig) -> np.ndarray: 45 | if not prompt_config.randomize_choices: 46 | return np.arange(len(prompt_config.choices)) 47 | else: 48 | np.random.seed(index + prompt_config.randomize_seed) 49 | permutation = np.random.permutation(len(prompt_config.choices)) 50 | return permutation 51 | 52 | 53 | def get_demonstration_permutation(index: int, prompt_config: PromptConfig) -> np.ndarray: 54 | if not prompt_config.randomize_demonstrations: 55 | return np.arange(len(prompt_config.demonstrations)) 56 | else: 57 | np.random.seed(index + prompt_config.randomize_seed + 1) 58 | permutation = np.random.permutation(len(prompt_config.demonstrations)) 59 | return permutation 60 | 61 | 62 | @sgl.function 63 | def classify(s, item: Dict[str, Any], index: int, prompt_config: PromptConfig): 64 | permutation = get_permutation(index, prompt_config) 65 | labels = prompt_config.labels[:len(prompt_config.choices)] 66 | choices = "\n".join(f"{labels[j]}: {prompt_config.choices[i]}" for j, i in enumerate(permutation)) 67 | 68 | kwargs = item.copy() 69 | if len(kwargs["text"]) > prompt_config.truncation: 70 | kwargs["text"] = kwargs["text"][:prompt_config.truncation] + "... (truncated)" 71 | kwargs["choices"] = choices 72 | 73 | if prompt_config.system_template is not None: 74 | s += sgl.system(prompt_config.system_template.format(**kwargs)) 75 | prompt = prompt_config.template.format(**kwargs) 76 | 77 | if prompt_config.demonstrations is not None: 78 | demonstration_permutation = get_demonstration_permutation(index, prompt_config) 79 | for j in demonstration_permutation: 80 | demonstration = prompt_config.demonstrations[j] 81 | 82 | kwargs = demonstration.copy() 83 | if len(kwargs["text"]) > prompt_config.truncation: 84 | kwargs["text"] = kwargs["text"][:prompt_config.truncation] + "... (truncated)" 85 | kwargs["choices"] = choices 86 | 87 | label_index = next(i for i, v in enumerate(prompt_config.choices) if v.startswith(demonstration["choice"])) 88 | permuted_label_index = np.where(permutation == label_index)[0][0] 89 | label = labels[permuted_label_index] 90 | 91 | s += sgl.user(prompt_config.template.format(**kwargs)) 92 | if "explanation" in demonstration: 93 | s += sgl.assistant(prompt_config.response_prefix + label + ": " + demonstration["explanation"]) 94 | else: 95 | s += sgl.assistant(prompt_config.response_prefix + label) 96 | 97 | 98 | s += sgl.user(prompt) 99 | s += sgl.assistant(prompt_config.response_prefix + sgl.gen("choice", choices=labels)) 100 | 101 | 102 | @retry(URLError, tries=360, backoff=1, delay=5) 103 | def set_default_backend(port=30000): 104 | sgl.set_default_backend(sgl.RuntimeEndpoint(f"http://localhost:{port}")) 105 | 106 | 107 | def predict_fn(dataset, 108 | indices, 109 | process_id, 110 | prompt_config, 111 | num_threads=1, 112 | batch_size=1000, 113 | port=30000): 114 | set_default_backend(port) 115 | 116 | start_time = time.time() 117 | 118 | for batch_start in range(0, len(dataset), batch_size): 119 | batch_range = list(range(batch_start, min(batch_start + batch_size, len(dataset)))) 120 | print(f"Processing batch {batch_range[0]} - {batch_range[-1]}") 121 | 122 | states = classify.run_batch([ 123 | {"item": dataset[i], "index": indices[i], "prompt_config": prompt_config} 124 | for i in batch_range 125 | ], num_threads=num_threads, progress_bar=True) 126 | 127 | # Check for corruption of inference server 128 | for state in states: 129 | meta_info = state.get_meta_info("choice") 130 | 131 | assert meta_info is not None and meta_info["normalized_prompt_logprobs"] is not None 132 | assert all( 133 | len(answer_tokens) > 1 for answer_tokens in meta_info["input_token_logprobs"] 134 | ), f"All answers should have at least 2 tokens in {meta_info['input_token_logprobs']}" 135 | 136 | 137 | for i, state in zip(batch_range, states): 138 | demonstration_permutation = get_demonstration_permutation(indices[i], prompt_config) 139 | permutation = get_permutation(indices[i], prompt_config) 140 | meta_info = state.get_meta_info("choice") 141 | 142 | # We re-compute answer logprobs, as the first token is the preceding token 143 | # that is the same for all answers 144 | permuted_choice_loss = np.array([ 145 | sum(logprob for logprob, token_id, _ in answer_tokens[1:]) / (len(answer_tokens) - 1) 146 | for answer_tokens in meta_info["input_token_logprobs"] 147 | ]) 148 | 149 | choice_loss = np.zeros_like(permuted_choice_loss) 150 | choice_loss[permutation] = permuted_choice_loss 151 | 152 | scores = choice_loss - np.max(choice_loss) 153 | scores = scores - np.log(np.exp(scores).sum()) 154 | probs = np.exp(scores) 155 | 156 | prediction = np.argmax(probs) 157 | 158 | yield { 159 | **dataset[i], 160 | "choice_loss": choice_loss, 161 | "choice_probs": probs, 162 | "top_choice": prompt_config.choices[prediction], 163 | "top_choice_index": prediction, 164 | "top_choice_prob": probs[prediction], 165 | "label_permutation": permutation, 166 | "fewshot_permutation": demonstration_permutation, 167 | } 168 | 169 | print(f"Time taken: {time.time() - start_time:.2f}s") 170 | 171 | 172 | if __name__ == "__main__": 173 | parser = ArgumentParser() 174 | 175 | parser.add_argument("inputs", type=Path, nargs="+", help="Input dataset paths") 176 | parser.add_argument("output", type=Path, help="Output dataset path") 177 | 178 | parser.add_argument("--config_path", type=str, required=True, help="Path to the config file") 179 | parser.add_argument("--num_threads", type=int, default=1, help="Number of threads to use") 180 | parser.add_argument("--batch_size", type=int, default=1000, help="Number of threads to use") 181 | parser.add_argument("--port", type=int, default=30000, help="Number of threads to use") 182 | parser.add_argument("--randomize_seed", default=None, type=int, help="Seed for randomization") 183 | 184 | parser.add_arguments(LoadOptions, dest="load_options") 185 | parser.add_arguments(ProcessOptions, dest="process_options") 186 | 187 | args = parser.parse_args() 188 | prompt_config = PromptConfig.load_yaml(args.config_path) 189 | 190 | if args.randomize_seed is not None: 191 | prompt_config.randomize_seed = args.randomize_seed 192 | 193 | args.prompt_config = prompt_config 194 | 195 | 196 | print("Arguments:", args) 197 | dataset = load(*args.inputs, options=args.load_options) 198 | N = len(dataset) 199 | print(f"Loaded dataset with {N} samples") 200 | 201 | process( 202 | dataset, 203 | partial( 204 | predict_fn, 205 | prompt_config=prompt_config, 206 | num_threads=args.num_threads, 207 | batch_size=args.batch_size, 208 | port=args.port 209 | ), 210 | args.output, args.process_options 211 | ) 212 | -------------------------------------------------------------------------------- /define_domains/prompt_classify.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | #SBATCH -J prompt_classify 3 | #SBATCH --output=slurm/%x-%A_%a.out 4 | #SBATCH -N 1 -c 12 --mem=40G --gres=gpu:8 5 | #SBATCH -t 0-12 6 | #SBATCH -a 0-3 7 | 8 | config=${CONFIG:-configs/topics.yaml} # defines taxonomy and instructions 9 | model=${MODEL:-405B-FP8} # Llama model to use 10 | size=${SIZE:-10K} # how many samples to process (across job array) 11 | seed=${SEED:-43} # random seed for order of categories and few-shot examples 12 | 13 | 14 | # Convert size to number of samples to process 15 | if [[ $size == *M ]]; then 16 | max_index=$((${size%M} * 1000000)) 17 | elif [[ $size = *K ]]; then 18 | max_index=$((${size%K} * 1000)) 19 | else 20 | max_index=$size 21 | fi 22 | 23 | # Get number of available GPUs 24 | if [ -z "$CUDA_VISIBLE_DEVICES" ]; then 25 | num_gpus=$(nvidia-smi -L | wc -l) 26 | else 27 | num_gpus=$(jq -n "[$CUDA_VISIBLE_DEVICES] | length") 28 | fi 29 | num_nodes=${NUM_NODES:-${SLURM_JOB_NUM_NODES:-1}} 30 | port=56421 31 | 32 | export OUTLINES_CACHE_DIR=/tmp/outlines # Fixes some job issues with outlines cache on shared filesystem 33 | 34 | if [ $num_nodes -gt 1 ]; then 35 | master_addr=$(scontrol show hostnames "$SLURM_JOB_NODELIST" | head -n 1) 36 | srun bash -c 'HOST_IP=$(hostname -i) python -m sglang.launch_server \ 37 | --model-path "'Meta-Llama-3.1-${model%-FP8}-Instruct${model#*B})'" \ 38 | --port "'$port'" \ 39 | --tp "'$(($num_gpus * $num_nodes))'" \ 40 | --nnodes "'$num_nodes'" \ 41 | --node-rank "$SLURM_NODEID" \ 42 | --nccl-init "'$master_addr':'$port'"' & 43 | else 44 | python -m sglang.launch_server \ 45 | --model-path Meta-Llama-3.1-${model%-FP8}-Instruct${model#*B} \ 46 | --port $port \ 47 | --tp $num_gpus & 48 | # --enable-torch-compile \ 49 | # --dtype bfloat16 \ 50 | fi 51 | 52 | config_name=$(basename $config) 53 | config_name=${config_name%.yaml} 54 | 55 | python prompt_classify.py datasets/dclm-refinedweb-sample1M.jsonl datasets/dclm-sample${size}-${config_name}-${model}-seed${seed} \ 56 | --config_path ${config} \ 57 | --num_threads 1 \ 58 | --batch_size 1000 \ 59 | --port $port \ 60 | --slurm_array \ 61 | --index_range 0 $max_index \ 62 | --randomize_seed $seed \ 63 | $@ 64 | 65 | 66 | kill -9 $(jobs -p) 67 | -------------------------------------------------------------------------------- /define_domains/train_classifier.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | #SBATCH -J train_classifier 3 | #SBATCH -N 1 -c 20 --gres=gpu:4 --mem=128G 4 | #SBATCH --output=slurm/%x-%j.out 5 | #SBATCH -t 0-6 6 | 7 | 8 | model=${MODEL:-"Alibaba-NLP/gte-base-en-v1.5"} # Model to fine-tune from 9 | bsz=${BSZ:-512} # Batch size 10 | seq=${SEQ:-32} # Sequence length 11 | lr=${LR:-1e-4} # Learning rate 12 | epochs=${EPOCHS:-5} # Number of epochs 13 | warmup=${WARMUP:-0.1} # Warmup ratio 14 | dataset=${DATASET:-""} # Dataset to fine-tune on 15 | url=${URL:-1} # Whether to use URL in input template 16 | 17 | 18 | run_name="$(basename $model)_$(basename $dataset)_bsz${bsz}_lr${lr}_epochs${epochs}_warmup${warmup}_url${url}" 19 | 20 | out_dir="checkpoints/$run_name" 21 | mkdir -p $out_dir 22 | 23 | nvidia-smi 24 | 25 | if [ -z "$CUDA_VISIBLE_DEVICES" ]; then 26 | num_gpus=$(nvidia-smi -L | wc -l) 27 | else 28 | num_gpus=$(jq -n "[$CUDA_VISIBLE_DEVICES] | length") 29 | fi 30 | num_gpus=${NUM_GPUS:-$num_gpus} 31 | master_port=54321 32 | 33 | header="torchrun \ 34 | --rdzv_backend=c10d \ 35 | --rdzv_endpoint=localhost:$master_port \ 36 | --nnodes=1 \ 37 | --nproc_per_node=$num_gpus \ 38 | train_classifier.py" 39 | 40 | accu=$(($bsz / $seq / $num_gpus)) 41 | 42 | export OMP_NUM_THREADS=$num_gpus 43 | 44 | export WANDB_PROJECT="weborganizer" 45 | export WANDB_DIR=$out_dir 46 | export WANDB_MODE="offline" 47 | 48 | base_arguments=( 49 | --report_to wandb 50 | 51 | --do_train 52 | --do_eval 53 | --do_predict 54 | 55 | --model_name $model 56 | 57 | --run_name $run_name 58 | --output_dir $out_dir 59 | --gradient_accumulation_steps $accu 60 | --per_device_train_batch_size $seq 61 | --learning_rate $lr 62 | --max_grad_norm 1.0 63 | --weight_decay 0.1 64 | --warmup_ratio $warmup 65 | --logging_steps 1 66 | --log_level info 67 | 68 | --evaluation_strategy epoch 69 | --save_strategy epoch 70 | --load_best_model_at_end true 71 | --metric_for_best_mode eval_validation_accuracy_label_min 72 | --greater_is_better true 73 | 74 | --num_train_epochs $epochs 75 | --dataloader_num_workers 8 76 | --overwrite_output_dir 77 | --remove_unused_columns false 78 | --disable_tqdm true 79 | --bf16 80 | --ddp_find_unused_parameters false 81 | 82 | --max_length 8192 83 | --label_field choice_probs 84 | 85 | --train_dataset $dataset/train 86 | --validation_dataset $dataset/validation 87 | --test_dataset $dataset/test 88 | 89 | --trust_remote_code 90 | --use_memory_efficient_attention 91 | --unpad_inputs 92 | 93 | $@ 94 | ) 95 | 96 | if [ $url -eq 1 ]; then 97 | base_arguments+=( 98 | --template '{url} 99 | 100 | {text}' 101 | ) 102 | fi 103 | 104 | 105 | echo command: "${header} ${base_arguments[@]}" 106 | ${header} "${base_arguments[@]}" 2>&1 | tee -a $out_dir/log.out 107 | -------------------------------------------------------------------------------- /domain_statistics.py: -------------------------------------------------------------------------------- 1 | import time 2 | import logging 3 | import numpy as np 4 | import pandas as pd 5 | from dataclasses import dataclass 6 | 7 | from simple_parsing import ArgumentParser, field 8 | from simple_parsing.helpers import Serializable 9 | from typing import Callable, Dict, Optional, List, Any, Tuple 10 | from collections.abc import Iterable 11 | 12 | from functools import partial 13 | from copy import copy 14 | import json 15 | 16 | from multiprocessing import Pool 17 | from datatools import load, process, identity_fn, ProcessOptions 18 | 19 | 20 | from pathlib import Path 21 | 22 | from contextlib import contextmanager 23 | 24 | 25 | @dataclass 26 | class DatasetOptions(Serializable): 27 | """This script requires a strict folder structure where the root folder has 28 | subfolders for documents, tokens, annotations, domains, each with an equal number of shards. 29 | Example: 30 | ``` 31 | base_corpus/ 32 | documents/ 33 | - CC_shard_00000000_processed.jsonl.zst 34 | - CC_shard_00000001_processed.jsonl.zst 35 | tokens/ 36 | - CC_shard_00000000_processed.npy 37 | - CC_shard_00000001_processed.npy 38 | some_domain_annotation/ 39 | - CC_shard_00000000_processed.npy 40 | - CC_shard_00000001_processed.npy 41 | ``` 42 | """ 43 | input_base: Path = field(positional=True, help="Path to the input folder containing labels") 44 | 45 | tokens_dir: Path = field(default="token_annotations", help="Relative to the output folder containing tokens") 46 | 47 | domains_dir: List[Path] = field(help="Relative paths to the input folders containing domains", default=None) 48 | 49 | domain_suffix: List[str] = field(default_factory=lambda: [".npy"], help="Extension of the domain files") 50 | token_suffix: str = field(default=".npy", help="Extension of the annotation files") 51 | num_proc: int = field(default=8, help="Number of processes to use", alias="-w") 52 | 53 | 54 | def load_dataframe(shard_name: Path, 55 | options: DatasetOptions): 56 | token_path = options.input_base / options.tokens_dir / (shard_name + options.token_suffix) 57 | 58 | df = pd.DataFrame({ 59 | "tokens": np.load(token_path) 60 | }) 61 | 62 | if options.domains_dir is not None: 63 | for i, domains_dir in enumerate(options.domains_dir): 64 | domains = np.load(options.input_base / domains_dir / (shard_name + options.domain_suffix[i])) 65 | df[("domains", i)] = domains # Use integer column names for domains 66 | 67 | return df 68 | 69 | 70 | def generate_statistics(options: DatasetOptions): 71 | token_paths = sorted((options.input_base / options.tokens_dir).glob(f"*{options.token_suffix}")) 72 | shard_names = [ 73 | str(path.name)[:len(str(path.name)) - len(options.token_suffix)] 74 | for path in token_paths 75 | ] 76 | 77 | with Pool(processes=options.num_proc) as pool: 78 | metadata_dfs = pool.map( 79 | partial(load_dataframe, options=options), 80 | shard_names 81 | ) 82 | metadata_df = pd.concat(metadata_dfs, ignore_index=True) 83 | domain_columns = sorted([c for c in metadata_df.columns if isinstance(c, tuple) and c[0] == "domains"]) 84 | df_by_domain = metadata_df.groupby(domain_columns)["tokens"].aggregate(["sum", "count"]) 85 | total_tokens = df_by_domain["sum"].sum() 86 | summary = [ 87 | {"domain": k, "tokens": v["sum"], "documents": v["count"], "weight": v["sum"] / total_tokens} 88 | for k, v in df_by_domain.to_dict("index").items() 89 | ] 90 | file_name = "domain_statistics/" + "_".join(d.name.removesuffix("_annotations").removeprefix("domains_") for d in options.domains_dir) + ".json" 91 | 92 | print("Statistics:") 93 | for s in summary: 94 | print(s) 95 | 96 | (options.input_base / file_name).parent.mkdir(parents=True, exist_ok=True) 97 | 98 | with (options.input_base / file_name).open("w") as f: 99 | json.dump(summary, f, indent=2) 100 | print(f"Saved statistics to {file_name}") 101 | 102 | 103 | if __name__ == "__main__": 104 | parser = ArgumentParser() 105 | parser.add_arguments(DatasetOptions, dest="options") 106 | args = parser.parse_args() 107 | generate_statistics(args.options) 108 | -------------------------------------------------------------------------------- /learn_mixtures/average_mixtures.py: -------------------------------------------------------------------------------- 1 | import json 2 | import numpy as np 3 | import pandas as pd 4 | from glob import glob 5 | from tqdm import tqdm 6 | import pickle 7 | import os 8 | from pathlib import Path 9 | import argparse 10 | from itertools import product 11 | import numpy as np 12 | 13 | if __name__ == "__main__": 14 | argparser = argparse.ArgumentParser() 15 | argparser.add_argument('--inputs', type=Path, required=True, nargs='+') 16 | argparser.add_argument('--output', type=Path, required=True) 17 | 18 | args = argparser.parse_args() 19 | print(args) 20 | 21 | inputs = args.inputs 22 | 23 | domain_lists = [] 24 | for file in inputs: 25 | with file.open() as f: 26 | domain_lists.append(json.load(f)) 27 | 28 | 29 | output_distribution = [] 30 | for entry in domain_lists[0]: 31 | domain = entry["domain"] 32 | entries = [next(e for e in domain_list if e["domain"] == domain) for domain_list in domain_lists] 33 | 34 | output_entry = { 35 | "domain": domain, 36 | "weight": sum([entry["weight"] for entry in entries]) / len(entries), 37 | } 38 | 39 | output_distribution.append(output_entry) 40 | 41 | with args.output.open('w') as f: 42 | json.dump(output_distribution, f, indent=2) 43 | 44 | -------------------------------------------------------------------------------- /learn_mixtures/combine_mixtures.py: -------------------------------------------------------------------------------- 1 | import json 2 | import numpy as np 3 | import pandas as pd 4 | from glob import glob 5 | from tqdm import tqdm 6 | import pickle 7 | import os 8 | from pathlib import Path 9 | import argparse 10 | from itertools import product 11 | import numpy as np 12 | 13 | if __name__ == "__main__": 14 | argparser = argparse.ArgumentParser() 15 | argparser.add_argument('--inputs', type=Path, required=True, nargs='+') 16 | argparser.add_argument('--output', type=Path, required=True) 17 | 18 | argparser.add_argument('--corpus_distribution', type=Path, default=None) 19 | # We roughly select 30 out of 200 domains to be present in the mixture. 20 | argparser.add_argument('--selection_fraction', type=float, default=30/200) 21 | 22 | args = argparser.parse_args() 23 | 24 | inputs = args.inputs 25 | 26 | domain_lists = [] 27 | for file in inputs: 28 | with file.open() as f: 29 | domain_lists.append(json.load(f)) 30 | 31 | # Assume independence 32 | independent_domain_list = [] 33 | for domain_objs in product(*domain_lists): 34 | combined_obj = {} 35 | for key in domain_objs[0]: 36 | if key == "weight": 37 | combined_obj[key] = np.prod(list(domain_obj[key] for domain_obj in domain_objs)).item() 38 | else: 39 | combined_obj[key] = list(domain_obj[key] for domain_obj in domain_objs) 40 | independent_domain_list.append(combined_obj) 41 | 42 | if args.corpus_distribution is not None: 43 | corpus_distribution = json.load(args.corpus_distribution.open()) 44 | for domain in independent_domain_list: 45 | ref_domain = next((d for d in corpus_distribution if d["domain"] == domain["domain"]), None) 46 | if ref_domain is None: 47 | raise ValueError(f"Domain {domain['domain']} not found in reference distribution") 48 | 49 | domain["weight"] = min(domain["weight"], ref_domain["weight"] / args.selection_fraction) 50 | 51 | total_weight = sum(domain["weight"] for domain in independent_domain_list) 52 | for domain in independent_domain_list: 53 | domain["weight"] = domain["weight"] / total_weight 54 | 55 | with args.output.open('w') as f: 56 | json.dump(independent_domain_list, f, indent=2) 57 | 58 | -------------------------------------------------------------------------------- /learn_mixtures/training_mixes.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | from dataclasses import dataclass 3 | 4 | from simple_parsing import ArgumentParser, field 5 | from simple_parsing.helpers import Serializable 6 | from typing import Callable, Dict, Optional, List, Any, Tuple 7 | 8 | import json 9 | 10 | from tqdm import tqdm 11 | 12 | from pathlib import Path 13 | 14 | 15 | @dataclass 16 | class ScriptOptions(Serializable): 17 | prior_distribution_file: Path = field(positional=True, metadata={"help": "Path to the input distribution file"}) 18 | output_folder: Path = field(positional=True, metadata={"help": "Output directory to save the generated mixes"}) 19 | 20 | seed: int = 42 21 | 22 | prior_temperature: float = 2.0 23 | 24 | min_weight: Optional[int] = 2e-4 # For statistical significance, include at least 200k tokens per domain at the 1B scale 25 | max_epochs: Optional[int] = 20 # Set this to 20 because when we subsample 15% tokens for 1xC, it will be 3 epochs 26 | 27 | min_dirichlet: float = 0.1 28 | max_dirichlet: float = 5.0 29 | 30 | num_mixes: int = 512 31 | 32 | min_total_variation_distance: float = 0.05 33 | 34 | 35 | def generate_mix(prior: np.ndarray, options: ScriptOptions) -> np.ndarray: 36 | # sample dirichlet parameter in log space 37 | alpha = np.exp(np.random.uniform(np.log(options.min_dirichlet), np.log(options.max_dirichlet))) 38 | mix = np.random.dirichlet(alpha * prior) 39 | 40 | # round small components to zero 41 | mix[mix < options.min_weight] = 0 42 | mix = mix / mix.sum() 43 | 44 | return mix 45 | 46 | 47 | def is_valid(proposed_mix: np.ndarray, mixes: np.ndarray, prior: np.ndarray, options: ScriptOptions) -> bool: 48 | max_epoch = (proposed_mix[prior > 0] / prior[prior > 0]).max() 49 | if max_epoch > options.max_epochs: 50 | return False 51 | 52 | if np.any(np.abs(mixes - proposed_mix).max(axis=-1) < options.min_total_variation_distance): 53 | print("INFO: Rejecting proposed mix due to close proximity to existing mixes") 54 | return False 55 | 56 | return True 57 | 58 | 59 | def generate_mixes(prior: np.ndarray, options: ScriptOptions): 60 | np.random.seed(options.seed) 61 | 62 | # apply temperature to prior 63 | prior = (prior ** (1/options.prior_temperature)) 64 | prior = prior / prior.sum() 65 | 66 | mixes = np.zeros((0, len(prior))) 67 | 68 | update_bar = tqdm(total=options.num_mixes, desc="Generating mixes") 69 | while len(mixes) < options.num_mixes: 70 | proposed_mix = generate_mix(prior, options) 71 | if is_valid(proposed_mix, mixes, prior, options): 72 | mixes = np.vstack([mixes, proposed_mix]) 73 | update_bar.update(1) 74 | 75 | return mixes 76 | 77 | 78 | def generate_and_write_mixes(options: ScriptOptions): 79 | with open(options.prior_distribution_file, 'r') as f: 80 | prior_distribution_info = json.load(f) 81 | 82 | prior = np.array([row["weight"] for row in prior_distribution_info]) 83 | prior = prior / prior.sum() 84 | 85 | mixes = generate_mixes(prior, options) 86 | 87 | output_folder = options.output_folder 88 | output_folder.mkdir(parents=True, exist_ok=True) 89 | 90 | for i, mix in enumerate(mixes): 91 | with (output_folder / f"random{i}.json").open("w") as f: 92 | obj = [ 93 | {"domain": row["domain"], "weight": weight.item()} 94 | for row, weight in zip(prior_distribution_info, mix) 95 | ] 96 | json.dump(obj, f, indent=2) 97 | 98 | 99 | if __name__ == "__main__": 100 | parser = ArgumentParser() 101 | parser.add_arguments(ScriptOptions, dest="options") 102 | args = parser.parse_args() 103 | generate_and_write_mixes(args.options) -------------------------------------------------------------------------------- /website/assets/data/examples/topic0_format0.json: -------------------------------------------------------------------------------- 1 | [ 2 | 3 | ] -------------------------------------------------------------------------------- /website/assets/data/examples/topic0_format20.json: -------------------------------------------------------------------------------- 1 | [ 2 | { 3 | "url":"http:\/\/www.linuxtv.org\/wiki\/index.php?title=Meye&oldid=16226", 4 | "text":"From LinuxTVWiki\nRevision as of 09:34, 11 November 2005 by 2chemp (Talk)\n\nJump to: navigation, search\n\nVaio Picturebook Motion Eye Camera Driver Readme\n\nFrom \/usr\/src\/linux\/Documentation\/video4linux\/meye.txt\n\n \u2022 Copyright (C) 2001-2004 Stelian Pop <>\n \u2022 Copyright (C) 2001-2002 Alc\u00f4ve <>\n \u2022 Copyright (C) 2000 Andrew Tridgell <>\n\nThis driver enables the use of video4linux compatible applications with the Motion Eye camera. This driver requires the \"Sony Vaio Programmable I\/O Control Device\" driver (which can be found in the \"Character drivers\" section of the kernel configuration utility) to be compiled and installed (using its \"camera=1\" parameter).\n\nIt can do at maximum 30 fps @ 320x240 or 15 fps @ 640x480.\n\nGrabbing is supported in packed YUV colorspace only.\n\nMJPEG hardware grabbing is supported via a private API (see below).\n\nHardware supported\n\nThis driver supports the 'second' version of the MotionEye camera\u00a0:)\n\nThe first version was connected directly on the video bus of the Neomagic video card and is unsupported.\n\nThe second one, made by Kawasaki Steel is fully supported by this driver (PCI vendor\/device is 0x136b\/0xff01)\n\nThe third one, present in recent (more or less last year) Picturebooks (C1M* models), is not supported. The manufacturer has given the specs to the developers under a NDA (which allows the develoment of a GPL driver however), but things are not moving very fast (see (PCI vendor\/device is 0x10cf\/0x2011).\n\nThere is a forth model connected on the USB bus in TR1* Vaio laptops. This camera is not supported at all by the current driver, in fact little information if any is available for this camera (USB vendor\/device is 0x054c\/0x0107).\n\nDriver options\n\nSeveral options can be passed to the meye driver using the standard module argument syntax (= when passing the option to the module or meye.= on the kernel boot line when meye is statically linked into the kernel). Those options are:\n\nforcev4l1: force use of V4L1 API instead of V4L2\ngbuffers: number of capture buffers, default is 2 (32 max)\ngbufsize: size of each capture buffer, default is 614400\nvideo_nr: video device to register (0 = \/dev\/video0, etc)\n\nModule use\n\nIn order to automatically load the meye module on use, you can put those lines in your \/etc\/modprobe.conf file:\n\nalias char-major-81 videodev\nalias char-major-81-0 meye\noptions meye gbuffers=32\n\n\nxawtv >= 3.49 (<>) for display and uncompressed video capture:\n\nxawtv -c \/dev\/video0 -geometry 640x480\n\n\nxawtv -c \/dev\/video0 -geometry 320x240\n\nmotioneye (<>) for getting ppm or jpg snapshots, mjpeg video\n\nPrivate API\n\nThe driver supports frame grabbing with the video4linux API (either v4l1 or v4l2), so all video4linux tools (like xawtv) should work with this driver.\n\nBesides the video4linux interface, the driver has a private interface for accessing the Motion Eye extended parameters (camera sharpness, agc, video framerate), the shapshot and the MJPEG capture facilities.\n\nThis interface consists of several ioctls (prototypes and structures can be found in include\/linux\/meye.h):\n\n Get and set the extended parameters of the motion eye camera.\n The user should always query the current parameters with\n MEYEIOC_G_PARAMS, change what he likes and then issue the\n MEYEIOC_S_PARAMS call (checking for -EINVAL). The extended\n parameters are described by the meye_params structure.\n Queue a buffer for capture (the buffers must have been\n obtained with a VIDIOCGMBUF call and mmap'ed by the\n application). The argument to MEYEIOC_QBUF_CAPT is the\n buffer number to queue (or -1 to end capture). The first\n call to MEYEIOC_QBUF_CAPT starts the streaming capture.\n Takes as an argument the buffer number you want to sync.\n This ioctl blocks until the buffer is filled and ready\n for the application to use. It returns the buffer size.\n Takes a snapshot in an uncompressed or compressed jpeg format.\n This ioctl blocks until the snapshot is done and returns (for\n jpeg snapshot) the size of the image. The image data is\n available from the first mmap'ed buffer.\n\nLook at the 'motioneye' application code for an actual example.\n\nBugs \/ Todo\n\n- the driver could be much cleaned up by removing the v4l1 support. However, this means all v4l1-only applications will stop working.\n\n- 'motioneye' still uses the meye private v4l1 API extensions.\n\n- mature videos - mature nipples - mature pussies - adult galleries mature women - mature teachers nude - mature cumshots - mature sex videos - mature sex pics - mature sexy women - mature hairy pussy - vintage mature sex - mature video - mature porn women - mature cunts - mature adult - facial cum mature women - mature anal sex - mature porno - mature latina girls - mature cum shots - naked mature daddies - sexool mature - mature lesbian porn - mature mom sex - mature thumbnails - sexy mature woman - mature escort - mature games - older mature tits - busty blonde mature - mature sex stories - mature group sex - moms mature naked - naked mature men - naked mature moms - mature porn stars - mature naughty moms - mature moms nude - mature cum woman - mature women anal - mature orgies - mature amateurs over - mature woman sex - plump mature women - mature nude woman - mature sex movies - mature pussy - mature sluts - mature tits - nude mature women - mature fuck - free mature porn - mature woman - mature boobs - mature thumbs - naked and mature women - mature nudes - busty and mature - mature nude - mature lesbian sex - mature women sex - mature big tits - free mature sex - hardcore mature - mature amateur - mature sex with women - mature blowjobs - mature slut - mature escorts - mature cunt - mature hardcore - nude mature - hardcore fucking - hardcore hentai - free hardcore - hardcore anal - free hardcore sex - hardcore fuck - gay hardcore - hardcore anal sex - hardcore pussy - anime hardcore - hardcore sex galleries - hardcore sluts - hardcore anime - hardcore teen sex - lesbian hardcore - hardcore teen - free hardcore movies - hardcore toons - hentai hardcore - hardcore digimon - hardcore cartoons - hardcore porno - hardcore sex videos - latina hardcore sex - free hardcore sex videos - anal hardcore - indian hardcore - black hardcore - hardcore anal fucking - hardcore milf - free hardcore pics - pussy shaving video shaved pussy - video surveillance - adult video games - car crash videos - video editing - fucking videos - video cards - blowjob video - shemale videos - nude video - car video - free gay porn videos - hentai video - video game vixens - sexy video - video sex - video cameras - digital video - amateur sex videos - video production services - gay sex videos - home sex videos - free lesbian video - video duplication - adult video clips - gay video - skateboarding videos - video capture - video game sex - bondage videos - animal sex videos - free teen porn videos - lez love video - anal sex videos - teen sex video - jordan sex video - amateur porn video - sex videos free - free amateur sex videos - lesbian porn videos - eve sex video - lesbian sex video - black sex videos - amateur adult video - horse sex videos - paris sex video - asian sex videos - teen porn videos - hot sex videos - free porn video downloads - gay porn videos - free video sex - sex video free - sex video trailers - adult video store - fred durst sex video - sexy music videos - free sex video samples - rough brutal sex videos - teen lesbian videos - porn videos free - gay bestiality videos - anal sex video - free sex video downloads - oral sex video - free lesbian sex video - adult sex videos - adult video chat - anime porn videos - hardcore sex video - video sex chat - free teen sex video - free hardcore fucking videos - michelle vieth sex video - underwater sex video - amateur home sex video - dog sex videos - fisting video - video production - blow job videos - video conferencing - video game girls hentai - nude videos - video strip poker - skateboard video - video poker - free xxx video - xxx videos - amateur video - free porno videos - free video porn - spanking videos - porn video clips - free adult video - free adult video clips - sexy videos - free nude videos - gay videos - rape video - music video - female ejaculation video - streaming video - upskirt video - free adult videos - free fucking videos - blockbuster video - free xxx videos - adult video - adult videos - videos - video games - free porn videos - hardcore blowjob - hardcore movies - free hardcore porn pics - sex hardcore - hardcore galleries - sex hardcore pornography - hardcore disney porn - hardcore pics - raven riley hardcore - xxx hardcore porn - hardcore pornography - hardcore disney toons - interracial hardcore - hardcore sex stories - jenna hardcore - hardcore cumshots - hardcore porn videos - hardcore interracial - shemale hardcore", 5 | "topic_id":0, 6 | "format_id":20, 7 | "topic_confidence":0.9242059588, 8 | "format_confidence":0.9694538713 9 | } 10 | ] -------------------------------------------------------------------------------- /website/assets/data/examples/topic0_format3.json: -------------------------------------------------------------------------------- 1 | [ 2 | { 3 | "url":"http:\/\/www.thedirtyzone.com\/report.php?id=74083", 4 | "text":"\n\nPowered By TGPX\n\nPartner Login\n\n\n\nForgot Your Password?\n\u00a0Sign Up\n\nGallery Search\n\nPowered By TGPX\n\nReport a Gallery\n\nUse the form below to report a broken gallery link or a gallery that is breaking our rules. You do not need to tell us the URL, just give a short description of what the gallery is doing to break the rules, or simply enter \"This is a broken link\" if the gallery link no longer works.\n\nIf we determine that your report is correct, we will remove the offending gallery and possibly ban it from our TGP. Thank you for helping to keep TheDirtyZone.com top quality!\nGallery URL http:\/\/www.littlethumbs.com\/mgpbig\/chesire\/dildos\/?coupon=1665637&e=1&l=1&t=1&n=1\nDescription Hot teen chesire has some fun lifting her shirt and sliding down her panties\nReport Reason\nCopy the characters from the image into the text box for verification\n\nPowered By TGPX", 5 | "topic_id":0, 6 | "format_id":3, 7 | "topic_confidence":0.9798814058, 8 | "format_confidence":0.9649695754 9 | }, 10 | { 11 | "url":"http:\/\/sublimecarlalive.com\/auth\/forgot-password", 12 | "text":"Forgot password\n\n\u00a0 \u00a0 If you forgot your username or email address please contact our Support Team here!\n\n\nOfficial Website\n\nThis site contains sexually explicit material.\n\nEnter only if you are Over 18 and you agree to our Sensitive Data and Cookies policies.\n\nNo, I'll leave\n\nBy entering the site I accept the Terms & Conditions and Privacy Policy.", 13 | "topic_id":0, 14 | "format_id":3, 15 | "topic_confidence":0.9383435249, 16 | "format_confidence":0.8072704673 17 | }, 18 | { 19 | "url":"https:\/\/www.hablaporinternet.com\/product-tag\/ani-callback\/?product_orderby=price", 20 | "text":"Web Callback account and instructions to connect any fixed\/mobile line to another fixed\/mobile line around the globe. Specify any 2 numbers to be connected and they will be connected within 30 seconds or so. We suggest to use when your internet mobile connection is very slow or has poor Quality of Service. You would need to load a simple web page \"telefonoatelefono.com\" to trigger the call back. After that the call will not use any internet data so the quality will be excellent.", 21 | "topic_id":0, 22 | "format_id":3, 23 | "topic_confidence":0.9360194206, 24 | "format_confidence":0.7603437901 25 | }, 26 | { 27 | "url":"http:\/\/www.diamonddooronline.com.au\/delivery\/", 28 | "text":"Order today for fast,\ndiscreet delivery.\n\nFast & Discreet Delivery\n\nPostage & Handling Information\n\nFor Australian deliveries, your order will be shipped Express Post via Australia Post. This guarantees next working day delivery to metropolitan areas if the order is received online before 12PM AEST time.\n\nDelivery is guaranteed next working day from our warehouse to all major Australian metropolitan areas; including Melbourne, Sydney, Brisbane, Darwin, Perth, Adelaide and Hobart. All other areas are still sent express post, ensuring fast delivery.\n\nAnyone associated with the delivery of the parcel is unaware of its contents or origin. All packages are discreetly labeled with no reference to Diamond Door Online. The return address on the packaging will be DDO with our PO Box address.\n\nYou will receive an email notification when your order has been shipped; this will include the Australia Post tracking number and contact details. Products too large to be shipped via Express Post will be shipped using Australian Registered Post.\n\nThe parcel will be wrapped and packed to prevent discovery of the contents. It will arrive in a standard Australia Post Express courier bag or other Australia Post packaging if the item is too large.\n\nIn the event the courier is unable to deliver to the address supplied, the parcel may be collected from your nearest Post Office. This is the most discreet and private way to purchase adult products.\n\nWe have a system to ensure that all items available for order on this website are in stock, guaranteeing your order can be shipped 'same day'.\n\n\nInternational orders will be shipped using Australia Post Express Courier International service. You will receive an email notification when your order has been shipped; this will include the Australia Post tracking number and contact details. Please note that you will need to sign for the package upon delivery.\n\nInternational orders will be delivered within 2 - 4 working days depending on metropolitan areas of major cities. Items can be tracked on www.auspost.com\/track\n\nDue to customs procedures for international orders, Diamond Door Online must declare the contents of the package and show our name and return address. This cannot be changed in anyway as we must comply with Australian Customs and Australia Post procedures.\nDiamond Door Online will not be responsible for items seized by international customs.\n\n\nFor Lingerie\n\nDue to the large range of lingerie we stock in some cases items need to be shipped from our USA supplier or our Australian Supplier, please allow an additional 3 days for delivery for all lingerie orders. In most cases you will know the delivery time on the website as it states the delivery times. If your item is awaiting shipment from the USA we will notify you via email with the option of a refund, store credit or backorder.\n\n\nContact Us\n\nEmail Address:\nFor all customer service enquires and technical difficulties: support@diamonddooronline.com\n\nDiamond Door Online is Australian Owned and Operated.\n\nDiscounted Sex ToysDiscreet Adult ShopDiscreet Payments by Visa, Mastercard or Paypal :: Learn MoreBuy Gift CardsLingerieLiving Beyond Breast CancerDiamond Door Online On Facebook", 29 | "topic_id":0, 30 | "format_id":3, 31 | "topic_confidence":0.9258791804, 32 | "format_confidence":0.9388980269 33 | }, 34 | { 35 | "url":"https:\/\/missloreleirivers.ch\/etiquette\/", 36 | "text":"Arrive promptly for your appointment. Please note, \u201cprompt\u201d does NOT mean early. If you are running a few minutes early, you may text or call to ask if we may begin early. If you\u2019re earlier than that, please explore one of the several cafes and restaurants in my neighborhood or take a walk in the nearby park.\n\nIf you are running late, please contact me as soon as possible. I will do my best to accommodate if my schedule allows, but tardiness may shorten the length of your session. Respect my time as I respect yours, and don\u2019t make me wait unless it is unavoidable.\n\nDress discreetly in \u201cvanilla\u201d attire when arriving and leaving my studio. (Feel free to wear hidden toys or lingerie underneath!)\n\nArrive sober, and do not bring drugs or weapons into my space.\n\nDo not wear cologne or heavy scents. Do use deodorant.\n\nUpon your arrival I will invite you to freshen up or shower \u2013 if you\u2019re uncertain, err on the side of cleanliness. (You may also choose to shower at the end of our time together.) I provide a selection of scented and unscented shower products, mouthwash, and other basic hygiene items.\n\nPrior to your appointment, we will do some negotiation via email. When you arrive we\u2019ll spend a few moments discussing your desires, limits, physical and mental state, and what we\u2019ll be doing during your session.\n\nIn the playroom, my word is law, and I expect obedience. I do not demand unquestioning submission, but I do require good manners and compliance with my wishes.\n\nReady and eager to explore with me?\n\nProceed to my contact page and follow the instructions there. To demonstrate that you have read the entirety of this page, and to arouse my interest in you, include the title of your favorite book in your initial email. Hardly any new playmates do this, so you will set yourself apart!", 37 | "topic_id":0, 38 | "format_id":3, 39 | "topic_confidence":0.9656427503, 40 | "format_confidence":0.803586185 41 | }, 42 | { 43 | "url":"http:\/\/www.afterdarkswingers.co.uk\/lost-password.html", 44 | "text":"Enter your username and password to Login\n\nAfter Dark\u00a0Swingers is\u00a0completely free to sign up and register, you will then be able to browse all our members profiles and search for swingers in your area.\n\nIf you are not already a member then just click the Sign Up link at the top!\n\nIf you can not remember your Password then click the Forgotten Password link to the right\n\nForgot Password:", 45 | "topic_id":0, 46 | "format_id":3, 47 | "topic_confidence":0.9619802237, 48 | "format_confidence":0.7939460278 49 | }, 50 | { 51 | "url":"https:\/\/uruguay.escortface.com\/contacts", 52 | "text":"Contact us\n\nDear friends,\n\nWe are not an escort agency and We are not providing any escort or call girls services.\nWe are running an adult directory and selling advertisement only.\n\nFeel free to send us a message if you have any advertising questions or encounter any technical problems on our site.\n\nOffice hours:\n\nEveryday: 08.00 - 22.00 (GMT +1)\nMessages sent during office hours are replied within 0-2 hour.", 53 | "topic_id":0, 54 | "format_id":3, 55 | "topic_confidence":0.978023231, 56 | "format_confidence":0.7701843381 57 | } 58 | ] -------------------------------------------------------------------------------- /website/assets/data/examples/topic10_format20.json: -------------------------------------------------------------------------------- 1 | [ 2 | 3 | ] -------------------------------------------------------------------------------- /website/assets/data/examples/topic13_format20.json: -------------------------------------------------------------------------------- 1 | [ 2 | 3 | ] -------------------------------------------------------------------------------- /website/assets/data/examples/topic15_format2.json: -------------------------------------------------------------------------------- 1 | [ 2 | { 3 | "url":"http:\/\/bretthetherington.net\/default.aspx?pageId=913", 4 | "text":"Brett Hetherington\n\nBanner photos: Cornelia Kraft\n\nPortrait of an Asturian miner\n\n\u00a0[This article was first published in Catalonia Today magazine, December 2016.]\n\nThe coal miner's wife wakes him and he coughs. He shuffles to the small bathroom sink and spits black liquid, washing it away with with the brown tap water.\n\nLast night he slept badly, suffering from stomach cramps, diarrhea and vomiting again. He had been working in the zinc mine at Arnao in Castrillon but the Belgian company who own the site have let him go to the San Juan mine. He felt himself to be quite lucky. At least there they had a river pool for the miners to wash in.\n\nAs he leaves for another day under the earth the miner looks for a last time at the mountains, at their ferns and the tall groups of eucalyptus - \u2013 pencil thin, not quite straight, just like the trees in a Dr Seuss book. He sees the houses with their sharp pitched roofs in front of deep gorges and is comforted by the roll of the hills across this green land.\n\nOur miner is living before the era of the chemical plants and big metal factories. He knows others who dig for iron and knows it\u2019s vital for tinned food because electricity and refrigeration have not yet arrived to this part of the world.\n\nHis mine, like many mines, is close to a river: a means of transporting the coal for trading this raw material with British towns like Cardiff and Newcastle-upon-Tyne [where my own father was born and also grew up next to a polluted river.]\n\nThis miner's children will one day see the construction of chemical industries, thanks to the mines, thanks to his labour. In fact, he thinks, as he makes the walk to the pits, the story of Asturias is the story of the miner and the story of the miner is the story of Asturias. It is one of hardship and scant reward, of growth but also ill health. It is a tale of the deep earth's hidden secrets and humanity\u2019s immeasurable suffering with the open spaces of the valleys and their claustrophobic confines - as unforgiving and back-breaking as any imagined hell in those greedy shafts penetrating ever downwards into the planet.\n\nToday, like thousands of other days, he will launch his body into the ground and probe for hour after hour for that black rock. Finally, at the end of the day our miner will take aspirin for his aching bones, smiling at the ironic fact that it has ingredients made from the very coal he has been digging for. He does not yet know though that, decades later, his children are going eat kiwi-fruits and chestnuts that will come to grow particularly well in the carbon-coloured soil left from abandoned open-cut mines scattered across the nearby hills.\n\nAs the miner eats his simple lunch with his hands still blackened by coal dust, he remembers his father, who was also a miner. He too worked to extract the iron that was in such high demand for both twentieth century century world wars \u2013 a metal that helped the rich become richer. His father started life as a rural worker and had to adapt from the rhythms of the seasons to the very different rhythm of an industrial timetable. He had to learn to accept days and nights with no sky or trees, down in the mines which lay right next to his cramped terrace house.\n\nLike every other subterranean labourer, his father and he both wondered if life could ever be different for them. He\u2019d heard that things were a bit better at the only mine run by a trade union. But it was on the other side of Asturias and he had never even visited there.\n\nOur miner lives in Bustiello town where all of the aristocrat Marques de Camilla's workers have their neat little houses below everyone else, at the bottom of the valley. It is an orderly, rectangular village and each house has a small garden. Up the hill above them live the engineers and above them is the church, then God of course. This is what he knows: the planning of the town exactly reflects the social and spiritual hierarchy. The Marques is a conservative man. He fears the progressive men who want social change.\n\nFurther on in the mountains there are mining zones that suffered from \u201cspecial measures\u201d during Franco's dictatorship. Around Pozo Fortuna trade union activists were assassinated and their bodies were thrown down an old pit-hole. Our miner speaks about this sadly with his friends and later falls asleep hoping that the bad times will end.\n\nIn the morning, he rises and faces another day.", 5 | "topic_id":15, 6 | "format_id":2, 7 | "topic_confidence":0.5859863758, 8 | "format_confidence":0.8383467197 9 | }, 10 | { 11 | "url":"https:\/\/www.elastoproxy.com\/chapter-four-sealing-the-win\/", 12 | "text":"Eric\u2019s phone call to his boss went about as well as his visit to Patrick\u2019s office. The young engineer knew his A-game hadn\u2019t been good enough and was still reeling from getting checked by Purchasing. \u201cPick yourself up off the ice and get back to work,\u201d Eric\u2019s boss said. \u201cGo talk to that company Elasto Proxy and get some answers to Patrick\u2019s questions. Otherwise, you can forget about switching vendors.\u201d\n\nWith a cup of coffee in one hand, Eric dialed Jenny-Lynn, his Elasto Proxy solutions provider. She picked up on the first ring and could tell by Eric\u2019s voice that the young engineer had been through the wringer. Eric explained that he needed gasket prices and minimum order quantities (MOQs). Then he took another sip of coffee, paused, and asked if buying a gasket was really just about knowing these two numbers.\n\n\u201cNo, it\u2019s not,\u201d Jenny-Lynn told the engineer. \u201cPrice and MOQ matter, of course, but your company also needs to think about true costs and manufacturing waste, especially with all of the rework you\u2019ve been doing. Has anyone spoken to you about this before?\u201d Eric couldn\u2019t remember learning about this in any of his engineering classes, and price rather than true costs was his company cared about.\n\n\u201cTrue costs are the full scope of your manufacturing costs,\u201d Jenny-Lynn explained. \u201cLet\u2019s keep things simple and stick to labor and materials. Labor is not just about the labor cost of the installer. It includes the cost of ordering, receiving, inventorying, and then delivering gaskets to the assembly line. There\u2019s even a cost to maintaining vendors and inspecting the quality of gasket installation.\u201d\n\nEric took another sip of coffee and asked Jenny-Lynn to continue.\n\n\u201cWith your material costs,\u201d she added, \u201cit\u2019s not just about installation either. Your true costs include material waste, such as when an installer makes a mis-cut and then discards a length of rubber. Here, it\u2019s worth talking about MOQs \u2013 something you mentioned before. Buying more material than you need isn\u2019t a good business strategy,\u201d she added.\n\n\u201cThat\u2019s for sure,\u201d Eric agreed. \u201cWhen I met with our Purchasing Director, he told me he didn\u2019t want a bunch of excess stock laying around because it ties up cash.\u201d\n\n\u201cWhat else did say to you?\u201d, Jenny-Lynn asked. \u201cI think it would help to know all of his concerns.\u201d\n\n\u201cWell,\u201d Eric continued, \u201che got pretty upset about stock-outs. It wasn\u2019t even the gaskets that bothered him, but all of the other rubber and plastic parts that he can\u2019t get.\u201d\n\n\u201cInteresting,\u201d Jenny-Lynn said. \u201cIt sounds like your Purchasing Director has some real headaches. Did he mention any of the eight forms of manufacturing waste? They\u2019re transportation, inventory, motion, waiting, over-production, over-processing, defects, and unused knowledge\u201d.\n\n\u201cWhoah,\u201d Eric laughed. \u201cI liked it better when you talked about true costs and how they relate to labor and materials. This is all new to me. Still, manufacturing waste does seem like something that Patrick our Purchasing Director would understand. He\u2019s the kind of guy who keeps aframed business school diploma on his wall.\u201d \u00a0\n\nNow it was Jenny-Lynn\u2019s turn to laugh. \u201cYou don\u2019t have to be a B-school grad to understand this stuff. It still comes down to labor, materials, and manufacturing overhead \u2013 if it\u2019s OK to introduce another term\u201d.\n\nPatrick said that it was and asked her to continue.\n\n\u201cWith some numbers from your Operations Manager,\u201d Jenny-Lynn continued, \u201c I bet you could convince Patrick to look at your company\u2019s true costs because he seems like a guy who doesn\u2019t like manufacturing waste \u2013 even though he\u2019s part of the manufacturing overhead.\u201d\n\n\u201cI\u2019m not sure that Patrick likes much of anything,\u201d Eric quipped.\n\nThey both laughed.\n\n\u201cLook,\u201d Jenny-Lynn continued, \u201cI\u2019m willing to work with you on getting the information that you need from Operations. Would you commit to setting up a meeting with your Operations Manager and inviting me to it? We can do a video call \u2013 it\u2019s something we do a lot at Elasto Proxy.\u201d\n\nEric put down his coffee cup and smiled.\n\n\u201cLet\u2019s do it,\u201d he said. \u201cI\u2019ll set up a video call with Olivia. She\u2019s our Operations Manager \u2013 a tough lady, but also fun to work with.\u201d\u00a0 \u00a0 \u00a0\u00a0\n\nPrevious Chapters\n\nChapter 3 \u2013 Hip Checked\n\nChapter 2 \u2013 Game Plan\n\nChapter 1 \u2013 Thin Ice\u00a0 \u00a0 \u00a0 \u00a0\n\nLeave a Reply", 13 | "topic_id":15, 14 | "format_id":2, 15 | "topic_confidence":0.8967555165, 16 | "format_confidence":0.9827904701 17 | } 18 | ] -------------------------------------------------------------------------------- /website/assets/data/examples/topic18_format20.json: -------------------------------------------------------------------------------- 1 | [ 2 | 3 | ] -------------------------------------------------------------------------------- /website/assets/data/examples/topic1_format20.json: -------------------------------------------------------------------------------- 1 | [ 2 | { 3 | "url":"https:\/\/docs.spongepowered.org\/5.1.0\/en\/about\/assets.html", 4 | "text":"Art Assets\n\n\nThis documentation refers to an outdated SpongeAPI version and is no longer actively maintained. While the code examples still work for that API version, the policies, guidelines, and some links may have changed. Please refer to the latest version of the documentation for those.\n\nThis page provides the official SpongePowered logo and mascot. Feel free to use them to spread the word about Sponge. However note that these images are not provided under the MIT License.\n\n\nIf you\u2019re reading a translated version, please note that the English license is the one which counts. Translated licenses are only provided for informational purposes.\n\nYou may:\n\n \u2022 Make minor modifications to Spongie\u2019s facial expressions - such as making a cute smile, making them perplexed or sad, adding a hat, and so on. It should be easily recognizable that the Spongie you create is the Spongie mascot, but wearing a different expression or clothing.\n\n \u2022 Use Spongie or the SpongePowered logo in an article or blog post about the Sponge project, and to spread the word.\n\n \u2022 Use the SpongePowered Logo to link to Sponge Homepage (for example in your signature on forums etc.)\n\nYou may not:\n\n \u2022 Use the mascot as a link to the Sponge project only. The SpongePowered logo (which retains Spongie\u2019s form) is a better representation of the Sponge project when used as an affiliate or reference.\n\n \u2022 Create a vastly modified version of Spongie, where they become unrecognizable or clearly different from the original Spongie mascot.\n\n \u2022 Add your project image to Spongie, or vice versa.\n\n \u2022 Claim Spongie as your own mascot, or use them as a mascot for your own project.\n\n \u2022 Sell or use Spongie in commerce without permission.\n\n \u2022 Change any colors or dimensions.\n\nSpongie - The Official Sponge Mascot\n\nSpongie the SpongePowered mascot", 5 | "topic_id":1, 6 | "format_id":20, 7 | "topic_confidence":0.8854210377, 8 | "format_confidence":0.7925449014 9 | } 10 | ] -------------------------------------------------------------------------------- /website/assets/data/examples/topic20_format2.json: -------------------------------------------------------------------------------- 1 | [ 2 | { 3 | "url":"http:\/\/uncyclopedia.wikia.com\/wiki\/Websense?direction=prev&oldid=5082166", 4 | "text":"From Uncyclopedia, the content-free encyclopedia\n\nJump to: navigation, search\nFor those without comedic tastes, the so-called experts at Wikipedia have an article about Websense.\n\u201cYou can't touch this.\u201d\n~ MC Hammer on accessing an illegal webpage.\n\nIntroduction and Rationale\n\n\nA screen familiar to users of this online service.\n\nIt has been known that rogue states but also companies and universities put restrictions to Internet usage. These restrictions are so aggressive and perverse that the Internet experience of the end-user is completely compromised.\n\nWebSense (IPAEng: \u02c8w\u025bbs\u025bns\u025bs\u028cks) (a.k.a \"Web'non'sense\") comes to the rescue by providing stealth anonymity to the masses, allowing them to browse the Internet without restrictions. Due to this, WebSense is severely unpopular in specific non-secural countries; merely having WebSense software installed on your computer may land you in jail for two years.\n\nTechnical details\n\nWebSense offers anonymous secure proxy servers that end-users can connect to. Once such a server is blocked, WebSense activates a new one and informs the users through encrypted e-mail to switch to it.\n\nThere is currently work carried out to integrate WebSense with the Onion Routing project in order to offer protection against traffic analysis.\n\nWebSense is resilient to differential analysis attacks thanks to patented technologies being employed.\n\nWebsense categories\n\nWebSense is used especially by these groups of people\n\nIf you belong to one of the categories above, you can apply for a free license to use WebSense. Please contact them mentioning this page.\n\n\nThe motto of WebSense is Putting the Sense back in the Web. As more content-filtering software packages are being sold and utilised in perimeter networks, the real message of the Internet got diluted. Therefore, in a wise move, WebSense adopted the motto of sensible Internet, and puts the sense back to where it belongs, the Internet itself. The other Websense motto is \"De Heil Websense!\".\n\n\nThis page has been vandalised twice already by the oppressors of freedom. Please help keep WebSense page up and running.\n\nPreemptive actions\n\nThanks to donations from around the world, the WebSense international community raised US$20.000 to win the online auction and register the Internet domain WebsenseSucks.com so that no other people may use. Please join the new initiative to register WebsenseReallySucks.com and avoid corporate interference to the mission of Websense.\n\n\nWebSense:Wikipedia makes fun of WebSense at the popular parody Website Wikipedia. In the sly humour of the Wikipedia hippies, WebSense is depicted as a content-filtering product aimed at restricting Internet access.\n\nSee also\n\nPersonal tools", 5 | "topic_id":20, 6 | "format_id":2, 7 | "topic_confidence":0.9601412416, 8 | "format_confidence":0.5483865738 9 | }, 10 | { 11 | "url":"http:\/\/www.rmtweb.co.uk\/abort-retry-ignore", 12 | "text":"Abort, Retry, Ignore\nOnce upon a midnight dreary, fingers cramped and vision bleary,\nSystem manuals piled high and wasted paper on the floor,\nLonging for the warmth of bed sheets,\nStill I sat there, doing spreadsheets:\nHaving reached the bottom line,\nI took a floppy from the drawer.\nTyping with a steady hand, I then invoked the SAVE command\nBut got instead a reprimand: it read Abort, Retry, Ignore.\nWas this some occult illusion? Some maniacal intrusion?\nThese were choices Solomon himself had never faced before.\nCarefully, I weighed my options.\nThese three seemed to be the top ones.\nClearly, I must now adopt one:\nChoose Abort, Retry, Ignore.\nWith my fingers pale and trembling,\nSlowly toward the keyboard bending,\nLonging for a happy ending, hoping all would be restored,\nPraying for some guarantee\nFinally I pressed a key \u2014\nBut on the screen what did I see?\nAgain: Abort, Retry, Ignore.\nI tried to catch the chips off-guard \u2014\nI pressed again, but twice as hard.\nLuck was just not in the cards.\nI saw what I had seen before.\nNow I typed in desperation\nTrying random combinations\nStill there came the incantation:\nChoose: Abort, Retry, Ignore.\nThere I sat, distraught, exhausted, by my own machine accosted\nGetting up I turned away and paced across the office floor.\nAnd then I saw an awful sight:\nA bold and blinding flash of light \u2014\nA lightning bolt had cut the night and shook me to my very core. I\nsaw the screen collapse and die \"Oh no \u2014 my database\", I cried I\nthought I heard a voice reply, \"You\u2019ll see your data Nevermore.\"\nTo this day I do not know\nThe place to which lost data goes\nI bet it goes to heaven where the angels have it stored.\nBut as for productivity, well\nI fear that it goes straight to hell\nAnd that\u2019s the tale I have to tell\nYour choice: Abort, Retry, Ignore.\n\nLeave a Reply", 13 | "topic_id":20, 14 | "format_id":2, 15 | "topic_confidence":0.5829549432, 16 | "format_confidence":0.9502524137 17 | } 18 | ] -------------------------------------------------------------------------------- /website/assets/data/examples/topic21_format20.json: -------------------------------------------------------------------------------- 1 | [ 2 | { 3 | "url":"https:\/\/azb.m.wikipedia.org\/wiki\/%D8%B4%D8%A7%D8%A8%D9%84%D9%88%D9%86:Current_sport", 4 | "text":"\u0634\u0627\u0628\u0644\u0648\u0646 \u0628\u0644\u06af\u0647 \u0644\u0647 \u0645\u0647\u200c\u0633\u06cc[\u0628\u0627\u062e] [\u062f\u06cc\u06cc\u0634\u062f\u06cc\u0631] [\u06af\u0626\u0686\u0645\u06cc\u0634] [\u062a\u0645\u06cc\u0632\u0644\u0647\u200c\u0645\u0647]\n\n\nDo not subst.\n\n \u2022 This template is for articles which involve an article about an evolving current sports-related event which is either changing rapidly or about which understanding is rapidly evolving. This is an advisory to readers that the article may be incomplete and subject to change.\n \u2022 Note that every article on Wikipedia has a General disclaimer indicating that the article contents may not be accurate. As such, this template is redundant.\n \u2022 As an advisory to editors, it may also be used in those occasions that many editors (perhaps a hundred or more) edit an article on the same day.\n \u2022 It is not intended to be used to mark an article that merely has recent news articles about the topic, or for some team or league that is in season; if it were, thousands of articles would have this template, without informational consequence.\n \u2022 Generally it is expected that this template and its closely related templates will appear on an article for perhaps a day or two, occasionally several days.\n \u2022 If you desire that an article be noticed as a topic about or related to a significant current event, see Wikipedia:How the Current events page works.\n\n\n\u0628\u0648 \u0634\u0627\u0628\u0644\u0648\u0646 \u06f3 \u067e\u0627\u0631\u0627\u0645\u062a\u0631\u0647 \u0634\u0627\u0645\u06cc\u0644 \u0627\u0648\u0644\u0648\u0631:\n\n \u2022 sport, \u0627\u062a\u0648\u0645\u0627\u062a\u06cc\u06a9 \u0635\u0648\u0631\u062a\u062f\u0647 \u0647\u0631 \u0627\u06cc\u062f\u0645\u0627\u0646\u06cc\u0646 \u0627\u0624\u0632 \u0634\u06a9\u06cc\u0644\u06cc\u0646 \u06af\u0648\u0633\u062a\u0631\u06cc\u0631. \u0628\u0648 \u0634\u0627\u0628\u0644\u0648\u0646 \u0627\u06cc\u0646\u062f\u0647 \u0628\u0648 \u0627\u06cc\u062f\u0645\u0627\u0646\u0644\u0627\u0631\u0627 \u0627\u0626\u062d\u062a\u06cc\u0648\u0627 \u0627\u0626\u062f\u06cc\u0631: American football, Australian rules football, basketball, baseball, cricket, cycling, golf, hockey, motorsports, Olympics, rugby, tennis, volleyball, boxing, curling, squash, swimming, athletics, lacrosse\n \u2022 image, which can switch to an alternate image that is not supported.\n \u2022 event, which can change the event to something other than the default sports-related event text.\n \u2022 category, parameter must be y, to add category to the article.\n\n\u0628\u0648 \u06a9\u0624\u062f\u0648\u0646:\n\n{{current sport|sport=tennis|event=tennis tournament}}\n\n\u0646\u062a\u06cc\u062c\u0647\u200c\u0633\u06cc \u0628\u0648:\n\n\u0627\u06cc\u0633\u062a\u06cc\u0641\u0627\u062f\u0647 \u0627\u0648\u0644\u0648\u0646\u0627\u0646 \u0634\u06a9\u06cc\u0644\u200c\u0644\u0631\u062f\u064e\u06cc\u06cc\u0634\u062f\u06cc\u0631\n\nSee also\u062f\u064e\u06cc\u06cc\u0634\u062f\u06cc\u0631\n\n\u0631\u062f\u0647:\u0627\u0644\u06af\u0648:\u0631\u0648\u06cc\u062f\u0627\u062f \u0631\u0648\u0632", 5 | "topic_id":21, 6 | "format_id":20, 7 | "topic_confidence":0.9884822369, 8 | "format_confidence":0.5266529322 9 | } 10 | ] -------------------------------------------------------------------------------- /website/assets/data/examples/topic23_format20.json: -------------------------------------------------------------------------------- 1 | [ 2 | 3 | ] -------------------------------------------------------------------------------- /website/assets/data/examples/topic5_format0.json: -------------------------------------------------------------------------------- 1 | [ 2 | { 3 | "url":"http:\/\/gammaelectronics.xyz\/audio_04-1977_open-reel.html", 4 | "text":"OPEN-REEL vs. CASSETTE (April 1977)\n\nHome | Audio Magazine | Stereo Review magazine | Good Sound | Troubleshooting\n\nAuthor: Herman Lia [Senior Engineer, Tandberg Radiofabrikk A\/S, Oslo, Norway ]\n\nThe enormous sale of cassette tape recorders in the last few years is proof enough that this product meets a demand.\n\nAt the same time there is a danger that people will forget the open -reel tape recorder, which in many important respects is a much better basic concept. A cassette machine's advantages are that it is easy to operate, weighs very little, and that a broad range of prerecorded tapes are widely available. On the other hand, as this article will show, cassette machines will always be inferior to open -reel machines when the major performance characteristics, such as signal-to-noise ratio, are compared. At the same time, it must be admitted that for a large number of consumers, cassette machines are usually quite good enough. However, when the very best quality recording is required, an open -reel machine must be used. To be fair and realistic, we must also say that cassette machines have expanded the total market for tape recorders, as well as capturing a portion of the open reel market. However, they will never take over all of the open-reel market because there are fundamental differences in the quality level obtainable with the two systems, differences which result from the internationally recognized standards governing each system.\n\nGeneral Considerations\n\nThe two most important characteristics that determine the performance of a tape recorder are signal-to-noise ratio and frequency response. In this context, frequency response means a response relative to a signal level that lies substantially below the saturation curve of the tape and substantially above the level of residual tape noise.\n\nLet us consider a tape recorder as a black box where we connect a signal to the input and take out another signal from the output, as shown in Fig. 1.\n\nIdeally, the only differences between the input and the output signal are time delay and possibly some scale or amplification factor, A. The lowest possible time delay is determined by the distance between the record and playback heads, together with tape speed. Unfortunately, real world tape recorders are not ideal, and we need to make some measurements to discover their characteristics. We can begin by measuring signal capacity. We do this by applying a single tone at a particular frequency to the input and then raising the input level voltage ei(t) until the signal at the output has a particular amount of distortion, e.g. 5 percent harmonic distortion. This can be done for a number of frequencies, and the typical results are shown in Fig. 2 for one particular tape speed. Next we remove the input signal ei(t) and short circuit the input. There should, of course, be no signal at the output, but in practice there is a noise spectrum which is the sum of the residual tape noise and the noise from the record and playback electronics.\n\nIn a well-designed tape recorder, the noise from the electronics is so low that the dominant noise component is the tape noise. The noise spectrum can be analyzed by mean of one-third octave filters, and this is shown in Fig. 2 along with a saturation curve.\n\nThese measurements tell us quite a lot. They tell us that there are upper and lower limits of the signal a tape can accommodate with acceptable quality. If the input signal is too high, the distortion will be above the acceptable maximum, and if the input signal is too low it will get lost in the noise.\n\nThe distance between the two curves in Fig. 2 at an individual frequency is therefore a measure of the signal capacity of the tape recorder at individual frequencies, while the total area between the two curves is a measure of the signal capacity over a chosen frequency range.\n\nUsing information theory, the signal capacity can be defined for a general transmission channel by the following integral:\n\nSB = \u222b B (S+N\/N) df (1)\n\nwhere SB is the signal \/bandwidth product, S is the signal, N is the noise, and B is the bandwidth. This is Shannon's definition of signal capacity, given in 1948.\n\nNow, since log S+N\/N = log (S+N) - log N, equation (1) can therefore be rewritten as:\n\nSB = \u222b B {log (S+N) log N } df (2)\n\nThis precisely defines the area between the signal and noise curves in Fig. 2, and therefore equation (2) gives us an opportunity to put forward a quantitative measure of a tape recorder's ability to accommodate signals. More exact theoretical considerations we have developed show that the SB product for a tape recorder is given by:\n\nBs = Induction in the tape caused by the signal (Gauss)\n\nBr = Maximum remanent induction (Gauss)\n\nHe = Coercivity (Orsteds)\n\nv = Tape speed\n\nb = Track width\n\nd = Thickness of oxide coating\n\nfo = Highest frequency considered\n\nN(f) = A characteristic function of tape noise.\n\nThe most important conclusion to be drawn from equation (3) is that the SB product is dependent on the physical properties of the system, such the tape speed, track width, tape parameters, and so on, rather than the electronics, as long as we maintain the true dynamic range in the program which is to be recorded with no signal processing.\n\nWe will see later that it is possible to process the signal so that the tape hiss becomes less audible to the listener.\n\nDespite this conclusion, we find the frequency -dependent equalization in a tape recorder greatly affects the audible results. We should, therefore, take a closer look at the main requirements influencing the choice of these equalizations. These turn out to be maximum subjective signal-to-noise ratio and flat frequency response at low signal levels.\n\nThe measurements for Fig. 2 were made with one particular playback equalization (120 RS). If we choose another equalization, say 50\u00b5S, and make additional measurements, we obtain the curves shown in Fig. 3. Note that the distance between the two curves is the same, but the shapes of the curves have changed. When we record a program, we are dealing with a complex signal with a particular power distribution over the frequency spectrum, and it should be obvious that we will obtain the best subjective signal-to-noise ratio if we can \"pack the sound\" as far as possible up under the tape's saturation curve.\n\nLet us assume that we have a program with relatively little power in the high frequencies. We then set the input sensitivity of the system to fully load the tape at the middle and low frequencies. If we use the 120\u00b5S playback equalization, the high frequencies will lie far under the tape's saturation curve and therefore near to the noise level. In this case, we could advantageously alter the equalization to 50 \u00b5S, say, and thereby drop the noise level away from the signal. If we change the equalization or time constant in this manner, to improve the signal-to-noise ratio, we must be consistent and change the input level to produce the flat test frequency response at low levels. On the other hand, if we now have a program with a lot of power in the high frequencies, a time constant that is too short will cause the high frequencies to overload the tape before the tape is saturated at the low frequencies, and low frequency noise can then become a problem. From this discussion, we can see that the SB product defined in equation (3) is an objective measure of the best signal-to-noise ratio that can be obtained.\n\nFrequency -dependent equalizations are thus used to match the characteristics of the tape to practical conditions and produce the best signal-to-noise ratio, which means that the SB product is exploited to its maximum. At the same time, we have seen that the optimum playback equalization depends on the type of program the tape recorder must handle. We are therefore led to seek a dynamic equalization that automatically adjusts itself to the power -frequency curve of the program being recorded. This is exactly the concept behind complementary noise -reduction systems, such as Dolby, dbx, Burwen, etc. If we make the same measurements used in Fig. 2 with a Dolby circuit added, we obtain the curves shown in Fig. 4.\n\nAt the higher input levels, the signal is not processed and the tape recorder performs as if the Dolby circuit had not been included. When the signal falls, the higher frequencies receive extra amplification and are, therefore, recorded with a larger margin above the tape noise than normal. During playback the opposite process occurs, and the overall frequency response is therefore correct.\n\nIncreased amplification of the higher frequencies during recording requires reduced amplification of the same frequencies during playback (complementary system). Therefore, noise and other unwanted signals introduced in the process after encoding and before decoding are reduced.\n\nTape noise is reduced by the same degree as the processing of the signal. Figure 4 shows typical output of a cassette machine with a time constant of 120 \u00b5S. At the higher levels, the signal swamps the noise, and the performance is acceptable. The corresponding noise level is given by curve B. At the lower levels, curve A is of no interest, but the signal processing in the Dolby circuits yields noise curve C which is equivalent to a time constant of 40\u00b5S because it has the effect of reducing the noise at higher frequencies by about 10 dB. Accordingly, there is a dynamic change in the time constant from 120 to about 40\u00b5S, depending on the amount of high frequency energy in the program.\n\nFig. 1--Ideally the only differences between input and output of a tape recorder are time delay and possibly amplification of signal.\n\nFig. 2--Recording system performance, showing maximum output level versus frequency at a constant 5 percent THD and residual noise level of the system.\n\nFig. 3--System performance, as in Fig. 2, with two different equalization time constants, showing how the shapes of the curves change. Note that th... (truncated)", 5 | "topic_id":5, 6 | "format_id":0, 7 | "topic_confidence":0.6880204678, 8 | "format_confidence":0.7518321872 9 | } 10 | ] -------------------------------------------------------------------------------- /website/assets/data/examples/topic5_format2.json: -------------------------------------------------------------------------------- 1 | [ 2 | 3 | ] -------------------------------------------------------------------------------- /website/assets/data/examples/topic6_format20.json: -------------------------------------------------------------------------------- 1 | [ 2 | { 3 | "url":"http:\/\/lyrics.wikia.com\/LyricWiki:Page_names", 4 | "text":"LyricWiki:Page Names\n\nRedirected from LyricWiki:Page names\n\n1,754,751pages on\nthis wiki\n\n\nThe purpose of this policy is to have pagenames that are as uniform as possible. LyricWikia is a site that has songs, artists, and users from many different nations, which have rules for capitalization that may vary. (At times, the Artist may creatively change capitalization as well.) By breaking the capitalization rules of every nation, it is hoped that the creation of multiple variations of a page will be minimized. This way, the site won't have one group of editors working on one variation, while another group works on another and work can be more collaborative. Also, because LyricWikia is incorporated into other projects, a uniform method of creating pagenames is also necessary for coding purposes.\n\nExample Titles\n\n\nAll words, regardless of whether the artist capitalize that letter or the language's grammar says it should be lower case, must have their initial letter capitalized.\n \u2022 ABBA, instead of Abba\nThe artist's name in this example is kept all in capitals, because that is the closest to the original's format as per the Swedish and English Wikipedias.\n \u2022 K.d. Lang instead of k.d. lang or K.D. Lang\nIn this example, the K and L are capitalized for LyricWikia's purposes, but the remainder of the artist's name remains in its official format. So, if an album was named OdDly CaPiTaLIzeD, the name should be capitalized exactly as shown, not changed to Oddly Capitalized.\nIf an album lists a song as being in all capitals, it is often the case that it shouldn't be placed under a page name with all capitals, although exceptions are likely to exist. This is very common with Japanese artists (such as Abingdon Boys School) who will use all capitals for those songs with English titles, such as with the example. The page name should follow the normal initial letter capitalization, followed by the capitalization that the word would normally have. (So MCDONALDS should be McDonalds, for example.) The display portion of the link, however, can follow the capitalization used on the album. (Example: # '''[[Abingdon Boys School:Strength.|STRENGTH.]]''' would display as STRENGTH., although it links to the correct page name.)\nE and A in the above example are still initial letters of their respective words, even though those words now form part of contractions and are not preceded by whitespace.\n\nNames Beginning With An Article\n\nArticles such that would normally follow a name in an index (A, An, and The) should remain at the beginning of the artist's name.\n\nPunctuation And Symbols\n\nIn this example, the quotation marks are used because this is the punctuation that the artist officially uses (following the official artist website).\nWords should not be substituted with their equivalent symbols if the word is an official part of the name (& for And, @ for At, etc.).\nSymbols should also not be substituted with their word-equivalent if the symbol is an official part of the name. (A few exceptions exist, see Technical Restrictions below for more information.)\nStylized artist names should be kept where possible.\nKeep apostrophes in contractions such as, ain't, don't, can't, won't, etc. (unless the official spelling of the artist\/album\/song omits them as well).\n\nNon-Latin Character Sets\n\nFor Japanese, Chinese, Korean, and other languages written in non-latin based scripts, artist names should have the romanized version of the name added in parenthesis (with given name first and family name last), with a single space between the native name and the opening parenthesis.\nThe album and song parts of page names should use only the title written in the native script, with no romanized version of the title added. The romanized version and the translation of the title can be provided by adding the {{TransTitle}} template to the top of the page.\n\nStage Name Vs. Real Name\n\nArtist names should be the popular ones, those the artists refer to themselves on album covers or the official websites.\n\nCommon Misspellings\/Incorrect Tagging\n\n \u2022 When you think that an artist or band will be searched for under a technically incorrect name, (ELO or E.L.O. instead of Electric Light Orchestra, for example) you may create a redirect to point to the correct name. A redirect is created by adding #REDIRECT [[Correct Artist Name]] to a blank page.\n \u2022 When you find an artist page to be under an incorrect name, you may move it by using the Move tab at the top of the page.\n \u2022 If the artist page contains subpages (such as album and\/or song pages) please do not move it yourself: this would be a lot of work since all subpages would have to be moved manually and would likely break a lot of links. Instead, add {{move|to=Correct page name|reason=Reason for moving}} to the page source by using the Edit tab at the top.\n \u2022 These same conventions hold for Album and Song names.\n\nArtist Pages\n\nOn occasion, there will be more than one artist with the same name. Whichever artist has the most of LW-relevant releases listed on Kingnee - Musicbrainz MusicBrainz should keep the primary location, with no parenthetical add-on used. If you are not sure about this, please contact an administrator.\n\nFor all other artists add a distinguishing notation to the page title in parentheses. Use the following:\n \u2022 The 2-letter country ISO codes Wikipedia16, e.g. (AR) for Argentina, (SE) for Sweden. For the United Kingdom of Great Britain and Northern Ireland always use (UK).\n \u2022 For multiple artists from the same country add the code for the principal subdivision (state or province) with a hyphen, e.g. (US-CA) for California. The same goes for all countries, respectively.\n \u2022 Use the primary genre for artists from the same country and state, e.g. (US-CA Rap) or (US-CA Metal) \u2013 as brief as possible.\n\nNOTE: This only applies to the artist page, not the artist's albums and songs, which should keep the unchanged artist name as prefix. For details see the homonymous artists help page.\n\nA {{WrongPage}} template should be placed at the top of all similarly-named Artist pages with a link to the disambiguation page.\n\nAlbum Pages\n\nAlbum pages should be named as follows: Artist:Album (release). For example: Green Day:American Idiot (2004). Notice that a space should be placed in between the album's title and the release year inside the parentheses.\n\nAdding the release year should always be done, but is especially important when the album contains a song by the same name, for example: The Corrs:Forgiven, Not Forgotten (1996) (album page) vs. The Corrs:Forgiven Not Forgotten (song page). If the release year is unknown, four question marks should be substituted for the year: (????).\n\nOn a rare occasion, an additional parenthetical notation such as (EP) or (Demo) may be necessary to differentiate between two albums. For this to be necessary, the two albums must have the same name and have been released in the same year. This is mostly due to an identically-titled demo album or EP being released in the same year as a full-sized album release. An add-on should not be used, however, for releases from different countries. Any album differences due to various regional releases, deluxe editions, or limited editions are best noted on the regular album page.\n\nSong Pages\n\nSong pages should be named in the same way: Artist:Song.\n\n \u2022 Added song notations (such as featured artists, live performances, bonus tracks, hidden tracks, etc.) that are not part of the song's title should be left off of the song's name whenever possible, and added parenthetically after the link.\nEXAMPLE: Fergie:Fergalicious (featuring instead of Fergie:Fergalicious (Featuring\n \u2022 A song may use an added notation to distinguish a version that has different lyrics from the original.\nEXAMPLE: Fergie:Fergalicious (Radio Edit) and the original Fergie:Fergalicious\n\nTechnical Restrictions\n\nIf a page name would contain a character that is impossible to use due to technical restrictions (for example, a # symbol) replace it by a suitable equivalent (whatever the symbol represents when the item would be said aloud) and use the {{WrongTitle}} template (or a more specific wrong-title template).\n\nFull List of Restrictions\n\n 1. # should be \"Number \" (notice the space at the end) or \" Sharp\" (however, in the case with albums or songs that start with a Twitter hashtag, the number part should be removed completely since it doesn't refer to a number in this case)\n 2. < should be \"Less Than\"\n 3. > should be \"Greater Than\"\n 4. [ and ] (square brackets) should be ( and )\n 5. { and } (curly brackets) should be ( and )\n 6. \u00bb and \u00ab (double guillemets) should both be \"\n 7. | (pipe) and \u00a6 (broken pipe) should both be \/\n 8. (pilcrow) should be P\n 9. \\n (the actual sequence backslash-n, not the newline character) should be \/n\n 10. : and \/ are only problematic as first character of the page name and should be omitted only in those cases\n\nAround Wikia's network\n\nRandom Wiki", 5 | "topic_id":6, 6 | "format_id":20, 7 | "topic_confidence":0.7096959352, 8 | "format_confidence":0.5109174848 9 | } 10 | ] -------------------------------------------------------------------------------- /website/assets/data/examples/topic7_format20.json: -------------------------------------------------------------------------------- 1 | [ 2 | 3 | ] -------------------------------------------------------------------------------- /website/assets/data/examples/topic8_format20.json: -------------------------------------------------------------------------------- 1 | [ 2 | { 3 | "url":"http:\/\/www.brecksgates.co.uk\/classic-style-00005-p-249.html", 4 | "text":"Home\u00a0::\u00a0 Men's Shoes\u00a0::\u00a0 Classic Style\u00a0::\u00a0 Classic Style 00005\n3065 Expression #1 of ORDER BY clause is not in SELECT list, references column 'tdc07956_vgsdbg.xp.products_id' which is not in SELECT list; this is incompatible with DISTINCT\n[select distinct p.products_id, p.products_image, pd.products_name from products_xsell xp, products p, products_description pd where xp.products_id = '249' and xp.xsell_id = p.products_id and p.products_id = pd.products_id and pd.language_id = '1' and p.products_status = '1' order by xp.products_id asc limit 6]", 5 | "topic_id":8, 6 | "format_id":20, 7 | "topic_confidence":0.5785923004, 8 | "format_confidence":0.8214659095 9 | } 10 | ] -------------------------------------------------------------------------------- /website/assets/data/formats.json: -------------------------------------------------------------------------------- 1 | [ 2 | { 3 | "domain_id":0, 4 | "domain_name":"Academic
Writing", 5 | "domain_description":"- Examples: a research paper, a paper abstract, a thesis, a literature review", 6 | "color": [0.7066666666666667, 0.9168627450980392, 0.9443137254901961, 1.0] 7 | }, 8 | { 9 | "domain_id":1, 10 | "domain_name":"Content
Listing", 11 | "domain_description":"- The page contains an overview of content and is used for navigation\n- Examples: sitemap, product catalog, search results, news listings with short snippets of articles\n- Note that hyperlinks are not visible from the text content and have to be deduced", 12 | "color": [0.697439446366782, 0.9010995770857362, 0.9458515955401768, 1.0] 13 | }, 14 | { 15 | "domain_id":2, 16 | "domain_name":"Creative
Writing", 17 | "domain_description":"- The page consists of a short story, chapters from a novel, poem or song lyrics", 18 | "color": [0.6594232987312572, 0.8361553248750481, 0.9521876201460977, 1.0] 19 | }, 20 | { 21 | "domain_id":3, 22 | "domain_name":"Support", 23 | "domain_fullname": "Customer Support", 24 | "domain_description":"- Content by an organization and for a general audience\n- Examples: a troubleshooting guide", 25 | "color": [0.7254901960784313, 0.9490196078431372, 0.9411764705882353, 1.0] 26 | }, 27 | { 28 | "domain_id":4, 29 | "domain_name":"Comment
Section", 30 | "domain_description":"- A comment section or discussion forum with multiple posts or comments\n- Examples: Community sites like reddit, comment sections on news article or blogs", 31 | "color": [0.6405997693194925, 0.80399846212995, 0.9553248750480585, 1.0] 32 | }, 33 | { 34 | "domain_id":5, 35 | "domain_name":"FAQ", 36 | "domain_description":"- The page content is in the Frequently Asked Questions format", 37 | "color": [0.7254901960784313, 0.9490196078431372, 0.9411764705882353, 1.0] 38 | }, 39 | { 40 | "domain_id":6, 41 | "domain_name":"Truncated", 42 | "domain_description":"- The page contents are incomplete, e.g., truncated, pay-walled, or require a login\n- If the page has multiple snippets of truncated articles, choose 'Content Listing'\n- Also includes multimedia web pages where the web page text primarily describes and supplements the audiovisual content, e.g., a video description or image gallery", 43 | "color": [0.6786159169550173, 0.8689427143406382, 0.9489888504421377, 1.0] 44 | }, 45 | { 46 | "domain_id":7, 47 | "domain_name":"Knowledge
Article", 48 | "domain_description":"- Written in an objective and neutral style\n- Published on a moderated platform (like Wikipedia) or by a reputable source", 49 | "color": [0.6594232987312572, 0.8361553248750481, 0.9521876201460977, 1.0] 50 | }, 51 | { 52 | "domain_id":8, 53 | "domain_name":"Legal", 54 | "domain_fullname": "Legal Notices", 55 | "domain_description":"- Examples: terms of service, legal disclaimers, privacy policy, license agreement", 56 | "color": [0.7254901960784313, 0.9490196078431372, 0.9411764705882353, 1.0] 57 | }, 58 | { 59 | "domain_id":9, 60 | "domain_name":"Listicle", 61 | "domain_description":"- A blog or article that presents content in the form of a list\n- Examples: Buzzfeed-style articles, ''Top 10'' lists, ''4 best places to visit in X''\n- Lists showing the site contents and facilitate navigation fall under 'Content Listing'", 62 | "color": [0.697439446366782, 0.9010995770857362, 0.9458515955401768, 1.0] 63 | }, 64 | { 65 | "domain_id":10, 66 | "domain_name":"News
Article", 67 | "domain_description":"- Written by journalists on current events and published by news organizations\n- Long reads, profiles, editorials, and journalistic essays fall under 'Nonfiction Writing'\n- Newspaper interviews fall under 'Audio Transcript'", 68 | "color": [0.697439446366782, 0.9010995770857362, 0.9458515955401768, 1.0] 69 | }, 70 | { 71 | "domain_id":11, 72 | "domain_name":"Nonfiction
Writing", 73 | "domain_description":"- Long reads, profiles, editorials, essays, obituaries, memoirs and other forms of nonfiction writing, written by journalists and other professional writers", 74 | "color": [0.697439446366782, 0.9010995770857362, 0.9458515955401768, 1.0] 75 | }, 76 | { 77 | "domain_id":12, 78 | "domain_name":"About
(Org.)", 79 | "domain_fullname": "About Page (Organization)", 80 | "domain_description":"- An organizational ''About Page'', typically containing a self-description or introduction by an organization such as a company, university, government agency, non-profit\n- Note that the content may appear similar to a 'Knowledge Article' in some cases, but is not verified and may contain self-promotion", 81 | "color": [0.7254901960784313, 0.9490196078431372, 0.9411764705882353, 1.0] 82 | }, 83 | { 84 | "domain_id":13, 85 | "domain_name":"News
(Org.)", 86 | "domain_fullname": "News (Organization)", 87 | "domain_description":"- Organizational news and announcements\n- Examples: a press release, a blog post by an organization such as a company, university, government agency, non-profit organization", 88 | "color": [0.7254901960784313, 0.9490196078431372, 0.9411764705882353, 1.0] 89 | }, 90 | { 91 | "domain_id":14, 92 | "domain_name":"About
(Pers.)", 93 | "domain_fullname": "About Page (Personal)", 94 | "domain_description":"- An ''About Page'' on a personal website or hobby website, typically containing a self-description, introduction or profile information", 95 | "color": [0.6313725490196078, 0.788235294117647, 0.9568627450980393, 1.0] 96 | }, 97 | { 98 | "domain_id":15, 99 | "domain_name":"Personal
Blog", 100 | "domain_description":"- Written by an individual typically relating personal experiences and opinions", 101 | "color": [0.6313725490196078, 0.788235294117647, 0.9568627450980393, 1.0] 102 | }, 103 | { 104 | "domain_id":16, 105 | "domain_name":"Product
Page", 106 | "domain_description":"- Typically contains descriptions and promotions for a product or service\n- Also includes products in a wider sense, for example university course descriptions", 107 | "color": [0.7254901960784313, 0.9490196078431372, 0.9411764705882353, 1.0] 108 | }, 109 | { 110 | "domain_id":17, 111 | "domain_name":"Q&A", 112 | "domain_fullname": "Q&A Forum", 113 | "domain_description":"- A user forum with an explicit question & answer format, e.g., Quora, Stack Exchange", 114 | "color": [0.6405997693194925, 0.80399846212995, 0.9553248750480585, 1.0] 115 | }, 116 | { 117 | "domain_id":18, 118 | "domain_name":"Spam
\/ Ads", 119 | "domain_description":"- The page consists primarily of spam content, SEO keyword stuffing, or short online ads for other pages, products or services, or has no apparent purpose", 120 | "color": [0.6786159169550173, 0.8689427143406382, 0.9489888504421377, 1.0] 121 | }, 122 | { 123 | "domain_id":19, 124 | "domain_name":"Structured
Data", 125 | "domain_description":"- Multiple data entries with a common structure\n- Examples: a table, datasheet, movie database, glossary, dictionary, json file, csv, xml", 126 | "color": [0.6786159169550173, 0.8689427143406382, 0.9489888504421377, 1.0] 127 | }, 128 | { 129 | "domain_id":20, 130 | "domain_name": "Docs.", 131 | "domain_fullname": "Documentation", 132 | "domain_description":"- Examples: technical writing, API documentation, README files, source code\n- Unlike 'Customer Support', meant for developers and experts, rather than end-users", 133 | "color": [0.7066666666666667, 0.9168627450980392, 0.9443137254901961, 1.0] 134 | }, 135 | { 136 | "domain_id":21, 137 | "domain_name":"Audio
Transcript", 138 | "domain_description":"- A written record of spoken language\n- Examples: interviews (e.g., in a newspaper), the transcript of a court hearing, movie, podcast, lecture, or speech", 139 | "color": [0.6786159169550173, 0.8689427143406382, 0.9489888504421377, 1.0] 140 | }, 141 | { 142 | "domain_id":22, 143 | "domain_name":"Tutorial", 144 | "domain_description":"- Examples: cooking recipes, DIY instructions, WikiHow page, Khan Academy course\n- The page must contain the actual content of the tutorial \/ how-to guide\n- Guides specific to products\/services from the website fall under 'Customer Support'", 145 | "color": [0.6594232987312572, 0.8361553248750481, 0.9521876201460977, 1.0] 146 | }, 147 | { 148 | "domain_id":23, 149 | "domain_name":"User
Review", 150 | "domain_description":"- Reviews posted by users, e.g., on Yelp, TripAdvisor", 151 | "color": [0.6405997693194925, 0.80399846212995, 0.9553248750480585, 1.0] 152 | } 153 | ] -------------------------------------------------------------------------------- /website/assets/data/topics.json: -------------------------------------------------------------------------------- 1 | [ 2 | { 3 | "domain_id":0, 4 | "domain_name":"Adult", 5 | "domain_description":"", 6 | "color": [0.984313725490196, 0.6933333333333334, 0.8172549019607843, 1.0] 7 | }, 8 | { 9 | "domain_id":1, 10 | "domain_name":"Art &
Design", 11 | "domain_description":"- Includes: architecture", 12 | "color": [0.988235294117647, 0.6964705882352942, 0.7403921568627451, 1.0] 13 | }, 14 | { 15 | "domain_id":2, 16 | "domain_name":"Software Dev.", 17 | "domain_fullname": "Software Development", 18 | "domain_description":"- Includes: algorithms, coding, and web development", 19 | "color": [0.996078431372549, 0.7027450980392157, 0.5866666666666667, 1.0] 20 | }, 21 | { 22 | "domain_id":3, 23 | "domain_name":"Crime
& Law", 24 | "domain_description":"- Includes: law enforcement\n- Financial crime and litigation fall under 'Finance & Business'\n- Social issues and the legislative process fall under 'Politics'", 25 | "color": [0.9921568627450981, 0.6996078431372549, 0.6635294117647059, 1.0] 26 | }, 27 | { 28 | "domain_id":4, 29 | "domain_name":"Education
& Jobs", 30 | "domain_description":"- Includes: pedagogy, training & certification, academia\n- Educational pages about a topic, e.g., food or mathematics, fall under that topic", 31 | "color": [0.9921568627450981, 0.6996078431372549, 0.6635294117647059, 1.0] 32 | }, 33 | { 34 | "domain_id":5, 35 | "domain_name":"Hardware", 36 | "domain_description":"- Includes: computer hardware, phones, televisions, other consumer electronics", 37 | "color": [0.996078431372549, 0.7027450980392157, 0.5866666666666667, 1.0] 38 | }, 39 | { 40 | "domain_id":6, 41 | "domain_name":"Entertainment", 42 | "domain_description":"- Includes: music, movies, TV shows, videos, celebrities, humor, nightlife\n- Music or film discussed as art rather than entertainment falls under 'Art & Design'", 43 | "color": [0.984313725490196, 0.6933333333333334, 0.8172549019607843, 1.0] 44 | }, 45 | { 46 | "domain_id":7, 47 | "domain_name":"Social Life", 48 | "domain_description":"- Includes: family, friends, relationships, community\n- Specific social activity (e.g., sports or board games) fall under those topics", 49 | "color": [0.9803921568627451, 0.6901960784313725, 0.8941176470588236, 1.0] 50 | }, 51 | { 52 | "domain_id":8, 53 | "domain_name":"Fashion
& Beauty", 54 | "domain_description":"- Includes: clothing, accessories, cosmetics", 55 | "color": [0.984313725490196, 0.6933333333333334, 0.8172549019607843, 1.0] 56 | }, 57 | { 58 | "domain_id":9, 59 | "domain_name":"Finance &
Business", 60 | "domain_description":"- Includes: taxes, regulations, investments, insurance, credit cards, personal finance, corporate communication, marketing, human resources", 61 | "color": [1.0, 0.7058823529411765, 0.5098039215686274, 1.0] 62 | }, 63 | { 64 | "domain_id":10, 65 | "domain_name":"Food &
Dining", 66 | "domain_description":"- Includes: recipes, groceries, beverages, restaurants\n- Nutritional sciences fall under 'Health'", 67 | "color": [0.984313725490196, 0.6933333333333334, 0.8172549019607843, 1.0] 68 | }, 69 | { 70 | "domain_id":11, 71 | "domain_name":"Games", 72 | "domain_description":"- Includes: video games, board games, gambling", 73 | "color": [0.984313725490196, 0.6933333333333334, 0.8172549019607843, 1.0] 74 | }, 75 | { 76 | "domain_id":12, 77 | "domain_name":"Health", 78 | "domain_description":"- Includes: medicine, wellness, mental health, veterinary science, nutritional science\n- Health insurance falls under 'Finance & Business'", 79 | "color": [0.988235294117647, 0.6964705882352942, 0.7403921568627451, 1.0] 80 | }, 81 | { 82 | "domain_id":13, 83 | "domain_name":"History", 84 | "domain_description":"- Includes: geography, archaeology", 85 | "color": [0.984313725490196, 0.6933333333333334, 0.8172549019607843, 1.0] 86 | }, 87 | { 88 | "domain_id":14, 89 | "domain_name":"Home
& Hobbies", 90 | "domain_description":"- Includes: real estate, renting, relocation, furniture, appliances, home improvement, DIY, gardening, pets, toys, collecting", 91 | "color": [0.984313725490196, 0.6933333333333334, 0.8172549019607843, 1.0] 92 | }, 93 | { 94 | "domain_id":15, 95 | "domain_name":"Industrial", 96 | "domain_description":"- Topics related to mining, agriculture, manufacturing, utilities and construction\n- Includes: raw materials, industrial goods, chemicals, textiles\n- General business topics or business finance fall under 'Finance & Business'", 97 | "color": [1.0, 0.7058823529411765, 0.5098039215686274, 1.0] 98 | }, 99 | { 100 | "domain_id":16, 101 | "domain_name":"Literature", 102 | "domain_description":"- Includes: literary criticism, linguistics, philosophy, related subjects in the humanities\n- Text written in literary style fall under the topic of its contents", 103 | "color": [0.988235294117647, 0.6964705882352942, 0.7403921568627451, 1.0] 104 | }, 105 | { 106 | "domain_id":17, 107 | "domain_name":"Politics", 108 | "domain_description":"- Includes: social issues, political campaigns, the legislative process, geopolitics, protests, activism", 109 | "color": [0.9803921568627451, 0.6901960784313725, 0.8941176470588236, 1.0] 110 | }, 111 | { 112 | "domain_id":18, 113 | "domain_name":"Religion", 114 | "domain_description":"- Includes: spirituality", 115 | "color": [0.9803921568627451, 0.6901960784313725, 0.8941176470588236, 1.0] 116 | }, 117 | { 118 | "domain_id":19, 119 | "domain_name":"Science
& Tech.", 120 | "domain_fullname": "Science & Technology", 121 | "domain_description":"- Includes: physics, chemistry, biology, environmental science, mathematics, statistics, biotech, engineering", 122 | "color": [0.996078431372549, 0.7027450980392157, 0.5866666666666667, 1.0] 123 | }, 124 | { 125 | "domain_id":20, 126 | "domain_name":"Software", 127 | "domain_description":"- Topics related to the use of software and the internet", 128 | "color": [0.9921568627450981, 0.6996078431372549, 0.6635294117647059, 1.0] 129 | }, 130 | { 131 | "domain_id":21, 132 | "domain_name":"Sports &
Fitness", 133 | "domain_description":"- Includes: martial arts, motor sports, outdoor activities, sports equipment", 134 | "color": [0.984313725490196, 0.6933333333333334, 0.8172549019607843, 1.0] 135 | }, 136 | { 137 | "domain_id":22, 138 | "domain_name":"Transportation", 139 | "domain_description":"- Includes: cars and other vehicles, taxis, public transportation, traffic, commuting, aviation, rail, shipping, logistics", 140 | "color": [0.9921568627450981, 0.6996078431372549, 0.6635294117647059, 1.0] 141 | }, 142 | { 143 | "domain_id":23, 144 | "domain_name":"Travel", 145 | "domain_description":"- Includes: hospitality, hotels, sight-seeing, cruises\n- Detailed descriptions of tourist destinations fall under 'History'", 146 | "color": [0.984313725490196, 0.6933333333333334, 0.8172549019607843, 1.0] 147 | } 148 | ] -------------------------------------------------------------------------------- /website/assets/images/ai2_logo.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/CodeCreator/WebOrganizer/b3da665635be3ee6c51484509f0fa5699f24d28c/website/assets/images/ai2_logo.png -------------------------------------------------------------------------------- /website/assets/images/icon.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/CodeCreator/WebOrganizer/b3da665635be3ee6c51484509f0fa5699f24d28c/website/assets/images/icon.png -------------------------------------------------------------------------------- /website/assets/images/mixtures_implicit.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/CodeCreator/WebOrganizer/b3da665635be3ee6c51484509f0fa5699f24d28c/website/assets/images/mixtures_implicit.png -------------------------------------------------------------------------------- /website/assets/images/mixtures_regmix.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/CodeCreator/WebOrganizer/b3da665635be3ee6c51484509f0fa5699f24d28c/website/assets/images/mixtures_regmix.png -------------------------------------------------------------------------------- /website/assets/images/pli_logo.svg: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 13 | 14 | 15 | 16 | 17 | 18 | 19 | 20 | 21 | 22 | 23 | 24 | 25 | 26 | 27 | 28 | 29 | 30 | 31 | 32 | 33 | 34 | 35 | 36 | 37 | 38 | 39 | 40 | 41 | 42 | 43 | 44 | 45 | 46 | 47 | 48 | 49 | 50 | 51 | 52 | 53 | 54 | 55 | 56 | 57 | 58 | 59 | 60 | 61 | 62 | 63 | 64 | 65 | 66 | 67 | 68 | 69 | -------------------------------------------------------------------------------- /website/assets/images/princeton_logo.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/CodeCreator/WebOrganizer/b3da665635be3ee6c51484509f0fa5699f24d28c/website/assets/images/princeton_logo.png -------------------------------------------------------------------------------- /website/assets/images/results_main.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/CodeCreator/WebOrganizer/b3da665635be3ee6c51484509f0fa5699f24d28c/website/assets/images/results_main.png -------------------------------------------------------------------------------- /website/assets/images/treemaps.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/CodeCreator/WebOrganizer/b3da665635be3ee6c51484509f0fa5699f24d28c/website/assets/images/treemaps.png -------------------------------------------------------------------------------- /website/assets/images/uc_berkeley_logo.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/CodeCreator/WebOrganizer/b3da665635be3ee6c51484509f0fa5699f24d28c/website/assets/images/uc_berkeley_logo.png -------------------------------------------------------------------------------- /website/assets/images/uw_logo.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/CodeCreator/WebOrganizer/b3da665635be3ee6c51484509f0fa5699f24d28c/website/assets/images/uw_logo.png --------------------------------------------------------------------------------