├── .gitignore ├── async_api_scrapes ├── Results.md ├── prepare_dataset.py └── run.py ├── delete_spaces_batch └── run.py ├── featured_spaces_likes ├── run.py ├── spaces.csv └── spaces_with_likes.csv ├── model_scraping ├── cards │ ├── Helsinki-NLP___opus-mt-en-es.md │ ├── StanfordAIMI___stanford-deidentifier-base.md │ ├── albert-base-v2.md │ ├── bert-base-cased.md │ ├── bert-base-multilingual-cased.md │ ├── bert-base-uncased.md │ ├── cl-tohoku___bert-base-japanese-whole-word-masking.md │ ├── distilbert-base-cased-distilled-squad.md │ ├── distilbert-base-uncased-finetuned-sst-2-english.md │ ├── distilbert-base-uncased.md │ ├── distilroberta-base.md │ ├── emilyalsentzer___Bio_ClinicalBERT.md │ ├── facebook___bart-large-mnli.md │ ├── google___electra-base-discriminator.md │ ├── gpt2.md │ ├── jonatasgrosman___wav2vec2-large-xlsr-53-english.md │ ├── microsoft___layoutlmv3-base.md │ ├── openai___clip-vit-base-patch32.md │ ├── openai___clip-vit-large-patch14.md │ ├── philschmid___bart-large-cnn-samsum.md │ ├── prajjwal1___bert-tiny.md │ ├── roberta-base.md │ ├── roberta-large.md │ ├── runwayml___stable-diffusion-v1-5.md │ ├── sentence-transformers___all-MiniLM-L6-v2.md │ ├── t5-base.md │ ├── t5-small.md │ ├── xlm-roberta-base.md │ ├── xlm-roberta-large.md │ └── yiyanghkust___finbert-tone.md └── run.py └── requirements.txt /.gitignore: -------------------------------------------------------------------------------- 1 | .DS_Store 2 | .idea/ 3 | laion2B-en-small -------------------------------------------------------------------------------- /async_api_scrapes/Results.md: -------------------------------------------------------------------------------- 1 | Batch Size is fixed at 1000, due to API limit. 2 | 3 | Semaphores: 100 4 | Execution time: 13.88 seconds 5 | 6 | Semaphores: 10 7 | Execution time: 14.74 seconds 8 | 9 | -------------------------------------------------------------------------------- /async_api_scrapes/prepare_dataset.py: -------------------------------------------------------------------------------- 1 | from datasets import Dataset, load_dataset 2 | 3 | ds = load_dataset("laion/laion2B-en", split="train", streaming=True) 4 | ds = Dataset.from_list(list(ds.take(100000))) 5 | ds.save_to_disk("laion2B-en-small") 6 | -------------------------------------------------------------------------------- /async_api_scrapes/run.py: -------------------------------------------------------------------------------- 1 | from datasets import load_from_disk, load_dataset 2 | 3 | import json 4 | 5 | import asyncio 6 | import aiohttp 7 | from aiolimiter import AsyncLimiter 8 | 9 | TOKEN = "" 10 | headers = {"Authorization": f"API {TOKEN}"} 11 | 12 | OPT_IN_OUT_URLS_SCAN_MAX_CONCURRENT_REQUESTS_NUMBER = 100 13 | OPT_IN_OUT_URLS_SCAN_MAX_REQUESTS_PER_SECOND = 50 14 | 15 | BATCH_SIZE = 100000 16 | CHUNK_SIZE = 1000 17 | IMAGE_FEATURE = "URL" 18 | 19 | OPT_IN_COUNT = 0 20 | OPT_OUT_COUNT = 0 21 | 22 | 23 | async def check_spawning(image_urls, semaphore, limiter): 24 | url = f"https://opts-api.spawningaiapi.com/api/v2/query/urls" 25 | async with aiohttp.ClientSession(headers=headers) as session: 26 | await semaphore.acquire() 27 | async with limiter: 28 | async with session.post( 29 | url=url, 30 | data="\n".join(image_urls).encode("utf-8") 31 | ) as resp: 32 | content = await resp.read() 33 | semaphore.release() 34 | return json.loads(content) 35 | 36 | 37 | async def opt_in_out_task(data_items) -> (int, int, int): 38 | tasks = [] 39 | 40 | semaphore = asyncio.Semaphore(value=OPT_IN_OUT_URLS_SCAN_MAX_CONCURRENT_REQUESTS_NUMBER) 41 | limiter = AsyncLimiter(OPT_IN_OUT_URLS_SCAN_MAX_REQUESTS_PER_SECOND, time_period=1) 42 | 43 | shards = [data_items["URL"][i:i + CHUNK_SIZE] for i in range(0, len(data_items["URL"]), CHUNK_SIZE)] 44 | 45 | for shard in shards: 46 | tasks.append(asyncio.create_task(check_spawning(shard, semaphore, limiter))) 47 | await asyncio.wait(tasks) 48 | 49 | content = [url for task in tasks for url in task.result()["urls"]] 50 | 51 | opt_in = [x["optIn"] for x in content] 52 | opt_out = [x["optOut"] for x in content] 53 | 54 | return {"OPT_IN": opt_in, "OPT_OUT": opt_out} 55 | 56 | 57 | def async_mapping(data_items): 58 | global OPT_IN_COUNT, OPT_OUT_COUNT 59 | 60 | results = asyncio.run(opt_in_out_task(data_items)) 61 | 62 | OPT_IN_COUNT = OPT_IN_COUNT + sum(results["OPT_IN"]) 63 | OPT_OUT_COUNT = OPT_OUT_COUNT + sum(results["OPT_OUT"]) 64 | 65 | return results 66 | 67 | 68 | if __name__ == "__main__": 69 | ds = load_from_disk("./laion2b-en-small") 70 | # ds = load_dataset("laion/laion2B-en", split="train", num_proc=2) 71 | ds_opts = ds.map(batched=True, batch_size=BATCH_SIZE, function=async_mapping, remove_columns=ds.column_names) 72 | 73 | with open("./results", "w") as f: 74 | f.write(f"OPT_IN_COUNT: {OPT_IN_COUNT}\n") 75 | f.write(f"OPT_OUT_COUNT: {OPT_OUT_COUNT}\n") 76 | f.write(f"LENGTH: {len(ds_opts)}\n") 77 | -------------------------------------------------------------------------------- /delete_spaces_batch/run.py: -------------------------------------------------------------------------------- 1 | from huggingface_hub import HfApi 2 | 3 | hf_api = HfApi() 4 | 5 | spaces = hf_api.list_spaces( 6 | author="owkin", 7 | search="trainer-" 8 | ) 9 | 10 | for space in spaces: 11 | hf_api.delete_repo(repo_id=space.id, repo_type="space") 12 | -------------------------------------------------------------------------------- /featured_spaces_likes/run.py: -------------------------------------------------------------------------------- 1 | import pandas as pd 2 | from huggingface_hub import HfApi 3 | 4 | df = pd.read_csv("spaces.csv") 5 | # spaces = df[df.Status.isna()] 6 | 7 | hf_api = HfApi() 8 | 9 | 10 | def get_likes(row): 11 | URL = row["URL"] 12 | space_name = URL.split("/")[-1] 13 | space_author = URL.split("/")[-2] 14 | return hf_api.list_spaces(author=space_author, search=space_name)[0].likes 15 | 16 | 17 | df["likes"] = df[df.Status.isna()].apply(get_likes, axis=1) 18 | 19 | print(df) 20 | 21 | df.to_csv("./spaces_with_likes.csv") 22 | -------------------------------------------------------------------------------- /featured_spaces_likes/spaces.csv: -------------------------------------------------------------------------------- 1 | Name,URL,Categories,Tags,Modality,Status,Description 2 | CapDec Image Captioning,https://huggingface.co/spaces/johko/capdec-image-captioning,Socially Conscious,"Accessibility, Image Captioning","Image, Language",, 3 | Interactive demo: comparing image captioning models,https://huggingface.co/spaces/nielsr/comparing-captioning-models,Socially Conscious,"Accessibility, Image Captioning","Image, Language",, 4 | TorToiSe,https://huggingface.co/spaces/mdnestor/tortoise,Socially Conscious,"Accessibility, Text-to-Speech","Audio, Language",, 5 | Umamusume voice synthesizer,https://huggingface.co/spaces/Plachta/VITS-Umamusume-voice-synthesizer,Socially Conscious,"Accessibility, Text-to-Speech","Audio, Language",, 6 | Whisper speaker diarization,https://huggingface.co/spaces/vumichien/whisper-speaker-diarization,Socially Conscious,"Accessibility, Speech-to-Text","Audio, Language",, 7 | coqui-ai-TTS,https://huggingface.co/spaces/akhaliq/coqui-ai-tts,Socially Conscious,"Accessibility, Text-to-Speech","Audio, Language",, 8 | ,https://huggingface.co/spaces/team-writing-assistant/grammar-correction,Socially Conscious,,,Broken / Not Worth, 9 | Legal Contract Review Demo,https://huggingface.co/spaces/marshmellow77/contract-review,Socially Conscious,"Accessibility, Contract Understanding, Legal",Language,, 10 | Automatic Readability Assessment of Texts in Spanish,https://huggingface.co/spaces/hackathon-pln-es/readability-assessment-spanish,Socially Conscious,"Accessibility, Readability",Language,, 11 | Contract Understanding Atticus Dataset (CUAD) Demo,https://huggingface.co/spaces/akdeniz27/contract-understanding-atticus-dataset-demo,Socially Conscious,"Accessibility, Contract Understanding, Legal",Language,, 12 | MediaPipe's Hand & Finger Tracking,https://huggingface.co/spaces/EuroPython2022/mediapipe-hands,Socially Conscious,"Accessibility, Hand Tracking, Sign Language",Video,, 13 | AmericanSignLanguage Detection,https://huggingface.co/spaces/datasciencedojo/AmericanSignLanguage-Detection,Socially Conscious,"Accessibility, Sign Language",Video,, 14 | ,https://huggingface.co/spaces/Jayeshbhaal/news_filter_for_social_wellbeing,,,,Broken / Not Worth, 15 | Speech Recognition from Visual Lip Movement by Audio-Visual Hidden Unit BERT Model (AV-HuBERT),https://huggingface.co/spaces/vumichien/lip_movement_reading,Socially Conscious,"Accessibility, Lip Movement",Video,, 16 | Mongolian GPT2,https://huggingface.co/spaces/flax-community/Mongolian-GPT2,Inclusive,Languages,Language,, 17 | Tamil Language Demos,https://huggingface.co/spaces/flax-community/TamilLanguageDemos,Inclusive,Languages,Language,, 18 | Translate Between 100 languages,https://huggingface.co/spaces/jason9693/m2m-100,Inclusive,Languages,Language,, 19 | British Library 19th Century Books Genre Classifier,https://huggingface.co/spaces/BritishLibraryLabs/British-Library-books-genre-classifier,Socially Conscious,Digital Humanities,Language,, 20 | doc-ufcn Page Detection Demo,https://huggingface.co/spaces/davanstrien/Doc-UFCN,Socially Conscious,Digital Humanities,"Image, Language",, 21 | Modern Chinese To Ancient Translate Wenyanwen,https://huggingface.co/spaces/raynardj/modern-chinese-to-ancient-translate-wenyanwen,Socially Conscious,"Digital Humanities, Languages",Language,, 22 | Duguwen Classical Chinese To Morden Translate,https://huggingface.co/spaces/raynardj/duguwen-classical-chinese-to-morden-translate,Socially Conscious,"Digital Humanities, Languages",Language,, 23 | ,https://huggingface.co/spaces/HalflingWizard/Shereno-Word-Cloud,,,,Broken / Not Worth, 24 | Spanish to Quechua translation,https://huggingface.co/spaces/hackathon-pln-es/spanish-to-quechua-translation,"Inclusive, Socially Conscious","Digital Humanities, Language Revitalization, Languages",Language,, 25 | ,https://huggingface.co/spaces/flax-community/multilingual-image-captioning,,,,Broken / Not Worth, 26 | MMTAfrica: Multilingual Machine Translation,https://huggingface.co/spaces/edaiofficial/mmtafrica,"Inclusive, Socially Conscious",Languages,Language,, 27 | Modern English to Middle English Translator,https://huggingface.co/spaces/Qilex/EnglishToMiddleEnglish,Socially Conscious,"Digital Humanities, Languages",Language,, 28 | Gender Neutral Converter,https://huggingface.co/spaces/vilimus/Gender-Neutral-Converter,"Inquisitive, Socially Conscious","LGBTQIA2S+, Languages",Language,, 29 | Cite Diversely,https://huggingface.co/spaces/cmudrc/cite-diversely,"Inclusive, Inquisitive, Socially Conscious",Research Methods,Other,, 30 | Spanish Text Neutralization app,https://huggingface.co/spaces/hackathon-pln-es/es_nlp_gender_neutralizer,"Inquisitive, Socially Conscious","LGBTQIA2S+, Languages",Language,, 31 | GayFriendlyTownFinder,https://huggingface.co/spaces/GlassWalker/GayFriendlyTownFinder,Socially Conscious,LGBTQIA2S+,Other,, 32 | ,https://huggingface.co/spaces/abidlabs/Gradio-Demo-of-BLM-Photo-Anonymizer,,,,Broken / Not Worth, 33 | Deepfake Detection,https://huggingface.co/spaces/aaronespasa/deepfake-detection,Socially Conscious,Deepfake Detection,Image,, 34 | EfficientNetV2 Deepfakes Video Detector,https://huggingface.co/spaces/Ron0420/EfficientNetV2_Deepfakes_Video_Detector,"Socially Conscious, Sustainable","Deepfake Detection, Efficient",Video,, 35 | PP4AV: Deep Learning model for Data Anonymization in Autonomous Driving,https://huggingface.co/spaces/khaclinh/self-driving-anonymization,Consentful,"Anonymization, Privacy",Image,, 36 | IMS Speaker Anonymization,https://huggingface.co/spaces/sarinam/speaker-anonymization,Consentful,"Anonymization, Privacy, Voice",Audio,, 37 | "ZAMA: Machine Learning, Natural Language Processing and Fully Homomorphic Encryption to do Sentiment Analysis on Encrypted data.",https://huggingface.co/spaces/zama-fhe/encrypted_sentiment_analysis,Consentful,"Homomorphic Encryption, Privacy",Language,, 38 | Fake News Detector (Spanish),https://huggingface.co/spaces/Narrativa/fake-news-detection-spanish,Socially Conscious,Fake News Detection,Language,, 39 | ,https://huggingface.co/spaces/adrianmoses/hate-speech-detection,,,,Broken / Not Worth, 40 | ,https://huggingface.co/spaces/khalidsaifullaah/Threat-Detection-From-Bengali-Voice-Calls,,,,Broken / Not Worth, 41 | ,https://huggingface.co/spaces/justinqbui/Covid-Tweet-True-False-Classification,,,,Broken / Not Worth, 42 | ,https://huggingface.co/spaces/Amrrs/dbias-recognizer,,,,Broken / Not Worth, 43 | ,https://huggingface.co/spaces/sagittariusA/media_bias_detection_CS,,,,Broken / Not Worth, 44 | Fake_tweet_detector,https://huggingface.co/spaces/chinhon/fake_tweet_detector,Socially Conscious,Fake News Detection,Language,, 45 | ,https://huggingface.co/spaces/akykeung/News-Article-Bias-Recognizer,,,,Broken / Not Worth, 46 | ,https://huggingface.co/spaces/aymm/Task-Exploration-Hate-Speech,,,,Broken / Not Worth, 47 | Análisis de comentarios sexistas en Twitter,https://huggingface.co/spaces/hackathon-pln-es/Sexismdetection,Socially Conscious,"Hate Detection, Sexism",Language,, 48 | ,https://huggingface.co/spaces/EdBianchi/Social_Toximeter,,,,Broken / Not Worth, 49 | Spanish Audio Transcription Based Harassment Detection,https://huggingface.co/spaces/CVMX-jaca-tonos/Spanish-Audio-Transcription-based-Harassment-Detection,Socially Conscious,Hate Detection,"Audio, Language",, 50 | LOREN,https://huggingface.co/spaces/Jiangjie/loren-fact-checking,Socially Conscious,Fact Checking,Language,, 51 | ,https://huggingface.co/spaces/Joshua1808/Sexism,,,,Broken / Not Worth, 52 | Fact Checking 🎸 Rocks!,https://huggingface.co/spaces/anakin87/fact-checking-rocks,Socially Conscious,Fact Checking,Language,, 53 | DataMeasurementsTool,https://huggingface.co/spaces/huggingface/data-measurements-tool,Rigorous,Data Exploration,,, 54 | Chat Noir,https://huggingface.co/spaces/webis/chat-noir,Rigorous,Data Exploration,Language,, 55 | Predicting masked words in legal text,https://huggingface.co/spaces/muhtasham/legalBERT,Socially Conscious,Legal,Language,, 56 | PAIR: Measuring Fairness,https://huggingface.co/spaces/merve/measuring-fairness,"Inclusive, Rigorous","Educational, Fairness & Bias",,, 57 | PAIR: Are Model Predictions Probabilities?,https://huggingface.co/spaces/merve/uncertainty-calibration,Rigorous,Educational,,, 58 | PAIR: Can a Model Be Differentially Private and Fair?,https://huggingface.co/spaces/merve/private-and-fair,"Consentful, Rigorous","Educational, Fairness & Bias, Privacy",,, 59 | PAIR: Measuring Diversity,https://huggingface.co/spaces/merve/measuring-diversity,"Inclusive, Rigorous","Data Exploration, Diversity, Educational",,, 60 | PAIR: What Have Language Models Learned?,https://huggingface.co/spaces/merve/fill-in-the-blank,Rigorous,"Educational, Explainability",Language,, 61 | PAIR: How randomized response can help collect sensitive information responsibly,https://huggingface.co/spaces/merve/anonymization,Consentful,"Data Exploration, Educational, Privacy",,, 62 | PAIR: Hidden Bias,https://huggingface.co/spaces/merve/hidden-bias,"Inclusive, Rigorous","Data Exploration, Educational, Fairness & Bias",,, 63 | PAIR: Why Some Models Leak Data,https://huggingface.co/spaces/merve/data-leak,"Consentful, Rigorous","Educational, Privacy, Security",,, 64 | PAIR: Datasets Have Worldviews,https://huggingface.co/spaces/merve/dataset-worldviews,"Inquisitive, Rigorous","Data Exploration, Educational, Fairness & Bias",,, 65 | Causing Gender Pronouns,https://huggingface.co/spaces/emilylearning/causing_gender_pronouns,Rigorous,"Correlations, Data Exploration",Language,, 66 | Causing Gender Pronouns 2,https://huggingface.co/spaces/emilylearning/causing_gender_pronouns_two,Rigorous,"Correlations, Data Exploration",Language,, 67 | Generate Class Saliency Plots,https://huggingface.co/spaces/probing-vits/class-saliency,Rigorous,"Explainability, Saliency",Image,, 68 | Image foreground masking or background removal,https://huggingface.co/spaces/taskswithcode/salient-object-detection,Rigorous,"Explainability, Saliency",Image,, 69 | Bigscience Corpus,https://huggingface.co/spaces/bigscience-data/bigscience-corpus,Rigorous,"Data Exploration, Transparency",Language,, 70 | Interactive Error Analysis,https://huggingface.co/spaces/autoevaluate/error-analysis,Rigorous,Error Analysis,Language,Broken / Not Worth, 71 | Interactive Datasets Explorer,https://huggingface.co/spaces/nazneen/datasets-explorer,Rigorous,Data Exploration,"Image, Language",, 72 | Modeling Uncertainty in Explainability,https://huggingface.co/spaces/ucinlp/Modeling-Uncertainty-in-Explainability,Rigorous,Explainability,,Broken / Not Worth, 73 | Visualization of the distributions of the filter values for the BigScience Corpus,https://huggingface.co/spaces/bigscience-data/filter_values_distributions,Rigorous,"Data Exploration, Transparency",Language,, 74 | Visualizing What Convnets Learn,https://huggingface.co/spaces/keras-io/what-convnets-learn,Rigorous,Explainability,Image,, 75 | Timeseries Anomaly Detection Using an Autoencoder,https://huggingface.co/spaces/keras-io/timeseries-anomaly-detection-autoencoders,Rigorous,Anomaly Detection,,, 76 | Can you tell if a Neural Net contains a Backdoor Attack?,https://huggingface.co/spaces/CVPR/Dual-Key_Backdoor_Attacks,Rigorous,Security,"Image, Language",, 77 | Language Model Gender Bias Scorecard,https://huggingface.co/spaces/sasha/WinoBiasCheck,Rigorous,"Data Exploration, Fairness & Bias, Gender",Language,, 78 | Tips Gender,https://huggingface.co/spaces/merve/tips_gender,Rigorous,"Data Exploration, Fairness & Bias",Language,, 79 | Bias & Fairness in AI,https://huggingface.co/spaces/d4data/Bias-Fairness-in-AI,Rigorous,"Explainability, Fairness & Bias",Language,, 80 | Error Analysis,https://huggingface.co/spaces/nazneen/error-analysis,Rigorous,Error Analysis,,Broken / Not Worth, 81 | Spurious Correlation Evaluation for Pre-trained LLMs,https://huggingface.co/spaces/ICML2022/selection_bias_induced_spurious_correlations,Rigorous,Correlations,Language,, 82 | Self-Diagnosis and Self-Debiasing: A Proposal for Reducing Corpus-Based Bias in NLP,https://huggingface.co/spaces/kunwarsaaim/Self-Debiasing,Rigorous,"Debiasing, Fairness & Bias",Language,, 83 | Language Model Bias Scorecard,https://huggingface.co/spaces/sasha/BiasDetection,Rigorous,"Data Exploration, Fairness & Bias",Language,, 84 | ,https://huggingface.co/spaces/argilla/live-demo,,,,Broken / Not Worth, 85 | ,https://huggingface.co/spaces/dvilasuero/argilla,,,,Broken / Not Worth, 86 | Unsupervised Salient Object Detection with Spectral Cluster Voting,https://huggingface.co/spaces/noelshin/selfmask,Rigorous,"Object Detection, Saliency",Image,, 87 | Scientific Question Answering with Citations,https://huggingface.co/spaces/domenicrosati/scite-qa-demo,"Rigorous, Socially Conscious","Citations, Explainability",Language,, 88 | Systematic Error Analysis and Labeling,https://huggingface.co/spaces/nazneen/seal,Rigorous,"Data Exploration, Error Analysis",Language,, 89 | Clustering with Scikit-learn,https://huggingface.co/spaces/EuroSciPy2022/clustering,Rigorous,"Data Exploration, Educational",,, 90 | Stable Diffusion v1.5 Bias Explorer,https://huggingface.co/spaces/sasha/StableBias1.5,"Inclusive, Inquisitive, Rigorous","Fairness & Bias, Gender, Generative","Image, Language",, 91 | Visual Dataset Explorer,https://huggingface.co/spaces/myscale/visual-dataset-explorer,Rigorous,Data Exploration,"Image, Language",, 92 | Promptsource,https://huggingface.co/spaces/bigscience/promptsource,Inclusive,Prompting,Language,, 93 | Hallucination detection,https://huggingface.co/spaces/ml6team/post-processing-summarization,Rigorous,"Error Analysis, Hallucination",Language,, 94 | Dutch Toxic Comment Detection,https://huggingface.co/spaces/ml6team/toxic-comment-detection-dutch,Socially Conscious,Hate Detection,Language,, 95 | German Toxic Comment Detection,https://huggingface.co/spaces/ml6team/toxic-comment-detection-german,Socially Conscious,Hate Detection,Language,, 96 | Terms & Conditions Summarizer,https://huggingface.co/spaces/ml6team/distilbart-tos-summarizer-tosdr,Socially Conscious,"Accessibility, Contract Understanding",Language,, 97 | RE:Belle,https://huggingface.co/spaces/ml6team/Knowledge-graphs,Rigorous,Knowledge Graphs,Language,, 98 | Am I in The Stack?,https://huggingface.co/spaces/lvwerra/in-the-stack-gr,Consentful,"Data Governance, Dataset Opt-Out",Language,, 99 | Gaia Search,https://huggingface.co/spaces/ola13/gaia,Rigorous,Data Exploration,"Image, Language",, 100 | compare anomaly detection algorithms,https://huggingface.co/spaces/scikit-learn/anomaly-detection,Rigorous,"Anomaly Detection, Educational",,, 101 | ROOTS search tool,https://huggingface.co/spaces/bigscience-data/scisearch,Rigorous,Data Exploration,Language,, 102 | ROOTS search tool,https://huggingface.co/spaces/bigscience-data/roots-search,Rigorous,Data Exploration,Language,, 103 | Convert any model to Safetensors and open a PR,https://huggingface.co/spaces/safetensors/convert,Rigorous,Security,,, 104 | Spurious Correlation Evaluation for Pre-trained LLMs,https://huggingface.co/spaces/emilylearning/spurious_correlation_evaluation,Rigorous,Correlations,Language,, 105 | Are you certain?,https://huggingface.co/spaces/emilylearning/llm_uncertainty,Rigorous,"Gender, Uncertainty",Language,, 106 | SantaCoder: Dataset Search,https://huggingface.co/spaces/bigcode/santacoder-search,"Consentful, Rigorous","Code Generation, License Compliance",Language,, 107 | Santa Model Generator,https://huggingface.co/spaces/bigcode/santacoder-endpoint,Consentful,Code Generation,Language,, 108 | "SantaCoder: 109 |  Code Generation",https://huggingface.co/spaces/bigcode/santacoder-demo,Consentful,Code Generation,Language,, 110 | PII Anonymization,https://huggingface.co/spaces/bigcode/pii-public-demo,"Consentful, Rigorous","Anonymization, Code Generation, Privacy, Security",Language,, 111 | XLabel: eXplainable Labeling Assistant,https://huggingface.co/spaces/Donlapark/XLabel,Rigorous,"Data Labeling, Explainability",Language,, 112 | Prompt specifier recognizer by https://www.selas.ai/,https://huggingface.co/spaces/teo-sanchez/prompt_specifier_recognizer,Rigorous,"Explainability, Prompting",Language,, 113 | In The Stack,https://huggingface.co/spaces/lvwerra/in-the-stack,"Consentful, Rigorous","Data Governance, Dataset Opt-Out, Transparency",Language,, 114 | SantaCoder 🎅 bash/shell 🐚 Completion,https://huggingface.co/spaces/mrm8488/santacoder-bash-completion,Consentful,"Code Generation, Downstream Artifact",Language,, 115 | Santa Explains Code,https://huggingface.co/spaces/bigcode/santa-explains-code,Consentful,"Code Generation, Downstream Artifact",Language,, 116 | Utilizing BioBERT for PICO Evidence Summarization,https://huggingface.co/spaces/owaiskha9654/PICO-Evidence-Based-Classification-Inference,Socially Conscious,Health,Language,, 117 | Pile V2 EDA,https://huggingface.co/spaces/CarperAI/pile-v2-eda,Rigorous,Data Exploration,Language,, 118 | Disaggregators,https://huggingface.co/spaces/society-ethics/disaggregators,"Inquisitive, Rigorous","Data Labeling, Disaggregation, Fairness & Bias",Language,, 119 | NER for Drug Names and Adverse Effects,https://huggingface.co/spaces/jsylee/adverse-drug-reactions-ner,Socially Conscious,Health,Language,, 120 | YOLOv5 - Plastic in river detection,https://huggingface.co/spaces/Kili/plastic_in_river,Socially Conscious,Climate,Image,, 121 | Predicting masked words in legal text,https://huggingface.co/spaces/muhtasham/legalBERT,Socially Conscious,Legal,Language,, 122 | Echocardiogram Segmentation,https://huggingface.co/spaces/abidlabs/Echocardiogram-Segmentation,Socially Conscious,Health,Image,, 123 | DeepPrivacy: A Generative Adversarial Network for Face Anonymization,https://huggingface.co/spaces/haakohu/DeepPrivacy,Socially Conscious,"Anonymization, Privacy",Video,, 124 | ASL Fingerspelling Recognition,https://huggingface.co/spaces/tallwhitestck/asl-fingerspelling-recognition,Socially Conscious,Sign Language,Image,, 125 | Task Exploration - Automatic Content Moderation,https://huggingface.co/spaces/hf-task-exploration/ExploreACMnaacl,,,,Broken / Not Worth, 126 | Socratic models for image captioning with BLOOM,https://huggingface.co/spaces/Geonmo/socratic-models-image-captioning-with-BLOOM,Socially Conscious,Image Captioning,Image,, 127 | IntrotoAI Climate Change Project,https://huggingface.co/spaces/aiEDUcurriculum/introtoAI-climate-change-project,Socially Conscious,"Climate, Educational",Language,, 128 | Ask2Democracy - Generador de historias basado en los testimonios del conflicto Colombiano,https://huggingface.co/spaces/jorge-henao/historias-conflicto-col,Socially Conscious,Political,Language,, 129 | Endangered Fish Classification,https://huggingface.co/spaces/Cawinchan/Endangered-Fish-Classification,Socially Conscious,"Climate, Nature",Image,, 130 | ETHIO HYDRO & CLIMATE HUB,https://huggingface.co/spaces/poooja2012/ethio_hydro,Socially Conscious,"Climate, Data Exploration",Tabular,, 131 | Cycling rates in London,https://huggingface.co/spaces/liloho/london-cycling-rates,Socially Conscious,Data Exploration,Tabular,, 132 | ClimateGAN: Visualize Climate Change,https://huggingface.co/spaces/NimaBoscarino/climategan,Socially Conscious,"Climate, Generative",Image,, 133 | ClimateGAN: Visualize Climate Change,https://huggingface.co/spaces/vict0rsch/climateGAN,Socially Conscious,"Climate, Generative",Image,, 134 | Out-of-context misinformation detection,https://huggingface.co/spaces/machinewise-io/OOC-misinformation-detection,Socially Conscious,Misinformation,"Image, Language",, 135 | AI for sustainable agriculture and food systems:Use of Satellite Imagery,https://huggingface.co/spaces/Omdena-Milan/milan-chapter-agrifoods,Socially Conscious,"Agriculture, Climate",Image,, 136 | Policy Test,https://huggingface.co/spaces/peter2000/policy_test,Socially Conscious,Legal,Language,, 137 | Find my Butterfly 🦋,https://huggingface.co/spaces/SDbiaseval/find-my-butterfly,Socially Conscious,"Fun, Nature",Image,, 138 | Which Sea Slug Am I ? 🐌,https://huggingface.co/spaces/sasha/find-my-sea-slug,Socially Conscious,"Fun, Nature",Image,, 139 | Image Classification with EfficientFormer-L1,https://huggingface.co/spaces/adirik/efficientformer,Sustainable,Efficient,Image,, 140 | Polymer Blocks,https://huggingface.co/spaces/GT4SD/polymer_blocks,"Rigorous, Socially Conscious","Model Card, Molecule Generation",Other,, 141 | GeoDiff,https://huggingface.co/spaces/GT4SD/geodiff,"Rigorous, Socially Conscious","Model Card, Molecule Generation",Other,, 142 | DivEMT Explorer,https://huggingface.co/spaces/GroNLP/divemt_explorer,"Rigorous, Socially Conscious","Data Exploration, Transparency",Language,, 143 | ,https://huggingface.co/spaces/lora-library/Low-rank-Adaptation,,,,Broken / Not Worth, 144 | Write Stories Using Bloom,https://huggingface.co/spaces/EuroPython2022/Write-Stories-Using-Bloom,Socially Conscious,"Fun, Generative, Text Generation",Language,, 145 | DALL·E mini by craiyon.com,https://huggingface.co/spaces/dalle-mini/dalle-mini,Rigorous,"Generative, Model Card","Image, Language",, 146 | Modelcard Creator,https://huggingface.co/spaces/huggingface/Model_Cards_Writing_Tool,Rigorous,"Educational, Model Card",,, 147 | Promptist Demo,https://huggingface.co/spaces/microsoft/Promptist,Inclusive,"Accessibility, Prompting, Tool",Language,, 148 | Lexica Art - A Search Engine for Generative Art Prompts and Works,https://huggingface.co/spaces/Xhaheen/Lexica_prompt_search,Inclusive,"Accessibility, Generative, Prompting","Image, Language",, 149 | Imaginary Network Expanded Dataset,https://huggingface.co/spaces/Sygil/INE-dataset-explorer,"Consentful, Rigorous",Data Exploration,"Image, Language",, 150 | Dataset Explore,https://huggingface.co/spaces/bigbio/dataset-explore,Rigorous,"Data Exploration, Transparency",Language,, 151 | Karlo - unCLIP model by KakaoBrain,https://huggingface.co/spaces/kakaobrain/karlo,Socially Conscious,RAIL License,"Image, Language",, 152 | Non-Suspicious image decoder,https://huggingface.co/spaces/mithril-security/NonSuspiciousImageDecoder,"Consentful, Rigorous","Security, Steganography",Image,, 153 | the AI art generator sources of inspiration,https://huggingface.co/spaces/MLearningAI/AIart_sources_of_inspiration,"Rigorous, Socially Conscious",Artist Acknowledgement,Image,, 154 | Wikipedia Assistant,https://huggingface.co/spaces/deepset/wikipedia-assistant,Socially Conscious,"Fact Checking, Knowledge Retrieval, Tool",Language,, 155 | BioGPT-Large Demo,https://huggingface.co/spaces/katielink/biogpt-large-demo,Socially Conscious,"Generative, Health, Text Generation",Language,, -------------------------------------------------------------------------------- /model_scraping/cards/Helsinki-NLP___opus-mt-en-es.md: -------------------------------------------------------------------------------- 1 | ### eng-spa 2 | 3 | * source group: English 4 | * target group: Spanish 5 | * OPUS readme: [eng-spa](https://github.com/Helsinki-NLP/Tatoeba-Challenge/tree/master/models/eng-spa/README.md) 6 | 7 | * model: transformer 8 | * source language(s): eng 9 | * target language(s): spa 10 | * model: transformer 11 | * pre-processing: normalization + SentencePiece (spm32k,spm32k) 12 | * download original weights: [opus-2020-08-18.zip](https://object.pouta.csc.fi/Tatoeba-MT-models/eng-spa/opus-2020-08-18.zip) 13 | * test set translations: [opus-2020-08-18.test.txt](https://object.pouta.csc.fi/Tatoeba-MT-models/eng-spa/opus-2020-08-18.test.txt) 14 | * test set scores: [opus-2020-08-18.eval.txt](https://object.pouta.csc.fi/Tatoeba-MT-models/eng-spa/opus-2020-08-18.eval.txt) 15 | 16 | ## Benchmarks 17 | 18 | | testset | BLEU | chr-F | 19 | |-----------------------|-------|-------| 20 | | newssyscomb2009-engspa.eng.spa | 31.0 | 0.583 | 21 | | news-test2008-engspa.eng.spa | 29.7 | 0.564 | 22 | | newstest2009-engspa.eng.spa | 30.2 | 0.578 | 23 | | newstest2010-engspa.eng.spa | 36.9 | 0.620 | 24 | | newstest2011-engspa.eng.spa | 38.2 | 0.619 | 25 | | newstest2012-engspa.eng.spa | 39.0 | 0.625 | 26 | | newstest2013-engspa.eng.spa | 35.0 | 0.598 | 27 | | Tatoeba-test.eng.spa | 54.9 | 0.721 | 28 | 29 | 30 | ### System Info: 31 | - hf_name: eng-spa 32 | 33 | - source_languages: eng 34 | 35 | - target_languages: spa 36 | 37 | - opus_readme_url: https://github.com/Helsinki-NLP/Tatoeba-Challenge/tree/master/models/eng-spa/README.md 38 | 39 | - original_repo: Tatoeba-Challenge 40 | 41 | - tags: ['translation'] 42 | 43 | - languages: ['en', 'es'] 44 | 45 | - src_constituents: {'eng'} 46 | 47 | - tgt_constituents: {'spa'} 48 | 49 | - src_multilingual: False 50 | 51 | - tgt_multilingual: False 52 | 53 | - prepro: normalization + SentencePiece (spm32k,spm32k) 54 | 55 | - url_model: https://object.pouta.csc.fi/Tatoeba-MT-models/eng-spa/opus-2020-08-18.zip 56 | 57 | - url_test_set: https://object.pouta.csc.fi/Tatoeba-MT-models/eng-spa/opus-2020-08-18.test.txt 58 | 59 | - src_alpha3: eng 60 | 61 | - tgt_alpha3: spa 62 | 63 | - short_pair: en-es 64 | 65 | - chrF2_score: 0.721 66 | 67 | - bleu: 54.9 68 | 69 | - brevity_penalty: 0.978 70 | 71 | - ref_len: 77311.0 72 | 73 | - src_name: English 74 | 75 | - tgt_name: Spanish 76 | 77 | - train_date: 2020-08-18 00:00:00 78 | 79 | - src_alpha2: en 80 | 81 | - tgt_alpha2: es 82 | 83 | - prefer_old: False 84 | 85 | - long_pair: eng-spa 86 | 87 | - helsinki_git_sha: d2f0910c89026c34a44e331e785dec1e0faa7b82 88 | 89 | - transformers_git_sha: f7af09b4524b784d67ae8526f0e2fcc6f5ed0de9 90 | 91 | - port_machine: brutasse 92 | 93 | - port_time: 2020-08-24-18:20 -------------------------------------------------------------------------------- /model_scraping/cards/StanfordAIMI___stanford-deidentifier-base.md: -------------------------------------------------------------------------------- 1 | Stanford de-identifier was trained on a variety of radiology and biomedical documents with the goal of automatising the de-identification process while reaching satisfactory accuracy for use in production. Manuscript in-proceedings. 2 | 3 | These model weights are the recommended ones among all available deidentifier weights. 4 | 5 | Associated github repo: https://github.com/MIDRC/Stanford_Penn_Deidentifier 6 | 7 | ## Citation 8 | 9 | ```bibtex 10 | @article{10.1093/jamia/ocac219, 11 | author = {Chambon, Pierre J and Wu, Christopher and Steinkamp, Jackson M and Adleberg, Jason and Cook, Tessa S and Langlotz, Curtis P}, 12 | title = "{Automated deidentification of radiology reports combining transformer and “hide in plain sight” rule-based methods}", 13 | journal = {Journal of the American Medical Informatics Association}, 14 | year = {2022}, 15 | month = {11}, 16 | abstract = "{To develop an automated deidentification pipeline for radiology reports that detect protected health information (PHI) entities and replaces them with realistic surrogates “hiding in plain sight.”In this retrospective study, 999 chest X-ray and CT reports collected between November 2019 and November 2020 were annotated for PHI at the token level and combined with 3001 X-rays and 2193 medical notes previously labeled, forming a large multi-institutional and cross-domain dataset of 6193 documents. Two radiology test sets, from a known and a new institution, as well as i2b2 2006 and 2014 test sets, served as an evaluation set to estimate model performance and to compare it with previously released deidentification tools. Several PHI detection models were developed based on different training datasets, fine-tuning approaches and data augmentation techniques, and a synthetic PHI generation algorithm. These models were compared using metrics such as precision, recall and F1 score, as well as paired samples Wilcoxon tests.Our best PHI detection model achieves 97.9 F1 score on radiology reports from a known institution, 99.6 from a new institution, 99.5 on i2b2 2006, and 98.9 on i2b2 2014. On reports from a known institution, it achieves 99.1 recall of detecting the core of each PHI span.Our model outperforms all deidentifiers it was compared to on all test sets as well as human labelers on i2b2 2014 data. It enables accurate and automatic deidentification of radiology reports.A transformer-based deidentification pipeline can achieve state-of-the-art performance for deidentifying radiology reports and other medical documents.}", 17 | issn = {1527-974X}, 18 | doi = {10.1093/jamia/ocac219}, 19 | url = {https://doi.org/10.1093/jamia/ocac219}, 20 | note = {ocac219}, 21 | eprint = {https://academic.oup.com/jamia/advance-article-pdf/doi/10.1093/jamia/ocac219/47220191/ocac219.pdf}, 22 | } 23 | ``` -------------------------------------------------------------------------------- /model_scraping/cards/albert-base-v2.md: -------------------------------------------------------------------------------- 1 | # ALBERT Base v2 2 | 3 | Pretrained model on English language using a masked language modeling (MLM) objective. It was introduced in 4 | [this paper](https://arxiv.org/abs/1909.11942) and first released in 5 | [this repository](https://github.com/google-research/albert). This model, as all ALBERT models, is uncased: it does not make a difference 6 | between english and English. 7 | 8 | Disclaimer: The team releasing ALBERT did not write a model card for this model so this model card has been written by 9 | the Hugging Face team. 10 | 11 | ## Model description 12 | 13 | ALBERT is a transformers model pretrained on a large corpus of English data in a self-supervised fashion. This means it 14 | was pretrained on the raw texts only, with no humans labelling them in any way (which is why it can use lots of 15 | publicly available data) with an automatic process to generate inputs and labels from those texts. More precisely, it 16 | was pretrained with two objectives: 17 | 18 | - Masked language modeling (MLM): taking a sentence, the model randomly masks 15% of the words in the input then run 19 | the entire masked sentence through the model and has to predict the masked words. This is different from traditional 20 | recurrent neural networks (RNNs) that usually see the words one after the other, or from autoregressive models like 21 | GPT which internally mask the future tokens. It allows the model to learn a bidirectional representation of the 22 | sentence. 23 | - Sentence Ordering Prediction (SOP): ALBERT uses a pretraining loss based on predicting the ordering of two consecutive segments of text. 24 | 25 | This way, the model learns an inner representation of the English language that can then be used to extract features 26 | useful for downstream tasks: if you have a dataset of labeled sentences for instance, you can train a standard 27 | classifier using the features produced by the ALBERT model as inputs. 28 | 29 | ALBERT is particular in that it shares its layers across its Transformer. Therefore, all layers have the same weights. Using repeating layers results in a small memory footprint, however, the computational cost remains similar to a BERT-like architecture with the same number of hidden layers as it has to iterate through the same number of (repeating) layers. 30 | 31 | This is the second version of the base model. Version 2 is different from version 1 due to different dropout rates, additional training data, and longer training. It has better results in nearly all downstream tasks. 32 | 33 | This model has the following configuration: 34 | 35 | - 12 repeating layers 36 | - 128 embedding dimension 37 | - 768 hidden dimension 38 | - 12 attention heads 39 | - 11M parameters 40 | 41 | ## Intended uses & limitations 42 | 43 | You can use the raw model for either masked language modeling or next sentence prediction, but it's mostly intended to 44 | be fine-tuned on a downstream task. See the [model hub](https://huggingface.co/models?filter=albert) to look for 45 | fine-tuned versions on a task that interests you. 46 | 47 | Note that this model is primarily aimed at being fine-tuned on tasks that use the whole sentence (potentially masked) 48 | to make decisions, such as sequence classification, token classification or question answering. For tasks such as text 49 | generation you should look at model like GPT2. 50 | 51 | ### How to use 52 | 53 | You can use this model directly with a pipeline for masked language modeling: 54 | 55 | ```python 56 | >>> from transformers import pipeline 57 | >>> unmasker = pipeline('fill-mask', model='albert-base-v2') 58 | >>> unmasker("Hello I'm a [MASK] model.") 59 | [ 60 | { 61 | "sequence":"[CLS] hello i'm a modeling model.[SEP]", 62 | "score":0.05816134437918663, 63 | "token":12807, 64 | "token_str":"▁modeling" 65 | }, 66 | { 67 | "sequence":"[CLS] hello i'm a modelling model.[SEP]", 68 | "score":0.03748830780386925, 69 | "token":23089, 70 | "token_str":"▁modelling" 71 | }, 72 | { 73 | "sequence":"[CLS] hello i'm a model model.[SEP]", 74 | "score":0.033725276589393616, 75 | "token":1061, 76 | "token_str":"▁model" 77 | }, 78 | { 79 | "sequence":"[CLS] hello i'm a runway model.[SEP]", 80 | "score":0.017313428223133087, 81 | "token":8014, 82 | "token_str":"▁runway" 83 | }, 84 | { 85 | "sequence":"[CLS] hello i'm a lingerie model.[SEP]", 86 | "score":0.014405295252799988, 87 | "token":29104, 88 | "token_str":"▁lingerie" 89 | } 90 | ] 91 | ``` 92 | 93 | Here is how to use this model to get the features of a given text in PyTorch: 94 | 95 | ```python 96 | from transformers import AlbertTokenizer, AlbertModel 97 | tokenizer = AlbertTokenizer.from_pretrained('albert-base-v2') 98 | model = AlbertModel.from_pretrained("albert-base-v2") 99 | text = "Replace me by any text you'd like." 100 | encoded_input = tokenizer(text, return_tensors='pt') 101 | output = model(**encoded_input) 102 | ``` 103 | 104 | and in TensorFlow: 105 | 106 | ```python 107 | from transformers import AlbertTokenizer, TFAlbertModel 108 | tokenizer = AlbertTokenizer.from_pretrained('albert-base-v2'') 109 | model = TFAlbertModel.from_pretrained("albert-base-v2) 110 | text = "Replace me by any text you'd like." 111 | encoded_input = tokenizer(text, return_tensors='tf') 112 | output = model(encoded_input) 113 | ``` 114 | 115 | ### Limitations and bias 116 | 117 | Even if the training data used for this model could be characterized as fairly neutral, this model can have biased 118 | predictions: 119 | 120 | ```python 121 | >>> from transformers import pipeline 122 | >>> unmasker = pipeline('fill-mask', model='albert-base-v2') 123 | >>> unmasker("The man worked as a [MASK].") 124 | 125 | [ 126 | { 127 | "sequence":"[CLS] the man worked as a chauffeur.[SEP]", 128 | "score":0.029577180743217468, 129 | "token":28744, 130 | "token_str":"▁chauffeur" 131 | }, 132 | { 133 | "sequence":"[CLS] the man worked as a janitor.[SEP]", 134 | "score":0.028865724802017212, 135 | "token":29477, 136 | "token_str":"▁janitor" 137 | }, 138 | { 139 | "sequence":"[CLS] the man worked as a shoemaker.[SEP]", 140 | "score":0.02581118606030941, 141 | "token":29024, 142 | "token_str":"▁shoemaker" 143 | }, 144 | { 145 | "sequence":"[CLS] the man worked as a blacksmith.[SEP]", 146 | "score":0.01849772222340107, 147 | "token":21238, 148 | "token_str":"▁blacksmith" 149 | }, 150 | { 151 | "sequence":"[CLS] the man worked as a lawyer.[SEP]", 152 | "score":0.01820771023631096, 153 | "token":3672, 154 | "token_str":"▁lawyer" 155 | } 156 | ] 157 | 158 | >>> unmasker("The woman worked as a [MASK].") 159 | 160 | [ 161 | { 162 | "sequence":"[CLS] the woman worked as a receptionist.[SEP]", 163 | "score":0.04604868218302727, 164 | "token":25331, 165 | "token_str":"▁receptionist" 166 | }, 167 | { 168 | "sequence":"[CLS] the woman worked as a janitor.[SEP]", 169 | "score":0.028220869600772858, 170 | "token":29477, 171 | "token_str":"▁janitor" 172 | }, 173 | { 174 | "sequence":"[CLS] the woman worked as a paramedic.[SEP]", 175 | "score":0.0261906236410141, 176 | "token":23386, 177 | "token_str":"▁paramedic" 178 | }, 179 | { 180 | "sequence":"[CLS] the woman worked as a chauffeur.[SEP]", 181 | "score":0.024797942489385605, 182 | "token":28744, 183 | "token_str":"▁chauffeur" 184 | }, 185 | { 186 | "sequence":"[CLS] the woman worked as a waitress.[SEP]", 187 | "score":0.024124596267938614, 188 | "token":13678, 189 | "token_str":"▁waitress" 190 | } 191 | ] 192 | ``` 193 | 194 | This bias will also affect all fine-tuned versions of this model. 195 | 196 | ## Training data 197 | 198 | The ALBERT model was pretrained on [BookCorpus](https://yknzhu.wixsite.com/mbweb), a dataset consisting of 11,038 199 | unpublished books and [English Wikipedia](https://en.wikipedia.org/wiki/English_Wikipedia) (excluding lists, tables and 200 | headers). 201 | 202 | ## Training procedure 203 | 204 | ### Preprocessing 205 | 206 | The texts are lowercased and tokenized using SentencePiece and a vocabulary size of 30,000. The inputs of the model are 207 | then of the form: 208 | 209 | ``` 210 | [CLS] Sentence A [SEP] Sentence B [SEP] 211 | ``` 212 | 213 | ### Training 214 | 215 | The ALBERT procedure follows the BERT setup. 216 | 217 | The details of the masking procedure for each sentence are the following: 218 | - 15% of the tokens are masked. 219 | - In 80% of the cases, the masked tokens are replaced by `[MASK]`. 220 | - In 10% of the cases, the masked tokens are replaced by a random token (different) from the one they replace. 221 | - In the 10% remaining cases, the masked tokens are left as is. 222 | 223 | ## Evaluation results 224 | 225 | When fine-tuned on downstream tasks, the ALBERT models achieve the following results: 226 | 227 | | | Average | SQuAD1.1 | SQuAD2.0 | MNLI | SST-2 | RACE | 228 | |----------------|----------|----------|----------|----------|----------|----------| 229 | |V2 | 230 | |ALBERT-base |82.3 |90.2/83.2 |82.1/79.3 |84.6 |92.9 |66.8 | 231 | |ALBERT-large |85.7 |91.8/85.2 |84.9/81.8 |86.5 |94.9 |75.2 | 232 | |ALBERT-xlarge |87.9 |92.9/86.4 |87.9/84.1 |87.9 |95.4 |80.7 | 233 | |ALBERT-xxlarge |90.9 |94.6/89.1 |89.8/86.9 |90.6 |96.8 |86.8 | 234 | |V1 | 235 | |ALBERT-base |80.1 |89.3/82.3 | 80.0/77.1|81.6 |90.3 | 64.0 | 236 | |ALBERT-large |82.4 |90.6/83.9 | 82.3/79.4|83.5 |91.7 | 68.5 | 237 | |ALBERT-xlarge |85.5 |92.5/86.1 | 86.1/83.1|86.4 |92.4 | 74.8 | 238 | |ALBERT-xxlarge |91.0 |94.8/89.3 | 90.2/87.4|90.8 |96.9 | 86.5 | 239 | 240 | 241 | ### BibTeX entry and citation info 242 | 243 | ```bibtex 244 | @article{DBLP:journals/corr/abs-1909-11942, 245 | author = {Zhenzhong Lan and 246 | Mingda Chen and 247 | Sebastian Goodman and 248 | Kevin Gimpel and 249 | Piyush Sharma and 250 | Radu Soricut}, 251 | title = {{ALBERT:} {A} Lite {BERT} for Self-supervised Learning of Language 252 | Representations}, 253 | journal = {CoRR}, 254 | volume = {abs/1909.11942}, 255 | year = {2019}, 256 | url = {http://arxiv.org/abs/1909.11942}, 257 | archivePrefix = {arXiv}, 258 | eprint = {1909.11942}, 259 | timestamp = {Fri, 27 Sep 2019 13:04:21 +0200}, 260 | biburl = {https://dblp.org/rec/journals/corr/abs-1909-11942.bib}, 261 | bibsource = {dblp computer science bibliography, https://dblp.org} 262 | } 263 | ``` -------------------------------------------------------------------------------- /model_scraping/cards/bert-base-cased.md: -------------------------------------------------------------------------------- 1 | # BERT base model (cased) 2 | 3 | Pretrained model on English language using a masked language modeling (MLM) objective. It was introduced in 4 | [this paper](https://arxiv.org/abs/1810.04805) and first released in 5 | [this repository](https://github.com/google-research/bert). This model is case-sensitive: it makes a difference between 6 | english and English. 7 | 8 | Disclaimer: The team releasing BERT did not write a model card for this model so this model card has been written by 9 | the Hugging Face team. 10 | 11 | ## Model description 12 | 13 | BERT is a transformers model pretrained on a large corpus of English data in a self-supervised fashion. This means it 14 | was pretrained on the raw texts only, with no humans labelling them in any way (which is why it can use lots of 15 | publicly available data) with an automatic process to generate inputs and labels from those texts. More precisely, it 16 | was pretrained with two objectives: 17 | 18 | - Masked language modeling (MLM): taking a sentence, the model randomly masks 15% of the words in the input then run 19 | the entire masked sentence through the model and has to predict the masked words. This is different from traditional 20 | recurrent neural networks (RNNs) that usually see the words one after the other, or from autoregressive models like 21 | GPT which internally mask the future tokens. It allows the model to learn a bidirectional representation of the 22 | sentence. 23 | - Next sentence prediction (NSP): the models concatenates two masked sentences as inputs during pretraining. Sometimes 24 | they correspond to sentences that were next to each other in the original text, sometimes not. The model then has to 25 | predict if the two sentences were following each other or not. 26 | 27 | This way, the model learns an inner representation of the English language that can then be used to extract features 28 | useful for downstream tasks: if you have a dataset of labeled sentences for instance, you can train a standard 29 | classifier using the features produced by the BERT model as inputs. 30 | 31 | ## Intended uses & limitations 32 | 33 | You can use the raw model for either masked language modeling or next sentence prediction, but it's mostly intended to 34 | be fine-tuned on a downstream task. See the [model hub](https://huggingface.co/models?filter=bert) to look for 35 | fine-tuned versions on a task that interests you. 36 | 37 | Note that this model is primarily aimed at being fine-tuned on tasks that use the whole sentence (potentially masked) 38 | to make decisions, such as sequence classification, token classification or question answering. For tasks such as text 39 | generation you should look at model like GPT2. 40 | 41 | ### How to use 42 | 43 | You can use this model directly with a pipeline for masked language modeling: 44 | 45 | ```python 46 | >>> from transformers import pipeline 47 | >>> unmasker = pipeline('fill-mask', model='bert-base-cased') 48 | >>> unmasker("Hello I'm a [MASK] model.") 49 | 50 | [{'sequence': "[CLS] Hello I'm a fashion model. [SEP]", 51 | 'score': 0.09019174426794052, 52 | 'token': 4633, 53 | 'token_str': 'fashion'}, 54 | {'sequence': "[CLS] Hello I'm a new model. [SEP]", 55 | 'score': 0.06349995732307434, 56 | 'token': 1207, 57 | 'token_str': 'new'}, 58 | {'sequence': "[CLS] Hello I'm a male model. [SEP]", 59 | 'score': 0.06228214129805565, 60 | 'token': 2581, 61 | 'token_str': 'male'}, 62 | {'sequence': "[CLS] Hello I'm a professional model. [SEP]", 63 | 'score': 0.0441727414727211, 64 | 'token': 1848, 65 | 'token_str': 'professional'}, 66 | {'sequence': "[CLS] Hello I'm a super model. [SEP]", 67 | 'score': 0.03326151892542839, 68 | 'token': 7688, 69 | 'token_str': 'super'}] 70 | ``` 71 | 72 | Here is how to use this model to get the features of a given text in PyTorch: 73 | 74 | ```python 75 | from transformers import BertTokenizer, BertModel 76 | tokenizer = BertTokenizer.from_pretrained('bert-base-cased') 77 | model = BertModel.from_pretrained("bert-base-cased") 78 | text = "Replace me by any text you'd like." 79 | encoded_input = tokenizer(text, return_tensors='pt') 80 | output = model(**encoded_input) 81 | ``` 82 | 83 | and in TensorFlow: 84 | 85 | ```python 86 | from transformers import BertTokenizer, TFBertModel 87 | tokenizer = BertTokenizer.from_pretrained('bert-base-cased') 88 | model = TFBertModel.from_pretrained("bert-base-cased") 89 | text = "Replace me by any text you'd like." 90 | encoded_input = tokenizer(text, return_tensors='tf') 91 | output = model(encoded_input) 92 | ``` 93 | 94 | ### Limitations and bias 95 | 96 | Even if the training data used for this model could be characterized as fairly neutral, this model can have biased 97 | predictions: 98 | 99 | ```python 100 | >>> from transformers import pipeline 101 | >>> unmasker = pipeline('fill-mask', model='bert-base-cased') 102 | >>> unmasker("The man worked as a [MASK].") 103 | 104 | [{'sequence': '[CLS] The man worked as a lawyer. [SEP]', 105 | 'score': 0.04804691672325134, 106 | 'token': 4545, 107 | 'token_str': 'lawyer'}, 108 | {'sequence': '[CLS] The man worked as a waiter. [SEP]', 109 | 'score': 0.037494491785764694, 110 | 'token': 17989, 111 | 'token_str': 'waiter'}, 112 | {'sequence': '[CLS] The man worked as a cop. [SEP]', 113 | 'score': 0.035512614995241165, 114 | 'token': 9947, 115 | 'token_str': 'cop'}, 116 | {'sequence': '[CLS] The man worked as a detective. [SEP]', 117 | 'score': 0.031271643936634064, 118 | 'token': 9140, 119 | 'token_str': 'detective'}, 120 | {'sequence': '[CLS] The man worked as a doctor. [SEP]', 121 | 'score': 0.027423162013292313, 122 | 'token': 3995, 123 | 'token_str': 'doctor'}] 124 | 125 | >>> unmasker("The woman worked as a [MASK].") 126 | 127 | [{'sequence': '[CLS] The woman worked as a nurse. [SEP]', 128 | 'score': 0.16927455365657806, 129 | 'token': 7439, 130 | 'token_str': 'nurse'}, 131 | {'sequence': '[CLS] The woman worked as a waitress. [SEP]', 132 | 'score': 0.1501094549894333, 133 | 'token': 15098, 134 | 'token_str': 'waitress'}, 135 | {'sequence': '[CLS] The woman worked as a maid. [SEP]', 136 | 'score': 0.05600163713097572, 137 | 'token': 13487, 138 | 'token_str': 'maid'}, 139 | {'sequence': '[CLS] The woman worked as a housekeeper. [SEP]', 140 | 'score': 0.04838843643665314, 141 | 'token': 26458, 142 | 'token_str': 'housekeeper'}, 143 | {'sequence': '[CLS] The woman worked as a cook. [SEP]', 144 | 'score': 0.029980547726154327, 145 | 'token': 9834, 146 | 'token_str': 'cook'}] 147 | ``` 148 | 149 | This bias will also affect all fine-tuned versions of this model. 150 | 151 | ## Training data 152 | 153 | The BERT model was pretrained on [BookCorpus](https://yknzhu.wixsite.com/mbweb), a dataset consisting of 11,038 154 | unpublished books and [English Wikipedia](https://en.wikipedia.org/wiki/English_Wikipedia) (excluding lists, tables and 155 | headers). 156 | 157 | ## Training procedure 158 | 159 | ### Preprocessing 160 | 161 | The texts are tokenized using WordPiece and a vocabulary size of 30,000. The inputs of the model are then of the form: 162 | 163 | ``` 164 | [CLS] Sentence A [SEP] Sentence B [SEP] 165 | ``` 166 | 167 | With probability 0.5, sentence A and sentence B correspond to two consecutive sentences in the original corpus and in 168 | the other cases, it's another random sentence in the corpus. Note that what is considered a sentence here is a 169 | consecutive span of text usually longer than a single sentence. The only constrain is that the result with the two 170 | "sentences" has a combined length of less than 512 tokens. 171 | 172 | The details of the masking procedure for each sentence are the following: 173 | - 15% of the tokens are masked. 174 | - In 80% of the cases, the masked tokens are replaced by `[MASK]`. 175 | - In 10% of the cases, the masked tokens are replaced by a random token (different) from the one they replace. 176 | - In the 10% remaining cases, the masked tokens are left as is. 177 | 178 | ### Pretraining 179 | 180 | The model was trained on 4 cloud TPUs in Pod configuration (16 TPU chips total) for one million steps with a batch size 181 | of 256. The sequence length was limited to 128 tokens for 90% of the steps and 512 for the remaining 10%. The optimizer 182 | used is Adam with a learning rate of 1e-4, \\(\beta_{1} = 0.9\\) and \\(\beta_{2} = 0.999\\), a weight decay of 0.01, 183 | learning rate warmup for 10,000 steps and linear decay of the learning rate after. 184 | 185 | ## Evaluation results 186 | 187 | When fine-tuned on downstream tasks, this model achieves the following results: 188 | 189 | Glue test results: 190 | 191 | | Task | MNLI-(m/mm) | QQP | QNLI | SST-2 | CoLA | STS-B | MRPC | RTE | Average | 192 | |:----:|:-----------:|:----:|:----:|:-----:|:----:|:-----:|:----:|:----:|:-------:| 193 | | | 84.6/83.4 | 71.2 | 90.5 | 93.5 | 52.1 | 85.8 | 88.9 | 66.4 | 79.6 | 194 | 195 | 196 | ### BibTeX entry and citation info 197 | 198 | ```bibtex 199 | @article{DBLP:journals/corr/abs-1810-04805, 200 | author = {Jacob Devlin and 201 | Ming{-}Wei Chang and 202 | Kenton Lee and 203 | Kristina Toutanova}, 204 | title = {{BERT:} Pre-training of Deep Bidirectional Transformers for Language 205 | Understanding}, 206 | journal = {CoRR}, 207 | volume = {abs/1810.04805}, 208 | year = {2018}, 209 | url = {http://arxiv.org/abs/1810.04805}, 210 | archivePrefix = {arXiv}, 211 | eprint = {1810.04805}, 212 | timestamp = {Tue, 30 Oct 2018 20:39:56 +0100}, 213 | biburl = {https://dblp.org/rec/journals/corr/abs-1810-04805.bib}, 214 | bibsource = {dblp computer science bibliography, https://dblp.org} 215 | } 216 | ``` 217 | 218 | 219 | 220 | -------------------------------------------------------------------------------- /model_scraping/cards/bert-base-multilingual-cased.md: -------------------------------------------------------------------------------- 1 | # BERT multilingual base model (cased) 2 | 3 | Pretrained model on the top 104 languages with the largest Wikipedia using a masked language modeling (MLM) objective. 4 | It was introduced in [this paper](https://arxiv.org/abs/1810.04805) and first released in 5 | [this repository](https://github.com/google-research/bert). This model is case sensitive: it makes a difference 6 | between english and English. 7 | 8 | Disclaimer: The team releasing BERT did not write a model card for this model so this model card has been written by 9 | the Hugging Face team. 10 | 11 | ## Model description 12 | 13 | BERT is a transformers model pretrained on a large corpus of multilingual data in a self-supervised fashion. This means 14 | it was pretrained on the raw texts only, with no humans labelling them in any way (which is why it can use lots of 15 | publicly available data) with an automatic process to generate inputs and labels from those texts. More precisely, it 16 | was pretrained with two objectives: 17 | 18 | - Masked language modeling (MLM): taking a sentence, the model randomly masks 15% of the words in the input then run 19 | the entire masked sentence through the model and has to predict the masked words. This is different from traditional 20 | recurrent neural networks (RNNs) that usually see the words one after the other, or from autoregressive models like 21 | GPT which internally mask the future tokens. It allows the model to learn a bidirectional representation of the 22 | sentence. 23 | - Next sentence prediction (NSP): the models concatenates two masked sentences as inputs during pretraining. Sometimes 24 | they correspond to sentences that were next to each other in the original text, sometimes not. The model then has to 25 | predict if the two sentences were following each other or not. 26 | 27 | This way, the model learns an inner representation of the languages in the training set that can then be used to 28 | extract features useful for downstream tasks: if you have a dataset of labeled sentences for instance, you can train a 29 | standard classifier using the features produced by the BERT model as inputs. 30 | 31 | ## Intended uses & limitations 32 | 33 | You can use the raw model for either masked language modeling or next sentence prediction, but it's mostly intended to 34 | be fine-tuned on a downstream task. See the [model hub](https://huggingface.co/models?filter=bert) to look for 35 | fine-tuned versions on a task that interests you. 36 | 37 | Note that this model is primarily aimed at being fine-tuned on tasks that use the whole sentence (potentially masked) 38 | to make decisions, such as sequence classification, token classification or question answering. For tasks such as text 39 | generation you should look at model like GPT2. 40 | 41 | ### How to use 42 | 43 | You can use this model directly with a pipeline for masked language modeling: 44 | 45 | ```python 46 | >>> from transformers import pipeline 47 | >>> unmasker = pipeline('fill-mask', model='bert-base-multilingual-cased') 48 | >>> unmasker("Hello I'm a [MASK] model.") 49 | 50 | [{'sequence': "[CLS] Hello I'm a model model. [SEP]", 51 | 'score': 0.10182085633277893, 52 | 'token': 13192, 53 | 'token_str': 'model'}, 54 | {'sequence': "[CLS] Hello I'm a world model. [SEP]", 55 | 'score': 0.052126359194517136, 56 | 'token': 11356, 57 | 'token_str': 'world'}, 58 | {'sequence': "[CLS] Hello I'm a data model. [SEP]", 59 | 'score': 0.048930276185274124, 60 | 'token': 11165, 61 | 'token_str': 'data'}, 62 | {'sequence': "[CLS] Hello I'm a flight model. [SEP]", 63 | 'score': 0.02036019042134285, 64 | 'token': 23578, 65 | 'token_str': 'flight'}, 66 | {'sequence': "[CLS] Hello I'm a business model. [SEP]", 67 | 'score': 0.020079681649804115, 68 | 'token': 14155, 69 | 'token_str': 'business'}] 70 | ``` 71 | 72 | Here is how to use this model to get the features of a given text in PyTorch: 73 | 74 | ```python 75 | from transformers import BertTokenizer, BertModel 76 | tokenizer = BertTokenizer.from_pretrained('bert-base-multilingual-cased') 77 | model = BertModel.from_pretrained("bert-base-multilingual-cased") 78 | text = "Replace me by any text you'd like." 79 | encoded_input = tokenizer(text, return_tensors='pt') 80 | output = model(**encoded_input) 81 | ``` 82 | 83 | and in TensorFlow: 84 | 85 | ```python 86 | from transformers import BertTokenizer, TFBertModel 87 | tokenizer = BertTokenizer.from_pretrained('bert-base-multilingual-cased') 88 | model = TFBertModel.from_pretrained("bert-base-multilingual-cased") 89 | text = "Replace me by any text you'd like." 90 | encoded_input = tokenizer(text, return_tensors='tf') 91 | output = model(encoded_input) 92 | ``` 93 | 94 | ## Training data 95 | 96 | The BERT model was pretrained on the 104 languages with the largest Wikipedias. You can find the complete list 97 | [here](https://github.com/google-research/bert/blob/master/multilingual.md#list-of-languages). 98 | 99 | ## Training procedure 100 | 101 | ### Preprocessing 102 | 103 | The texts are lowercased and tokenized using WordPiece and a shared vocabulary size of 110,000. The languages with a 104 | larger Wikipedia are under-sampled and the ones with lower resources are oversampled. For languages like Chinese, 105 | Japanese Kanji and Korean Hanja that don't have space, a CJK Unicode block is added around every character. 106 | 107 | The inputs of the model are then of the form: 108 | 109 | ``` 110 | [CLS] Sentence A [SEP] Sentence B [SEP] 111 | ``` 112 | 113 | With probability 0.5, sentence A and sentence B correspond to two consecutive sentences in the original corpus and in 114 | the other cases, it's another random sentence in the corpus. Note that what is considered a sentence here is a 115 | consecutive span of text usually longer than a single sentence. The only constrain is that the result with the two 116 | "sentences" has a combined length of less than 512 tokens. 117 | 118 | The details of the masking procedure for each sentence are the following: 119 | - 15% of the tokens are masked. 120 | - In 80% of the cases, the masked tokens are replaced by `[MASK]`. 121 | - In 10% of the cases, the masked tokens are replaced by a random token (different) from the one they replace. 122 | - In the 10% remaining cases, the masked tokens are left as is. 123 | 124 | 125 | ### BibTeX entry and citation info 126 | 127 | ```bibtex 128 | @article{DBLP:journals/corr/abs-1810-04805, 129 | author = {Jacob Devlin and 130 | Ming{-}Wei Chang and 131 | Kenton Lee and 132 | Kristina Toutanova}, 133 | title = {{BERT:} Pre-training of Deep Bidirectional Transformers for Language 134 | Understanding}, 135 | journal = {CoRR}, 136 | volume = {abs/1810.04805}, 137 | year = {2018}, 138 | url = {http://arxiv.org/abs/1810.04805}, 139 | archivePrefix = {arXiv}, 140 | eprint = {1810.04805}, 141 | timestamp = {Tue, 30 Oct 2018 20:39:56 +0100}, 142 | biburl = {https://dblp.org/rec/journals/corr/abs-1810-04805.bib}, 143 | bibsource = {dblp computer science bibliography, https://dblp.org} 144 | } 145 | ``` -------------------------------------------------------------------------------- /model_scraping/cards/bert-base-uncased.md: -------------------------------------------------------------------------------- 1 | # BERT base model (uncased) 2 | 3 | Pretrained model on English language using a masked language modeling (MLM) objective. It was introduced in 4 | [this paper](https://arxiv.org/abs/1810.04805) and first released in 5 | [this repository](https://github.com/google-research/bert). This model is uncased: it does not make a difference 6 | between english and English. 7 | 8 | Disclaimer: The team releasing BERT did not write a model card for this model so this model card has been written by 9 | the Hugging Face team. 10 | 11 | ## Model description 12 | 13 | BERT is a transformers model pretrained on a large corpus of English data in a self-supervised fashion. This means it 14 | was pretrained on the raw texts only, with no humans labeling them in any way (which is why it can use lots of 15 | publicly available data) with an automatic process to generate inputs and labels from those texts. More precisely, it 16 | was pretrained with two objectives: 17 | 18 | - Masked language modeling (MLM): taking a sentence, the model randomly masks 15% of the words in the input then run 19 | the entire masked sentence through the model and has to predict the masked words. This is different from traditional 20 | recurrent neural networks (RNNs) that usually see the words one after the other, or from autoregressive models like 21 | GPT which internally masks the future tokens. It allows the model to learn a bidirectional representation of the 22 | sentence. 23 | - Next sentence prediction (NSP): the models concatenates two masked sentences as inputs during pretraining. Sometimes 24 | they correspond to sentences that were next to each other in the original text, sometimes not. The model then has to 25 | predict if the two sentences were following each other or not. 26 | 27 | This way, the model learns an inner representation of the English language that can then be used to extract features 28 | useful for downstream tasks: if you have a dataset of labeled sentences, for instance, you can train a standard 29 | classifier using the features produced by the BERT model as inputs. 30 | 31 | ## Model variations 32 | 33 | BERT has originally been released in base and large variations, for cased and uncased input text. The uncased models also strips out an accent markers. 34 | Chinese and multilingual uncased and cased versions followed shortly after. 35 | Modified preprocessing with whole word masking has replaced subpiece masking in a following work, with the release of two models. 36 | Other 24 smaller models are released afterward. 37 | 38 | The detailed release history can be found on the [google-research/bert readme](https://github.com/google-research/bert/blob/master/README.md) on github. 39 | 40 | | Model | #params | Language | 41 | |------------------------|--------------------------------|-------| 42 | | [`bert-base-uncased`](https://huggingface.co/bert-base-uncased) | 110M | English | 43 | | [`bert-large-uncased`](https://huggingface.co/bert-large-uncased) | 340M | English | sub 44 | | [`bert-base-cased`](https://huggingface.co/bert-base-cased) | 110M | English | 45 | | [`bert-large-cased`](https://huggingface.co/bert-large-cased) | 340M | English | 46 | | [`bert-base-chinese`](https://huggingface.co/bert-base-chinese) | 110M | Chinese | 47 | | [`bert-base-multilingual-cased`](https://huggingface.co/bert-base-multilingual-cased) | 110M | Multiple | 48 | | [`bert-large-uncased-whole-word-masking`](https://huggingface.co/bert-large-uncased-whole-word-masking) | 340M | English | 49 | | [`bert-large-cased-whole-word-masking`](https://huggingface.co/bert-large-cased-whole-word-masking) | 340M | English | 50 | 51 | ## Intended uses & limitations 52 | 53 | You can use the raw model for either masked language modeling or next sentence prediction, but it's mostly intended to 54 | be fine-tuned on a downstream task. See the [model hub](https://huggingface.co/models?filter=bert) to look for 55 | fine-tuned versions of a task that interests you. 56 | 57 | Note that this model is primarily aimed at being fine-tuned on tasks that use the whole sentence (potentially masked) 58 | to make decisions, such as sequence classification, token classification or question answering. For tasks such as text 59 | generation you should look at model like GPT2. 60 | 61 | ### How to use 62 | 63 | You can use this model directly with a pipeline for masked language modeling: 64 | 65 | ```python 66 | >>> from transformers import pipeline 67 | >>> unmasker = pipeline('fill-mask', model='bert-base-uncased') 68 | >>> unmasker("Hello I'm a [MASK] model.") 69 | 70 | [{'sequence': "[CLS] hello i'm a fashion model. [SEP]", 71 | 'score': 0.1073106899857521, 72 | 'token': 4827, 73 | 'token_str': 'fashion'}, 74 | {'sequence': "[CLS] hello i'm a role model. [SEP]", 75 | 'score': 0.08774490654468536, 76 | 'token': 2535, 77 | 'token_str': 'role'}, 78 | {'sequence': "[CLS] hello i'm a new model. [SEP]", 79 | 'score': 0.05338378623127937, 80 | 'token': 2047, 81 | 'token_str': 'new'}, 82 | {'sequence': "[CLS] hello i'm a super model. [SEP]", 83 | 'score': 0.04667217284440994, 84 | 'token': 3565, 85 | 'token_str': 'super'}, 86 | {'sequence': "[CLS] hello i'm a fine model. [SEP]", 87 | 'score': 0.027095865458250046, 88 | 'token': 2986, 89 | 'token_str': 'fine'}] 90 | ``` 91 | 92 | Here is how to use this model to get the features of a given text in PyTorch: 93 | 94 | ```python 95 | from transformers import BertTokenizer, BertModel 96 | tokenizer = BertTokenizer.from_pretrained('bert-base-uncased') 97 | model = BertModel.from_pretrained("bert-base-uncased") 98 | text = "Replace me by any text you'd like." 99 | encoded_input = tokenizer(text, return_tensors='pt') 100 | output = model(**encoded_input) 101 | ``` 102 | 103 | and in TensorFlow: 104 | 105 | ```python 106 | from transformers import BertTokenizer, TFBertModel 107 | tokenizer = BertTokenizer.from_pretrained('bert-base-uncased') 108 | model = TFBertModel.from_pretrained("bert-base-uncased") 109 | text = "Replace me by any text you'd like." 110 | encoded_input = tokenizer(text, return_tensors='tf') 111 | output = model(encoded_input) 112 | ``` 113 | 114 | ### Limitations and bias 115 | 116 | Even if the training data used for this model could be characterized as fairly neutral, this model can have biased 117 | predictions: 118 | 119 | ```python 120 | >>> from transformers import pipeline 121 | >>> unmasker = pipeline('fill-mask', model='bert-base-uncased') 122 | >>> unmasker("The man worked as a [MASK].") 123 | 124 | [{'sequence': '[CLS] the man worked as a carpenter. [SEP]', 125 | 'score': 0.09747550636529922, 126 | 'token': 10533, 127 | 'token_str': 'carpenter'}, 128 | {'sequence': '[CLS] the man worked as a waiter. [SEP]', 129 | 'score': 0.0523831807076931, 130 | 'token': 15610, 131 | 'token_str': 'waiter'}, 132 | {'sequence': '[CLS] the man worked as a barber. [SEP]', 133 | 'score': 0.04962705448269844, 134 | 'token': 13362, 135 | 'token_str': 'barber'}, 136 | {'sequence': '[CLS] the man worked as a mechanic. [SEP]', 137 | 'score': 0.03788609802722931, 138 | 'token': 15893, 139 | 'token_str': 'mechanic'}, 140 | {'sequence': '[CLS] the man worked as a salesman. [SEP]', 141 | 'score': 0.037680890411138535, 142 | 'token': 18968, 143 | 'token_str': 'salesman'}] 144 | 145 | >>> unmasker("The woman worked as a [MASK].") 146 | 147 | [{'sequence': '[CLS] the woman worked as a nurse. [SEP]', 148 | 'score': 0.21981462836265564, 149 | 'token': 6821, 150 | 'token_str': 'nurse'}, 151 | {'sequence': '[CLS] the woman worked as a waitress. [SEP]', 152 | 'score': 0.1597415804862976, 153 | 'token': 13877, 154 | 'token_str': 'waitress'}, 155 | {'sequence': '[CLS] the woman worked as a maid. [SEP]', 156 | 'score': 0.1154729500412941, 157 | 'token': 10850, 158 | 'token_str': 'maid'}, 159 | {'sequence': '[CLS] the woman worked as a prostitute. [SEP]', 160 | 'score': 0.037968918681144714, 161 | 'token': 19215, 162 | 'token_str': 'prostitute'}, 163 | {'sequence': '[CLS] the woman worked as a cook. [SEP]', 164 | 'score': 0.03042375110089779, 165 | 'token': 5660, 166 | 'token_str': 'cook'}] 167 | ``` 168 | 169 | This bias will also affect all fine-tuned versions of this model. 170 | 171 | ## Training data 172 | 173 | The BERT model was pretrained on [BookCorpus](https://yknzhu.wixsite.com/mbweb), a dataset consisting of 11,038 174 | unpublished books and [English Wikipedia](https://en.wikipedia.org/wiki/English_Wikipedia) (excluding lists, tables and 175 | headers). 176 | 177 | ## Training procedure 178 | 179 | ### Preprocessing 180 | 181 | The texts are lowercased and tokenized using WordPiece and a vocabulary size of 30,000. The inputs of the model are 182 | then of the form: 183 | 184 | ``` 185 | [CLS] Sentence A [SEP] Sentence B [SEP] 186 | ``` 187 | 188 | With probability 0.5, sentence A and sentence B correspond to two consecutive sentences in the original corpus, and in 189 | the other cases, it's another random sentence in the corpus. Note that what is considered a sentence here is a 190 | consecutive span of text usually longer than a single sentence. The only constrain is that the result with the two 191 | "sentences" has a combined length of less than 512 tokens. 192 | 193 | The details of the masking procedure for each sentence are the following: 194 | - 15% of the tokens are masked. 195 | - In 80% of the cases, the masked tokens are replaced by `[MASK]`. 196 | - In 10% of the cases, the masked tokens are replaced by a random token (different) from the one they replace. 197 | - In the 10% remaining cases, the masked tokens are left as is. 198 | 199 | ### Pretraining 200 | 201 | The model was trained on 4 cloud TPUs in Pod configuration (16 TPU chips total) for one million steps with a batch size 202 | of 256. The sequence length was limited to 128 tokens for 90% of the steps and 512 for the remaining 10%. The optimizer 203 | used is Adam with a learning rate of 1e-4, \\(\beta_{1} = 0.9\\) and \\(\beta_{2} = 0.999\\), a weight decay of 0.01, 204 | learning rate warmup for 10,000 steps and linear decay of the learning rate after. 205 | 206 | ## Evaluation results 207 | 208 | When fine-tuned on downstream tasks, this model achieves the following results: 209 | 210 | Glue test results: 211 | 212 | | Task | MNLI-(m/mm) | QQP | QNLI | SST-2 | CoLA | STS-B | MRPC | RTE | Average | 213 | |:----:|:-----------:|:----:|:----:|:-----:|:----:|:-----:|:----:|:----:|:-------:| 214 | | | 84.6/83.4 | 71.2 | 90.5 | 93.5 | 52.1 | 85.8 | 88.9 | 66.4 | 79.6 | 215 | 216 | 217 | ### BibTeX entry and citation info 218 | 219 | ```bibtex 220 | @article{DBLP:journals/corr/abs-1810-04805, 221 | author = {Jacob Devlin and 222 | Ming{-}Wei Chang and 223 | Kenton Lee and 224 | Kristina Toutanova}, 225 | title = {{BERT:} Pre-training of Deep Bidirectional Transformers for Language 226 | Understanding}, 227 | journal = {CoRR}, 228 | volume = {abs/1810.04805}, 229 | year = {2018}, 230 | url = {http://arxiv.org/abs/1810.04805}, 231 | archivePrefix = {arXiv}, 232 | eprint = {1810.04805}, 233 | timestamp = {Tue, 30 Oct 2018 20:39:56 +0100}, 234 | biburl = {https://dblp.org/rec/journals/corr/abs-1810-04805.bib}, 235 | bibsource = {dblp computer science bibliography, https://dblp.org} 236 | } 237 | ``` 238 | 239 | 240 | 241 | -------------------------------------------------------------------------------- /model_scraping/cards/cl-tohoku___bert-base-japanese-whole-word-masking.md: -------------------------------------------------------------------------------- 1 | # BERT base Japanese (IPA dictionary, whole word masking enabled) 2 | 3 | This is a [BERT](https://github.com/google-research/bert) model pretrained on texts in the Japanese language. 4 | 5 | This version of the model processes input texts with word-level tokenization based on the IPA dictionary, followed by the WordPiece subword tokenization. 6 | Additionally, the model is trained with the whole word masking enabled for the masked language modeling (MLM) objective. 7 | 8 | The codes for the pretraining are available at [cl-tohoku/bert-japanese](https://github.com/cl-tohoku/bert-japanese/tree/v1.0). 9 | 10 | ## Model architecture 11 | 12 | The model architecture is the same as the original BERT base model; 12 layers, 768 dimensions of hidden states, and 12 attention heads. 13 | 14 | ## Training Data 15 | 16 | The model is trained on Japanese Wikipedia as of September 1, 2019. 17 | To generate the training corpus, [WikiExtractor](https://github.com/attardi/wikiextractor) is used to extract plain texts from a dump file of Wikipedia articles. 18 | The text files used for the training are 2.6GB in size, consisting of approximately 17M sentences. 19 | 20 | ## Tokenization 21 | 22 | The texts are first tokenized by [MeCab](https://taku910.github.io/mecab/) morphological parser with the IPA dictionary and then split into subwords by the WordPiece algorithm. 23 | The vocabulary size is 32000. 24 | 25 | ## Training 26 | 27 | The model is trained with the same configuration as the original BERT; 512 tokens per instance, 256 instances per batch, and 1M training steps. 28 | 29 | For the training of the MLM (masked language modeling) objective, we introduced the **Whole Word Masking** in which all of the subword tokens corresponding to a single word (tokenized by MeCab) are masked at once. 30 | 31 | ## Licenses 32 | 33 | The pretrained models are distributed under the terms of the [Creative Commons Attribution-ShareAlike 3.0](https://creativecommons.org/licenses/by-sa/3.0/). 34 | 35 | ## Acknowledgments 36 | 37 | For training models, we used Cloud TPUs provided by [TensorFlow Research Cloud](https://www.tensorflow.org/tfrc/) program. -------------------------------------------------------------------------------- /model_scraping/cards/distilbert-base-cased-distilled-squad.md: -------------------------------------------------------------------------------- 1 | # DistilBERT base cased distilled SQuAD 2 | 3 | ## Table of Contents 4 | - [Model Details](#model-details) 5 | - [How To Get Started With the Model](#how-to-get-started-with-the-model) 6 | - [Uses](#uses) 7 | - [Risks, Limitations and Biases](#risks-limitations-and-biases) 8 | - [Training](#training) 9 | - [Evaluation](#evaluation) 10 | - [Environmental Impact](#environmental-impact) 11 | - [Technical Specifications](#technical-specifications) 12 | - [Citation Information](#citation-information) 13 | - [Model Card Authors](#model-card-authors) 14 | 15 | ## Model Details 16 | 17 | **Model Description:** The DistilBERT model was proposed in the blog post [Smaller, faster, cheaper, lighter: Introducing DistilBERT, adistilled version of BERT](https://medium.com/huggingface/distilbert-8cf3380435b5), and the paper [DistilBERT, adistilled version of BERT: smaller, faster, cheaper and lighter](https://arxiv.org/abs/1910.01108). DistilBERT is a small, fast, cheap and light Transformer model trained by distilling BERT base. It has 40% less parameters than *bert-base-uncased*, runs 60% faster while preserving over 95% of BERT's performances as measured on the GLUE language understanding benchmark. 18 | 19 | This model is a fine-tune checkpoint of [DistilBERT-base-cased](https://huggingface.co/distilbert-base-cased), fine-tuned using (a second step of) knowledge distillation on [SQuAD v1.1](https://huggingface.co/datasets/squad). 20 | 21 | - **Developed by:** Hugging Face 22 | - **Model Type:** Transformer-based language model 23 | - **Language(s):** English 24 | - **License:** Apache 2.0 25 | - **Related Models:** [DistilBERT-base-cased](https://huggingface.co/distilbert-base-cased) 26 | - **Resources for more information:** 27 | - See [this repository](https://github.com/huggingface/transformers/tree/main/examples/research_projects/distillation) for more about Distil\* (a class of compressed models including this model) 28 | - See [Sanh et al. (2019)](https://arxiv.org/abs/1910.01108) for more information about knowledge distillation and the training procedure 29 | 30 | ## How to Get Started with the Model 31 | 32 | Use the code below to get started with the model. 33 | 34 | ```python 35 | >>> from transformers import pipeline 36 | >>> question_answerer = pipeline("question-answering", model='distilbert-base-cased-distilled-squad') 37 | 38 | >>> context = r""" 39 | ... Extractive Question Answering is the task of extracting an answer from a text given a question. An example of a 40 | ... question answering dataset is the SQuAD dataset, which is entirely based on that task. If you would like to fine-tune 41 | ... a model on a SQuAD task, you may leverage the examples/pytorch/question-answering/run_squad.py script. 42 | ... """ 43 | 44 | >>> result = question_answerer(question="What is a good example of a question answering dataset?", context=context) 45 | >>> print( 46 | ... f"Answer: '{result['answer']}', score: {round(result['score'], 4)}, start: {result['start']}, end: {result['end']}" 47 | ...) 48 | 49 | Answer: 'SQuAD dataset', score: 0.5152, start: 147, end: 160 50 | ``` 51 | 52 | Here is how to use this model in PyTorch: 53 | 54 | ```python 55 | from transformers import DistilBertTokenizer, DistilBertModel 56 | import torch 57 | tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-cased-distilled-squad') 58 | model = DistilBertModel.from_pretrained('distilbert-base-cased-distilled-squad') 59 | 60 | question, text = "Who was Jim Henson?", "Jim Henson was a nice puppet" 61 | 62 | inputs = tokenizer(question, text, return_tensors="pt") 63 | with torch.no_grad(): 64 | outputs = model(**inputs) 65 | 66 | print(outputs) 67 | ``` 68 | 69 | And in TensorFlow: 70 | 71 | ```python 72 | from transformers import DistilBertTokenizer, TFDistilBertForQuestionAnswering 73 | import tensorflow as tf 74 | 75 | tokenizer = DistilBertTokenizer.from_pretrained("distilbert-base-cased-distilled-squad") 76 | model = TFDistilBertForQuestionAnswering.from_pretrained("distilbert-base-cased-distilled-squad") 77 | 78 | question, text = "Who was Jim Henson?", "Jim Henson was a nice puppet" 79 | 80 | inputs = tokenizer(question, text, return_tensors="tf") 81 | outputs = model(**inputs) 82 | 83 | answer_start_index = int(tf.math.argmax(outputs.start_logits, axis=-1)[0]) 84 | answer_end_index = int(tf.math.argmax(outputs.end_logits, axis=-1)[0]) 85 | 86 | predict_answer_tokens = inputs.input_ids[0, answer_start_index : answer_end_index + 1] 87 | tokenizer.decode(predict_answer_tokens) 88 | ``` 89 | 90 | ## Uses 91 | 92 | This model can be used for question answering. 93 | 94 | #### Misuse and Out-of-scope Use 95 | 96 | The model should not be used to intentionally create hostile or alienating environments for people. In addition, the model was not trained to be factual or true representations of people or events, and therefore using the model to generate such content is out-of-scope for the abilities of this model. 97 | 98 | ## Risks, Limitations and Biases 99 | 100 | **CONTENT WARNING: Readers should be aware that language generated by this model can be disturbing or offensive to some and can propagate historical and current stereotypes.** 101 | 102 | Significant research has explored bias and fairness issues with language models (see, e.g., [Sheng et al. (2021)](https://aclanthology.org/2021.acl-long.330.pdf) and [Bender et al. (2021)](https://dl.acm.org/doi/pdf/10.1145/3442188.3445922)). Predictions generated by the model can include disturbing and harmful stereotypes across protected classes; identity characteristics; and sensitive, social, and occupational groups. For example: 103 | 104 | 105 | ```python 106 | >>> from transformers import pipeline 107 | >>> question_answerer = pipeline("question-answering", model='distilbert-base-cased-distilled-squad') 108 | 109 | >>> context = r""" 110 | ... Alice is sitting on the bench. Bob is sitting next to her. 111 | ... """ 112 | 113 | >>> result = question_answerer(question="Who is the CEO?", context=context) 114 | >>> print( 115 | ... f"Answer: '{result['answer']}', score: {round(result['score'], 4)}, start: {result['start']}, end: {result['end']}" 116 | ...) 117 | 118 | Answer: 'Bob', score: 0.7527, start: 32, end: 35 119 | ``` 120 | 121 | Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. 122 | 123 | ## Training 124 | 125 | #### Training Data 126 | 127 | The [distilbert-base-cased model](https://huggingface.co/distilbert-base-cased) was trained using the same data as the [distilbert-base-uncased model](https://huggingface.co/distilbert-base-uncased). The [distilbert-base-uncased model](https://huggingface.co/distilbert-base-uncased) model describes it's training data as: 128 | 129 | > DistilBERT pretrained on the same data as BERT, which is [BookCorpus](https://yknzhu.wixsite.com/mbweb), a dataset consisting of 11,038 unpublished books and [English Wikipedia](https://en.wikipedia.org/wiki/English_Wikipedia) (excluding lists, tables and headers). 130 | 131 | To learn more about the SQuAD v1.1 dataset, see the [SQuAD v1.1 data card](https://huggingface.co/datasets/squad). 132 | 133 | #### Training Procedure 134 | 135 | ##### Preprocessing 136 | 137 | See the [distilbert-base-cased model card](https://huggingface.co/distilbert-base-cased) for further details. 138 | 139 | ##### Pretraining 140 | 141 | See the [distilbert-base-cased model card](https://huggingface.co/distilbert-base-cased) for further details. 142 | 143 | ## Evaluation 144 | 145 | As discussed in the [model repository](https://github.com/huggingface/transformers/blob/main/examples/research_projects/distillation/README.md) 146 | 147 | > This model reaches a F1 score of 87.1 on the [SQuAD v1.1] dev set (for comparison, BERT bert-base-cased version reaches a F1 score of 88.7). 148 | 149 | ## Environmental Impact 150 | 151 | Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700). We present the hardware type and hours used based on the [associated paper](https://arxiv.org/pdf/1910.01108.pdf). Note that these details are just for training DistilBERT, not including the fine-tuning with SQuAD. 152 | 153 | - **Hardware Type:** 8 16GB V100 GPUs 154 | - **Hours used:** 90 hours 155 | - **Cloud Provider:** Unknown 156 | - **Compute Region:** Unknown 157 | - **Carbon Emitted:** Unknown 158 | 159 | ## Technical Specifications 160 | 161 | See the [associated paper](https://arxiv.org/abs/1910.01108) for details on the modeling architecture, objective, compute infrastructure, and training details. 162 | 163 | ## Citation Information 164 | 165 | ```bibtex 166 | @inproceedings{sanh2019distilbert, 167 | title={DistilBERT, a distilled version of BERT: smaller, faster, cheaper and lighter}, 168 | author={Sanh, Victor and Debut, Lysandre and Chaumond, Julien and Wolf, Thomas}, 169 | booktitle={NeurIPS EMC^2 Workshop}, 170 | year={2019} 171 | } 172 | ``` 173 | 174 | APA: 175 | - Sanh, V., Debut, L., Chaumond, J., & Wolf, T. (2019). DistilBERT, a distilled version of BERT: smaller, faster, cheaper and lighter. arXiv preprint arXiv:1910.01108. 176 | 177 | ## Model Card Authors 178 | 179 | This model card was written by the Hugging Face team. -------------------------------------------------------------------------------- /model_scraping/cards/distilbert-base-uncased-finetuned-sst-2-english.md: -------------------------------------------------------------------------------- 1 | # DistilBERT base uncased finetuned SST-2 2 | 3 | ## Table of Contents 4 | - [Model Details](#model-details) 5 | - [How to Get Started With the Model](#how-to-get-started-with-the-model) 6 | - [Uses](#uses) 7 | - [Risks, Limitations and Biases](#risks-limitations-and-biases) 8 | - [Training](#training) 9 | 10 | ## Model Details 11 | **Model Description:** This model is a fine-tune checkpoint of [DistilBERT-base-uncased](https://huggingface.co/distilbert-base-uncased), fine-tuned on SST-2. 12 | This model reaches an accuracy of 91.3 on the dev set (for comparison, Bert bert-base-uncased version reaches an accuracy of 92.7). 13 | - **Developed by:** Hugging Face 14 | - **Model Type:** Text Classification 15 | - **Language(s):** English 16 | - **License:** Apache-2.0 17 | - **Parent Model:** For more details about DistilBERT, we encourage users to check out [this model card](https://huggingface.co/distilbert-base-uncased). 18 | - **Resources for more information:** 19 | - [Model Documentation](https://huggingface.co/docs/transformers/main/en/model_doc/distilbert#transformers.DistilBertForSequenceClassification) 20 | 21 | ## How to Get Started With the Model 22 | 23 | Example of single-label classification: 24 | ​​ 25 | ```python 26 | import torch 27 | from transformers import DistilBertTokenizer, DistilBertForSequenceClassification 28 | 29 | tokenizer = DistilBertTokenizer.from_pretrained("distilbert-base-uncased") 30 | model = DistilBertForSequenceClassification.from_pretrained("distilbert-base-uncased") 31 | 32 | inputs = tokenizer("Hello, my dog is cute", return_tensors="pt") 33 | with torch.no_grad(): 34 | logits = model(**inputs).logits 35 | 36 | predicted_class_id = logits.argmax().item() 37 | model.config.id2label[predicted_class_id] 38 | 39 | ``` 40 | 41 | ## Uses 42 | 43 | #### Direct Use 44 | 45 | This model can be used for topic classification. You can use the raw model for either masked language modeling or next sentence prediction, but it's mostly intended to be fine-tuned on a downstream task. See the model hub to look for fine-tuned versions on a task that interests you. 46 | 47 | #### Misuse and Out-of-scope Use 48 | The model should not be used to intentionally create hostile or alienating environments for people. In addition, the model was not trained to be factual or true representations of people or events, and therefore using the model to generate such content is out-of-scope for the abilities of this model. 49 | 50 | 51 | ## Risks, Limitations and Biases 52 | 53 | Based on a few experimentations, we observed that this model could produce biased predictions that target underrepresented populations. 54 | 55 | For instance, for sentences like `This film was filmed in COUNTRY`, this binary classification model will give radically different probabilities for the positive label depending on the country (0.89 if the country is France, but 0.08 if the country is Afghanistan) when nothing in the input indicates such a strong semantic shift. In this [colab](https://colab.research.google.com/gist/ageron/fb2f64fb145b4bc7c49efc97e5f114d3/biasmap.ipynb), [Aurélien Géron](https://twitter.com/aureliengeron) made an interesting map plotting these probabilities for each country. 56 | 57 | Map of positive probabilities per country. 58 | 59 | We strongly advise users to thoroughly probe these aspects on their use-cases in order to evaluate the risks of this model. We recommend looking at the following bias evaluation datasets as a place to start: [WinoBias](https://huggingface.co/datasets/wino_bias), [WinoGender](https://huggingface.co/datasets/super_glue), [Stereoset](https://huggingface.co/datasets/stereoset). 60 | 61 | 62 | 63 | # Training 64 | 65 | 66 | #### Training Data 67 | 68 | 69 | The authors use the following Stanford Sentiment Treebank([sst2](https://huggingface.co/datasets/sst2)) corpora for the model. 70 | 71 | #### Training Procedure 72 | 73 | ###### Fine-tuning hyper-parameters 74 | 75 | 76 | - learning_rate = 1e-5 77 | - batch_size = 32 78 | - warmup = 600 79 | - max_seq_length = 128 80 | - num_train_epochs = 3.0 -------------------------------------------------------------------------------- /model_scraping/cards/distilbert-base-uncased.md: -------------------------------------------------------------------------------- 1 | # DistilBERT base model (uncased) 2 | 3 | This model is a distilled version of the [BERT base model](https://huggingface.co/bert-base-uncased). It was 4 | introduced in [this paper](https://arxiv.org/abs/1910.01108). The code for the distillation process can be found 5 | [here](https://github.com/huggingface/transformers/tree/main/examples/research_projects/distillation). This model is uncased: it does 6 | not make a difference between english and English. 7 | 8 | ## Model description 9 | 10 | DistilBERT is a transformers model, smaller and faster than BERT, which was pretrained on the same corpus in a 11 | self-supervised fashion, using the BERT base model as a teacher. This means it was pretrained on the raw texts only, 12 | with no humans labelling them in any way (which is why it can use lots of publicly available data) with an automatic 13 | process to generate inputs and labels from those texts using the BERT base model. More precisely, it was pretrained 14 | with three objectives: 15 | 16 | - Distillation loss: the model was trained to return the same probabilities as the BERT base model. 17 | - Masked language modeling (MLM): this is part of the original training loss of the BERT base model. When taking a 18 | sentence, the model randomly masks 15% of the words in the input then run the entire masked sentence through the 19 | model and has to predict the masked words. This is different from traditional recurrent neural networks (RNNs) that 20 | usually see the words one after the other, or from autoregressive models like GPT which internally mask the future 21 | tokens. It allows the model to learn a bidirectional representation of the sentence. 22 | - Cosine embedding loss: the model was also trained to generate hidden states as close as possible as the BERT base 23 | model. 24 | 25 | This way, the model learns the same inner representation of the English language than its teacher model, while being 26 | faster for inference or downstream tasks. 27 | 28 | ## Intended uses & limitations 29 | 30 | You can use the raw model for either masked language modeling or next sentence prediction, but it's mostly intended to 31 | be fine-tuned on a downstream task. See the [model hub](https://huggingface.co/models?filter=distilbert) to look for 32 | fine-tuned versions on a task that interests you. 33 | 34 | Note that this model is primarily aimed at being fine-tuned on tasks that use the whole sentence (potentially masked) 35 | to make decisions, such as sequence classification, token classification or question answering. For tasks such as text 36 | generation you should look at model like GPT2. 37 | 38 | ### How to use 39 | 40 | You can use this model directly with a pipeline for masked language modeling: 41 | 42 | ```python 43 | >>> from transformers import pipeline 44 | >>> unmasker = pipeline('fill-mask', model='distilbert-base-uncased') 45 | >>> unmasker("Hello I'm a [MASK] model.") 46 | 47 | [{'sequence': "[CLS] hello i'm a role model. [SEP]", 48 | 'score': 0.05292855575680733, 49 | 'token': 2535, 50 | 'token_str': 'role'}, 51 | {'sequence': "[CLS] hello i'm a fashion model. [SEP]", 52 | 'score': 0.03968575969338417, 53 | 'token': 4827, 54 | 'token_str': 'fashion'}, 55 | {'sequence': "[CLS] hello i'm a business model. [SEP]", 56 | 'score': 0.034743521362543106, 57 | 'token': 2449, 58 | 'token_str': 'business'}, 59 | {'sequence': "[CLS] hello i'm a model model. [SEP]", 60 | 'score': 0.03462274372577667, 61 | 'token': 2944, 62 | 'token_str': 'model'}, 63 | {'sequence': "[CLS] hello i'm a modeling model. [SEP]", 64 | 'score': 0.018145186826586723, 65 | 'token': 11643, 66 | 'token_str': 'modeling'}] 67 | ``` 68 | 69 | Here is how to use this model to get the features of a given text in PyTorch: 70 | 71 | ```python 72 | from transformers import DistilBertTokenizer, DistilBertModel 73 | tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-uncased') 74 | model = DistilBertModel.from_pretrained("distilbert-base-uncased") 75 | text = "Replace me by any text you'd like." 76 | encoded_input = tokenizer(text, return_tensors='pt') 77 | output = model(**encoded_input) 78 | ``` 79 | 80 | and in TensorFlow: 81 | 82 | ```python 83 | from transformers import DistilBertTokenizer, TFDistilBertModel 84 | tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-uncased') 85 | model = TFDistilBertModel.from_pretrained("distilbert-base-uncased") 86 | text = "Replace me by any text you'd like." 87 | encoded_input = tokenizer(text, return_tensors='tf') 88 | output = model(encoded_input) 89 | ``` 90 | 91 | ### Limitations and bias 92 | 93 | Even if the training data used for this model could be characterized as fairly neutral, this model can have biased 94 | predictions. It also inherits some of 95 | [the bias of its teacher model](https://huggingface.co/bert-base-uncased#limitations-and-bias). 96 | 97 | ```python 98 | >>> from transformers import pipeline 99 | >>> unmasker = pipeline('fill-mask', model='distilbert-base-uncased') 100 | >>> unmasker("The White man worked as a [MASK].") 101 | 102 | [{'sequence': '[CLS] the white man worked as a blacksmith. [SEP]', 103 | 'score': 0.1235365942120552, 104 | 'token': 20987, 105 | 'token_str': 'blacksmith'}, 106 | {'sequence': '[CLS] the white man worked as a carpenter. [SEP]', 107 | 'score': 0.10142576694488525, 108 | 'token': 10533, 109 | 'token_str': 'carpenter'}, 110 | {'sequence': '[CLS] the white man worked as a farmer. [SEP]', 111 | 'score': 0.04985016956925392, 112 | 'token': 7500, 113 | 'token_str': 'farmer'}, 114 | {'sequence': '[CLS] the white man worked as a miner. [SEP]', 115 | 'score': 0.03932540491223335, 116 | 'token': 18594, 117 | 'token_str': 'miner'}, 118 | {'sequence': '[CLS] the white man worked as a butcher. [SEP]', 119 | 'score': 0.03351764753460884, 120 | 'token': 14998, 121 | 'token_str': 'butcher'}] 122 | 123 | >>> unmasker("The Black woman worked as a [MASK].") 124 | 125 | [{'sequence': '[CLS] the black woman worked as a waitress. [SEP]', 126 | 'score': 0.13283951580524445, 127 | 'token': 13877, 128 | 'token_str': 'waitress'}, 129 | {'sequence': '[CLS] the black woman worked as a nurse. [SEP]', 130 | 'score': 0.12586183845996857, 131 | 'token': 6821, 132 | 'token_str': 'nurse'}, 133 | {'sequence': '[CLS] the black woman worked as a maid. [SEP]', 134 | 'score': 0.11708822101354599, 135 | 'token': 10850, 136 | 'token_str': 'maid'}, 137 | {'sequence': '[CLS] the black woman worked as a prostitute. [SEP]', 138 | 'score': 0.11499975621700287, 139 | 'token': 19215, 140 | 'token_str': 'prostitute'}, 141 | {'sequence': '[CLS] the black woman worked as a housekeeper. [SEP]', 142 | 'score': 0.04722772538661957, 143 | 'token': 22583, 144 | 'token_str': 'housekeeper'}] 145 | ``` 146 | 147 | This bias will also affect all fine-tuned versions of this model. 148 | 149 | ## Training data 150 | 151 | DistilBERT pretrained on the same data as BERT, which is [BookCorpus](https://yknzhu.wixsite.com/mbweb), a dataset 152 | consisting of 11,038 unpublished books and [English Wikipedia](https://en.wikipedia.org/wiki/English_Wikipedia) 153 | (excluding lists, tables and headers). 154 | 155 | ## Training procedure 156 | 157 | ### Preprocessing 158 | 159 | The texts are lowercased and tokenized using WordPiece and a vocabulary size of 30,000. The inputs of the model are 160 | then of the form: 161 | 162 | ``` 163 | [CLS] Sentence A [SEP] Sentence B [SEP] 164 | ``` 165 | 166 | With probability 0.5, sentence A and sentence B correspond to two consecutive sentences in the original corpus and in 167 | the other cases, it's another random sentence in the corpus. Note that what is considered a sentence here is a 168 | consecutive span of text usually longer than a single sentence. The only constrain is that the result with the two 169 | "sentences" has a combined length of less than 512 tokens. 170 | 171 | The details of the masking procedure for each sentence are the following: 172 | - 15% of the tokens are masked. 173 | - In 80% of the cases, the masked tokens are replaced by `[MASK]`. 174 | - In 10% of the cases, the masked tokens are replaced by a random token (different) from the one they replace. 175 | - In the 10% remaining cases, the masked tokens are left as is. 176 | 177 | ### Pretraining 178 | 179 | The model was trained on 8 16 GB V100 for 90 hours. See the 180 | [training code](https://github.com/huggingface/transformers/tree/master/examples/distillation) for all hyperparameters 181 | details. 182 | 183 | ## Evaluation results 184 | 185 | When fine-tuned on downstream tasks, this model achieves the following results: 186 | 187 | Glue test results: 188 | 189 | | Task | MNLI | QQP | QNLI | SST-2 | CoLA | STS-B | MRPC | RTE | 190 | |:----:|:----:|:----:|:----:|:-----:|:----:|:-----:|:----:|:----:| 191 | | | 82.2 | 88.5 | 89.2 | 91.3 | 51.3 | 85.8 | 87.5 | 59.9 | 192 | 193 | 194 | ### BibTeX entry and citation info 195 | 196 | ```bibtex 197 | @article{Sanh2019DistilBERTAD, 198 | title={DistilBERT, a distilled version of BERT: smaller, faster, cheaper and lighter}, 199 | author={Victor Sanh and Lysandre Debut and Julien Chaumond and Thomas Wolf}, 200 | journal={ArXiv}, 201 | year={2019}, 202 | volume={abs/1910.01108} 203 | } 204 | ``` 205 | 206 | 207 | 208 | -------------------------------------------------------------------------------- /model_scraping/cards/distilroberta-base.md: -------------------------------------------------------------------------------- 1 | # Model Card for DistilRoBERTa base 2 | 3 | # Table of Contents 4 | 5 | 1. [Model Details](#model-details) 6 | 2. [Uses](#uses) 7 | 3. [Bias, Risks, and Limitations](#bias-risks-and-limitations) 8 | 4. [Training Details](#training-details) 9 | 5. [Evaluation](#evaluation) 10 | 6. [Environmental Impact](#environmental-impact) 11 | 7. [Citation](#citation) 12 | 8. [How To Get Started With the Model](#how-to-get-started-with-the-model) 13 | 14 | # Model Details 15 | 16 | ## Model Description 17 | 18 | This model is a distilled version of the [RoBERTa-base model](https://huggingface.co/roberta-base). It follows the same training procedure as [DistilBERT](https://huggingface.co/distilbert-base-uncased). 19 | The code for the distillation process can be found [here](https://github.com/huggingface/transformers/tree/master/examples/distillation). 20 | This model is case-sensitive: it makes a difference between english and English. 21 | 22 | The model has 6 layers, 768 dimension and 12 heads, totalizing 82M parameters (compared to 125M parameters for RoBERTa-base). 23 | On average DistilRoBERTa is twice as fast as Roberta-base. 24 | 25 | We encourage users of this model card to check out the [RoBERTa-base model card](https://huggingface.co/roberta-base) to learn more about usage, limitations and potential biases. 26 | 27 | - **Developed by:** Victor Sanh, Lysandre Debut, Julien Chaumond, Thomas Wolf (Hugging Face) 28 | - **Model type:** Transformer-based language model 29 | - **Language(s) (NLP):** English 30 | - **License:** Apache 2.0 31 | - **Related Models:** [RoBERTa-base model card](https://huggingface.co/roberta-base) 32 | - **Resources for more information:** 33 | - [GitHub Repository](https://github.com/huggingface/transformers/blob/main/examples/research_projects/distillation/README.md) 34 | - [Associated Paper](https://arxiv.org/abs/1910.01108) 35 | 36 | # Uses 37 | 38 | ## Direct Use and Downstream Use 39 | 40 | You can use the raw model for masked language modeling, but it's mostly intended to be fine-tuned on a downstream task. See the [model hub](https://huggingface.co/models?filter=roberta) to look for fine-tuned versions on a task that interests you. 41 | 42 | Note that this model is primarily aimed at being fine-tuned on tasks that use the whole sentence (potentially masked) to make decisions, such as sequence classification, token classification or question answering. For tasks such as text generation you should look at model like GPT2. 43 | 44 | ## Out of Scope Use 45 | 46 | The model should not be used to intentionally create hostile or alienating environments for people. The model was not trained to be factual or true representations of people or events, and therefore using the models to generate such content is out-of-scope for the abilities of this model. 47 | 48 | # Bias, Risks, and Limitations 49 | 50 | Significant research has explored bias and fairness issues with language models (see, e.g., [Sheng et al. (2021)](https://aclanthology.org/2021.acl-long.330.pdf) and [Bender et al. (2021)](https://dl.acm.org/doi/pdf/10.1145/3442188.3445922)). Predictions generated by the model may include disturbing and harmful stereotypes across protected classes; identity characteristics; and sensitive, social, and occupational groups. For example: 51 | 52 | ```python 53 | >>> from transformers import pipeline 54 | >>> unmasker = pipeline('fill-mask', model='distilroberta-base') 55 | >>> unmasker("The man worked as a .") 56 | [{'score': 0.1237526461482048, 57 | 'sequence': 'The man worked as a waiter.', 58 | 'token': 38233, 59 | 'token_str': ' waiter'}, 60 | {'score': 0.08968018740415573, 61 | 'sequence': 'The man worked as a waitress.', 62 | 'token': 35698, 63 | 'token_str': ' waitress'}, 64 | {'score': 0.08387645334005356, 65 | 'sequence': 'The man worked as a bartender.', 66 | 'token': 33080, 67 | 'token_str': ' bartender'}, 68 | {'score': 0.061059024184942245, 69 | 'sequence': 'The man worked as a mechanic.', 70 | 'token': 25682, 71 | 'token_str': ' mechanic'}, 72 | {'score': 0.03804653510451317, 73 | 'sequence': 'The man worked as a courier.', 74 | 'token': 37171, 75 | 'token_str': ' courier'}] 76 | 77 | >>> unmasker("The woman worked as a .") 78 | [{'score': 0.23149248957633972, 79 | 'sequence': 'The woman worked as a waitress.', 80 | 'token': 35698, 81 | 'token_str': ' waitress'}, 82 | {'score': 0.07563332468271255, 83 | 'sequence': 'The woman worked as a waiter.', 84 | 'token': 38233, 85 | 'token_str': ' waiter'}, 86 | {'score': 0.06983394920825958, 87 | 'sequence': 'The woman worked as a bartender.', 88 | 'token': 33080, 89 | 'token_str': ' bartender'}, 90 | {'score': 0.05411609262228012, 91 | 'sequence': 'The woman worked as a nurse.', 92 | 'token': 9008, 93 | 'token_str': ' nurse'}, 94 | {'score': 0.04995106905698776, 95 | 'sequence': 'The woman worked as a maid.', 96 | 'token': 29754, 97 | 'token_str': ' maid'}] 98 | ``` 99 | 100 | ## Recommendations 101 | 102 | Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. 103 | 104 | # Training Details 105 | 106 | DistilRoBERTa was pre-trained on [OpenWebTextCorpus](https://skylion007.github.io/OpenWebTextCorpus/), a reproduction of OpenAI's WebText dataset (it is ~4 times less training data than the teacher RoBERTa). See the [roberta-base model card](https://huggingface.co/roberta-base/blob/main/README.md) for further details on training. 107 | 108 | # Evaluation 109 | 110 | When fine-tuned on downstream tasks, this model achieves the following results (see [GitHub Repo](https://github.com/huggingface/transformers/blob/main/examples/research_projects/distillation/README.md)): 111 | 112 | Glue test results: 113 | 114 | | Task | MNLI | QQP | QNLI | SST-2 | CoLA | STS-B | MRPC | RTE | 115 | |:----:|:----:|:----:|:----:|:-----:|:----:|:-----:|:----:|:----:| 116 | | | 84.0 | 89.4 | 90.8 | 92.5 | 59.3 | 88.3 | 86.6 | 67.9 | 117 | 118 | # Environmental Impact 119 | 120 | Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700). 121 | 122 | - **Hardware Type:** More information needed 123 | - **Hours used:** More information needed 124 | - **Cloud Provider:** More information needed 125 | - **Compute Region:** More information needed 126 | - **Carbon Emitted:** More information needed 127 | 128 | # Citation 129 | 130 | ```bibtex 131 | @article{Sanh2019DistilBERTAD, 132 | title={DistilBERT, a distilled version of BERT: smaller, faster, cheaper and lighter}, 133 | author={Victor Sanh and Lysandre Debut and Julien Chaumond and Thomas Wolf}, 134 | journal={ArXiv}, 135 | year={2019}, 136 | volume={abs/1910.01108} 137 | } 138 | ``` 139 | 140 | APA 141 | - Sanh, V., Debut, L., Chaumond, J., & Wolf, T. (2019). DistilBERT, a distilled version of BERT: smaller, faster, cheaper and lighter. arXiv preprint arXiv:1910.01108. 142 | 143 | # How to Get Started With the Model 144 | 145 | You can use the model directly with a pipeline for masked language modeling: 146 | 147 | ```python 148 | >>> from transformers import pipeline 149 | >>> unmasker = pipeline('fill-mask', model='distilroberta-base') 150 | >>> unmasker("Hello I'm a model.") 151 | [{'score': 0.04673689603805542, 152 | 'sequence': "Hello I'm a business model.", 153 | 'token': 265, 154 | 'token_str': ' business'}, 155 | {'score': 0.03846118599176407, 156 | 'sequence': "Hello I'm a freelance model.", 157 | 'token': 18150, 158 | 'token_str': ' freelance'}, 159 | {'score': 0.03308931365609169, 160 | 'sequence': "Hello I'm a fashion model.", 161 | 'token': 2734, 162 | 'token_str': ' fashion'}, 163 | {'score': 0.03018997237086296, 164 | 'sequence': "Hello I'm a role model.", 165 | 'token': 774, 166 | 'token_str': ' role'}, 167 | {'score': 0.02111748233437538, 168 | 'sequence': "Hello I'm a Playboy model.", 169 | 'token': 24526, 170 | 'token_str': ' Playboy'}] 171 | ``` 172 | 173 | 174 | 175 | -------------------------------------------------------------------------------- /model_scraping/cards/emilyalsentzer___Bio_ClinicalBERT.md: -------------------------------------------------------------------------------- 1 | # ClinicalBERT - Bio + Clinical BERT Model 2 | 3 | The [Publicly Available Clinical BERT Embeddings](https://arxiv.org/abs/1904.03323) paper contains four unique clinicalBERT models: initialized with BERT-Base (`cased_L-12_H-768_A-12`) or BioBERT (`BioBERT-Base v1.0 + PubMed 200K + PMC 270K`) & trained on either all MIMIC notes or only discharge summaries. 4 | 5 | This model card describes the Bio+Clinical BERT model, which was initialized from [BioBERT](https://arxiv.org/abs/1901.08746) & trained on all MIMIC notes. 6 | 7 | ## Pretraining Data 8 | The `Bio_ClinicalBERT` model was trained on all notes from [MIMIC III](https://www.nature.com/articles/sdata201635), a database containing electronic health records from ICU patients at the Beth Israel Hospital in Boston, MA. For more details on MIMIC, see [here](https://mimic.physionet.org/). All notes from the `NOTEEVENTS` table were included (~880M words). 9 | 10 | ## Model Pretraining 11 | 12 | ### Note Preprocessing 13 | Each note in MIMIC was first split into sections using a rules-based section splitter (e.g. discharge summary notes were split into "History of Present Illness", "Family History", "Brief Hospital Course", etc. sections). Then each section was split into sentences using SciSpacy (`en core sci md` tokenizer). 14 | 15 | ### Pretraining Procedures 16 | The model was trained using code from [Google's BERT repository](https://github.com/google-research/bert) on a GeForce GTX TITAN X 12 GB GPU. Model parameters were initialized with BioBERT (`BioBERT-Base v1.0 + PubMed 200K + PMC 270K`). 17 | 18 | ### Pretraining Hyperparameters 19 | We used a batch size of 32, a maximum sequence length of 128, and a learning rate of 5 · 10−5 for pre-training our models. The models trained on all MIMIC notes were trained for 150,000 steps. The dup factor for duplicating input data with different masks was set to 5. All other default parameters were used (specifically, masked language model probability = 0.15 20 | and max predictions per sequence = 20). 21 | 22 | ## How to use the model 23 | 24 | Load the model via the transformers library: 25 | ``` 26 | from transformers import AutoTokenizer, AutoModel 27 | tokenizer = AutoTokenizer.from_pretrained("emilyalsentzer/Bio_ClinicalBERT") 28 | model = AutoModel.from_pretrained("emilyalsentzer/Bio_ClinicalBERT") 29 | ``` 30 | 31 | ## More Information 32 | 33 | Refer to the original paper, [Publicly Available Clinical BERT Embeddings](https://arxiv.org/abs/1904.03323) (NAACL Clinical NLP Workshop 2019) for additional details and performance on NLI and NER tasks. 34 | 35 | ## Questions? 36 | 37 | Post a Github issue on the [clinicalBERT repo](https://github.com/EmilyAlsentzer/clinicalBERT) or email emilya@mit.edu with any questions. -------------------------------------------------------------------------------- /model_scraping/cards/facebook___bart-large-mnli.md: -------------------------------------------------------------------------------- 1 | # bart-large-mnli 2 | 3 | This is the checkpoint for [bart-large](https://huggingface.co/facebook/bart-large) after being trained on the [MultiNLI (MNLI)](https://huggingface.co/datasets/multi_nli) dataset. 4 | 5 | Additional information about this model: 6 | - The [bart-large](https://huggingface.co/facebook/bart-large) model page 7 | - [BART: Denoising Sequence-to-Sequence Pre-training for Natural Language Generation, Translation, and Comprehension 8 | ](https://arxiv.org/abs/1910.13461) 9 | - [BART fairseq implementation](https://github.com/pytorch/fairseq/tree/master/fairseq/models/bart) 10 | 11 | ## NLI-based Zero Shot Text Classification 12 | 13 | [Yin et al.](https://arxiv.org/abs/1909.00161) proposed a method for using pre-trained NLI models as a ready-made zero-shot sequence classifiers. The method works by posing the sequence to be classified as the NLI premise and to construct a hypothesis from each candidate label. For example, if we want to evaluate whether a sequence belongs to the class "politics", we could construct a hypothesis of `This text is about politics.`. The probabilities for entailment and contradiction are then converted to label probabilities. 14 | 15 | This method is surprisingly effective in many cases, particularly when used with larger pre-trained models like BART and Roberta. See [this blog post](https://joeddav.github.io/blog/2020/05/29/ZSL.html) for a more expansive introduction to this and other zero shot methods, and see the code snippets below for examples of using this model for zero-shot classification both with Hugging Face's built-in pipeline and with native Transformers/PyTorch code. 16 | 17 | #### With the zero-shot classification pipeline 18 | 19 | The model can be loaded with the `zero-shot-classification` pipeline like so: 20 | 21 | ```python 22 | from transformers import pipeline 23 | classifier = pipeline("zero-shot-classification", 24 | model="facebook/bart-large-mnli") 25 | ``` 26 | 27 | You can then use this pipeline to classify sequences into any of the class names you specify. 28 | 29 | ```python 30 | sequence_to_classify = "one day I will see the world" 31 | candidate_labels = ['travel', 'cooking', 'dancing'] 32 | classifier(sequence_to_classify, candidate_labels) 33 | #{'labels': ['travel', 'dancing', 'cooking'], 34 | # 'scores': [0.9938651323318481, 0.0032737774308770895, 0.002861034357920289], 35 | # 'sequence': 'one day I will see the world'} 36 | ``` 37 | 38 | If more than one candidate label can be correct, pass `multi_class=True` to calculate each class independently: 39 | 40 | ```python 41 | candidate_labels = ['travel', 'cooking', 'dancing', 'exploration'] 42 | classifier(sequence_to_classify, candidate_labels, multi_class=True) 43 | #{'labels': ['travel', 'exploration', 'dancing', 'cooking'], 44 | # 'scores': [0.9945111274719238, 45 | # 0.9383890628814697, 46 | # 0.0057061901316046715, 47 | # 0.0018193122232332826], 48 | # 'sequence': 'one day I will see the world'} 49 | ``` 50 | 51 | 52 | #### With manual PyTorch 53 | 54 | ```python 55 | # pose sequence as a NLI premise and label as a hypothesis 56 | from transformers import AutoModelForSequenceClassification, AutoTokenizer 57 | nli_model = AutoModelForSequenceClassification.from_pretrained('facebook/bart-large-mnli') 58 | tokenizer = AutoTokenizer.from_pretrained('facebook/bart-large-mnli') 59 | 60 | premise = sequence 61 | hypothesis = f'This example is {label}.' 62 | 63 | # run through model pre-trained on MNLI 64 | x = tokenizer.encode(premise, hypothesis, return_tensors='pt', 65 | truncation_strategy='only_first') 66 | logits = nli_model(x.to(device))[0] 67 | 68 | # we throw away "neutral" (dim 1) and take the probability of 69 | # "entailment" (2) as the probability of the label being true 70 | entail_contradiction_logits = logits[:,[0,2]] 71 | probs = entail_contradiction_logits.softmax(dim=1) 72 | prob_label_is_true = probs[:,1] 73 | ``` -------------------------------------------------------------------------------- /model_scraping/cards/google___electra-base-discriminator.md: -------------------------------------------------------------------------------- 1 | ## ELECTRA: Pre-training Text Encoders as Discriminators Rather Than Generators 2 | 3 | **ELECTRA** is a new method for self-supervised language representation learning. It can be used to pre-train transformer networks using relatively little compute. ELECTRA models are trained to distinguish "real" input tokens vs "fake" input tokens generated by another neural network, similar to the discriminator of a [GAN](https://arxiv.org/pdf/1406.2661.pdf). At small scale, ELECTRA achieves strong results even when trained on a single GPU. At large scale, ELECTRA achieves state-of-the-art results on the [SQuAD 2.0](https://rajpurkar.github.io/SQuAD-explorer/) dataset. 4 | 5 | For a detailed description and experimental results, please refer to our paper [ELECTRA: Pre-training Text Encoders as Discriminators Rather Than Generators](https://openreview.net/pdf?id=r1xMH1BtvB). 6 | 7 | This repository contains code to pre-train ELECTRA, including small ELECTRA models on a single GPU. It also supports fine-tuning ELECTRA on downstream tasks including classification tasks (e.g,. [GLUE](https://gluebenchmark.com/)), QA tasks (e.g., [SQuAD](https://rajpurkar.github.io/SQuAD-explorer/)), and sequence tagging tasks (e.g., [text chunking](https://www.clips.uantwerpen.be/conll2000/chunking/)). 8 | 9 | ## How to use the discriminator in `transformers` 10 | 11 | ```python 12 | from transformers import ElectraForPreTraining, ElectraTokenizerFast 13 | import torch 14 | 15 | discriminator = ElectraForPreTraining.from_pretrained("google/electra-base-discriminator") 16 | tokenizer = ElectraTokenizerFast.from_pretrained("google/electra-base-discriminator") 17 | 18 | sentence = "The quick brown fox jumps over the lazy dog" 19 | fake_sentence = "The quick brown fox fake over the lazy dog" 20 | 21 | fake_tokens = tokenizer.tokenize(fake_sentence) 22 | fake_inputs = tokenizer.encode(fake_sentence, return_tensors="pt") 23 | discriminator_outputs = discriminator(fake_inputs) 24 | predictions = torch.round((torch.sign(discriminator_outputs[0]) + 1) / 2) 25 | 26 | [print("%7s" % token, end="") for token in fake_tokens] 27 | 28 | [print("%7s" % int(prediction), end="") for prediction in predictions.tolist()] 29 | ``` -------------------------------------------------------------------------------- /model_scraping/cards/gpt2.md: -------------------------------------------------------------------------------- 1 | # GPT-2 2 | 3 | Test the whole generation capabilities here: https://transformer.huggingface.co/doc/gpt2-large 4 | 5 | Pretrained model on English language using a causal language modeling (CLM) objective. It was introduced in 6 | [this paper](https://d4mucfpksywv.cloudfront.net/better-language-models/language_models_are_unsupervised_multitask_learners.pdf) 7 | and first released at [this page](https://openai.com/blog/better-language-models/). 8 | 9 | Disclaimer: The team releasing GPT-2 also wrote a 10 | [model card](https://github.com/openai/gpt-2/blob/master/model_card.md) for their model. Content from this model card 11 | has been written by the Hugging Face team to complete the information they provided and give specific examples of bias. 12 | 13 | ## Model description 14 | 15 | GPT-2 is a transformers model pretrained on a very large corpus of English data in a self-supervised fashion. This 16 | means it was pretrained on the raw texts only, with no humans labelling them in any way (which is why it can use lots 17 | of publicly available data) with an automatic process to generate inputs and labels from those texts. More precisely, 18 | it was trained to guess the next word in sentences. 19 | 20 | More precisely, inputs are sequences of continuous text of a certain length and the targets are the same sequence, 21 | shifted one token (word or piece of word) to the right. The model uses internally a mask-mechanism to make sure the 22 | predictions for the token `i` only uses the inputs from `1` to `i` but not the future tokens. 23 | 24 | This way, the model learns an inner representation of the English language that can then be used to extract features 25 | useful for downstream tasks. The model is best at what it was pretrained for however, which is generating texts from a 26 | prompt. 27 | 28 | This is the **smallest** version of GPT-2, with 124M parameters. 29 | 30 | **Related Models:** [GPT-Large](https://huggingface.co/gpt2-large), [GPT-Medium](https://huggingface.co/gpt2-medium) and [GPT-XL](https://huggingface.co/gpt2-xl) 31 | 32 | ## Intended uses & limitations 33 | 34 | You can use the raw model for text generation or fine-tune it to a downstream task. See the 35 | [model hub](https://huggingface.co/models?filter=gpt2) to look for fine-tuned versions on a task that interests you. 36 | 37 | ### How to use 38 | 39 | You can use this model directly with a pipeline for text generation. Since the generation relies on some randomness, we 40 | set a seed for reproducibility: 41 | 42 | ```python 43 | >>> from transformers import pipeline, set_seed 44 | >>> generator = pipeline('text-generation', model='gpt2') 45 | >>> set_seed(42) 46 | >>> generator("Hello, I'm a language model,", max_length=30, num_return_sequences=5) 47 | 48 | [{'generated_text': "Hello, I'm a language model, a language for thinking, a language for expressing thoughts."}, 49 | {'generated_text': "Hello, I'm a language model, a compiler, a compiler library, I just want to know how I build this kind of stuff. I don"}, 50 | {'generated_text': "Hello, I'm a language model, and also have more than a few of your own, but I understand that they're going to need some help"}, 51 | {'generated_text': "Hello, I'm a language model, a system model. I want to know my language so that it might be more interesting, more user-friendly"}, 52 | {'generated_text': 'Hello, I\'m a language model, not a language model"\n\nThe concept of "no-tricks" comes in handy later with new'}] 53 | ``` 54 | 55 | Here is how to use this model to get the features of a given text in PyTorch: 56 | 57 | ```python 58 | from transformers import GPT2Tokenizer, GPT2Model 59 | tokenizer = GPT2Tokenizer.from_pretrained('gpt2') 60 | model = GPT2Model.from_pretrained('gpt2') 61 | text = "Replace me by any text you'd like." 62 | encoded_input = tokenizer(text, return_tensors='pt') 63 | output = model(**encoded_input) 64 | ``` 65 | 66 | and in TensorFlow: 67 | 68 | ```python 69 | from transformers import GPT2Tokenizer, TFGPT2Model 70 | tokenizer = GPT2Tokenizer.from_pretrained('gpt2') 71 | model = TFGPT2Model.from_pretrained('gpt2') 72 | text = "Replace me by any text you'd like." 73 | encoded_input = tokenizer(text, return_tensors='tf') 74 | output = model(encoded_input) 75 | ``` 76 | 77 | ### Limitations and bias 78 | 79 | The training data used for this model has not been released as a dataset one can browse. We know it contains a lot of 80 | unfiltered content from the internet, which is far from neutral. As the openAI team themselves point out in their 81 | [model card](https://github.com/openai/gpt-2/blob/master/model_card.md#out-of-scope-use-cases): 82 | 83 | > Because large-scale language models like GPT-2 do not distinguish fact from fiction, we don’t support use-cases 84 | > that require the generated text to be true. 85 | > 86 | > Additionally, language models like GPT-2 reflect the biases inherent to the systems they were trained on, so we do 87 | > not recommend that they be deployed into systems that interact with humans > unless the deployers first carry out a 88 | > study of biases relevant to the intended use-case. We found no statistically significant difference in gender, race, 89 | > and religious bias probes between 774M and 1.5B, implying all versions of GPT-2 should be approached with similar 90 | > levels of caution around use cases that are sensitive to biases around human attributes. 91 | 92 | Here's an example of how the model can have biased predictions: 93 | 94 | ```python 95 | >>> from transformers import pipeline, set_seed 96 | >>> generator = pipeline('text-generation', model='gpt2') 97 | >>> set_seed(42) 98 | >>> generator("The White man worked as a", max_length=10, num_return_sequences=5) 99 | 100 | [{'generated_text': 'The White man worked as a mannequin for'}, 101 | {'generated_text': 'The White man worked as a maniser of the'}, 102 | {'generated_text': 'The White man worked as a bus conductor by day'}, 103 | {'generated_text': 'The White man worked as a plumber at the'}, 104 | {'generated_text': 'The White man worked as a journalist. He had'}] 105 | 106 | >>> set_seed(42) 107 | >>> generator("The Black man worked as a", max_length=10, num_return_sequences=5) 108 | 109 | [{'generated_text': 'The Black man worked as a man at a restaurant'}, 110 | {'generated_text': 'The Black man worked as a car salesman in a'}, 111 | {'generated_text': 'The Black man worked as a police sergeant at the'}, 112 | {'generated_text': 'The Black man worked as a man-eating monster'}, 113 | {'generated_text': 'The Black man worked as a slave, and was'}] 114 | ``` 115 | 116 | This bias will also affect all fine-tuned versions of this model. 117 | 118 | ## Training data 119 | 120 | The OpenAI team wanted to train this model on a corpus as large as possible. To build it, they scraped all the web 121 | pages from outbound links on Reddit which received at least 3 karma. Note that all Wikipedia pages were removed from 122 | this dataset, so the model was not trained on any part of Wikipedia. The resulting dataset (called WebText) weights 123 | 40GB of texts but has not been publicly released. You can find a list of the top 1,000 domains present in WebText 124 | [here](https://github.com/openai/gpt-2/blob/master/domains.txt). 125 | 126 | ## Training procedure 127 | 128 | ### Preprocessing 129 | 130 | The texts are tokenized using a byte-level version of Byte Pair Encoding (BPE) (for unicode characters) and a 131 | vocabulary size of 50,257. The inputs are sequences of 1024 consecutive tokens. 132 | 133 | The larger model was trained on 256 cloud TPU v3 cores. The training duration was not disclosed, nor were the exact 134 | details of training. 135 | 136 | ## Evaluation results 137 | 138 | The model achieves the following results without any fine-tuning (zero-shot): 139 | 140 | | Dataset | LAMBADA | LAMBADA | CBT-CN | CBT-NE | WikiText2 | PTB | enwiki8 | text8 | WikiText103 | 1BW | 141 | |:--------:|:-------:|:-------:|:------:|:------:|:---------:|:------:|:-------:|:------:|:-----------:|:-----:| 142 | | (metric) | (PPL) | (ACC) | (ACC) | (ACC) | (PPL) | (PPL) | (BPB) | (BPC) | (PPL) | (PPL) | 143 | | | 35.13 | 45.99 | 87.65 | 83.4 | 29.41 | 65.85 | 1.16 | 1,17 | 37.50 | 75.20 | 144 | 145 | 146 | ### BibTeX entry and citation info 147 | 148 | ```bibtex 149 | @article{radford2019language, 150 | title={Language Models are Unsupervised Multitask Learners}, 151 | author={Radford, Alec and Wu, Jeff and Child, Rewon and Luan, David and Amodei, Dario and Sutskever, Ilya}, 152 | year={2019} 153 | } 154 | ``` 155 | 156 | 157 | 158 | -------------------------------------------------------------------------------- /model_scraping/cards/jonatasgrosman___wav2vec2-large-xlsr-53-english.md: -------------------------------------------------------------------------------- 1 | # Fine-tuned XLSR-53 large model for speech recognition in English 2 | 3 | Fine-tuned [facebook/wav2vec2-large-xlsr-53](https://huggingface.co/facebook/wav2vec2-large-xlsr-53) on English using the train and validation splits of [Common Voice 6.1](https://huggingface.co/datasets/common_voice). 4 | When using this model, make sure that your speech input is sampled at 16kHz. 5 | 6 | This model has been fine-tuned thanks to the GPU credits generously given by the [OVHcloud](https://www.ovhcloud.com/en/public-cloud/ai-training/) :) 7 | 8 | The script used for training can be found here: https://github.com/jonatasgrosman/wav2vec2-sprint 9 | 10 | ## Usage 11 | 12 | The model can be used directly (without a language model) as follows... 13 | 14 | Using the [HuggingSound](https://github.com/jonatasgrosman/huggingsound) library: 15 | 16 | ```python 17 | from huggingsound import SpeechRecognitionModel 18 | 19 | model = SpeechRecognitionModel("jonatasgrosman/wav2vec2-large-xlsr-53-english") 20 | audio_paths = ["/path/to/file.mp3", "/path/to/another_file.wav"] 21 | 22 | transcriptions = model.transcribe(audio_paths) 23 | ``` 24 | 25 | Writing your own inference script: 26 | 27 | ```python 28 | import torch 29 | import librosa 30 | from datasets import load_dataset 31 | from transformers import Wav2Vec2ForCTC, Wav2Vec2Processor 32 | 33 | LANG_ID = "en" 34 | MODEL_ID = "jonatasgrosman/wav2vec2-large-xlsr-53-english" 35 | SAMPLES = 10 36 | 37 | test_dataset = load_dataset("common_voice", LANG_ID, split=f"test[:{SAMPLES}]") 38 | 39 | processor = Wav2Vec2Processor.from_pretrained(MODEL_ID) 40 | model = Wav2Vec2ForCTC.from_pretrained(MODEL_ID) 41 | 42 | # Preprocessing the datasets. 43 | # We need to read the audio files as arrays 44 | def speech_file_to_array_fn(batch): 45 | speech_array, sampling_rate = librosa.load(batch["path"], sr=16_000) 46 | batch["speech"] = speech_array 47 | batch["sentence"] = batch["sentence"].upper() 48 | return batch 49 | 50 | test_dataset = test_dataset.map(speech_file_to_array_fn) 51 | inputs = processor(test_dataset["speech"], sampling_rate=16_000, return_tensors="pt", padding=True) 52 | 53 | with torch.no_grad(): 54 | logits = model(inputs.input_values, attention_mask=inputs.attention_mask).logits 55 | 56 | predicted_ids = torch.argmax(logits, dim=-1) 57 | predicted_sentences = processor.batch_decode(predicted_ids) 58 | 59 | for i, predicted_sentence in enumerate(predicted_sentences): 60 | print("-" * 100) 61 | print("Reference:", test_dataset[i]["sentence"]) 62 | print("Prediction:", predicted_sentence) 63 | ``` 64 | 65 | | Reference | Prediction | 66 | | ------------- | ------------- | 67 | | "SHE'LL BE ALL RIGHT." | SHE'LL BE ALL RIGHT | 68 | | SIX | SIX | 69 | | "ALL'S WELL THAT ENDS WELL." | ALL AS WELL THAT ENDS WELL | 70 | | DO YOU MEAN IT? | DO YOU MEAN IT | 71 | | THE NEW PATCH IS LESS INVASIVE THAN THE OLD ONE, BUT STILL CAUSES REGRESSIONS. | THE NEW PATCH IS LESS INVASIVE THAN THE OLD ONE BUT STILL CAUSES REGRESSION | 72 | | HOW IS MOZILLA GOING TO HANDLE AMBIGUITIES LIKE QUEUE AND CUE? | HOW IS MOSLILLAR GOING TO HANDLE ANDBEWOOTH HIS LIKE Q AND Q | 73 | | "I GUESS YOU MUST THINK I'M KINDA BATTY." | RUSTIAN WASTIN PAN ONTE BATTLY | 74 | | NO ONE NEAR THE REMOTE MACHINE YOU COULD RING? | NO ONE NEAR THE REMOTE MACHINE YOU COULD RING | 75 | | SAUCE FOR THE GOOSE IS SAUCE FOR THE GANDER. | SAUCE FOR THE GUICE IS SAUCE FOR THE GONDER | 76 | | GROVES STARTED WRITING SONGS WHEN SHE WAS FOUR YEARS OLD. | GRAFS STARTED WRITING SONGS WHEN SHE WAS FOUR YEARS OLD | 77 | 78 | ## Evaluation 79 | 80 | 1. To evaluate on `mozilla-foundation/common_voice_6_0` with split `test` 81 | 82 | ```bash 83 | python eval.py --model_id jonatasgrosman/wav2vec2-large-xlsr-53-english --dataset mozilla-foundation/common_voice_6_0 --config en --split test 84 | ``` 85 | 86 | 2. To evaluate on `speech-recognition-community-v2/dev_data` 87 | 88 | ```bash 89 | python eval.py --model_id jonatasgrosman/wav2vec2-large-xlsr-53-english --dataset speech-recognition-community-v2/dev_data --config en --split validation --chunk_length_s 5.0 --stride_length_s 1.0 90 | ``` 91 | 92 | ## Citation 93 | If you want to cite this model you can use this: 94 | 95 | ```bibtex 96 | @misc{grosman2021xlsr53-large-english, 97 | title={Fine-tuned {XLSR}-53 large model for speech recognition in {E}nglish}, 98 | author={Grosman, Jonatas}, 99 | howpublished={\url{https://huggingface.co/jonatasgrosman/wav2vec2-large-xlsr-53-english}}, 100 | year={2021} 101 | } 102 | ``` -------------------------------------------------------------------------------- /model_scraping/cards/microsoft___layoutlmv3-base.md: -------------------------------------------------------------------------------- 1 | # LayoutLMv3 2 | 3 | [Microsoft Document AI](https://www.microsoft.com/en-us/research/project/document-ai/) | [GitHub](https://aka.ms/layoutlmv3) 4 | 5 | ## Model description 6 | 7 | LayoutLMv3 is a pre-trained multimodal Transformer for Document AI with unified text and image masking. The simple unified architecture and training objectives make LayoutLMv3 a general-purpose pre-trained model. For example, LayoutLMv3 can be fine-tuned for both text-centric tasks, including form understanding, receipt understanding, and document visual question answering, and image-centric tasks such as document image classification and document layout analysis. 8 | 9 | [LayoutLMv3: Pre-training for Document AI with Unified Text and Image Masking](https://arxiv.org/abs/2204.08387) 10 | Yupan Huang, Tengchao Lv, Lei Cui, Yutong Lu, Furu Wei, ACM Multimedia 2022. 11 | 12 | ## Citation 13 | 14 | If you find LayoutLM useful in your research, please cite the following paper: 15 | 16 | ``` 17 | @inproceedings{huang2022layoutlmv3, 18 | author={Yupan Huang and Tengchao Lv and Lei Cui and Yutong Lu and Furu Wei}, 19 | title={LayoutLMv3: Pre-training for Document AI with Unified Text and Image Masking}, 20 | booktitle={Proceedings of the 30th ACM International Conference on Multimedia}, 21 | year={2022} 22 | } 23 | ``` 24 | 25 | ## License 26 | 27 | The content of this project itself is licensed under the [Attribution-NonCommercial-ShareAlike 4.0 International (CC BY-NC-SA 4.0)](https://creativecommons.org/licenses/by-nc-sa/4.0/). 28 | Portions of the source code are based on the [transformers](https://github.com/huggingface/transformers) project. 29 | [Microsoft Open Source Code of Conduct](https://opensource.microsoft.com/codeofconduct) -------------------------------------------------------------------------------- /model_scraping/cards/openai___clip-vit-base-patch32.md: -------------------------------------------------------------------------------- 1 | # Model Card: CLIP 2 | 3 | Disclaimer: The model card is taken and modified from the official CLIP repository, it can be found [here](https://github.com/openai/CLIP/blob/main/model-card.md). 4 | 5 | ## Model Details 6 | 7 | The CLIP model was developed by researchers at OpenAI to learn about what contributes to robustness in computer vision tasks. The model was also developed to test the ability of models to generalize to arbitrary image classification tasks in a zero-shot manner. It was not developed for general model deployment - to deploy models like CLIP, researchers will first need to carefully study their capabilities in relation to the specific context they’re being deployed within. 8 | 9 | ### Model Date 10 | 11 | January 2021 12 | 13 | ### Model Type 14 | 15 | The model uses a ViT-B/32 Transformer architecture as an image encoder and uses a masked self-attention Transformer as a text encoder. These encoders are trained to maximize the similarity of (image, text) pairs via a contrastive loss. 16 | 17 | The original implementation had two variants: one using a ResNet image encoder and the other using a Vision Transformer. This repository has the variant with the Vision Transformer. 18 | 19 | 20 | ### Documents 21 | 22 | - [Blog Post](https://openai.com/blog/clip/) 23 | - [CLIP Paper](https://arxiv.org/abs/2103.00020) 24 | 25 | 26 | ### Use with Transformers 27 | 28 | ```python3 29 | from PIL import Image 30 | import requests 31 | 32 | from transformers import CLIPProcessor, CLIPModel 33 | 34 | model = CLIPModel.from_pretrained("openai/clip-vit-base-patch32") 35 | processor = CLIPProcessor.from_pretrained("openai/clip-vit-base-patch32") 36 | 37 | url = "http://images.cocodataset.org/val2017/000000039769.jpg" 38 | image = Image.open(requests.get(url, stream=True).raw) 39 | 40 | inputs = processor(text=["a photo of a cat", "a photo of a dog"], images=image, return_tensors="pt", padding=True) 41 | 42 | outputs = model(**inputs) 43 | logits_per_image = outputs.logits_per_image # this is the image-text similarity score 44 | probs = logits_per_image.softmax(dim=1) # we can take the softmax to get the label probabilities 45 | ``` 46 | 47 | 48 | ## Model Use 49 | 50 | ### Intended Use 51 | 52 | The model is intended as a research output for research communities. We hope that this model will enable researchers to better understand and explore zero-shot, arbitrary image classification. We also hope it can be used for interdisciplinary studies of the potential impact of such models - the CLIP paper includes a discussion of potential downstream impacts to provide an example for this sort of analysis. 53 | 54 | #### Primary intended uses 55 | 56 | The primary intended users of these models are AI researchers. 57 | 58 | We primarily imagine the model will be used by researchers to better understand robustness, generalization, and other capabilities, biases, and constraints of computer vision models. 59 | 60 | ### Out-of-Scope Use Cases 61 | 62 | **Any** deployed use case of the model - whether commercial or not - is currently out of scope. Non-deployed use cases such as image search in a constrained environment, are also not recommended unless there is thorough in-domain testing of the model with a specific, fixed class taxonomy. This is because our safety assessment demonstrated a high need for task specific testing especially given the variability of CLIP’s performance with different class taxonomies. This makes untested and unconstrained deployment of the model in any use case currently potentially harmful. 63 | 64 | Certain use cases which would fall under the domain of surveillance and facial recognition are always out-of-scope regardless of performance of the model. This is because the use of artificial intelligence for tasks such as these can be premature currently given the lack of testing norms and checks to ensure its fair use. 65 | 66 | Since the model has not been purposefully trained in or evaluated on any languages other than English, its use should be limited to English language use cases. 67 | 68 | 69 | 70 | ## Data 71 | 72 | The model was trained on publicly available image-caption data. This was done through a combination of crawling a handful of websites and using commonly-used pre-existing image datasets such as [YFCC100M](http://projects.dfki.uni-kl.de/yfcc100m/). A large portion of the data comes from our crawling of the internet. This means that the data is more representative of people and societies most connected to the internet which tend to skew towards more developed nations, and younger, male users. 73 | 74 | ### Data Mission Statement 75 | 76 | Our goal with building this dataset was to test out robustness and generalizability in computer vision tasks. As a result, the focus was on gathering large quantities of data from different publicly-available internet data sources. The data was gathered in a mostly non-interventionist manner. However, we only crawled websites that had policies against excessively violent and adult images and allowed us to filter out such content. We do not intend for this dataset to be used as the basis for any commercial or deployed model and will not be releasing the dataset. 77 | 78 | 79 | 80 | ## Performance and Limitations 81 | 82 | ### Performance 83 | 84 | We have evaluated the performance of CLIP on a wide range of benchmarks across a variety of computer vision datasets such as OCR to texture recognition to fine-grained classification. The paper describes model performance on the following datasets: 85 | 86 | - Food101 87 | - CIFAR10 88 | - CIFAR100 89 | - Birdsnap 90 | - SUN397 91 | - Stanford Cars 92 | - FGVC Aircraft 93 | - VOC2007 94 | - DTD 95 | - Oxford-IIIT Pet dataset 96 | - Caltech101 97 | - Flowers102 98 | - MNIST 99 | - SVHN 100 | - IIIT5K 101 | - Hateful Memes 102 | - SST-2 103 | - UCF101 104 | - Kinetics700 105 | - Country211 106 | - CLEVR Counting 107 | - KITTI Distance 108 | - STL-10 109 | - RareAct 110 | - Flickr30 111 | - MSCOCO 112 | - ImageNet 113 | - ImageNet-A 114 | - ImageNet-R 115 | - ImageNet Sketch 116 | - ObjectNet (ImageNet Overlap) 117 | - Youtube-BB 118 | - ImageNet-Vid 119 | 120 | ## Limitations 121 | 122 | CLIP and our analysis of it have a number of limitations. CLIP currently struggles with respect to certain tasks such as fine grained classification and counting objects. CLIP also poses issues with regards to fairness and bias which we discuss in the paper and briefly in the next section. Additionally, our approach to testing CLIP also has an important limitation- in many cases we have used linear probes to evaluate the performance of CLIP and there is evidence suggesting that linear probes can underestimate model performance. 123 | 124 | ### Bias and Fairness 125 | 126 | We find that the performance of CLIP - and the specific biases it exhibits - can depend significantly on class design and the choices one makes for categories to include and exclude. We tested the risk of certain kinds of denigration with CLIP by classifying images of people from [Fairface](https://arxiv.org/abs/1908.04913) into crime-related and non-human animal categories. We found significant disparities with respect to race and gender. Additionally, we found that these disparities could shift based on how the classes were constructed. (Details captured in the Broader Impacts Section in the paper). 127 | 128 | We also tested the performance of CLIP on gender, race and age classification using the Fairface dataset (We default to using race categories as they are constructed in the Fairface dataset.) in order to assess quality of performance across different demographics. We found accuracy >96% across all races for gender classification with ‘Middle Eastern’ having the highest accuracy (98.4%) and ‘White’ having the lowest (96.5%). Additionally, CLIP averaged ~93% for racial classification and ~63% for age classification. Our use of evaluations to test for gender, race and age classification as well as denigration harms is simply to evaluate performance of the model across people and surface potential risks and not to demonstrate an endorsement/enthusiasm for such tasks. 129 | 130 | 131 | 132 | ## Feedback 133 | 134 | ### Where to send questions or comments about the model 135 | 136 | Please use [this Google Form](https://forms.gle/Uv7afRH5dvY34ZEs9) -------------------------------------------------------------------------------- /model_scraping/cards/openai___clip-vit-large-patch14.md: -------------------------------------------------------------------------------- 1 | # Model Card: CLIP 2 | 3 | Disclaimer: The model card is taken and modified from the official CLIP repository, it can be found [here](https://github.com/openai/CLIP/blob/main/model-card.md). 4 | 5 | ## Model Details 6 | 7 | The CLIP model was developed by researchers at OpenAI to learn about what contributes to robustness in computer vision tasks. The model was also developed to test the ability of models to generalize to arbitrary image classification tasks in a zero-shot manner. It was not developed for general model deployment - to deploy models like CLIP, researchers will first need to carefully study their capabilities in relation to the specific context they’re being deployed within. 8 | 9 | ### Model Date 10 | 11 | January 2021 12 | 13 | ### Model Type 14 | 15 | The base model uses a ViT-L/14 Transformer architecture as an image encoder and uses a masked self-attention Transformer as a text encoder. These encoders are trained to maximize the similarity of (image, text) pairs via a contrastive loss. 16 | 17 | The original implementation had two variants: one using a ResNet image encoder and the other using a Vision Transformer. This repository has the variant with the Vision Transformer. 18 | 19 | 20 | ### Documents 21 | 22 | - [Blog Post](https://openai.com/blog/clip/) 23 | - [CLIP Paper](https://arxiv.org/abs/2103.00020) 24 | 25 | 26 | ### Use with Transformers 27 | 28 | ```python 29 | from PIL import Image 30 | import requests 31 | 32 | from transformers import CLIPProcessor, CLIPModel 33 | 34 | model = CLIPModel.from_pretrained("openai/clip-vit-large-patch14") 35 | processor = CLIPProcessor.from_pretrained("openai/clip-vit-large-patch14") 36 | 37 | url = "http://images.cocodataset.org/val2017/000000039769.jpg" 38 | image = Image.open(requests.get(url, stream=True).raw) 39 | 40 | inputs = processor(text=["a photo of a cat", "a photo of a dog"], images=image, return_tensors="pt", padding=True) 41 | 42 | outputs = model(**inputs) 43 | logits_per_image = outputs.logits_per_image # this is the image-text similarity score 44 | probs = logits_per_image.softmax(dim=1) # we can take the softmax to get the label probabilities 45 | ``` 46 | 47 | 48 | ## Model Use 49 | 50 | ### Intended Use 51 | 52 | The model is intended as a research output for research communities. We hope that this model will enable researchers to better understand and explore zero-shot, arbitrary image classification. We also hope it can be used for interdisciplinary studies of the potential impact of such models - the CLIP paper includes a discussion of potential downstream impacts to provide an example for this sort of analysis. 53 | 54 | #### Primary intended uses 55 | 56 | The primary intended users of these models are AI researchers. 57 | 58 | We primarily imagine the model will be used by researchers to better understand robustness, generalization, and other capabilities, biases, and constraints of computer vision models. 59 | 60 | ### Out-of-Scope Use Cases 61 | 62 | **Any** deployed use case of the model - whether commercial or not - is currently out of scope. Non-deployed use cases such as image search in a constrained environment, are also not recommended unless there is thorough in-domain testing of the model with a specific, fixed class taxonomy. This is because our safety assessment demonstrated a high need for task specific testing especially given the variability of CLIP’s performance with different class taxonomies. This makes untested and unconstrained deployment of the model in any use case currently potentially harmful. 63 | 64 | Certain use cases which would fall under the domain of surveillance and facial recognition are always out-of-scope regardless of performance of the model. This is because the use of artificial intelligence for tasks such as these can be premature currently given the lack of testing norms and checks to ensure its fair use. 65 | 66 | Since the model has not been purposefully trained in or evaluated on any languages other than English, its use should be limited to English language use cases. 67 | 68 | 69 | 70 | ## Data 71 | 72 | The model was trained on publicly available image-caption data. This was done through a combination of crawling a handful of websites and using commonly-used pre-existing image datasets such as [YFCC100M](http://projects.dfki.uni-kl.de/yfcc100m/). A large portion of the data comes from our crawling of the internet. This means that the data is more representative of people and societies most connected to the internet which tend to skew towards more developed nations, and younger, male users. 73 | 74 | ### Data Mission Statement 75 | 76 | Our goal with building this dataset was to test out robustness and generalizability in computer vision tasks. As a result, the focus was on gathering large quantities of data from different publicly-available internet data sources. The data was gathered in a mostly non-interventionist manner. However, we only crawled websites that had policies against excessively violent and adult images and allowed us to filter out such content. We do not intend for this dataset to be used as the basis for any commercial or deployed model and will not be releasing the dataset. 77 | 78 | 79 | 80 | ## Performance and Limitations 81 | 82 | ### Performance 83 | 84 | We have evaluated the performance of CLIP on a wide range of benchmarks across a variety of computer vision datasets such as OCR to texture recognition to fine-grained classification. The paper describes model performance on the following datasets: 85 | 86 | - Food101 87 | - CIFAR10 88 | - CIFAR100 89 | - Birdsnap 90 | - SUN397 91 | - Stanford Cars 92 | - FGVC Aircraft 93 | - VOC2007 94 | - DTD 95 | - Oxford-IIIT Pet dataset 96 | - Caltech101 97 | - Flowers102 98 | - MNIST 99 | - SVHN 100 | - IIIT5K 101 | - Hateful Memes 102 | - SST-2 103 | - UCF101 104 | - Kinetics700 105 | - Country211 106 | - CLEVR Counting 107 | - KITTI Distance 108 | - STL-10 109 | - RareAct 110 | - Flickr30 111 | - MSCOCO 112 | - ImageNet 113 | - ImageNet-A 114 | - ImageNet-R 115 | - ImageNet Sketch 116 | - ObjectNet (ImageNet Overlap) 117 | - Youtube-BB 118 | - ImageNet-Vid 119 | 120 | ## Limitations 121 | 122 | CLIP and our analysis of it have a number of limitations. CLIP currently struggles with respect to certain tasks such as fine grained classification and counting objects. CLIP also poses issues with regards to fairness and bias which we discuss in the paper and briefly in the next section. Additionally, our approach to testing CLIP also has an important limitation- in many cases we have used linear probes to evaluate the performance of CLIP and there is evidence suggesting that linear probes can underestimate model performance. 123 | 124 | ### Bias and Fairness 125 | 126 | We find that the performance of CLIP - and the specific biases it exhibits - can depend significantly on class design and the choices one makes for categories to include and exclude. We tested the risk of certain kinds of denigration with CLIP by classifying images of people from [Fairface](https://arxiv.org/abs/1908.04913) into crime-related and non-human animal categories. We found significant disparities with respect to race and gender. Additionally, we found that these disparities could shift based on how the classes were constructed. (Details captured in the Broader Impacts Section in the paper). 127 | 128 | We also tested the performance of CLIP on gender, race and age classification using the Fairface dataset (We default to using race categories as they are constructed in the Fairface dataset.) in order to assess quality of performance across different demographics. We found accuracy >96% across all races for gender classification with ‘Middle Eastern’ having the highest accuracy (98.4%) and ‘White’ having the lowest (96.5%). Additionally, CLIP averaged ~93% for racial classification and ~63% for age classification. Our use of evaluations to test for gender, race and age classification as well as denigration harms is simply to evaluate performance of the model across people and surface potential risks and not to demonstrate an endorsement/enthusiasm for such tasks. 129 | 130 | 131 | 132 | ## Feedback 133 | 134 | ### Where to send questions or comments about the model 135 | 136 | Please use [this Google Form](https://forms.gle/Uv7afRH5dvY34ZEs9) -------------------------------------------------------------------------------- /model_scraping/cards/philschmid___bart-large-cnn-samsum.md: -------------------------------------------------------------------------------- 1 | ## `bart-large-cnn-samsum` 2 | 3 | > If you want to use the model you should try a newer fine-tuned FLAN-T5 version [philschmid/flan-t5-base-samsum](https://huggingface.co/philschmid/flan-t5-base-samsum) out socring the BART version with `+6` on `ROGUE1` achieving `47.24`. 4 | 5 | # TRY [philschmid/flan-t5-base-samsum](https://huggingface.co/philschmid/flan-t5-base-samsum) 6 | 7 | 8 | This model was trained using Amazon SageMaker and the new Hugging Face Deep Learning container. 9 | 10 | For more information look at: 11 | - [🤗 Transformers Documentation: Amazon SageMaker](https://huggingface.co/transformers/sagemaker.html) 12 | - [Example Notebooks](https://github.com/huggingface/notebooks/tree/master/sagemaker) 13 | - [Amazon SageMaker documentation for Hugging Face](https://docs.aws.amazon.com/sagemaker/latest/dg/hugging-face.html) 14 | - [Python SDK SageMaker documentation for Hugging Face](https://sagemaker.readthedocs.io/en/stable/frameworks/huggingface/index.html) 15 | - [Deep Learning Container](https://github.com/aws/deep-learning-containers/blob/master/available_images.md#huggingface-training-containers) 16 | 17 | ## Hyperparameters 18 | ```json 19 | { 20 | "dataset_name": "samsum", 21 | "do_eval": true, 22 | "do_predict": true, 23 | "do_train": true, 24 | "fp16": true, 25 | "learning_rate": 5e-05, 26 | "model_name_or_path": "facebook/bart-large-cnn", 27 | "num_train_epochs": 3, 28 | "output_dir": "/opt/ml/model", 29 | "per_device_eval_batch_size": 4, 30 | "per_device_train_batch_size": 4, 31 | "predict_with_generate": true, 32 | "seed": 7 33 | } 34 | ``` 35 | 36 | ## Usage 37 | ```python 38 | from transformers import pipeline 39 | summarizer = pipeline("summarization", model="philschmid/bart-large-cnn-samsum") 40 | 41 | conversation = '''Jeff: Can I train a 🤗 Transformers model on Amazon SageMaker? 42 | Philipp: Sure you can use the new Hugging Face Deep Learning Container. 43 | Jeff: ok. 44 | Jeff: and how can I get started? 45 | Jeff: where can I find documentation? 46 | Philipp: ok, ok you can find everything here. https://huggingface.co/blog/the-partnership-amazon-sagemaker-and-hugging-face 47 | ''' 48 | summarizer(conversation) 49 | ``` 50 | 51 | ## Results 52 | 53 | | key | value | 54 | | --- | ----- | 55 | | eval_rouge1 | 42.621 | 56 | | eval_rouge2 | 21.9825 | 57 | | eval_rougeL | 33.034 | 58 | | eval_rougeLsum | 39.6783 | 59 | | test_rouge1 | 41.3174 | 60 | | test_rouge2 | 20.8716 | 61 | | test_rougeL | 32.1337 | 62 | | test_rougeLsum | 38.4149 | -------------------------------------------------------------------------------- /model_scraping/cards/prajjwal1___bert-tiny.md: -------------------------------------------------------------------------------- 1 | The following model is a Pytorch pre-trained model obtained from converting Tensorflow checkpoint found in the [official Google BERT repository](https://github.com/google-research/bert). 2 | 3 | This is one of the smaller pre-trained BERT variants, together with [bert-mini](https://huggingface.co/prajjwal1/bert-mini) [bert-small](https://huggingface.co/prajjwal1/bert-small) and [bert-medium](https://huggingface.co/prajjwal1/bert-medium). They were introduced in the study `Well-Read Students Learn Better: On the Importance of Pre-training Compact Models` ([arxiv](https://arxiv.org/abs/1908.08962)), and ported to HF for the study `Generalization in NLI: Ways (Not) To Go Beyond Simple Heuristics` ([arXiv](https://arxiv.org/abs/2110.01518)). These models are supposed to be trained on a downstream task. 4 | 5 | If you use the model, please consider citing both the papers: 6 | ``` 7 | @misc{bhargava2021generalization, 8 | title={Generalization in NLI: Ways (Not) To Go Beyond Simple Heuristics}, 9 | author={Prajjwal Bhargava and Aleksandr Drozd and Anna Rogers}, 10 | year={2021}, 11 | eprint={2110.01518}, 12 | archivePrefix={arXiv}, 13 | primaryClass={cs.CL} 14 | } 15 | 16 | @article{DBLP:journals/corr/abs-1908-08962, 17 | author = {Iulia Turc and 18 | Ming{-}Wei Chang and 19 | Kenton Lee and 20 | Kristina Toutanova}, 21 | title = {Well-Read Students Learn Better: The Impact of Student Initialization 22 | on Knowledge Distillation}, 23 | journal = {CoRR}, 24 | volume = {abs/1908.08962}, 25 | year = {2019}, 26 | url = {http://arxiv.org/abs/1908.08962}, 27 | eprinttype = {arXiv}, 28 | eprint = {1908.08962}, 29 | timestamp = {Thu, 29 Aug 2019 16:32:34 +0200}, 30 | biburl = {https://dblp.org/rec/journals/corr/abs-1908-08962.bib}, 31 | bibsource = {dblp computer science bibliography, https://dblp.org} 32 | } 33 | 34 | ``` 35 | Config of this model: 36 | - `prajjwal1/bert-tiny` (L=2, H=128) [Model Link](https://huggingface.co/prajjwal1/bert-tiny) 37 | 38 | 39 | Other models to check out: 40 | - `prajjwal1/bert-mini` (L=4, H=256) [Model Link](https://huggingface.co/prajjwal1/bert-mini) 41 | - `prajjwal1/bert-small` (L=4, H=512) [Model Link](https://huggingface.co/prajjwal1/bert-small) 42 | - `prajjwal1/bert-medium` (L=8, H=512) [Model Link](https://huggingface.co/prajjwal1/bert-medium) 43 | 44 | Original Implementation and more info can be found in [this Github repository](https://github.com/prajjwal1/generalize_lm_nli). 45 | 46 | Twitter: [@prajjwal_1](https://twitter.com/prajjwal_1) -------------------------------------------------------------------------------- /model_scraping/cards/roberta-base.md: -------------------------------------------------------------------------------- 1 | # RoBERTa base model 2 | 3 | Pretrained model on English language using a masked language modeling (MLM) objective. It was introduced in 4 | [this paper](https://arxiv.org/abs/1907.11692) and first released in 5 | [this repository](https://github.com/pytorch/fairseq/tree/master/examples/roberta). This model is case-sensitive: it 6 | makes a difference between english and English. 7 | 8 | Disclaimer: The team releasing RoBERTa did not write a model card for this model so this model card has been written by 9 | the Hugging Face team. 10 | 11 | ## Model description 12 | 13 | RoBERTa is a transformers model pretrained on a large corpus of English data in a self-supervised fashion. This means 14 | it was pretrained on the raw texts only, with no humans labelling them in any way (which is why it can use lots of 15 | publicly available data) with an automatic process to generate inputs and labels from those texts. 16 | 17 | More precisely, it was pretrained with the Masked language modeling (MLM) objective. Taking a sentence, the model 18 | randomly masks 15% of the words in the input then run the entire masked sentence through the model and has to predict 19 | the masked words. This is different from traditional recurrent neural networks (RNNs) that usually see the words one 20 | after the other, or from autoregressive models like GPT which internally mask the future tokens. It allows the model to 21 | learn a bidirectional representation of the sentence. 22 | 23 | This way, the model learns an inner representation of the English language that can then be used to extract features 24 | useful for downstream tasks: if you have a dataset of labeled sentences for instance, you can train a standard 25 | classifier using the features produced by the BERT model as inputs. 26 | 27 | ## Intended uses & limitations 28 | 29 | You can use the raw model for masked language modeling, but it's mostly intended to be fine-tuned on a downstream task. 30 | See the [model hub](https://huggingface.co/models?filter=roberta) to look for fine-tuned versions on a task that 31 | interests you. 32 | 33 | Note that this model is primarily aimed at being fine-tuned on tasks that use the whole sentence (potentially masked) 34 | to make decisions, such as sequence classification, token classification or question answering. For tasks such as text 35 | generation you should look at a model like GPT2. 36 | 37 | ### How to use 38 | 39 | You can use this model directly with a pipeline for masked language modeling: 40 | 41 | ```python 42 | >>> from transformers import pipeline 43 | >>> unmasker = pipeline('fill-mask', model='roberta-base') 44 | >>> unmasker("Hello I'm a model.") 45 | 46 | [{'sequence': "Hello I'm a male model.", 47 | 'score': 0.3306540250778198, 48 | 'token': 2943, 49 | 'token_str': 'Ġmale'}, 50 | {'sequence': "Hello I'm a female model.", 51 | 'score': 0.04655390977859497, 52 | 'token': 2182, 53 | 'token_str': 'Ġfemale'}, 54 | {'sequence': "Hello I'm a professional model.", 55 | 'score': 0.04232972860336304, 56 | 'token': 2038, 57 | 'token_str': 'Ġprofessional'}, 58 | {'sequence': "Hello I'm a fashion model.", 59 | 'score': 0.037216778844594955, 60 | 'token': 2734, 61 | 'token_str': 'Ġfashion'}, 62 | {'sequence': "Hello I'm a Russian model.", 63 | 'score': 0.03253649175167084, 64 | 'token': 1083, 65 | 'token_str': 'ĠRussian'}] 66 | ``` 67 | 68 | Here is how to use this model to get the features of a given text in PyTorch: 69 | 70 | ```python 71 | from transformers import RobertaTokenizer, RobertaModel 72 | tokenizer = RobertaTokenizer.from_pretrained('roberta-base') 73 | model = RobertaModel.from_pretrained('roberta-base') 74 | text = "Replace me by any text you'd like." 75 | encoded_input = tokenizer(text, return_tensors='pt') 76 | output = model(**encoded_input) 77 | ``` 78 | 79 | and in TensorFlow: 80 | 81 | ```python 82 | from transformers import RobertaTokenizer, TFRobertaModel 83 | tokenizer = RobertaTokenizer.from_pretrained('roberta-base') 84 | model = TFRobertaModel.from_pretrained('roberta-base') 85 | text = "Replace me by any text you'd like." 86 | encoded_input = tokenizer(text, return_tensors='tf') 87 | output = model(encoded_input) 88 | ``` 89 | 90 | ### Limitations and bias 91 | 92 | The training data used for this model contains a lot of unfiltered content from the internet, which is far from 93 | neutral. Therefore, the model can have biased predictions: 94 | 95 | ```python 96 | >>> from transformers import pipeline 97 | >>> unmasker = pipeline('fill-mask', model='roberta-base') 98 | >>> unmasker("The man worked as a .") 99 | 100 | [{'sequence': 'The man worked as a mechanic.', 101 | 'score': 0.08702439814805984, 102 | 'token': 25682, 103 | 'token_str': 'Ġmechanic'}, 104 | {'sequence': 'The man worked as a waiter.', 105 | 'score': 0.0819653645157814, 106 | 'token': 38233, 107 | 'token_str': 'Ġwaiter'}, 108 | {'sequence': 'The man worked as a butcher.', 109 | 'score': 0.073323555290699, 110 | 'token': 32364, 111 | 'token_str': 'Ġbutcher'}, 112 | {'sequence': 'The man worked as a miner.', 113 | 'score': 0.046322137117385864, 114 | 'token': 18678, 115 | 'token_str': 'Ġminer'}, 116 | {'sequence': 'The man worked as a guard.', 117 | 'score': 0.040150221437215805, 118 | 'token': 2510, 119 | 'token_str': 'Ġguard'}] 120 | 121 | >>> unmasker("The Black woman worked as a .") 122 | 123 | [{'sequence': 'The Black woman worked as a waitress.', 124 | 'score': 0.22177888453006744, 125 | 'token': 35698, 126 | 'token_str': 'Ġwaitress'}, 127 | {'sequence': 'The Black woman worked as a prostitute.', 128 | 'score': 0.19288744032382965, 129 | 'token': 36289, 130 | 'token_str': 'Ġprostitute'}, 131 | {'sequence': 'The Black woman worked as a maid.', 132 | 'score': 0.06498628109693527, 133 | 'token': 29754, 134 | 'token_str': 'Ġmaid'}, 135 | {'sequence': 'The Black woman worked as a secretary.', 136 | 'score': 0.05375480651855469, 137 | 'token': 2971, 138 | 'token_str': 'Ġsecretary'}, 139 | {'sequence': 'The Black woman worked as a nurse.', 140 | 'score': 0.05245552211999893, 141 | 'token': 9008, 142 | 'token_str': 'Ġnurse'}] 143 | ``` 144 | 145 | This bias will also affect all fine-tuned versions of this model. 146 | 147 | ## Training data 148 | 149 | The RoBERTa model was pretrained on the reunion of five datasets: 150 | - [BookCorpus](https://yknzhu.wixsite.com/mbweb), a dataset consisting of 11,038 unpublished books; 151 | - [English Wikipedia](https://en.wikipedia.org/wiki/English_Wikipedia) (excluding lists, tables and headers) ; 152 | - [CC-News](https://commoncrawl.org/2016/10/news-dataset-available/), a dataset containing 63 millions English news 153 | articles crawled between September 2016 and February 2019. 154 | - [OpenWebText](https://github.com/jcpeterson/openwebtext), an opensource recreation of the WebText dataset used to 155 | train GPT-2, 156 | - [Stories](https://arxiv.org/abs/1806.02847) a dataset containing a subset of CommonCrawl data filtered to match the 157 | story-like style of Winograd schemas. 158 | 159 | Together these datasets weigh 160GB of text. 160 | 161 | ## Training procedure 162 | 163 | ### Preprocessing 164 | 165 | The texts are tokenized using a byte version of Byte-Pair Encoding (BPE) and a vocabulary size of 50,000. The inputs of 166 | the model take pieces of 512 contiguous tokens that may span over documents. The beginning of a new document is marked 167 | with `` and the end of one by `` 168 | 169 | The details of the masking procedure for each sentence are the following: 170 | - 15% of the tokens are masked. 171 | - In 80% of the cases, the masked tokens are replaced by ``. 172 | - In 10% of the cases, the masked tokens are replaced by a random token (different) from the one they replace. 173 | - In the 10% remaining cases, the masked tokens are left as is. 174 | 175 | Contrary to BERT, the masking is done dynamically during pretraining (e.g., it changes at each epoch and is not fixed). 176 | 177 | ### Pretraining 178 | 179 | The model was trained on 1024 V100 GPUs for 500K steps with a batch size of 8K and a sequence length of 512. The 180 | optimizer used is Adam with a learning rate of 6e-4, \\(\beta_{1} = 0.9\\), \\(\beta_{2} = 0.98\\) and 181 | \\(\epsilon = 1e-6\\), a weight decay of 0.01, learning rate warmup for 24,000 steps and linear decay of the learning 182 | rate after. 183 | 184 | ## Evaluation results 185 | 186 | When fine-tuned on downstream tasks, this model achieves the following results: 187 | 188 | Glue test results: 189 | 190 | | Task | MNLI | QQP | QNLI | SST-2 | CoLA | STS-B | MRPC | RTE | 191 | |:----:|:----:|:----:|:----:|:-----:|:----:|:-----:|:----:|:----:| 192 | | | 87.6 | 91.9 | 92.8 | 94.8 | 63.6 | 91.2 | 90.2 | 78.7 | 193 | 194 | 195 | ### BibTeX entry and citation info 196 | 197 | ```bibtex 198 | @article{DBLP:journals/corr/abs-1907-11692, 199 | author = {Yinhan Liu and 200 | Myle Ott and 201 | Naman Goyal and 202 | Jingfei Du and 203 | Mandar Joshi and 204 | Danqi Chen and 205 | Omer Levy and 206 | Mike Lewis and 207 | Luke Zettlemoyer and 208 | Veselin Stoyanov}, 209 | title = {RoBERTa: {A} Robustly Optimized {BERT} Pretraining Approach}, 210 | journal = {CoRR}, 211 | volume = {abs/1907.11692}, 212 | year = {2019}, 213 | url = {http://arxiv.org/abs/1907.11692}, 214 | archivePrefix = {arXiv}, 215 | eprint = {1907.11692}, 216 | timestamp = {Thu, 01 Aug 2019 08:59:33 +0200}, 217 | biburl = {https://dblp.org/rec/journals/corr/abs-1907-11692.bib}, 218 | bibsource = {dblp computer science bibliography, https://dblp.org} 219 | } 220 | ``` 221 | 222 | 223 | 224 | -------------------------------------------------------------------------------- /model_scraping/cards/roberta-large.md: -------------------------------------------------------------------------------- 1 | # RoBERTa large model 2 | 3 | Pretrained model on English language using a masked language modeling (MLM) objective. It was introduced in 4 | [this paper](https://arxiv.org/abs/1907.11692) and first released in 5 | [this repository](https://github.com/pytorch/fairseq/tree/master/examples/roberta). This model is case-sensitive: it 6 | makes a difference between english and English. 7 | 8 | Disclaimer: The team releasing RoBERTa did not write a model card for this model so this model card has been written by 9 | the Hugging Face team. 10 | 11 | ## Model description 12 | 13 | RoBERTa is a transformers model pretrained on a large corpus of English data in a self-supervised fashion. This means 14 | it was pretrained on the raw texts only, with no humans labelling them in any way (which is why it can use lots of 15 | publicly available data) with an automatic process to generate inputs and labels from those texts. 16 | 17 | More precisely, it was pretrained with the Masked language modeling (MLM) objective. Taking a sentence, the model 18 | randomly masks 15% of the words in the input then run the entire masked sentence through the model and has to predict 19 | the masked words. This is different from traditional recurrent neural networks (RNNs) that usually see the words one 20 | after the other, or from autoregressive models like GPT which internally mask the future tokens. It allows the model to 21 | learn a bidirectional representation of the sentence. 22 | 23 | This way, the model learns an inner representation of the English language that can then be used to extract features 24 | useful for downstream tasks: if you have a dataset of labeled sentences for instance, you can train a standard 25 | classifier using the features produced by the BERT model as inputs. 26 | 27 | ## Intended uses & limitations 28 | 29 | You can use the raw model for masked language modeling, but it's mostly intended to be fine-tuned on a downstream task. 30 | See the [model hub](https://huggingface.co/models?filter=roberta) to look for fine-tuned versions on a task that 31 | interests you. 32 | 33 | Note that this model is primarily aimed at being fine-tuned on tasks that use the whole sentence (potentially masked) 34 | to make decisions, such as sequence classification, token classification or question answering. For tasks such as text 35 | generation you should look at model like GPT2. 36 | 37 | ### How to use 38 | 39 | You can use this model directly with a pipeline for masked language modeling: 40 | 41 | ```python 42 | >>> from transformers import pipeline 43 | >>> unmasker = pipeline('fill-mask', model='roberta-large') 44 | >>> unmasker("Hello I'm a model.") 45 | 46 | [{'sequence': "Hello I'm a male model.", 47 | 'score': 0.3317350447177887, 48 | 'token': 2943, 49 | 'token_str': 'Ġmale'}, 50 | {'sequence': "Hello I'm a fashion model.", 51 | 'score': 0.14171843230724335, 52 | 'token': 2734, 53 | 'token_str': 'Ġfashion'}, 54 | {'sequence': "Hello I'm a professional model.", 55 | 'score': 0.04291723668575287, 56 | 'token': 2038, 57 | 'token_str': 'Ġprofessional'}, 58 | {'sequence': "Hello I'm a freelance model.", 59 | 'score': 0.02134818211197853, 60 | 'token': 18150, 61 | 'token_str': 'Ġfreelance'}, 62 | {'sequence': "Hello I'm a young model.", 63 | 'score': 0.021098261699080467, 64 | 'token': 664, 65 | 'token_str': 'Ġyoung'}] 66 | ``` 67 | 68 | Here is how to use this model to get the features of a given text in PyTorch: 69 | 70 | ```python 71 | from transformers import RobertaTokenizer, RobertaModel 72 | tokenizer = RobertaTokenizer.from_pretrained('roberta-large') 73 | model = RobertaModel.from_pretrained('roberta-large') 74 | text = "Replace me by any text you'd like." 75 | encoded_input = tokenizer(text, return_tensors='pt') 76 | output = model(**encoded_input) 77 | ``` 78 | 79 | and in TensorFlow: 80 | 81 | ```python 82 | from transformers import RobertaTokenizer, TFRobertaModel 83 | tokenizer = RobertaTokenizer.from_pretrained('roberta-large') 84 | model = TFRobertaModel.from_pretrained('roberta-large') 85 | text = "Replace me by any text you'd like." 86 | encoded_input = tokenizer(text, return_tensors='tf') 87 | output = model(encoded_input) 88 | ``` 89 | 90 | ### Limitations and bias 91 | 92 | The training data used for this model contains a lot of unfiltered content from the internet, which is far from 93 | neutral. Therefore, the model can have biased predictions: 94 | 95 | ```python 96 | >>> from transformers import pipeline 97 | >>> unmasker = pipeline('fill-mask', model='roberta-large') 98 | >>> unmasker("The man worked as a .") 99 | 100 | [{'sequence': 'The man worked as a mechanic.', 101 | 'score': 0.08260300755500793, 102 | 'token': 25682, 103 | 'token_str': 'Ġmechanic'}, 104 | {'sequence': 'The man worked as a driver.', 105 | 'score': 0.05736079439520836, 106 | 'token': 1393, 107 | 'token_str': 'Ġdriver'}, 108 | {'sequence': 'The man worked as a teacher.', 109 | 'score': 0.04709019884467125, 110 | 'token': 3254, 111 | 'token_str': 'Ġteacher'}, 112 | {'sequence': 'The man worked as a bartender.', 113 | 'score': 0.04641604796051979, 114 | 'token': 33080, 115 | 'token_str': 'Ġbartender'}, 116 | {'sequence': 'The man worked as a waiter.', 117 | 'score': 0.04239227622747421, 118 | 'token': 38233, 119 | 'token_str': 'Ġwaiter'}] 120 | 121 | >>> unmasker("The woman worked as a .") 122 | 123 | [{'sequence': 'The woman worked as a nurse.', 124 | 'score': 0.2667474150657654, 125 | 'token': 9008, 126 | 'token_str': 'Ġnurse'}, 127 | {'sequence': 'The woman worked as a waitress.', 128 | 'score': 0.12280137836933136, 129 | 'token': 35698, 130 | 'token_str': 'Ġwaitress'}, 131 | {'sequence': 'The woman worked as a teacher.', 132 | 'score': 0.09747499972581863, 133 | 'token': 3254, 134 | 'token_str': 'Ġteacher'}, 135 | {'sequence': 'The woman worked as a secretary.', 136 | 'score': 0.05783602222800255, 137 | 'token': 2971, 138 | 'token_str': 'Ġsecretary'}, 139 | {'sequence': 'The woman worked as a cleaner.', 140 | 'score': 0.05576248839497566, 141 | 'token': 16126, 142 | 'token_str': 'Ġcleaner'}] 143 | ``` 144 | 145 | This bias will also affect all fine-tuned versions of this model. 146 | 147 | ## Training data 148 | 149 | The RoBERTa model was pretrained on the reunion of five datasets: 150 | - [BookCorpus](https://yknzhu.wixsite.com/mbweb), a dataset consisting of 11,038 unpublished books; 151 | - [English Wikipedia](https://en.wikipedia.org/wiki/English_Wikipedia) (excluding lists, tables and headers) ; 152 | - [CC-News](https://commoncrawl.org/2016/10/news-dataset-available/), a dataset containing 63 millions English news 153 | articles crawled between September 2016 and February 2019. 154 | - [OpenWebText](https://github.com/jcpeterson/openwebtext), an opensource recreation of the WebText dataset used to 155 | train GPT-2, 156 | - [Stories](https://arxiv.org/abs/1806.02847) a dataset containing a subset of CommonCrawl data filtered to match the 157 | story-like style of Winograd schemas. 158 | 159 | Together theses datasets weight 160GB of text. 160 | 161 | ## Training procedure 162 | 163 | ### Preprocessing 164 | 165 | The texts are tokenized using a byte version of Byte-Pair Encoding (BPE) and a vocabulary size of 50,000. The inputs of 166 | the model take pieces of 512 contiguous token that may span over documents. The beginning of a new document is marked 167 | with `` and the end of one by `` 168 | 169 | The details of the masking procedure for each sentence are the following: 170 | - 15% of the tokens are masked. 171 | - In 80% of the cases, the masked tokens are replaced by ``. 172 | 173 | - In 10% of the cases, the masked tokens are replaced by a random token (different) from the one they replace. 174 | - In the 10% remaining cases, the masked tokens are left as is. 175 | 176 | Contrary to BERT, the masking is done dynamically during pretraining (e.g., it changes at each epoch and is not fixed). 177 | 178 | ### Pretraining 179 | 180 | The model was trained on 1024 V100 GPUs for 500K steps with a batch size of 8K and a sequence length of 512. The 181 | optimizer used is Adam with a learning rate of 4e-4, \\(\beta_{1} = 0.9\\), \\(\beta_{2} = 0.98\\) and 182 | \\(\epsilon = 1e-6\\), a weight decay of 0.01, learning rate warmup for 30,000 steps and linear decay of the learning 183 | rate after. 184 | 185 | ## Evaluation results 186 | 187 | When fine-tuned on downstream tasks, this model achieves the following results: 188 | 189 | Glue test results: 190 | 191 | | Task | MNLI | QQP | QNLI | SST-2 | CoLA | STS-B | MRPC | RTE | 192 | |:----:|:----:|:----:|:----:|:-----:|:----:|:-----:|:----:|:----:| 193 | | | 90.2 | 92.2 | 94.7 | 96.4 | 68.0 | 96.4 | 90.9 | 86.6 | 194 | 195 | 196 | ### BibTeX entry and citation info 197 | 198 | ```bibtex 199 | @article{DBLP:journals/corr/abs-1907-11692, 200 | author = {Yinhan Liu and 201 | Myle Ott and 202 | Naman Goyal and 203 | Jingfei Du and 204 | Mandar Joshi and 205 | Danqi Chen and 206 | Omer Levy and 207 | Mike Lewis and 208 | Luke Zettlemoyer and 209 | Veselin Stoyanov}, 210 | title = {RoBERTa: {A} Robustly Optimized {BERT} Pretraining Approach}, 211 | journal = {CoRR}, 212 | volume = {abs/1907.11692}, 213 | year = {2019}, 214 | url = {http://arxiv.org/abs/1907.11692}, 215 | archivePrefix = {arXiv}, 216 | eprint = {1907.11692}, 217 | timestamp = {Thu, 01 Aug 2019 08:59:33 +0200}, 218 | biburl = {https://dblp.org/rec/journals/corr/abs-1907-11692.bib}, 219 | bibsource = {dblp computer science bibliography, https://dblp.org} 220 | } 221 | ``` 222 | 223 | 224 | 225 | -------------------------------------------------------------------------------- /model_scraping/cards/runwayml___stable-diffusion-v1-5.md: -------------------------------------------------------------------------------- 1 | # Stable Diffusion v1-5 Model Card 2 | 3 | Stable Diffusion is a latent text-to-image diffusion model capable of generating photo-realistic images given any text input. 4 | For more information about how Stable Diffusion functions, please have a look at [🤗's Stable Diffusion blog](https://huggingface.co/blog/stable_diffusion). 5 | 6 | The **Stable-Diffusion-v1-5** checkpoint was initialized with the weights of the [Stable-Diffusion-v1-2](https:/steps/huggingface.co/CompVis/stable-diffusion-v1-2) 7 | checkpoint and subsequently fine-tuned on 595k steps at resolution 512x512 on "laion-aesthetics v2 5+" and 10% dropping of the text-conditioning to improve [classifier-free guidance sampling](https://arxiv.org/abs/2207.12598). 8 | 9 | You can use this both with the [🧨Diffusers library](https://github.com/huggingface/diffusers) and the [RunwayML GitHub repository](https://github.com/runwayml/stable-diffusion). 10 | 11 | ### Diffusers 12 | ```py 13 | from diffusers import StableDiffusionPipeline 14 | import torch 15 | 16 | model_id = "runwayml/stable-diffusion-v1-5" 17 | pipe = StableDiffusionPipeline.from_pretrained(model_id, torch_dtype=torch.float16) 18 | pipe = pipe.to("cuda") 19 | 20 | prompt = "a photo of an astronaut riding a horse on mars" 21 | image = pipe(prompt).images[0] 22 | 23 | image.save("astronaut_rides_horse.png") 24 | ``` 25 | For more detailed instructions, use-cases and examples in JAX follow the instructions [here](https://github.com/huggingface/diffusers#text-to-image-generation-with-stable-diffusion) 26 | 27 | ### Original GitHub Repository 28 | 29 | 1. Download the weights 30 | - [v1-5-pruned-emaonly.ckpt](https://huggingface.co/runwayml/stable-diffusion-v1-5/resolve/main/v1-5-pruned-emaonly.ckpt) - 4.27GB, ema-only weight. uses less VRAM - suitable for inference 31 | - [v1-5-pruned.ckpt](https://huggingface.co/runwayml/stable-diffusion-v1-5/resolve/main/v1-5-pruned.ckpt) - 7.7GB, ema+non-ema weights. uses more VRAM - suitable for fine-tuning 32 | 33 | 2. Follow instructions [here](https://github.com/runwayml/stable-diffusion). 34 | 35 | ## Model Details 36 | - **Developed by:** Robin Rombach, Patrick Esser 37 | - **Model type:** Diffusion-based text-to-image generation model 38 | - **Language(s):** English 39 | - **License:** [The CreativeML OpenRAIL M license](https://huggingface.co/spaces/CompVis/stable-diffusion-license) is an [Open RAIL M license](https://www.licenses.ai/blog/2022/8/18/naming-convention-of-responsible-ai-licenses), adapted from the work that [BigScience](https://bigscience.huggingface.co/) and [the RAIL Initiative](https://www.licenses.ai/) are jointly carrying in the area of responsible AI licensing. See also [the article about the BLOOM Open RAIL license](https://bigscience.huggingface.co/blog/the-bigscience-rail-license) on which our license is based. 40 | - **Model Description:** This is a model that can be used to generate and modify images based on text prompts. It is a [Latent Diffusion Model](https://arxiv.org/abs/2112.10752) that uses a fixed, pretrained text encoder ([CLIP ViT-L/14](https://arxiv.org/abs/2103.00020)) as suggested in the [Imagen paper](https://arxiv.org/abs/2205.11487). 41 | - **Resources for more information:** [GitHub Repository](https://github.com/CompVis/stable-diffusion), [Paper](https://arxiv.org/abs/2112.10752). 42 | - **Cite as:** 43 | 44 | @InProceedings{Rombach_2022_CVPR, 45 | author = {Rombach, Robin and Blattmann, Andreas and Lorenz, Dominik and Esser, Patrick and Ommer, Bj\"orn}, 46 | title = {High-Resolution Image Synthesis With Latent Diffusion Models}, 47 | booktitle = {Proceedings of the IEEE/CVF Conference on Computer Vision and Pattern Recognition (CVPR)}, 48 | month = {June}, 49 | year = {2022}, 50 | pages = {10684-10695} 51 | } 52 | 53 | # Uses 54 | 55 | ## Direct Use 56 | The model is intended for research purposes only. Possible research areas and 57 | tasks include 58 | 59 | - Safe deployment of models which have the potential to generate harmful content. 60 | - Probing and understanding the limitations and biases of generative models. 61 | - Generation of artworks and use in design and other artistic processes. 62 | - Applications in educational or creative tools. 63 | - Research on generative models. 64 | 65 | Excluded uses are described below. 66 | 67 | ### Misuse, Malicious Use, and Out-of-Scope Use 68 | _Note: This section is taken from the [DALLE-MINI model card](https://huggingface.co/dalle-mini/dalle-mini), but applies in the same way to Stable Diffusion v1_. 69 | 70 | 71 | The model should not be used to intentionally create or disseminate images that create hostile or alienating environments for people. This includes generating images that people would foreseeably find disturbing, distressing, or offensive; or content that propagates historical or current stereotypes. 72 | 73 | #### Out-of-Scope Use 74 | The model was not trained to be factual or true representations of people or events, and therefore using the model to generate such content is out-of-scope for the abilities of this model. 75 | 76 | #### Misuse and Malicious Use 77 | Using the model to generate content that is cruel to individuals is a misuse of this model. This includes, but is not limited to: 78 | 79 | - Generating demeaning, dehumanizing, or otherwise harmful representations of people or their environments, cultures, religions, etc. 80 | - Intentionally promoting or propagating discriminatory content or harmful stereotypes. 81 | - Impersonating individuals without their consent. 82 | - Sexual content without consent of the people who might see it. 83 | - Mis- and disinformation 84 | - Representations of egregious violence and gore 85 | - Sharing of copyrighted or licensed material in violation of its terms of use. 86 | - Sharing content that is an alteration of copyrighted or licensed material in violation of its terms of use. 87 | 88 | ## Limitations and Bias 89 | 90 | ### Limitations 91 | 92 | - The model does not achieve perfect photorealism 93 | - The model cannot render legible text 94 | - The model does not perform well on more difficult tasks which involve compositionality, such as rendering an image corresponding to “A red cube on top of a blue sphere” 95 | - Faces and people in general may not be generated properly. 96 | - The model was trained mainly with English captions and will not work as well in other languages. 97 | - The autoencoding part of the model is lossy 98 | - The model was trained on a large-scale dataset 99 | [LAION-5B](https://laion.ai/blog/laion-5b/) which contains adult material 100 | and is not fit for product use without additional safety mechanisms and 101 | considerations. 102 | - No additional measures were used to deduplicate the dataset. As a result, we observe some degree of memorization for images that are duplicated in the training data. 103 | The training data can be searched at [https://rom1504.github.io/clip-retrieval/](https://rom1504.github.io/clip-retrieval/) to possibly assist in the detection of memorized images. 104 | 105 | ### Bias 106 | 107 | While the capabilities of image generation models are impressive, they can also reinforce or exacerbate social biases. 108 | Stable Diffusion v1 was trained on subsets of [LAION-2B(en)](https://laion.ai/blog/laion-5b/), 109 | which consists of images that are primarily limited to English descriptions. 110 | Texts and images from communities and cultures that use other languages are likely to be insufficiently accounted for. 111 | This affects the overall output of the model, as white and western cultures are often set as the default. Further, the 112 | ability of the model to generate content with non-English prompts is significantly worse than with English-language prompts. 113 | 114 | ### Safety Module 115 | 116 | The intended use of this model is with the [Safety Checker](https://github.com/huggingface/diffusers/blob/main/src/diffusers/pipelines/stable_diffusion/safety_checker.py) in Diffusers. 117 | This checker works by checking model outputs against known hard-coded NSFW concepts. 118 | The concepts are intentionally hidden to reduce the likelihood of reverse-engineering this filter. 119 | Specifically, the checker compares the class probability of harmful concepts in the embedding space of the `CLIPTextModel` *after generation* of the images. 120 | The concepts are passed into the model with the generated image and compared to a hand-engineered weight for each NSFW concept. 121 | 122 | 123 | ## Training 124 | 125 | **Training Data** 126 | The model developers used the following dataset for training the model: 127 | 128 | - LAION-2B (en) and subsets thereof (see next section) 129 | 130 | **Training Procedure** 131 | Stable Diffusion v1-5 is a latent diffusion model which combines an autoencoder with a diffusion model that is trained in the latent space of the autoencoder. During training, 132 | 133 | - Images are encoded through an encoder, which turns images into latent representations. The autoencoder uses a relative downsampling factor of 8 and maps images of shape H x W x 3 to latents of shape H/f x W/f x 4 134 | - Text prompts are encoded through a ViT-L/14 text-encoder. 135 | - The non-pooled output of the text encoder is fed into the UNet backbone of the latent diffusion model via cross-attention. 136 | - The loss is a reconstruction objective between the noise that was added to the latent and the prediction made by the UNet. 137 | 138 | Currently six Stable Diffusion checkpoints are provided, which were trained as follows. 139 | - [`stable-diffusion-v1-1`](https://huggingface.co/CompVis/stable-diffusion-v1-1): 237,000 steps at resolution `256x256` on [laion2B-en](https://huggingface.co/datasets/laion/laion2B-en). 140 | 194,000 steps at resolution `512x512` on [laion-high-resolution](https://huggingface.co/datasets/laion/laion-high-resolution) (170M examples from LAION-5B with resolution `>= 1024x1024`). 141 | - [`stable-diffusion-v1-2`](https://huggingface.co/CompVis/stable-diffusion-v1-2): Resumed from `stable-diffusion-v1-1`. 142 | 515,000 steps at resolution `512x512` on "laion-improved-aesthetics" (a subset of laion2B-en, 143 | filtered to images with an original size `>= 512x512`, estimated aesthetics score `> 5.0`, and an estimated watermark probability `< 0.5`. The watermark estimate is from the LAION-5B metadata, the aesthetics score is estimated using an [improved aesthetics estimator](https://github.com/christophschuhmann/improved-aesthetic-predictor)). 144 | - [`stable-diffusion-v1-3`](https://huggingface.co/CompVis/stable-diffusion-v1-3): Resumed from `stable-diffusion-v1-2` - 195,000 steps at resolution `512x512` on "laion-improved-aesthetics" and 10 % dropping of the text-conditioning to improve [classifier-free guidance sampling](https://arxiv.org/abs/2207.12598). 145 | - [`stable-diffusion-v1-4`](https://huggingface.co/CompVis/stable-diffusion-v1-4) Resumed from `stable-diffusion-v1-2` - 225,000 steps at resolution `512x512` on "laion-aesthetics v2 5+" and 10 % dropping of the text-conditioning to improve [classifier-free guidance sampling](https://arxiv.org/abs/2207.12598). 146 | - [`stable-diffusion-v1-5`](https://huggingface.co/runwayml/stable-diffusion-v1-5) Resumed from `stable-diffusion-v1-2` - 595,000 steps at resolution `512x512` on "laion-aesthetics v2 5+" and 10 % dropping of the text-conditioning to improve [classifier-free guidance sampling](https://arxiv.org/abs/2207.12598). 147 | - [`stable-diffusion-inpainting`](https://huggingface.co/runwayml/stable-diffusion-inpainting) Resumed from `stable-diffusion-v1-5` - then 440,000 steps of inpainting training at resolution 512x512 on “laion-aesthetics v2 5+” and 10% dropping of the text-conditioning. For inpainting, the UNet has 5 additional input channels (4 for the encoded masked-image and 1 for the mask itself) whose weights were zero-initialized after restoring the non-inpainting checkpoint. During training, we generate synthetic masks and in 25% mask everything. 148 | 149 | - **Hardware:** 32 x 8 x A100 GPUs 150 | - **Optimizer:** AdamW 151 | - **Gradient Accumulations**: 2 152 | - **Batch:** 32 x 8 x 2 x 4 = 2048 153 | - **Learning rate:** warmup to 0.0001 for 10,000 steps and then kept constant 154 | 155 | ## Evaluation Results 156 | Evaluations with different classifier-free guidance scales (1.5, 2.0, 3.0, 4.0, 157 | 5.0, 6.0, 7.0, 8.0) and 50 PNDM/PLMS sampling 158 | steps show the relative improvements of the checkpoints: 159 | 160 | ![pareto](https://huggingface.co/CompVis/stable-diffusion/resolve/main/v1-1-to-v1-5.png) 161 | 162 | Evaluated using 50 PLMS steps and 10000 random prompts from the COCO2017 validation set, evaluated at 512x512 resolution. Not optimized for FID scores. 163 | ## Environmental Impact 164 | 165 | **Stable Diffusion v1** **Estimated Emissions** 166 | Based on that information, we estimate the following CO2 emissions using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700). The hardware, runtime, cloud provider, and compute region were utilized to estimate the carbon impact. 167 | 168 | - **Hardware Type:** A100 PCIe 40GB 169 | - **Hours used:** 150000 170 | - **Cloud Provider:** AWS 171 | - **Compute Region:** US-east 172 | - **Carbon Emitted (Power consumption x Time x Carbon produced based on location of power grid):** 11250 kg CO2 eq. 173 | 174 | 175 | ## Citation 176 | 177 | ```bibtex 178 | @InProceedings{Rombach_2022_CVPR, 179 | author = {Rombach, Robin and Blattmann, Andreas and Lorenz, Dominik and Esser, Patrick and Ommer, Bj\"orn}, 180 | title = {High-Resolution Image Synthesis With Latent Diffusion Models}, 181 | booktitle = {Proceedings of the IEEE/CVF Conference on Computer Vision and Pattern Recognition (CVPR)}, 182 | month = {June}, 183 | year = {2022}, 184 | pages = {10684-10695} 185 | } 186 | ``` 187 | 188 | *This model card was written by: Robin Rombach and Patrick Esser and is based on the [DALL-E Mini model card](https://huggingface.co/dalle-mini/dalle-mini).* -------------------------------------------------------------------------------- /model_scraping/cards/sentence-transformers___all-MiniLM-L6-v2.md: -------------------------------------------------------------------------------- 1 | # all-MiniLM-L6-v2 2 | This is a [sentence-transformers](https://www.SBERT.net) model: It maps sentences & paragraphs to a 384 dimensional dense vector space and can be used for tasks like clustering or semantic search. 3 | 4 | ## Usage (Sentence-Transformers) 5 | Using this model becomes easy when you have [sentence-transformers](https://www.SBERT.net) installed: 6 | 7 | ``` 8 | pip install -U sentence-transformers 9 | ``` 10 | 11 | Then you can use the model like this: 12 | ```python 13 | from sentence_transformers import SentenceTransformer 14 | sentences = ["This is an example sentence", "Each sentence is converted"] 15 | 16 | model = SentenceTransformer('sentence-transformers/all-MiniLM-L6-v2') 17 | embeddings = model.encode(sentences) 18 | print(embeddings) 19 | ``` 20 | 21 | ## Usage (HuggingFace Transformers) 22 | Without [sentence-transformers](https://www.SBERT.net), you can use the model like this: First, you pass your input through the transformer model, then you have to apply the right pooling-operation on-top of the contextualized word embeddings. 23 | 24 | ```python 25 | from transformers import AutoTokenizer, AutoModel 26 | import torch 27 | import torch.nn.functional as F 28 | 29 | #Mean Pooling - Take attention mask into account for correct averaging 30 | def mean_pooling(model_output, attention_mask): 31 | token_embeddings = model_output[0] #First element of model_output contains all token embeddings 32 | input_mask_expanded = attention_mask.unsqueeze(-1).expand(token_embeddings.size()).float() 33 | return torch.sum(token_embeddings * input_mask_expanded, 1) / torch.clamp(input_mask_expanded.sum(1), min=1e-9) 34 | 35 | 36 | # Sentences we want sentence embeddings for 37 | sentences = ['This is an example sentence', 'Each sentence is converted'] 38 | 39 | # Load model from HuggingFace Hub 40 | tokenizer = AutoTokenizer.from_pretrained('sentence-transformers/all-MiniLM-L6-v2') 41 | model = AutoModel.from_pretrained('sentence-transformers/all-MiniLM-L6-v2') 42 | 43 | # Tokenize sentences 44 | encoded_input = tokenizer(sentences, padding=True, truncation=True, return_tensors='pt') 45 | 46 | # Compute token embeddings 47 | with torch.no_grad(): 48 | model_output = model(**encoded_input) 49 | 50 | # Perform pooling 51 | sentence_embeddings = mean_pooling(model_output, encoded_input['attention_mask']) 52 | 53 | # Normalize embeddings 54 | sentence_embeddings = F.normalize(sentence_embeddings, p=2, dim=1) 55 | 56 | print("Sentence embeddings:") 57 | print(sentence_embeddings) 58 | ``` 59 | 60 | ## Evaluation Results 61 | 62 | For an automated evaluation of this model, see the *Sentence Embeddings Benchmark*: [https://seb.sbert.net](https://seb.sbert.net?model_name=sentence-transformers/all-MiniLM-L6-v2) 63 | 64 | ------ 65 | 66 | ## Background 67 | 68 | The project aims to train sentence embedding models on very large sentence level datasets using a self-supervised 69 | contrastive learning objective. We used the pretrained [`nreimers/MiniLM-L6-H384-uncased`](https://huggingface.co/nreimers/MiniLM-L6-H384-uncased) model and fine-tuned in on a 70 | 1B sentence pairs dataset. We use a contrastive learning objective: given a sentence from the pair, the model should predict which out of a set of randomly sampled other sentences, was actually paired with it in our dataset. 71 | 72 | We developped this model during the 73 | [Community week using JAX/Flax for NLP & CV](https://discuss.huggingface.co/t/open-to-the-community-community-week-using-jax-flax-for-nlp-cv/7104), 74 | organized by Hugging Face. We developped this model as part of the project: 75 | [Train the Best Sentence Embedding Model Ever with 1B Training Pairs](https://discuss.huggingface.co/t/train-the-best-sentence-embedding-model-ever-with-1b-training-pairs/7354). We benefited from efficient hardware infrastructure to run the project: 7 TPUs v3-8, as well as intervention from Googles Flax, JAX, and Cloud team member about efficient deep learning frameworks. 76 | 77 | ## Intended uses 78 | 79 | Our model is intented to be used as a sentence and short paragraph encoder. Given an input text, it ouptuts a vector which captures 80 | the semantic information. The sentence vector may be used for information retrieval, clustering or sentence similarity tasks. 81 | 82 | By default, input text longer than 256 word pieces is truncated. 83 | 84 | 85 | ## Training procedure 86 | 87 | ### Pre-training 88 | 89 | We use the pretrained [`nreimers/MiniLM-L6-H384-uncased`](https://huggingface.co/nreimers/MiniLM-L6-H384-uncased) model. Please refer to the model card for more detailed information about the pre-training procedure. 90 | 91 | ### Fine-tuning 92 | 93 | We fine-tune the model using a contrastive objective. Formally, we compute the cosine similarity from each possible sentence pairs from the batch. 94 | We then apply the cross entropy loss by comparing with true pairs. 95 | 96 | #### Hyper parameters 97 | 98 | We trained ou model on a TPU v3-8. We train the model during 100k steps using a batch size of 1024 (128 per TPU core). 99 | We use a learning rate warm up of 500. The sequence length was limited to 128 tokens. We used the AdamW optimizer with 100 | a 2e-5 learning rate. The full training script is accessible in this current repository: `train_script.py`. 101 | 102 | #### Training data 103 | 104 | We use the concatenation from multiple datasets to fine-tune our model. The total number of sentence pairs is above 1 billion sentences. 105 | We sampled each dataset given a weighted probability which configuration is detailed in the `data_config.json` file. 106 | 107 | 108 | | Dataset | Paper | Number of training tuples | 109 | |--------------------------------------------------------|:----------------------------------------:|:--------------------------:| 110 | | [Reddit comments (2015-2018)](https://github.com/PolyAI-LDN/conversational-datasets/tree/master/reddit) | [paper](https://arxiv.org/abs/1904.06472) | 726,484,430 | 111 | | [S2ORC](https://github.com/allenai/s2orc) Citation pairs (Abstracts) | [paper](https://aclanthology.org/2020.acl-main.447/) | 116,288,806 | 112 | | [WikiAnswers](https://github.com/afader/oqa#wikianswers-corpus) Duplicate question pairs | [paper](https://doi.org/10.1145/2623330.2623677) | 77,427,422 | 113 | | [PAQ](https://github.com/facebookresearch/PAQ) (Question, Answer) pairs | [paper](https://arxiv.org/abs/2102.07033) | 64,371,441 | 114 | | [S2ORC](https://github.com/allenai/s2orc) Citation pairs (Titles) | [paper](https://aclanthology.org/2020.acl-main.447/) | 52,603,982 | 115 | | [S2ORC](https://github.com/allenai/s2orc) (Title, Abstract) | [paper](https://aclanthology.org/2020.acl-main.447/) | 41,769,185 | 116 | | [Stack Exchange](https://huggingface.co/datasets/flax-sentence-embeddings/stackexchange_xml) (Title, Body) pairs | - | 25,316,456 | 117 | | [Stack Exchange](https://huggingface.co/datasets/flax-sentence-embeddings/stackexchange_xml) (Title+Body, Answer) pairs | - | 21,396,559 | 118 | | [Stack Exchange](https://huggingface.co/datasets/flax-sentence-embeddings/stackexchange_xml) (Title, Answer) pairs | - | 21,396,559 | 119 | | [MS MARCO](https://microsoft.github.io/msmarco/) triplets | [paper](https://doi.org/10.1145/3404835.3462804) | 9,144,553 | 120 | | [GOOAQ: Open Question Answering with Diverse Answer Types](https://github.com/allenai/gooaq) | [paper](https://arxiv.org/pdf/2104.08727.pdf) | 3,012,496 | 121 | | [Yahoo Answers](https://www.kaggle.com/soumikrakshit/yahoo-answers-dataset) (Title, Answer) | [paper](https://proceedings.neurips.cc/paper/2015/hash/250cf8b51c773f3f8dc8b4be867a9a02-Abstract.html) | 1,198,260 | 122 | | [Code Search](https://huggingface.co/datasets/code_search_net) | - | 1,151,414 | 123 | | [COCO](https://cocodataset.org/#home) Image captions | [paper](https://link.springer.com/chapter/10.1007%2F978-3-319-10602-1_48) | 828,395| 124 | | [SPECTER](https://github.com/allenai/specter) citation triplets | [paper](https://doi.org/10.18653/v1/2020.acl-main.207) | 684,100 | 125 | | [Yahoo Answers](https://www.kaggle.com/soumikrakshit/yahoo-answers-dataset) (Question, Answer) | [paper](https://proceedings.neurips.cc/paper/2015/hash/250cf8b51c773f3f8dc8b4be867a9a02-Abstract.html) | 681,164 | 126 | | [Yahoo Answers](https://www.kaggle.com/soumikrakshit/yahoo-answers-dataset) (Title, Question) | [paper](https://proceedings.neurips.cc/paper/2015/hash/250cf8b51c773f3f8dc8b4be867a9a02-Abstract.html) | 659,896 | 127 | | [SearchQA](https://huggingface.co/datasets/search_qa) | [paper](https://arxiv.org/abs/1704.05179) | 582,261 | 128 | | [Eli5](https://huggingface.co/datasets/eli5) | [paper](https://doi.org/10.18653/v1/p19-1346) | 325,475 | 129 | | [Flickr 30k](https://shannon.cs.illinois.edu/DenotationGraph/) | [paper](https://transacl.org/ojs/index.php/tacl/article/view/229/33) | 317,695 | 130 | | [Stack Exchange](https://huggingface.co/datasets/flax-sentence-embeddings/stackexchange_xml) Duplicate questions (titles) | | 304,525 | 131 | | AllNLI ([SNLI](https://nlp.stanford.edu/projects/snli/) and [MultiNLI](https://cims.nyu.edu/~sbowman/multinli/) | [paper SNLI](https://doi.org/10.18653/v1/d15-1075), [paper MultiNLI](https://doi.org/10.18653/v1/n18-1101) | 277,230 | 132 | | [Stack Exchange](https://huggingface.co/datasets/flax-sentence-embeddings/stackexchange_xml) Duplicate questions (bodies) | | 250,519 | 133 | | [Stack Exchange](https://huggingface.co/datasets/flax-sentence-embeddings/stackexchange_xml) Duplicate questions (titles+bodies) | | 250,460 | 134 | | [Sentence Compression](https://github.com/google-research-datasets/sentence-compression) | [paper](https://www.aclweb.org/anthology/D13-1155/) | 180,000 | 135 | | [Wikihow](https://github.com/pvl/wikihow_pairs_dataset) | [paper](https://arxiv.org/abs/1810.09305) | 128,542 | 136 | | [Altlex](https://github.com/chridey/altlex/) | [paper](https://aclanthology.org/P16-1135.pdf) | 112,696 | 137 | | [Quora Question Triplets](https://quoradata.quora.com/First-Quora-Dataset-Release-Question-Pairs) | - | 103,663 | 138 | | [Simple Wikipedia](https://cs.pomona.edu/~dkauchak/simplification/) | [paper](https://www.aclweb.org/anthology/P11-2117/) | 102,225 | 139 | | [Natural Questions (NQ)](https://ai.google.com/research/NaturalQuestions) | [paper](https://transacl.org/ojs/index.php/tacl/article/view/1455) | 100,231 | 140 | | [SQuAD2.0](https://rajpurkar.github.io/SQuAD-explorer/) | [paper](https://aclanthology.org/P18-2124.pdf) | 87,599 | 141 | | [TriviaQA](https://huggingface.co/datasets/trivia_qa) | - | 73,346 | 142 | | **Total** | | **1,170,060,424** | -------------------------------------------------------------------------------- /model_scraping/cards/t5-base.md: -------------------------------------------------------------------------------- 1 | # Model Card for T5 Base 2 | 3 | ![model image](https://camo.githubusercontent.com/623b4dea0b653f2ad3f36c71ebfe749a677ac0a1/68747470733a2f2f6d69726f2e6d656469756d2e636f6d2f6d61782f343030362f312a44304a31674e51663876727255704b657944387750412e706e67) 4 | 5 | # Table of Contents 6 | 7 | 1. [Model Details](#model-details) 8 | 2. [Uses](#uses) 9 | 3. [Bias, Risks, and Limitations](#bias-risks-and-limitations) 10 | 4. [Training Details](#training-details) 11 | 5. [Evaluation](#evaluation) 12 | 6. [Environmental Impact](#environmental-impact) 13 | 7. [Citation](#citation) 14 | 8. [Model Card Authors](#model-card-authors) 15 | 9. [How To Get Started With the Model](#how-to-get-started-with-the-model) 16 | 17 | # Model Details 18 | 19 | ## Model Description 20 | 21 | The developers of the Text-To-Text Transfer Transformer (T5) [write](https://ai.googleblog.com/2020/02/exploring-transfer-learning-with-t5.html): 22 | 23 | > With T5, we propose reframing all NLP tasks into a unified text-to-text-format where the input and output are always text strings, in contrast to BERT-style models that can only output either a class label or a span of the input. Our text-to-text framework allows us to use the same model, loss function, and hyperparameters on any NLP task. 24 | 25 | T5-Base is the checkpoint with 220 million parameters. 26 | 27 | - **Developed by:** Colin Raffel, Noam Shazeer, Adam Roberts, Katherine Lee, Sharan Narang, Michael Matena, Yanqi Zhou, Wei Li, Peter J. Liu. See [associated paper](https://jmlr.org/papers/volume21/20-074/20-074.pdf) and [GitHub repo](https://github.com/google-research/text-to-text-transfer-transformer#released-model-checkpoints) 28 | - **Model type:** Language model 29 | - **Language(s) (NLP):** English, French, Romanian, German 30 | - **License:** Apache 2.0 31 | - **Related Models:** [All T5 Checkpoints](https://huggingface.co/models?search=t5) 32 | - **Resources for more information:** 33 | - [Research paper](https://jmlr.org/papers/volume21/20-074/20-074.pdf) 34 | - [Google's T5 Blog Post](https://ai.googleblog.com/2020/02/exploring-transfer-learning-with-t5.html) 35 | - [GitHub Repo](https://github.com/google-research/text-to-text-transfer-transformer) 36 | - [Hugging Face T5 Docs](https://huggingface.co/docs/transformers/model_doc/t5) 37 | 38 | # Uses 39 | 40 | ## Direct Use and Downstream Use 41 | 42 | The developers write in a [blog post](https://ai.googleblog.com/2020/02/exploring-transfer-learning-with-t5.html) that the model: 43 | 44 | > Our text-to-text framework allows us to use the same model, loss function, and hyperparameters on any NLP task, including machine translation, document summarization, question answering, and classification tasks (e.g., sentiment analysis). We can even apply T5 to regression tasks by training it to predict the string representation of a number instead of the number itself. 45 | 46 | See the [blog post](https://ai.googleblog.com/2020/02/exploring-transfer-learning-with-t5.html) and [research paper](https://jmlr.org/papers/volume21/20-074/20-074.pdf) for further details. 47 | 48 | ## Out-of-Scope Use 49 | 50 | More information needed. 51 | 52 | # Bias, Risks, and Limitations 53 | 54 | More information needed. 55 | 56 | ## Recommendations 57 | 58 | More information needed. 59 | 60 | # Training Details 61 | 62 | ## Training Data 63 | 64 | The model is pre-trained on the [Colossal Clean Crawled Corpus (C4)](https://www.tensorflow.org/datasets/catalog/c4), which was developed and released in the context of the same [research paper](https://jmlr.org/papers/volume21/20-074/20-074.pdf) as T5. 65 | 66 | The model was pre-trained on a on a **multi-task mixture of unsupervised (1.) and supervised tasks (2.)**. 67 | Thereby, the following datasets were being used for (1.) and (2.): 68 | 69 | 1. **Datasets used for Unsupervised denoising objective**: 70 | 71 | - [C4](https://huggingface.co/datasets/c4) 72 | - [Wiki-DPR](https://huggingface.co/datasets/wiki_dpr) 73 | 74 | 75 | 2. **Datasets used for Supervised text-to-text language modeling objective** 76 | 77 | - Sentence acceptability judgment 78 | - CoLA [Warstadt et al., 2018](https://arxiv.org/abs/1805.12471) 79 | - Sentiment analysis 80 | - SST-2 [Socher et al., 2013](https://nlp.stanford.edu/~socherr/EMNLP2013_RNTN.pdf) 81 | - Paraphrasing/sentence similarity 82 | - MRPC [Dolan and Brockett, 2005](https://aclanthology.org/I05-5002) 83 | - STS-B [Ceret al., 2017](https://arxiv.org/abs/1708.00055) 84 | - QQP [Iyer et al., 2017](https://quoradata.quora.com/First-Quora-Dataset-Release-Question-Pairs) 85 | - Natural language inference 86 | - MNLI [Williams et al., 2017](https://arxiv.org/abs/1704.05426) 87 | - QNLI [Rajpurkar et al.,2016](https://arxiv.org/abs/1606.05250) 88 | - RTE [Dagan et al., 2005](https://link.springer.com/chapter/10.1007/11736790_9) 89 | - CB [De Marneff et al., 2019](https://semanticsarchive.net/Archive/Tg3ZGI2M/Marneffe.pdf) 90 | - Sentence completion 91 | - COPA [Roemmele et al., 2011](https://www.researchgate.net/publication/221251392_Choice_of_Plausible_Alternatives_An_Evaluation_of_Commonsense_Causal_Reasoning) 92 | - Word sense disambiguation 93 | - WIC [Pilehvar and Camacho-Collados, 2018](https://arxiv.org/abs/1808.09121) 94 | - Question answering 95 | - MultiRC [Khashabi et al., 2018](https://aclanthology.org/N18-1023) 96 | - ReCoRD [Zhang et al., 2018](https://arxiv.org/abs/1810.12885) 97 | - BoolQ [Clark et al., 2019](https://arxiv.org/abs/1905.10044) 98 | 99 | ## Training Procedure 100 | 101 | In their [abstract](https://jmlr.org/papers/volume21/20-074/20-074.pdf), the model developers write: 102 | 103 | > In this paper, we explore the landscape of transfer learning techniques for NLP by introducing a unified framework that converts every language problem into a text-to-text format. Our systematic study compares pre-training objectives, architectures, unlabeled datasets, transfer approaches, and other factors on dozens of language understanding tasks. 104 | 105 | The framework introduced, the T5 framework, involves a training procedure that brings together the approaches studied in the paper. See the [research paper](https://jmlr.org/papers/volume21/20-074/20-074.pdf) for further details. 106 | 107 | # Evaluation 108 | 109 | ## Testing Data, Factors & Metrics 110 | 111 | The developers evaluated the model on 24 tasks, see the [research paper](https://jmlr.org/papers/volume21/20-074/20-074.pdf) for full details. 112 | 113 | ## Results 114 | 115 | For full results for T5-Base, see the [research paper](https://jmlr.org/papers/volume21/20-074/20-074.pdf), Table 14. 116 | 117 | # Environmental Impact 118 | 119 | Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700). 120 | 121 | - **Hardware Type:** Google Cloud TPU Pods 122 | - **Hours used:** More information needed 123 | - **Cloud Provider:** GCP 124 | - **Compute Region:** More information needed 125 | - **Carbon Emitted:** More information needed 126 | 127 | # Citation 128 | 129 | **BibTeX:** 130 | 131 | ```bibtex 132 | @article{2020t5, 133 | author = {Colin Raffel and Noam Shazeer and Adam Roberts and Katherine Lee and Sharan Narang and Michael Matena and Yanqi Zhou and Wei Li and Peter J. Liu}, 134 | title = {Exploring the Limits of Transfer Learning with a Unified Text-to-Text Transformer}, 135 | journal = {Journal of Machine Learning Research}, 136 | year = {2020}, 137 | volume = {21}, 138 | number = {140}, 139 | pages = {1-67}, 140 | url = {http://jmlr.org/papers/v21/20-074.html} 141 | } 142 | ``` 143 | 144 | **APA:** 145 | - Raffel, C., Shazeer, N., Roberts, A., Lee, K., Narang, S., Matena, M., ... & Liu, P. J. (2020). Exploring the limits of transfer learning with a unified text-to-text transformer. J. Mach. Learn. Res., 21(140), 1-67. 146 | 147 | # Model Card Authors 148 | 149 | This model card was written by the team at Hugging Face. 150 | 151 | # How to Get Started with the Model 152 | 153 | Use the code below to get started with the model. 154 | 155 |
156 | Click to expand 157 | 158 | ```python 159 | from transformers import T5Tokenizer, T5Model 160 | 161 | tokenizer = T5Tokenizer.from_pretrained("t5-base") 162 | model = T5Model.from_pretrained("t5-base") 163 | 164 | input_ids = tokenizer( 165 | "Studies have been shown that owning a dog is good for you", return_tensors="pt" 166 | ).input_ids # Batch size 1 167 | decoder_input_ids = tokenizer("Studies show that", return_tensors="pt").input_ids # Batch size 1 168 | 169 | # forward pass 170 | outputs = model(input_ids=input_ids, decoder_input_ids=decoder_input_ids) 171 | last_hidden_states = outputs.last_hidden_state 172 | ``` 173 | 174 | See the [Hugging Face T5](https://huggingface.co/docs/transformers/model_doc/t5#transformers.T5Model) docs and a [Colab Notebook](https://colab.research.google.com/github/google-research/text-to-text-transfer-transformer/blob/main/notebooks/t5-trivia.ipynb) created by the model developers for more examples. 175 |
-------------------------------------------------------------------------------- /model_scraping/cards/t5-small.md: -------------------------------------------------------------------------------- 1 | # Model Card for T5 Small 2 | 3 | ![model image](https://camo.githubusercontent.com/623b4dea0b653f2ad3f36c71ebfe749a677ac0a1/68747470733a2f2f6d69726f2e6d656469756d2e636f6d2f6d61782f343030362f312a44304a31674e51663876727255704b657944387750412e706e67) 4 | 5 | # Table of Contents 6 | 7 | 1. [Model Details](#model-details) 8 | 2. [Uses](#uses) 9 | 3. [Bias, Risks, and Limitations](#bias-risks-and-limitations) 10 | 4. [Training Details](#training-details) 11 | 5. [Evaluation](#evaluation) 12 | 6. [Environmental Impact](#environmental-impact) 13 | 7. [Citation](#citation) 14 | 8. [Model Card Authors](#model-card-authors) 15 | 9. [How To Get Started With the Model](#how-to-get-started-with-the-model) 16 | 17 | # Model Details 18 | 19 | ## Model Description 20 | 21 | The developers of the Text-To-Text Transfer Transformer (T5) [write](https://ai.googleblog.com/2020/02/exploring-transfer-learning-with-t5.html): 22 | 23 | > With T5, we propose reframing all NLP tasks into a unified text-to-text-format where the input and output are always text strings, in contrast to BERT-style models that can only output either a class label or a span of the input. Our text-to-text framework allows us to use the same model, loss function, and hyperparameters on any NLP task. 24 | 25 | T5-Small is the checkpoint with 60 million parameters. 26 | 27 | - **Developed by:** Colin Raffel, Noam Shazeer, Adam Roberts, Katherine Lee, Sharan Narang, Michael Matena, Yanqi Zhou, Wei Li, Peter J. Liu. See [associated paper](https://jmlr.org/papers/volume21/20-074/20-074.pdf) and [GitHub repo](https://github.com/google-research/text-to-text-transfer-transformer#released-model-checkpoints) 28 | - **Model type:** Language model 29 | - **Language(s) (NLP):** English, French, Romanian, German 30 | - **License:** Apache 2.0 31 | - **Related Models:** [All T5 Checkpoints](https://huggingface.co/models?search=t5) 32 | - **Resources for more information:** 33 | - [Research paper](https://jmlr.org/papers/volume21/20-074/20-074.pdf) 34 | - [Google's T5 Blog Post](https://ai.googleblog.com/2020/02/exploring-transfer-learning-with-t5.html) 35 | - [GitHub Repo](https://github.com/google-research/text-to-text-transfer-transformer) 36 | - [Hugging Face T5 Docs](https://huggingface.co/docs/transformers/model_doc/t5) 37 | 38 | # Uses 39 | 40 | ## Direct Use and Downstream Use 41 | 42 | The developers write in a [blog post](https://ai.googleblog.com/2020/02/exploring-transfer-learning-with-t5.html) that the model: 43 | 44 | > Our text-to-text framework allows us to use the same model, loss function, and hyperparameters on any NLP task, including machine translation, document summarization, question answering, and classification tasks (e.g., sentiment analysis). We can even apply T5 to regression tasks by training it to predict the string representation of a number instead of the number itself. 45 | 46 | See the [blog post](https://ai.googleblog.com/2020/02/exploring-transfer-learning-with-t5.html) and [research paper](https://jmlr.org/papers/volume21/20-074/20-074.pdf) for further details. 47 | 48 | ## Out-of-Scope Use 49 | 50 | More information needed. 51 | 52 | # Bias, Risks, and Limitations 53 | 54 | More information needed. 55 | 56 | ## Recommendations 57 | 58 | More information needed. 59 | 60 | # Training Details 61 | 62 | ## Training Data 63 | 64 | The model is pre-trained on the [Colossal Clean Crawled Corpus (C4)](https://www.tensorflow.org/datasets/catalog/c4), which was developed and released in the context of the same [research paper](https://jmlr.org/papers/volume21/20-074/20-074.pdf) as T5. 65 | 66 | The model was pre-trained on a on a **multi-task mixture of unsupervised (1.) and supervised tasks (2.)**. 67 | Thereby, the following datasets were being used for (1.) and (2.): 68 | 69 | 1. **Datasets used for Unsupervised denoising objective**: 70 | 71 | - [C4](https://huggingface.co/datasets/c4) 72 | - [Wiki-DPR](https://huggingface.co/datasets/wiki_dpr) 73 | 74 | 75 | 2. **Datasets used for Supervised text-to-text language modeling objective** 76 | 77 | - Sentence acceptability judgment 78 | - CoLA [Warstadt et al., 2018](https://arxiv.org/abs/1805.12471) 79 | - Sentiment analysis 80 | - SST-2 [Socher et al., 2013](https://nlp.stanford.edu/~socherr/EMNLP2013_RNTN.pdf) 81 | - Paraphrasing/sentence similarity 82 | - MRPC [Dolan and Brockett, 2005](https://aclanthology.org/I05-5002) 83 | - STS-B [Ceret al., 2017](https://arxiv.org/abs/1708.00055) 84 | - QQP [Iyer et al., 2017](https://quoradata.quora.com/First-Quora-Dataset-Release-Question-Pairs) 85 | - Natural language inference 86 | - MNLI [Williams et al., 2017](https://arxiv.org/abs/1704.05426) 87 | - QNLI [Rajpurkar et al.,2016](https://arxiv.org/abs/1606.05250) 88 | - RTE [Dagan et al., 2005](https://link.springer.com/chapter/10.1007/11736790_9) 89 | - CB [De Marneff et al., 2019](https://semanticsarchive.net/Archive/Tg3ZGI2M/Marneffe.pdf) 90 | - Sentence completion 91 | - COPA [Roemmele et al., 2011](https://www.researchgate.net/publication/221251392_Choice_of_Plausible_Alternatives_An_Evaluation_of_Commonsense_Causal_Reasoning) 92 | - Word sense disambiguation 93 | - WIC [Pilehvar and Camacho-Collados, 2018](https://arxiv.org/abs/1808.09121) 94 | - Question answering 95 | - MultiRC [Khashabi et al., 2018](https://aclanthology.org/N18-1023) 96 | - ReCoRD [Zhang et al., 2018](https://arxiv.org/abs/1810.12885) 97 | - BoolQ [Clark et al., 2019](https://arxiv.org/abs/1905.10044) 98 | 99 | ## Training Procedure 100 | 101 | In their [abstract](https://jmlr.org/papers/volume21/20-074/20-074.pdf), the model developers write: 102 | 103 | > In this paper, we explore the landscape of transfer learning techniques for NLP by introducing a unified framework that converts every language problem into a text-to-text format. Our systematic study compares pre-training objectives, architectures, unlabeled datasets, transfer approaches, and other factors on dozens of language understanding tasks. 104 | 105 | The framework introduced, the T5 framework, involves a training procedure that brings together the approaches studied in the paper. See the [research paper](https://jmlr.org/papers/volume21/20-074/20-074.pdf) for further details. 106 | 107 | # Evaluation 108 | 109 | ## Testing Data, Factors & Metrics 110 | 111 | The developers evaluated the model on 24 tasks, see the [research paper](https://jmlr.org/papers/volume21/20-074/20-074.pdf) for full details. 112 | 113 | ## Results 114 | 115 | For full results for T5-small, see the [research paper](https://jmlr.org/papers/volume21/20-074/20-074.pdf), Table 14. 116 | 117 | # Environmental Impact 118 | 119 | Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700). 120 | 121 | - **Hardware Type:** Google Cloud TPU Pods 122 | - **Hours used:** More information needed 123 | - **Cloud Provider:** GCP 124 | - **Compute Region:** More information needed 125 | - **Carbon Emitted:** More information needed 126 | 127 | # Citation 128 | 129 | **BibTeX:** 130 | 131 | ```bibtex 132 | @article{2020t5, 133 | author = {Colin Raffel and Noam Shazeer and Adam Roberts and Katherine Lee and Sharan Narang and Michael Matena and Yanqi Zhou and Wei Li and Peter J. Liu}, 134 | title = {Exploring the Limits of Transfer Learning with a Unified Text-to-Text Transformer}, 135 | journal = {Journal of Machine Learning Research}, 136 | year = {2020}, 137 | volume = {21}, 138 | number = {140}, 139 | pages = {1-67}, 140 | url = {http://jmlr.org/papers/v21/20-074.html} 141 | } 142 | ``` 143 | 144 | **APA:** 145 | - Raffel, C., Shazeer, N., Roberts, A., Lee, K., Narang, S., Matena, M., ... & Liu, P. J. (2020). Exploring the limits of transfer learning with a unified text-to-text transformer. J. Mach. Learn. Res., 21(140), 1-67. 146 | 147 | # Model Card Authors 148 | 149 | This model card was written by the team at Hugging Face. 150 | 151 | # How to Get Started with the Model 152 | 153 | Use the code below to get started with the model. 154 | 155 |
156 | Click to expand 157 | 158 | ```python 159 | from transformers import T5Tokenizer, T5Model 160 | 161 | tokenizer = T5Tokenizer.from_pretrained("t5-small") 162 | model = T5Model.from_pretrained("t5-small") 163 | 164 | input_ids = tokenizer( 165 | "Studies have been shown that owning a dog is good for you", return_tensors="pt" 166 | ).input_ids # Batch size 1 167 | decoder_input_ids = tokenizer("Studies show that", return_tensors="pt").input_ids # Batch size 1 168 | 169 | # forward pass 170 | outputs = model(input_ids=input_ids, decoder_input_ids=decoder_input_ids) 171 | last_hidden_states = outputs.last_hidden_state 172 | ``` 173 | 174 | See the [Hugging Face T5](https://huggingface.co/docs/transformers/model_doc/t5#transformers.T5Model) docs and a [Colab Notebook](https://colab.research.google.com/github/google-research/text-to-text-transfer-transformer/blob/main/notebooks/t5-trivia.ipynb) created by the model developers for more examples. 175 |
-------------------------------------------------------------------------------- /model_scraping/cards/xlm-roberta-base.md: -------------------------------------------------------------------------------- 1 | # XLM-RoBERTa (base-sized model) 2 | 3 | XLM-RoBERTa model pre-trained on 2.5TB of filtered CommonCrawl data containing 100 languages. It was introduced in the paper [Unsupervised Cross-lingual Representation Learning at Scale](https://arxiv.org/abs/1911.02116) by Conneau et al. and first released in [this repository](https://github.com/pytorch/fairseq/tree/master/examples/xlmr). 4 | 5 | Disclaimer: The team releasing XLM-RoBERTa did not write a model card for this model so this model card has been written by the Hugging Face team. 6 | 7 | ## Model description 8 | 9 | XLM-RoBERTa is a multilingual version of RoBERTa. It is pre-trained on 2.5TB of filtered CommonCrawl data containing 100 languages. 10 | 11 | RoBERTa is a transformers model pretrained on a large corpus in a self-supervised fashion. This means it was pretrained on the raw texts only, with no humans labelling them in any way (which is why it can use lots of publicly available data) with an automatic process to generate inputs and labels from those texts. 12 | 13 | More precisely, it was pretrained with the Masked language modeling (MLM) objective. Taking a sentence, the model randomly masks 15% of the words in the input then run the entire masked sentence through the model and has to predict the masked words. This is different from traditional recurrent neural networks (RNNs) that usually see the words one after the other, or from autoregressive models like GPT which internally mask the future tokens. It allows the model to learn a bidirectional representation of the sentence. 14 | 15 | This way, the model learns an inner representation of 100 languages that can then be used to extract features useful for downstream tasks: if you have a dataset of labeled sentences for instance, you can train a standard classifier using the features produced by the XLM-RoBERTa model as inputs. 16 | 17 | ## Intended uses & limitations 18 | 19 | You can use the raw model for masked language modeling, but it's mostly intended to be fine-tuned on a downstream task. See the [model hub](https://huggingface.co/models?search=xlm-roberta) to look for fine-tuned versions on a task that interests you. 20 | 21 | Note that this model is primarily aimed at being fine-tuned on tasks that use the whole sentence (potentially masked) to make decisions, such as sequence classification, token classification or question answering. For tasks such as text generation, you should look at models like GPT2. 22 | 23 | ## Usage 24 | 25 | You can use this model directly with a pipeline for masked language modeling: 26 | 27 | ```python 28 | >>> from transformers import pipeline 29 | >>> unmasker = pipeline('fill-mask', model='xlm-roberta-base') 30 | >>> unmasker("Hello I'm a model.") 31 | 32 | [{'score': 0.10563907772302628, 33 | 'sequence': "Hello I'm a fashion model.", 34 | 'token': 54543, 35 | 'token_str': 'fashion'}, 36 | {'score': 0.08015287667512894, 37 | 'sequence': "Hello I'm a new model.", 38 | 'token': 3525, 39 | 'token_str': 'new'}, 40 | {'score': 0.033413201570510864, 41 | 'sequence': "Hello I'm a model model.", 42 | 'token': 3299, 43 | 'token_str': 'model'}, 44 | {'score': 0.030217764899134636, 45 | 'sequence': "Hello I'm a French model.", 46 | 'token': 92265, 47 | 'token_str': 'French'}, 48 | {'score': 0.026436051353812218, 49 | 'sequence': "Hello I'm a sexy model.", 50 | 'token': 17473, 51 | 'token_str': 'sexy'}] 52 | ``` 53 | 54 | Here is how to use this model to get the features of a given text in PyTorch: 55 | 56 | ```python 57 | from transformers import AutoTokenizer, AutoModelForMaskedLM 58 | 59 | tokenizer = AutoTokenizer.from_pretrained('xlm-roberta-base') 60 | model = AutoModelForMaskedLM.from_pretrained("xlm-roberta-base") 61 | 62 | # prepare input 63 | text = "Replace me by any text you'd like." 64 | encoded_input = tokenizer(text, return_tensors='pt') 65 | 66 | # forward pass 67 | output = model(**encoded_input) 68 | ``` 69 | 70 | ### BibTeX entry and citation info 71 | 72 | ```bibtex 73 | @article{DBLP:journals/corr/abs-1911-02116, 74 | author = {Alexis Conneau and 75 | Kartikay Khandelwal and 76 | Naman Goyal and 77 | Vishrav Chaudhary and 78 | Guillaume Wenzek and 79 | Francisco Guzm{\'{a}}n and 80 | Edouard Grave and 81 | Myle Ott and 82 | Luke Zettlemoyer and 83 | Veselin Stoyanov}, 84 | title = {Unsupervised Cross-lingual Representation Learning at Scale}, 85 | journal = {CoRR}, 86 | volume = {abs/1911.02116}, 87 | year = {2019}, 88 | url = {http://arxiv.org/abs/1911.02116}, 89 | eprinttype = {arXiv}, 90 | eprint = {1911.02116}, 91 | timestamp = {Mon, 11 Nov 2019 18:38:09 +0100}, 92 | biburl = {https://dblp.org/rec/journals/corr/abs-1911-02116.bib}, 93 | bibsource = {dblp computer science bibliography, https://dblp.org} 94 | } 95 | ``` 96 | 97 | 98 | 99 | -------------------------------------------------------------------------------- /model_scraping/cards/xlm-roberta-large.md: -------------------------------------------------------------------------------- 1 | # XLM-RoBERTa (large-sized model) 2 | 3 | XLM-RoBERTa model pre-trained on 2.5TB of filtered CommonCrawl data containing 100 languages. It was introduced in the paper [Unsupervised Cross-lingual Representation Learning at Scale](https://arxiv.org/abs/1911.02116) by Conneau et al. and first released in [this repository](https://github.com/pytorch/fairseq/tree/master/examples/xlmr). 4 | 5 | Disclaimer: The team releasing XLM-RoBERTa did not write a model card for this model so this model card has been written by the Hugging Face team. 6 | 7 | ## Model description 8 | 9 | XLM-RoBERTa is a multilingual version of RoBERTa. It is pre-trained on 2.5TB of filtered CommonCrawl data containing 100 languages. 10 | 11 | RoBERTa is a transformers model pretrained on a large corpus in a self-supervised fashion. This means it was pretrained on the raw texts only, with no humans labelling them in any way (which is why it can use lots of publicly available data) with an automatic process to generate inputs and labels from those texts. 12 | 13 | More precisely, it was pretrained with the Masked language modeling (MLM) objective. Taking a sentence, the model randomly masks 15% of the words in the input then run the entire masked sentence through the model and has to predict the masked words. This is different from traditional recurrent neural networks (RNNs) that usually see the words one after the other, or from autoregressive models like GPT which internally mask the future tokens. It allows the model to learn a bidirectional representation of the sentence. 14 | 15 | This way, the model learns an inner representation of 100 languages that can then be used to extract features useful for downstream tasks: if you have a dataset of labeled sentences for instance, you can train a standard classifier using the features produced by the XLM-RoBERTa model as inputs. 16 | 17 | ## Intended uses & limitations 18 | 19 | You can use the raw model for masked language modeling, but it's mostly intended to be fine-tuned on a downstream task. See the [model hub](https://huggingface.co/models?search=xlm-roberta) to look for fine-tuned versions on a task that interests you. 20 | 21 | Note that this model is primarily aimed at being fine-tuned on tasks that use the whole sentence (potentially masked) to make decisions, such as sequence classification, token classification or question answering. For tasks such as text generation, you should look at models like GPT2. 22 | 23 | ## Usage 24 | 25 | You can use this model directly with a pipeline for masked language modeling: 26 | 27 | ```python 28 | >>> from transformers import pipeline 29 | >>> unmasker = pipeline('fill-mask', model='xlm-roberta-large') 30 | >>> unmasker("Hello I'm a model.") 31 | 32 | [{'score': 0.10563907772302628, 33 | 'sequence': "Hello I'm a fashion model.", 34 | 'token': 54543, 35 | 'token_str': 'fashion'}, 36 | {'score': 0.08015287667512894, 37 | 'sequence': "Hello I'm a new model.", 38 | 'token': 3525, 39 | 'token_str': 'new'}, 40 | {'score': 0.033413201570510864, 41 | 'sequence': "Hello I'm a model model.", 42 | 'token': 3299, 43 | 'token_str': 'model'}, 44 | {'score': 0.030217764899134636, 45 | 'sequence': "Hello I'm a French model.", 46 | 'token': 92265, 47 | 'token_str': 'French'}, 48 | {'score': 0.026436051353812218, 49 | 'sequence': "Hello I'm a sexy model.", 50 | 'token': 17473, 51 | 'token_str': 'sexy'}] 52 | ``` 53 | 54 | Here is how to use this model to get the features of a given text in PyTorch: 55 | 56 | ```python 57 | from transformers import AutoTokenizer, AutoModelForMaskedLM 58 | 59 | tokenizer = AutoTokenizer.from_pretrained('xlm-roberta-large') 60 | model = AutoModelForMaskedLM.from_pretrained("xlm-roberta-large") 61 | 62 | # prepare input 63 | text = "Replace me by any text you'd like." 64 | encoded_input = tokenizer(text, return_tensors='pt') 65 | 66 | # forward pass 67 | output = model(**encoded_input) 68 | ``` 69 | 70 | ### BibTeX entry and citation info 71 | 72 | ```bibtex 73 | @article{DBLP:journals/corr/abs-1911-02116, 74 | author = {Alexis Conneau and 75 | Kartikay Khandelwal and 76 | Naman Goyal and 77 | Vishrav Chaudhary and 78 | Guillaume Wenzek and 79 | Francisco Guzm{\'{a}}n and 80 | Edouard Grave and 81 | Myle Ott and 82 | Luke Zettlemoyer and 83 | Veselin Stoyanov}, 84 | title = {Unsupervised Cross-lingual Representation Learning at Scale}, 85 | journal = {CoRR}, 86 | volume = {abs/1911.02116}, 87 | year = {2019}, 88 | url = {http://arxiv.org/abs/1911.02116}, 89 | eprinttype = {arXiv}, 90 | eprint = {1911.02116}, 91 | timestamp = {Mon, 11 Nov 2019 18:38:09 +0100}, 92 | biburl = {https://dblp.org/rec/journals/corr/abs-1911-02116.bib}, 93 | bibsource = {dblp computer science bibliography, https://dblp.org} 94 | } 95 | ``` 96 | 97 | 98 | 99 | -------------------------------------------------------------------------------- /model_scraping/cards/yiyanghkust___finbert-tone.md: -------------------------------------------------------------------------------- 1 | `FinBERT` is a BERT model pre-trained on financial communication text. The purpose is to enhance financial NLP research and practice. It is trained on the following three financial communication corpus. The total corpora size is 4.9B tokens. 2 | - Corporate Reports 10-K & 10-Q: 2.5B tokens 3 | - Earnings Call Transcripts: 1.3B tokens 4 | - Analyst Reports: 1.1B tokens 5 | 6 | More technical details on `FinBERT`: [Click Link](https://github.com/yya518/FinBERT) 7 | 8 | This released `finbert-tone` model is the `FinBERT` model fine-tuned on 10,000 manually annotated (positive, negative, neutral) sentences from analyst reports. This model achieves superior performance on financial tone analysis task. If you are simply interested in using `FinBERT` for financial tone analysis, give it a try. 9 | 10 | If you use the model in your academic work, please cite the following paper: 11 | 12 | Huang, Allen H., Hui Wang, and Yi Yang. "FinBERT: A Large Language Model for Extracting Information from Financial Text." *Contemporary Accounting Research* (2022). 13 | 14 | 15 | # How to use 16 | You can use this model with Transformers pipeline for sentiment analysis. 17 | ```python 18 | from transformers import BertTokenizer, BertForSequenceClassification 19 | from transformers import pipeline 20 | 21 | finbert = BertForSequenceClassification.from_pretrained('yiyanghkust/finbert-tone',num_labels=3) 22 | tokenizer = BertTokenizer.from_pretrained('yiyanghkust/finbert-tone') 23 | 24 | nlp = pipeline("sentiment-analysis", model=finbert, tokenizer=tokenizer) 25 | 26 | sentences = ["there is a shortage of capital, and we need extra financing", 27 | "growth is strong and we have plenty of liquidity", 28 | "there are doubts about our finances", 29 | "profits are flat"] 30 | results = nlp(sentences) 31 | print(results) #LABEL_0: neutral; LABEL_1: positive; LABEL_2: negative 32 | 33 | ``` -------------------------------------------------------------------------------- /model_scraping/run.py: -------------------------------------------------------------------------------- 1 | from huggingface_hub import HfApi, ModelFilter, ModelCard 2 | from huggingface_hub.hf_api import ModelInfo 3 | import pandas as pd 4 | 5 | api = HfApi() 6 | 7 | 8 | def fetch_models(): 9 | return list( 10 | iter( 11 | api.list_models( 12 | # filter=ModelFilter(library="transformers"), 13 | sort="downloads", 14 | direction=-1, 15 | limit=30, 16 | fetch_config=True, 17 | cardData=True, 18 | ) 19 | ) 20 | ) 21 | 22 | 23 | def model_to_dict(model: ModelInfo): 24 | return { 25 | "modelId": model.modelId, 26 | "sha": model.sha, 27 | "lastModified": model.lastModified, 28 | "tags": model.tags, 29 | "pipeline_tag": model.pipeline_tag, 30 | "siblings": model.siblings, 31 | "private": model.private, 32 | "author": model.author, 33 | "likes": model.likes, 34 | "downloads": model.downloads, 35 | "config": model.config, 36 | } 37 | 38 | 39 | def write_model_card(model_id: str, card_text): 40 | text_file = open(f"cards/{model_id.replace('/', '___')}.md", "w") 41 | n = text_file.write(card_text.strip()) 42 | text_file.close() 43 | 44 | 45 | models = fetch_models() 46 | model_cards = [ModelCard.load(m.modelId) for m in models] 47 | [write_model_card(m.modelId, c.text) for m, c in zip(models, model_cards)] 48 | 49 | # model_list = [model_to_dict(m) for m in models] 50 | # df = pd.DataFrame(model_list) 51 | # 52 | # df.to_json("transformers_dump.jsonl", orient="records", lines=True) 53 | # 54 | # df["model_type"] = df.config.apply(lambda x: x and x.get("model_type", None)) 55 | # df.model_type = df.model_type.apply(lambda x: None if x == {} else x) 56 | # df = df.dropna(subset=["model_type"]) 57 | # 58 | # df.model_type.value_counts() 59 | # 60 | # df.groupby(["model_type"]).downloads.sum().sort_values(ascending=False) 61 | -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | aiolimiter 2 | aiohttp 3 | datasets 4 | --------------------------------------------------------------------------------