├── LICENSE ├── Transformers Fundamentals ├── 01 Text-Based Pipelines │ ├── 02 Named Entity Recognition (NER) │ │ └── ner.py │ ├── 04 Text Generation │ │ └── text_generation.py │ ├── 07 Fill-Mask │ │ └── fill_mask.py │ ├── 05 Summarization │ │ └── summarization.py │ ├── 01 Text Classification │ │ └── text_classification.py │ ├── 03 Question Answering │ │ └── question_answering.py │ ├── 06 Translation │ │ └── translation.py │ └── README.md ├── 02 Speech and Audio Pipelines │ ├── 02 Text-to-Speech (TTS) │ │ └── tts.py │ ├── 01 Automatic Speech Recognition (ASR) │ │ └── asr.py │ ├── 03 Audio Classification │ │ └── audio_classification.py │ └── README.md └── 03 Vision-Based Pipelines │ ├── 04 Image-to-Text │ └── image_to_text.py │ ├── 02 Object Detection │ └── object_detection.py │ ├── 03 Image Segmentation │ └── image_segmentation.py │ ├── 01 Image Classification │ └── image_classification.py │ └── README.md ├── README.md └── Transformers Interview Questions └── README.md /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2025 rohanmistry231 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. -------------------------------------------------------------------------------- /Transformers Fundamentals/01 Text-Based Pipelines/02 Named Entity Recognition (NER)/ner.py: -------------------------------------------------------------------------------- 1 | # %% [1. Introduction to Named Entity Recognition] 2 | # Learn entity extraction with Hugging Face NER pipeline. 3 | 4 | # Setup: pip install transformers torch numpy matplotlib 5 | import matplotlib.pyplot as plt 6 | from collections import Counter 7 | from transformers import pipeline 8 | 9 | def run_ner_demo(): 10 | # %% [2. Synthetic Retail Text Data] 11 | reviews = [ 12 | "This laptop from TechCorp is great! I love the fast processor from Intel.", 13 | "The screen is vibrant, designed by Samsung in New York.", 14 | "Overall, a solid purchase from TechCorp in California." 15 | ] 16 | print("Synthetic Text: Retail product reviews created") 17 | print(f"Reviews: {reviews}") 18 | 19 | # %% [3. Entity Extraction] 20 | ner = pipeline("ner", model="dslim/bert-base-NER", grouped_entities=True) 21 | entities = [] 22 | for review in reviews: 23 | result = ner(review) 24 | entities.extend([(entity['entity_group'], entity['word']) for entity in result]) 25 | print("NER: Entities extracted") 26 | print(f"Entities (Sample): {entities[:5]}...") 27 | 28 | # %% [4. Visualization] 29 | entity_types = [entity[0] for entity in entities] 30 | type_counts = Counter(entity_types) 31 | plt.figure(figsize=(8, 4)) 32 | plt.bar(type_counts.keys(), type_counts.values(), color='blue') 33 | plt.title("Entity Type Distribution") 34 | plt.xlabel("Entity Type") 35 | plt.ylabel("Count") 36 | plt.savefig("ner_output.png") 37 | print("Visualization: Entity distribution saved as ner_output.png") 38 | 39 | # %% [5. Interview Scenario: NER] 40 | """ 41 | Interview Scenario: Named Entity Recognition 42 | Q: How does the NER pipeline identify entities in Hugging Face? 43 | A: It uses a transformer model (e.g., BERT) fine-tuned to classify tokens into entity categories. 44 | Key: Groups tokens into entities like PERSON, ORG, LOC. 45 | Example: pipeline("ner", model="dslim/bert-base-NER") 46 | """ 47 | 48 | # Execute the demo 49 | if __name__ == "__main__": 50 | run_ner_demo() -------------------------------------------------------------------------------- /Transformers Fundamentals/01 Text-Based Pipelines/04 Text Generation/text_generation.py: -------------------------------------------------------------------------------- 1 | # %% [1. Introduction to Text Generation] 2 | # Learn story generation and text completion with Hugging Face pipelines. 3 | 4 | # Setup: pip install transformers torch numpy matplotlib 5 | import matplotlib.pyplot as plt 6 | from transformers import pipeline 7 | import nltk 8 | 9 | def run_text_generation_demo(): 10 | # %% [2. Synthetic Retail Text Data] 11 | prompts = [ 12 | "The new TechCorp laptop is amazing because", 13 | "A customer review of the vibrant screen:", 14 | "Why I love shopping at TechCorp:" 15 | ] 16 | print("Synthetic Text: Retail text prompts created") 17 | print(f"Prompts: {prompts}") 18 | 19 | # %% [3. Text Generation] 20 | generator = pipeline("text-generation", model="gpt2", max_length=50) 21 | generated_texts = [generator(prompt, num_return_sequences=1)[0]['generated_text'] for prompt in prompts] 22 | print("Text Generation: Texts generated") 23 | for i, (prompt, text) in enumerate(zip(prompts, generated_texts)): 24 | print(f"Prompt {i+1}: {prompt}") 25 | print(f"Generated: {text[:100]}...") 26 | 27 | # %% [4. Visualization] 28 | lengths = [len(nltk.word_tokenize(text)) for text in generated_texts] 29 | plt.figure(figsize=(8, 4)) 30 | plt.bar(range(1, len(prompts) + 1), lengths, color='purple') 31 | plt.title("Generated Text Lengths") 32 | plt.xlabel("Prompt") 33 | plt.ylabel("Word Count") 34 | plt.savefig("text_generation_output.png") 35 | print("Visualization: Generated text lengths saved as text_generation_output.png") 36 | 37 | # %% [5. Interview Scenario: Text Generation] 38 | """ 39 | Interview Scenario: Text Generation 40 | Q: How does the text-generation pipeline work in Hugging Face? 41 | A: It uses a generative model (e.g., GPT-2) to predict the next token iteratively. 42 | Key: Controlled by parameters like max_length and num_return_sequences. 43 | Example: pipeline("text-generation", model="gpt2") 44 | """ 45 | 46 | # Execute the demo 47 | if __name__ == "__main__": 48 | nltk.download('punkt', quiet=True) 49 | run_text_generation_demo() -------------------------------------------------------------------------------- /Transformers Fundamentals/02 Speech and Audio Pipelines/02 Text-to-Speech (TTS)/tts.py: -------------------------------------------------------------------------------- 1 | # %% [1. Introduction to Text-to-Speech] 2 | # Learn speech synthesis with Hugging Face TTS pipeline. 3 | 4 | # Setup: pip install transformers torch numpy matplotlib 5 | import matplotlib.pyplot as plt 6 | from transformers import pipeline 7 | import numpy as np 8 | 9 | def run_tts_demo(): 10 | # %% [2. Synthetic Retail Text Data] 11 | texts = [ 12 | "Welcome to TechCorp! Our new laptop is amazing.", 13 | "The vibrant screen is a customer favorite.", 14 | "Visit our store for exclusive deals today." 15 | ] 16 | print("Synthetic Text: Retail announcements created") 17 | print(f"Texts: {texts}") 18 | 19 | # %% [3. TTS Pipeline Simulation] 20 | # Note: TTS pipeline generates audio; we simulate metadata due to file output constraints 21 | tts = pipeline("text-to-speech", model="facebook/mms-tts-eng") 22 | # Simulate TTS output with estimated durations (seconds per word approximation) 23 | durations = [len(text.split()) * 0.5 for text in texts] # Approx 0.5s per word 24 | print("TTS: Audio generation simulated") 25 | for i, (text, duration) in enumerate(zip(texts, durations)): 26 | print(f"Text {i+1}: {text}") 27 | print(f"Simulated Duration: {duration:.2f} seconds") 28 | 29 | # %% [4. Visualization] 30 | plt.figure(figsize=(8, 4)) 31 | plt.bar(range(1, len(texts) + 1), durations, color='green') 32 | plt.title("Simulated Audio Durations") 33 | plt.xlabel("Text Sample") 34 | plt.ylabel("Duration (Seconds)") 35 | plt.savefig("tts_output.png") 36 | print("Visualization: Audio durations saved as tts_output.png") 37 | 38 | # %% [5. Interview Scenario: TTS] 39 | """ 40 | Interview Scenario: Text-to-Speech 41 | Q: How does the TTS pipeline synthesize speech in Hugging Face? 42 | A: It uses models like SpeechT5 or MMS-TTS to generate audio waveforms from text embeddings. 43 | Key: Trained on speech datasets to produce natural-sounding audio. 44 | Example: pipeline("text-to-speech", model="facebook/mms-tts-eng") 45 | """ 46 | 47 | # Execute the demo 48 | if __name__ == "__main__": 49 | run_tts_demo() -------------------------------------------------------------------------------- /Transformers Fundamentals/01 Text-Based Pipelines/07 Fill-Mask/fill_mask.py: -------------------------------------------------------------------------------- 1 | # %% [1. Introduction to Fill-Mask] 2 | # Learn masked language modeling with Hugging Face fill-mask pipeline. 3 | 4 | # Setup: pip install transformers torch numpy matplotlib 5 | import matplotlib.pyplot as plt 6 | from transformers import pipeline 7 | 8 | def run_fill_mask_demo(): 9 | # %% [2. Synthetic Retail Text Data] 10 | masked_texts = [ 11 | "This laptop from TechCorp is [MASK]!", 12 | "The [MASK] is vibrant but the battery life is terrible.", 13 | "Overall, a [MASK] purchase from TechCorp." 14 | ] 15 | print("Synthetic Text: Retail masked texts created") 16 | print(f"Masked Texts: {masked_texts}") 17 | 18 | # %% [3. Masked Language Modeling] 19 | fill_mask = pipeline("fill-mask", model="bert-base-uncased") 20 | predictions = [fill_mask(text)[:3] for text in masked_texts] # Top 3 predictions 21 | print("Fill-Mask: Predictions made") 22 | for i, (text, preds) in enumerate(zip(masked_texts, predictions)): 23 | print(f"Text {i+1}: {text}") 24 | for j, pred in enumerate(preds): 25 | print(f"Prediction {j+1}: {pred['token_str']} (Score: {pred['score']:.2f})") 26 | 27 | # %% [4. Visualization] 28 | scores = [[pred['score'] for pred in preds] for preds in predictions] 29 | plt.figure(figsize=(8, 4)) 30 | for i, score_list in enumerate(scores): 31 | plt.bar([x + i*0.3 for x in range(1, len(score_list) + 1)], score_list, width=0.3, label=f"Text {i+1}") 32 | plt.title("Prediction Confidence Scores") 33 | plt.xlabel("Prediction Rank") 34 | plt.ylabel("Score") 35 | plt.legend() 36 | plt.savefig("fill_mask_output.png") 37 | print("Visualization: Prediction confidence saved as fill_mask_output.png") 38 | 39 | # %% [5. Interview Scenario: Fill-Mask] 40 | """ 41 | Interview Scenario: Fill-Mask 42 | Q: How does the fill-mask pipeline leverage masked language models? 43 | A: It uses models like BERT to predict masked tokens based on context. 44 | Key: Trained on large corpora to understand word relationships. 45 | Example: pipeline("fill-mask", model="bert-base-uncased") 46 | """ 47 | 48 | # Execute the demo 49 | if __name__ == "__main__": 50 | run_fill_mask_demo() -------------------------------------------------------------------------------- /Transformers Fundamentals/01 Text-Based Pipelines/05 Summarization/summarization.py: -------------------------------------------------------------------------------- 1 | # %% [1. Introduction to Summarization] 2 | # Learn abstractive and extractive summarization with Hugging Face pipelines. 3 | 4 | # Setup: pip install transformers torch numpy matplotlib 5 | import matplotlib.pyplot as plt 6 | from transformers import pipeline 7 | import nltk 8 | 9 | def run_summarization_demo(): 10 | # %% [2. Synthetic Retail Text Data] 11 | texts = [ 12 | """ 13 | TechCorp's new laptop has a fast processor from Intel and a vibrant screen designed by Samsung. 14 | The battery life is average, lasting about 6 hours. It was launched in New York in 2025. 15 | Customers love the sleek design and performance but some complain about the battery. 16 | """ 17 | ] 18 | print("Synthetic Text: Retail product description created") 19 | print(f"Text: {texts[0][:100]}...") 20 | 21 | # %% [3. Abstractive Summarization] 22 | summarizer = pipeline("summarization", model="facebook/bart-large-cnn") 23 | summaries = [summarizer(text, max_length=50, min_length=10, do_sample=False)[0]['summary_text'] for text in texts] 24 | print("Summarization: Summaries generated") 25 | for i, summary in enumerate(summaries): 26 | print(f"Summary {i+1}: {summary}") 27 | 28 | # %% [4. Visualization] 29 | lengths = [len(nltk.word_tokenize(summary)) for summary in summaries] 30 | plt.figure(figsize=(8, 4)) 31 | plt.bar(range(1, len(summaries) + 1), lengths, color='orange') 32 | plt.title("Summary Lengths") 33 | plt.xlabel("Summary") 34 | plt.ylabel("Word Count") 35 | plt.savefig("summarization_output.png") 36 | print("Visualization: Summary lengths saved as summarization_output.png") 37 | 38 | # %% [5. Interview Scenario: Summarization] 39 | """ 40 | Interview Scenario: Summarization 41 | Q: What’s the difference between abstractive and extractive summarization? 42 | A: Abstractive generates new text; extractive selects existing sentences. 43 | Key: Abstractive uses models like BART, extractive uses algorithms like TextRank. 44 | Example: pipeline("summarization", model="facebook/bart-large-cnn") 45 | """ 46 | 47 | # Execute the demo 48 | if __name__ == "__main__": 49 | nltk.download('punkt', quiet=True) 50 | run_summarization_demo() -------------------------------------------------------------------------------- /Transformers Fundamentals/02 Speech and Audio Pipelines/01 Automatic Speech Recognition (ASR)/asr.py: -------------------------------------------------------------------------------- 1 | # %% [1. Introduction to Automatic Speech Recognition] 2 | # Learn speech-to-text conversion with Hugging Face ASR pipeline. 3 | 4 | # Setup: pip install transformers torch numpy matplotlib soundfile librosa 5 | import matplotlib.pyplot as plt 6 | from transformers import pipeline 7 | import numpy as np 8 | import librosa 9 | 10 | def run_asr_demo(): 11 | # %% [2. Synthetic Audio Data Simulation] 12 | # Note: Due to file I/O constraints, we simulate audio input with metadata 13 | audio_samples = [ 14 | {"text": "This laptop is great!", "duration": 2.5}, 15 | {"text": "The battery life is terrible.", "duration": 3.0}, 16 | {"text": "TechCorp products are solid.", "duration": 2.8} 17 | ] 18 | print("Synthetic Audio: Simulated retail customer audio created") 19 | print(f"Audio Samples: {audio_samples}") 20 | 21 | # %% [3. ASR Pipeline] 22 | asr = pipeline("automatic-speech-recognition", model="facebook/wav2vec2-base-960h") 23 | # Simulate ASR by using the known text (since actual audio processing requires file input) 24 | transcriptions = [sample["text"] for sample in audio_samples] 25 | print("ASR: Transcriptions simulated") 26 | for i, transcription in enumerate(transcriptions): 27 | print(f"Sample {i+1}: {transcription}") 28 | 29 | # %% [4. Visualization] 30 | lengths = [len(transcription.split()) for transcription in transcriptions] 31 | plt.figure(figsize=(8, 4)) 32 | plt.bar(range(1, len(transcriptions) + 1), lengths, color='blue') 33 | plt.title("Transcription Word Counts") 34 | plt.xlabel("Audio Sample") 35 | plt.ylabel("Word Count") 36 | plt.savefig("asr_output.png") 37 | print("Visualization: Transcription lengths saved as asr_output.png") 38 | 39 | # %% [5. Interview Scenario: ASR] 40 | """ 41 | Interview Scenario: Automatic Speech Recognition 42 | Q: How does the ASR pipeline process audio in Hugging Face? 43 | A: It uses models like Wav2Vec2 to convert raw audio waveforms to text via learned representations. 44 | Key: Pre-trained on large speech datasets for robust transcription. 45 | Example: pipeline("automatic-speech-recognition", model="facebook/wav2vec2-base-960h") 46 | """ 47 | 48 | # Execute the demo 49 | if __name__ == "__main__": 50 | run_asr_demo() -------------------------------------------------------------------------------- /Transformers Fundamentals/03 Vision-Based Pipelines/04 Image-to-Text/image_to_text.py: -------------------------------------------------------------------------------- 1 | # %% [1. Introduction to Image-to-Text] 2 | # Learn caption generation with Hugging Face image-to-text pipeline. 3 | 4 | # Setup: pip install transformers torch numpy matplotlib pillow 5 | import matplotlib.pyplot as plt 6 | from transformers import pipeline 7 | import nltk 8 | import numpy as np 9 | 10 | def run_image_to_text_demo(): 11 | # %% [2. Synthetic Image Data Simulation] 12 | # Note: Due to file I/O constraints, we simulate image inputs with metadata 13 | images = [ 14 | {"description": "Laptop on a desk", "caption": "A laptop on a wooden desk."}, 15 | {"description": "Smartphone in a store", "caption": "A smartphone displayed in a retail store."}, 16 | {"description": "Broken gadget", "caption": "A broken gadget on a table."} 17 | ] 18 | print("Synthetic Images: Simulated retail product images created") 19 | print(f"Images: {images}") 20 | 21 | # %% [3. Image-to-Text] 22 | captioner = pipeline("image-to-text", model="Salesforce/blip-image-captioning-base") 23 | # Simulate captioning by using predefined captions (since actual image processing requires file input) 24 | captions = [image["caption"] for image in images] 25 | print("Image-to-Text: Captions simulated") 26 | for i, caption in enumerate(captions): 27 | print(f"Image {i+1}: {caption}") 28 | 29 | # %% [4. Visualization] 30 | lengths = [len(nltk.word_tokenize(caption)) for caption in captions] 31 | plt.figure(figsize=(8, 4)) 32 | plt.bar(range(1, len(captions) + 1), lengths, color='purple') 33 | plt.title("Caption Lengths") 34 | plt.xlabel("Image") 35 | plt.ylabel("Word Count") 36 | plt.savefig("image_to_text_output.png") 37 | print("Visualization: Caption lengths saved as image_to_text_output.png") 38 | 39 | # %% [5. Interview Scenario: Image-to-Text] 40 | """ 41 | Interview Scenario: Image-to-Text 42 | Q: How does the image-to-text pipeline work in Hugging Face? 43 | A: It uses multimodal models like BLIP or CLIP to generate text descriptions from image features. 44 | Key: Combines vision and language transformers for captioning. 45 | Example: pipeline("image-to-text", model="Salesforce/blip-image-captioning-base") 46 | """ 47 | 48 | # Execute the demo 49 | if __name__ == "__main__": 50 | nltk.download('punkt', quiet=True) 51 | run_image_to_text_demo() -------------------------------------------------------------------------------- /Transformers Fundamentals/01 Text-Based Pipelines/01 Text Classification/text_classification.py: -------------------------------------------------------------------------------- 1 | # %% [1. Introduction to Text Classification] 2 | # Learn sentiment analysis and topic classification with Hugging Face pipelines. 3 | 4 | # Setup: pip install transformers torch numpy matplotlib 5 | import matplotlib.pyplot as plt 6 | from collections import Counter 7 | from transformers import pipeline 8 | 9 | def run_text_classification_demo(): 10 | # %% [2. Synthetic Retail Text Data] 11 | reviews = [ 12 | "This laptop from TechCorp is great! I love the fast processor.", 13 | "The screen is vibrant but the battery life is terrible.", 14 | "Overall, a solid purchase from TechCorp. Highly recommend!" 15 | ] 16 | print("Synthetic Text: Retail product reviews created") 17 | print(f"Reviews: {reviews}") 18 | 19 | # %% [3. Sentiment Analysis] 20 | classifier = pipeline("text-classification", model="distilbert-base-uncased-finetuned-sst-2-english") 21 | sentiment_results = classifier(reviews) 22 | print("Sentiment Analysis: Predictions made") 23 | for i, (review, result) in enumerate(zip(reviews, sentiment_results)): 24 | print(f"Review {i+1}: {result['label']} (Score: {result['score']:.2f})") 25 | 26 | # %% [4. Visualization] 27 | labels = [result['label'] for result in sentiment_results] 28 | scores = [result['score'] for result in sentiment_results] 29 | label_counts = Counter(labels) 30 | plt.figure(figsize=(8, 4)) 31 | plt.bar(label_counts.keys(), label_counts.values(), color=['green' if k == 'POSITIVE' else 'red' for k in label_counts.keys()]) 32 | plt.title("Sentiment Distribution") 33 | plt.xlabel("Sentiment") 34 | plt.ylabel("Count") 35 | plt.savefig("text_classification_output.png") 36 | print("Visualization: Sentiment distribution saved as text_classification_output.png") 37 | 38 | # %% [5. Interview Scenario: Text Classification] 39 | """ 40 | Interview Scenario: Text Classification 41 | Q: How does the text-classification pipeline work in Hugging Face? 42 | A: It uses a pre-trained transformer model (e.g., DistilBERT) to predict labels like positive/negative. 43 | Key: Fine-tuned on datasets like SST-2 for sentiment analysis. 44 | Example: pipeline("text-classification", model="distilbert-base-uncased-finetuned-sst-2-english") 45 | """ 46 | 47 | # Execute the demo 48 | if __name__ == "__main__": 49 | run_text_classification_demo() -------------------------------------------------------------------------------- /Transformers Fundamentals/01 Text-Based Pipelines/03 Question Answering/question_answering.py: -------------------------------------------------------------------------------- 1 | # %% [1. Introduction to Question Answering] 2 | # Learn extractive and generative QA with Hugging Face pipelines. 3 | 4 | # Setup: pip install transformers torch numpy matplotlib 5 | import matplotlib.pyplot as plt 6 | from transformers import pipeline 7 | 8 | def run_question_answering_demo(): 9 | # %% [2. Synthetic Retail Text Data] 10 | context = """ 11 | TechCorp's new laptop has a fast processor from Intel and a vibrant screen designed by Samsung. 12 | The battery life is average, lasting about 6 hours. It was launched in New York in 2025. 13 | """ 14 | questions = [ 15 | "What is the processor brand?", 16 | "Where was the laptop launched?", 17 | "How long does the battery last?" 18 | ] 19 | print("Synthetic Text: Retail product description created") 20 | print(f"Context: {context[:100]}...") 21 | print(f"Questions: {questions}") 22 | 23 | # %% [3. Extractive QA] 24 | qa = pipeline("question-answering", model="distilbert-base-cased-distilled-squad") 25 | answers = [qa(question=question, context=context) for question in questions] 26 | print("Question Answering: Answers extracted") 27 | for i, (question, answer) in enumerate(zip(questions, answers)): 28 | print(f"Question {i+1}: {question}") 29 | print(f"Answer: {answer['answer']} (Score: {answer['score']:.2f})") 30 | 31 | # %% [4. Visualization] 32 | scores = [answer['score'] for answer in answers] 33 | plt.figure(figsize=(8, 4)) 34 | plt.bar(range(1, len(questions) + 1), scores, color='green') 35 | plt.title("Answer Confidence Scores") 36 | plt.xlabel("Question") 37 | plt.ylabel("Confidence Score") 38 | plt.savefig("question_answering_output.png") 39 | print("Visualization: Answer confidence saved as question_answering_output.png") 40 | 41 | # %% [5. Interview Scenario: Question Answering] 42 | """ 43 | Interview Scenario: Question Answering 44 | Q: What’s the difference between extractive and generative QA? 45 | A: Extractive QA selects spans from the context; generative QA generates free-form answers. 46 | Key: Extractive uses models like BERT, generative uses T5 or GPT. 47 | Example: pipeline("question-answering", model="distilbert-base-cased-distilled-squad") 48 | """ 49 | 50 | # Execute the demo 51 | if __name__ == "__main__": 52 | run_question_answering_demo() -------------------------------------------------------------------------------- /Transformers Fundamentals/01 Text-Based Pipelines/06 Translation/translation.py: -------------------------------------------------------------------------------- 1 | # %% [1. Introduction to Translation] 2 | # Learn multilingual translation with Hugging Face pipelines. 3 | 4 | # Setup: pip install transformers torch numpy matplotlib 5 | import matplotlib.pyplot as plt 6 | from transformers import pipeline 7 | 8 | def run_translation_demo(): 9 | # %% [2. Synthetic Retail Text Data] 10 | reviews = [ 11 | "This laptop from TechCorp is great!", 12 | "The screen is vibrant but the battery life is terrible.", 13 | "Overall, a solid purchase from TechCorp." 14 | ] 15 | target_languages = ["es", "fr"] # Spanish, French 16 | print("Synthetic Text: Retail product reviews created") 17 | print(f"Reviews: {reviews}") 18 | 19 | # %% [3. Multilingual Translation] 20 | translations = [] 21 | for lang in target_languages: 22 | translator = pipeline(f"translation_en_to_{lang}", model=f"Helsinki-NLP/opus-mt-en-{lang}") 23 | lang_translations = [translator(review)[0]['translation_text'] for review in reviews] 24 | translations.append((lang, lang_translations)) 25 | print("Translation: Texts translated") 26 | for lang, trans in translations: 27 | print(f"Language: {lang.upper()}") 28 | for i, t in enumerate(trans): 29 | print(f"Review {i+1}: {t}") 30 | 31 | # %% [4. Visualization] 32 | lengths = [[len(t.split()) for t in trans] for lang, trans in translations] 33 | plt.figure(figsize=(8, 4)) 34 | for i, (lang, lens) in enumerate(zip(target_languages, lengths)): 35 | plt.bar([x + i*0.4 for x in range(1, len(reviews) + 1)], lens, width=0.4, label=lang.upper()) 36 | plt.title("Translation Lengths by Language") 37 | plt.xlabel("Review") 38 | plt.ylabel("Word Count") 39 | plt.legend() 40 | plt.savefig("translation_output.png") 41 | print("Visualization: Translation lengths saved as translation_output.png") 42 | 43 | # %% [5. Interview Scenario: Translation] 44 | """ 45 | Interview Scenario: Translation 46 | Q: How does the translation pipeline work in Hugging Face? 47 | A: It uses encoder-decoder models (e.g., MarianMT) fine-tuned for language pairs. 48 | Key: Supports multilingual translation with high accuracy. 49 | Example: pipeline("translation_en_to_es", model="Helsinki-NLP/opus-mt-en-es") 50 | """ 51 | 52 | # Execute the demo 53 | if __name__ == "__main__": 54 | run_translation_demo() -------------------------------------------------------------------------------- /Transformers Fundamentals/03 Vision-Based Pipelines/02 Object Detection/object_detection.py: -------------------------------------------------------------------------------- 1 | # %% [1. Introduction to Object Detection] 2 | # Learn bounding box detection with Hugging Face object detection pipeline. 3 | 4 | # Setup: pip install transformers torch numpy matplotlib pillow 5 | import matplotlib.pyplot as plt 6 | from collections import Counter 7 | from transformers import pipeline 8 | import numpy as np 9 | 10 | def run_object_detection_demo(): 11 | # %% [2. Synthetic Image Data Simulation] 12 | # Note: Due to file I/O constraints, we simulate image inputs with metadata 13 | images = [ 14 | {"description": "Laptop and phone on a desk", "objects": ["laptop", "phone"]}, 15 | {"description": "Store shelf with gadgets", "objects": ["phone", "tablet"]}, 16 | {"description": "Broken laptop", "objects": ["laptop"]} 17 | ] 18 | print("Synthetic Images: Simulated retail product images created") 19 | print(f"Images: {images}") 20 | 21 | # %% [3. Object Detection] 22 | detector = pipeline("object-detection", model="facebook/detr-resnet-50") 23 | # Simulate detection by using predefined objects (since actual image processing requires file input) 24 | detections = [image["objects"] for image in images] 25 | print("Object Detection: Objects simulated") 26 | for i, objects in enumerate(detections): 27 | print(f"Image {i+1}: {objects}") 28 | 29 | # %% [4. Visualization] 30 | all_objects = [obj for detection in detections for obj in detection] 31 | object_counts = Counter(all_objects) 32 | plt.figure(figsize=(8, 4)) 33 | plt.bar(object_counts.keys(), object_counts.values(), color='blue') 34 | plt.title("Detected Object Distribution") 35 | plt.xlabel("Object") 36 | plt.ylabel("Count") 37 | plt.savefig("object_detection_output.png") 38 | print("Visualization: Object distribution saved as object_detection_output.png") 39 | 40 | # %% [5. Interview Scenario: Object Detection] 41 | """ 42 | Interview Scenario: Object Detection 43 | Q: How does the object detection pipeline work in Hugging Face? 44 | A: It uses models like DETR to predict bounding boxes and class labels for objects in images. 45 | Key: Combines transformer-based feature extraction with object localization. 46 | Example: pipeline("object-detection", model="facebook/detr-resnet-50") 47 | """ 48 | 49 | # Execute the demo 50 | if __name__ == "__main__": 51 | run_object_detection_demo() -------------------------------------------------------------------------------- /Transformers Fundamentals/03 Vision-Based Pipelines/03 Image Segmentation/image_segmentation.py: -------------------------------------------------------------------------------- 1 | # %% [1. Introduction to Image Segmentation] 2 | # Learn pixel-level classification with Hugging Face image segmentation pipeline. 3 | 4 | # Setup: pip install transformers torch numpy matplotlib pillow 5 | import matplotlib.pyplot as plt 6 | from collections import Counter 7 | from transformers import pipeline 8 | import numpy as np 9 | 10 | def run_image_segmentation_demo(): 11 | # %% [2. Synthetic Image Data Simulation] 12 | # Note: Due to file I/O constraints, we simulate image inputs with metadata 13 | images = [ 14 | {"description": "Laptop on a desk", "segments": ["laptop", "desk"]}, 15 | {"description": "Store shelf with gadgets", "segments": ["shelf", "phone", "tablet"]}, 16 | {"description": "Broken laptop", "segments": ["laptop"]} 17 | ] 18 | print("Synthetic Images: Simulated retail product images created") 19 | print(f"Images: {images}") 20 | 21 | # %% [3. Image Segmentation] 22 | segmenter = pipeline("image-segmentation", model="facebook/detr-resnet-50-panoptic") 23 | # Simulate segmentation by using predefined segments (since actual image processing requires file input) 24 | segmentations = [image["segments"] for image in images] 25 | print("Image Segmentation: Segments simulated") 26 | for i, segments in enumerate(segmentations): 27 | print(f"Image {i+1}: {segments}") 28 | 29 | # %% [4. Visualization] 30 | all_segments = [seg for segmentation in segmentations for seg in segmentation] 31 | segment_counts = Counter(all_segments) 32 | plt.figure(figsize=(8, 4)) 33 | plt.bar(segment_counts.keys(), segment_counts.values(), color='green') 34 | plt.title("Segmented Region Distribution") 35 | plt.xlabel("Segment") 36 | plt.ylabel("Count") 37 | plt.savefig("image_segmentation_output.png") 38 | print("Visualization: Segment distribution saved as image_segmentation_output.png") 39 | 40 | # %% [5. Interview Scenario: Image Segmentation] 41 | """ 42 | Interview Scenario: Image Segmentation 43 | Q: How does the image segmentation pipeline work in Hugging Face? 44 | A: It uses models like DETR to assign class labels to each pixel or region in an image. 45 | Key: Supports panoptic segmentation for both objects and background. 46 | Example: pipeline("image-segmentation", model="facebook/detr-resnet-50-panoptic") 47 | """ 48 | 49 | # Execute the demo 50 | if __name__ == "__main__": 51 | run_image_segmentation_demo() -------------------------------------------------------------------------------- /Transformers Fundamentals/03 Vision-Based Pipelines/01 Image Classification/image_classification.py: -------------------------------------------------------------------------------- 1 | # %% [1. Introduction to Image Classification] 2 | # Learn object and scene recognition with Hugging Face image classification pipeline. 3 | 4 | # Setup: pip install transformers torch numpy matplotlib pillow 5 | import matplotlib.pyplot as plt 6 | from collections import Counter 7 | from transformers import pipeline 8 | import numpy as np 9 | 10 | def run_image_classification_demo(): 11 | # %% [2. Synthetic Image Data Simulation] 12 | # Note: Due to file I/O constraints, we simulate image inputs with metadata 13 | images = [ 14 | {"description": "Laptop on a desk", "category": "positive"}, 15 | {"description": "Smartphone in a store", "category": "positive"}, 16 | {"description": "Broken gadget", "category": "negative"} 17 | ] 18 | print("Synthetic Images: Simulated retail product images created") 19 | print(f"Images: {images}") 20 | 21 | # %% [3. Image Classification] 22 | classifier = pipeline("image-classification", model="google/vit-base-patch16-224") 23 | # Simulate classification by using predefined categories (since actual image processing requires file input) 24 | classifications = [image["category"] for image in images] 25 | print("Image Classification: Classifications simulated") 26 | for i, classification in enumerate(classifications): 27 | print(f"Image {i+1}: {classification}") 28 | 29 | # %% [4. Visualization] 30 | label_counts = Counter(classifications) 31 | plt.figure(figsize=(8, 4)) 32 | plt.bar(label_counts.keys(), label_counts.values(), color=['green' if k == 'positive' else 'red' for k in label_counts.keys()]) 33 | plt.title("Image Classification Distribution") 34 | plt.xlabel("Category") 35 | plt.ylabel("Count") 36 | plt.savefig("image_classification_output.png") 37 | print("Visualization: Classification distribution saved as image_classification_output.png") 38 | 39 | # %% [5. Interview Scenario: Image Classification] 40 | """ 41 | Interview Scenario: Image Classification 42 | Q: How does the image classification pipeline work in Hugging Face? 43 | A: It uses Vision Transformers (e.g., ViT) to classify images based on learned patch embeddings. 44 | Key: Fine-tuned on datasets like ImageNet for robust performance. 45 | Example: pipeline("image-classification", model="google/vit-base-patch16-224") 46 | """ 47 | 48 | # Execute the demo 49 | if __name__ == "__main__": 50 | run_image_classification_demo() -------------------------------------------------------------------------------- /Transformers Fundamentals/02 Speech and Audio Pipelines/03 Audio Classification/audio_classification.py: -------------------------------------------------------------------------------- 1 | # %% [1. Introduction to Audio Classification] 2 | # Learn sound event detection with Hugging Face audio classification pipeline. 3 | 4 | # Setup: pip install transformers torch numpy matplotlib 5 | import matplotlib.pyplot as plt 6 | from collections import Counter 7 | from transformers import pipeline 8 | 9 | def run_audio_classification_demo(): 10 | # %% [2. Synthetic Audio Data Simulation] 11 | # Note: Due to file I/O constraints, we simulate audio input with metadata 12 | audio_samples = [ 13 | {"label": "positive", "description": "Customer praising product"}, 14 | {"label": "negative", "description": "Customer complaining about battery"}, 15 | {"label": "positive", "description": "Customer excited about screen"} 16 | ] 17 | print("Synthetic Audio: Simulated retail customer feedback created") 18 | print(f"Audio Samples: {audio_samples}") 19 | 20 | # %% [3. Audio Classification] 21 | classifier = pipeline("audio-classification", model="superb/hubert-base-superb-er") 22 | # Simulate classification by using predefined labels (since actual audio processing requires file input) 23 | classifications = [sample["label"] for sample in audio_samples] 24 | print("Audio Classification: Classifications simulated") 25 | for i, classification in enumerate(classifications): 26 | print(f"Sample {i+1}: {classification}") 27 | 28 | # %% [4. Visualization] 29 | label_counts = Counter(classifications) 30 | plt.figure(figsize=(8, 4)) 31 | plt.bar(label_counts.keys(), label_counts.values(), color=['green' if k == 'positive' else 'red' for k in label_counts.keys()]) 32 | plt.title("Audio Classification Distribution") 33 | plt.xlabel("Sentiment") 34 | plt.ylabel("Count") 35 | plt.savefig("audio_classification_output.png") 36 | print("Visualization: Classification distribution saved as audio_classification_output.png") 37 | 38 | # %% [5. Interview Scenario: Audio Classification] 39 | """ 40 | Interview Scenario: Audio Classification 41 | Q: How does the audio classification pipeline work in Hugging Face? 42 | A: It uses models like HuBERT to classify audio based on learned features from waveforms. 43 | Key: Fine-tuned on datasets for tasks like emotion or event detection. 44 | Example: pipeline("audio-classification", model="superb/hubert-base-superb-er") 45 | """ 46 | 47 | # Execute the demo 48 | if __name__ == "__main__": 49 | run_audio_classification_demo() -------------------------------------------------------------------------------- /Transformers Fundamentals/02 Speech and Audio Pipelines/README.md: -------------------------------------------------------------------------------- 1 | # 🗣️ Speech and Audio Pipelines with Hugging Face Transformers 2 | 3 |
4 | Python Logo 5 | Hugging Face 6 | Transformers 7 | NumPy 8 | Matplotlib 9 |
10 |

Your guide to mastering speech and audio pipelines with Hugging Face Transformers for AI/ML and NLP interviews

11 | 12 | --- 13 | 14 | ## 📖 Introduction 15 | 16 | Welcome to the **Speech and Audio Pipelines** subsection of the **Transformers Library Roadmap**! 🚀 This folder focuses on leveraging the **Hugging Face Transformers** library for speech and audio tasks, including speech-to-text, text-to-speech, and audio classification. Designed for hands-on learning and interview success, it builds on your prior roadmaps—**Python**, **TensorFlow.js**, **GenAI**, **JavaScript**, **Keras**, **Matplotlib**, **Pandas**, **NumPy**, **Computer Vision with OpenCV (cv2)**, and **NLP with NLTK**—and supports your retail-themed projects (April 26, 2025). Whether tackling coding challenges or technical discussions, this section equips you with the skills to excel in speech and audio processing roles. 17 | 18 | ## 🌟 What’s Inside? 19 | 20 | - **Automatic Speech Recognition (ASR)**: Convert spoken audio to text. 21 | - **Text-to-Speech (TTS)**: Synthesize speech from text. 22 | - **Audio Classification**: Detect and classify sound events. 23 | - **Hands-on Code**: Three `.py` files with practical examples using synthetic or sample audio data. 24 | - **Interview Scenarios**: Key questions and answers to ace speech/audio-related interviews. 25 | 26 | ## 🔍 Who Is This For? 27 | 28 | - NLP Engineers working with speech and audio data. 29 | - Machine Learning Engineers building audio-based AI models. 30 | - AI Researchers mastering transformer-based audio processing. 31 | - Software Engineers deepening expertise in Hugging Face audio tools. 32 | - Anyone preparing for speech/audio-related interviews in AI/ML or retail. 33 | 34 | ## 🗺️ Learning Roadmap 35 | 36 | This subsection covers three key speech and audio pipelines, each with a dedicated `.py` file: 37 | 38 | ### 🎙️ Automatic Speech Recognition (`asr.py`) 39 | - Speech-to-Text Conversion 40 | - Transcription Analysis 41 | - Transcription Visualization 42 | 43 | ### 🗣️ Text-to-Speech (`tts.py`) 44 | - Speech Synthesis 45 | - Audio Generation 46 | - Audio Length Visualization 47 | 48 | ### 🔊 Audio Classification (`audio_classification.py`) 49 | - Sound Event Detection 50 | - Classification Analysis 51 | - Classification Visualization 52 | 53 | ## 💡 Why Master Speech and Audio Pipelines? 54 | 55 | Speech and audio pipelines with Hugging Face Transformers are critical for modern AI, and here’s why they matter: 56 | 1. **Real-World Applications**: Powers voice assistants, customer service bots, and audio analytics. 57 | 2. **Retail Relevance**: Enhances retail experiences (e.g., voice queries, audio feedback analysis). 58 | 3. **Interview Relevance**: Tested in coding challenges (e.g., ASR implementation, audio classification). 59 | 4. **State-of-the-Art**: Leverages models like Wav2Vec2, SpeechT5, and HuBERT. 60 | 5. **Industry Demand**: A must-have for 6 LPA+ AI/ML roles in retail, tech, and beyond. 61 | 62 | This section is your roadmap to mastering speech and audio pipelines for technical interviews—let’s dive in! 63 | 64 | ## 📆 Study Plan 65 | 66 | - **Week 1**: 67 | - Day 1-2: Automatic Speech Recognition 68 | - Day 3-4: Text-to-Speech 69 | - Day 5-6: Audio Classification 70 | - Day 7: Review and practice interview scenarios 71 | 72 | ## 🛠️ Setup Instructions 73 | 74 | 1. **Python Environment**: 75 | - Install Python 3.8+ and pip. 76 | - Create a virtual environment: `python -m venv transformers_env; source transformers_env/bin/activate`. 77 | - Install dependencies: `pip install transformers torch numpy matplotlib soundfile librosa`. 78 | 2. **Hugging Face Hub**: 79 | - Optional: Create a Hugging Face account for model access. 80 | - Install `huggingface_hub`: `pip install huggingface_hub`. 81 | 3. **Datasets**: 82 | - Uses synthetic or sample audio data (e.g., generated WAV files or public datasets). 83 | - Optional: Download audio datasets from [Hugging Face Datasets](https://huggingface.co/datasets) (e.g., LibriSpeech). 84 | - Note: `.py` files include code to generate synthetic audio or use sample files due to file I/O constraints. 85 | 4. **Running Code**: 86 | - Run `.py` files in a Python environment (e.g., `python asr.py`). 87 | - Use Google Colab for convenience or local setup with GPU support for faster processing. 88 | - View outputs in terminal (console logs) and Matplotlib visualizations (saved as PNGs). 89 | - Check terminal for errors; ensure dependencies and audio libraries are installed. 90 | 91 | ## 🏆 Practical Tasks 92 | 93 | 1. **Automatic Speech Recognition**: 94 | - Transcribe synthetic customer voice queries. 95 | - Visualize transcription lengths. 96 | 2. **Text-to-Speech**: 97 | - Synthesize product descriptions as audio. 98 | - Analyze generated audio lengths. 99 | 3. **Audio Classification**: 100 | - Classify retail audio feedback (e.g., positive/negative tones). 101 | - Visualize classification distribution. 102 | 103 | ## 💡 Interview Tips 104 | 105 | - **Common Questions**: 106 | - How does the ASR pipeline process audio in Hugging Face? 107 | - What’s the difference between TTS and traditional speech synthesis? 108 | - How do you handle noisy audio in classification tasks? 109 | - **Tips**: 110 | - Explain ASR with code (e.g., `pipeline("automatic-speech-recognition")`). 111 | - Demonstrate TTS pipeline usage (e.g., `pipeline("text-to-speech")`). 112 | - Be ready to code tasks like audio preprocessing or classification. 113 | - Discuss trade-offs (e.g., Wav2Vec2 vs. traditional ASR, model size vs. latency). 114 | - **Coding Tasks**: 115 | - Implement an ASR pipeline for customer queries. 116 | - Synthesize a retail announcement using TTS. 117 | - Classify audio samples by sentiment. 118 | - **Conceptual Clarity**: 119 | - Explain how Wav2Vec2 processes raw audio. 120 | - Describe the role of transformers in audio classification. 121 | 122 | ## 📚 Resources 123 | 124 | - [Hugging Face Transformers Documentation](https://huggingface.co/docs/transformers/) 125 | - [Hugging Face Datasets Documentation](https://huggingface.co/docs/datasets/) 126 | - [Hugging Face Course](https://huggingface.co/course) 127 | - [PyTorch Documentation](https://pytorch.org/) 128 | - [NumPy Documentation](https://numpy.org/doc/) 129 | - [Matplotlib Documentation](https://matplotlib.org/stable/contents.html) 130 | - [Librosa Documentation](https://librosa.org/doc/) 131 | 132 | ## 🤝 Contributions 133 | 134 | Love to collaborate? Here’s how! 🌟 135 | 1. Fork the repository. 136 | 2. Create a feature branch (`git checkout -b feature/amazing-addition`). 137 | 3. Commit your changes (`git commit -m 'Add some amazing content'`). 138 | 4. Push to the branch (`git push origin feature/amazing-addition`). 139 | 5. Open a Pull Request. 140 | 141 | --- 142 | 143 |
144 |

Happy Learning and Good Luck with Your Interviews! ✨

145 |
-------------------------------------------------------------------------------- /Transformers Fundamentals/03 Vision-Based Pipelines/README.md: -------------------------------------------------------------------------------- 1 | # 🖼️ Vision-Based Pipelines with Hugging Face Transformers 2 | 3 |
4 | Python Logo 5 | Hugging Face 6 | Transformers 7 | NumPy 8 | Matplotlib 9 |
10 |

Your guide to mastering vision-based pipelines with Hugging Face Transformers for AI/ML and computer vision interviews

11 | 12 | --- 13 | 14 | ## 📖 Introduction 15 | 16 | Welcome to the **Vision-Based Pipelines** subsection of the **Transformers Library Roadmap**! 🚀 This folder focuses on leveraging the **Hugging Face Transformers** library for vision tasks, including image classification, object detection, image segmentation, and image-to-text captioning. Designed for hands-on learning and interview success, it builds on your prior roadmaps—**Python**, **TensorFlow.js**, **GenAI**, **JavaScript**, **Keras**, **Matplotlib**, **Pandas**, **NumPy**, **Computer Vision with OpenCV (cv2)**, and **NLP with NLTK**—and supports your retail-themed projects (April 26, 2025). Whether tackling coding challenges or technical discussions, this section equips you with the skills to excel in computer vision and multimodal AI roles. 17 | 18 | ## 🌟 What’s Inside? 19 | 20 | - **Image Classification**: Recognize objects and scenes in images. 21 | - **Object Detection**: Detect and localize objects with bounding boxes. 22 | - **Image Segmentation**: Perform pixel-level classification of image regions. 23 | - **Image-to-Text**: Generate descriptive captions for images. 24 | - **Hands-on Code**: Four `.py` files with practical examples using synthetic or sample image data. 25 | - **Interview Scenarios**: Key questions and answers to ace vision-related interviews. 26 | 27 | ## 🔍 Who Is This For? 28 | 29 | - Computer Vision Engineers working with transformer-based models. 30 | - Machine Learning Engineers building vision-based AI models. 31 | - AI Researchers mastering vision transformers (ViT, DETR). 32 | - Software Engineers deepening expertise in Hugging Face vision tools. 33 | - Anyone preparing for computer vision interviews in AI/ML or retail. 34 | 35 | ## 🗺️ Learning Roadmap 36 | 37 | This subsection covers four key vision-based pipelines, each with a dedicated `.py` file: 38 | 39 | ### 🏞️ Image Classification (`image_classification.py`) 40 | - Object Recognition 41 | - Scene Recognition 42 | - Classification Visualization 43 | 44 | ### 📍 Object Detection (`object_detection.py`) 45 | - Bounding Box Detection 46 | - Object Localization 47 | - Detection Visualization 48 | 49 | ### 🖌️ Image Segmentation (`image_segmentation.py`) 50 | - Pixel-Level Classification 51 | - Segmentation Analysis 52 | - Segmentation Visualization 53 | 54 | ### 📜 Image-to-Text (`image_to_text.py`) 55 | - Caption Generation 56 | - Caption Analysis 57 | - Caption Visualization 58 | 59 | ## 💡 Why Master Vision-Based Pipelines? 60 | 61 | Vision-based pipelines with Hugging Face Transformers are critical for modern AI, and here’s why they matter: 62 | 1. **Real-World Applications**: Powers visual search, product recognition, and automated retail analytics. 63 | 2. **Retail Relevance**: Enhances retail experiences (e.g., product image analysis, visual inventory). 64 | 3. **Interview Relevance**: Tested in coding challenges (e.g., image classification, object detection). 65 | 4. **State-of-the-Art**: Leverages models like Vision Transformer (ViT), DETR, and CLIP. 66 | 5. **Industry Demand**: A must-have for 6 LPA+ AI/ML roles in retail, tech, and beyond. 67 | 68 | This section is your roadmap to mastering vision-based pipelines for technical interviews—let’s dive in! 69 | 70 | ## 📆 Study Plan 71 | 72 | - **Week 1**: 73 | - Day 1-2: Image Classification 74 | - Day 3-4: Object Detection 75 | - Day 5-6: Image Segmentation 76 | - Day 7: Image-to-Text 77 | - **Week 2**: 78 | - Day 1-7: Review all `.py` files and practice interview scenarios. 79 | 80 | ## 🛠️ Setup Instructions 81 | 82 | 1. **Python Environment**: 83 | - Install Python 3.8+ and pip. 84 | - Create a virtual environment: `python -m venv transformers_env; source transformers_env/bin/activate`. 85 | - Install dependencies: `pip install transformers torch numpy matplotlib pillow`. 86 | 2. **Hugging Face Hub**: 87 | - Optional: Create a Hugging Face account for model access. 88 | - Install `huggingface_hub`: `pip install huggingface_hub`. 89 | 3. **Datasets**: 90 | - Uses synthetic or sample image data (e.g., programmatically generated images or public datasets). 91 | - Optional: Download image datasets from [Hugging Face Datasets](https://huggingface.co/datasets) (e.g., COCO, ImageNet). 92 | - Note: `.py` files include code to simulate image inputs due to file I/O constraints. 93 | 4. **Running Code**: 94 | - Run `.py` files in a Python environment (e.g., `python image_classification.py`). 95 | - Use Google Colab for convenience or local setup with GPU support for faster processing. 96 | - View outputs in terminal (console logs) and Matplotlib visualizations (saved as PNGs). 97 | - Check terminal for errors; ensure dependencies are installed. 98 | 99 | ## 🏆 Practical Tasks 100 | 101 | 1. **Image Classification**: 102 | - Classify retail product images by category. 103 | - Visualize classification confidence scores. 104 | 2. **Object Detection**: 105 | - Detect products in retail images with bounding boxes. 106 | - Plot detected objects. 107 | 3. **Image Segmentation**: 108 | - Segment product regions in images. 109 | - Visualize segmentation masks. 110 | 4. **Image-to-Text**: 111 | - Generate captions for product images. 112 | - Analyze caption lengths. 113 | 114 | ## 💡 Interview Tips 115 | 116 | - **Common Questions**: 117 | - How does the image classification pipeline work in Hugging Face? 118 | - What’s the difference between object detection and image segmentation? 119 | - How do vision transformers process images? 120 | - How does image-to-text leverage multimodal models? 121 | - **Tips**: 122 | - Explain pipelines with code (e.g., `pipeline("image-classification")`). 123 | - Demonstrate object detection with DETR (e.g., `pipeline("object-detection")`). 124 | - Be ready to code tasks like image preprocessing or caption generation. 125 | - Discuss trade-offs (e.g., ViT vs. CNNs, model size vs. accuracy). 126 | - **Coding Tasks**: 127 | - Implement an image classification pipeline for product images. 128 | - Detect objects in a retail image. 129 | - Generate captions for a product image. 130 | - **Conceptual Clarity**: 131 | - Explain how Vision Transformers process image patches. 132 | - Describe the role of CLIP in image-to-text tasks. 133 | 134 | ## 📚 Resources 135 | 136 | - [Hugging Face Transformers Documentation](https://huggingface.co/docs/transformers/) 137 | - [Hugging Face Datasets Documentation](https://huggingface.co/docs/datasets/) 138 | - [Hugging Face Course](https://huggingface.co/course) 139 | - [PyTorch Documentation](https://pytorch.org/) 140 | - [NumPy Documentation](https://numpy.org/doc/) 141 | - [Matplotlib Documentation](https://matplotlib.org/stable/contents.html) 142 | - [“Deep Learning with Python” by François Chollet](https://www.manning.com/books/deep-learning-with-python) 143 | 144 | ## 🤝 Contributions 145 | 146 | Love to collaborate? Here’s how! 🌟 147 | 1. Fork the repository. 148 | 2. Create a feature branch (`git checkout -b feature/amazing-addition`). 149 | 3. Commit your changes (`git commit -m 'Add some amazing content'`). 150 | 4. Push to the branch (`git push origin feature/amazing-addition`). 151 | 5. Open a Pull Request. 152 | 153 | --- 154 | 155 |
156 |

Happy Learning and Good Luck with Your Interviews! ✨

157 |
-------------------------------------------------------------------------------- /Transformers Fundamentals/01 Text-Based Pipelines/README.md: -------------------------------------------------------------------------------- 1 | # 📝 Text-Based Pipelines with Hugging Face Transformers 2 | 3 |
4 | Python Logo 5 | Hugging Face 6 | Transformers 7 | NumPy 8 | Matplotlib 9 |
10 |

Your guide to mastering text-based pipelines with Hugging Face Transformers for AI/ML and NLP interviews

11 | 12 | --- 13 | 14 | ## 📖 Introduction 15 | 16 | Welcome to the **Text-Based Pipelines** subsection of the **Transformers Library Roadmap**! 🚀 This folder focuses on leveraging the **Hugging Face Transformers** library’s text-based pipelines for tasks like sentiment analysis, entity extraction, and text generation. Designed for hands-on learning and interview success, it builds on your prior roadmaps—**Python**, **TensorFlow.js**, **GenAI**, **JavaScript**, **Keras**, **Matplotlib**, **Pandas**, **NumPy**, **Computer Vision with OpenCV (cv2)**, and **NLP with NLTK**—and supports your retail-themed projects (April 26, 2025). Whether tackling coding challenges or technical discussions, this section equips you with the skills to excel in NLP roles. 17 | 18 | ## 🌟 What’s Inside? 19 | 20 | - **Text Classification**: Perform sentiment analysis and topic classification. 21 | - **Named Entity Recognition (NER)**: Extract entities like names and organizations. 22 | - **Question Answering**: Implement extractive and generative QA systems. 23 | - **Text Generation**: Generate stories and complete text prompts. 24 | - **Summarization**: Create abstractive and extractive summaries. 25 | - **Translation**: Translate text across multiple languages. 26 | - **Fill-Mask**: Predict masked words in sentences. 27 | - **Hands-on Code**: Seven `.py` files with practical examples using synthetic retail text data (e.g., product reviews). 28 | - **Interview Scenarios**: Key questions and answers to ace NLP interviews. 29 | 30 | ## 🔍 Who Is This For? 31 | 32 | - NLP Engineers applying transformers to text tasks. 33 | - Machine Learning Engineers building text-based AI models. 34 | - AI Researchers mastering transformer pipelines. 35 | - Software Engineers deepening expertise in Hugging Face tools. 36 | - Anyone preparing for NLP interviews in AI/ML or retail. 37 | 38 | ## 🗺️ Learning Roadmap 39 | 40 | This subsection covers seven key text-based pipelines, each with a dedicated `.py` file: 41 | 42 | ### 😊 Text Classification (`text_classification.py`) 43 | - Sentiment Analysis 44 | - Topic Classification 45 | - Visualization of Sentiment Scores 46 | 47 | ### 🕵️ Named Entity Recognition (`ner.py`) 48 | - Entity Extraction 49 | - Entity Type Analysis 50 | - Entity Visualization 51 | 52 | ### ❓ Question Answering (`question_answering.py`) 53 | - Extractive QA 54 | - Generative QA 55 | - Answer Visualization 56 | 57 | ### ✍️ Text Generation (`text_generation.py`) 58 | - Story Generation 59 | - Text Completion 60 | - Generated Text Analysis 61 | 62 | ### 📄 Summarization (`summarization.py`) 63 | - Abstractive Summarization 64 | - Extractive Summarization 65 | - Summary Length Visualization 66 | 67 | ### 🌍 Translation (`translation.py`) 68 | - Multilingual Translation 69 | - Translation Accuracy 70 | - Translation Visualization 71 | 72 | ### 🎭 Fill-Mask (`fill_mask.py`) 73 | - Masked Language Modeling 74 | - Prediction Confidence 75 | - Mask Prediction Visualization 76 | 77 | ## 💡 Why Master Text-Based Pipelines? 78 | 79 | Text-based pipelines with Hugging Face Transformers are critical for NLP, and here’s why they matter: 80 | 1. **Ease of Use**: Pre-built pipelines simplify complex NLP tasks. 81 | 2. **Versatility**: Applies to retail (e.g., review analysis, customer support), chatbots, and search. 82 | 3. **Interview Relevance**: Tested in coding challenges (e.g., sentiment analysis, QA). 83 | 4. **State-of-the-Art**: Leverages models like BERT, RoBERTa, and T5. 84 | 5. **Industry Demand**: A must-have for 6 LPA+ NLP/AI roles. 85 | 86 | This section is your roadmap to mastering text-based pipelines for technical interviews—let’s dive in! 87 | 88 | ## 📆 Study Plan 89 | 90 | - **Week 1**: 91 | - Day 1-2: Text Classification 92 | - Day 3-4: Named Entity Recognition 93 | - Day 5-6: Question Answering 94 | - Day 7: Review and practice 95 | - **Week 2**: 96 | - Day 1-2: Text Generation 97 | - Day 3-4: Summarization 98 | - Day 5-6: Translation 99 | - Day 7: Fill-Mask 100 | - **Week 3**: 101 | - Day 1-7: Review all `.py` files and practice interview scenarios. 102 | 103 | ## 🛠️ Setup Instructions 104 | 105 | 1. **Python Environment**: 106 | - Install Python 3.8+ and pip. 107 | - Create a virtual environment: `python -m venv transformers_env; source transformers_env/bin/activate`. 108 | - Install dependencies: `pip install transformers torch numpy matplotlib`. 109 | 2. **Hugging Face Hub**: 110 | - Optional: Create a Hugging Face account for model access. 111 | - Install `huggingface_hub`: `pip install huggingface_hub`. 112 | 3. **Datasets**: 113 | - Uses synthetic retail text data (e.g., product reviews like “This laptop is great!”). 114 | - Optional: Download datasets from [Hugging Face Datasets](https://huggingface.co/datasets) (e.g., IMDb, SQuAD). 115 | 4. **Running Code**: 116 | - Run `.py` files in a Python environment (e.g., `python text_classification.py`). 117 | - Use Google Colab for convenience or local setup. 118 | - View outputs in terminal (console logs) and Matplotlib visualizations (saved as PNGs). 119 | - Check terminal for errors; ensure dependencies are installed. 120 | 121 | ## 🏆 Practical Tasks 122 | 123 | 1. **Text Classification**: 124 | - Classify sentiment in retail reviews. 125 | - Visualize sentiment distribution. 126 | 2. **Named Entity Recognition**: 127 | - Extract entities from customer feedback. 128 | - Plot entity type frequencies. 129 | 3. **Question Answering**: 130 | - Answer questions about product descriptions. 131 | - Compare extractive vs. generative QA. 132 | 4. **Text Generation**: 133 | - Generate product review continuations. 134 | - Analyze generated text quality. 135 | 5. **Summarization**: 136 | - Summarize long product descriptions. 137 | - Visualize summary lengths. 138 | 6. **Translation**: 139 | - Translate reviews to multiple languages. 140 | - Compare translation outputs. 141 | 7. **Fill-Mask**: 142 | - Predict masked words in reviews. 143 | - Visualize prediction confidence. 144 | 145 | ## 💡 Interview Tips 146 | 147 | - **Common Questions**: 148 | - How do Hugging Face pipelines work for text tasks? 149 | - What’s the difference between extractive and generative QA? 150 | - How does the fill-mask pipeline leverage masked language models? 151 | - When would you use summarization vs. text generation? 152 | - **Tips**: 153 | - Explain pipeline usage with code (e.g., `pipeline("text-classification")`). 154 | - Demonstrate task-specific pipelines (e.g., `pipeline("question-answering")`). 155 | - Be ready to code tasks like sentiment analysis or NER. 156 | - Discuss trade-offs (e.g., model size vs. performance, pipeline vs. custom models). 157 | - **Coding Tasks**: 158 | - Implement a sentiment analysis pipeline. 159 | - Extract entities from a review text. 160 | - Generate a summary for a product description. 161 | - **Conceptual Clarity**: 162 | - Explain how transformers handle text classification. 163 | - Describe the role of attention in QA and summarization. 164 | 165 | ## 📚 Resources 166 | 167 | - [Hugging Face Transformers Documentation](https://huggingface.co/docs/transformers/) 168 | - [Hugging Face Course](https://huggingface.co/course) 169 | - [PyTorch Documentation](https://pytorch.org/) 170 | - [NumPy Documentation](https://numpy.org/doc/) 171 | - [Matplotlib Documentation](https://matplotlib.org/stable/contents.html) 172 | - [“Deep Learning with Python” by François Chollet](https://www.manning.com/books/deep-learning-with-python) 173 | 174 | ## 🤝 Contributions 175 | 176 | Love to collaborate? Here’s how! 🌟 177 | 1. Fork the repository. 178 | 2. Create a feature branch (`git checkout -b feature/amazing-addition`). 179 | 3. Commit your changes (`git commit -m 'Add some amazing content'`). 180 | 4. Push to the branch (`git push origin feature/amazing-addition`). 181 | 5. Open a Pull Request. 182 | 183 | --- 184 | 185 |
186 |

Happy Learning and Good Luck with Your Interviews! ✨

187 |
-------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # 🤖 Transformers Library Roadmap with Hugging Face - Interview Preparation 2 | 3 |
4 | Python Logo 5 | Hugging Face 6 | Transformers 7 | PyTorch 8 | TensorFlow 9 | NumPy 10 | Matplotlib 11 |
12 |

Your comprehensive guide to mastering the Hugging Face Transformers library for AI/ML and NLP interviews

13 | 14 | --- 15 | 16 | ## 📖 Introduction 17 | 18 | Welcome to my **Transformers Library Roadmap** for AI/ML and NLP interview preparation! 🚀 This roadmap dives deep into the **Hugging Face Transformers library**, a powerful toolkit for state-of-the-art NLP, computer vision, and multimodal tasks. Covering all major **Hugging Face pipelines** and related components, it’s designed for hands-on learning and interview success, building on your prior roadmaps—**Python**, **TensorFlow.js**, **GenAI**, **JavaScript**, **Keras**, **Matplotlib**, **Pandas**, **NumPy**, **Computer Vision with OpenCV (cv2)**, and **NLP with NLTK**—and supporting your retail-themed projects (April 26, 2025). Whether tackling coding challenges or technical discussions, this roadmap equips you with the skills to excel in advanced NLP and AI roles. 19 | 20 | ## 🌟 What’s Inside? 21 | 22 | - **Hugging Face Pipelines**: Ready-to-use APIs for text, image, and multimodal tasks. 23 | - **Core Components**: Tokenizers, models, datasets, and training APIs. 24 | - **Advanced Features**: Fine-tuning, evaluation, and deployment. 25 | - **Hands-on Code**: Subsections with `.py` files using synthetic retail data (e.g., product reviews, images). 26 | - **Interview Scenarios**: Key questions and answers to ace NLP/AI interviews. 27 | - **Retail Applications**: Examples tailored to retail (e.g., review analysis, chatbots, image classification). 28 | 29 | ## 🔍 Who Is This For? 30 | 31 | - NLP Engineers leveraging transformers for text tasks. 32 | - Machine Learning Engineers building multimodal AI models. 33 | - AI Researchers mastering state-of-the-art transformer architectures. 34 | - Software Engineers deepening expertise in Hugging Face tools. 35 | - Anyone preparing for NLP/AI interviews in AI/ML or retail. 36 | 37 | ## 🗺️ Learning Roadmap 38 | 39 | This roadmap is organized into subsections, each covering a key aspect of the Hugging Face Transformers library. Each subsection includes a dedicated folder with a `README.md` and `.py` files for practical demos. 40 | 41 | ### 📝 Text-Based Pipelines 42 | - **Text Classification**: Sentiment analysis, topic classification. 43 | - **Named Entity Recognition (NER)**: Entity extraction. 44 | - **Question Answering**: Extractive and generative QA. 45 | - **Text Generation**: Story generation, text completion. 46 | - **Summarization**: Abstractive and extractive summarization. 47 | - **Translation**: Multilingual text translation. 48 | - **Fill-Mask**: Masked language modeling tasks. 49 | 50 | ### 🗣️ Speech and Audio Pipelines 51 | - **Automatic Speech Recognition (ASR)**: Speech-to-text conversion. 52 | - **Text-to-Speech (TTS)**: Speech synthesis. 53 | - **Audio Classification**: Sound event detection. 54 | 55 | ### 🖼️ Vision-Based Pipelines 56 | - **Image Classification**: Object and scene recognition. 57 | - **Object Detection**: Bounding box detection. 58 | - **Image Segmentation**: Pixel-level classification. 59 | - **Image-to-Text**: Caption generation. 60 | 61 | ### 🔄 Multimodal Pipelines 62 | - **Visual Question Answering (VQA)**: Image-based QA. 63 | - **Document Question Answering**: Extract answers from documents. 64 | - **Feature Extraction**: Multimodal embeddings. 65 | 66 | ### 🛠️ Core Components 67 | - **Tokenizers**: Text preprocessing and tokenization. 68 | - **Models**: Pre-trained transformer architectures (BERT, GPT, T5, etc.). 69 | - **Datasets**: Hugging Face Datasets library for data loading. 70 | - **Training APIs**: Fine-tuning and custom training loops. 71 | 72 | ### 🚀 Advanced Features 73 | - **Fine-Tuning**: Adapt pre-trained models to custom datasets. 74 | - **Evaluation Metrics**: ROUGE, BLEU, accuracy, and more. 75 | - **Model Deployment**: Deploy models with Hugging Face Inference API. 76 | - **Optimization**: Quantization, pruning, and ONNX export. 77 | 78 | ### 🤖 Retail Applications 79 | - **Chatbots**: Conversational agents for customer support. 80 | - **Recommendation Systems**: Product recommendation with embeddings. 81 | - **Review Analysis**: Sentiment and topic modeling for reviews. 82 | - **Visual Search**: Image-based product search. 83 | 84 | ## 💡 Why Master the Transformers Library? 85 | 86 | The Hugging Face Transformers library is a cornerstone of modern NLP and AI, and here’s why it matters: 87 | 1. **State-of-the-Art**: Powers cutting-edge models like BERT, GPT, and Vision Transformers. 88 | 2. **Versatility**: Supports text, speech, vision, and multimodal tasks. 89 | 3. **Interview Relevance**: Tested in coding challenges (e.g., fine-tuning, pipeline usage). 90 | 4. **Ease of Use**: Pipelines simplify complex tasks for rapid prototyping. 91 | 5. **Industry Demand**: A must-have for 6 LPA+ NLP/AI roles in retail, tech, and beyond. 92 | 93 | This roadmap is your guide to mastering Transformers for technical interviews—let’s dive in! 94 | 95 | ## 📆 Study Plan 96 | 97 | - **Month 1**: 98 | - Week 1: Text-Based Pipelines (Text Classification, NER) 99 | - Week 2: Text-Based Pipelines (QA, Text Generation) 100 | - Week 3: Text-Based Pipelines (Summarization, Translation, Fill-Mask) 101 | - Week 4: Speech and Audio Pipelines 102 | - **Month 2**: 103 | - Week 1: Vision-Based Pipelines 104 | - Week 2: Multimodal Pipelines 105 | - Week 3: Core Components (Tokenizers, Models) 106 | - Week 4: Core Components (Datasets, Training APIs) 107 | - **Month 3**: 108 | - Week 1: Advanced Features (Fine-Tuning, Evaluation) 109 | - Week 2: Advanced Features (Deployment, Optimization) 110 | - Week 3: Retail Applications (Chatbots, Review Analysis) 111 | - Week 4: Retail Applications (Recommendation, Visual Search) and Review 112 | 113 | ## 🛠️ Setup Instructions 114 | 115 | 1. **Python Environment**: 116 | - Install Python 3.8+ and pip. 117 | - Create a virtual environment: `python -m venv transformers_env; source transformers_env/bin/activate`. 118 | - Install dependencies: `pip install transformers datasets torch tensorflow numpy matplotlib`. 119 | 2. **Hugging Face Hub**: 120 | - Optional: Create a Hugging Face account for model and dataset access. 121 | - Install `huggingface_hub`: `pip install huggingface_hub`. 122 | 3. **Datasets**: 123 | - Uses synthetic retail text and image data (e.g., product reviews, product images). 124 | - Optional: Download datasets from [Hugging Face Datasets](https://huggingface.co/datasets) (e.g., IMDb, SQuAD). 125 | 4. **Running Code**: 126 | - Run `.py` files in a Python environment (e.g., `python text_classification.py`). 127 | - Use Google Colab for convenience or local setup with GPU support for faster training. 128 | - View outputs in terminal (console logs) and Matplotlib visualizations (saved as PNGs). 129 | - Check terminal for errors; ensure dependencies are installed. 130 | 131 | ## 🏆 Practical Tasks 132 | 133 | 1. **Text-Based Pipelines**: 134 | - Classify sentiment in retail reviews. 135 | - Extract entities from customer feedback. 136 | - Generate summaries for product descriptions. 137 | 2. **Speech and Audio Pipelines**: 138 | - Convert customer voice queries to text. 139 | - Classify audio feedback sentiment. 140 | 3. **Vision-Based Pipelines**: 141 | - Classify product images by category. 142 | - Detect objects in retail images. 143 | 4. **Multimodal Pipelines**: 144 | - Answer questions about product images. 145 | - Extract information from retail documents. 146 | 5. **Core Components**: 147 | - Tokenize retail reviews with Hugging Face tokenizers. 148 | - Fine-tune a BERT model for sentiment analysis. 149 | 6. **Advanced Features**: 150 | - Deploy a chatbot using Hugging Face Inference API. 151 | - Optimize a model with quantization. 152 | 7. **Retail Applications**: 153 | - Build a retail chatbot for customer queries. 154 | - Create a product recommendation system using embeddings. 155 | 156 | ## 💡 Interview Tips 157 | 158 | - **Common Questions**: 159 | - What is the Hugging Face Transformers library, and how does it work? 160 | - How do pipelines simplify NLP tasks? 161 | - What’s the difference between fine-tuning and zero-shot learning? 162 | - How do you optimize transformer models for deployment? 163 | - **Tips**: 164 | - Explain pipelines with code (e.g., `pipeline("text-classification")`). 165 | - Demonstrate fine-tuning (e.g., `Trainer` API). 166 | - Be ready to code tasks like tokenization or model inference. 167 | - Discuss trade-offs (e.g., BERT vs. DistilBERT, CPU vs. GPU inference). 168 | - **Coding Tasks**: 169 | - Implement a sentiment analysis pipeline. 170 | - Fine-tune a model on a custom dataset. 171 | - Deploy a model using Hugging Face Inference API. 172 | - **Conceptual Clarity**: 173 | - Explain transformer architecture (e.g., attention mechanism). 174 | - Describe how tokenizers handle subword units. 175 | 176 | ## 📚 Resources 177 | 178 | - [Hugging Face Transformers Documentation](https://huggingface.co/docs/transformers/) 179 | - [Hugging Face Datasets Documentation](https://huggingface.co/docs/datasets/) 180 | - [Hugging Face Course](https://huggingface.co/course) 181 | - [PyTorch Documentation](https://pytorch.org/) 182 | - [TensorFlow Documentation](https://www.tensorflow.org/) 183 | - [NumPy Documentation](https://numpy.org/doc/) 184 | - [Matplotlib Documentation](https://matplotlib.org/stable/contents.html) 185 | - [“Deep Learning with Python” by François Chollet](https://www.manning.com/books/deep-learning-with-python) 186 | 187 | ## 🤝 Contributions 188 | 189 | Love to collaborate? Here’s how! 🌟 190 | 1. Fork the repository. 191 | 2. Create a feature branch (`git checkout -b feature/amazing-addition`). 192 | 3. Commit your changes (`git commit -m 'Add some amazing content'`). 193 | 4. Push to the branch (`git push origin feature/amazing-addition`). 194 | 5. Open a Pull Request. 195 | 196 | --- 197 | 198 |
199 |

Happy Learning and Good Luck with Your Interviews! ✨

200 |
-------------------------------------------------------------------------------- /Transformers Interview Questions/README.md: -------------------------------------------------------------------------------- 1 | # Transformers Interview Questions for AI/ML Roles 2 | 3 | This README provides 170 Transformers interview questions tailored for AI/ML roles, focusing on the Hugging Face Transformers library in Python for generative AI tasks. The questions cover **core Transformers concepts** (e.g., model loading, fine-tuning, tokenization, generation, deployment) and their applications in natural language processing (NLP), text generation, and multimodal tasks like image-to-text generation. Questions are categorized by topic and divided into **Basic**, **Intermediate**, and **Advanced** levels to support candidates preparing for roles requiring Transformers in generative AI workflows. 4 | 5 | ## Model Loading and Inference 6 | 7 | ### Basic 8 | 1. **What is the Hugging Face Transformers library, and why is it used in generative AI?** 9 | A library for state-of-the-art NLP and multimodal models. 10 | ```python 11 | from transformers import pipeline 12 | generator = pipeline("text-generation") 13 | ``` 14 | 15 | 2. **How do you load a pre-trained model in Transformers?** 16 | Uses `from_pretrained` for model access. 17 | ```python 18 | from transformers import AutoModel 19 | model = AutoModel.from_pretrained("bert-base-uncased") 20 | ``` 21 | 22 | 3. **How do you perform text generation with Transformers?** 23 | Generates text using a pipeline. 24 | ```python 25 | from transformers import pipeline 26 | generator = pipeline("text-generation", model="gpt2") 27 | output = generator("Hello, world!", max_length=50) 28 | ``` 29 | 30 | 4. **What is the role of `AutoTokenizer` in Transformers?** 31 | Loads tokenizers dynamically. 32 | ```python 33 | from transformers import AutoTokenizer 34 | tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased") 35 | ``` 36 | 37 | 5. **How do you encode text for a Transformers model?** 38 | Converts text to token IDs. 39 | ```python 40 | text = "Hello, world!" 41 | inputs = tokenizer(text, return_tensors="pt") 42 | ``` 43 | 44 | 6. **How do you perform inference with a Transformers model?** 45 | Processes inputs through the model. 46 | ```python 47 | from transformers import AutoModelForCausalLM 48 | model = AutoModelForCausalLM.from_pretrained("gpt2") 49 | outputs = model(**inputs) 50 | ``` 51 | 52 | #### Intermediate 53 | 7. **Write a function to load a Transformers model and tokenizer.** 54 | Initializes model and tokenizer. 55 | ```python 56 | def load_model_and_tokenizer(model_name): 57 | tokenizer = AutoTokenizer.from_pretrained(model_name) 58 | model = AutoModel.from_pretrained(model_name) 59 | return model, tokenizer 60 | ``` 61 | 62 | 8. **How do you handle batch inference in Transformers?** 63 | Processes multiple inputs. 64 | ```python 65 | texts = ["Hello, world!", "Good morning!"] 66 | inputs = tokenizer(texts, return_tensors="pt", padding=True) 67 | outputs = model(**inputs) 68 | ``` 69 | 70 | 9. **Write a function to generate text with custom parameters.** 71 | Controls generation settings. 72 | ```python 73 | def generate_text(model, tokenizer, prompt, max_length=50, num_beams=5): 74 | inputs = tokenizer(prompt, return_tensors="pt") 75 | outputs = model.generate(**inputs, max_length=max_length, num_beams=num_beams) 76 | return tokenizer.decode(outputs[0], skip_special_tokens=True) 77 | ``` 78 | 79 | 10. **How do you use a pipeline for question answering in Transformers?** 80 | Extracts answers from context. 81 | ```python 82 | qa_pipeline = pipeline("question-answering") 83 | result = qa_pipeline({"question": "Who is the president?", "context": "Joe Biden is the president."}) 84 | ``` 85 | 86 | 11. **Write a function to visualize model outputs.** 87 | Plots token probabilities. 88 | ```python 89 | import matplotlib.pyplot as plt 90 | def plot_output_probs(logits): 91 | probs = logits.softmax(dim=-1).detach().numpy()[0] 92 | plt.bar(range(len(probs)), probs) 93 | plt.savefig("output_probs.png") 94 | ``` 95 | 96 | 12. **How do you handle multilingual models in Transformers?** 97 | Uses models like mBERT. 98 | ```python 99 | model = AutoModel.from_pretrained("bert-base-multilingual-cased") 100 | tokenizer = AutoTokenizer.from_pretrained("bert-base-multilingual-cased") 101 | ``` 102 | 103 | #### Advanced 104 | 13. **Write a function to load a model with custom configurations.** 105 | Defines model settings. 106 | ```python 107 | from transformers import AutoConfig 108 | def load_custom_model(model_name, config_kwargs): 109 | config = AutoConfig.from_pretrained(model_name, **config_kwargs) 110 | model = AutoModel.from_pretrained(model_name, config=config) 111 | return model 112 | ``` 113 | 114 | 14. **How do you optimize model inference in Transformers?** 115 | Uses torch.compile or quantization. 116 | ```python 117 | import torch 118 | model = torch.compile(model) 119 | ``` 120 | 121 | 15. **Write a function to handle multimodal inference in Transformers.** 122 | Processes text and images. 123 | ```python 124 | from transformers import VisionEncoderDecoderModel, ViTFeatureExtractor 125 | def multimodal_inference(image, model_name="trop-vit"): 126 | model = VisionEncoderDecoderModel.from_pretrained(model_name) 127 | feature_extractor = ViTFeatureExtractor.from_pretrained(model_name) 128 | inputs = feature_extractor(images=image, return_tensors="pt") 129 | outputs = model.generate(**inputs) 130 | return outputs 131 | ``` 132 | 133 | 16. **How do you handle memory-efficient inference in Transformers?** 134 | Uses gradient checkpointing or mixed precision. 135 | ```python 136 | from transformers import AutoModelForCausalLM 137 | model = AutoModelForCausalLM.from_pretrained("gpt2", torch_dtype=torch.float16) 138 | ``` 139 | 140 | 17. **Write a function to perform zero-shot classification.** 141 | Classifies without training. 142 | ```python 143 | def zero_shot_classify(text, labels, model_name="facebook/bart-large-mnli"): 144 | classifier = pipeline("zero-shot-classification", model=model_name) 145 | return classifier(text, candidate_labels=labels) 146 | ``` 147 | 148 | 18. **How do you integrate Transformers with external APIs?** 149 | Calls Hugging Face Inference API. 150 | ```python 151 | from huggingface_hub import InferenceClient 152 | def api_inference(prompt): 153 | client = InferenceClient() 154 | return client.text_generation(prompt, model="gpt2") 155 | ``` 156 | 157 | ## Tokenization and Data Preprocessing 158 | 159 | ### Basic 160 | 19. **What is tokenization in the context of Transformers?** 161 | Splits text into tokens for model input. 162 | ```python 163 | tokens = tokenizer.tokenize("Hello, world!") 164 | ``` 165 | 166 | 20. **How do you convert tokens to IDs in Transformers?** 167 | Maps tokens to vocabulary indices. 168 | ```python 169 | token_ids = tokenizer.convert_tokens_to_ids(tokens) 170 | ``` 171 | 172 | 21. **How do you handle padding in Transformers?** 173 | Ensures uniform input lengths. 174 | ```python 175 | inputs = tokenizer("Hello, world!", padding=True, return_tensors="pt") 176 | ``` 177 | 178 | 22. **What is the role of attention masks in Transformers?** 179 | Indicates valid tokens. 180 | ```python 181 | inputs = tokenizer("Hello, world!", return_tensors="pt", return_attention_mask=True) 182 | ``` 183 | 184 | 23. **How do you decode model outputs in Transformers?** 185 | Converts token IDs to text. 186 | ```python 187 | text = tokenizer.decode(outputs[0], skip_special_tokens=True) 188 | ``` 189 | 190 | 24. **How do you visualize token embeddings?** 191 | Plots embeddings using Matplotlib. 192 | ```python 193 | import matplotlib.pyplot as plt 194 | def plot_embeddings(embeddings): 195 | plt.scatter(embeddings[:, 0], embeddings[:, 1]) 196 | plt.savefig("embeddings.png") 197 | ``` 198 | 199 | #### Intermediate 200 | 25. **Write a function to preprocess a dataset for Transformers.** 201 | Tokenizes and formats data. 202 | ```python 203 | def preprocess_dataset(dataset, tokenizer, max_length=128): 204 | return dataset.map(lambda x: tokenizer(x["text"], truncation=True, padding="max_length", max_length=max_length)) 205 | ``` 206 | 207 | 26. **How do you handle subword tokenization in Transformers?** 208 | Uses WordPiece or BPE. 209 | ```python 210 | tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased") 211 | tokens = tokenizer.tokenize("unhappiness") 212 | ``` 213 | 214 | 27. **Write a function to create a custom tokenizer.** 215 | Trains a new tokenizer. 216 | ```python 217 | from transformers import Tokenizer 218 | def train_custom_tokenizer(texts, vocab_size=1000): 219 | tokenizer = Tokenizer.from_texts(texts) 220 | tokenizer.train(vocab_size=vocab_size) 221 | return tokenizer 222 | ``` 223 | 224 | 28. **How do you integrate Transformers with Hugging Face Datasets?** 225 | Loads and preprocesses datasets. 226 | ```python 227 | from datasets import load_dataset 228 | dataset = load_dataset("imdb") 229 | tokenized = preprocess_dataset(dataset, tokenizer) 230 | ``` 231 | 232 | 29. **Write a function to visualize attention masks.** 233 | Displays mask patterns. 234 | ```python 235 | import matplotlib.pyplot as plt 236 | def plot_attention_mask(mask): 237 | plt.imshow(mask.numpy(), cmap="binary") 238 | plt.savefig("attention_mask.png") 239 | ``` 240 | 241 | 30. **How do you handle long sequences in Transformers?** 242 | Uses truncation or sliding windows. 243 | ```python 244 | inputs = tokenizer("Long text...", truncation=True, max_length=512, return_tensors="pt") 245 | ``` 246 | 247 | #### Advanced 248 | 31. **Write a function to implement dynamic padding in Transformers.** 249 | Pads to longest in batch. 250 | ```python 251 | from transformers import DataCollatorWithPadding 252 | def dynamic_padding(tokenizer, dataset): 253 | data_collator = DataCollatorWithPadding(tokenizer) 254 | return data_collator(dataset) 255 | ``` 256 | 257 | 32. **How do you optimize tokenization for large datasets?** 258 | Uses fast tokenizers or batch processing. 259 | ```python 260 | tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased", use_fast=True) 261 | ``` 262 | 263 | 33. **Write a function to handle multilingual tokenization.** 264 | Supports multiple languages. 265 | ```python 266 | def multilingual_tokenize(texts, model_name="xlm-roberta-base"): 267 | tokenizer = AutoTokenizer.from_pretrained(model_name) 268 | return tokenizer(texts, padding=True, truncation=True, return_tensors="pt") 269 | ``` 270 | 271 | 34. **How do you implement custom preprocessing for multimodal data?** 272 | Processes text and images. 273 | ```python 274 | from transformers import ViTFeatureExtractor 275 | def preprocess_multimodal(texts, images, tokenizer, feature_extractor): 276 | text_inputs = tokenizer(texts, return_tensors="pt") 277 | image_inputs = feature_extractor(images=images, return_tensors="pt") 278 | return {"text": text_inputs, "image": image_inputs} 279 | ``` 280 | 281 | 35. **Write a function to visualize tokenization statistics.** 282 | Plots token length distribution. 283 | ```python 284 | import matplotlib.pyplot as plt 285 | def plot_token_lengths(dataset, tokenizer): 286 | lengths = [len(tokenizer.tokenize(x["text"])) for x in dataset] 287 | plt.hist(lengths, bins=20) 288 | plt.savefig("token_lengths.png") 289 | ``` 290 | 291 | 36. **How do you handle domain-specific tokenization in Transformers?** 292 | Fine-tunes tokenizer on custom corpus. 293 | ```python 294 | from transformers import AutoTokenizer 295 | def domain_specific_tokenizer(corpus, model_name="bert-base-uncased"): 296 | tokenizer = AutoTokenizer.from_pretrained(model_name) 297 | tokenizer.train_new_from_iterator(corpus, vocab_size=32000) 298 | return tokenizer 299 | ``` 300 | 301 | ## Fine-Tuning and Training 302 | 303 | ### Basic 304 | 37. **What is fine-tuning in the context of Transformers?** 305 | Adapts pre-trained models to specific tasks. 306 | ```python 307 | from transformers import Trainer 308 | trainer = Trainer(model=model, train_dataset=dataset) 309 | ``` 310 | 311 | 38. **How do you set up a Trainer in Transformers?** 312 | Configures training settings. 313 | ```python 314 | from transformers import TrainingArguments 315 | args = TrainingArguments(output_dir="output", num_train_epochs=3) 316 | trainer = Trainer(model=model, args=args, train_dataset=dataset) 317 | ``` 318 | 319 | 39. **How do you define a loss function for fine-tuning?** 320 | Uses model’s default loss. 321 | ```python 322 | outputs = model(**inputs, labels=labels) 323 | loss = outputs.loss 324 | ``` 325 | 326 | 40. **How do you perform a training step in Transformers?** 327 | Executes forward and backward passes. 328 | ```python 329 | model.train() 330 | outputs = model(**inputs) 331 | loss = outputs.loss 332 | loss.backward() 333 | ``` 334 | 335 | 41. **How do you save a fine-tuned model in Transformers?** 336 | Persists model weights. 337 | ```python 338 | model.save_pretrained("fine_tuned_model") 339 | tokenizer.save_pretrained("fine_tuned_model") 340 | ``` 341 | 342 | 42. **How do you visualize training metrics in Transformers?** 343 | Plots loss curves. 344 | ```python 345 | import matplotlib.pyplot as plt 346 | def plot_training_metrics(trainer): 347 | losses = trainer.state.log_history["loss"] 348 | plt.plot(losses) 349 | plt.savefig("training_loss.png") 350 | ``` 351 | 352 | #### Intermediate 353 | 43. **Write a function to fine-tune a Transformers model.** 354 | Trains on custom dataset. 355 | ```python 356 | def fine_tune_model(model, tokenizer, dataset, output_dir="output"): 357 | args = TrainingArguments(output_dir=output_dir, num_train_epochs=3) 358 | trainer = Trainer(model=model, args=args, train_dataset=dataset) 359 | trainer.train() 360 | return trainer 361 | ``` 362 | 363 | 44. **How do you implement learning rate scheduling in Transformers?** 364 | Adjusts learning rate dynamically. 365 | ```python 366 | args = TrainingArguments(output_dir="output", learning_rate=5e-5, lr_scheduler_type="cosine") 367 | ``` 368 | 369 | 45. **Write a function to evaluate a fine-tuned model.** 370 | Computes validation metrics. 371 | ```python 372 | def evaluate_model(trainer, eval_dataset): 373 | metrics = trainer.evaluate(eval_dataset) 374 | return metrics 375 | ``` 376 | 377 | 46. **How do you implement early stopping in Transformers?** 378 | Halts training on stagnation. 379 | ```python 380 | args = TrainingArguments(output_dir="output", evaluation_strategy="epoch", early_stopping_patience=5) 381 | trainer = Trainer(model=model, args=args, train_dataset=train_dataset, eval_dataset=eval_dataset) 382 | ``` 383 | 384 | 47. **Write a function to handle data collation for training.** 385 | Formats batches dynamically. 386 | ```python 387 | from transformers import DataCollatorForLanguageModeling 388 | def create_data_collator(tokenizer): 389 | return DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm=False) 390 | ``` 391 | 392 | 48. **How do you implement mixed precision training in Transformers?** 393 | Reduces memory usage. 394 | ```python 395 | args = TrainingArguments(output_dir="output", fp16=True) 396 | trainer = Trainer(model=model, args=args, train_dataset=dataset) 397 | ``` 398 | 399 | #### Advanced 400 | 49. **Write a function to implement gradient clipping in Transformers.** 401 | Stabilizes training. 402 | ```python 403 | args = TrainingArguments(output_dir="output", max_grad_norm=1.0) 404 | trainer = Trainer(model=model, args=args, train_dataset=dataset) 405 | ``` 406 | 407 | 50. **How do you optimize training for large models in Transformers?** 408 | Uses distributed training or DeepSpeed. 409 | ```python 410 | args = TrainingArguments(output_dir="output", deepspeed="ds_config.json") 411 | trainer = Trainer(model=model, args=args, train_dataset=dataset) 412 | ``` 413 | 414 | 51. **Write a function to implement custom loss functions in Transformers.** 415 | Defines specialized losses. 416 | ```python 417 | def custom_loss(model, inputs, labels): 418 | outputs = model(**inputs) 419 | return torch.nn.functional.cross_entropy(outputs.logits, labels) 420 | ``` 421 | 422 | 52. **How do you implement adversarial training in Transformers?** 423 | Enhances model robustness. 424 | ```python 425 | def adversarial_step(model, inputs, epsilon=0.1): 426 | inputs["input_ids"].requires_grad = True 427 | outputs = model(**inputs) 428 | loss = outputs.loss 429 | loss.backward() 430 | adv_inputs = inputs["input_ids"] + epsilon * inputs["input_ids"].grad.sign() 431 | return model(adv_inputs) 432 | ``` 433 | 434 | 53. **Write a function to implement curriculum learning in Transformers.** 435 | Adjusts training difficulty. 436 | ```python 437 | def curriculum_train(trainer, datasets, difficulty_levels): 438 | for dataset, level in zip(datasets, difficulty_levels): 439 | trainer.train_dataset = dataset 440 | trainer.train() 441 | ``` 442 | 443 | 54. **How do you implement distributed training in Transformers?** 444 | Scales across GPUs. 445 | ```python 446 | args = TrainingArguments(output_dir="output", distributed_training=True) 447 | trainer = Trainer(model=model, args=args, train_dataset=dataset) 448 | ``` 449 | 450 | ## Text Generation and Evaluation 451 | 452 | ### Basic 453 | 55. **How do you generate text with a Transformers model?** 454 | Uses `generate` method. 455 | ```python 456 | outputs = model.generate(**inputs, max_length=50) 457 | ``` 458 | 459 | 56. **What is beam search in Transformers?** 460 | Improves generation quality. 461 | ```python 462 | outputs = model.generate(**inputs, num_beams=5) 463 | ``` 464 | 465 | 57. **How do you evaluate generated text in Transformers?** 466 | Uses metrics like BLEU. 467 | ```python 468 | from datasets import load_metric 469 | bleu = load_metric("bleu") 470 | score = bleu.compute(predictions=["Hello"], references=[["Hello, world!"]]) 471 | ``` 472 | 473 | 58. **How do you control generation temperature in Transformers?** 474 | Adjusts output randomness. 475 | ```python 476 | outputs = model.generate(**inputs, temperature=0.7) 477 | ``` 478 | 479 | 59. **How do you visualize generated text quality?** 480 | Plots metric scores. 481 | ```python 482 | import matplotlib.pyplot as plt 483 | def plot_bleu_scores(scores): 484 | plt.plot(scores) 485 | plt.savefig("bleu_scores.png") 486 | ``` 487 | 488 | 60. **How do you handle repetitive text in generation?** 489 | Uses no_repeat_ngram_size. 490 | ```python 491 | outputs = model.generate(**inputs, no_repeat_ngram_size=2) 492 | ``` 493 | 494 | #### Intermediate 495 | 61. **Write a function to generate multiple text sequences.** 496 | Produces diverse outputs. 497 | ```python 498 | def generate_multiple(model, tokenizer, prompt, num_return_sequences=3): 499 | inputs = tokenizer(prompt, return_tensors="pt") 500 | outputs = model.generate(**inputs, num_return_sequences=num_return_sequences) 501 | return [tokenizer.decode(out, skip_special_tokens=True) for out in outputs] 502 | ``` 503 | 504 | 62. **How do you implement top-k sampling in Transformers?** 505 | Samples from top-k tokens. 506 | ```python 507 | outputs = model.generate(**inputs, top_k=50) 508 | ``` 509 | 510 | 63. **Write a function to evaluate generation with ROUGE.** 511 | Computes ROUGE scores. 512 | ```python 513 | from datasets import load_metric 514 | def compute_rouge(predictions, references): 515 | rouge = load_metric("rouge") 516 | return rouge.compute(predictions=predictions, references=references) 517 | ``` 518 | 519 | 64. **How do you implement nucleus sampling in Transformers?** 520 | Samples from top-p probability mass. 521 | ```python 522 | outputs = model.generate(**inputs, top_p=0.9) 523 | ``` 524 | 525 | 65. **Write a function to visualize generation diversity.** 526 | Plots unique token counts. 527 | ```python 528 | import matplotlib.pyplot as plt 529 | def plot_diversity(texts): 530 | unique_tokens = [len(set(text.split())) for text in texts] 531 | plt.hist(unique_tokens, bins=20) 532 | plt.savefig("diversity.png") 533 | ``` 534 | 535 | 66. **How do you handle long-form text generation?** 536 | Uses sliding windows or chunking. 537 | ```python 538 | def long_form_generate(model, tokenizer, prompt, chunk_size=512): 539 | inputs = tokenizer(prompt, return_tensors="pt") 540 | outputs = [] 541 | for i in range(0, len(inputs["input_ids"][0]), chunk_size): 542 | chunk = inputs["input_ids"][:, i:i+chunk_size] 543 | outputs.append(model.generate(input_ids=chunk)) 544 | return tokenizer.decode(torch.cat(outputs), skip_special_tokens=True) 545 | ``` 546 | 547 | #### Advanced 548 | 67. **Write a function to implement constrained generation.** 549 | Enforces specific outputs. 550 | ```python 551 | def constrained_generate(model, tokenizer, prompt, constraints): 552 | inputs = tokenizer(prompt, return_tensors="pt") 553 | outputs = model.generate(**inputs, prefix_allowed_tokens_fn=lambda x, y: constraints) 554 | return tokenizer.decode(outputs[0], skip_special_tokens=True) 555 | ``` 556 | 557 | 68. **How do you optimize text generation for latency?** 558 | Uses caching or smaller models. 559 | ```python 560 | model = AutoModelForCausalLM.from_pretrained("distilgpt2") 561 | ``` 562 | 563 | 69. **Write a function to evaluate generation with human-in-the-loop.** 564 | Collects feedback. 565 | ```python 566 | def human_eval_generate(model, tokenizer, prompt): 567 | generated = generate_text(model, tokenizer, prompt) 568 | feedback = input(f"Rate this output (1-5): {generated}\n") 569 | return {"text": generated, "score": int(feedback)} 570 | ``` 571 | 572 | 70. **How do you implement iterative refinement in generation?** 573 | Refines outputs iteratively. 574 | ```python 575 | def iterative_generate(model, tokenizer, prompt, iterations=3): 576 | text = prompt 577 | for _ in range(iterations): 578 | inputs = tokenizer(text, return_tensors="pt") 579 | text = tokenizer.decode(model.generate(**inputs)[0], skip_special_tokens=True) 580 | return text 581 | ``` 582 | 583 | 71. **Write a function to visualize attention weights in generation.** 584 | Plots attention matrices. 585 | ```python 586 | import matplotlib.pyplot as plt 587 | def plot_attention_weights(attention): 588 | plt.imshow(attention[0][0].detach().numpy(), cmap="hot") 589 | plt.savefig("attention_weights.png") 590 | ``` 591 | 592 | 72. **How do you implement controllable generation in Transformers?** 593 | Uses control codes or prompts. 594 | ```python 595 | def control_generate(model, tokenizer, prompt, control_code): 596 | inputs = tokenizer(f"{control_code} {prompt}", return_tensors="pt") 597 | outputs = model.generate(**inputs) 598 | return tokenizer.decode(outputs[0], skip_special_tokens=True) 599 | ``` 600 | 601 | ## Deployment and Scalability 602 | 603 | ### Basic 604 | 73. **How do you deploy a Transformers model for inference?** 605 | Serves model via API. 606 | ```python 607 | from transformers import pipeline 608 | model = pipeline("text-generation", model="gpt2") 609 | ``` 610 | 611 | 74. **How do you save a Transformers model for deployment?** 612 | Exports model and tokenizer. 613 | ```python 614 | model.save_pretrained("deployed_model") 615 | tokenizer.save_pretrained("deployed_model") 616 | ``` 617 | 618 | 75. **How do you load a deployed Transformers model?** 619 | Restores model state. 620 | ```python 621 | model = AutoModel.from_pretrained("deployed_model") 622 | tokenizer = AutoTokenizer.from_pretrained("deployed_model") 623 | ``` 624 | 625 | 76. **What is model quantization in Transformers?** 626 | Reduces model size for deployment. 627 | ```python 628 | from transformers import AutoModelForCausalLM 629 | model = AutoModelForCausalLM.from_pretrained("gpt2", torch_dtype="int8") 630 | ``` 631 | 632 | 77. **How do you optimize a model for mobile deployment?** 633 | Uses distilled models. 634 | ```python 635 | model = AutoModel.from_pretrained("distilbert-base-uncased") 636 | ``` 637 | 638 | 78. **How do you visualize inference latency?** 639 | Plots latency metrics. 640 | ```python 641 | import matplotlib.pyplot as plt 642 | def plot_latency(times): 643 | plt.plot(times) 644 | plt.savefig("inference_latency.png") 645 | ``` 646 | 647 | #### Intermediate 648 | 79. **Write a function to deploy a Transformers model with FastAPI.** 649 | Exposes model via API. 650 | ```python 651 | from fastapi import FastAPI 652 | app = FastAPI() 653 | model, tokenizer = load_model_and_tokenizer("gpt2") 654 | @app.post("/generate") 655 | async def generate(prompt: str): 656 | return {"text": generate_text(model, tokenizer, prompt)} 657 | ``` 658 | 659 | 80. **How do you deploy Transformers models with Hugging Face Inference Endpoints?** 660 | Uses cloud infrastructure. 661 | ```python 662 | from huggingface_hub import InferenceClient 663 | client = InferenceClient(model="gpt2") 664 | output = client.text_generation("Hello") 665 | ``` 666 | 667 | 81. **Write a function to perform batch inference for deployment.** 668 | Processes multiple inputs. 669 | ```python 670 | def batch_inference(model, tokenizer, texts): 671 | inputs = tokenizer(texts, return_tensors="pt", padding=True) 672 | outputs = model.generate(**inputs) 673 | return [tokenizer.decode(out, skip_special_tokens=True) for out in outputs] 674 | ``` 675 | 676 | 82. **How do you optimize inference for edge devices?** 677 | Uses ONNX or TensorFlow Lite. 678 | ```python 679 | from transformers import AutoModelForCausalLM 680 | model = AutoModelForCausalLM.from_pretrained("distilgpt2") 681 | model.to_onnx("model.onnx") 682 | ``` 683 | 684 | 83. **Write a function to monitor deployed model performance.** 685 | Tracks latency and errors. 686 | ```python 687 | import time 688 | def monitor_inference(model, tokenizer, prompt): 689 | start = time.time() 690 | output = generate_text(model, tokenizer, prompt) 691 | return {"latency": time.time() - start, "output": output} 692 | ``` 693 | 694 | 84. **How do you handle model versioning in Transformers?** 695 | Tracks model iterations. 696 | ```python 697 | def save_versioned_model(model, tokenizer, version): 698 | model.save_pretrained(f"model_v{version}") 699 | tokenizer.save_pretrained(f"model_v{version}") 700 | ``` 701 | 702 | #### Advanced 703 | 85. **Write a function to implement model pruning in Transformers.** 704 | Removes unnecessary weights. 705 | ```python 706 | from transformers import prune_low_magnitude 707 | def prune_model(model, amount=0.5): 708 | return prune_low_magnitude(model, amount=amount) 709 | ``` 710 | 711 | 86. **How do you deploy Transformers models in a serverless environment?** 712 | Uses cloud functions. 713 | ```python 714 | from huggingface_hub import InferenceClient 715 | def serverless_inference(prompt): 716 | client = InferenceClient(model="gpt2") 717 | return client.text_generation(prompt) 718 | ``` 719 | 720 | 87. **Write a function to scale inference with distributed systems.** 721 | Uses model parallelism. 722 | ```python 723 | from transformers import AutoModelForCausalLM 724 | def distributed_inference(model_name, inputs): 725 | model = AutoModelForCausalLM.from_pretrained(model_name, device_map="auto") 726 | return model.generate(**inputs) 727 | ``` 728 | 729 | 88. **How do you implement A/B testing for deployed Transformers models?** 730 | Compares model performance. 731 | ```python 732 | def ab_test(model_a, model_b, tokenizer, texts): 733 | outputs_a = batch_inference(model_a, tokenizer, texts) 734 | outputs_b = batch_inference(model_b, tokenizer, texts) 735 | return {"model_a": outputs_a, "model_b": outputs_b} 736 | ``` 737 | 738 | 89. **Write a function to handle real-time inference in Transformers.** 739 | Processes streaming data. 740 | ```python 741 | def real_time_inference(model, tokenizer, stream): 742 | for prompt in stream: 743 | yield generate_text(model, tokenizer, prompt) 744 | ``` 745 | 746 | 90. **How do you implement model monitoring with Transformers?** 747 | Tracks performance metrics. 748 | ```python 749 | import logging 750 | def monitor_model(model, tokenizer, prompt): 751 | logging.basicConfig(filename="model.log", level=logging.INFO) 752 | start = time.time() 753 | output = generate_text(model, tokenizer, prompt) 754 | logging.info(f"Latency: {time.time() - start}, Output: {output}") 755 | return output 756 | ``` 757 | 758 | ## Debugging and Error Handling 759 | 760 | ### Basic 761 | 91. **How do you debug tokenization issues in Transformers?** 762 | Logs token outputs. 763 | ```python 764 | def debug_tokenize(text, tokenizer): 765 | tokens = tokenizer.tokenize(text) 766 | print(f"Tokens: {tokens}") 767 | return tokens 768 | ``` 769 | 770 | 92. **What is a try-except block in Transformers applications?** 771 | Handles runtime errors. 772 | ```python 773 | try: 774 | outputs = model(**inputs) 775 | except Exception as e: 776 | print(f"Error: {e}") 777 | ``` 778 | 779 | 93. **How do you validate model inputs in Transformers?** 780 | Ensures correct formats. 781 | ```python 782 | def validate_inputs(inputs, expected_keys): 783 | if not all(key in inputs for key in expected_keys): 784 | raise ValueError(f"Missing keys: {set(expected_keys) - set(inputs)}") 785 | return inputs 786 | ``` 787 | 788 | 94. **How do you handle out-of-memory errors in Transformers?** 789 | Reduces batch size or uses smaller models. 790 | ```python 791 | args = TrainingArguments(output_dir="output", per_device_train_batch_size=4) 792 | ``` 793 | 794 | 95. **What is the role of logging in Transformers debugging?** 795 | Tracks errors and operations. 796 | ```python 797 | import logging 798 | logging.basicConfig(filename="transformers.log", level=logging.INFO) 799 | logging.info("Starting Transformers operation") 800 | ``` 801 | 802 | 96. **How do you handle NaN values in Transformers training?** 803 | Detects and mitigates NaNs. 804 | ```python 805 | def check_nan(outputs): 806 | if torch.isnan(outputs.loss): 807 | raise ValueError("NaN detected in loss") 808 | return outputs 809 | ``` 810 | 811 | #### Intermediate 812 | 97. **Write a function to retry Transformers operations on failure.** 813 | Handles transient errors. 814 | ```python 815 | def retry_operation(func, *args, max_attempts=3): 816 | for attempt in range(max_attempts): 817 | try: 818 | return func(*args) 819 | except Exception as e: 820 | if attempt == max_attempts - 1: 821 | raise 822 | print(f"Attempt {attempt+1} failed: {e}") 823 | ``` 824 | 825 | 98. **How do you debug model outputs in Transformers?** 826 | Inspects logits or embeddings. 827 | ```python 828 | def debug_outputs(outputs): 829 | print(f"Logits shape: {outputs.logits.shape}, Sample: {outputs.logits[0, :5]}") 830 | return outputs 831 | ``` 832 | 833 | 99. **Write a function to validate model parameters.** 834 | Ensures weights are valid. 835 | ```python 836 | def validate_params(model): 837 | for name, param in model.named_parameters(): 838 | if torch.isnan(param).any(): 839 | raise ValueError(f"NaN in {name}") 840 | return model 841 | ``` 842 | 843 | 100. **How do you profile Transformers model performance?** 844 | Measures execution time. 845 | ```python 846 | import time 847 | def profile_inference(model, inputs): 848 | start = time.time() 849 | outputs = model(**inputs) 850 | print(f"Inference took {time.time() - start}s") 851 | return outputs 852 | ``` 853 | 854 | 101. **Write a function to handle numerical instability.** 855 | Stabilizes computations. 856 | ```python 857 | def safe_computation(outputs, epsilon=1e-8): 858 | return torch.clamp(outputs, min=epsilon, max=1/epsilon) 859 | ``` 860 | 861 | 102. **How do you debug Transformers training loops?** 862 | Logs epoch metrics. 863 | ```python 864 | def debug_training(trainer): 865 | trainer.add_callback(lambda trainer: print(f"Epoch {trainer.state.epoch}, Loss: {trainer.state.log_history[-1]['loss']}")) 866 | return trainer.train() 867 | ``` 868 | 869 | #### Advanced 870 | 103. **Write a function to implement a custom error handler.** 871 | Logs specific errors. 872 | ```python 873 | import logging 874 | def custom_error_handler(operation, *args): 875 | logging.basicConfig(filename="transformers.log", level=logging.ERROR) 876 | try: 877 | return operation(*args) 878 | except Exception as e: 879 | logging.error(f"Operation error: {e}") 880 | raise 881 | ``` 882 | 883 | 104. **How do you implement circuit breakers in Transformers applications?** 884 | Prevents cascading failures. 885 | ```python 886 | from pybreaker import CircuitBreaker 887 | breaker = CircuitBreaker(fail_max=3, reset_timeout=60) 888 | @breaker 889 | def safe_inference(model, inputs): 890 | return model(**inputs) 891 | ``` 892 | 893 | 105. **Write a function to detect gradient explosions.** 894 | Checks gradient norms. 895 | ```python 896 | def detect_explosion(model, inputs, labels): 897 | outputs = model(**inputs, labels=labels) 898 | loss = outputs.loss 899 | loss.backward() 900 | grad_norm = sum(p.grad.norm() for p in model.parameters()) 901 | if grad_norm > 10: 902 | print("Warning: Gradient explosion detected") 903 | ``` 904 | 905 | 106. **How do you implement logging for distributed Transformers training?** 906 | Centralizes logs. 907 | ```python 908 | import logging.handlers 909 | def setup_distributed_logging(): 910 | handler = logging.handlers.SocketHandler("log-server", 9090) 911 | logging.getLogger().addHandler(handler) 912 | logging.info("Transformers training started") 913 | ``` 914 | 915 | 107. **Write a function to handle version compatibility in Transformers.** 916 | Checks library versions. 917 | ```python 918 | from transformers import __version__ 919 | def check_transformers_version(): 920 | if __version__ < "4.0": 921 | raise ValueError("Unsupported Transformers version") 922 | ``` 923 | 924 | 108. **How do you debug Transformers performance bottlenecks?** 925 | Profiles training stages. 926 | ```python 927 | from torch.profiler import profile 928 | def debug_bottlenecks(model, inputs): 929 | with profile() as prof: 930 | outputs = model(**inputs) 931 | print(prof.key_averages()) 932 | return outputs 933 | ``` 934 | 935 | ## Visualization and Interpretation 936 | 937 | ### Basic 938 | 109. **How do you visualize attention weights in Transformers?** 939 | Plots attention matrices. 940 | ```python 941 | import matplotlib.pyplot as plt 942 | def plot_attention(attention): 943 | plt.imshow(attention[0][0].detach().numpy(), cmap="hot") 944 | plt.savefig("attention.png") 945 | ``` 946 | 947 | 110. **How do you create a word cloud for generated text?** 948 | Visualizes word frequencies. 949 | ```python 950 | from wordcloud import WordCloud 951 | import matplotlib.pyplot as plt 952 | def plot_word_cloud(text): 953 | wc = WordCloud().generate(text) 954 | plt.imshow(wc, interpolation="bilinear") 955 | plt.savefig("word_cloud.png") 956 | ``` 957 | 958 | 111. **How do you visualize training metrics in Transformers?** 959 | Plots loss or accuracy curves. 960 | ```python 961 | import matplotlib.pyplot as plt 962 | def plot_metrics(history): 963 | plt.plot(history["loss"]) 964 | plt.savefig("metrics.png") 965 | ``` 966 | 967 | 112. **How do you visualize token embeddings in Transformers?** 968 | Projects embeddings to 2D. 969 | ```python 970 | from sklearn.manifold import TSNE 971 | import matplotlib.pyplot as plt 972 | def plot_token_embeddings(embeddings): 973 | tsne = TSNE(n_components=2) 974 | reduced = tsne.fit_transform(embeddings.detach().numpy()) 975 | plt.scatter(reduced[:, 0], reduced[:, 1]) 976 | plt.savefig("token_embeddings.png") 977 | ``` 978 | 979 | 113. **How do you create a confusion matrix for classification?** 980 | Evaluates model performance. 981 | ```python 982 | from sklearn.metrics import confusion_matrix 983 | import seaborn as sns 984 | import matplotlib.pyplot as plt 985 | def plot_confusion_matrix(preds, labels): 986 | cm = confusion_matrix(labels, preds) 987 | sns.heatmap(cm, annot=True) 988 | plt.savefig("confusion_matrix.png") 989 | ``` 990 | 991 | 114. **How do you visualize model uncertainty in Transformers?** 992 | Plots confidence intervals. 993 | ```python 994 | import matplotlib.pyplot as plt 995 | def plot_uncertainty(probs, std): 996 | mean = probs.mean(dim=0).detach().numpy() 997 | std = std.detach().numpy() 998 | plt.plot(mean) 999 | plt.fill_between(range(len(mean)), mean - std, mean + std, alpha=0.2) 1000 | plt.savefig("uncertainty.png") 1001 | ``` 1002 | 1003 | #### Intermediate 1004 | 115. **Write a function to visualize generated text length distribution.** 1005 | Plots text lengths. 1006 | ```python 1007 | import matplotlib.pyplot as plt 1008 | def plot_text_lengths(texts): 1009 | lengths = [len(text.split()) for text in texts] 1010 | plt.hist(lengths, bins=20) 1011 | plt.savefig("text_lengths.png") 1012 | ``` 1013 | 1014 | 116. **How do you visualize model performance across epochs?** 1015 | Plots training curves. 1016 | ```python 1017 | import matplotlib.pyplot as plt 1018 | def plot_epoch_performance(history): 1019 | plt.plot(history["eval_accuracy"]) 1020 | plt.savefig("epoch_performance.png") 1021 | ``` 1022 | 1023 | 117. **Write a function to visualize attention heads.** 1024 | Plots multiple attention matrices. 1025 | ```python 1026 | import matplotlib.pyplot as plt 1027 | def plot_attention_heads(attention, num_heads=4): 1028 | fig, axes = plt.subplots(1, num_heads, figsize=(15, 3)) 1029 | for i in range(num_heads): 1030 | axes[i].imshow(attention[0][i].detach().numpy(), cmap="hot") 1031 | plt.savefig("attention_heads.png") 1032 | ``` 1033 | 1034 | 118. **How do you visualize model robustness in Transformers?** 1035 | Plots performance under noise. 1036 | ```python 1037 | import matplotlib.pyplot as plt 1038 | def plot_robustness(metrics, noise_levels): 1039 | plt.plot(noise_levels, metrics) 1040 | plt.savefig("robustness.png") 1041 | ``` 1042 | 1043 | 119. **Write a function to visualize dataset statistics.** 1044 | Plots feature distributions. 1045 | ```python 1046 | import matplotlib.pyplot as plt 1047 | def plot_dataset_stats(dataset, key): 1048 | values = [x[key] for x in dataset] 1049 | plt.hist(values, bins=20) 1050 | plt.savefig("dataset_stats.png") 1051 | ``` 1052 | 1053 | 120. **How do you visualize model fairness in Transformers?** 1054 | Plots group-wise metrics. 1055 | ```python 1056 | import matplotlib.pyplot as plt 1057 | def plot_fairness(metrics, groups): 1058 | plt.bar(groups, metrics) 1059 | plt.savefig("fairness.png") 1060 | ``` 1061 | 1062 | #### Advanced 1063 | 121. **Write a function to visualize model interpretability with SHAP.** 1064 | Explains predictions. 1065 | ```python 1066 | import shap 1067 | import matplotlib.pyplot as plt 1068 | def plot_shap_values(model, inputs): 1069 | explainer = shap.DeepExplainer(model, inputs) 1070 | shap_values = explainer.shap_values(inputs) 1071 | shap.summary_plot(shap_values, inputs, show=False) 1072 | plt.savefig("shap_values.png") 1073 | ``` 1074 | 1075 | 122. **How do you implement a dashboard for Transformers metrics?** 1076 | Displays real-time stats. 1077 | ```python 1078 | from fastapi import FastAPI 1079 | app = FastAPI() 1080 | metrics = [] 1081 | @app.get("/metrics") 1082 | async def get_metrics(): 1083 | return {"metrics": metrics} 1084 | ``` 1085 | 1086 | 123. **Write a function to visualize data drift in Transformers.** 1087 | Tracks dataset changes. 1088 | ```python 1089 | import matplotlib.pyplot as plt 1090 | def plot_data_drift(old_data, new_data): 1091 | plt.hist(old_data, alpha=0.5, label="Old") 1092 | plt.hist(new_data, alpha=0.5, label="New") 1093 | plt.legend() 1094 | plt.savefig("data_drift.png") 1095 | ``` 1096 | 1097 | 124. **How do you visualize attention flow in Transformers?** 1098 | Plots attention across layers. 1099 | ```python 1100 | import matplotlib.pyplot as plt 1101 | def plot_attention_flow(attention, layer_idx): 1102 | plt.imshow(attention[layer_idx][0].detach().numpy(), cmap="hot") 1103 | plt.savefig(f"attention_flow_layer_{layer_idx}.png") 1104 | ``` 1105 | 1106 | 125. **Write a function to visualize multimodal outputs.** 1107 | Plots text and image predictions. 1108 | ```python 1109 | import matplotlib.pyplot as plt 1110 | def plot_multimodal(text, image): 1111 | plt.subplot(1, 2, 1) 1112 | plt.imshow(image) 1113 | plt.subplot(1, 2, 2) 1114 | plt.text(0.5, 0.5, text, wrap=True) 1115 | plt.savefig("multimodal_output.png") 1116 | ``` 1117 | 1118 | 126. **How do you visualize model bias in Transformers?** 1119 | Plots group-wise predictions. 1120 | ```python 1121 | import matplotlib.pyplot as plt 1122 | def plot_bias(outputs, groups): 1123 | group_means = [outputs[groups == g].mean().item() for g in set(groups)] 1124 | plt.bar(set(groups), group_means) 1125 | plt.savefig("bias.png") 1126 | ``` 1127 | 1128 | ## Best Practices and Optimization 1129 | 1130 | ### Basic 1131 | 127. **What are best practices for Transformers code organization?** 1132 | Modularizes model and training code. 1133 | ```python 1134 | def build_model(model_name): 1135 | return AutoModel.from_pretrained(model_name) 1136 | def train(model, dataset): 1137 | trainer = Trainer(model=model, train_dataset=dataset) 1138 | trainer.train() 1139 | ``` 1140 | 1141 | 128. **How do you ensure reproducibility in Transformers?** 1142 | Sets random seeds. 1143 | ```python 1144 | import torch 1145 | torch.manual_seed(42) 1146 | ``` 1147 | 1148 | 129. **What is model caching in Transformers?** 1149 | Stores pre-trained models locally. 1150 | ```python 1151 | model = AutoModel.from_pretrained("gpt2", cache_dir="cache") 1152 | ``` 1153 | 1154 | 130. **How do you handle large-scale Transformers models?** 1155 | Uses model parallelism or smaller models. 1156 | ```python 1157 | model = AutoModel.from_pretrained("distilgpt2") 1158 | ``` 1159 | 1160 | 131. **What is the role of environment configuration in Transformers?** 1161 | Manages settings securely. 1162 | ```python 1163 | import os 1164 | os.environ["HF_TOKEN"] = "your_token" 1165 | ``` 1166 | 1167 | 132. **How do you document Transformers code?** 1168 | Uses docstrings for clarity. 1169 | ```python 1170 | def train_model(model, dataset): 1171 | """Trains a Transformers model on a dataset.""" 1172 | trainer = Trainer(model=model, train_dataset=dataset) 1173 | trainer.train() 1174 | ``` 1175 | 1176 | #### Intermediate 1177 | 133. **Write a function to optimize Transformers memory usage.** 1178 | Uses mixed precision or gradient accumulation. 1179 | ```python 1180 | def optimize_memory(args): 1181 | args.fp16 = True 1182 | args.gradient_accumulation_steps = 4 1183 | return args 1184 | ``` 1185 | 1186 | 134. **How do you implement unit tests for Transformers code?** 1187 | Validates model behavior. 1188 | ```python 1189 | import unittest 1190 | class TestTransformers(unittest.TestCase): 1191 | def test_model_output(self): 1192 | model = AutoModel.from_pretrained("distilbert-base-uncased") 1193 | inputs = tokenizer("test", return_tensors="pt") 1194 | outputs = model(**inputs) 1195 | self.assertEqual(outputs.logits.shape[0], 1) 1196 | ``` 1197 | 1198 | 135. **Write a function to create reusable Transformers templates.** 1199 | Standardizes model building. 1200 | ```python 1201 | def model_template(model_name, task="text-generation"): 1202 | return pipeline(task, model=model_name) 1203 | ``` 1204 | 1205 | 136. **How do you optimize Transformers for batch processing?** 1206 | Processes data in chunks. 1207 | ```python 1208 | def batch_process(model, tokenizer, texts, batch_size=32): 1209 | results = [] 1210 | for i in range(0, len(texts), batch_size): 1211 | batch = texts[i:i+batch_size] 1212 | results.extend(batch_inference(model, tokenizer, batch)) 1213 | return results 1214 | ``` 1215 | 1216 | 137. **Write a function to handle Transformers configuration.** 1217 | Centralizes settings. 1218 | ```python 1219 | def configure_transformers(): 1220 | return {"model_name": "gpt2", "batch_size": 16, "max_length": 512} 1221 | ``` 1222 | 1223 | 138. **How do you ensure Transformers pipeline consistency?** 1224 | Standardizes versions and settings. 1225 | ```python 1226 | from transformers import __version__ 1227 | def check_transformers_env(): 1228 | print(f"Transformers version: {__version__}") 1229 | ``` 1230 | 1231 | #### Advanced 1232 | 139. **Write a function to implement Transformers pipeline caching.** 1233 | Reuses processed data. 1234 | ```python 1235 | from datasets import load_dataset 1236 | def cache_dataset(dataset_name, cache_dir="cache"): 1237 | return load_dataset(dataset_name, cache_dir=cache_dir) 1238 | ``` 1239 | 1240 | 140. **How do you optimize Transformers for high-throughput processing?** 1241 | Uses parallel execution. 1242 | ```python 1243 | from joblib import Parallel, delayed 1244 | def high_throughput_inference(model, tokenizer, texts): 1245 | return Parallel(n_jobs=-1)(delayed(generate_text)(model, tokenizer, text) for text in texts) 1246 | ``` 1247 | 1248 | 141. **Write a function to implement Transformers pipeline versioning.** 1249 | Tracks changes in workflows. 1250 | ```python 1251 | import json 1252 | def version_pipeline(config, version): 1253 | with open(f"pipeline_v{version}.json", "w") as f: 1254 | json.dump(config, f) 1255 | ``` 1256 | 1257 | 142. **How do you implement Transformers pipeline monitoring?** 1258 | Logs performance metrics. 1259 | ```python 1260 | import logging 1261 | def monitored_training(trainer): 1262 | logging.basicConfig(filename="transformers.log", level=logging.INFO) 1263 | start = time.time() 1264 | trainer.train() 1265 | logging.info(f"Training took {time.time() - start}s") 1266 | ``` 1267 | 1268 | 143. **Write a function to handle Transformers scalability.** 1269 | Processes large datasets efficiently. 1270 | ```python 1271 | def scalable_training(trainer, dataset, chunk_size=1000): 1272 | for i in range(0, len(dataset), chunk_size): 1273 | trainer.train_dataset = dataset[i:i+chunk_size] 1274 | trainer.train() 1275 | ``` 1276 | 1277 | 144. **How do you implement Transformers pipeline automation?** 1278 | Scripts end-to-end workflows. 1279 | ```python 1280 | def automate_pipeline(dataset, model_name): 1281 | model, tokenizer = load_model_and_tokenizer(model_name) 1282 | tokenized = preprocess_dataset(dataset, tokenizer) 1283 | trainer = fine_tune_model(model, tokenizer, tokenized) 1284 | trainer.save_model("output") 1285 | return trainer 1286 | ``` 1287 | 1288 | ## Ethical Considerations in Transformers 1289 | 1290 | ### Basic 1291 | 145. **What are ethical concerns in Transformers applications?** 1292 | Includes bias in outputs and energy consumption. 1293 | ```python 1294 | def check_model_bias(outputs, groups): 1295 | return {g: outputs[groups == g].mean().item() for g in set(groups)} 1296 | ``` 1297 | 1298 | 146. **How do you detect bias in Transformers model predictions?** 1299 | Analyzes group disparities. 1300 | ```python 1301 | def detect_bias(outputs, groups): 1302 | return {g: outputs[groups == g].mean().item() for g in set(groups)} 1303 | ``` 1304 | 1305 | 147. **What is data privacy in Transformers, and how is it ensured?** 1306 | Protects sensitive data. 1307 | ```python 1308 | def anonymize_data(data): 1309 | return [text + " [MASK]" for text in data] 1310 | ``` 1311 | 1312 | 148. **How do you ensure fairness in Transformers models?** 1313 | Balances predictions across groups. 1314 | ```python 1315 | def fair_training(trainer, dataset, weights): 1316 | trainer.train_dataset = dataset.map(lambda x: {**x, "weight": weights[x["label"]]}) 1317 | trainer.train() 1318 | ``` 1319 | 1320 | 149. **What is explainability in Transformers applications?** 1321 | Clarifies model decisions. 1322 | ```python 1323 | def explain_predictions(model, inputs): 1324 | outputs = model(**inputs) 1325 | print(f"Logits: {outputs.logits[0, :5]}") 1326 | return outputs 1327 | ``` 1328 | 1329 | 150. **How do you visualize Transformers model bias?** 1330 | Plots group-wise predictions. 1331 | ```python 1332 | import matplotlib.pyplot as plt 1333 | def plot_bias(outputs, groups): 1334 | group_means = [outputs[groups == g].mean().item() for g in set(groups)] 1335 | plt.bar(set(groups), group_means) 1336 | plt.savefig("bias_plot.png") 1337 | ``` 1338 | 1339 | #### Intermediate 1340 | 151. **Write a function to mitigate bias in Transformers models.** 1341 | Reweights or resamples data. 1342 | ```python 1343 | def mitigate_bias(dataset, weights): 1344 | return dataset.map(lambda x: {**x, "weight": weights[x["label"]]}) 1345 | ``` 1346 | 1347 | 152. **How do you implement differential privacy in Transformers?** 1348 | Adds noise to gradients. 1349 | ```python 1350 | from opacus import PrivacyEngine 1351 | def private_training(model, trainer): 1352 | privacy_engine = PrivacyEngine() 1353 | model, optimizer, train_loader = privacy_engine.make_private(model, trainer.optimizer, trainer.train_dataset) 1354 | trainer.model = model 1355 | trainer.optimizer = optimizer 1356 | trainer.train() 1357 | ``` 1358 | 1359 | 153. **Write a function to assess model fairness in Transformers.** 1360 | Computes fairness metrics. 1361 | ```python 1362 | def fairness_metrics(outputs, groups, targets): 1363 | return {g: (outputs[groups == g] == targets[groups == g]).float().mean().item() for g in set(groups)} 1364 | ``` 1365 | 1366 | 154. **How do you ensure energy-efficient Transformers training?** 1367 | Optimizes resource usage. 1368 | ```python 1369 | def efficient_training(args): 1370 | args.fp16 = True 1371 | args.per_device_train_batch_size = 8 1372 | return args 1373 | ``` 1374 | 1375 | 155. **Write a function to audit Transformers model decisions.** 1376 | Logs predictions and inputs. 1377 | ```python 1378 | import logging 1379 | def audit_predictions(model, tokenizer, inputs): 1380 | logging.basicConfig(filename="audit.log", level=logging.INFO) 1381 | outputs = model.generate(**inputs) 1382 | logging.info(f"Input: {inputs['input_ids']}, Output: {outputs}") 1383 | ``` 1384 | 1385 | 156. **How do you visualize fairness metrics in Transformers?** 1386 | Plots group-wise performance. 1387 | ```python 1388 | import matplotlib.pyplot as plt 1389 | def plot_fairness_metrics(metrics): 1390 | plt.bar(metrics.keys(), metrics.values()) 1391 | plt.savefig("fairness_metrics.png") 1392 | ``` 1393 | 1394 | #### Advanced 1395 | 157. **Write a function to implement fairness-aware training in Transformers.** 1396 | Uses adversarial debiasing. 1397 | ```python 1398 | def fairness_training(model, adv_model, trainer, dataset): 1399 | for batch in dataset: 1400 | outputs = model(**batch) 1401 | adv_loss = adv_model(outputs.logits, batch["groups"]).mean() 1402 | loss = outputs.loss - adv_loss 1403 | loss.backward() 1404 | trainer.optimizer.step() 1405 | ``` 1406 | 1407 | 158. **How do you implement privacy-preserving inference in Transformers?** 1408 | Uses encrypted computation. 1409 | ```python 1410 | def private_inference(model, inputs): 1411 | noisy_inputs = inputs["input_ids"] + torch.randn_like(inputs["input_ids"]) * 0.1 1412 | return model(input_ids=noisy_inputs) 1413 | ``` 1414 | 1415 | 159. **Write a function to monitor ethical risks in Transformers models.** 1416 | Tracks bias and fairness metrics. 1417 | ```python 1418 | import logging 1419 | def monitor_ethics(outputs, groups, targets): 1420 | logging.basicConfig(filename="ethics.log", level=logging.INFO) 1421 | metrics = fairness_metrics(outputs, groups, targets) 1422 | logging.info(f"Fairness metrics: {metrics}") 1423 | return metrics 1424 | ``` 1425 | 1426 | 160. **How do you implement explainable AI with Transformers?** 1427 | Uses attribution methods. 1428 | ```python 1429 | from captum.attr import IntegratedGradients 1430 | def explainable_model(model, inputs): 1431 | ig = IntegratedGradients(model) 1432 | attributions = ig.attribute(inputs["input_ids"]) 1433 | return attributions 1434 | ``` 1435 | 1436 | 161. **Write a function to ensure regulatory compliance in Transformers.** 1437 | Logs model metadata. 1438 | ```python 1439 | import json 1440 | def log_compliance(model, metadata): 1441 | with open("compliance.json", "w") as f: 1442 | json.dump({"model": str(model), "metadata": metadata}, f) 1443 | ``` 1444 | 1445 | 162. **How do you implement ethical model evaluation in Transformers?** 1446 | Assesses fairness and robustness. 1447 | ```python 1448 | def ethical_evaluation(model, dataset): 1449 | outputs = batch_inference(model, tokenizer, dataset["text"]) 1450 | fairness = fairness_metrics(outputs, dataset["groups"], dataset["labels"]) 1451 | robustness = evaluate_model(trainer, dataset) 1452 | return {"fairness": fairness, "robustness": robustness} 1453 | ``` 1454 | 1455 | ## Integration with Other Libraries 1456 | 1457 | ### Basic 1458 | 163. **How do you integrate Transformers with PyTorch?** 1459 | Uses PyTorch-based models. 1460 | ```python 1461 | from transformers import AutoModel 1462 | model = AutoModel.from_pretrained("bert-base-uncased") 1463 | ``` 1464 | 1465 | 164. **How do you integrate Transformers with Hugging Face Datasets?** 1466 | Loads and preprocesses datasets. 1467 | ```python 1468 | from datasets import load_dataset 1469 | dataset = load_dataset("imdb") 1470 | ``` 1471 | 1472 | 165. **How do you use Transformers with Matplotlib?** 1473 | Visualizes model outputs. 1474 | ```python 1475 | import matplotlib.pyplot as plt 1476 | def plot_data(data): 1477 | plt.plot(data) 1478 | plt.savefig("data_plot.png") 1479 | ``` 1480 | 1481 | 166. **How do you integrate Transformers with FastAPI?** 1482 | Serves models via API. 1483 | ```python 1484 | from fastapi import FastAPI 1485 | app = FastAPI() 1486 | model = AutoModel.from_pretrained("gpt2") 1487 | @app.post("/predict") 1488 | async def predict(text: str): 1489 | inputs = tokenizer(text, return_tensors="pt") 1490 | outputs = model(**inputs) 1491 | return {"logits": outputs.logits.tolist()} 1492 | ``` 1493 | 1494 | 167. **How do you use Transformers with TensorFlow?** 1495 | Uses TensorFlow-compatible models. 1496 | ```python 1497 | from transformers import TFAutoModel 1498 | model = TFAutoModel.from_pretrained("bert-base-uncased") 1499 | ``` 1500 | 1501 | 168. **How do you integrate Transformers with ONNX?** 1502 | Exports models for inference. 1503 | ```python 1504 | from transformers import AutoModel 1505 | model = AutoModel.from_pretrained("bert-base-uncased") 1506 | model.to_onnx("model.onnx") 1507 | ``` 1508 | 1509 | #### Intermediate 1510 | 169. **Write a function to integrate Transformers with Pandas.** 1511 | Preprocesses DataFrame data. 1512 | ```python 1513 | import pandas as pd 1514 | def preprocess_with_pandas(df, tokenizer, column="text"): 1515 | return tokenizer(df[column].tolist(), padding=True, return_tensors="pt") 1516 | ``` 1517 | 1518 | 170. **How do you integrate Transformers with LangChain?** 1519 | Builds conversational agents. 1520 | ```python 1521 | from langchain import HuggingFacePipeline 1522 | from transformers import pipeline 1523 | def create_langchain_agent(model_name="gpt2"): 1524 | hf_pipeline = pipeline("text-generation", model=model_name) 1525 | return HuggingFacePipeline(pipeline=hf_pipeline) 1526 | ``` --------------------------------------------------------------------------------