├── README.md
├── config.json
├── model.safetensors
├── special_tokens_map.json
├── spiece.model
├── tokenizer.json
└── tokenizer_config.json


/README.md:
--------------------------------------------------------------------------------
  1 | # Duplicate Sentence Detection with ALBERT-base-v2
  2 | 
  3 | ## 📌 Overview
  4 | 
  5 | This repository hosts the quantized version of the ALBERT-base-v2 model for Duplicate Sentence Detection. The model is designed to determine whether two sentences convey the same meaning. If they are similar, the model outputs "duplicate" with a confidence score; otherwise, it outputs "not duplicate" with a confidence score. The model has been optimized for efficient deployment while maintaining reasonable accuracy, making it suitable for real-time applications.
  6 | 
  7 | ## 🏗 Model Details
  8 | 
  9 | - **Model Architecture:** ALBERT-base-v2  
 10 | - **Task:** Duplicate Sentence Detection  
 11 | - **Dataset:** Hugging Face's `quora-question-pairs`  
 12 | - **Quantization:** Float16 (FP16) for optimized inference  
 13 | - **Fine-tuning Framework:** Hugging Face Transformers  
 14 | 
 15 | ## 🚀 Usage
 16 | 
 17 | ### Installation
 18 | 
 19 | ```bash
 20 | pip install transformers torch
 21 | ```
 22 | 
 23 | ### Loading the Model
 24 | 
 25 | ```python
 26 | from transformers import AlbertTokenizer, AlbertForSequenceClassification
 27 | import torch
 28 | 
 29 | device = "cuda" if torch.cuda.is_available() else "cpu"
 30 | 
 31 | model_name = "AventIQ-AI/albert-duplicate-sentence-detection"
 32 | model = AlbertForSequenceClassification.from_pretrained(model_name).to(device)
 33 | tokenizer = AlbertTokenizer.from_pretrained(model_name)
 34 | ```
 35 | 
 36 | ### Paraphrase Detection Inference
 37 | 
 38 | ```python
 39 | def predict_duplicate(question1, question2, model):
 40 |     inputs = tokenizer(question1, question2, truncation=True, padding="max_length", max_length=128, return_tensors="pt")
 41 |     
 42 |     # ✅ Move inputs to the same device as the model
 43 |     inputs = {key: value.to(device) for key, value in inputs.items()}
 44 |     
 45 |     with torch.no_grad():  # Disable gradient calculation
 46 |         outputs = model(**inputs)
 47 |         logits = outputs.logits
 48 |     
 49 |     # ✅ Get prediction
 50 |     probs = torch.softmax(logits, dim=1)
 51 |     prediction = torch.argmax(probs, dim=1).item()
 52 |  
 53 |     # ✅ Output the results
 54 |     label_map = {0: "Not Duplicate", 1: "Duplicate"}
 55 |     print(f"Q1: {question1}")
 56 |     print(f"Q2: {question2}")
 57 |     print(f"Prediction: {label_map[prediction]} (Confidence: {probs.max().item():.4f})\n")
 58 | 
 59 | # 🔍 Test Example
 60 | test_samples = [
 61 |     ("How can I learn Python quickly?", "What is the fastest way to learn Python?"),  # Duplicate
 62 |     ("What is the capital of India?", "Where is New Delhi located?"),  # Duplicate
 63 |     ("How to lose weight fast?", "What is the best programming language to learn?"),  # Not Duplicate
 64 |     ("Who is the CEO of Tesla?", "What is the net worth of Elon Musk?"),  # Not Duplicate
 65 |     ("What is machine learning?", "How does AI work?"),  # Duplicate
 66 | ]
 67 | for q1, q2 in test_samples:
 68 |     predict_duplicate(q1, q2, model)
 69 | ```
 70 | 
 71 | ## 📊 Quantized Model Evaluation Results
 72 | 
 73 | ### 🔥 Evaluation Metrics 🔥
 74 | 
 75 | - ✅ **Accuracy:**  0.7215  
 76 | - ✅ **Precision:** 0.6497  
 77 | - ✅ **Recall:**    0.5440  
 78 | - ✅ **F1-score:**  0.5922  
 79 | 
 80 | ## ⚡ Quantization Details
 81 | 
 82 | Post-training quantization was applied using PyTorch's built-in quantization framework. The model was quantized to Float16 (FP16) to reduce model size and improve inference efficiency while balancing accuracy.
 83 | 
 84 | ## 📂 Repository Structure
 85 | 
 86 | ```
 87 | .
 88 | ├── model/               # Contains the quantized model files
 89 | ├── tokenizer_config/    # Tokenizer configuration and vocabulary files
 90 | ├── model.safetensors/   # Quantized Model
 91 | ├── README.md            # Model documentation
 92 | ```
 93 | 
 94 | ## ⚠️ Limitations
 95 | 
 96 | - The model may struggle with highly nuanced paraphrases.
 97 | - Quantization may lead to slight degradation in accuracy compared to full-precision models.
 98 | - Performance may vary across different domains and sentence structures.
 99 | 
100 | ## 🤝 Contributing
101 | 
102 | Contributions are welcome! Feel free to open an issue or submit a pull request if you have suggestions or improvements.
103 | 


--------------------------------------------------------------------------------
/config.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "_name_or_path": "path_to_save_model",
 3 |   "architectures": [
 4 |     "AlbertForSequenceClassification"
 5 |   ],
 6 |   "attention_probs_dropout_prob": 0,
 7 |   "bos_token_id": 2,
 8 |   "classifier_dropout_prob": 0.1,
 9 |   "down_scale_factor": 1,
10 |   "embedding_size": 128,
11 |   "eos_token_id": 3,
12 |   "gap_size": 0,
13 |   "hidden_act": "gelu_new",
14 |   "hidden_dropout_prob": 0,
15 |   "hidden_size": 768,
16 |   "initializer_range": 0.02,
17 |   "inner_group_num": 1,
18 |   "intermediate_size": 3072,
19 |   "layer_norm_eps": 1e-12,
20 |   "max_position_embeddings": 512,
21 |   "model_type": "albert",
22 |   "net_structure_type": 0,
23 |   "num_attention_heads": 12,
24 |   "num_hidden_groups": 1,
25 |   "num_hidden_layers": 12,
26 |   "num_memory_blocks": 0,
27 |   "pad_token_id": 0,
28 |   "position_embedding_type": "absolute",
29 |   "problem_type": "single_label_classification",
30 |   "torch_dtype": "float16",
31 |   "transformers_version": "4.48.2",
32 |   "type_vocab_size": 2,
33 |   "vocab_size": 30000
34 | }
35 | 


--------------------------------------------------------------------------------
/model.safetensors:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Avent-IQ/albert-duplicate-sentence-detection/c97e46b171d8ae110610c7af8bd9390cdfde760f/model.safetensors


--------------------------------------------------------------------------------
/special_tokens_map.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "bos_token": {
 3 |     "content": "[CLS]",
 4 |     "lstrip": false,
 5 |     "normalized": false,
 6 |     "rstrip": false,
 7 |     "single_word": false
 8 |   },
 9 |   "cls_token": {
10 |     "content": "[CLS]",
11 |     "lstrip": false,
12 |     "normalized": false,
13 |     "rstrip": false,
14 |     "single_word": false
15 |   },
16 |   "eos_token": {
17 |     "content": "[SEP]",
18 |     "lstrip": false,
19 |     "normalized": false,
20 |     "rstrip": false,
21 |     "single_word": false
22 |   },
23 |   "mask_token": {
24 |     "content": "[MASK]",
25 |     "lstrip": true,
26 |     "normalized": false,
27 |     "rstrip": false,
28 |     "single_word": false
29 |   },
30 |   "pad_token": {
31 |     "content": "<pad>",
32 |     "lstrip": false,
33 |     "normalized": false,
34 |     "rstrip": false,
35 |     "single_word": false
36 |   },
37 |   "sep_token": {
38 |     "content": "[SEP]",
39 |     "lstrip": false,
40 |     "normalized": false,
41 |     "rstrip": false,
42 |     "single_word": false
43 |   },
44 |   "unk_token": {
45 |     "content": "<unk>",
46 |     "lstrip": false,
47 |     "normalized": false,
48 |     "rstrip": false,
49 |     "single_word": false
50 |   }
51 | }
52 | 


--------------------------------------------------------------------------------
/spiece.model:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Avent-IQ/albert-duplicate-sentence-detection/c97e46b171d8ae110610c7af8bd9390cdfde760f/spiece.model


--------------------------------------------------------------------------------
/tokenizer_config.json:
--------------------------------------------------------------------------------
  1 | {
  2 |   "added_tokens_decoder": {
  3 |     "0": {
  4 |       "content": "<pad>",
  5 |       "lstrip": false,
  6 |       "normalized": false,
  7 |       "rstrip": false,
  8 |       "single_word": false,
  9 |       "special": true
 10 |     },
 11 |     "1": {
 12 |       "content": "<unk>",
 13 |       "lstrip": false,
 14 |       "normalized": false,
 15 |       "rstrip": false,
 16 |       "single_word": false,
 17 |       "special": true
 18 |     },
 19 |     "2": {
 20 |       "content": "[CLS]",
 21 |       "lstrip": false,
 22 |       "normalized": false,
 23 |       "rstrip": false,
 24 |       "single_word": false,
 25 |       "special": true
 26 |     },
 27 |     "3": {
 28 |       "content": "[SEP]",
 29 |       "lstrip": false,
 30 |       "normalized": false,
 31 |       "rstrip": false,
 32 |       "single_word": false,
 33 |       "special": true
 34 |     },
 35 |     "4": {
 36 |       "content": "[MASK]",
 37 |       "lstrip": true,
 38 |       "normalized": false,
 39 |       "rstrip": false,
 40 |       "single_word": false,
 41 |       "special": true
 42 |     },
 43 |     "5": {
 44 |       "content": "(",
 45 |       "lstrip": false,
 46 |       "normalized": false,
 47 |       "rstrip": false,
 48 |       "single_word": false,
 49 |       "special": false
 50 |     },
 51 |     "6": {
 52 |       "content": ")",
 53 |       "lstrip": false,
 54 |       "normalized": false,
 55 |       "rstrip": false,
 56 |       "single_word": false,
 57 |       "special": false
 58 |     },
 59 |     "7": {
 60 |       "content": "\"",
 61 |       "lstrip": false,
 62 |       "normalized": false,
 63 |       "rstrip": false,
 64 |       "single_word": false,
 65 |       "special": false
 66 |     },
 67 |     "8": {
 68 |       "content": "-",
 69 |       "lstrip": false,
 70 |       "normalized": false,
 71 |       "rstrip": false,
 72 |       "single_word": false,
 73 |       "special": false
 74 |     },
 75 |     "9": {
 76 |       "content": ".",
 77 |       "lstrip": false,
 78 |       "normalized": false,
 79 |       "rstrip": false,
 80 |       "single_word": false,
 81 |       "special": false
 82 |     },
 83 |     "10": {
 84 |       "content": "–",
 85 |       "lstrip": false,
 86 |       "normalized": false,
 87 |       "rstrip": false,
 88 |       "single_word": false,
 89 |       "special": false
 90 |     },
 91 |     "11": {
 92 |       "content": "£",
 93 |       "lstrip": false,
 94 |       "normalized": false,
 95 |       "rstrip": false,
 96 |       "single_word": false,
 97 |       "special": false
 98 |     },
 99 |     "12": {
100 |       "content": "€",
101 |       "lstrip": false,
102 |       "normalized": false,
103 |       "rstrip": false,
104 |       "single_word": false,
105 |       "special": false
106 |     }
107 |   },
108 |   "bos_token": "[CLS]",
109 |   "clean_up_tokenization_spaces": false,
110 |   "cls_token": "[CLS]",
111 |   "do_lower_case": true,
112 |   "eos_token": "[SEP]",
113 |   "extra_special_tokens": {},
114 |   "keep_accents": false,
115 |   "mask_token": "[MASK]",
116 |   "model_max_length": 512,
117 |   "pad_token": "<pad>",
118 |   "remove_space": true,
119 |   "sep_token": "[SEP]",
120 |   "sp_model_kwargs": {},
121 |   "tokenizer_class": "AlbertTokenizer",
122 |   "unk_token": "<unk>"
123 | }
124 | 


--------------------------------------------------------------------------------