├── .gitignore
└── README.md


/.gitignore:
--------------------------------------------------------------------------------
  1 | # Byte-compiled / optimized / DLL files
  2 | __pycache__/
  3 | *.py[cod]
  4 | *$py.class
  5 | 
  6 | # C extensions
  7 | *.so
  8 | 
  9 | # Distribution / packaging
 10 | .Python
 11 | build/
 12 | develop-eggs/
 13 | dist/
 14 | downloads/
 15 | eggs/
 16 | .eggs/
 17 | lib/
 18 | lib64/
 19 | parts/
 20 | sdist/
 21 | var/
 22 | wheels/
 23 | share/python-wheels/
 24 | *.egg-info/
 25 | .installed.cfg
 26 | *.egg
 27 | MANIFEST
 28 | 
 29 | # PyInstaller
 30 | #  Usually these files are written by a python script from a template
 31 | #  before PyInstaller builds the exe, so as to inject date/other infos into it.
 32 | *.manifest
 33 | *.spec
 34 | 
 35 | # Installer logs
 36 | pip-log.txt
 37 | pip-delete-this-directory.txt
 38 | 
 39 | # Unit test / coverage reports
 40 | htmlcov/
 41 | .tox/
 42 | .nox/
 43 | .coverage
 44 | .coverage.*
 45 | .cache
 46 | nosetests.xml
 47 | coverage.xml
 48 | *.cover
 49 | *.py,cover
 50 | .hypothesis/
 51 | .pytest_cache/
 52 | cover/
 53 | 
 54 | # Translations
 55 | *.mo
 56 | *.pot
 57 | 
 58 | # Django stuff:
 59 | *.log
 60 | local_settings.py
 61 | db.sqlite3
 62 | db.sqlite3-journal
 63 | 
 64 | # Flask stuff:
 65 | instance/
 66 | .webassets-cache
 67 | 
 68 | # Scrapy stuff:
 69 | .scrapy
 70 | 
 71 | # Sphinx documentation
 72 | docs/_build/
 73 | 
 74 | # PyBuilder
 75 | .pybuilder/
 76 | target/
 77 | 
 78 | # Jupyter Notebook
 79 | .ipynb_checkpoints
 80 | 
 81 | # IPython
 82 | profile_default/
 83 | ipython_config.py
 84 | 
 85 | # pyenv
 86 | #   For a library or package, you might want to ignore these files since the code is
 87 | #   intended to run in multiple environments; otherwise, check them in:
 88 | # .python-version
 89 | 
 90 | # pipenv
 91 | #   According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
 92 | #   However, in case of collaboration, if having platform-specific dependencies or dependencies
 93 | #   having no cross-platform support, pipenv may install dependencies that don't work, or not
 94 | #   install all needed dependencies.
 95 | #Pipfile.lock
 96 | 
 97 | # poetry
 98 | #   Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control.
 99 | #   This is especially recommended for binary packages to ensure reproducibility, and is more
100 | #   commonly ignored for libraries.
101 | #   https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control
102 | #poetry.lock
103 | 
104 | # pdm
105 | #   Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control.
106 | #pdm.lock
107 | #   pdm stores project-wide configurations in .pdm.toml, but it is recommended to not include it
108 | #   in version control.
109 | #   https://pdm.fming.dev/latest/usage/project/#working-with-version-control
110 | .pdm.toml
111 | .pdm-python
112 | .pdm-build/
113 | 
114 | # PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm
115 | __pypackages__/
116 | 
117 | # Celery stuff
118 | celerybeat-schedule
119 | celerybeat.pid
120 | 
121 | # SageMath parsed files
122 | *.sage.py
123 | 
124 | # Environments
125 | .env
126 | .venv
127 | env/
128 | venv/
129 | ENV/
130 | env.bak/
131 | venv.bak/
132 | 
133 | # Spyder project settings
134 | .spyderproject
135 | .spyproject
136 | 
137 | # Rope project settings
138 | .ropeproject
139 | 
140 | # mkdocs documentation
141 | /site
142 | 
143 | # mypy
144 | .mypy_cache/
145 | .dmypy.json
146 | dmypy.json
147 | 
148 | # Pyre type checker
149 | .pyre/
150 | 
151 | # pytype static type analyzer
152 | .pytype/
153 | 
154 | # Cython debug symbols
155 | cython_debug/
156 | 
157 | # PyCharm
158 | #  JetBrains specific template is maintained in a separate JetBrains.gitignore that can
159 | #  be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore
160 | #  and can be added to the global gitignore or merged into this file.  For a more nuclear
161 | #  option (not recommended) you can uncomment the following to ignore the entire idea folder.
162 | #.idea/
163 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
  1 | # Desklib AI Text Detector
  2 | 
  3 | This repository contains the code and resources for an AI-generated text detection model developed by Desklib. This model is designed to classify English text as either human-written or AI-generated.
  4 | 
  5 | ## Overview
  6 | 
  7 | The model is a fine-tuned version of **microsoft/deberta-v3-large**, leveraging a transformer-based architecture to achieve high accuracy in identifying AI-generated content.  It is robust against various adversarial attacks across different text domains, making it a reliable tool for detecting AI-generated text in various contexts. This model is particularly useful in content moderation, academic integrity, journalism, and other applications where the authenticity of text is crucial.
  8 | 
  9 | **Key Features:**
 10 | 
 11 | *   **Robust Detection:**  Effectively identifies AI-generated text, even with adversarial modifications.
 12 | *   **High Accuracy:**  Achieves leading performance on the [RAID Benchmark for AI Detection](https://raid-bench.xyz/leaderboard?domain=all&decoding=all&repetition=all&attack=all) at time of submission.
 13 | *   **Easy to Use:**  Simple integration with the Hugging Face `transformers` library.
 14 | *   **Based on DeBERTa:**  Leverages the powerful `microsoft/deberta-v3-large` transformer model.
 15 | * **Developed by Desklib**: Desklib provides AI based tools for students, educators and universities.
 16 | 
 17 | **Links:**
 18 | 
 19 | *   **Hugging Face Model Hub:** [https://huggingface.co/desklib/ai-text-detector-v1.01](https://huggingface.co/desklib/ai-text-detector-v1.01)
 20 | *   **Try the model online!**: [Desklib AI Detector](https://desklib.com/ai-content-detector/)
 21 | * **RAID Benchmark Leaderboard**: [Visit RAID Leaderboard](https://raid-bench.xyz/leaderboard?domain=all&decoding=all&repetition=all&attack=all)
 22 | *  **Github Repo**: [https://github.com/desklib/ai-text-detector](https://github.com/desklib/ai-text-detector)
 23 | 
 24 | ## Installation
 25 | This project requires Python 3.7+ and PyTorch.
 26 | 
 27 | 1.  **Install dependencies:**
 28 | 
 29 |     ```bash
 30 |     pip install torch transformers
 31 |     ```
 32 |     (It is highly recommended to use a virtual environment (like `venv` or `conda`) to avoid conflicts with other projects)
 33 | 
 34 | ## Usage
 35 | 
 36 | The script provides a simple example of how to use the model to predict whether a given text is AI-generated.  The core logic is encapsulated in the `predict_single_text` function.
 37 | 
 38 | ```python
 39 | import torch
 40 | import torch.nn as nn
 41 | from transformers import AutoTokenizer, AutoConfig, AutoModel, PreTrainedModel
 42 | 
 43 | class DesklibAIDetectionModel(PreTrainedModel):
 44 |     config_class = AutoConfig
 45 | 
 46 |     def __init__(self, config):
 47 |         super().__init__(config)
 48 |         # Initialize the base transformer model.
 49 |         self.model = AutoModel.from_config(config)
 50 |         # Define a classifier head.
 51 |         self.classifier = nn.Linear(config.hidden_size, 1)
 52 |         # Initialize weights (handled by PreTrainedModel)
 53 |         self.init_weights()
 54 | 
 55 |     def forward(self, input_ids, attention_mask=None, labels=None):
 56 |         # Forward pass through the transformer
 57 |         outputs = self.model(input_ids, attention_mask=attention_mask)
 58 |         last_hidden_state = outputs[0]
 59 |         # Mean pooling
 60 |         input_mask_expanded = attention_mask.unsqueeze(-1).expand(last_hidden_state.size()).float()
 61 |         sum_embeddings = torch.sum(last_hidden_state * input_mask_expanded, dim=1)
 62 |         sum_mask = torch.clamp(input_mask_expanded.sum(dim=1), min=1e-9)
 63 |         pooled_output = sum_embeddings / sum_mask
 64 | 
 65 |         # Classifier
 66 |         logits = self.classifier(pooled_output)
 67 |         loss = None
 68 |         if labels is not None:
 69 |             loss_fct = nn.BCEWithLogitsLoss()
 70 |             loss = loss_fct(logits.view(-1), labels.float())
 71 | 
 72 |         output = {"logits": logits}
 73 |         if loss is not None:
 74 |             output["loss"] = loss
 75 |         return output
 76 | 
 77 | def predict_single_text(text, model, tokenizer, device, max_len=768, threshold=0.5):
 78 |     """
 79 |         Predicts whether the given text is AI-generated.
 80 |     """
 81 |     encoded = tokenizer(
 82 |         text,
 83 |         padding='max_length',
 84 |         truncation=True,
 85 |         max_length=max_len,
 86 |         return_tensors='pt'
 87 |     )
 88 |     input_ids = encoded['input_ids'].to(device)
 89 |     attention_mask = encoded['attention_mask'].to(device)
 90 | 
 91 |     model.eval()
 92 |     with torch.no_grad():
 93 |         outputs = model(input_ids=input_ids, attention_mask=attention_mask)
 94 |         logits = outputs["logits"]
 95 |         probability = torch.sigmoid(logits).item()
 96 | 
 97 |     label = 1 if probability >= threshold else 0
 98 |     return probability, label
 99 | 
100 | def main():
101 |     # --- Model and Tokenizer Directory ---
102 |     model_directory = "desklib/ai-text-detector-v1.01"
103 | 
104 |     # --- Load tokenizer and model ---
105 |     tokenizer = AutoTokenizer.from_pretrained(model_directory)
106 |     model = DesklibAIDetectionModel.from_pretrained(model_directory)
107 | 
108 |     # --- Set up device ---
109 |     device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
110 |     model.to(device)
111 | 
112 |     # --- Example Input text ---
113 |     text_ai = "AI detection refers to the process of identifying whether a given piece of content, such as text, images, or audio, has been generated by artificial intelligence. This is achieved using various machine learning techniques, including perplexity analysis, entropy measurements, linguistic pattern recognition, and neural network classifiers trained on human and AI-generated data. Advanced AI detection tools assess writing style, coherence, and statistical properties to determine the likelihood of AI involvement. These tools are widely used in academia, journalism, and content moderation to ensure originality, prevent misinformation, and maintain ethical standards. As AI-generated content becomes increasingly sophisticated, AI detection methods continue to evolve, integrating deep learning models and ensemble techniques for improved accuracy."
114 |     text_human = "It is estimated that a major part of the content in the internet will be generated by AI / LLMs by 2025. This leads to a lot of misinformation and credibility related issues. That is why if is important to have accurate tools to identify if a content is AI generated or human written"
115 | 
116 |     # --- Run prediction ---
117 |     probability, predicted_label = predict_single_text(text_ai, model, tokenizer, device)
118 |     print(f"Probability of being AI generated: {probability:.4f}")
119 |     print(f"Predicted label: {'AI Generated' if predicted_label == 1 else 'Not AI Generated'}")
120 | 
121 |     probability, predicted_label = predict_single_text(text_human, model, tokenizer, device)
122 |     print(f"Probability of being AI generated: {probability:.4f}")
123 |     print(f"Predicted label: {'AI Generated' if predicted_label == 1 else 'Not AI Generated'}")
124 | 
125 | if __name__ == "__main__":
126 |     main()
127 | 


--------------------------------------------------------------------------------