└── semi supervised classifier.ipynb /semi supervised classifier.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": 6, 6 | "metadata": {}, 7 | "outputs": [ 8 | { 9 | "name": "stdout", 10 | "output_type": "stream", 11 | "text": [ 12 | "Accuracy: 1.0000\n", 13 | "Precision: 1.0000\n", 14 | "Recall: 1.0000\n" 15 | ] 16 | } 17 | ], 18 | "source": [ 19 | "from sklearn.feature_extraction.text import TfidfVectorizer\n", 20 | "from sklearn.semi_supervised import LabelPropagation\n", 21 | "from sklearn.metrics import accuracy_score, precision_score, recall_score\n", 22 | "import numpy as np\n", 23 | "\n", 24 | "labeled_data = [\n", 25 | " (\"This is a document about sports\", \"Sports\"),\n", 26 | " (\"This is a news article\", \"News\"),\n", 27 | " (\"Another document about sports\", \"Sports\"),\n", 28 | " (\"A text sample about politics\", \"Politics\"),\n", 29 | " (\"A document discussing music\", \"Music\")\n", 30 | "]\n", 31 | "unlabeled_data = [\n", 32 | " \"This document discusses machine learning\",\n", 33 | " \"Another document about music\",\n", 34 | " \"A short text sample\"\n", 35 | "]\n", 36 | "\n", 37 | "all_data = [text for text, _ in labeled_data] + unlabeled_data\n", 38 | "texts, labels = zip(*labeled_data)\n", 39 | "vectorizer = TfidfVectorizer(max_features=500)\n", 40 | "features = vectorizer.fit_transform(all_data)\n", 41 | "features_dense = features.toarray()\n", 42 | "all_labels = sorted(set(labels))\n", 43 | "label_distributions = np.zeros((len(texts), len(all_labels)))\n", 44 | "for i, label in enumerate(labels):\n", 45 | " label_distributions[i, all_labels.index(label)] = 1\n", 46 | "\n", 47 | "X_train = features_dense[:len(texts)]\n", 48 | "y_train = labels\n", 49 | "X_test = X_train\n", 50 | "y_test = y_train\n", 51 | "\n", 52 | "y_train_indices = np.array([all_labels.index(label) for label in y_train])\n", 53 | "semi_clf = LabelPropagation()\n", 54 | "semi_clf.fit(X_train, y_train_indices)\n", 55 | "predictions = semi_clf.predict(X_test)\n", 56 | "accuracy = accuracy_score(np.array([all_labels.index(label) for label in y_test]), predictions)\n", 57 | "precision = precision_score(np.array([all_labels.index(label) for label in y_test]), predictions, average='weighted')\n", 58 | "recall = recall_score(np.array([all_labels.index(label) for label in y_test]), predictions, average='weighted')\n", 59 | "print(f\"Accuracy: {accuracy:.4f}\")\n", 60 | "print(f\"Precision: {precision:.4f}\")\n", 61 | "print(f\"Recall: {recall:.4f}\")\n" 62 | ] 63 | } 64 | ], 65 | "metadata": { 66 | "kernelspec": { 67 | "display_name": "Python 3", 68 | "language": "python", 69 | "name": "python3" 70 | }, 71 | "language_info": { 72 | "codemirror_mode": { 73 | "name": "ipython", 74 | "version": 3 75 | }, 76 | "file_extension": ".py", 77 | "mimetype": "text/x-python", 78 | "name": "python", 79 | "nbconvert_exporter": "python", 80 | "pygments_lexer": "ipython3", 81 | "version": "3.12.0" 82 | } 83 | }, 84 | "nbformat": 4, 85 | "nbformat_minor": 2 86 | } 87 | --------------------------------------------------------------------------------