├── CLI Model Building.ipynb ├── Data Preprocessing.ipynb ├── LICENSE ├── Model Building.ipynb ├── README.md ├── app.py ├── base_config.cfg ├── data.json ├── data ├── test │ ├── Alice Clark CV.docx │ ├── Alice Clark CV.pdf │ ├── Alice Clark CV.txt │ ├── Smith Resume.docx │ └── Smith Resume.pdf └── train │ └── train_data.json ├── ner_model ├── config.cfg ├── meta.json ├── ner │ ├── cfg │ ├── model │ └── moves ├── tokenizer └── vocab │ ├── key2row │ ├── lookups.bin │ ├── strings.json │ ├── vectors │ └── vectors.cfg └── requirement.txt /CLI Model Building.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "# *Required Libraries*" 8 | ] 9 | }, 10 | { 11 | "cell_type": "code", 12 | "execution_count": null, 13 | "metadata": {}, 14 | "outputs": [], 15 | "source": [ 16 | "import spacy\n", 17 | "import json\n", 18 | "from spacy.util import filter_spans\n", 19 | "from spacy.tokens import DocBin\n", 20 | "from tqdm import tqdm" 21 | ] 22 | }, 23 | { 24 | "cell_type": "markdown", 25 | "metadata": {}, 26 | "source": [ 27 | "# *Loading Data*" 28 | ] 29 | }, 30 | { 31 | "cell_type": "code", 32 | "execution_count": null, 33 | "metadata": {}, 34 | "outputs": [], 35 | "source": [ 36 | "with open('Data/train/train_data.json','rb') as f:\n", 37 | " train_data=json.load(f)" 38 | ] 39 | }, 40 | { 41 | "cell_type": "markdown", 42 | "metadata": {}, 43 | "source": [ 44 | "# *Model Building*" 45 | ] 46 | }, 47 | { 48 | "cell_type": "code", 49 | "execution_count": null, 50 | "metadata": {}, 51 | "outputs": [], 52 | "source": [ 53 | "nlp = spacy.blank('en')\n", 54 | "doc_bin = DocBin()\n", 55 | "count=0\n", 56 | "for training_example in tqdm(train_data):\n", 57 | " text = training_example['text']\n", 58 | " entities = training_example['entities']\n", 59 | " doc = nlp.make_doc(text)\n", 60 | " ents = []\n", 61 | " for start, end, label in entities:\n", 62 | " span = doc.char_span(start, end, label=label, alignment_mode=\"contract\")\n", 63 | " filtered_ents = filter_spans(ents)\n", 64 | " doc.ents = filtered_ents\n", 65 | " doc_bin.add(doc)\n", 66 | "doc_bin.to_disk(\"train.spacy\")" 67 | ] 68 | }, 69 | { 70 | "cell_type": "code", 71 | "execution_count": null, 72 | "metadata": {}, 73 | "outputs": [], 74 | "source": [ 75 | "! python -m spacy init fill-config base_config.cfg config.cfg" 76 | ] 77 | }, 78 | { 79 | "cell_type": "markdown", 80 | "metadata": {}, 81 | "source": [ 82 | "## *Model Training*" 83 | ] 84 | }, 85 | { 86 | "cell_type": "code", 87 | "execution_count": null, 88 | "metadata": {}, 89 | "outputs": [], 90 | "source": [ 91 | "! python -m spacy train config.cfg --output ./output --paths.train ./train.spacy --paths.dev ./train.spacy" 92 | ] 93 | } 94 | ], 95 | "metadata": { 96 | "kernelspec": { 97 | "display_name": "Python 3", 98 | "language": "python", 99 | "name": "python3" 100 | }, 101 | "language_info": { 102 | "codemirror_mode": { 103 | "name": "ipython", 104 | "version": 3 105 | }, 106 | "file_extension": ".py", 107 | "mimetype": "text/x-python", 108 | "name": "python", 109 | "nbconvert_exporter": "python", 110 | "pygments_lexer": "ipython3", 111 | "version": "3.11.7" 112 | } 113 | }, 114 | "nbformat": 4, 115 | "nbformat_minor": 2 116 | } 117 | -------------------------------------------------------------------------------- /Data Preprocessing.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": 34, 6 | "metadata": {}, 7 | "outputs": [], 8 | "source": [ 9 | "import json" 10 | ] 11 | }, 12 | { 13 | "cell_type": "code", 14 | "execution_count": 35, 15 | "metadata": {}, 16 | "outputs": [], 17 | "source": [ 18 | "with open('data.json','rb') as f:\n", 19 | " data=json.load(f)" 20 | ] 21 | }, 22 | { 23 | "cell_type": "code", 24 | "execution_count": 38, 25 | "metadata": {}, 26 | "outputs": [], 27 | "source": [ 28 | "train_data=[]\n", 29 | "for i in data:\n", 30 | " item={}\n", 31 | " entities=[]\n", 32 | " content=i['content']\n", 33 | " seen=set()\n", 34 | " indexes=[]\n", 35 | " for j in i['annotation']:\n", 36 | " label=j['label']\n", 37 | " label=label[0].replace(\" \",\"_\").upper()\n", 38 | " text=j['text'][0].strip()\n", 39 | " if text not in seen:\n", 40 | " start=content.index(text)\n", 41 | " end=start+len(text)\n", 42 | " flag=0\n", 43 | " for index in indexes:\n", 44 | " if (index[0]<=start<=index[1]) or (index[0]<=end<=index[1]) or (start<=index[0]<=end) or (start<=index[1]<=end):\n", 45 | " flag=1\n", 46 | " if flag!=1:\n", 47 | " entities.append([start,end,label])\n", 48 | " \n", 49 | " seen.add(text)\n", 50 | " indexes.append([start,end]) \n", 51 | "\n", 52 | " item['text']=content\n", 53 | " item['entities']=entities\n", 54 | " train_data.append(item)" 55 | ] 56 | }, 57 | { 58 | "cell_type": "code", 59 | "execution_count": 39, 60 | "metadata": {}, 61 | "outputs": [], 62 | "source": [ 63 | "with open('./Data/train/train_data.json','w') as file:\n", 64 | " json.dump(train_data,file)" 65 | ] 66 | } 67 | ], 68 | "metadata": { 69 | "kernelspec": { 70 | "display_name": "Python 3", 71 | "language": "python", 72 | "name": "python3" 73 | }, 74 | "language_info": { 75 | "codemirror_mode": { 76 | "name": "ipython", 77 | "version": 3 78 | }, 79 | "file_extension": ".py", 80 | "mimetype": "text/x-python", 81 | "name": "python", 82 | "nbconvert_exporter": "python", 83 | "pygments_lexer": "ipython3", 84 | "version": "3.11.7" 85 | } 86 | }, 87 | "nbformat": 4, 88 | "nbformat_minor": 2 89 | } 90 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2024 Mitesh Gupta 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /Model Building.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": 1, 6 | "metadata": {}, 7 | "outputs": [], 8 | "source": [ 9 | "import spacy\n", 10 | "from spacy.training import Example\n", 11 | "from spacy.util import minibatch, compounding\n", 12 | "import random\n", 13 | "import json\n", 14 | "from spacy.util import filter_spans" 15 | ] 16 | }, 17 | { 18 | "cell_type": "code", 19 | "execution_count": 2, 20 | "metadata": {}, 21 | "outputs": [], 22 | "source": [ 23 | "with open('Data/train/train_data.json','rb') as f:\n", 24 | " train_data=json.load(f)" 25 | ] 26 | }, 27 | { 28 | "cell_type": "code", 29 | "execution_count": null, 30 | "metadata": {}, 31 | "outputs": [], 32 | "source": [ 33 | "# create a blank English NLP model\n", 34 | "nlp = spacy.blank('en')\n", 35 | "\n", 36 | "# Create the NER component and add it to the pipeline\n", 37 | "if \"ner\" not in nlp.pipe_names:\n", 38 | " ner = nlp.add_pipe(\"ner\", last=True)\n", 39 | "else:\n", 40 | " ner = nlp.get_pipe(\"ner\")\n", 41 | "\n", 42 | "# Add labels to the NER component\n", 43 | "for item in train_data:\n", 44 | " for _, _, label in item['entities']:\n", 45 | " ner.add_label(label)\n", 46 | "\n", 47 | "# Prepare training data in the format required by spaCy 3.x\n", 48 | "train_examples = []\n", 49 | "count=0\n", 50 | "for item in train_data:\n", 51 | " doc = nlp.make_doc(item[\"text\"])\n", 52 | " ents = []\n", 53 | " for start, end, label in item['entities']:\n", 54 | " span = doc.char_span(start, end, label=label, alignment_mode=\"contract\")\n", 55 | " if span is not None:\n", 56 | " ents.append(span)\n", 57 | " \n", 58 | " filtered_ents = filter_spans(ents)\n", 59 | " doc.ents = filtered_ents\n", 60 | " example = Example.from_dict(doc, {\"entities\": item['entities']})\n", 61 | " train_examples.append(example)\n", 62 | "\n" 63 | ] 64 | }, 65 | { 66 | "cell_type": "code", 67 | "execution_count": 4, 68 | "metadata": {}, 69 | "outputs": [ 70 | { 71 | "name": "stdout", 72 | "output_type": "stream", 73 | "text": [ 74 | "Iteration 1: Losses: 26612.209, Precision: 0.997, Recall: 1.000, F1-score: 0.999\n", 75 | "Iteration 2: Losses: 5704.469, Precision: 0.934, Recall: 1.000, F1-score: 0.966\n", 76 | "Iteration 3: Losses: 3952.015, Precision: 0.823, Recall: 1.000, F1-score: 0.903\n", 77 | "Iteration 4: Losses: 3180.710, Precision: 0.900, Recall: 1.000, F1-score: 0.947\n", 78 | "Iteration 5: Losses: 2740.532, Precision: 0.851, Recall: 1.000, F1-score: 0.919\n", 79 | "Iteration 6: Losses: 2647.418, Precision: 0.884, Recall: 1.000, F1-score: 0.938\n", 80 | "Iteration 7: Losses: 2200.027, Precision: 0.885, Recall: 1.000, F1-score: 0.939\n", 81 | "Iteration 8: Losses: 2086.462, Precision: 0.881, Recall: 1.000, F1-score: 0.937\n", 82 | "Iteration 9: Losses: 1896.823, Precision: 0.893, Recall: 1.000, F1-score: 0.944\n", 83 | "Iteration 10: Losses: 1817.931, Precision: 0.931, Recall: 1.000, F1-score: 0.964\n", 84 | "Iteration 11: Losses: 1680.189, Precision: 0.936, Recall: 1.000, F1-score: 0.967\n", 85 | "Iteration 12: Losses: 1707.834, Precision: 0.907, Recall: 1.000, F1-score: 0.951\n", 86 | "Iteration 13: Losses: 1603.189, Precision: 0.928, Recall: 1.000, F1-score: 0.963\n", 87 | "Iteration 14: Losses: 1530.553, Precision: 0.931, Recall: 1.000, F1-score: 0.964\n", 88 | "Iteration 15: Losses: 1418.975, Precision: 0.943, Recall: 1.000, F1-score: 0.970\n", 89 | "Iteration 16: Losses: 1448.126, Precision: 0.904, Recall: 1.000, F1-score: 0.950\n", 90 | "Iteration 17: Losses: 1277.593, Precision: 0.947, Recall: 1.000, F1-score: 0.973\n", 91 | "Iteration 18: Losses: 1275.569, Precision: 0.966, Recall: 1.000, F1-score: 0.983\n", 92 | "Iteration 19: Losses: 1246.902, Precision: 0.954, Recall: 1.000, F1-score: 0.976\n", 93 | "Iteration 20: Losses: 1147.991, Precision: 0.890, Recall: 1.000, F1-score: 0.942\n", 94 | "Iteration 21: Losses: 1112.616, Precision: 0.918, Recall: 1.000, F1-score: 0.958\n", 95 | "Iteration 22: Losses: 980.910, Precision: 0.916, Recall: 1.000, F1-score: 0.956\n", 96 | "Iteration 23: Losses: 986.483, Precision: 0.909, Recall: 1.000, F1-score: 0.952\n", 97 | "Iteration 24: Losses: 1034.985, Precision: 0.943, Recall: 1.000, F1-score: 0.970\n", 98 | "Iteration 25: Losses: 987.256, Precision: 0.945, Recall: 1.000, F1-score: 0.972\n", 99 | "Iteration 26: Losses: 983.464, Precision: 0.912, Recall: 1.000, F1-score: 0.954\n", 100 | "Iteration 27: Losses: 987.474, Precision: 0.955, Recall: 1.000, F1-score: 0.977\n", 101 | "Iteration 28: Losses: 976.155, Precision: 0.957, Recall: 1.000, F1-score: 0.978\n", 102 | "Iteration 29: Losses: 951.699, Precision: 0.947, Recall: 1.000, F1-score: 0.973\n", 103 | "Iteration 30: Losses: 851.523, Precision: 0.927, Recall: 1.000, F1-score: 0.962\n", 104 | "Iteration 31: Losses: 817.878, Precision: 0.969, Recall: 1.000, F1-score: 0.984\n", 105 | "Iteration 32: Losses: 866.606, Precision: 0.940, Recall: 1.000, F1-score: 0.969\n", 106 | "Iteration 33: Losses: 823.616, Precision: 0.914, Recall: 1.000, F1-score: 0.955\n", 107 | "Iteration 34: Losses: 820.650, Precision: 0.950, Recall: 1.000, F1-score: 0.974\n", 108 | "Iteration 35: Losses: 789.587, Precision: 0.946, Recall: 1.000, F1-score: 0.972\n", 109 | "Iteration 36: Losses: 763.384, Precision: 0.934, Recall: 1.000, F1-score: 0.966\n", 110 | "Iteration 37: Losses: 753.573, Precision: 0.962, Recall: 1.000, F1-score: 0.981\n", 111 | "Iteration 38: Losses: 735.642, Precision: 0.953, Recall: 1.000, F1-score: 0.976\n", 112 | "Iteration 39: Losses: 694.879, Precision: 0.979, Recall: 1.000, F1-score: 0.989\n", 113 | "Iteration 40: Losses: 733.312, Precision: 0.970, Recall: 1.000, F1-score: 0.985\n", 114 | "Iteration 41: Losses: 707.260, Precision: 0.963, Recall: 1.000, F1-score: 0.981\n", 115 | "Iteration 42: Losses: 686.931, Precision: 0.946, Recall: 1.000, F1-score: 0.972\n", 116 | "Iteration 43: Losses: 637.460, Precision: 0.956, Recall: 1.000, F1-score: 0.978\n", 117 | "Iteration 44: Losses: 720.086, Precision: 0.972, Recall: 1.000, F1-score: 0.986\n", 118 | "Iteration 45: Losses: 602.563, Precision: 0.978, Recall: 1.000, F1-score: 0.989\n", 119 | "Iteration 46: Losses: 609.871, Precision: 0.955, Recall: 1.000, F1-score: 0.977\n", 120 | "Iteration 47: Losses: 614.682, Precision: 0.986, Recall: 1.000, F1-score: 0.993\n", 121 | "Iteration 48: Losses: 593.069, Precision: 0.973, Recall: 1.000, F1-score: 0.986\n", 122 | "Iteration 49: Losses: 675.111, Precision: 0.960, Recall: 1.000, F1-score: 0.979\n", 123 | "Iteration 50: Losses: 574.115, Precision: 0.969, Recall: 1.000, F1-score: 0.984\n", 124 | "Iteration 51: Losses: 574.475, Precision: 0.965, Recall: 1.000, F1-score: 0.982\n", 125 | "Iteration 52: Losses: 542.209, Precision: 0.964, Recall: 1.000, F1-score: 0.982\n", 126 | "Iteration 53: Losses: 533.706, Precision: 0.955, Recall: 1.000, F1-score: 0.977\n", 127 | "Iteration 54: Losses: 607.607, Precision: 0.961, Recall: 1.000, F1-score: 0.980\n", 128 | "Iteration 55: Losses: 558.791, Precision: 0.982, Recall: 1.000, F1-score: 0.991\n", 129 | "Iteration 56: Losses: 539.896, Precision: 0.959, Recall: 1.000, F1-score: 0.979\n", 130 | "Iteration 57: Losses: 531.988, Precision: 0.974, Recall: 1.000, F1-score: 0.987\n", 131 | "Iteration 58: Losses: 561.542, Precision: 0.978, Recall: 1.000, F1-score: 0.989\n", 132 | "Iteration 59: Losses: 533.384, Precision: 0.962, Recall: 1.000, F1-score: 0.981\n", 133 | "Iteration 60: Losses: 516.781, Precision: 0.979, Recall: 1.000, F1-score: 0.989\n", 134 | "Iteration 61: Losses: 474.448, Precision: 0.961, Recall: 1.000, F1-score: 0.980\n", 135 | "Iteration 62: Losses: 468.161, Precision: 0.990, Recall: 1.000, F1-score: 0.995\n", 136 | "Iteration 63: Losses: 502.683, Precision: 0.970, Recall: 1.000, F1-score: 0.985\n", 137 | "Iteration 64: Losses: 457.712, Precision: 0.974, Recall: 1.000, F1-score: 0.987\n", 138 | "Iteration 65: Losses: 492.248, Precision: 0.973, Recall: 1.000, F1-score: 0.987\n", 139 | "Iteration 66: Losses: 501.086, Precision: 0.972, Recall: 1.000, F1-score: 0.986\n", 140 | "Iteration 67: Losses: 499.547, Precision: 0.976, Recall: 1.000, F1-score: 0.988\n", 141 | "Iteration 68: Losses: 448.287, Precision: 0.986, Recall: 1.000, F1-score: 0.993\n", 142 | "Iteration 69: Losses: 450.033, Precision: 0.973, Recall: 1.000, F1-score: 0.987\n", 143 | "Iteration 70: Losses: 436.626, Precision: 0.974, Recall: 1.000, F1-score: 0.987\n", 144 | "Iteration 71: Losses: 443.753, Precision: 0.962, Recall: 1.000, F1-score: 0.981\n", 145 | "Iteration 72: Losses: 452.542, Precision: 0.947, Recall: 1.000, F1-score: 0.973\n", 146 | "Iteration 73: Losses: 450.764, Precision: 0.984, Recall: 1.000, F1-score: 0.992\n", 147 | "Iteration 74: Losses: 451.362, Precision: 0.975, Recall: 1.000, F1-score: 0.987\n", 148 | "Iteration 75: Losses: 391.670, Precision: 0.980, Recall: 1.000, F1-score: 0.990\n", 149 | "Iteration 76: Losses: 428.624, Precision: 0.971, Recall: 1.000, F1-score: 0.985\n", 150 | "Iteration 77: Losses: 436.983, Precision: 0.986, Recall: 1.000, F1-score: 0.993\n", 151 | "Iteration 78: Losses: 443.188, Precision: 0.984, Recall: 1.000, F1-score: 0.992\n", 152 | "Iteration 79: Losses: 381.292, Precision: 0.985, Recall: 1.000, F1-score: 0.992\n", 153 | "Iteration 80: Losses: 369.396, Precision: 0.976, Recall: 1.000, F1-score: 0.988\n", 154 | "Iteration 81: Losses: 371.702, Precision: 0.982, Recall: 1.000, F1-score: 0.991\n", 155 | "Iteration 82: Losses: 389.415, Precision: 0.983, Recall: 1.000, F1-score: 0.992\n", 156 | "Iteration 83: Losses: 348.948, Precision: 0.973, Recall: 1.000, F1-score: 0.987\n", 157 | "Iteration 84: Losses: 380.144, Precision: 0.969, Recall: 1.000, F1-score: 0.984\n", 158 | "Iteration 85: Losses: 405.729, Precision: 0.965, Recall: 1.000, F1-score: 0.982\n", 159 | "Iteration 86: Losses: 375.188, Precision: 0.968, Recall: 1.000, F1-score: 0.984\n", 160 | "Iteration 87: Losses: 387.213, Precision: 0.972, Recall: 1.000, F1-score: 0.986\n", 161 | "Iteration 88: Losses: 370.844, Precision: 0.985, Recall: 1.000, F1-score: 0.993\n", 162 | "Iteration 89: Losses: 355.205, Precision: 0.979, Recall: 1.000, F1-score: 0.989\n", 163 | "Iteration 90: Losses: 360.518, Precision: 0.979, Recall: 1.000, F1-score: 0.989\n", 164 | "Iteration 91: Losses: 357.224, Precision: 0.957, Recall: 1.000, F1-score: 0.978\n", 165 | "Iteration 92: Losses: 367.662, Precision: 0.984, Recall: 1.000, F1-score: 0.992\n", 166 | "Iteration 93: Losses: 344.491, Precision: 0.985, Recall: 1.000, F1-score: 0.992\n", 167 | "Iteration 94: Losses: 354.471, Precision: 0.983, Recall: 1.000, F1-score: 0.991\n", 168 | "Iteration 95: Losses: 307.208, Precision: 0.980, Recall: 1.000, F1-score: 0.990\n", 169 | "Iteration 96: Losses: 319.637, Precision: 0.970, Recall: 1.000, F1-score: 0.985\n", 170 | "Iteration 97: Losses: 332.045, Precision: 0.991, Recall: 1.000, F1-score: 0.995\n", 171 | "Iteration 98: Losses: 330.712, Precision: 0.982, Recall: 1.000, F1-score: 0.991\n", 172 | "Iteration 99: Losses: 357.091, Precision: 0.967, Recall: 1.000, F1-score: 0.983\n", 173 | "Iteration 100: Losses: 338.934, Precision: 0.964, Recall: 1.000, F1-score: 0.982\n", 174 | "Iteration 101: Losses: 324.915, Precision: 0.983, Recall: 1.000, F1-score: 0.991\n", 175 | "Iteration 102: Losses: 355.408, Precision: 0.977, Recall: 1.000, F1-score: 0.989\n", 176 | "Iteration 103: Losses: 367.118, Precision: 0.981, Recall: 1.000, F1-score: 0.990\n", 177 | "Iteration 104: Losses: 347.553, Precision: 0.987, Recall: 1.000, F1-score: 0.994\n", 178 | "Iteration 105: Losses: 325.273, Precision: 0.969, Recall: 1.000, F1-score: 0.984\n", 179 | "Iteration 106: Losses: 328.377, Precision: 0.986, Recall: 1.000, F1-score: 0.993\n", 180 | "Iteration 107: Losses: 292.850, Precision: 0.980, Recall: 1.000, F1-score: 0.990\n", 181 | "Iteration 108: Losses: 304.283, Precision: 0.970, Recall: 1.000, F1-score: 0.985\n", 182 | "Iteration 109: Losses: 313.686, Precision: 0.970, Recall: 1.000, F1-score: 0.985\n", 183 | "Iteration 110: Losses: 323.467, Precision: 0.979, Recall: 1.000, F1-score: 0.990\n", 184 | "Iteration 111: Losses: 312.275, Precision: 0.984, Recall: 1.000, F1-score: 0.992\n", 185 | "Iteration 112: Losses: 283.199, Precision: 0.983, Recall: 1.000, F1-score: 0.992\n", 186 | "Iteration 113: Losses: 290.470, Precision: 0.982, Recall: 1.000, F1-score: 0.991\n", 187 | "Iteration 114: Losses: 311.933, Precision: 0.979, Recall: 1.000, F1-score: 0.990\n", 188 | "Iteration 115: Losses: 341.079, Precision: 0.982, Recall: 1.000, F1-score: 0.991\n", 189 | "Iteration 116: Losses: 278.338, Precision: 0.984, Recall: 1.000, F1-score: 0.992\n", 190 | "Iteration 117: Losses: 332.445, Precision: 0.975, Recall: 1.000, F1-score: 0.987\n", 191 | "Iteration 118: Losses: 305.459, Precision: 0.977, Recall: 1.000, F1-score: 0.988\n", 192 | "Iteration 119: Losses: 305.398, Precision: 0.984, Recall: 1.000, F1-score: 0.992\n", 193 | "Iteration 120: Losses: 284.236, Precision: 0.987, Recall: 1.000, F1-score: 0.993\n", 194 | "Iteration 121: Losses: 311.096, Precision: 0.985, Recall: 1.000, F1-score: 0.992\n", 195 | "Iteration 122: Losses: 279.543, Precision: 0.978, Recall: 1.000, F1-score: 0.989\n", 196 | "Iteration 123: Losses: 281.207, Precision: 0.987, Recall: 1.000, F1-score: 0.993\n", 197 | "Iteration 124: Losses: 268.616, Precision: 0.970, Recall: 1.000, F1-score: 0.985\n", 198 | "Iteration 125: Losses: 265.647, Precision: 0.980, Recall: 1.000, F1-score: 0.990\n", 199 | "Iteration 126: Losses: 256.806, Precision: 0.970, Recall: 1.000, F1-score: 0.985\n", 200 | "Iteration 127: Losses: 279.410, Precision: 0.975, Recall: 1.000, F1-score: 0.987\n", 201 | "Iteration 128: Losses: 275.790, Precision: 0.981, Recall: 1.000, F1-score: 0.991\n", 202 | "Iteration 129: Losses: 261.689, Precision: 0.985, Recall: 1.000, F1-score: 0.992\n", 203 | "Iteration 130: Losses: 229.473, Precision: 0.977, Recall: 1.000, F1-score: 0.989\n", 204 | "Iteration 131: Losses: 269.852, Precision: 0.981, Recall: 1.000, F1-score: 0.991\n", 205 | "Iteration 132: Losses: 254.804, Precision: 0.982, Recall: 1.000, F1-score: 0.991\n", 206 | "Iteration 133: Losses: 292.561, Precision: 0.986, Recall: 1.000, F1-score: 0.993\n", 207 | "Iteration 134: Losses: 299.291, Precision: 0.988, Recall: 1.000, F1-score: 0.994\n", 208 | "Iteration 135: Losses: 263.173, Precision: 0.987, Recall: 1.000, F1-score: 0.993\n", 209 | "Iteration 136: Losses: 261.140, Precision: 0.985, Recall: 1.000, F1-score: 0.992\n", 210 | "Iteration 137: Losses: 275.786, Precision: 0.987, Recall: 1.000, F1-score: 0.993\n", 211 | "Iteration 138: Losses: 242.394, Precision: 0.985, Recall: 1.000, F1-score: 0.993\n", 212 | "Iteration 139: Losses: 272.513, Precision: 0.983, Recall: 1.000, F1-score: 0.991\n", 213 | "Iteration 140: Losses: 258.453, Precision: 0.976, Recall: 1.000, F1-score: 0.988\n", 214 | "Iteration 141: Losses: 248.512, Precision: 0.972, Recall: 1.000, F1-score: 0.986\n", 215 | "Iteration 142: Losses: 242.651, Precision: 0.991, Recall: 1.000, F1-score: 0.996\n", 216 | "Iteration 143: Losses: 256.711, Precision: 0.977, Recall: 1.000, F1-score: 0.988\n", 217 | "Iteration 144: Losses: 247.515, Precision: 0.980, Recall: 1.000, F1-score: 0.990\n", 218 | "Iteration 145: Losses: 258.765, Precision: 0.991, Recall: 1.000, F1-score: 0.995\n", 219 | "Iteration 146: Losses: 243.136, Precision: 0.985, Recall: 1.000, F1-score: 0.993\n", 220 | "Iteration 147: Losses: 269.303, Precision: 0.984, Recall: 1.000, F1-score: 0.992\n", 221 | "Iteration 148: Losses: 267.897, Precision: 0.987, Recall: 1.000, F1-score: 0.993\n", 222 | "Iteration 149: Losses: 260.078, Precision: 0.990, Recall: 1.000, F1-score: 0.995\n", 223 | "Iteration 150: Losses: 234.311, Precision: 0.985, Recall: 1.000, F1-score: 0.993\n", 224 | "Iteration 151: Losses: 227.250, Precision: 0.992, Recall: 1.000, F1-score: 0.996\n", 225 | "Iteration 152: Losses: 242.148, Precision: 0.986, Recall: 1.000, F1-score: 0.993\n", 226 | "Iteration 153: Losses: 235.717, Precision: 0.973, Recall: 1.000, F1-score: 0.986\n", 227 | "Iteration 154: Losses: 362.445, Precision: 0.988, Recall: 1.000, F1-score: 0.994\n", 228 | "Iteration 155: Losses: 267.372, Precision: 0.985, Recall: 1.000, F1-score: 0.992\n", 229 | "Iteration 156: Losses: 236.276, Precision: 0.985, Recall: 1.000, F1-score: 0.993\n", 230 | "Iteration 157: Losses: 256.003, Precision: 0.985, Recall: 1.000, F1-score: 0.992\n", 231 | "Iteration 158: Losses: 219.543, Precision: 0.983, Recall: 1.000, F1-score: 0.992\n", 232 | "Iteration 159: Losses: 228.927, Precision: 0.991, Recall: 1.000, F1-score: 0.995\n", 233 | "Iteration 160: Losses: 198.271, Precision: 0.978, Recall: 1.000, F1-score: 0.989\n", 234 | "Iteration 161: Losses: 247.528, Precision: 0.988, Recall: 1.000, F1-score: 0.994\n", 235 | "Iteration 162: Losses: 228.176, Precision: 0.988, Recall: 1.000, F1-score: 0.994\n", 236 | "Iteration 163: Losses: 225.315, Precision: 0.991, Recall: 1.000, F1-score: 0.995\n", 237 | "Iteration 164: Losses: 214.318, Precision: 0.981, Recall: 1.000, F1-score: 0.991\n", 238 | "Iteration 165: Losses: 196.020, Precision: 0.987, Recall: 1.000, F1-score: 0.993\n", 239 | "Iteration 166: Losses: 234.131, Precision: 0.988, Recall: 1.000, F1-score: 0.994\n", 240 | "Iteration 167: Losses: 219.748, Precision: 0.985, Recall: 1.000, F1-score: 0.993\n", 241 | "Iteration 168: Losses: 214.162, Precision: 0.989, Recall: 1.000, F1-score: 0.994\n", 242 | "Iteration 169: Losses: 243.845, Precision: 0.991, Recall: 1.000, F1-score: 0.995\n", 243 | "Iteration 170: Losses: 177.961, Precision: 0.984, Recall: 1.000, F1-score: 0.992\n", 244 | "Iteration 171: Losses: 189.173, Precision: 0.980, Recall: 1.000, F1-score: 0.990\n", 245 | "Iteration 172: Losses: 220.402, Precision: 0.987, Recall: 1.000, F1-score: 0.993\n", 246 | "Iteration 173: Losses: 243.295, Precision: 0.988, Recall: 1.000, F1-score: 0.994\n", 247 | "Iteration 174: Losses: 227.411, Precision: 0.986, Recall: 1.000, F1-score: 0.993\n", 248 | "Iteration 175: Losses: 219.293, Precision: 0.987, Recall: 1.000, F1-score: 0.994\n", 249 | "Iteration 176: Losses: 190.667, Precision: 0.989, Recall: 1.000, F1-score: 0.994\n", 250 | "Iteration 177: Losses: 204.839, Precision: 0.991, Recall: 1.000, F1-score: 0.996\n", 251 | "Iteration 178: Losses: 207.279, Precision: 0.975, Recall: 1.000, F1-score: 0.987\n", 252 | "Iteration 179: Losses: 227.621, Precision: 0.982, Recall: 1.000, F1-score: 0.991\n", 253 | "Iteration 180: Losses: 192.607, Precision: 0.989, Recall: 1.000, F1-score: 0.994\n", 254 | "Iteration 181: Losses: 207.633, Precision: 0.991, Recall: 1.000, F1-score: 0.996\n", 255 | "Iteration 182: Losses: 210.554, Precision: 0.989, Recall: 1.000, F1-score: 0.995\n", 256 | "Iteration 183: Losses: 209.294, Precision: 0.987, Recall: 1.000, F1-score: 0.993\n", 257 | "Iteration 184: Losses: 195.729, Precision: 0.983, Recall: 1.000, F1-score: 0.991\n", 258 | "Iteration 185: Losses: 238.936, Precision: 0.995, Recall: 1.000, F1-score: 0.997\n", 259 | "Iteration 186: Losses: 196.003, Precision: 0.989, Recall: 1.000, F1-score: 0.994\n", 260 | "Iteration 187: Losses: 184.401, Precision: 0.985, Recall: 1.000, F1-score: 0.993\n", 261 | "Iteration 188: Losses: 278.631, Precision: 0.991, Recall: 1.000, F1-score: 0.995\n", 262 | "Iteration 189: Losses: 206.244, Precision: 0.988, Recall: 1.000, F1-score: 0.994\n", 263 | "Iteration 190: Losses: 206.792, Precision: 0.988, Recall: 1.000, F1-score: 0.994\n", 264 | "Iteration 191: Losses: 216.515, Precision: 0.983, Recall: 1.000, F1-score: 0.991\n", 265 | "Iteration 192: Losses: 178.613, Precision: 0.991, Recall: 1.000, F1-score: 0.995\n", 266 | "Iteration 193: Losses: 206.766, Precision: 0.985, Recall: 1.000, F1-score: 0.992\n", 267 | "Iteration 194: Losses: 192.838, Precision: 0.994, Recall: 1.000, F1-score: 0.997\n", 268 | "Iteration 195: Losses: 167.583, Precision: 0.988, Recall: 1.000, F1-score: 0.994\n", 269 | "Iteration 196: Losses: 174.022, Precision: 0.992, Recall: 1.000, F1-score: 0.996\n", 270 | "Iteration 197: Losses: 177.738, Precision: 0.991, Recall: 1.000, F1-score: 0.995\n", 271 | "Iteration 198: Losses: 206.007, Precision: 0.986, Recall: 1.000, F1-score: 0.993\n", 272 | "Iteration 199: Losses: 194.892, Precision: 0.990, Recall: 1.000, F1-score: 0.995\n", 273 | "Iteration 200: Losses: 182.221, Precision: 0.990, Recall: 1.000, F1-score: 0.995\n", 274 | "Iteration 201: Losses: 186.950, Precision: 0.992, Recall: 1.000, F1-score: 0.996\n", 275 | "Iteration 202: Losses: 203.436, Precision: 0.990, Recall: 1.000, F1-score: 0.995\n", 276 | "Iteration 203: Losses: 180.020, Precision: 0.990, Recall: 1.000, F1-score: 0.995\n", 277 | "Iteration 204: Losses: 156.148, Precision: 0.987, Recall: 1.000, F1-score: 0.993\n", 278 | "Iteration 205: Losses: 192.183, Precision: 0.989, Recall: 1.000, F1-score: 0.994\n", 279 | "Iteration 206: Losses: 179.260, Precision: 0.990, Recall: 1.000, F1-score: 0.995\n", 280 | "Iteration 207: Losses: 159.405, Precision: 0.985, Recall: 1.000, F1-score: 0.992\n", 281 | "Iteration 208: Losses: 171.970, Precision: 0.995, Recall: 1.000, F1-score: 0.998\n", 282 | "Iteration 209: Losses: 189.327, Precision: 0.986, Recall: 1.000, F1-score: 0.993\n", 283 | "Iteration 210: Losses: 186.480, Precision: 0.991, Recall: 1.000, F1-score: 0.996\n", 284 | "Iteration 211: Losses: 161.660, Precision: 0.991, Recall: 1.000, F1-score: 0.996\n", 285 | "Iteration 212: Losses: 200.974, Precision: 0.987, Recall: 1.000, F1-score: 0.994\n", 286 | "Iteration 213: Losses: 179.066, Precision: 0.988, Recall: 1.000, F1-score: 0.994\n", 287 | "Iteration 214: Losses: 179.727, Precision: 0.992, Recall: 1.000, F1-score: 0.996\n", 288 | "Iteration 215: Losses: 196.743, Precision: 0.990, Recall: 1.000, F1-score: 0.995\n", 289 | "Iteration 216: Losses: 179.379, Precision: 0.991, Recall: 1.000, F1-score: 0.995\n", 290 | "Iteration 217: Losses: 154.307, Precision: 0.984, Recall: 1.000, F1-score: 0.992\n", 291 | "Iteration 218: Losses: 182.968, Precision: 0.987, Recall: 1.000, F1-score: 0.994\n", 292 | "Iteration 219: Losses: 171.672, Precision: 0.988, Recall: 1.000, F1-score: 0.994\n", 293 | "Iteration 220: Losses: 193.198, Precision: 0.986, Recall: 1.000, F1-score: 0.993\n", 294 | "Iteration 221: Losses: 173.074, Precision: 0.992, Recall: 1.000, F1-score: 0.996\n", 295 | "Iteration 222: Losses: 190.361, Precision: 0.990, Recall: 1.000, F1-score: 0.995\n", 296 | "Iteration 223: Losses: 180.024, Precision: 0.985, Recall: 1.000, F1-score: 0.993\n", 297 | "Iteration 224: Losses: 177.070, Precision: 0.991, Recall: 1.000, F1-score: 0.996\n", 298 | "Iteration 225: Losses: 178.802, Precision: 0.992, Recall: 1.000, F1-score: 0.996\n", 299 | "Iteration 226: Losses: 159.339, Precision: 0.989, Recall: 1.000, F1-score: 0.994\n", 300 | "Iteration 227: Losses: 166.651, Precision: 0.993, Recall: 1.000, F1-score: 0.996\n", 301 | "Iteration 228: Losses: 154.083, Precision: 0.991, Recall: 1.000, F1-score: 0.996\n", 302 | "Iteration 229: Losses: 152.684, Precision: 0.990, Recall: 1.000, F1-score: 0.995\n", 303 | "Iteration 230: Losses: 179.814, Precision: 0.988, Recall: 1.000, F1-score: 0.994\n", 304 | "Iteration 231: Losses: 168.555, Precision: 0.985, Recall: 1.000, F1-score: 0.993\n", 305 | "Iteration 232: Losses: 182.348, Precision: 0.988, Recall: 1.000, F1-score: 0.994\n", 306 | "Iteration 233: Losses: 175.933, Precision: 0.985, Recall: 1.000, F1-score: 0.993\n", 307 | "Iteration 234: Losses: 168.575, Precision: 0.992, Recall: 1.000, F1-score: 0.996\n", 308 | "Iteration 235: Losses: 143.038, Precision: 0.991, Recall: 1.000, F1-score: 0.995\n", 309 | "Iteration 236: Losses: 161.567, Precision: 0.990, Recall: 1.000, F1-score: 0.995\n", 310 | "Iteration 237: Losses: 168.450, Precision: 0.994, Recall: 1.000, F1-score: 0.997\n", 311 | "Iteration 238: Losses: 166.903, Precision: 0.993, Recall: 1.000, F1-score: 0.997\n", 312 | "Iteration 239: Losses: 162.112, Precision: 0.988, Recall: 1.000, F1-score: 0.994\n", 313 | "Iteration 240: Losses: 158.231, Precision: 0.990, Recall: 1.000, F1-score: 0.995\n", 314 | "Iteration 241: Losses: 155.303, Precision: 0.991, Recall: 1.000, F1-score: 0.995\n", 315 | "Iteration 242: Losses: 152.772, Precision: 0.989, Recall: 1.000, F1-score: 0.995\n", 316 | "Iteration 243: Losses: 159.833, Precision: 0.991, Recall: 1.000, F1-score: 0.996\n", 317 | "Iteration 244: Losses: 143.507, Precision: 0.991, Recall: 1.000, F1-score: 0.995\n", 318 | "Iteration 245: Losses: 182.271, Precision: 0.991, Recall: 1.000, F1-score: 0.996\n", 319 | "Iteration 246: Losses: 158.287, Precision: 0.989, Recall: 1.000, F1-score: 0.995\n", 320 | "Iteration 247: Losses: 174.304, Precision: 0.991, Recall: 1.000, F1-score: 0.995\n", 321 | "Iteration 248: Losses: 166.270, Precision: 0.991, Recall: 1.000, F1-score: 0.995\n", 322 | "Iteration 249: Losses: 171.564, Precision: 0.993, Recall: 1.000, F1-score: 0.997\n", 323 | "Iteration 250: Losses: 158.631, Precision: 0.991, Recall: 1.000, F1-score: 0.996\n", 324 | "Iteration 251: Losses: 161.692, Precision: 0.992, Recall: 1.000, F1-score: 0.996\n", 325 | "Iteration 252: Losses: 154.421, Precision: 0.991, Recall: 1.000, F1-score: 0.995\n", 326 | "Iteration 253: Losses: 164.905, Precision: 0.992, Recall: 1.000, F1-score: 0.996\n", 327 | "Iteration 254: Losses: 152.181, Precision: 0.990, Recall: 1.000, F1-score: 0.995\n", 328 | "Iteration 255: Losses: 141.292, Precision: 0.983, Recall: 1.000, F1-score: 0.992\n", 329 | "Iteration 256: Losses: 169.921, Precision: 0.995, Recall: 1.000, F1-score: 0.997\n", 330 | "Iteration 257: Losses: 149.973, Precision: 0.988, Recall: 1.000, F1-score: 0.994\n", 331 | "Iteration 258: Losses: 124.414, Precision: 0.991, Recall: 1.000, F1-score: 0.995\n", 332 | "Iteration 259: Losses: 151.241, Precision: 0.993, Recall: 1.000, F1-score: 0.997\n", 333 | "Iteration 260: Losses: 172.780, Precision: 0.992, Recall: 1.000, F1-score: 0.996\n", 334 | "Iteration 261: Losses: 159.021, Precision: 0.993, Recall: 1.000, F1-score: 0.996\n", 335 | "Iteration 262: Losses: 144.074, Precision: 0.986, Recall: 1.000, F1-score: 0.993\n", 336 | "Iteration 263: Losses: 161.403, Precision: 0.987, Recall: 1.000, F1-score: 0.993\n", 337 | "Iteration 264: Losses: 128.211, Precision: 0.991, Recall: 1.000, F1-score: 0.995\n", 338 | "Iteration 265: Losses: 146.785, Precision: 0.993, Recall: 1.000, F1-score: 0.997\n", 339 | "Iteration 266: Losses: 149.536, Precision: 0.990, Recall: 1.000, F1-score: 0.995\n", 340 | "Iteration 267: Losses: 160.178, Precision: 0.991, Recall: 1.000, F1-score: 0.995\n", 341 | "Iteration 268: Losses: 137.646, Precision: 0.992, Recall: 1.000, F1-score: 0.996\n", 342 | "Iteration 269: Losses: 158.576, Precision: 0.989, Recall: 1.000, F1-score: 0.995\n", 343 | "Iteration 270: Losses: 155.475, Precision: 0.990, Recall: 1.000, F1-score: 0.995\n", 344 | "Iteration 271: Losses: 175.135, Precision: 0.996, Recall: 1.000, F1-score: 0.998\n", 345 | "Iteration 272: Losses: 146.622, Precision: 0.991, Recall: 1.000, F1-score: 0.996\n", 346 | "Iteration 273: Losses: 162.552, Precision: 0.985, Recall: 1.000, F1-score: 0.993\n", 347 | "Iteration 274: Losses: 132.417, Precision: 0.987, Recall: 1.000, F1-score: 0.993\n", 348 | "Iteration 275: Losses: 162.806, Precision: 0.993, Recall: 1.000, F1-score: 0.996\n", 349 | "Iteration 276: Losses: 148.515, Precision: 0.990, Recall: 1.000, F1-score: 0.995\n", 350 | "Iteration 277: Losses: 150.103, Precision: 0.991, Recall: 1.000, F1-score: 0.995\n", 351 | "Iteration 278: Losses: 128.127, Precision: 0.991, Recall: 1.000, F1-score: 0.995\n", 352 | "Iteration 279: Losses: 164.356, Precision: 0.988, Recall: 1.000, F1-score: 0.994\n", 353 | "Iteration 280: Losses: 130.868, Precision: 0.989, Recall: 1.000, F1-score: 0.995\n", 354 | "Iteration 281: Losses: 166.330, Precision: 0.991, Recall: 1.000, F1-score: 0.996\n", 355 | "Iteration 282: Losses: 144.873, Precision: 0.995, Recall: 1.000, F1-score: 0.998\n", 356 | "Iteration 283: Losses: 179.280, Precision: 0.993, Recall: 1.000, F1-score: 0.996\n", 357 | "Iteration 284: Losses: 145.830, Precision: 0.989, Recall: 1.000, F1-score: 0.995\n", 358 | "Iteration 285: Losses: 147.223, Precision: 0.993, Recall: 1.000, F1-score: 0.996\n", 359 | "Iteration 286: Losses: 134.719, Precision: 0.995, Recall: 1.000, F1-score: 0.997\n", 360 | "Iteration 287: Losses: 148.663, Precision: 0.989, Recall: 1.000, F1-score: 0.994\n", 361 | "Iteration 288: Losses: 116.841, Precision: 0.996, Recall: 1.000, F1-score: 0.998\n", 362 | "Iteration 289: Losses: 150.505, Precision: 0.991, Recall: 1.000, F1-score: 0.995\n", 363 | "Iteration 290: Losses: 117.468, Precision: 0.990, Recall: 1.000, F1-score: 0.995\n", 364 | "Iteration 291: Losses: 151.356, Precision: 0.990, Recall: 1.000, F1-score: 0.995\n", 365 | "Iteration 292: Losses: 137.203, Precision: 0.995, Recall: 1.000, F1-score: 0.997\n", 366 | "Iteration 293: Losses: 151.406, Precision: 0.990, Recall: 1.000, F1-score: 0.995\n", 367 | "Iteration 294: Losses: 163.338, Precision: 0.995, Recall: 1.000, F1-score: 0.998\n", 368 | "Iteration 295: Losses: 146.029, Precision: 0.987, Recall: 1.000, F1-score: 0.994\n", 369 | "Iteration 296: Losses: 111.953, Precision: 0.995, Recall: 1.000, F1-score: 0.998\n", 370 | "Iteration 297: Losses: 128.676, Precision: 0.996, Recall: 1.000, F1-score: 0.998\n", 371 | "Iteration 298: Losses: 161.734, Precision: 0.988, Recall: 1.000, F1-score: 0.994\n", 372 | "Iteration 299: Losses: 152.918, Precision: 0.990, Recall: 1.000, F1-score: 0.995\n", 373 | "Iteration 300: Losses: 176.403, Precision: 0.991, Recall: 1.000, F1-score: 0.995\n" 374 | ] 375 | } 376 | ], 377 | "source": [ 378 | "# Initialize the optimizer\n", 379 | "optimizer = nlp.begin_training()\n", 380 | "\n", 381 | "# Training loop\n", 382 | "n_iter = 300\n", 383 | "for itn in range(n_iter):\n", 384 | " random.shuffle(train_examples)\n", 385 | " losses = {}\n", 386 | " # Batch up the examples using spaCy's minibatch\n", 387 | " batches = minibatch(train_examples, size=compounding(4.0, 32.0, 1.001))\n", 388 | " for batch in batches:\n", 389 | " nlp.update(\n", 390 | " batch, # batch of Example objects\n", 391 | " drop=0.2, # dropout - make it harder to memorise data\n", 392 | " sgd=optimizer, # callable to update weights\n", 393 | " losses=losses\n", 394 | " )\n", 395 | " scores = nlp.evaluate(train_examples)\n", 396 | " ents_p = scores[\"ents_p\"]\n", 397 | " ents_r = scores[\"ents_r\"]\n", 398 | " ents_f = scores[\"ents_f\"]\n", 399 | "\n", 400 | " print(f\"Iteration {itn+1}: Losses: {losses['ner']:.3f}, Precision: {ents_p:.3f}, Recall: {ents_r:.3f}, F1-score: {ents_f:.3f}\")\n", 401 | "\n", 402 | "# Save the model\n", 403 | "nlp.to_disk(\"ner_model\")" 404 | ] 405 | } 406 | ], 407 | "metadata": { 408 | "kernelspec": { 409 | "display_name": "Python 3", 410 | "language": "python", 411 | "name": "python3" 412 | }, 413 | "language_info": { 414 | "codemirror_mode": { 415 | "name": "ipython", 416 | "version": 3 417 | }, 418 | "file_extension": ".py", 419 | "mimetype": "text/x-python", 420 | "name": "python", 421 | "nbconvert_exporter": "python", 422 | "pygments_lexer": "ipython3", 423 | "version": "3.11.7" 424 | } 425 | }, 426 | "nbformat": 4, 427 | "nbformat_minor": 2 428 | } 429 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 |
2 |

📑ATS Scoring System📑

3 |
4 | 5 | ## 🌟 Project Overview 6 | The ATS Scoring System is designed to parse resumes, extract entities and keywords, and score resumes based on the found keywords. The system utilizes a spaCy model trained on a [Kaggle dataset](https://www.kaggle.com/datasets/dataturks/resume-entities-for-ner) to identify and score important keywords in resumes. Additionally, it provides suggestions for improvement and displays extracted entities. The entire application is built using Streamlit, allowing users to interact with the system through a web interface. 7 | 8 | ## ✨ Features 9 | - 📄 **Resume Parsing**: Extract text from various formats (PDF, DOCX, TXT). 10 | - 🔍 **Entity Extraction**: Identify and extract entities from resumes using a spaCy model. 11 | - 💯 **Keyword Scoring**: Evaluate the resume based on the presence of relevant keywords and provide a score. 12 | - 💡 **Suggestions**: Offer suggestions to improve the resume based on the extracted entities and keywords. 13 | - 📊 **Visualization**: Display the extracted entities and keywords in a user-friendly format. 14 | 15 | ## 🛠️ Technologies Used 16 | - 🧠 **spaCy**: For natural language processing and entity recognition. 17 | - 🐼 **pandas**: For handling and processing data. 18 | - 📚 **pdfplumber**: To extract text from PDF files. 19 | - 📝 **docx**: To extract text from DOCX files. 20 | - 🌐 **Streamlit**: To create a web-based interface for interacting with the ATS scoring system. 21 | 22 | ## 🚀 Installation 23 | 1. Clone the repository: 24 | ```bash 25 | git clone https://github.com/miteshgupta07/ATS-Scoring-System.git 26 | 27 | 2. Navigate to the project directory: 28 | 29 | ```bash 30 | cd ats-scoring-system 31 | 32 | 3. Install the required packages: 33 | ```bash 34 | pip install -r requirements.txt 35 | 36 | ## 🖥️Usage 37 | 1. Run the Streamlit app: 38 | 39 | ```bash 40 | streamlit run app.py 41 | ``` 42 | 43 | 2. Upload your resume in PDF, DOCX, or TXT format. 44 | 45 | 3. Enter the job description for comparison. 46 | 47 | 4. View the ATS score, Suggestions, and extracted entities. 48 | 49 | ## 📜License 50 | This project is licensed under the MIT License - see the [LICENSE](https://github.com/miteshgupta07/ATS-Scoring-System/blob/main/LICENSE) file for details. 51 | -------------------------------------------------------------------------------- /app.py: -------------------------------------------------------------------------------- 1 | import streamlit as st 2 | import pdfplumber 3 | from docx import Document 4 | import spacy 5 | import re 6 | import pandas as pd 7 | import time 8 | 9 | st.write("# **Applicant Tracking System (ATS)📑**") 10 | 11 | uploaded_resume=st.file_uploader("Upload CV/Resume",['pdf','docx','txt']) 12 | job_description=st.text_area('Enter Job Description') 13 | 14 | def extract_text_from_pdf(pdf_file): 15 | text = "" 16 | with pdfplumber.open(pdf_file) as pdf: 17 | 18 | for page in pdf.pages: 19 | text += page.extract_text() 20 | return text 21 | 22 | def extract_text_from_txt(text_file): 23 | with open(text_file,'r') as txt_file: 24 | text=txt_file.read() 25 | return text 26 | 27 | def extract_text_from_docx(word_file): 28 | doc = Document(word_file) 29 | text = "" 30 | for para in doc.paragraphs: 31 | text += para.text 32 | return text 33 | 34 | def preprocess_text(text): 35 | replacements = { 36 | '–': '–', # en dash 37 | '—': '—', # em dash 38 | '‘': '‘', # left single quotation mark 39 | '’': '’', # right single quotation mark 40 | '“': '“', # left double quotation mark 41 | 'â€�': '”', # right double quotation mark 42 | '•': '•', # bullet point 43 | '…': '…', # ellipsis 44 | 'é': 'é', # é 45 | 'è': 'è', # è 46 | 'â': 'â', # â 47 | 'ô': 'ô', # ô 48 | 'ü': 'ü', # ü 49 | 'ñ': 'ñ', # ñ 50 | 'Ë': 'Ë', # Ë 51 | 'á': 'á', # á 52 | 'ú': 'ú', # ú 53 | 'î': 'î', # î 54 | 'À': 'À', # À 55 | 'ì': 'ì', # ì 56 | 'Ù': 'Ù', # Ù 57 | 'Í': 'Í', # Í 58 | 'Ö': 'Ö', # Ö 59 | 'Á': 'Á', # Á 60 | 'ÃŒ': 'Ì', # Ì 61 | 'É': 'É', # É 62 | 'Ï': 'Ï', # Ï 63 | 'ë': 'ë', # ë 64 | 'ó': 'ó', # ó 65 | 'Þ': 'Þ', # Þ 66 | 'Ú': 'Ú', # Ú 67 | 'æ': 'æ', # æ 68 | 'Ø': 'Ø', # Ø 69 | 'ß': 'ß', # ß 70 | 'ð': 'ð', # ð 71 | 'í': 'í', # í 72 | 'õ': 'õ', # õ 73 | 'Ã¥': 'å', # å 74 | 'ï': 'ï', # ï 75 | 'ã': 'ã', # ã 76 | 'ä': 'ä', # ä 77 | 'ö': 'ö', # ö 78 | 'ü': 'ü', # ü 79 | '€': '€', # Euro sign 80 | 'â„¢': '™', # Trademark sign 81 | '∂': '∂', # Partial differential 82 | '∀': '∀', # For all 83 | '∈': '∈', # Element of 84 | '∃': '∃', # There exists 85 | '∅': '∅', # Empty set 86 | '∆': '∆', # Increment 87 | '∇': '∇', # Nabla 88 | '∑': '∑', # N-ary summation 89 | '∗': '∗', # Asterisk operator 90 | '∘': '∘', # Ring operator 91 | '∙': '∙', # Bullet operator 92 | '√': '√', # Square root 93 | '∛': '∧', # Logical and 94 | '∥': '∥', # Parallel to 95 | '∼': '∼', # Tilde operator 96 | '∾': '≀', # Wreath product 97 | '∿': '≁', # Not tilde 98 | '∋': '⊂', # Subset of 99 | '∛': '⊃', # Superset of 100 | 'â‰': '≠', # Not equal to 101 | '≤': '≤', # Less-than or equal to 102 | '≥': '≥', # Greater-than or equal to 103 | '≤': '≤', # Less-than or equal to 104 | 'â‰≥': '≥', # Greater-than or equal to 105 | '≲': '²', # Superscript two 106 | '≳': '³', # Superscript three 107 | '≮': '≡', # Identical to 108 | '≳': '≥', # Greater-than or equal to 109 | '≯': '≣', # Equivalent to 110 | '≤': '≤', # Less-than or equal to 111 | '≳': '≥', # Greater-than or equal to 112 | '≮': '≡', # Identical to 113 | } 114 | 115 | # Replace the characters in the text 116 | for wrong_char, correct_char in replacements.items(): 117 | text = text.replace(wrong_char, correct_char) 118 | 119 | # Remove newline characters and any extraneous whitespace 120 | text = re.sub(r'\n+', ' ', text) # Replace multiple newlines with a single space 121 | text = re.sub(r'\s+', ' ', text) # Replace multiple spaces with a single space 122 | processed_text = text.strip() # Remove leading and trailing whitespace 123 | 124 | return processed_text 125 | 126 | def extract_entities(text): 127 | nlp=spacy.load('./ner_model') 128 | doc=nlp(text) 129 | data = [] 130 | for ent in doc.ents: 131 | data.append([ent.label_, ent.text]) 132 | 133 | return data 134 | 135 | def find_not_found_keywords(resume_keywords,job_keywords): 136 | nf_keywords=[] 137 | 138 | for keywords in resume_keywords: 139 | if keywords[0] not in job_keywords: 140 | nf_keywords.append(keywords[0]) 141 | return nf_keywords 142 | 143 | def show_ATS_score(text,job_description): 144 | resume_keywords=extract_entities(text) 145 | job_keywords=extract_entities(job_description) 146 | resumek_count=len(resume_keywords) 147 | jobk_count=len(job_keywords) 148 | if jobk_count!=0: 149 | score=(resumek_count/jobk_count)*100 150 | score=round(score,2) 151 | else: 152 | score=-1 153 | not_found_keywords=find_not_found_keywords(resume_keywords,job_keywords) 154 | return score,not_found_keywords 155 | 156 | def show_entities(text): 157 | data=extract_entities(text) 158 | df = pd.DataFrame(data, columns=["Label", "Entity"]) 159 | st.table(df) 160 | 161 | def show_suggestion(keywords): 162 | improvement_suggestions = { 163 | "NAME": "• **Name:** Including your full name helps recruiters easily identify you.", 164 | "EMAIL_ADDRESS": "• **Email Address:** Providing an email address makes it easier for recruiters to contact you.", 165 | "LOCATION": "• **Location:** Including your location helps recruiters find candidates in specific geographic areas.", 166 | "DEGREE": "• **Degree:** Listing your degree demonstrates your educational qualifications.", 167 | "GRADUATION_YEAR": "• **Graduation Year:** Including your graduation year helps recruiters understand your experience level.", 168 | "COLLEGE_NAME": "• **College/University Name:** Mentioning your college or university name adds credibility and context to your educational background.", 169 | "SKILLS": "• **Skills:** Highlighting your skills shows your expertise and can make your resume stand out.", 170 | "WORK_EXPERIENCE": "• **Work Experience:** Detailing your previous job roles and responsibilities provides insight into your professional background.", 171 | "CERTIFICATIONS": "• **Certifications:** Adding relevant certifications can showcase your additional qualifications and specialized knowledge.", 172 | "PROJECTS": "• **Projects:** Describing significant projects you have worked on can demonstrate your practical experience and problem-solving abilities.", 173 | "LANGUAGES": "• **Languages:** Including languages you are proficient in can be advantageous, especially for roles requiring multilingual skills.", 174 | "LINKEDIN_PROFILE": "• **LinkedIn Profile:** Providing a link to your LinkedIn profile can give recruiters a more comprehensive view of your professional network and endorsements.", 175 | "PROFESSIONAL_SUMMARY": "• **Professional Summary:** Writing a concise professional summary at the top of your resume can quickly convey your key strengths and career objectives.", 176 | "DESIGNATION": "• **Designation:** Specifying your current or desired job title helps recruiters understand your career level and aspirations." 177 | } 178 | 179 | st.write("#### You can add the following details to improve your score:") 180 | for i in keywords: 181 | st.write(improvement_suggestions[i]) 182 | 183 | def main(uploaded_resume,job_description): 184 | if uploaded_resume is not None: 185 | doc_type=uploaded_resume.type 186 | if doc_type=='invalid': 187 | st.error("CV/Resume should be in PDF/DOCX/TXT format") 188 | else: 189 | if doc_type=='application/pdf': 190 | text=extract_text_from_pdf(uploaded_resume) 191 | elif doc_type=='text/plain': 192 | text=extract_text_from_docx(uploaded_resume) 193 | else: 194 | text=extract_text_from_docx(uploaded_resume) 195 | 196 | if text and not job_description: 197 | if st.checkbox('Show Entities'): 198 | st.write("### Resume Keywords") 199 | show_entities(text) 200 | 201 | if text and job_description: 202 | text=preprocess_text(text) 203 | job_description=preprocess_text(job_description) 204 | 205 | progress_bar=st.progress(0) 206 | for i in range(100): 207 | time.sleep(0.01) 208 | progress_bar.progress(i + 1) 209 | time.sleep(1) 210 | 211 | ats_score,not_found_keywords=show_ATS_score(text,job_description) 212 | 213 | if ats_score==-1: 214 | st.warning("No Keywords Found in Job Description") 215 | else: 216 | if 0=3.7.5,<3.8.0", 6 | "description":"", 7 | "author":"", 8 | "email":"", 9 | "url":"", 10 | "license":"", 11 | "spacy_git_version":"a6d0fc360", 12 | "vectors":{ 13 | "width":0, 14 | "vectors":0, 15 | "keys":0, 16 | "name":null, 17 | "mode":"default" 18 | }, 19 | "labels":{ 20 | "ner":[ 21 | "COLLEGE_NAME", 22 | "COMPANIES_WORKED_AT", 23 | "DEGREE", 24 | "DESIGNATION", 25 | "EMAIL_ADDRESS", 26 | "GRADUATION_YEAR", 27 | "LOCATION", 28 | "NAME", 29 | "SKILLS", 30 | "UNKNOWN", 31 | "YEARS_OF_EXPERIENCE" 32 | ] 33 | }, 34 | "pipeline":[ 35 | "ner" 36 | ], 37 | "components":[ 38 | "ner" 39 | ], 40 | "disabled":[ 41 | 42 | ] 43 | } -------------------------------------------------------------------------------- /ner_model/ner/cfg: -------------------------------------------------------------------------------- 1 | { 2 | "moves":null, 3 | "update_with_oracle_cut_size":100, 4 | "multitasks":[ 5 | 6 | ], 7 | "min_action_freq":1, 8 | "learn_tokens":false, 9 | "beam_width":1, 10 | "beam_density":0.0, 11 | "beam_update_prob":0.0, 12 | "incorrect_spans_key":null 13 | } -------------------------------------------------------------------------------- /ner_model/ner/model: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/miteshgupta07/ATS-Scoring-System/49ec0e353cde20189c49cd4c6a4cb64a3b926f2d/ner_model/ner/model -------------------------------------------------------------------------------- /ner_model/ner/moves: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/miteshgupta07/ATS-Scoring-System/49ec0e353cde20189c49cd4c6a4cb64a3b926f2d/ner_model/ner/moves -------------------------------------------------------------------------------- /ner_model/tokenizer: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/miteshgupta07/ATS-Scoring-System/49ec0e353cde20189c49cd4c6a4cb64a3b926f2d/ner_model/tokenizer -------------------------------------------------------------------------------- /ner_model/vocab/key2row: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/miteshgupta07/ATS-Scoring-System/49ec0e353cde20189c49cd4c6a4cb64a3b926f2d/ner_model/vocab/key2row -------------------------------------------------------------------------------- /ner_model/vocab/lookups.bin: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/miteshgupta07/ATS-Scoring-System/49ec0e353cde20189c49cd4c6a4cb64a3b926f2d/ner_model/vocab/lookups.bin -------------------------------------------------------------------------------- /ner_model/vocab/vectors: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/miteshgupta07/ATS-Scoring-System/49ec0e353cde20189c49cd4c6a4cb64a3b926f2d/ner_model/vocab/vectors -------------------------------------------------------------------------------- /ner_model/vocab/vectors.cfg: -------------------------------------------------------------------------------- 1 | { 2 | "mode":"default" 3 | } -------------------------------------------------------------------------------- /requirement.txt: -------------------------------------------------------------------------------- 1 | streamlit==1.36.0 2 | pdfplumber==0.11.2 3 | docx==1.1.2 4 | spacy==3.7.5 5 | pandas==2.2.2 6 | --------------------------------------------------------------------------------