├── .gitignore ├── README.md ├── requirements.txt └── run.py /.gitignore: -------------------------------------------------------------------------------- 1 | # Byte-compiled / optimized / DLL files 2 | __pycache__/ 3 | *.py[cod] 4 | *$py.class 5 | 6 | # C extensions 7 | *.so 8 | 9 | # Distribution / packaging 10 | .Python 11 | env/ 12 | build/ 13 | develop-eggs/ 14 | dist/ 15 | downloads/ 16 | eggs/ 17 | .eggs/ 18 | lib/ 19 | lib64/ 20 | parts/ 21 | sdist/ 22 | var/ 23 | wheels/ 24 | *.egg-info/ 25 | .installed.cfg 26 | *.egg 27 | 28 | # PyInstaller 29 | # Usually these files are written by a python script from a template 30 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 31 | *.manifest 32 | *.spec 33 | 34 | # Installer logs 35 | pip-log.txt 36 | pip-delete-this-directory.txt 37 | 38 | # Unit test / coverage reports 39 | htmlcov/ 40 | .tox/ 41 | .coverage 42 | .coverage.* 43 | .cache 44 | nosetests.xml 45 | coverage.xml 46 | *.cover 47 | .hypothesis/ 48 | 49 | # Translations 50 | *.mo 51 | *.pot 52 | 53 | # Django stuff: 54 | *.log 55 | local_settings.py 56 | 57 | # Flask stuff: 58 | instance/ 59 | .webassets-cache 60 | 61 | # Scrapy stuff: 62 | .scrapy 63 | 64 | # Sphinx documentation 65 | docs/_build/ 66 | 67 | # PyBuilder 68 | target/ 69 | 70 | # Jupyter Notebook 71 | .ipynb_checkpoints 72 | 73 | # pyenv 74 | .python-version 75 | 76 | # celery beat schedule file 77 | celerybeat-schedule 78 | 79 | # SageMath parsed files 80 | *.sage.py 81 | 82 | # dotenv 83 | .env 84 | 85 | # virtualenv 86 | .venv 87 | venv/ 88 | ENV/ 89 | 90 | # Spyder project settings 91 | .spyderproject 92 | .spyproject 93 | 94 | # Rope project settings 95 | .ropeproject 96 | 97 | # mkdocs documentation 98 | /site 99 | 100 | # mypy 101 | mypy_cache/ 102 | 103 | *.xml 104 | *.model 105 | 106 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Performing Sequence Labelling using CRF in Python 2 | 3 | This is an example of using [pycrfsuite](https://github.com/scrapinghub/python-crfsuite) to train a CRF classifier for named entity recognition. For more detailed explanation, refer to the blog post at: [http://www.albertauyeung.com/post/python-sequence-labelling-with-crf/](http://www.albertauyeung.com/post/python-sequence-labelling-with-crf/). 4 | 5 | Before running the script, download the data file `reuters.xml` from [https://github.com/AKSW/n3-collection/blob/master/reuters.xml](https://github.com/AKSW/n3-collection/blob/master/reuters.xml), and install the dependencies: 6 | 7 | $ sudo pip install -r requirements.txt 8 | -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | nltk == 3.2.2 2 | numpy == 1.12.1 3 | beautifulsoup4 == 4.6.0 4 | python_crfsuite == 0.9.2 5 | scikit_learn == 0.18.1 6 | -------------------------------------------------------------------------------- /run.py: -------------------------------------------------------------------------------- 1 | import codecs 2 | import numpy as np 3 | import nltk 4 | import pycrfsuite 5 | from bs4 import BeautifulSoup as bs 6 | from bs4.element import Tag 7 | from sklearn.model_selection import train_test_split 8 | from sklearn.metrics import classification_report 9 | 10 | # Read data file and parse the XML 11 | with codecs.open("reuters.xml", "r", "utf-8") as infile: 12 | soup = bs(infile, "html5lib") 13 | 14 | docs = [] 15 | for elem in soup.find_all("document"): 16 | texts = [] 17 | 18 | # Loop through each child of the element under "textwithnamedentities" 19 | for c in elem.find("textwithnamedentities").children: 20 | if type(c) == Tag: 21 | if c.name == "namedentityintext": 22 | label = "N" # part of a named entity 23 | else: 24 | label = "I" # irrelevant word 25 | for w in c.text.split(" "): 26 | if len(w) > 0: 27 | texts.append((w, label)) 28 | docs.append(texts) 29 | 30 | 31 | data = [] 32 | for i, doc in enumerate(docs): 33 | 34 | # Obtain the list of tokens in the document 35 | tokens = [t for t, label in doc] 36 | 37 | # Perform POS tagging 38 | tagged = nltk.pos_tag(tokens) 39 | 40 | # Take the word, POS tag, and its label 41 | data.append([(w, pos, label) for (w, label), (word, pos) in zip(doc, tagged)]) 42 | 43 | 44 | def word2features(doc, i): 45 | word = doc[i][0] 46 | postag = doc[i][1] 47 | 48 | # Common features for all words 49 | features = [ 50 | 'bias', 51 | 'word.lower=' + word.lower(), 52 | 'word[-3:]=' + word[-3:], 53 | 'word[-2:]=' + word[-2:], 54 | 'word.isupper=%s' % word.isupper(), 55 | 'word.istitle=%s' % word.istitle(), 56 | 'word.isdigit=%s' % word.isdigit(), 57 | 'postag=' + postag 58 | ] 59 | 60 | # Features for words that are not 61 | # at the beginning of a document 62 | if i > 0: 63 | word1 = doc[i-1][0] 64 | postag1 = doc[i-1][1] 65 | features.extend([ 66 | '-1:word.lower=' + word1.lower(), 67 | '-1:word.istitle=%s' % word1.istitle(), 68 | '-1:word.isupper=%s' % word1.isupper(), 69 | '-1:word.isdigit=%s' % word1.isdigit(), 70 | '-1:postag=' + postag1 71 | ]) 72 | else: 73 | # Indicate that it is the 'beginning of a document' 74 | features.append('BOS') 75 | 76 | # Features for words that are not 77 | # at the end of a document 78 | if i < len(doc)-1: 79 | word1 = doc[i+1][0] 80 | postag1 = doc[i+1][1] 81 | features.extend([ 82 | '+1:word.lower=' + word1.lower(), 83 | '+1:word.istitle=%s' % word1.istitle(), 84 | '+1:word.isupper=%s' % word1.isupper(), 85 | '+1:word.isdigit=%s' % word1.isdigit(), 86 | '+1:postag=' + postag1 87 | ]) 88 | else: 89 | # Indicate that it is the 'end of a document' 90 | features.append('EOS') 91 | 92 | return features 93 | 94 | # A function for extracting features in documents 95 | def extract_features(doc): 96 | return [word2features(doc, i) for i in range(len(doc))] 97 | 98 | # A function fo generating the list of labels for each document 99 | def get_labels(doc): 100 | return [label for (token, postag, label) in doc] 101 | 102 | 103 | X = [extract_features(doc) for doc in data] 104 | y = [get_labels(doc) for doc in data] 105 | X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2) 106 | 107 | trainer = pycrfsuite.Trainer(verbose=True) 108 | 109 | # Submit training data to the trainer 110 | for xseq, yseq in zip(X_train, y_train): 111 | trainer.append(xseq, yseq) 112 | 113 | # Set the parameters of the model 114 | trainer.set_params({ 115 | # coefficient for L1 penalty 116 | 'c1': 0.1, 117 | 118 | # coefficient for L2 penalty 119 | 'c2': 0.01, 120 | 121 | # maximum number of iterations 122 | 'max_iterations': 200, 123 | 124 | # whether to include transitions that 125 | # are possible, but not observed 126 | 'feature.possible_transitions': True 127 | }) 128 | 129 | # Provide a file name as a parameter to the train function, such that 130 | # the model will be saved to the file when training is finished 131 | trainer.train('crf.model') 132 | 133 | # Generate predictions 134 | tagger = pycrfsuite.Tagger() 135 | tagger.open('crf.model') 136 | y_pred = [tagger.tag(xseq) for xseq in X_test] 137 | 138 | # Let's take a look at a random sample in the testing set 139 | i = 12 140 | for x, y in zip(y_pred[i], [x[1].split("=")[1] for x in X_test[i]]): 141 | print("%s (%s)" % (y, x)) 142 | 143 | # Create a mapping of labels to indices 144 | labels = {"N": 1, "I": 0} 145 | 146 | # Convert the sequences of tags into a 1-dimensional array 147 | predictions = np.array([labels[tag] for row in y_pred for tag in row]) 148 | truths = np.array([labels[tag] for row in y_test for tag in row]) 149 | 150 | # Print out the classification report 151 | print(classification_report( 152 | truths, predictions, 153 | target_names=["I", "N"])) 154 | --------------------------------------------------------------------------------