├── .gitignore
├── README.md
├── requirements.txt
└── run.py


/.gitignore:
--------------------------------------------------------------------------------
  1 | # Byte-compiled / optimized / DLL files
  2 | __pycache__/
  3 | *.py[cod]
  4 | *$py.class
  5 | 
  6 | # C extensions
  7 | *.so
  8 | 
  9 | # Distribution / packaging
 10 | .Python
 11 | env/
 12 | build/
 13 | develop-eggs/
 14 | dist/
 15 | downloads/
 16 | eggs/
 17 | .eggs/
 18 | lib/
 19 | lib64/
 20 | parts/
 21 | sdist/
 22 | var/
 23 | wheels/
 24 | *.egg-info/
 25 | .installed.cfg
 26 | *.egg
 27 | 
 28 | # PyInstaller
 29 | #  Usually these files are written by a python script from a template
 30 | #  before PyInstaller builds the exe, so as to inject date/other infos into it.
 31 | *.manifest
 32 | *.spec
 33 | 
 34 | # Installer logs
 35 | pip-log.txt
 36 | pip-delete-this-directory.txt
 37 | 
 38 | # Unit test / coverage reports
 39 | htmlcov/
 40 | .tox/
 41 | .coverage
 42 | .coverage.*
 43 | .cache
 44 | nosetests.xml
 45 | coverage.xml
 46 | *.cover
 47 | .hypothesis/
 48 | 
 49 | # Translations
 50 | *.mo
 51 | *.pot
 52 | 
 53 | # Django stuff:
 54 | *.log
 55 | local_settings.py
 56 | 
 57 | # Flask stuff:
 58 | instance/
 59 | .webassets-cache
 60 | 
 61 | # Scrapy stuff:
 62 | .scrapy
 63 | 
 64 | # Sphinx documentation
 65 | docs/_build/
 66 | 
 67 | # PyBuilder
 68 | target/
 69 | 
 70 | # Jupyter Notebook
 71 | .ipynb_checkpoints
 72 | 
 73 | # pyenv
 74 | .python-version
 75 | 
 76 | # celery beat schedule file
 77 | celerybeat-schedule
 78 | 
 79 | # SageMath parsed files
 80 | *.sage.py
 81 | 
 82 | # dotenv
 83 | .env
 84 | 
 85 | # virtualenv
 86 | .venv
 87 | venv/
 88 | ENV/
 89 | 
 90 | # Spyder project settings
 91 | .spyderproject
 92 | .spyproject
 93 | 
 94 | # Rope project settings
 95 | .ropeproject
 96 | 
 97 | # mkdocs documentation
 98 | /site
 99 | 
100 | # mypy
101 | mypy_cache/
102 | 
103 | *.xml
104 | *.model
105 | 
106 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 | # Performing Sequence Labelling using CRF in Python
2 | 
3 | This is an example of using [pycrfsuite](https://github.com/scrapinghub/python-crfsuite) to train a CRF classifier for named entity recognition. For more detailed explanation, refer to the blog post at: [http://www.albertauyeung.com/post/python-sequence-labelling-with-crf/](http://www.albertauyeung.com/post/python-sequence-labelling-with-crf/).
4 | 
5 | Before running the script, download the data file `reuters.xml` from [https://github.com/AKSW/n3-collection/blob/master/reuters.xml](https://github.com/AKSW/n3-collection/blob/master/reuters.xml), and install the dependencies:
6 | 
7 |     $ sudo pip install -r requirements.txt
8 | 


--------------------------------------------------------------------------------
/requirements.txt:
--------------------------------------------------------------------------------
1 | nltk == 3.2.2
2 | numpy == 1.12.1
3 | beautifulsoup4 == 4.6.0
4 | python_crfsuite == 0.9.2
5 | scikit_learn == 0.18.1
6 | 


--------------------------------------------------------------------------------
/run.py:
--------------------------------------------------------------------------------
  1 | import codecs
  2 | import numpy as np
  3 | import nltk
  4 | import pycrfsuite
  5 | from bs4 import BeautifulSoup as bs
  6 | from bs4.element import Tag
  7 | from sklearn.model_selection import train_test_split
  8 | from sklearn.metrics import classification_report
  9 | 
 10 | # Read data file and parse the XML
 11 | with codecs.open("reuters.xml", "r", "utf-8") as infile:
 12 |     soup = bs(infile, "html5lib")
 13 | 
 14 | docs = []
 15 | for elem in soup.find_all("document"):
 16 |     texts = []
 17 | 
 18 |     # Loop through each child of the element under "textwithnamedentities"
 19 |     for c in elem.find("textwithnamedentities").children:
 20 |         if type(c) == Tag:
 21 |             if c.name == "namedentityintext":
 22 |                 label = "N"  # part of a named entity
 23 |             else:
 24 |                 label = "I"  # irrelevant word
 25 |             for w in c.text.split(" "):
 26 |                 if len(w) > 0:
 27 |                     texts.append((w, label))
 28 |     docs.append(texts)
 29 | 
 30 | 
 31 | data = []
 32 | for i, doc in enumerate(docs):
 33 | 
 34 |     # Obtain the list of tokens in the document
 35 |     tokens = [t for t, label in doc]
 36 | 
 37 |     # Perform POS tagging
 38 |     tagged = nltk.pos_tag(tokens)
 39 | 
 40 |     # Take the word, POS tag, and its label
 41 |     data.append([(w, pos, label) for (w, label), (word, pos) in zip(doc, tagged)])
 42 | 
 43 | 
 44 | def word2features(doc, i):
 45 |     word = doc[i][0]
 46 |     postag = doc[i][1]
 47 | 
 48 |     # Common features for all words
 49 |     features = [
 50 |         'bias',
 51 |         'word.lower=' + word.lower(),
 52 |         'word[-3:]=' + word[-3:],
 53 |         'word[-2:]=' + word[-2:],
 54 |         'word.isupper=%s' % word.isupper(),
 55 |         'word.istitle=%s' % word.istitle(),
 56 |         'word.isdigit=%s' % word.isdigit(),
 57 |         'postag=' + postag
 58 |     ]
 59 | 
 60 |     # Features for words that are not
 61 |     # at the beginning of a document
 62 |     if i > 0:
 63 |         word1 = doc[i-1][0]
 64 |         postag1 = doc[i-1][1]
 65 |         features.extend([
 66 |             '-1:word.lower=' + word1.lower(),
 67 |             '-1:word.istitle=%s' % word1.istitle(),
 68 |             '-1:word.isupper=%s' % word1.isupper(),
 69 |             '-1:word.isdigit=%s' % word1.isdigit(),
 70 |             '-1:postag=' + postag1
 71 |         ])
 72 |     else:
 73 |         # Indicate that it is the 'beginning of a document'
 74 |         features.append('BOS')
 75 | 
 76 |     # Features for words that are not 
 77 |     # at the end of a document
 78 |     if i < len(doc)-1:
 79 |         word1 = doc[i+1][0]
 80 |         postag1 = doc[i+1][1]
 81 |         features.extend([
 82 |             '+1:word.lower=' + word1.lower(),
 83 |             '+1:word.istitle=%s' % word1.istitle(),
 84 |             '+1:word.isupper=%s' % word1.isupper(),
 85 |             '+1:word.isdigit=%s' % word1.isdigit(),
 86 |             '+1:postag=' + postag1
 87 |         ])
 88 |     else:
 89 |         # Indicate that it is the 'end of a document'
 90 |         features.append('EOS')
 91 | 
 92 |     return features
 93 | 
 94 | # A function for extracting features in documents
 95 | def extract_features(doc):
 96 |     return [word2features(doc, i) for i in range(len(doc))]
 97 | 
 98 | # A function fo generating the list of labels for each document
 99 | def get_labels(doc):
100 |     return [label for (token, postag, label) in doc]
101 | 
102 | 
103 | X = [extract_features(doc) for doc in data]
104 | y = [get_labels(doc) for doc in data]
105 | X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)
106 | 
107 | trainer = pycrfsuite.Trainer(verbose=True)
108 | 
109 | # Submit training data to the trainer
110 | for xseq, yseq in zip(X_train, y_train):
111 |     trainer.append(xseq, yseq)
112 | 
113 | # Set the parameters of the model
114 | trainer.set_params({
115 |     # coefficient for L1 penalty
116 |     'c1': 0.1,
117 | 
118 |     # coefficient for L2 penalty
119 |     'c2': 0.01,  
120 | 
121 |     # maximum number of iterations
122 |     'max_iterations': 200,
123 | 
124 |     # whether to include transitions that
125 |     # are possible, but not observed
126 |     'feature.possible_transitions': True
127 | })
128 | 
129 | # Provide a file name as a parameter to the train function, such that
130 | # the model will be saved to the file when training is finished
131 | trainer.train('crf.model')
132 | 
133 | # Generate predictions
134 | tagger = pycrfsuite.Tagger()
135 | tagger.open('crf.model')
136 | y_pred = [tagger.tag(xseq) for xseq in X_test]
137 | 
138 | # Let's take a look at a random sample in the testing set
139 | i = 12
140 | for x, y in zip(y_pred[i], [x[1].split("=")[1] for x in X_test[i]]):
141 |     print("%s (%s)" % (y, x))
142 | 
143 | # Create a mapping of labels to indices
144 | labels = {"N": 1, "I": 0}
145 | 
146 | # Convert the sequences of tags into a 1-dimensional array
147 | predictions = np.array([labels[tag] for row in y_pred for tag in row])
148 | truths = np.array([labels[tag] for row in y_test for tag in row])
149 | 
150 | # Print out the classification report
151 | print(classification_report(
152 |     truths, predictions,
153 |     target_names=["I", "N"]))
154 | 


--------------------------------------------------------------------------------