├── .gitignore
├── deepmultilingualpunctuation
    ├── __init__.py
    └── punctuationmodel.py
├── dev-readme.md
├── setup.py
├── LICENSE
└── README.md


/.gitignore:
--------------------------------------------------------------------------------
1 | build/
2 | dist/
3 | *.egg-info/
4 | models/
5 | *.zip
6 | __pycache__/


--------------------------------------------------------------------------------
/deepmultilingualpunctuation/__init__.py:
--------------------------------------------------------------------------------
1 | from .punctuationmodel import PunctuationModel


--------------------------------------------------------------------------------
/dev-readme.md:
--------------------------------------------------------------------------------
 1 | 
 2 | ## Install the package on your local machine
 3 | 
 4 | ```
 5 | pip3 install -e ./
 6 | ```
 7 | 
 8 | ## Build the package
 9 | 
10 | ```
11 | python3 setup.py sdist bdist_wheel
12 | ```
13 | 
14 | ## upload package
15 | 
16 | ```
17 | python3 -m twine upload dist/* --verbose
18 | ```
19 | 
20 | ## run unittests 
21 | 
22 | ```
23 | python3 -m pytest tests/test.py
24 | ```


--------------------------------------------------------------------------------
/setup.py:
--------------------------------------------------------------------------------
 1 | import setuptools
 2 | 
 3 | with open("README.md", "r") as fh:
 4 |     long_description = fh.read()
 5 | 
 6 | setuptools.setup(
 7 |     name="deepmultilingualpunctuation",
 8 |     version="1.0.1",
 9 |     author="Oliver Guhr",
10 |     author_email="oliver.guhr@htw-dresden.de",
11 |     description="A python package for deep multilingual punctuation prediction.",
12 |     long_description=long_description,
13 |     long_description_content_type="text/markdown",
14 |     url="https://github.com/oliverguhr/deepmultilingualpunctuation",
15 |     packages=setuptools.find_packages(),    
16 |     classifiers=[
17 |         "Programming Language :: Python :: 3",
18 |         "License :: OSI Approved :: MIT License",
19 |         "Operating System :: OS Independent",
20 |     ],
21 |     install_requires=[
22 |        "transformers",
23 |        "torch>=1.8.1",
24 |     ],
25 |     python_requires='>=3.6',
26 | )
27 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 | MIT License
 2 | 
 3 | Copyright (c) 2022 Oliver Guhr
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | of this software and associated documentation files (the "Software"), to deal
 7 | in the Software without restriction, including without limitation the rights
 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 | 
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 | 
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.


--------------------------------------------------------------------------------
/deepmultilingualpunctuation/punctuationmodel.py:
--------------------------------------------------------------------------------
 1 | from concurrent.futures import process
 2 | from transformers import pipeline
 3 | import re
 4 | import torch
 5 | 
 6 | class PunctuationModel():
 7 |     def __init__(self, model = "oliverguhr/fullstop-punctuation-multilang-large") -> None:
 8 |         if torch.cuda.is_available():
 9 |             self.pipe = pipeline("ner",model, aggregation_strategy="none", device=1)
10 |         else:
11 |             self.pipe = pipeline("ner",model, aggregation_strategy="none")
12 | 
13 |     def preprocess(self,text):
14 |         #remove markers except for markers in numbers
15 |         text = re.sub(r"(?<!\d)[.,;:!?](?!\d)","",text)
16 |         #todo: match acronyms https://stackoverflow.com/questions/35076016/regex-to-match-acronyms
17 |         text = text.split()
18 |         return text
19 | 
20 |     def restore_punctuation(self, text, chunk_size=230):
21 |         result = self.predict(self.preprocess(text), chunk_size)
22 |         return self.prediction_to_text(result)
23 | 
24 |     def overlap_chunks(self,lst, n, stride=0):
25 |         """Yield successive n-sized chunks from lst with stride length of overlap."""
26 |         for i in range(0, len(lst), n-stride):
27 |                 yield lst[i:i + n]
28 | 
29 |     def predict(self, words, chunk_size=230):
30 |         overlap = 5
31 |         if len(words) <= chunk_size:
32 |             overlap = 0
33 | 
34 |         batches = list(self.overlap_chunks(words,chunk_size,overlap))
35 | 
36 |         # if the last batch is smaller than the overlap,
37 |         # we can just remove it
38 |         if len(batches[-1]) <= overlap:
39 |             batches.pop()
40 | 
41 |         tagged_words = []
42 |         for batch in batches:
43 |             # use last batch completely
44 |             if batch == batches[-1]:
45 |                 overlap = 0
46 |             text = " ".join(batch)
47 |             result = self.pipe(text)
48 |             assert len(text) == result[-1]["end"], "chunk size too large, text got clipped"
49 | 
50 |             char_index = 0
51 |             result_index = 0
52 |             for word in batch[:len(batch)-overlap]:
53 |                 char_index += len(word) + 1
54 |                 # if any subtoken of an word is labled as sentence end
55 |                 # we label the whole word as sentence end
56 |                 label = "0"
57 |                 while result_index < len(result) and char_index > result[result_index]["end"] :
58 |                     label = result[result_index]['entity']
59 |                     score = result[result_index]['score']
60 |                     result_index += 1
61 |                 tagged_words.append([word,label, score])
62 | 
63 |         assert len(tagged_words) == len(words)
64 |         return tagged_words
65 | 
66 |     def prediction_to_text(self,prediction):
67 |         result = ""
68 |         for word, label, _ in prediction:
69 |             result += word
70 |             if label == "0":
71 |                 result += " "
72 |             if label in ".,?-:":
73 |                 result += label+" "
74 |         return result.strip()
75 | 
76 | if __name__ == "__main__":
77 |     model = PunctuationModel()
78 | 
79 |     text = "das , ist fies "
80 |     # restore add missing punctuation
81 |     result = model.restore_punctuation(text)
82 |     print(result)
83 | 
84 |     clean_text = model.preprocess(text)
85 |     labled_words = model.predict(clean_text)
86 |     print(labled_words)
87 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
  1 | # Deep Multilingual Punctuation Prediction
  2 | 
  3 | This python library predicts the punctuation of English, Italian, French and German texts. We developed it to restore the punctuation of transcribed spoken language. 
  4 | 
  5 | This uses [our "FullStop" model](https://huggingface.co/oliverguhr/fullstop-punctuation-multilang-large/) that we trained on the [Europarl Dataset](https://huggingface.co/datasets/wmt/europarl). *Please note that this dataset consists of political speeches. Therefore the model might perform differently on texts from other domains.*
  6 | 
  7 | The code restores the following punctuation markers: **"." "," "?" "-" ":"**
  8 | 
  9 | 
 10 | [Video Sample](https://user-images.githubusercontent.com/3495355/150677531-13f2037d-8673-4e34-8769-0da1784c2fe7.mp4)
 11 | 
 12 | ## Install 
 13 | 
 14 | To get started install the package from [pypi](https://pypi.org/project/deepmultilingualpunctuation/):
 15 | 
 16 | ```bash
 17 | pip install deepmultilingualpunctuation
 18 | ```
 19 | 
 20 | ## Usage
 21 | The ```PunctuationModel``` class an process texts of any length. Note that processing of very long texts can be time consuming.
 22 | 
 23 | ### Restore Punctuation
 24 | ```python
 25 | from deepmultilingualpunctuation import PunctuationModel
 26 | 
 27 | model = PunctuationModel()
 28 | text = "My name is Clara and I live in Berkeley California Ist das eine Frage Frau Müller"
 29 | result = model.restore_punctuation(text)
 30 | print(result)
 31 | ```
 32 | 
 33 | **output**
 34 | > My name is Clara and I live in Berkeley, California. Ist das eine Frage, Frau Müller?
 35 | 
 36 | 
 37 | ### Predict Labels 
 38 | ```python
 39 | from deepmultilingualpunctuation import PunctuationModel
 40 | 
 41 | model = PunctuationModel()
 42 | text = "My name is Clara and I live in Berkeley California Ist das eine Frage Frau Müller"
 43 | clean_text = model.preprocess(text)
 44 | labled_words = model.predict(clean_text)
 45 | print(labled_words)
 46 | ```
 47 | 
 48 | **output**
 49 | 
 50 | > [['My', '0', 0.9999887], ['name', '0', 0.99998665], ['is', '0', 0.9998579], ['Clara', '0', 0.6752215], ['and', '0', 0.99990904], ['I', '0', 0.9999877], ['live', '0', 0.9999839], ['in', '0', 0.9999515], ['Berkeley', ',', 0.99800044], ['California', '.', 0.99534047], ['Ist', '0', 0.99998784], ['das', '0', 0.99999154], ['eine', '0', 0.9999918], ['Frage', ',', 0.99622655], ['Frau', '0', 0.9999889], ['Müller', '?', 0.99863917]]
 51 | 
 52 | ## Languages
 53 | 
 54 | ### Models
 55 | 
 56 | | Languages                                  | Model                                                        |
 57 | | ------------------------------------------ | ------------------------------------------------------------ |
 58 | | English, Italian, French and German        | [oliverguhr/fullstop-punctuation-multilang-large](https://huggingface.co/oliverguhr/fullstop-punctuation-multilang-large) |
 59 | | English, Italian, French, German and Dutch | [oliverguhr/fullstop-punctuation-multilingual-sonar-base](https://huggingface.co/oliverguhr/fullstop-punctuation-multilingual-sonar-base) |
 60 | | Dutch                                      | [oliverguhr/fullstop-dutch-sonar-punctuation-prediction](https://huggingface.co/oliverguhr/fullstop-dutch-sonar-punctuation-prediction) |
 61 | 
 62 | ### Community Models
 63 | 
 64 | | Languages                                  | Model                                                        |
 65 | | ------------------------------------------ | ------------------------------------------------------------ |
 66 | |English, German, French, Spanish, Bulgarian, Italian, Polish, Dutch, Czech, Portugese, Slovak, Slovenian| [kredor/punctuate-all](https://huggingface.co/kredor/punctuate-all)                                                             |
 67 | | Catalan                                    | [softcatala/fullstop-catalan-punctuation-prediction](https://huggingface.co/softcatala/fullstop-catalan-punctuation-prediction) |
 68 | 
 69 | You can use different models by setting the model parameter:
 70 | 
 71 | ```python
 72 | model = PunctuationModel(model = "oliverguhr/fullstop-dutch-punctuation-prediction")
 73 | ```
 74 | 
 75 | ## Where do I find the code and can I train my own model?
 76 | 
 77 | Yes you can! For complete code of the reareach project take a look at [this repository](https://github.com/oliverguhr/fullstop-deep-punctuation-prediction).
 78 | 
 79 | There is also an guide on [how to fine tune this model for you data / language](https://github.com/oliverguhr/fullstop-deep-punctuation-prediction/blob/main/other_languages/readme.md). 
 80 | 
 81 | 
 82 | ## Results 
 83 | 
 84 | The performance differs for the single punctuation markers as hyphens and colons, in many cases, are optional and can be substituted by either a comma or a full stop. The model achieves the following F1 scores for the different languages:
 85 | 
 86 | | Label         | EN    | DE    | FR    | IT    |
 87 | | ------------- | ----- | ----- | ----- | ----- |
 88 | | 0             | 0.991 | 0.997 | 0.992 | 0.989 |
 89 | | .             | 0.948 | 0.961 | 0.945 | 0.942 |
 90 | | ?             | 0.890 | 0.893 | 0.871 | 0.832 |
 91 | | ,             | 0.819 | 0.945 | 0.831 | 0.798 |
 92 | | :             | 0.575 | 0.652 | 0.620 | 0.588 |
 93 | | -             | 0.425 | 0.435 | 0.431 | 0.421 |
 94 | | macro average | 0.775 | 0.814 | 0.782 | 0.762 |
 95 | 
 96 | 
 97 | 
 98 | ## References
 99 | Please cite us if you found this useful:
100 | 
101 | ```
102 | @article{guhr-EtAl:2021:fullstop,
103 |   title={FullStop: Multilingual Deep Models for Punctuation Prediction},
104 |   author    = {Guhr, Oliver  and  Schumann, Anne-Kathrin  and  Bahrmann, Frank  and  Böhme, Hans Joachim},
105 |   booktitle      = {Proceedings of the Swiss Text Analytics Conference 2021},
106 |   month          = {June},
107 |   year           = {2021},
108 |   address        = {Winterthur, Switzerland},
109 |   publisher      = {CEUR Workshop Proceedings},  
110 |   url       = {http://ceur-ws.org/Vol-2957/sepp_paper4.pdf}
111 | }
112 | ```
113 | 


--------------------------------------------------------------------------------