├── .gitignore
├── LICENSE
├── README.md
├── scilk
    ├── __init__.py
    ├── collections
    │   ├── __init__.py
    │   ├── _collections.py
    │   └── common.py
    ├── corpora
    │   ├── __init__.py
    │   ├── chemdner.py
    │   ├── corpus.py
    │   └── genia.py
    └── util
    │   ├── __init__.py
    │   ├── binning.py
    │   ├── intervals.py
    │   ├── networks
    │       ├── __init__.py
    │       ├── blocks.py
    │       ├── callbacks.py
    │       ├── metrics.py
    │       └── wrappers.py
    │   ├── patterns.py
    │   ├── preprocessing.py
    │   └── segments.py
├── setup.py
└── test.py


/.gitignore:
--------------------------------------------------------------------------------
  1 | # Created by .ignore support plugin (hsz.mobi)
  2 | ### macOS template
  3 | *.DS_Store
  4 | .AppleDouble
  5 | .LSOverride
  6 | 
  7 | # Icon must end with two \r
  8 | Icon
  9 | 
 10 | 
 11 | # Thumbnails
 12 | ._*
 13 | 
 14 | # Files that might appear in the root of a volume
 15 | .DocumentRevisions-V100
 16 | .fseventsd
 17 | .Spotlight-V100
 18 | .TemporaryItems
 19 | .Trashes
 20 | .VolumeIcon.icns
 21 | .com.apple.timemachine.donotpresent
 22 | 
 23 | # Directories potentially created on remote AFP share
 24 | .AppleDB
 25 | .AppleDesktop
 26 | Network Trash Folder
 27 | Temporary Items
 28 | .apdisk
 29 | ### Python template
 30 | # Byte-compiled / optimized / DLL files
 31 | __pycache__/
 32 | *.py[cod]
 33 | *$py.class
 34 | 
 35 | # C extensions
 36 | *.so
 37 | 
 38 | # Distribution / packaging
 39 | .Python
 40 | env/
 41 | build/
 42 | develop-eggs/
 43 | dist/
 44 | downloads/
 45 | eggs/
 46 | .eggs/
 47 | lib/
 48 | lib64/
 49 | parts/
 50 | sdist/
 51 | var/
 52 | wheels/
 53 | *.egg-info/
 54 | .installed.cfg
 55 | *.egg
 56 | 
 57 | # PyInstaller
 58 | #  Usually these files are written by a python script from a template
 59 | #  before PyInstaller builds the exe, so as to inject date/other infos into it.
 60 | *.manifest
 61 | *.spec
 62 | 
 63 | # Installer logs
 64 | pip-log.txt
 65 | pip-delete-this-directory.txt
 66 | 
 67 | # Unit test / coverage reports
 68 | htmlcov/
 69 | .tox/
 70 | .coverage
 71 | .coverage.*
 72 | .cache
 73 | nosetests.xml
 74 | coverage.xml
 75 | *,cover
 76 | .hypothesis/
 77 | 
 78 | # Translations
 79 | *.mo
 80 | *.pot
 81 | 
 82 | # Django stuff:
 83 | *.log
 84 | local_settings.py
 85 | 
 86 | # Flask stuff:
 87 | instance/
 88 | .webassets-cache
 89 | 
 90 | # Scrapy stuff:
 91 | .scrapy
 92 | 
 93 | # Sphinx documentation
 94 | docs/_build/
 95 | 
 96 | # PyBuilder
 97 | target/
 98 | 
 99 | # Jupyter Notebook
100 | .ipynb_checkpoints
101 | 
102 | # pyenv
103 | .python-version
104 | 
105 | # celery beat schedule file
106 | celerybeat-schedule
107 | 
108 | # SageMath parsed files
109 | *.sage.py
110 | 
111 | # dotenv
112 | .env
113 | 
114 | # virtualenv
115 | .venv
116 | venv/
117 | ENV/
118 | 
119 | # Spyder project settings
120 | .spyderproject
121 | 
122 | # Rope project settings
123 | .ropeproject
124 | ### JetBrains template
125 | # Covers JetBrains IDEs: IntelliJ, RubyMine, PhpStorm, AppCode, PyCharm, CLion, Android Studio and Webstorm
126 | # Reference: https://intellij-support.jetbrains.com/hc/en-us/articles/206544839
127 | 
128 | # User-specific stuff:
129 | .idea/**/workspace.xml
130 | .idea/**/tasks.xml
131 | .idea/dictionaries
132 | 
133 | # Sensitive or high-churn files:
134 | .idea/**/dataSources/
135 | .idea/**/dataSources.ids
136 | .idea/**/dataSources.xml
137 | .idea/**/dataSources.local.xml
138 | .idea/**/sqlDataSources.xml
139 | .idea/**/dynamic.xml
140 | .idea/**/uiDesigner.xml
141 | 
142 | # Gradle:
143 | .idea/**/gradle.xml
144 | .idea/**/libraries
145 | 
146 | # Mongo Explorer plugin:
147 | .idea/**/mongoSettings.xml
148 | 
149 | ## File-based project format:
150 | *.iws
151 | 
152 | ## Plugin-specific files:
153 | 
154 | # IntelliJ
155 | /out/
156 | 
157 | # mpeltonen/sbt-idea plugin
158 | .idea_modules/
159 | 
160 | # JIRA plugin
161 | atlassian-ide-plugin.xml
162 | 
163 | # Crashlytics plugin (for Android Studio and IntelliJ)
164 | com_crashlytics_export_strings.xml
165 | crashlytics.properties
166 | crashlytics-build.properties
167 | fabric.properties
168 | 
169 | .idea/
170 | genia_corpus/
171 | 
172 | ### macOS template
173 | 
174 | # Icon must end with two \r
175 | 
176 | 
177 | # Thumbnails
178 | 
179 | # Files that might appear in the root of a volume
180 | 
181 | # Directories potentially created on remote AFP share
182 | ### C template
183 | # Prerequisites
184 | *.d
185 | 
186 | # Object files
187 | *.o
188 | *.ko
189 | *.obj
190 | *.elf
191 | 
192 | # Linker output
193 | *.ilk
194 | *.map
195 | *.exp
196 | 
197 | # Precompiled Headers
198 | *.gch
199 | *.pch
200 | 
201 | # Libraries
202 | *.lib
203 | *.a
204 | *.la
205 | *.lo
206 | 
207 | # Shared objects (inc. Windows DLLs)
208 | *.dll
209 | *.so.*
210 | *.dylib
211 | 
212 | # Executables
213 | *.exe
214 | *.out
215 | *.app
216 | *.i*86
217 | *.x86_64
218 | *.hex
219 | 
220 | # Debug files
221 | *.dSYM/
222 | *.su
223 | *.idb
224 | *.pdb
225 | 
226 | # Kernel Module Compile Results
227 | *.mod*
228 | *.cmd
229 | modules.order
230 | Module.symvers
231 | Mkfile.old
232 | dkms.conf
233 | ### C++ template
234 | # Prerequisites
235 | 
236 | # Compiled Object files
237 | *.slo
238 | 
239 | # Precompiled Headers
240 | 
241 | # Compiled Dynamic libraries
242 | 
243 | # Fortran module files
244 | *.mod
245 | *.smod
246 | 
247 | # Compiled Static libraries
248 | *.lai
249 | 
250 | # Executables
251 | ### JetBrains template
252 | # Covers JetBrains IDEs: IntelliJ, RubyMine, PhpStorm, AppCode, PyCharm, CLion, Android Studio and Webstorm
253 | # Reference: https://intellij-support.jetbrains.com/hc/en-us/articles/206544839
254 | 
255 | # User-specific stuff:
256 | 
257 | # Sensitive or high-churn files:
258 | 
259 | # Gradle:
260 | 
261 | # Mongo Explorer plugin:
262 | 
263 | ## File-based project format:
264 | 
265 | ## Plugin-specific files:
266 | 
267 | # IntelliJ
268 | 
269 | # mpeltonen/sbt-idea plugin
270 | 
271 | # JIRA plugin
272 | 
273 | # Crashlytics plugin (for Android Studio and IntelliJ)
274 | ### Linux template
275 | *~
276 | 
277 | # temporary files which can be created if a process still has a handle open of a deleted file
278 | .fuse_hidden*
279 | 
280 | # KDE directory preferences
281 | .directory
282 | 
283 | # Linux trash folder which might appear on any partition or disk
284 | .Trash-*
285 | 
286 | # .nfs files are created when an open file is removed but is still being accessed
287 | .nfs*
288 | 
289 | local/


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 | MIT License
 2 | 
 3 | Copyright (c) 2017 skoblov-lab
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | of this software and associated documentation files (the "Software"), to deal
 7 | in the Software without restriction, including without limitation the rights
 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 | 
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 | 
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | # SciLK: a Scientific natural Language Toolkit
 2 | SciLK (pronounced as "silk") is a natural language toolkit created and 
 3 | optimised specifically for text-mining applications in natural sciences 
 4 | (primarily biology and chemistry). As of this moment, this package is purely
 5 | experimental and is bound to be unstable for some time to come. Stable published
 6 | models will be stored in separate stale branches until the master branch has
 7 | matured. The list of such branches:
 8 | 
 9 |  - `chemdner-pub` - a text tokeniser and chemical named entity recognition model
10 |  trained on the CHEMDNER corpus (publication pending). **Update** We've added Windows support.
11 | 


--------------------------------------------------------------------------------
/scilk/__init__.py:
--------------------------------------------------------------------------------
 1 | """
 2 | 
 3 | 
 4 | 
 5 | """
 6 | 
 7 | 
 8 | import pathlib
 9 | import sys
10 | import os
11 | 
12 | 
13 | if sys.version_info < (3, 5, 2):
14 |     print("ChemPred required Python >= 3.5.2")
15 |     sys.exit(1)
16 | 
17 | 
18 | SCILK_ROOT = os.path.abspath(os.environ.get('SCILK_ROOT') or
19 |                              os.path.expanduser('~/.scilk'))
20 | os.makedirs(SCILK_ROOT, exist_ok=True)
21 | 
22 | 
23 | if __name__ == '__main__':
24 |     raise RuntimeError
25 | 


--------------------------------------------------------------------------------
/scilk/collections/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/skoblov-lab/SciLK/b3509b7d3839462ab415e9f2cfd0ad8033f8034d/scilk/collections/__init__.py


--------------------------------------------------------------------------------
/scilk/collections/_collections.py:
--------------------------------------------------------------------------------
  1 | """
  2 | 
  3 | This module is purely experimental. We are going to use it for IO prototyping.
  4 | 
  5 | """
  6 | 
  7 | 
  8 | from typing import Tuple, Mapping, List, Optional, Callable, Any
  9 | from importlib import import_module
 10 | import joblib
 11 | import inspect
 12 | import keyword
 13 | import copy
 14 | import glob
 15 | import shutil
 16 | import os
 17 | 
 18 | import scilk
 19 | 
 20 | 
 21 | LOADER_EXT = 'loader'
 22 | DATA_EXT = 'entrydata'
 23 | COLL_EXT = 'collection'
 24 | 
 25 | 
 26 | class Collection:
 27 | 
 28 |     def __init__(self):
 29 |         self._entries = {}
 30 |         self._loaders = {}
 31 |         self._data = {}
 32 |         self._status = {}
 33 | 
 34 |     def __getattr__(self, entry: str) -> Any:
 35 |         if entry not in self._loaders:
 36 |             raise AttributeError('no entry named {}'.format(entry))
 37 |         # uninvoked dependencies are False, loading dependencies are None,
 38 |         # loaded dependencies are True
 39 |         if self._status[entry] is None:
 40 |             raise RuntimeError("'{}' was accessed while loading".format(entry))
 41 |         if not self._status[entry]:
 42 |             self._activate_entry(entry)
 43 |         return self._entries[entry]
 44 | 
 45 |     @property
 46 |     def entries(self) -> List[str]:
 47 |         return list(self._loaders)
 48 | 
 49 |     def add(self, entry: str, loader: Callable[['Collection', Mapping], Any],
 50 |             data: Optional[Mapping[str, str]]=None, postpone: bool=False):
 51 |         """
 52 |         Add a model to the collection.
 53 |         :param entry: entry name; it must be a valid python identifier,
 54 |         because it will be used to access the entry via the attribute lookup
 55 |         mechanism, i.e.
 56 |         >>> assert isidentifier(entry)
 57 |         should pass.
 58 |         :param loader: a callable responsible for loading an entry. A loader
 59 |         must accept two arguments: (1) a Collection instance (this would allow
 60 |         the loader to access other models in the same Collection) and (2) a data
 61 |         mapping (see argument 'data'); take note that cyclic dependencies
 62 |         between entries are not allowed and will result in a RuntimeError
 63 |         error. There are two additional requirements:
 64 |         - The loader must be defined in an importable module
 65 |         - The loader must be accessible via its __name__ attribute from the
 66 |         module's global namespace.
 67 |         If both requirements are met, the following code will work just fine:
 68 |         >>> import inspect
 69 |         >>> from importlib import import_module
 70 |         >>> module = import_module(inspect.getmodule(loader).__name__)
 71 |         >>> assert getattr(module, loader.__name__) is loader
 72 |         The method will try to validate your loader and will raise a ValueError
 73 |         if the validation fails.
 74 |         :param data: a Mapping between labels and file paths (symlinks are not
 75 |         allowed). When a Collection is serialised, all data mappings associated
 76 |         with underlying entries are copied into the Collection's destination
 77 |         directory under appropriate subdirectories; nevertheless, all data keys
 78 |         remain the same and it is thus safe to rely on them in loaders.
 79 |         :param postpone: do not load the entry at once. This option is useful if
 80 |         you don't want to work out the correct order of adding entries without
 81 |         running into missing dependencies.
 82 |         :raises SyntaxError: invalid entry name
 83 |         :raises ImportError: can't import loader from its module
 84 |         :raises ValueError: invalid data
 85 |         """
 86 |         if not isidentifier(entry):
 87 |             raise SyntaxError("'{}' is not a valid identifier".format(entry))
 88 |         if not importable(loader):
 89 |             raise ImportError("can't import the loader from its module")
 90 |         # check the data mapping
 91 |         if not (data is None or isinstance(data, Mapping)):
 92 |             raise ValueError('data argument must be a Mapping instance or None')
 93 |         if not (data is None or all(map(os.path.isfile, data.values()))):
 94 |             raise ValueError('all values in data must be valid file paths')
 95 |         self._loaders[entry] = loader
 96 |         self._data[entry] = copy.deepcopy(dict(data or {}))
 97 |         self._status[entry] = False
 98 |         if not postpone:
 99 |             self._activate_entry(entry)
100 | 
101 |     def _activate_entry(self, entry: str):
102 |         if self._status[entry]:
103 |             raise RuntimeError('trying to reload an entry')
104 |         # set entry status to None to show that it is currently loading
105 |         self._status[entry] = None
106 |         # load the entry
107 |         self._entries[entry] = self._loaders[entry](self, self._data[entry])
108 |         # show that the entry is available
109 |         self._status[entry] = True
110 | 
111 |     @classmethod
112 |     def load(cls, name: str) -> 'Collection':
113 |         """
114 |         Load a serialised Collection from your SciLK root
115 |         :param name: Collection's name
116 |         :return: a loaded Collection
117 |         :raises FileNotFoundError: missing files
118 |         :raises ModuleNotFoundError: can't load a loader's module
119 |         :raises AttributeError: can't find a loader in its module
120 |         """
121 |         collection = cls()
122 |         base = os.path.join(scilk.SCILK_ROOT, name)
123 |         entries = joblib.load(os.path.join(base, '{}.{}'.format(name, COLL_EXT)))
124 |         for entry in entries:
125 |             collection.add(entry, *cls._load_entry(base, entry), postpone=True)
126 |         return collection
127 | 
128 |     def save(self, name):
129 |         """
130 |         Save a Collection to your SciLK root in a distributable form:
131 |         - create a directory named after the Collection under the SciLK root
132 |         directory and inflate it with subdirectories named after entries
133 |         - save everything necessary to load the entries
134 |         - save specifications
135 |         :raises FileExistsError: there already is a saved Collection with
136 |         identical name
137 |         """
138 |         destination = os.path.join(scilk.SCILK_ROOT, name)
139 |         try:
140 |             os.makedirs(destination)
141 |         except FileExistsError:
142 |             raise FileExistsError("there is a collection named '{}' in your "
143 |                                   "SciLK root directory".format(name))
144 |         # save individual entries
145 |         for entry in self._loaders:
146 |             self._save_entry(destination, entry)
147 |         # save collection spec to prevent data corruption
148 |         collection_spec_path = os.path.join(destination,
149 |                                             '{}.{}'.format(name, COLL_EXT))
150 |         joblib.dump(self.entries, collection_spec_path, 1)
151 | 
152 |     @staticmethod
153 |     def _load_entry(base: str, entry: str) -> Tuple[Callable, Mapping]:
154 |         # load data
155 |         data_spec_path = os.path.join(base, entry, '{}.{}'.format(entry, DATA_EXT))
156 |         try:
157 |             data_spec = joblib.load(data_spec_path)
158 |         except FileNotFoundError:
159 |             raise FileNotFoundError("missing data for entry '{}'".format(entry))
160 |         data = {k: os.path.join(base, entry, value) for k, value in data_spec.items()}
161 |         # load loader
162 |         loader_spec_path = os.path.join(base, entry, '{}.{}'.format(entry, LOADER_EXT))
163 |         try:
164 |             module, name = joblib.load(loader_spec_path)
165 |         except FileNotFoundError:
166 |             raise FileNotFoundError("missing loader for entry '{}'".format(entry))
167 |         try:
168 |             loader = getattr(import_module(module), name)
169 |         except ModuleNotFoundError:
170 |             raise ModuleNotFoundError("can't import module '{}' to access "
171 |                                       "the loader specified by "
172 |                                       "'{}'".format(module, entry))
173 |         except AttributeError:
174 |             raise AttributeError("module '{}' contains no global name "
175 |                                  "'{}' specified as loader in entry "
176 |                                  "'{}'".format(module, name, entry))
177 |         return loader, data
178 | 
179 |     def _save_entry(self, base, entry):
180 |         destination = os.path.join(base, entry)
181 |         os.mkdir(destination)
182 |         # save data and data spec
183 |         data = self._data[entry]
184 |         for _, path in data.items():
185 |             shutil.copy(path, os.path.join(destination, os.path.basename(path)))
186 |         data_spec = {item: os.path.basename(path) for item, path in data.items()}
187 |         data_spec_path = os.path.join(destination, '{}.{}'.format(entry, DATA_EXT))
188 |         joblib.dump(data_spec, data_spec_path, 1)
189 |         # save loader spec
190 |         loader = self._loaders[entry]
191 |         loader_spec = (inspect.getmodule(loader).__name__, loader.__name__)
192 |         loader_spec_path = os.path.join(destination, '{}.{}'.format(entry, LOADER_EXT))
193 |         joblib.dump(loader_spec, loader_spec_path, 1)
194 | 
195 | 
196 | def importable(item) -> bool:
197 |     """
198 |     Check whether 'item' is accessible from its module's global namespace under
199 |     'item.__name__'.
200 |     :param item:
201 |     :return:
202 |     """
203 |     try:
204 |         module = import_module(inspect.getmodule(item).__name__)
205 |         assert getattr(module, item.__name__) is item
206 |     except (AssertionError, ImportError, ValueError, AttributeError):
207 |         return False
208 |     return True
209 | 
210 | 
211 | def isidentifier(name: str) -> bool:
212 |     """
213 |     Determines if string is valid Python identifier.
214 |     """
215 |     if not isinstance(name, str):
216 |         raise TypeError("expected str, but got {!r}".format(type(name)))
217 |     return name.isidentifier() and not keyword.iskeyword(name)
218 | 
219 | 
220 | if __name__ == '__main__':
221 |     raise RuntimeError
222 | 


--------------------------------------------------------------------------------
/scilk/collections/common.py:
--------------------------------------------------------------------------------
  1 | import csv
  2 | from itertools import chain
  3 | from typing import Sequence, Iterable, TypeVar, List, Tuple, Callable, Mapping, Union
  4 | 
  5 | import numpy as np
  6 | from fn import F
  7 | import pandas as pd
  8 | 
  9 | from scilk.util import preprocessing
 10 | from scilk.util.binning import unbin, unmerge_bins
 11 | 
 12 | T = TypeVar('T')
 13 | 
 14 | TextEncoder = Callable[[Union[str, Iterable[str]]], np.ndarray]
 15 | 
 16 | 
 17 | def asciicharset(strings: Iterable[str]) -> List[str]:
 18 |     """
 19 |     Return a sorted list of unique ascii characters
 20 |     :param strings: an iterable of strings to extract characters from
 21 |     :return:
 22 |     """
 23 |     characters = chain.from_iterable(strings)
 24 |     return sorted(set(filter(lambda x: ord(x) < 128, characters)))
 25 | 
 26 | 
 27 | # TODO specify all exception in the docs
 28 | 
 29 | def build_charencoder(corpus: Iterable[str], wordlen: int=None) \
 30 |         -> Tuple[int,  Mapping[str, int], TextEncoder]:
 31 |     """
 32 |     Create a char-level encoder: a Callable, mapping strings into integer arrays.
 33 |     Encoders dispatch on input type: if you pass a single string, you will get
 34 |     a 1D array, if you pass an Iterable of strings, you will get a 2D array
 35 |     where row i encodes the i-th string in the Iterable.
 36 |     :param corpus: an Iterable of strings to extract characters from. The
 37 |     encoder will map any non-ASCII character into the OOV code.
 38 |     :param wordlen: when `wordlen` is None and an encoder receives an Iterable of
 39 |     strings, the second dimension in the output array will be as long as the
 40 |     longest string, otherwise it will be `wordlen` long. In the latter case
 41 |     words exceeding `wordlen` will be trimmed. In both cases empty-spaces are
 42 |     filled with zeros.
 43 |     in the Iterable. If wordlen is not
 44 |     :return: the OOV code, a character mapping representing non-OOV character
 45 |     encodings, an encoder
 46 |     """
 47 |     if wordlen and wordlen < 1:
 48 |         raise ValueError('`wordlen` must be positive')
 49 |     try:
 50 |         charmap = {char: i + 1 for i, char in enumerate(asciicharset(corpus))}
 51 |     except TypeError:
 52 |         raise ValueError('`corpus` can be either a string or an Iterable of '
 53 |                          'strings')
 54 |     if not charmap:
 55 |         raise ValueError('the `corpus` is empty')
 56 |     oov = len(charmap) + 1
 57 | 
 58 |     def encode_string(string: str) -> np.ndarray:
 59 |         if not string:
 60 |             raise ValueError("can't encode empty strings")
 61 |         return np.fromiter((charmap.get(char, oov) for char in string), np.int32,
 62 |                            len(string))
 63 | 
 64 |     def charencoder(target: Union[str, Iterable[str]]):
 65 |         if isinstance(target, str):
 66 |             return encode_string(target)
 67 |         encoded_strings = list(map(encode_string, target))
 68 |         if not encoded_strings:
 69 |             raise ValueError('there are no `target`')
 70 |         return preprocessing.stack(
 71 |             encoded_strings, [wordlen or -1], np.int32, 0, True)[0]
 72 | 
 73 |     return oov, charmap, charencoder
 74 | 
 75 | 
 76 | def build_wordencoder(embeddings: pd.DataFrame, transform: Callable[[str], str]) \
 77 |         -> TextEncoder:
 78 |     """
 79 |     Create a word-level encoder: a Callable, mapping strings into integer arrays.
 80 |     Encoders dispatch on input type: if you pass a single string, you will get
 81 |     a 1D array, if you pass an Iterable of strings, you will get a 2D array,
 82 |     where row i encodes the i-th string in the Iterable.
 83 |     :param embeddings: a dataframe of word vectors indexed by words. The last
 84 |     vector (row) is used to encode OOV words.
 85 |     :return:
 86 |     """
 87 |     wordmap = {word: i for i, word in enumerate(embeddings.index)}
 88 |     if not wordmap:
 89 |         raise ValueError('empty `embeddings`')
 90 |     if not all(isinstance(word, str) for word in wordmap):
 91 |         raise ValueError('`embeddings` can be indexed by strings alone')
 92 |     oov = wordmap[embeddings.index[-1]]
 93 |     vectors = embeddings.as_matrix()
 94 | 
 95 |     def index(word: str) -> int:
 96 |         if not word:
 97 |             raise ValueError("can't encode empty words")
 98 |         return wordmap.get(transform(word), oov)
 99 | 
100 |     def wordencoder(target: Union[str, Iterable[str]]) -> np.ndarray:
101 |         if isinstance(target, str):
102 |             return vectors[index(target)]
103 |         indices = list(map(index, target))
104 |         if not indices:
105 |             raise ValueError('there are no `target`s')
106 |         return np.vstack(vectors[indices])
107 | 
108 |     return wordencoder
109 | 
110 | 
111 | def read_glove(path: str) -> pd.DataFrame:
112 |     """
113 |     Read Glove embeddings in text format. The file can be compressed.
114 |     :param path:
115 |     :return:
116 |     """
117 |     return pd.read_table(
118 |         path, sep=' ', index_col=0, header=None, quoting=csv.QUOTE_NONE,
119 |         na_values=None, keep_default_na=False
120 |     ).astype(np.float32)
121 | 
122 | 
123 | def decode_merged_predictions(merged: np.ndarray, bins: Sequence[Sequence[int]],
124 |                               lengths: Sequence[int]) -> List[Sequence[int]]:
125 |     """
126 |     :param merged: merged predictions
127 |     :param bins: bins
128 |     :param lengths: text lengths
129 |     """
130 |     unmerged = unmerge_bins(merged, bins, lengths)
131 |     unbined = (F(map, preprocessing.reverse) >> list)(unbin(unmerged, bins))
132 |     return [np.nonzero(anno > 0.5)[0] for anno in unbined]
133 | 
134 | 
135 | if __name__ == '__main__':
136 |     raise RuntimeError
137 | 


--------------------------------------------------------------------------------
/scilk/corpora/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/skoblov-lab/SciLK/b3509b7d3839462ab415e9f2cfd0ad8033f8034d/scilk/corpora/__init__.py


--------------------------------------------------------------------------------
/scilk/corpora/chemdner.py:
--------------------------------------------------------------------------------
  1 | """
  2 | 
  3 | Parsers, preprocessors and type annotations for the chemdner dataset.
  4 | 
  5 | """
  6 | 
  7 | import operator as op
  8 | from itertools import groupby
  9 | from typing import List, Tuple, Text, Iterable, Iterator
 10 | 
 11 | import pandas as pd
 12 | from fn import F
 13 | 
 14 | from scilk.corpora.corpus import TITLE, BODY, Abstract, AbstractAnnotation, \
 15 |     AbstractText, AbstractSentenceBorders
 16 | from scilk.util.intervals import Interval
 17 | 
 18 | 
 19 | def parse_abstracts(path: Text) -> List[AbstractText]:
 20 |     """
 21 |     Read chemdner abstracts
 22 |     :return: list[(abstract id, title, body)]
 23 |     >>> path = "testdata/abstracts.txt"
 24 |     >>> abstracts = parse_abstracts(path)
 25 |     >>> ids = {21826085, 22080034, 22080035, 22080037}
 26 |     >>> all(id_ in ids for id_, *_ in abstracts)
 27 |     True
 28 |     """
 29 |     with open(path) as buffer:
 30 |         parsed_buffer = (line.strip().split('\t') for line in buffer)
 31 |         return [AbstractText(int(id_), title.rstrip(), body.rstrip())
 32 |                 for id_, title, body in parsed_buffer]
 33 | 
 34 | 
 35 | def parse_annotations(path: Text) -> List[AbstractAnnotation]:
 36 |     # TODO log empty annotations
 37 |     # TODO more tests
 38 |     """
 39 |     Read chemdner annotations
 40 |     :param path: path to a CHEMDNER-formatted annotation files
 41 |     >>> path = "testdata/annotations.txt"
 42 |     >>> anno = parse_annotations(path)
 43 |     >>> ids = {21826085, 22080034, 22080035, 22080037}
 44 |     >>> all(id_ in ids for id_, *_ in anno)
 45 |     True
 46 |     >>> nonempty_anno = [id_ for id_, title, _ in anno if title]
 47 |     >>> nonempty_anno
 48 |     [22080037]
 49 |     >>> [len(title) for _, title, _ in anno]
 50 |     [0, 0, 0, 2]
 51 |     >>> [len(body) for _, _, body in anno]
 52 |     [1, 6, 9, 5]
 53 |     """
 54 |     def wrap_interval(record: Tuple[str, str, str, str, str, str]) \
 55 |             -> Interval:
 56 |         _, _, start, stop, text, label = record
 57 |         return Interval(int(start), int(stop), label)
 58 | 
 59 |     def parse_line(line):
 60 |         id_, src, start, stop, text, label = line.split('\t')
 61 |         return int(id_), src, int(start), int(stop), text, label
 62 | 
 63 |     with open(path) as buffer:
 64 |         parsed_lines = map(parse_line, map(str.strip, buffer))
 65 |         lines_sorted = sorted(
 66 |             parsed_lines, key=lambda x: (-x[0], x[1], -x[2]), reverse=True)
 67 |         # separate abstracts
 68 |         abstract_groups = groupby(lines_sorted, op.itemgetter(0))
 69 |         # separate parts (title and body)
 70 |         part_groups = ((id_, groupby(group, op.itemgetter(1)))
 71 |                        for id_, group in abstract_groups)
 72 |         # filter zero-length intervals and `None`s
 73 |         wrapper = F(map, wrap_interval) >> (filter, bool) >> list
 74 |         mapped_parts = ((id_, {part: wrapper(recs) for part, recs in parts})
 75 |                         for id_, parts in part_groups)
 76 |         return [AbstractAnnotation(int(id_),
 77 |                                    list(parts.get(TITLE, [])),
 78 |                                    list(parts.get(BODY, [])))
 79 |                 for id_, parts in mapped_parts]
 80 | 
 81 | 
 82 | def parse_borders(path: Text) -> List[AbstractSentenceBorders]:
 83 |     def pack_borders(id_: int, borders_: pd.DataFrame):
 84 |         src_mapped = {
 85 |             src: [Interval(*map(int, b_str.split(':'))) for b_str in bs[2]]
 86 |             for src, bs in borders_.groupby(1)
 87 |         }
 88 |         title_borders = src_mapped.get(TITLE, [])
 89 |         body_borders = src_mapped.get(BODY, [])
 90 |         return AbstractSentenceBorders(id_, title_borders, body_borders)
 91 | 
 92 |     borders = pd.read_csv(path, sep='\t', header=None)
 93 |     return ([] if not len(borders) else
 94 |             [pack_borders(id_, bs) for id_, bs in borders.groupby(0)])
 95 | 
 96 | 
 97 | def align_abstracts(abstracts: Iterable[AbstractText],
 98 |                     annotations: Iterable[AbstractAnnotation]=None,
 99 |                     borders: Iterable[AbstractSentenceBorders]=None) \
100 |         -> Iterator[Abstract]:
101 |     # TODO tests
102 |     """
103 |     Align abstracts and annotations (i.e. match abstract ids)
104 |     :param abstracts: parsed abstracts (e.g. produces by `read_abstracts`)
105 |     :param annotations: parsed annotations (e.g. produces by `read_annotations`)
106 |     :return: Iterator[(parsed abstract, parsed annotation)]
107 |     """
108 |     def empty_anno(id_: int) -> AbstractAnnotation:
109 |         return AbstractAnnotation(id_, [], [])
110 | 
111 |     def empty_borders(id_: int) -> AbstractSentenceBorders:
112 |         return AbstractSentenceBorders(id_, [], [])
113 | 
114 |     anno_mapping = {anno.id: anno for anno in annotations or []}
115 |     borders_mapping = {b.id: b for b in borders or []}
116 | 
117 |     return ((abstract,
118 |              anno_mapping.get(abstract.id, empty_anno(abstract.id)),
119 |              borders_mapping.get(abstract.id, empty_borders(abstract.id)))
120 |             for abstract in abstracts)
121 | 
122 | 
123 | def parse(abstracts: str, annotations: str, borders: str) -> List[Abstract]:
124 |     return list(align_abstracts(parse_abstracts(abstracts),
125 |                                 parse_annotations(annotations),
126 |                                 parse_borders(borders)))
127 | 
128 | 
129 | if __name__ == '__main__':
130 |     raise RuntimeError
131 | 


--------------------------------------------------------------------------------
/scilk/corpora/corpus.py:
--------------------------------------------------------------------------------
 1 | from numbers import Integral
 2 | from typing import Sequence, NamedTuple, Text, Iterable, Tuple, List, \
 3 |     Mapping, Optional
 4 | from itertools import chain
 5 | 
 6 | from fn import F
 7 | 
 8 | from scilk.util import intervals
 9 | 
10 | 
11 | OTHER = "OTHER"
12 | TITLE = "T"
13 | BODY = "A"
14 | ClassMapping = Mapping[Text, Integral]
15 | LabeledInterval = intervals.Interval[Text]
16 | Annotation = Sequence[LabeledInterval]
17 | SentenceBorders = Sequence[intervals.Interval]
18 | 
19 | AbstractText = NamedTuple("Abstract",
20 |                           [("id", int), ("title", Text), ("body", Text)])
21 | AbstractAnnotation = NamedTuple("AbstractAnnotation", [("id", int),
22 |                                                        ("title", Annotation),
23 |                                                        ("body", Annotation)])
24 | AbstractSentenceBorders = NamedTuple("AbstractSentenceBorders",
25 |                                      [("id", int), ("title", SentenceBorders),
26 |                                       ("body", SentenceBorders)])
27 | Abstract = Tuple[AbstractText, AbstractAnnotation, AbstractSentenceBorders]
28 | # Record: (abstract id, part type, text, annotation, sentence borders)
29 | Record = Tuple[int, Text, Text, Optional[Annotation], Optional[SentenceBorders]]
30 | 
31 | 
32 | class AnnotationError(ValueError):
33 |     pass
34 | 
35 | 
36 | def records(abstract: Abstract) -> List[Record]:
37 |     """
38 |     :return: list[(abstract id, source, text, annotation)]
39 |     """
40 |     abstract_id, title, body = abstract[0]
41 |     anno_id, title_anno, body_anno = abstract[1]
42 |     borders_id, title_borders, body_borders = abstract[2]
43 |     if abstract_id != anno_id:
44 |         raise AnnotationError("Abstract ids do not match")
45 |     return [(abstract_id, TITLE, title, title_anno, title_borders),
46 |             (abstract_id, BODY, body, body_anno, body_borders)]
47 | 
48 | 
49 | def parse_mapping(classmaps: Iterable[str]) -> ClassMapping:
50 |     """
51 |     :param classmaps:
52 |     :return:
53 |     >>> classmaps = ["a:1", "b:1", "c:2"]
54 |     >>> parse_mapping(classmaps) == dict(a=1, b=1, c=2)
55 |     True
56 |     """
57 |     try:
58 |         return {cls: int(val)
59 |                 for cls, val in [classmap.split(":") for classmap in classmaps]}
60 |     except ValueError as err:
61 |         raise AnnotationError("Badly formatted mapping: {}".format(err))
62 | 
63 | 
64 | def flatten_abstracts(abstracts: Iterable[Abstract]) -> \
65 |         List[Tuple[str, List[intervals.Interval], List[intervals.Interval]]]:
66 |     """
67 |     Flatten abstracts into a stream of tuples of form (text, annotations,
68 |     sentence borders)
69 |     :param abstracts:
70 |     :return:
71 |     """
72 |     return (F(chain.from_iterable) >> list)([
73 |         ((abstract.title, annotations.title, borders.title),
74 |          (abstract.body, annotations.body, borders.body))
75 |         for abstract, annotations, borders in abstracts
76 |     ])
77 | 
78 | 
79 | if __name__ == '__main__':
80 |     raise RuntimeError
81 | 


--------------------------------------------------------------------------------
/scilk/corpora/genia.py:
--------------------------------------------------------------------------------
  1 | from itertools import starmap
  2 | from typing import Sequence, NamedTuple, Tuple, Iterable, Text, Optional, List, \
  3 |     Iterator
  4 | from xml.etree import ElementTree as ETree
  5 | 
  6 | import operator as op
  7 | import re
  8 | from functools import reduce
  9 | from pyrsistent import v, pvector
 10 | 
 11 | from scilk.corpora.corpus import AbstractAnnotation, AbstractText, AbstractSentenceBorders, \
 12 |     AnnotationError, LabeledInterval, Abstract, SentenceBorders
 13 | from scilk.util.intervals import Interval
 14 | 
 15 | ANNO_PATT = re.compile('G#(\w+)')
 16 | SENTENCE_TAG = 'sentence'
 17 | ANNO_TAG = 'sem'
 18 | ARTICLE_TAG = 'article'
 19 | 
 20 | LevelAnnotation = NamedTuple('Annotation', [('level', int),
 21 |                                             ('anno', Sequence[Optional[Text]]),
 22 |                                             ('terminal', bool)])
 23 | 
 24 | 
 25 | def _flatten_sentence(sentence: ETree.Element) \
 26 |         -> List[Tuple[Text, Sequence[LevelAnnotation]]]:
 27 |     # TODO docs
 28 |     """
 29 |     Convert a `sentence` XML Element object into normal text.
 30 |     :param sentence: an sentence XML node
 31 |     :return: a list of strings with corresponding annotations
 32 |     """
 33 | 
 34 |     def isterminal(element: ETree.Element):
 35 |         return next(iter(element), None) is None
 36 | 
 37 |     def getanno(element: ETree.Element):
 38 |         return element.get(ANNO_TAG, None)
 39 | 
 40 |     stack = [(sentence, iter(sentence), v())]
 41 |     texts = [sentence.text]
 42 |     annotations = [stack[0][2]]
 43 |     while stack:
 44 |         node, children, anno = stack[-1]
 45 |         child = next(children, None)
 46 |         if child is None:
 47 |             stack.pop()
 48 |             texts.append(node.tail)
 49 |             annotations.append(anno[:-1])
 50 |             continue
 51 |         child_anno = anno.append(
 52 |             LevelAnnotation(len(anno), getanno(child), isterminal(child)))
 53 |         texts.append(child.text)
 54 |         annotations.append(child_anno)
 55 |         stack.append((child, iter(child), child_anno))
 56 | 
 57 |     return list(zip(texts, annotations))
 58 | 
 59 | 
 60 | def _segment_borders(texts: Iterable[Text]) -> List[Tuple[int, int]]:
 61 |     # TODO docs
 62 |     """
 63 |     Returns a list of cummulative start/stop positions for segments in `texts`.
 64 |     :param texts: a list of strings
 65 |     :return: list of (start position, stop position)
 66 |     >>> _segment_borders(['amino acid', 'is any']) == [(0, 10), (10, 16)]
 67 |     True
 68 |     """
 69 | 
 70 |     def aggregate_boundaries(boundaries: pvector, text):
 71 |         return (
 72 |             boundaries + [(boundaries[-1][1], boundaries[-1][1] + len(text))]
 73 |             if boundaries else v((0, len(text)))
 74 |         )
 75 | 
 76 |     return list(reduce(aggregate_boundaries, texts, v()))
 77 | 
 78 | 
 79 | def _sentences_borders(sentences: Iterable[ETree.Element]) -> SentenceBorders:
 80 |     """
 81 |     Applies _segment_borders to sentences and corrects intervals to handle 
 82 |     end-of-a-sentence symbol at the ends of sentences in AbstractText returned 
 83 |     by _parse_sentences
 84 |     :param sentences: Iterable of ETree.Element objects each containing
 85 |     sentence's text. Correct sentence segmentation is assumed
 86 |     :return: List of Intervals with sentence borders and no Interval.data.
 87 |     ~List[Interval[start, stop, None]]
 88 |     """
 89 |     sent_j = [''.join(x.itertext()) for x in sentences]
 90 |     borders = ((start+i, stop+i)
 91 |                for i, (start, stop) in enumerate(_segment_borders(sent_j)))
 92 |     return [Interval(start, stop) for (start, stop), _ in zip(borders, sent_j)]
 93 | 
 94 | 
 95 | def _parse_sentences(root: ETree.Element) \
 96 |         -> Tuple[Text, List[LabeledInterval], SentenceBorders]:
 97 |     # TODO docs
 98 |     """
 99 |     Get text from `root` Element with given mapping dictionary.
100 |     :param root:
101 |     :return: joined text along with its annotations
102 |     """
103 | 
104 |     def wrap_iv(start: int, stop: int, levels: Sequence[LevelAnnotation]) \
105 |             -> LabeledInterval:
106 |         """
107 |         Wrap `start`, `stop` and `levels` into an Interval.
108 |         :param start: start position
109 |         :param stop: stop position
110 |         :param levels: list of annotations
111 |         :return: Interval(`start`, `stop`, mappings)
112 |         """
113 |         # get the first nonempty annotation bottom to top
114 |         anno = next(filter(bool, (l.anno for l in reversed(levels))), '')
115 |         codes = set(ANNO_PATT.findall(anno))
116 |         if not len(codes) == 1:
117 |             raise AnnotationError(
118 |                 'The annotation is either ambiguous or empty: {}'.format(codes))
119 |         return Interval(start, stop, codes.pop())
120 | 
121 |     sentences = root.findall(SENTENCE_TAG)
122 |     flattened = reduce(op.iadd, map(_flatten_sentence, sentences), [])
123 |     texts, annotations = zip(*((txt, anno) for txt, anno in flattened
124 |                                if txt is not None))
125 |     boundaries = _segment_borders(texts)
126 |     intervals = [wrap_iv(start, stop, levels)
127 |                  for (start, stop), levels in zip(boundaries, annotations)
128 |                  if levels and levels[-1].terminal]
129 |     text = ''.join(texts).replace('\n', ' ').rstrip()
130 |     annotation = [iv for iv in intervals if iv]
131 |     borders = _sentences_borders(sentences)
132 |     return text, annotation, borders
133 | 
134 | 
135 | def parse(path: Text) -> List[Abstract]:
136 |     """
137 |     Extract text from xml file `path`.
138 |     :param path: xml file's path
139 |     :return:
140 |     """
141 | 
142 |     def getid(article: ETree.Element) -> int:
143 |         raw = article.find('articleinfo').find('bibliomisc').text
144 |         return int(raw.replace('MEDLINE:', ''))
145 | 
146 |     def accumulate_articles(root: ETree.Element) \
147 |             -> Iterator[Tuple[int, ETree.Element, ETree.Element]]:
148 |         """
149 |         Collects articles inside `root`.
150 |         :param root:
151 |         :return:
152 |         """
153 |         articles_ = root.findall(ARTICLE_TAG)
154 |         ids = map(getid, articles_)
155 |         title_roots = [article.find('title') for article in articles_]
156 |         body_roots = [article.find('abstract') for article in articles_]
157 |         return zip(ids, title_roots, body_roots)
158 | 
159 |     def parse_article(id_: int, title_root: ETree.Element,
160 |                       body_root: ETree.Element) \
161 |             -> Tuple[AbstractText, AbstractAnnotation, AbstractSentenceBorders]:
162 |         """
163 |         Extract title and body texts from `title_root` and `body_root`.
164 |         :param id_: article's id
165 |         :param title_root:
166 |         :param body_root:
167 |         :return:
168 |         """
169 |         title_text, title_anno, title_sent = _parse_sentences(title_root)
170 |         body_text, body_anno, body_sent = _parse_sentences(body_root)
171 |         abstract = AbstractText(id_, title_text, body_text)
172 |         annotation = AbstractAnnotation(id_, title_anno, body_anno)
173 |         sent_borders = AbstractSentenceBorders(id_, title_sent, body_sent)
174 |         return abstract, annotation, sent_borders
175 | 
176 |     corpus = ETree.parse(path)
177 |     articles = accumulate_articles(corpus)
178 |     return list(starmap(parse_article, articles))
179 | 
180 | 
181 | if __name__ == '__main__':
182 |     raise RuntimeError
183 | 


--------------------------------------------------------------------------------
/scilk/util/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/skoblov-lab/SciLK/b3509b7d3839462ab415e9f2cfd0ad8033f8034d/scilk/util/__init__.py


--------------------------------------------------------------------------------
/scilk/util/binning.py:
--------------------------------------------------------------------------------
  1 | import operator as op
  2 | from itertools import chain
  3 | from numbers import Number
  4 | from typing import Union, Sequence, Iterable, List, TypeVar, Callable
  5 | 
  6 | import numpy as np
  7 | from binpacking import to_constant_bin_number
  8 | from fn import F
  9 | 
 10 | from scilk.util import preprocessing
 11 | 
 12 | 
 13 | T = TypeVar('T')
 14 | 
 15 | 
 16 | def binpack(nbins: int, weight: Callable[[T], Number], items: Sequence[T]) \
 17 |         -> List[List[int]]:
 18 |     """
 19 |     Pack items into n bins while minimising the variance of weight accumulated
 20 |     in each bin. The function uses a greedy algorithm, which doesn't not
 21 |     guarantee a perfect result.
 22 |     :param nbins: the number of bins to create
 23 |     :param weight: a weight function
 24 |     :param items: items to pack; since the function returns bins packed with
 25 |     positions inferred from iteration order, iteration over `items` must be
 26 |     stable for the output to be useful.
 27 |     :return: a nested list of integers representing positions in `items`
 28 |     """
 29 |     if len(items) < nbins:
 30 |         raise ValueError('There should be at lest `nbins` items')
 31 |     weighted = [(i, weight(item)) for i, item in enumerate(items)]
 32 |     return (F(map, F(map, op.itemgetter(0)) >> list) >> list)(
 33 |         to_constant_bin_number(weighted, nbins, weight_pos=1)
 34 |     )
 35 | 
 36 | 
 37 | def binextract(source: Union[Sequence[T], np.ndarray], bins: Sequence[Sequence[int]]) \
 38 |         -> Union[List[List[T]], List[np.ndarray]]:
 39 |     """
 40 |     'Materialise' bins, i.e. transform a nested list of indices into bins of
 41 |     source items. See `binpack` for additional info.
 42 |     :param source: source items
 43 |     :param bins: a nested sequence if integers - indices referring to object
 44 |     from `source`
 45 |     :return:
 46 |     """
 47 |     if not isinstance(source, (Sequence, np.ndarray)):
 48 |         raise ValueError('`source` must be either a Sequence or a numpy array')
 49 |     try:
 50 |         return (
 51 |             [source[bin_] for bin_ in bins] if isinstance(source, np.ndarray) else
 52 |             [[source[i] for i in bin_] for bin_ in bins]
 53 |         )
 54 |     except IndexError:
 55 |         raise ValueError('`bins` contain indices outside of the `source` range')
 56 | 
 57 | 
 58 | def merge_bins(sources: Union[np.ndarray, Sequence[np.ndarray]],
 59 |                bins: Sequence[Sequence[int]], dtype=None) -> np.ndarray:
 60 |     """
 61 |     Merge sources within bins and stack them on top of each other.
 62 |     :param sources: a Sequence of source arrays.
 63 |     :param bins: a Sequence of bins: Sequences of indices referencing
 64 |     arrays in `sources`.
 65 |     :param dtype: numpy data type; if None `sources[0].dtype` will be used
 66 |     instead
 67 |     :return: a merged arrays
 68 |     """
 69 |     if not len(sources):
 70 |         raise ValueError('no `sources`')
 71 |     extracted = (
 72 |             F(binextract) >> (map, np.concatenate) >> list
 73 |     )(sources, bins)
 74 |     return preprocessing.stack(extracted, None,
 75 |                                dtype=(dtype or sources[0].dtype))[0]
 76 | 
 77 | 
 78 | def unbin(binned: Iterable[Iterable[T]], bins: Iterable[Iterable[int]]) \
 79 |         -> List[T]:
 80 |     """
 81 |     Revert binning: transform a nested Iterable of objects (i.e. objects packed
 82 |     into bins) into a list of objects ordered the same way as the original
 83 |     Sequence
 84 |     :param binned: a nested Iterable of binned objects
 85 |     :param bins: a nested Iterable of bins: Iterables of indices referencing
 86 |     objects in the original Sequence
 87 |     :return:
 88 |     """
 89 |     return (F(map, chain.from_iterable) >>
 90 |             (lambda x: zip(*x)) >>
 91 |             F(sorted, key=op.itemgetter(0)) >>
 92 |             (map, op.itemgetter(1)) >> list)([bins, binned])
 93 | 
 94 | 
 95 | def unmerge_bins(merged: np.ndarray, bins: Sequence[Sequence[int]],
 96 |                  lengths: Sequence[int]) -> List[List[np.ndarray]]:
 97 |     """
 98 |     Breaks `merged` into binned objects corresponding to the original objects
 99 |     in a binned Sequence
100 |     :param merged: a merged representation of binned data
101 |     :param bins: a Sequence of bins: Sequences of indices referencing
102 |     :param lengths: lengths of the original source objects
103 |     :return:
104 |     """
105 |     lengths_ = np.array(lengths)
106 |     indices = [lengths_[bin_] for bin_ in bins]
107 |     return [list(np.split(line, np.cumsum(l_indices)))[:-1]
108 |             for line, l_indices in zip(merged, indices)]
109 | 
110 | 
111 | if __name__ == '__main__':
112 |     raise RuntimeError
113 | 


--------------------------------------------------------------------------------
/scilk/util/intervals.py:
--------------------------------------------------------------------------------
  1 | import sys
  2 | from typing import TypeVar, Container, Generic, Optional, Sequence, Iterable, \
  3 |     List, Iterator, Union, overload
  4 | from numbers import Number
  5 | 
  6 | import numpy as np
  7 | 
  8 | _slots_supported = (sys.version_info >= (3, 6, 2) or
  9 |                     (3, 5, 3) <= sys.version_info < (3, 6))
 10 | T = TypeVar("T")
 11 | 
 12 | 
 13 | class Interval(Container, Generic[T]):
 14 | 
 15 |     if _slots_supported:
 16 |         __slots__ = ("start", "stop", "data")
 17 | 
 18 |     def __init__(self, start: int, stop: int, data: Optional[T]=None):
 19 |         self.start = start
 20 |         self.stop = stop
 21 |         self.data = data
 22 | 
 23 |     def __contains__(self, item: T) -> bool:
 24 |         return False if self.data is None or item is None else self.data == item
 25 | 
 26 |     def __iter__(self):
 27 |         return iter(range(self.start, self.stop))
 28 | 
 29 |     def __eq__(self, other: 'Interval'):
 30 |         return (self.start, self.stop, self.data) == (other.start, other.stop, other.data)
 31 | 
 32 |     def __hash__(self):
 33 |         return hash((self.start, self.stop, self.data))
 34 | 
 35 |     def __len__(self):
 36 |         return self.stop - self.start
 37 | 
 38 |     def __bool__(self):
 39 |         return bool(len(self))
 40 | 
 41 |     def __and__(self, other: 'Interval') -> 'Interval[List]':
 42 |         # TODO docs
 43 |         first, second = sorted([self, other], key=lambda iv: iv.start)
 44 |         return type(self)(first.start, second.stop, [first.data, second.data])
 45 | 
 46 |     def __repr__(self):
 47 |         return '{}(start={}, stop={}, data={})'.format(type(self).__name__,
 48 |                                                        self.start,
 49 |                                                        self.stop,
 50 |                                                        self.data)
 51 | 
 52 |     def reload(self, value: T) -> 'Interval[T]':
 53 |         return type(self)(self.start, self.stop, value)
 54 | 
 55 |     def intersects(self, other: Union['Interval', Number]) -> bool:
 56 |         if isinstance(other, type(self)):
 57 |             return (other.start <= self.start < other.stop or
 58 |                     self.start <= other.start < self.stop)
 59 |         if isinstance(other, Number):
 60 |             return self.start <= other < self.stop
 61 |         raise ValueError('method argument `other` must be an instance of {} '
 62 |                          'or a Number'.format(type(self).__name__))
 63 | 
 64 | 
 65 | def extract(sequence: Sequence[T], ivs: Iterable[Interval], offset=0) \
 66 |         -> List[Sequence[T]]:
 67 |     return [sequence[iv.start-offset:iv.stop-offset] for iv in ivs]
 68 | 
 69 | 
 70 | def span(ivs: Sequence[Interval]) -> Optional[Interval]:
 71 |     """
 72 |     Intervals must be presorted
 73 |     :param ivs:
 74 |     :return:
 75 |     """
 76 |     return Interval(ivs[0].start, ivs[-1].stop) if len(ivs) else None
 77 | 
 78 | 
 79 | def unload(intervals: Iterable[Interval[T]]) -> Iterator[T]:
 80 |     return (iv.data for iv in intervals)
 81 | 
 82 | 
 83 | @overload
 84 | def unextract(ivs: Sequence[Interval], extracted: Sequence[Sequence[T]], fill: T) \
 85 |         -> Sequence[T]:
 86 |     pass
 87 | 
 88 | 
 89 | @overload
 90 | def unextract(ivs: Sequence[Interval], extracted: Sequence[np.ndarray], fill) \
 91 |         -> Sequence[T]:
 92 |     pass
 93 | 
 94 | 
 95 | def unextract(ivs, extracted, fill):
 96 |     if not len(ivs) or not len(extracted):
 97 |         return None
 98 |     if all(isinstance(ext, np.ndarray) for ext in extracted):
 99 |         return _unextract_arr(ivs, extracted, fill)
100 |     if isinstance(extracted, Sequence):
101 |         return _unextract_sequence(ivs, extracted, fill)
102 |     raise ValueError("Extracted must be either a sequence of numpy arrays or "
103 |                      "a sequence of Sequence objects")
104 | 
105 | 
106 | def _unextract_sequence(ivs: Sequence[Interval],
107 |                         extracted: Sequence[Sequence[T]],
108 |                         fill: T) -> Sequence[T]:
109 |     sorted_ivs = sorted(ivs, key=lambda x: x.start)
110 |     res = [fill] * len(span(sorted_ivs))
111 |     offset = sorted_ivs[0].start
112 |     for iv, ext in zip(ivs, extracted):
113 |         if len(iv) != len(ext):
114 |             raise ValueError("Intervals and extracted data are not aligned "
115 |                              "with respect to length")
116 |         for i, val in zip(iv, ext):
117 |             res[i-offset] = val
118 |     return res
119 | 
120 | 
121 | def _unextract_arr(ivs: Sequence[Interval], extracted: Sequence[np.ndarray], fill) \
122 |         -> Optional[np.ndarray]:
123 |     ndims = set(map(np.ndim, extracted))
124 |     dtypes = set(ext.dtype for ext in extracted)
125 |     if not len(ndims) == len(dtypes) == 1:
126 |         raise ValueError("Arrays must be homogeneous")
127 |     if isinstance(fill, np.ndarray) and fill.shape != extracted[0].shape[1:]:
128 |         raise ValueError("fill is incompatible with extracted arrays")
129 |     sorted_ivs = sorted(ivs, key=lambda x: x.start)
130 |     res = np.array([fill]*len(span(sorted_ivs)), dtype=dtypes.pop())
131 |     offset = sorted_ivs[0].start
132 |     for iv, ext in zip(ivs, extracted):
133 |         if len(iv) != len(ext):
134 |             raise ValueError("Intervals and extracted data are not aligned "
135 |                              "with respect to length")
136 |         res[iv.start-offset:iv.stop-offset] = ext
137 |     return res
138 | 
139 | 
140 | if __name__ == "__main__":
141 |     raise ValueError
142 | 


--------------------------------------------------------------------------------
/scilk/util/networks/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/skoblov-lab/SciLK/b3509b7d3839462ab415e9f2cfd0ad8033f8034d/scilk/util/networks/__init__.py


--------------------------------------------------------------------------------
/scilk/util/networks/blocks.py:
--------------------------------------------------------------------------------
  1 | """
  2 | 
  3 | Utility functions for creating ChemPred deep learning models and working with
  4 | their predictions
  5 | 
  6 | """
  7 | from functools import reduce
  8 | from typing import Sequence, Tuple, Optional, Union, Callable
  9 | 
 10 | import numpy as np
 11 | from keras import layers, backend as K
 12 | 
 13 | 
 14 | def cnn(nfilters: Sequence[int],
 15 |         filter_width: Union[int, Sequence[int]],
 16 |         dropout: Union[Optional[float], Sequence[Optional[float]]]=None,
 17 |         padding: Union[str, Sequence[str]]='same',
 18 |         name_template: str='conv{}') \
 19 |         -> Callable:
 20 |     # TODO extend documentation
 21 |     # TODO more tests
 22 |     # TODO make name_template Optional
 23 |     """
 24 | 
 25 |     :param nfilters:
 26 |     :param filter_width:
 27 |     :return:
 28 |     """
 29 |     def stack_conv(prev, param: Tuple[str, int, int, float, str]):
 30 |         name, nfilt, kern_size, drop_p, pad = param
 31 |         l = layers.Convolution1D(
 32 |             nfilt, kern_size, activation='relu', name=name, padding=pad
 33 |         )(prev)
 34 |         return layers.Dropout(drop_p)(l) if drop_p else l
 35 | 
 36 |     filter_width = (filter_width if isinstance(filter_width, Sequence) else
 37 |                     [filter_width] * len(nfilters))
 38 |     dropout = (dropout if isinstance(dropout, Sequence) else
 39 |                [dropout] * len(nfilters))
 40 |     padding = (padding if isinstance(padding, Sequence) and not isinstance(padding, str)
 41 |                else [padding] * len(nfilters))
 42 | 
 43 |     if not len(nfilters) == len(filter_width) == len(dropout) == len(padding):
 44 |         raise ValueError('Parameter sequences have different lengths')
 45 | 
 46 |     def conv(incomming):
 47 |         conv_names = (name_template.format(i+1) for i in range(0, len(nfilters)))
 48 |         parameters = zip(conv_names, nfilters, filter_width, dropout, padding)
 49 |         cnn = reduce(stack_conv, parameters, incomming)
 50 |         return cnn
 51 | 
 52 |     return conv
 53 | 
 54 | 
 55 | def rnn(nsteps: Sequence[int],
 56 |         inp_drop: Optional[Union[float, Sequence[float]]]=None,
 57 |         rec_drop: Optional[Union[float, Sequence[float]]]=None,
 58 |         bidirectional: Union[Optional[str], Sequence[Optional[str]]]=None,
 59 |         stateful=False, layer=layers.LSTM) -> Callable:
 60 |     # TODO extend documentation
 61 |     # TODO add name template argument
 62 |     # TODO tests
 63 |     """
 64 |     :param nsteps:
 65 |     :param inp_drop:
 66 |     :param rec_drop:
 67 |     :param bidirectional:
 68 |     :param stateful: use stateful RNN-cells
 69 |     :param layer: a recurrent layer to use
 70 |     :return:
 71 |     """
 72 | 
 73 |     def stack_layers(prev, param: Tuple[str, int, float, float, str]):
 74 |         """
 75 |         :param prev: incomming keras layer
 76 |         :param param: [layer name, steps, input dropout, recurrent dropout,
 77 |         bidirectional]
 78 |         """
 79 |         name, steps, indrop, recdrop, bidir = param
 80 |         layer_ = layer(steps, dropout=indrop, recurrent_dropout=recdrop,
 81 |                        return_sequences=True, stateful=stateful)
 82 |         return (layers.Bidirectional(layer_, bidir) if bidir else layer_)(prev)
 83 | 
 84 |     bidir_is_seq = (isinstance(bidirectional, Sequence)
 85 |                     and not isinstance(bidirectional, str))
 86 |     bi = (bidirectional if bidir_is_seq else [bidirectional] * len(nsteps))
 87 |     inp_drop = (inp_drop if isinstance(inp_drop, Sequence) else
 88 |                 [inp_drop or 0] * len(nsteps))
 89 |     rec_drop = (rec_drop if isinstance(rec_drop, Sequence) else
 90 |                 [rec_drop or 0] * len(nsteps))
 91 | 
 92 |     if not len(nsteps) == len(rec_drop) == len(inp_drop) == len(bi):
 93 |         raise ValueError('Parameter sequences have different length')
 94 | 
 95 |     def rec(incomming):
 96 |         rec_names = ('rec{}'.format(i) for i in range(1, len(nsteps) + 1))
 97 |         parameters = zip(rec_names, nsteps, inp_drop, rec_drop, bi)
 98 |         rnn = reduce(stack_layers, parameters, incomming)
 99 |         return rnn
100 | 
101 |     return rec
102 | 
103 | 
104 | def wordemb(nwords: int, vectors: np.ndarray, mask: bool):
105 |     # TODO docs
106 |     def wordemb(incomming):
107 |         emb = layers.embeddings.Embedding(input_dim=nwords,
108 |                                           output_dim=vectors.shape[-1],
109 |                                           mask_zero=mask,
110 |                                           weights=[vectors])(incomming)
111 |         return emb
112 | 
113 |     return wordemb
114 | 
115 | 
116 | def charemb(input_dim: int, maxlen: int, embsize: int, nunits: int,
117 |             indrop: float, recdrop: float, mask: bool, layer=layers.LSTM):
118 |     # TODO docs
119 |     def charemb(incomming):
120 |         emb = layers.embeddings.Embedding(input_dim=input_dim,
121 |                                           output_dim=embsize,
122 |                                           mask_zero=mask)(incomming)
123 |         shape = (K.shape(incomming)[0], maxlen, K.shape(incomming)[2], embsize)
124 |         emb = layers.Lambda(
125 |             lambda x: K.reshape(x, shape=(-1, shape[-2], embsize)))(emb)
126 | 
127 |         forward = layer(nunits,
128 |                         return_state=True,
129 |                         dropout=indrop,
130 |                         recurrent_dropout=recdrop)(emb)[-2]
131 |         reverse = layer(nunits,
132 |                         return_state=True,
133 |                         recurrent_dropout=recdrop,
134 |                         dropout=indrop,
135 |                         go_backwards=True)(emb)[-2]
136 |         emb = layers.concatenate([forward, reverse], axis=-1)
137 |         # shape = (batch size, max sentence length, char hidden size)
138 |         embshape = [incomming.shape[0].value or -1, shape[1], 2 * nunits]
139 |         return layers.Lambda(lambda x: K.reshape(x, shape=embshape))(emb)
140 | 
141 |     return charemb
142 | 
143 | 
144 | if __name__ == '__main__':
145 |     raise RuntimeError
146 | 


--------------------------------------------------------------------------------
/scilk/util/networks/callbacks.py:
--------------------------------------------------------------------------------
  1 | import sys
  2 | from itertools import starmap
  3 | from typing import Sequence, Mapping, Text, Callable, Optional, IO, Any, Iterable
  4 | import copy
  5 | 
  6 | import numpy as np
  7 | from fn.op import identity
  8 | from keras import callbacks
  9 | from keras.models import Model
 10 | 
 11 | 
 12 | class Validator(callbacks.Callback):
 13 |     modes = ('max', 'min')
 14 | 
 15 |     # TODO docs
 16 | 
 17 |     def __init__(self,
 18 |                  inputs: Sequence[np.ndarray],
 19 |                  output: np.ndarray,
 20 |                  batchsize: int,
 21 |                  metrics: Mapping[Text, Callable[[np.ndarray, np.ndarray], float]],
 22 |                  transform: Callable[[np.ndarray], np.ndarray]=identity,
 23 |                  monitor: Optional[Text]=None,
 24 |                  mode: Text='max',
 25 |                  prefix: Text=None,
 26 |                  stream: IO=sys.stderr):
 27 |         """
 28 |         :param inputs:
 29 |         :param output:
 30 |         :param batchsize:
 31 |         :param metrics: a mapping between names and functions; the functions
 32 |         must have the following signature: f(true, predicted) -> float
 33 |         :param transform:
 34 |         :param monitor:
 35 |         :param mode:
 36 |         :param prefix:
 37 |         """
 38 |         super().__init__()
 39 |         if mode not in self.modes:
 40 |             raise ValueError('`mode` must be either "max" or "min"')
 41 |         if monitor and monitor not in metrics:
 42 |             raise ValueError('`monitor` is not in metrics')
 43 |         if monitor and not prefix:
 44 |             raise ValueError('you must provide a path prefix when monitoring')
 45 |         self.inputs = inputs
 46 |         self.output = output
 47 |         self.epoch = None
 48 |         self.batchsize = batchsize
 49 |         self.metrics = metrics
 50 |         self.mode = mode
 51 |         self.transform = transform
 52 |         self.monitor = monitor
 53 |         self.best = float('-inf') if mode == 'max' else float('inf')
 54 |         self.prefix = prefix
 55 |         self.stream = stream
 56 | 
 57 |     def _estimate_metrics(self):
 58 |         pred = self.transform(self.model.predict(self.inputs, self.batchsize))
 59 |         return {name: f(self.output, pred) for name, f in self.metrics.items()}
 60 | 
 61 |     @staticmethod
 62 |     def _format_score_log(scores: Mapping[Text, float]):
 63 |         template = '{} - {:.3f}'
 64 |         return " | ".join(starmap(template.format, scores.items()))
 65 | 
 66 |     def _improved(self, score: float):
 67 |         return score > self.best if self.mode == 'max' else score < self.best
 68 | 
 69 |     def on_epoch_end(self, epoch, logs=None):
 70 |         self.epoch = epoch
 71 |         scores = self._estimate_metrics()
 72 |         log = self._format_score_log(scores)
 73 |         print("\n" + log, file=self.stream)
 74 |         if self.monitor and self._improved(scores[self.monitor]):
 75 |             path = '{}-{:02d}-{:.3f}.hdf5'.format(self.prefix, self.epoch, scores[self.monitor])
 76 |             print('{} improved from {} to {}; saving weights to {}'.format(
 77 |                 self.monitor, self.best, scores[self.monitor], path),
 78 |                 end='\n\n', file=self.stream)
 79 |             self.best = scores[self.monitor]
 80 |             self.model.save_weights(path)
 81 |         elif self.monitor:
 82 |             print("{} didn't improve".format(self.monitor), end='\n\n', file=self.stream)
 83 |         self.stream.flush()
 84 | 
 85 | 
 86 | class Caller(callbacks.Callback):
 87 | 
 88 |     def __init__(self, callables: Mapping[str, Iterable[Callable[[Model], Any]]]):
 89 |         """
 90 |         Call some callables on epoch/batch end/begin. Valid dictionary keys:
 91 |         - on_batch_begin
 92 |         - on_batch_end
 93 |         - on_epoch_begin
 94 |         - on_epoch_end
 95 |         """
 96 |         super().__init__()
 97 |         self.callables = {key: list(val) for key, val in callables.items()}
 98 | 
 99 |     def call(self, when):
100 |         for f in self.callables.get(when, []):
101 |             f(self.model)
102 | 
103 |     def on_batch_begin(self, batch, logs=None):
104 |         self.call('on_batch_begin')
105 | 
106 |     def on_batch_end(self, batch, logs=None):
107 |         self.call('on_batch_end')
108 | 
109 |     def on_epoch_begin(self, epoch, logs=None):
110 |         self.call('on_epoch_begin')
111 | 
112 |     def on_epoch_end(self, epoch, logs=None):
113 |         self.call('on_epoch_end')
114 | 
115 | 
116 | if __name__ == '__main__':
117 |     raise RuntimeError
118 | 


--------------------------------------------------------------------------------
/scilk/util/networks/metrics.py:
--------------------------------------------------------------------------------
 1 | """
 2 | 
 3 | 
 4 | """
 5 | 
 6 | from keras import backend as K
 7 | 
 8 | 
 9 | def precision(y_true, y_pred):
10 |     true_positives = K.sum(K.round(K.clip(y_true * y_pred, 0, 1)))
11 |     predicted_positives = K.sum(K.round(K.clip(y_pred, 0, 1)))
12 |     precision = true_positives / (predicted_positives + K.epsilon())
13 |     return precision
14 | 
15 | 
16 | def recall(y_true, y_pred):
17 |     true_positives = K.sum(K.round(K.clip(y_true * y_pred, 0, 1)))
18 |     possible_positives = K.sum(K.round(K.clip(y_true, 0, 1)))
19 |     recall = true_positives / (possible_positives + K.epsilon())
20 |     return recall
21 | 
22 | 
23 | def fbeta_score(y_true, y_pred, beta):
24 |     """
25 |     Calculates the F score, the weighted harmonic mean of precision and recall.
26 | 
27 |     This is useful for multi-label classification, where input samples can be
28 |     classified as sets of labels. By only using accuracy (precision) a model
29 |     would achieve a perfect score by simply assigning every class to every
30 |     input. In order to avoid this, a metric should penalize incorrect class
31 |     assignments as well (recall). The F-beta score (ranged from 0.0 to 1.0)
32 |     computes this, as a weighted mean of the proportion of correct class
33 |     assignments vs. the proportion of incorrect class assignments.
34 | 
35 |     With beta = 1, this is equivalent to a F-measure. With beta < 1, assigning
36 |     correct classes becomes more important, and with beta > 1 the metric is
37 |     instead weighted towards penalizing incorrect class assignments.
38 |     """
39 |     if beta < 0:
40 |         raise ValueError('The lowest choosable beta is zero (only precision).')
41 | 
42 |     # If there are no true positives, fix the F score at 0 like sklearn.
43 |     if K.sum(K.round(K.clip(y_true, 0, 1))) == 0:
44 |         return 0
45 | 
46 |     p = precision(y_true, y_pred)
47 |     r = recall(y_true, y_pred)
48 |     bb = beta ** 2
49 |     fbeta_score = (1 + bb) * (p * r) / (bb * p + r + K.epsilon())
50 |     return fbeta_score
51 | 
52 | 
53 | def fmeasure(y_true, y_pred):
54 |     """
55 |     Calculates the f-measure, the harmonic mean of precision and recall.
56 |     """
57 |     return fbeta_score(y_true, y_pred, beta=1)
58 | 
59 | 
60 | def recall_softmax(y_true, y_pred):
61 |     labels_true = K.argmax(y_true, axis=-1)
62 |     labels_pred = K.argmax(y_pred, axis=-1)
63 |     positive_true = K.cast(K.equal(labels_true, 1), dtype=K.floatx())
64 |     positive_pred = K.cast(K.equal(labels_pred, 1), dtype=K.floatx())
65 |     true_positives = K.sum(positive_true * positive_pred) + K.epsilon()
66 |     return true_positives / (K.sum(positive_true) + K.epsilon())
67 | 
68 | 
69 | def precision_softmax(y_true, y_pred):
70 |     labels_true = K.argmax(y_true, axis=-1)
71 |     labels_pred = K.argmax(y_pred, axis=-1)
72 |     positive_true = K.cast(K.equal(labels_true, 1), dtype=K.floatx())
73 |     positive_pred = K.cast(K.equal(labels_pred, 1), dtype=K.floatx())
74 |     true_positives = K.sum(positive_true * positive_pred) + K.epsilon()
75 |     return true_positives / (K.sum(positive_pred) + K.epsilon())
76 | 
77 | 
78 | def fmeasure_softmax(y_true, y_pred):
79 |     p = precision_softmax(y_true, y_pred)
80 |     r = recall_softmax(y_true, y_pred)
81 |     return 2 * p * r / (p + r)
82 | 
83 | 
84 | if __name__ == "__main__":
85 |     raise RuntimeError
86 | 


--------------------------------------------------------------------------------
/scilk/util/networks/wrappers.py:
--------------------------------------------------------------------------------
  1 | from keras import layers
  2 | from keras.layers import wrappers
  3 | from keras import backend as K
  4 | import copy
  5 | 
  6 | 
  7 | class HalfStatefulBidirectional(wrappers.Wrapper):
  8 |     """
  9 |     Unlike the built-in keras.wrappers.Bidirectional, this wrapper only makes
 10 |     the forward reading layer stateful if an incoming layer is stateful. The
 11 |     backwards reading layer is always stateless, because it makes no sense to
 12 |     transfer state between batches evolving forward in time in a reversed
 13 |     layer.
 14 |     """
 15 |     def __init__(self, layer: layers.RNN, merge_mode='concat', weights=None, **kwargs):
 16 |         super().__init__(layer, **kwargs)
 17 |         if merge_mode not in ['sum', 'mul', 'ave', 'concat', None]:
 18 |             raise ValueError('Invalid merge mode. '
 19 |                              'Merge mode should be one of '
 20 |                              '{"sum", "mul", "ave", "concat", None}')
 21 |         config = layer.get_config()
 22 |         forward_conf = {**config, 'go_backwards': False}
 23 |         backward_conf = {**config, 'go_backwards': True, 'stateful': False}
 24 |         self.forward_layer = layer.__class__.from_config(forward_conf)
 25 |         self.backward_layer = layer.__class__.from_config(backward_conf)
 26 |         self.forward_layer.name = 'forward_' + self.forward_layer.name
 27 |         self.backward_layer.name = 'backward_' + self.backward_layer.name
 28 |         self.merge_mode = merge_mode
 29 |         if weights:
 30 |             self.forward_layer.initial_weights = weights[:len(weights) // 2]
 31 |             self.backward_layer.initial_weights = weights[len(weights) // 2:]
 32 |         self.stateful = layer.stateful
 33 |         self.return_sequences = layer.return_sequences
 34 |         self.return_state = layer.return_state
 35 |         self.supports_masking = True
 36 | 
 37 |     def get_weights(self):
 38 |         return self.forward_layer.get_weights() + self.backward_layer.get_weights()
 39 | 
 40 |     def set_weights(self, weights):
 41 |         self.forward_layer.set_weights(weights[:len(weights) // 2])
 42 |         self.backward_layer.set_weights(weights[len(weights) // 2:])
 43 | 
 44 |     def compute_output_shape(self, input_shape):
 45 |         output_shape = self.forward_layer.compute_output_shape(input_shape)
 46 |         if self.return_state:
 47 |             state_shape = output_shape[1:]
 48 |             output_shape = output_shape[0]
 49 | 
 50 |         if self.merge_mode == 'concat':
 51 |             output_shape = list(output_shape)
 52 |             output_shape[-1] *= 2
 53 |             output_shape = tuple(output_shape)
 54 |         elif self.merge_mode is None:
 55 |             output_shape = [output_shape, copy.copy(output_shape)]
 56 | 
 57 |         if self.return_state:
 58 |             if self.merge_mode is None:
 59 |                 return output_shape + state_shape + copy.copy(state_shape)
 60 |             return [output_shape] + state_shape + copy.copy(state_shape)
 61 |         return output_shape
 62 | 
 63 |     def call(self, inputs, training=None, mask=None, initial_state=None):
 64 |         kwargs = {}
 65 |         if wrappers.has_arg(self.layer.call, 'training'):
 66 |             kwargs['training'] = training
 67 |         if wrappers.has_arg(self.layer.call, 'mask'):
 68 |             kwargs['mask'] = mask
 69 | 
 70 |         if initial_state is not None and wrappers.has_arg(self.layer.call, 'initial_state'):
 71 |             if not isinstance(initial_state, list):
 72 |                 raise ValueError(
 73 |                     'When passing `initial_state` to a Bidirectional RNN, the state '
 74 |                     'should be a list containing the states of the underlying RNNs. '
 75 |                     'Found: ' + str(initial_state))
 76 |             forward_state = initial_state[:len(initial_state) // 2]
 77 |             y = self.forward_layer.call(inputs, initial_state=forward_state, **kwargs)
 78 |             y_rev = self.backward_layer.call(inputs, **kwargs)
 79 |         else:
 80 |             y = self.forward_layer.call(inputs, **kwargs)
 81 |             y_rev = self.backward_layer.call(inputs, **kwargs)
 82 | 
 83 |         if self.return_state:
 84 |             states = y[1:] + y_rev[1:]
 85 |             y = y[0]
 86 |             y_rev = y_rev[0]
 87 | 
 88 |         if self.return_sequences:
 89 |             y_rev = K.reverse(y_rev, 1)
 90 |         if self.merge_mode == 'concat':
 91 |             output = K.concatenate([y, y_rev])
 92 |         elif self.merge_mode == 'sum':
 93 |             output = y + y_rev
 94 |         elif self.merge_mode == 'ave':
 95 |             output = (y + y_rev) / 2
 96 |         elif self.merge_mode == 'mul':
 97 |             output = y * y_rev
 98 |         elif self.merge_mode is None:
 99 |             output = [y, y_rev]
100 | 
101 |         # Properly set learning phase
102 |         if (getattr(y, '_uses_learning_phase', False) or
103 |            getattr(y_rev, '_uses_learning_phase', False)):
104 |             if self.merge_mode is None:
105 |                 for out in output:
106 |                     out._uses_learning_phase = True
107 |             else:
108 |                 output._uses_learning_phase = True
109 | 
110 |         if self.return_state:
111 |             if self.merge_mode is None:
112 |                 return output + states
113 |             return [output] + states
114 |         return output
115 | 
116 |     def reset_states(self):
117 |         self.forward_layer.reset_states()
118 | 
119 |     def build(self, input_shape):
120 |         with K.name_scope(self.forward_layer.name):
121 |             self.forward_layer.build(input_shape)
122 |         with K.name_scope(self.backward_layer.name):
123 |             self.backward_layer.build(input_shape)
124 |         self.built = True
125 | 
126 |     def compute_mask(self, inputs, mask):
127 |         if self.return_sequences:
128 |             if not self.merge_mode:
129 |                 return [mask, mask]
130 |             else:
131 |                 return mask
132 |         else:
133 |             return None
134 | 
135 |     @property
136 |     def trainable_weights(self):
137 |         if hasattr(self.forward_layer, 'trainable_weights'):
138 |             return (self.forward_layer.trainable_weights +
139 |                     self.backward_layer.trainable_weights)
140 |         return []
141 | 
142 |     @property
143 |     def non_trainable_weights(self):
144 |         if hasattr(self.forward_layer, 'non_trainable_weights'):
145 |             return (self.forward_layer.non_trainable_weights +
146 |                     self.backward_layer.non_trainable_weights)
147 |         return []
148 | 
149 |     @property
150 |     def updates(self):
151 |         if hasattr(self.forward_layer, 'updates'):
152 |             return self.forward_layer.updates + self.backward_layer.updates
153 |         return []
154 | 
155 |     @property
156 |     def losses(self):
157 |         if hasattr(self.forward_layer, 'losses'):
158 |             return self.forward_layer.losses + self.backward_layer.losses
159 |         return []
160 | 
161 |     @property
162 |     def constraints(self):
163 |         constraints = {}
164 |         if hasattr(self.forward_layer, 'constraints'):
165 |             constraints.update(self.forward_layer.constraints)
166 |             constraints.update(self.backward_layer.constraints)
167 |         return constraints
168 | 
169 |     def get_config(self):
170 |         return {**super().get_config(), 'merge_mode': self.merge_mode}
171 | 
172 | 
173 | if __name__ == '__main__':
174 |     raise RuntimeError
175 | 


--------------------------------------------------------------------------------
/scilk/util/patterns.py:
--------------------------------------------------------------------------------
 1 | import re
 2 | from functools import reduce
 3 | from typing import Iterable, Tuple, Pattern, Union, Text, Callable, List
 4 | 
 5 | from scilk.util.intervals import Interval
 6 | 
 7 | numeric = re.compile('[0-9]*\.?[0-9]+')
 8 | wordlike = re.compile('\w+')
 9 | misc = re.compile('[^\s\w]')
10 | 
11 | 
12 | def ptransform(transformations: Iterable[Tuple[Pattern, Union[Text, Callable]]],
13 |                text: Text) -> Text:
14 |     """
15 |     Pattern transform. The patterns are applied in iteration order with no
16 |     intermediate masking.
17 |     :param transformations: pairs of patterns and replacements (refer to
18 |     `re.sub`'s documentation for more information on possible replacements);
19 |     :param text: text to transform
20 |     :return: transformed text
21 |     """
22 |     return reduce(lambda s, t: t[0].sub(t[1], s), transformations, text)
23 | 
24 | 
25 | def ptokenise(patterns: List[Pattern], text: Text, mask=' ') \
26 |         -> List[Interval[Text]]:
27 |     """
28 |     Return intervals matched by `patterns`. The patterns are applied
29 |     in iteration order. Before applying pattern `i+1`, the function replaces
30 |     each region `r` matched by pattern `i` with `mask * len(r)`. This means
31 |     the output might be sensitive to pattern order.
32 |     :param patterns: a list of patterns to search for
33 |     :param text: a unicode string
34 |     :param mask: the masking value
35 |     :return: a list of intervals storing the corresponding string
36 |     """
37 |     def repl(match) -> Text:
38 |         return mask * (match.end() - match.start())
39 | 
40 |     def match_mask(acc: Tuple[List[Tuple[int, int]], Text],
41 |                    patt: Pattern) -> Tuple[List[Tuple[int, int]], Text]:
42 |         spans, s = acc
43 |         spans.extend(m.span() for m in patt.finditer(s))
44 |         return spans, patt.sub(repl, s)
45 | 
46 |     return [Interval(start, stop, text[start:stop]) for start, stop in
47 |             sorted(reduce(match_mask, patterns, ([], text))[0])]
48 | 
49 | 
50 | if __name__ == '__main__':
51 |     raise RuntimeError
52 | 


--------------------------------------------------------------------------------
/scilk/util/preprocessing.py:
--------------------------------------------------------------------------------
  1 | """
  2 | 
  3 | 
  4 | 
  5 | """
  6 | 
  7 | 
  8 | import operator as op
  9 | from itertools import chain, repeat, count
 10 | from math import ceil
 11 | from typing import List, Tuple, Optional, TypeVar, Sequence
 12 | 
 13 | import numpy as np
 14 | from fn import F
 15 | 
 16 | 
 17 | T = TypeVar('T')
 18 | 
 19 | 
 20 | homogenous = F(map) >> set >> len >> F(op.contains, [0, 1])
 21 | flatmap = F(map) >> chain.from_iterable
 22 | strictmap = F(map) >> list
 23 | 
 24 | 
 25 | def flatzip(flat, nested):
 26 |     flatrep = map(F(map, repeat), flat)
 27 |     iterables = (*flatrep, *nested)
 28 |     return (F(zip) >> F(map, lambda x: zip(*x)) >> chain.from_iterable)(*iterables)
 29 | 
 30 | 
 31 | def maxshape(arrays: Sequence[np.ndarray]) -> Tuple[int]:
 32 |     """
 33 |     :param arrays: a nonempty sequence of arrays; the sequence must be
 34 |     homogeneous with respect to dimensionality.
 35 |     :raises ValueError: if `arrays` sequence is empty; if arrays have different
 36 |     dimensionality.
 37 |     """
 38 |     if not arrays:
 39 |         raise ValueError('`arrays` should not be empty')
 40 |     if not homogenous(np.ndim, arrays):
 41 |         raise ValueError('`arrays` must have homogeneous dimensionality')
 42 |     return tuple(np.array([array.shape for array in arrays]).max(axis=0))
 43 | 
 44 | 
 45 | def stack(arrays: Sequence[np.ndarray], shape: Optional[Sequence[int]], dtype,
 46 |           filler=0, trim=False) -> Tuple[np.ndarray, np.ndarray]:
 47 |     """
 48 |     Stack N-dimensional arrays with variable sizes across dimensions.
 49 |     :param arrays: a nonempty sequence of arrays; the sequence must be
 50 |     homogeneous with respect to dimensionality.
 51 |     :param shape: target shape to broadcast each array to. The shape must
 52 |     specify one integer per dimension – the output will thus have shape
 53 |     `[len(arrays), *shape]`. If None the function will infer the maximal size
 54 |     per dimension from `arrays`. To infer size for individual dimension(s)
 55 |     use -1.
 56 |     :param dtype: output data type
 57 |     :param filler: a value to fill in the empty space.
 58 |     :param trim: trim arrays to fit the `shape`.
 59 |     :raises ValueError: if `len(shape)` doesn't match the dimensionality of
 60 |     arrays in `arrays`; if an array can't be broadcasted to `shape` without
 61 |     trimming, while trimming is disabled; + all cases specified in function
 62 |     `maxshape`
 63 |     :return: stacked arrays, a boolean mask (empty positions are False).
 64 |     >>> from random import choice
 65 |     >>> maxlen = 100
 66 |     >>> ntests = 10000
 67 |     >>> lengths = range(10, maxlen+1, 2)
 68 |     >>> arrays = [
 69 |     ...    np.random.randint(0, 127, size=choice(lengths)).reshape((2, -1))
 70 |     ...    for _ in range(ntests)
 71 |     ... ]
 72 |     >>> stacked, masks = stack(arrays, [-1, maxlen], np.int)
 73 |     >>> all((arr.flatten() == s[m].flatten()).all()
 74 |     ...     for arr, s, m in zip(arrays, stacked, masks))
 75 |     True
 76 |     >>> stacked, masks = stack(arrays, [2, -1], np.int)
 77 |     >>> all((arr.flatten() == s[m].flatten()).all()
 78 |     ...     for arr, s, m in zip(arrays, stacked, masks))
 79 |     True
 80 |     """
 81 |     def slices(limits: Tuple[int], array: np.ndarray) -> List[slice]:
 82 |         stops = [min(limit, size) for limit, size in zip(limits, array.shape)]
 83 |         return [slice(0, stop) for stop in stops]
 84 | 
 85 |     if not isinstance(arrays, Sequence):
 86 |         raise ValueError('`arrays` must be a Sequence object')
 87 |     ndim = arrays[0].ndim
 88 |     if shape is not None and len(shape) != ndim:
 89 |         raise ValueError("`shape`'s dimensionality doesn't match that of "
 90 |                          "`arrays`")
 91 |     if shape is not None and any(s < 1 and s != -1 for s in shape):
 92 |         raise ValueError('the only allowed non-positive value in `shape` is -1')
 93 |     # infer size across all dimensions
 94 |     inferred = np.array(maxshape(arrays))
 95 |     # mix inferred and requested sizes where requested
 96 |     limits = (inferred if shape is None else
 97 |               np.where(np.array(shape) == -1, inferred, shape))
 98 |     # make sure everything fits fine
 99 |     if not (shape is None or trim or (inferred <= limits).all()):
100 |         raise ValueError("can't broadcast all arrays to `shape` without "
101 |                          "trimming")
102 |     stacked = np.full([len(arrays), *limits], filler, dtype=dtype)
103 |     mask = np.zeros([len(arrays), *limits], dtype=bool)
104 |     for i, arr, slices_ in zip(count(), arrays, map(F(slices, limits), arrays)):
105 |         op.setitem(stacked, [i, *slices_], op.getitem(arr, slices_))
106 |         op.setitem(mask, [i, *slices_], True)
107 |     stacked[~mask] = filler
108 |     return stacked, mask
109 | 
110 | 
111 | def maskfalse(array: np.ndarray, mask: np.ndarray) -> np.ndarray:
112 |     """
113 |     Replace False-masked items with zeros.
114 |     >>> array = np.arange(10)
115 |     >>> mask = np.random.binomial(1, 0.5, len(array)).astype(bool)
116 |     >>> masked = maskfalse(array, mask)
117 |     >>> (masked[mask] == array[mask]).all()
118 |     True
119 |     >>> (masked[~mask] == 0).all()
120 |     True
121 |     """
122 |     if not np.issubdtype(mask.dtype, np.bool):
123 |         raise ValueError("Masks are supposed to be boolean")
124 |     copy = array.copy()
125 |     copy[~mask] = 0
126 |     return copy
127 | 
128 | 
129 | def chunksteps(size: int, array: np.ndarray, filler=0) -> np.ndarray:
130 |     """
131 |     Chunk time steps, that is break an array into fixed-size slices along the
132 |     second dimension (array.shape[1]).
133 |     :param size: chunk size
134 |     :param array: an array to chunk. The array must have at lest two dimensions
135 |     :param filler: a value to fill in the empty space in the last chunk if
136 |     `array.shape[1] % size != 0`
137 |     :return:
138 |     """
139 |     nchunks = int(ceil(array.shape[1] / size))
140 |     chunks = [array[:, start:start+size] for start in range(0, size*nchunks, size)]
141 |     assert chunks[-1].shape[1] <= size
142 |     if chunks[-1].shape[1] < size:
143 |         chunk = np.full((array.shape[0], size, *array.shape[2:]), filler,
144 |                          dtype=array.dtype)
145 |         chunk[:, :chunks[-1].shape[1]] = chunks[-1]
146 |         chunks[-1] = chunk
147 |     return np.array(chunks)
148 | 
149 | 
150 | reverse = op.itemgetter(slice(None, None, -1))  # reverse a Sequence or an array
151 | 
152 | 
153 | if __name__ == '__main__':
154 |     raise RuntimeError
155 | 


--------------------------------------------------------------------------------
/scilk/util/segments.py:
--------------------------------------------------------------------------------
 1 | from typing import Sequence, List, Iterable, Tuple
 2 | from itertools import dropwhile
 3 | from functools import reduce
 4 | import operator as op
 5 | 
 6 | from fn import F
 7 | from fn.iters import splitby, droplast
 8 | import numpy as np
 9 | 
10 | from .intervals import Interval
11 | 
12 | 
13 | def breakpoints(intervals: Iterable[Interval]) -> List[int]:
14 |     """
15 |     Find breakpoints between intervals
16 |     :param intervals:
17 |     :return:
18 |     """
19 |     return [iv.stop - 1 for iv in intervals]
20 | 
21 | 
22 | def stitchpoints(intervals: Sequence[Interval], targets: Sequence[Interval]):
23 |     """
24 |     Find breakpoints that have to be stitched in order to recover target
25 |     intervals from finer subintervals. For a set of intervals [iv_1, ..., iv_n]
26 |     that must be stitched to obtain a target t1, the function returns
27 |     [(iv_1).stop-1, ..., (iv_n-1).stop-1]. The function groups all intervals
28 |     intersecting a target together and merges them. Take note, that an ideal
29 |     reconstruction might not be achievable. In this case it is only guaranteed
30 |     that all merged intervals will contain the entire span of the corresponding
31 |     target, but not the other way around.
32 |     :param intervals:
33 |     :param targets:
34 |     :return:
35 |     """
36 |     intervals_ = sorted(intervals, key=lambda iv: iv.start)
37 |     stitched_ = sorted(targets, key=lambda iv: iv.start)
38 |     inbreaks = F(droplast, 1) >> breakpoints
39 | 
40 |     def grouper(acc: Tuple[List[int], Iterable[Interval]], iv: Interval):
41 |         # find breakpoints to stitch
42 |         breaks, ivs = acc
43 |         grouped, remainder = splitby(
44 |             iv.intersects, dropwhile(lambda x: x.stop <= iv.start, ivs)
45 |         )
46 |         return breaks.extend(inbreaks(grouped)) or breaks, list(remainder)
47 | 
48 |     return reduce(grouper, stitched_, ([], intervals_))[0]
49 | 
50 | 
51 | def stitch(intervals: Sequence[Interval], points: Sequence[int]) \
52 |         -> List[Interval]:
53 |     """
54 |     Stitch intervals. If any point in `points` falls into an interval at
55 |     position i this interval will be stitched to interval at position i+1.
56 |     :param intervals:
57 |     :param points:
58 |     :return:
59 |     """
60 |     # extract annotations
61 |     ivs = sorted(intervals, key=lambda iv: iv.start)
62 |     length = max(iv.stop for iv in ivs)
63 |     annotations = np.zeros(length, dtype=np.int32)
64 |     annotations[points] = 1
65 |     iv_anno = [annotations[iv.start:iv.stop].any() for iv in ivs]
66 | 
67 |     # group intervals to stitch
68 |     def group(acc: Tuple[List[List[Interval]], bool],
69 |               step: Tuple[Interval, bool]) \
70 |             -> Tuple[List[List[Interval]], bool]:
71 |         groups, takethis = acc
72 |         iv, takenext = step
73 |         if takethis:
74 |             groups[-1].append(iv)
75 |         else:
76 |             groups.append([iv])
77 |         return groups, takenext
78 | 
79 |     grouped = reduce(group, zip(ivs, iv_anno), ([], False))[0]
80 |     # stitch intervals
81 |     return [reduce(op.and_, group) for group in grouped]
82 | 
83 | 
84 | if __name__ == '__main__':
85 |     raise RuntimeError
86 | 


--------------------------------------------------------------------------------
/setup.py:
--------------------------------------------------------------------------------
 1 | import sys
 2 | from distutils.core import setup
 3 | from setuptools import find_packages
 4 | 
 5 | # TODO add loggers and warnings
 6 | # TODO lazy module improting (https://github.com/bwesterb/py-demandimport)
 7 | 
 8 | if sys.version_info < (3, 5, 2):
 9 |     print("SciLK requires Python >= 3.5.2")
10 |     sys.exit(1)
11 | 
12 | # from Cython.Build import cythonize
13 | #
14 | # os.environ['CFLAGS'] = '-O3 -Wall'
15 | 
16 | setup(
17 |     name="scilk",
18 |     version="0.1a1",
19 |     packages=find_packages("./"),
20 |     scripts=[],
21 |     requires=["numpy",
22 |               "h5py",
23 |               "fn",
24 |               "pyrsistent",
25 |               "keras",
26 |               "scikit-learn",
27 |               "pandas",
28 |               "hypothesis",
29 |               "frozendict",
30 |               "tensorflow", 'multipledispatch']
31 | )
32 | 


--------------------------------------------------------------------------------
/test.py:
--------------------------------------------------------------------------------
  1 | import unittest
  2 | from typing import Sequence, Iterable, cast, Mapping
  3 | import tempfile
  4 | import os
  5 | 
  6 | import numpy as np
  7 | import joblib
  8 | from hypothesis import given, note
  9 | from hypothesis import settings, strategies as st
 10 | 
 11 | from scilk.corpora import genia
 12 | from scilk.util import intervals
 13 | from scilk.collections import _collections
 14 | import scilk
 15 | 
 16 | MAX_TESTS = 1000
 17 | 
 18 | 
 19 | # strategies
 20 | 
 21 | texts = st.text(st.characters(min_codepoint=32, max_codepoint=255), 0, 500, 1000)
 22 | 
 23 | 
 24 | def loader_caller(collection: _collections.Collection, data=None):
 25 | 
 26 |     def caller(value: str):
 27 |         return collection.translate(value)
 28 | 
 29 |     return caller
 30 | 
 31 | 
 32 | def loader_translate(collection: _collections.Collection, data: dict):
 33 |     mapping = joblib.load(data['mapping'])
 34 | 
 35 |     def translate(value: str):
 36 |         return mapping.get(value)
 37 | 
 38 |     return translate
 39 | 
 40 | 
 41 | # test cases
 42 | 
 43 | class TestText(unittest.TestCase):
 44 | 
 45 |     @staticmethod
 46 |     def unparse(txt, intervals_: Sequence[intervals.Interval]):
 47 |         if not len(intervals_):
 48 |             return ""
 49 |         codes = np.repeat([ord(" ")], intervals_[-1].stop)
 50 |         for iv in intervals_:
 51 |             token = intervals.extract(txt, [iv])[0]
 52 |             codes[iv.start:iv.stop] = list(map(ord, token))
 53 |         return "".join(map(chr, codes))
 54 | 
 55 |     # @given(texts)
 56 |     # @settings(max_examples=MAX_TESTS)
 57 |     # def test_parse_text(self, txt):
 58 |     #     parsed = text.tointervals(text.fine_tokeniser, txt)
 59 |     #     mod_text = re.sub("\s", " ", txt)
 60 |     #     self.assertEqual(self.unparse(txt, parsed), mod_text.rstrip())
 61 | 
 62 | 
 63 | class TestGenia(unittest.TestCase):
 64 | 
 65 |     @given(st.lists(st.text()))
 66 |     @settings(max_examples=MAX_TESTS)
 67 |     def test_text_boundaries(self, texts: list):
 68 |         """
 69 |         Test of text_boundaries() function.
 70 |         :return:
 71 |         """
 72 |         boundaries = genia._segment_borders(texts)
 73 |         note(boundaries)
 74 | 
 75 |         self.assertTrue(all([boundaries[i][1] == boundaries[i + 1][0] for i in
 76 |                              range(len(boundaries) - 1)]))
 77 |         self.assertTrue(all([boundaries[i][0] <= boundaries[i][1] for i in
 78 |                             range(len(boundaries) - 1)]))
 79 |         if boundaries:
 80 |             self.assertTrue(boundaries[0][0] == 0)
 81 | 
 82 | 
 83 | class TestCollection(unittest.TestCase):
 84 |     def test_collection(self):
 85 |         with tempfile.TemporaryDirectory() as dirpath:
 86 |             scilk.SCILK_ROOT = dirpath
 87 |             mapping = dict(test='OK')
 88 |             mapping_path = os.path.join(dirpath, 'mapping.joblib')
 89 |             joblib.dump(mapping, mapping_path)
 90 |             collection = _collections.Collection()
 91 |             collection.add('translate', loader_translate, dict(mapping=mapping_path))
 92 |             collection.add('caller', loader_caller)
 93 |             self.assertAlmostEqual(collection.caller('test'), 'OK')
 94 |             collection.save(name='test')
 95 |             collection = _collections.Collection.load('test')
 96 |             self.assertAlmostEqual(collection.caller('test'), 'OK')
 97 |             self.assertEquals({'translate', 'caller'}, set(collection.entries))
 98 | 
 99 | 
100 | if __name__ == '__main__':
101 |     unittest.main()
102 | 


--------------------------------------------------------------------------------