├── .gitignore ├── LICENSE ├── README.md ├── scilk ├── __init__.py ├── collections │ ├── __init__.py │ ├── _collections.py │ └── common.py ├── corpora │ ├── __init__.py │ ├── chemdner.py │ ├── corpus.py │ └── genia.py └── util │ ├── __init__.py │ ├── binning.py │ ├── intervals.py │ ├── networks │ ├── __init__.py │ ├── blocks.py │ ├── callbacks.py │ ├── metrics.py │ └── wrappers.py │ ├── patterns.py │ ├── preprocessing.py │ └── segments.py ├── setup.py └── test.py /.gitignore: -------------------------------------------------------------------------------- 1 | # Created by .ignore support plugin (hsz.mobi) 2 | ### macOS template 3 | *.DS_Store 4 | .AppleDouble 5 | .LSOverride 6 | 7 | # Icon must end with two \r 8 | Icon 9 | 10 | 11 | # Thumbnails 12 | ._* 13 | 14 | # Files that might appear in the root of a volume 15 | .DocumentRevisions-V100 16 | .fseventsd 17 | .Spotlight-V100 18 | .TemporaryItems 19 | .Trashes 20 | .VolumeIcon.icns 21 | .com.apple.timemachine.donotpresent 22 | 23 | # Directories potentially created on remote AFP share 24 | .AppleDB 25 | .AppleDesktop 26 | Network Trash Folder 27 | Temporary Items 28 | .apdisk 29 | ### Python template 30 | # Byte-compiled / optimized / DLL files 31 | __pycache__/ 32 | *.py[cod] 33 | *$py.class 34 | 35 | # C extensions 36 | *.so 37 | 38 | # Distribution / packaging 39 | .Python 40 | env/ 41 | build/ 42 | develop-eggs/ 43 | dist/ 44 | downloads/ 45 | eggs/ 46 | .eggs/ 47 | lib/ 48 | lib64/ 49 | parts/ 50 | sdist/ 51 | var/ 52 | wheels/ 53 | *.egg-info/ 54 | .installed.cfg 55 | *.egg 56 | 57 | # PyInstaller 58 | # Usually these files are written by a python script from a template 59 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 60 | *.manifest 61 | *.spec 62 | 63 | # Installer logs 64 | pip-log.txt 65 | pip-delete-this-directory.txt 66 | 67 | # Unit test / coverage reports 68 | htmlcov/ 69 | .tox/ 70 | .coverage 71 | .coverage.* 72 | .cache 73 | nosetests.xml 74 | coverage.xml 75 | *,cover 76 | .hypothesis/ 77 | 78 | # Translations 79 | *.mo 80 | *.pot 81 | 82 | # Django stuff: 83 | *.log 84 | local_settings.py 85 | 86 | # Flask stuff: 87 | instance/ 88 | .webassets-cache 89 | 90 | # Scrapy stuff: 91 | .scrapy 92 | 93 | # Sphinx documentation 94 | docs/_build/ 95 | 96 | # PyBuilder 97 | target/ 98 | 99 | # Jupyter Notebook 100 | .ipynb_checkpoints 101 | 102 | # pyenv 103 | .python-version 104 | 105 | # celery beat schedule file 106 | celerybeat-schedule 107 | 108 | # SageMath parsed files 109 | *.sage.py 110 | 111 | # dotenv 112 | .env 113 | 114 | # virtualenv 115 | .venv 116 | venv/ 117 | ENV/ 118 | 119 | # Spyder project settings 120 | .spyderproject 121 | 122 | # Rope project settings 123 | .ropeproject 124 | ### JetBrains template 125 | # Covers JetBrains IDEs: IntelliJ, RubyMine, PhpStorm, AppCode, PyCharm, CLion, Android Studio and Webstorm 126 | # Reference: https://intellij-support.jetbrains.com/hc/en-us/articles/206544839 127 | 128 | # User-specific stuff: 129 | .idea/**/workspace.xml 130 | .idea/**/tasks.xml 131 | .idea/dictionaries 132 | 133 | # Sensitive or high-churn files: 134 | .idea/**/dataSources/ 135 | .idea/**/dataSources.ids 136 | .idea/**/dataSources.xml 137 | .idea/**/dataSources.local.xml 138 | .idea/**/sqlDataSources.xml 139 | .idea/**/dynamic.xml 140 | .idea/**/uiDesigner.xml 141 | 142 | # Gradle: 143 | .idea/**/gradle.xml 144 | .idea/**/libraries 145 | 146 | # Mongo Explorer plugin: 147 | .idea/**/mongoSettings.xml 148 | 149 | ## File-based project format: 150 | *.iws 151 | 152 | ## Plugin-specific files: 153 | 154 | # IntelliJ 155 | /out/ 156 | 157 | # mpeltonen/sbt-idea plugin 158 | .idea_modules/ 159 | 160 | # JIRA plugin 161 | atlassian-ide-plugin.xml 162 | 163 | # Crashlytics plugin (for Android Studio and IntelliJ) 164 | com_crashlytics_export_strings.xml 165 | crashlytics.properties 166 | crashlytics-build.properties 167 | fabric.properties 168 | 169 | .idea/ 170 | genia_corpus/ 171 | 172 | ### macOS template 173 | 174 | # Icon must end with two \r 175 | 176 | 177 | # Thumbnails 178 | 179 | # Files that might appear in the root of a volume 180 | 181 | # Directories potentially created on remote AFP share 182 | ### C template 183 | # Prerequisites 184 | *.d 185 | 186 | # Object files 187 | *.o 188 | *.ko 189 | *.obj 190 | *.elf 191 | 192 | # Linker output 193 | *.ilk 194 | *.map 195 | *.exp 196 | 197 | # Precompiled Headers 198 | *.gch 199 | *.pch 200 | 201 | # Libraries 202 | *.lib 203 | *.a 204 | *.la 205 | *.lo 206 | 207 | # Shared objects (inc. Windows DLLs) 208 | *.dll 209 | *.so.* 210 | *.dylib 211 | 212 | # Executables 213 | *.exe 214 | *.out 215 | *.app 216 | *.i*86 217 | *.x86_64 218 | *.hex 219 | 220 | # Debug files 221 | *.dSYM/ 222 | *.su 223 | *.idb 224 | *.pdb 225 | 226 | # Kernel Module Compile Results 227 | *.mod* 228 | *.cmd 229 | modules.order 230 | Module.symvers 231 | Mkfile.old 232 | dkms.conf 233 | ### C++ template 234 | # Prerequisites 235 | 236 | # Compiled Object files 237 | *.slo 238 | 239 | # Precompiled Headers 240 | 241 | # Compiled Dynamic libraries 242 | 243 | # Fortran module files 244 | *.mod 245 | *.smod 246 | 247 | # Compiled Static libraries 248 | *.lai 249 | 250 | # Executables 251 | ### JetBrains template 252 | # Covers JetBrains IDEs: IntelliJ, RubyMine, PhpStorm, AppCode, PyCharm, CLion, Android Studio and Webstorm 253 | # Reference: https://intellij-support.jetbrains.com/hc/en-us/articles/206544839 254 | 255 | # User-specific stuff: 256 | 257 | # Sensitive or high-churn files: 258 | 259 | # Gradle: 260 | 261 | # Mongo Explorer plugin: 262 | 263 | ## File-based project format: 264 | 265 | ## Plugin-specific files: 266 | 267 | # IntelliJ 268 | 269 | # mpeltonen/sbt-idea plugin 270 | 271 | # JIRA plugin 272 | 273 | # Crashlytics plugin (for Android Studio and IntelliJ) 274 | ### Linux template 275 | *~ 276 | 277 | # temporary files which can be created if a process still has a handle open of a deleted file 278 | .fuse_hidden* 279 | 280 | # KDE directory preferences 281 | .directory 282 | 283 | # Linux trash folder which might appear on any partition or disk 284 | .Trash-* 285 | 286 | # .nfs files are created when an open file is removed but is still being accessed 287 | .nfs* 288 | 289 | local/ -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2017 skoblov-lab 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # SciLK: a Scientific natural Language Toolkit 2 | SciLK (pronounced as "silk") is a natural language toolkit created and 3 | optimised specifically for text-mining applications in natural sciences 4 | (primarily biology and chemistry). As of this moment, this package is purely 5 | experimental and is bound to be unstable for some time to come. Stable published 6 | models will be stored in separate stale branches until the master branch has 7 | matured. The list of such branches: 8 | 9 | - `chemdner-pub` - a text tokeniser and chemical named entity recognition model 10 | trained on the CHEMDNER corpus (publication pending). **Update** We've added Windows support. 11 | -------------------------------------------------------------------------------- /scilk/__init__.py: -------------------------------------------------------------------------------- 1 | """ 2 | 3 | 4 | 5 | """ 6 | 7 | 8 | import pathlib 9 | import sys 10 | import os 11 | 12 | 13 | if sys.version_info < (3, 5, 2): 14 | print("ChemPred required Python >= 3.5.2") 15 | sys.exit(1) 16 | 17 | 18 | SCILK_ROOT = os.path.abspath(os.environ.get('SCILK_ROOT') or 19 | os.path.expanduser('~/.scilk')) 20 | os.makedirs(SCILK_ROOT, exist_ok=True) 21 | 22 | 23 | if __name__ == '__main__': 24 | raise RuntimeError 25 | -------------------------------------------------------------------------------- /scilk/collections/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/skoblov-lab/SciLK/b3509b7d3839462ab415e9f2cfd0ad8033f8034d/scilk/collections/__init__.py -------------------------------------------------------------------------------- /scilk/collections/_collections.py: -------------------------------------------------------------------------------- 1 | """ 2 | 3 | This module is purely experimental. We are going to use it for IO prototyping. 4 | 5 | """ 6 | 7 | 8 | from typing import Tuple, Mapping, List, Optional, Callable, Any 9 | from importlib import import_module 10 | import joblib 11 | import inspect 12 | import keyword 13 | import copy 14 | import glob 15 | import shutil 16 | import os 17 | 18 | import scilk 19 | 20 | 21 | LOADER_EXT = 'loader' 22 | DATA_EXT = 'entrydata' 23 | COLL_EXT = 'collection' 24 | 25 | 26 | class Collection: 27 | 28 | def __init__(self): 29 | self._entries = {} 30 | self._loaders = {} 31 | self._data = {} 32 | self._status = {} 33 | 34 | def __getattr__(self, entry: str) -> Any: 35 | if entry not in self._loaders: 36 | raise AttributeError('no entry named {}'.format(entry)) 37 | # uninvoked dependencies are False, loading dependencies are None, 38 | # loaded dependencies are True 39 | if self._status[entry] is None: 40 | raise RuntimeError("'{}' was accessed while loading".format(entry)) 41 | if not self._status[entry]: 42 | self._activate_entry(entry) 43 | return self._entries[entry] 44 | 45 | @property 46 | def entries(self) -> List[str]: 47 | return list(self._loaders) 48 | 49 | def add(self, entry: str, loader: Callable[['Collection', Mapping], Any], 50 | data: Optional[Mapping[str, str]]=None, postpone: bool=False): 51 | """ 52 | Add a model to the collection. 53 | :param entry: entry name; it must be a valid python identifier, 54 | because it will be used to access the entry via the attribute lookup 55 | mechanism, i.e. 56 | >>> assert isidentifier(entry) 57 | should pass. 58 | :param loader: a callable responsible for loading an entry. A loader 59 | must accept two arguments: (1) a Collection instance (this would allow 60 | the loader to access other models in the same Collection) and (2) a data 61 | mapping (see argument 'data'); take note that cyclic dependencies 62 | between entries are not allowed and will result in a RuntimeError 63 | error. There are two additional requirements: 64 | - The loader must be defined in an importable module 65 | - The loader must be accessible via its __name__ attribute from the 66 | module's global namespace. 67 | If both requirements are met, the following code will work just fine: 68 | >>> import inspect 69 | >>> from importlib import import_module 70 | >>> module = import_module(inspect.getmodule(loader).__name__) 71 | >>> assert getattr(module, loader.__name__) is loader 72 | The method will try to validate your loader and will raise a ValueError 73 | if the validation fails. 74 | :param data: a Mapping between labels and file paths (symlinks are not 75 | allowed). When a Collection is serialised, all data mappings associated 76 | with underlying entries are copied into the Collection's destination 77 | directory under appropriate subdirectories; nevertheless, all data keys 78 | remain the same and it is thus safe to rely on them in loaders. 79 | :param postpone: do not load the entry at once. This option is useful if 80 | you don't want to work out the correct order of adding entries without 81 | running into missing dependencies. 82 | :raises SyntaxError: invalid entry name 83 | :raises ImportError: can't import loader from its module 84 | :raises ValueError: invalid data 85 | """ 86 | if not isidentifier(entry): 87 | raise SyntaxError("'{}' is not a valid identifier".format(entry)) 88 | if not importable(loader): 89 | raise ImportError("can't import the loader from its module") 90 | # check the data mapping 91 | if not (data is None or isinstance(data, Mapping)): 92 | raise ValueError('data argument must be a Mapping instance or None') 93 | if not (data is None or all(map(os.path.isfile, data.values()))): 94 | raise ValueError('all values in data must be valid file paths') 95 | self._loaders[entry] = loader 96 | self._data[entry] = copy.deepcopy(dict(data or {})) 97 | self._status[entry] = False 98 | if not postpone: 99 | self._activate_entry(entry) 100 | 101 | def _activate_entry(self, entry: str): 102 | if self._status[entry]: 103 | raise RuntimeError('trying to reload an entry') 104 | # set entry status to None to show that it is currently loading 105 | self._status[entry] = None 106 | # load the entry 107 | self._entries[entry] = self._loaders[entry](self, self._data[entry]) 108 | # show that the entry is available 109 | self._status[entry] = True 110 | 111 | @classmethod 112 | def load(cls, name: str) -> 'Collection': 113 | """ 114 | Load a serialised Collection from your SciLK root 115 | :param name: Collection's name 116 | :return: a loaded Collection 117 | :raises FileNotFoundError: missing files 118 | :raises ModuleNotFoundError: can't load a loader's module 119 | :raises AttributeError: can't find a loader in its module 120 | """ 121 | collection = cls() 122 | base = os.path.join(scilk.SCILK_ROOT, name) 123 | entries = joblib.load(os.path.join(base, '{}.{}'.format(name, COLL_EXT))) 124 | for entry in entries: 125 | collection.add(entry, *cls._load_entry(base, entry), postpone=True) 126 | return collection 127 | 128 | def save(self, name): 129 | """ 130 | Save a Collection to your SciLK root in a distributable form: 131 | - create a directory named after the Collection under the SciLK root 132 | directory and inflate it with subdirectories named after entries 133 | - save everything necessary to load the entries 134 | - save specifications 135 | :raises FileExistsError: there already is a saved Collection with 136 | identical name 137 | """ 138 | destination = os.path.join(scilk.SCILK_ROOT, name) 139 | try: 140 | os.makedirs(destination) 141 | except FileExistsError: 142 | raise FileExistsError("there is a collection named '{}' in your " 143 | "SciLK root directory".format(name)) 144 | # save individual entries 145 | for entry in self._loaders: 146 | self._save_entry(destination, entry) 147 | # save collection spec to prevent data corruption 148 | collection_spec_path = os.path.join(destination, 149 | '{}.{}'.format(name, COLL_EXT)) 150 | joblib.dump(self.entries, collection_spec_path, 1) 151 | 152 | @staticmethod 153 | def _load_entry(base: str, entry: str) -> Tuple[Callable, Mapping]: 154 | # load data 155 | data_spec_path = os.path.join(base, entry, '{}.{}'.format(entry, DATA_EXT)) 156 | try: 157 | data_spec = joblib.load(data_spec_path) 158 | except FileNotFoundError: 159 | raise FileNotFoundError("missing data for entry '{}'".format(entry)) 160 | data = {k: os.path.join(base, entry, value) for k, value in data_spec.items()} 161 | # load loader 162 | loader_spec_path = os.path.join(base, entry, '{}.{}'.format(entry, LOADER_EXT)) 163 | try: 164 | module, name = joblib.load(loader_spec_path) 165 | except FileNotFoundError: 166 | raise FileNotFoundError("missing loader for entry '{}'".format(entry)) 167 | try: 168 | loader = getattr(import_module(module), name) 169 | except ModuleNotFoundError: 170 | raise ModuleNotFoundError("can't import module '{}' to access " 171 | "the loader specified by " 172 | "'{}'".format(module, entry)) 173 | except AttributeError: 174 | raise AttributeError("module '{}' contains no global name " 175 | "'{}' specified as loader in entry " 176 | "'{}'".format(module, name, entry)) 177 | return loader, data 178 | 179 | def _save_entry(self, base, entry): 180 | destination = os.path.join(base, entry) 181 | os.mkdir(destination) 182 | # save data and data spec 183 | data = self._data[entry] 184 | for _, path in data.items(): 185 | shutil.copy(path, os.path.join(destination, os.path.basename(path))) 186 | data_spec = {item: os.path.basename(path) for item, path in data.items()} 187 | data_spec_path = os.path.join(destination, '{}.{}'.format(entry, DATA_EXT)) 188 | joblib.dump(data_spec, data_spec_path, 1) 189 | # save loader spec 190 | loader = self._loaders[entry] 191 | loader_spec = (inspect.getmodule(loader).__name__, loader.__name__) 192 | loader_spec_path = os.path.join(destination, '{}.{}'.format(entry, LOADER_EXT)) 193 | joblib.dump(loader_spec, loader_spec_path, 1) 194 | 195 | 196 | def importable(item) -> bool: 197 | """ 198 | Check whether 'item' is accessible from its module's global namespace under 199 | 'item.__name__'. 200 | :param item: 201 | :return: 202 | """ 203 | try: 204 | module = import_module(inspect.getmodule(item).__name__) 205 | assert getattr(module, item.__name__) is item 206 | except (AssertionError, ImportError, ValueError, AttributeError): 207 | return False 208 | return True 209 | 210 | 211 | def isidentifier(name: str) -> bool: 212 | """ 213 | Determines if string is valid Python identifier. 214 | """ 215 | if not isinstance(name, str): 216 | raise TypeError("expected str, but got {!r}".format(type(name))) 217 | return name.isidentifier() and not keyword.iskeyword(name) 218 | 219 | 220 | if __name__ == '__main__': 221 | raise RuntimeError 222 | -------------------------------------------------------------------------------- /scilk/collections/common.py: -------------------------------------------------------------------------------- 1 | import csv 2 | from itertools import chain 3 | from typing import Sequence, Iterable, TypeVar, List, Tuple, Callable, Mapping, Union 4 | 5 | import numpy as np 6 | from fn import F 7 | import pandas as pd 8 | 9 | from scilk.util import preprocessing 10 | from scilk.util.binning import unbin, unmerge_bins 11 | 12 | T = TypeVar('T') 13 | 14 | TextEncoder = Callable[[Union[str, Iterable[str]]], np.ndarray] 15 | 16 | 17 | def asciicharset(strings: Iterable[str]) -> List[str]: 18 | """ 19 | Return a sorted list of unique ascii characters 20 | :param strings: an iterable of strings to extract characters from 21 | :return: 22 | """ 23 | characters = chain.from_iterable(strings) 24 | return sorted(set(filter(lambda x: ord(x) < 128, characters))) 25 | 26 | 27 | # TODO specify all exception in the docs 28 | 29 | def build_charencoder(corpus: Iterable[str], wordlen: int=None) \ 30 | -> Tuple[int, Mapping[str, int], TextEncoder]: 31 | """ 32 | Create a char-level encoder: a Callable, mapping strings into integer arrays. 33 | Encoders dispatch on input type: if you pass a single string, you will get 34 | a 1D array, if you pass an Iterable of strings, you will get a 2D array 35 | where row i encodes the i-th string in the Iterable. 36 | :param corpus: an Iterable of strings to extract characters from. The 37 | encoder will map any non-ASCII character into the OOV code. 38 | :param wordlen: when `wordlen` is None and an encoder receives an Iterable of 39 | strings, the second dimension in the output array will be as long as the 40 | longest string, otherwise it will be `wordlen` long. In the latter case 41 | words exceeding `wordlen` will be trimmed. In both cases empty-spaces are 42 | filled with zeros. 43 | in the Iterable. If wordlen is not 44 | :return: the OOV code, a character mapping representing non-OOV character 45 | encodings, an encoder 46 | """ 47 | if wordlen and wordlen < 1: 48 | raise ValueError('`wordlen` must be positive') 49 | try: 50 | charmap = {char: i + 1 for i, char in enumerate(asciicharset(corpus))} 51 | except TypeError: 52 | raise ValueError('`corpus` can be either a string or an Iterable of ' 53 | 'strings') 54 | if not charmap: 55 | raise ValueError('the `corpus` is empty') 56 | oov = len(charmap) + 1 57 | 58 | def encode_string(string: str) -> np.ndarray: 59 | if not string: 60 | raise ValueError("can't encode empty strings") 61 | return np.fromiter((charmap.get(char, oov) for char in string), np.int32, 62 | len(string)) 63 | 64 | def charencoder(target: Union[str, Iterable[str]]): 65 | if isinstance(target, str): 66 | return encode_string(target) 67 | encoded_strings = list(map(encode_string, target)) 68 | if not encoded_strings: 69 | raise ValueError('there are no `target`') 70 | return preprocessing.stack( 71 | encoded_strings, [wordlen or -1], np.int32, 0, True)[0] 72 | 73 | return oov, charmap, charencoder 74 | 75 | 76 | def build_wordencoder(embeddings: pd.DataFrame, transform: Callable[[str], str]) \ 77 | -> TextEncoder: 78 | """ 79 | Create a word-level encoder: a Callable, mapping strings into integer arrays. 80 | Encoders dispatch on input type: if you pass a single string, you will get 81 | a 1D array, if you pass an Iterable of strings, you will get a 2D array, 82 | where row i encodes the i-th string in the Iterable. 83 | :param embeddings: a dataframe of word vectors indexed by words. The last 84 | vector (row) is used to encode OOV words. 85 | :return: 86 | """ 87 | wordmap = {word: i for i, word in enumerate(embeddings.index)} 88 | if not wordmap: 89 | raise ValueError('empty `embeddings`') 90 | if not all(isinstance(word, str) for word in wordmap): 91 | raise ValueError('`embeddings` can be indexed by strings alone') 92 | oov = wordmap[embeddings.index[-1]] 93 | vectors = embeddings.as_matrix() 94 | 95 | def index(word: str) -> int: 96 | if not word: 97 | raise ValueError("can't encode empty words") 98 | return wordmap.get(transform(word), oov) 99 | 100 | def wordencoder(target: Union[str, Iterable[str]]) -> np.ndarray: 101 | if isinstance(target, str): 102 | return vectors[index(target)] 103 | indices = list(map(index, target)) 104 | if not indices: 105 | raise ValueError('there are no `target`s') 106 | return np.vstack(vectors[indices]) 107 | 108 | return wordencoder 109 | 110 | 111 | def read_glove(path: str) -> pd.DataFrame: 112 | """ 113 | Read Glove embeddings in text format. The file can be compressed. 114 | :param path: 115 | :return: 116 | """ 117 | return pd.read_table( 118 | path, sep=' ', index_col=0, header=None, quoting=csv.QUOTE_NONE, 119 | na_values=None, keep_default_na=False 120 | ).astype(np.float32) 121 | 122 | 123 | def decode_merged_predictions(merged: np.ndarray, bins: Sequence[Sequence[int]], 124 | lengths: Sequence[int]) -> List[Sequence[int]]: 125 | """ 126 | :param merged: merged predictions 127 | :param bins: bins 128 | :param lengths: text lengths 129 | """ 130 | unmerged = unmerge_bins(merged, bins, lengths) 131 | unbined = (F(map, preprocessing.reverse) >> list)(unbin(unmerged, bins)) 132 | return [np.nonzero(anno > 0.5)[0] for anno in unbined] 133 | 134 | 135 | if __name__ == '__main__': 136 | raise RuntimeError 137 | -------------------------------------------------------------------------------- /scilk/corpora/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/skoblov-lab/SciLK/b3509b7d3839462ab415e9f2cfd0ad8033f8034d/scilk/corpora/__init__.py -------------------------------------------------------------------------------- /scilk/corpora/chemdner.py: -------------------------------------------------------------------------------- 1 | """ 2 | 3 | Parsers, preprocessors and type annotations for the chemdner dataset. 4 | 5 | """ 6 | 7 | import operator as op 8 | from itertools import groupby 9 | from typing import List, Tuple, Text, Iterable, Iterator 10 | 11 | import pandas as pd 12 | from fn import F 13 | 14 | from scilk.corpora.corpus import TITLE, BODY, Abstract, AbstractAnnotation, \ 15 | AbstractText, AbstractSentenceBorders 16 | from scilk.util.intervals import Interval 17 | 18 | 19 | def parse_abstracts(path: Text) -> List[AbstractText]: 20 | """ 21 | Read chemdner abstracts 22 | :return: list[(abstract id, title, body)] 23 | >>> path = "testdata/abstracts.txt" 24 | >>> abstracts = parse_abstracts(path) 25 | >>> ids = {21826085, 22080034, 22080035, 22080037} 26 | >>> all(id_ in ids for id_, *_ in abstracts) 27 | True 28 | """ 29 | with open(path) as buffer: 30 | parsed_buffer = (line.strip().split('\t') for line in buffer) 31 | return [AbstractText(int(id_), title.rstrip(), body.rstrip()) 32 | for id_, title, body in parsed_buffer] 33 | 34 | 35 | def parse_annotations(path: Text) -> List[AbstractAnnotation]: 36 | # TODO log empty annotations 37 | # TODO more tests 38 | """ 39 | Read chemdner annotations 40 | :param path: path to a CHEMDNER-formatted annotation files 41 | >>> path = "testdata/annotations.txt" 42 | >>> anno = parse_annotations(path) 43 | >>> ids = {21826085, 22080034, 22080035, 22080037} 44 | >>> all(id_ in ids for id_, *_ in anno) 45 | True 46 | >>> nonempty_anno = [id_ for id_, title, _ in anno if title] 47 | >>> nonempty_anno 48 | [22080037] 49 | >>> [len(title) for _, title, _ in anno] 50 | [0, 0, 0, 2] 51 | >>> [len(body) for _, _, body in anno] 52 | [1, 6, 9, 5] 53 | """ 54 | def wrap_interval(record: Tuple[str, str, str, str, str, str]) \ 55 | -> Interval: 56 | _, _, start, stop, text, label = record 57 | return Interval(int(start), int(stop), label) 58 | 59 | def parse_line(line): 60 | id_, src, start, stop, text, label = line.split('\t') 61 | return int(id_), src, int(start), int(stop), text, label 62 | 63 | with open(path) as buffer: 64 | parsed_lines = map(parse_line, map(str.strip, buffer)) 65 | lines_sorted = sorted( 66 | parsed_lines, key=lambda x: (-x[0], x[1], -x[2]), reverse=True) 67 | # separate abstracts 68 | abstract_groups = groupby(lines_sorted, op.itemgetter(0)) 69 | # separate parts (title and body) 70 | part_groups = ((id_, groupby(group, op.itemgetter(1))) 71 | for id_, group in abstract_groups) 72 | # filter zero-length intervals and `None`s 73 | wrapper = F(map, wrap_interval) >> (filter, bool) >> list 74 | mapped_parts = ((id_, {part: wrapper(recs) for part, recs in parts}) 75 | for id_, parts in part_groups) 76 | return [AbstractAnnotation(int(id_), 77 | list(parts.get(TITLE, [])), 78 | list(parts.get(BODY, []))) 79 | for id_, parts in mapped_parts] 80 | 81 | 82 | def parse_borders(path: Text) -> List[AbstractSentenceBorders]: 83 | def pack_borders(id_: int, borders_: pd.DataFrame): 84 | src_mapped = { 85 | src: [Interval(*map(int, b_str.split(':'))) for b_str in bs[2]] 86 | for src, bs in borders_.groupby(1) 87 | } 88 | title_borders = src_mapped.get(TITLE, []) 89 | body_borders = src_mapped.get(BODY, []) 90 | return AbstractSentenceBorders(id_, title_borders, body_borders) 91 | 92 | borders = pd.read_csv(path, sep='\t', header=None) 93 | return ([] if not len(borders) else 94 | [pack_borders(id_, bs) for id_, bs in borders.groupby(0)]) 95 | 96 | 97 | def align_abstracts(abstracts: Iterable[AbstractText], 98 | annotations: Iterable[AbstractAnnotation]=None, 99 | borders: Iterable[AbstractSentenceBorders]=None) \ 100 | -> Iterator[Abstract]: 101 | # TODO tests 102 | """ 103 | Align abstracts and annotations (i.e. match abstract ids) 104 | :param abstracts: parsed abstracts (e.g. produces by `read_abstracts`) 105 | :param annotations: parsed annotations (e.g. produces by `read_annotations`) 106 | :return: Iterator[(parsed abstract, parsed annotation)] 107 | """ 108 | def empty_anno(id_: int) -> AbstractAnnotation: 109 | return AbstractAnnotation(id_, [], []) 110 | 111 | def empty_borders(id_: int) -> AbstractSentenceBorders: 112 | return AbstractSentenceBorders(id_, [], []) 113 | 114 | anno_mapping = {anno.id: anno for anno in annotations or []} 115 | borders_mapping = {b.id: b for b in borders or []} 116 | 117 | return ((abstract, 118 | anno_mapping.get(abstract.id, empty_anno(abstract.id)), 119 | borders_mapping.get(abstract.id, empty_borders(abstract.id))) 120 | for abstract in abstracts) 121 | 122 | 123 | def parse(abstracts: str, annotations: str, borders: str) -> List[Abstract]: 124 | return list(align_abstracts(parse_abstracts(abstracts), 125 | parse_annotations(annotations), 126 | parse_borders(borders))) 127 | 128 | 129 | if __name__ == '__main__': 130 | raise RuntimeError 131 | -------------------------------------------------------------------------------- /scilk/corpora/corpus.py: -------------------------------------------------------------------------------- 1 | from numbers import Integral 2 | from typing import Sequence, NamedTuple, Text, Iterable, Tuple, List, \ 3 | Mapping, Optional 4 | from itertools import chain 5 | 6 | from fn import F 7 | 8 | from scilk.util import intervals 9 | 10 | 11 | OTHER = "OTHER" 12 | TITLE = "T" 13 | BODY = "A" 14 | ClassMapping = Mapping[Text, Integral] 15 | LabeledInterval = intervals.Interval[Text] 16 | Annotation = Sequence[LabeledInterval] 17 | SentenceBorders = Sequence[intervals.Interval] 18 | 19 | AbstractText = NamedTuple("Abstract", 20 | [("id", int), ("title", Text), ("body", Text)]) 21 | AbstractAnnotation = NamedTuple("AbstractAnnotation", [("id", int), 22 | ("title", Annotation), 23 | ("body", Annotation)]) 24 | AbstractSentenceBorders = NamedTuple("AbstractSentenceBorders", 25 | [("id", int), ("title", SentenceBorders), 26 | ("body", SentenceBorders)]) 27 | Abstract = Tuple[AbstractText, AbstractAnnotation, AbstractSentenceBorders] 28 | # Record: (abstract id, part type, text, annotation, sentence borders) 29 | Record = Tuple[int, Text, Text, Optional[Annotation], Optional[SentenceBorders]] 30 | 31 | 32 | class AnnotationError(ValueError): 33 | pass 34 | 35 | 36 | def records(abstract: Abstract) -> List[Record]: 37 | """ 38 | :return: list[(abstract id, source, text, annotation)] 39 | """ 40 | abstract_id, title, body = abstract[0] 41 | anno_id, title_anno, body_anno = abstract[1] 42 | borders_id, title_borders, body_borders = abstract[2] 43 | if abstract_id != anno_id: 44 | raise AnnotationError("Abstract ids do not match") 45 | return [(abstract_id, TITLE, title, title_anno, title_borders), 46 | (abstract_id, BODY, body, body_anno, body_borders)] 47 | 48 | 49 | def parse_mapping(classmaps: Iterable[str]) -> ClassMapping: 50 | """ 51 | :param classmaps: 52 | :return: 53 | >>> classmaps = ["a:1", "b:1", "c:2"] 54 | >>> parse_mapping(classmaps) == dict(a=1, b=1, c=2) 55 | True 56 | """ 57 | try: 58 | return {cls: int(val) 59 | for cls, val in [classmap.split(":") for classmap in classmaps]} 60 | except ValueError as err: 61 | raise AnnotationError("Badly formatted mapping: {}".format(err)) 62 | 63 | 64 | def flatten_abstracts(abstracts: Iterable[Abstract]) -> \ 65 | List[Tuple[str, List[intervals.Interval], List[intervals.Interval]]]: 66 | """ 67 | Flatten abstracts into a stream of tuples of form (text, annotations, 68 | sentence borders) 69 | :param abstracts: 70 | :return: 71 | """ 72 | return (F(chain.from_iterable) >> list)([ 73 | ((abstract.title, annotations.title, borders.title), 74 | (abstract.body, annotations.body, borders.body)) 75 | for abstract, annotations, borders in abstracts 76 | ]) 77 | 78 | 79 | if __name__ == '__main__': 80 | raise RuntimeError 81 | -------------------------------------------------------------------------------- /scilk/corpora/genia.py: -------------------------------------------------------------------------------- 1 | from itertools import starmap 2 | from typing import Sequence, NamedTuple, Tuple, Iterable, Text, Optional, List, \ 3 | Iterator 4 | from xml.etree import ElementTree as ETree 5 | 6 | import operator as op 7 | import re 8 | from functools import reduce 9 | from pyrsistent import v, pvector 10 | 11 | from scilk.corpora.corpus import AbstractAnnotation, AbstractText, AbstractSentenceBorders, \ 12 | AnnotationError, LabeledInterval, Abstract, SentenceBorders 13 | from scilk.util.intervals import Interval 14 | 15 | ANNO_PATT = re.compile('G#(\w+)') 16 | SENTENCE_TAG = 'sentence' 17 | ANNO_TAG = 'sem' 18 | ARTICLE_TAG = 'article' 19 | 20 | LevelAnnotation = NamedTuple('Annotation', [('level', int), 21 | ('anno', Sequence[Optional[Text]]), 22 | ('terminal', bool)]) 23 | 24 | 25 | def _flatten_sentence(sentence: ETree.Element) \ 26 | -> List[Tuple[Text, Sequence[LevelAnnotation]]]: 27 | # TODO docs 28 | """ 29 | Convert a `sentence` XML Element object into normal text. 30 | :param sentence: an sentence XML node 31 | :return: a list of strings with corresponding annotations 32 | """ 33 | 34 | def isterminal(element: ETree.Element): 35 | return next(iter(element), None) is None 36 | 37 | def getanno(element: ETree.Element): 38 | return element.get(ANNO_TAG, None) 39 | 40 | stack = [(sentence, iter(sentence), v())] 41 | texts = [sentence.text] 42 | annotations = [stack[0][2]] 43 | while stack: 44 | node, children, anno = stack[-1] 45 | child = next(children, None) 46 | if child is None: 47 | stack.pop() 48 | texts.append(node.tail) 49 | annotations.append(anno[:-1]) 50 | continue 51 | child_anno = anno.append( 52 | LevelAnnotation(len(anno), getanno(child), isterminal(child))) 53 | texts.append(child.text) 54 | annotations.append(child_anno) 55 | stack.append((child, iter(child), child_anno)) 56 | 57 | return list(zip(texts, annotations)) 58 | 59 | 60 | def _segment_borders(texts: Iterable[Text]) -> List[Tuple[int, int]]: 61 | # TODO docs 62 | """ 63 | Returns a list of cummulative start/stop positions for segments in `texts`. 64 | :param texts: a list of strings 65 | :return: list of (start position, stop position) 66 | >>> _segment_borders(['amino acid', 'is any']) == [(0, 10), (10, 16)] 67 | True 68 | """ 69 | 70 | def aggregate_boundaries(boundaries: pvector, text): 71 | return ( 72 | boundaries + [(boundaries[-1][1], boundaries[-1][1] + len(text))] 73 | if boundaries else v((0, len(text))) 74 | ) 75 | 76 | return list(reduce(aggregate_boundaries, texts, v())) 77 | 78 | 79 | def _sentences_borders(sentences: Iterable[ETree.Element]) -> SentenceBorders: 80 | """ 81 | Applies _segment_borders to sentences and corrects intervals to handle 82 | end-of-a-sentence symbol at the ends of sentences in AbstractText returned 83 | by _parse_sentences 84 | :param sentences: Iterable of ETree.Element objects each containing 85 | sentence's text. Correct sentence segmentation is assumed 86 | :return: List of Intervals with sentence borders and no Interval.data. 87 | ~List[Interval[start, stop, None]] 88 | """ 89 | sent_j = [''.join(x.itertext()) for x in sentences] 90 | borders = ((start+i, stop+i) 91 | for i, (start, stop) in enumerate(_segment_borders(sent_j))) 92 | return [Interval(start, stop) for (start, stop), _ in zip(borders, sent_j)] 93 | 94 | 95 | def _parse_sentences(root: ETree.Element) \ 96 | -> Tuple[Text, List[LabeledInterval], SentenceBorders]: 97 | # TODO docs 98 | """ 99 | Get text from `root` Element with given mapping dictionary. 100 | :param root: 101 | :return: joined text along with its annotations 102 | """ 103 | 104 | def wrap_iv(start: int, stop: int, levels: Sequence[LevelAnnotation]) \ 105 | -> LabeledInterval: 106 | """ 107 | Wrap `start`, `stop` and `levels` into an Interval. 108 | :param start: start position 109 | :param stop: stop position 110 | :param levels: list of annotations 111 | :return: Interval(`start`, `stop`, mappings) 112 | """ 113 | # get the first nonempty annotation bottom to top 114 | anno = next(filter(bool, (l.anno for l in reversed(levels))), '') 115 | codes = set(ANNO_PATT.findall(anno)) 116 | if not len(codes) == 1: 117 | raise AnnotationError( 118 | 'The annotation is either ambiguous or empty: {}'.format(codes)) 119 | return Interval(start, stop, codes.pop()) 120 | 121 | sentences = root.findall(SENTENCE_TAG) 122 | flattened = reduce(op.iadd, map(_flatten_sentence, sentences), []) 123 | texts, annotations = zip(*((txt, anno) for txt, anno in flattened 124 | if txt is not None)) 125 | boundaries = _segment_borders(texts) 126 | intervals = [wrap_iv(start, stop, levels) 127 | for (start, stop), levels in zip(boundaries, annotations) 128 | if levels and levels[-1].terminal] 129 | text = ''.join(texts).replace('\n', ' ').rstrip() 130 | annotation = [iv for iv in intervals if iv] 131 | borders = _sentences_borders(sentences) 132 | return text, annotation, borders 133 | 134 | 135 | def parse(path: Text) -> List[Abstract]: 136 | """ 137 | Extract text from xml file `path`. 138 | :param path: xml file's path 139 | :return: 140 | """ 141 | 142 | def getid(article: ETree.Element) -> int: 143 | raw = article.find('articleinfo').find('bibliomisc').text 144 | return int(raw.replace('MEDLINE:', '')) 145 | 146 | def accumulate_articles(root: ETree.Element) \ 147 | -> Iterator[Tuple[int, ETree.Element, ETree.Element]]: 148 | """ 149 | Collects articles inside `root`. 150 | :param root: 151 | :return: 152 | """ 153 | articles_ = root.findall(ARTICLE_TAG) 154 | ids = map(getid, articles_) 155 | title_roots = [article.find('title') for article in articles_] 156 | body_roots = [article.find('abstract') for article in articles_] 157 | return zip(ids, title_roots, body_roots) 158 | 159 | def parse_article(id_: int, title_root: ETree.Element, 160 | body_root: ETree.Element) \ 161 | -> Tuple[AbstractText, AbstractAnnotation, AbstractSentenceBorders]: 162 | """ 163 | Extract title and body texts from `title_root` and `body_root`. 164 | :param id_: article's id 165 | :param title_root: 166 | :param body_root: 167 | :return: 168 | """ 169 | title_text, title_anno, title_sent = _parse_sentences(title_root) 170 | body_text, body_anno, body_sent = _parse_sentences(body_root) 171 | abstract = AbstractText(id_, title_text, body_text) 172 | annotation = AbstractAnnotation(id_, title_anno, body_anno) 173 | sent_borders = AbstractSentenceBorders(id_, title_sent, body_sent) 174 | return abstract, annotation, sent_borders 175 | 176 | corpus = ETree.parse(path) 177 | articles = accumulate_articles(corpus) 178 | return list(starmap(parse_article, articles)) 179 | 180 | 181 | if __name__ == '__main__': 182 | raise RuntimeError 183 | -------------------------------------------------------------------------------- /scilk/util/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/skoblov-lab/SciLK/b3509b7d3839462ab415e9f2cfd0ad8033f8034d/scilk/util/__init__.py -------------------------------------------------------------------------------- /scilk/util/binning.py: -------------------------------------------------------------------------------- 1 | import operator as op 2 | from itertools import chain 3 | from numbers import Number 4 | from typing import Union, Sequence, Iterable, List, TypeVar, Callable 5 | 6 | import numpy as np 7 | from binpacking import to_constant_bin_number 8 | from fn import F 9 | 10 | from scilk.util import preprocessing 11 | 12 | 13 | T = TypeVar('T') 14 | 15 | 16 | def binpack(nbins: int, weight: Callable[[T], Number], items: Sequence[T]) \ 17 | -> List[List[int]]: 18 | """ 19 | Pack items into n bins while minimising the variance of weight accumulated 20 | in each bin. The function uses a greedy algorithm, which doesn't not 21 | guarantee a perfect result. 22 | :param nbins: the number of bins to create 23 | :param weight: a weight function 24 | :param items: items to pack; since the function returns bins packed with 25 | positions inferred from iteration order, iteration over `items` must be 26 | stable for the output to be useful. 27 | :return: a nested list of integers representing positions in `items` 28 | """ 29 | if len(items) < nbins: 30 | raise ValueError('There should be at lest `nbins` items') 31 | weighted = [(i, weight(item)) for i, item in enumerate(items)] 32 | return (F(map, F(map, op.itemgetter(0)) >> list) >> list)( 33 | to_constant_bin_number(weighted, nbins, weight_pos=1) 34 | ) 35 | 36 | 37 | def binextract(source: Union[Sequence[T], np.ndarray], bins: Sequence[Sequence[int]]) \ 38 | -> Union[List[List[T]], List[np.ndarray]]: 39 | """ 40 | 'Materialise' bins, i.e. transform a nested list of indices into bins of 41 | source items. See `binpack` for additional info. 42 | :param source: source items 43 | :param bins: a nested sequence if integers - indices referring to object 44 | from `source` 45 | :return: 46 | """ 47 | if not isinstance(source, (Sequence, np.ndarray)): 48 | raise ValueError('`source` must be either a Sequence or a numpy array') 49 | try: 50 | return ( 51 | [source[bin_] for bin_ in bins] if isinstance(source, np.ndarray) else 52 | [[source[i] for i in bin_] for bin_ in bins] 53 | ) 54 | except IndexError: 55 | raise ValueError('`bins` contain indices outside of the `source` range') 56 | 57 | 58 | def merge_bins(sources: Union[np.ndarray, Sequence[np.ndarray]], 59 | bins: Sequence[Sequence[int]], dtype=None) -> np.ndarray: 60 | """ 61 | Merge sources within bins and stack them on top of each other. 62 | :param sources: a Sequence of source arrays. 63 | :param bins: a Sequence of bins: Sequences of indices referencing 64 | arrays in `sources`. 65 | :param dtype: numpy data type; if None `sources[0].dtype` will be used 66 | instead 67 | :return: a merged arrays 68 | """ 69 | if not len(sources): 70 | raise ValueError('no `sources`') 71 | extracted = ( 72 | F(binextract) >> (map, np.concatenate) >> list 73 | )(sources, bins) 74 | return preprocessing.stack(extracted, None, 75 | dtype=(dtype or sources[0].dtype))[0] 76 | 77 | 78 | def unbin(binned: Iterable[Iterable[T]], bins: Iterable[Iterable[int]]) \ 79 | -> List[T]: 80 | """ 81 | Revert binning: transform a nested Iterable of objects (i.e. objects packed 82 | into bins) into a list of objects ordered the same way as the original 83 | Sequence 84 | :param binned: a nested Iterable of binned objects 85 | :param bins: a nested Iterable of bins: Iterables of indices referencing 86 | objects in the original Sequence 87 | :return: 88 | """ 89 | return (F(map, chain.from_iterable) >> 90 | (lambda x: zip(*x)) >> 91 | F(sorted, key=op.itemgetter(0)) >> 92 | (map, op.itemgetter(1)) >> list)([bins, binned]) 93 | 94 | 95 | def unmerge_bins(merged: np.ndarray, bins: Sequence[Sequence[int]], 96 | lengths: Sequence[int]) -> List[List[np.ndarray]]: 97 | """ 98 | Breaks `merged` into binned objects corresponding to the original objects 99 | in a binned Sequence 100 | :param merged: a merged representation of binned data 101 | :param bins: a Sequence of bins: Sequences of indices referencing 102 | :param lengths: lengths of the original source objects 103 | :return: 104 | """ 105 | lengths_ = np.array(lengths) 106 | indices = [lengths_[bin_] for bin_ in bins] 107 | return [list(np.split(line, np.cumsum(l_indices)))[:-1] 108 | for line, l_indices in zip(merged, indices)] 109 | 110 | 111 | if __name__ == '__main__': 112 | raise RuntimeError 113 | -------------------------------------------------------------------------------- /scilk/util/intervals.py: -------------------------------------------------------------------------------- 1 | import sys 2 | from typing import TypeVar, Container, Generic, Optional, Sequence, Iterable, \ 3 | List, Iterator, Union, overload 4 | from numbers import Number 5 | 6 | import numpy as np 7 | 8 | _slots_supported = (sys.version_info >= (3, 6, 2) or 9 | (3, 5, 3) <= sys.version_info < (3, 6)) 10 | T = TypeVar("T") 11 | 12 | 13 | class Interval(Container, Generic[T]): 14 | 15 | if _slots_supported: 16 | __slots__ = ("start", "stop", "data") 17 | 18 | def __init__(self, start: int, stop: int, data: Optional[T]=None): 19 | self.start = start 20 | self.stop = stop 21 | self.data = data 22 | 23 | def __contains__(self, item: T) -> bool: 24 | return False if self.data is None or item is None else self.data == item 25 | 26 | def __iter__(self): 27 | return iter(range(self.start, self.stop)) 28 | 29 | def __eq__(self, other: 'Interval'): 30 | return (self.start, self.stop, self.data) == (other.start, other.stop, other.data) 31 | 32 | def __hash__(self): 33 | return hash((self.start, self.stop, self.data)) 34 | 35 | def __len__(self): 36 | return self.stop - self.start 37 | 38 | def __bool__(self): 39 | return bool(len(self)) 40 | 41 | def __and__(self, other: 'Interval') -> 'Interval[List]': 42 | # TODO docs 43 | first, second = sorted([self, other], key=lambda iv: iv.start) 44 | return type(self)(first.start, second.stop, [first.data, second.data]) 45 | 46 | def __repr__(self): 47 | return '{}(start={}, stop={}, data={})'.format(type(self).__name__, 48 | self.start, 49 | self.stop, 50 | self.data) 51 | 52 | def reload(self, value: T) -> 'Interval[T]': 53 | return type(self)(self.start, self.stop, value) 54 | 55 | def intersects(self, other: Union['Interval', Number]) -> bool: 56 | if isinstance(other, type(self)): 57 | return (other.start <= self.start < other.stop or 58 | self.start <= other.start < self.stop) 59 | if isinstance(other, Number): 60 | return self.start <= other < self.stop 61 | raise ValueError('method argument `other` must be an instance of {} ' 62 | 'or a Number'.format(type(self).__name__)) 63 | 64 | 65 | def extract(sequence: Sequence[T], ivs: Iterable[Interval], offset=0) \ 66 | -> List[Sequence[T]]: 67 | return [sequence[iv.start-offset:iv.stop-offset] for iv in ivs] 68 | 69 | 70 | def span(ivs: Sequence[Interval]) -> Optional[Interval]: 71 | """ 72 | Intervals must be presorted 73 | :param ivs: 74 | :return: 75 | """ 76 | return Interval(ivs[0].start, ivs[-1].stop) if len(ivs) else None 77 | 78 | 79 | def unload(intervals: Iterable[Interval[T]]) -> Iterator[T]: 80 | return (iv.data for iv in intervals) 81 | 82 | 83 | @overload 84 | def unextract(ivs: Sequence[Interval], extracted: Sequence[Sequence[T]], fill: T) \ 85 | -> Sequence[T]: 86 | pass 87 | 88 | 89 | @overload 90 | def unextract(ivs: Sequence[Interval], extracted: Sequence[np.ndarray], fill) \ 91 | -> Sequence[T]: 92 | pass 93 | 94 | 95 | def unextract(ivs, extracted, fill): 96 | if not len(ivs) or not len(extracted): 97 | return None 98 | if all(isinstance(ext, np.ndarray) for ext in extracted): 99 | return _unextract_arr(ivs, extracted, fill) 100 | if isinstance(extracted, Sequence): 101 | return _unextract_sequence(ivs, extracted, fill) 102 | raise ValueError("Extracted must be either a sequence of numpy arrays or " 103 | "a sequence of Sequence objects") 104 | 105 | 106 | def _unextract_sequence(ivs: Sequence[Interval], 107 | extracted: Sequence[Sequence[T]], 108 | fill: T) -> Sequence[T]: 109 | sorted_ivs = sorted(ivs, key=lambda x: x.start) 110 | res = [fill] * len(span(sorted_ivs)) 111 | offset = sorted_ivs[0].start 112 | for iv, ext in zip(ivs, extracted): 113 | if len(iv) != len(ext): 114 | raise ValueError("Intervals and extracted data are not aligned " 115 | "with respect to length") 116 | for i, val in zip(iv, ext): 117 | res[i-offset] = val 118 | return res 119 | 120 | 121 | def _unextract_arr(ivs: Sequence[Interval], extracted: Sequence[np.ndarray], fill) \ 122 | -> Optional[np.ndarray]: 123 | ndims = set(map(np.ndim, extracted)) 124 | dtypes = set(ext.dtype for ext in extracted) 125 | if not len(ndims) == len(dtypes) == 1: 126 | raise ValueError("Arrays must be homogeneous") 127 | if isinstance(fill, np.ndarray) and fill.shape != extracted[0].shape[1:]: 128 | raise ValueError("fill is incompatible with extracted arrays") 129 | sorted_ivs = sorted(ivs, key=lambda x: x.start) 130 | res = np.array([fill]*len(span(sorted_ivs)), dtype=dtypes.pop()) 131 | offset = sorted_ivs[0].start 132 | for iv, ext in zip(ivs, extracted): 133 | if len(iv) != len(ext): 134 | raise ValueError("Intervals and extracted data are not aligned " 135 | "with respect to length") 136 | res[iv.start-offset:iv.stop-offset] = ext 137 | return res 138 | 139 | 140 | if __name__ == "__main__": 141 | raise ValueError 142 | -------------------------------------------------------------------------------- /scilk/util/networks/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/skoblov-lab/SciLK/b3509b7d3839462ab415e9f2cfd0ad8033f8034d/scilk/util/networks/__init__.py -------------------------------------------------------------------------------- /scilk/util/networks/blocks.py: -------------------------------------------------------------------------------- 1 | """ 2 | 3 | Utility functions for creating ChemPred deep learning models and working with 4 | their predictions 5 | 6 | """ 7 | from functools import reduce 8 | from typing import Sequence, Tuple, Optional, Union, Callable 9 | 10 | import numpy as np 11 | from keras import layers, backend as K 12 | 13 | 14 | def cnn(nfilters: Sequence[int], 15 | filter_width: Union[int, Sequence[int]], 16 | dropout: Union[Optional[float], Sequence[Optional[float]]]=None, 17 | padding: Union[str, Sequence[str]]='same', 18 | name_template: str='conv{}') \ 19 | -> Callable: 20 | # TODO extend documentation 21 | # TODO more tests 22 | # TODO make name_template Optional 23 | """ 24 | 25 | :param nfilters: 26 | :param filter_width: 27 | :return: 28 | """ 29 | def stack_conv(prev, param: Tuple[str, int, int, float, str]): 30 | name, nfilt, kern_size, drop_p, pad = param 31 | l = layers.Convolution1D( 32 | nfilt, kern_size, activation='relu', name=name, padding=pad 33 | )(prev) 34 | return layers.Dropout(drop_p)(l) if drop_p else l 35 | 36 | filter_width = (filter_width if isinstance(filter_width, Sequence) else 37 | [filter_width] * len(nfilters)) 38 | dropout = (dropout if isinstance(dropout, Sequence) else 39 | [dropout] * len(nfilters)) 40 | padding = (padding if isinstance(padding, Sequence) and not isinstance(padding, str) 41 | else [padding] * len(nfilters)) 42 | 43 | if not len(nfilters) == len(filter_width) == len(dropout) == len(padding): 44 | raise ValueError('Parameter sequences have different lengths') 45 | 46 | def conv(incomming): 47 | conv_names = (name_template.format(i+1) for i in range(0, len(nfilters))) 48 | parameters = zip(conv_names, nfilters, filter_width, dropout, padding) 49 | cnn = reduce(stack_conv, parameters, incomming) 50 | return cnn 51 | 52 | return conv 53 | 54 | 55 | def rnn(nsteps: Sequence[int], 56 | inp_drop: Optional[Union[float, Sequence[float]]]=None, 57 | rec_drop: Optional[Union[float, Sequence[float]]]=None, 58 | bidirectional: Union[Optional[str], Sequence[Optional[str]]]=None, 59 | stateful=False, layer=layers.LSTM) -> Callable: 60 | # TODO extend documentation 61 | # TODO add name template argument 62 | # TODO tests 63 | """ 64 | :param nsteps: 65 | :param inp_drop: 66 | :param rec_drop: 67 | :param bidirectional: 68 | :param stateful: use stateful RNN-cells 69 | :param layer: a recurrent layer to use 70 | :return: 71 | """ 72 | 73 | def stack_layers(prev, param: Tuple[str, int, float, float, str]): 74 | """ 75 | :param prev: incomming keras layer 76 | :param param: [layer name, steps, input dropout, recurrent dropout, 77 | bidirectional] 78 | """ 79 | name, steps, indrop, recdrop, bidir = param 80 | layer_ = layer(steps, dropout=indrop, recurrent_dropout=recdrop, 81 | return_sequences=True, stateful=stateful) 82 | return (layers.Bidirectional(layer_, bidir) if bidir else layer_)(prev) 83 | 84 | bidir_is_seq = (isinstance(bidirectional, Sequence) 85 | and not isinstance(bidirectional, str)) 86 | bi = (bidirectional if bidir_is_seq else [bidirectional] * len(nsteps)) 87 | inp_drop = (inp_drop if isinstance(inp_drop, Sequence) else 88 | [inp_drop or 0] * len(nsteps)) 89 | rec_drop = (rec_drop if isinstance(rec_drop, Sequence) else 90 | [rec_drop or 0] * len(nsteps)) 91 | 92 | if not len(nsteps) == len(rec_drop) == len(inp_drop) == len(bi): 93 | raise ValueError('Parameter sequences have different length') 94 | 95 | def rec(incomming): 96 | rec_names = ('rec{}'.format(i) for i in range(1, len(nsteps) + 1)) 97 | parameters = zip(rec_names, nsteps, inp_drop, rec_drop, bi) 98 | rnn = reduce(stack_layers, parameters, incomming) 99 | return rnn 100 | 101 | return rec 102 | 103 | 104 | def wordemb(nwords: int, vectors: np.ndarray, mask: bool): 105 | # TODO docs 106 | def wordemb(incomming): 107 | emb = layers.embeddings.Embedding(input_dim=nwords, 108 | output_dim=vectors.shape[-1], 109 | mask_zero=mask, 110 | weights=[vectors])(incomming) 111 | return emb 112 | 113 | return wordemb 114 | 115 | 116 | def charemb(input_dim: int, maxlen: int, embsize: int, nunits: int, 117 | indrop: float, recdrop: float, mask: bool, layer=layers.LSTM): 118 | # TODO docs 119 | def charemb(incomming): 120 | emb = layers.embeddings.Embedding(input_dim=input_dim, 121 | output_dim=embsize, 122 | mask_zero=mask)(incomming) 123 | shape = (K.shape(incomming)[0], maxlen, K.shape(incomming)[2], embsize) 124 | emb = layers.Lambda( 125 | lambda x: K.reshape(x, shape=(-1, shape[-2], embsize)))(emb) 126 | 127 | forward = layer(nunits, 128 | return_state=True, 129 | dropout=indrop, 130 | recurrent_dropout=recdrop)(emb)[-2] 131 | reverse = layer(nunits, 132 | return_state=True, 133 | recurrent_dropout=recdrop, 134 | dropout=indrop, 135 | go_backwards=True)(emb)[-2] 136 | emb = layers.concatenate([forward, reverse], axis=-1) 137 | # shape = (batch size, max sentence length, char hidden size) 138 | embshape = [incomming.shape[0].value or -1, shape[1], 2 * nunits] 139 | return layers.Lambda(lambda x: K.reshape(x, shape=embshape))(emb) 140 | 141 | return charemb 142 | 143 | 144 | if __name__ == '__main__': 145 | raise RuntimeError 146 | -------------------------------------------------------------------------------- /scilk/util/networks/callbacks.py: -------------------------------------------------------------------------------- 1 | import sys 2 | from itertools import starmap 3 | from typing import Sequence, Mapping, Text, Callable, Optional, IO, Any, Iterable 4 | import copy 5 | 6 | import numpy as np 7 | from fn.op import identity 8 | from keras import callbacks 9 | from keras.models import Model 10 | 11 | 12 | class Validator(callbacks.Callback): 13 | modes = ('max', 'min') 14 | 15 | # TODO docs 16 | 17 | def __init__(self, 18 | inputs: Sequence[np.ndarray], 19 | output: np.ndarray, 20 | batchsize: int, 21 | metrics: Mapping[Text, Callable[[np.ndarray, np.ndarray], float]], 22 | transform: Callable[[np.ndarray], np.ndarray]=identity, 23 | monitor: Optional[Text]=None, 24 | mode: Text='max', 25 | prefix: Text=None, 26 | stream: IO=sys.stderr): 27 | """ 28 | :param inputs: 29 | :param output: 30 | :param batchsize: 31 | :param metrics: a mapping between names and functions; the functions 32 | must have the following signature: f(true, predicted) -> float 33 | :param transform: 34 | :param monitor: 35 | :param mode: 36 | :param prefix: 37 | """ 38 | super().__init__() 39 | if mode not in self.modes: 40 | raise ValueError('`mode` must be either "max" or "min"') 41 | if monitor and monitor not in metrics: 42 | raise ValueError('`monitor` is not in metrics') 43 | if monitor and not prefix: 44 | raise ValueError('you must provide a path prefix when monitoring') 45 | self.inputs = inputs 46 | self.output = output 47 | self.epoch = None 48 | self.batchsize = batchsize 49 | self.metrics = metrics 50 | self.mode = mode 51 | self.transform = transform 52 | self.monitor = monitor 53 | self.best = float('-inf') if mode == 'max' else float('inf') 54 | self.prefix = prefix 55 | self.stream = stream 56 | 57 | def _estimate_metrics(self): 58 | pred = self.transform(self.model.predict(self.inputs, self.batchsize)) 59 | return {name: f(self.output, pred) for name, f in self.metrics.items()} 60 | 61 | @staticmethod 62 | def _format_score_log(scores: Mapping[Text, float]): 63 | template = '{} - {:.3f}' 64 | return " | ".join(starmap(template.format, scores.items())) 65 | 66 | def _improved(self, score: float): 67 | return score > self.best if self.mode == 'max' else score < self.best 68 | 69 | def on_epoch_end(self, epoch, logs=None): 70 | self.epoch = epoch 71 | scores = self._estimate_metrics() 72 | log = self._format_score_log(scores) 73 | print("\n" + log, file=self.stream) 74 | if self.monitor and self._improved(scores[self.monitor]): 75 | path = '{}-{:02d}-{:.3f}.hdf5'.format(self.prefix, self.epoch, scores[self.monitor]) 76 | print('{} improved from {} to {}; saving weights to {}'.format( 77 | self.monitor, self.best, scores[self.monitor], path), 78 | end='\n\n', file=self.stream) 79 | self.best = scores[self.monitor] 80 | self.model.save_weights(path) 81 | elif self.monitor: 82 | print("{} didn't improve".format(self.monitor), end='\n\n', file=self.stream) 83 | self.stream.flush() 84 | 85 | 86 | class Caller(callbacks.Callback): 87 | 88 | def __init__(self, callables: Mapping[str, Iterable[Callable[[Model], Any]]]): 89 | """ 90 | Call some callables on epoch/batch end/begin. Valid dictionary keys: 91 | - on_batch_begin 92 | - on_batch_end 93 | - on_epoch_begin 94 | - on_epoch_end 95 | """ 96 | super().__init__() 97 | self.callables = {key: list(val) for key, val in callables.items()} 98 | 99 | def call(self, when): 100 | for f in self.callables.get(when, []): 101 | f(self.model) 102 | 103 | def on_batch_begin(self, batch, logs=None): 104 | self.call('on_batch_begin') 105 | 106 | def on_batch_end(self, batch, logs=None): 107 | self.call('on_batch_end') 108 | 109 | def on_epoch_begin(self, epoch, logs=None): 110 | self.call('on_epoch_begin') 111 | 112 | def on_epoch_end(self, epoch, logs=None): 113 | self.call('on_epoch_end') 114 | 115 | 116 | if __name__ == '__main__': 117 | raise RuntimeError 118 | -------------------------------------------------------------------------------- /scilk/util/networks/metrics.py: -------------------------------------------------------------------------------- 1 | """ 2 | 3 | 4 | """ 5 | 6 | from keras import backend as K 7 | 8 | 9 | def precision(y_true, y_pred): 10 | true_positives = K.sum(K.round(K.clip(y_true * y_pred, 0, 1))) 11 | predicted_positives = K.sum(K.round(K.clip(y_pred, 0, 1))) 12 | precision = true_positives / (predicted_positives + K.epsilon()) 13 | return precision 14 | 15 | 16 | def recall(y_true, y_pred): 17 | true_positives = K.sum(K.round(K.clip(y_true * y_pred, 0, 1))) 18 | possible_positives = K.sum(K.round(K.clip(y_true, 0, 1))) 19 | recall = true_positives / (possible_positives + K.epsilon()) 20 | return recall 21 | 22 | 23 | def fbeta_score(y_true, y_pred, beta): 24 | """ 25 | Calculates the F score, the weighted harmonic mean of precision and recall. 26 | 27 | This is useful for multi-label classification, where input samples can be 28 | classified as sets of labels. By only using accuracy (precision) a model 29 | would achieve a perfect score by simply assigning every class to every 30 | input. In order to avoid this, a metric should penalize incorrect class 31 | assignments as well (recall). The F-beta score (ranged from 0.0 to 1.0) 32 | computes this, as a weighted mean of the proportion of correct class 33 | assignments vs. the proportion of incorrect class assignments. 34 | 35 | With beta = 1, this is equivalent to a F-measure. With beta < 1, assigning 36 | correct classes becomes more important, and with beta > 1 the metric is 37 | instead weighted towards penalizing incorrect class assignments. 38 | """ 39 | if beta < 0: 40 | raise ValueError('The lowest choosable beta is zero (only precision).') 41 | 42 | # If there are no true positives, fix the F score at 0 like sklearn. 43 | if K.sum(K.round(K.clip(y_true, 0, 1))) == 0: 44 | return 0 45 | 46 | p = precision(y_true, y_pred) 47 | r = recall(y_true, y_pred) 48 | bb = beta ** 2 49 | fbeta_score = (1 + bb) * (p * r) / (bb * p + r + K.epsilon()) 50 | return fbeta_score 51 | 52 | 53 | def fmeasure(y_true, y_pred): 54 | """ 55 | Calculates the f-measure, the harmonic mean of precision and recall. 56 | """ 57 | return fbeta_score(y_true, y_pred, beta=1) 58 | 59 | 60 | def recall_softmax(y_true, y_pred): 61 | labels_true = K.argmax(y_true, axis=-1) 62 | labels_pred = K.argmax(y_pred, axis=-1) 63 | positive_true = K.cast(K.equal(labels_true, 1), dtype=K.floatx()) 64 | positive_pred = K.cast(K.equal(labels_pred, 1), dtype=K.floatx()) 65 | true_positives = K.sum(positive_true * positive_pred) + K.epsilon() 66 | return true_positives / (K.sum(positive_true) + K.epsilon()) 67 | 68 | 69 | def precision_softmax(y_true, y_pred): 70 | labels_true = K.argmax(y_true, axis=-1) 71 | labels_pred = K.argmax(y_pred, axis=-1) 72 | positive_true = K.cast(K.equal(labels_true, 1), dtype=K.floatx()) 73 | positive_pred = K.cast(K.equal(labels_pred, 1), dtype=K.floatx()) 74 | true_positives = K.sum(positive_true * positive_pred) + K.epsilon() 75 | return true_positives / (K.sum(positive_pred) + K.epsilon()) 76 | 77 | 78 | def fmeasure_softmax(y_true, y_pred): 79 | p = precision_softmax(y_true, y_pred) 80 | r = recall_softmax(y_true, y_pred) 81 | return 2 * p * r / (p + r) 82 | 83 | 84 | if __name__ == "__main__": 85 | raise RuntimeError 86 | -------------------------------------------------------------------------------- /scilk/util/networks/wrappers.py: -------------------------------------------------------------------------------- 1 | from keras import layers 2 | from keras.layers import wrappers 3 | from keras import backend as K 4 | import copy 5 | 6 | 7 | class HalfStatefulBidirectional(wrappers.Wrapper): 8 | """ 9 | Unlike the built-in keras.wrappers.Bidirectional, this wrapper only makes 10 | the forward reading layer stateful if an incoming layer is stateful. The 11 | backwards reading layer is always stateless, because it makes no sense to 12 | transfer state between batches evolving forward in time in a reversed 13 | layer. 14 | """ 15 | def __init__(self, layer: layers.RNN, merge_mode='concat', weights=None, **kwargs): 16 | super().__init__(layer, **kwargs) 17 | if merge_mode not in ['sum', 'mul', 'ave', 'concat', None]: 18 | raise ValueError('Invalid merge mode. ' 19 | 'Merge mode should be one of ' 20 | '{"sum", "mul", "ave", "concat", None}') 21 | config = layer.get_config() 22 | forward_conf = {**config, 'go_backwards': False} 23 | backward_conf = {**config, 'go_backwards': True, 'stateful': False} 24 | self.forward_layer = layer.__class__.from_config(forward_conf) 25 | self.backward_layer = layer.__class__.from_config(backward_conf) 26 | self.forward_layer.name = 'forward_' + self.forward_layer.name 27 | self.backward_layer.name = 'backward_' + self.backward_layer.name 28 | self.merge_mode = merge_mode 29 | if weights: 30 | self.forward_layer.initial_weights = weights[:len(weights) // 2] 31 | self.backward_layer.initial_weights = weights[len(weights) // 2:] 32 | self.stateful = layer.stateful 33 | self.return_sequences = layer.return_sequences 34 | self.return_state = layer.return_state 35 | self.supports_masking = True 36 | 37 | def get_weights(self): 38 | return self.forward_layer.get_weights() + self.backward_layer.get_weights() 39 | 40 | def set_weights(self, weights): 41 | self.forward_layer.set_weights(weights[:len(weights) // 2]) 42 | self.backward_layer.set_weights(weights[len(weights) // 2:]) 43 | 44 | def compute_output_shape(self, input_shape): 45 | output_shape = self.forward_layer.compute_output_shape(input_shape) 46 | if self.return_state: 47 | state_shape = output_shape[1:] 48 | output_shape = output_shape[0] 49 | 50 | if self.merge_mode == 'concat': 51 | output_shape = list(output_shape) 52 | output_shape[-1] *= 2 53 | output_shape = tuple(output_shape) 54 | elif self.merge_mode is None: 55 | output_shape = [output_shape, copy.copy(output_shape)] 56 | 57 | if self.return_state: 58 | if self.merge_mode is None: 59 | return output_shape + state_shape + copy.copy(state_shape) 60 | return [output_shape] + state_shape + copy.copy(state_shape) 61 | return output_shape 62 | 63 | def call(self, inputs, training=None, mask=None, initial_state=None): 64 | kwargs = {} 65 | if wrappers.has_arg(self.layer.call, 'training'): 66 | kwargs['training'] = training 67 | if wrappers.has_arg(self.layer.call, 'mask'): 68 | kwargs['mask'] = mask 69 | 70 | if initial_state is not None and wrappers.has_arg(self.layer.call, 'initial_state'): 71 | if not isinstance(initial_state, list): 72 | raise ValueError( 73 | 'When passing `initial_state` to a Bidirectional RNN, the state ' 74 | 'should be a list containing the states of the underlying RNNs. ' 75 | 'Found: ' + str(initial_state)) 76 | forward_state = initial_state[:len(initial_state) // 2] 77 | y = self.forward_layer.call(inputs, initial_state=forward_state, **kwargs) 78 | y_rev = self.backward_layer.call(inputs, **kwargs) 79 | else: 80 | y = self.forward_layer.call(inputs, **kwargs) 81 | y_rev = self.backward_layer.call(inputs, **kwargs) 82 | 83 | if self.return_state: 84 | states = y[1:] + y_rev[1:] 85 | y = y[0] 86 | y_rev = y_rev[0] 87 | 88 | if self.return_sequences: 89 | y_rev = K.reverse(y_rev, 1) 90 | if self.merge_mode == 'concat': 91 | output = K.concatenate([y, y_rev]) 92 | elif self.merge_mode == 'sum': 93 | output = y + y_rev 94 | elif self.merge_mode == 'ave': 95 | output = (y + y_rev) / 2 96 | elif self.merge_mode == 'mul': 97 | output = y * y_rev 98 | elif self.merge_mode is None: 99 | output = [y, y_rev] 100 | 101 | # Properly set learning phase 102 | if (getattr(y, '_uses_learning_phase', False) or 103 | getattr(y_rev, '_uses_learning_phase', False)): 104 | if self.merge_mode is None: 105 | for out in output: 106 | out._uses_learning_phase = True 107 | else: 108 | output._uses_learning_phase = True 109 | 110 | if self.return_state: 111 | if self.merge_mode is None: 112 | return output + states 113 | return [output] + states 114 | return output 115 | 116 | def reset_states(self): 117 | self.forward_layer.reset_states() 118 | 119 | def build(self, input_shape): 120 | with K.name_scope(self.forward_layer.name): 121 | self.forward_layer.build(input_shape) 122 | with K.name_scope(self.backward_layer.name): 123 | self.backward_layer.build(input_shape) 124 | self.built = True 125 | 126 | def compute_mask(self, inputs, mask): 127 | if self.return_sequences: 128 | if not self.merge_mode: 129 | return [mask, mask] 130 | else: 131 | return mask 132 | else: 133 | return None 134 | 135 | @property 136 | def trainable_weights(self): 137 | if hasattr(self.forward_layer, 'trainable_weights'): 138 | return (self.forward_layer.trainable_weights + 139 | self.backward_layer.trainable_weights) 140 | return [] 141 | 142 | @property 143 | def non_trainable_weights(self): 144 | if hasattr(self.forward_layer, 'non_trainable_weights'): 145 | return (self.forward_layer.non_trainable_weights + 146 | self.backward_layer.non_trainable_weights) 147 | return [] 148 | 149 | @property 150 | def updates(self): 151 | if hasattr(self.forward_layer, 'updates'): 152 | return self.forward_layer.updates + self.backward_layer.updates 153 | return [] 154 | 155 | @property 156 | def losses(self): 157 | if hasattr(self.forward_layer, 'losses'): 158 | return self.forward_layer.losses + self.backward_layer.losses 159 | return [] 160 | 161 | @property 162 | def constraints(self): 163 | constraints = {} 164 | if hasattr(self.forward_layer, 'constraints'): 165 | constraints.update(self.forward_layer.constraints) 166 | constraints.update(self.backward_layer.constraints) 167 | return constraints 168 | 169 | def get_config(self): 170 | return {**super().get_config(), 'merge_mode': self.merge_mode} 171 | 172 | 173 | if __name__ == '__main__': 174 | raise RuntimeError 175 | -------------------------------------------------------------------------------- /scilk/util/patterns.py: -------------------------------------------------------------------------------- 1 | import re 2 | from functools import reduce 3 | from typing import Iterable, Tuple, Pattern, Union, Text, Callable, List 4 | 5 | from scilk.util.intervals import Interval 6 | 7 | numeric = re.compile('[0-9]*\.?[0-9]+') 8 | wordlike = re.compile('\w+') 9 | misc = re.compile('[^\s\w]') 10 | 11 | 12 | def ptransform(transformations: Iterable[Tuple[Pattern, Union[Text, Callable]]], 13 | text: Text) -> Text: 14 | """ 15 | Pattern transform. The patterns are applied in iteration order with no 16 | intermediate masking. 17 | :param transformations: pairs of patterns and replacements (refer to 18 | `re.sub`'s documentation for more information on possible replacements); 19 | :param text: text to transform 20 | :return: transformed text 21 | """ 22 | return reduce(lambda s, t: t[0].sub(t[1], s), transformations, text) 23 | 24 | 25 | def ptokenise(patterns: List[Pattern], text: Text, mask=' ') \ 26 | -> List[Interval[Text]]: 27 | """ 28 | Return intervals matched by `patterns`. The patterns are applied 29 | in iteration order. Before applying pattern `i+1`, the function replaces 30 | each region `r` matched by pattern `i` with `mask * len(r)`. This means 31 | the output might be sensitive to pattern order. 32 | :param patterns: a list of patterns to search for 33 | :param text: a unicode string 34 | :param mask: the masking value 35 | :return: a list of intervals storing the corresponding string 36 | """ 37 | def repl(match) -> Text: 38 | return mask * (match.end() - match.start()) 39 | 40 | def match_mask(acc: Tuple[List[Tuple[int, int]], Text], 41 | patt: Pattern) -> Tuple[List[Tuple[int, int]], Text]: 42 | spans, s = acc 43 | spans.extend(m.span() for m in patt.finditer(s)) 44 | return spans, patt.sub(repl, s) 45 | 46 | return [Interval(start, stop, text[start:stop]) for start, stop in 47 | sorted(reduce(match_mask, patterns, ([], text))[0])] 48 | 49 | 50 | if __name__ == '__main__': 51 | raise RuntimeError 52 | -------------------------------------------------------------------------------- /scilk/util/preprocessing.py: -------------------------------------------------------------------------------- 1 | """ 2 | 3 | 4 | 5 | """ 6 | 7 | 8 | import operator as op 9 | from itertools import chain, repeat, count 10 | from math import ceil 11 | from typing import List, Tuple, Optional, TypeVar, Sequence 12 | 13 | import numpy as np 14 | from fn import F 15 | 16 | 17 | T = TypeVar('T') 18 | 19 | 20 | homogenous = F(map) >> set >> len >> F(op.contains, [0, 1]) 21 | flatmap = F(map) >> chain.from_iterable 22 | strictmap = F(map) >> list 23 | 24 | 25 | def flatzip(flat, nested): 26 | flatrep = map(F(map, repeat), flat) 27 | iterables = (*flatrep, *nested) 28 | return (F(zip) >> F(map, lambda x: zip(*x)) >> chain.from_iterable)(*iterables) 29 | 30 | 31 | def maxshape(arrays: Sequence[np.ndarray]) -> Tuple[int]: 32 | """ 33 | :param arrays: a nonempty sequence of arrays; the sequence must be 34 | homogeneous with respect to dimensionality. 35 | :raises ValueError: if `arrays` sequence is empty; if arrays have different 36 | dimensionality. 37 | """ 38 | if not arrays: 39 | raise ValueError('`arrays` should not be empty') 40 | if not homogenous(np.ndim, arrays): 41 | raise ValueError('`arrays` must have homogeneous dimensionality') 42 | return tuple(np.array([array.shape for array in arrays]).max(axis=0)) 43 | 44 | 45 | def stack(arrays: Sequence[np.ndarray], shape: Optional[Sequence[int]], dtype, 46 | filler=0, trim=False) -> Tuple[np.ndarray, np.ndarray]: 47 | """ 48 | Stack N-dimensional arrays with variable sizes across dimensions. 49 | :param arrays: a nonempty sequence of arrays; the sequence must be 50 | homogeneous with respect to dimensionality. 51 | :param shape: target shape to broadcast each array to. The shape must 52 | specify one integer per dimension – the output will thus have shape 53 | `[len(arrays), *shape]`. If None the function will infer the maximal size 54 | per dimension from `arrays`. To infer size for individual dimension(s) 55 | use -1. 56 | :param dtype: output data type 57 | :param filler: a value to fill in the empty space. 58 | :param trim: trim arrays to fit the `shape`. 59 | :raises ValueError: if `len(shape)` doesn't match the dimensionality of 60 | arrays in `arrays`; if an array can't be broadcasted to `shape` without 61 | trimming, while trimming is disabled; + all cases specified in function 62 | `maxshape` 63 | :return: stacked arrays, a boolean mask (empty positions are False). 64 | >>> from random import choice 65 | >>> maxlen = 100 66 | >>> ntests = 10000 67 | >>> lengths = range(10, maxlen+1, 2) 68 | >>> arrays = [ 69 | ... np.random.randint(0, 127, size=choice(lengths)).reshape((2, -1)) 70 | ... for _ in range(ntests) 71 | ... ] 72 | >>> stacked, masks = stack(arrays, [-1, maxlen], np.int) 73 | >>> all((arr.flatten() == s[m].flatten()).all() 74 | ... for arr, s, m in zip(arrays, stacked, masks)) 75 | True 76 | >>> stacked, masks = stack(arrays, [2, -1], np.int) 77 | >>> all((arr.flatten() == s[m].flatten()).all() 78 | ... for arr, s, m in zip(arrays, stacked, masks)) 79 | True 80 | """ 81 | def slices(limits: Tuple[int], array: np.ndarray) -> List[slice]: 82 | stops = [min(limit, size) for limit, size in zip(limits, array.shape)] 83 | return [slice(0, stop) for stop in stops] 84 | 85 | if not isinstance(arrays, Sequence): 86 | raise ValueError('`arrays` must be a Sequence object') 87 | ndim = arrays[0].ndim 88 | if shape is not None and len(shape) != ndim: 89 | raise ValueError("`shape`'s dimensionality doesn't match that of " 90 | "`arrays`") 91 | if shape is not None and any(s < 1 and s != -1 for s in shape): 92 | raise ValueError('the only allowed non-positive value in `shape` is -1') 93 | # infer size across all dimensions 94 | inferred = np.array(maxshape(arrays)) 95 | # mix inferred and requested sizes where requested 96 | limits = (inferred if shape is None else 97 | np.where(np.array(shape) == -1, inferred, shape)) 98 | # make sure everything fits fine 99 | if not (shape is None or trim or (inferred <= limits).all()): 100 | raise ValueError("can't broadcast all arrays to `shape` without " 101 | "trimming") 102 | stacked = np.full([len(arrays), *limits], filler, dtype=dtype) 103 | mask = np.zeros([len(arrays), *limits], dtype=bool) 104 | for i, arr, slices_ in zip(count(), arrays, map(F(slices, limits), arrays)): 105 | op.setitem(stacked, [i, *slices_], op.getitem(arr, slices_)) 106 | op.setitem(mask, [i, *slices_], True) 107 | stacked[~mask] = filler 108 | return stacked, mask 109 | 110 | 111 | def maskfalse(array: np.ndarray, mask: np.ndarray) -> np.ndarray: 112 | """ 113 | Replace False-masked items with zeros. 114 | >>> array = np.arange(10) 115 | >>> mask = np.random.binomial(1, 0.5, len(array)).astype(bool) 116 | >>> masked = maskfalse(array, mask) 117 | >>> (masked[mask] == array[mask]).all() 118 | True 119 | >>> (masked[~mask] == 0).all() 120 | True 121 | """ 122 | if not np.issubdtype(mask.dtype, np.bool): 123 | raise ValueError("Masks are supposed to be boolean") 124 | copy = array.copy() 125 | copy[~mask] = 0 126 | return copy 127 | 128 | 129 | def chunksteps(size: int, array: np.ndarray, filler=0) -> np.ndarray: 130 | """ 131 | Chunk time steps, that is break an array into fixed-size slices along the 132 | second dimension (array.shape[1]). 133 | :param size: chunk size 134 | :param array: an array to chunk. The array must have at lest two dimensions 135 | :param filler: a value to fill in the empty space in the last chunk if 136 | `array.shape[1] % size != 0` 137 | :return: 138 | """ 139 | nchunks = int(ceil(array.shape[1] / size)) 140 | chunks = [array[:, start:start+size] for start in range(0, size*nchunks, size)] 141 | assert chunks[-1].shape[1] <= size 142 | if chunks[-1].shape[1] < size: 143 | chunk = np.full((array.shape[0], size, *array.shape[2:]), filler, 144 | dtype=array.dtype) 145 | chunk[:, :chunks[-1].shape[1]] = chunks[-1] 146 | chunks[-1] = chunk 147 | return np.array(chunks) 148 | 149 | 150 | reverse = op.itemgetter(slice(None, None, -1)) # reverse a Sequence or an array 151 | 152 | 153 | if __name__ == '__main__': 154 | raise RuntimeError 155 | -------------------------------------------------------------------------------- /scilk/util/segments.py: -------------------------------------------------------------------------------- 1 | from typing import Sequence, List, Iterable, Tuple 2 | from itertools import dropwhile 3 | from functools import reduce 4 | import operator as op 5 | 6 | from fn import F 7 | from fn.iters import splitby, droplast 8 | import numpy as np 9 | 10 | from .intervals import Interval 11 | 12 | 13 | def breakpoints(intervals: Iterable[Interval]) -> List[int]: 14 | """ 15 | Find breakpoints between intervals 16 | :param intervals: 17 | :return: 18 | """ 19 | return [iv.stop - 1 for iv in intervals] 20 | 21 | 22 | def stitchpoints(intervals: Sequence[Interval], targets: Sequence[Interval]): 23 | """ 24 | Find breakpoints that have to be stitched in order to recover target 25 | intervals from finer subintervals. For a set of intervals [iv_1, ..., iv_n] 26 | that must be stitched to obtain a target t1, the function returns 27 | [(iv_1).stop-1, ..., (iv_n-1).stop-1]. The function groups all intervals 28 | intersecting a target together and merges them. Take note, that an ideal 29 | reconstruction might not be achievable. In this case it is only guaranteed 30 | that all merged intervals will contain the entire span of the corresponding 31 | target, but not the other way around. 32 | :param intervals: 33 | :param targets: 34 | :return: 35 | """ 36 | intervals_ = sorted(intervals, key=lambda iv: iv.start) 37 | stitched_ = sorted(targets, key=lambda iv: iv.start) 38 | inbreaks = F(droplast, 1) >> breakpoints 39 | 40 | def grouper(acc: Tuple[List[int], Iterable[Interval]], iv: Interval): 41 | # find breakpoints to stitch 42 | breaks, ivs = acc 43 | grouped, remainder = splitby( 44 | iv.intersects, dropwhile(lambda x: x.stop <= iv.start, ivs) 45 | ) 46 | return breaks.extend(inbreaks(grouped)) or breaks, list(remainder) 47 | 48 | return reduce(grouper, stitched_, ([], intervals_))[0] 49 | 50 | 51 | def stitch(intervals: Sequence[Interval], points: Sequence[int]) \ 52 | -> List[Interval]: 53 | """ 54 | Stitch intervals. If any point in `points` falls into an interval at 55 | position i this interval will be stitched to interval at position i+1. 56 | :param intervals: 57 | :param points: 58 | :return: 59 | """ 60 | # extract annotations 61 | ivs = sorted(intervals, key=lambda iv: iv.start) 62 | length = max(iv.stop for iv in ivs) 63 | annotations = np.zeros(length, dtype=np.int32) 64 | annotations[points] = 1 65 | iv_anno = [annotations[iv.start:iv.stop].any() for iv in ivs] 66 | 67 | # group intervals to stitch 68 | def group(acc: Tuple[List[List[Interval]], bool], 69 | step: Tuple[Interval, bool]) \ 70 | -> Tuple[List[List[Interval]], bool]: 71 | groups, takethis = acc 72 | iv, takenext = step 73 | if takethis: 74 | groups[-1].append(iv) 75 | else: 76 | groups.append([iv]) 77 | return groups, takenext 78 | 79 | grouped = reduce(group, zip(ivs, iv_anno), ([], False))[0] 80 | # stitch intervals 81 | return [reduce(op.and_, group) for group in grouped] 82 | 83 | 84 | if __name__ == '__main__': 85 | raise RuntimeError 86 | -------------------------------------------------------------------------------- /setup.py: -------------------------------------------------------------------------------- 1 | import sys 2 | from distutils.core import setup 3 | from setuptools import find_packages 4 | 5 | # TODO add loggers and warnings 6 | # TODO lazy module improting (https://github.com/bwesterb/py-demandimport) 7 | 8 | if sys.version_info < (3, 5, 2): 9 | print("SciLK requires Python >= 3.5.2") 10 | sys.exit(1) 11 | 12 | # from Cython.Build import cythonize 13 | # 14 | # os.environ['CFLAGS'] = '-O3 -Wall' 15 | 16 | setup( 17 | name="scilk", 18 | version="0.1a1", 19 | packages=find_packages("./"), 20 | scripts=[], 21 | requires=["numpy", 22 | "h5py", 23 | "fn", 24 | "pyrsistent", 25 | "keras", 26 | "scikit-learn", 27 | "pandas", 28 | "hypothesis", 29 | "frozendict", 30 | "tensorflow", 'multipledispatch'] 31 | ) 32 | -------------------------------------------------------------------------------- /test.py: -------------------------------------------------------------------------------- 1 | import unittest 2 | from typing import Sequence, Iterable, cast, Mapping 3 | import tempfile 4 | import os 5 | 6 | import numpy as np 7 | import joblib 8 | from hypothesis import given, note 9 | from hypothesis import settings, strategies as st 10 | 11 | from scilk.corpora import genia 12 | from scilk.util import intervals 13 | from scilk.collections import _collections 14 | import scilk 15 | 16 | MAX_TESTS = 1000 17 | 18 | 19 | # strategies 20 | 21 | texts = st.text(st.characters(min_codepoint=32, max_codepoint=255), 0, 500, 1000) 22 | 23 | 24 | def loader_caller(collection: _collections.Collection, data=None): 25 | 26 | def caller(value: str): 27 | return collection.translate(value) 28 | 29 | return caller 30 | 31 | 32 | def loader_translate(collection: _collections.Collection, data: dict): 33 | mapping = joblib.load(data['mapping']) 34 | 35 | def translate(value: str): 36 | return mapping.get(value) 37 | 38 | return translate 39 | 40 | 41 | # test cases 42 | 43 | class TestText(unittest.TestCase): 44 | 45 | @staticmethod 46 | def unparse(txt, intervals_: Sequence[intervals.Interval]): 47 | if not len(intervals_): 48 | return "" 49 | codes = np.repeat([ord(" ")], intervals_[-1].stop) 50 | for iv in intervals_: 51 | token = intervals.extract(txt, [iv])[0] 52 | codes[iv.start:iv.stop] = list(map(ord, token)) 53 | return "".join(map(chr, codes)) 54 | 55 | # @given(texts) 56 | # @settings(max_examples=MAX_TESTS) 57 | # def test_parse_text(self, txt): 58 | # parsed = text.tointervals(text.fine_tokeniser, txt) 59 | # mod_text = re.sub("\s", " ", txt) 60 | # self.assertEqual(self.unparse(txt, parsed), mod_text.rstrip()) 61 | 62 | 63 | class TestGenia(unittest.TestCase): 64 | 65 | @given(st.lists(st.text())) 66 | @settings(max_examples=MAX_TESTS) 67 | def test_text_boundaries(self, texts: list): 68 | """ 69 | Test of text_boundaries() function. 70 | :return: 71 | """ 72 | boundaries = genia._segment_borders(texts) 73 | note(boundaries) 74 | 75 | self.assertTrue(all([boundaries[i][1] == boundaries[i + 1][0] for i in 76 | range(len(boundaries) - 1)])) 77 | self.assertTrue(all([boundaries[i][0] <= boundaries[i][1] for i in 78 | range(len(boundaries) - 1)])) 79 | if boundaries: 80 | self.assertTrue(boundaries[0][0] == 0) 81 | 82 | 83 | class TestCollection(unittest.TestCase): 84 | def test_collection(self): 85 | with tempfile.TemporaryDirectory() as dirpath: 86 | scilk.SCILK_ROOT = dirpath 87 | mapping = dict(test='OK') 88 | mapping_path = os.path.join(dirpath, 'mapping.joblib') 89 | joblib.dump(mapping, mapping_path) 90 | collection = _collections.Collection() 91 | collection.add('translate', loader_translate, dict(mapping=mapping_path)) 92 | collection.add('caller', loader_caller) 93 | self.assertAlmostEqual(collection.caller('test'), 'OK') 94 | collection.save(name='test') 95 | collection = _collections.Collection.load('test') 96 | self.assertAlmostEqual(collection.caller('test'), 'OK') 97 | self.assertEquals({'translate', 'caller'}, set(collection.entries)) 98 | 99 | 100 | if __name__ == '__main__': 101 | unittest.main() 102 | --------------------------------------------------------------------------------