├── downcast
├── __init__.py
├── db
│ ├── __init__.py
│ ├── bcp
│ │ ├── __init__.py
│ │ ├── util.py
│ │ ├── types.py
│ │ └── cursor.py
│ ├── exceptions.py
│ ├── query.py
│ └── dwcbcp.py
├── output
│ ├── __init__.py
│ ├── mapping.py
│ ├── process.py
│ ├── patients.py
│ ├── log.py
│ ├── files.py
│ ├── enums.py
│ ├── numerics.py
│ ├── alerts.py
│ └── timemap.py
├── util.py
├── timeconv.py
├── bcpmerge.py
├── timestamp.py
├── attributes.py
├── main.py
├── server.py
├── shell.py
├── subprocess.py
└── messages.py
├── .gitattributes
├── .gitignore
├── server.conf.example
├── downcast.py
├── dwcsql.py
├── downcast-bcpdstfix.py
├── downcast-bcpmerge.py
├── dwctimeconv.py
├── test-extractor
├── test-archive
├── README
├── test-wave-message
├── test-dispatcher
├── INTERNALS
├── test-parsers
└── bcp-scripts
└── bulk-verify
/downcast/__init__.py:
--------------------------------------------------------------------------------
1 |
--------------------------------------------------------------------------------
/downcast/db/__init__.py:
--------------------------------------------------------------------------------
1 |
--------------------------------------------------------------------------------
/downcast/output/__init__.py:
--------------------------------------------------------------------------------
1 |
--------------------------------------------------------------------------------
/.gitattributes:
--------------------------------------------------------------------------------
1 | *.py diff=python
2 |
--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
1 | *~
2 | *#*
3 | *.orig
4 | *.rej
5 | *.patch
6 |
7 | *.pyc
8 | *.conf
9 |
--------------------------------------------------------------------------------
/server.conf.example:
--------------------------------------------------------------------------------
1 | # Example server.conf file - edit as needed.
2 |
3 | # Read input from a running SQL Server instance
4 | # (requires password authentication)
5 | [example-live]
6 | hostname = 192.168.123.45
7 | username = somebody
8 | password = 12341234
9 | database = Philips.PatientData
10 |
11 | # Read input from a single-day BCP data dump
12 | [example-bcp]
13 | type = bcp
14 | bcp-path = /data/dwc/2001-05-01
15 |
16 | # Read input from a multi-day BCP data dump
17 | # (each day should be stored in a separate directory, listed in order)
18 | [example-bcp-multiple]
19 | type = bcp
20 | bcp-path = /data/dwc/2001-05-01:/data/dwc/2001-05-02:/data/dwc/2001-05-03
21 |
--------------------------------------------------------------------------------
/downcast.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/python3
2 | #
3 | # downcast - tools for unpacking patient data from DWC
4 | #
5 | # Copyright (c) 2018 Laboratory for Computational Physiology
6 | #
7 | # This program is free software: you can redistribute it and/or modify
8 | # it under the terms of the GNU General Public License as published by
9 | # the Free Software Foundation, either version 3 of the License, or
10 | # (at your option) any later version.
11 | #
12 | # This program is distributed in the hope that it will be useful,
13 | # but WITHOUT ANY WARRANTY; without even the implied warranty of
14 | # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15 | # GNU General Public License for more details.
16 | #
17 | # You should have received a copy of the GNU General Public License
18 | # along with this program. If not, see .
19 |
20 | from downcast.main import main
21 | main()
22 |
--------------------------------------------------------------------------------
/dwcsql.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/python3
2 | #
3 | # dwcsql - simple interactive frontend for the DWC SQL database
4 | #
5 | # Copyright (c) 2018 Laboratory for Computational Physiology
6 | #
7 | # This program is free software: you can redistribute it and/or modify
8 | # it under the terms of the GNU General Public License as published by
9 | # the Free Software Foundation, either version 3 of the License, or
10 | # (at your option) any later version.
11 | #
12 | # This program is distributed in the hope that it will be useful,
13 | # but WITHOUT ANY WARRANTY; without even the implied warranty of
14 | # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15 | # GNU General Public License for more details.
16 | #
17 | # You should have received a copy of the GNU General Public License
18 | # along with this program. If not, see .
19 |
20 | from downcast.shell import main
21 | main()
22 |
--------------------------------------------------------------------------------
/downcast-bcpdstfix.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/python3
2 | #
3 | # downcast - tools for unpacking patient data from DWC
4 | #
5 | # Copyright (c) 2021 Laboratory for Computational Physiology
6 | #
7 | # This program is free software: you can redistribute it and/or modify
8 | # it under the terms of the GNU General Public License as published by
9 | # the Free Software Foundation, either version 3 of the License, or
10 | # (at your option) any later version.
11 | #
12 | # This program is distributed in the hope that it will be useful,
13 | # but WITHOUT ANY WARRANTY; without even the implied warranty of
14 | # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15 | # GNU General Public License for more details.
16 | #
17 | # You should have received a copy of the GNU General Public License
18 | # along with this program. If not, see .
19 |
20 | from downcast.bcpdstfix import main
21 | main()
22 |
--------------------------------------------------------------------------------
/downcast-bcpmerge.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/python3
2 | #
3 | # downcast - tools for unpacking patient data from DWC
4 | #
5 | # Copyright (c) 2021 Laboratory for Computational Physiology
6 | #
7 | # This program is free software: you can redistribute it and/or modify
8 | # it under the terms of the GNU General Public License as published by
9 | # the Free Software Foundation, either version 3 of the License, or
10 | # (at your option) any later version.
11 | #
12 | # This program is distributed in the hope that it will be useful,
13 | # but WITHOUT ANY WARRANTY; without even the implied warranty of
14 | # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15 | # GNU General Public License for more details.
16 | #
17 | # You should have received a copy of the GNU General Public License
18 | # along with this program. If not, see .
19 |
20 | from downcast.bcpmerge import main
21 | main()
22 |
--------------------------------------------------------------------------------
/dwctimeconv.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/python3
2 | #
3 | # dwctimeconv - convert between time formats in a converted record
4 | #
5 | # Copyright (c) 2020 Laboratory for Computational Physiology
6 | #
7 | # This program is free software: you can redistribute it and/or modify
8 | # it under the terms of the GNU General Public License as published by
9 | # the Free Software Foundation, either version 3 of the License, or
10 | # (at your option) any later version.
11 | #
12 | # This program is distributed in the hope that it will be useful,
13 | # but WITHOUT ANY WARRANTY; without even the implied warranty of
14 | # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15 | # GNU General Public License for more details.
16 | #
17 | # You should have received a copy of the GNU General Public License
18 | # along with this program. If not, see .
19 |
20 | from downcast.timeconv import main
21 | main()
22 |
--------------------------------------------------------------------------------
/downcast/db/bcp/__init__.py:
--------------------------------------------------------------------------------
1 | #
2 | # downcast - tools for unpacking patient data from DWC
3 | #
4 | # Copyright (c) 2018 Laboratory for Computational Physiology
5 | #
6 | # This program is free software: you can redistribute it and/or modify
7 | # it under the terms of the GNU General Public License as published by
8 | # the Free Software Foundation, either version 3 of the License, or
9 | # (at your option) any later version.
10 | #
11 | # This program is distributed in the hope that it will be useful,
12 | # but WITHOUT ANY WARRANTY; without even the implied warranty of
13 | # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14 | # GNU General Public License for more details.
15 | #
16 | # You should have received a copy of the GNU General Public License
17 | # along with this program. If not, see .
18 |
19 | import os
20 | import re
21 | import bisect
22 | import struct
23 |
24 | from ..exceptions import *
25 | from .types import *
26 | from .connection import BCPConnection
27 |
28 | def connect():
29 | """Connect to a database consisting of a set of bcp-format files."""
30 | return BCPConnection()
31 |
32 | apilevel = '2.0'
33 | paramstyle = 'qmark'
34 | threadsafety = 1
35 |
--------------------------------------------------------------------------------
/downcast/output/mapping.py:
--------------------------------------------------------------------------------
1 | #
2 | # downcast - tools for unpacking patient data from DWC
3 | #
4 | # Copyright (c) 2018 Laboratory for Computational Physiology
5 | #
6 | # This program is free software: you can redistribute it and/or modify
7 | # it under the terms of the GNU General Public License as published by
8 | # the Free Software Foundation, either version 3 of the License, or
9 | # (at your option) any later version.
10 | #
11 | # This program is distributed in the hope that it will be useful,
12 | # but WITHOUT ANY WARRANTY; without even the implied warranty of
13 | # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14 | # GNU General Public License for more details.
15 | #
16 | # You should have received a copy of the GNU General Public License
17 | # along with this program. If not, see .
18 |
19 | from ..messages import PatientMappingMessage
20 |
21 | class PatientMappingHandler:
22 | def __init__(self, archive):
23 | self.archive = archive
24 |
25 | def send_message(self, chn, msg, source, ttl):
26 | if not isinstance(msg, PatientMappingMessage):
27 | return
28 |
29 | source.nack_message(chn, msg, self)
30 | msg.origin.set_patient_id(msg.mapping_id, msg.patient_id)
31 | source.ack_message(chn, msg, self)
32 |
33 | def flush(self):
34 | pass
35 |
--------------------------------------------------------------------------------
/downcast/output/process.py:
--------------------------------------------------------------------------------
1 | #
2 | # downcast - tools for unpacking patient data from DWC
3 | #
4 | # Copyright (c) 2018 Laboratory for Computational Physiology
5 | #
6 | # This program is free software: you can redistribute it and/or modify
7 | # it under the terms of the GNU General Public License as published by
8 | # the Free Software Foundation, either version 3 of the License, or
9 | # (at your option) any later version.
10 | #
11 | # This program is distributed in the hope that it will be useful,
12 | # but WITHOUT ANY WARRANTY; without even the implied warranty of
13 | # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14 | # GNU General Public License for more details.
15 | #
16 | # You should have received a copy of the GNU General Public License
17 | # along with this program. If not, see .
18 |
19 | import os
20 | import sys
21 | import cProfile
22 | from multiprocessing import Process
23 |
24 | from ..util import setproctitle
25 |
26 | class WorkerProcess(Process):
27 | def run(self):
28 | name = self.name
29 | if name is not None:
30 | setproctitle('downcast:%s' % (name,))
31 |
32 | # Invoke the target function, with profiling if enabled
33 | pf = os.environ.get('DOWNCAST_PROFILE_OUT', None)
34 | if pf is not None and name is not None:
35 | pf = '%s.%s' % (pf, name)
36 | cProfile.runctx('Process.run(self)', globals(), locals(), pf)
37 | else:
38 | Process.run(self)
39 |
--------------------------------------------------------------------------------
/test-extractor:
--------------------------------------------------------------------------------
1 | #!/usr/bin/python3
2 |
3 | from downcast.server import DWCDB
4 | from downcast.extractor import (Extractor, WaveSampleQueue, NumericValueQueue,
5 | EnumerationValueQueue, AlertQueue,
6 | PatientMappingQueue, PatientBasicInfoQueue,
7 | PatientDateAttributeQueue,
8 | PatientStringAttributeQueue, BedTagQueue)
9 | from downcast.timestamp import T
10 |
11 | class TestHandler():
12 | def send_message(self, channel, message, source, ttl):
13 | print('%s\t%s\t%s' % (message.timestamp, channel,
14 | type(message).__name__))
15 | source.ack_message(channel, message, self)
16 |
17 | DWCDB.load_config('server.conf')
18 | db = DWCDB('demo')
19 | ex = Extractor(db, '/tmp/downcast-extractor-test', fatal_exceptions = True)
20 |
21 | ex.add_handler(TestHandler())
22 |
23 | st = None
24 | ex.add_queue(WaveSampleQueue('waves', start_time = st,
25 | messages_per_batch = 10))
26 | ex.add_queue(NumericValueQueue('numerics', start_time = st,
27 | messages_per_batch = 10))
28 | ex.add_queue(EnumerationValueQueue('enums', start_time = st,
29 | messages_per_batch = 10))
30 | ex.add_queue(AlertQueue('alerts', start_time = st,
31 | messages_per_batch = 10))
32 | ex.add_queue(PatientMappingQueue('mapping', start_time = st,
33 | messages_per_batch = 10))
34 | ex.add_queue(PatientBasicInfoQueue('patients', start_time = st,
35 | messages_per_batch = 10))
36 | ex.add_queue(PatientStringAttributeQueue('strings', start_time = st,
37 | messages_per_batch = 10))
38 | ex.add_queue(PatientDateAttributeQueue('dates', start_time = st,
39 | messages_per_batch = 10))
40 | ex.add_queue(BedTagQueue('beds', start_time = st,
41 | messages_per_batch = 10))
42 |
43 | for _ in range(100):
44 | ex.run()
45 |
--------------------------------------------------------------------------------
/downcast/db/exceptions.py:
--------------------------------------------------------------------------------
1 | #
2 | # downcast - tools for unpacking patient data from DWC
3 | #
4 | # Copyright (c) 2018 Laboratory for Computational Physiology
5 | #
6 | # This program is free software: you can redistribute it and/or modify
7 | # it under the terms of the GNU General Public License as published by
8 | # the Free Software Foundation, either version 3 of the License, or
9 | # (at your option) any later version.
10 | #
11 | # This program is distributed in the hope that it will be useful,
12 | # but WITHOUT ANY WARRANTY; without even the implied warranty of
13 | # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14 | # GNU General Public License for more details.
15 | #
16 | # You should have received a copy of the GNU General Public License
17 | # along with this program. If not, see .
18 |
19 | class Error(Exception):
20 | """Base exception type for database errors."""
21 | pass
22 |
23 | class InterfaceError(Error):
24 | """Base exception type relating to the database interface."""
25 | pass
26 |
27 | class DatabaseError(Error):
28 | """Base exception type relating to the database."""
29 | pass
30 |
31 | class OperationalError(DatabaseError):
32 | """Exception caused by an error in database operation."""
33 | pass
34 |
35 | class DataSyntaxError(OperationalError):
36 | """Exception caused by a malformed entry in the data file."""
37 | pass
38 |
39 | class ProgrammingError(DatabaseError):
40 | """Exception caused by errors in the query syntax."""
41 | pass
42 |
43 | class ParameterCountError(ProgrammingError):
44 | """Exception caused by supplying the wrong number of query parameters."""
45 | def __init__(self, message, context = None):
46 | ProgrammingError.__init__(self, message)
47 | self.context = context
48 |
49 | class DataError(DatabaseError):
50 | """Exception caused by an error in processed data."""
51 | pass
52 |
53 | class IntegrityError(DatabaseError):
54 | """Exception caused by an error in database integrity."""
55 | pass
56 |
57 | class InternalError(DatabaseError):
58 | """Exception caused by an internal database error."""
59 | pass
60 |
61 | class NotSupportedError(DatabaseError):
62 | """Exception caused by an unsupported operation."""
63 | pass
64 |
--------------------------------------------------------------------------------
/test-archive:
--------------------------------------------------------------------------------
1 | #!/usr/bin/python3
2 |
3 | import shutil
4 | from datetime import timedelta
5 |
6 | from downcast.server import DWCDB
7 | from downcast.extractor import (Extractor, WaveSampleQueue, NumericValueQueue,
8 | EnumerationValueQueue, AlertQueue,
9 | PatientMappingQueue, PatientBasicInfoQueue,
10 | PatientStringAttributeQueue,
11 | PatientDateAttributeQueue)
12 | from downcast.timestamp import T
13 | from downcast.output.archive import Archive
14 | from downcast.output.numerics import NumericValueHandler
15 | from downcast.output.waveforms import WaveSampleHandler
16 | from downcast.output.mapping import PatientMappingHandler
17 | from downcast.output.patients import PatientHandler
18 |
19 | DWCDB.load_config('server.conf')
20 |
21 | def test(dest_dir, iterations):
22 | db = DWCDB('demo')
23 | ex = Extractor(db, dest_dir, fatal_exceptions = True, debug = True)
24 | arx = Archive(dest_dir)
25 | nh = NumericValueHandler(arx)
26 | ex.add_handler(nh)
27 | mh = PatientMappingHandler(arx)
28 | ex.add_handler(mh)
29 | ph = PatientHandler(arx)
30 | ex.add_handler(ph)
31 | wh = WaveSampleHandler(arx)
32 | ex.add_handler(wh)
33 |
34 | pmq = PatientMappingQueue('mapping')
35 | pmdelay = timedelta(minutes = 30)
36 | ex.add_queue(pmq)
37 |
38 | # ex.add_queue(PatientBasicInfoQueue('patients'))
39 | # ex.add_queue(PatientStringAttributeQueue('strings'))
40 | # ex.add_queue(PatientDateAttributeQueue('dates'))
41 |
42 | st = T('2016-01-28 14:00:00.000 -05:00')
43 | ex.add_queue(NumericValueQueue('numerics', start_time = st,
44 | messages_per_batch = 100))
45 | ex.add_queue(WaveSampleQueue('waves', start_time = st,
46 | messages_per_batch = 100))
47 | for _ in range(iterations):
48 | ex.run()
49 | ex.flush()
50 |
51 | shutil.rmtree('/tmp/downcast-extractor-test', ignore_errors = True)
52 | test('/tmp/downcast-extractor-test', 5)
53 | test('/tmp/downcast-extractor-test', 5)
54 | test('/tmp/downcast-extractor-test', 5)
55 | test('/tmp/downcast-extractor-test', 5)
56 |
57 | shutil.rmtree('/tmp/downcast-extractor-test2', ignore_errors = True)
58 | test('/tmp/downcast-extractor-test2', 20)
59 |
--------------------------------------------------------------------------------
/downcast/util.py:
--------------------------------------------------------------------------------
1 | #
2 | # downcast - tools for unpacking patient data from DWC
3 | #
4 | # Copyright (c) 2018 Laboratory for Computational Physiology
5 | #
6 | # This program is free software: you can redistribute it and/or modify
7 | # it under the terms of the GNU General Public License as published by
8 | # the Free Software Foundation, either version 3 of the License, or
9 | # (at your option) any later version.
10 | #
11 | # This program is distributed in the hope that it will be useful,
12 | # but WITHOUT ANY WARRANTY; without even the implied warranty of
13 | # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14 | # GNU General Public License for more details.
15 | #
16 | # You should have received a copy of the GNU General Public License
17 | # along with this program. If not, see .
18 |
19 | import os
20 | import tempfile
21 |
22 | try:
23 | import setproctitle
24 | setproctitle = setproctitle.setproctitle
25 | except ImportError:
26 | def setproctitle(title):
27 | pass
28 |
29 | # fdatasync: ensure data for the given file descriptor is written to disk
30 | # (implemented using fsync if the OS does not appear to support fdatasync)
31 | with tempfile.TemporaryFile() as f:
32 | try:
33 | os.fdatasync(f.fileno())
34 | except Exception:
35 | fdatasync = os.fsync
36 | else:
37 | fdatasync = os.fdatasync
38 |
39 | _ascii_substitutions = {
40 | '\N{HEAVY ASTERISK}': '*', # ✱
41 | '\N{MICRO SIGN}': 'u', # µ
42 | '\N{DEGREE SIGN}': 'deg', # °
43 | '\N{SUBSCRIPT TWO}': '2', # ₂
44 | '\N{SUPERSCRIPT TWO}': '^2', # ²
45 | '\N{GREEK CAPITAL LETTER DELTA}': 'Delta', # Δ
46 | }
47 | for x in list(range(32)) + [127]:
48 | _ascii_substitutions[x] = ' '
49 | _ascii_substitutions = str.maketrans(_ascii_substitutions)
50 |
51 | def string_to_ascii(string):
52 | """
53 | Convert various characters to approximate ASCII equivalents.
54 |
55 | >>> string_to_ascii('✱✱✱ VTach')
56 | '*** VTach'
57 | >>> string_to_ascii('µV')
58 | 'uV'
59 | >>> string_to_ascii('°C')
60 | 'degC'
61 | >>> string_to_ascii('SpO₂')
62 | 'SpO2'
63 | >>> string_to_ascii('ml/m²')
64 | 'ml/m^2'
65 | >>> string_to_ascii('ΔTemp')
66 | 'DeltaTemp'
67 | """
68 | return string.translate(_ascii_substitutions)
69 |
--------------------------------------------------------------------------------
/README:
--------------------------------------------------------------------------------
1 | Downcast
2 | --------
3 |
4 | This repository contains tools for processing and converting data from
5 | the DWC system into WFDB and other open formats.
6 |
7 |
8 | Requirements
9 | ------------
10 |
11 | Python 3.4 or later is required. A Unix-like platform is required -
12 | Debian and CentOS have been tested; Mac OS might work as well. This
13 | package will not work on Windows.
14 |
15 | For processing data in BCP format, the ply package is required.
16 |
17 | For processing data directly from SQL Server, the pymssql package is
18 | required. (This package is now mostly abandoned and should probably
19 | be replaced with a different backend.)
20 |
21 |
22 | Quick start
23 | -----------
24 |
25 | If you have access to the demo DWC database, download and unpack these
26 | files (about 30 GB uncompressed.) You will then need to create a
27 | "server.conf" file, which should look like this:
28 |
29 | [demo]
30 | type = bcp
31 | bcp-path = /home/user/dwc-demo
32 |
33 | (where /home/user/dwc-demo is the directory containing "Alert.dat",
34 | "Alert.fmt", etc.) See server.conf.example for other examples.
35 |
36 | The demo database spans the time period from 1:00 AM EDT on October
37 | 31, 2004, to midnight EST on November 1. To parse and convert a slice
38 | of the data (say, from 10:00 to 10:05 AM), first we initialize an
39 | output directory and set the starting time:
40 |
41 | $ ./downcast.py --init --server demo \
42 | --output-dir /home/user/dwc-test-output \
43 | --start "2004-10-31 10:00:00.000 -05:00"
44 |
45 | Then run a batch conversion while specifying the end time:
46 |
47 | $ ./downcast.py --batch --server demo \
48 | --output-dir /home/user/dwc-test-output \
49 | --end "2004-10-31 10:05:00.000 -05:00"
50 |
51 | If we wanted to keep going, we could run the same --batch command
52 | again, increasing the end timestamp each time. We don't need to
53 | specify the starting timestamp for --batch, since the "current"
54 | timestamp is saved automatically.
55 |
56 | To "finalize" the output (and forcibly truncate all patient records at
57 | the specified end time), we use the --terminate option. This wouldn't
58 | be done for a real database conversion, but it's useful for a simple
59 | test:
60 |
61 | $ ./downcast.py --batch --server demo \
62 | --output-dir /home/user/dwc-test-output \
63 | --end "2004-10-31 10:05:00.000 -05:00" \
64 | --terminate
65 |
66 | This should result in a bunch of patient records in WFDB format,
67 | stored in /home/user/dwc-test-output.
68 |
--------------------------------------------------------------------------------
/test-wave-message:
--------------------------------------------------------------------------------
1 | #!/usr/bin/python3
2 |
3 | from collections import OrderedDict
4 | import pymssql
5 | import sys
6 |
7 | from downcast.dispatcher import Dispatcher
8 | from downcast.server import DWCDB
9 | from downcast.parser import WaveSampleParser
10 | from downcast.messages import WaveSampleMessage
11 |
12 | class TestHandler:
13 | def __init__(self):
14 | self.prev_sequence_number = None
15 | self.cur_sequence_number = None
16 | self.cur_wave_ids = OrderedDict()
17 |
18 | def send_message(self, chn, msg, source, ttl):
19 | if isinstance(msg, WaveSampleMessage):
20 | source.nack_message(chn, msg, self)
21 |
22 | if msg.wave_id == 27 and msg.sequence_number == 507278718464:
23 | raise Exception('we crash now')
24 |
25 | if self.cur_sequence_number is None:
26 | self.cur_sequence_number = msg.sequence_number
27 |
28 | if msg.sequence_number == self.cur_sequence_number:
29 | self.cur_wave_ids[msg.wave_id] = 1
30 |
31 | if msg.sequence_number > self.cur_sequence_number or ttl == 0:
32 | source.nack_message(chn, msg, self, replay = True)
33 | sys.stdout.write('\nT=%d: ' % self.cur_sequence_number)
34 | self.prev_sequence_number = self.cur_sequence_number
35 | self.cur_sequence_number = msg.sequence_number
36 | self.cur_wave_ids.clear()
37 |
38 | if msg.sequence_number == self.prev_sequence_number:
39 | sys.stdout.write(' [%d]' % msg.wave_id)
40 | source.ack_message(chn, msg, self)
41 | elif msg.sequence_number < self.cur_sequence_number:
42 | print('*** message out of sequence (%d < %d)'
43 | % (msg.sequence_number, self.cur_sequence_number))
44 |
45 | def flush(self):
46 | return
47 |
48 | class TestDeadLetterHandler:
49 | def send_message(self, chn, msg, source, ttl):
50 | print('*** Dead letter (T=%d, W=%d)' % (msg.sequence_number, msg.wave_id))
51 |
52 | class TestGenerator:
53 | def __init__(self):
54 | DWCDB.load_config('server.conf')
55 | self.db = DWCDB('demo')
56 | self.dispatcher = Dispatcher()
57 |
58 | def parse(self, parser):
59 | for msg in self.db.get_messages(parser):
60 | self.dispatcher.send_message(msg.mapping_id, msg, self, 100)
61 |
62 | def ack_message(self, chn, msg, recipient):
63 | return
64 | def nack_message(self, chn, msg, recipient):
65 | return
66 |
67 | g = TestGenerator()
68 | h = TestHandler()
69 | d = TestDeadLetterHandler()
70 | g.dispatcher.add_handler(h)
71 | g.dispatcher.add_dead_letter_handler(d)
72 |
73 | parser = WaveSampleParser(limit = 500, mapping_id = '85965f09-e8c2-4e79-8c1c-cb1775bd2550')
74 | g.parse(parser)
75 |
76 | print('\n--- terminating ---')
77 | g.dispatcher.terminate()
78 | g.dispatcher.flush()
79 | print('')
80 |
--------------------------------------------------------------------------------
/downcast/db/bcp/util.py:
--------------------------------------------------------------------------------
1 | #
2 | # downcast - tools for unpacking patient data from DWC
3 | #
4 | # Copyright (c) 2021 Laboratory for Computational Physiology
5 | #
6 | # This program is free software: you can redistribute it and/or modify
7 | # it under the terms of the GNU General Public License as published by
8 | # the Free Software Foundation, either version 3 of the License, or
9 | # (at your option) any later version.
10 | #
11 | # This program is distributed in the hope that it will be useful,
12 | # but WITHOUT ANY WARRANTY; without even the implied warranty of
13 | # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14 | # GNU General Public License for more details.
15 | #
16 | # You should have received a copy of the GNU General Public License
17 | # along with this program. If not, see .
18 |
19 | import io
20 | import os
21 | import tempfile
22 |
23 | _dev_fd_yields_independent_ofd = False
24 | with tempfile.TemporaryFile() as f1:
25 | f1.write(b'what hath god wrought')
26 | f1.flush()
27 | try:
28 | with open('/dev/fd/%d' % f1.fileno(), 'rb') as f2:
29 | if (os.path.samefile(f1.fileno(), f2.fileno())
30 | and (os.lseek(f1.fileno(), 0, os.SEEK_CUR)
31 | != os.lseek(f2.fileno(), 0, os.SEEK_CUR))):
32 | _dev_fd_yields_independent_ofd = True
33 | except OSError:
34 | pass
35 |
36 | def open_copy(fileobj, *args, **kwargs):
37 | """
38 | Open a new file object that refers to the same underlying file.
39 |
40 | The input must be a Python file object. The result will be an
41 | independent file object that refers to the same file.
42 |
43 | If the operating system provides a /dev/fd filesystem, and that
44 | filesystem allows creating independent OFDs, then this can be done even
45 | if the original file has been deleted or renamed.
46 |
47 | If the operating system *doesn't* provide /dev/fd, or if /dev/fd
48 | uses dup semantics, this will attempt to reopen the original
49 | filename (fileobj.path) instead, which will fail if the original
50 | file has been deleted or renamed.
51 | """
52 | if isinstance(fileobj, io.TextIOWrapper):
53 | fileobj = fileobj.buffer
54 | if isinstance(fileobj, io.BufferedReader):
55 | fileobj = fileobj.raw
56 | if not isinstance(fileobj, io.FileIO):
57 | raise TypeError('not a native file object')
58 |
59 | if _dev_fd_yields_independent_ofd:
60 | return open('/dev/fd/%d' % fileobj.fileno(), *args, **kwargs)
61 | else:
62 | oldpath = fileobj.name
63 | oldfd = fileobj.fileno()
64 | newfile = open(oldpath, *args, **kwargs)
65 | try:
66 | if os.path.samefile(oldfd, newfile.fileno()):
67 | return newfile
68 | else:
69 | raise FileNotFoundError(0, 'File has been renamed', oldpath)
70 | except OSError:
71 | newfile.close()
72 | raise
73 |
--------------------------------------------------------------------------------
/test-dispatcher:
--------------------------------------------------------------------------------
1 | #!/usr/bin/python3
2 |
3 | from weakref import WeakSet
4 | from downcast.dispatcher import Dispatcher
5 |
6 | class TestMessage:
7 | def __init__(self, seqnum, msgstr):
8 | self.seqnum = seqnum
9 | self.msgstr = msgstr
10 |
11 | def __str__(self):
12 | return ('[%d:%s]' % (self.seqnum, self.msgstr))
13 |
14 | # Note this is NOT meant as an example of how you should write a
15 | # handler, it's just to exercise the dispatcher logic
16 | class TestHandler:
17 | def __init__(self):
18 | self.seqnum = 0
19 |
20 | def send_message(self, channel, msg, dispatcher, ttl):
21 | print(" (%d)" % msg.seqnum)
22 | if msg.seqnum < self.seqnum:
23 | print(" ignored out-of-date %d" % msg.seqnum)
24 | else:
25 | if ttl == 0:
26 | self.seqnum = msg.seqnum
27 |
28 | dispatcher.nack_message(channel, msg, self, replay = True)
29 | if msg.seqnum == self.seqnum:
30 | print(" >> RECEIVED %d: %s" % (msg.seqnum, msg.msgstr))
31 | self.seqnum = msg.seqnum + 1
32 | dispatcher.ack_message(channel, msg, self)
33 |
34 | def flush(self):
35 | return
36 |
37 | class TestDeadLetterHandler:
38 | def send_message(self, channel, msg, dispatcher, ttl):
39 | print(" Dead letter: %s" % msg)
40 |
41 | class TestGenerator:
42 | def __init__(self):
43 | self.dispatcher = Dispatcher(fatal_exceptions = True)
44 | self.dead_messages = WeakSet()
45 |
46 | def gen_message(self, channel, seqnum, msgstr):
47 | msg = TestMessage(seqnum, msgstr)
48 | print("created %d" % seqnum)
49 | self.dispatcher.send_message(channel, msg, self, 10)
50 | msg = None
51 | if len(self.dead_messages) != 0:
52 | print("*** LEAKED: %d" % len(self.dead_messages))
53 |
54 | def ack_message(self, channel, msg, recipient):
55 | self.dead_messages.add(msg)
56 | print("deleted %d" % msg.seqnum)
57 |
58 | def nack_message(self, channel, msg, recipient):
59 | print("deferred %d" % msg.seqnum)
60 |
61 | g = TestGenerator()
62 | h = TestHandler()
63 | d = TestDeadLetterHandler()
64 | g.dispatcher.add_handler(h)
65 | g.dispatcher.add_dead_letter_handler(d)
66 |
67 | g.gen_message('x', 0, "test zero")
68 | g.gen_message('x', 1, "test one")
69 | g.gen_message('x', 5, "test five")
70 | g.gen_message('x', 2, "test two")
71 | g.gen_message('x', 3, "test three")
72 | g.gen_message('x', 4, "test four")
73 | g.gen_message('x', -1000, "way out of order")
74 | g.gen_message('x', 6, "test six")
75 | g.gen_message('x', 7, "test seven")
76 | g.gen_message('x', 15, "test fifteen")
77 | g.gen_message('x', 14, "test fourteen")
78 | g.gen_message('x', 13, "test thirteen")
79 | g.gen_message('x', 12, "test twelve")
80 | g.gen_message('x', 11, "test eleven")
81 | g.gen_message('x', 10, "test ten")
82 | g.gen_message('x', 9, "test nine")
83 | g.gen_message('x', 8, "test eight")
84 | g.gen_message('x', 17, "test seventeen")
85 | print("--- flushing ---")
86 | g.dispatcher.flush()
87 | print("--- terminating ---")
88 | g.dispatcher.terminate()
89 |
--------------------------------------------------------------------------------
/downcast/timeconv.py:
--------------------------------------------------------------------------------
1 | #
2 | # dwctimeconv - convert between time formats in a converted record
3 | #
4 | # Copyright (c) 2020 Laboratory for Computational Physiology
5 | #
6 | # This program is free software: you can redistribute it and/or modify
7 | # it under the terms of the GNU General Public License as published by
8 | # the Free Software Foundation, either version 3 of the License, or
9 | # (at your option) any later version.
10 | #
11 | # This program is distributed in the hope that it will be useful,
12 | # but WITHOUT ANY WARRANTY; without even the implied warranty of
13 | # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14 | # GNU General Public License for more details.
15 | #
16 | # You should have received a copy of the GNU General Public License
17 | # along with this program. If not, see .
18 |
19 | import argparse
20 | import json
21 | import os
22 | import re
23 | import sys
24 |
25 | from .timestamp import T
26 | from .output.archive import ArchiveRecord
27 |
28 | def main():
29 | p = argparse.ArgumentParser()
30 | p.add_argument('--record', metavar = 'PATH', default = '.')
31 | p.add_argument('timestamps', metavar = 'TIMESTAMP', nargs = '+')
32 | opts = p.parse_args()
33 |
34 | rec = ArchiveRecord(path = opts.record, servername = 'unknown',
35 | record_id = os.path.basename(opts.record),
36 | datestamp = 'unknown')
37 | seqnum0 = rec.seqnum0()
38 |
39 | for ts in opts.timestamps:
40 | if re.fullmatch('S\d+', ts):
41 | # sequence number
42 | seqnum = int(ts[1:])
43 | time = rec.time_map.get_time(seqnum)
44 | if seqnum0 is not None:
45 | counter = seqnum - seqnum0
46 | elif re.fullmatch('c\d+', ts):
47 | # counter value
48 | counter = int(ts[1:])
49 | if seqnum0 is not None:
50 | seqnum = seqnum0 + counter
51 | time = rec.time_map.get_time(seqnum)
52 | else:
53 | # wall clock timestamp
54 | try:
55 | time = T(ts)
56 | except ValueError:
57 | sys.stderr.write('%s: invalid argument: %s\n' % (sys.argv[0], ts))
58 | sys.stderr.write('valid timestamp formats:\n')
59 | sys.stderr.write(' YYYY-MM-DD HH:MM:SS.SSS +ZZ:ZZ\n')
60 | sys.stderr.write(' S######### (DWC sequence number)\n')
61 | sys.stderr.write(' c######### (WFDB counter value)\n')
62 | sys.exit(1)
63 |
64 | seqnum = rec.time_map.get_seqnum(time)
65 | if seqnum0 is not None:
66 | counter = seqnum - seqnum0
67 |
68 | if time is None:
69 | time_str = '-'
70 | else:
71 | time_str = str(time)
72 |
73 | if seqnum is None:
74 | seqnum_str = '-'
75 | else:
76 | seqnum_str = 'S%s' % seqnum
77 |
78 | if counter is None:
79 | counter_str = '-'
80 | else:
81 | counter_str = 'c%s' % counter
82 |
83 | print('%-24s\t%-8s\t%-8s' % (time_str, seqnum_str, counter_str))
84 |
--------------------------------------------------------------------------------
/downcast/output/patients.py:
--------------------------------------------------------------------------------
1 | #
2 | # downcast - tools for unpacking patient data from DWC
3 | #
4 | # Copyright (c) 2018 Laboratory for Computational Physiology
5 | #
6 | # This program is free software: you can redistribute it and/or modify
7 | # it under the terms of the GNU General Public License as published by
8 | # the Free Software Foundation, either version 3 of the License, or
9 | # (at your option) any later version.
10 | #
11 | # This program is distributed in the hope that it will be useful,
12 | # but WITHOUT ANY WARRANTY; without even the implied warranty of
13 | # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14 | # GNU General Public License for more details.
15 | #
16 | # You should have received a copy of the GNU General Public License
17 | # along with this program. If not, see .
18 |
19 | from ..messages import (PatientBasicInfoMessage,
20 | PatientDateAttributeMessage,
21 | PatientStringAttributeMessage)
22 |
23 | class PatientHandler:
24 | def __init__(self, archive):
25 | self.archive = archive
26 |
27 | def send_message(self, chn, msg, source, ttl):
28 | if isinstance(msg, PatientBasicInfoMessage):
29 | source.nack_message(chn, msg, self)
30 | record = self.archive.get_record(msg)
31 | if record is None:
32 | return
33 | self._log_info(record, msg, 'BedLabel', msg.bed_label)
34 | self._log_info(record, msg, 'Alias', msg.alias)
35 | self._log_info(record, msg, 'Category', msg.category)
36 | self._log_info(record, msg, 'Height', msg.height)
37 | self._log_info(record, msg, 'HeightUnit', msg.height_unit)
38 | self._log_info(record, msg, 'Weight', msg.weight)
39 | self._log_info(record, msg, 'WeightUnit', msg.weight_unit)
40 | self._log_info(record, msg, 'PressureUnit', msg.pressure_unit)
41 | self._log_info(record, msg, 'PacedMode', msg.paced_mode)
42 | self._log_info(record, msg, 'ResuscitationStatus',
43 | msg.resuscitation_status)
44 | self._log_info(record, msg, 'AdmitState', msg.admit_state)
45 | self._log_info(record, msg, 'ClinicalUnit', msg.clinical_unit)
46 | self._log_info(record, msg, 'Gender', msg.gender)
47 | source.ack_message(chn, msg, self)
48 |
49 | elif isinstance(msg, PatientDateAttributeMessage):
50 | source.nack_message(chn, msg, self)
51 | record = self.archive.get_record(msg)
52 | if record is None:
53 | return
54 | self._log_info(record, msg, 'd:%s' % msg.name, msg.value)
55 | source.ack_message(chn, msg, self)
56 | elif isinstance(msg, PatientStringAttributeMessage):
57 | source.nack_message(chn, msg, self)
58 | record = self.archive.get_record(msg)
59 | if record is None:
60 | return
61 | self._log_info(record, msg, 's:%s' % msg.name, msg.value)
62 | source.ack_message(chn, msg, self)
63 |
64 | def _log_info(self, record, msg, key, value):
65 | logfile = record.open_log_file('_phi_patient_info')
66 | logfile.append('%s,%s,%s' % (msg.timestamp, _escape(key),
67 | _escape(str(value))))
68 |
69 | def flush(self):
70 | self.archive.flush()
71 |
72 | _escape_chars = list(range(32)) + [127] + [ord(x) for x in ',"\'\\']
73 | _escape_table = str.maketrans({x: '\\%03o' % x for x in _escape_chars})
74 | def _escape(s):
75 | return s.translate(_escape_table)
76 |
--------------------------------------------------------------------------------
/downcast/db/bcp/types.py:
--------------------------------------------------------------------------------
1 | #
2 | # downcast - tools for unpacking patient data from DWC
3 | #
4 | # Copyright (c) 2018 Laboratory for Computational Physiology
5 | #
6 | # This program is free software: you can redistribute it and/or modify
7 | # it under the terms of the GNU General Public License as published by
8 | # the Free Software Foundation, either version 3 of the License, or
9 | # (at your option) any later version.
10 | #
11 | # This program is distributed in the hope that it will be useful,
12 | # but WITHOUT ANY WARRANTY; without even the implied warranty of
13 | # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14 | # GNU General Public License for more details.
15 | #
16 | # You should have received a copy of the GNU General Public License
17 | # along with this program. If not, see .
18 |
19 | import uuid
20 | import decimal
21 | from datetime import datetime, timezone
22 |
23 | from ... import timestamp
24 |
25 | class BCPType:
26 | """
27 | Base type for database column types.
28 |
29 | Derived classes must implement a function from_bytes(), which
30 | converts a byte string (retrieved from the BCP file) into the
31 | appropriate data type.
32 |
33 | A second function from_param() may also be defined, which converts
34 | a Python value (passed as a parameter to execute()) into the
35 | appropriate type for comparison. By default, the identity
36 | function is used.
37 | """
38 |
39 | # def from_bytes(b):
40 | # return b
41 |
42 | def from_param(p):
43 | return p
44 |
45 | # DB-API types
46 |
47 | class BINARY(BCPType):
48 | """BCP type for a binary column."""
49 | def from_bytes(b):
50 | return b
51 |
52 | class STRING(BCPType):
53 | """BCP type for a string column."""
54 | def from_bytes(b):
55 | if b == b'\0':
56 | return ''
57 | else:
58 | return b.decode('UTF-8')
59 |
60 | class NUMBER(BCPType):
61 | """BCP type for a real number column."""
62 | def from_bytes(b):
63 | return decimal.Decimal(b.decode())
64 |
65 | class DATETIME(BCPType):
66 | """BCP type for a timestamp column."""
67 | def from_bytes(b):
68 | return timestamp.T(b.decode())
69 | def from_param(p):
70 | return timestamp.T(p)
71 |
72 | class ROWID(BCPType):
73 | """BCP type for a row-ID column."""
74 | def from_bytes(b):
75 | return int(b)
76 |
77 | # Additional types
78 |
79 | class INTEGER(BCPType):
80 | """BCP type for an integer column."""
81 | def from_bytes(b):
82 | return int(b)
83 |
84 | class BOOLEAN(BCPType):
85 | """BCP type for a boolean column."""
86 | def from_bytes(b):
87 | return bool(int(b))
88 | def from_param(p):
89 | return bool(p)
90 |
91 | class UUID(BCPType):
92 | """BCP type for a UUID column."""
93 | def from_bytes(b):
94 | return uuid.UUID(b.decode())
95 | def from_param(p):
96 | return uuid.UUID(p)
97 |
98 | # DB-API conversion functions
99 |
100 | Binary = bytes
101 | Date = datetime.date
102 | Time = datetime.time
103 |
104 | def Timestamp(year, month, day, hour, minute, second):
105 | return datetime(year, month, day, hour, minute, second,
106 | tzinfo = timezone.utc)
107 |
108 | def TimestampFromTicks(ticks):
109 | return datetime.fromtimestamp(ticks, tz = timezone.utc)
110 |
111 | def DateFromTicks(ticks):
112 | return TimestampFromTicks(ticks).date()
113 |
114 | def TimeFromTicks(ticks):
115 | return TimestampFromTicks(ticks).time()
116 |
--------------------------------------------------------------------------------
/downcast/bcpmerge.py:
--------------------------------------------------------------------------------
1 | #
2 | # downcast - tools for unpacking patient data from DWC
3 | #
4 | # Copyright (c) 2021 Laboratory for Computational Physiology
5 | #
6 | # This program is free software: you can redistribute it and/or modify
7 | # it under the terms of the GNU General Public License as published by
8 | # the Free Software Foundation, either version 3 of the License, or
9 | # (at your option) any later version.
10 | #
11 | # This program is distributed in the hope that it will be useful,
12 | # but WITHOUT ANY WARRANTY; without even the implied warranty of
13 | # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14 | # GNU General Public License for more details.
15 | #
16 | # You should have received a copy of the GNU General Public License
17 | # along with this program. If not, see .
18 |
19 | import argparse
20 | import heapq
21 | import os
22 |
23 | from .db import dwcbcp
24 | from .messages import (AlertMessage, BedTagMessage, EnumerationValueMessage,
25 | NumericValueMessage, PatientBasicInfoMessage,
26 | PatientDateAttributeMessage, PatientMappingMessage,
27 | PatientStringAttributeMessage, WaveSampleMessage,
28 | bcp_format_description, bcp_format_message)
29 | from .parser import (AlertParser, BedTagParser, EnumerationValueParser,
30 | NumericValueParser, PatientBasicInfoParser,
31 | PatientDateAttributeParser, PatientMappingParser,
32 | PatientStringAttributeParser, WaveSampleParser)
33 | from .timestamp import T
34 |
35 | def merge_files(table_abbr, input_files, output_data_file,
36 | output_format_file = None, start = None, end = None):
37 | parser_message_types = {
38 | 'Alert': (AlertParser,
39 | AlertMessage),
40 | 'BedTag': (BedTagParser,
41 | BedTagMessage),
42 | 'EnumerationValue': (EnumerationValueParser,
43 | EnumerationValueMessage),
44 | 'NumericValue': (NumericValueParser,
45 | NumericValueMessage),
46 | 'Patient': (PatientBasicInfoParser,
47 | PatientBasicInfoMessage),
48 | 'PatientDateAttribute': (PatientDateAttributeParser,
49 | PatientDateAttributeMessage),
50 | 'PatientMapping': (PatientMappingParser,
51 | PatientMappingMessage),
52 | 'PatientStringAttribute': (PatientStringAttributeParser,
53 | PatientStringAttributeMessage),
54 | 'WaveSample': (WaveSampleParser,
55 | WaveSampleMessage),
56 | }
57 | (parser_type, message_type) = parser_message_types[table_abbr]
58 | table = '_Export.%s_' % table_abbr
59 |
60 | input_files = list(input_files)
61 | dbs = []
62 | cursors = []
63 | message_iters = []
64 | for (data_file, format_file) in input_files:
65 | db = dwcbcp.DWCBCPConnection([])
66 | db.add_data_file(table, data_file, format_file)
67 | dbs.append(db)
68 | cursor = db.cursor()
69 | cursors.append(cursor)
70 | parser = parser_type(limit = None, dialect = 'sqlite',
71 | paramstyle = dwcbcp.paramstyle,
72 | time_ge = start, time_lt = end)
73 | message_iter = parser.parse(origin = None, cursor = cursor)
74 | message_iters.append(message_iter)
75 |
76 | with open(output_data_file, 'wb') as outf:
77 | for message in heapq.merge(*message_iters,
78 | key = lambda x: x.timestamp):
79 | outf.write(bcp_format_message(message))
80 |
81 | if output_format_file is not None:
82 | with open(output_format_file, 'w') as fmtf:
83 | fmtf.write(bcp_format_description(message_type))
84 |
85 | def _parse_timestamp(arg):
86 | try:
87 | return T(arg)
88 | except Exception:
89 | raise ArgumentTypeError(
90 | "%r is not in the format 'YYYY-MM-DD HH:MM:SS.SSS +ZZ:ZZ'" % arg)
91 |
92 | def main():
93 | p = argparse.ArgumentParser()
94 | p.add_argument('-t', '--table', metavar = 'TABLE')
95 | p.add_argument('-f', '--format-file',
96 | metavar = 'TABLE.fmt', required = True)
97 | p.add_argument('-o', '--output-file',
98 | metavar = 'OUTPUT.dat', required = True)
99 | p.add_argument('--start', metavar = 'TIME', type = _parse_timestamp)
100 | p.add_argument('--end', metavar = 'TIME', type = _parse_timestamp)
101 | p.add_argument('input_files', metavar = 'INPUT.dat', nargs = '+')
102 | opts = p.parse_args()
103 |
104 | table_abbr = opts.table
105 | if table_abbr is None:
106 | table_abbr, _ = os.path.splitext(os.path.basename(opts.format_file))
107 |
108 | input_files = [(f, opts.format_file) for f in opts.input_files]
109 |
110 | output_table_abbr, _ = os.path.splitext(os.path.basename(opts.output_file))
111 | output_format_file = os.path.join(os.path.dirname(opts.output_file),
112 | output_table_abbr + '.fmt')
113 |
114 | merge_files(table_abbr, input_files, opts.output_file,
115 | output_format_file = output_format_file,
116 | start = opts.start, end = opts.end)
117 |
--------------------------------------------------------------------------------
/downcast/output/log.py:
--------------------------------------------------------------------------------
1 | #
2 | # downcast - tools for unpacking patient data from DWC
3 | #
4 | # Copyright (c) 2018 Laboratory for Computational Physiology
5 | #
6 | # This program is free software: you can redistribute it and/or modify
7 | # it under the terms of the GNU General Public License as published by
8 | # the Free Software Foundation, either version 3 of the License, or
9 | # (at your option) any later version.
10 | #
11 | # This program is distributed in the hope that it will be useful,
12 | # but WITHOUT ANY WARRANTY; without even the implied warranty of
13 | # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14 | # GNU General Public License for more details.
15 | #
16 | # You should have received a copy of the GNU General Public License
17 | # along with this program. If not, see .
18 |
19 | import heapq
20 | import re
21 |
22 | _timestamp_pattern = re.compile(b'S?[0-9]+\n')
23 |
24 | class ArchiveLogReader:
25 | """Class for reading log entries from a mostly-sorted input file.
26 |
27 | Each line in the input file is either a data record, a timestamp,
28 | or a sequence number.
29 |
30 | Timestamps are written as a decimal integer (interpreted as a
31 | string of digits, giving the UTC year, month, day, hour, minute,
32 | second, and microsecond).
33 |
34 | Sequence numbers are written as the letter 'S' followed by a
35 | decimal integer (interpreted as the number of milliseconds since
36 | the epoch.)
37 |
38 | All other lines in the file are treated as data records, and are
39 | associated with the preceding timestamp and sequence number (thus
40 | allowing basic compression, while keeping the file format
41 | extremely simple.)
42 |
43 | When reading the file, data records are returned in order (sorting
44 | first by sequence number, then by timestamp, then by order in the
45 | input file.) This will be done efficiently if the input file is
46 | mostly sorted to begin with (and less efficiently otherwise.)
47 |
48 | No attempt is made to remove duplicate or invalid records - this
49 | must be done by the caller.
50 |
51 | If the file is modified after being opened, then garbage in,
52 | garbage out.
53 | """
54 |
55 | def __init__(self, filename, allow_missing = False):
56 | # Open the file
57 | try:
58 | fp = open(filename, 'rb')
59 | except FileNotFoundError:
60 | if allow_missing:
61 | self._fp = None
62 | self._subsequences = None
63 | return
64 | else:
65 | raise
66 | self._fp = fp
67 | self._subsequences = None
68 |
69 | def close(self):
70 | if self._fp:
71 | self._fp.close()
72 | self._subsequences = None
73 |
74 | def __enter__(self):
75 | return self
76 |
77 | def __exit__(self, exc_type, exc_val, exc_tb):
78 | self.close()
79 |
80 | def missing(self):
81 | return (self._fp is None)
82 |
83 | def unsorted_items(self):
84 | fp = self._fp
85 | if not fp:
86 | return
87 | fp.seek(0)
88 | subseq = []
89 | prev_t = None
90 | sn = ts = 0
91 | t = (sn, ts)
92 | for line in fp:
93 | if _timestamp_pattern.fullmatch(line):
94 | if line[0] == 83: # ASCII 'S'
95 | sn = int(line[1:])
96 | t = (sn, ts)
97 | else:
98 | ts = int(line)
99 | t = (sn, ts)
100 | else:
101 | yield (sn, ts, line)
102 | if not subseq or t < prev_t:
103 | fpos = fp.tell() - len(line)
104 | subseq.append((sn, ts, fpos))
105 | prev_t = t
106 | heapq.heapify(subseq)
107 | self._subsequences = subseq
108 |
109 | def sorted_items(self):
110 | if self._subsequences is None:
111 | for _ in self.unsorted_items():
112 | pass
113 |
114 | fp = self._fp
115 | subseq = self._subsequences
116 | self._subsequences = None
117 | while subseq:
118 | # Begin reading the earliest subsequence
119 | p = heapq.heappop(subseq)
120 | (sn, ts, fpos) = prev_p = p
121 | fp.seek(fpos)
122 |
123 | for line in fp:
124 | if _timestamp_pattern.fullmatch(line):
125 | if line[0] == 83: # ASCII 'S'
126 | sn = int(line[1:])
127 | p = (sn, ts, fpos)
128 | else:
129 | ts = int(line)
130 | p = (sn, ts, fpos)
131 | else:
132 | if p < prev_p:
133 | # reached end of subsequence
134 | break
135 | # Note that because the subsequences are disjoint,
136 | # this comparison is valid even though fpos is not
137 | # continuously updated.
138 | elif subseq and p > subseq[0]:
139 | # switch to other subsequence
140 | fpos = fp.tell() - len(line)
141 | p = heapq.heapreplace(subseq, (sn, ts, fpos))
142 | (sn, ts, fpos) = prev_p = p
143 | fp.seek(fpos)
144 | next_p = subseq[0]
145 | else:
146 | # continue with current subsequence
147 | yield (sn, ts, line)
148 | prev_p = p
149 |
--------------------------------------------------------------------------------
/INTERNALS:
--------------------------------------------------------------------------------
1 | Block diagram
2 | =============
3 |
4 | +-----------+
5 | | |
6 | | Extractor |
7 | | |
8 | +-----------+
9 | |
10 | | schedules
11 | | queries
12 | v
13 | +----------------+
14 | | | sets parameters +---------------+
15 | | | ------------------> | |
16 | | | generates SQL | MessageParser |
17 | | | <------------------ | |
18 | | ExtractorQueue | +---------------+
19 | | |
20 | | | +-------------+
21 | | | ----> | state files |
22 | | | +-------------+
23 | +----------------+
24 | sends | ^
25 | messages | | sends
26 | v | acknowledgements
27 | +------------+
28 | | |
29 | | Dispatcher |
30 | | |
31 | +------------+
32 | sends | ^
33 | messages | | sends
34 | v | acknowledgements
35 | +--------------+
36 | | | +--------------+
37 | | OuputHandler | ----> | output files |
38 | | | +--------------+
39 | +--------------+
40 |
41 |
42 | Extractor
43 | =========
44 |
45 | The Extractor manages the overall flow of the conversion process:
46 | deciding the order that queries are issued.
47 |
48 | Messages will need to be retrieved from many different sources,
49 | represented by "queues", and then passed on to the dispatcher and
50 | output handlers. The task of the extractor is to decide which
51 | queue(s) to read.
52 |
53 | (A "message", by the way, is an object that will generally correspond
54 | to a single row in a single table of the DWC database, but this may
55 | not always be the case.)
56 |
57 |
58 | ExtractorQueue
59 | ==============
60 |
61 | An ExtractorQueue is created for each input data source (~ each
62 | database table that is to be polled.)
63 |
64 | The ExtractorQueue, with the help of a MessageParser, determines what
65 | SQL queries are to be issued, making sure that no messages are missed
66 | while avoiding (as much as possible) querying the same message more
67 | than once. This is closely related to the task of tracking which
68 | messages have been seen and acknowledged so far.
69 |
70 | In general, there will be some "earliest unacked" timestamp (before
71 | which, all messages have been acknowledged), and some "latest seen"
72 | timestamp (after which, no messages have yet been seen.) In between
73 | are the messages that have been seen but not yet acknowledged.
74 |
75 | When conversion is halted, the queue will need to save a state file
76 | containing:
77 |
78 | - the "earliest unacked" timestamp
79 |
80 | - a list of all the messages after that point that have already been
81 | acknowledged
82 |
83 | so that when conversion is resumed, it can resume from the same
84 | point, and ideally ignore all messages that have already been
85 | processed. The format of this state file needs to be determined; it
86 | must be fairly simple and robust, so that newer versions of the
87 | program can read state files created by older versions, and vice
88 | versa.
89 |
90 |
91 | MessageParser
92 | =============
93 |
94 | A MessageParser handles the actual details of the database structure:
95 | how to translate a particular request for messages into an SQL
96 | statement, and how to translate the results into the appropriate
97 | message type. If details of the database are changed from one DWC
98 | version to another, this is where they will need to be addressed.
99 |
100 |
101 | Dispatcher
102 | ==========
103 |
104 | The Dispatcher keeps track of all messages that have been received by
105 | the various queues, and passes them on to the various output
106 | handlers.
107 |
108 | The chief purpose of having an intermediate dispatcher object is to
109 | ensure that all messages, whatever their origin, will eventually
110 | expire, and will not be kept in "seen but not yet acknowledged" state
111 | indefinitely.
112 |
113 |
114 | OutputHandler
115 | =============
116 |
117 | The various OutputHandlers are responsible for interpreting the
118 | incoming messages and storing them in the appropriate output files.
119 |
120 | When a handler receives a message, it can do several things with it:
121 |
122 | - ignore it, implicitly indicating that the handler doesn't know
123 | what to do with it;
124 |
125 | - acknowledge it, indicating that its contents have now been saved
126 | to the appropriate output file, and the message may now be
127 | discarded;
128 |
129 | - or defer it, indicating that the handler is interested in this
130 | message but is unable to process it immediately.
131 |
132 | Deferring a message can occur for several reasons:
133 |
134 | - because the handler requires additional metadata that is not yet
135 | available;
136 |
137 | - because the handler wants to aggregate all simultaneous events
138 | (e.g., numerics or wave samples) in a single file, and it hasn't
139 | yet received all of the messages for this time period;
140 |
141 | - or because the messages it has received are not in chronological
142 | order, and it's waiting to see if a later message will fill in the
143 | gap.
144 |
145 | (The precise details will need to be established once we have an
146 | actual database to examine; for example, we don't currently know
147 | whether it's even possible for messages to appear out of order.)
148 |
149 | In addition to incomplete output files, output handlers must be able
150 | to save their current state to appropriate state files, so that (just
151 | as with queue state files) the program can be stopped and restarted
152 | without creating any discontinuity in the output.
153 |
--------------------------------------------------------------------------------
/downcast/timestamp.py:
--------------------------------------------------------------------------------
1 | #
2 | # downcast - tools for unpacking patient data from DWC
3 | #
4 | # Copyright (c) 2018 Laboratory for Computational Physiology
5 | #
6 | # This program is free software: you can redistribute it and/or modify
7 | # it under the terms of the GNU General Public License as published by
8 | # the Free Software Foundation, either version 3 of the License, or
9 | # (at your option) any later version.
10 | #
11 | # This program is distributed in the hope that it will be useful,
12 | # but WITHOUT ANY WARRANTY; without even the implied warranty of
13 | # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14 | # GNU General Public License for more details.
15 | #
16 | # You should have received a copy of the GNU General Public License
17 | # along with this program. If not, see .
18 |
19 | import re
20 | from datetime import datetime, timedelta, timezone
21 |
22 | class T(datetime):
23 | """Date/time class using MS SQL time string format.
24 |
25 | This class is a wrapper around the standard datetime class, but
26 | its constructor accepts either a datetime object, or a string in
27 | the ISO 8601 format used by MS SQL.
28 |
29 | Addition, subtraction, and comparison work as for normal datetime
30 | objects. repr and str produce something sensible.
31 | """
32 |
33 | # Note that the following pattern recognizes several formats:
34 | #
35 | # YYYY-MM-DD HH:MM:SS.SSS +ZZ:ZZ (MS SQL)
36 | # YYYY-MM-DD HH:MM:SS.SSSSSS+ZZ:ZZ (datetime.__str__)
37 | # YYYY-MM-DD HH:MM:SS+ZZ:ZZ (datetime.__str__ if microseconds = 0)
38 | #
39 | # The first format is what should normally be used, but for some
40 | # reason the timestamps in _phi_time_map files are sometimes
41 | # written in the latter two formats - this is a bug somewhere in
42 | # downcast.output.timemap, but for now we need to support the
43 | # existing _phi_time_map files.
44 |
45 | _pattern = re.compile('\A(\d+)-(\d+)-(\d+)\s+' +
46 | '(\d+):(\d+):(\d+)(\.\d+)?\s*' +
47 | '([-+])(\d+):(\d+)\Z', re.ASCII)
48 |
49 | def __new__(cls, val, *args):
50 | # The constructor may be called in various ways:
51 | # - T(str), to explicitly convert from a time string
52 | # - T(datetime), to explicitly convert from a datetime
53 | # - T(int, int, int, int, int, int, int, tzinfo),
54 | # used by __add__ and __sub__ in Python 3.8
55 | # - T(bytes, tzinfo), used by pickle.loads
56 | # Only the first two (single argument) forms should be used by
57 | # applications.
58 |
59 | if args:
60 | return datetime.__new__(cls, val, *args)
61 |
62 | if isinstance(val, datetime):
63 | tz = val.tzinfo
64 | if tz is None:
65 | raise TypeError('missing timezone')
66 | return datetime.__new__(
67 | cls,
68 | year = val.year,
69 | month = val.month,
70 | day = val.day,
71 | hour = val.hour,
72 | minute = val.minute,
73 | second = val.second,
74 | microsecond = val.microsecond,
75 | tzinfo = tz)
76 |
77 | m = T._pattern.match(val)
78 | if m is None:
79 | raise ValueError('malformed timestamp string %r' % (val,))
80 |
81 | second = int(m.group(6))
82 | microsecond = round(float(m.group(7) or 0) * 1000000)
83 | # datetime doesn't support leap seconds, and DWC probably
84 | # doesn't support them either, but allow for the possibility
85 | # here just in case. If there is a leap second, it is
86 | # silently compressed into the final millisecond of the
87 | # preceding second; this will result in one or more
88 | # discontinuities in the record time map.
89 | if second == 60:
90 | second = 59
91 | microsecond = 999000 + microsecond // 1000
92 |
93 | tzs = 1 if m.group(8) == '+' else -1
94 | tz = timezone(timedelta(hours = tzs * int(m.group(9)),
95 | minutes = tzs * int(m.group(10))))
96 |
97 | return datetime.__new__(
98 | cls,
99 | year = int(m.group(1)),
100 | month = int(m.group(2)),
101 | day = int(m.group(3)),
102 | hour = int(m.group(4)),
103 | minute = int(m.group(5)),
104 | second = second,
105 | microsecond = microsecond,
106 | tzinfo = tz)
107 |
108 | def __str__(self):
109 | tzoffs = round(self.tzinfo.utcoffset(None).total_seconds() / 60)
110 | (tzh, tzm) = divmod(abs(tzoffs), 60)
111 | if self.microsecond % 1000 == 0:
112 | f = '%03d' % (self.microsecond // 1000)
113 | else:
114 | f = '%06d' % self.microsecond
115 | return ('%04d-%02d-%02d %02d:%02d:%02d.%s %s%02d:%02d'
116 | % (self.year, self.month, self.day,
117 | self.hour, self.minute, self.second, f,
118 | ('-' if tzoffs < 0 else '+'), tzh, tzm))
119 |
120 | def __repr__(self):
121 | return ('%s(%r)' % (self.__class__.__name__, T.__str__(self)))
122 |
123 | def strftime_local(self, fmt):
124 | """Format time as a string, using its original timezone."""
125 | return datetime.strftime(self, fmt)
126 |
127 | def strftime_utc(self, fmt):
128 | """Convert time to UTC and format as a string."""
129 | return datetime.strftime(self.astimezone(timezone.utc), fmt)
130 |
131 |
132 | if not isinstance(T('1800-01-01 00:00:00.000 +00:00') + timedelta(0), T):
133 | # the following are redundant in Python 3.8
134 | # also, the above line is a nice sanity check in case Python
135 | # decides to break this stuff *again*
136 |
137 | def _add_and_convert(a, b):
138 | return T(datetime.__add__(a, b))
139 | T.__add__ = _add_and_convert
140 | T.__radd__ = _add_and_convert
141 |
142 | def _sub_and_convert(a, b):
143 | d = datetime.__sub__(a, b)
144 | if isinstance(d, datetime):
145 | return T(d)
146 | else:
147 | return d
148 | T.__sub__ = _sub_and_convert
149 |
150 |
151 | def delta_ms(time_a, time_b):
152 | """Compute the difference between two timestamps in milliseconds."""
153 | delta = time_a - time_b
154 | return ((delta.days * 86400 + delta.seconds) * 1000
155 | + (delta.microseconds // 1000))
156 |
157 | very_old_timestamp = T('1800-01-01 00:00:00.000 +00:00')
158 | dwc_epoch = T('2000-01-01 12:00:00.000 +00:00')
159 |
--------------------------------------------------------------------------------
/downcast/output/files.py:
--------------------------------------------------------------------------------
1 | #
2 | # downcast - tools for unpacking patient data from DWC
3 | #
4 | # Copyright (c) 2018 Laboratory for Computational Physiology
5 | #
6 | # This program is free software: you can redistribute it and/or modify
7 | # it under the terms of the GNU General Public License as published by
8 | # the Free Software Foundation, either version 3 of the License, or
9 | # (at your option) any later version.
10 | #
11 | # This program is distributed in the hope that it will be useful,
12 | # but WITHOUT ANY WARRANTY; without even the implied warranty of
13 | # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14 | # GNU General Public License for more details.
15 | #
16 | # You should have received a copy of the GNU General Public License
17 | # along with this program. If not, see .
18 |
19 | import os
20 | import errno
21 | import mmap
22 |
23 | from ..util import fdatasync
24 |
25 | class ArchiveLogFile:
26 | """Append-only text log output file.
27 |
28 | Messages can only be appended to the end of the log file.
29 | Messages must be strings and are always encoded as UTF-8.
30 |
31 | When the file is opened, if it ends with an incomplete message
32 | (i.e., the program writing the file crashed or ran out of space),
33 | a special marker is appended to indicate that the line is invalid.
34 | """
35 |
36 | def __init__(self, filename):
37 | # Open file
38 | self.fp = open(filename, 'a+b')
39 | self.fsync_before_close = False
40 |
41 | # Check if file ends with \n; if not, append a marker to
42 | # indicate the last line is invalid
43 | try:
44 | self.fp.seek(-1, os.SEEK_END)
45 | except OSError as e:
46 | if e.errno == errno.EINVAL:
47 | return
48 | else:
49 | raise
50 | c = self.fp.read(1)
51 | if c != b'\n' and c != b'':
52 | self.fp.write(b'\030\r####\030\n')
53 |
54 | def append(self, msg):
55 | """Write a message to the end of the file.
56 |
57 | A line feed is appended automatically.
58 | """
59 | self.fp.write(msg.encode('UTF-8'))
60 | self.fp.write(b'\n')
61 |
62 | def append_raw(self, msg):
63 | """Write a raw binary message to the end of the file."""
64 | self.fp.write(msg)
65 |
66 | def flush(self, fsync = True):
67 | """Ensure that previous messages are saved to disk."""
68 | self.fp.flush()
69 | if fsync:
70 | fdatasync(self.fp.fileno())
71 |
72 | def close(self, fsync = True):
73 | """Flush and close the file."""
74 |
75 | # closing should be idempotent - but raise an exception if
76 | # fsync = True and file was previously closed without fsync
77 | if self.fp.closed:
78 | if not fsync or self.fsync_before_close:
79 | return
80 |
81 | self.flush(fsync = fsync)
82 | self.fp.close()
83 | self.fsync_before_close = fsync
84 |
85 | class ArchiveBinaryFile:
86 | """Random-access binary output file.
87 |
88 | Binary data may be written to any location in the file. This uses
89 | mmap internally, so the output file must support mmap.
90 |
91 | For efficiency, the file on disk will be resized in units of
92 | mmap.PAGESIZE (or more) at a time; the file will be truncated to
93 | its "real" size when flush or close is called.
94 | """
95 |
96 | def __init__(self, filename, window_size = None):
97 | # Open the file R/W and create if missing, never truncate
98 | self.fd = os.open(filename, os.O_RDWR|os.O_CREAT, 0o666)
99 | self.fsync_before_close = False
100 |
101 | self.current_size = os.lseek(self.fd, 0, os.SEEK_END)
102 | self.real_size = self.current_size
103 |
104 | self.window_size = mmap.PAGESIZE * 2
105 | if window_size is not None:
106 | while self.window_size < window_size:
107 | self.window_size *= 2
108 |
109 | self.map_start = self.map_end = 0
110 | self.map_buffer = None
111 |
112 | def _map_range(self, start, end):
113 | if end < self.map_start or start >= self.map_end:
114 | start -= start % mmap.PAGESIZE
115 | if end < start + self.window_size:
116 | end = start + self.window_size
117 | else:
118 | end += mmap.PAGESIZE - (end % mmap.PAGESIZE)
119 | if end > self.current_size:
120 | os.ftruncate(self.fd, end)
121 | self.current_size = end
122 | self.map_buffer = mmap.mmap(self.fd, end - start, offset = start)
123 | self.map_start = start
124 | self.map_end = end
125 |
126 | def size(self):
127 | """Get the size of the file."""
128 | return self.real_size
129 |
130 | def truncate(self, size):
131 | """Truncate or extend the file to the given size."""
132 | self.real_size = size
133 |
134 | def write(self, pos, data, mask = None):
135 | """Write data to the file, extending it if necessary.
136 |
137 | If mask is specified, it must be the same length as data; only
138 | the bits set in the mask are modified.
139 | """
140 | end = pos + len(data)
141 | if end > self.real_size:
142 | self.real_size = end
143 | self._map_range(pos, end)
144 | i = pos - self.map_start
145 | if mask is None:
146 | self.map_buffer[i : i + len(data)] = data
147 | else:
148 | for j in range(len(data)):
149 | self.map_buffer[i + j] = ((self.map_buffer[i + j] & ~mask[j])
150 | | (data[j] & mask[j]))
151 |
152 | def flush(self, fsync = True):
153 | """Ensure that the file contents are saved to disk."""
154 | self.map_start = self.map_end = 0
155 | if self.map_buffer is not None:
156 | self.map_buffer.close()
157 | self.map_buffer = None
158 | if self.real_size != self.current_size:
159 | os.ftruncate(self.fd, self.real_size)
160 | self.current_size = self.real_size
161 | if fsync:
162 | fdatasync(self.fd)
163 |
164 | def close(self, fsync = True):
165 | """Flush and close the file."""
166 |
167 | # closing should be idempotent - but raise an exception if
168 | # fsync = True and file was previously closed without fsync
169 | if self.fd is None:
170 | if not fsync or self.fsync_before_close:
171 | return
172 |
173 | self.flush(fsync = fsync)
174 | os.close(self.fd)
175 | self.fd = None
176 | self.fsync_before_close = fsync
177 |
178 | def __del__(self):
179 | self.close(fsync = False)
180 |
--------------------------------------------------------------------------------
/downcast/output/enums.py:
--------------------------------------------------------------------------------
1 | #
2 | # downcast - tools for unpacking patient data from DWC
3 | #
4 | # Copyright (c) 2018 Laboratory for Computational Physiology
5 | #
6 | # This program is free software: you can redistribute it and/or modify
7 | # it under the terms of the GNU General Public License as published by
8 | # the Free Software Foundation, either version 3 of the License, or
9 | # (at your option) any later version.
10 | #
11 | # This program is distributed in the hope that it will be useful,
12 | # but WITHOUT ANY WARRANTY; without even the implied warranty of
13 | # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14 | # GNU General Public License for more details.
15 | #
16 | # You should have received a copy of the GNU General Public License
17 | # along with this program. If not, see .
18 |
19 | from datetime import datetime, timezone
20 | import os
21 |
22 | from ..messages import EnumerationValueMessage
23 | from .wfdb import (Annotator, AnnotationType)
24 |
25 | _del_control = str.maketrans({x: ' ' for x in list(range(32)) + [127]})
26 |
27 | class EnumerationValueHandler:
28 | def __init__(self, archive):
29 | self.archive = archive
30 | self.last_event = {}
31 |
32 | def send_message(self, chn, msg, source, ttl):
33 | if not isinstance(msg, EnumerationValueMessage):
34 | return
35 |
36 | source.nack_message(chn, msg, self)
37 |
38 | # Load metadata for this numeric
39 | attr = msg.origin.get_enumeration_attr(msg.enumeration_id, (ttl <= 0))
40 | if attr is None:
41 | # Metadata not yet available - hold message in pending and
42 | # continue processing
43 | return
44 |
45 | # Look up the corresponding record
46 | record = self.archive.get_record(msg)
47 | if record is None:
48 | # Record not yet available - hold message in pending and
49 | # continue processing
50 | return
51 |
52 | # Dump original message to BCP file if desired
53 | if record.dump(msg):
54 | source.ack_message(chn, msg, self)
55 | return
56 |
57 | # Open or create a log file
58 | logfile = record.open_log_file('_phi_enums')
59 |
60 | # Write the sequence number and timestamp to the log file
61 | # (if they don't differ from the previous event)
62 | sn = msg.sequence_number
63 | ts = msg.timestamp
64 | (old_sn, old_ts) = self.last_event.get(record, (None, None))
65 | if sn != old_sn:
66 | logfile.append('S%s' % sn)
67 | if ts != old_ts:
68 | logfile.append(ts.strftime_utc('%Y%m%d%H%M%S%f'))
69 | self.last_event[record] = (sn, ts)
70 |
71 | # Write value to the log file
72 | lbl = attr.label.translate(_del_control)
73 | val = msg.value
74 | if val is None:
75 | val = ''
76 | else:
77 | val = val.translate(_del_control)
78 | logfile.append('%s\t%d\t%s' % (attr.label, attr.value_physio_id, val))
79 | source.ack_message(chn, msg, self)
80 |
81 | def flush(self):
82 | self.archive.flush()
83 |
84 | # Known DWC annotation codes, and corresponding WFDB anntyp / subtyp / aux
85 | _ann_code = {
86 | b'148631': (AnnotationType.NORMAL, 0, None), # N - normal
87 | b'148767': (AnnotationType.PVC, 0, None), # V - ventricular
88 | b'147983': (AnnotationType.SVPB, 0, None), # S - supraventricular
89 | b'148063': (AnnotationType.PACE, 0, None), # P - paced (most common?)
90 | b'147543': (AnnotationType.PACE, 1, None), # P - paced
91 | b'147591': (AnnotationType.PACE, 2, None), # P - paced (least common?)
92 | b'147631': (AnnotationType.PACESP, 0, None), # ' - single pacer spike
93 | b'148751': (AnnotationType.PACESP, 1, None), # " - bivent. pacer spike
94 | b'148783': (AnnotationType.LEARN, 0, None), # L - learning
95 | b'147551': (AnnotationType.NOTE, 0, b'M'), # M - missed beat
96 | b'195396': (AnnotationType.UNKNOWN, 0, None), # B - QRS, unspecified type
97 | b'148759': (AnnotationType.UNKNOWN, 1, None), # ? - QRS, unclassifiable
98 | b'147527': (AnnotationType.ARFCT, 0, None), # A - artifact
99 | b'148743': (AnnotationType.NOTE, 0, b'_'), # I - signals inoperable
100 | }
101 |
102 | # Unknown annotations are mapped to an anntyp based on the first
103 | # letter of the label
104 | _ann_letter = {
105 | b'N': AnnotationType.NORMAL,
106 | b'V': AnnotationType.PVC,
107 | b'S': AnnotationType.SVPB,
108 | b'P': AnnotationType.PACE,
109 | b"'": AnnotationType.PACESP,
110 | b'"': AnnotationType.PACESP,
111 | b'L': AnnotationType.LEARN,
112 | b'M': AnnotationType.NOTE,
113 | b'B': AnnotationType.UNKNOWN,
114 | b'?': AnnotationType.UNKNOWN,
115 | b'A': AnnotationType.ARFCT,
116 | }
117 |
118 | class EnumerationValueFinalizer:
119 | def __init__(self, record):
120 | self.record = record
121 | self.log = record.open_log_reader('_phi_enums', allow_missing = True)
122 |
123 | # Scan the enums log file, and add timestamps to the time map.
124 | for (sn, ts, line) in self.log.unsorted_items():
125 | ts = datetime.strptime(str(ts), '%Y%m%d%H%M%S%f')
126 | ts = ts.replace(tzinfo = timezone.utc)
127 | record.time_map.add_time(ts)
128 |
129 | def finalize_record(self):
130 | sn0 = self.record.seqnum0()
131 | if sn0 is None:
132 | # if we don't have a seqnum0 then time is meaningless
133 | return
134 |
135 | annfname = os.path.join(self.record.path, 'waves.beat')
136 | with Annotator(annfname, afreq = 1000) as anns:
137 | # Reread the enums log file in order, and write beat annotations.
138 | for (sn, ts, line) in self.log.sorted_items():
139 | if b'\030' in line:
140 | continue
141 | ts = datetime.strptime(str(ts), '%Y%m%d%H%M%S%f')
142 | ts = ts.replace(tzinfo = timezone.utc)
143 | sn = self.record.time_map.get_seqnum(ts, sn + 5120) or sn
144 |
145 | f = line.split(b'\t')
146 | if len(f) == 3 and f[0] == b'Annot':
147 | (label, value_physio_id, value) = f
148 | t = _ann_code.get(value_physio_id)
149 | if t:
150 | (anntyp, subtyp, aux) = t
151 | else:
152 | anntyp = _ann_letter.get(value[:1],
153 | AnnotationType.UNKNOWN)
154 | subtyp = 0
155 | aux = b'[' + value_physio_id + b'] ' + value
156 | anns.put(time = (sn - sn0), chan = 255,
157 | anntyp = anntyp, subtyp = subtyp, aux = aux)
158 |
--------------------------------------------------------------------------------
/downcast/db/query.py:
--------------------------------------------------------------------------------
1 | #
2 | # downcast - tools for unpacking patient data from DWC
3 | #
4 | # Copyright (c) 2018 Laboratory for Computational Physiology
5 | #
6 | # This program is free software: you can redistribute it and/or modify
7 | # it under the terms of the GNU General Public License as published by
8 | # the Free Software Foundation, either version 3 of the License, or
9 | # (at your option) any later version.
10 | #
11 | # This program is distributed in the hope that it will be useful,
12 | # but WITHOUT ANY WARRANTY; without even the implied warranty of
13 | # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14 | # GNU General Public License for more details.
15 | #
16 | # You should have received a copy of the GNU General Public License
17 | # along with this program. If not, see .
18 |
19 | import re
20 | from collections import namedtuple
21 | import ply.lex
22 | import ply.yacc
23 |
24 | from .exceptions import (ProgrammingError, ParameterCountError)
25 |
26 | # This implements a parser for an extremely limited subset of SQL,
27 | # just enough to handle the queries generated by downcast.
28 |
29 | SelectStatement = namedtuple('SelectStatement', (
30 | 'columns', 'table', 'constraints', 'order', 'limit'))
31 |
32 | Constraint = namedtuple('Constraint', (
33 | 'column', 'relation', 'value'))
34 |
35 | class SimpleQueryParser:
36 | _keywords = {
37 | 'SELECT', 'FROM', 'WHERE', 'AND', 'ORDER', 'BY', 'LIMIT'
38 | }
39 |
40 | tokens = list(_keywords) + [
41 | 'PARAM', 'LE', 'GE', 'identifier', 'bracketed_identifier',
42 | 'integer', 'string_constant'
43 | ]
44 |
45 | literals = ['=', '<', '>', ',', '*', ';']
46 |
47 | t_ignore = ' \t\r\f\n'
48 |
49 | t_LE = r'<='
50 | t_GE = r'>='
51 |
52 | def t_PARAM(self, t):
53 | r'\?'
54 | try:
55 | t.value = next(self._param_iter)
56 | except StopIteration:
57 | c = None
58 | if self._constraint_pos is not None:
59 | c = self._input[self._constraint_pos:self._lexer.lexpos]
60 | raise ParameterCountError('not enough parameters for query', c)
61 | return t
62 |
63 | def t_identifier(self, t):
64 | r'[A-Za-z_][A-Za-z0-9_.]*'
65 | u = t.value.upper()
66 | if u in SimpleQueryParser._keywords:
67 | t.type = u
68 | return t
69 |
70 | def t_bracketed_identifier(self, t):
71 | r'\[[A-Za-z0-9_.]+\]'
72 | t.value = t.value[1:-1]
73 | return t
74 |
75 | def t_integer(self, t):
76 | r'[0-9]+'
77 | t.value = int(t.value)
78 | return t
79 |
80 | def t_string_constant(self, t):
81 | r"'(?:[^']+|'')*'"
82 | t.value = t.value[1:-1].replace("''", "'")
83 | return t
84 |
85 | def t_error(self, t):
86 | text = (self._input[:t.lexpos] + '<>' + self._input[t.lexpos:])
87 | raise ProgrammingError('syntax error in %r' % text)
88 |
89 | ################################################################
90 |
91 | def p_statement(self, p):
92 | """
93 | statement : SELECT columns FROM table constraints order limit ';'
94 | | SELECT columns FROM table constraints order limit
95 | """
96 | p[0] = SelectStatement(columns = p[2], table = p[4],
97 | constraints = p[5], order = p[6],
98 | limit = p[7])
99 |
100 | def p_columns(self, p):
101 | """columns : columns ',' column"""
102 | p[0] = p[1] + [p[3]]
103 |
104 | def p_columns_1(self, p):
105 | """columns : column"""
106 | p[0] = [p[1]]
107 |
108 | def p_columns_star(self, p):
109 | """columns : '*'"""
110 | p[0] = ['*']
111 |
112 | def p_column(self, p):
113 | """column : identifier"""
114 | p[0] = p[1]
115 | self._column_pos = p.lexpos(1)
116 |
117 | def p_table(self, p):
118 | """
119 | table : identifier
120 | | bracketed_identifier
121 | """
122 | p[0] = p[1]
123 |
124 | def p_constraints(self, p):
125 | """constraints : WHERE constraint_list"""
126 | p[0] = p[2]
127 |
128 | def p_constraints_0(self, p):
129 | """constraints : """
130 | p[0] = []
131 |
132 | def p_constraint_list(self, p):
133 | """constraint_list : constraint_list AND constraint"""
134 | p[0] = p[1] + [p[3]]
135 |
136 | def p_constraint_list_1(self, p):
137 | """constraint_list : constraint"""
138 | p[0] = [p[1]]
139 |
140 | def p_constraint(self, p):
141 | """
142 | constraint : constraint_column '=' constraint_value
143 | | constraint_column '<' constraint_value
144 | | constraint_column '>' constraint_value
145 | | constraint_column LE constraint_value
146 | | constraint_column GE constraint_value
147 | """
148 | p[0] = Constraint(column = p[1], relation = p[2], value = p[3])
149 | self._constraint_pos = None
150 |
151 | def p_constraint_column(self, p):
152 | """constraint_column : column"""
153 | p[0] = p[1]
154 | self._constraint_pos = self._column_pos
155 |
156 | def p_constraint_value(self, p):
157 | """
158 | constraint_value : PARAM
159 | | integer
160 | | string_constant
161 | """
162 | p[0] = p[1]
163 |
164 | def p_order(self, p):
165 | """order : ORDER BY column"""
166 | p[0] = p[3]
167 |
168 | def p_order_0(self, p):
169 | """order : """
170 | p[0] = None
171 |
172 | def p_limit(self, p):
173 | """limit : LIMIT integer"""
174 | p[0] = p[2]
175 |
176 | def p_limit_0(self, p):
177 | """limit : """
178 | p[0] = None
179 |
180 | def p_error(self, t):
181 | if t:
182 | desc = t.type
183 | text = (self._input[:t.lexpos] + '<>' + self._input[t.lexpos:])
184 | else:
185 | desc = 'EOF'
186 | text = (self._input + '<>')
187 | raise ProgrammingError('syntax error (unexpected %s) in %r'
188 | % (desc, text))
189 |
190 | def __init__(self):
191 | self._lexer = ply.lex.lex(module = self)
192 | self._parser = ply.yacc.yacc(module = self,
193 | write_tables = False,
194 | debug = False)
195 |
196 | def parse(self, statement, params):
197 | self._input = statement
198 | self._param_iter = iter(params)
199 | self._constraint_pos = None
200 | q = self._parser.parse(statement, lexer = self._lexer)
201 | try:
202 | next(self._param_iter)
203 | raise ParameterCountError('too many parameters for query')
204 | except StopIteration:
205 | return q
206 |
--------------------------------------------------------------------------------
/downcast/db/bcp/cursor.py:
--------------------------------------------------------------------------------
1 | #
2 | # downcast - tools for unpacking patient data from DWC
3 | #
4 | # Copyright (c) 2018 Laboratory for Computational Physiology
5 | #
6 | # This program is free software: you can redistribute it and/or modify
7 | # it under the terms of the GNU General Public License as published by
8 | # the Free Software Foundation, either version 3 of the License, or
9 | # (at your option) any later version.
10 | #
11 | # This program is distributed in the hope that it will be useful,
12 | # but WITHOUT ANY WARRANTY; without even the implied warranty of
13 | # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14 | # GNU General Public License for more details.
15 | #
16 | # You should have received a copy of the GNU General Public License
17 | # along with this program. If not, see .
18 |
19 | from ..exceptions import (Error, DataError, ProgrammingError)
20 |
21 | class BCPCursor:
22 | def __init__(self, connection):
23 | self._conn = connection
24 | self._table_iters = {}
25 | self._query_fetch = None
26 | self._query_skip = None
27 | self._query_cols = None
28 | self.description = None
29 | self.rowcount = -1
30 | self.arraysize = 1
31 |
32 | def __enter__(self):
33 | return self
34 |
35 | def __exit__(self, exc_type, exc_val, exc_tb):
36 | self.close()
37 |
38 | def __iter__(self):
39 | return self
40 |
41 | def __next__(self):
42 | row = self.fetchone()
43 | if row:
44 | return row
45 | else:
46 | raise StopIteration()
47 |
48 | #### DB-API ####
49 |
50 | def close(self):
51 | try:
52 | for it in self._table_iters.values():
53 | it.close()
54 | finally:
55 | self._table_iters = {}
56 | self._conn = None
57 | self._query_fetch = None
58 | self._query_skip = None
59 | self._query_cols = None
60 |
61 | def execute(self, statement, params = ()):
62 | try:
63 | q = self._conn.parse(statement, params)
64 | except Error:
65 | raise
66 | except Exception as e:
67 | raise ProgrammingError(e)
68 |
69 | table = self._conn.get_table(q.table)
70 |
71 | if table not in self._table_iters:
72 | self._table_iters[table] = table.iterator()
73 | it = self._table_iters[table]
74 |
75 | if q.order is not None:
76 | i = table.column_number(q.order)
77 | if i != table.order_column():
78 | raise ProgrammingError('cannot sort %s by %s'
79 | % (q.table, q.order))
80 |
81 | cols = []
82 | for c in q.columns:
83 | if c == '*':
84 | cols += range(table.n_columns())
85 | else:
86 | cols.append(table.column_number(c))
87 |
88 | seek = None
89 | skip = []
90 | for c in q.constraints:
91 | i = table.column_number(c.column)
92 | t = table.column_type(i)
93 |
94 | try:
95 | v = t.from_param(c.value)
96 | except Exception:
97 | raise ProgrammingError('in %s, cannot compare %s to %r'
98 | % (table.name, c.column, c.value))
99 |
100 | oc = table.order_column()
101 | rel = c.relation
102 | if i == oc and rel == '<':
103 | skip += [_halt_unless(i, rel, v)]
104 | elif i == oc and rel == '<=':
105 | skip += [_halt_unless(i, rel, v)]
106 | elif i == oc and rel == '=' and seek is None:
107 | seek = (i, v)
108 | skip += [_halt_unless(i, rel, v)]
109 | elif i == oc and rel == '>=' and seek is None:
110 | seek = (i, v)
111 | elif i == oc and rel == '>' and seek is None:
112 | seek = (i, v)
113 | skip += [_skip_unless(i, '<>', v)]
114 | elif table.column_indexed(i) and rel == '=' and seek is None:
115 | seek = (i, v)
116 | skip += [_halt_unless(i, '=', v)]
117 | else:
118 | skip += [_skip_unless(i, rel, v)]
119 |
120 | self.description = []
121 | for i in cols:
122 | self.description.append((table.column_name(i),
123 | table.column_type(i),
124 | None, None, None, None, None))
125 | self.rowcount = 0
126 |
127 | if q.limit is not None:
128 | skip += [lambda r: self.rowcount >= q.limit and _halt()]
129 |
130 | if seek is None:
131 | it.seek(None, None)
132 | else:
133 | it.seek(*seek)
134 | self._query_fetch = it.fetch
135 | self._query_skip = skip
136 | self._query_cols = cols
137 |
138 | def executemany(self, statement, params):
139 | for p in params:
140 | self.execute(statement, p)
141 |
142 | def fetchone(self):
143 | fetch = self._query_fetch
144 | skip = self._query_skip
145 | try:
146 | r = fetch()
147 | while r:
148 | if any(f(r) for f in skip):
149 | r = fetch()
150 | else:
151 | self.rowcount += 1
152 | return [r[i] for i in self._query_cols]
153 | except HaltQuery:
154 | self._query_fetch = lambda: None
155 | return
156 | except Error:
157 | self._query_fetch = lambda: None
158 | raise
159 | except Exception as e:
160 | self._query_fetch = lambda: None
161 | raise DataError(e)
162 |
163 | def fetchmany(self, size = None):
164 | if size is None:
165 | size = self.arraysize
166 | rows = []
167 | while size > 0:
168 | size -= 1
169 | row = self.fetchone()
170 | if not row:
171 | break
172 | rows.append(row)
173 | return rows
174 |
175 | def fetchall(self):
176 | rows = []
177 | row = self.fetchone()
178 | while row:
179 | rows.append(row)
180 | row = self.fetchone()
181 | return rows
182 |
183 | def setinputsizes(self, sizes):
184 | pass
185 |
186 | def setoutputsize(self, size, column):
187 | pass
188 |
189 | def nextset(self):
190 | return None
191 |
192 | class HaltQuery(Exception):
193 | pass
194 |
195 | def _halt():
196 | raise HaltQuery()
197 |
198 | def _skip_unless(col, rel, value):
199 | if rel == '<':
200 | return lambda row: row[col] >= value
201 | elif rel == '<=':
202 | return lambda row: row[col] > value
203 | elif rel == '>':
204 | return lambda row: row[col] <= value
205 | elif rel == '>=':
206 | return lambda row: row[col] < value
207 | elif rel == '=':
208 | return lambda row: row[col] != value
209 | elif rel == '<>':
210 | return lambda row: row[col] == value
211 | else:
212 | raise ProgrammingError('unknown relation %r' % rel)
213 |
214 | def _halt_unless(col, rel, value):
215 | if rel == '<':
216 | return lambda row: row[col] >= value and _halt()
217 | elif rel == '<=':
218 | return lambda row: row[col] > value and _halt()
219 | elif rel == '>':
220 | return lambda row: row[col] <= value and _halt()
221 | elif rel == '>=':
222 | return lambda row: row[col] < value and _halt()
223 | elif rel == '=':
224 | return lambda row: row[col] != value and _halt()
225 | elif rel == '<>':
226 | return lambda row: row[col] == value and _halt()
227 | else:
228 | raise ProgrammingError('unknown relation %r' % rel)
229 |
--------------------------------------------------------------------------------
/downcast/attributes.py:
--------------------------------------------------------------------------------
1 | #
2 | # downcast - tools for unpacking patient data from DWC
3 | #
4 | # Copyright (c) 2017 Laboratory for Computational Physiology
5 | #
6 | # This program is free software: you can redistribute it and/or modify
7 | # it under the terms of the GNU General Public License as published by
8 | # the Free Software Foundation, either version 3 of the License, or
9 | # (at your option) any later version.
10 | #
11 | # This program is distributed in the hope that it will be useful,
12 | # but WITHOUT ANY WARRANTY; without even the implied warranty of
13 | # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14 | # GNU General Public License for more details.
15 | #
16 | # You should have received a copy of the GNU General Public License
17 | # along with this program. If not, see .
18 |
19 | from collections import namedtuple
20 |
21 | # Note that 'enumeration_id', 'numeric_id', and 'wave_id' are
22 | # deliberately omitted. Contents of these attribute structures should
23 | # be fully anonymized.
24 |
25 | # _Export.Enumeration_
26 | EnumerationAttr = namedtuple('EnumerationAttr', (
27 | # Magic number for... something. See
28 | # System_Parameter-Alerts_Table_Ed_2_-_PIIC_iX_Rel_B.00.xlsx
29 | # (Parameters). Underlying type is 'bigint'.
30 | 'base_physio_id',
31 |
32 | # Magic number for the enumeration. See
33 | # System_Parameter-Alerts_Table_Ed_2_-_PIIC_iX_Rel_B.00.xlsx
34 | # (Parameters). Underlying type is 'bigint'.
35 | 'physio_id',
36 |
37 | # Description of the enumeration, such as 'Annot' or 'RhySta'.
38 | 'label',
39 |
40 | # Undocumented magic number. Underlying type is 'bigint'.
41 | 'value_physio_id',
42 |
43 | # Supposedly indicates if observation is aperiodic.
44 | # Seems to be 0 even for 'Annot'.
45 | 'is_aperiodic',
46 |
47 | # Indicates if observation is manually entered, I guess???
48 | 'is_manual',
49 |
50 | # Magic number indicating whether observation is valid????
51 | 'validity',
52 |
53 | # Magic number for the units of measurement. See
54 | # System_Parameter-Alerts_Table_Ed_2_-_PIIC_iX_Rel_B.00.xlsx
55 | # (Units-Of-Measure). Underlying type is 'bigint'.
56 | 'unit_code',
57 |
58 | # Units of measurement, if that makes any sense (current enums say
59 | # 'Unknwn'.) (What IS an "enumeration", if not something that
60 | # lacks units of measurement?)
61 | 'unit_label',
62 |
63 | # Color to use for displaying enumeration values, represented as
64 | # 0xAARRGGBB, reinterpreted as a signed 32-bit integer.
65 | 'color'))
66 |
67 | undefined_enumeration = EnumerationAttr(*[None]*10)
68 |
69 | # _Export.Numeric_
70 | NumericAttr = namedtuple('NumericAttr', (
71 | # Magic number for... something. Underlying type is 'bigint'.
72 | 'base_physio_id',
73 |
74 | # Magic number for the "category" of numeric. See
75 | # System_Parameter-Alerts_Table_Ed_2_-_PIIC_iX_Rel_B.00.xlsx
76 | # (Parameters? or Calculations?) Underlying type is 'bigint'.
77 | 'physio_id',
78 |
79 | # Description of the "category" of numeric (such as 'NBP'.)
80 | 'label',
81 |
82 | # Indicates that the measurement is aperiodic (like NBP), rather
83 | # than periodic (like HR).
84 | 'is_aperiodic',
85 |
86 | # Units of measurement.
87 | 'unit_label',
88 |
89 | # Magic number indicating whether measurement is valid????
90 | 'validity',
91 |
92 | # Lower alarm threshold (?!)
93 | 'lower_limit',
94 |
95 | # Upper alarm threshold (?!)
96 | 'upper_limit',
97 |
98 | # Indicates that threshold(?) alarms are disabled (?!)
99 | 'is_alarming_off',
100 |
101 | # Magic number for the specific numeric. See
102 | # System_Parameter-Alerts_Table_Ed_2_-_PIIC_iX_Rel_B.00.xlsx
103 | # (Parameters? or Calculations?) Underlying type is 'bigint'.
104 | 'sub_physio_id',
105 |
106 | # Description of the specific numeric (such as 'NBPs'.)
107 | 'sub_label',
108 |
109 | # Color to use for displaying numeric values, represented as
110 | # 0xAARRGGBB, reinterpreted as a signed 32-bit integer.
111 | 'color',
112 |
113 | # Indicates if value is manually entered, I guess???
114 | 'is_manual',
115 |
116 | # Number of values belonging to the compound value???
117 | 'max_values',
118 |
119 | # Number of decimal places to be displayed (?)
120 | 'scale'))
121 |
122 | undefined_numeric = NumericAttr(*[None]*15)
123 |
124 | # _Export.Wave_
125 | WaveAttr = namedtuple('WaveAttr', (
126 | # Magic number for the "category" of waveform. See
127 | # System_Parameter-Alerts_Table_Ed_2_-_PIIC_iX_Rel_B.00.xlsx
128 | # (Parameters). Underlying type is 'bigint'.
129 | 'base_physio_id',
130 |
131 | # Magic number for the specific waveform. See
132 | # System_Parameter-Alerts_Table_Ed_2_-_PIIC_iX_Rel_B.00.xlsx
133 | # (Parameters). Underlying type is 'bigint'.
134 | 'physio_id',
135 |
136 | # Description of the waveform.
137 | 'label',
138 |
139 | # 0 = Primary, 1 = Secondary ???
140 | 'channel',
141 |
142 | # Presumably, number of seqnum ticks per sample.
143 | 'sample_period',
144 |
145 | # Indicates the waveform should be displayed with lower time
146 | # resolution than usual.
147 | 'is_slow_wave',
148 |
149 | # Indicates that the waveform is "derived". ???
150 | 'is_derived',
151 |
152 | # Color to use for displaying the waveform, represented as
153 | # 0xAARRGGBB, reinterpreted as a signed 32-bit integer.
154 | 'color',
155 |
156 | # Low/high cutoff frequency of the input bandpass filter.
157 | 'low_edge_frequency',
158 | 'high_edge_frequency',
159 |
160 | # Range of sample values.
161 | 'scale_lower',
162 | 'scale_upper',
163 |
164 | # Two reference sample values.
165 | 'calibration_scaled_lower',
166 | 'calibration_scaled_upper',
167 |
168 | # Physical values corresponding to the two reference sample
169 | # values.
170 | 'calibration_abs_lower',
171 | 'calibration_abs_upper',
172 |
173 | # Magic number indicating how signal is calibrated (???)
174 | 'calibration_type',
175 |
176 | # Units of measurement.
177 | 'unit_label',
178 |
179 | # Magic number for the units of measurement. See
180 | # System_Parameter-Alerts_Table_Ed_2_-_PIIC_iX_Rel_B.00.xlsx
181 | # (Units-Of-Measure). Underlying type is 'bigint'.
182 | 'unit_code',
183 |
184 | # Magic number indicating electrode placement (???)
185 | 'ecg_lead_placement'))
186 |
187 | undefined_wave = WaveAttr(*[None]*20)
188 |
189 | # "Parameters" table
190 | PhysioIDAttr = namedtuple('PhysioIDAttr', (
191 | # I guess this is a standard code of some sort (comment says HL7)?
192 | 'mdil_code',
193 |
194 | # The Philips internal identifier for the signal/parameter, used
195 | # in various structures. (These don't appear related to the
196 | # "StardateNom" numbering system used in DataExport and RDE.)
197 | 'physio_id',
198 |
199 | # Short description of the signal/parameter.
200 | 'label',
201 |
202 | # Verbose description of the signal/parameter.
203 | 'description',
204 |
205 | # I guess this is another standard code of some sort? Often this
206 | # equals the PhysioId.
207 | 'mdc_code',
208 |
209 | # I guess this is another standard code, in this case a symbolic
210 | # name.
211 | 'mdc_label',
212 |
213 | # Defines how the physioid is used, I guess: "wave", "numeric",
214 | # "numeric/wave", "setting/numeric", or "string/enumeration".
215 | # Maybe other possibilities, who knows?
216 | 'type',
217 |
218 | # ???
219 | 'hl7_outbound',
220 |
221 | # ???
222 | 'data_warehouse_connect'))
223 |
224 | # "Units-Of-Measure" table
225 | UnitAttr = namedtuple('UnitAttr', (
226 | # I guess this is a standard code of some sort (comment says HL7)?
227 | 'mdil_code',
228 |
229 | # The Philips internal identifier for the unit, used in various
230 | # structures.
231 | 'unit_code',
232 |
233 | # Abbreviation for the unit. Not typographically consistent
234 | # ("°F", "/mm³", "cmH2O/l/s", "1/nl", ...)
235 | 'label',
236 |
237 | # I guess this is another standard code of some sort? Often this
238 | # equals the unit_code.
239 | 'mdc_code',
240 |
241 | # I guess this is another standard code, in this case a symbolic
242 | # name.
243 | 'mdc_label',
244 |
245 | # Verbose description, even more typographically inconsistent than
246 | # the label.
247 | 'description'))
248 |
--------------------------------------------------------------------------------
/test-parsers:
--------------------------------------------------------------------------------
1 | #!/usr/bin/python3
2 |
3 | from datetime import datetime
4 |
5 | from downcast.server import DWCDB
6 | from downcast.parser import (WaveSampleParser, DummyWaveSampleParser,
7 | AlertParser, NumericValueParser,
8 | EnumerationValueParser, WaveAttrParser,
9 | NumericAttrParser, EnumerationAttrParser,
10 | BedTagParser, PatientDateAttributeParser,
11 | PatientStringAttributeParser,
12 | PatientBasicInfoParser, PatientMappingParser)
13 |
14 | DWCDB.load_config('server.conf')
15 | db = DWCDB('demo')
16 | conn = db.connect()
17 |
18 | def test(parser):
19 | for (query, handler) in parser.queries():
20 | print()
21 | print(query)
22 | cursor = conn.cursor()
23 | cursor.execute(*query)
24 | row = cursor.fetchone()
25 | nresults = 0
26 | while row is not None:
27 | msg = handler(db, row)
28 | if msg is not None:
29 | nresults += 1
30 | print(msg)
31 | row = cursor.fetchone()
32 | cursor.close()
33 | if nresults == 0:
34 | raise Exception("no results!")
35 |
36 | def pt(s):
37 | return datetime.strptime(s, '%b %d %Y %H:%M:%S.%f %z')
38 |
39 | ## note the following queries are not necessarily efficient, but we
40 | ## want to individually test each possible constraint
41 |
42 | ## note that some of these timestamps are in the "wrong" timezone; we
43 | ## want to be sure that the server DTRT.
44 |
45 | test(WaveSampleParser(limit = 2))
46 | test(WaveSampleParser(limit = 1, mapping_id = '85965f09-e8c2-4e79-8c1c-cb1775bd2550'))
47 | test(WaveSampleParser(limit = 1, time_gt = pt('Jan 28 2016 17:00:00.0 -0400')))
48 | test(WaveSampleParser(limit = 1, time_ge = pt('Jan 28 2016 17:00:00.0 -0400')))
49 | test(WaveSampleParser(limit = 1, time_lt = pt('Jan 28 2016 16:00:00.0 -0500')))
50 | test(WaveSampleParser(limit = 1, time_le = pt('Jan 28 2016 16:00:00.0 -0500')))
51 | test(WaveSampleParser(limit = 1, time = pt('Jan 28 2016 14:40:29.321 -0400')))
52 | test(WaveSampleParser(limit = 1, seqnum_ge = 507279000000))
53 | test(WaveSampleParser(limit = 1, seqnum_gt = 507279000000))
54 | test(WaveSampleParser(limit = 1, seqnum_le = 507279000000))
55 | test(WaveSampleParser(limit = 1, seqnum_lt = 507279000000))
56 | test(WaveSampleParser(limit = 1, seqnum = 507278429440))
57 | test(DummyWaveSampleParser(limit = 2))
58 |
59 | test(AlertParser(limit = 2))
60 | test(AlertParser(limit = 1, mapping_id = '7cc594d9-d8dc-4bc7-9522-59cbc8091d23'))
61 | test(AlertParser(limit = 1, time_gt = pt('Jan 28 2016 16:00:00.0 -0500')))
62 | test(AlertParser(limit = 1, time_ge = pt('Jan 28 2016 16:00:00.0 -0500')))
63 | test(AlertParser(limit = 1, time_lt = pt('Jan 28 2016 16:00:00.0 -0500')))
64 | test(AlertParser(limit = 1, time_le = pt('Jan 28 2016 16:00:00.0 -0500')))
65 | test(AlertParser(limit = 1, time = pt('Jan 28 2016 13:30:05.755 -0500')))
66 | test(AlertParser(limit = 1, seqnum_gt = 507279000000))
67 | test(AlertParser(limit = 1, seqnum_ge = 507279000000))
68 | test(AlertParser(limit = 1, seqnum_lt = 507279000000))
69 | test(AlertParser(limit = 1, seqnum_le = 507279000000))
70 | test(AlertParser(limit = 1, seqnum = 507277805824))
71 |
72 | test(NumericValueParser(limit = 2))
73 | test(NumericValueParser(limit = 1, mapping_id = '655d8b35-cdb7-46aa-84d8-bed0dece0cb2'))
74 | test(NumericValueParser(limit = 1, time_gt = pt('Jan 28 2016 16:00:00.0 -0500')))
75 | test(NumericValueParser(limit = 1, time_ge = pt('Jan 28 2016 16:00:00.0 -0500')))
76 | test(NumericValueParser(limit = 1, time_lt = pt('Jan 28 2016 16:00:00.0 -0500')))
77 | test(NumericValueParser(limit = 1, time_le = pt('Jan 28 2016 16:00:00.0 -0500')))
78 | test(NumericValueParser(limit = 1, time = pt('Jan 28 2016 16:33:27.0 -0500')))
79 | test(NumericValueParser(limit = 1, seqnum_gt = 507279000000))
80 | test(NumericValueParser(limit = 1, seqnum_ge = 507279000000))
81 | test(NumericValueParser(limit = 1, seqnum_lt = 507279000000))
82 | test(NumericValueParser(limit = 1, seqnum_le = 507279000000))
83 | test(NumericValueParser(limit = 1, seqnum = 507278429440))
84 |
85 | test(EnumerationValueParser(limit = 2))
86 | test(EnumerationValueParser(limit = 1, mapping_id = '466fcc4c-7d8c-4c59-b00c-80aba6e7605d'))
87 | test(EnumerationValueParser(limit = 1, time_gt = pt('Jan 28 2016 16:00:00.0 -0500')))
88 | test(EnumerationValueParser(limit = 1, time_ge = pt('Jan 28 2016 16:00:00.0 -0500')))
89 | test(EnumerationValueParser(limit = 1, time_lt = pt('Jan 28 2016 16:00:00.0 -0500')))
90 | test(EnumerationValueParser(limit = 1, time_le = pt('Jan 28 2016 16:00:00.0 -0500')))
91 | test(EnumerationValueParser(limit = 1, time = pt('Jan 28 2016 13:40:29.577 -0500')))
92 | test(EnumerationValueParser(limit = 1, seqnum_gt = 507279000000))
93 | test(EnumerationValueParser(limit = 1, seqnum_ge = 507279000000))
94 | test(EnumerationValueParser(limit = 1, seqnum_lt = 507279000000))
95 | test(EnumerationValueParser(limit = 1, seqnum_le = 507279000000))
96 | test(EnumerationValueParser(limit = 1, seqnum = 507278429440))
97 |
98 | test(WaveAttrParser(limit = 2))
99 | test(WaveAttrParser(limit = 1, wave_id = 1)) # (ART)
100 |
101 | test(NumericAttrParser(limit = 2))
102 | test(NumericAttrParser(limit = 1, numeric_id = 1)) # (HR)
103 |
104 | test(EnumerationAttrParser(limit = 2))
105 | test(EnumerationAttrParser(limit = 1, enumeration_id = 1)) # (RhySta)
106 |
107 | test(BedTagParser(limit = 2))
108 | test(BedTagParser(limit = 1, bed_label = 'CDBed1'))
109 | test(BedTagParser(limit = 1, time_gt = pt('Jan 28 2016 12:00:00.0 -0500')))
110 | test(BedTagParser(limit = 1, time_ge = pt('Jan 28 2016 12:00:00.0 -0500')))
111 | test(BedTagParser(limit = 1, time_lt = pt('Jan 28 2016 16:00:00.0 -0500')))
112 | test(BedTagParser(limit = 1, time_le = pt('Jan 28 2016 16:00:00.0 -0500')))
113 | test(BedTagParser(limit = 1, time = pt('Jan 28 2016 13:30:23.202 -0500')))
114 |
115 | test(PatientDateAttributeParser(limit = 2))
116 | test(PatientDateAttributeParser(limit = 1, patient_id = '31c1da32-2ea1-4166-a7eb-2d9738967412'))
117 | test(PatientDateAttributeParser(limit = 1, attr = 'DOB'))
118 | test(PatientDateAttributeParser(limit = 1, time_gt = pt('Jan 28 2016 12:00:00.0 -0500')))
119 | test(PatientDateAttributeParser(limit = 1, time_ge = pt('Jan 28 2016 12:00:00.0 -0500')))
120 | test(PatientDateAttributeParser(limit = 1, time_le = pt('Jan 28 2016 16:00:00.0 -0500')))
121 | test(PatientDateAttributeParser(limit = 1, time_lt = pt('Jan 28 2016 16:00:00.0 -0500')))
122 | test(PatientDateAttributeParser(limit = 1, time = pt('Jan 28 2016 13:30:23.202 -0500')))
123 |
124 | test(PatientStringAttributeParser(limit = 2))
125 | test(PatientStringAttributeParser(limit = 1, patient_id = '31c1da32-2ea1-4166-a7eb-2d9738967412'))
126 | test(PatientStringAttributeParser(limit = 1, attr = 'FirstName'))
127 | test(PatientStringAttributeParser(limit = 1, time_gt = pt('Jan 28 2016 12:00:00.0 -0500')))
128 | test(PatientStringAttributeParser(limit = 1, time_ge = pt('Jan 28 2016 12:00:00.0 -0500')))
129 | test(PatientStringAttributeParser(limit = 1, time_lt = pt('Jan 28 2016 16:00:00.0 -0500')))
130 | test(PatientStringAttributeParser(limit = 1, time_le = pt('Jan 28 2016 16:00:00.0 -0500')))
131 | test(PatientStringAttributeParser(limit = 1, time = pt('Jan 28 2016 13:30:23.202 -0500')))
132 |
133 | test(PatientBasicInfoParser(limit = 2))
134 | test(PatientBasicInfoParser(limit = 1, patient_id = '31c1da32-2ea1-4166-a7eb-2d9738967412'))
135 | test(PatientBasicInfoParser(limit = 1, time_gt = pt('Jan 28 2016 12:00:00.0 -0500')))
136 | test(PatientBasicInfoParser(limit = 1, time_ge = pt('Jan 28 2016 12:00:00.0 -0500')))
137 | test(PatientBasicInfoParser(limit = 1, time_lt = pt('Jan 28 2016 16:00:00.0 -0500')))
138 | test(PatientBasicInfoParser(limit = 1, time_le = pt('Jan 28 2016 16:00:00.0 -0500')))
139 | test(PatientBasicInfoParser(limit = 1, time = pt('Jan 28 2016 13:30:23.202 -0500')))
140 |
141 | test(PatientMappingParser(limit = 2))
142 | test(PatientMappingParser(limit = 1, patient_id = '31c1da32-2ea1-4166-a7eb-2d9738967412'))
143 | test(PatientMappingParser(limit = 1, mapping_id = '466fcc4c-7d8c-4c59-b00c-80aba6e7605d'))
144 | test(PatientMappingParser(limit = 1, time_gt = pt('Jan 28 2016 12:00:00.0 -0500')))
145 | test(PatientMappingParser(limit = 1, time_ge = pt('Jan 28 2016 12:00:00.0 -0500')))
146 | test(PatientMappingParser(limit = 1, time_lt = pt('Jan 28 2016 16:00:00.0 -0500')))
147 | test(PatientMappingParser(limit = 1, time_le = pt('Jan 28 2016 16:00:00.0 -0500')))
148 | test(PatientMappingParser(limit = 1, time = pt('Jan 28 2016 13:26:53.456 -0500')))
149 | test(PatientMappingParser(limit = 1, hostname = 'RDEGEN8-1'))
150 | test(PatientMappingParser(limit = 1, is_mapped = False))
151 |
--------------------------------------------------------------------------------
/downcast/output/numerics.py:
--------------------------------------------------------------------------------
1 | #
2 | # downcast - tools for unpacking patient data from DWC
3 | #
4 | # Copyright (c) 2017 Laboratory for Computational Physiology
5 | #
6 | # This program is free software: you can redistribute it and/or modify
7 | # it under the terms of the GNU General Public License as published by
8 | # the Free Software Foundation, either version 3 of the License, or
9 | # (at your option) any later version.
10 | #
11 | # This program is distributed in the hope that it will be useful,
12 | # but WITHOUT ANY WARRANTY; without even the implied warranty of
13 | # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14 | # GNU General Public License for more details.
15 | #
16 | # You should have received a copy of the GNU General Public License
17 | # along with this program. If not, see .
18 |
19 | from datetime import datetime, timezone
20 | import heapq
21 |
22 | from ..messages import NumericValueMessage
23 | from ..util import string_to_ascii
24 |
25 | class NumericValueHandler:
26 | def __init__(self, archive):
27 | self.archive = archive
28 | self.last_periodic = {}
29 | self.last_aperiodic = {}
30 |
31 | def send_message(self, chn, msg, source, ttl):
32 | if not isinstance(msg, NumericValueMessage):
33 | return
34 |
35 | source.nack_message(chn, msg, self)
36 |
37 | # Load metadata for this numeric
38 | attr = msg.origin.get_numeric_attr(msg.numeric_id, (ttl <= 0))
39 | if attr is None:
40 | # Metadata not yet available - hold message in pending and
41 | # continue processing
42 | return
43 |
44 | # Look up the corresponding record
45 | record = self.archive.get_record(msg)
46 | if record is None:
47 | # Record not yet available - hold message in pending and
48 | # continue processing
49 | return
50 |
51 | # Dump original message to BCP file if desired
52 | if record.dump(msg):
53 | source.ack_message(chn, msg, self)
54 | return
55 |
56 | if attr.is_aperiodic:
57 | # Open or create a log file
58 | logfile = record.open_log_file('_phi_aperiodics')
59 |
60 | # Write the sequence number to the log file
61 | # (if it doesn't differ from the previous event)
62 | sn = msg.sequence_number
63 | old_sn = self.last_aperiodic.get(record, None)
64 | if sn != old_sn:
65 | logfile.append('S%s' % sn)
66 | self.last_aperiodic[record] = sn
67 |
68 | # Write the value to the log file
69 | lbl = string_to_ascii(attr.sub_label)
70 | ulbl = string_to_ascii(attr.unit_label)
71 | val = msg.value
72 | if val is None:
73 | val = ''
74 | logfile.append('%s\t%s\t%s' % (lbl, val, ulbl))
75 | source.ack_message(chn, msg, self)
76 |
77 | else:
78 | # Open or create a log file
79 | logfile = record.open_log_file('_phi_numerics')
80 |
81 | # Write the sequence number and timestamp to the log file
82 | # (if they don't differ from the previous event)
83 | sn = msg.sequence_number
84 | ts = msg.timestamp
85 | (old_sn, old_ts) = self.last_periodic.get(record, (None, None))
86 | if sn != old_sn:
87 | logfile.append('S%s' % sn)
88 | if ts != old_ts:
89 | logfile.append(ts.strftime_utc('%Y%m%d%H%M%S%f'))
90 | self.last_periodic[record] = (sn, ts)
91 |
92 | # Write the value to the log file
93 | lbl = string_to_ascii(attr.sub_label)
94 | ulbl = string_to_ascii(attr.unit_label)
95 | val = msg.value
96 | if val is None:
97 | val = ''
98 | logfile.append('%s\t%s\t%s' % (lbl, val, ulbl))
99 | source.ack_message(chn, msg, self)
100 |
101 | def flush(self):
102 | self.archive.flush()
103 |
104 | def _strip_csv_meta(string):
105 | return string.replace(b',', b'_').replace(b'"', b'_')
106 |
107 | class NumericValueFinalizer:
108 | def __init__(self, record):
109 | self.record = record
110 |
111 | # Scan the log files; make a list of all non-null
112 | # numerics, and add timestamps to the time map
113 | raw_numerics = set()
114 |
115 | self.periodic_log = record.open_log_reader('_phi_numerics',
116 | allow_missing = True)
117 | for (sn, ts, line) in self.periodic_log.unsorted_items():
118 | ts = datetime.strptime(str(ts), '%Y%m%d%H%M%S%f')
119 | ts = ts.replace(tzinfo = timezone.utc)
120 | record.time_map.add_time(ts)
121 | if b'\030' not in line:
122 | parts = line.rstrip(b'\n').split(b'\t')
123 | # ignore nulls
124 | if len(parts) >= 3 and parts[1]:
125 | raw_numerics.add((parts[0], parts[2]))
126 |
127 | self.aperiodic_log = record.open_log_reader('_phi_aperiodics',
128 | allow_missing = True)
129 | for (sn, _, line) in self.aperiodic_log.unsorted_items():
130 | if b'\030' not in line:
131 | parts = line.rstrip(b'\n').split(b'\t')
132 | # ignore nulls
133 | if len(parts) >= 3 and parts[1]:
134 | raw_numerics.add((parts[0], parts[2]))
135 |
136 | self.norm_numerics = {}
137 | for (raw_name, raw_units) in raw_numerics:
138 | norm_name = _strip_csv_meta(raw_name.strip())
139 | norm_units = _strip_csv_meta(raw_units.strip()) or b'NU'
140 | self.norm_numerics[(raw_name, raw_units)] = (norm_name, norm_units)
141 |
142 | def finalize_record(self):
143 | sn0 = self.record.seqnum0()
144 |
145 | if self.norm_numerics:
146 | num_columns = sorted(set(self.norm_numerics.values()))
147 | num_index = {}
148 | for (raw_key, norm_key) in self.norm_numerics.items():
149 | num_index[raw_key] = num_columns.index(norm_key) + 1
150 |
151 | nf = self.record.open_log_file('numerics.csv', truncate = True)
152 | row = [b'"time"']
153 | for (name, units) in num_columns:
154 | desc = name + b' [' + units + b']'
155 | row.append(b'"' + desc.replace(b'"', b'""') + b'"')
156 | cur_ts = None
157 | cur_sn = None
158 | cur_time = None
159 | for (sn, ts, line) in heapq.merge(
160 | self.periodic_log.sorted_items(),
161 | self.aperiodic_log.sorted_items()):
162 | if b'\030' in line:
163 | continue
164 | parts = line.rstrip(b'\n').split(b'\t')
165 | # ignore nulls
166 | if len(parts) < 3 or not parts[1]:
167 | continue
168 | col_id = (parts[0], parts[2])
169 |
170 | # determine new time value
171 | if ts == cur_ts and sn == cur_sn:
172 | time = cur_time
173 | else:
174 | if ts == 0:
175 | # for aperiodics (such as NBP), use sequence number as
176 | # observation time
177 | obs_sn = sn
178 | else:
179 | # for periodics, translate timestamp to
180 | # sequence number and use that as observation
181 | # time
182 | ts = datetime.strptime(str(ts), '%Y%m%d%H%M%S%f')
183 | ts = ts.replace(tzinfo = timezone.utc)
184 | obs_sn = self.record.time_map.get_seqnum(ts, sn + 5120)
185 | if obs_sn is None:
186 | obs_sn = sn
187 |
188 | if sn0 is None:
189 | sn0 = obs_sn
190 | # Time measured in counter ticks, ick.
191 | # Better would probably be to use (real) seconds
192 | time = str(obs_sn - sn0).encode()
193 | cur_ts = ts
194 | cur_sn = sn
195 | cur_time = time
196 |
197 | # write out a complete row if the time value has changed
198 | if time != row[0]:
199 | nf.fp.write(b','.join(row))
200 | nf.fp.write(b'\n')
201 | row = [time] + [b''] * len(num_columns)
202 | row[num_index[col_id]] = parts[1].rstrip(b'0').rstrip(b'.')
203 | # write the final row
204 | nf.fp.write(b','.join(row))
205 | nf.fp.write(b'\n')
206 |
--------------------------------------------------------------------------------
/bcp-scripts/bulk-verify:
--------------------------------------------------------------------------------
1 | #!/usr/bin/perl -w
2 | #
3 | # bulk-verify - check syntax of DWC-BCP data files
4 | #
5 | # Copyright (c) 2018 Laboratory for Computational Physiology
6 | #
7 | # This program is free software: you can redistribute it and/or modify
8 | # it under the terms of the GNU General Public License as published by
9 | # the Free Software Foundation, either version 3 of the License, or
10 | # (at your option) any later version.
11 | #
12 | # This program is distributed in the hope that it will be useful,
13 | # but WITHOUT ANY WARRANTY; without even the implied warranty of
14 | # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15 | # GNU General Public License for more details.
16 | #
17 | # You should have received a copy of the GNU General Public License
18 | # along with this program. If not, see .
19 |
20 | use strict;
21 | use Getopt::Long qw(:config gnu_getopt);
22 |
23 | my $HEX = qr/[0-9A-F]/;
24 | my $UUID = qr/$HEX{8}(?:-$HEX{4}){3}-$HEX{12}/;
25 | my $HEX_I = qr/[0-9A-F]/i;
26 | my $UUID_STRING = qr/$HEX_I{8}(?:-$HEX_I{4}){3}-$HEX_I{12}/;
27 | my $DATE = qr/\d{4,}-\d{2}-\d{2}/;
28 | my $TIMESTAMP = qr/$DATE \d{2}:\d{2}:\d{2}\.\d+ [-+]\d{2}:\d{2}/;
29 | my $INTEGER = qr/-?\d+/;
30 | my $DECIMAL = qr/-?\d*\.\d+/;
31 |
32 | my $SAMPLE_INDEX_LIST = qr/\d+(?: \d+)*/;
33 | my $SAMPLE_RANGE_LIST = qr/\d+ \d+(?: \d+ \d+)*/;
34 |
35 | my %PATTERNS = (
36 | AdmitState => qr{\A$INTEGER?\z},
37 | AlertId => qr{\A$UUID\z},
38 | Alias => qr{.?}s,
39 | AnnounceTime => qr{\A$TIMESTAMP\z},
40 | BasePhysioId => qr{\A$INTEGER\z},
41 | BedLabel => qr{.?}s,
42 | CalibrationAbsLower => qr{\A$DECIMAL?\z},
43 | CalibrationAbsUpper => qr{\A$DECIMAL?\z},
44 | CalibrationScaledLower => qr{\A$INTEGER\z},
45 | CalibrationScaledUpper => qr{\A$INTEGER\z},
46 | CalibrationType => qr{\A$INTEGER\z},
47 | Category => qr{\A$INTEGER?\z},
48 | Channel => qr{\A$INTEGER\z},
49 | ClinicalUnit => qr{.?}s,
50 | Code => qr{\A$INTEGER\z},
51 | Color => qr{\A$INTEGER\z},
52 | CompoundValueId => qr{\A$UUID\z},
53 | EcgLeadPlacement => qr{\A$INTEGER\z},
54 | EndTime => qr{\A$TIMESTAMP\z},
55 | EnumerationId => qr{\A$INTEGER\z},
56 | Gender => qr{\A$INTEGER\z},
57 | Height => qr{\A$DECIMAL?\z},
58 | HeightUnit => qr{\A$INTEGER?\z},
59 | HighEdgeFrequency => qr{\A$DECIMAL?\z},
60 | Hostname => qr{.}s,
61 | Id => {
62 | Enumeration => qr{\A$INTEGER\z},
63 | Numeric => qr{\A$INTEGER\z},
64 | Wave => qr{\A$INTEGER\z},
65 | Patient => qr{\A$UUID_STRING\z},
66 | PatientMapping => qr{\A$UUID\z},
67 | },
68 | InvalidSamples => qr{\A$SAMPLE_RANGE_LIST?\z},
69 | IsAlarmingOff => qr{\A[01]\z},
70 | IsAperiodic => qr{\A[01]\z},
71 | IsDerived => qr{\A[01]\z},
72 | IsManual => qr{\A[01]\z},
73 | IsMapped => qr{\A1\z}, # we don't want pre-mapping mappings
74 | IsSilenced => qr{\A[01]\z},
75 | IsSlowWave => qr{\A[01]\z},
76 | IsTrendUploaded => qr{\A[01]\z},
77 | Kind => qr{\A$INTEGER\z},
78 | Label => qr{.}s,
79 | LowEdgeFrequency => qr{\A$DECIMAL?\z},
80 | LowerLimit => qr{\A$DECIMAL?\z},
81 | MappingId => qr{\A$UUID\z},
82 | MaxValues => qr{\A$INTEGER\z},
83 | Name => qr{\A\S+\z},
84 | NumericId => qr{\A$INTEGER\z},
85 | OnsetTime => qr{\A$TIMESTAMP\z},
86 | PacedMode => qr{\A$INTEGER?\z},
87 | PacedPulses => qr{\A$SAMPLE_INDEX_LIST?\z},
88 | PatientId => qr{\A$UUID_STRING\z},
89 | PhysioId => qr{\A$INTEGER\z},
90 | PressureUnit => qr{\A$INTEGER?\z},
91 | ResuscitationStatus => qr{\A$INTEGER?\z},
92 | SamplePeriod => qr{\A$INTEGER\z},
93 | Scale => qr{\A$INTEGER\z},
94 | ScaleLower => qr{\A$INTEGER\z},
95 | ScaleUpper => qr{\A$INTEGER\z},
96 | SequenceNumber => qr{\A$INTEGER\z},
97 | Severity => qr{\A$INTEGER\z},
98 | Source => qr{\A$INTEGER\z},
99 | SubLabel => qr{.}s,
100 | SubPhysioId => qr{\A$INTEGER\z},
101 | SubtypeId => qr{\A$INTEGER\z},
102 | Tag => qr{.}s,
103 | TimeStamp => qr{\A$TIMESTAMP\z},
104 | Timestamp => qr{\A$TIMESTAMP\z},
105 | UnavailableSamples => qr{\A$SAMPLE_RANGE_LIST?\z},
106 | UnitCode => qr{\A$INTEGER\z},
107 | UnitLabel => qr{.}s,
108 | UpperLimit => qr{\A$DECIMAL?\z},
109 | Validity => qr{\A$INTEGER\z},
110 | Value => {
111 | EnumerationValue => qr{.}s,
112 | NumericValue => qr{\A$DECIMAL?\z},
113 | PatientDateAttribute => qr{\A$DATE \d{2}:\d{2}:\d{2}\z},
114 | PatientStringAttribute => qr{.}s,
115 | },
116 | ValuePhysioId => qr{\A$INTEGER\z},
117 | WaveId => qr{\A$INTEGER\z},
118 | WaveSamples => qr{\A(?:..)+\z}s,
119 | Weight => qr{\A$DECIMAL?\z},
120 | WeightUnit => qr{\A$INTEGER?\z},
121 | );
122 |
123 | my $slistmax = 0;
124 |
125 | sub check_sample_list {
126 | $slistmax = length($_) if length($_) > $slistmax;
127 | my ($x, @n) = split / /;
128 | while (@n) {
129 | my $y = shift @n;
130 | return 0 if $x >= $y;
131 | $x = $y;
132 | }
133 | return 1;
134 | }
135 |
136 | my %CHECKFUNC = (
137 | PacedPulses => \&check_sample_list,
138 | # UnavailableSamples => \&check_sample_list,
139 | # InvalidSamples => \&check_sample_list,
140 | );
141 |
142 | my $exit_status = 0;
143 | my @ignored_bad_columns;
144 |
145 | GetOptions('force-invalid=s' => \@ignored_bad_columns) or die;
146 |
147 | foreach my $datafile (@ARGV) {
148 | my ($table) = split /\./, $datafile;
149 | my $fmtfile = "$table.fmt";
150 | my @cols;
151 |
152 | open FMT, $fmtfile or die "can't read $fmtfile: $!";
153 | $/ = "\n";
154 | my $ver = ;
155 | my $ncols = ;
156 | while () {
157 | s/^\s+//;
158 | my ($hcol, $type, $plen, $clen, $term, $tcol, $name) = split /\s+/;
159 | die "$fmtfile: wrong column number" if $hcol ne (@cols + 1);
160 | die "$fmtfile: invalid data type" if $type !~ /^SYB(?:CHAR|BINARY)$/;
161 | die "$fmtfile: invalid prefix size" if $plen !~ /^\d+$/;
162 | die "$fmtfile: invalid column size" if $clen ne -1;
163 | die "$fmtfile: invalid column separator" if $term !~ /^".*"$/;
164 | die "$fmtfile: invalid source column" if $tcol !~ /^\d+$/;
165 | my $pat = $PATTERNS{$name};
166 | my $func = $CHECKFUNC{$name};
167 | if (ref $pat eq 'HASH') {
168 | $pat = $pat->{$table};
169 | }
170 | die "$fmtfile: unknown column name" if !defined $pat;
171 |
172 | if ($term eq '"\t"' && $plen == 0) {
173 | push @cols, [ undef, "\t", $pat, $func, $name ];
174 | }
175 | elsif ($term eq '"\n"' && $plen == 0) {
176 | push @cols, [ undef, "\n", $pat, $func, $name ];
177 | }
178 | elsif ($term eq '""' && $plen == 4) {
179 | push @cols, [ 4, 'V', $pat, $func, $name ];
180 | }
181 | else {
182 | die "$fmtfile: unknown column specification";
183 | }
184 | }
185 | close FMT;
186 | if (@cols != $ncols) {
187 | die "$fmtfile: incorrect number of columns";
188 | }
189 |
190 | open DATA, $datafile or die "can't read $datafile: $!";
191 | my $nrows = 0;
192 | my $invalid = 0;
193 | my %invalid_cols;
194 | while (!eof DATA) {
195 | $nrows++;
196 | foreach my $c (@cols) {
197 | if ($c->[0]) {
198 | if ($c->[0] != read DATA, $_, $c->[0]) {
199 | print STDERR "$datafile:R$nrows: unexpected EOF (in $c->[3])\n";
200 | $invalid = 1;
201 | last;
202 | }
203 | my $n = unpack $c->[1], $_;
204 | if ($n != read DATA, $_, $n) {
205 | print STDERR "$datafile:R$nrows: unexpected EOF (in $c->[3])\n";
206 | $invalid = 1;
207 | last;
208 | }
209 | }
210 | else {
211 | $/ = $c->[1];
212 | $_ = ;
213 | if (!chomp) {
214 | print STDERR "$datafile:R$nrows: unexpected EOF (in $c->[3])\n";
215 | $invalid = 1;
216 | last;
217 | }
218 | }
219 | if ($_ !~ $c->[2] or ($c->[3] and !&{$c->[3]})) {
220 | $invalid_cols{$c->[4]}++;
221 | if ($invalid_cols{$c->[4]} <= 5) {
222 | s/([\\"])/\\$1/g;
223 | s{([\000-\037])}{sprintf '\\%03o', ord $1}eg;
224 | my $pos = tell DATA;
225 | print STDERR "$datafile:R$nrows:\@$pos: invalid $c->[4]\n";
226 | print STDERR " value: \"$_\"\n";
227 | print STDERR " expected: $c->[2]\n";
228 | }
229 | }
230 | }
231 | }
232 | close DATA;
233 |
234 | foreach my $c (sort keys %invalid_cols) {
235 | if (!grep { $_ eq $c } @ignored_bad_columns) {
236 | $invalid = 1;
237 | $exit_status = 1;
238 | }
239 | }
240 |
241 | if ($invalid) {
242 | print '-' x 32, " $datafile ($nrows)";
243 | }
244 | else {
245 | my ($md5) = (`md5sum $datafile` =~ /^([0-9a-f]{32})/);
246 | print "$md5 $datafile $nrows";
247 | }
248 | foreach my $c (sort keys %invalid_cols) {
249 | print "\t(", $invalid_cols{$c}, " $c)";
250 | }
251 | print "\n";
252 | }
253 |
254 | if ($slistmax == 2048) {
255 | print "*** Warning: sample lists may have been truncated\n";
256 | }
257 |
258 | exit ($exit_status);
259 |
--------------------------------------------------------------------------------
/downcast/main.py:
--------------------------------------------------------------------------------
1 | #
2 | # downcast - tools for unpacking patient data from DWC
3 | #
4 | # Copyright (c) 2018 Laboratory for Computational Physiology
5 | #
6 | # This program is free software: you can redistribute it and/or modify
7 | # it under the terms of the GNU General Public License as published by
8 | # the Free Software Foundation, either version 3 of the License, or
9 | # (at your option) any later version.
10 | #
11 | # This program is distributed in the hope that it will be useful,
12 | # but WITHOUT ANY WARRANTY; without even the implied warranty of
13 | # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14 | # GNU General Public License for more details.
15 | #
16 | # You should have received a copy of the GNU General Public License
17 | # along with this program. If not, see .
18 |
19 | import sys
20 | import os
21 | import resource
22 | from argparse import ArgumentParser, ArgumentTypeError
23 | from datetime import timedelta
24 |
25 | from .server import DWCDB
26 | from .timestamp import T
27 | from .extractor import (Extractor, WaveSampleQueue, NumericValueQueue,
28 | EnumerationValueQueue, AlertQueue,
29 | PatientMappingQueue, PatientBasicInfoQueue,
30 | PatientDateAttributeQueue,
31 | PatientStringAttributeQueue, BedTagQueue)
32 |
33 | from .output.archive import Archive
34 | from .output.numerics import NumericValueHandler
35 | from .output.waveforms import WaveSampleHandler
36 | from .output.enums import EnumerationValueHandler
37 | from .output.alerts import AlertHandler
38 | from .output.mapping import PatientMappingHandler
39 | from .output.patients import PatientHandler
40 |
41 | def main(args = None):
42 | (_, n) = resource.getrlimit(resource.RLIMIT_NOFILE)
43 | if n != resource.RLIM_INFINITY and n < 4096:
44 | sys.exit('RLIMIT_NOFILE too low (%d)' % (n,))
45 | resource.setrlimit(resource.RLIMIT_NOFILE, (n, n))
46 |
47 | opts = _parse_cmdline(args)
48 | _main_loop(opts)
49 |
50 | def _parse_timestamp(arg):
51 | try:
52 | return T(arg)
53 | except Exception:
54 | raise ArgumentTypeError(
55 | "%r is not in the format 'YYYY-MM-DD HH:MM:SS.SSS +ZZ:ZZ'" % arg)
56 |
57 | def _parse_cmdline(args):
58 | p = ArgumentParser(
59 | description = 'Extract and convert DWC patient data.',
60 | fromfile_prefix_chars = '@')
61 |
62 | g = p.add_argument_group('input selection')
63 | g.add_argument('--server', metavar = 'NAME',
64 | help = 'name of DWC database server')
65 | g.add_argument('--password-file', metavar = 'FILE',
66 | default = 'server.conf',
67 | help = 'file containing login credentials')
68 |
69 | g = p.add_argument_group('output database location')
70 | g.add_argument('--output-dir', metavar = 'DIR',
71 | help = 'directory to store output database')
72 | g.add_argument('--state-dir', metavar = 'DIR',
73 | help = 'directory to store state files')
74 |
75 | g = p.add_argument_group('conversion modes')
76 | g.add_argument('--init', action = 'store_true',
77 | help = 'initialize a new output database')
78 | g.add_argument('--batch', action = 'store_true',
79 | help = 'process available data and exit')
80 | g.add_argument('--live', action = 'store_true',
81 | help = 'collect data continuously')
82 | g.add_argument('--start', metavar = 'TIME', type = _parse_timestamp,
83 | help = 'begin collecting data at the given time')
84 | g.add_argument('--end', metavar = 'TIME', type = _parse_timestamp,
85 | help = 'collect data up to the given time')
86 | g.add_argument('--partial', action = 'store_true',
87 | help = 'include partial records at start time')
88 | g.add_argument('--terminate', action = 'store_true',
89 | help = 'handle final data after permanent shutdown')
90 |
91 | opts = p.parse_args(args)
92 | progname = sys.argv[0]
93 |
94 | if opts.output_dir is None:
95 | sys.exit(('%s: no --output-dir specified' % progname)
96 | + '\n' + p.format_usage())
97 | if opts.server is None:
98 | sys.exit(('%s: no --server specified' % progname)
99 | + '\n' + p.format_usage())
100 |
101 | if (opts.init + opts.batch + opts.live) != 1:
102 | sys.exit(('%s: must specify exactly one of --init, --batch, or --live'
103 | % progname) + '\n' + p.format_usage())
104 |
105 | if opts.start is not None and not opts.init:
106 | sys.exit(('%s: --start can only be used with --init' % progname)
107 | + '\n' + p.format_usage())
108 | if opts.end is not None and not opts.batch:
109 | sys.exit(('%s: --end can only be used with --batch' % progname)
110 | + '\n' + p.format_usage())
111 |
112 | if opts.state_dir is None:
113 | opts.state_dir = opts.output_dir
114 |
115 | if opts.init:
116 | if os.path.exists(opts.state_dir):
117 | sys.exit("%s: directory %s already exists"
118 | % (progname, opts.state_dir))
119 | if os.path.exists(opts.output_dir):
120 | sys.exit("%s: directory %s already exists"
121 | % (progname, opts.state_dir))
122 | else:
123 | if not os.path.isdir(opts.state_dir):
124 | sys.exit("%s: directory %s does not exist"
125 | % (progname, opts.state_dir))
126 | if not os.path.isdir(opts.output_dir):
127 | sys.exit("%s: directory %s does not exist"
128 | % (progname, opts.state_dir))
129 | return opts
130 |
131 | def _init_extractor(opts):
132 | DWCDB.load_config(opts.password_file)
133 |
134 | db = DWCDB(opts.server)
135 | ex = Extractor(db, opts.state_dir, fatal_exceptions = True,
136 | deterministic_output = True, debug = True)
137 |
138 | pmq = PatientMappingQueue('mapping',
139 | start_time = opts.start,
140 | end_time = opts.end)
141 | ex.add_queue(pmq)
142 |
143 | ex.add_queue(WaveSampleQueue(
144 | 'waves',
145 | start_time = opts.start, end_time = opts.end))
146 | ex.add_queue(NumericValueQueue(
147 | 'numerics',
148 | start_time = opts.start, end_time = opts.end))
149 | ex.add_queue(EnumerationValueQueue(
150 | 'enums',
151 | start_time = opts.start, end_time = opts.end))
152 | ex.add_queue(AlertQueue(
153 | 'alerts',
154 | start_time = opts.start, end_time = opts.end))
155 | return ex
156 |
157 | def _init_archive(opts, extractor):
158 | a = Archive(opts.output_dir, deterministic_output = True)
159 |
160 | # Scan the output directory to find patients for whom we have not
161 | # seen any data for a long time, and finalize those records. We
162 | # need to do this periodically since otherwise nothing would
163 | # finalize records at the end of a patient stay.
164 | synctime = extractor.fully_processed_timestamp()
165 | a.finalize_before(synctime)
166 | a.flush()
167 |
168 | extractor.add_handler(NumericValueHandler(a))
169 | extractor.add_handler(WaveSampleHandler(a))
170 | extractor.add_handler(EnumerationValueHandler(a))
171 | extractor.add_handler(AlertHandler(a))
172 | extractor.add_handler(PatientMappingHandler(a))
173 |
174 | # FIXME: Handling patient messages is disabled for now - it causes
175 | # archive to split records unnecessarily.
176 | #extractor.add_handler(PatientHandler(a))
177 |
178 | # Create or refresh state files, and fail if they're not writable
179 | extractor.flush()
180 | return a
181 |
182 | def _main_loop(opts):
183 | if opts.init:
184 | # In --init mode, simply create the extractor and write the
185 | # initial queue state files.
186 | if opts.start and not opts.partial:
187 | os.makedirs(opts.output_dir, exist_ok = True)
188 | horizon_file = os.path.join(opts.output_dir, '%horizon')
189 | with open(horizon_file, 'w') as hf:
190 | hf.write(str(opts.start) + '\n')
191 |
192 | extractor = _init_extractor(opts)
193 | extractor.flush()
194 | return
195 |
196 | # Otherwise, feed data from the extractor into the archive until
197 | # we reach the desired end point.
198 | while True:
199 | # We periodically stop and re-create the extractor and
200 | # archive, so that records can be finalized at the end of a
201 | # stay. (We can't simply invoke finalize_before on a live
202 | # Archive object because different patients are handled by
203 | # different subprocesses - each process only knows about the
204 | # patients that have been delegated to it.)
205 | extractor = _init_extractor(opts)
206 | _init_archive(opts, extractor)
207 | next_sync = (extractor.fully_processed_timestamp()
208 | + timedelta(hours = 3))
209 | try:
210 | # Save state to disk after every 500 queries.
211 | n = 500
212 | while extractor.fully_processed_timestamp() < next_sync:
213 | if extractor.idle() and not opts.live:
214 | if opts.terminate:
215 | extractor.dispatcher.terminate()
216 | extractor.flush()
217 | a = Archive(opts.output_dir)
218 | a.terminate()
219 | return
220 |
221 | extractor.run()
222 | n -= 1
223 | if n <= 0:
224 | extractor.flush()
225 | n = 500
226 | finally:
227 | extractor.flush()
228 | extractor.dispatcher.shutdown()
229 |
--------------------------------------------------------------------------------
/downcast/server.py:
--------------------------------------------------------------------------------
1 | #
2 | # downcast - tools for unpacking patient data from DWC
3 | #
4 | # Copyright (c) 2018 Laboratory for Computational Physiology
5 | #
6 | # This program is free software: you can redistribute it and/or modify
7 | # it under the terms of the GNU General Public License as published by
8 | # the Free Software Foundation, either version 3 of the License, or
9 | # (at your option) any later version.
10 | #
11 | # This program is distributed in the hope that it will be useful,
12 | # but WITHOUT ANY WARRANTY; without even the implied warranty of
13 | # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14 | # GNU General Public License for more details.
15 | #
16 | # You should have received a copy of the GNU General Public License
17 | # along with this program. If not, see .
18 |
19 | from configparser import ConfigParser
20 | import logging
21 | import warnings
22 | import os
23 |
24 | from .parser import (WaveAttrParser, NumericAttrParser,
25 | EnumerationAttrParser, PatientMappingParser,
26 | DBSyntaxError)
27 | from .attributes import (undefined_wave, undefined_numeric,
28 | undefined_enumeration)
29 |
30 | class DWCDB:
31 | _config = None
32 | _config_path = ''
33 |
34 | def load_config(filename):
35 | DWCDB._config = ConfigParser()
36 | DWCDB._config.read(filename)
37 | DWCDB._config_path = os.path.dirname(filename)
38 |
39 | def __init__(self, servername):
40 | self._server = DWCDBServer.get(servername)
41 | self.servername = servername
42 | self.dialect = self._server.dialect
43 | self.paramstyle = self._server.paramstyle
44 |
45 | def __repr__(self):
46 | return ('%s(%r)' % (self.__class__.__name__, self.servername))
47 |
48 | def __getstate__(self):
49 | return self.servername
50 |
51 | def __setstate__(self, servername):
52 | DWCDB.__init__(self, servername)
53 |
54 | def connect(self):
55 | return self._server.connect()
56 |
57 | def get_messages(self, parser, connection = None, cursor = None):
58 | tmpconn = None
59 | tmpcur = None
60 | try:
61 | if cursor is not None:
62 | cur = cursor
63 | elif connection is not None:
64 | cur = tmpcur = connection.cursor()
65 | else:
66 | tmpconn = self._server.connect()
67 | cur = tmpcur = tmpconn.cursor()
68 | yield from parser.parse(self, cur)
69 | finally:
70 | if tmpcur is not None:
71 | tmpcur.close()
72 | if tmpconn is not None:
73 | tmpconn.close()
74 |
75 | def get_wave_attr(self, wave_id, sync):
76 | v = self._server.wave_attr.get(wave_id, None)
77 | if v is not None:
78 | return v
79 |
80 | p = WaveAttrParser(dialect = self.dialect,
81 | paramstyle = self.paramstyle,
82 | limit = 2, wave_id = wave_id)
83 | try:
84 | v = self._parse_attr(p, sync)
85 | except UnknownAttrError:
86 | if not self._server._warned_wave:
87 | logging.warning('unknown wave ID: %s' % wave_id)
88 | self._server._warned_wave = True
89 | v = undefined_wave
90 | except DBSyntaxError as e:
91 | warnings.warn(e.warning(), stacklevel = 2)
92 | v = undefined_wave
93 | except UnavailableAttrError:
94 | return None
95 | self._server.wave_attr[wave_id] = v
96 | return v
97 |
98 | def get_numeric_attr(self, numeric_id, sync):
99 | v = self._server.numeric_attr.get(numeric_id, None)
100 | if v is not None:
101 | return v
102 |
103 | p = NumericAttrParser(dialect = self.dialect,
104 | paramstyle = self.paramstyle,
105 | limit = 2, numeric_id = numeric_id)
106 | try:
107 | v = self._parse_attr(p, sync)
108 | except UnknownAttrError:
109 | if not self._server._warned_numeric:
110 | logging.warning('unknown numeric ID: %s' % numeric_id)
111 | self._server._warned_numeric = True
112 | v = undefined_numeric
113 | except DBSyntaxError as e:
114 | warnings.warn(e.warning(), stacklevel = 2)
115 | v = undefined_numeric
116 | except UnavailableAttrError:
117 | return None
118 | self._server.numeric_attr[numeric_id] = v
119 | return v
120 |
121 | def get_enumeration_attr(self, enumeration_id, sync):
122 | v = self._server.enumeration_attr.get(enumeration_id, None)
123 | if v is not None:
124 | return v
125 |
126 | p = EnumerationAttrParser(dialect = self.dialect,
127 | paramstyle = self.paramstyle,
128 | limit = 2, enumeration_id = enumeration_id)
129 | try:
130 | v = self._parse_attr(p, sync)
131 | except UnknownAttrError:
132 | if not self._server._warned_enum:
133 | logging.warning('unknown enumeration ID: %s' % enumeration_id)
134 | self._server._warned_enum = True
135 | v = undefined_enumeration
136 | except DBSyntaxError as e:
137 | warnings.warn(e.warning(), stacklevel = 2)
138 | v = undefined_enumeration
139 | except UnavailableAttrError:
140 | return None
141 | self._server.enumeration_attr[enumeration_id] = v
142 | return v
143 |
144 | def get_patient_id(self, mapping_id, sync):
145 | v = self._server.patient_map.get(mapping_id, None)
146 | if v is not None:
147 | return v
148 | # if not sync:
149 | # return None
150 |
151 | p = PatientMappingParser(dialect = self.dialect,
152 | paramstyle = self.paramstyle,
153 | limit = 2, mapping_id = mapping_id)
154 | try:
155 | v = self._parse_attr(p, True)
156 | except UnknownAttrError:
157 | if not self._server._warned_mapping:
158 | logging.warning('unknown mapping ID: %s' % mapping_id)
159 | self._server._warned_mapping = True
160 | return None
161 | except DBSyntaxError as e:
162 | warnings.warn(e.warning(), stacklevel = 2)
163 | self._server.patient_map[mapping_id] = None
164 | return None
165 | self.set_patient_id(mapping_id, v.patient_id)
166 | return v.patient_id
167 |
168 | def set_patient_id(self, mapping_id, patient_id):
169 | self._server.patient_map[mapping_id] = patient_id
170 |
171 | def _parse_attr(self, parser, sync):
172 | # ensure that attr_db connections are not shared between
173 | # processes
174 | pid = os.getpid()
175 | if self._server.attr_db_pid == pid:
176 | conn = self._server.attr_db
177 | else:
178 | self._server.attr_db = conn = self._server.connect()
179 | self._server.attr_db_pid = pid
180 |
181 | # FIXME: add asynchronous processing
182 | results = []
183 | for msg in self.get_messages(parser, connection = conn):
184 | results.append(msg)
185 | if len(results) > 1:
186 | logging.warning('multiple results found for %r' % parser)
187 | elif len(results) == 0:
188 | raise UnknownAttrError()
189 | return results[0]
190 |
191 | class DWCDBServer:
192 | _named_servers = {}
193 |
194 | def __init__(self, servername):
195 | self.dbtype = DWCDB._config.get(servername, 'type', fallback = 'mssql')
196 |
197 | if self.dbtype == 'mssql':
198 | import pymssql
199 | self.hostname = DWCDB._config[servername]['hostname']
200 | self.username = DWCDB._config[servername]['username']
201 | self.password = DWCDB._config[servername]['password']
202 | self.database = DWCDB._config[servername]['database']
203 | self.dialect = 'ms'
204 | self.paramstyle = pymssql.paramstyle
205 | elif self.dbtype == 'sqlite':
206 | import sqlite3
207 | self.filename = DWCDB._config[servername]['file']
208 | self.dialect = 'sqlite'
209 | self.paramstyle = sqlite3.paramstyle
210 | elif self.dbtype == 'bcp':
211 | from .db import dwcbcp
212 | self.bcpdirs = []
213 | for d in DWCDB._config[servername]['bcp-path'].split(':'):
214 | self.bcpdirs.append(os.path.join(DWCDB._config_path, d))
215 | self.dialect = 'sqlite'
216 | self.paramstyle = dwcbcp.paramstyle
217 | else:
218 | raise ValueError('unknown database type')
219 |
220 | self.wave_attr = {}
221 | self.numeric_attr = {}
222 | self.enumeration_attr = {}
223 | self.patient_map = {}
224 | self.attr_db = None
225 | self.attr_db_pid = None
226 | self._warned_mapping = False
227 | self._warned_wave = False
228 | self._warned_numeric = False
229 | self._warned_enum = False
230 |
231 | def get(servername):
232 | s = DWCDBServer._named_servers.get(servername, None)
233 | if s is None:
234 | s = DWCDBServer(servername)
235 | DWCDBServer._named_servers[servername] = s
236 | return s
237 |
238 | def connect(self):
239 | if self.dbtype == 'mssql':
240 | import pymssql
241 | return pymssql.connect(self.hostname, self.username,
242 | self.password, self.database,
243 | tds_version='7.1')
244 | elif self.dbtype == 'sqlite':
245 | import sqlite3
246 | return sqlite3.connect(self.filename)
247 | elif self.dbtype == 'bcp':
248 | from .db import dwcbcp
249 | return dwcbcp.connect(self.bcpdirs)
250 |
251 | class UnknownAttrError(Exception):
252 | """Internal exception indicating the object does not exist."""
253 | pass
254 |
255 | class UnavailableAttrError(Exception):
256 | """Internal exception indicating that the request is pending."""
257 | pass
258 |
--------------------------------------------------------------------------------
/downcast/output/alerts.py:
--------------------------------------------------------------------------------
1 | #
2 | # downcast - tools for unpacking patient data from DWC
3 | #
4 | # Copyright (c) 2018 Laboratory for Computational Physiology
5 | #
6 | # This program is free software: you can redistribute it and/or modify
7 | # it under the terms of the GNU General Public License as published by
8 | # the Free Software Foundation, either version 3 of the License, or
9 | # (at your option) any later version.
10 | #
11 | # This program is distributed in the hope that it will be useful,
12 | # but WITHOUT ANY WARRANTY; without even the implied warranty of
13 | # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14 | # GNU General Public License for more details.
15 | #
16 | # You should have received a copy of the GNU General Public License
17 | # along with this program. If not, see .
18 |
19 | from datetime import datetime, timezone
20 | import os
21 | import re
22 |
23 | from ..messages import AlertMessage
24 | from ..timestamp import (T, delta_ms)
25 | from ..util import string_to_ascii
26 | from .wfdb import (Annotator, AnnotationType)
27 |
28 | _sane_time = T('1970-01-01 00:00:00.000 +00:00')
29 |
30 | class AlertHandler:
31 | def __init__(self, archive):
32 | self.archive = archive
33 |
34 | def send_message(self, chn, msg, source, ttl):
35 | if not isinstance(msg, AlertMessage):
36 | return
37 |
38 | source.nack_message(chn, msg, self)
39 |
40 | # Look up the corresponding record
41 | record = self.archive.get_record(msg)
42 | if record is None:
43 | # Record not yet available - hold message in pending and
44 | # continue processing
45 | return
46 |
47 | # Dump original message to BCP file if desired
48 | if record.dump(msg):
49 | source.ack_message(chn, msg, self)
50 | return
51 |
52 | # Open or create a log file
53 | logfile = record.open_log_file('_phi_alerts')
54 |
55 | # Write value to the log file
56 | sn = msg.sequence_number
57 | ts = msg.timestamp.strftime_utc('%Y%m%d%H%M%S%f')
58 | idstr = str(msg.alert_id)
59 | lbl = string_to_ascii(msg.label)
60 | if msg.is_silenced:
61 | statestr = '~'
62 | else:
63 | statestr = '='
64 |
65 | logfile.append('S%s' % sn)
66 | if msg.announce_time and msg.announce_time > _sane_time:
67 | ats = msg.announce_time.strftime_utc('%Y%m%d%H%M%S%f')
68 | logfile.append(ats)
69 | logfile.append('(%s)+' % (idstr,))
70 | if msg.onset_time and msg.onset_time > _sane_time:
71 | ots = msg.onset_time.strftime_utc('%Y%m%d%H%M%S%f')
72 | logfile.append(ots)
73 | logfile.append('(%s)!' % (idstr,))
74 | if msg.end_time and msg.end_time > _sane_time:
75 | ets = msg.end_time.strftime_utc('%Y%m%d%H%M%S%f')
76 | logfile.append(ets)
77 | logfile.append('(%s)-' % (idstr,))
78 | logfile.append(ts)
79 | logfile.append('(%s)%s%s%s' % (idstr, msg.severity, statestr, lbl))
80 |
81 | source.ack_message(chn, msg, self)
82 |
83 | def flush(self):
84 | self.archive.flush()
85 |
86 | class AlertFinalizer:
87 | def __init__(self, record):
88 | self.record = record
89 | self.log = record.open_log_reader('_phi_alerts', allow_missing = True)
90 |
91 | self.alert_onset = {}
92 | self.alert_announce = {}
93 | self.alert_end = {}
94 |
95 | # Scan the alerts log file, add timestamps to the time map,
96 | # and record onset/announce/end time for each alert ID.
97 | for (sn, ts, line) in self.log.unsorted_items():
98 | ts = datetime.strptime(str(ts), '%Y%m%d%H%M%S%f')
99 | ts = ts.replace(tzinfo = timezone.utc)
100 | record.time_map.add_time(ts)
101 |
102 | (alert_id, event, severity, state, label) = _parse_info(line)
103 | # If there are multiple recorded onset times, save the one
104 | # that was recorded first (smallest sequence number.)
105 | # Save the earliest onset timestamp that was recorded at
106 | # that sequence number.
107 | if event == b'!':
108 | if (sn, ts) < self.alert_onset.setdefault(alert_id, (sn, ts)):
109 | self.alert_onset[alert_id] = (sn, ts)
110 | # If there are multiple recorded announce times, save the
111 | # one that was recorded first (smallest sequence number.)
112 | # Save the earliest announce timestamp that was recorded
113 | # at that sequence number.
114 | elif event == b'+':
115 | if (sn, ts) < self.alert_announce.setdefault(alert_id,
116 | (sn, ts)):
117 | self.alert_announce[alert_id] = (sn, ts)
118 | # If there are multiple recorded end times, save the one
119 | # that was recorded last (largest sequence number.) Save
120 | # the latest end timestamp that was recorded at that
121 | # sequence number.
122 | elif event == b'-':
123 | if (sn, ts) > self.alert_end.setdefault(alert_id, (sn, ts)):
124 | self.alert_end[alert_id] = (sn, ts)
125 |
126 | def finalize_record(self):
127 | sn0 = self.record.seqnum0()
128 | if sn0 is None:
129 | # if we don't have a seqnum0 then time is meaningless
130 | return
131 |
132 | alert_first = {}
133 | alert_pre_announce = {}
134 | alert_pre_end = {}
135 | alert_last = {}
136 | alert_num = {}
137 |
138 | announce_t = {}
139 | for (alert_id, (sn, ts)) in self.alert_announce.items():
140 | sn = self.record.time_map.get_seqnum(ts, sn + 5120)
141 | if sn is None:
142 | continue
143 | announce_t[alert_id] = sn - sn0
144 |
145 | end_t = {}
146 | for (alert_id, (sn, ts)) in self.alert_end.items():
147 | # alert end time may actually be slightly later than
148 | # time of the message. why? no idea. how do these
149 | # timestamps work in regard to system clock
150 | # adjustments? no idea.
151 | sn = self.record.time_map.get_seqnum(ts, sn + 15120)
152 | if sn is None:
153 | continue
154 | end_t[alert_id] = sn - sn0
155 |
156 | annfname = os.path.join(self.record.path, 'waves.alarm')
157 | with Annotator(annfname, afreq = 1000) as anns:
158 | # Reread the alerts log file in order. Assign an integer
159 | # ID to each alert in order of appearance, and record the
160 | # severity, state (silenced or not) and label.
161 | #
162 | # Severity/state/label can change from one message to the
163 | # next. For the onset annotation, we use the earliest
164 | # message. For the announcement annotation, we use the
165 | # latest message that precedes the announcement time, or
166 | # the earliest annotation if there isn't one. For the end
167 | # annotation, we use the latest message that precedes the
168 | # end time, or the latest message if there isn't one. If
169 | # there are any state changes between announcement and
170 | # end, we add those as additional annotations.
171 | for (sn, ts, line) in self.log.sorted_items():
172 | if b'\030' in line:
173 | continue
174 |
175 | (alert_id, event, severity, state, label) = _parse_info(line)
176 | if not label:
177 | continue
178 |
179 | ts = datetime.strptime(str(ts), '%Y%m%d%H%M%S%f')
180 | ts = ts.replace(tzinfo = timezone.utc)
181 | sn = self.record.time_map.get_seqnum(ts, sn + 5120) or sn
182 | t = sn - sn0
183 |
184 | num = alert_num.setdefault(alert_id, len(alert_num) + 1)
185 |
186 | oldstate = alert_last.get(alert_id, None)
187 | newstate = (severity, state, label)
188 | alert_first.setdefault(alert_id, newstate)
189 | alert_last[alert_id] = newstate
190 |
191 | announce = announce_t.get(alert_id, t)
192 | end = end_t.get(alert_id, t)
193 | if t <= announce:
194 | alert_pre_announce[alert_id] = newstate
195 | if t <= end:
196 | alert_pre_end[alert_id] = newstate
197 |
198 | if oldstate and oldstate != newstate and announce < t < end:
199 | _put_annot(anns, t, num, b';', severity, state, label)
200 |
201 | for (alert_id, (sn, ts)) in self.alert_onset.items():
202 | num = alert_num.get(alert_id)
203 | sn = self.record.time_map.get_seqnum(ts, sn + 5120)
204 | if num is None or sn is None:
205 | continue
206 | t = sn - sn0
207 | (severity, state, label) = alert_first[alert_id]
208 | _put_annot(anns, t, num, b'+', severity, state, label)
209 |
210 | for (alert_id, t) in announce_t.items():
211 | num = alert_num.get(alert_id)
212 | if num is None:
213 | continue
214 | (severity, state, label) = (alert_pre_announce.get(alert_id)
215 | or alert_first[alert_id])
216 | _put_annot(anns, t, num, b'<', severity, state, label)
217 |
218 | for (alert_id, t) in end_t.items():
219 | num = alert_num.get(alert_id)
220 | if num is None:
221 | continue
222 | (severity, state, label) = (alert_pre_end.get(alert_id)
223 | or alert_last[alert_id])
224 | _put_annot(anns, t, num, b'>', severity, state, label)
225 |
226 | _info_pattern = re.compile(rb'\(([\w-]+)\)(?:([-+!])|(\d+)([=~])(.*))')
227 |
228 | def _parse_info(line):
229 | m = _info_pattern.fullmatch(line.rstrip(b'\n'))
230 | if m:
231 | return m.groups()
232 | else:
233 | return (None, None, None, None, None)
234 |
235 | def _put_annot(anns, time, alert_num, event_code, severity, state, label):
236 | severity = int(severity)
237 | if severity == 0: # RED
238 | subtyp = 3
239 | elif severity == 1: # YELLOW
240 | subtyp = 2
241 | elif severity == 2: # SHORT YELLOW
242 | subtyp = 1
243 | else:
244 | subtyp = 0
245 |
246 | if event_code == b'+': # onset
247 | subtyp += 90
248 | elif event_code == b'<': # announce
249 | subtyp += 80
250 | elif event_code == b'>': # end
251 | subtyp += 60
252 | else:
253 | subtyp += 70
254 |
255 | aux = event_code + b'{' + str(alert_num).encode() + b'}'
256 | if state == b'~': # silenced
257 | aux += b'~'
258 | else:
259 | aux += b' '
260 | aux += label
261 |
262 | anns.put(time = time, anntyp = AnnotationType.NOTE,
263 | subtyp = subtyp, chan = 255, aux = aux)
264 |
--------------------------------------------------------------------------------
/downcast/output/timemap.py:
--------------------------------------------------------------------------------
1 | #
2 | # downcast - tools for unpacking patient data from DWC
3 | #
4 | # Copyright (c) 2018 Laboratory for Computational Physiology
5 | #
6 | # This program is free software: you can redistribute it and/or modify
7 | # it under the terms of the GNU General Public License as published by
8 | # the Free Software Foundation, either version 3 of the License, or
9 | # (at your option) any later version.
10 | #
11 | # This program is distributed in the hope that it will be useful,
12 | # but WITHOUT ANY WARRANTY; without even the implied warranty of
13 | # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14 | # GNU General Public License for more details.
15 | #
16 | # You should have received a copy of the GNU General Public License
17 | # along with this program. If not, see .
18 |
19 | import os
20 | import csv
21 | import bisect
22 | import logging
23 | from datetime import timedelta
24 |
25 | from ..timestamp import T, delta_ms
26 | from ..util import fdatasync
27 |
28 | class TimeMap:
29 | """
30 | Object that tracks the mapping between time and sequence number.
31 |
32 | In general, sequence numbers provide a reliable measurement of
33 | time; wall-clock timestamps do not.
34 |
35 | (For example, two events whose sequence numbers differ by
36 | 1,000,000 are exactly twice as far apart as two events whose
37 | sequence numbers differ by 500,000. However, two events whose
38 | wall-clock timestamps differ by 1,000 seconds might be anywhere
39 | from 970 to 1,030 seconds apart.)
40 |
41 | This object aggregates the available information concerning the
42 | mapping (which is not necessarily injective in either direction)
43 | between sequence number and timestamp, so that given an arbitrary
44 | timestamp, it is possible to determine the most likely sequence
45 | number at which that timestamp would have been generated.
46 | """
47 |
48 | def __init__(self, record_id):
49 | self.entries = []
50 | self.record_id = record_id
51 |
52 | def read(self, path, name):
53 | """Read a time map file."""
54 | fname = os.path.join(path, name)
55 | try:
56 | with open(fname, 'rt', encoding = 'UTF-8') as f:
57 | for row in csv.reader(f):
58 | start = int(row[0])
59 | end = int(row[1])
60 | baset = T(row[2])
61 | self.entries.append([start, end, baset, set()])
62 | except FileNotFoundError:
63 | pass
64 | self.entries.sort()
65 |
66 | def write(self, path, name):
67 | """Write a time map file."""
68 | fname = os.path.join(path, name)
69 | tmpfname = os.path.join(path, '_' + name + '.tmp')
70 | with open(tmpfname, 'wt', encoding = 'UTF-8') as f:
71 | w = csv.writer(f)
72 | for e in self.entries:
73 | w.writerow(e[0:3])
74 | f.flush()
75 | fdatasync(f.fileno())
76 | os.rename(tmpfname, fname)
77 |
78 | def set_time(self, seqnum, time):
79 | """
80 | Add a reference timestamp to the map.
81 |
82 | This indicates that we know (from a reliable source, such as a
83 | wave sample message) the exact wall-clock time at a given
84 | sequence number.
85 |
86 | Given this information, we can infer what the wall-clock time
87 | must have been at other moments in time, so long as the wall
88 | clock is not adjusted.
89 |
90 | This information is treated as trustworthy and will be saved
91 | to the time map file when write() is called.
92 | """
93 | baset = time - timedelta(milliseconds = seqnum)
94 |
95 | # i = index of the first span that begins at or after seqnum
96 | i = bisect.bisect_right(self.entries, [seqnum])
97 | p = self.entries[i-1:i]
98 | n = self.entries[i:i+1]
99 |
100 | # If this sequence number falls within an existing span,
101 | # verify that baset is what we expect
102 | if p and seqnum <= p[0][1]:
103 | if baset != p[0][2]:
104 | logging.warning('conflicting timestamps at %d in %s'
105 | % (seqnum, self.record_id))
106 | elif n and seqnum >= n[0][0]:
107 | if baset != n[0][2]:
108 | logging.warning('conflicting timestamps at %d in %s'
109 | % (seqnum, self.record_id))
110 |
111 | # If this sequence number falls close to the start or end of
112 | # an existing span that has the same baset value (close enough
113 | # that we assume there could not have been more than one clock
114 | # adjustment), then extend the existing span(s)
115 | elif p and p[0][2] == baset and seqnum - p[0][1] < 30000:
116 | p[0][1] = seqnum
117 | if n and n[0][2] == baset and n[0][0] - seqnum < 30000:
118 | n[0][0] = p[0][0]
119 | del self.entries[i-1]
120 | elif n and n[0][2] == baset and n[0][0] - seqnum < 30000:
121 | n[0][0] = seqnum
122 |
123 | # Otherwise, define a new span
124 | else:
125 | self.entries.insert(i, [seqnum, seqnum, baset, set()])
126 |
127 | def add_time(self, time):
128 | """
129 | Add a non-reference timestamp to the map.
130 |
131 | This indicates that we have observed the given wall-clock time
132 | (for example, it is used as the timestamp of a numeric or
133 | alert message), but we do not yet know precisely when that
134 | timestamp occurred.
135 |
136 | This information is not saved in the time map file, but is
137 | used by resolve_gaps() to refine the time map.
138 |
139 | This function should be called after all reference timestamps
140 | have been recorded using set_time().
141 | """
142 | for e in self.entries:
143 | start = e[2] + timedelta(milliseconds = e[0])
144 | if time < start:
145 | e[3].add(time)
146 | return
147 | end = e[2] + timedelta(milliseconds = e[1])
148 | if time <= end:
149 | return
150 |
151 | def get_seqnum(self, time, limit = None):
152 | """
153 | Guess the sequence number corresponding to a wall-clock time.
154 |
155 | limit should be the latest possible value (inclusive) for this
156 | sequence number. Typically, if the message sequence number is
157 | N, then it should be impossible for any event to have occurred
158 | at time greater than (N + 5120).
159 |
160 | If no information is available, this will return None.
161 | """
162 |
163 | if not self.entries:
164 | return None
165 |
166 | if limit is None:
167 | limit = self.entries[-1][1]
168 |
169 | # If this timestamp falls within a known interval - there is
170 | # an instant at which we know the system clock would have
171 | # displayed that value - then choose the latest such instant
172 | # that is before or equal to 'limit'.
173 | possible_sn = []
174 | best_known = None
175 | for (start, end, base, _) in self.entries:
176 | sn = delta_ms(time, base)
177 | possible_sn.append((sn, end))
178 | if start <= sn <= end and sn <= limit:
179 | best_known = sn
180 | if best_known is not None:
181 | return best_known
182 |
183 | # Otherwise, take the earliest interval for which this
184 | # timestamp would appear to be in the past. (So, if the
185 | # system clock never displayed this timestamp, then translate
186 | # according to the next reference timestamp *after* this
187 | # point. If the system clock displayed this timestamp
188 | # multiple times, but all of those occurred after 'limit',
189 | # then choose the earliest.)
190 | for (sn, interval_end) in possible_sn:
191 | if sn <= interval_end:
192 | return sn
193 |
194 | # Otherwise, the timestamp occurs in the future; extrapolate
195 | # from the *last* reference timestamp.
196 | return possible_sn[-1][0]
197 |
198 | def get_time(self, seqnum):
199 | """
200 | Guess the wall-clock time corresponding to a sequence number.
201 |
202 | If no information is available, this will return None.
203 | """
204 | best_time = None
205 | best_delta = None
206 | for (start, end, base, _) in self.entries:
207 | delta = max(start - seqnum, seqnum - end)
208 | if best_delta is None or delta < best_delta:
209 | best_time = base + timedelta(milliseconds = seqnum)
210 | best_delta = delta
211 | return best_time
212 |
213 | def resolve_gaps(self):
214 | """
215 | Refine the time map based on all available information.
216 |
217 | The wall clock may be adjusted at any time during the record;
218 | in general, there is no way to know exactly when this happens.
219 | When it does, two consecutive reference timestamps will
220 | disagree; for example, we might have
221 |
222 | sequence number timestamp
223 | 500000000000 2015-11-05 12:53:20.000 +00:00
224 | 500000005120 2015-11-05 12:53:27.120 +00:00
225 |
226 | This tells us that, at some time between those two events, the
227 | wall clock was adjusted forward by two seconds. If we then
228 | see:
229 |
230 | (unknown) 2015-11-05 12:53:23.800 +00:00
231 |
232 | we can't tell whether that occurs 3.8 seconds after event #1,
233 | or 3.32 seconds before event #2. However, if we also see:
234 |
235 | (unknown) 2015-11-05 12:53:21.900 +00:00
236 |
237 | we can deduce that the two-second adjustment could not
238 | possibly have occurred between events #1 and #4, nor between
239 | events #4 and #3, and thus it must have been between events #3
240 | and #2; so event #4 must have occurred at sequence number
241 | 500000001900, and event #3 at 500000003800.
242 |
243 | In ambiguous cases, our best guess is that the adjustment
244 | occurred between the most distant pair of timestamps - if we
245 | only have events #1-#3 above, then all we can say is that it's
246 | more likely to have a 3.32-second interval with no events,
247 | than to have a 3.8-second interval with no events, and thus
248 | the clock adjustment is more likely to have occurred between
249 | events #1 and #3.
250 | """
251 | p = None
252 | new_refs = []
253 | for n in self.entries:
254 | if p and n[3]:
255 | gapstart = p[2] + timedelta(milliseconds = p[1])
256 | gapend = n[2] + timedelta(milliseconds = n[0])
257 | n[3].add(gapstart)
258 | n[3].add(gapend)
259 | best = (timedelta(0), gapstart)
260 | for d in _differences(sorted(n[3])):
261 | best = max(best, d)
262 | tbefore = best[1]
263 | tafter = best[1] + best[0]
264 | snp = delta_ms(tbefore, p[2])
265 | snn = delta_ms(tafter, n[2])
266 | new_refs.append((snp, tbefore))
267 | new_refs.append((snn, tafter))
268 | p = n
269 | for (seqnum, time) in new_refs:
270 | self.set_time(seqnum, time)
271 |
272 | def _differences(k):
273 | i = iter(k)
274 | try:
275 | prev = next(i)
276 | except StopIteration:
277 | return
278 | for cur in i:
279 | yield (cur - prev, prev)
280 | prev = cur
281 |
--------------------------------------------------------------------------------
/downcast/shell.py:
--------------------------------------------------------------------------------
1 | #
2 | # dwcsql - simple interactive frontend for the DWC SQL database
3 | #
4 | # Copyright (c) 2018 Laboratory for Computational Physiology
5 | #
6 | # This program is free software: you can redistribute it and/or modify
7 | # it under the terms of the GNU General Public License as published by
8 | # the Free Software Foundation, either version 3 of the License, or
9 | # (at your option) any later version.
10 | #
11 | # This program is distributed in the hope that it will be useful,
12 | # but WITHOUT ANY WARRANTY; without even the implied warranty of
13 | # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14 | # GNU General Public License for more details.
15 | #
16 | # You should have received a copy of the GNU General Public License
17 | # along with this program. If not, see .
18 |
19 | import sys
20 | import readline
21 | import time
22 | import os
23 | import re
24 | import locale
25 | import ast
26 | from argparse import ArgumentParser
27 | from uuid import UUID
28 | from decimal import Decimal
29 |
30 | from .server import DWCDB
31 | from .db.exceptions import ParameterCountError
32 |
33 | ################################################################
34 |
35 | _known_tables = [
36 | 'External_Alert',
37 | 'External_BedTag',
38 | 'External_Enumeration',
39 | 'External_EnumerationValue',
40 | 'External_Numeric',
41 | 'External_NumericValue',
42 | 'External_Patient',
43 | 'External_PatientDateAttribute',
44 | 'External_PatientStringAttribute',
45 | 'External_Wave',
46 | 'External_WaveSample',
47 | 'Pdx_PartitionDetailView',
48 | '_Export.AlertArchive_',
49 | '_Export.Alert_',
50 | '_Export.BedTag_',
51 | '_Export.Configuration_',
52 | '_Export.DbMaintenanceLock_',
53 | '_Export.EnumerationValueArchive_',
54 | '_Export.EnumerationValue_',
55 | '_Export.Enumeration_',
56 | '_Export.NumericValueArchive_',
57 | '_Export.NumericValue_',
58 | '_Export.Numeric_',
59 | '_Export.PartitionSetting_',
60 | '_Export.PatientDateAttribute_',
61 | '_Export.PatientMappingArchive_',
62 | '_Export.PatientMapping_',
63 | '_Export.PatientStringAttribute_',
64 | '_Export.Patient_',
65 | '_Export.StorageLocation_',
66 | '_Export.WaveSampleArchive_',
67 | '_Export.WaveSample_',
68 | '_Export.Wave_'
69 | ]
70 |
71 | _known_columns = [
72 | 'AdmitState', 'AlertId', 'Alias', 'AnnounceTime', 'BasePhysioId',
73 | 'BedLabel', 'CalibrationAbsLower', 'CalibrationAbsUpper',
74 | 'CalibrationScaledLower', 'CalibrationScaledUpper',
75 | 'CalibrationType', 'Category', 'Channel', 'ClinicalUnit', 'Code',
76 | 'Color', 'CompoundValueId', 'EcgLeadPlacement', 'EndTime',
77 | 'EnumerationId', 'Gender', 'Height', 'HeightUnit',
78 | 'HighEdgeFrequency', 'Hostname', 'Id', 'InvalidSamples',
79 | 'IsAlarmingOff', 'IsAperiodic', 'IsDerived', 'IsManual',
80 | 'IsMapped', 'IsSilenced', 'IsSlowWave', 'IsTrendUploaded', 'Kind',
81 | 'Label', 'LowEdgeFrequency', 'LowerLimit', 'MappingId',
82 | 'MaxValues', 'Name', 'NumericId', 'OnsetTime', 'PacedMode',
83 | 'PacedPulses', 'PatientId', 'PhysioId', 'PressureUnit',
84 | 'ResuscitationStatus', 'SamplePeriod', 'Scale', 'ScaleLower',
85 | 'ScaleUpper', 'SequenceNumber', 'Severity', 'Source', 'SubLabel',
86 | 'SubPhysioId', 'SubtypeId', 'Tag', 'TimeStamp', 'Timestamp',
87 | 'UnavailableSamples', 'UnitCode', 'UnitLabel', 'UpperLimit',
88 | 'Validity', 'Value', 'ValuePhysioId', 'WaveId', 'WaveSamples',
89 | 'Weight', 'WeightUnit'
90 | ]
91 |
92 | _known_ids = {}
93 |
94 | def _get_completions(text):
95 | for t in _known_tables:
96 | if t.startswith(text):
97 | yield t
98 | for c in _known_columns:
99 | if c.startswith(text):
100 | yield c
101 | if text.startswith("'"):
102 | prefix = text[1:3]
103 | if prefix in _known_ids:
104 | for z in _known_ids[prefix]:
105 | if z.startswith(text):
106 | yield z
107 |
108 | def _add_known_uuid(val):
109 | s = repr(str(val))
110 | prefix = s[1:3]
111 | if prefix not in _known_ids:
112 | _known_ids[prefix] = set()
113 | _known_ids[prefix].add(s)
114 |
115 | _ctext = ''
116 | _ccompl = []
117 |
118 | def _completer(text, state):
119 | global _ctext, _ccompl
120 | if text != _ctext:
121 | _ctext = text
122 | _ccompl = sorted(_get_completions(text))
123 | if state < len(_ccompl):
124 | return _ccompl[state]
125 | else:
126 | return None
127 |
128 | ################################################################
129 |
130 | _uuid_pattern = re.compile('\A[0-9a-f]{8}(?:-[0-9a-f]{4}){3}-[0-9a-f]{12}\Z',
131 | re.ASCII | re.IGNORECASE)
132 |
133 | if sys.stdout.isatty() and os.environ.get('TERM', 'dumb') != 'dumb':
134 | _vcolor = ['\033[0m'] + ['\033[%dm' % i for i in range(31, 37)]
135 | _hcolor = ['\033[0;1m'] + ['\033[%d;1m' % i for i in range(31, 37)]
136 | _color0 = '\033[0m'
137 | else:
138 | _vcolor = _hcolor = ['']
139 | _color0 = ''
140 |
141 | _max_align_width = 64
142 | _align_group_size = 20
143 |
144 | def _format_value(val, desc=None):
145 | if isinstance(val, bool):
146 | return repr(val)
147 | elif isinstance(val, Decimal) or isinstance(val, int):
148 | if desc == 'Color' and val < 0 and val >= -16777216:
149 | return '#%06x' % (val + 16777216)
150 | return '{:n}'.format(val)
151 | elif isinstance(val, UUID):
152 | _add_known_uuid(val)
153 | return repr(str(val))
154 | elif isinstance(val, str):
155 | if _uuid_pattern.match(val):
156 | _add_known_uuid(UUID(val))
157 | return repr(val)
158 | else:
159 | return repr(val)
160 |
161 | def _value_alignment(val):
162 | return (isinstance(val, str)
163 | or isinstance(val, bytes)
164 | or isinstance(val, UUID))
165 |
166 | def _pad(text, width, leftalign):
167 | if leftalign:
168 | return text.ljust(width)
169 | else:
170 | return text.rjust(width)
171 |
172 | def _show_results(cur, colinfo, results, setindex, transpose=False):
173 | headers = (len(colinfo) == 0)
174 | headerwidth = 0
175 | if headers:
176 | for desc in cur.description:
177 | if transpose:
178 | colinfo.append([0, None, desc[0]])
179 | headerwidth = max(headerwidth, len(desc[0]))
180 | else:
181 | colinfo.append([len(desc[0]), None, desc[0]])
182 | table = []
183 | for row in results:
184 | while len(colinfo) < len(row):
185 | colinfo.append([0, None, ''])
186 | tabrow = []
187 | for (i, value) in enumerate(row):
188 | text = _format_value(value, colinfo[i][2])
189 | width = len(text)
190 | if width < _max_align_width:
191 | colinfo[i][0] = max(colinfo[i][0], width)
192 | if value is not None and colinfo[i][1] is None:
193 | colinfo[i][1] = _value_alignment(value)
194 | tabrow.append(text)
195 | table.append(tabrow)
196 |
197 | if transpose:
198 | cellwidth = max(ci[0] for ci in colinfo)
199 | for (i, (_, leftalign, label)) in enumerate(colinfo):
200 | sys.stdout.write(_hcolor[(i + setindex) % len(_hcolor)])
201 | sys.stdout.write(_pad(label, headerwidth, True))
202 | sys.stdout.write(_color0)
203 | sys.stdout.write(_vcolor[(i + setindex) % len(_vcolor)])
204 | for tabrow in table:
205 | try:
206 | text = tabrow[i]
207 | except IndexError:
208 | text = ''
209 | sys.stdout.write(' ')
210 | sys.stdout.write(_pad(text, cellwidth, leftalign))
211 | sys.stdout.write(_color0 + '\n')
212 | else:
213 | if headers:
214 | for (i, (width, leftalign, label)) in enumerate(colinfo):
215 | if i > 0:
216 | sys.stdout.write(' ')
217 | sys.stdout.write(_hcolor[(i + setindex) % len(_hcolor)])
218 | sys.stdout.write(_pad(label, width, leftalign))
219 | sys.stdout.write(_color0 + '\n')
220 | for tabrow in table:
221 | for (i, text) in enumerate(tabrow):
222 | if i > 0:
223 | sys.stdout.write(' ')
224 | sys.stdout.write(_vcolor[(i + setindex) % len(_vcolor)])
225 | (width, leftalign, _) = colinfo[i]
226 | sys.stdout.write(_pad(text, width, leftalign))
227 | sys.stdout.write(_color0 + '\n')
228 |
229 | def _run_query(conn, query, params):
230 | if query == '':
231 | return
232 | if re.match(r'@transpose\s', query):
233 | transpose = True
234 | query = query[len('@transpose'):]
235 | else:
236 | transpose = False
237 | with conn.cursor() as cur:
238 | begin = time.monotonic()
239 | cur.execute(query, params)
240 |
241 | more_results = True
242 | setindex = 0
243 | while more_results:
244 | colinfo = []
245 | headers = True
246 | results = []
247 | row = cur.fetchone()
248 | while row is not None:
249 | results.append(row)
250 | if len(results) >= _align_group_size:
251 | _show_results(cur, colinfo, results, setindex, transpose)
252 | results = []
253 | row = cur.fetchone()
254 | _show_results(cur, colinfo, results, setindex, transpose)
255 | more_results = cur.nextset()
256 | setindex += 1
257 | if more_results:
258 | print()
259 |
260 | end = time.monotonic()
261 | print('(%d rows; %.3f seconds)' % (cur.rowcount, end - begin))
262 | print()
263 |
264 | ################################################################
265 |
266 | def main():
267 | locale.setlocale(locale.LC_ALL, '')
268 |
269 | p = ArgumentParser()
270 | p.add_argument('--server', metavar = 'NAME', default = 'demo')
271 | p.add_argument('--password-file', metavar = 'FILE',
272 | default = 'server.conf')
273 | opts = p.parse_args()
274 |
275 | DWCDB.load_config(opts.password_file)
276 |
277 | db = DWCDB(opts.server)
278 | conn = None
279 |
280 | readline.set_completer_delims(' \t\n()[]=<>-+*?,')
281 | readline.parse_and_bind('tab: complete')
282 | readline.set_completer(_completer)
283 |
284 | histfile = os.environ.get('DWCSQL_HISTFILE', None)
285 | if histfile is not None:
286 | try:
287 | readline.read_history_file(histfile)
288 | except Exception:
289 | pass
290 | readline.set_history_length(1000)
291 |
292 | try:
293 | while True:
294 | try:
295 | line = input(opts.server + '> ')
296 | query = line
297 | while line != '' and not query.endswith(';'):
298 | line = input(' ' * len(opts.server) + '> ')
299 | query += '\n' + line
300 | params = []
301 | if conn is None:
302 | conn = db.connect()
303 | while True:
304 | try:
305 | _run_query(conn, query, params)
306 | break
307 | except ParameterCountError as e:
308 | pprompt = (e.context or '?') + ' '
309 | pass
310 | except:
311 | conn.close()
312 | conn = None
313 | raise
314 | line = input(pprompt)
315 | params.append(ast.literal_eval(line.strip()))
316 | except KeyboardInterrupt:
317 | print(_color0)
318 | except EOFError:
319 | print()
320 | return
321 | except Exception as e:
322 | # nasty hack to extract the human-readable message from a
323 | # pymssql exception... is there a proper way to do this?
324 | if (hasattr(e, 'args') and isinstance(e.args, tuple)
325 | and len(e.args) == 2 and isinstance(e.args[1], bytes)):
326 | msg = e.args[1].decode('UTF-8', errors = 'replace')
327 | else:
328 | msg = str(e)
329 | print('%s%s:\n%s\n' % (_color0, type(e).__name__, msg))
330 | finally:
331 | if histfile is not None:
332 | try:
333 | readline.write_history_file(histfile)
334 | except Exception:
335 | pass
336 |
--------------------------------------------------------------------------------
/downcast/db/dwcbcp.py:
--------------------------------------------------------------------------------
1 | #
2 | # downcast - tools for unpacking patient data from DWC
3 | #
4 | # Copyright (c) 2018 Laboratory for Computational Physiology
5 | #
6 | # This program is free software: you can redistribute it and/or modify
7 | # it under the terms of the GNU General Public License as published by
8 | # the Free Software Foundation, either version 3 of the License, or
9 | # (at your option) any later version.
10 | #
11 | # This program is distributed in the hope that it will be useful,
12 | # but WITHOUT ANY WARRANTY; without even the implied warranty of
13 | # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14 | # GNU General Public License for more details.
15 | #
16 | # You should have received a copy of the GNU General Public License
17 | # along with this program. If not, see .
18 |
19 | import os
20 | import re
21 |
22 | from .bcp import *
23 |
24 | # Sorting order for each table
25 |
26 | _table_order_column = {
27 | '_Export.Alert_': 'TimeStamp',
28 | '_Export.BedTag_': 'Timestamp',
29 | '_Export.Enumeration_': 'Id',
30 | '_Export.EnumerationValue_': 'TimeStamp',
31 | '_Export.Numeric_': 'Id',
32 | '_Export.NumericValue_': 'TimeStamp',
33 | '_Export.Patient_': 'Timestamp',
34 | '_Export.PatientDateAttribute_': 'Timestamp',
35 | '_Export.PatientStringAttribute_': 'Timestamp',
36 | '_Export.PatientMapping_': 'Timestamp',
37 | '_Export.Wave_': 'Id',
38 | '_Export.WaveSample_': 'TimeStamp'
39 | }
40 |
41 | # Index keys for each table
42 |
43 | _table_id_columns = {
44 | '_Export.PatientMapping_': ['Id']
45 | }
46 |
47 | # Regular expression to identify start of a row
48 |
49 | _table_sync_pattern = {
50 | '_Export.Alert_': b'\n().',
51 | '_Export.BedTag_': b'\n().',
52 | '_Export.Enumeration_': b'\n().',
53 | '_Export.EnumerationValue_': b'\n().',
54 | '_Export.Numeric_': b'\n().',
55 | '_Export.NumericValue_': b'\n().',
56 | '_Export.Patient_': b'\n().',
57 | '_Export.PatientDateAttribute_': b'\n().',
58 | '_Export.PatientStringAttribute_': b'\n().',
59 | '_Export.PatientMapping_': b'\n().',
60 | '_Export.Wave_': b'\n().',
61 | '_Export.WaveSample_': b'''(?x)
62 | # UnavailableSamples
63 | [ 0-9\0]* [\t]
64 | # InvalidSamples
65 | [ 0-9\0]* [\t]
66 | # PacedPulses
67 | [ 0-9\0]* [\t]
68 | # MappingId
69 | [0-9A-F]{8}-[0-9A-F]{4}-[0-9A-F]{4}-[0-9A-F]{4}-[0-9A-F]{12} [\n]
70 | ()
71 | # WaveId
72 | \d+ [\t]
73 | # TimeStamp
74 | \d{4}-\d{2}-\d{2} [ ] \d{2}:\d{2}:\d{2}\.\d+ [ ] [-+]\d{2}:\d{2} [\t]
75 | # SequenceNumber
76 | \d+ [\t]
77 | '''
78 | }
79 |
80 | # List of columns and types
81 |
82 | _table_columns = {
83 | '_Export.Alert_': {
84 | 'TimeStamp': DATETIME,
85 | 'SequenceNumber': INTEGER,
86 | 'AlertId': UUID,
87 | 'Source': INTEGER,
88 | 'Code': INTEGER,
89 | 'Label': STRING,
90 | 'Severity': INTEGER,
91 | 'Kind': INTEGER,
92 | 'IsSilenced': BOOLEAN,
93 | 'SubtypeId': INTEGER,
94 | 'AnnounceTime': DATETIME,
95 | 'OnsetTime': DATETIME,
96 | 'EndTime': DATETIME,
97 | 'MappingId': UUID,
98 | },
99 | '_Export.BedTag_': {
100 | 'BedLabel': STRING,
101 | 'Timestamp': DATETIME,
102 | 'Tag': STRING,
103 | },
104 | '_Export.Enumeration_': {
105 | 'Id': INTEGER,
106 | 'BasePhysioId': INTEGER,
107 | 'PhysioId': INTEGER,
108 | 'Label': STRING,
109 | 'ValuePhysioId': INTEGER,
110 | 'IsAperiodic': BOOLEAN,
111 | 'IsManual': BOOLEAN,
112 | 'Validity': INTEGER,
113 | 'UnitCode': INTEGER,
114 | 'UnitLabel': STRING,
115 | 'Color': INTEGER,
116 | },
117 | '_Export.EnumerationValue_': {
118 | 'EnumerationId': INTEGER,
119 | 'TimeStamp': DATETIME,
120 | 'SequenceNumber': INTEGER,
121 | 'CompoundValueId': UUID,
122 | 'Value': STRING,
123 | 'MappingId': UUID,
124 | },
125 | '_Export.Numeric_': {
126 | 'Id': INTEGER,
127 | 'BasePhysioId': INTEGER,
128 | 'PhysioId': INTEGER,
129 | 'Label': STRING,
130 | 'IsAperiodic': BOOLEAN,
131 | 'UnitLabel': STRING,
132 | 'Validity': INTEGER,
133 | 'LowerLimit': NUMBER,
134 | 'UpperLimit': NUMBER,
135 | 'IsAlarmingOff': BOOLEAN,
136 | 'SubPhysioId': INTEGER,
137 | 'SubLabel': STRING,
138 | 'Color': INTEGER,
139 | 'IsManual': BOOLEAN,
140 | 'MaxValues': INTEGER,
141 | 'Scale': INTEGER,
142 | },
143 | '_Export.NumericValue_': {
144 | 'NumericId': INTEGER,
145 | 'TimeStamp': DATETIME,
146 | 'SequenceNumber': INTEGER,
147 | 'IsTrendUploaded': BOOLEAN,
148 | 'CompoundValueId': UUID,
149 | 'Value': NUMBER,
150 | 'MappingId': UUID,
151 | },
152 | '_Export.Patient_': {
153 | 'Id': UUID,
154 | 'Timestamp': DATETIME,
155 | 'BedLabel': STRING,
156 | 'Alias': STRING,
157 | 'Category': INTEGER,
158 | 'Height': NUMBER,
159 | 'HeightUnit': INTEGER,
160 | 'Weight': NUMBER,
161 | 'WeightUnit': INTEGER,
162 | 'PressureUnit': INTEGER,
163 | 'PacedMode': INTEGER,
164 | 'ResuscitationStatus': INTEGER,
165 | 'AdmitState': INTEGER,
166 | 'ClinicalUnit': STRING,
167 | 'Gender': INTEGER,
168 | },
169 | '_Export.PatientDateAttribute_': {
170 | 'PatientId': UUID,
171 | 'Timestamp': DATETIME,
172 | 'Name': STRING,
173 | 'Value': STRING, # actually a date but who cares
174 | },
175 | '_Export.PatientStringAttribute_': {
176 | 'PatientId': UUID,
177 | 'Timestamp': DATETIME,
178 | 'Name': STRING,
179 | 'Value': STRING,
180 | },
181 | '_Export.PatientMapping_': {
182 | 'Id': UUID,
183 | 'PatientId': UUID,
184 | 'Timestamp': DATETIME,
185 | 'IsMapped': BOOLEAN,
186 | 'Hostname': STRING,
187 | },
188 | '_Export.Wave_': {
189 | 'Id': INTEGER,
190 | 'BasePhysioId': INTEGER,
191 | 'PhysioId': INTEGER,
192 | 'Label': STRING,
193 | 'Channel': INTEGER,
194 | 'SamplePeriod': INTEGER,
195 | 'IsSlowWave': BOOLEAN,
196 | 'IsDerived': BOOLEAN,
197 | 'Color': INTEGER,
198 | 'LowEdgeFrequency': NUMBER,
199 | 'HighEdgeFrequency': NUMBER,
200 | 'ScaleLower': INTEGER,
201 | 'ScaleUpper': INTEGER,
202 | 'CalibrationScaledLower': INTEGER,
203 | 'CalibrationScaledUpper': INTEGER,
204 | 'CalibrationAbsLower': NUMBER,
205 | 'CalibrationAbsUpper': NUMBER,
206 | 'CalibrationType': INTEGER,
207 | 'UnitLabel': STRING,
208 | 'UnitCode': INTEGER,
209 | 'EcgLeadPlacement': INTEGER,
210 | },
211 | '_Export.WaveSample_': {
212 | 'WaveId': INTEGER,
213 | 'TimeStamp': DATETIME,
214 | 'SequenceNumber': INTEGER,
215 | 'WaveSamples': BINARY,
216 | 'UnavailableSamples': STRING,
217 | 'InvalidSamples': STRING,
218 | 'PacedPulses': STRING,
219 | 'MappingId': UUID,
220 | }
221 | }
222 |
223 | class DWCBCPConnection(BCPConnection):
224 | def __init__(self, datadirs):
225 | BCPConnection.__init__(self)
226 | for d in datadirs:
227 | self.add_data_dir(d)
228 |
229 | def add_data_dir(self, dirname):
230 | """
231 | Import a directory of data files into the database.
232 |
233 | An example data directory might contain the following:
234 |
235 | Alert.20010101_20010102
236 | Alert.fmt
237 | BedTag.20010101_20010102
238 | BedTag.fmt
239 | Enumeration
240 | Enumeration.fmt
241 | EnumerationValue.20010101_20010102
242 | EnumerationValue.fmt
243 | Numeric
244 | Numeric.fmt
245 | NumericValue.20010101_20010102
246 | NumericValue.fmt
247 | Patient.20010101_20010102
248 | Patient.fmt
249 | PatientDateAttribute.20010101_20010102
250 | PatientDateAttribute.fmt
251 | PatientMapping.20010101_20010102
252 | PatientMapping.fmt
253 | PatientStringAttribute.20010101_20010102
254 | PatientStringAttribute.fmt
255 | Wave
256 | Wave.fmt
257 | WaveSample.20010101_20010102
258 | WaveSample.fmt
259 |
260 | For example, 'Alert.20010101_20010102' contains Alert data
261 | between those two dates, and 'Alert.fmt' is a freebcp format
262 | file describing the format of 'Alert.20010101_20010102'.
263 |
264 | The 'Enumeration', 'Numeric', and 'Wave' tables are not
265 | specific to the time period. For those tables, the most
266 | recently imported file replaces any previous files.
267 |
268 | For the other tables, all data files are concatenated in the
269 | order that they are imported. All of these files must be
270 | sorted by timestamp, and must not overlap.
271 | """
272 |
273 | meta_pat = re.compile('\A(?:Enumeration|Numeric|Wave)(?:\.dat)?\Z')
274 | data_pat = re.compile('\.(?:dat|[0-9]+_[0-9]+)\Z')
275 | for f in sorted(os.listdir(dirname)):
276 | path = os.path.join(dirname, f)
277 | base = f.split('.')[0]
278 | table = '_Export.%s_' % base
279 | fmtpath = os.path.join(dirname, base + '.fmt')
280 | if meta_pat.search(f):
281 | self.add_data_file(table, path, fmtpath, True)
282 | elif data_pat.search(f):
283 | self.add_data_file(table, path, fmtpath, False)
284 |
285 | def add_data_file(self, table, data_file, format_file, replace = False):
286 | """
287 | Import a file into the database.
288 |
289 | table is the name of the table, such as '_Export.Alert_'.
290 |
291 | data_file is the name of the raw data file; format_file is the
292 | name of the corresponding freebcp format file. (Note that
293 | only a very small subset of the possible freebcp formats are
294 | supported.)
295 |
296 | If replace is true, the new data file replaces all previously
297 | imported data; otherwise, it is concatenated onto the end of
298 | the preceding files.
299 | """
300 |
301 | tbl = self.add_table(table)
302 | tbl.set_sync_pattern(_table_sync_pattern[table])
303 | tbl.set_order(_table_order_column[table])
304 | for (col, dtype) in _table_columns[table].items():
305 | tbl.add_column(col, dtype)
306 | for col in _table_id_columns.get(table, []):
307 | tbl.add_unique_id(col)
308 | if replace:
309 | tbl.clear()
310 | tbl.add_data_file(data_file, format_file)
311 |
312 | #### DB-API ####
313 |
314 | def connect(datadirs):
315 | return DWCBCPConnection(datadirs)
316 |
--------------------------------------------------------------------------------
/downcast/subprocess.py:
--------------------------------------------------------------------------------
1 | #
2 | # downcast - tools for unpacking patient data from DWC
3 | #
4 | # Copyright (c) 2017 Laboratory for Computational Physiology
5 | #
6 | # This program is free software: you can redistribute it and/or modify
7 | # it under the terms of the GNU General Public License as published by
8 | # the Free Software Foundation, either version 3 of the License, or
9 | # (at your option) any later version.
10 | #
11 | # This program is distributed in the hope that it will be useful,
12 | # but WITHOUT ANY WARRANTY; without even the implied warranty of
13 | # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14 | # GNU General Public License for more details.
15 | #
16 | # You should have received a copy of the GNU General Public License
17 | # along with this program. If not, see .
18 |
19 | from enum import Enum
20 | from multiprocessing import Process, Pipe, current_process
21 | import atexit
22 | import traceback
23 | import logging
24 | import cProfile
25 | import os
26 | import sys
27 |
28 | from .dispatcher import Dispatcher
29 | from .util import setproctitle
30 |
31 | class ParallelDispatcher:
32 | """Object that routes messages to a set of child processes.
33 |
34 | When a message is sent to this dispatcher, it is forwarded to one
35 | of N child processes, selected based on the message channel. All
36 | messages in the same channel will be delivered to the same
37 | process, but no other guarantees are made about how different
38 | messages will be routed. Thus, all related messages must be sent
39 | to the same channel.
40 |
41 | Apart from distributing the workload, and operating
42 | asynchronously, this class's API is largely compatible with the
43 | API of the Dispatcher class.
44 | """
45 |
46 | def __init__(self, n_children, pending_limit = 200, **kwargs):
47 | self.n_children = n_children
48 | self.pending_limit = pending_limit
49 | self.children = None
50 | self.dispatcher = Dispatcher(**kwargs)
51 | sys.excepthook = _handle_fatal_exception
52 |
53 | def add_handler(self, handler):
54 | """Add a message handler.
55 |
56 | All handlers must be attached before the child processes are
57 | launched; i.e., before sending any messages.
58 | """
59 | if self.children is not None:
60 | raise Exception('cannot add handlers after sending messages')
61 | self.dispatcher.add_handler(handler)
62 |
63 | def add_dead_letter_handler(self, handler):
64 | """Add a dead-letter handler.
65 |
66 | All handlers must be attached before the child processes are
67 | launched; i.e., before sending any messages.
68 | """
69 | if self.children is not None:
70 | raise Exception('cannot add handlers after sending messages')
71 | self.dispatcher.add_dead_letter_handler(handler)
72 |
73 | def _start(self):
74 | if self.children is None:
75 | self.children = []
76 | for i in range(0, self.n_children):
77 | c = ChildConnector(self.dispatcher,
78 | pending_limit = self.pending_limit,
79 | name = ('handler%d' % i))
80 | self.children.append(c)
81 | atexit.register(self.shutdown)
82 |
83 | def shutdown(self):
84 | """Stop all worker processes and wait for them to exit.
85 |
86 | Typically flush should be called first.
87 | """
88 | if self.children is not None:
89 | atexit.unregister(self.shutdown)
90 | for c in self.children:
91 | c.close()
92 | self.children = None
93 |
94 | def send_message(self, channel, message, source, ttl):
95 | """Submit a new message.
96 |
97 | Note that message acknowledgements, as well as exceptions,
98 | will be reported asynchronously. In particular, if this
99 | function raises an exception, it may actually be the result of
100 | some earlier message.
101 | """
102 | self._start()
103 | k = hash(channel) % self.n_children
104 | self.children[k].send_message(channel, message, source, ttl)
105 |
106 | def flush(self):
107 | """Flush pending output to disk.
108 |
109 | Any pending acknowledgements or exceptions will be processed
110 | before flushing. If this function raises an exception, it may
111 | actually be the result of some earlier message.
112 | """
113 | self._start()
114 | for c in self.children:
115 | c.flush_begin()
116 | for c in self.children:
117 | c.flush_end()
118 |
119 | def terminate(self):
120 | """Force expiration of all pending messages."""
121 | self._start()
122 | for c in self.children:
123 | c.terminate()
124 |
125 | class ChildConnector:
126 | """Object that routes messages to a child process."""
127 |
128 | _all_pipes = set()
129 |
130 | def __init__(self, handler, pending_limit = 50, name = None):
131 | self.pending_limit = pending_limit
132 | self.pending_count = pending_limit
133 | self.messages = {}
134 | self.message_id = 0
135 |
136 | (parent_pipe, child_pipe) = Pipe()
137 | ChildConnector._all_pipes.add(parent_pipe)
138 | self.child = ChildContext(handler)
139 | self.process = Process(target = self.child._main,
140 | args = (name, child_pipe),
141 | name = name)
142 | self.process.start()
143 | self.parent_pipe = parent_pipe
144 | child_pipe.close()
145 |
146 | def close(self):
147 | """Shut down the child process."""
148 | try:
149 | if self.pending_count != self.pending_limit:
150 | try:
151 | self._sync_response()
152 | except Exception:
153 | logging.exception('Unhandled exception in child process')
154 | self.parent_pipe.send(ChildRequest.EXIT)
155 | finally:
156 | self.parent_pipe.close()
157 | ChildConnector._all_pipes.discard(self.parent_pipe)
158 | self.process.join()
159 |
160 | def send_message(self, channel, message, source, ttl):
161 | """Send a message to the child process."""
162 | if ttl <= 0:
163 | self._async_message(channel, message, source, ttl)
164 | self._sync_response()
165 | else:
166 | source.nack_message(channel, message, self)
167 | self._async_message(channel, message, source, ttl)
168 |
169 | def flush_begin(self):
170 | """Instruct the child process to flush output to disk."""
171 | self._async_request(ChildRequest.FLUSH)
172 |
173 | def flush_end(self):
174 | """Wait for the child process to finish flushing output."""
175 | self._sync_response()
176 |
177 | def terminate(self):
178 | """Force expiration of all pending messages."""
179 | self._async_request(ChildRequest.TERMINATE)
180 |
181 | def _async_message(self, channel, message, source, ttl):
182 | self.message_id += 1
183 | msgid = self.message_id
184 | self.messages[msgid] = (channel, message, source)
185 | self._async_request((msgid, channel, message, ttl))
186 |
187 | def _async_request(self, request):
188 | if self.pending_count <= 0:
189 | self._sync_response()
190 | self.parent_pipe.send(request)
191 | self.pending_count -= 1
192 |
193 | def _sync_response(self):
194 | self.parent_pipe.send(ChildRequest.SYNC_RESPONSE)
195 | (acks, exc, exc_msg) = self.parent_pipe.recv()
196 | for ackid in acks:
197 | m = self.messages.pop(ackid, None)
198 | if m is None:
199 | logging.warning('ack for an unknown message')
200 | else:
201 | (channel, message, source) = m
202 | source.ack_message(channel, message, self)
203 | if exc is not None:
204 | if isinstance(exc, BorkedPickleException):
205 | m = self.messages.get(exc.last_seen_msgid + 1, (None, None))
206 | desc = ('Failed to send/receive a message;' +
207 | ' pending channel=%r, message=%r') % (m[0], m[1])
208 | exc = TypeError(desc)
209 | raise exc from Exception(exc_msg)
210 | self.pending_count = self.pending_limit
211 |
212 | class ChildContext:
213 | def __init__(self, handler):
214 | self.handler = handler
215 | self.message_ids = {}
216 | self.acks = []
217 | self.pipe = None
218 |
219 | def _main(self, name, child_pipe):
220 | try:
221 | # Close all of the parent-side pipes that were created
222 | # previously (and inherited by the child process.)
223 | # Unfortunately we can't simply close all file
224 | # descriptors, or even all 'non-inheritable' file
225 | # descriptors, as that breaks pymssql.
226 | for p in ChildConnector._all_pipes:
227 | p.close()
228 | ChildConnector._all_pipes = set()
229 |
230 | if name is not None:
231 | setproctitle('downcast:%s' % (name,))
232 |
233 | self.pipe = child_pipe
234 | pf = os.environ.get('DOWNCAST_PROFILE_OUT', None)
235 | if pf is not None and name is not None:
236 | pf = '%s.%s' % (pf, name)
237 | cProfile.runctx('self._main1()', globals(), locals(), pf)
238 | else:
239 | self._main1()
240 | except:
241 | _handle_fatal_exception(*sys.exc_info())
242 | sys.exit(1)
243 |
244 | def _main1(self):
245 | try:
246 | msgid = 0
247 | while True:
248 | try:
249 | req = self.pipe.recv()
250 | except EOFError:
251 | return
252 | except (OSError, MemoryError):
253 | raise
254 | except Exception as e:
255 | # We assume that all other exceptions that occur
256 | # here result from an error in the process of
257 | # unpickling the message (or, potentially, the
258 | # channel.) Such exceptions can occur even
259 | # without raising an exception on the sender side,
260 | # and the resulting error message is generally
261 | # unhelpful in the extreme. Thus, we send back an
262 | # exception that indicates the *last message ID
263 | # that we were able to decode*; the sender, upon
264 | # receiving such an exception, can identify the
265 | # message that was (probably) the cause of the
266 | # exception.
267 | #
268 | # (We assume that there are never problems with
269 | # pickling/unpickling ChildRequests, nor 'msgid'
270 | # or 'ttl' values.)
271 | raise BorkedPickleException(msgid) from e
272 |
273 | if isinstance(req, tuple):
274 | (msgid, channel, message, ttl) = req
275 | self.message_ids[channel, message] = msgid
276 | self.handler.send_message(channel, message, self, ttl)
277 | elif req is ChildRequest.SYNC_RESPONSE:
278 | resp = (self.acks, None, None)
279 | self.acks = []
280 | self.pipe.send(resp)
281 | counter = 0
282 | elif req is ChildRequest.FLUSH:
283 | self.handler.flush()
284 | elif req is ChildRequest.TERMINATE:
285 | self.handler.terminate()
286 | elif req is ChildRequest.EXIT:
287 | return
288 | except Exception as exc:
289 | exc_msg = traceback.format_exc()
290 | while True:
291 | try:
292 | req = self.pipe.recv()
293 | except EOFError:
294 | return
295 | except (OSError, MemoryError):
296 | raise
297 | except Exception:
298 | req = None
299 | if req is ChildRequest.SYNC_RESPONSE:
300 | resp = (self.acks, exc, exc_msg)
301 | self.acks = []
302 | self.pipe.send(resp)
303 | elif req is ChildRequest.EXIT:
304 | return
305 |
306 | def nack_message(self, channel, message, handler):
307 | """Defer processing of a message."""
308 | pass
309 |
310 | def ack_message(self, channel, message, handler):
311 | """Acknowledge a message."""
312 | msgid = self.message_ids.pop((channel, message), None)
313 | if msgid is None:
314 | logging.warning('ack for an unknown message')
315 | else:
316 | self.acks.append(msgid)
317 |
318 |
319 | def _handle_fatal_exception(exc_type, exc_val, exc_tb):
320 | if exc_type is not SystemExit:
321 | hdr = '-------- %s --------\n' % current_process().name
322 | msg = traceback.format_exception(exc_type, exc_val, exc_tb)
323 | m = (hdr + ''.join(msg) + '\n').encode(sys.stderr.encoding,
324 | errors = 'replace')
325 | sys.stderr.flush()
326 | os.write(sys.stderr.fileno(), m)
327 |
328 | class ChildRequest(Enum):
329 | SYNC_RESPONSE = 0
330 | FLUSH = 1
331 | TERMINATE = 2
332 | EXIT = 3
333 |
334 | class BorkedPickleException(Exception):
335 | def __init__(self, last_seen_msgid):
336 | self.last_seen_msgid = last_seen_msgid
337 |
--------------------------------------------------------------------------------
/downcast/messages.py:
--------------------------------------------------------------------------------
1 | #
2 | # downcast - tools for unpacking patient data from DWC
3 | #
4 | # Copyright (c) 2017 Laboratory for Computational Physiology
5 | #
6 | # This program is free software: you can redistribute it and/or modify
7 | # it under the terms of the GNU General Public License as published by
8 | # the Free Software Foundation, either version 3 of the License, or
9 | # (at your option) any later version.
10 | #
11 | # This program is distributed in the hope that it will be useful,
12 | # but WITHOUT ANY WARRANTY; without even the implied warranty of
13 | # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14 | # GNU General Public License for more details.
15 | #
16 | # You should have received a copy of the GNU General Public License
17 | # along with this program. If not, see .
18 |
19 | from collections import namedtuple
20 | import struct
21 | import uuid
22 |
23 | ################################################################
24 |
25 | # _Export.WaveSample_
26 | WaveSampleMessage = namedtuple('WaveSampleMessage', (
27 | # The original data source (required for looking up wave_ids.)
28 | 'origin',
29 |
30 | # An opaque identifier (probably a small integer) for the waveform
31 | # attributes. I am hoping that those attributes are immutable
32 | # (e.g. same signal with different gain/baseline will use a
33 | # different ID.) Underlying type is 'bigint'.
34 | 'wave_id',
35 |
36 | # A timestamp (probably from DWC or SQL.)
37 | 'timestamp',
38 |
39 | # Apparently a uniform counter (i.e., runs continuously, never
40 | # adjusted forward or backward) of Philips milliseconds.
41 | 'sequence_number',
42 |
43 | # Byte array encoding wave samples as 16-bit little-endian
44 | # unsigned integers. Note that users should probably assume that
45 | # indices corresponding to 'unavailable_samples' or
46 | # 'invalid_samples' contain garbage and should be ignored.
47 | 'wave_samples',
48 |
49 | # String describing the intervals within 'wave_samples' that are
50 | # considered "unavailable". Should be a list of ASCII decimal
51 | # numbers separated by spaces; each pair of numbers indicates the
52 | # start and end of an "unavailable" interval.
53 | 'unavailable_samples',
54 |
55 | # String describing the intervals within 'wave_samples' that are
56 | # considered "invalid". Should be a list of ASCII decimal numbers
57 | # separated by spaces; each pair of numbers indicates the start
58 | # and end of an "invalid" interval. Indices start at zero and the
59 | # range is inclusive (e.g. "0 9" would indicate the first ten
60 | # samples.)
61 | 'invalid_samples',
62 |
63 | # String (list of ASCII decimal numbers separated by spaces)
64 | # giving the relative sample numbers at which pacemaker pulses
65 | # occurred.
66 | 'paced_pulses',
67 |
68 | # Should correspond to 'mapping_id' in PatientMappingMessage.
69 | 'mapping_id'))
70 |
71 | ################################################################)
72 |
73 | # _Export.Alert_
74 | AlertMessage = namedtuple('AlertMessage', (
75 | # The original data source.
76 | 'origin',
77 |
78 | # A timestamp (probably from DWC or SQL.)
79 | 'timestamp',
80 |
81 | # Sequence number. Corresponds to what?
82 | 'sequence_number',
83 |
84 | # An opaque identifier (probably a GUID) for the particular alarm.
85 | 'alert_id',
86 |
87 | # Magic number for the "source" of the alarm. See
88 | # System_Parameter-Alerts_Table_Ed_2_-_PIIC_iX_Rel_B.00.xlsx
89 | # (Parameters? or Calculations?). Underlying type is 'bigint'.
90 | 'source',
91 |
92 | # Magic number for the "code" of the alarm. See
93 | # System_Parameter-Alerts_Table_Ed_2_-_PIIC_iX_Rel_B.00.xlsx
94 | # (Alarm-Code-Ids). Underlying type is 'integer'.
95 | 'code',
96 |
97 | # Alarm message.
98 | 'label',
99 |
100 | # Magic number for the "severity" of the alarm.
101 | 'severity',
102 |
103 | # Magic number for the category of the alarm.
104 | 'kind',
105 |
106 | # Indicates that alarm has been silenced (?)
107 | 'is_silenced',
108 |
109 | # Undocumented magic number. Underlying type is 'bigint'.
110 | 'subtype_id',
111 |
112 | # Time that the alarm is reported? (probably from monitor)
113 | 'announce_time',
114 |
115 | # Time that the triggering condition begins? (probably from monitor)
116 | # If the time is unknown, this will be something absurd like 0001-01-01.
117 | 'onset_time',
118 |
119 | # Time that ??? ends (probably from monitor)
120 | # If the alarm has not yet ended, this will be something absurd
121 | # like 0001-01-01.
122 | 'end_time',
123 |
124 | # Should correspond to 'mapping_id' in PatientMappingMessage.
125 | 'mapping_id'))
126 |
127 | ################################################################)
128 |
129 | # _Export.EnumerationValue_
130 | EnumerationValueMessage = namedtuple('EnumerationValueMessage', (
131 | # The original data source (required for looking up
132 | # enumeration_ids.)
133 | 'origin',
134 |
135 | # An opaque identifier (probably a small integer) for the
136 | # observation attributes. I am hoping that those attributes are
137 | # immutable. Underlying type is 'bigint'.
138 | 'enumeration_id',
139 |
140 | # A timestamp (probably from DWC or SQL.)
141 | 'timestamp',
142 |
143 | # Sequence number when the observation was made.
144 | 'sequence_number',
145 |
146 | # An opaque identifier (probably a GUID) for a set of
147 | # simultaneous, related observations (???)
148 | 'compound_value_id',
149 |
150 | # Value, such as a beat label or description of rhythm.
151 | 'value',
152 |
153 | # Should correspond to 'mapping_id' in PatientMappingMessage.
154 | 'mapping_id'))
155 |
156 | ################################################################
157 |
158 | # _Export.NumericValue_
159 | NumericValueMessage = namedtuple('NumericValueMessage', (
160 | # The original data source (required for looking up
161 | # numeric_ids.)
162 | 'origin',
163 |
164 | # An opaque identifier (probably a small integer) for the
165 | # measurement attributes. I am hoping that these attributes are
166 | # immutable. Underlying type is 'bigint'.
167 | 'numeric_id',
168 |
169 | # A timestamp (probably from DWC or SQL.)
170 | 'timestamp',
171 |
172 | # Sequence number when the measurement was made.
173 | 'sequence_number',
174 |
175 | # Supposedly indicates that it's derived from "historic data
176 | # loaded upon bed association to PIIC iX".
177 | 'is_trend_uploaded',
178 |
179 | # An opaque identifier (probably a GUID) for a set of
180 | # simultaneous, related measurements.
181 | 'compound_value_id',
182 |
183 | # Measurement value.
184 | 'value',
185 |
186 | # Should correspond to 'mapping_id' in PatientMappingMessage.
187 | 'mapping_id'))
188 |
189 | ################################################################
190 |
191 | # _Export.PatientMapping_
192 | PatientMappingMessage = namedtuple('PatientMappingMessage', (
193 | # The original data source.
194 | 'origin',
195 |
196 | # An opaque identifier (probably a GUID) for the record. (This is
197 | # the 'Id' column in _Export.PatientMapping_.)
198 | 'mapping_id',
199 |
200 | # An opaque identifier (probably a GUID) for the patient.
201 | 'patient_id',
202 |
203 | # A timestamp, origin unknown. Presumably indicates when the
204 | # information in this message was updated.
205 | 'timestamp',
206 |
207 | # ???
208 | 'is_mapped',
209 |
210 | # Presumably indicates the original host from which the message
211 | # was received by the DWC system.
212 | 'hostname'))
213 |
214 | # _Export.Patient_
215 | PatientBasicInfoMessage = namedtuple('PatientBasicInfoMessage', (
216 | # The original data source.
217 | 'origin',
218 |
219 | # An opaque identifier (probably a GUID) for the patient.
220 | 'patient_id',
221 |
222 | # A timestamp, origin unknown. Presumably indicates when the
223 | # information in this message was updated.
224 | 'timestamp',
225 |
226 | # Presumably, the name of the bed the patient is assigned to.
227 | 'bed_label',
228 |
229 | # ???
230 | 'alias',
231 |
232 | # Magic number for patient's age category.
233 | 'category',
234 |
235 | # Patient's height.
236 | 'height',
237 |
238 | # Magic number for units of height.
239 | 'height_unit',
240 |
241 | # Patient's weight.
242 | 'weight',
243 |
244 | # Magic number for units of weight.
245 | 'weight_unit',
246 |
247 | # Magic number for units of pressure. (Why is this here?)
248 | 'pressure_unit',
249 |
250 | # Magic number for whether or not the patient has a pacemaker.
251 | 'paced_mode',
252 |
253 | # ???
254 | 'resuscitation_status',
255 |
256 | # ???
257 | 'admit_state',
258 |
259 | # Presumably, the name of the care unit.
260 | 'clinical_unit',
261 |
262 | # Magic number for sex.
263 | 'gender'))
264 |
265 | # _Export.BedTag_
266 | BedTagMessage = namedtuple('BedTagMessage', (
267 | # The original data source.
268 | 'origin',
269 |
270 | # Name of the bed.
271 | 'bed_label',
272 |
273 | # A timestamp, origin unknown. Presumably indicates when the
274 | # information in this message was updated.
275 | 'timestamp',
276 |
277 | # Tag. What is this?
278 | 'tag'))
279 |
280 | # _Export.PatientDateAttribute_
281 | PatientDateAttributeMessage = namedtuple('PatientDateAttributeMessage', (
282 | # The original data source.
283 | 'origin',
284 |
285 | # An opaque identifier (probably a GUID) for the patient.
286 | 'patient_id',
287 |
288 | # A timestamp, origin unknown. Presumably indicates when the
289 | # information in this message was updated.
290 | 'timestamp',
291 |
292 | # Name of the attribute, such as "DOB".
293 | 'name',
294 |
295 | # Value of the attribute.
296 | 'value'))
297 |
298 | # _Export.PatientStringAttribute_
299 | PatientStringAttributeMessage = namedtuple('PatientStringAttributeMessage', (
300 | # The original data source.
301 | 'origin',
302 |
303 | # An opaque identifier (probably a GUID) for the patient.
304 | 'patient_id',
305 |
306 | # A timestamp, origin unknown. Presumably indicates when the
307 | # information in this message was updated.
308 | 'timestamp',
309 |
310 | # Name of the attribute.
311 | 'name',
312 |
313 | # Value of the attribute.
314 | 'value'))
315 |
316 | ################################################################
317 |
318 | def bcp_format_message(message):
319 | """Convert a message to BCP format.
320 |
321 | The argument must be an AlertMessage, BedTagMessage,
322 | EnumerationValueMessage, NumericValueMessage,
323 | PatientBasicInfoMessage, PatientDateAttributeMessage,
324 | PatientMappingMessage, PatientStringAttributeMessage, or
325 | WaveSampleMessage.
326 |
327 | The result is a byte string which can be written to a file and
328 | later parsed by freebcp or by the downcast.db.bcp module.
329 |
330 | Note that the result is not always identical to what freebcp
331 | itself would have produced, since UUIDs are sometimes "natively"
332 | written as lowercase and sometimes uppercase.
333 | """
334 | text = []
335 | for (field, value) in zip(message._fields, message):
336 | # ignore the internal "origin" field
337 | if field == 'origin':
338 | continue
339 | # special case for WaveSamples
340 | if field == 'wave_samples':
341 | ftext = struct.pack(' id
393 | text += str(i + 1) + ' ' + name + ' ""\n'
394 | return text
395 |
--------------------------------------------------------------------------------