├── entab-js ├── .cargo-ok ├── example │ ├── icons │ │ ├── favicon.ico │ │ ├── favicon-16x16.png │ │ ├── favicon-32x32.png │ │ ├── maskable_icon.png │ │ ├── apple-touch-icon.png │ │ ├── android-chrome-192x192.png │ │ └── android-chrome-512x512.png │ ├── service-worker.js │ └── app.webmanifest ├── .appveyor.yml ├── src │ ├── utils.rs │ └── lib.rs ├── README.md ├── Cargo.toml ├── LICENSE.md ├── tests │ └── web.rs └── .travis.yml ├── entab-py ├── rust-toolchain ├── entab │ ├── __init__.py │ └── spectra.py ├── pyproject.toml ├── README.md ├── Cargo.toml └── src │ └── raw_io_wrapper.rs ├── entab ├── fuzz │ ├── rust-toolchain │ ├── .gitignore │ ├── fuzz_targets │ │ ├── tsv.rs │ │ └── reader.rs │ └── Cargo.toml ├── tests │ ├── data │ │ ├── chemstation_mwd.d │ │ │ ├── RUN.M │ │ │ │ ├── RECALIB.MTH │ │ │ │ ├── FIA.REG │ │ │ │ ├── LAFC1.REG │ │ │ │ ├── LALS1.REG │ │ │ │ ├── LMWD1.REG │ │ │ │ ├── LPMP1.REG │ │ │ │ ├── LTHM1.REG │ │ │ │ ├── DAMETHOD.REG │ │ │ │ ├── INFO.MTH │ │ │ │ └── ACQ.MS │ │ │ ├── RUN.LOG │ │ │ ├── mwd1A.ch │ │ │ ├── mwd1B.ch │ │ │ ├── mwd1C.ch │ │ │ ├── mwd1D.ch │ │ │ ├── mwd1E.ch │ │ │ ├── ACQRES.REG │ │ │ ├── LCDIAG.REG │ │ │ ├── LAFC1FD.REG │ │ │ ├── MSACQINF.REG │ │ │ ├── ATUNES.TUN │ │ │ │ ├── NEGATIVE.U │ │ │ │ └── POSITIVE.U │ │ │ ├── SAMPLE.MAC │ │ │ └── result.ini │ │ ├── test.bam │ │ ├── bmp_24.png │ │ ├── small.RAW │ │ ├── test.csv.xz │ │ ├── test_fid.ch │ │ ├── test-0000.cf │ │ ├── test.csv.bz2 │ │ ├── test.csv.zst │ │ ├── b3_alkanes.dxf │ │ ├── bmp_indexed.png │ │ ├── test_179_fid.ch │ │ ├── carotenoid_extract.d │ │ │ ├── MSD1.MS │ │ │ ├── RUN.LOG │ │ │ ├── dad1.uv │ │ │ ├── ACQRES.REG │ │ │ ├── LCDIAG.REG │ │ │ ├── MSDIAG.REG │ │ │ ├── SAMPLE.XML │ │ │ ├── result.ini │ │ │ ├── LAFC1FD.REG │ │ │ ├── MSACQINF.REG │ │ │ ├── MSPARMS.txt │ │ │ ├── RUN.M │ │ │ │ ├── FIA.REG │ │ │ │ ├── INFO.MTH │ │ │ │ ├── LAFC1.REG │ │ │ │ ├── LALS1.REG │ │ │ │ ├── LDAD1.REG │ │ │ │ ├── LPMP1.REG │ │ │ │ ├── LTHM1.REG │ │ │ │ ├── RECALIB.MTH │ │ │ │ ├── DAMETHOD.REG │ │ │ │ └── ACQ.MS │ │ │ ├── SAMPLE.MAC.bak │ │ │ └── atunes.tun │ │ │ │ ├── NEGATIVE.U │ │ │ │ └── POSITIVE.U │ │ ├── masshunter_example │ │ │ ├── desktop.ini │ │ │ └── AcqData │ │ │ │ ├── DAD1.cd │ │ │ │ ├── DAD1.cg │ │ │ │ ├── DAD1.sd │ │ │ │ ├── DAD1.sp │ │ │ │ ├── TCC1.cd │ │ │ │ ├── TCC1.cg │ │ │ │ ├── MSScan.bin │ │ │ │ ├── MSProfile.bin │ │ │ │ ├── QuatPump1.cd │ │ │ │ ├── QuatPump1.cg │ │ │ │ ├── RJB_Airs2001FIA.m │ │ │ │ ├── 192_1.stg │ │ │ │ ├── DAD_1_MethodMetaData.XML │ │ │ │ ├── TCC_1_MethodMetaData.XML │ │ │ │ ├── DaMethod │ │ │ │ │ ├── Qual │ │ │ │ │ │ ├── info.xml │ │ │ │ │ │ └── method.xml │ │ │ │ │ └── Quant │ │ │ │ │ │ ├── info.xml │ │ │ │ │ │ └── method.xml │ │ │ │ ├── HiP-ALS_1_MethodMetaData.XML │ │ │ │ ├── QuatPump_1_MethodMetaData.XML │ │ │ │ ├── qqqacqmethod.xsd │ │ │ │ ├── info.xml │ │ │ │ ├── HiP-ALS_1_Pretreatment.XML │ │ │ │ ├── TCC_1.XML │ │ │ │ ├── HiP-ALS_1.XML │ │ │ │ ├── msquad.xsd │ │ │ │ ├── DAD_1.XML │ │ │ │ └── 192_1.xml │ │ │ │ ├── MSTS.xml │ │ │ │ ├── Contents.xml │ │ │ │ ├── Devices.xml │ │ │ │ ├── MSScan.xsd │ │ │ │ └── sample_info.xml │ │ ├── HTS_BD_LSR_II_Mixed_Specimen_001_D6_D06.fcs │ │ ├── test.sam │ │ └── sequence.fasta │ └── DATA_SOURCES.txt ├── src │ ├── parsers │ │ ├── thermo │ │ │ └── mod.rs │ │ ├── agilent │ │ │ ├── mod.rs │ │ │ └── chemstation_reg.rs │ │ ├── microsoft_common.rs │ │ ├── mod.rs │ │ ├── fasta.rs │ │ ├── fastq.rs │ │ ├── common.rs │ │ └── xml.rs │ ├── lib.rs │ ├── compression.rs │ └── record.rs ├── Cargo.toml ├── README.md └── benches │ └── benchmarks.rs ├── entab-r ├── .Rbuildignore ├── .gitignore ├── NAMESPACE ├── src │ ├── Makevars.ucrt │ ├── Makevars │ ├── Makevars.win │ └── lib.rs ├── man │ ├── cash-Reader-method.Rd │ ├── Reader-class.Rd │ ├── show-Reader-method.Rd │ ├── as.data.frame-Reader-method.Rd │ └── initialize-Reader-method.Rd ├── Cargo.toml ├── DESCRIPTION ├── LICENSE ├── R │ └── lib.R └── README.md ├── entab-cli ├── README.md ├── src │ ├── main.rs │ ├── lib.rs │ └── tsv_params.rs └── Cargo.toml ├── .gitignore ├── Cargo.toml ├── entab-benchmarks ├── README.md ├── Cargo.toml └── benches │ └── fasta.rs ├── LICENSE.md ├── .github └── workflows │ ├── website.yml │ ├── tests.yml │ └── publish.yml └── README.md /entab-js/.cargo-ok: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /entab-py/rust-toolchain: -------------------------------------------------------------------------------- 1 | nightly 2 | -------------------------------------------------------------------------------- /entab/fuzz/rust-toolchain: -------------------------------------------------------------------------------- 1 | nightly 2 | -------------------------------------------------------------------------------- /entab-r/.Rbuildignore: -------------------------------------------------------------------------------- 1 | ^\.gitignore$ 2 | ^\.\.Rcheck$ 3 | -------------------------------------------------------------------------------- /entab-py/entab/__init__.py: -------------------------------------------------------------------------------- 1 | from ._entab import Reader 2 | 3 | -------------------------------------------------------------------------------- /entab/fuzz/.gitignore: -------------------------------------------------------------------------------- 1 | 2 | target 3 | corpus 4 | artifacts 5 | -------------------------------------------------------------------------------- /entab-r/.gitignore: -------------------------------------------------------------------------------- 1 | src/libentab.so 2 | ..Rcheck 3 | entab.Rcheck 4 | -------------------------------------------------------------------------------- /entab/tests/data/chemstation_mwd.d/RUN.M/RECALIB.MTH: -------------------------------------------------------------------------------- 1 | CLEARMIX 2 | -------------------------------------------------------------------------------- /entab-cli/README.md: -------------------------------------------------------------------------------- 1 | This is the CLI using the entab parsing library. 2 | -------------------------------------------------------------------------------- /entab/tests/data/test.bam: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/bovee/entab/HEAD/entab/tests/data/test.bam -------------------------------------------------------------------------------- /entab/tests/data/bmp_24.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/bovee/entab/HEAD/entab/tests/data/bmp_24.png -------------------------------------------------------------------------------- /entab/tests/data/small.RAW: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/bovee/entab/HEAD/entab/tests/data/small.RAW -------------------------------------------------------------------------------- /entab/tests/data/test.csv.xz: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/bovee/entab/HEAD/entab/tests/data/test.csv.xz -------------------------------------------------------------------------------- /entab/tests/data/test_fid.ch: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/bovee/entab/HEAD/entab/tests/data/test_fid.ch -------------------------------------------------------------------------------- /entab/tests/data/test-0000.cf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/bovee/entab/HEAD/entab/tests/data/test-0000.cf -------------------------------------------------------------------------------- /entab/tests/data/test.csv.bz2: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/bovee/entab/HEAD/entab/tests/data/test.csv.bz2 -------------------------------------------------------------------------------- /entab/tests/data/test.csv.zst: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/bovee/entab/HEAD/entab/tests/data/test.csv.zst -------------------------------------------------------------------------------- /entab/tests/data/b3_alkanes.dxf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/bovee/entab/HEAD/entab/tests/data/b3_alkanes.dxf -------------------------------------------------------------------------------- /entab/tests/data/bmp_indexed.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/bovee/entab/HEAD/entab/tests/data/bmp_indexed.png -------------------------------------------------------------------------------- /entab/tests/data/test_179_fid.ch: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/bovee/entab/HEAD/entab/tests/data/test_179_fid.ch -------------------------------------------------------------------------------- /entab-js/example/icons/favicon.ico: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/bovee/entab/HEAD/entab-js/example/icons/favicon.ico -------------------------------------------------------------------------------- /entab-js/example/icons/favicon-16x16.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/bovee/entab/HEAD/entab-js/example/icons/favicon-16x16.png -------------------------------------------------------------------------------- /entab-js/example/icons/favicon-32x32.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/bovee/entab/HEAD/entab-js/example/icons/favicon-32x32.png -------------------------------------------------------------------------------- /entab-js/example/icons/maskable_icon.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/bovee/entab/HEAD/entab-js/example/icons/maskable_icon.png -------------------------------------------------------------------------------- /entab-js/example/icons/apple-touch-icon.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/bovee/entab/HEAD/entab-js/example/icons/apple-touch-icon.png -------------------------------------------------------------------------------- /entab/tests/data/chemstation_mwd.d/RUN.LOG: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/bovee/entab/HEAD/entab/tests/data/chemstation_mwd.d/RUN.LOG -------------------------------------------------------------------------------- /entab/tests/data/chemstation_mwd.d/mwd1A.ch: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/bovee/entab/HEAD/entab/tests/data/chemstation_mwd.d/mwd1A.ch -------------------------------------------------------------------------------- /entab/tests/data/chemstation_mwd.d/mwd1B.ch: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/bovee/entab/HEAD/entab/tests/data/chemstation_mwd.d/mwd1B.ch -------------------------------------------------------------------------------- /entab/tests/data/chemstation_mwd.d/mwd1C.ch: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/bovee/entab/HEAD/entab/tests/data/chemstation_mwd.d/mwd1C.ch -------------------------------------------------------------------------------- /entab/tests/data/chemstation_mwd.d/mwd1D.ch: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/bovee/entab/HEAD/entab/tests/data/chemstation_mwd.d/mwd1D.ch -------------------------------------------------------------------------------- /entab/tests/data/chemstation_mwd.d/mwd1E.ch: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/bovee/entab/HEAD/entab/tests/data/chemstation_mwd.d/mwd1E.ch -------------------------------------------------------------------------------- /entab/tests/data/carotenoid_extract.d/MSD1.MS: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/bovee/entab/HEAD/entab/tests/data/carotenoid_extract.d/MSD1.MS -------------------------------------------------------------------------------- /entab/tests/data/carotenoid_extract.d/RUN.LOG: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/bovee/entab/HEAD/entab/tests/data/carotenoid_extract.d/RUN.LOG -------------------------------------------------------------------------------- /entab/tests/data/carotenoid_extract.d/dad1.uv: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/bovee/entab/HEAD/entab/tests/data/carotenoid_extract.d/dad1.uv -------------------------------------------------------------------------------- /entab/tests/data/chemstation_mwd.d/ACQRES.REG: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/bovee/entab/HEAD/entab/tests/data/chemstation_mwd.d/ACQRES.REG -------------------------------------------------------------------------------- /entab/tests/data/chemstation_mwd.d/LCDIAG.REG: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/bovee/entab/HEAD/entab/tests/data/chemstation_mwd.d/LCDIAG.REG -------------------------------------------------------------------------------- /entab/tests/data/carotenoid_extract.d/ACQRES.REG: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/bovee/entab/HEAD/entab/tests/data/carotenoid_extract.d/ACQRES.REG -------------------------------------------------------------------------------- /entab/tests/data/carotenoid_extract.d/LCDIAG.REG: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/bovee/entab/HEAD/entab/tests/data/carotenoid_extract.d/LCDIAG.REG -------------------------------------------------------------------------------- /entab/tests/data/carotenoid_extract.d/MSDIAG.REG: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/bovee/entab/HEAD/entab/tests/data/carotenoid_extract.d/MSDIAG.REG -------------------------------------------------------------------------------- /entab/tests/data/carotenoid_extract.d/SAMPLE.XML: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/bovee/entab/HEAD/entab/tests/data/carotenoid_extract.d/SAMPLE.XML -------------------------------------------------------------------------------- /entab/tests/data/carotenoid_extract.d/result.ini: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/bovee/entab/HEAD/entab/tests/data/carotenoid_extract.d/result.ini -------------------------------------------------------------------------------- /entab/tests/data/chemstation_mwd.d/LAFC1FD.REG: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/bovee/entab/HEAD/entab/tests/data/chemstation_mwd.d/LAFC1FD.REG -------------------------------------------------------------------------------- /entab/tests/data/chemstation_mwd.d/MSACQINF.REG: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/bovee/entab/HEAD/entab/tests/data/chemstation_mwd.d/MSACQINF.REG -------------------------------------------------------------------------------- /entab/tests/data/chemstation_mwd.d/RUN.M/FIA.REG: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/bovee/entab/HEAD/entab/tests/data/chemstation_mwd.d/RUN.M/FIA.REG -------------------------------------------------------------------------------- /entab/tests/data/masshunter_example/desktop.ini: -------------------------------------------------------------------------------- 1 | [.ShellClassInfo] 2 | IconFile=C:\Windows\system32\agtMassHunter.ico 3 | IconIndex=0 4 | -------------------------------------------------------------------------------- /entab-js/example/icons/android-chrome-192x192.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/bovee/entab/HEAD/entab-js/example/icons/android-chrome-192x192.png -------------------------------------------------------------------------------- /entab-js/example/icons/android-chrome-512x512.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/bovee/entab/HEAD/entab-js/example/icons/android-chrome-512x512.png -------------------------------------------------------------------------------- /entab/tests/data/carotenoid_extract.d/LAFC1FD.REG: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/bovee/entab/HEAD/entab/tests/data/carotenoid_extract.d/LAFC1FD.REG -------------------------------------------------------------------------------- /entab/tests/data/carotenoid_extract.d/MSACQINF.REG: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/bovee/entab/HEAD/entab/tests/data/carotenoid_extract.d/MSACQINF.REG -------------------------------------------------------------------------------- /entab/tests/data/carotenoid_extract.d/MSPARMS.txt: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/bovee/entab/HEAD/entab/tests/data/carotenoid_extract.d/MSPARMS.txt -------------------------------------------------------------------------------- /entab/tests/data/chemstation_mwd.d/RUN.M/LAFC1.REG: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/bovee/entab/HEAD/entab/tests/data/chemstation_mwd.d/RUN.M/LAFC1.REG -------------------------------------------------------------------------------- /entab/tests/data/chemstation_mwd.d/RUN.M/LALS1.REG: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/bovee/entab/HEAD/entab/tests/data/chemstation_mwd.d/RUN.M/LALS1.REG -------------------------------------------------------------------------------- /entab/tests/data/chemstation_mwd.d/RUN.M/LMWD1.REG: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/bovee/entab/HEAD/entab/tests/data/chemstation_mwd.d/RUN.M/LMWD1.REG -------------------------------------------------------------------------------- /entab/tests/data/chemstation_mwd.d/RUN.M/LPMP1.REG: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/bovee/entab/HEAD/entab/tests/data/chemstation_mwd.d/RUN.M/LPMP1.REG -------------------------------------------------------------------------------- /entab/tests/data/chemstation_mwd.d/RUN.M/LTHM1.REG: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/bovee/entab/HEAD/entab/tests/data/chemstation_mwd.d/RUN.M/LTHM1.REG -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | target/ 2 | **/*.rs.bk 3 | Cargo.lock 4 | bin/ 5 | dist/ 6 | pkg/ 7 | wasm-pack.log 8 | venv/ 9 | __pycache__/ 10 | _entab.*.so 11 | -------------------------------------------------------------------------------- /entab/tests/data/carotenoid_extract.d/RUN.M/FIA.REG: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/bovee/entab/HEAD/entab/tests/data/carotenoid_extract.d/RUN.M/FIA.REG -------------------------------------------------------------------------------- /entab/tests/data/carotenoid_extract.d/RUN.M/INFO.MTH: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/bovee/entab/HEAD/entab/tests/data/carotenoid_extract.d/RUN.M/INFO.MTH -------------------------------------------------------------------------------- /entab/tests/data/carotenoid_extract.d/RUN.M/LAFC1.REG: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/bovee/entab/HEAD/entab/tests/data/carotenoid_extract.d/RUN.M/LAFC1.REG -------------------------------------------------------------------------------- /entab/tests/data/carotenoid_extract.d/RUN.M/LALS1.REG: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/bovee/entab/HEAD/entab/tests/data/carotenoid_extract.d/RUN.M/LALS1.REG -------------------------------------------------------------------------------- /entab/tests/data/carotenoid_extract.d/RUN.M/LDAD1.REG: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/bovee/entab/HEAD/entab/tests/data/carotenoid_extract.d/RUN.M/LDAD1.REG -------------------------------------------------------------------------------- /entab/tests/data/carotenoid_extract.d/RUN.M/LPMP1.REG: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/bovee/entab/HEAD/entab/tests/data/carotenoid_extract.d/RUN.M/LPMP1.REG -------------------------------------------------------------------------------- /entab/tests/data/carotenoid_extract.d/RUN.M/LTHM1.REG: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/bovee/entab/HEAD/entab/tests/data/carotenoid_extract.d/RUN.M/LTHM1.REG -------------------------------------------------------------------------------- /entab/tests/data/carotenoid_extract.d/SAMPLE.MAC.bak: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/bovee/entab/HEAD/entab/tests/data/carotenoid_extract.d/SAMPLE.MAC.bak -------------------------------------------------------------------------------- /entab/tests/data/chemstation_mwd.d/RUN.M/DAMETHOD.REG: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/bovee/entab/HEAD/entab/tests/data/chemstation_mwd.d/RUN.M/DAMETHOD.REG -------------------------------------------------------------------------------- /entab/tests/data/masshunter_example/AcqData/DAD1.cd: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/bovee/entab/HEAD/entab/tests/data/masshunter_example/AcqData/DAD1.cd -------------------------------------------------------------------------------- /entab/tests/data/masshunter_example/AcqData/DAD1.cg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/bovee/entab/HEAD/entab/tests/data/masshunter_example/AcqData/DAD1.cg -------------------------------------------------------------------------------- /entab/tests/data/masshunter_example/AcqData/DAD1.sd: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/bovee/entab/HEAD/entab/tests/data/masshunter_example/AcqData/DAD1.sd -------------------------------------------------------------------------------- /entab/tests/data/masshunter_example/AcqData/DAD1.sp: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/bovee/entab/HEAD/entab/tests/data/masshunter_example/AcqData/DAD1.sp -------------------------------------------------------------------------------- /entab/tests/data/masshunter_example/AcqData/TCC1.cd: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/bovee/entab/HEAD/entab/tests/data/masshunter_example/AcqData/TCC1.cd -------------------------------------------------------------------------------- /entab/tests/data/masshunter_example/AcqData/TCC1.cg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/bovee/entab/HEAD/entab/tests/data/masshunter_example/AcqData/TCC1.cg -------------------------------------------------------------------------------- /entab/tests/data/carotenoid_extract.d/RUN.M/RECALIB.MTH: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/bovee/entab/HEAD/entab/tests/data/carotenoid_extract.d/RUN.M/RECALIB.MTH -------------------------------------------------------------------------------- /entab/tests/data/masshunter_example/AcqData/MSScan.bin: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/bovee/entab/HEAD/entab/tests/data/masshunter_example/AcqData/MSScan.bin -------------------------------------------------------------------------------- /entab/tests/data/carotenoid_extract.d/RUN.M/DAMETHOD.REG: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/bovee/entab/HEAD/entab/tests/data/carotenoid_extract.d/RUN.M/DAMETHOD.REG -------------------------------------------------------------------------------- /entab/tests/data/chemstation_mwd.d/ATUNES.TUN/NEGATIVE.U: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/bovee/entab/HEAD/entab/tests/data/chemstation_mwd.d/ATUNES.TUN/NEGATIVE.U -------------------------------------------------------------------------------- /entab/tests/data/chemstation_mwd.d/ATUNES.TUN/POSITIVE.U: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/bovee/entab/HEAD/entab/tests/data/chemstation_mwd.d/ATUNES.TUN/POSITIVE.U -------------------------------------------------------------------------------- /entab/tests/data/masshunter_example/AcqData/MSProfile.bin: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/bovee/entab/HEAD/entab/tests/data/masshunter_example/AcqData/MSProfile.bin -------------------------------------------------------------------------------- /entab/tests/data/masshunter_example/AcqData/QuatPump1.cd: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/bovee/entab/HEAD/entab/tests/data/masshunter_example/AcqData/QuatPump1.cd -------------------------------------------------------------------------------- /entab/tests/data/masshunter_example/AcqData/QuatPump1.cg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/bovee/entab/HEAD/entab/tests/data/masshunter_example/AcqData/QuatPump1.cg -------------------------------------------------------------------------------- /entab/tests/data/HTS_BD_LSR_II_Mixed_Specimen_001_D6_D06.fcs: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/bovee/entab/HEAD/entab/tests/data/HTS_BD_LSR_II_Mixed_Specimen_001_D6_D06.fcs -------------------------------------------------------------------------------- /entab/tests/data/carotenoid_extract.d/atunes.tun/NEGATIVE.U: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/bovee/entab/HEAD/entab/tests/data/carotenoid_extract.d/atunes.tun/NEGATIVE.U -------------------------------------------------------------------------------- /entab/tests/data/carotenoid_extract.d/atunes.tun/POSITIVE.U: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/bovee/entab/HEAD/entab/tests/data/carotenoid_extract.d/atunes.tun/POSITIVE.U -------------------------------------------------------------------------------- /entab/src/parsers/thermo/mod.rs: -------------------------------------------------------------------------------- 1 | /// Parsers for files from Thermo's isotopic mass specs 2 | pub mod thermo_iso; 3 | /// Parsers for Thermo "RAW" format 4 | pub mod thermo_raw; 5 | -------------------------------------------------------------------------------- /entab/tests/data/masshunter_example/AcqData/RJB_Airs2001FIA.m/192_1.stg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/bovee/entab/HEAD/entab/tests/data/masshunter_example/AcqData/RJB_Airs2001FIA.m/192_1.stg -------------------------------------------------------------------------------- /entab-r/NAMESPACE: -------------------------------------------------------------------------------- 1 | # Generated by roxygen2: do not edit by hand 2 | 3 | export(Reader) 4 | exportMethods(as.data.frame) 5 | importFrom(methods,new) 6 | useDynLib(libentab, .registration = TRUE) 7 | -------------------------------------------------------------------------------- /entab/tests/data/chemstation_mwd.d/SAMPLE.MAC: -------------------------------------------------------------------------------- 1 | Name SAMPLEINFO 2 | ON ERROR Print "" 3 | SAMPLETARGETMASS "" 4 | ON ERROR 5 | CLRRPTSMPLINFO 6 | SAMPLECALISTD 1,0 7 | SAMPLECALAMT 0,1,1 8 | REMOVE SAMPLEINFO 9 | RETURN 10 | -------------------------------------------------------------------------------- /entab-py/pyproject.toml: -------------------------------------------------------------------------------- 1 | [build-system] 2 | requires = ["maturin>=1.0,<2.0"] 3 | build-backend = "maturin" 4 | 5 | [project] 6 | name = "entab" 7 | 8 | [tool.maturin] 9 | bindings = "pyo3" 10 | features = ["maturin"] 11 | module-name = "entab._entab" 12 | -------------------------------------------------------------------------------- /entab/tests/data/masshunter_example/AcqData/RJB_Airs2001FIA.m/DAD_1_MethodMetaData.XML: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | UseInjectorProgram 5 | 0 6 | 7 | -------------------------------------------------------------------------------- /entab/tests/data/masshunter_example/AcqData/RJB_Airs2001FIA.m/TCC_1_MethodMetaData.XML: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | UseInjectorProgram 5 | 0 6 | 7 | -------------------------------------------------------------------------------- /entab/tests/data/masshunter_example/AcqData/RJB_Airs2001FIA.m/DaMethod/Qual/info.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 2.0 4 | 5 | 0 6 | -------------------------------------------------------------------------------- /entab/tests/data/masshunter_example/AcqData/RJB_Airs2001FIA.m/DaMethod/Quant/info.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 2.0 4 | 5 | 0 6 | -------------------------------------------------------------------------------- /entab/tests/data/masshunter_example/AcqData/RJB_Airs2001FIA.m/HiP-ALS_1_MethodMetaData.XML: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | UseInjectorProgram 5 | 0 6 | 7 | -------------------------------------------------------------------------------- /entab/tests/data/masshunter_example/AcqData/RJB_Airs2001FIA.m/QuatPump_1_MethodMetaData.XML: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | UseInjectorProgram 5 | 0 6 | 7 | -------------------------------------------------------------------------------- /entab-r/src/Makevars.ucrt: -------------------------------------------------------------------------------- 1 | # Use GNU toolchain for R >= 4.2 2 | TOOLCHAIN = stable-gnu 3 | 4 | # Rtools42 doesn't have the linker in the location that cargo expects, so we 5 | # need to overwrite it via configuration. 6 | CARGO_LINKER = x86_64-w64-mingw32.static.posix-gcc.exe 7 | 8 | include Makevars.win 9 | -------------------------------------------------------------------------------- /entab-r/man/cash-Reader-method.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/lib.R 3 | \name{$,Reader-method} 4 | \alias{$,Reader-method} 5 | \title{Expose methods} 6 | \usage{ 7 | \S4method{$}{Reader}(x, name) 8 | } 9 | \description{ 10 | i.e. Reader$metadata(), Reader$headers(), and Reader$parser() 11 | } 12 | -------------------------------------------------------------------------------- /entab-r/Cargo.toml: -------------------------------------------------------------------------------- 1 | [package] 2 | name = "entab-r" 3 | version = "0.3.3" 4 | authors = ["Roderick "] 5 | edition = "2018" 6 | 7 | [dependencies] 8 | entab_base = { package = "entab", git = "https://github.com/bovee/entab" } 9 | extendr-api = "0.6.0" 10 | libR-sys = "0.6.0" 11 | 12 | [lib] 13 | name = "entab" 14 | crate-type = ["cdylib"] 15 | -------------------------------------------------------------------------------- /entab-r/man/Reader-class.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/lib.R 3 | \docType{class} 4 | \name{Reader-class} 5 | \alias{Reader-class} 6 | \alias{Reader} 7 | \title{entab: a package for reading record-oriented file types} 8 | \description{ 9 | entab: a package for reading record-oriented file types 10 | } 11 | -------------------------------------------------------------------------------- /entab-r/man/show-Reader-method.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/lib.R 3 | \name{show,Reader-method} 4 | \alias{show,Reader-method} 5 | \title{Pretty-print a description of the Reader} 6 | \usage{ 7 | \S4method{show}{Reader}(object) 8 | } 9 | \description{ 10 | Pretty-print a description of the Reader 11 | } 12 | -------------------------------------------------------------------------------- /Cargo.toml: -------------------------------------------------------------------------------- 1 | [workspace] 2 | members = [ 3 | "entab", 4 | "entab-cli", 5 | "entab-js", 6 | "entab-py", 7 | ] 8 | 9 | exclude = [ 10 | "entab-r", 11 | "entab-benchmarks", 12 | ] 13 | 14 | [profile.release] 15 | lto = true 16 | codegen-units = 1 17 | 18 | [profile.bench] 19 | lto = true 20 | opt-level = 3 21 | overflow-checks = false 22 | codegen-units = 1 23 | -------------------------------------------------------------------------------- /entab-cli/src/main.rs: -------------------------------------------------------------------------------- 1 | use std::env::args_os; 2 | use std::io; 3 | 4 | use entab_cli::run; 5 | 6 | pub fn main() { 7 | let stdin = io::stdin(); 8 | let stdout = io::stdout(); 9 | 10 | if let Err(e) = run(args_os(), stdin.lock(), stdout.lock()) { 11 | eprintln!("##### AN ERROR OCCURRED ####"); 12 | eprintln!("{}", e); 13 | eprintln!("#####"); 14 | } 15 | } 16 | -------------------------------------------------------------------------------- /entab/tests/data/chemstation_mwd.d/result.ini: -------------------------------------------------------------------------------- 1 | 2 | [Header] 3 | SampleLocation=Vial 11 4 | Detector1=MWD1 5 | Detector2=UIB 6 | Detector3=MSD 7 | Detector4= 8 | Trigger=0 9 | PeakDuration=1.5 10 | Fractions=1 11 | Time=10 12 | PeakDetection=0 13 | WriteMasses=0 14 | LastDet= 15 | MsdDelay1=0.01 16 | NumFractions=0 17 | [Recovery] 18 | NumLoc=0 19 | [Fraction_0] 20 | NumWells=0 21 | -------------------------------------------------------------------------------- /entab-js/.appveyor.yml: -------------------------------------------------------------------------------- 1 | install: 2 | - appveyor-retry appveyor DownloadFile https://win.rustup.rs/ -FileName rustup-init.exe 3 | - if not defined RUSTFLAGS rustup-init.exe -y --default-host x86_64-pc-windows-msvc --default-toolchain nightly 4 | - set PATH=%PATH%;C:\Users\appveyor\.cargo\bin 5 | - rustc -V 6 | - cargo -V 7 | 8 | build: false 9 | 10 | test_script: 11 | - cargo test --locked 12 | -------------------------------------------------------------------------------- /entab-r/man/as.data.frame-Reader-method.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/lib.R 3 | \name{as.data.frame,Reader-method} 4 | \alias{as.data.frame,Reader-method} 5 | \title{Convert the Reader into a data.frame} 6 | \usage{ 7 | \S4method{as.data.frame}{Reader}(x, row.names = NULL, optional = FALSE, ...) 8 | } 9 | \description{ 10 | Convert the Reader into a data.frame 11 | } 12 | -------------------------------------------------------------------------------- /entab-r/DESCRIPTION: -------------------------------------------------------------------------------- 1 | Package: entab 2 | Type: Package 3 | Title: Entab 4 | Version: 0.3.1 5 | Author: Roderick 6 | Maintainer: Roderick 7 | Description: Entab is a record-format file reader. 8 | License: MIT + file LICENSE 9 | Encoding: UTF-8 10 | Imports: 11 | methods 12 | Suggests: 13 | devtools, 14 | roxygen2 15 | LazyData: true 16 | RoxygenNote: 7.1.2 17 | SystemRequirements: Cargo (rustc package manager) 18 | -------------------------------------------------------------------------------- /entab/tests/data/masshunter_example/AcqData/MSTS.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 1 4 | 5 | 0.0001166666666667 6 | 0.4021 7 | 49 8 | false 9 | 10 | 11 | -------------------------------------------------------------------------------- /entab/tests/data/masshunter_example/AcqData/RJB_Airs2001FIA.m/DaMethod/Qual/method.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 2.0 4 | 5 | Link 6 | 7 | 8 | Select a method 9 | 10 | 11 | -------------------------------------------------------------------------------- /entab/fuzz/fuzz_targets/tsv.rs: -------------------------------------------------------------------------------- 1 | #![no_main] 2 | use libfuzzer_sys::fuzz_target; 3 | extern crate entab; 4 | 5 | use entab::EtError; 6 | use entab::parsers::tsv::TsvReader; 7 | 8 | fuzz_target!(|data: &[u8]| { 9 | let _ = generate_reader(data); 10 | }); 11 | 12 | fn generate_reader(data: &[u8]) -> Result<(), EtError> { 13 | let mut reader = TsvReader::new(data, None)?; 14 | while let Some(_) = reader.next()? { 15 | } 16 | Ok(()) 17 | } 18 | -------------------------------------------------------------------------------- /entab/tests/data/chemstation_mwd.d/RUN.M/INFO.MTH: -------------------------------------------------------------------------------- 1 | CLEARMETHOD 2 | PRERUNPROGRAM 0 3 | POSTRUNPROGRAM 0 4 | ENABLEACQUISITION 1 5 | ENABLEANALYSIS 1 6 | CUSTOMANALYSIS 0 7 | COPYMETHOD 1 8 | SAVEGLPDATA 0 9 | PRERUNNAME "" 10 | POSTRUNNAME "" 11 | SIGNAL2METH "" 12 | ANALYSISMACRO "" 13 | METHODCOMMENT 14 | METHODCOMMENT "Reverse-phase separation of proteins with a H2O-MeOH gradient. Formic \" 15 | METHODCOMMENT "acid used as a ion-pairing agent.\" 16 | -------------------------------------------------------------------------------- /entab-benchmarks/README.md: -------------------------------------------------------------------------------- 1 | # Benchmarks 2 | 3 | These are benchmarks of entab against other Rust parsers available on crates.io. 4 | Note that the test files used as _not_ representative of real life data, e.g. needletail appears very slow parsing FASTA files here because of set-up overhead, but with more representative, real-life data it's much, much faster. 5 | To test on more accurate scenarios, update the filenames in benchmarks.rs. 6 | 7 | Run with `cargo criterion`. 8 | 9 | -------------------------------------------------------------------------------- /entab/tests/data/masshunter_example/AcqData/RJB_Airs2001FIA.m/qqqacqmethod.xsd: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | -------------------------------------------------------------------------------- /entab-py/README.md: -------------------------------------------------------------------------------- 1 | # Entab 2 | 3 | Parse record-based file formats into a stream of records. 4 | 5 | ## Usage 6 | 7 | ```python 8 | from entab import Reader 9 | reader = Reader(filename='test.fa') 10 | for record in reader: 11 | print(record.id) 12 | ``` 13 | 14 | ## Development 15 | 16 | Build with `maturin build` or build a working copy with `maturin develop`. 17 | 18 | Test with `cargo test`. 19 | 20 | # Releases 21 | 22 | Releases are generated by a Github Action. 23 | -------------------------------------------------------------------------------- /entab/fuzz/fuzz_targets/reader.rs: -------------------------------------------------------------------------------- 1 | #![no_main] 2 | use libfuzzer_sys::fuzz_target; 3 | extern crate entab; 4 | 5 | use entab::EtError; 6 | use entab::readers::get_reader; 7 | 8 | fuzz_target!(|data: &[u8]| { 9 | let _ = generate_reader(data); 10 | }); 11 | 12 | fn generate_reader(data: &[u8]) -> Result<(), EtError> { 13 | let (mut rec_reader, _) = get_reader(data, None, None)?; 14 | while let Some(_) = rec_reader.next_record()? { 15 | } 16 | Ok(()) 17 | } 18 | -------------------------------------------------------------------------------- /entab-js/src/utils.rs: -------------------------------------------------------------------------------- 1 | pub fn set_panic_hook() { 2 | // When the `console_error_panic_hook` feature is enabled, we can call the 3 | // `set_panic_hook` function at least once during initialization, and then 4 | // we will get better error messages if our code ever panics. 5 | // 6 | // For more details see 7 | // https://github.com/rustwasm/console_error_panic_hook#readme 8 | #[cfg(feature = "console_error_panic_hook")] 9 | console_error_panic_hook::set_once(); 10 | } 11 | -------------------------------------------------------------------------------- /entab/fuzz/Cargo.toml: -------------------------------------------------------------------------------- 1 | 2 | [package] 3 | name = "entab-fuzz" 4 | version = "0.0.0" 5 | authors = ["Automatically generated"] 6 | publish = false 7 | edition = "2018" 8 | 9 | [package.metadata] 10 | cargo-fuzz = true 11 | 12 | [dependencies] 13 | libfuzzer-sys = "0.4" 14 | 15 | [dependencies.entab] 16 | path = ".." 17 | 18 | # Prevent this from interfering with workspaces 19 | [workspace] 20 | members = ["."] 21 | 22 | [[bin]] 23 | name = "reader" 24 | path = "fuzz_targets/reader.rs" 25 | 26 | [[bin]] 27 | name = "tsv" 28 | path = "fuzz_targets/tsv.rs" 29 | test = false 30 | doc = false 31 | -------------------------------------------------------------------------------- /entab-benchmarks/Cargo.toml: -------------------------------------------------------------------------------- 1 | [package] 2 | name = "entab-benchmarks" 3 | version = "0.1.0" 4 | edition = "2018" 5 | 6 | # See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html 7 | 8 | [dependencies] 9 | entab = { path = "../entab" } 10 | needletail = "0.5.1" 11 | noodles = { version = "0.41.0", features = ["bam", "fasta", "fastq", "sam"] } 12 | bio = "1.2.0" 13 | rust-htslib = "0.43.1" 14 | seq_io = "0.3.1" 15 | memchr = "2.5" # for optimized FASTA benchmark 16 | 17 | [dev-dependencies] 18 | criterion = "0.5" 19 | 20 | [[bench]] 21 | name = "benchmark" 22 | harness = false 23 | -------------------------------------------------------------------------------- /entab-py/Cargo.toml: -------------------------------------------------------------------------------- 1 | [package] 2 | name = "entab-py" 3 | version = "0.3.3" 4 | authors = ["Roderick "] 5 | license = "MIT" 6 | description = "Record-format file reader" 7 | repository = "https://github.com/bovee/entab" 8 | edition = "2018" 9 | 10 | [dependencies] 11 | entab_base = { package = "entab", path = "../entab", default-features = false, features = ["std"] } 12 | memmap = "0.7" 13 | pyo3 = { version = "0.22.0" } 14 | 15 | [lib] 16 | name = "entab" 17 | crate-type = ["cdylib"] 18 | 19 | [features] 20 | maturin = ["pyo3/extension-module"] 21 | 22 | [package.metadata.maturin] 23 | name = "entab" 24 | -------------------------------------------------------------------------------- /entab-r/man/initialize-Reader-method.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/lib.R 3 | \name{initialize,Reader-method} 4 | \alias{initialize,Reader-method} 5 | \title{Create a new Reader} 6 | \usage{ 7 | \S4method{initialize}{Reader}(.Object, filename, parser = "") 8 | } 9 | \arguments{ 10 | \item{.Object}{base object} 11 | 12 | \item{filename}{path to the file to be parsed} 13 | 14 | \item{parser}{name of the parser to be used; if not specified, auto-detected} 15 | } 16 | \value{ 17 | Reader wrapping the opened file 18 | } 19 | \description{ 20 | Create a new Reader 21 | } 22 | -------------------------------------------------------------------------------- /entab-r/src/Makevars: -------------------------------------------------------------------------------- 1 | TARGET_DIR = ../target 2 | LIBDIR = $(TARGET_DIR)/release 3 | STATLIB = libentab.so 4 | PKG_LIBS = -L$(LIBDIR) -lentab 5 | ifeq ($(shell uname -s),Darwin) 6 | PLATFORM_STATLIB = libentab.dylib 7 | else 8 | PLATFORM_STATLIB = libentab.so 9 | endif 10 | 11 | all: C_clean 12 | 13 | $(SHLIB): $(STATLIB) 14 | 15 | $(STATLIB): 16 | export PATH="$(PATH):$(HOME)/.cargo/bin" && \ 17 | cargo build --release --manifest-path=../Cargo.toml --target-dir $(TARGET_DIR) 18 | mv ./$(LIBDIR)/$(PLATFORM_STATLIB) ./$(STATLIB) 19 | 20 | C_clean: 21 | rm -Rf $(SHLIB) $(STATLIB) $(OBJECTS) 22 | 23 | clean: 24 | rm -Rf $(SHLIB) $(STATLIB) $(OBJECTS) $(TARGET_DIR) 25 | -------------------------------------------------------------------------------- /entab/tests/data/masshunter_example/AcqData/RJB_Airs2001FIA.m/DaMethod/Quant/method.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 2.0 4 | 5 | 0 6 | Link 7 | 8 | 9 | Select a method 10 | 11 | 12 | 0 13 | 14 | 15 | 0 16 | 0 17 | CSV 18 | -------------------------------------------------------------------------------- /entab-cli/Cargo.toml: -------------------------------------------------------------------------------- 1 | [package] 2 | name = "entab-cli" 3 | version = "0.3.3" 4 | authors = ["Roderick "] 5 | edition = "2018" 6 | description = "Record-format file reader CLI" 7 | readme = "README.md" 8 | repository = "https://github.com/bovee/entab" 9 | license = "MIT" 10 | categories = ["command-line-utilities", "parsing", "science"] 11 | 12 | [dependencies] 13 | clap = { version = "4.5.7", features = ["cargo"] } 14 | entab = { path = "../entab", version = "0.3.1" } 15 | memchr = "2.7" 16 | memmap2 = { version = "0.9.4", optional = true } 17 | 18 | [features] 19 | default = ["mmap"] 20 | mmap = ["memmap2"] 21 | 22 | [[bin]] 23 | name = "entab" 24 | path = "src/main.rs" 25 | -------------------------------------------------------------------------------- /entab-js/README.md: -------------------------------------------------------------------------------- 1 | # Entab 2 | 3 | Parse record-based file formats into a stream of records. 4 | 5 | ## Usage 6 | 7 | ```javascript 8 | import { Reader } from 'entab'; 9 | 10 | // now parse the file 11 | const reader = new Reader(new Uint8Array(await file.arrayBuffer())); 12 | // or a string 13 | const reader = new Reader(new TextEncoder("utf-8").encode(">test\nacgt")); 14 | for (const record of reader) { 15 | ... 16 | } 17 | ``` 18 | 19 | Note that this will require paging the entire file into memory so files that 20 | take >10 Mb may be slow and files >100 Mb may not work at all. 21 | 22 | ## Development 23 | 24 | Build with `wasm-pack build`. 25 | 26 | Test with `wasm-pack test`. 27 | 28 | -------------------------------------------------------------------------------- /entab-js/example/service-worker.js: -------------------------------------------------------------------------------- 1 | self.addEventListener('install', evt => { 2 | evt.waitUntil( 3 | caches.open('entab').then(cache => { 4 | return cache.add('./index.html').then(() => self.skipWaiting()); 5 | }) 6 | ) 7 | }); 8 | 9 | self.addEventListener('activate', evt => { 10 | evt.waitUntil(self.clients.claim()); 11 | }); 12 | 13 | self.addEventListener('fetch', evt => { 14 | // fix for the bug here: https://bugs.chromium.org/p/chromium/issues/detail?id=823392 15 | if (evt.request.cache === 'only-if-cached' && evt.request.mode !== 'same-origin') { 16 | return; 17 | } 18 | 19 | evt.respondWith( 20 | caches.match(evt.request).then(res => res || fetch(evt.request)), 21 | ); 22 | }); 23 | -------------------------------------------------------------------------------- /entab-js/example/app.webmanifest: -------------------------------------------------------------------------------- 1 | { 2 | "short_name": "entab", 3 | "name": "entab", 4 | "description": "entab is a parser for bioinformatic, chemoinformatic and other scientific file formats. This page is a demonstration of how it can be used to graph data and file formats it can read.", 5 | "icons": [ 6 | {"src": "/icons/android-chrome-192x192.png", "sizes": "192x192", "type": "image/png"}, 7 | {"src": "/icons/android-chrome-512x512.png", "sizes": "512x512", "type": "image/png"}, 8 | {"src": "/icons/maskable_icon.png", "sizes": "192x192", "type": "image/png", "purpose": "any maskable"} 9 | ], 10 | "start_url": "/index.html", 11 | "background_color": "#FFFFFF", 12 | "display": "standalone", 13 | "scope": "/", 14 | "theme_color": "#B0B0D0" 15 | } 16 | -------------------------------------------------------------------------------- /entab/tests/data/masshunter_example/AcqData/RJB_Airs2001FIA.m/info.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | B.05.00 5 | 6 | 7 | 8 | 9 | 10 | 11 | 12 | 13 | 14 | 15 | 16 | 17 | 18 | 19 | 20 | 21 | 22 | 23 | 24 | 25 | Default Method 26 | 1.4 27 | -------------------------------------------------------------------------------- /entab-js/Cargo.toml: -------------------------------------------------------------------------------- 1 | [package] 2 | name = "entab-js" 3 | version = "0.3.3" 4 | authors = ["Roderick "] 5 | license = "MIT" 6 | description = "Record-format file reader" 7 | repository = "https://github.com/bovee/entab" 8 | edition = "2018" 9 | 10 | [lib] 11 | name = "entab" 12 | crate-type = ["cdylib", "rlib"] 13 | 14 | [features] 15 | default = ["console_error_panic_hook"] 16 | 17 | [dependencies] 18 | console_error_panic_hook = { version = "0.1.7", optional = true } 19 | entab_base = { package = "entab", path = "../entab", default-features = false, features = ["std"] } 20 | js-sys = "0.3.69" 21 | serde = { version = "1.0", features = ["derive"] } 22 | serde-wasm-bindgen = "0.6.5" 23 | wasm-bindgen = { version = "0.2.92", features = ["serde-serialize"] } 24 | 25 | [dev-dependencies] 26 | wasm-bindgen-test = "0.3.36" 27 | -------------------------------------------------------------------------------- /entab/tests/DATA_SOURCES.txt: -------------------------------------------------------------------------------- 1 | filename, source, license 2 | bmp_24.png, https://people.sc.fsu.edu/~jburkardt/data/png/png.html, LGPL 3 | b3_alkanes.dxf, collected by Roderick, 4 | HTS_BD_LSR_II_Mixed_Specimen_001_D6_D06.fcs, https://github.com/eyurtsev/fcsparser/blob/master/fcsparser/tests/data/FlowCytometers/HTS_BD_LSR-II/HTS_BD_LSR_II_Mixed_Specimen_001_D6_D06.fcs, MIT 5 | carotenoid_extract.d, collected by Roderick, 6 | chemstation_mwd.d, collected by Roderick, 7 | masshunter_exampke, collected by Roderick, 8 | sequence.fasta, downloaded from NCBI, 9 | test.csv.bz2, generated by hand 10 | test.csv.xz, generated by hand 11 | test.csv.zst, generated by hand 12 | test-0000.cf, collected by Roderick, 13 | test.bam, generated from test.sam, 14 | test.fastq, downloaded from NCBI, 15 | test_fid.ch, collected by Roderick, 16 | test_179_fid.ch, from issue #32 17 | test.sam, generated from aligning sequence.fasta against test.fastq, 18 | small.RAW, https://github.com/galaxyproteomics/tools-galaxyp/blob/master/tools/msconvert/test-data/small.RAW, CC0 19 | -------------------------------------------------------------------------------- /entab/Cargo.toml: -------------------------------------------------------------------------------- 1 | [package] 2 | name = "entab" 3 | version = "0.3.3" 4 | authors = ["Roderick "] 5 | edition = "2018" 6 | description = "Record-format file reader" 7 | readme = "README.md" 8 | repository = "https://github.com/bovee/entab" 9 | license = "MIT" 10 | categories = ["parsing", "science"] 11 | 12 | [dependencies] 13 | # parsing 14 | bytecount = "0.6.8" 15 | chrono = { version = "0.4", default-features=false, features = ["alloc", "serde"] } 16 | encoding = "0.2.33" 17 | memchr = "2.7" 18 | serde = { version = "1.0", default-features=false, features = ["derive"] } 19 | # compression 20 | flate2 = { version = "1.0" } 21 | bzip2 = { version = "0.4", optional = true } 22 | xz2 = { version = "0.1", optional = true } 23 | zstd = { version = "0.13", optional = true } 24 | 25 | [dev-dependencies] 26 | criterion = "0.3" 27 | rayon = "1.5.1" 28 | 29 | [features] 30 | default = ["compression", "std"] 31 | compression = ["bzip2", "xz2", "zstd"] 32 | std = ["bytecount/runtime-dispatch-simd", "chrono/std", "serde/std"] 33 | 34 | [[bench]] 35 | name = "benchmarks" 36 | path = "benches/benchmarks.rs" 37 | harness = false 38 | -------------------------------------------------------------------------------- /entab/README.md: -------------------------------------------------------------------------------- 1 | # Entab 2 | This is the main file parsing library and includes support for compression/ 3 | decompression, file type inference, and parsers for different file types. 4 | 5 | ## Usage 6 | 7 | To parse the IDs out of a FASTA file: 8 | ```rust 9 | //! use std::fs::File; 10 | //! use entab::readers::fasta::{FastaReader, FastaRecord}; 11 | //! 12 | //! let file = File::open("./tests/data/sequence.fasta")?; 13 | //! let mut reader = FastaReader::new(file, None)?; 14 | //! while let Some(FastaRecord { id, .. }) = reader.next()? { 15 | //! println!("{}", id); 16 | //! } 17 | ``` 18 | 19 | ## Other Parsers 20 | [Aston](https://github.com/bovee/aston) - Python - Agilent Chemstation & Masshunter/Thermo DXF/Inficon/etc 21 | [Chromatography Toolbox](https://github.com/chemplexity/chromatography) - Matlab - Agilent/Thermo/NetCDF/mzXML 22 | [Isoreader](https://github.com/isoverse/isoreader) - R - Isodat 23 | [Unfinnigan](https://github.com/prvst/unfinnigan) - Perl/Python - Thermo RAW 24 | [seqio](https://github.com/markschl/seq_io) - Rust - FASTX 25 | 26 | _Please let me know if there are others that you find useful that should be added to this list._ 27 | -------------------------------------------------------------------------------- /LICENSE.md: -------------------------------------------------------------------------------- 1 | The MIT License (MIT) 2 | 3 | Copyright (c) 2014- Roderick Bovee 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /entab-r/LICENSE: -------------------------------------------------------------------------------- 1 | The MIT License (MIT) 2 | 3 | Copyright (c) 2014- Roderick Bovee 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /entab-js/LICENSE.md: -------------------------------------------------------------------------------- 1 | The MIT License (MIT) 2 | 3 | Copyright (c) 2014- Roderick Bovee 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /entab-r/src/Makevars.win: -------------------------------------------------------------------------------- 1 | # TARGET = $(subst 64,x86_64,$(subst 32,i686,$(WIN)))-pc-windows-gnu 2 | TARGET = $(subst 64,x86_64,$(subst 32,i686,$(WIN)))-pc-windows-gnu 3 | 4 | # This is provided in Makevars.ucrt for R >= 4.2 5 | TOOLCHAIN ?= stable-msvc 6 | 7 | TARGET_DIR = ../target 8 | LIBDIR = $(TARGET_DIR)/$(TARGET)/release 9 | STATLIB = libentab.dll.a 10 | PKG_LIBS = -L$(LIBDIR) -lentab -lws2_32 -ladvapi32 -luserenv -lbcrypt 11 | PLATFORM_STATLIB = libentab.dll.a 12 | 13 | all: C_clean 14 | 15 | $(SHLIB): $(STATLIB) 16 | 17 | $(STATLIB): 18 | mkdir -p $(TARGET_DIR)/libgcc_mock 19 | cd $(TARGET_DIR)/libgcc_mock && \ 20 | touch gcc_mock.c && \ 21 | gcc -c gcc_mock.c -o gcc_mock.o && \ 22 | ar -r libgcc_eh.a gcc_mock.o && \ 23 | cp libgcc_eh.a libgcc_s.a 24 | 25 | # CARGO_LINKER is provided in Makevars.ucrt for R >= 4.2 26 | export PATH="$(PATH):$(HOME)/.cargo/bin" && \ 27 | export CARGO_TARGET_X86_64_PC_WINDOWS_GNU_LINKER="$(CARGO_LINKER)" && \ 28 | export LIBRARY_PATH="$${LIBRARY_PATH};$(CURDIR)/$(TARGET_DIR)/libgcc_mock" && \ 29 | cargo build --target=$(TARGET) --lib --release --manifest-path=../Cargo.toml --target-dir $(TARGET_DIR) 30 | mv ./$(LIBDIR)/entab.dll ./libentab.dll 31 | 32 | C_clean: 33 | rm -Rf $(SHLIB) $(STATLIB) $(OBJECTS) entab/target 34 | -------------------------------------------------------------------------------- /.github/workflows/website.yml: -------------------------------------------------------------------------------- 1 | name: Build WASM and publish website 2 | 3 | on: 4 | push: 5 | branches: [ main ] 6 | # push: 7 | # tags: 8 | # - v[0-9]+.* 9 | 10 | jobs: 11 | build-wasm-and-publish: 12 | concurrency: ci-${{ github.ref }} 13 | runs-on: ubuntu-latest 14 | steps: 15 | - uses: actions/checkout@v3 16 | 17 | - name: Install rust-toolchain 18 | uses: actions-rs/toolchain@v1 19 | with: 20 | toolchain: stable 21 | 22 | - name: Install wasm-pack 23 | run: | 24 | curl https://rustwasm.github.io/wasm-pack/installer/init.sh -sSf | sh 25 | wasm-pack --version 26 | 27 | - name: Run JS tests 28 | working-directory: ./entab-js 29 | run: wasm-pack test --headless --chrome 30 | 31 | - name: Build the WASM 32 | working-directory: ./entab-js 33 | run: wasm-pack build --target web --out-dir ./example/pkg 34 | 35 | - name: Delete .gitignore so pkg will be pushed too 36 | run: rm .gitignore entab-js/example/pkg/.gitignore 37 | 38 | - name: Publish to Github pages 39 | uses: JamesIves/github-pages-deploy-action@v4.3.3 40 | with: 41 | branch: gh-pages 42 | folder: entab-js/example 43 | single-commit: true 44 | -------------------------------------------------------------------------------- /entab/tests/data/test.sam: -------------------------------------------------------------------------------- 1 | @SQ SN:gi|9626372|ref|NC_001422.1| LN:5386 2 | @PG ID:minimap2 PN:minimap2 VN:2.17-r941 CL:minimap2 -a -o out ./sequence.fasta ./test.fastq 3 | SRR062634.1 4 * 0 0 * * 0 0 GGGTTTTCCTGAAAAAGGGATTCAAGAAAGAAAACTTACATGAGGTGATTGTTTAATGTTGCTACCAAAGAAGAGAGAGTTACCTGCCCATTCACTCAGG @C'@9:BB:?DCCB5CC?5C=?5@CADC?BDB)B@?-A@=:=:@CC'C>5AA+*+2@@'-?>5-?C=@-??)'>>B?D@?*?A################# rl:i:0 4 | SRR062634.2 4 * 0 0 * * 0 0 ACCGTGAGCAATCAGCTGCCATCAACGTGGAGGTAAGACTCTCCACCTGCAAAAACATTACAACTTGCTGAAGGCTGAGATACTTGTTCGCACATTTTTA FDEFF?DFEFE?BEEEEED=DB:DCEAEEB,CC=@B=5?B?CC5C?B+A??=>:CC<9-B2=@>-?:-*0<:'0%6,>:9&-:>?:>==B?? rl:i:0 5 | SRR062634.3 4 * 0 0 * * 0 0 TAGATATTTTTGTTTTAACTGCTGTAGAAAATTAAGACATAAACTAAGAAATATCCCATGAAGGAATGAGTATACTGTTTCTACTTGCAGAAAAGCTGCG -?3-C22646@-@3@@3-=-====CBB@DB-A-=-AA@C-D=ABDA;?CDDDD5D?DD55:>:AB>5?-CCCC###################### rl:i:0 6 | SRR062634.4 4 * 0 0 * * 0 0 AGATGAGTTTCACAAAGTAAGCAACTTGATATCCAAATAATCAACACCCAACTCAAGAAAAAGATCATTACCAGAAACTAATAAACCAGCACATTAGGTG ??EEEDB?D-?AAA-AA?>->BC:ADB:--A55ACCA:D6C:?5@CADD6=C5:CD?D4;,::?,CC-5A@C-?D5@+-BB?BC*A-A?C:C@####### rl:i:0 7 | SRR062634.5 4 * 0 0 * * 0 0 CTGTATCTAGGTTTTGTCCTTACATGTATATAACCTACACCCACAGTTTACCATCCGTGACATTTTCTTTCCTCTCCAGTCGTACAACAATACCCTGCCA CC?-?BAAB?E:B@@A7A?5CCBBBB@B?ABB?B@BB=B-BB=?######################################################## rl:i:0 8 | -------------------------------------------------------------------------------- /entab-js/tests/web.rs: -------------------------------------------------------------------------------- 1 | //! Test suite for the Web and headless browsers. 2 | 3 | #![cfg(target_arch = "wasm32")] 4 | 5 | use entab::Reader; 6 | use js_sys::{Map, Object, Reflect}; 7 | use wasm_bindgen::{JsCast, JsValue}; 8 | use wasm_bindgen_test::*; 9 | 10 | wasm_bindgen_test_configure!(run_in_browser); 11 | 12 | #[wasm_bindgen_test] 13 | fn create_reader() { 14 | // doesn't work for obvious reasons, but it'd be nice to test against a Uint8Array 15 | // let data = Uint8Array::new(&JsValue::from_str(">test\nACGT")); 16 | let data = b">test\nACGT"; 17 | let mut reader = 18 | Reader::new(data.to_vec().into_boxed_slice(), None).expect("Error creating the reader"); 19 | assert_eq!(reader.parser(), "fasta"); 20 | let raw_rec = reader.next().expect("Error reading first record"); 21 | let rec = raw_rec 22 | .dyn_into::() 23 | .expect("next() returns an object"); 24 | 25 | let done = Reflect::get(&rec, &JsValue::from_str("done")).expect("record has done"); 26 | assert!(done.is_falsy()); 27 | 28 | let raw_value = Reflect::get(&rec, &JsValue::from_str("value")).expect("record has value"); 29 | let value = raw_value.dyn_into::().expect("value is a map"); 30 | assert_eq!(value.size(), 2); 31 | assert_eq!(value.get(&("id".to_string()).into()), "test"); 32 | assert!(value.has(&("sequence".to_string()).into())); 33 | } 34 | -------------------------------------------------------------------------------- /.github/workflows/tests.yml: -------------------------------------------------------------------------------- 1 | name: Tests 2 | 3 | on: 4 | push: 5 | branches: [ main ] 6 | pull_request: 7 | branches: [ main ] 8 | 9 | env: 10 | CARGO_TERM_COLOR: always 11 | 12 | jobs: 13 | coverage: 14 | runs-on: ubuntu-latest 15 | steps: 16 | - uses: actions/checkout@v2 17 | 18 | - name: Install nightly 19 | uses: actions-rs/toolchain@v1 20 | with: 21 | toolchain: nightly 22 | override: true 23 | 24 | - name: Install tarpaulin 25 | uses: actions-rs/install@v0.1 26 | with: 27 | crate: cargo-tarpaulin 28 | version: 0.19.1 29 | use-tool-cache: true 30 | 31 | - name: Build 32 | run: cargo build 33 | 34 | - name: Run tests with coverage 35 | working-directory: ./entab 36 | run: cargo +nightly tarpaulin --all --timeout 600 --out Xml -- --test-threads 1 37 | 38 | - name: Upload to codecov 39 | uses: codecov/codecov-action@v2 40 | with: 41 | token: ${{ secrets.CODECOV_TOKEN }} 42 | fail_ci_if_error: true 43 | 44 | misc_tests: 45 | runs-on: ubuntu-latest 46 | steps: 47 | - uses: actions/checkout@v2 48 | 49 | - name: Install rust-toolchain 50 | uses: actions-rs/toolchain@v1 51 | with: 52 | toolchain: stable 53 | 54 | - name: Run no_std tests 55 | working-directory: ./entab 56 | run: cargo test --no-default-features 57 | -------------------------------------------------------------------------------- /entab-r/R/lib.R: -------------------------------------------------------------------------------- 1 | #' entab: a package for reading record-oriented file types 2 | #' 3 | #' @importFrom methods new 4 | #' @useDynLib libentab, .registration = TRUE 5 | #' 6 | 7 | #' @export Reader 8 | Reader <- setClass("Reader", slots = c( pointer = "externalptr" ) ) 9 | 10 | #' Convert the Reader into a data.frame 11 | #' 12 | #' @export 13 | setMethod("as.data.frame", "Reader", function(x, ...) { 14 | .Call("wrap__as_data_frame", x@pointer) 15 | } ) 16 | 17 | #' Expose methods 18 | #' 19 | #' i.e. Reader$metadata(), Reader$headers(), and Reader$parser() 20 | setMethod("$", "Reader", function(x, name) { 21 | function(...) .Call(paste0("wrap__Reader__", name), x@pointer, ...) 22 | } ) 23 | 24 | #' Pretty-print a description of the Reader 25 | setMethod("show", "Reader", function(object) { 26 | cat(object$parser(), "Reader\n") 27 | } ) 28 | 29 | #' Create a new Reader 30 | #' 31 | #' @param .Object base object 32 | #' @param filename path to the file to be parsed 33 | #' @param parser name of the parser to be used; if not specified, auto-detected 34 | #' 35 | #' @return Reader wrapping the opened file 36 | setMethod("initialize", "Reader", function(.Object, filename, parser = "") { 37 | d <- .Call("wrap__Reader__new", filename, parser) 38 | # extendr is setting class, but we need to strip it to fit in the slot 39 | attr(d, "class") <- NULL 40 | .Object@pointer <- d 41 | .Object 42 | } ) 43 | -------------------------------------------------------------------------------- /entab/tests/data/masshunter_example/AcqData/RJB_Airs2001FIA.m/HiP-ALS_1_Pretreatment.XML: -------------------------------------------------------------------------------- 1 | 2 | <SamplerConfig xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xmlns:xsd="http://www.w3.org/2001/XMLSchema" schemaVersion="1.0.2"><DisplayName>HiP Sampler</DisplayName><SerialNumber>DEBAP01780</SerialNumber><ModuleType>G4226A</ModuleType><ModuleDisplayType>G4226A</ModuleDisplayType><ConnectionInfo><ConnectionType>ConnectionType_IPAddress</ConnectionType><IPAddress>192.168.254.11</IPAddress></ConnectionInfo><FirmwareRevision>A.06.50 [003]</FirmwareRevision><IsExternalContactsBoardInstalled>false</IsExternalContactsBoardInstalled><IsThermostatInstalled>false</IsThermostatInstalled><SyringeVolume>40</SyringeVolume><SeatCapillary>1.2</SeatCapillary><LoopCapillary>80</LoopCapillary><IsBackSeatInstalled>false</IsBackSeatInstalled><BackSeatCapillary>0</BackSeatCapillary><BackLoopCapillary>0</BackLoopCapillary><RinseValveInstalled>false</RinseValveInstalled><RinseValveEnabled>false</RinseValveEnabled></SamplerConfig> 3 | -------------------------------------------------------------------------------- /entab/tests/data/masshunter_example/AcqData/Contents.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 3 5 | 6 | 7 | 2013-11-25T03:37:13Z 8 | 9 | 2 10 | 11 | Instrument 1 12 | 13 | 0 14 | 15 | 0 16 | 17 | 0 18 | 19 | 81.38 20 | 21 | 6400 Series Triple Quadrupole B.06.00 (B6025.0) 22 | -------------------------------------------------------------------------------- /entab/src/parsers/agilent/mod.rs: -------------------------------------------------------------------------------- 1 | /// Readers for formats generated by the GC/LC control software Chemstation 2 | pub mod chemstation; 3 | /// Readers for newer formats generated by the GC/LC control software Chemstation 4 | pub mod chemstation_new; 5 | // TODO: finish and reenable this 6 | // /// Readers for instrument telemetry data generated by Chemstation 7 | // pub mod chemstation_reg; 8 | /// Readers for formats generated by the GC/LC control software Masshunter 9 | #[cfg(feature = "std")] 10 | pub mod masshunter; 11 | /// Read the common metadata format at the top of Chemstation files 12 | pub mod metadata; 13 | 14 | use crate::error::EtError; 15 | use crate::parsers::common::Skip; 16 | use crate::parsers::{extract, Endian, FromSlice}; 17 | 18 | /// Read the header chunk for an Agilent file 19 | pub(crate) fn read_agilent_header(rb: &[u8], ms_format: bool) -> Result { 20 | if rb.len() < 268 { 21 | return Err(EtError::from("Agilent header too short").incomplete()); 22 | } 23 | 24 | // figure out how big the header should be and then get it 25 | let raw_header_size = u32::extract(&rb[264..268], &Endian::Big)? as usize; 26 | if raw_header_size == 0 { 27 | return Err("Invalid header length of 0".into()); 28 | } 29 | let mut header_size = 2 * (raw_header_size - 1); 30 | if !ms_format { 31 | header_size *= 256; 32 | } 33 | if header_size < 512 { 34 | return Err("Header length too short".into()); 35 | } else if header_size > 20_000 { 36 | return Err("Header length too long".into()); 37 | } 38 | let con = &mut 0; 39 | let _ = extract::(rb, con, &mut header_size)?; 40 | Ok(*con) 41 | } 42 | -------------------------------------------------------------------------------- /entab/tests/data/masshunter_example/AcqData/Devices.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 1 6 | 7 | TandemQuadrupole 8 | G6410A 9 | 1 10 | US63910134 11 | 5 12 | 8 13 | 0 14 | 1 15 | 16 | 17 | DAD 18 | G1315D 19 | 1 20 | DE64262996 21 | 14 22 | 6 23 | 0 24 | 1 25 | 26 | 27 | QuatPump 28 | G4204A 29 | 1 30 | DEBAN00295 31 | 32 32 | 2 33 | 0 34 | 1 35 | 36 | 37 | TCC 38 | G1316C 39 | 1 40 | DEBAC05711 41 | 40 42 | 2 43 | 0 44 | 1 45 | 46 | -------------------------------------------------------------------------------- /entab/tests/data/masshunter_example/AcqData/RJB_Airs2001FIA.m/TCC_1.XML: -------------------------------------------------------------------------------- 1 | 2 | <ColumnCompConfig xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xmlns:xsd="http://www.w3.org/2001/XMLSchema" schemaVersion="1.0.2"><DisplayName>Column Comp.</DisplayName><SerialNumber>DEBAC05711</SerialNumber><ModuleType>G1316C</ModuleType><ModuleDisplayType>G1316C</ModuleDisplayType><ConnectionInfo><ConnectionType>ConnectionType_IPAddress</ConnectionType><IPAddress>192.168.254.11</IPAddress></ConnectionInfo><FirmwareRevision>A.06.53 [002]</FirmwareRevision><IsColumnSwitchingValveInstalled>true</IsColumnSwitchingValveInstalled><ColumnValveVersion>6Port2Positions1200Bar</ColumnValveVersion></ColumnCompConfig> 3 | 4 | 5 | StopTime_NoLimit 6 | 7 | 8 | PostTime_Off 9 | 10 | ValvePos_1_6_10 11 | 12 | 0 13 | 0 14 | 15 | 16 | TemperatureSet 17 | 30 18 | 19 | true 20 | 0.8 21 | 22 | 23 | 24 | Combined 25 | 26 | true 27 | 0.8 28 | 29 | 30 | true 31 | -------------------------------------------------------------------------------- /entab-r/README.md: -------------------------------------------------------------------------------- 1 | # Development 2 | 3 | Rebuild the NAMESPACE and documentation with: 4 | ```r 5 | library(devtools) 6 | document() 7 | ``` 8 | 9 | Note that there's an issue with having the entab dependency in the R bindings as a path (and including this in the workspace in the directory above) because R will only build this directory and not include the parent directory. This will cause the build process to fail with a message about "could not find entab, only entab-r". What this means in practice is that a new version of `entab` needs to be pinned in Crates before any new features can be used in here. 10 | 11 | For future inspiration: There's an [example Windows build config](https://yutani.rbind.io/post/some-more-notes-about-using-rust-code-in-r-packages/) that might be good inspiration for building/releasing this for Windows machines. [gifski](https://cran.r-project.org/web/packages/gifski/index.html) is one of the few packages on CRAN with a Rust build pipeline. 12 | 13 | # Installation 14 | 15 | Build the R package itself with: 16 | ```bash 17 | R CMD INSTALL . 18 | ``` 19 | 20 | You can also install off of Github with: 21 | ```r 22 | library(devtools) 23 | devtools::install_github("bovee/entab", subdir="entab-r") 24 | ``` 25 | 26 | ## Additional instructions for Mac OS X installation 27 | 28 | If you're using RStudio on a Mac, you will likely need to tell R Studio where to find cargo (the Rust package manager) by adding it to your path. You can do this by modifying your `.Rprofile` file as suggested below: 29 | 30 | 1. Find your `.Rprofile` file in your home directory (you may need to press Command + Shift + period to reveal hidden files) and open it in your text editor of choice. Alternatively, you can call `usethis::edit_r_profile()` from RStudio to automatically open your `.Rprofile`. 31 | 2. Add `Sys.setenv(PATH = paste0("/Users//.cargo/bin:", Sys.getenv("PATH")))`, replacing with your username. This will append Cargo to your path when you open RStudio. 32 | 3. Save your `.Rprofile` and restart R Studio. 33 | 4. Install Entab from GitHub. 34 | 35 | # Usage 36 | 37 | And then use: 38 | ```r 39 | library(entab) 40 | r <- Reader('../test_file.fasta') 41 | data <- as.data.frame(r) 42 | ``` 43 | -------------------------------------------------------------------------------- /entab-benchmarks/benches/fasta.rs: -------------------------------------------------------------------------------- 1 | use std::str::from_utf8; 2 | 3 | use memchr::{memchr, memchr_iter}; 4 | 5 | 6 | pub fn read_fasta( 7 | mut rb: &[u8], 8 | mut callback: F, 9 | ) -> Result<(), &str> 10 | where 11 | F: for<'a> FnMut(&'a str, &[u8]) -> (), 12 | { 13 | let mut id; 14 | let mut raw_sequence; 15 | let mut new_buf = Vec::with_capacity(1024); 16 | while !rb.is_empty() { 17 | if rb[0] != b'>' { 18 | // TODO: check for spaces at the very end? 19 | return Err("Valid FASTA records start with '>'"); 20 | } 21 | let seq_start = if let Some(p) = memchr(b'\n', rb) { 22 | if p > 0 && rb[p - 1] == b'\r' { 23 | // strip out the \r too if this is a \r\n ending 24 | id = from_utf8(&rb[1..p - 1]).map_err(|_| "bad utf8 in header")?; 25 | } else { 26 | id = from_utf8(&rb[1..p]).map_err(|_| "bad utf8 in header")?; 27 | } 28 | p + 1 29 | } else { 30 | return Err("Incomplete header"); 31 | }; 32 | 33 | if let Some(p) = memchr(b'>', &rb[seq_start..]) { 34 | if p == 0 || rb.get(seq_start + p - 1) != Some(&b'\n') { 35 | return Err("Unexpected '>' found"); 36 | } 37 | if rb.get(seq_start + p - 2) == Some(&b'\r') { 38 | raw_sequence = &rb[seq_start..seq_start + p - 2]; 39 | } else { 40 | raw_sequence = &rb[seq_start..seq_start + p - 1]; 41 | } 42 | rb = &rb[seq_start + p..]; 43 | } else { 44 | raw_sequence = &rb[seq_start..rb.len()]; 45 | // at eof; just return the end 46 | rb = b""; 47 | } 48 | 49 | let mut seq_newlines = memchr_iter(b'\n', raw_sequence).peekable(); 50 | let sequence = if seq_newlines.peek().is_none() { 51 | raw_sequence.as_ref() 52 | } else { 53 | let mut start = 0; 54 | new_buf.clear(); 55 | for pos in seq_newlines { 56 | if pos >= 1 && raw_sequence.get(pos - 1) == Some(&b'\r') { 57 | new_buf.extend_from_slice(&raw_sequence[start..pos - 1]); 58 | } else { 59 | new_buf.extend_from_slice(&raw_sequence[start..pos]); 60 | } 61 | start = pos + 1; 62 | } 63 | new_buf.extend_from_slice(&raw_sequence[start..]); 64 | new_buf.as_ref() 65 | }; 66 | callback(id, sequence); 67 | } 68 | Ok(()) 69 | } 70 | -------------------------------------------------------------------------------- /entab-py/entab/spectra.py: -------------------------------------------------------------------------------- 1 | from entab import Reader 2 | 3 | 4 | class SpectraReader: 5 | """ 6 | A Reader that returns a sequence of spectra instead of the individual points. 7 | 8 | This takes the same arguments as a `Reader` (data, filename, parser) in addition to 9 | `time_res` which controls how points are bucketed by time and `merge_fn` which is 10 | used to e.g. sum together the points with the same mz/wavelength in the same time 11 | bucket. 12 | """ 13 | def __init__(self, data=None, filename=None, parser=None, time_res=0, merge_fn=sum): 14 | if not isinstance(data, Reader): 15 | reader = Reader(data, filename=filename, parser=parser) 16 | else: 17 | reader = data 18 | if reader is None: 19 | raise ValueError('A reader, data, or a filename is required') 20 | 21 | if 'mz' in reader.headers: 22 | y_key = 'mz' 23 | elif 'wavelength' in reader.headers: 24 | y_key = 'wavelength' 25 | else: 26 | raise KeyError(f'{reader.parser} reader missing mz/wavelength?') 27 | point = next(reader) 28 | self._reader = reader 29 | self._t_key = 'time' 30 | self._y_key = y_key 31 | self._z_key = 'intensity' 32 | self._time_res = time_res 33 | self._merge_fn = merge_fn 34 | self._cur_time = getattr(point, self._t_key) 35 | self._cur_spectra = {getattr(point, self._y_key): [getattr(point, self._z_key)]} 36 | 37 | def __getattr__(self, name): 38 | return getattr(self._reader, name) 39 | 40 | def __iter__(self): 41 | return self 42 | 43 | def __next__(self): 44 | for point in self._reader: 45 | if getattr(point, self._t_key) - self._cur_time > self._time_res: 46 | time = self._cur_time 47 | spectra = {y: self._merge_fn(z) for y, z in self._cur_spectra.items()} 48 | self._cur_spectra = {} 49 | self._cur_time = getattr(point, self._t_key) 50 | return (time, spectra) 51 | if getattr(point, self._y_key) in self._cur_spectra: 52 | self._cur_spectra[getattr(point, self._y_key)].append(getattr(point, self._z_key)) 53 | else: 54 | self._cur_spectra[getattr(point, self._y_key)] = [getattr(point, self._z_key)] 55 | 56 | if len(self._cur_spectra) == 0: 57 | raise StopIteration() 58 | spectra = {y: self._merge_fn(z) for y, z in self._cur_spectra.items()} 59 | self._cur_spectra = {} 60 | return (self._cur_time, spectra) 61 | 62 | -------------------------------------------------------------------------------- /entab/src/lib.rs: -------------------------------------------------------------------------------- 1 | #![cfg_attr(not(feature = "std"), no_std)] 2 | #![allow(clippy::module_name_repetitions)] 3 | #![deny( 4 | missing_debug_implementations, 5 | missing_docs, 6 | missing_copy_implementations, 7 | trivial_casts, 8 | trivial_numeric_casts, 9 | unreachable_pub, 10 | unused_import_braces, 11 | unused_qualifications, 12 | unused_results 13 | )] 14 | //! entab is a library to parse different "record-formatted" file formats 15 | //! into tabular form. 16 | //! 17 | //! Entab provides two different ways to parse each file it supports. If you 18 | //! know the type of the file you'll be reading, you generally want to use the 19 | //! specific parser for that file type which will return a record of a specific 20 | //! type. For example, to parse the IDs out of a FASTA file you might do the 21 | //! following: 22 | //! ``` 23 | //! # #[cfg(feature = "std")] { 24 | //! use std::fs::File; 25 | //! use entab::parsers::fasta::{FastaReader, FastaRecord}; 26 | //! 27 | //! let file = File::open("./tests/data/sequence.fasta")?; 28 | //! let mut reader = FastaReader::new(file, None)?; 29 | //! while let Some(FastaRecord { id, .. }) = reader.next()? { 30 | //! println!("{}", id); 31 | //! } 32 | //! # } 33 | //! # use entab::EtError; 34 | //! # Ok::<(), EtError>(()) 35 | //! ``` 36 | //! 37 | //! Alternatively, you may not know the type of file when writing your code so 38 | //! you may want to abstract over as many types as possible. This is where the 39 | //! slower, generic parser framework is used (for example, in the bindings 40 | //! for different languages). This framework can optionally take a `parser_name` 41 | //! to force it to use that specific parser and optional params to control 42 | //! parser options. 43 | //! ``` 44 | //! # #[cfg(feature = "std")] { 45 | //! use std::fs::File; 46 | //! use entab::filetype::FileType; 47 | //! use entab::readers::get_reader; 48 | //! 49 | //! let file = File::open("./tests/data/sequence.fasta")?; 50 | //! let (mut reader, _) = get_reader(file, None, None)?; 51 | //! while let Some(record) = reader.next_record()? { 52 | //! println!("{:?}", record[0]); 53 | //! } 54 | //! # } 55 | //! # use entab::EtError; 56 | //! # Ok::<(), EtError>(()) 57 | //! ``` 58 | 59 | extern crate alloc; 60 | 61 | /// The buffer interface that underlies the file readers 62 | pub mod buffer; 63 | /// Generic file decompression 64 | pub mod compression; 65 | /// Miscellanous utility functions and error handling 66 | pub mod error; 67 | /// File format inference 68 | pub mod filetype; 69 | /// Lightweight parsers to read records out of buffers 70 | pub mod parsers; 71 | /// Parsers for specific file formats 72 | pub mod readers; 73 | /// Record and abstract record reading 74 | pub mod record; 75 | 76 | pub use error::EtError; 77 | -------------------------------------------------------------------------------- /.github/workflows/publish.yml: -------------------------------------------------------------------------------- 1 | name: 'Publish new versions' 2 | 3 | on: 4 | push: 5 | tags: 6 | - v[0-9]+.* 7 | 8 | jobs: 9 | rust: 10 | name: Publish to crates.io 11 | runs-on: ubuntu-latest 12 | steps: 13 | - name: Checkout sources 14 | uses: actions/checkout@v4 15 | 16 | - name: Install stable toolchain 17 | uses: actions-rs/toolchain@v1 18 | with: 19 | profile: minimal 20 | toolchain: stable 21 | override: true 22 | 23 | - name: Publish package to Crates 24 | run: cargo publish --manifest-path entab/Cargo.toml --token ${CRATES_API_TOKEN} 25 | env: 26 | CRATES_API_TOKEN: ${{ secrets.CRATES_API_TOKEN }} 27 | 28 | python: 29 | name: Publish to PyPI 30 | strategy: 31 | fail-fast: false 32 | matrix: 33 | include: 34 | - os: ubuntu-latest 35 | target: x86_64 36 | - os: macos-latest 37 | target: x64 38 | - os: windows-latest 39 | target: x64 40 | runs-on: ${{ matrix.os }} 41 | steps: 42 | - uses: actions/checkout@v4 43 | 44 | - uses: actions-rs/toolchain@v1 45 | with: 46 | profile: minimal 47 | toolchain: stable 48 | default: true 49 | 50 | - uses: actions/setup-python@v5 51 | with: 52 | python-version: '3.11' 53 | architecture: x64 54 | 55 | - name: build the wheels 56 | uses: messense/maturin-action@v1 57 | with: 58 | target: ${{ matrix.target }} 59 | manylinux: auto 60 | args: --release --out dist -m entab-py/Cargo.toml --features=maturin 61 | 62 | - name: upload the wheels 63 | env: 64 | TWINE_USERNAME: __token__ 65 | TWINE_PASSWORD: ${{ secrets.PYPI_API_TOKEN }} 66 | run: | 67 | pip install --upgrade twine 68 | twine upload --skip-existing dist/* 69 | 70 | javascript: 71 | name: Publish to NPM 72 | runs-on: ubuntu-latest 73 | steps: 74 | - uses: actions/checkout@v4 75 | 76 | - uses: actions-rs/toolchain@v1 77 | with: 78 | profile: minimal 79 | toolchain: stable 80 | default: true 81 | 82 | - uses: actions/setup-node@v2 83 | with: 84 | registry-url: 'https://registry.npmjs.org' 85 | 86 | - name: install wasm-pack 87 | run: curl https://rustwasm.github.io/wasm-pack/installer/init.sh -sSf | sh 88 | 89 | - name: build the package 90 | run: wasm-pack build --release ./entab-js/ 91 | 92 | - name: fix the package name 93 | run: sed -i.bak "s/entab-js/entab/" entab-js/pkg/package.json 94 | - run: rm ./entab-js/pkg/package.json.bak 95 | 96 | - name: publish to npm 97 | env: 98 | NODE_AUTH_TOKEN: ${{ secrets.NPM_API_TOKEN }} 99 | run: npm publish --access public ./entab-js/pkg/ 100 | 101 | -------------------------------------------------------------------------------- /entab/tests/data/masshunter_example/AcqData/RJB_Airs2001FIA.m/HiP-ALS_1.XML: -------------------------------------------------------------------------------- 1 | 2 | <SamplerConfig xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xmlns:xsd="http://www.w3.org/2001/XMLSchema" schemaVersion="1.0.2"><DisplayName>HiP Sampler</DisplayName><SerialNumber>DEBAP01780</SerialNumber><ModuleType>G4226A</ModuleType><ModuleDisplayType>G4226A</ModuleDisplayType><ConnectionInfo><ConnectionType>ConnectionType_IPAddress</ConnectionType><IPAddress>192.168.254.11</IPAddress></ConnectionInfo><FirmwareRevision>A.06.50 [003]</FirmwareRevision><IsExternalContactsBoardInstalled>false</IsExternalContactsBoardInstalled><IsThermostatInstalled>false</IsThermostatInstalled><SyringeVolume>40</SyringeVolume><SeatCapillary>1.2</SeatCapillary><LoopCapillary>80</LoopCapillary><IsBackSeatInstalled>false</IsBackSeatInstalled><BackSeatCapillary>0</BackSeatCapillary><BackLoopCapillary>0</BackLoopCapillary><RinseValveInstalled>false</RinseValveInstalled><RinseValveEnabled>false</RinseValveEnabled></SamplerConfig> 3 | 4 | 5 | StopTime_NoLimit 6 | 7 | 8 | PostTime_Off 9 | 10 | 11 | 200 12 | 200 13 | 0 14 | 2 15 | 5 16 | true 17 | 18 | 19 | NeedleWash 20 | 10 21 | 22 | [[1]] 23 | WashVial 24 | 3 25 | 26 | 27 | 28 | false 29 | 30 | false 31 | 32 | 33 | 34 | 35 | false 36 | 37 | 38 | false 39 | 40 | 41 | false 42 | 43 | 44 | false 45 | 46 | 0 47 | 48 | -------------------------------------------------------------------------------- /entab/tests/data/masshunter_example/AcqData/RJB_Airs2001FIA.m/msquad.xsd: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | 10 | 11 | 12 | 13 | 14 | 15 | 16 | 17 | 18 | 19 | 20 | 21 | 22 | 23 | 24 | 25 | 26 | 27 | 28 | 29 | 30 | 31 | 32 | 33 | 34 | 35 | 36 | 37 | 38 | 39 | 40 | 41 | 42 | 43 | 44 | 45 | 46 | 47 | 48 | 49 | 50 | 51 | 52 | 53 | 54 | 55 | 56 | 57 | 58 | 59 | 60 | 61 | 62 | 63 | 64 | 65 | 66 | 67 | 68 | 69 | 70 | -------------------------------------------------------------------------------- /entab-js/.travis.yml: -------------------------------------------------------------------------------- 1 | language: rust 2 | sudo: false 3 | 4 | cache: cargo 5 | 6 | matrix: 7 | include: 8 | 9 | # Builds with wasm-pack. 10 | - rust: beta 11 | env: RUST_BACKTRACE=1 12 | addons: 13 | firefox: latest 14 | chrome: stable 15 | before_script: 16 | - (test -x $HOME/.cargo/bin/cargo-install-update || cargo install cargo-update) 17 | - (test -x $HOME/.cargo/bin/cargo-generate || cargo install --vers "^0.2" cargo-generate) 18 | - cargo install-update -a 19 | - curl https://rustwasm.github.io/wasm-pack/installer/init.sh -sSf | sh -s -- -f 20 | script: 21 | - cargo generate --git . --name testing 22 | # Having a broken Cargo.toml (in that it has curlies in fields) anywhere 23 | # in any of our parent dirs is problematic. 24 | - mv Cargo.toml Cargo.toml.tmpl 25 | - cd testing 26 | - wasm-pack build 27 | - wasm-pack test --chrome --firefox --headless 28 | 29 | # Builds on nightly. 30 | - rust: nightly 31 | env: RUST_BACKTRACE=1 32 | before_script: 33 | - (test -x $HOME/.cargo/bin/cargo-install-update || cargo install cargo-update) 34 | - (test -x $HOME/.cargo/bin/cargo-generate || cargo install --vers "^0.2" cargo-generate) 35 | - cargo install-update -a 36 | - rustup target add wasm32-unknown-unknown 37 | script: 38 | - cargo generate --git . --name testing 39 | - mv Cargo.toml Cargo.toml.tmpl 40 | - cd testing 41 | - cargo check 42 | - cargo check --target wasm32-unknown-unknown 43 | - cargo check --no-default-features 44 | - cargo check --target wasm32-unknown-unknown --no-default-features 45 | - cargo check --no-default-features --features console_error_panic_hook 46 | - cargo check --target wasm32-unknown-unknown --no-default-features --features console_error_panic_hook 47 | - cargo check --no-default-features --features "console_error_panic_hook wee_alloc" 48 | - cargo check --target wasm32-unknown-unknown --no-default-features --features "console_error_panic_hook wee_alloc" 49 | 50 | # Builds on beta. 51 | - rust: beta 52 | env: RUST_BACKTRACE=1 53 | before_script: 54 | - (test -x $HOME/.cargo/bin/cargo-install-update || cargo install cargo-update) 55 | - (test -x $HOME/.cargo/bin/cargo-generate || cargo install --vers "^0.2" cargo-generate) 56 | - cargo install-update -a 57 | - rustup target add wasm32-unknown-unknown 58 | script: 59 | - cargo generate --git . --name testing 60 | - mv Cargo.toml Cargo.toml.tmpl 61 | - cd testing 62 | - cargo check 63 | - cargo check --target wasm32-unknown-unknown 64 | - cargo check --no-default-features 65 | - cargo check --target wasm32-unknown-unknown --no-default-features 66 | - cargo check --no-default-features --features console_error_panic_hook 67 | - cargo check --target wasm32-unknown-unknown --no-default-features --features console_error_panic_hook 68 | # Note: no enabling the `wee_alloc` feature here because it requires 69 | # nightly for now. 70 | -------------------------------------------------------------------------------- /entab/src/parsers/microsoft_common.rs: -------------------------------------------------------------------------------- 1 | use core::convert::TryInto; 2 | 3 | use chrono::{NaiveDateTime, TimeZone, Utc}; 4 | 5 | use crate::error::EtError; 6 | use crate::parsers::{extract, Endian, FromSlice}; 7 | 8 | 9 | /// Convert a "Windows" timestamp into a regular `DateTime`. 10 | /// 11 | /// Windows time is the number of "100 microsecond" chunks since January 1, 1601 so to convert to 12 | /// unix time we first need to convert into nanoseconds and then subtract the number of nanoseconds 13 | /// from then to Jan 1, 1970. 14 | pub fn from_windows_time(time: u64) -> Result { 15 | let unix_time = time.saturating_mul(100).saturating_sub(11_644_473_600_000_000_000); 16 | Ok(Utc.timestamp_nanos(unix_time.try_into()?).naive_local()) 17 | } 18 | 19 | /// A chunk from a Microsoft "Compound File Binary" file (commonly used on Windows machines to 20 | /// store different data). 21 | /// 22 | /// See Microsoft documentation for more info: 23 | /// https://docs.microsoft.com/en-us/openspecs/windows_protocols/ms-cfb/05060311-bfce-4b12-874d-71fd4ce63aea 24 | #[derive(Debug, Default)] 25 | struct MsCfbHeader { } 26 | 27 | impl<'b: 's, 's> FromSlice<'b, 's> for MsCfbHeader { 28 | type State = (); 29 | 30 | fn parse( 31 | buffer: &[u8], 32 | _eof: bool, 33 | _consumed: &mut usize, 34 | _state: &mut Self::State, 35 | ) -> Result { 36 | const CFB_MAGIC: &[u8] = b"\xD0\xCF\x11\xE0\xA1\xB1\x1A\xE1"; 37 | 38 | if buffer.len() < 512 { 39 | return Err(EtError::new("MS CFB headers are always 512 bytes long").incomplete()); 40 | } 41 | if &buffer[..8] != CFB_MAGIC { 42 | return Err(EtError::new("CFB header has invalid magic")); 43 | } 44 | 45 | // minor_version = buffer[24..26] 46 | // major_version = buffer[26..28] 47 | // byte_order = buffer[28..30] 48 | let sector_size = match buffer[30..32] { 49 | [0x09, 0] => 512, 50 | [0x0C, 0] => 4096, 51 | _ => return Err("Invalid sector shift specified".into()), 52 | }; 53 | // 32..44 -> ... 54 | 55 | let n_fat_sectors = u32::extract(&buffer[44..48], Endian::Little)?; 56 | // TODO: we could maybe come up with a way to not call the `parse` side of above, but with 57 | // good ergonomics? (the below is a little gross) 58 | // let mut n_fat_sectors: u32 = 0; 59 | // FromSlice::get(&mut n_fat_sectors, &buffer[44..48], &Endian::Little)?; 60 | 61 | let first_dir_loc = u32::extract(&buffer[48..52], Endian::Little)?; 62 | let first_minifat_loc = u32::extract(&buffer[60..64], Endian::Little)?; 63 | let n_minifat_sectors = u32::extract(&buffer[64..68], Endian::Little)?; 64 | let first_difat_loc = u32::extract(&buffer[68..72], Endian::Little)?; 65 | let n_difat_sectors = u32::extract(&buffer[72..76], Endian::Little)?; 66 | if n_difat_sectors > 0 { 67 | return Err("DIFAT sectors aren't supported yet".into()); 68 | } 69 | // 76..512 -> DIFAT array of u32s 70 | 71 | Ok(false) 72 | } 73 | 74 | fn get(&mut self, _buffer: &'r [u8], _state: &Self::State) -> Result<(), EtError> { 75 | Ok(()) 76 | } 77 | } 78 | -------------------------------------------------------------------------------- /entab/tests/data/masshunter_example/AcqData/RJB_Airs2001FIA.m/DAD_1.XML: -------------------------------------------------------------------------------- 1 | 2 | <DadConfig xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xmlns:xsd="http://www.w3.org/2001/XMLSchema" schemaVersion="1.0.2"><DisplayName>DAD</DisplayName><SerialNumber>DE64262996</SerialNumber><ModuleType>G1315D</ModuleType><ModuleDisplayType>G1315D</ModuleDisplayType><ConnectionInfo><ConnectionType>ConnectionType_IPAddress</ConnectionType><IPAddress>192.168.254.11</IPAddress></ConnectionInfo><FirmwareRevision>B.06.53 [0013]</FirmwareRevision><IsExternalContactsBoardInstalled>false</IsExternalContactsBoardInstalled></DadConfig> 3 | 4 | 5 | StopTime_NoLimit 6 | 7 | 8 | PostTime_Off 9 | 10 | 0 11 | 12 | 5 13 | 10 14 | 15 | 16 | 5 17 | 10 18 | 19 | 20 | 21 | false 22 | SigA 23 | false 24 | 25 | 26 | false 27 | SigB 28 | false 29 | 30 | 31 | false 32 | SigC 33 | false 34 | 35 | 36 | false 37 | SigD 38 | false 39 | 40 | 41 | false 42 | SigE 43 | false 44 | 45 | 46 | false 47 | SigF 48 | false 49 | 50 | 51 | false 52 | SigG 53 | false 54 | 55 | 56 | false 57 | SigH 58 | false 59 | 60 | 61 | 2 62 | 2 63 | true 64 | true 65 | 66 | Balance_FixedGain 67 | 100 68 | 0 69 | 70 | 71 | true 72 | false 73 | false 74 | 75 | 76 | 250 77 | 800 78 | 2 79 | SpecAcq_All 80 | 81 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Entab 2 | What is everything were a/could be turned into a table? 3 | 4 | Entab is a parsing framework to turn a variety of record-based scientific file 5 | formats into usable tabular data across a variety of programming languages. 6 | 7 | ![Test status](https://github.com/bovee/entab/workflows/Tests/badge.svg) 8 | [![codecov](https://codecov.io/gh/bovee/entab/branch/master/graph/badge.svg?token=106EC5R6M5)](https://codecov.io/gh/bovee/entab) 9 | [![Package on Crates.io](https://img.shields.io/crates/v/entab.svg)](https://crates.io/crates/entab) 10 | [![Package on NPM](https://img.shields.io/npm/v/entab.svg)](https://www.npmjs.com/package/entab) 11 | [![Package on PyPI](https://img.shields.io/pypi/v/entab.svg)](https://pypi.org/project/entab/) 12 | [![DOI](https://zenodo.org/badge/245891414.svg)](https://zenodo.org/badge/latestdoi/245891414) 13 | 14 | 15 | ## Formats 16 | 17 | Entab supports reading a variety of bioinformatics, chemoinformatics, and 18 | other formats. 19 | 20 | - Agilent Chemstation CH, FID, MS, MWD, and UV formats 21 | - Agilent Masshunter DAD format[^1] 22 | - FASTA and FASTQ sequence formats 23 | - FCS flow cytometry format 24 | - Inficon Hapsite mass specotrometry format 25 | - PNG image format 26 | - SAM and BAM alignment formats 27 | - Thermo continuous flow isotope mass spectrometry formats 28 | - Thermo RAW files 29 | - CSV & TSV files 30 | 31 | [^1]: This format uses multiple files so it's not supported in streaming mode or in e.g. the JS bindings. 32 | 33 | ## CLI 34 | 35 | Entab has a CLI that allows piping in arbitrary files and outputs TSVs. 36 | Install with: 37 | ```sh 38 | cargo install entab-cli 39 | ``` 40 | 41 | Example usage to see how many records are in a file: 42 | ```sh 43 | cat test.fa | entab | sed '1d' | wc -l 44 | ``` 45 | 46 | ## Bindings 47 | 48 | There are bindings for two languages, Python and JavaScript, that support 49 | reading data streams and converting them into a series of records. 50 | 51 | The Javascript library can be installed with: 52 | ```sh 53 | npm install entab 54 | ``` 55 | The Python library can be installed with: 56 | ```sh 57 | pip install entab 58 | ``` 59 | 60 | The R bindings can be installed from inside R with (note you will need Cargo and a Rust buildchain locally): 61 | ```r 62 | library(devtools) 63 | devtools::install_github("bovee/entab", subdir="entab-r") 64 | ``` 65 | 66 | ## Priorities 67 | 68 | 1. *Handling many formats:* 69 | Support as many record-based, streamable scientific formats as possible. 70 | Formats like HDF5 with complex headers and already existing, well-supported 71 | parsers are not considered a priority though. 72 | 73 | 2. *Correctness:* 74 | Formats should be parsed with good error messages, consistant failure 75 | states, and well-tested code. 76 | 77 | 3. *Language bindings:* 78 | Support using Entab from a decent selection of the programming languages 79 | currently used for science, data science, and related fields. Currently 80 | supporting Python, Javascript, and experimentally R with possible support 81 | for Julia in the future. 82 | 83 | 5. *Speed:* 84 | Entab should be as fast as possible while still prioritizing the above 85 | issues. Parsers are split into two forms: a fast one that produces a 86 | specialized struct and a slow one that produces a generic record and is 87 | capable of being switched to at run time. 88 | 89 | 90 | ## Website 91 | 92 | There is a small demo of 93 | [entab running in the browser](https://bovee.github.io/entab/) that can open 94 | small files and plot the data in them. 95 | -------------------------------------------------------------------------------- /entab/tests/data/masshunter_example/AcqData/RJB_Airs2001FIA.m/192_1.xml: -------------------------------------------------------------------------------- 1 | 2 | 2012002 3 | QQQ_G6410B 4 | APCI 5 | atunes.tune.xml 6 | ByPumpTime 7 | 1 8 | true 9 | true 10 | false 11 | 0.07 12 | 13 | 1 14 | 0 15 | ToMS 16 | 17 | 18 | DGasHeater 19 | 350 20 | 350 21 | 22 | 23 | APCIHeater 24 | 450 25 | 450 26 | 27 | 28 | DGasFlow 29 | 6 30 | 6 31 | 32 | 33 | NebulizerPressure 34 | 60 35 | 60 36 | 37 | 38 | VCap 39 | 3000 40 | 3000 41 | 42 | 43 | APCINeedlePos 44 | 5 45 | 5 46 | 47 | 48 | APCINeedleNeg 49 | 5 50 | 5 51 | 52 | 53 | true 54 | 55 | 1 56 | 57 | APCI 58 | Positive 59 | ProductIon 60 | 500 61 | Profile 62 | 0 63 | Fixed 64 | 65 | 66 | 1 67 | 611.3 68 | 100 69 | 612 70 | 0.1 71 | 175 72 | 20 73 | 0 74 | 7 75 | 76 | 77 | 78 | 79 | 80 | 81 | 82 | 83 | 84 | 5 85 | 86 | 87 | 88 | 1 89 | TIC 90 | 91 | 92 | 0 93 | 1E+07 94 | 95 | 96 | -1 97 | -1 98 | 99 | -------------------------------------------------------------------------------- /entab/tests/data/masshunter_example/AcqData/MSScan.xsd: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | MSScan binary file index record details 8 | 9 | 10 | 11 | 12 | 13 | 14 | 15 | 16 | 17 | 18 | 19 | 20 | 21 | 22 | 23 | 24 | 25 | 26 | 27 | 28 | 29 | 30 | 31 | 32 | 33 | 34 | 35 | 36 | 37 | 38 | 39 | 40 | 41 | 42 | 43 | 44 | 45 | 46 | 47 | 48 | 49 | 50 | 51 | 52 | 53 | 54 | 55 | 56 | 57 | 58 | 59 | 60 | 61 | 62 | 63 | 64 | 65 | -------------------------------------------------------------------------------- /entab-js/src/lib.rs: -------------------------------------------------------------------------------- 1 | #![allow(clippy::unused_unit)] 2 | mod utils; 3 | 4 | use std::collections::BTreeMap; 5 | use std::convert::AsRef; 6 | use std::io::{Cursor, Read}; 7 | 8 | use entab_base::error::EtError; 9 | use entab_base::readers::{get_reader, RecordReader}; 10 | use entab_base::record::Value; 11 | use js_sys::{Array, Object}; 12 | use serde::Serialize; 13 | use wasm_bindgen::prelude::*; 14 | 15 | #[derive(Serialize)] 16 | pub struct NextRecord<'v> { 17 | value: Option>>, 18 | done: bool, 19 | } 20 | 21 | #[wasm_bindgen] 22 | pub struct Reader { 23 | parser: String, 24 | headers: Vec, 25 | reader: Box, 26 | } 27 | 28 | fn to_js(err: EtError) -> JsValue { 29 | let res = err.to_string().into(); 30 | // technically we could just take a &EtError, but to have a nice function signature we consume 31 | // the err so we should also drop it in here to make clippy happy 32 | drop(err); 33 | res 34 | } 35 | 36 | #[wasm_bindgen] 37 | impl Reader { 38 | #[wasm_bindgen(constructor)] 39 | pub fn new(data: Box<[u8]>, parser: Option) -> Result { 40 | utils::set_panic_hook(); 41 | if data.is_empty() { 42 | return Err(JsValue::from_str("Data is empty or of the wrong type.")); 43 | } 44 | let stream: Box = Box::new(Cursor::new(data)); 45 | 46 | let (reader, parser_used) = get_reader(stream, parser.as_deref(), None).map_err(to_js)?; 47 | let headers = reader.headers(); 48 | Ok(Reader { 49 | parser: parser_used.to_string(), 50 | headers, 51 | reader, 52 | }) 53 | } 54 | 55 | #[wasm_bindgen(getter)] 56 | pub fn parser(&self) -> String { 57 | self.parser.clone() 58 | } 59 | 60 | #[wasm_bindgen(getter)] 61 | pub fn headers(&self) -> JsValue { 62 | let array = Array::new(); 63 | for item in &self.headers { 64 | array.push(&item.into()); 65 | } 66 | array.into() 67 | } 68 | 69 | #[wasm_bindgen(getter)] 70 | pub fn metadata(&self) -> Result { 71 | serde_wasm_bindgen::to_value(&self.reader.metadata()) 72 | .map_err(|_| JsValue::from_str("Error translating metadata")) 73 | } 74 | 75 | #[allow(clippy::should_implement_trait)] 76 | #[wasm_bindgen] 77 | pub fn next(&mut self) -> Result { 78 | if let Some(value) = self.reader.next_record().map_err(to_js)? { 79 | let obj: BTreeMap<&str, Value> = 80 | self.headers.iter().map(AsRef::as_ref).zip(value).collect(); 81 | serde_wasm_bindgen::to_value(&NextRecord { 82 | value: Some(obj), 83 | done: false, 84 | }) 85 | .map_err(|_| JsValue::from_str("Error translating record")) 86 | } else { 87 | serde_wasm_bindgen::to_value(&NextRecord { 88 | value: None, 89 | done: true, 90 | }) 91 | .map_err(|_| JsValue::from_str("Error translating record")) 92 | } 93 | } 94 | } 95 | 96 | #[wasm_bindgen(inline_js = " 97 | export function make_reader_iter(proto) { proto[Symbol.iterator] = function () { return this; }; } 98 | ")] 99 | extern "C" { 100 | fn make_reader_iter(obj: &Object); 101 | } 102 | 103 | #[wasm_bindgen(start)] 104 | pub fn start() -> Result<(), JsValue> { 105 | // this is kind of hacky, but we create a simple object and get its prototype so we can add the 106 | // iterable marker onto it to allow e.g. `for (row of reader) {}` 107 | let reader = Reader::new(b"\n".to_vec().into_boxed_slice(), Some("csv".to_string()))?; 108 | make_reader_iter(&Object::get_prototype_of(&reader.into())); 109 | Ok(()) 110 | } 111 | -------------------------------------------------------------------------------- /entab-py/src/raw_io_wrapper.rs: -------------------------------------------------------------------------------- 1 | use std::io::{Error, ErrorKind, Read}; 2 | use std::ptr::copy_nonoverlapping; 3 | 4 | use pyo3::prelude::*; 5 | 6 | pub struct RawIoWrapper { 7 | reader: PyObject, 8 | } 9 | 10 | impl RawIoWrapper { 11 | pub fn new(obj: &Bound) -> Self { 12 | let reader = Python::with_gil(|py| obj.to_object(py)); 13 | RawIoWrapper { reader } 14 | } 15 | } 16 | 17 | impl Read for RawIoWrapper { 18 | fn read(&mut self, buf: &mut [u8]) -> Result { 19 | // TODO: it would be pass the buf itself into `readinto` so we're not 20 | // creating so many copies in here, but I can't figure out how to wrap 21 | // that into a python object that implements PyBufferProtocol properly 22 | Python::with_gil(|py| { 23 | let py_data = self 24 | .reader 25 | .call_method1(py, "read", (buf.len(),)) 26 | .map_err(|_| { 27 | // TODO: get the error message from the python error? 28 | Error::new(ErrorKind::Other, "`read` failed") 29 | })?; 30 | 31 | let amt_read = if let Ok(bytes) = py_data.extract::>(py) { 32 | unsafe { 33 | copy_nonoverlapping::(bytes.as_ptr(), buf.as_mut_ptr(), bytes.len()); 34 | } 35 | bytes.len() 36 | } else if let Ok(string) = py_data.extract::(py) { 37 | let bytes = string.as_bytes(); 38 | unsafe { 39 | copy_nonoverlapping::(bytes.as_ptr(), buf.as_mut_ptr(), bytes.len()); 40 | } 41 | bytes.len() 42 | } else { 43 | return Err(Error::new( 44 | ErrorKind::Other, 45 | "`read` returned an unknown object", 46 | )); 47 | }; 48 | Ok(amt_read) 49 | }) 50 | } 51 | } 52 | 53 | #[cfg(test)] 54 | mod tests { 55 | use super::*; 56 | 57 | use pyo3::types::{IntoPyDict, PyFloat}; 58 | 59 | #[test] 60 | fn test_io_wrapper_bad_type() -> Result<(), Error> { 61 | pyo3::prepare_freethreaded_python(); 62 | Python::with_gil(|py| { 63 | let mut scratch = Vec::new(); 64 | 65 | let num = PyFloat::new_bound(py, 2.); 66 | let mut wrapper = RawIoWrapper::new(num.as_ref()); 67 | assert!(wrapper.read_to_end(&mut scratch).is_err()); 68 | Ok(()) 69 | }) 70 | } 71 | 72 | #[test] 73 | fn test_io_wrapper_stringio() -> Result<(), Error> { 74 | pyo3::prepare_freethreaded_python(); 75 | Python::with_gil(|py| { 76 | let locals = [("io", py.import_bound("io")?)].into_py_dict_bound(py); 77 | let mut scratch = Vec::new(); 78 | 79 | let code = "io.StringIO('>test\\nACGT')"; 80 | let buffer: PyObject = py.eval_bound(code, None, Some(&locals))?.extract()?; 81 | let mut wrapper = RawIoWrapper::new(buffer.bind(py)); 82 | assert_eq!(wrapper.read_to_end(&mut scratch)?, 10); 83 | assert_eq!(scratch, b">test\nACGT"); 84 | Ok(()) 85 | }) 86 | } 87 | 88 | #[test] 89 | fn test_io_wrapper_bytesio() -> Result<(), Error> { 90 | pyo3::prepare_freethreaded_python(); 91 | Python::with_gil(|py| { 92 | let locals = [("io", py.import_bound("io")?)].into_py_dict_bound(py); 93 | let mut scratch = Vec::new(); 94 | 95 | let code = "io.StringIO('>seq\\nTGCAT')"; 96 | let buffer: PyObject = py.eval_bound(code, None, Some(&locals))?.extract()?; 97 | let mut wrapper = RawIoWrapper::new(buffer.bind(py)); 98 | assert_eq!(wrapper.read_to_end(&mut scratch)?, 10); 99 | assert_eq!(scratch, b">seq\nTGCAT"); 100 | 101 | Ok(()) 102 | }) 103 | } 104 | } 105 | -------------------------------------------------------------------------------- /entab/src/parsers/mod.rs: -------------------------------------------------------------------------------- 1 | use alloc::format; 2 | use core::any::type_name; 3 | 4 | use crate::EtError; 5 | 6 | /// Readers for formats generated by Agilent instruments 7 | pub mod agilent; 8 | /// Common low-level readers (ints, slices, etc) 9 | pub mod common; 10 | /// Reader for FASTA bioinformatics format 11 | pub mod fasta; 12 | /// Reader for FASTQ bioinformatics format 13 | pub mod fastq; 14 | /// Reader for flow data 15 | pub mod flow; 16 | /// Reader for Inficon Hapsite MS formats 17 | pub mod inficon; 18 | /// Reader for PNG image format 19 | #[cfg(feature = "std")] 20 | pub mod png; 21 | /// Reader for BAM/SAM bioinformatics formats 22 | pub mod sam; 23 | /// Readers for Thermo formats 24 | pub mod thermo; 25 | /// Readers for tab-seperated text format 26 | pub mod tsv; 27 | /// Helpers for TSV parsing 28 | pub mod tsv_inference; 29 | // /// Reader for generic XML 30 | // pub mod xml; 31 | 32 | /// The default implementation is `impl FromSlice for ()` to simplify implementations for 33 | /// e.g. state or other objects that don't read from the buffer. 34 | pub trait FromSlice<'b: 's, 's>: Sized + Default { 35 | /// State is used to track information outside of the current slice scope that's used to create 36 | /// the value returned. 37 | type State: core::fmt::Debug + Default + 's; 38 | 39 | /// Given a slice and state, determine how much of the slice needs to be parsed to return a 40 | /// value and update `consumed` with that amount. If no value can be parsed, return Ok(false), 41 | /// otherwise return Ok(true) if a value can be parsed. 42 | /// 43 | /// # Errors 44 | /// If the parser fails or if there's not enough data in the buffer, an `EtError` will be returned. 45 | fn parse( 46 | _buffer: &[u8], 47 | _eof: bool, 48 | _consumed: &mut usize, 49 | _state: &mut Self::State, 50 | ) -> Result { 51 | Ok(true) 52 | } 53 | 54 | /// Given a slice and state, update Self by reading the information about the current record 55 | /// out. 56 | /// 57 | /// # Errors 58 | /// If buffer can not be interpreted into `Self`, return `EtError`. 59 | fn get(&mut self, _buffer: &'b [u8], _state: &'s Self::State) -> Result<(), EtError> { 60 | Ok(()) 61 | } 62 | 63 | /// Essentially the same as `extract` below, but doesn't update the state or consume any space. 64 | /// 65 | /// Use only for simple types with defined sizes like u8, i32, &[u8], etc. Using this with more 66 | /// complex types that rely upon updating `state` in between reads will cause bad and confusing 67 | /// things to happen! 68 | /// 69 | /// # Errors 70 | /// If parsing fails, an error will be returned. 71 | fn extract(buffer: &'b [u8], state: &'s Self::State) -> Result 72 | where 73 | Self::State: 'static, 74 | Self: 's, 75 | { 76 | let mut val = Self::default(); 77 | Self::get(&mut val, buffer, state)?; 78 | Ok(val) 79 | } 80 | } 81 | 82 | /// Pull a `T` out of the slice, updating state appropriately and incrementing `consumed` to 83 | /// account for bytes used. 84 | /// 85 | /// # Errors 86 | /// If an error extracting a value occured or if slice needs to be extended, return `EtError`. 87 | #[inline] 88 | pub(crate) fn extract<'b: 's, 's, T>( 89 | buffer: &'b [u8], 90 | consumed: &mut usize, 91 | state: &'s mut >::State, 92 | ) -> Result 93 | where 94 | T: FromSlice<'b, 's> + Default, 95 | { 96 | match extract_opt(buffer, false, consumed, state)? { 97 | None => Err(format!( 98 | "Tried to extract {}, but parser indicated no more.", 99 | type_name::() 100 | ) 101 | .into()), 102 | Some(value) => Ok(value), 103 | } 104 | } 105 | 106 | /// Pull a `T` out of the slice, updating state appropriately and incrementing `consumed` to 107 | /// account for bytes used. 108 | /// 109 | /// # Errors 110 | /// If an error extracting a value occured or if slice needs to be extended, return `EtError`. 111 | #[inline] 112 | pub(crate) fn extract_opt<'b: 's, 's, T>( 113 | buffer: &'b [u8], 114 | eof: bool, 115 | consumed: &mut usize, 116 | state: &'s mut >::State, 117 | ) -> Result, EtError> 118 | where 119 | T: FromSlice<'b, 's> + Default, 120 | { 121 | let start = *consumed; 122 | if !T::parse(&buffer[start..], eof, consumed, state)? { 123 | return Ok(None); 124 | } 125 | let mut record = T::default(); 126 | T::get(&mut record, &buffer[start..*consumed], state)?; 127 | Ok(Some(record)) 128 | } 129 | 130 | /// The endianness of a number used to extract such a number. 131 | #[derive(Clone, Copy, Debug, Default)] 132 | pub enum Endian { 133 | /// A number stored in big-endian format 134 | Big, 135 | /// A number stored in little-endian format 136 | #[default] 137 | Little, 138 | } 139 | -------------------------------------------------------------------------------- /entab/benches/benchmarks.rs: -------------------------------------------------------------------------------- 1 | use std::fs::File; 2 | 3 | use criterion::{black_box, criterion_group, criterion_main, Criterion}; 4 | 5 | use entab::compression::decompress; 6 | use entab::parsers::agilent::chemstation::ChemstationMsReader; 7 | use entab::parsers::fasta::FastaReader; 8 | use entab::parsers::fastq::{FastqReader, FastqRecord, FastqState}; 9 | use entab::parsers::png::PngReader; 10 | use entab::parsers::sam::BamReader; 11 | use entab::readers::{get_reader, init_state}; 12 | 13 | fn benchmark_raw_readers(c: &mut Criterion) { 14 | let mut raw_readers = c.benchmark_group("raw readers"); 15 | raw_readers.significance_level(0.01).sample_size(500); 16 | 17 | raw_readers.bench_function("chemstation reader", |b| { 18 | b.iter(|| { 19 | let f = File::open("tests/data/carotenoid_extract.d/MSD1.MS").unwrap(); 20 | let mut reader = ChemstationMsReader::new(f, None).unwrap(); 21 | while let Some(record) = reader.next().unwrap() { 22 | black_box(record); 23 | } 24 | }) 25 | }); 26 | 27 | raw_readers.bench_function("fasta reader", |b| { 28 | b.iter(|| { 29 | let f = File::open("tests/data/sequence.fasta").unwrap(); 30 | let mut reader = FastaReader::new(f, None).unwrap(); 31 | while let Some(record) = reader.next().unwrap() { 32 | black_box(record); 33 | } 34 | }) 35 | }); 36 | 37 | raw_readers.bench_function("fastq reader", |b| { 38 | b.iter(|| { 39 | let f = File::open("tests/data/test.fastq").unwrap(); 40 | let mut reader = FastqReader::new(f, None).unwrap(); 41 | while let Some(record) = reader.next().unwrap() { 42 | black_box(record); 43 | } 44 | }) 45 | }); 46 | 47 | raw_readers.bench_function("fastq [unsafe] reader", |b| { 48 | b.iter(|| { 49 | let f = File::open("tests/data/test.fastq").unwrap(); 50 | let (mut rb, mut state) = init_state::(f, None).unwrap(); 51 | let mut record = FastqRecord::default(); 52 | while unsafe { rb.next_into(&mut state, &mut record).unwrap() } { 53 | let FastqRecord { sequence, .. } = &record; 54 | black_box(sequence); 55 | } 56 | }) 57 | }); 58 | 59 | raw_readers.bench_function("png reader", |b| { 60 | b.iter(|| { 61 | let f = File::open("tests/data/bmp_24.png").unwrap(); 62 | let mut reader = PngReader::new(f, None).unwrap(); 63 | while let Some(record) = reader.next().unwrap() { 64 | black_box(record); 65 | } 66 | }) 67 | }); 68 | 69 | raw_readers.bench_function("bam reader", |b| { 70 | b.iter(|| { 71 | let f = File::open("tests/data/test.bam").unwrap(); 72 | let (rb, _) = decompress(f).unwrap(); 73 | let mut reader = BamReader::new(rb, None).unwrap(); 74 | while let Some(record) = reader.next().unwrap() { 75 | black_box(record); 76 | } 77 | }) 78 | }); 79 | } 80 | 81 | fn benchmark_generic_readers(c: &mut Criterion) { 82 | let mut generic_readers = c.benchmark_group("generic readers"); 83 | generic_readers.significance_level(0.01).sample_size(500); 84 | 85 | generic_readers.bench_function("generic chemstation reader", |b| { 86 | b.iter(|| { 87 | let f = File::open("tests/data/carotenoid_extract.d/MSD1.MS").unwrap(); 88 | let (mut reader, _) = get_reader(f, Some("chemstation_ms"), None).unwrap(); 89 | while let Some(record) = reader.next_record().unwrap() { 90 | black_box(record); 91 | } 92 | }) 93 | }); 94 | 95 | generic_readers.bench_function("generic fastq reader", |b| { 96 | b.iter(|| { 97 | let f = File::open("tests/data/test.fastq").unwrap(); 98 | let (mut reader, _) = get_reader(f, Some("fastq"), None).unwrap(); 99 | while let Some(record) = reader.next_record().unwrap() { 100 | black_box(record); 101 | } 102 | }) 103 | }); 104 | 105 | generic_readers.bench_function("flow reader", |b| { 106 | b.iter(|| { 107 | let f = File::open("tests/data/HTS_BD_LSR_II_Mixed_Specimen_001_D6_D06.fcs").unwrap(); 108 | let (mut reader, _) = get_reader(f, Some("flow"), None).unwrap(); 109 | while let Some(record) = reader.next_record().unwrap() { 110 | black_box(record); 111 | } 112 | }) 113 | }); 114 | 115 | generic_readers.bench_function("png reader", |b| { 116 | b.iter(|| { 117 | let f = File::open("tests/data/bmp_24.png").unwrap(); 118 | let (mut reader, _) = get_reader(f, Some("png"), None).unwrap(); 119 | while let Some(record) = reader.next_record().unwrap() { 120 | black_box(record); 121 | } 122 | }) 123 | }); 124 | } 125 | 126 | criterion_group!(benches, benchmark_raw_readers, benchmark_generic_readers); 127 | criterion_main!(benches); 128 | -------------------------------------------------------------------------------- /entab/tests/data/sequence.fasta: -------------------------------------------------------------------------------- 1 | >gi|9626372|ref|NC_001422.1| Coliphage phi-X174, complete genome 2 | GAGTTTTATCGCTTCCATGACGCAGAAGTTAACACTTTCGGATATTTCTGATGAGTCGAAAAATTATCTT 3 | GATAAAGCAGGAATTACTACTGCTTGTTTACGAATTAAATCGAAGTGGACTGCTGGCGGAAAATGAGAAA 4 | ATTCGACCTATCCTTGCGCAGCTCGAGAAGCTCTTACTTTGCGACCTTTCGCCATCAACTAACGATTCTG 5 | TCAAAAACTGACGCGTTGGATGAGGAGAAGTGGCTTAATATGCTTGGCACGTTCGTCAAGGACTGGTTTA 6 | GATATGAGTCACATTTTGTTCATGGTAGAGATTCTCTTGTTGACATTTTAAAAGAGCGTGGATTACTATC 7 | TGAGTCCGATGCTGTTCAACCACTAATAGGTAAGAAATCATGAGTCAAGTTACTGAACAATCCGTACGTT 8 | TCCAGACCGCTTTGGCCTCTATTAAGCTCATTCAGGCTTCTGCCGTTTTGGATTTAACCGAAGATGATTT 9 | CGATTTTCTGACGAGTAACAAAGTTTGGATTGCTACTGACCGCTCTCGTGCTCGTCGCTGCGTTGAGGCT 10 | TGCGTTTATGGTACGCTGGACTTTGTGGGATACCCTCGCTTTCCTGCTCCTGTTGAGTTTATTGCTGCCG 11 | TCATTGCTTATTATGTTCATCCCGTCAACATTCAAACGGCCTGTCTCATCATGGAAGGCGCTGAATTTAC 12 | GGAAAACATTATTAATGGCGTCGAGCGTCCGGTTAAAGCCGCTGAATTGTTCGCGTTTACCTTGCGTGTA 13 | CGCGCAGGAAACACTGACGTTCTTACTGACGCAGAAGAAAACGTGCGTCAAAAATTACGTGCGGAAGGAG 14 | TGATGTAATGTCTAAAGGTAAAAAACGTTCTGGCGCTCGCCCTGGTCGTCCGCAGCCGTTGCGAGGTACT 15 | AAAGGCAAGCGTAAAGGCGCTCGTCTTTGGTATGTAGGTGGTCAACAATTTTAATTGCAGGGGCTTCGGC 16 | CCCTTACTTGAGGATAAATTATGTCTAATATTCAAACTGGCGCCGAGCGTATGCCGCATGACCTTTCCCA 17 | TCTTGGCTTCCTTGCTGGTCAGATTGGTCGTCTTATTACCATTTCAACTACTCCGGTTATCGCTGGCGAC 18 | TCCTTCGAGATGGACGCCGTTGGCGCTCTCCGTCTTTCTCCATTGCGTCGTGGCCTTGCTATTGACTCTA 19 | CTGTAGACATTTTTACTTTTTATGTCCCTCATCGTCACGTTTATGGTGAACAGTGGATTAAGTTCATGAA 20 | GGATGGTGTTAATGCCACTCCTCTCCCGACTGTTAACACTACTGGTTATATTGACCATGCCGCTTTTCTT 21 | GGCACGATTAACCCTGATACCAATAAAATCCCTAAGCATTTGTTTCAGGGTTATTTGAATATCTATAACA 22 | ACTATTTTAAAGCGCCGTGGATGCCTGACCGTACCGAGGCTAACCCTAATGAGCTTAATCAAGATGATGC 23 | TCGTTATGGTTTCCGTTGCTGCCATCTCAAAAACATTTGGACTGCTCCGCTTCCTCCTGAGACTGAGCTT 24 | TCTCGCCAAATGACGACTTCTACCACATCTATTGACATTATGGGTCTGCAAGCTGCTTATGCTAATTTGC 25 | ATACTGACCAAGAACGTGATTACTTCATGCAGCGTTACCATGATGTTATTTCTTCATTTGGAGGTAAAAC 26 | CTCTTATGACGCTGACAACCGTCCTTTACTTGTCATGCGCTCTAATCTCTGGGCATCTGGCTATGATGTT 27 | GATGGAACTGACCAAACGTCGTTAGGCCAGTTTTCTGGTCGTGTTCAACAGACCTATAAACATTCTGTGC 28 | CGCGTTTCTTTGTTCCTGAGCATGGCACTATGTTTACTCTTGCGCTTGTTCGTTTTCCGCCTACTGCGAC 29 | TAAAGAGATTCAGTACCTTAACGCTAAAGGTGCTTTGACTTATACCGATATTGCTGGCGACCCTGTTTTG 30 | TATGGCAACTTGCCGCCGCGTGAAATTTCTATGAAGGATGTTTTCCGTTCTGGTGATTCGTCTAAGAAGT 31 | TTAAGATTGCTGAGGGTCAGTGGTATCGTTATGCGCCTTCGTATGTTTCTCCTGCTTATCACCTTCTTGA 32 | AGGCTTCCCATTCATTCAGGAACCGCCTTCTGGTGATTTGCAAGAACGCGTACTTATTCGCCACCATGAT 33 | TATGACCAGTGTTTCCAGTCCGTTCAGTTGTTGCAGTGGAATAGTCAGGTTAAATTTAATGTGACCGTTT 34 | ATCGCAATCTGCCGACCACTCGCGATTCAATCATGACTTCGTGATAAAAGATTGAGTGTGAGGTTATAAC 35 | GCCGAAGCGGTAAAAATTTTAATTTTTGCCGCTGAGGGGTTGACCAAGCGAAGCGCGGTAGGTTTTCTGC 36 | TTAGGAGTTTAATCATGTTTCAGACTTTTATTTCTCGCCATAATTCAAACTTTTTTTCTGATAAGCTGGT 37 | TCTCACTTCTGTTACTCCAGCTTCTTCGGCACCTGTTTTACAGACACCTAAAGCTACATCGTCAACGTTA 38 | TATTTTGATAGTTTGACGGTTAATGCTGGTAATGGTGGTTTTCTTCATTGCATTCAGATGGATACATCTG 39 | TCAACGCCGCTAATCAGGTTGTTTCTGTTGGTGCTGATATTGCTTTTGATGCCGACCCTAAATTTTTTGC 40 | CTGTTTGGTTCGCTTTGAGTCTTCTTCGGTTCCGACTACCCTCCCGACTGCCTATGATGTTTATCCTTTG 41 | AATGGTCGCCATGATGGTGGTTATTATACCGTCAAGGACTGTGTGACTATTGACGTCCTTCCCCGTACGC 42 | CGGGCAATAACGTTTATGTTGGTTTCATGGTTTGGTCTAACTTTACCGCTACTAAATGCCGCGGATTGGT 43 | TTCGCTGAATCAGGTTATTAAAGAGATTATTTGTCTCCAGCCACTTAAGTGAGGTGATTTATGTTTGGTG 44 | CTATTGCTGGCGGTATTGCTTCTGCTCTTGCTGGTGGCGCCATGTCTAAATTGTTTGGAGGCGGTCAAAA 45 | AGCCGCCTCCGGTGGCATTCAAGGTGATGTGCTTGCTACCGATAACAATACTGTAGGCATGGGTGATGCT 46 | GGTATTAAATCTGCCATTCAAGGCTCTAATGTTCCTAACCCTGATGAGGCCGCCCCTAGTTTTGTTTCTG 47 | GTGCTATGGCTAAAGCTGGTAAAGGACTTCTTGAAGGTACGTTGCAGGCTGGCACTTCTGCCGTTTCTGA 48 | TAAGTTGCTTGATTTGGTTGGACTTGGTGGCAAGTCTGCCGCTGATAAAGGAAAGGATACTCGTGATTAT 49 | CTTGCTGCTGCATTTCCTGAGCTTAATGCTTGGGAGCGTGCTGGTGCTGATGCTTCCTCTGCTGGTATGG 50 | TTGACGCCGGATTTGAGAATCAAAAAGAGCTTACTAAAATGCAACTGGACAATCAGAAAGAGATTGCCGA 51 | GATGCAAAATGAGACTCAAAAAGAGATTGCTGGCATTCAGTCGGCGACTTCACGCCAGAATACGAAAGAC 52 | CAGGTATATGCACAAAATGAGATGCTTGCTTATCAACAGAAGGAGTCTACTGCTCGCGTTGCGTCTATTA 53 | TGGAAAACACCAATCTTTCCAAGCAACAGCAGGTTTCCGAGATTATGCGCCAAATGCTTACTCAAGCTCA 54 | AACGGCTGGTCAGTATTTTACCAATGACCAAATCAAAGAAATGACTCGCAAGGTTAGTGCTGAGGTTGAC 55 | TTAGTTCATCAGCAAACGCAGAATCAGCGGTATGGCTCTTCTCATATTGGCGCTACTGCAAAGGATATTT 56 | CTAATGTCGTCACTGATGCTGCTTCTGGTGTGGTTGATATTTTTCATGGTATTGATAAAGCTGTTGCCGA 57 | TACTTGGAACAATTTCTGGAAAGACGGTAAAGCTGATGGTATTGGCTCTAATTTGTCTAGGAAATAACCG 58 | TCAGGATTGACACCCTCCCAATTGTATGTTTTCATGCCTCCAAATCTTGGAGGCTTTTTTATGGTTCGTT 59 | CTTATTACCCTTCTGAATGTCACGCTGATTATTTTGACTTTGAGCGTATCGAGGCTCTTAAACCTGCTAT 60 | TGAGGCTTGTGGCATTTCTACTCTTTCTCAATCCCCAATGCTTGGCTTCCATAAGCAGATGGATAACCGC 61 | ATCAAGCTCTTGGAAGAGATTCTGTCTTTTCGTATGCAGGGCGTTGAGTTCGATAATGGTGATATGTATG 62 | TTGACGGCCATAAGGCTGCTTCTGACGTTCGTGATGAGTTTGTATCTGTTACTGAGAAGTTAATGGATGA 63 | ATTGGCACAATGCTACAATGTGCTCCCCCAACTTGATATTAATAACACTATAGACCACCGCCCCGAAGGG 64 | GACGAAAAATGGTTTTTAGAGAACGAGAAGACGGTTACGCAGTTTTGCCGCAAGCTGGCTGCTGAACGCC 65 | CTCTTAAGGATATTCGCGATGAGTATAATTACCCCAAAAAGAAAGGTATTAAGGATGAGTGTTCAAGATT 66 | GCTGGAGGCCTCCACTATGAAATCGCGTAGAGGCTTTGCTATTCAGCGTTTGATGAATGCAATGCGACAG 67 | GCTCATGCTGATGGTTGGTTTATCGTTTTTGACACTCTCACGTTGGCTGACGACCGATTAGAGGCGTTTT 68 | ATGATAATCCCAATGCTTTGCGTGACTATTTTCGTGATATTGGTCGTATGGTTCTTGCTGCCGAGGGTCG 69 | CAAGGCTAATGATTCACACGCCGACTGCTATCAGTATTTTTGTGTGCCTGAGTATGGTACAGCTAATGGC 70 | CGTCTTCATTTCCATGCGGTGCACTTTATGCGGACACTTCCTACAGGTAGCGTTGACCCTAATTTTGGTC 71 | GTCGGGTACGCAATCGCCGCCAGTTAAATAGCTTGCAAAATACGTGGCCTTATGGTTACAGTATGCCCAT 72 | CGCAGTTCGCTACACGCAGGACGCTTTTTCACGTTCTGGTTGGTTGTGGCCTGTTGATGCTAAAGGTGAG 73 | CCGCTTAAAGCTACCAGTTATATGGCTGTTGGTTTCTATGTGGCTAAATACGTTAACAAAAAGTCAGATA 74 | TGGACCTTGCTGCTAAAGGTCTAGGAGCTAAAGAATGGAACAACTCACTAAAAACCAAGCTGTCGCTACT 75 | TCCCAAGAAGCTGTTCAGAATCAGAATGAGCCGCAACTTCGGGATGAAAATGCTCACAATGACAAATCTG 76 | TCCACGGAGTGCTTAATCCAACTTACCAAGCTGGGTTACGACGCGACGCCGTTCAACCAGATATTGAAGC 77 | AGAACGCAAAAAGAGAGATGAGATTGAGGCTGGGAAAAGTTACTGTAGCCGACGTTTTGGCGGCGCAACC 78 | TGTGACGACAAATCTGCTCAAATTTATGCGCGCTTCGATAAAAATGATTGGCGTATCCAACCTGCA 79 | 80 | -------------------------------------------------------------------------------- /entab/tests/data/chemstation_mwd.d/RUN.M/ACQ.MS: -------------------------------------------------------------------------------- 1 | [Signature] 2 | ID605=o31Q4bBIfVVn1bfDlqrTJDYt00000000 3 | [Version] 4 | ID=G2715 Version: A.09.03 5 | [General] 6 | Type=1 7 | Run Duration=900.00 8 | NoLimit=1 9 | NoMSD=1 10 | PeakWidth=6.00 11 | FastScan=0 12 | FastScanDataRecon=0 13 | FastScanAcq=0 14 | Solvent Delay=0.00 15 | IonizationMode=6 16 | Tune File=atunes.tun 17 | Acq Mode=0 18 | Acq Mode 2=0 19 | Acq Mode 3=0 20 | Acq Mode 4=0 21 | Signal Summary=350-1400 22 | Signal Summary 2= 23 | Signal Summary 3= 24 | Signal Summary 4= 25 | Signal Active=1 26 | Signal Active 2=0 27 | Signal Active 3=0 28 | Signal Active 4=0 29 | Cycle Time=100 30 | Cycle Time 2=0 31 | Cycle Time 3=0 32 | Cycle Time 4=0 33 | Polarity=0 34 | Polarity 2=0 35 | Polarity 3=0 36 | Polarity 4=0 37 | Sim on target mass=0 38 | Sim on target mass 2=0 39 | Sim on target mass 3=0 40 | Sim on target mass 4=0 41 | Group Ions=1 42 | Group Ions 2=1 43 | Group Ions 3=1 44 | Group Ions 4=1 45 | Zones=4 46 | Scan Groups=1 47 | Scan Groups 2=1 48 | Scan Groups 3=1 49 | Scan Groups 4=1 50 | Data Curves=0 51 | Data Curve Rate=1000 52 | [Parameters] 53 | ID 1=1 54 | Mode 1=1 55 | Value 1=3000.000000,3000.000000 56 | ID 2=5 57 | Mode 2=1 58 | Value 2=3000.000000,3000.000000 59 | ID 3=7 60 | Mode 3=1 61 | Value 3=5.000000,4.000000 62 | ID 4=8 63 | Mode 4=1 64 | Value 4=5.000000,40.000000 65 | ID 5=33 66 | Mode 5=1 67 | Value 5=4000.000000,4000.000000 68 | [DynamicParm 1] 69 | GlobalId=11 70 | Enable=0 71 | DynamicValues= 72 | [DynamicParm 2 1] 73 | GlobalId=-1 74 | Enable=0 75 | DynamicValues= 76 | [DynamicParm 3 1] 77 | GlobalId=-1 78 | Enable=0 79 | DynamicValues= 80 | [DynamicParm 4 1] 81 | GlobalId=-1 82 | Enable=0 83 | DynamicValues= 84 | [Zone 1] 85 | ID=6 86 | On=1 87 | Soft Max=350.00 88 | Set Point=350.00 89 | Type=0 90 | [Zone 2] 91 | ID=7 92 | On=1 93 | Soft Max=500.00 94 | Set Point=425.00 95 | Type=0 96 | [Zone 3] 97 | ID=13 98 | On=1 99 | Soft Max=13.00 100 | Set Point=6.00 101 | Type=0 102 | [Zone 4] 103 | ID=14 104 | On=1 105 | Soft Max=60.00 106 | Set Point=60.00 107 | Type=0 108 | [Ions 1] 109 | Group Name=Group 1 110 | Start Time=0.00 111 | Row OnOff=1 112 | Resolution=0 113 | Gain=1.0 114 | Plot Index 1=0 115 | Plot Index 2=0 116 | Number of Ions=1 117 | Ion 1=195.0,580 118 | Frgmtr Ion 1=70 119 | [Ions 2 1] 120 | Group Name=Group 1 121 | Start Time=0.00 122 | Row OnOff=1 123 | Resolution=0 124 | Gain=1.0 125 | Plot Index 1=0 126 | Plot Index 2=0 127 | Number of Ions=1 128 | Ion 1=195.0,580 129 | Frgmtr Ion 1=70 130 | [Ions 3 1] 131 | Group Name=Group 1 132 | Start Time=0.00 133 | Row OnOff=1 134 | Resolution=0 135 | Gain=1.0 136 | Plot Index 1=0 137 | Plot Index 2=0 138 | Number of Ions=1 139 | Ion 1=195.0,580 140 | Frgmtr Ion 1=70 141 | [Ions 4 1] 142 | Group Name=Group 1 143 | Start Time=0.00 144 | Row OnOff=1 145 | Resolution=0 146 | Gain=1.0 147 | Plot Index 1=0 148 | Plot Index 2=0 149 | Number of Ions=1 150 | Ion 1=195.0,580 151 | Frgmtr Ion 1=70 152 | [Scan 1] 153 | Start Time=0.00 154 | Row OnOff=1 155 | Low Mass=350.0 156 | High Mass=1400.0 157 | Threshold=100 158 | Sampling=5 159 | Stepsize=0.10 160 | Gain=1.0 161 | Fragmentor=75 162 | Low Plot 1=1200.0 163 | High Plot 1=1350.0 164 | Low Plot 2=1200.0 165 | High Plot 2=1350.0 166 | [Scan 2 1] 167 | Start Time=0.00 168 | Row OnOff=0 169 | Low Mass=200.0 170 | High Mass=400.0 171 | Threshold=150 172 | Sampling=12 173 | Stepsize=0.10 174 | Gain=1.0 175 | Fragmentor=70 176 | Low Plot 1=200.0 177 | High Plot 1=400.0 178 | Low Plot 2=200.0 179 | High Plot 2=400.0 180 | [Scan 3 1] 181 | Start Time=0.00 182 | Row OnOff=1 183 | Low Mass=100.0 184 | High Mass=1000.0 185 | Threshold=150 186 | Sampling=4 187 | Stepsize=0.10 188 | Gain=1.0 189 | Fragmentor=70 190 | Low Plot 1=100.0 191 | High Plot 1=550.0 192 | Low Plot 2=100.0 193 | High Plot 2=550.0 194 | [Scan 4 1] 195 | Start Time=0.00 196 | Row OnOff=1 197 | Low Mass=100.0 198 | High Mass=1000.0 199 | Threshold=150 200 | Sampling=4 201 | Stepsize=0.10 202 | Gain=1.0 203 | Fragmentor=70 204 | Low Plot 1=100.0 205 | High Plot 1=550.0 206 | Low Plot 2=100.0 207 | High Plot 2=550.0 208 | [Time Events] 209 | Number Events=0 210 | [Filters] 211 | Mass Type=1 212 | Mass Param=0.300 213 | Mass Sigma=2.00 214 | Time Type=1 215 | Time Param=3.000 216 | Time Sigma=2.00 217 | [Filters 2] 218 | Mass Type=1 219 | Mass Param=0.300 220 | Mass Sigma=2.00 221 | Time Type=1 222 | Time Param=3.000 223 | Time Sigma=2.00 224 | [Filters 3] 225 | Mass Type=1 226 | Mass Param=0.300 227 | Mass Sigma=2.00 228 | Time Type=1 229 | Time Param=3.000 230 | Time Sigma=2.00 231 | [Filters 4] 232 | Mass Type=1 233 | Mass Param=0.300 234 | Mass Sigma=2.00 235 | Time Type=1 236 | Time Param=3.000 237 | Time Sigma=2.00 238 | [Plot] 239 | Plot Type 1=0 240 | Plot Type 2=1 241 | [FractionCollection] 242 | Mode=0 243 | CSV File=BASEMASS.CSV 244 | Signal 1=1 245 | Signal 2=0 246 | Signal 3=0 247 | Signal 4=0 248 | Detector=0 249 | Standard Positive Adducts=0,0,0,0,0 250 | Standard Negative Adducts=0,0 251 | User-defined Positive Adducts=0, ,0, ,0, 252 | User-defined Negative Adducts=0, ,0, ,0, 253 | Number Of Base Masses=5 254 | Base Masses 1=1292.6 255 | Base Masses 2=1297.6 256 | Base Masses 3=1299.6 257 | Base Masses 4=1301.7 258 | Base Masses 5=1302.6 259 | Min Peak Width=0.00 260 | Max Peak Width=1.50 261 | Time Limit=15.00 262 | MSD Delay=0.01 263 | MSD Threshold=700 264 | MSD Peak Slope=0 265 | Other Delay=0.00 266 | Other Threshold=5.0 267 | Other Peak Slope=10 268 | MSD Window=0.2 269 | MSD Output Range=2000 270 | Other Full Scale=0.10 271 | -------------------------------------------------------------------------------- /entab/src/compression.rs: -------------------------------------------------------------------------------- 1 | #[cfg(feature = "std")] 2 | use alloc::boxed::Box; 3 | use core::convert::TryInto; 4 | 5 | #[cfg(all(feature = "compression", feature = "std"))] 6 | use bzip2::read::BzDecoder; 7 | #[cfg(feature = "std")] 8 | use flate2::read::MultiGzDecoder; 9 | #[cfg(all(feature = "compression", feature = "std"))] 10 | use xz2::read::XzDecoder; 11 | #[cfg(all(feature = "compression", feature = "std"))] 12 | use zstd::stream::read::Decoder as ZstdDecoder; 13 | 14 | use crate::buffer::ReadBuffer; 15 | use crate::filetype::FileType; 16 | use crate::EtError; 17 | 18 | /// Decompress the contents of a `ReadBuffer` into a new `ReadBuffer` and return the type of compression. 19 | /// 20 | /// # Errors 21 | /// If reading fails or if the stream can't be decompressed, return `EtError`. 22 | #[cfg(all(feature = "compression", feature = "std"))] 23 | pub fn decompress<'r, B>(data: B) -> Result<(ReadBuffer<'r>, Option), EtError> 24 | where 25 | B: TryInto>, 26 | EtError: From<>>::Error>, 27 | { 28 | let mut reader = data.try_into()?; 29 | let file_type = reader.sniff_filetype()?; 30 | Ok(match file_type { 31 | FileType::Gzip => { 32 | let gz_reader = MultiGzDecoder::new(reader.into_box_read()); 33 | ( 34 | ReadBuffer::from_reader(Box::new(gz_reader), None)?, 35 | Some(file_type), 36 | ) 37 | } 38 | FileType::Bzip => { 39 | let bz_reader = BzDecoder::new(reader.into_box_read()); 40 | ( 41 | ReadBuffer::from_reader(Box::new(bz_reader), None)?, 42 | Some(file_type), 43 | ) 44 | } 45 | FileType::Lzma => { 46 | let xz_reader = XzDecoder::new(reader.into_box_read()); 47 | ( 48 | ReadBuffer::from_reader(Box::new(xz_reader), None)?, 49 | Some(file_type), 50 | ) 51 | } 52 | FileType::Zstd => { 53 | let zstd_reader = ZstdDecoder::new(reader.into_box_read())?; 54 | ( 55 | ReadBuffer::from_reader(Box::new(zstd_reader), None)?, 56 | Some(file_type), 57 | ) 58 | } 59 | _ => (reader, None), 60 | }) 61 | } 62 | 63 | /// Decompress a `Read` stream and returns the inferred file type. 64 | /// 65 | /// # Errors 66 | /// If reading fails or if the stream can't be decompressed, return `EtError`. 67 | #[cfg(all(not(feature = "compression"), feature = "std"))] 68 | pub fn decompress<'r, B>(data: B) -> Result<(ReadBuffer<'r>, Option), EtError> 69 | where 70 | B: TryInto>, 71 | EtError: From<>>::Error>, 72 | { 73 | let mut reader = data.try_into()?; 74 | let file_type = reader.sniff_filetype()?; 75 | Ok(match file_type { 76 | FileType::Gzip => { 77 | let gz_reader = MultiGzDecoder::new(reader.into_box_read()); 78 | ( 79 | ReadBuffer::from_reader(Box::new(gz_reader), None)?, 80 | Some(file_type), 81 | ) 82 | } 83 | FileType::Bzip | FileType::Lzma | FileType::Zstd => { 84 | return Err("entab was not compiled with support for compressed files".into()); 85 | } 86 | _ => (reader, None), 87 | }) 88 | } 89 | 90 | /// Decompress a `Read` stream and returns the inferred file type. 91 | /// 92 | /// # Errors 93 | /// If reading fails or if the stream can't be decompressed, return `EtError`. 94 | #[cfg(not(feature = "std"))] 95 | pub fn decompress<'r, B>(data: B) -> Result<(ReadBuffer<'r>, Option), EtError> 96 | where 97 | B: TryInto>, 98 | EtError: From<>>::Error>, 99 | { 100 | let mut reader = data.try_into()?; 101 | let file_type = reader.sniff_filetype()?; 102 | Ok(match file_type { 103 | FileType::Gzip | FileType::Bzip | FileType::Lzma | FileType::Zstd => { 104 | return Err("entab was not compiled with support for any compressed files".into()); 105 | } 106 | _ => (reader, None), 107 | }) 108 | } 109 | 110 | #[cfg(all(test, feature = "compression", feature = "std"))] 111 | mod tests { 112 | use super::*; 113 | use std::fs::File; 114 | 115 | #[test] 116 | fn test_read_gzip() -> Result<(), EtError> { 117 | let f = File::open("tests/data/test.bam")?; 118 | 119 | let (mut rb, compression) = decompress(f)?; 120 | assert_eq!(compression, Some(FileType::Gzip)); 121 | let x: &[u8] = rb.next(&mut 1392)?.unwrap(); 122 | assert_eq!(x.len(), 1392); 123 | assert!(rb.next::<&[u8]>(&mut 1).is_err()); 124 | Ok(()) 125 | } 126 | 127 | #[test] 128 | fn test_read_bzip2() -> Result<(), EtError> { 129 | let f = File::open("tests/data/test.csv.bz2")?; 130 | 131 | let (rb, compression) = decompress(f)?; 132 | assert_eq!(compression, Some(FileType::Bzip)); 133 | assert_eq!(rb.as_ref().len(), 48); 134 | Ok(()) 135 | } 136 | 137 | #[test] 138 | fn test_read_xz() -> Result<(), EtError> { 139 | let f = File::open("tests/data/test.csv.xz")?; 140 | 141 | let (rb, compression) = decompress(f)?; 142 | assert_eq!(compression, Some(FileType::Lzma)); 143 | assert_eq!(rb.as_ref().len(), 48); 144 | Ok(()) 145 | } 146 | 147 | #[test] 148 | fn test_read_zstd() -> Result<(), EtError> { 149 | let f = File::open("tests/data/test.csv.zst")?; 150 | 151 | let (rb, compression) = decompress(f)?; 152 | assert_eq!(compression, Some(FileType::Zstd)); 153 | assert_eq!(rb.as_ref().len(), 48); 154 | Ok(()) 155 | } 156 | } 157 | -------------------------------------------------------------------------------- /entab-r/src/lib.rs: -------------------------------------------------------------------------------- 1 | use std::collections::BTreeMap; 2 | use std::fs::File; 3 | 4 | use entab_base::error::EtError; 5 | use entab_base::readers::{get_reader, RecordReader}; 6 | use entab_base::record::Value; 7 | use extendr_api::prelude::*; 8 | 9 | #[allow(clippy::needless_pass_by_value)] 10 | fn to_r(err: EtError) -> Error { 11 | err.to_string().into() 12 | } 13 | 14 | fn value_to_robj(value: Value) -> Robj { 15 | match value { 16 | Value::Null => ().into(), 17 | Value::Boolean(b) => b.into(), 18 | Value::Datetime(dt) => lang!("as.POSIXlt", dt.timestamp(), origin = "1970-01-01"), 19 | Value::Float(f) => f.into(), 20 | Value::Integer(i) => i.into(), 21 | Value::String(s) => s.as_ref().into(), 22 | Value::List(l) => { 23 | let mut values = Vec::new(); 24 | for v in l { 25 | values.push(value_to_robj(v)); 26 | } 27 | List::from_values(values).into() 28 | } 29 | Value::Record(r) => { 30 | let mut names = Vec::new(); 31 | let mut values = Vec::new(); 32 | for (key, value) in r { 33 | names.push(key); 34 | values.push(value_to_robj(value)); 35 | } 36 | List::from_names_and_values(names, values).into() 37 | } 38 | } 39 | } 40 | 41 | struct Reader { 42 | parser: String, 43 | header_names: Vec, 44 | reader: Box, 45 | } 46 | 47 | #[extendr] 48 | impl Reader { 49 | #[allow(clippy::new_ret_no_self)] 50 | fn new(filename: &str, parser: &str) -> Result { 51 | let file = File::open(filename).map_err(|e| Error::from(e.to_string()))?; 52 | let parser = if parser.is_empty() { 53 | None 54 | } else { 55 | Some(parser) 56 | }; 57 | let mut params = BTreeMap::new(); 58 | params.insert("filename".to_string(), Value::String(filename.into())); 59 | let (reader, parser_used) = get_reader(file, parser, Some(params)).map_err(to_r)?; 60 | let header_names = reader.headers(); 61 | Ok(Reader { 62 | parser: parser_used.to_string(), 63 | header_names, 64 | reader, 65 | } 66 | .into()) 67 | } 68 | 69 | fn parser(&self) -> &str { 70 | &self.parser 71 | } 72 | 73 | fn headers(&self) -> Vec { 74 | self.reader.headers() 75 | } 76 | 77 | fn metadata(&self) -> Robj { 78 | let metadata = self.reader.metadata(); 79 | let mut names = Vec::new(); 80 | let mut values = Vec::new(); 81 | for (key, value) in metadata { 82 | names.push(key); 83 | values.push(value_to_robj(value)); 84 | } 85 | List::from_names_and_values(names, values).into() 86 | } 87 | 88 | fn next(&mut self) -> Result { 89 | if let Some(record) = self.reader.next_record().map_err(to_r)? { 90 | let mut values = Vec::new(); 91 | for v in record { 92 | values.push(value_to_robj(v)); 93 | } 94 | Ok(List::from_names_and_values(&self.header_names, values).into()) 95 | } else { 96 | Ok(().into()) 97 | } 98 | } 99 | } 100 | 101 | pub enum ValueList { 102 | Null(usize), 103 | Boolean(Vec), 104 | Float(Vec), 105 | Integer(Vec), 106 | String(Vec), 107 | Misc(Vec), 108 | } 109 | 110 | #[extendr] 111 | fn as_data_frame(reader: &mut Reader) -> Result { 112 | let mut data: Vec = Vec::new(); 113 | if let Some(first) = reader.reader.next_record().map_err(to_r)? { 114 | for v in first { 115 | data.push(match v { 116 | Value::Null => ValueList::Null(1), 117 | Value::Boolean(b) => ValueList::Boolean(vec![b]), 118 | Value::Float(f) => ValueList::Float(vec![f]), 119 | Value::Integer(i) => ValueList::Integer(vec![i]), 120 | Value::String(s) => ValueList::String(vec![s.to_string()]), 121 | x => ValueList::Misc(vec![value_to_robj(x)]), 122 | }); 123 | } 124 | while let Some(record) = reader.reader.next_record().map_err(to_r)? { 125 | for (ix, v) in record.into_iter().enumerate() { 126 | match (&mut data[ix], v) { 127 | (ValueList::Null(x), Value::Null) => *x += 1, 128 | (ValueList::Boolean(v), Value::Boolean(b)) => v.push(b), 129 | (ValueList::Float(v), Value::Float(f)) => v.push(f), 130 | (ValueList::Integer(v), Value::Integer(i)) => v.push(i), 131 | (ValueList::String(v), Value::String(s)) => v.push(s.to_string()), 132 | (ValueList::Misc(v), x) => v.push(value_to_robj(x)), 133 | _ => panic!("Tried to append wrong data type"), 134 | } 135 | } 136 | } 137 | } else { 138 | for _ in &reader.header_names { 139 | data.push(ValueList::Null(0)); 140 | } 141 | } 142 | 143 | let mut vectors: Vec = vec![]; 144 | for v in data { 145 | vectors.push(match v { 146 | ValueList::Null(x) => vec![r!(NULL); x].into(), 147 | ValueList::Boolean(v) => v.iter().collect_robj(), 148 | ValueList::Float(v) => v.iter().collect_robj(), 149 | ValueList::Integer(v) => v.iter().collect_robj(), 150 | ValueList::String(v) => v.iter().collect_robj(), 151 | ValueList::Misc(v) => v.into(), 152 | }); 153 | } 154 | let obj: Robj = List::from_names_and_values(&reader.header_names, &vectors).into(); 155 | obj.set_attrib( 156 | row_names_symbol(), 157 | (1i32..=vectors[0].len() as i32).collect_robj(), 158 | )?; 159 | obj.set_class(&["data.frame"])?; 160 | Ok(obj) 161 | } 162 | 163 | extendr_module! { 164 | mod entab; 165 | impl Reader; 166 | fn as_data_frame; 167 | } 168 | -------------------------------------------------------------------------------- /entab-cli/src/lib.rs: -------------------------------------------------------------------------------- 1 | mod tsv_params; 2 | 3 | use std::collections::BTreeMap; 4 | use std::ffi::OsString; 5 | use std::fs::File; 6 | use std::io; 7 | use std::str; 8 | 9 | use clap::error::ErrorKind; 10 | use clap::{crate_authors, crate_version, Arg, Command}; 11 | #[cfg(feature = "mmap")] 12 | use memmap2::Mmap; 13 | 14 | use entab::readers::get_reader; 15 | use entab::record::Value; 16 | use entab::EtError; 17 | 18 | use crate::tsv_params::TsvParams; 19 | 20 | /// Parse the provided `stdin` using `args` and write results to `stdout`. 21 | /// 22 | /// # Errors 23 | /// If there are any issues, an `EtError` will be returned. 24 | pub fn run(args: I, stdin: R, stdout: W) -> Result<(), EtError> 25 | where 26 | I: IntoIterator, 27 | T: Into + Clone, 28 | R: io::Read, 29 | W: io::Write, 30 | { 31 | let clap_result = Command::new("entab") 32 | .about("Turn anything into a TSV") 33 | .author(crate_authors!()) 34 | .version(crate_version!()) 35 | .arg( 36 | Arg::new("input") 37 | .short('i') 38 | .help("Path to read; if not provided stdin will be used") 39 | .num_args(1), 40 | ) 41 | .arg( 42 | Arg::new("output") 43 | .short('o') 44 | .help("Path to write to; if not provided stdout will be used") 45 | .num_args(1), 46 | ) 47 | .arg( 48 | Arg::new("parser") 49 | .short('p') 50 | .help("Parser to use [if not specified, it will be auto-detected]") 51 | .num_args(1), 52 | ) 53 | .arg( 54 | Arg::new("metadata") 55 | .short('m') 56 | .long("metadata") 57 | .help("Reports metadata about the file instead of the data itself") 58 | .action(clap::ArgAction::SetTrue), 59 | ) 60 | .try_get_matches_from(args); 61 | 62 | let matches = match clap_result { 63 | Ok(d) => d, 64 | Err(e) => { 65 | if e.kind() == ErrorKind::DisplayHelp || e.kind() == ErrorKind::DisplayVersion { 66 | e.print()?; 67 | return Ok(()); 68 | } 69 | return Err(e.to_string().into()); 70 | } 71 | }; 72 | 73 | // TODO: map/reduce/filter options? 74 | // every column should either have a reduction set or it'll be dropped from 75 | // the result? reductions can be e.g. sum,average,count or group or column 76 | // (where column is the same as a pivot); this might be more useful as 77 | // another tool? 78 | 79 | #[cfg(feature = "mmap")] 80 | let mmap: Mmap; 81 | 82 | let mut parse_params = BTreeMap::new(); 83 | let parser = matches.get_one::<&str>("parser").copied(); 84 | let (mut rec_reader, _) = if let Some(&i) = matches.get_one::<&str>("input") { 85 | parse_params.insert("filename".to_string(), Value::String(i.into())); 86 | let file = File::open(i)?; 87 | #[cfg(feature = "mmap")] 88 | { 89 | mmap = unsafe { Mmap::map(&file)? }; 90 | get_reader(mmap.as_ref(), parser, Some(parse_params))? 91 | } 92 | #[cfg(not(feature = "mmap"))] 93 | get_reader(file, parser, Some(parse_params))? 94 | } else { 95 | let buffer: Box = Box::new(stdin); 96 | get_reader(buffer, parser, Some(parse_params))? 97 | }; 98 | // TODO: allow user to set these 99 | let params = TsvParams::default(); 100 | 101 | let mut writer: Box = if let Some(&i) = matches.get_one::<&str>("output") { 102 | Box::new(File::create(i)?) 103 | } else { 104 | Box::new(stdout) 105 | }; 106 | 107 | if matches.get_flag("metadata") { 108 | writer.write_all(b"key")?; 109 | writer.write_all(&[params.main_delimiter])?; 110 | writer.write_all(b"value")?; 111 | writer.write_all(¶ms.line_delimiter)?; 112 | for (key, value) in rec_reader.metadata() { 113 | params.write_str(key.as_bytes(), &mut writer)?; 114 | writer.write_all(&[params.main_delimiter])?; 115 | params.write_value(&value, &mut writer)?; 116 | writer.write_all(¶ms.line_delimiter)?; 117 | } 118 | return Ok(()); 119 | } 120 | writer.write_all( 121 | rec_reader 122 | .headers() 123 | .join(str::from_utf8(&[params.main_delimiter])?) 124 | .as_bytes(), 125 | )?; 126 | writer.write_all(¶ms.line_delimiter)?; 127 | 128 | while let Some(fields) = rec_reader.next_record()? { 129 | params.write_value(&fields[0], &mut writer)?; 130 | for field in fields.iter().skip(1) { 131 | writer.write_all(&[params.main_delimiter])?; 132 | params.write_value(field, &mut writer)?; 133 | } 134 | writer.write_all(¶ms.line_delimiter)?; 135 | } 136 | writer.flush()?; 137 | 138 | Ok(()) 139 | } 140 | 141 | #[cfg(test)] 142 | mod tests { 143 | use super::*; 144 | 145 | #[test] 146 | fn test_version() -> Result<(), EtError> { 147 | let mut out = Vec::new(); 148 | assert!(run(["entab", "--version"], &b""[..], io::Cursor::new(&mut out)).is_ok()); 149 | assert_eq!(&out[..], b""); 150 | Ok(()) 151 | } 152 | 153 | #[test] 154 | fn test_output() -> Result<(), EtError> { 155 | let mut out = Vec::new(); 156 | assert!(run(["entab"], &b">test\nACGT"[..], io::Cursor::new(&mut out)).is_ok()); 157 | println!("{}", std::str::from_utf8(&out).unwrap()); 158 | assert_eq!(&out[..], b"id\tsequence\ntest\tACGT\n"); 159 | Ok(()) 160 | } 161 | 162 | #[test] 163 | fn test_metadata() -> Result<(), EtError> { 164 | let mut out = Vec::new(); 165 | run( 166 | ["entab", "--metadata"], 167 | &b">test\nACGT"[..], 168 | io::Cursor::new(&mut out), 169 | )?; 170 | assert_eq!(&out[..], b"key\tvalue\n"); 171 | Ok(()) 172 | } 173 | } 174 | -------------------------------------------------------------------------------- /entab/tests/data/carotenoid_extract.d/RUN.M/ACQ.MS: -------------------------------------------------------------------------------- 1 | [Signature] 2 | ID605=3bFLO8wECj8jND7/yiLo24vQ00000000 3 | [Version] 4 | ID=G2715 Version: B.02.01 5 | [General] 6 | Type=1 7 | Run Duration=900.00 8 | NoLimit=1 9 | NoMSD=0 10 | PeakWidth=6.00 11 | FastScan=0 12 | FastScanDataRecon=0 13 | FastScanAcq=0 14 | Solvent Delay=0.00 15 | IonizationMode=6 16 | IonSwitchDelay=50 17 | PolSwitchDelay=300 18 | PolSwitchDelayUser=300 19 | PolSwitchDelayAuto=1 20 | Tune File=atunes.tun 21 | Acq Mode=0 22 | Acq Mode 2=0 23 | Acq Mode 3=0 24 | Acq Mode 4=0 25 | Signal Summary= 26 | Signal Summary 2= 27 | Signal Summary 3= 28 | Signal Summary 4= 29 | Signal Active=1 30 | Signal Active 2=0 31 | Signal Active 3=0 32 | Signal Active 4=0 33 | Cycle Time=100 34 | Cycle Time 2=0 35 | Cycle Time 3=0 36 | Cycle Time 4=0 37 | Polarity=0 38 | Polarity 2=0 39 | Polarity 3=0 40 | Polarity 4=0 41 | Ionization=9 42 | Ionization 2=9 43 | Ionization 3=9 44 | Ionization 4=9 45 | Sim on target mass=0 46 | Sim on target mass 2=0 47 | Sim on target mass 3=0 48 | Sim on target mass 4=0 49 | Group Ions=1 50 | Group Ions 2=1 51 | Group Ions 3=1 52 | Group Ions 4=1 53 | Zones=4 54 | Scan Groups=1 55 | Scan Groups 2=1 56 | Scan Groups 3=1 57 | Scan Groups 4=1 58 | Data Curves=0 59 | Data Curve Rate=1000 60 | [Parameters] 61 | ID 1=1 62 | Mode 1=1 63 | Value 1=3000.000000,3000.000000 64 | ID 2=5 65 | Mode 2=1 66 | Value 2=3000.000000,3000.000000 67 | ID 3=7 68 | Mode 3=1 69 | Value 3=5.000000,4.000000 70 | ID 4=8 71 | Mode 4=1 72 | Value 4=5.000000,40.000000 73 | ID 5=33 74 | Mode 5=1 75 | Value 5=4000.000000,4000.000000 76 | ID 6=-1 77 | Mode 6=0 78 | Value 6=0.000000,2000.000000 79 | ID 7=-1 80 | Mode 7=0 81 | Value 7=0.000000,1300.000000 82 | ID 8=-1 83 | Mode 8=0 84 | Value 8=0.000000,1300.000000 85 | ID 9=-1 86 | Mode 9=0 87 | Value 9=0.000000,4000.000000 88 | ID 10=-1 89 | Mode 10=0 90 | Value 10=0.000000,1300.000000 91 | ID 11=-1 92 | Mode 11=0 93 | Value 11=0.000000,2000.000000 94 | ID 12=-1 95 | Mode 12=0 96 | Value 12=0.000000,4.000000 97 | ID 13=-1 98 | Mode 13=0 99 | Value 13=0.000000,40.000000 100 | [DynamicParm 1] 101 | GlobalId=11 102 | Enable=0 103 | DynamicValues= 104 | [DynamicParm 2 1] 105 | GlobalId=-1 106 | Enable=0 107 | DynamicValues= 108 | [DynamicParm 3 1] 109 | GlobalId=-1 110 | Enable=0 111 | DynamicValues= 112 | [DynamicParm 4 1] 113 | GlobalId=-1 114 | Enable=0 115 | DynamicValues= 116 | [Zone 1] 117 | ID=6 118 | On=1 119 | Soft Max=350.00 120 | Set Point=350.00 121 | Type=0 122 | [Zone 2] 123 | ID=7 124 | On=1 125 | Soft Max=500.00 126 | Set Point=425.00 127 | Type=0 128 | [Zone 3] 129 | ID=13 130 | On=1 131 | Soft Max=13.00 132 | Set Point=6.00 133 | Type=0 134 | [Zone 4] 135 | ID=14 136 | On=1 137 | Soft Max=60.00 138 | Set Point=60.00 139 | Type=0 140 | [Ions 1] 141 | Group Name=Group 1 142 | Start Time=0.00 143 | Row OnOff=1 144 | Resolution=0 145 | Gain=1.0 146 | Plot Index 1=0 147 | Plot Index 2=0 148 | Number of Ions=1 149 | Ion 1=195.0,590 150 | ISTD Ion 1=0 151 | Compound Name 1= 152 | Frgmtr Ion 1=70 153 | [Ions 2 1] 154 | Group Name=Group 1 155 | Start Time=0.00 156 | Row OnOff=1 157 | Resolution=0 158 | Gain=1.0 159 | Plot Index 1=0 160 | Plot Index 2=0 161 | Number of Ions=1 162 | Ion 1=195.0,590 163 | ISTD Ion 1=0 164 | Compound Name 1= 165 | Frgmtr Ion 1=70 166 | [Ions 3 1] 167 | Group Name=Group 1 168 | Start Time=0.00 169 | Row OnOff=1 170 | Resolution=0 171 | Gain=1.0 172 | Plot Index 1=0 173 | Plot Index 2=0 174 | Number of Ions=1 175 | Ion 1=195.0,590 176 | ISTD Ion 1=0 177 | Compound Name 1= 178 | Frgmtr Ion 1=70 179 | [Ions 4 1] 180 | Group Name=Group 1 181 | Start Time=0.00 182 | Row OnOff=1 183 | Resolution=0 184 | Gain=1.0 185 | Plot Index 1=0 186 | Plot Index 2=0 187 | Number of Ions=1 188 | Ion 1=195.0,590 189 | ISTD Ion 1=0 190 | Compound Name 1= 191 | Frgmtr Ion 1=70 192 | [Scan 1] 193 | Start Time=0.00 194 | Row OnOff=1 195 | Low Mass=100.0 196 | High Mass=1000.0 197 | Threshold=100 198 | Sampling=6 199 | Stepsize=0.10 200 | Gain=1.0 201 | Fragmentor=250 202 | Low Plot 1=100.0 203 | High Plot 1=600.0 204 | Low Plot 2=100.0 205 | High Plot 2=600.0 206 | [Scan 2 1] 207 | Start Time=0.00 208 | Row OnOff=1 209 | Low Mass=1016.0 210 | High Mass=1054.0 211 | Threshold=100 212 | Sampling=54 213 | Stepsize=0.10 214 | Gain=1.0 215 | Fragmentor=75 216 | Low Plot 1=1016.0 217 | High Plot 1=1054.0 218 | Low Plot 2=1016.0 219 | High Plot 2=1054.0 220 | [Scan 3 1] 221 | Start Time=0.00 222 | Row OnOff=1 223 | Low Mass=100.0 224 | High Mass=1000.0 225 | Threshold=150 226 | Sampling=4 227 | Stepsize=0.10 228 | Gain=1.0 229 | Fragmentor=70 230 | Low Plot 1=100.0 231 | High Plot 1=550.0 232 | Low Plot 2=100.0 233 | High Plot 2=550.0 234 | [Scan 4 1] 235 | Start Time=0.00 236 | Row OnOff=1 237 | Low Mass=100.0 238 | High Mass=1000.0 239 | Threshold=150 240 | Sampling=4 241 | Stepsize=0.10 242 | Gain=1.0 243 | Fragmentor=70 244 | Low Plot 1=100.0 245 | High Plot 1=550.0 246 | Low Plot 2=100.0 247 | High Plot 2=550.0 248 | [Time Events] 249 | Number Events=0 250 | [Filters] 251 | Mass Type=1 252 | Mass Param=0.300 253 | Mass Sigma=2.00 254 | Time Type=1 255 | Time Param=3.000 256 | Time Sigma=2.00 257 | [Filters 2] 258 | Mass Type=1 259 | Mass Param=0.300 260 | Mass Sigma=2.00 261 | Time Type=1 262 | Time Param=3.000 263 | Time Sigma=2.00 264 | [Filters 3] 265 | Mass Type=1 266 | Mass Param=0.300 267 | Mass Sigma=2.00 268 | Time Type=1 269 | Time Param=3.000 270 | Time Sigma=2.00 271 | [Filters 4] 272 | Mass Type=1 273 | Mass Param=0.300 274 | Mass Sigma=2.00 275 | Time Type=1 276 | Time Param=3.000 277 | Time Sigma=2.00 278 | [Plot] 279 | Plot Type 1=0 280 | Plot Type 2=1 281 | [FractionCollection] 282 | Mode=0 283 | CSV File=BASEMASS.CSV 284 | Signal 1=1 285 | Signal 2=0 286 | Signal 3=0 287 | Signal 4=0 288 | Detector=0 289 | Standard Positive Adducts=0,0,0,0,0 290 | Standard Negative Adducts=0,0,0,0,0 291 | User-defined Positive Adducts=0, ,0, ,0, 292 | User-defined Negative Adducts=0, ,0, ,0, 293 | Positive Charge State=0,0 294 | Negative Charge State=0,0 295 | Number Of Base Masses=5 296 | Base Masses 1=1292.6 297 | Base Masses 2=1297.6 298 | Base Masses 3=1299.6 299 | Base Masses 4=1301.7 300 | Base Masses 5=1302.6 301 | Min Peak Width=0.00 302 | Max Peak Width=1.50 303 | Time Limit=15.00 304 | MSD Delay=0.01 305 | MSD Threshold=700 306 | MSD Peak Slope=0 307 | Other Delay=0.00 308 | Other Threshold=5.0 309 | Other Peak Slope=10 310 | MSD Window=0.2 311 | MSD Output Range=2000 312 | Other Full Scale=0.10 313 | -------------------------------------------------------------------------------- /entab/tests/data/masshunter_example/AcqData/sample_info.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | Sample ID 4 | Sample ID 5 | 6 | 7 | 8 8 | 9 | SYSTEM 10 | 11 | 12 | Sample Name 13 | Sample Name 14 | Shoreline TLE 15 | 8 16 | 17 | SYSTEM 18 | 19 | 20 | Rack Code 21 | Rack Code 22 | 23 | 24 | 8 25 | 26 | SYSTEM 27 | 28 | 29 | Rack Position 30 | Rack Position 31 | 32 | 33 | 8 34 | 35 | SYSTEM 36 | 37 | 38 | Plate Code 39 | Plate Code 40 | PlateOrVial 41 | 8 42 | 43 | SYSTEM 44 | 45 | 46 | Plate Position 47 | Plate Position 48 | 49 | 50 | 8 51 | 52 | SYSTEM 53 | 54 | 55 | Sample Position 56 | Sample Position 57 | P1-A4 58 | 8 59 | 60 | SYSTEM 61 | 62 | 63 | Method 64 | Method 65 | D:\MassHunter\Methods\RJB_Airs2001FIA.m 66 | 8 67 | 68 | SYSTEM 69 | 70 | 71 | Override DA Method 72 | Override DA Method 73 | 74 | 75 | 8 76 | 77 | SYSTEM 78 | 79 | 80 | Data File 81 | Data File 82 | D:\MassHunter\Data\Roderick\20131124_FIA_47.d 83 | 8 84 | 85 | SYSTEM 86 | 87 | 88 | Sample Type 89 | Sample Type 90 | Sample 91 | 8 92 | 93 | SYSTEM 94 | 95 | 96 | Method Type 97 | Method Type 98 | Acquisition Only 99 | 8 100 | 101 | SYSTEM 102 | 103 | 104 | Balance Override 105 | Balance Override 106 | No Override 107 | 8 108 | 109 | SYSTEM 110 | 111 | 112 | Inj Vol (µl) 113 | Inj Vol (µl) 114 | 10 115 | 8 116 | 117 | SYSTEM 118 | 119 | 120 | Equilib Time (min) 121 | Equilib Time (min) 122 | 0 123 | 8 124 | 125 | SYSTEM 126 | 127 | 128 | Dilution 129 | Dilution 130 | 1 131 | 8 132 | 133 | SYSTEM 134 | 135 | 136 | Wt/Vol 137 | Wt/Vol 138 | 0 139 | 8 140 | 141 | SYSTEM 142 | 143 | 144 | Comment 145 | Comment 146 | 147 | 148 | 8 149 | 150 | SYSTEM 151 | 152 | 153 | Barcode 154 | Barcode 155 | 156 | 157 | 8 158 | 159 | SYSTEM 160 | 161 | 162 | Level Name 163 | Level Name 164 | 165 | 166 | 8 167 | 168 | SYSTEM 169 | 170 | 171 | Sample Group 172 | Sample Group 173 | 174 | 175 | 8 176 | 177 | SYSTEM 178 | 179 | 180 | Info. 181 | Info. 182 | 183 | 184 | 8 185 | 186 | SYSTEM 187 | 188 | 189 | OperatorName 190 | Operator Name 191 | UHPLC-PC\UHPLC-PC 192 | 8 193 | 194 | 195 | SYSTEM 196 | 197 | 198 | InstrumentName 199 | Instrument Name 200 | Instrument 1 201 | 8 202 | 203 | 204 | SYSTEM 205 | 206 | 207 | AcqTime 208 | Acquisition Time 209 | 2013-11-25T03:37:13Z 210 | 8 211 | 212 | 213 | SYSTEM 214 | 215 | 216 | SampleLockedRunMode 217 | Locked Run 218 | 0 219 | 8 220 | 221 | 222 | SYSTEM 223 | 224 | 225 | RunCompletedFlag 226 | Run Completed 227 | -1 228 | 8 229 | 230 | 231 | SYSTEM 232 | 233 | -------------------------------------------------------------------------------- /entab-cli/src/tsv_params.rs: -------------------------------------------------------------------------------- 1 | use std::convert::Into; 2 | use std::io::Write; 3 | 4 | use entab::error::EtError; 5 | use entab::record::Value; 6 | 7 | use memchr::{memchr, memchr_iter}; 8 | 9 | #[allow(dead_code)] // TODO: add support for this into the CLI client 10 | pub enum TsvEscapeStyle { 11 | Quote(u8), 12 | Escape(u8), 13 | Replace(u8), 14 | } 15 | 16 | pub struct TsvParams { 17 | pub null_value: Vec, 18 | pub true_value: Vec, 19 | pub false_value: Vec, 20 | pub line_delimiter: Vec, 21 | pub main_delimiter: u8, 22 | pub escape_style: TsvEscapeStyle, 23 | pub list_delimiter: u8, 24 | pub list_start_end: (Vec, Vec), 25 | pub record_delimiter: u8, 26 | } 27 | 28 | impl Default for TsvParams { 29 | fn default() -> Self { 30 | TsvParams { 31 | null_value: b"null".to_vec(), 32 | true_value: b"true".to_vec(), 33 | false_value: b"false".to_vec(), 34 | line_delimiter: vec![b'\n'], 35 | main_delimiter: b'\t', 36 | escape_style: TsvEscapeStyle::Quote(b'"'), 37 | list_delimiter: b',', 38 | list_start_end: (b"".to_vec(), b"".to_vec()), 39 | record_delimiter: b':', 40 | } 41 | } 42 | } 43 | 44 | impl TsvParams { 45 | pub fn write_str(&self, string: &'_ [u8], mut writer: W) -> Result<(), EtError> 46 | where 47 | W: Write, 48 | { 49 | let first = match memchr(self.main_delimiter, string) { 50 | Some(break_loc) => break_loc, 51 | None => { 52 | return writer.write_all(string).map_err(Into::into); 53 | } 54 | }; 55 | if let TsvEscapeStyle::Quote(quote_char) = self.escape_style { 56 | writer.write_all(&[quote_char])?; 57 | writer.write_all(string)?; 58 | return writer.write_all(&[quote_char]).map_err(Into::into); 59 | }; 60 | writer.write_all(&string[..first])?; 61 | if let TsvEscapeStyle::Escape(escape_char) = self.escape_style { 62 | writer.write_all(&[escape_char, self.main_delimiter])?; 63 | } else if let TsvEscapeStyle::Replace(replace_char) = self.escape_style { 64 | writer.write_all(&[replace_char])?; 65 | } 66 | let mut old_pos = 1; 67 | for pos in memchr_iter(self.main_delimiter, &string[first + 1..]) { 68 | writer.write_all(&string[first + old_pos..=first + pos])?; 69 | if let TsvEscapeStyle::Escape(escape_char) = self.escape_style { 70 | writer.write_all(&[escape_char, self.main_delimiter])?; 71 | } else if let TsvEscapeStyle::Replace(replace_char) = self.escape_style { 72 | writer.write_all(&[replace_char])?; 73 | } 74 | old_pos = pos + 2; 75 | } 76 | if old_pos < string.len() { 77 | writer.write_all(&string[first + old_pos..])?; 78 | } 79 | Ok(()) 80 | } 81 | 82 | /// Write a `Value` out to a TSV stream 83 | pub fn write_value(&self, value: &Value, mut writer: &mut W) -> Result<(), EtError> 84 | where 85 | W: Write, 86 | { 87 | match value { 88 | Value::Null => writer.write_all(&self.null_value)?, 89 | Value::Boolean(true) => writer.write_all(&self.true_value)?, 90 | Value::Boolean(false) => writer.write_all(&self.false_value)?, 91 | Value::Datetime(s) => writer.write_all(format!("{:+?}", s).as_bytes())?, 92 | Value::Float(v) => writer.write_all(format!("{}", v).as_bytes())?, 93 | Value::Integer(v) => writer.write_all(format!("{}", v).as_bytes())?, 94 | Value::List(l) => { 95 | writer.write_all(&self.list_start_end.0)?; 96 | if !l.is_empty() { 97 | self.write_value(&l[0], writer)?; 98 | for i in &l[1..] { 99 | writer.write_all(&[self.list_delimiter])?; 100 | self.write_value(i, writer)?; 101 | } 102 | } 103 | writer.write_all(&self.list_start_end.1)?; 104 | } 105 | Value::Record(_) => unimplemented!("No writer for records yet"), 106 | Value::String(s) => self.write_str(s.as_bytes(), &mut writer)?, 107 | }; 108 | Ok(()) 109 | } 110 | } 111 | 112 | #[cfg(test)] 113 | mod tests { 114 | use super::*; 115 | use std::io::Cursor; 116 | 117 | #[test] 118 | fn test_replace_chars() { 119 | let mut params = TsvParams::default(); 120 | params.escape_style = TsvEscapeStyle::Replace(b'|'); 121 | 122 | let mut buffer = Cursor::new(Vec::new()); 123 | let _ = params.write_str(b"", &mut buffer); 124 | assert_eq!(buffer.get_ref(), b""); 125 | 126 | let mut buffer = Cursor::new(Vec::new()); 127 | let _ = params.write_str(b"\t", &mut buffer); 128 | assert_eq!(buffer.get_ref(), b"|"); 129 | 130 | let mut buffer = Cursor::new(Vec::new()); 131 | let _ = params.write_str(b"test", &mut buffer); 132 | assert_eq!(buffer.get_ref(), b"test"); 133 | 134 | let mut buffer = Cursor::new(Vec::new()); 135 | let _ = params.write_str(b"\ttest", &mut buffer); 136 | assert_eq!(buffer.get_ref(), b"|test"); 137 | 138 | let mut buffer = Cursor::new(Vec::new()); 139 | let _ = params.write_str(b"\ttest\t", &mut buffer); 140 | assert_eq!(buffer.get_ref(), b"|test|"); 141 | 142 | let mut buffer = Cursor::new(Vec::new()); 143 | let _ = params.write_str(b"\ttest\tt\t", &mut buffer); 144 | assert_eq!(buffer.get_ref(), b"|test|t|"); 145 | 146 | let mut buffer = Cursor::new(Vec::new()); 147 | let _ = params.write_str(b"\t\t\t", &mut buffer); 148 | assert_eq!(buffer.get_ref(), b"|||"); 149 | } 150 | 151 | #[test] 152 | fn test_escape_params() { 153 | let mut params = TsvParams::default(); 154 | params.escape_style = TsvEscapeStyle::Escape(b'|'); 155 | 156 | let mut buffer = Cursor::new(Vec::new()); 157 | let _ = params.write_str(b"\t", &mut buffer); 158 | assert_eq!(buffer.get_ref(), b"|\t"); 159 | 160 | let mut buffer = Cursor::new(Vec::new()); 161 | let _ = params.write_str(b"\ttest\t", &mut buffer); 162 | assert_eq!(buffer.get_ref(), b"|\ttest|\t"); 163 | } 164 | 165 | #[test] 166 | fn test_write_value_date() -> Result<(), EtError> { 167 | const DATE: &str = "2001-02-03T04:05:06.000Z"; 168 | const OUT_DATE: &[u8] = b"2001-02-03T04:05:06"; 169 | 170 | let p = TsvParams::default(); 171 | let mut buffer = Cursor::new(Vec::new()); 172 | let datetime = Value::from_iso_date(DATE)?; 173 | let _ = p.write_value(&datetime, &mut buffer); 174 | assert_eq!(buffer.get_ref(), &OUT_DATE); 175 | Ok(()) 176 | } 177 | } 178 | -------------------------------------------------------------------------------- /entab/src/parsers/agilent/chemstation_reg.rs: -------------------------------------------------------------------------------- 1 | use alloc::collections::BTreeMap; 2 | use core::marker::Copy; 3 | 4 | use encoding::all::ISO_8859_1; 5 | use encoding::{DecoderTrap, Encoding}; 6 | 7 | use crate::parsers::{extract, Endian, FromSlice}; 8 | use crate::record::StateMetadata; 9 | use crate::EtError; 10 | use crate::{impl_reader, impl_record}; 11 | use crate::record::Value; 12 | 13 | fn decode_iso_8859(raw: &[u8]) -> Result { 14 | ISO_8859_1.decode(raw, DecoderTrap::Ignore).map_err(|e| e.into_owned().into()) 15 | } 16 | 17 | /// State of the Chemstation REG parser 18 | #[derive(Clone, Copy, Debug, Default)] 19 | pub struct ChemstationRegState { 20 | 21 | 22 | 23 | } 24 | 25 | impl StateMetadata for ChemstationRegState { 26 | fn header(&self) -> Vec<&str> { 27 | vec!["time", "intensity"] 28 | } 29 | } 30 | 31 | impl<'b: 's, 's> FromSlice<'b, 's> for ChemstationRegState { 32 | type State = (); 33 | 34 | fn parse( 35 | buf: &[u8], 36 | eof: bool, 37 | consumed: &mut usize, 38 | _state: &mut Self::State, 39 | ) -> Result { 40 | let con = &mut 0; 41 | let header = extract::<&[u8]>(buf, con, &mut 45)?; 42 | 43 | if header[25] != b'A' { 44 | return Err(EtError::from("Version of REG file is too new")); 45 | } 46 | let n_sections = u16::extract(&header[38..], &Endian::Little)?; 47 | 48 | // TODO: parse multiple sections 49 | 50 | let n_records = extract::(buf, con, &mut Endian::Little)? as usize; 51 | 52 | let mut records = Vec::with_capacity(n_records); 53 | for _ in 0..n_records { 54 | let _ = extract::(buf, con, &mut Endian::Little)?; 55 | let record_type = extract::(buf, con, &mut Endian::Little)?; 56 | let record_len = extract::(buf, con, &mut Endian::Little)? as usize; 57 | let _ = extract::(buf, con, &mut Endian::Little)?; 58 | let record_id = extract::(buf, con, &mut Endian::Little)?; 59 | records.push((record_type, record_len, record_id)) 60 | } 61 | 62 | let mut names: BTreeMap = BTreeMap::new(); 63 | let mut metadata: BTreeMap = BTreeMap::new(); 64 | for (record_type, mut record_len, record_id) in records { 65 | let record_data = extract::<&[u8]>(buf, con, &mut record_len)?; 66 | match record_type { 67 | // x-y table 68 | 1281 | 1283 => { 69 | // u16,u16,u8,u32,u32 (n_points),i16,u32,f64 70 | // H H B I I h I d 71 | 72 | // (then repeated twice, first x array and then y array) 73 | // u32 (units id),u32 (name id?),[12],i16,u32,f64 (multiplicative adjustment),f64,u64,u64,u8,[8] 74 | // I I 12s h I d d Q Q B 8s 75 | // FIXME 76 | }, 77 | // key-value? 78 | 1537 => { 79 | // the matching data is in a 32770 record so we only get the name 80 | let record_id = u32::extract(&record_data[35..], &Endian::Little)?; 81 | let _ = names.insert(record_id, decode_iso_8859(record_data[14..30].split(|c| *c == 0).next().unwrap_or(&record_data[14..30]))?); 82 | }, 83 | // part of a linked list 84 | 1538 => { 85 | if record_data.len() != 39 { 86 | return Err(EtError::from("Data type 1538 was an unexpected size")); 87 | } 88 | let _ = names.insert(record_id, decode_iso_8859(&record_data[14..35])?); 89 | let _ = metadata.insert(record_id, u32::extract(&record_data[35..], &Endian::Little)?.into()); 90 | }, 91 | // another part of a linked list with a table reference 92 | 1539 => { 93 | if record_data.len() != 39 { 94 | return Err(EtError::from("Data type 1539 was an unexpected size")); 95 | } 96 | let id = u32::extract(&record_data[35..], &Endian::Little)?; 97 | let _ = names.insert(id, decode_iso_8859(&record_data[14..35])?); 98 | // no data? 99 | }, 100 | // table of values 101 | 1793 => { 102 | let n_rows = u16::extract(&record_data[4..], &Endian::Little)?; 103 | let n_columns = u16::extract(&record_data[16..], &Endian::Little)?; 104 | if n_columns == 0 { 105 | continue; 106 | } 107 | // FIXME 108 | }, 109 | // names (these have data elsewhere?) 110 | 32769 | 32771 => { 111 | let _ = names.insert(record_id, decode_iso_8859(&record_data[..record_len-1])?); 112 | }, 113 | 32774 => { 114 | let _ = names.insert(record_id, decode_iso_8859(&record_data[2..record_len-1])?); 115 | }, 116 | // flattened numeric array; contains the raw data for 1281/1283 records 117 | 32770 => { 118 | if record_data.len() < 4 { 119 | return Err(EtError::from("Array was undersized")); 120 | } 121 | let n_points = record_data.len() / 4 - 1; 122 | let mut data: Vec = Vec::with_capacity(n_points); 123 | for ix in 0..n_points { 124 | data.push(u32::extract(&record_data[4 * ix + 4..], &Endian::Little)?.into()); 125 | } 126 | let _ = metadata.insert(record_id, data.into()); 127 | }, 128 | _ => { }, 129 | } 130 | } 131 | 132 | Ok(true) 133 | } 134 | 135 | fn get( 136 | &mut self, 137 | buf: &'b [u8], 138 | state: &'s Self::State, 139 | ) -> Result<(), EtError> { 140 | Ok(()) 141 | } 142 | } 143 | 144 | /// Record 145 | #[derive(Clone, Copy, Debug, Default)] 146 | pub struct ChemstationRegRecord { 147 | point: f64 148 | } 149 | 150 | impl<'b: 's, 's> FromSlice<'b, 's> for ChemstationRegRecord { 151 | type State = ChemstationRegState; 152 | 153 | fn parse( 154 | buf: &[u8], 155 | eof: bool, 156 | consumed: &mut usize, 157 | _state: &mut Self::State, 158 | ) -> Result { 159 | Ok(false) 160 | } 161 | 162 | fn get( 163 | &mut self, 164 | buf: &'b [u8], 165 | state: &'s Self::State, 166 | ) -> Result<(), EtError> { 167 | Ok(()) 168 | } 169 | } 170 | 171 | impl_record!(ChemstationRegRecord: point); 172 | 173 | impl_reader!(ChemstationRegReader, ChemstationRegRecord, ChemstationRegRecord, ChemstationRegState, ()); 174 | 175 | #[cfg(test)] 176 | mod tests { 177 | use super::*; 178 | 179 | #[test] 180 | fn test_chemstation_reg_reader() -> Result<(), EtError> { 181 | let rb: &[u8] = include_bytes!("../../../tests/data/chemstation_mwd.d/LCDIAG.REG"); 182 | let mut reader = ChemstationRegReader::new(rb, None)?; 183 | 184 | let mut n_recs = 1; 185 | while reader.next()?.is_some() { 186 | n_recs += 1; 187 | } 188 | assert_eq!(n_recs, 5); 189 | Ok(()) 190 | } 191 | } 192 | -------------------------------------------------------------------------------- /entab/src/parsers/fasta.rs: -------------------------------------------------------------------------------- 1 | use alloc::vec; 2 | use alloc::vec::Vec; 3 | 4 | use memchr::{memchr, memchr_iter}; 5 | 6 | use crate::parsers::FromSlice; 7 | use crate::record::StateMetadata; 8 | use crate::EtError; 9 | use crate::{impl_reader, impl_record}; 10 | 11 | use alloc::borrow::Cow; 12 | 13 | #[derive(Clone, Debug, Default)] 14 | /// A single sequence from a FASTA file 15 | pub struct FastaRecord<'r> { 16 | /// The ID/header line 17 | pub id: &'r str, 18 | /// The sequence itself 19 | pub sequence: Cow<'r, [u8]>, 20 | } 21 | 22 | impl_record!(FastaRecord<'r>: id, sequence); 23 | 24 | /// The current state of FASTA parsing 25 | #[derive(Clone, Copy, Debug, Default)] 26 | pub struct FastaState { 27 | header_end: usize, 28 | seq: (usize, usize), 29 | } 30 | 31 | impl StateMetadata for FastaState { 32 | fn header(&self) -> Vec<&str> { 33 | vec!["id", "sequence"] 34 | } 35 | } 36 | 37 | impl<'b: 's, 's> FromSlice<'b, 's> for FastaState { 38 | type State = (); 39 | } 40 | 41 | impl<'b: 's, 's> FromSlice<'b, 's> for FastaRecord<'b> { 42 | type State = FastaState; 43 | 44 | fn parse( 45 | rb: &[u8], 46 | eof: bool, 47 | consumed: &mut usize, 48 | parser_state: &mut Self::State, 49 | ) -> Result { 50 | if !eof && rb.is_empty() { 51 | // TODO: also check if it's just some whitespace? 52 | return Err(EtError::new("No FASTA could be parsed").incomplete()); 53 | } else if eof && rb.is_empty() { 54 | return Ok(false); 55 | } 56 | if rb[0] != b'>' { 57 | return Err("Valid FASTA records start with '>'".into()); 58 | } 59 | let seq_start = if let Some(p) = memchr(b'\n', rb) { 60 | if p > 0 && rb[p - 1] == b'\r' { 61 | // strip out the \r too if this is a \r\n ending 62 | parser_state.header_end = p - 1; 63 | p + 1 64 | } else { 65 | parser_state.header_end = p; 66 | p + 1 67 | } 68 | } else { 69 | return Err(EtError::new("Incomplete header").incomplete()); 70 | }; 71 | 72 | if let Some(p) = memchr(b'>', &rb[seq_start..]) { 73 | if p == 0 || rb.get(seq_start + p - 1) != Some(&b'\n') { 74 | return Err("Unexpected '>' found".into()); 75 | } 76 | if rb.get(seq_start + p - 2) == Some(&b'\r') { 77 | parser_state.seq = (seq_start, seq_start + p - 2); 78 | } else { 79 | parser_state.seq = (seq_start, seq_start + p - 1); 80 | } 81 | *consumed += seq_start + p; 82 | } else if eof { 83 | parser_state.seq = (seq_start, rb.len()); 84 | // at eof; just return the end 85 | *consumed += rb.len(); 86 | } else { 87 | return Err(EtError::new("Sequence needs more data").incomplete()); 88 | } 89 | Ok(true) 90 | } 91 | 92 | fn get(&mut self, rb: &'b [u8], state: &Self::State) -> Result<(), EtError> { 93 | self.id = alloc::str::from_utf8(&rb[1..state.header_end])?; 94 | let raw_sequence = &rb[state.seq.0..state.seq.1]; 95 | let mut seq_newlines = memchr_iter(b'\n', raw_sequence).peekable(); 96 | self.sequence = if seq_newlines.peek().is_none() { 97 | raw_sequence.into() 98 | } else { 99 | let mut new_buf = Vec::with_capacity(raw_sequence.len()); 100 | let mut start = 0; 101 | for pos in seq_newlines { 102 | if pos >= 1 && raw_sequence.get(pos - 1) == Some(&b'\r') { 103 | new_buf.extend_from_slice(&raw_sequence[start..pos - 1]); 104 | } else { 105 | new_buf.extend_from_slice(&raw_sequence[start..pos]); 106 | } 107 | start = pos + 1; 108 | } 109 | new_buf.extend_from_slice(&raw_sequence[start..]); 110 | new_buf.into() 111 | }; 112 | Ok(()) 113 | } 114 | } 115 | 116 | impl_reader!(FastaReader, FastaRecord, FastaRecord<'r>, FastaState, ()); 117 | 118 | #[cfg(test)] 119 | mod tests { 120 | use alloc::borrow::Cow; 121 | 122 | use super::*; 123 | 124 | #[test] 125 | fn test_fasta_reading() -> Result<(), EtError> { 126 | const TEST_FASTA: &[u8] = b">id\nACGT\n>id2\nTGCA"; 127 | let mut pt = FastaReader::new(TEST_FASTA, None)?; 128 | 129 | let mut ix = 0; 130 | while let Some(FastaRecord { id, sequence }) = pt.next()? { 131 | match ix { 132 | 0 => { 133 | assert_eq!(id, "id"); 134 | assert_eq!(sequence, Cow::Borrowed(&b"ACGT"[..])); 135 | } 136 | 1 => { 137 | assert_eq!(id, "id2"); 138 | assert_eq!(sequence, Cow::Borrowed(&b"TGCA"[..])); 139 | } 140 | _ => return Err("bad line".into()), 141 | } 142 | ix += 1; 143 | } 144 | assert_eq!(ix, 2); 145 | Ok(()) 146 | } 147 | 148 | #[test] 149 | fn test_fasta_short() -> Result<(), EtError> { 150 | const TEST_FASTA: &[u8] = b">id"; 151 | let mut pt = FastaReader::new(TEST_FASTA, None)?; 152 | assert!(pt.next().is_err()); 153 | 154 | const TEST_FASTA_2: &[u8] = b">\n>"; 155 | let mut pt = FastaReader::new(TEST_FASTA_2, None)?; 156 | assert!(pt.next().is_err()); 157 | 158 | Ok(()) 159 | } 160 | 161 | #[test] 162 | fn test_fasta_multiline() -> Result<(), EtError> { 163 | const TEST_FASTA: &[u8] = b">id\nACGT\nAAAA\n>id2\nTGCA"; 164 | let mut pt = FastaReader::new(TEST_FASTA, None)?; 165 | 166 | let FastaRecord { id, sequence } = pt.next()?.expect("first record present"); 167 | assert_eq!(id, "id"); 168 | assert_eq!(sequence, Cow::Owned::<[u8]>(b"ACGTAAAA".to_vec())); 169 | 170 | let FastaRecord { id, sequence } = pt.next()?.expect("second record present"); 171 | assert_eq!(id, "id2"); 172 | assert_eq!(sequence, Cow::Borrowed(b"TGCA")); 173 | 174 | assert!(pt.next()?.is_none()); 175 | Ok(()) 176 | } 177 | 178 | #[test] 179 | fn test_fasta_multiline_extra_newlines() -> Result<(), EtError> { 180 | const TEST_FASTA: &[u8] = b">id\r\nACGT\r\nAAAA\r\n>id2\r\nTGCA\r\n"; 181 | let mut pt = FastaReader::new(TEST_FASTA, None)?; 182 | 183 | let FastaRecord { id, sequence } = pt.next()?.expect("first record present"); 184 | assert_eq!(id, "id"); 185 | assert_eq!(sequence, Cow::Owned::<[u8]>(b"ACGTAAAA".to_vec())); 186 | 187 | let FastaRecord { id, sequence } = pt.next()?.expect("second record present"); 188 | assert_eq!(id, "id2"); 189 | assert_eq!(sequence, Cow::Borrowed(b"TGCA")); 190 | 191 | assert!(pt.next()?.is_none()); 192 | Ok(()) 193 | } 194 | 195 | #[test] 196 | fn test_fasta_empty_fields() -> Result<(), EtError> { 197 | const TEST_FASTA: &[u8] = b">hd\n\n>\n\n"; 198 | let mut pt = FastaReader::new(TEST_FASTA, None)?; 199 | 200 | let FastaRecord { id, sequence } = pt.next()?.expect("first record present"); 201 | assert_eq!(id, "hd"); 202 | assert_eq!(sequence, Cow::Borrowed(b"")); 203 | 204 | let FastaRecord { id, sequence } = pt.next()?.expect("second record present"); 205 | assert_eq!(id, ""); 206 | assert_eq!(sequence, Cow::Borrowed(b"")); 207 | 208 | assert!(pt.next()?.is_none()); 209 | Ok(()) 210 | } 211 | } 212 | -------------------------------------------------------------------------------- /entab/src/parsers/fastq.rs: -------------------------------------------------------------------------------- 1 | use alloc::vec; 2 | use alloc::vec::Vec; 3 | use memchr::memchr; 4 | 5 | use crate::parsers::FromSlice; 6 | use crate::record::StateMetadata; 7 | use crate::EtError; 8 | use crate::{impl_reader, impl_record}; 9 | 10 | #[derive(Clone, Debug, Default)] 11 | /// A single sequence with quality data from a FASTQ file 12 | pub struct FastqRecord<'r> { 13 | /// The ID/header line 14 | pub id: &'r str, 15 | /// The sequence itself 16 | pub sequence: &'r [u8], 17 | /// The matching quality scores for bases in the sequence 18 | pub quality: &'r [u8], 19 | } 20 | 21 | impl_record!(FastqRecord<'r>: id, sequence, quality); 22 | 23 | /// The current state of FASTQ parsing; note that we use tuples of usize because Range doesn't 24 | /// support copying and tuples with an inclusive and exclusive bound are actually fairly slow. 25 | #[derive(Clone, Copy, Debug, Default)] 26 | pub struct FastqState { 27 | header_end: usize, 28 | seq: (usize, usize), 29 | qual: (usize, usize), 30 | } 31 | 32 | impl StateMetadata for FastqState { 33 | fn header(&self) -> Vec<&str> { 34 | vec!["id", "sequence", "quality"] 35 | } 36 | } 37 | 38 | impl<'b: 's, 's> FromSlice<'b, 's> for FastqState { 39 | type State = (); 40 | } 41 | 42 | impl<'b: 's, 's> FromSlice<'b, 's> for FastqRecord<'s> { 43 | type State = FastqState; 44 | 45 | fn parse( 46 | buf: &[u8], 47 | eof: bool, 48 | consumed: &mut usize, 49 | state: &mut Self::State, 50 | ) -> Result { 51 | if buf.is_empty() { 52 | if eof { 53 | return Ok(false); 54 | } 55 | return Err(EtError::new("No FASTQ could be parsed").incomplete()); 56 | } 57 | if buf[0] != b'@' { 58 | return Err("Valid FASTQ records start with '@'".into()); 59 | } 60 | // figure out where the first id/header line ends 61 | let seq_start = if let Some(p) = memchr(b'\n', buf) { 62 | if p > 0 && buf[p - 1] == b'\r' { 63 | // strip out the \r too if this is a \r\n ending 64 | state.header_end = p - 1; 65 | } else { 66 | state.header_end = p; 67 | } 68 | p + 1 69 | } else { 70 | return Err(EtError::new("Record ended prematurely in header").incomplete()); 71 | }; 72 | // figure out where the sequence data is 73 | let id2_start = if let Some(p) = memchr(b'+', &buf[seq_start..]) { 74 | if p == 0 || buf[seq_start + p - 1] != b'\n' { 75 | return Err("Unexpected + found in sequence".into()); 76 | } 77 | // the + is technically part of the next header so we're 78 | // already one short before we even check the \r 79 | if seq_start + p > 2 && buf[seq_start + p - 2] == b'\r' { 80 | // strip out the \r too if this is a \r\n ending 81 | state.seq = (seq_start, seq_start + p - 2); 82 | } else { 83 | state.seq = (seq_start, seq_start + p - 1); 84 | } 85 | seq_start + p 86 | } else { 87 | return Err(EtError::new("Record ended prematurely in sequence").incomplete()); 88 | }; 89 | // skip over the second id/header line 90 | let qual_start = if let Some(p) = memchr(b'\n', &buf[id2_start..]) { 91 | id2_start + p + 1 92 | } else { 93 | return Err(EtError::new("Record ended prematurely in second header").incomplete()); 94 | }; 95 | // and get the quality scores location 96 | let qual_end = qual_start + (state.seq.1 - state.seq.0); 97 | let mut rec_end = qual_end + (id2_start - state.seq.1); 98 | // sometimes the terminal one or two newlines might be missing 99 | // so we deduct here to avoid a error overconsuming 100 | if rec_end > buf.len() && eof { 101 | rec_end -= id2_start - state.seq.1; 102 | } 103 | if rec_end > buf.len() { 104 | return Err(EtError::new("Record ended prematurely in quality").incomplete()); 105 | } 106 | state.qual = (qual_start, qual_end); 107 | 108 | *consumed += rec_end; 109 | Ok(true) 110 | } 111 | 112 | fn get(&mut self, buf: &'b [u8], state: &'s Self::State) -> Result<(), EtError> { 113 | self.id = alloc::str::from_utf8(&buf[1..state.header_end])?; 114 | self.sequence = &buf[state.seq.0..state.seq.1]; 115 | self.quality = &buf[state.qual.0..state.qual.1]; 116 | Ok(()) 117 | } 118 | } 119 | 120 | impl_reader!(FastqReader, FastqRecord, FastqRecord<'r>, FastqState, ()); 121 | 122 | #[cfg(test)] 123 | mod tests { 124 | use super::*; 125 | 126 | #[test] 127 | fn test_fastq_reading() -> Result<(), EtError> { 128 | const TEST_FASTQ: &[u8] = b"@id\nACGT\n+\n!!!!\n@id2\nTGCA\n+\n!!!!"; 129 | let mut pt = FastqReader::new(TEST_FASTQ, None)?; 130 | 131 | if let Some(FastqRecord { 132 | id, 133 | sequence, 134 | quality, 135 | }) = pt.next()? 136 | { 137 | assert_eq!(id, "id"); 138 | assert_eq!(sequence, &b"ACGT"[..]); 139 | assert_eq!(quality, &b"!!!!"[..]); 140 | } else { 141 | panic!("FASTQ reader returned non-FASTQ reader"); 142 | } 143 | 144 | if let Some(FastqRecord { 145 | id, 146 | sequence, 147 | quality, 148 | }) = pt.next()? 149 | { 150 | assert_eq!(id, "id2"); 151 | assert_eq!(sequence, &b"TGCA"[..]); 152 | assert_eq!(quality, &b"!!!!"[..]); 153 | } else { 154 | panic!("FASTQ reader returned non-FASTQ reader"); 155 | } 156 | 157 | assert!(pt.next()?.is_none()); 158 | Ok(()) 159 | } 160 | 161 | #[test] 162 | fn test_fastq_extra_newlines() -> Result<(), EtError> { 163 | const TEST_FASTQ: &[u8] = b"@id\r\nACGT\r\n+\r\n!!!!\r\n@id2\r\nTGCA\r\n+\r\n!!!!\r\n"; 164 | let mut pt = FastqReader::new(TEST_FASTQ, None)?; 165 | 166 | if let Some(FastqRecord { 167 | id, 168 | sequence, 169 | quality, 170 | }) = pt.next()? 171 | { 172 | assert_eq!(id, "id"); 173 | assert_eq!(sequence, &b"ACGT"[..]); 174 | assert_eq!(quality, &b"!!!!"[..]); 175 | } else { 176 | panic!("FASTQ reader returned non-FASTQ reader"); 177 | } 178 | 179 | if let Some(FastqRecord { 180 | id, 181 | sequence, 182 | quality, 183 | }) = pt.next()? 184 | { 185 | assert_eq!(id, "id2"); 186 | assert_eq!(sequence, &b"TGCA"[..]); 187 | assert_eq!(quality, &b"!!!!"[..]); 188 | } else { 189 | panic!("FASTQ reader returned non-FASTQ reader"); 190 | } 191 | 192 | assert!(pt.next()?.is_none()); 193 | Ok(()) 194 | } 195 | 196 | #[test] 197 | fn test_fastq_pathological_sequences() -> Result<(), EtError> { 198 | const TEST_FASTQ_1: &[u8] = b"@DF\n+\n+\n!"; 199 | let mut pt = FastqReader::new(TEST_FASTQ_1, None)?; 200 | assert!(pt.next().is_err()); 201 | 202 | const TEST_FASTQ_2: &[u8] = b"@\n"; 203 | let mut pt = FastqReader::new(TEST_FASTQ_2, None)?; 204 | assert!(pt.next().is_err()); 205 | 206 | Ok(()) 207 | } 208 | 209 | #[test] 210 | fn test_fastq_from_file() -> Result<(), EtError> { 211 | let data: &[u8] = include_bytes!("../../tests/data/test.fastq"); 212 | let mut reader = FastqReader::new(data, None)?; 213 | while reader.next()?.is_some() {} 214 | Ok(()) 215 | } 216 | } 217 | -------------------------------------------------------------------------------- /entab/src/record.rs: -------------------------------------------------------------------------------- 1 | use alloc::borrow::Cow; 2 | use alloc::collections::BTreeMap; 3 | use alloc::string::{String, ToString}; 4 | use alloc::vec::Vec; 5 | use core::convert::TryFrom; 6 | 7 | use chrono::{NaiveDate, NaiveDateTime}; 8 | use serde::{Serialize, Serializer}; 9 | 10 | use crate::error::EtError; 11 | 12 | /// For a given state struct, the metadata associated with that struct. 13 | /// 14 | /// Primarily used to generate the corresponding metadata in the 15 | /// `RecordReader` trait. 16 | pub trait StateMetadata { 17 | /// Metadata about the current state of the parser 18 | fn metadata(&self) -> BTreeMap { 19 | BTreeMap::new() 20 | } 21 | 22 | /// The fields in the associated struct 23 | fn header(&self) -> Vec<&str>; 24 | } 25 | 26 | impl StateMetadata for () { 27 | fn header(&self) -> Vec<&str> { 28 | Vec::new() 29 | } 30 | } 31 | 32 | /// Autogenerates the conversion from a struct into the matching `Vec` of 33 | /// headers and the corresponding `Vec` of `Value`s to allow decomposing 34 | /// these raw structs into a common Record system that allows abstracting 35 | /// over different file formats. 36 | #[macro_export] 37 | macro_rules! impl_record { 38 | ($type:ty : $($key:ident),* ) => { 39 | impl<'r> From<$type> for ::alloc::vec::Vec<$crate::record::Value<'r>> { 40 | fn from(record: $type) -> Self { 41 | ::alloc::vec![$(record.$key.into(),)*] 42 | } 43 | } 44 | }; 45 | ($type:ty : $($key:ident)+ ) => { record!($($key),+) }; 46 | } 47 | 48 | /// An arbitrary serializable value 49 | /// 50 | /// Similar to the value types in `toml-rs` and `serde-json`, but in addition 51 | /// we need to derive other methods for e.g. converting into something 52 | /// displayable in a TSV so we couldn't use those. 53 | #[derive(PartialEq, Clone, Debug)] 54 | pub enum Value<'a> { 55 | /// A null value; all other types are considered implicitly nullable 56 | Null, 57 | /// A true/false value 58 | Boolean(bool), 59 | /// A date with associated time 60 | Datetime(NaiveDateTime), 61 | /// A floating point number 62 | Float(f64), 63 | /// An integer 64 | Integer(i64), 65 | /// A string/textual data 66 | String(Cow<'a, str>), 67 | /// A list of `Value`s (not well supported yet) 68 | List(Vec>), 69 | /// A record mapping keys to `Value`s 70 | Record(BTreeMap>), 71 | } 72 | 73 | impl<'a> Value<'a> { 74 | /// Converts an ISO-8601 formated date into a `Value::Datetime` 75 | /// 76 | /// # Errors 77 | /// If the string can't be interpreted as a date, an error is returned. 78 | pub fn from_iso_date(string: &str) -> Result { 79 | let datetime = NaiveDateTime::parse_from_str(string, "%+") 80 | .map_err(|e| EtError::from(e.to_string()))?; 81 | Ok(Self::Datetime(datetime)) 82 | } 83 | 84 | /// If the Value is a String, return the string. 85 | /// 86 | /// # Errors 87 | /// If the value isn't a string, an error is returned. 88 | pub fn into_string(self) -> Result { 89 | if let Value::String(s) = self { 90 | return Ok(s.into_owned()); 91 | } 92 | Err(EtError::from("Value was not a string")) 93 | } 94 | } 95 | 96 | impl<'a, T: Into>> From> for Value<'a> { 97 | fn from(x: Option) -> Self { 98 | match x { 99 | None => Value::Null, 100 | Some(s) => s.into(), 101 | } 102 | } 103 | } 104 | 105 | impl<'a> From for Value<'a> { 106 | fn from(x: bool) -> Self { 107 | Value::Boolean(x) 108 | } 109 | } 110 | 111 | impl<'a> From for Value<'a> { 112 | fn from(x: f32) -> Self { 113 | Value::Float(f64::from(x)) 114 | } 115 | } 116 | 117 | impl<'a> From for Value<'a> { 118 | fn from(x: f64) -> Self { 119 | Value::Float(x) 120 | } 121 | } 122 | 123 | impl<'a> From for Value<'a> { 124 | fn from(x: u8) -> Self { 125 | Value::Integer(i64::from(x)) 126 | } 127 | } 128 | 129 | impl<'a> From for Value<'a> { 130 | fn from(x: u16) -> Self { 131 | Value::Integer(i64::from(x)) 132 | } 133 | } 134 | 135 | impl<'a> From for Value<'a> { 136 | fn from(x: i32) -> Self { 137 | Value::Integer(i64::from(x)) 138 | } 139 | } 140 | 141 | impl<'a> From for Value<'a> { 142 | fn from(x: u32) -> Self { 143 | Value::Integer(i64::from(x)) 144 | } 145 | } 146 | 147 | impl<'a> From for Value<'a> { 148 | fn from(x: i64) -> Self { 149 | Value::Integer(x) 150 | } 151 | } 152 | 153 | impl<'a> From for Value<'a> { 154 | fn from(x: u64) -> Self { 155 | if x.leading_zeros() == 0 { 156 | // handle u64 -> i64 overflow by saturating; maybe someday this should be a try_from? 157 | Value::Integer(i64::MAX) 158 | } else { 159 | Value::Integer(i64::try_from(x).unwrap()) 160 | } 161 | } 162 | } 163 | 164 | impl<'a> From> for Value<'a> { 165 | fn from(x: Cow<'a, [u8]>) -> Self { 166 | Value::String(match x { 167 | Cow::Borrowed(b) => String::from_utf8_lossy(b), 168 | Cow::Owned(o) => Cow::Owned(String::from_utf8_lossy(&o).into_owned()), 169 | }) 170 | } 171 | } 172 | 173 | impl<'a> From<&'a [u8]> for Value<'a> { 174 | fn from(x: &'a [u8]) -> Self { 175 | Value::String(String::from_utf8_lossy(x)) 176 | } 177 | } 178 | 179 | impl<'a> From> for Value<'a> { 180 | fn from(x: Vec) -> Self { 181 | Value::String(Cow::Owned(String::from_utf8_lossy(&x).into_owned())) 182 | } 183 | } 184 | 185 | impl<'a> From> for Value<'a> { 186 | fn from(x: Cow<'a, str>) -> Self { 187 | Value::String(x) 188 | } 189 | } 190 | 191 | impl<'a> From<&'a str> for Value<'a> { 192 | fn from(x: &'a str) -> Self { 193 | Value::String(x.into()) 194 | } 195 | } 196 | 197 | impl<'a> From for Value<'a> { 198 | fn from(x: String) -> Self { 199 | Value::String(x.into()) 200 | } 201 | } 202 | 203 | impl<'a> From for Value<'a> { 204 | fn from(d: NaiveDateTime) -> Self { 205 | Value::Datetime(d) 206 | } 207 | } 208 | 209 | impl<'a> From for Value<'a> { 210 | fn from(d: NaiveDate) -> Self { 211 | Value::Datetime(d.and_hms_opt(0, 0, 0).unwrap()) 212 | } 213 | } 214 | 215 | impl<'a> From<&'a [String]> for Value<'a> { 216 | fn from(value: &'a [String]) -> Self { 217 | let mut rec = Vec::with_capacity(value.len()); 218 | for v in value { 219 | let bv: &str = v.as_ref(); 220 | rec.push(bv.into()); 221 | } 222 | Value::List(rec) 223 | } 224 | } 225 | 226 | impl<'a> From> for Value<'a> { 227 | fn from(value: Vec) -> Self { 228 | let mut rec = Vec::with_capacity(value.len()); 229 | for v in value { 230 | rec.push(v.into()); 231 | } 232 | Value::List(rec) 233 | } 234 | } 235 | 236 | impl<'a> From>> for Value<'a> { 237 | fn from(value: Vec>) -> Self { 238 | Value::List(value) 239 | } 240 | } 241 | 242 | impl<'a> Serialize for Value<'a> { 243 | fn serialize(&self, serializer: S) -> Result { 244 | match *self { 245 | Value::Null => serializer.serialize_none(), 246 | Value::Boolean(b) => serializer.serialize_bool(b), 247 | Value::Datetime(ref s) => s.serialize(serializer), 248 | Value::Float(f) => serializer.serialize_f64(f), 249 | Value::Integer(i) => serializer.serialize_i64(i), 250 | Value::List(ref a) => a.serialize(serializer), 251 | Value::Record(ref t) => t.serialize(serializer), 252 | Value::String(ref s) => serializer.serialize_str(s), 253 | } 254 | } 255 | } 256 | -------------------------------------------------------------------------------- /entab/src/parsers/common.rs: -------------------------------------------------------------------------------- 1 | use alloc::format; 2 | use core::convert::TryInto; 3 | use core::marker::Copy; 4 | 5 | use memchr::{memchr, memchr_iter}; 6 | 7 | use crate::error::EtError; 8 | use crate::parsers::{Endian, FromSlice}; 9 | 10 | macro_rules! impl_extract { 11 | ($return:ty) => { 12 | impl<'b: 's, 's> FromSlice<'b, 's> for $return { 13 | type State = Endian; 14 | 15 | #[inline] 16 | fn parse( 17 | buf: &[u8], 18 | _eof: bool, 19 | consumed: &mut usize, 20 | _state: &mut Self::State, 21 | ) -> Result { 22 | if buf.len() < core::mem::size_of::<$return>() { 23 | let err: EtError = 24 | format!("Could not read {}", ::core::any::type_name::<$return>()).into(); 25 | return Err(err.incomplete()); 26 | } 27 | *consumed += core::mem::size_of::<$return>(); 28 | Ok(true) 29 | } 30 | 31 | fn get(&mut self, buf: &'b [u8], state: &Self::State) -> Result<(), EtError> { 32 | let slice = buf[..core::mem::size_of::<$return>()].try_into().unwrap(); 33 | *self = match state { 34 | Endian::Big => <$return>::from_be_bytes(slice), 35 | Endian::Little => <$return>::from_le_bytes(slice), 36 | }; 37 | Ok(()) 38 | } 39 | } 40 | }; 41 | } 42 | 43 | impl_extract!(i8); 44 | impl_extract!(u8); 45 | impl_extract!(i16); 46 | impl_extract!(u16); 47 | impl_extract!(i32); 48 | impl_extract!(u32); 49 | impl_extract!(i64); 50 | impl_extract!(u64); 51 | impl_extract!(f32); 52 | impl_extract!(f64); 53 | 54 | impl<'b: 's, 's> FromSlice<'b, 's> for () { 55 | type State = (); 56 | } 57 | 58 | impl<'b: 's, 's> FromSlice<'b, 's> for &'b [u8] { 59 | type State = usize; 60 | 61 | #[inline] 62 | fn parse( 63 | buf: &[u8], 64 | _eof: bool, 65 | consumed: &mut usize, 66 | amt: &mut Self::State, 67 | ) -> Result { 68 | if buf.len() < *amt { 69 | let err: EtError = format!("Could not extract a slice of size {}", amt).into(); 70 | return Err(err.incomplete()); 71 | } 72 | *consumed += *amt; 73 | Ok(true) 74 | } 75 | 76 | #[inline] 77 | fn get(&mut self, buf: &'b [u8], amt: &Self::State) -> Result<(), EtError> { 78 | *self = &buf[..*amt]; 79 | Ok(()) 80 | } 81 | } 82 | 83 | impl<'b: 's, 's> FromSlice<'b, 's> for &'b str { 84 | type State = usize; 85 | 86 | #[inline] 87 | fn parse( 88 | buf: &[u8], 89 | _eof: bool, 90 | consumed: &mut usize, 91 | amt: &mut Self::State, 92 | ) -> Result { 93 | if buf.len() < *amt { 94 | let err: EtError = format!("Could not extract a slice of size {}", amt).into(); 95 | return Err(err.incomplete()); 96 | } 97 | *consumed += *amt; 98 | Ok(true) 99 | } 100 | 101 | #[inline] 102 | fn get(&mut self, buf: &'b [u8], amt: &Self::State) -> Result<(), EtError> { 103 | *self = core::str::from_utf8(&buf[..*amt])?; 104 | Ok(()) 105 | } 106 | } 107 | 108 | /// Used to read a single line out of the buffer. 109 | /// 110 | /// Assumes all lines are terminated with a '\n' and an optional '\r' 111 | /// before so should handle almost all current text file formats, but 112 | /// may fail on older '\r' only formats. 113 | #[derive(Clone, Copy, Debug, Default, PartialEq)] 114 | pub(crate) struct NewLine<'b>(pub(crate) &'b [u8]); 115 | 116 | impl<'b: 's, 's> FromSlice<'b, 's> for NewLine<'b> { 117 | type State = usize; 118 | 119 | #[inline] 120 | fn parse( 121 | buf: &[u8], 122 | eof: bool, 123 | consumed: &mut usize, 124 | state: &mut Self::State, 125 | ) -> Result { 126 | if buf.is_empty() { 127 | if eof { 128 | return Ok(false); 129 | } 130 | return Err(EtError::new("Could not extract a new line").incomplete()); 131 | } 132 | // find the newline 133 | let (end, to_consume) = if let Some(e) = memchr(b'\n', buf) { 134 | if buf[..e].last() == Some(&b'\r') { 135 | (e - 1, e + 1) 136 | } else { 137 | (e, e + 1) 138 | } 139 | } else if eof { 140 | // we couldn't find a new line, but we are at the end of the file 141 | // so return everything to the EOF 142 | let l = buf.len(); 143 | (l, l) 144 | } else { 145 | // couldn't find the character; load more 146 | return Err(EtError::new("Could not extract a new line").incomplete()); 147 | }; 148 | *state = end; 149 | 150 | *consumed += to_consume; 151 | Ok(true) 152 | } 153 | 154 | #[inline] 155 | fn get(&mut self, buf: &'b [u8], amt: &Self::State) -> Result<(), EtError> { 156 | self.0 = &buf[..*amt]; 157 | Ok(()) 158 | } 159 | } 160 | 161 | /// Used to read from a buffer until the given `state` slice is found and then discard everything before 162 | /// that `state` slice. Note that this never returns a consumed length of more than 0 because it 163 | /// silently updates the state as it consumes so it doesn't have to re-search the buffer if the 164 | /// buffer needs to be refilled. 165 | #[derive(Clone, Copy, Debug, Default, PartialEq)] 166 | pub(crate) struct SeekPattern; 167 | 168 | impl<'b: 's, 's> FromSlice<'b, 's> for SeekPattern { 169 | type State = &'s [u8]; 170 | 171 | #[inline] 172 | fn parse( 173 | buffer: &[u8], 174 | eof: bool, 175 | consumed: &mut usize, 176 | pat: &mut Self::State, 177 | ) -> Result { 178 | for pos in memchr_iter(pat[0], buffer) { 179 | if pos + pat.len() > buffer.len() { 180 | *consumed += pos; 181 | if eof { 182 | return Ok(false); 183 | } 184 | let err: EtError = format!( 185 | "{:?} may be at end of buffer, but no more could be pulled", 186 | pat 187 | ) 188 | .into(); 189 | return Err(err.incomplete()); 190 | } 191 | if &buffer[pos..pos + pat.len()] == *pat { 192 | *consumed += pos; 193 | return Ok(true); 194 | } 195 | } 196 | 197 | *consumed = buffer.len(); 198 | if eof { 199 | return Ok(false); 200 | } 201 | let err: EtError = format!("Could not find {:?}", pat).into(); 202 | Err(err.incomplete()) 203 | } 204 | 205 | #[inline] 206 | fn get(&mut self, _buf: &'b [u8], _amt: &Self::State) -> Result<(), EtError> { 207 | Ok(()) 208 | } 209 | } 210 | 211 | /// Used to skip ahead in a buffer 212 | #[derive(Clone, Copy, Debug, Default, PartialEq)] 213 | pub(crate) struct Skip; 214 | 215 | impl<'b: 's, 's> FromSlice<'b, 's> for Skip { 216 | type State = usize; 217 | 218 | #[inline] 219 | fn parse( 220 | buffer: &[u8], 221 | _eof: bool, 222 | consumed: &mut usize, 223 | amt: &mut Self::State, 224 | ) -> Result { 225 | if buffer.len() < *amt { 226 | let err: EtError = 227 | format!("Buffer terminated before {} bytes could be skipped.", amt).into(); 228 | return Err(err.incomplete()); 229 | } 230 | *consumed += *amt; 231 | Ok(true) 232 | } 233 | 234 | #[inline] 235 | fn get(&mut self, _buf: &'b [u8], _amt: &Self::State) -> Result<(), EtError> { 236 | Ok(()) 237 | } 238 | } 239 | 240 | /// Used to skip ahead in a buffer 241 | #[derive(Clone, Copy, Debug, Default, PartialEq)] 242 | pub(crate) struct EndOfFile; 243 | 244 | impl<'b: 's, 's> FromSlice<'b, 's> for EndOfFile { 245 | type State = (); 246 | 247 | #[inline] 248 | fn parse( 249 | buffer: &[u8], 250 | eof: bool, 251 | consumed: &mut usize, 252 | _state: &mut Self::State, 253 | ) -> Result { 254 | if !eof { 255 | return Err(EtError::from("No EOF yet").incomplete()); 256 | } 257 | *consumed += buffer.len(); 258 | Ok(true) 259 | } 260 | 261 | #[inline] 262 | fn get(&mut self, _buf: &'b [u8], _state: &Self::State) -> Result<(), EtError> { 263 | Ok(()) 264 | } 265 | } 266 | -------------------------------------------------------------------------------- /entab/src/parsers/xml.rs: -------------------------------------------------------------------------------- 1 | use core::marker::Copy; 2 | // use alloc::collections::BTreeMap; 3 | use alloc::borrow::ToOwned; 4 | use alloc::format; 5 | use alloc::str::from_utf8; 6 | use alloc::string::String; 7 | use alloc::vec; 8 | use alloc::vec::Vec; 9 | 10 | use memchr::{memchr, memchr3_iter}; 11 | 12 | use crate::parsers::{extract, FromSlice}; 13 | use crate::record::StateMetadata; 14 | use crate::EtError; 15 | use crate::{impl_reader, impl_record}; 16 | 17 | /// What kind of XML tag this is 18 | #[derive(Clone, Copy, Debug)] 19 | pub enum XmlTagType { 20 | /// An opening tag, e.g. 21 | Open, 22 | /// An closing tag, e.g. 23 | Close, 24 | /// A self-closing tag, e.g.
25 | SelfClose, 26 | } 27 | // TODO: maybe CDATA, DOCTYPE, comments too? 28 | 29 | impl Default for XmlTagType { 30 | fn default() -> Self { 31 | XmlTagType::Open 32 | } 33 | } 34 | 35 | /// Convenience struct for tokenizing tags out of XML streams 36 | #[derive(Clone, Copy, Debug, Default)] 37 | pub struct XmlTag<'r> { 38 | tag_type: XmlTagType, 39 | id: &'r str, 40 | } 41 | 42 | impl<'b: 's, 's> FromSlice<'b, 's> for XmlTag<'r> { 43 | type State = (); 44 | 45 | fn parse( 46 | rb: &[u8], 47 | eof: bool, 48 | consumed: &mut usize, 49 | _state: &mut Self::State, 50 | ) -> Result { 51 | let mut cur_quote = b' '; 52 | let mut start = 0; 53 | let end = 'read: loop { 54 | // we're parsing a tag 55 | for i in memchr3_iter(b'>', b'"', b'\'', &rb[start..]) { 56 | match (rb[i], cur_quote) { 57 | // if we're not in quotes and see a >, break 58 | (b'>', b' ') => break 'read i + 1, 59 | // if we're not in quotes and see a quote, start "quoting" 60 | (b'\'', b' ') => cur_quote = b'\'', 61 | (b'"', b' ') => cur_quote = b'"', 62 | // if we're in quotes and see a quote, stop "quoting" 63 | (b'\'', b'\'') => cur_quote = b' ', 64 | (b'"', b'"') => cur_quote = b' ', 65 | _ => {} 66 | } 67 | } 68 | if rb.len() > 1024 { 69 | return Err(format!("Tags larger than {} not supported", 1024).into()); 70 | } 71 | if eof { 72 | return Err("Tag was never closed".into()); 73 | } 74 | start = rb.len() - 1; 75 | }; 76 | *consumed += end; 77 | Ok(true) 78 | } 79 | 80 | fn get( 81 | &mut self, 82 | buf: &'r [u8], 83 | _state: &Self::State, 84 | ) -> Result<(), EtError> { 85 | let is_closing = buf.get(1) == Some(&b'/'); 86 | let is_self_closing = buf.last() == Some(&b'/'); 87 | let (tag_type, data) = match (is_closing, is_self_closing) { 88 | // TODO: we should be able to use EtError::new here 89 | (true, true) => return Err(EtError::from("Tag can not start and end with '/'")), 90 | (true, false) => (XmlTagType::Close, &buf[2..buf.len() - 1]), 91 | (false, true) => (XmlTagType::SelfClose, &buf[1..buf.len() - 2]), 92 | (false, false) => (XmlTagType::Open, &buf[1..buf.len() - 1]), 93 | }; 94 | let id_end = memchr(b' ', data).unwrap_or(data.len()); 95 | self.tag_type = tag_type; 96 | self.id = from_utf8(&data[..id_end])?; 97 | // TODO: parse attributes 98 | Ok(()) 99 | } 100 | } 101 | 102 | /// Convenience struct for tokenizing text out of XML streams 103 | #[derive(Clone, Copy, Debug, Default)] 104 | pub struct XmlText<'r>(&'r str); 105 | 106 | impl<'b: 's, 's> FromSlice<'b, 's> for XmlText<'r> { 107 | type State = (); 108 | 109 | fn parse( 110 | rb: &[u8], 111 | eof: bool, 112 | consumed: &mut usize, 113 | _state: &mut Self::State, 114 | ) -> Result { 115 | // we're parsing a text element 116 | if let Some(e) = memchr(b'<', rb) { 117 | *consumed += e; 118 | return Ok(true); 119 | } 120 | if rb.len() > 65536 { 121 | return Err( 122 | format!("XML text larger than {} not supported", 65536).into() 123 | ); 124 | } 125 | if eof { 126 | // TODO: add test for this case 127 | *consumed += rb.len(); 128 | return Ok(true); 129 | } 130 | Ok(false) 131 | } 132 | 133 | fn get( 134 | &mut self, 135 | buf: &'r [u8], 136 | _state: &Self::State, 137 | ) -> Result<(), EtError> { 138 | self.0 = from_utf8(buf)?; 139 | Ok(()) 140 | } 141 | } 142 | 143 | /// Current state of the XML parser 144 | #[derive(Clone, Debug, Default)] 145 | pub struct XmlState { 146 | // token_counts: Vec>, 147 | stack: Vec, 148 | is_text: bool, 149 | } 150 | 151 | impl StateMetadata for XmlState { 152 | fn header(&self) -> Vec<&str> { 153 | vec!["tags", "text"] 154 | } 155 | } 156 | 157 | impl<'b: 's, 's> FromSlice<'b, 's> for XmlState { 158 | type State = (); 159 | } 160 | 161 | /// A single record from an XML stream 162 | #[derive(Clone, Debug, Default)] 163 | pub struct XmlRecord<'r> { 164 | tags: Vec, 165 | text: &'r str, 166 | // TODO 167 | // attributes: BTreeMap 168 | } 169 | 170 | impl<'b: 's, 's> FromSlice<'b, 's> for XmlRecord<'r> { 171 | type State = &'r mut XmlState; 172 | 173 | fn parse(rb: &[u8], eof: bool, consumed: &mut usize, state: &mut Self::State) -> Result { 174 | if rb.is_empty() { 175 | if !state.stack.is_empty() { 176 | return Err(format!("Closing tag for {} not present?", state.stack.pop().unwrap()).into()); 177 | } else { 178 | return Ok(false); 179 | } 180 | } 181 | let con = &mut 0; 182 | if rb[0] == b'<' { 183 | // it's a tag 184 | let tag = extract::(rb, con, ())?; 185 | match tag.tag_type { 186 | XmlTagType::Open => { 187 | state.stack.push(tag.id.to_owned()); 188 | } 189 | XmlTagType::Close => { 190 | if let Some(open_tag) = state.stack.pop() { 191 | if open_tag != tag.id { 192 | return Err( 193 | format!("Closing tag {} found, but {} was open.", tag.id, open_tag).into() 194 | ); 195 | } 196 | } else { 197 | return Err( 198 | format!( 199 | "Closing tag {} found, but no tags opened before it.", 200 | tag.id 201 | ).into() 202 | ); 203 | } 204 | } 205 | // TODO: we need to return the tag stack with this tag on it 206 | XmlTagType::SelfClose => {} 207 | } 208 | state.is_text = false; 209 | } else { 210 | // it's text; parse the length out 211 | if XmlText::parse(rb, eof, con, &mut ())? { 212 | state.is_text = true; 213 | } else { 214 | return Ok(false); 215 | } 216 | } 217 | *consumed += *con; 218 | 219 | Ok(true) 220 | } 221 | 222 | fn get( 223 | &mut self, 224 | rb: &'r [u8], 225 | state: &Self::State, 226 | ) -> Result<(), EtError> { 227 | self.text = if state.is_text { 228 | from_utf8(rb)? 229 | } else { 230 | "" 231 | }; 232 | self.tags = state.stack.clone(); 233 | Ok(()) 234 | } 235 | } 236 | 237 | impl_record!(XmlRecord<'r>: tags, text); 238 | 239 | impl_reader!(XmlReader, XmlRecord, XmlRecord<'r>, XmlState, ()); 240 | 241 | #[cfg(test)] 242 | mod tests { 243 | use super::*; 244 | 245 | #[test] 246 | fn test_xml_reader() -> Result<(), EtError> { 247 | let data: &[u8] = b"test"; 248 | let mut reader = XmlReader::new(data, ())?; 249 | 250 | // TODO: don't emit on tag close? also emit the current tag? 251 | let rec = reader.next()?.unwrap(); 252 | assert_eq!(rec.tags, &["a"]); 253 | let rec = reader.next()?.unwrap(); 254 | assert_eq!(rec.tags, &["a"]); 255 | let rec = reader.next()?.unwrap(); 256 | assert!(rec.tags.is_empty()); 257 | assert!(reader.next()?.is_none()); 258 | Ok(()) 259 | } 260 | } 261 | --------------------------------------------------------------------------------