├── .github └── workflows │ └── test.yml ├── .gitignore ├── CHANGELOG.md ├── CITATION.cff ├── CONTRIBUTING.md ├── Dockerfile ├── LICENSE ├── Makefile ├── README.md ├── binding └── main.go ├── build ├── bin │ └── .gitkeep ├── clib │ └── .gitkeep └── release │ └── .gitkeep ├── cli_test.go ├── config.go ├── config_test.go ├── default.nix ├── docker-compose.yml ├── ent ├── internal │ ├── preparser │ │ ├── grammar.peg │ │ ├── grammar.peg.go │ │ ├── preparser.go │ │ └── preparser_test.go │ └── preprocess │ │ ├── annot.rl │ │ ├── cleanup.go │ │ ├── noparse.go │ │ ├── noparse.rl │ │ ├── preprocess.go │ │ ├── preprocess_test.go │ │ ├── virus.go │ │ └── virus.rl ├── nameidx │ └── nameidx.go ├── parsed │ ├── annotation.go │ ├── annotation_test.go │ ├── details.go │ ├── flatten.go │ ├── flatten_test.go │ ├── interface.go │ ├── output.go │ ├── parsed.go │ ├── parsed_result.go │ ├── restore_ambiguous.go │ ├── warning.go │ ├── warning_test.go │ └── words.go ├── parser │ ├── ast.go │ ├── engine.go │ ├── grammar.peg │ ├── grammar.peg.go │ ├── interfaces.go │ ├── name.go │ ├── output.go │ ├── parser.go │ └── parser_test.go ├── stemmer │ ├── stemmer.go │ └── stemmer_test.go └── str │ ├── str.go │ └── str_test.go ├── flake.lock ├── flake.nix ├── gnparser.go ├── gnparser ├── LICENSE ├── cmd │ ├── flags.go │ ├── parse_batch.go │ ├── parse_stream.go │ └── root.go ├── main.go └── tools.go ├── gnparser_stream.go ├── gnparser_test.go ├── go.mod ├── go.sum ├── interface.go ├── io ├── dict │ ├── data │ │ ├── README.md │ │ ├── bacteria_genera.txt │ │ ├── bacteria_genera_homonyms.txt │ │ └── genera_auth_icn.txt │ ├── dict.go │ └── dict_test.go └── web │ ├── gnparser_service.go │ ├── interface.go │ ├── server.go │ ├── static │ ├── images │ │ ├── favicon.ico │ │ ├── github-mark.svg │ │ └── gna.svg │ └── styles │ │ ├── parser.css │ │ └── screen.css │ ├── templates.go │ ├── templates │ ├── doc_api.html │ ├── home.html │ └── layout.html │ ├── web.go │ └── web_internal_test.go ├── man ├── gnparser.1 ├── gnparser.1.html └── gnparser.1.ronn ├── nsqd.dat ├── quality.md ├── shell.nix ├── testdata ├── exceptions.txt ├── stems.txt ├── test_data.md └── test_data_cultivars.md ├── tools ├── gentest.go └── quality.go └── version.go /.github/workflows/test.yml: -------------------------------------------------------------------------------- 1 | name: Tests 2 | 3 | on: 4 | push: 5 | branches: [master] 6 | pull_request: 7 | branches: [master] 8 | 9 | jobs: 10 | build: 11 | name: Build 12 | runs-on: ${{ matrix.os }} 13 | 14 | strategy: 15 | matrix: 16 | os: [ubuntu-latest, macos-latest, windows-latest] 17 | 18 | steps: 19 | - name: Set up Go 20 | uses: actions/setup-go@v3 21 | with: 22 | go-version: 1.23 23 | 24 | - name: Check out code into the Go module directory 25 | uses: actions/checkout@v3 26 | - run: git fetch --prune --unshallow 27 | 28 | - name: install tools and dependencies 29 | run: make tools 30 | 31 | - name: Test 32 | run: make test 33 | 34 | - name: Build 35 | run: make build 36 | 37 | - name: Build C lib 38 | run: make clib 39 | 40 | - name: Store C library 41 | uses: actions/upload-artifact@v4 42 | with: 43 | name: ${{ matrix.os }}-clib 44 | path: binding/lib* 45 | retention-days: 1 46 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | .idea 2 | t 3 | result 4 | prof 5 | .vscode 6 | 200k-lines.txt 7 | test_data.new.txt 8 | .idea 9 | gnparser/gnparser 10 | bench*.txt 11 | binding/libgnparser.h 12 | binding/*.so 13 | build/** 14 | .DS_Store 15 | *.pprof 16 | *.test 17 | *.gif 18 | __debug_bin 19 | -------------------------------------------------------------------------------- /CITATION.cff: -------------------------------------------------------------------------------- 1 | cff-version: 1.2.0 2 | message: "If you use this software, please cite it as below." 3 | title: "GNparser -- a powerful scientific names parser." 4 | version: v1.11.1 5 | authors: 6 | - family-names: "Mozzherin" 7 | given-names: "Dmitry" 8 | orcid: "https://orcid.org/0000-0003-1593-1417" 9 | - family-names: "Marsden" 10 | given-names: "Toby" 11 | - family-names: "Pereira" 12 | given-names: "Hernán Lucas" 13 | orcid: "https://orcid.org/0000-0001-6681-7038" 14 | repository-code: "https://github.com/gnames/gnparser" 15 | doi: 10.5281/zenodo.14096467 16 | date-released: 2024-11-07 17 | license: MIT 18 | -------------------------------------------------------------------------------- /CONTRIBUTING.md: -------------------------------------------------------------------------------- 1 | # How to contribute to ``gnparser`` project 2 | 3 | ## **Did you find a bug?** 4 | 5 | * **Ensure the bug was not already reported** by searching on GitHub under 6 | [Issues](https://github.com/gnames/gnparser/issues). 7 | 8 | * If you're unable to find an open issue addressing the problem, [open a new 9 | one](https://github.com/gnames/gnparser/issues/new). Be sure to include a 10 | **title and clear description**, as much relevant information as possible, 11 | and a **code sample** or an **executable test case** via 12 | [https:parser.globalnames.org](https://parser.globalnames.org) demonstrating 13 | the expected behavior that is not occurring. 14 | * Make sure you **do not put more than one bug report** in the new issue. 15 | 16 | ## **Do you intend to add a new feature or change an existing one?** 17 | 18 | * Suggest your change in the [GlobalNames gitter 19 | group](https://gitter.im/GlobalNamesArchitecture/GlobalNames), or [create an 20 | issue](https://github.com/gnames/gnparser/issues/new) that describes your 21 | suggestion in detail. 22 | * Make sure you **do not put more than one feature or change** in the new issue. 23 | 24 | ## **Did you write a patch that fixes a bug?** 25 | 26 | * Open a new GitHub pull request with the patch. 27 | 28 | * Ensure the PR description clearly describes the problem and solution. Include 29 | the relevant issue number if applicable. 30 | 31 | * Clearly state if your PR is a proof of concept and what needs to be done to 32 | finish it, or, if it is ready to merge patch with tests and documentation 33 | added. 34 | 35 | ## **Did you write a client for your favorite language to access ``gnparser`` functionality via REST api?** 36 | 37 | Let us know about your client on [GlobalNames gitter 38 | group](https://gitter.im/GlobalNamesArchitecture/GlobalNames). 39 | 40 | ## **Do you have questions about the source code?** 41 | 42 | * Ask any question on the [GlobalNames gitter 43 | group](https://gitter.im/GlobalNamesArchitecture/GlobalNames) 44 | 45 | ## **Would you like to contribute, but do not know how?** 46 | 47 | * Read the next section about configuring environment for the project. 48 | 49 | ## **Setting up ``gnparser`` programming environment** 50 | 51 | ### Install Go 52 | 53 | [Download and install Go](https://golang.org/doc/install) for your operating 54 | system. Make sure you [configured GOPATH environment 55 | library](https://github.com/golang/go/wiki/SettingGOPATH). 56 | 57 | You need Go v1.16.x or higher. 58 | 59 | ### Install ``gnparser`` code 60 | 61 | Before Go v1.11 all Go code had to be organized inside of the ``GOPATH`` 62 | directory. Now, for projects like ``gnparser`` that use Go modules it is not 63 | necessary, however many tools still behave assuming old ways, so we recommend 64 | to setup ``gnparser`` code traditional way. 65 | 66 | ```bash 67 | mkdir -p $GOPATH/src/github.com/gnames 68 | cd $GOPATH/src/github.com/gnames 69 | git clone https://github.com/gnames/gnparser.git 70 | # or use URL of your fork on GitHub or GitLab 71 | 72 | cd gnparser 73 | ``` 74 | 75 | ``gnparser`` uses several external tools and technologies: 76 | 77 | 1. [Parsing Expression Grammar tool](https://github.com/pointlander/peg) to 78 | generate parsing code. 79 | 80 | 2. [Cobra CLI framework](https://github.com/spf13/cobra) for creating command 81 | line application. 82 | 83 | 3. [goimport tool](https://golang.org/x/tools/cmd/goimports) for fixing 84 | imports in PEG autogenerated go code. 85 | 86 | To install them run 87 | 88 | ```bash 89 | make tools 90 | ``` 91 | 92 | To create a ``gnparser`` executable and place it to $GOPATH/bin 93 | 94 | ```bash 95 | make 96 | ``` 97 | 98 | Now you should be able to use gnparser compiled from the code: 99 | 100 | ```bash 101 | gnparser -f pretty "Pica pica (Linnaeus, 1758)" 102 | ``` 103 | 104 | ### To run tests 105 | 106 | ```bash 107 | make test 108 | ``` 109 | 110 | or 111 | 112 | ```bash 113 | go test ./... 114 | ``` 115 | 116 | ### To generate tests automatically 117 | 118 | If your change generates a lot of changes in `testdata/test_data.md` 119 | and/or `testdata/test_data_cultivars.md` you can generate 120 | `testdata/test_data_new.md` and `testdata/test_data_cultivars_new.md` 121 | files using `gentest.go` tool. 122 | 123 | ```bash 124 | cd tools 125 | go run gentest.go 126 | cd ../testdata 127 | ls 128 | ``` 129 | 130 | You will have two new files in testdata. It is VERY important now to check 131 | difference between old and new test files before making the next step: 132 | 133 | ```bash 134 | mv test_data_new.md test_data.md 135 | mv test_data_cultivars_new.md test_data_cultivars.md 136 | ``` 137 | 138 | ## Benchmarks 139 | 140 | Benchmarks are located in `gnparser_test.go` 141 | 142 | To run benchmarks from the project's root: 143 | 144 | ```bash 145 | # this command will install benchstat 146 | make tools 147 | 148 | go test -bench=. -benchmem -count=10 -run=XXX > bench.txt && benchstat bench.txt 149 | ``` 150 | 151 | After running you should get results similar to: 152 | 153 | ```bash 154 | name time/op 155 | Parse/Parse_to_object_once-16 73.0µs ± 1% 156 | Parse/Parse_to_object_once_with_Init-16 83.2µs ± 1% 157 | Parse/Parse_to_object-16 67.5ms ± 1% 158 | Parse/Parse_to_JSON-16 71.5ms ± 1% 159 | Parse/Parse_to_JSON_(Details)-16 71.8ms ± 1% 160 | Parse/Parse_to_CSV-16 69.1ms ± 1% 161 | 162 | name alloc/op 163 | Parse/Parse_to_object_once-16 10.9kB ± 0% 164 | Parse/Parse_to_object_once_with_Init-16 23.8kB ± 0% 165 | Parse/Parse_to_object-16 15.5MB ± 0% 166 | Parse/Parse_to_JSON-16 17.2MB ± 0% 167 | Parse/Parse_to_JSON_(Details)-16 17.2MB ± 0% 168 | Parse/Parse_to_CSV-16 16.2MB ± 0% 169 | 170 | name allocs/op 171 | Parse/Parse_to_object_once-16 250 ± 0% 172 | Parse/Parse_to_object_once_with_Init-16 409 ± 0% 173 | Parse/Parse_to_object-16 235k ± 0% 174 | Parse/Parse_to_JSON-16 242k ± 0% 175 | Parse/Parse_to_JSON_(Details)-16 242k ± 0% 176 | Parse/Parse_to_CSV-16 240k ± 0% 177 | ``` 178 | 179 | ### Accessing a raw parsed AST tree 180 | 181 | PEG parser generates it own abstract syntax tree (AST), that later gets 182 | conberted into a ``gnparser`` specific AST. Sometimes it is useful to see the 183 | raw tree of nodes. To do that, open gnparser/gnparser/cmd/root.go, 184 | change ``const debug`` to ``true`` and run ``make``. After that you will be 185 | able to examing raw tree of a string, for example: 186 | 187 | ```bash 188 | gnparser "Bubo bubo" 189 | ``` 190 | -------------------------------------------------------------------------------- /Dockerfile: -------------------------------------------------------------------------------- 1 | FROM alpine:3.14 2 | 3 | LABEL maintainer="Dmitry Mozzherin" 4 | 5 | ENV LAST_FULL_REBUILD=2024-10-11 6 | 7 | WORKDIR /bin 8 | 9 | COPY ./gnparser/gnparser /bin 10 | 11 | ENTRYPOINT [ "gnparser" ] 12 | 13 | CMD ["-p", "8778"] 14 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2019 gnames 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /Makefile: -------------------------------------------------------------------------------- 1 | PROJ_NAME = gnparser 2 | 3 | VERSION = $(shell git describe --tags) 4 | VER = $(shell git describe --tags --abbrev=0) 5 | DATE = $(shell date -u '+%Y-%m-%d_%H:%M:%S%Z') 6 | 7 | NO_C = CGO_ENABLED=0 8 | FLAGS_SHARED = GOARCH=amd64 9 | FLAGS_LINUX = GOARCH=amd64 GOOS=linux 10 | FLAGS_LINUX_ARM = GOARCH=arm64 GOOS=linux 11 | FLAGS_MAC = GOARCH=amd64 GOOS=darwin 12 | FLAGS_MAC_ARM = GOARCH=arm64 GOOS=darwin 13 | FLAGS_WIN = GOARCH=amd64 GOOS=windows 14 | FLAGS_WIN_ARM = GOARCH=arm64 GOOS=windows 15 | FLAGS_LD=-ldflags "-s -w -X github.com/gnames/$(PROJ_NAME).Build=$(DATE) \ 16 | -X github.com/gnames/$(PROJ_NAME).Version=$(VERSION)" 17 | FLAGS_REL = -trimpath -ldflags "-s -w \ 18 | -X github.com/gnames/$(PROJ_NAME).Build=$(DATE)" 19 | 20 | GOCMD = go 21 | GOBUILD = $(GOCMD) build $(FLAGS_LD) 22 | GOINSTALL = $(GOCMD) install $(FLAGS_LD) 23 | GORELEASE = $(GOCMD) build $(FLAGS_REL) 24 | GOCLEAN = $(GOCMD) clean 25 | GOGET = $(GOCMD) get 26 | 27 | RELEASE_DIR ?= "/tmp" 28 | BUILD_DIR ?= "." 29 | CLIB_DIR ?= "." 30 | 31 | all: install 32 | 33 | test: deps install 34 | $(FLAG_MODULE) go test -shuffle=on -race -count=1 ./... 35 | 36 | test-build: deps build 37 | 38 | deps: 39 | $(GOCMD) mod download; 40 | 41 | tools: deps 42 | @echo Installing tools from tools.go 43 | @cat $(PROJ_NAME)/tools.go | grep _ | awk -F'"' '{print $$2}' | xargs -tI % go install % 44 | 45 | peg: 46 | cd ent/parser; \ 47 | peg grammar.peg; \ 48 | goimports -w grammar.peg.go; \ 49 | cd ../internal/preparser; \ 50 | peg grammar.peg; \ 51 | goimports -w grammar.peg.go; 52 | 53 | ragel: 54 | cd ent/internal/preprocess; \ 55 | ragel -Z -G2 virus.rl; \ 56 | ragel -Z -G2 noparse.rl 57 | 58 | asset: 59 | cd io/fs; \ 60 | $(FLAGS_SHARED) go run -tags=dev assets_gen.go 61 | 62 | build: peg 63 | cd $(PROJ_NAME); \ 64 | $(GOCLEAN); \ 65 | $(NO_C) $(GOBUILD) -o $(BUILD_DIR) 66 | 67 | buildrel: peg 68 | cd $(PROJ_NAME); \ 69 | $(GOCLEAN); \ 70 | $(NO_C) $(GORELEASE) -o $(BUILD_DIR) 71 | 72 | install: peg 73 | cd $(PROJ_NAME); \ 74 | $(GOCLEAN); \ 75 | $(NO_C) $(GOINSTALL) 76 | 77 | release: peg dockerhub 78 | cd $(PROJ_NAME); \ 79 | $(GOCLEAN); \ 80 | $(FLAGS_LINUX) $(NO_C) $(GOBUILD); \ 81 | tar zcf $(RELEASE_DIR)/$(PROJ_NAME)-$(VER)-linux-x86.tar.gz $(PROJ_NAME); \ 82 | $(GOCLEAN); \ 83 | $(FLAGS_LINUX_ARM) $(NO_C) $(GOBUILD); \ 84 | tar zcf $(RELEASE_DIR)/$(PROJ_NAME)-$(VER)-linux-arm.tar.gz $(PROJ_NAME); \ 85 | $(GOCLEAN); \ 86 | $(FLAGS_MAC) $(NO_C) $(GOBUILD); \ 87 | tar zcf $(RELEASE_DIR)/$(PROJ_NAME)-$(VER)-mac-x86.tar.gz $(PROJ_NAME); \ 88 | $(GOCLEAN); \ 89 | $(FLAGS_MAC_ARM) $(NO_C) $(GOBUILD); \ 90 | tar zcf $(RELEASE_DIR)/$(PROJ_NAME)-$(VER)-mac-arm.tar.gz $(PROJ_NAME); \ 91 | $(GOCLEAN); \ 92 | $(FLAGS_WIN) $(NO_C) $(GOBUILD); \ 93 | zip -9 $(RELEASE_DIR)/$(PROJ_NAME)-$(VER)-win-x86.zip $(PROJ_NAME).exe; \ 94 | $(GOCLEAN); \ 95 | $(FLAGS_WIN_ARM) $(NO_C) $(GOBUILD); \ 96 | zip -9 $(RELEASE_DIR)/$(PROJ_NAME)-$(VER)-win-arm.zip $(PROJ_NAME).exe; \ 97 | $(GOCLEAN); 98 | 99 | dc: asset build 100 | docker-compose build; 101 | 102 | docker: build 103 | docker build -t gnames/go$(PROJ_NAME):latest -t gnames/go$(PROJ_NAME):$(VERSION) .; \ 104 | cd $(PROJ_NAME); \ 105 | $(GOCLEAN); 106 | 107 | dockerhub: docker 108 | docker push gnames/go$(PROJ_NAME); \ 109 | docker push gnames/go$(PROJ_NAME):$(VERSION) 110 | 111 | clib_darwin: peg 112 | cd binding; \ 113 | $(GOCLEAN); \ 114 | CGO_ENABLED=1 GOOS=darwin GOARCH=arm64 $(GOBUILD) -buildmode=c-shared -o $(CLIB_DIR)/lib$(PROJ_NAME)_arm64.so; \ 115 | CGO_ENABLED=1 GOOS=darwin GOARCH=amd64 $(GOBUILD) -buildmode=c-shared -o $(CLIB_DIR)/lib$(PROJ_NAME)_amd64.so; \ 116 | rm lib$(PROJ_NAME)_amd64.h; \ 117 | mv lib$(PROJ_NAME)_arm64.h lib$(PROJ_NAME).h; \ 118 | lipo -create -output $(CLIB_DIR)/lib$(PROJ_NAME).so $(CLIB_DIR)/lib$(PROJ_NAME)_arm64.so $(CLIB_DIR)/lib$(PROJ_NAME)_amd64.so; 119 | 120 | clib: peg 121 | cd binding; \ 122 | $(GOBUILD) -buildmode=c-shared -o $(CLIB_DIR)/lib$(PROJ_NAME).so; 123 | 124 | quality: 125 | cd tools;\ 126 | $(GOCMD) run quality.go > ../quality.md 127 | 128 | 129 | .PHONY: man 130 | man: ronn 131 | @ronn ./man/$(PROJ_NAME).1.ronn --style=dark 132 | 133 | .PHONY: ronn 134 | ronn: 135 | @which ronn > /dev/null || gem install ronn 136 | -------------------------------------------------------------------------------- /binding/main.go: -------------------------------------------------------------------------------- 1 | // Package main provides C-binding functionality to use parser in 2 | // other languages. 3 | package main 4 | 5 | /* 6 | #include "stdlib.h" 7 | */ 8 | import "C" 9 | 10 | import ( 11 | "strings" 12 | "unsafe" 13 | 14 | "github.com/gnames/gnfmt" 15 | "github.com/gnames/gnlib/ent/nomcode" 16 | "github.com/gnames/gnparser" 17 | ) 18 | 19 | // ParseToString function takes a name-string, desired format, a withDetails 20 | // flag as 0|1 integer. It parses the name-string to either JSON, or a CSV 21 | // string, depending on the desired format. Format argument can take values of 22 | // 'csv', 'compact', 'pretty'. If withDetails argument is 0, additional 23 | // parsed details are ommited, if it is 1 -- they are included. 24 | // true. 25 | // 26 | //export ParseToString 27 | func ParseToString( 28 | name *C.char, 29 | fmtStr *C.char, 30 | codeStr *C.char, 31 | details C.int, 32 | diaereses C.int, 33 | ) *C.char { 34 | goname := C.GoString(name) 35 | code := nomcode.New(C.GoString(codeStr)) 36 | frmt, err := gnfmt.NewFormat(C.GoString(fmtStr)) 37 | if err != nil { 38 | frmt = gnfmt.CSV 39 | } 40 | opts := []gnparser.Option{ 41 | gnparser.OptFormat(frmt), 42 | gnparser.OptWithDetails(int(details) > 0), 43 | gnparser.OptCode(code), 44 | gnparser.OptWithPreserveDiaereses(int(diaereses) > 0), 45 | } 46 | cfg := gnparser.NewConfig(opts...) 47 | gnp := gnparser.New(cfg) 48 | parsed := gnp.ParseName(goname).Output(gnp.Format()) 49 | 50 | return C.CString(parsed) 51 | } 52 | 53 | // FreeMemory takes a string pointer and frees its memory. 54 | // 55 | //export FreeMemory 56 | func FreeMemory(p *C.char) { 57 | C.free(unsafe.Pointer(p)) 58 | } 59 | 60 | // ParseAryToString function takes an array of names, parsing format, and a 61 | // withDetails flag as 0|1 integer. Parsed outputs are sent as a string in 62 | // either CSV or JSONformat. Format argument can take values of 'csv', 63 | // 'compact', or 'pretty'. For withDetails argument 0 means false, 1 means 64 | // true. 65 | // 66 | //export ParseAryToString 67 | func ParseAryToString( 68 | in **C.char, 69 | length C.int, 70 | fmtStr *C.char, 71 | codeStr *C.char, 72 | details C.int, 73 | diaereses C.int, 74 | ) *C.char { 75 | names := make([]string, int(length)) 76 | code := nomcode.New(C.GoString(codeStr)) 77 | frmt, err := gnfmt.NewFormat(C.GoString(fmtStr)) 78 | if err != nil { 79 | frmt = gnfmt.CSV 80 | } 81 | 82 | opts := []gnparser.Option{ 83 | gnparser.OptFormat(frmt), 84 | gnparser.OptWithDetails(int(details) > 0), 85 | gnparser.OptCode(code), 86 | gnparser.OptWithPreserveDiaereses(int(diaereses) > 0), 87 | } 88 | start := unsafe.Pointer(in) 89 | pointerSize := unsafe.Sizeof(in) 90 | 91 | for i := 0; i < int(length); i++ { 92 | // Copy each input string into a Go string and add it to the slice. 93 | pointer := (**C.char)(unsafe.Pointer(uintptr(start) + uintptr(i)*pointerSize)) 94 | name := C.GoString(*pointer) 95 | names[i] = name 96 | } 97 | 98 | cfg := gnparser.NewConfig(opts...) 99 | gnp := gnparser.New(cfg) 100 | 101 | var res string 102 | parsed := gnp.ParseNames(names) 103 | if gnp.Format() == gnfmt.CSV { 104 | csv := make([]string, length) 105 | for i := range parsed { 106 | csv[i] = parsed[i].Output(gnfmt.CSV) 107 | } 108 | res = strings.Join(csv, "\n") 109 | } else { 110 | json, _ := gnfmt.GNjson{}.Encode(parsed) 111 | res = string(json) 112 | } 113 | return C.CString(res) 114 | } 115 | 116 | func main() {} 117 | -------------------------------------------------------------------------------- /build/bin/.gitkeep: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/gnames/gnparser/a2bee074a6d9f6b23be70c0d760540863645fa74/build/bin/.gitkeep -------------------------------------------------------------------------------- /build/clib/.gitkeep: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/gnames/gnparser/a2bee074a6d9f6b23be70c0d760540863645fa74/build/clib/.gitkeep -------------------------------------------------------------------------------- /build/release/.gitkeep: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/gnames/gnparser/a2bee074a6d9f6b23be70c0d760540863645fa74/build/release/.gitkeep -------------------------------------------------------------------------------- /cli_test.go: -------------------------------------------------------------------------------- 1 | package gnparser_test 2 | 3 | import ( 4 | "strings" 5 | "testing" 6 | 7 | "github.com/rendon/testcli" 8 | "github.com/stretchr/testify/assert" 9 | ) 10 | 11 | func TestTMP(t *testing.T) { 12 | assert.True(t, true) 13 | } 14 | 15 | // Run make install before these tests to get meaningful 16 | // results. 17 | 18 | func TestVersion(t *testing.T) { 19 | c := testcli.Command("gnparser", "-V") 20 | c.Run() 21 | assert.True(t, c.Success()) 22 | assert.Contains(t, c.Stdout(), "version:") 23 | 24 | c = testcli.Command("gnparser", "-V", "-f", "simple", 25 | "-j", "200", "-p", "8000") 26 | c.Run() 27 | assert.True(t, c.Success()) 28 | assert.Contains(t, c.Stdout(), "version:") 29 | } 30 | 31 | func TestFormat(t *testing.T) { 32 | t.Run("runs csv format", func(t *testing.T) { 33 | c := testcli.Command("gnparser", "Homo sapiens", "-f", "csv") 34 | c.Run() 35 | assert.True(t, c.Success()) 36 | assert.Contains(t, c.Stdout(), ",Homo sapiens,2") 37 | }) 38 | 39 | t.Run("ignores parsing with --version", func(t *testing.T) { 40 | c := testcli.Command("gnparser", "Homo sapiens", "-f", "simple", "--version") 41 | c.Run() 42 | assert.True(t, c.Success()) 43 | assert.NotContains(t, c.Stdout(), ",Homo sapiens,") 44 | assert.Contains(t, c.Stdout(), "version:") 45 | }) 46 | 47 | t.Run("sets format to default if -f value is unknown", func(t *testing.T) { 48 | c := testcli.Command("gnparser", "Homo sapiens", "-f", ":)") 49 | c.Run() 50 | assert.True(t, c.Success()) 51 | assert.Contains(t, c.Stdout(), `Id,Verbatim,Cardinality,`) 52 | }) 53 | } 54 | 55 | func TestStdin(t *testing.T) { 56 | t.Run("takes data from Stdin", func(t *testing.T) { 57 | c := testcli.Command("gnparser", "-f", "simple") 58 | c.SetStdin(strings.NewReader("Homo sapiens")) 59 | c.Run() 60 | assert.True(t, c.Success()) 61 | assert.Contains(t, c.Stdout(), ",Homo sapiens,") 62 | }) 63 | 64 | t.Run("takes multiple names from Stdin", func(t *testing.T) { 65 | c := testcli.Command("gnparser", "-f", "simple") 66 | c.SetStdin(strings.NewReader("Plantago\nBubo L.\n")) 67 | c.Run() 68 | assert.True(t, c.Success()) 69 | assert.Contains(t, c.Stdout(), ",Plantago,") 70 | assert.Contains(t, c.Stdout(), ",Bubo,") 71 | }) 72 | } 73 | -------------------------------------------------------------------------------- /config.go: -------------------------------------------------------------------------------- 1 | package gnparser 2 | 3 | import ( 4 | "log/slog" 5 | "runtime" 6 | 7 | "github.com/gnames/gnfmt" 8 | "github.com/gnames/gnlib/ent/nomcode" 9 | ) 10 | 11 | // Config keeps settings that might affect how parsing is done, 12 | // of change the parsing output. 13 | type Config struct { 14 | // BatchSize sets the maximum number of elements in names-strings slice. 15 | BatchSize int 16 | 17 | // Code contains optional nomenclatural code value. This option is 18 | // useful to solve ambiguous parsing cases and to add cultivar botanical 19 | // rules. 20 | nomcode.Code 21 | 22 | // Debug sets a "debug" state for parsing. The debug state forces output 23 | // format to showing parsed ast tree. 24 | Debug bool 25 | 26 | // Format sets the output format for CLI and Web interfaces. 27 | // There are 3 formats available: 'CSV', 'CompactJSON' and 28 | // 'PrettyJSON'. 29 | Format gnfmt.Format 30 | 31 | // IgnoreHTMLTags can be set to true when it is desirable to clean up names 32 | // from a few HTML tags often present in names-strings that were planned to 33 | // be presented via an HTML page. 34 | IgnoreHTMLTags bool 35 | 36 | // IsTest can be set to true when parsing functionality is used for tests. 37 | // In such cases the `ParserVersion` field is presented as `test_version` 38 | // instead of displaying the actual version of `gnparser`. 39 | IsTest bool 40 | 41 | // JobsNum sets a level of parallelism used during parsing of 42 | // a stream of name-strings. 43 | JobsNum int 44 | 45 | // Port to run wer-service. 46 | Port int 47 | 48 | // WithCapitalization flag, when true, the first letter of a name-string 49 | // is capitalized, if appropriate. 50 | WithCapitalization bool 51 | 52 | // WithDetails can be set to true when a simplified output is not sufficient 53 | // for obtaining a required information. 54 | WithDetails bool 55 | 56 | // WithNoOrder flag, when true, output and input are in different order. 57 | WithNoOrder bool 58 | 59 | // WithPreserveDiaereses flag, when true, diaereses will not be transliterated 60 | WithPreserveDiaereses bool 61 | 62 | // WithStream changes from parsing a batch by batch, to parsing one name 63 | // at a time. When WithStream is true, BatchSize setting is ignored. 64 | WithStream bool 65 | 66 | // WithWebLogs flag enables logs when running web-service. This flag is 67 | // ignored if `Port` value is not set. 68 | WithWebLogs bool 69 | 70 | // WithSpeciesGroupCut flag means that stemmed version of autonyms (ICN) and 71 | // species group names (ICZN) will be truncated to species. It helps to 72 | // simplify matching names like `Aus bus` and `Aus bus bus`. 73 | WithSpeciesGroupCut bool 74 | } 75 | 76 | // Option is a type that has to be returned by all Option functions. Such 77 | // functions are able to modify the settings of a Config object. 78 | type Option func(*Config) 79 | 80 | // OptBatchSize sets the max number of names in a batch. 81 | func OptBatchSize(i int) Option { 82 | return func(cfg *Config) { 83 | if i <= 0 { 84 | slog.Warn("Batch size should be a positive number") 85 | return 86 | } 87 | cfg.BatchSize = i 88 | } 89 | } 90 | 91 | // OptDebugParse returns parsed tree 92 | func OptDebug(b bool) Option { 93 | return func(cfg *Config) { 94 | cfg.Debug = b 95 | } 96 | } 97 | 98 | // OptFormat sets the formatting option for CLI or Web presentation. 99 | // It accepts a gnfmt.Format value to control the output format. 100 | func OptFormat(f gnfmt.Format) Option { 101 | return func(cfg *Config) { 102 | cfg.Format = f 103 | } 104 | } 105 | 106 | // OptKeepHTMLTags sets the KeepHTMLTags field. This option is useful if 107 | // names with HTML tags shold not be parsed, or they are absent in input 108 | // data. 109 | func OptIgnoreHTMLTags(b bool) Option { 110 | return func(cfg *Config) { 111 | cfg.IgnoreHTMLTags = b 112 | } 113 | } 114 | 115 | // OptIsTest sets a test flag. 116 | func OptIsTest(b bool) Option { 117 | return func(cfg *Config) { 118 | cfg.IsTest = b 119 | } 120 | } 121 | 122 | // OptJobsNum sets the JobsNum field. 123 | func OptJobsNum(i int) Option { 124 | return func(cfg *Config) { 125 | cfg.JobsNum = i 126 | } 127 | } 128 | 129 | // OptPort sets a port for web-service. 130 | func OptPort(i int) Option { 131 | return func(cfg *Config) { 132 | cfg.Port = i 133 | } 134 | } 135 | 136 | // OptWithCapitaliation sets the WithCapitalization field. 137 | func OptWithCapitaliation(b bool) Option { 138 | return func(cfg *Config) { 139 | cfg.WithCapitalization = b 140 | } 141 | } 142 | 143 | // OptCode sets Code field 144 | func OptCode(c nomcode.Code) Option { 145 | return func(cfg *Config) { 146 | cfg.Code = c 147 | } 148 | } 149 | 150 | // OptWithDetails sets the WithDetails field. 151 | func OptWithDetails(b bool) Option { 152 | return func(cfg *Config) { 153 | cfg.WithDetails = b 154 | } 155 | } 156 | 157 | // OptWithNoOrder sets the WithNoOrder field. 158 | func OptWithNoOrder(b bool) Option { 159 | return func(cfg *Config) { 160 | cfg.WithNoOrder = b 161 | } 162 | } 163 | 164 | // OptWithPreserveDiaereses sets the PreserveDiaereses field. 165 | func OptWithPreserveDiaereses(b bool) Option { 166 | return func(cfg *Config) { 167 | cfg.WithPreserveDiaereses = b 168 | } 169 | } 170 | 171 | // OptWithDetails sets the WithDetails field. 172 | func OptWithStream(b bool) Option { 173 | return func(cfg *Config) { 174 | cfg.WithStream = b 175 | } 176 | } 177 | 178 | // OptWithWebLogs sets the WithWebLogs field. 179 | func OptWithWebLogs(b bool) Option { 180 | return func(cfg *Config) { 181 | cfg.WithWebLogs = b 182 | } 183 | } 184 | 185 | // OptWithSpeciesGroupCut sets WithSpeciesGroupCut field. 186 | func OptWithSpeciesGroupCut(b bool) Option { 187 | return func(cfg *Config) { 188 | cfg.WithSpeciesGroupCut = b 189 | } 190 | } 191 | 192 | // NewConfig generates a new Config object. It can take an arbitrary number 193 | // of `Option` functions to modify default configuration settings. 194 | func NewConfig(opts ...Option) Config { 195 | cfg := Config{ 196 | Format: gnfmt.CSV, 197 | JobsNum: runtime.NumCPU(), 198 | BatchSize: 50_000, 199 | IgnoreHTMLTags: false, 200 | Port: 8080, 201 | Code: nomcode.Unknown, 202 | } 203 | for i := range opts { 204 | opts[i](&cfg) 205 | } 206 | return cfg 207 | } 208 | -------------------------------------------------------------------------------- /config_test.go: -------------------------------------------------------------------------------- 1 | package gnparser_test 2 | 3 | import ( 4 | "runtime" 5 | "testing" 6 | 7 | "github.com/gnames/gnfmt" 8 | "github.com/gnames/gnparser" 9 | "github.com/stretchr/testify/assert" 10 | ) 11 | 12 | func TestNew(t *testing.T) { 13 | cfg := gnparser.NewConfig() 14 | deflt := gnparser.Config{ 15 | Format: gnfmt.CSV, 16 | JobsNum: runtime.NumCPU(), 17 | BatchSize: 50_000, 18 | IgnoreHTMLTags: false, 19 | WithDetails: false, 20 | Port: 8080, 21 | IsTest: false, 22 | } 23 | assert.Equal(t, deflt, cfg) 24 | } 25 | 26 | func TestNewOpts(t *testing.T) { 27 | opts := opts() 28 | cnf := gnparser.NewConfig(opts...) 29 | updt := gnparser.Config{ 30 | Format: gnfmt.CompactJSON, 31 | JobsNum: 161, 32 | BatchSize: 1, 33 | IgnoreHTMLTags: true, 34 | WithDetails: true, 35 | Port: 8989, 36 | } 37 | assert.Equal(t, updt, cnf) 38 | } 39 | 40 | func opts() []gnparser.Option { 41 | return []gnparser.Option{ 42 | gnparser.OptFormat(gnfmt.CompactJSON), 43 | gnparser.OptJobsNum(161), 44 | gnparser.OptBatchSize(1), 45 | gnparser.OptIgnoreHTMLTags(true), 46 | gnparser.OptWithDetails(true), 47 | gnparser.OptPort(8989), 48 | } 49 | } 50 | -------------------------------------------------------------------------------- /default.nix: -------------------------------------------------------------------------------- 1 | { lib, buildGoModule, fetchFromGitHub, stdenv, glibc }: 2 | 3 | buildGoModule rec { 4 | pname = "gnparser"; 5 | version = "v1.6.6"; 6 | date = "2022-05-17"; 7 | 8 | src = ./.; 9 | 10 | vendorSha256 = "sha256-TY/vIgtu/GeVKJ1AonMMxCvIbK3ATc2jp9Zqq1YQ9Mg="; 11 | 12 | buildInputs = [ 13 | stdenv 14 | glibc.static 15 | ]; 16 | 17 | doChecks = false; 18 | 19 | subPackages = "gnparser"; 20 | 21 | ldflags = [ 22 | "-s" 23 | "-w" 24 | "-linkmode external" 25 | "-extldflags" 26 | "-static" 27 | "-X github.com/gnames/gnparser.Version=${version}" 28 | "-X github.com/gnames/gnparser.Build=${date}" 29 | ]; 30 | 31 | meta = with lib; { 32 | description = "Parser for bio scientific names"; 33 | homepage = "https://github.com/gnames/gnparser"; 34 | license = licenses.mit; 35 | maintainers = with maintainers; [ "dimus" ]; 36 | }; 37 | } 38 | -------------------------------------------------------------------------------- /docker-compose.yml: -------------------------------------------------------------------------------- 1 | version: "3" 2 | 3 | services: 4 | app: 5 | build: 6 | context: . 7 | ports: 8 | - 0.0.0.0:8778:8778 9 | command: 10 | - -p 11 | - "8778" 12 | 13 | -------------------------------------------------------------------------------- /ent/internal/preparser/grammar.peg: -------------------------------------------------------------------------------- 1 | package preparser 2 | 3 | type PreParser Peg { PreString } 4 | 5 | String <- _? (Head Tail? / TailPhrase .*) SpaceOrEnd 6 | 7 | Head <- Word (CommaSpace Word)* &(Tail / SpaceOrEnd) 8 | 9 | Tail <- { p.tailIndex = int(token.begin) } CommaSpace TailPhrase .* 10 | 11 | Word <- !TailPhrase [^, ]+ / ',' 12 | 13 | TailPhrase <- TailLastWordJunk / TailPhrase4 / TailPhrase3 / 14 | TailStopWords / TailPhrase2 / TailPhrase1 15 | 16 | TailLastWordJunk <- (("var" / "ined" / "ssp" / "subsp" / "subgen" ) '.'? / 17 | "sensu" / "new" / "non" / "nec" / "hybrid" / "von" / 'P.' _? 'P.' / 18 | "ms" / 'CF') '?'? &SpaceOrEnd 19 | 20 | TailPhrase4 <- ("pro" _ "parte" / "nomen") &NotLetterOrEnd / 'p.' _? 'p.' / 21 | "nom." / "comb." 22 | 23 | TailPhrase3 <- '('? 's' ('.' _? / _ ) ('s' '.'? &NotLetterOrEnd / 'l.' / 'str.' / 24 | 'lat.') 25 | 26 | TailStopWords <- ("environmental" / "enrichment" / "samples" / 27 | "species" / "group" / "complex" / "clade" / 28 | "author" / "nec" / "vide" / "species" / "fide" / "non" / "not" ) &NotLetterOrEnd 29 | 30 | TailPhrase2 <- ("sero" ("var" / "type") / "sensu" / "auct" / "sec" / "near" / 31 | "str") '.'? &NotLetterOrEnd 32 | 33 | TailPhrase1 <- (('('? ('ht' / 'hort')) / "S" 'pec' / 34 | 'nov' '.'? _ 'spec') '.'? &NotLetterOrEnd 35 | 36 | SpaceOrEnd <- CommaSpace? END 37 | 38 | CommaSpace <- (_? ',' _?)+ / _ 39 | 40 | _ <- MultipleSpace / SingleSpace 41 | 42 | NotLetterOrEnd <- NotLetter / END 43 | 44 | NotLetter <- [[^A-Z0-9_.\-]] 45 | 46 | MultipleSpace <- SingleSpace SingleSpace+ 47 | 48 | SingleSpace <- ' ' / OtherSpace 49 | 50 | OtherSpace <- [  \t\r\n\f\v] 51 | 52 | END <- !. 53 | -------------------------------------------------------------------------------- /ent/internal/preparser/preparser.go: -------------------------------------------------------------------------------- 1 | package preparser 2 | 3 | import "log/slog" 4 | 5 | func New() *PreParser { 6 | res := &PreParser{} 7 | res.Init() 8 | return res 9 | } 10 | 11 | type PreString struct { 12 | tailIndex int 13 | } 14 | 15 | // ParseString returns index of the Tail 16 | func (ppr *PreParser) NewString(s string) { 17 | ppr.tailIndex = -1 18 | ppr.Buffer = s 19 | ppr.Reset() 20 | } 21 | 22 | func (ppr *PreParser) TailIndex(s string) int { 23 | ppr.NewString(s) 24 | if err := ppr.Parse(); err != nil { 25 | slog.Error("Preparsing failed", "error", err, "string", s) 26 | return -1 27 | } 28 | ppr.Execute() 29 | if ppr.tailIndex >= 0 { 30 | rs := []rune(s) 31 | head := rs[0:ppr.tailIndex] 32 | return len([]byte(string(head))) 33 | } 34 | return ppr.tailIndex 35 | } 36 | 37 | // Debug takes a string, parses it, and prints its AST. 38 | func (ppr *PreParser) Debug(q string) error { 39 | ppr.NewString(q) 40 | err := ppr.Parse() 41 | if err != nil { 42 | return err 43 | } 44 | ppr.PrettyPrintSyntaxTree(q) 45 | return nil 46 | } 47 | -------------------------------------------------------------------------------- /ent/internal/preparser/preparser_test.go: -------------------------------------------------------------------------------- 1 | package preparser_test 2 | 3 | import ( 4 | "testing" 5 | 6 | "github.com/gnames/gnparser/ent/internal/preparser" 7 | "github.com/stretchr/testify/assert" 8 | ) 9 | 10 | func TestDebug(t *testing.T) { 11 | debug := true 12 | q := "The annulignatha group" 13 | assert := assert.New(t) 14 | ppr := preparser.New() 15 | if debug { 16 | err := ppr.Debug(q) 17 | assert.Nil(err) 18 | } 19 | } 20 | 21 | func TestPreParsed(t *testing.T) { 22 | tests := []struct { 23 | msg, str, tail string 24 | }{ 25 | // Last 'junk' words/ annotations 26 | {"var", "Musca domeſtica Linnaeus 1758 var? ", " var? "}, 27 | {"ined", " Oxalis_barrelieri ined.?", " ined.?"}, 28 | {"ssp.", "Peperomia non-alata Trel. ssp.", " ssp."}, 29 | {"subsp.", "Sanogasta x-signata (Keyserling,1891) subsp.", 30 | " subsp."}, 31 | {"subgen", "Sanogasta x-signata (Keyserling,1891) subgen? ", 32 | " subgen? "}, 33 | {"sensu", "Pseudomonas methanica (Söhngen 1906) sensu. Dworkin and Foster 1956", 34 | " sensu. Dworkin and Foster 1956"}, 35 | {"new", "Hegeter (Hegeter) intercedens Lindberg H 1950 new", " new"}, 36 | {"non", "Anthoscopus Cabanis [1851?] non", " non"}, 37 | {"nec", "Hegeter (Hegeter) intercedens Lindberg H 1950 nec", " nec"}, 38 | {"hybrid", " Arthopyrenia hyalospora x hybrid?", " hybrid?"}, 39 | {"von$", "Nautilus asterizans von", " von"}, 40 | 41 | // Pro Parte 42 | {"Pro Parte", "Abarema clypearia (Jack) Kosterm., Pro Parte", 43 | ", Pro Parte"}, 44 | {"nomen", "Akeratidae Nomen Nudum", " Nomen Nudum"}, 45 | {"nom.", "Akeratidae nom. nudum", " nom. nudum"}, 46 | {"nom illeg", "Abutilon avicennae Gaertn., nom. illeg.", ", nom. illeg."}, 47 | {"comb", "Arthopyrenia hyalospora (Nyl.) R.C. Harris comb. nov.", 48 | " comb. nov."}, 49 | {"p. p.", "Abarema clypearia (Jack) Kosterm., p. p.", ", p. p."}, 50 | {"P. P.", "Abarema clypearia (Jack) Kosterm., P. P.", ", P. P."}, 51 | 52 | // s.s. 53 | {", s. s.", "Bubo bubo, s. s. nov spec something", 54 | ", s. s. nov spec something"}, 55 | {"s.s.", "Bubo bubo s.s. nov spec something", 56 | " s.s. nov spec something"}, 57 | {"s.l.", "Bubo bubo s.l. something", 58 | " s.l. something"}, 59 | {"s. lat.", "Bubo bubo s. lat. something", 60 | " s. lat. something"}, 61 | {"s. str.", "Bubo bubo s. str. something", 62 | " s. str. something"}, 63 | {"no break space", " Canadensis Erxleben, 1777 s.str.", " s.str."}, 64 | 65 | // Stop words 66 | {"env", "Ge Nicéville 1895 Environmental sample", 67 | " Environmental sample"}, 68 | {"env samples", "Candidatus Anammoxoglobus environmental samples", 69 | " environmental samples"}, 70 | {"enrichment", "Crenarchaeote enrichment culture clone OREC-B1022", 71 | " enrichment culture clone OREC-B1022"}, 72 | {"samples", "Candidatus Anammoxoglobus samples", 73 | " samples"}, 74 | 75 | {"sec", "Ataladoris Iredale & O'Donoghue 1923 sec Eschmeyer", 76 | " sec Eschmeyer"}, 77 | {"sec.", "Ataladoris Iredale & O'Donoghue 1923 sec. Eschmeyer", 78 | " sec. Eschmeyer"}, 79 | {"sp compl", "Acarospora cratericola cratericola Shenk 1974 species complex", 80 | " species complex"}, 81 | {"utf8", "× Dialaeliopsis hort.", " hort."}, 82 | } 83 | 84 | assert := assert.New(t) 85 | ppr := preparser.New() 86 | 87 | for _, v := range tests { 88 | idx := ppr.TailIndex(v.str) 89 | assert.True(idx >= 0, v.msg) 90 | assert.Equal(v.tail, string([]byte(v.str)[idx:]), v.msg) 91 | } 92 | } 93 | 94 | func TestNotPreParsed(t *testing.T) { 95 | tests := []struct { 96 | msg, str string 97 | }{ 98 | {"no tail1", "Lachenalia tricolor var. nelsonii (anon.) Baker"}, 99 | {"S. S.", "Bubo bubo, S. S. something"}, 100 | {"dagger", "Heteralocha acutirostris (Gould, 1837) Huia N E†"}, 101 | {"spaces", "Heteralocha acutirostris (Gould, 1837) Huia N E "}, 102 | {"comma", "Abantiadinus pusillus Broun, T. , 1914"}, 103 | {"last comma", "Acalles foveopunctatus Fiedler,"}, 104 | {"space comma", "Calamagrostis neglecta G.Gaertn. ,B.Mey. & Scherb."}, 105 | {"all tail", "Non splenectomized mulatta"}, 106 | {"several commas", "Naupliicola cystifingens Michajlow, ,1968"}, 107 | {"spp", "Crataegus curvisepala nvar. naviculiformis T. Petauer Alaria spp."}, 108 | } 109 | 110 | assert := assert.New(t) 111 | ppr := preparser.New() 112 | 113 | for _, v := range tests { 114 | idx := ppr.TailIndex(v.str) 115 | assert.Equal(-1, idx) 116 | } 117 | } 118 | -------------------------------------------------------------------------------- /ent/internal/preprocess/annot.rl: -------------------------------------------------------------------------------- 1 | package preprocess 2 | 3 | import ( 4 | ) 5 | 6 | func AnnotationRL(data []byte) bool { 7 | %%{ 8 | machine annot; 9 | write data; 10 | }%% 11 | 12 | cs, p, pe, eof := 0, 0, len(data), len(data) 13 | _ = eof 14 | _ = annot_en_main 15 | _ = annot_error 16 | _ = annot_first_final 17 | 18 | var match bool 19 | 20 | %%{ 21 | action setMatch {match = true} 22 | action setPos {pos = append(pos,p)} 23 | 24 | notes = ("species"i | "group"i | "clade"i | "authors"i | "non" | "nec" | 25 | "fide" | "vide" ); 26 | tc1 = ("sensu"i | "auct"i | "sec"i | "near" | "str") "."?; 27 | tc2 = "("? "s." space? ([sl] | "str" | "lat") "."; 28 | tc3 = "pro parte"i | "p." space? "p."; 29 | tc4 = "("? ("nomen"i | "nom."i | "comb."); 30 | 31 | main := any* ((space+ | "," space?) 32 | (notes | tc1 |tc2 | tc3 | tc4)) %/setMatch 33 | ((space | punct) >setMatch); 34 | 35 | write init; 36 | write exec; 37 | }%% 38 | 39 | return match 40 | } 41 | -------------------------------------------------------------------------------- /ent/internal/preprocess/cleanup.go: -------------------------------------------------------------------------------- 1 | package preprocess 2 | 3 | import ( 4 | "bytes" 5 | "io" 6 | "sync" 7 | 8 | "golang.org/x/net/html" 9 | ) 10 | 11 | var tags = map[string]struct{}{ 12 | "i": {}, 13 | "small": {}, 14 | "br": {}, 15 | "em": {}, 16 | "b": {}, 17 | } 18 | 19 | // CleanupResult keeps results of removal of some HTML tags. 20 | type CleanupResult struct { 21 | // Input is the original name. 22 | Input string 23 | // Output is the name after the tag removal. 24 | Output string 25 | } 26 | 27 | // CleanupStream takes input and output string channels, and feeds output with 28 | // pipe delimited strings with original name on the left and cleaned up name 29 | // on the right from the pipe. 30 | func CleanupStream(in <-chan string, out chan<- *CleanupResult, wn int) { 31 | var wg sync.WaitGroup 32 | wg.Add(wn) 33 | for i := 0; i < wn; i++ { 34 | go cleanupWorker(in, out, &wg) 35 | } 36 | wg.Wait() 37 | close(out) 38 | } 39 | 40 | func cleanupWorker(in <-chan string, out chan<- *CleanupResult, 41 | wg *sync.WaitGroup) { 42 | defer wg.Done() 43 | for s := range in { 44 | res := StripTags(s) 45 | out <- &CleanupResult{Input: s, Output: res} 46 | } 47 | } 48 | 49 | // StripTags takes a slice of bytes and returns a string with common 50 | // tags removed and html entities escaped. It does keep all uncommon tags 51 | // intact to let parser deal with them. 52 | func StripTags(s string) string { 53 | var buff bytes.Buffer 54 | r := bytes.NewReader([]byte(s)) 55 | 56 | tokenizer := html.NewTokenizer(r) 57 | for { 58 | if tokenizer.Next() == html.ErrorToken { 59 | err := tokenizer.Err() 60 | if err == io.EOF { 61 | return html.UnescapeString(buff.String()) 62 | } 63 | return "" 64 | } 65 | tokenVal := string(tokenizer.Raw()) 66 | 67 | token := tokenizer.Token() 68 | switch token.Type { 69 | case html.DoctypeToken: 70 | case html.CommentToken: 71 | case html.StartTagToken: 72 | if _, ok := tags[token.Data]; ok { 73 | break 74 | } 75 | buff.WriteString(tokenVal) 76 | 77 | case html.EndTagToken: 78 | if _, ok := tags[token.Data]; ok { 79 | break 80 | } 81 | buff.WriteString(tokenVal) 82 | 83 | case html.TextToken: 84 | buff.WriteString(tokenVal) 85 | 86 | default: 87 | return "" 88 | } 89 | } 90 | } 91 | -------------------------------------------------------------------------------- /ent/internal/preprocess/noparse.rl: -------------------------------------------------------------------------------- 1 | package preprocess 2 | 3 | func NoParse(data []byte) bool { 4 | 5 | %%{ 6 | machine noparse; 7 | write data; 8 | }%% 9 | 10 | cs, p, pe, eof := 0, 0, len(data), len(data) 11 | _ = eof 12 | _ = noparse_first_final 13 | _ = noparse_error 14 | _ = noparse_en_main 15 | 16 | var match bool 17 | 18 | 19 | %%{ 20 | action setMatch {match = true} 21 | 22 | noparse1 = ("Not" | "None" | "Un" ("n"? "amed" | "identified")); 23 | noparse2 = any* [Ii] "nc" ("." | "ertae") space* [Ss] "ed" ("." | "is"); 24 | noparse3 = any* (("endo" | "ecto")? "symbiont" | "phytoplasma" | space "cyano"? "bacterium"| "plasmid" "s"? | [^A-Z] "RNA" [^A-Z]*); 25 | 26 | 27 | main := (noparse1 | noparse2 | noparse3) %/setMatch 28 | ((space | punct) >setMatch); 29 | 30 | write init; 31 | write exec; 32 | 33 | }%% 34 | 35 | return match 36 | } 37 | -------------------------------------------------------------------------------- /ent/internal/preprocess/preprocess.go: -------------------------------------------------------------------------------- 1 | // Package preprocess performs preparsing filtering and modification of a 2 | // scientific-name. 3 | package preprocess 4 | 5 | import ( 6 | "bytes" 7 | "io" 8 | "regexp" 9 | "strings" 10 | "unicode" 11 | 12 | "github.com/gnames/gnparser/ent/internal/preparser" 13 | "golang.org/x/text/unicode/norm" 14 | ) 15 | 16 | var VirusException = map[string]string{ 17 | "Aspilota": "vector", 18 | "Bembidion": "satellites", 19 | "Bolivina": "prion", 20 | "Ceylonesmus": "vector", 21 | "Cryptops": "vector", 22 | "Culex": "vector", 23 | "Dasyproctus": "cevirus", 24 | "Desmoxytes": "vector", 25 | "Dicathais": "vector", 26 | "Erateina": "satellites", 27 | "Euragallia": "prion", 28 | "Exochus": "virus", 29 | "Hilara": "vector", 30 | "Ithomeis": "satellites", 31 | "Microgoneplax": "prion", 32 | "Neoaemula": "vector", 33 | "Nephodia": "satellites", 34 | "Ophion": "virus", 35 | "Phalium": "vector", 36 | "Psenulus": "trevirus", 37 | "Tidabius": "vector", 38 | "Turkozelotes": "attavirus", 39 | } 40 | 41 | var AmbiguousException = map[string][]string{ 42 | "Aeolesthes": {"mihi"}, 43 | "Agnetina": {"den"}, 44 | "Agra": {"not"}, 45 | "Aleuroclava": {"complex"}, 46 | "Allawrencius": {"complex"}, 47 | "Anisochaeta": {"mihi"}, 48 | "Antaplaga": {"dela"}, 49 | "Baeolidia": {"dela"}, 50 | "Bolbodeomyia": {"complex"}, 51 | "Bolitoglossa": {"la"}, 52 | "Campylosphaera": {"dela"}, 53 | "Castelnaudia": {"spec"}, 54 | "Cicada": {"complex"}, 55 | "Concinnum": {"ten"}, 56 | "Desmoxytes": {"des"}, 57 | "Dicentria": {"dela"}, 58 | "Dichostasia": {"complex"}, 59 | "Dimorphoceras": {"complex"}, 60 | "Dischidia": {"complex"}, 61 | "Ecnomus": {"complex"}, 62 | "Eresus": {"da"}, 63 | "Eucyclops": {"mihi"}, 64 | "Eulaira": {"dela"}, 65 | "Fusinus": {"complex"}, 66 | "Gnathopleustes": {"den"}, 67 | "Gobiosoma": {"spec"}, 68 | "Gonatobotrys": {"complex"}, 69 | "Heizmannia": {"complex"}, 70 | "Helophorus": {"ser"}, 71 | "Hemicloeina": {"spec"}, 72 | "Lampona": {"spec"}, 73 | "Leptonetela": {"la"}, 74 | "Libystica": {"complex"}, 75 | "Malamatidia": {"zu"}, 76 | "Meteorus": {"dos"}, 77 | "Nocaracris": {"van"}, 78 | "Notozomus": {"spec"}, 79 | "Ochodaeus": {"complex"}, 80 | "Odontella": {"do"}, 81 | "Oecetis": {"complex"}, 82 | "Oedipina": {"complex"}, 83 | "Oedipus": {"complex"}, 84 | "Oedopinola": {"complex"}, 85 | "Orcevia": {"zu"}, 86 | "Paradimorphoceras": {"complex"}, 87 | "Paralvinella": {"dela"}, 88 | "Parentia": {"do"}, 89 | "Phyllospongia": {"complex"}, 90 | "Plagiozopelma": {"du"}, 91 | "Plectrocnemia": {"complex"}, 92 | "Rubus": {"complex"}, 93 | "Ruteloryctes": {"bis"}, 94 | "Sceliphron": {"complex"}, 95 | "Scopaeus": {"complex"}, 96 | "Scoparia": {"dela"}, 97 | "Selenops": {"ab"}, 98 | "Semiothisa": {"da"}, 99 | "Serina": {"ser", "subser"}, 100 | "Schizura": {"dela"}, 101 | "Sigipinius": {"complex"}, 102 | "Stegosoladidus": {"complex"}, 103 | "Stenoecia": {"dos"}, 104 | "Sympycnus": {"du"}, 105 | "Tetracis": {"complex"}, 106 | "Tetramorium": {"do"}, 107 | "Tortolena": {"dela"}, 108 | "Trichosternus": {"spec"}, 109 | "Trisephena": {"complex"}, 110 | "Zodarion": {"van"}, 111 | } 112 | 113 | var NoParseException = map[string]string{ 114 | "Navicula": "bacterium", 115 | "Spirophora": "bacterium", 116 | } 117 | 118 | var cultivarRankRe = regexp.MustCompile( 119 | `\s+(cultivar\.?[\W_]|cv\.?[\W_]|['"‘’“”]).*$`, 120 | ) 121 | 122 | var ofWordRe = regexp.MustCompile( 123 | `\s+(of[\W_]).*$`, 124 | ) 125 | 126 | var dagger = []byte("†") 127 | 128 | // Preprocessor structure keeps state of the preprocessor results. 129 | type Preprocessor struct { 130 | Virus bool 131 | Underscore bool 132 | NoParse bool 133 | DaggerChar bool 134 | Approximate bool 135 | Annotation bool 136 | Body []byte 137 | Tail []byte 138 | Ambiguous ambiguous 139 | } 140 | 141 | type ambiguous struct { 142 | Orig string 143 | Subst string 144 | } 145 | 146 | var normalizer = norm.NFC 147 | 148 | // Preprocess runs a series of regular expressions over the input to determine 149 | // features of the input before parsing. 150 | func Preprocess(ppr *preparser.PreParser, bs []byte) *Preprocessor { 151 | bs = normalizer.Bytes(bs) 152 | 153 | pr := &Preprocessor{} 154 | 155 | // check for empty string 156 | if len(bs) == 0 || strings.TrimSpace(string(bs)) == "" { 157 | pr.NoParse = true 158 | return pr 159 | } 160 | i := len(bs) 161 | words := strings.Fields(string(bs)) 162 | 163 | // check for viruses, plasmids, RNA, DNA etc. 164 | if !isException(words, VirusException) { 165 | pr.Virus = IsVirus(bs[0:i]) 166 | } 167 | if pr.Virus { 168 | pr.NoParse = true 169 | return pr 170 | } 171 | 172 | // check for unparseable names 173 | pr.NoParse = NoParse(bs[0:i]) 174 | if isException(words, NoParseException) { 175 | pr.NoParse = false 176 | } 177 | if pr.NoParse { 178 | return pr 179 | } 180 | 181 | pr.DaggerChar = hasDagger(bs[0:i]) 182 | 183 | if len(words) > 1 { 184 | pr.ambiguous(words[0], bs) 185 | } 186 | 187 | j := procAnnot(ppr, bs[0:i]) 188 | if j < i { 189 | pr.Annotation = true 190 | i = j 191 | } 192 | 193 | // ignoring error, as it should never happen 194 | changed, _ := UnderscoreToSpace(bs[0:i]) 195 | if changed { 196 | pr.Underscore = true 197 | } 198 | 199 | pr.Body = bs[0:i] 200 | pr.Tail = bs[i:] 201 | return pr 202 | } 203 | 204 | func hasDagger(bs []byte) bool { 205 | idx := bytes.Index(bs, dagger) 206 | if idx == -1 { 207 | return false 208 | } 209 | 210 | sp := byte(' ') 211 | bs[idx], bs[idx+1], bs[idx+2] = sp, sp, sp 212 | return true 213 | } 214 | 215 | func isException(words []string, names map[string]string) bool { 216 | if len(words) < 2 { 217 | return false 218 | } 219 | if epithet, ok := names[words[0]]; ok { 220 | for _, w := range words[1:] { 221 | if w == epithet { 222 | return true 223 | } 224 | } 225 | } 226 | return false 227 | } 228 | 229 | func (p *Preprocessor) ambiguous(firstWord string, bs []byte) { 230 | if epithets, ok := AmbiguousException[firstWord]; ok { 231 | var sub byte = 'k' 232 | for _, epithet := range epithets { 233 | idx := bytes.Index(bs, []byte(" "+epithet)) 234 | if idx == -1 { 235 | continue 236 | } 237 | p.Ambiguous.Orig = epithet 238 | p.Ambiguous.Subst = string(sub) + epithet[1:] 239 | bs[idx+1] = sub 240 | } 241 | } 242 | } 243 | 244 | // procAnnot returns index where unparsed part starts. In case if 245 | // the full string can be parsed, returns returns the index of the end of the 246 | // input. 247 | func procAnnot(ppr *preparser.PreParser, bs []byte) int { 248 | i := len(bs) 249 | if idx := ppr.TailIndex(string(bs)); idx >= 0 { 250 | i = idx 251 | } 252 | 253 | // If ` of ` is in the string, before the start of the already-calculated 254 | // unparsed part, but there is no cultivar rank marker before it, consider it 255 | // unparseable. `Anthurium 'Ace of Spades'` should parse fully; 256 | // `Anthurium Trustees of the British Museum` should not. 257 | cultivarRankLoc := cultivarRankRe.FindIndex(bs[0:i]) 258 | ofLoc := ofWordRe.FindIndex(bs[0:i]) 259 | if len(ofLoc) > 0 && ofLoc[0] < i && 260 | (len(cultivarRankLoc) == 0 || cultivarRankLoc[0] > ofLoc[0]) { 261 | i = ofLoc[0] 262 | } 263 | 264 | return i 265 | } 266 | 267 | // UnderscoreToSpace takes a slice of bytes. If it finds that the string 268 | // contains underscores, but not spaces, it substitutes underscores to spaces 269 | // in the slice. In case if any spaces are present, the slice is returned 270 | // unmodified. 271 | func UnderscoreToSpace(bs []byte) (bool, error) { 272 | reader := bytes.NewReader(bs) 273 | var hasUnderscore bool 274 | for { 275 | r, _, err := reader.ReadRune() 276 | if err == io.EOF { 277 | break 278 | } else if err != nil { 279 | return false, err 280 | } 281 | if unicode.IsSpace(r) { 282 | return false, nil 283 | } 284 | if r == '_' { 285 | hasUnderscore = true 286 | } 287 | } 288 | if !hasUnderscore { 289 | return false, nil 290 | } 291 | 292 | for i, v := range bs { 293 | if v == '_' { 294 | bs[i] = ' ' 295 | } 296 | } 297 | return true, nil 298 | } 299 | -------------------------------------------------------------------------------- /ent/internal/preprocess/preprocess_test.go: -------------------------------------------------------------------------------- 1 | package preprocess 2 | 3 | import ( 4 | "strings" 5 | "testing" 6 | 7 | "github.com/gnames/gnparser/ent/internal/preparser" 8 | "github.com/stretchr/testify/assert" 9 | ) 10 | 11 | func TestCleanup(t *testing.T) { 12 | t.Run("StripTags", func(t *testing.T) { 13 | data := []struct { 14 | msg string 15 | tags string 16 | notags string 17 | }{ 18 | {"no html", "Hello", "Hello"}, 19 | {"html tags", "Hello", "Hello"}, 20 | {"html tags", "Hello", "Hello"}, 21 | {"bad tag", " 3 | 5 | 11 | 20 | 23 | 29 | 31 | 33 | 35 | 36 | 37 | 39 | 40 | -------------------------------------------------------------------------------- /io/web/static/styles/parser.css: -------------------------------------------------------------------------------- 1 | .parser textarea { 2 | width: 100%; 3 | height: 7em; 4 | display: block; 5 | margin-bottom:1em; 6 | } 7 | -------------------------------------------------------------------------------- /io/web/templates.go: -------------------------------------------------------------------------------- 1 | package web 2 | 3 | import ( 4 | "embed" 5 | "fmt" 6 | "html/template" 7 | "io" 8 | "path" 9 | 10 | "github.com/gnames/gnfmt" 11 | "github.com/gnames/gnparser/ent/parsed" 12 | "github.com/labstack/echo/v4" 13 | ) 14 | 15 | //go:embed templates 16 | var tmpls embed.FS 17 | 18 | // echoTempl implements echo.Renderer interface. 19 | type echoTempl struct { 20 | templates *template.Template 21 | } 22 | 23 | // Render implements echo.Renderer interface. 24 | func (t *echoTempl) Render( 25 | w io.Writer, 26 | name string, 27 | data interface{}, 28 | c echo.Context, 29 | ) error { 30 | return t.templates.ExecuteTemplate(w, name, data) 31 | } 32 | 33 | func NewTemplate() (*echoTempl, error) { 34 | t, err := parseFiles() 35 | if err != nil { 36 | return nil, fmt.Errorf("cannot parse file %w", err) 37 | } 38 | return &echoTempl{t}, nil 39 | } 40 | 41 | func parseFiles() (*template.Template, error) { 42 | var err error 43 | var t *template.Template 44 | 45 | var filenames []string 46 | dir := "templates" 47 | entries, _ := tmpls.ReadDir(dir) 48 | for i := range entries { 49 | if entries[i].Type().IsRegular() { 50 | filenames = append( 51 | filenames, 52 | fmt.Sprintf("%s/%s", dir, entries[i].Name()), 53 | ) 54 | } 55 | } 56 | 57 | for _, filename := range filenames { 58 | name := path.Base(filename) 59 | var tmpl *template.Template 60 | if t == nil { 61 | t = template.New(name) 62 | } 63 | if name == t.Name() { 64 | tmpl = t 65 | } else { 66 | tmpl = t.New(name) 67 | } 68 | addFuncs(tmpl) 69 | _, err = tmpl.ParseFS(tmpls, filename) 70 | if err != nil { 71 | return nil, err 72 | } 73 | } 74 | return t, nil 75 | } 76 | 77 | func addFuncs(tmpl *template.Template) { 78 | tmpl.Funcs(template.FuncMap{ 79 | "parsedJSON": func(p parsed.Parsed) string { 80 | return p.Output(gnfmt.PrettyJSON) 81 | }, 82 | }) 83 | } 84 | -------------------------------------------------------------------------------- /io/web/templates/doc_api.html: -------------------------------------------------------------------------------- 1 | {{ define "doc" }} 2 |
3 |
4 |
5 |

Application Programming Interface (API)

6 | 7 |

Web-based parser service includes a RESTful interface to parsing 8 | functionalilty. Both GET and POST methods are supported.

9 | 10 |

GET

11 | 12 |

13 | Append a vertical line separated array of strings to your domain url. 14 | Make sure that '&' in the names are escaped as '%26', 15 | and spaces are escaped as '+'. 16 |

17 | 18 |

19 | /api/v1/Aus+bus|Aus+bus+D.+%26+M.,+1870 20 |

21 | 22 |

POST

23 | 24 |

/api/v1

25 | 26 |

27 | with request body of JSON array of strings 28 |

29 | 30 |

OpenAPI Schema

31 |

32 | Read the GNparser's 33 | 34 | OpenAPI documentation 35 | to learn about all options and the output schema. 36 |

37 |
38 |
39 |
40 | {{ end }} 41 | -------------------------------------------------------------------------------- /io/web/templates/home.html: -------------------------------------------------------------------------------- 1 | {{ define "home" }} 2 |
3 |
4 |
5 |
6 |
7 | 8 | 14 | 15 | 21 | 22 | 29 | 30 | 31 |
32 | 40 | 41 |
42 |
43 |
44 |
45 | {{ if .Parsed }} {{if eq .Format "html" }} 46 |
47 |
48 |
49 |

Results:

50 | {{ range .Parsed }} 51 |

52 | {{ parsedJSON . }} 55 |

56 |

57 | {{ end }} 58 |
59 |
60 |
61 | {{ end }} {{ end }} {{ end }} 62 | -------------------------------------------------------------------------------- /io/web/templates/layout.html: -------------------------------------------------------------------------------- 1 | {{ define "layout" }} 2 | 3 | 4 | 5 | 6 | GNparser 7 | 8 | 9 | 10 | 11 | 12 | 13 | 14 |
15 | 39 |
40 |
41 | 47 |
48 | 72 |
73 |
74 |
75 |
76 |
77 |

Global Names Parser

78 |

Scientific Names in Detail

79 |
80 |
81 |
82 | 83 | {{ if .HomePage }} {{ template "home" . }} {{ else }} {{ template "doc" .}} 84 | {{ end }} 85 | 86 | 123 | 124 | 125 | {{ end }} 126 | -------------------------------------------------------------------------------- /io/web/web.go: -------------------------------------------------------------------------------- 1 | // Package web provides RESTful API service and a website for gnparser. 2 | package web 3 | 4 | import ( 5 | "fmt" 6 | "log/slog" 7 | "net/http" 8 | "net/url" 9 | "strings" 10 | 11 | "github.com/gnames/gnfmt" 12 | "github.com/gnames/gnlib/ent/nomcode" 13 | "github.com/gnames/gnparser" 14 | "github.com/gnames/gnparser/ent/parsed" 15 | "github.com/labstack/echo/v4" 16 | ) 17 | 18 | // inputFORM is used to collect data from HTML form. 19 | type inputFORM struct { 20 | Names string `query:"names" form:"names"` 21 | Code string `query:"code" form:"code"` 22 | Format string `query:"format" form:"format"` 23 | WithDetails string `query:"with_details" form:"with_details"` 24 | 25 | // WithCultivars is deprecated and overriden by Code 26 | WithCultivars string `query:"cultivars" form:"cultivars"` 27 | PreserveDiaereses string `query:"diaereses" form:"diaereses"` 28 | } 29 | 30 | // Data contains information required to render web-pages. 31 | type Data struct { 32 | Input string 33 | Parsed []parsed.Parsed 34 | Code string 35 | Format string 36 | HomePage bool 37 | Version string 38 | WithDetails bool 39 | // WithCultivars is deprecated by Code field 40 | WithCultivars bool 41 | PreserveDiaereses bool 42 | } 43 | 44 | // NewData creates new Data for web-page templates. 45 | func newData(isHome bool) *Data { 46 | return &Data{HomePage: isHome, Format: "html", Version: gnparser.Version} 47 | } 48 | 49 | func homePOST(gnps GNparserService) func(echo.Context) error { 50 | return func(c echo.Context) error { 51 | inp := new(inputFORM) 52 | data := newData(true) 53 | 54 | err := c.Bind(inp) 55 | if err != nil { 56 | return err 57 | } 58 | 59 | if strings.TrimSpace(inp.Names) == "" { 60 | return c.Redirect(http.StatusFound, "") 61 | } 62 | 63 | if strings.Count(inp.Names, "\n") < 20 { 64 | return redirectToHomeGET(c, inp) 65 | } 66 | 67 | return parsingResults(c, gnps, inp, data) 68 | } 69 | } 70 | 71 | func redirectToHomeGET(c echo.Context, inp *inputFORM) error { 72 | withDetails := inp.WithDetails == "on" 73 | withCultivars := inp.WithCultivars == "on" 74 | preserveDiaereses := inp.PreserveDiaereses == "on" 75 | q := make(url.Values) 76 | q.Set("names", inp.Names) 77 | q.Set("format", inp.Format) 78 | if withDetails { 79 | q.Set("with_details", inp.WithDetails) 80 | } 81 | if withCultivars { 82 | q.Set("cultivars", inp.WithCultivars) 83 | } 84 | if preserveDiaereses { 85 | q.Set("diaereses", inp.PreserveDiaereses) 86 | } 87 | q.Set("code", inp.Code) 88 | 89 | url := fmt.Sprintf("/?%s", q.Encode()) 90 | return c.Redirect(http.StatusFound, url) 91 | } 92 | 93 | func homeGET(gnps GNparserService) func(echo.Context) error { 94 | return func(c echo.Context) error { 95 | data := newData(true) 96 | 97 | inp := new(inputFORM) 98 | err := c.Bind(inp) 99 | if err != nil { 100 | return err 101 | } 102 | 103 | if strings.TrimSpace(inp.Names) == "" { 104 | return c.Render(http.StatusOK, "layout", data) 105 | } 106 | 107 | return parsingResults(c, gnps, inp, data) 108 | } 109 | } 110 | 111 | func parsingResults( 112 | c echo.Context, 113 | gnps GNparserService, 114 | inp *inputFORM, 115 | data *Data, 116 | ) error { 117 | var names []string 118 | data.WithDetails = inp.WithDetails == "on" 119 | data.WithCultivars = inp.WithCultivars == "on" 120 | data.PreserveDiaereses = inp.PreserveDiaereses == "on" 121 | data.Code = inp.Code 122 | 123 | format := inp.Format 124 | if format == "csv" || format == "tsv" || format == "json" { 125 | data.Format = format 126 | } 127 | 128 | data.Input = strings.TrimSpace(inp.Names) 129 | split := strings.Split(data.Input, "\n") 130 | if len(split) > 5_000 { 131 | split = split[0:5_000] 132 | } 133 | 134 | names = make([]string, len(split)) 135 | for i := range split { 136 | names[i] = strings.TrimSpace(split[i]) 137 | } 138 | if l := len(names); l > 0 { 139 | slog.Info("Parsed", 140 | "namesNum", l, 141 | "example", names[0], 142 | "parsedBy", "WEB GUI", 143 | ) 144 | } 145 | data.Input = strings.Join(names, "\n") 146 | 147 | opts := []gnparser.Option{ 148 | gnparser.OptWithDetails(data.WithDetails), 149 | gnparser.OptWithPreserveDiaereses(data.PreserveDiaereses), 150 | } 151 | 152 | if data.WithCultivars { 153 | opts = append(opts, gnparser.OptCode(nomcode.Cultivars)) 154 | } 155 | 156 | code := nomcode.New(data.Code) 157 | if code != nomcode.Unknown { 158 | // overrides data.WithCultivars 159 | opts = append(opts, gnparser.OptCode(code)) 160 | } 161 | 162 | gnp := gnps.ChangeConfig(opts...) 163 | data.Parsed = gnp.ParseNames(names) 164 | 165 | switch data.Format { 166 | case "json": 167 | return c.JSON(http.StatusOK, data.Parsed) 168 | case "csv", "tsv": 169 | f := gnfmt.CSV 170 | if data.Format == "tsv" { 171 | f = gnfmt.TSV 172 | } 173 | 174 | res := make([]string, len(data.Parsed)+1) 175 | res[0] = parsed.HeaderCSV(f) 176 | for i := range data.Parsed { 177 | res[i+1] = data.Parsed[i].Output(f) 178 | } 179 | return c.String(http.StatusOK, strings.Join(res, "\n")) 180 | default: 181 | return c.Render(http.StatusOK, "layout", data) 182 | } 183 | } 184 | 185 | func docAPI() func(echo.Context) error { 186 | return func(c echo.Context) error { 187 | data := newData(false) 188 | return c.Render(http.StatusOK, "layout", data) 189 | } 190 | } 191 | -------------------------------------------------------------------------------- /io/web/web_internal_test.go: -------------------------------------------------------------------------------- 1 | package web 2 | 3 | import ( 4 | "bytes" 5 | "net/http" 6 | "net/http/httptest" 7 | "net/url" 8 | "strings" 9 | "testing" 10 | 11 | "github.com/gnames/gnfmt" 12 | "github.com/gnames/gnlib/ent/gnvers" 13 | "github.com/gnames/gnparser" 14 | "github.com/gnames/gnparser/ent/parsed" 15 | "github.com/labstack/echo/v4" 16 | "github.com/stretchr/testify/assert" 17 | ) 18 | 19 | func handlerGET(path string) (echo.Context, *httptest.ResponseRecorder) { 20 | req := httptest.NewRequest(http.MethodGet, path, nil) 21 | rec := httptest.NewRecorder() 22 | e := echo.New() 23 | e.Renderer, _ = NewTemplate() 24 | c := e.NewContext(req, rec) 25 | return c, rec 26 | } 27 | 28 | func TestHome(t *testing.T) { 29 | var err error 30 | cfg := gnparser.NewConfig(gnparser.OptFormat(gnfmt.CompactJSON)) 31 | gnp := gnparser.New(cfg) 32 | gnps := NewGNparserService(gnp, 0) 33 | 34 | req := httptest.NewRequest(http.MethodGet, "/", nil) 35 | rec := httptest.NewRecorder() 36 | e := echo.New() 37 | c := e.NewContext(req, rec) 38 | e.Renderer, err = NewTemplate() 39 | assert.Nil(t, err) 40 | 41 | assert.Nil(t, homePOST(gnps)(c)) 42 | assert.Equal(t, http.StatusFound, rec.Code) 43 | } 44 | 45 | // func TestDocAPI(t *testing.T) { 46 | // c, rec := handlerGET("/doc/api") 47 | // assert.Nil(t, docAPI()(c)) 48 | // assert.Equal(t, http.StatusOK, rec.Code) 49 | // assert.Contains(t, rec.Body.String(), "Application Programming Interface") 50 | // } 51 | 52 | func TestInfo(t *testing.T) { 53 | c, rec := handlerGET("/") 54 | 55 | assert.Nil(t, info()(c)) 56 | assert.Equal(t, http.StatusOK, rec.Code) 57 | assert.Contains(t, rec.Body.String(), "OpenAPI") 58 | } 59 | 60 | func TestPing(t *testing.T) { 61 | cfg := gnparser.NewConfig(gnparser.OptFormat(gnfmt.CompactJSON)) 62 | gnp := gnparser.New(cfg) 63 | gnps := NewGNparserService(gnp, 0) 64 | c, rec := handlerGET("/ping") 65 | 66 | assert.Nil(t, ping(gnps)(c)) 67 | assert.Equal(t, http.StatusOK, rec.Code) 68 | assert.Equal(t, "pong", rec.Body.String()) 69 | } 70 | 71 | func TestVer(t *testing.T) { 72 | cfg := gnparser.NewConfig(gnparser.OptFormat(gnfmt.CompactJSON)) 73 | gnp := gnparser.New(cfg) 74 | gnps := NewGNparserService(gnp, 0) 75 | c, rec := handlerGET("/version") 76 | 77 | assert.Nil(t, ver(gnps)(c)) 78 | enc := gnfmt.GNjson{} 79 | var response gnvers.Version 80 | err := enc.Decode(rec.Body.Bytes(), &response) 81 | assert.Nil(t, err) 82 | assert.Regexp(t, `^v\d+\.\d+\.\d+`, response.Version) 83 | } 84 | 85 | func TestParseGET(t *testing.T) { 86 | cfg := gnparser.NewConfig(gnparser.OptFormat(gnfmt.CompactJSON)) 87 | gnp := gnparser.New(cfg) 88 | gnps := NewGNparserService(gnp, 0) 89 | 90 | var response []parsed.Parsed 91 | names := []string{ 92 | "Not name", "Bubo bubo", "Pomatomus", 93 | "Pardosa moesta", "Plantago major var major", 94 | "Cytospora ribis mitovirus 2", 95 | "A-shaped rods", "Alb. alba", 96 | "Pisonia grandis", "Acacia vestita may", 97 | } 98 | request := strings.Join( 99 | names, 100 | "|", 101 | ) 102 | namesQuery := url.QueryEscape(request) 103 | path := "/" + namesQuery 104 | 105 | c, rec := handlerGET(path) 106 | c.SetPath("/:names") 107 | c.SetParamNames("names") 108 | c.SetParamValues(namesQuery) 109 | 110 | assert.Nil(t, parseNamesGET(gnps)(c)) 111 | 112 | enc := gnfmt.GNjson{} 113 | err := enc.Decode(rec.Body.Bytes(), &response) 114 | assert.Nil(t, err) 115 | 116 | assert.Equal(t, len(names), len(response)) 117 | for i, v := range response { 118 | switch i { 119 | case 0: 120 | assert.Equal(t, "Not name", v.Verbatim, v.Verbatim) 121 | assert.False(t, v.Parsed, v.Verbatim) 122 | case 1: 123 | assert.Equal(t, "Bubo bubo", v.Verbatim, v.Verbatim) 124 | assert.Equal(t, "Bubo bubo", v.Canonical.Simple) 125 | } 126 | } 127 | } 128 | 129 | func TestParseParamsGET(t *testing.T) { 130 | cfg := gnparser.NewConfig(gnparser.OptFormat(gnfmt.CompactJSON)) 131 | gnp := gnparser.New(cfg) 132 | gnps := NewGNparserService(gnp, 0) 133 | 134 | name := url.QueryEscape("Bubo bubo") 135 | tests := []struct { 136 | csv, det, startsWith, pattern string 137 | contains bool 138 | }{ 139 | {"true", "false", "Id", "[", false}, 140 | {"true", "true", "Id", "[", false}, 141 | {"false", "false", "[", "details", false}, 142 | {"false", "true", "[", "details", true}, 143 | } 144 | 145 | for _, v := range tests { 146 | e := echo.New() 147 | q := make(url.Values) 148 | q.Set("csv", v.csv) 149 | q.Set("with_details", v.det) 150 | req := httptest.NewRequest(http.MethodGet, "/?"+q.Encode(), nil) 151 | rec := httptest.NewRecorder() 152 | c := e.NewContext(req, rec) 153 | c.SetPath("/:names") 154 | c.SetParamNames("names") 155 | c.SetParamValues(name) 156 | 157 | assert.Nil(t, parseNamesGET(gnps)(c)) 158 | 159 | body := rec.Body.String() 160 | assert.True(t, strings.HasPrefix(body, v.startsWith)) 161 | if v.contains { 162 | assert.True(t, strings.Contains(body, v.pattern)) 163 | } else { 164 | assert.False(t, strings.HasPrefix(body, v.pattern)) 165 | } 166 | } 167 | } 168 | 169 | func TestParsePOST(t *testing.T) { 170 | cfg := gnparser.NewConfig(gnparser.OptFormat(gnfmt.CompactJSON)) 171 | gnp := gnparser.New(cfg) 172 | gnps := NewGNparserService(gnp, 0) 173 | 174 | var response []parsed.Parsed 175 | names := []string{ 176 | "Not name", "Bubo bubo", "Leptochloöpsis virgata", 177 | "Pomatomus", "Pardosa moesta", 178 | "Plantago major var major", 179 | "Cytospora ribis mitovirus 2", 180 | "A-shaped rods", "Alb. alba", 181 | "Pisonia grandis", "Acacia vestita may", 182 | "Sarracenia flava 'Maxima'", 183 | } 184 | params := inputREST{ 185 | Names: names, 186 | CSV: false, 187 | WithDetails: false, 188 | WithCultivars: true, 189 | PreserveDiaereses: true, 190 | } 191 | reqBody, err := gnfmt.GNjson{}.Encode(params) 192 | assert.Nil(t, err) 193 | r := bytes.NewReader(reqBody) 194 | req := httptest.NewRequest(http.MethodPost, "/api/v1", r) 195 | req.Header.Set(echo.HeaderContentType, echo.MIMEApplicationJSON) 196 | rec := httptest.NewRecorder() 197 | e := echo.New() 198 | c := e.NewContext(req, rec) 199 | 200 | assert.Nil(t, parseNamesPOST(gnps)(c)) 201 | 202 | enc := gnfmt.GNjson{} 203 | err = enc.Decode(rec.Body.Bytes(), &response) 204 | assert.Nil(t, err) 205 | 206 | assert.Equal(t, len(names), len(response)) 207 | for i, v := range response { 208 | switch i { 209 | case 0: 210 | assert.Equal(t, "Not name", v.Verbatim, v.Verbatim) 211 | assert.False(t, v.Parsed, v.Verbatim) 212 | case 1: 213 | assert.Equal(t, "Bubo bubo", v.Verbatim, v.Verbatim) 214 | assert.Equal(t, "Bubo bubo", v.Canonical.Simple) 215 | case 2: 216 | assert.Equal(t, "Leptochloöpsis virgata", v.Verbatim, v.Verbatim) 217 | assert.Equal(t, "Leptochloöpsis virgata", v.Canonical.Simple) 218 | case 11: 219 | assert.Equal(t, "Sarracenia flava ‘Maxima’", v.Normalized) 220 | assert.Equal(t, 3, v.Cardinality) 221 | } 222 | 223 | } 224 | 225 | params = inputREST{ 226 | Names: names, 227 | CSV: true, 228 | WithDetails: false, 229 | WithCultivars: false, 230 | PreserveDiaereses: false, 231 | } 232 | reqBody, err = gnfmt.GNjson{}.Encode(params) 233 | r = bytes.NewReader(reqBody) 234 | req = httptest.NewRequest(http.MethodPost, "/api/v1", r) 235 | req.Header.Set(echo.HeaderContentType, echo.MIMEApplicationJSON) 236 | rec = httptest.NewRecorder() 237 | c = e.NewContext(req, rec) 238 | assert.Nil(t, parseNamesPOST(gnps)(c)) 239 | assert.True(t, strings.HasPrefix(rec.Body.String(), "Id")) 240 | } 241 | -------------------------------------------------------------------------------- /man/gnparser.1: -------------------------------------------------------------------------------- 1 | .\" generated with Ronn/v0.7.3 2 | .\" http://github.com/rtomayko/ronn/tree/0.7.3 3 | . 4 | .TH "GNPARSER" "1" "November 2021" "" "" 5 | . 6 | .SH "NAME" 7 | \fBgnparser\fR \- parse biodiversity scientific names 8 | . 9 | .SH "SYNOPSIS" 10 | \fBgnparser\fR [OPTION\.\.\.] [TERM/FILE] 11 | . 12 | .SH "DESCRIPTION" 13 | \fBGNparser\fR breaks biodiversity scientific names into their structural elements\. For example it finds that a genus in \fIHomo sapiens\fR is \fIHomo\fR\. 14 | . 15 | .P 16 | It can be used for one name, or for many names in a file (one name per line)\. 17 | . 18 | .SH "USAGE" 19 | . 20 | .SS "Usage for one name" 21 | . 22 | .nf 23 | 24 | gnparser "Pleurosigma vitrea var\. kjellmanii H\.Peragallo, 1891" 25 | 26 | # CSV output (default) 27 | gnparser "Parus major Linnaeus, 1788" 28 | # or 29 | gnparser \-f csv "Parus major Linnaeus, 1788" 30 | 31 | # TSV output (default) 32 | gnparser \-f tsv "Parus major Linnaeus, 1788" 33 | 34 | # JSON compact format 35 | gnparser "Parus major Linnaeus, 1788" \-f compact 36 | 37 | # pretty format 38 | gnparser \-f pretty "Parus major Linnaeus, 1788" 39 | 40 | # to parse a name from the standard input 41 | echo "Parus major Linnaeus, 1788" | gnparser 42 | . 43 | .fi 44 | . 45 | .SS "Usage for many names in a file" 46 | There is no flag for parsing a file\. If parser finds the given file path on your computer, it will parse the content of the file, assuming that every line is a new scientific name\. If the file path is not found, gnparser will try to parse the "path" as a scientific name\. 47 | . 48 | .P 49 | Parsed results will stream to STDOUT, while progress of the parsing will be directed to STDERR\. 50 | . 51 | .IP "" 4 52 | . 53 | .nf 54 | 55 | # to parse with 200 parallel processes 56 | gnparser \-j 200 names\.txt > names_parsed\.csv 57 | 58 | # to parse file with more detailed output 59 | gnparser names\.txt \-d \-f compact > names_parsed\.txt 60 | 61 | # to parse files using pipes 62 | cat names\.txt | gnparser \-f csv \-j 200 > names_parsed\.csv 63 | 64 | # to parse using stream method instead of batch method\. 65 | cat names\.txt | gnparser \-s > names_parsed\.csv 66 | 67 | # to not remove html tags and entities during parsing\. You gain a bit of 68 | # performance with this option if your data does not contain HTML tags or 69 | # entities\. 70 | gnparser "Pomatomus saltator" 71 | gnparser \-i "Pomatomus saltator" 72 | gnparser \-i "Pomatomus saltator" 73 | . 74 | .fi 75 | . 76 | .IP "" 0 77 | . 78 | .SH "GNPARSER SETTINGS" 79 | . 80 | .SS "\-h, \-\-help" 81 | Prints help information: 82 | . 83 | .IP "" 4 84 | . 85 | .nf 86 | 87 | gnparser \-h 88 | . 89 | .fi 90 | . 91 | .IP "" 0 92 | . 93 | .SS "\-b, \-\-batch_size (values: positive integers, default 50,000)" 94 | Sets a maximum number of names collected into a batch before processing\. This flag is ignored, if parsing is applied to only one name or if parsing mode is set to streaming with \-s flag: 95 | . 96 | .P 97 | gnparser \-b 100 names\.txt 98 | . 99 | .SS "\-c, \-\-capitalize" 100 | Capitalizes the first letter of a name\-string before parsing: 101 | . 102 | .P 103 | gnparser "homo sapiens" \-c 104 | . 105 | .SS "\-C, \-\-cultivar" 106 | Parses given name/s according to the Code of Cultivar Plants: 107 | . 108 | .P 109 | gnparser "Sarracenia flava \'Maxima\'" \-C gnparser "Cytisus purpureus + Laburnum anagyroides" \-C 110 | . 111 | .SS "\-D, \-\-diaereses" 112 | Preserves diaereses present in names: 113 | . 114 | .IP "" 4 115 | . 116 | .nf 117 | 118 | gnparser "Leptochloöpsis virgata" \-D 119 | . 120 | .fi 121 | . 122 | .IP "" 0 123 | . 124 | .P 125 | The stemmed canonical name will be generated without diaereses\. 126 | . 127 | .SS "\-d, \-\-details" 128 | Return more details for a parsed name\. This flag is ignored for CSV formatting: 129 | . 130 | .IP "" 4 131 | . 132 | .nf 133 | 134 | gnparser "Pardosa moesta Banks, 1982" \-d \-f pretty 135 | . 136 | .fi 137 | . 138 | .IP "" 0 139 | . 140 | .SS "\-f, \-\-format" 141 | Determines an output format\. Can be \fBcompact\fR, \fBpretty\fR, \fBcsv\fR\. Default is \fBcsv\fR\. 142 | . 143 | .P 144 | The default \fBcsv\fR format returns a header row and the CSV\-compatible parsed result: 145 | . 146 | .IP "" 4 147 | . 148 | .nf 149 | 150 | gnparser "Pardosa moesta" 151 | . 152 | .fi 153 | . 154 | .IP "" 0 155 | . 156 | .P 157 | The \fBtsv\fR format returns a header row and a tab\-delimited output: 158 | . 159 | .IP "" 4 160 | . 161 | .nf 162 | 163 | gnparser "Pardosa moesta" \-f tsv 164 | . 165 | .fi 166 | . 167 | .IP "" 0 168 | . 169 | .P 170 | The \fBcompact\fR format returns a JSON\-encoded result without indentations and new lines: 171 | . 172 | .IP "" 4 173 | . 174 | .nf 175 | 176 | gnparser "Pardosa moesta" \-f compact 177 | . 178 | .fi 179 | . 180 | .IP "" 0 181 | . 182 | .P 183 | The \fBpretty\fR format returns a JSON\-encoded result in a more human\-readable form: 184 | . 185 | .IP "" 4 186 | . 187 | .nf 188 | 189 | gnparser "Pardosa moesta" \-f pretty 190 | . 191 | .fi 192 | . 193 | .IP "" 0 194 | . 195 | .SS "\-i, \-\-ignore_tags" 196 | By default \fBgnparser\fR scans names for HTML tags and removes them before parsing\. It slows the process slightly\. If there are no HTML tags in names (no names are like \fBAus bus L\.\fR, this flag allows to skip HTML removal step, increasing performance slightly: 197 | . 198 | .IP "" 4 199 | . 200 | .nf 201 | 202 | gnparser \-i plain\-text\-names\.txt 203 | . 204 | .fi 205 | . 206 | .IP "" 0 207 | . 208 | .SS "\-j, \-\-jobs (positive integer, default is a number of CPUs on a machine)" 209 | The number of jobs running concurrently\. This flag is ignored when parsing one name: 210 | . 211 | .IP "" 4 212 | . 213 | .nf 214 | 215 | gnparser \-j 200 names\.txt 216 | . 217 | .fi 218 | . 219 | .IP "" 0 220 | . 221 | .SS "\-p, \-\-port (port number)" 222 | Set a port to run web\-interface and RESTful API and starts an HTTP service on this port: 223 | . 224 | .IP "" 4 225 | . 226 | .nf 227 | 228 | gnparser \-p 80 229 | . 230 | .fi 231 | . 232 | .IP "" 0 233 | . 234 | .SS "\-s, \-\-stream" 235 | Changes parsing method for large number of names from \fBbatch\fR to \fBstream\fR\. If this flag is set, gnparser can be used from any language application using pipe\-in/pipe\-out methods\. Such an approach requires sending 1 name at a time to gnparser instead of sending names in batches\. Streaming allows to achieve that, but there is a slight decrease in performance: 236 | . 237 | .IP "" 4 238 | . 239 | .nf 240 | 241 | gnparser \-s names\.json 242 | . 243 | .fi 244 | . 245 | .IP "" 0 246 | . 247 | .SS "\-u, \-\-unordered" 248 | If this flag is on, output and intput order will not be syncronized\. If there is only one parsing job running (\fB\-j\fR flag), the input and output will be of the same order even if \fB\-u\fR flag is given\. 249 | . 250 | .IP "" 4 251 | . 252 | .nf 253 | 254 | gnparser \-u \-j 100 names\.txt 255 | . 256 | .fi 257 | . 258 | .IP "" 0 259 | . 260 | .SS "\-V, \-\-version" 261 | Shows the version number of gnparser\. 262 | . 263 | .SH "COPYRIGHT" 264 | The MIT License (MIT) 265 | . 266 | .P 267 | Copyright (c) 2018\-2022 Dmitry Mozzherin 268 | . 269 | .SH "Contributors" 270 | Toby Marsden, Geoffrey Ower, Hernan Lucas Pereira 271 | -------------------------------------------------------------------------------- /man/gnparser.1.ronn: -------------------------------------------------------------------------------- 1 | # gnparser -- parse biodiversity scientific names 2 | 3 | ## SYNOPSIS 4 | 5 | **gnparser** [OPTION...] [TERM/FILE] 6 | 7 | ## DESCRIPTION 8 | 9 | **GNparser** breaks biodiversity scientific names into their structural 10 | elements. For example it finds that a genus in *Homo sapiens* is *Homo*. 11 | 12 | It can be used for one name, or for many names in a file (one name per line). 13 | 14 | ## USAGE 15 | 16 | ### Usage for one name 17 | 18 | gnparser "Pleurosigma vitrea var. kjellmanii H.Peragallo, 1891" 19 | 20 | # CSV output (default) 21 | gnparser "Parus major Linnaeus, 1788" 22 | # or 23 | gnparser -f csv "Parus major Linnaeus, 1788" 24 | 25 | # TSV output (default) 26 | gnparser -f tsv "Parus major Linnaeus, 1788" 27 | 28 | # JSON compact format 29 | gnparser "Parus major Linnaeus, 1788" -f compact 30 | 31 | # pretty format 32 | gnparser -f pretty "Parus major Linnaeus, 1788" 33 | 34 | # to parse a name from the standard input 35 | echo "Parus major Linnaeus, 1788" | gnparser 36 | 37 | ### Usage for many names in a file 38 | 39 | There is no flag for parsing a file. If parser finds the given file path on 40 | your computer, it will parse the content of the file, assuming that every line 41 | is a new scientific name. If the file path is not found, gnparser will try to 42 | parse the "path" as a scientific name. 43 | 44 | Parsed results will stream to STDOUT, while progress of the parsing will be 45 | directed to STDERR. 46 | 47 | # to parse with 200 parallel processes 48 | gnparser -j 200 names.txt > names_parsed.csv 49 | 50 | # to parse file with more detailed output 51 | gnparser names.txt -d -f compact > names_parsed.txt 52 | 53 | # to parse files using pipes 54 | cat names.txt | gnparser -f csv -j 200 > names_parsed.csv 55 | 56 | # to parse using stream method instead of batch method. 57 | cat names.txt | gnparser -s > names_parsed.csv 58 | 59 | # to not remove html tags and entities during parsing. You gain a bit of 60 | # performance with this option if your data does not contain HTML tags or 61 | # entities. 62 | gnparser "Pomatomus saltator" 63 | gnparser -i "Pomatomus saltator" 64 | gnparser -i "Pomatomus saltator" 65 | 66 | ## GNPARSER SETTINGS 67 | 68 | ### -h, --help 69 | 70 | Prints help information: 71 | 72 | gnparser -h 73 | 74 | ### -b, --batch_size (values: positive integers, default 50,000) 75 | 76 | Sets a maximum number of names collected into a batch before processing. 77 | This flag is ignored, if parsing is applied to only one name or 78 | if parsing mode is set to streaming with -s flag: 79 | 80 | gnparser -b 100 names.txt 81 | 82 | ### -c, --capitalize 83 | 84 | Capitalizes the first letter of a name-string before parsing: 85 | 86 | gnparser "homo sapiens" -c 87 | 88 | ### -C, --cultivar 89 | 90 | Parses given name/s according to the Code of Cultivar Plants: 91 | 92 | gnparser "Sarracenia flava 'Maxima'" -C 93 | gnparser "Cytisus purpureus + Laburnum anagyroides" -C 94 | 95 | ### -D, --diaereses 96 | 97 | Preserves diaereses present in names: 98 | 99 | gnparser "Leptochloöpsis virgata" -D 100 | 101 | The stemmed canonical name will be generated without diaereses. 102 | 103 | ### -d, --details 104 | 105 | Return more details for a parsed name. This flag is ignored for CSV formatting: 106 | 107 | gnparser "Pardosa moesta Banks, 1982" -d -f pretty 108 | 109 | ### -f, --format 110 | 111 | Determines an output format. Can be `compact`, `pretty`, `csv`. 112 | Default is `csv`. 113 | 114 | The default `csv` format returns a header row and the CSV-compatible 115 | parsed result: 116 | 117 | gnparser "Pardosa moesta" 118 | 119 | The `tsv` format returns a header row and a tab-delimited output: 120 | 121 | gnparser "Pardosa moesta" -f tsv 122 | 123 | The `compact` format returns a JSON-encoded result without indentations and 124 | new lines: 125 | 126 | gnparser "Pardosa moesta" -f compact 127 | 128 | The `pretty` format returns a JSON-encoded result in a more human-readable 129 | form: 130 | 131 | gnparser "Pardosa moesta" -f pretty 132 | 133 | ### -i, --ignore_tags 134 | 135 | By default `gnparser` scans names for HTML tags and removes them before 136 | parsing. It slows the process slightly. If there are no HTML tags in names 137 | (no names are like `Aus bus L.`, this flag allows to skip HTML removal 138 | step, increasing performance slightly: 139 | 140 | gnparser -i plain-text-names.txt 141 | 142 | ### -j, --jobs (positive integer, default is a number of CPUs on a machine) 143 | 144 | The number of jobs running concurrently. This flag is ignored when parsing 145 | one name: 146 | 147 | gnparser -j 200 names.txt 148 | 149 | ### -p, --port (port number) 150 | 151 | Set a port to run web-interface and RESTful API and starts an HTTP service on 152 | this port: 153 | 154 | gnparser -p 80 155 | 156 | ### -s, --stream 157 | 158 | Changes parsing method for large number of names from `batch` to `stream`. 159 | If this flag is set, gnparser can be used from any language application 160 | using pipe-in/pipe-out methods. Such an approach requires sending 1 name 161 | at a time to gnparser instead of sending names in batches. Streaming allows 162 | to achieve that, but there is a slight decrease in performance: 163 | 164 | gnparser -s names.json 165 | 166 | ### -u, --unordered 167 | 168 | If this flag is on, output and intput order will not be syncronized. If there 169 | is only one parsing job running (`-j` flag), the input and output will be of 170 | the same order even if `-u` flag is given. 171 | 172 | gnparser -u -j 100 names.txt 173 | 174 | ### -V, --version 175 | 176 | Shows the version number of gnparser. 177 | 178 | 179 | ## COPYRIGHT 180 | 181 | The MIT License (MIT) 182 | 183 | Copyright (c) 2018-2022 Dmitry Mozzherin 184 | 185 | ## Contributors 186 | 187 | Toby Marsden, Geoffrey Ower, Hernan Lucas Pereira 188 | -------------------------------------------------------------------------------- /nsqd.dat: -------------------------------------------------------------------------------- 1 | {"topics":[{"channels":[],"name":"test","paused":false}],"version":"1.2.1"} -------------------------------------------------------------------------------- /quality.md: -------------------------------------------------------------------------------- 1 | # Quality categories 2 | 3 | ## Quality 0 4 | 5 | Parsing failed. 6 | 7 | ## Quality 1 8 | 9 | Parsing finished without detecting any problems. 10 | 11 | ## Quality 2 12 | 13 | - Abbreviated subgenus 14 | - Ambiguity: subgenus or superspecies found 15 | - Ambiguous f. (filius or forma) 16 | - Apparent genus with capital character after hyphen 17 | - Author in upper case 18 | - Author is unknown 19 | - Bacterial `Candidatus` name 20 | - Combination of two uninomials 21 | - Cultivar epithet 22 | - Deprecated Greek letter enumeration in rank 23 | - Emend authors are not required 24 | - `ex` authors are not required 25 | - Hybrid formula 26 | - Misplaced basionym year 27 | - Multiple adjacent space characters 28 | - Named hybrid 29 | - Non-standard characters in canonical 30 | - Non-standard space characters 31 | - Ambiguity: ICN author or subgenus 32 | - Probably incomplete hybrid formula 33 | - Spanish 'y' is used instead of '&' 34 | - Trailing whitespace 35 | - Year with latin character 36 | - Year with page info 37 | - Year with parentheses 38 | - Year with period 39 | - Year with question mark 40 | 41 | ## Quality 3 42 | 43 | - Apostrophe is not allowed in canonical 44 | - Author is too short 45 | - HTML tags or entities in the name 46 | - Hybrid char is not separated by space 47 | - Not an ASCII apostrophe 48 | - Numeric prefix 49 | - Uncommon rank 50 | - Year with square brackets 51 | - Years range 52 | - `emend` without a period 53 | - `ex` ends with a period 54 | - `in` ends with a period 55 | 56 | ## Quality 4 57 | 58 | - Abbreviated uninomial word 59 | - Author as a question mark 60 | - Authorship in double parentheses 61 | - Authorship is missing one parenthesis 62 | - Incomplete hybrid formula 63 | - Incorrect conversion to UTF-8 64 | - Name comparison 65 | - Name is approximate 66 | - Name starts with low-case character 67 | - Uninomial word with question mark 68 | - Unparsed tail 69 | -------------------------------------------------------------------------------- /shell.nix: -------------------------------------------------------------------------------- 1 | { mkShell, go, gopls }: 2 | mkShell rec { 3 | buildInputs = [ go gopls ]; 4 | } 5 | -------------------------------------------------------------------------------- /testdata/exceptions.txt: -------------------------------------------------------------------------------- 1 | Aleuroclava complex Singh, 1931 2 | Allawrencius complex Lawrence, 1953 3 | Bolbodeomyia complex Theobald, 1910 4 | Castelnaudia spec (Darlington, 1962) 5 | Cicada complex Walker, 1850 6 | Dichostasia complex Yochelson, 1956 7 | Dimorphoceras complex (Moore, 1939) 8 | Dischidia complex Griff. 9 | Ecnomus complex Mosely, 1932 10 | Fusinus complex M. A. Snyder, 2000 11 | Fusinus pauciliratus complex M. A. Snyder, 2000 12 | Gonatobotrys complex Jane Walker & Minter 13 | Heizmannia (Heizmannia) complex (Theobald, 1910) 14 | Hemicloeina spec Platnick, 2002 15 | Libystica complex Holland, 1894 16 | Notozomus spec (Harvey, 1992) 17 | Ochodaeus complex LeConte, 1868 18 | Odontella do J Najt, & WM Weiner 19 | Oecetis complex Hwang, 1957 20 | Oedipina complex (Dunn, 1924) 21 | Oedipus complex Dunn, 1924 22 | Oedopinola complex (Dunn, 1924) 23 | Paradimorphoceras complex (Moore, 1939) 24 | Parentia do Bickel, 2002 25 | Phyllospongia complex de Laubenfels, 1954 26 | Plectrocnemia complex Hwang, 1958 27 | Rubus complex L. H. Bailey 28 | Sceliphron complex Kohl, 1918 29 | Sceliphron fossuliferum complex Kohl, 1918 30 | Scopaeus (Scopaeus) complex Sharp, 1874 31 | Scopaeus complex Sharp, 1874 32 | Sigipinius complex Golovatch, 2013 33 | Stegosoladidus complex Berge, 2001 34 | Tetracis complex Sharp, 1874 35 | Tetramorium do Forel, 1914 36 | Trichosternus spec Darlington, 1962 37 | Trisephena complex Medler, 1990 38 | -------------------------------------------------------------------------------- /tools/gentest.go: -------------------------------------------------------------------------------- 1 | //go:build ignore 2 | // +build ignore 3 | 4 | // Generates a new test_data_new.txt file out of test_data.txt using current 5 | // parser output. We need to do this in cases when parser output is modified. 6 | // Run `go run gentest.go` 7 | package main 8 | 9 | import ( 10 | "bufio" 11 | "os" 12 | "path/filepath" 13 | "strings" 14 | 15 | "github.com/gnames/gnfmt" 16 | "github.com/gnames/gnlib/ent/nomcode" 17 | "github.com/gnames/gnparser" 18 | "github.com/gnames/gnparser/ent/parsed" 19 | ) 20 | 21 | func genTestData() error { 22 | testFiles := []string{"test_data", "test_data_cultivars"} 23 | for _, v := range testFiles { 24 | err := newTestFile(v) 25 | if err != nil { 26 | return err 27 | } 28 | } 29 | return nil 30 | } 31 | 32 | func newTestFile(file string) error { 33 | enc := gnfmt.GNjson{} 34 | path := filepath.Join("..", "testdata", file+".md") 35 | outPath := filepath.Join("..", "testdata", file+"_new.md") 36 | f, err := os.OpenFile(path, os.O_RDONLY, os.ModePerm) 37 | if err != nil { 38 | return err 39 | } 40 | w, err := os.Create(outPath) 41 | if err != nil { 42 | return err 43 | } 44 | defer f.Close() 45 | defer w.Close() 46 | 47 | sc := bufio.NewScanner(f) 48 | opts := []gnparser.Option{gnparser.OptIsTest(true), gnparser.OptWithDetails(true)} 49 | if file == "test_data_cultivars" { 50 | opts = append(opts, gnparser.OptCode(nomcode.Cultivars)) 51 | } 52 | cfg := gnparser.NewConfig(opts...) 53 | gnp := gnparser.New(cfg) 54 | var res parsed.Parsed 55 | isName := false 56 | var count int 57 | var can, au, nameString string 58 | var jsonData []byte 59 | for sc.Scan() { 60 | line := sc.Text() 61 | if !isName { 62 | w.Write([]byte(line + "\n")) 63 | if strings.HasPrefix(line, "Name: ") { 64 | isName = true 65 | nameString = line[6:] 66 | res = gnp.ParseName(nameString) 67 | jsonData, _ = enc.Encode(res) 68 | if res.Parsed { 69 | can = res.Canonical.Full 70 | if res.Authorship != nil { 71 | au = res.Authorship.Normalized 72 | } 73 | } 74 | } 75 | continue 76 | } 77 | count++ 78 | switch count { 79 | case 2: // Canonical: name_here 80 | can = strings.TrimRight("Canonical: "+can, " ") 81 | w.Write([]byte(can + "\n")) 82 | case 4: // Authorship 83 | au = strings.TrimRight("Authorship: "+au, " ") 84 | w.Write([]byte(au + "\n")) 85 | case 7: 86 | w.Write(jsonData) 87 | w.Write([]byte("\n")) 88 | count = 0 89 | isName = false 90 | can, au = "", "" 91 | jsonData = []byte("") 92 | default: 93 | w.Write([]byte(line + "\n")) 94 | } 95 | } 96 | if err := sc.Err(); err != nil { 97 | return err 98 | } 99 | 100 | return nil 101 | } 102 | 103 | func main() { 104 | genTestData() 105 | } 106 | -------------------------------------------------------------------------------- /tools/quality.go: -------------------------------------------------------------------------------- 1 | //go:build ignore 2 | // +build ignore 3 | 4 | // quality.go generates a markdown file that describes meaning of each quality 5 | // category. 6 | package main 7 | 8 | import ( 9 | "fmt" 10 | "slices" 11 | 12 | "github.com/gnames/gnparser/ent/parsed" 13 | ) 14 | 15 | var body = `# Quality categories 16 | 17 | ## Quality 0 18 | 19 | Parsing failed. 20 | 21 | ## Quality 1 22 | 23 | Parsing finished without detecting any problems.` 24 | 25 | func main() { 26 | warnsMap := make(map[int][]string) 27 | for k, v := range parsed.WarningQualityMap { 28 | warnsMap[v] = append(warnsMap[v], k.String()) 29 | } 30 | 31 | for _, v := range []int{2, 3, 4} { 32 | warns := warnsMap[v] 33 | slices.Sort(warns) 34 | item := fmt.Sprintf("\n\n## Quality %d\n", v) 35 | for i := range warns { 36 | warn := fmt.Sprintf("\n- %s", warns[i]) 37 | item += warn 38 | } 39 | body += item 40 | } 41 | fmt.Println(body) 42 | } 43 | -------------------------------------------------------------------------------- /version.go: -------------------------------------------------------------------------------- 1 | package gnparser 2 | 3 | var ( 4 | // Version is the version of the gnparser package. When Makefile is 5 | // used, the version is calculated out of Git tags. 6 | Version = "v1.11.6" 7 | // Build is a timestamp of when Makefile was used to compile 8 | // the gnparser code. If go build was used, Build stays empty. 9 | Build string 10 | ) 11 | --------------------------------------------------------------------------------