├── .github
    └── workflows
    │   └── test.yml
├── .gitignore
├── CHANGELOG.md
├── CITATION.cff
├── CONTRIBUTING.md
├── Dockerfile
├── LICENSE
├── Makefile
├── README.md
├── binding
    └── main.go
├── build
    ├── bin
    │   └── .gitkeep
    ├── clib
    │   └── .gitkeep
    └── release
    │   └── .gitkeep
├── cli_test.go
├── config.go
├── config_test.go
├── default.nix
├── docker-compose.yml
├── ent
    ├── internal
    │   ├── preparser
    │   │   ├── grammar.peg
    │   │   ├── grammar.peg.go
    │   │   ├── preparser.go
    │   │   └── preparser_test.go
    │   └── preprocess
    │   │   ├── annot.rl
    │   │   ├── cleanup.go
    │   │   ├── noparse.go
    │   │   ├── noparse.rl
    │   │   ├── preprocess.go
    │   │   ├── preprocess_test.go
    │   │   ├── virus.go
    │   │   └── virus.rl
    ├── nameidx
    │   └── nameidx.go
    ├── parsed
    │   ├── annotation.go
    │   ├── annotation_test.go
    │   ├── details.go
    │   ├── flatten.go
    │   ├── flatten_test.go
    │   ├── interface.go
    │   ├── output.go
    │   ├── parsed.go
    │   ├── parsed_result.go
    │   ├── restore_ambiguous.go
    │   ├── warning.go
    │   ├── warning_test.go
    │   └── words.go
    ├── parser
    │   ├── ast.go
    │   ├── engine.go
    │   ├── grammar.peg
    │   ├── grammar.peg.go
    │   ├── interfaces.go
    │   ├── name.go
    │   ├── output.go
    │   ├── parser.go
    │   └── parser_test.go
    ├── stemmer
    │   ├── stemmer.go
    │   └── stemmer_test.go
    └── str
    │   ├── str.go
    │   └── str_test.go
├── flake.lock
├── flake.nix
├── gnparser.go
├── gnparser
    ├── LICENSE
    ├── cmd
    │   ├── flags.go
    │   ├── parse_batch.go
    │   ├── parse_stream.go
    │   └── root.go
    ├── main.go
    └── tools.go
├── gnparser_stream.go
├── gnparser_test.go
├── go.mod
├── go.sum
├── interface.go
├── io
    ├── dict
    │   ├── data
    │   │   ├── README.md
    │   │   ├── bacteria_genera.txt
    │   │   ├── bacteria_genera_homonyms.txt
    │   │   └── genera_auth_icn.txt
    │   ├── dict.go
    │   └── dict_test.go
    └── web
    │   ├── gnparser_service.go
    │   ├── interface.go
    │   ├── server.go
    │   ├── static
    │       ├── images
    │       │   ├── favicon.ico
    │       │   ├── github-mark.svg
    │       │   └── gna.svg
    │       └── styles
    │       │   ├── parser.css
    │       │   └── screen.css
    │   ├── templates.go
    │   ├── templates
    │       ├── doc_api.html
    │       ├── home.html
    │       └── layout.html
    │   ├── web.go
    │   └── web_internal_test.go
├── man
    ├── gnparser.1
    ├── gnparser.1.html
    └── gnparser.1.ronn
├── nsqd.dat
├── quality.md
├── shell.nix
├── testdata
    ├── exceptions.txt
    ├── stems.txt
    ├── test_data.md
    └── test_data_cultivars.md
├── tools
    ├── gentest.go
    └── quality.go
└── version.go


/.github/workflows/test.yml:
--------------------------------------------------------------------------------
 1 | name: Tests
 2 | 
 3 | on:
 4 |   push:
 5 |     branches: [master]
 6 |   pull_request:
 7 |     branches: [master]
 8 | 
 9 | jobs:
10 |   build:
11 |     name: Build
12 |     runs-on: ${{ matrix.os }}
13 | 
14 |     strategy:
15 |       matrix:
16 |         os: [ubuntu-latest, macos-latest, windows-latest]
17 | 
18 |     steps:
19 |       - name: Set up Go
20 |         uses: actions/setup-go@v3
21 |         with:
22 |           go-version: 1.23
23 | 
24 |       - name: Check out code into the Go module directory
25 |         uses: actions/checkout@v3
26 |       - run: git fetch --prune --unshallow
27 | 
28 |       - name: install tools and dependencies
29 |         run: make tools
30 | 
31 |       - name: Test
32 |         run: make test
33 | 
34 |       - name: Build
35 |         run: make build
36 | 
37 |       - name: Build C lib
38 |         run: make clib
39 | 
40 |       - name: Store C library
41 |         uses: actions/upload-artifact@v4
42 |         with:
43 |           name: ${{ matrix.os }}-clib
44 |           path: binding/lib*
45 |           retention-days: 1
46 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
 1 | .idea
 2 | t
 3 | result
 4 | prof
 5 | .vscode
 6 | 200k-lines.txt
 7 | test_data.new.txt
 8 | .idea
 9 | gnparser/gnparser
10 | bench*.txt
11 | binding/libgnparser.h
12 | binding/*.so
13 | build/**
14 | .DS_Store
15 | *.pprof
16 | *.test
17 | *.gif
18 | __debug_bin
19 | 


--------------------------------------------------------------------------------
/CITATION.cff:
--------------------------------------------------------------------------------
 1 | cff-version: 1.2.0
 2 | message: "If you use this software, please cite it as below."
 3 | title: "GNparser -- a powerful scientific names parser."
 4 | version: v1.11.1 
 5 | authors:
 6 |   - family-names: "Mozzherin"
 7 |     given-names: "Dmitry"
 8 |     orcid: "https://orcid.org/0000-0003-1593-1417"
 9 |   - family-names: "Marsden"
10 |     given-names: "Toby"
11 |   - family-names: "Pereira"
12 |     given-names: "Hernán Lucas"
13 |     orcid: "https://orcid.org/0000-0001-6681-7038"
14 | repository-code: "https://github.com/gnames/gnparser"
15 | doi: 10.5281/zenodo.14096467
16 | date-released: 2024-11-07
17 | license: MIT
18 | 


--------------------------------------------------------------------------------
/CONTRIBUTING.md:
--------------------------------------------------------------------------------
  1 | # How to contribute to ``gnparser`` project
  2 | 
  3 | ## **Did you find a bug?**
  4 | 
  5 | * **Ensure the bug was not already reported** by searching on GitHub under
  6 |   [Issues](https://github.com/gnames/gnparser/issues).
  7 | 
  8 | * If you're unable to find an open issue addressing the problem, [open a new
  9 |   one](https://github.com/gnames/gnparser/issues/new). Be sure to include a
 10 |   **title and clear description**, as much relevant information as possible,
 11 |   and a **code sample** or an **executable test case** via
 12 |   [https:parser.globalnames.org](https://parser.globalnames.org) demonstrating
 13 |   the expected behavior that is not occurring.
 14 | * Make sure you **do not put more than one bug report** in the new issue.
 15 | 
 16 | ## **Do you intend to add a new feature or change an existing one?**
 17 | 
 18 | * Suggest your change in the [GlobalNames gitter
 19 |   group](https://gitter.im/GlobalNamesArchitecture/GlobalNames), or [create an
 20 |   issue](https://github.com/gnames/gnparser/issues/new) that describes your
 21 |   suggestion in detail.
 22 | * Make sure you **do not put more than one feature or change** in the new issue.
 23 | 
 24 | ## **Did you write a patch that fixes a bug?**
 25 | 
 26 | * Open a new GitHub pull request with the patch.
 27 | 
 28 | * Ensure the PR description clearly describes the problem and solution. Include
 29 |   the relevant issue number if applicable.
 30 | 
 31 | * Clearly state if your PR is a proof of concept and what needs to be done to
 32 |   finish it, or, if it is ready to merge patch with tests and documentation
 33 |   added.
 34 | 
 35 | ## **Did you write a client for your favorite language to access ``gnparser`` functionality via REST api?**
 36 | 
 37 | Let us know about your client on [GlobalNames gitter
 38 | group](https://gitter.im/GlobalNamesArchitecture/GlobalNames).
 39 | 
 40 | ## **Do you have questions about the source code?**
 41 | 
 42 | * Ask any question on the [GlobalNames gitter
 43 |   group](https://gitter.im/GlobalNamesArchitecture/GlobalNames)
 44 | 
 45 | ## **Would you like to contribute, but do not know how?**
 46 | 
 47 | * Read the next section about configuring environment for the project.
 48 | 
 49 | ## **Setting up ``gnparser`` programming environment**
 50 | 
 51 | ### Install Go
 52 | 
 53 | [Download and install Go](https://golang.org/doc/install) for your operating
 54 | system. Make sure you [configured GOPATH environment
 55 | library](https://github.com/golang/go/wiki/SettingGOPATH).
 56 | 
 57 | You need Go v1.16.x or higher.
 58 | 
 59 | ### Install ``gnparser`` code
 60 | 
 61 | Before Go v1.11 all Go code had to be organized inside of the ``GOPATH``
 62 | directory. Now, for projects like ``gnparser`` that use Go modules it is not
 63 | necessary, however many tools still behave assuming old ways, so we recommend
 64 | to setup ``gnparser`` code traditional way.
 65 | 
 66 | ```bash
 67 | mkdir -p $GOPATH/src/github.com/gnames
 68 | cd $GOPATH/src/github.com/gnames
 69 | git clone https://github.com/gnames/gnparser.git
 70 | # or use URL of your fork on GitHub or GitLab
 71 | 
 72 | cd gnparser
 73 | ```
 74 | 
 75 | ``gnparser`` uses several external tools and technologies:
 76 | 
 77 | 1. [Parsing Expression Grammar tool](https://github.com/pointlander/peg) to
 78 |    generate parsing code.
 79 | 
 80 | 2. [Cobra CLI framework](https://github.com/spf13/cobra) for creating command
 81 |    line application.
 82 | 
 83 | 3. [goimport tool](https://golang.org/x/tools/cmd/goimports) for fixing
 84 |    imports in PEG autogenerated go code.
 85 | 
 86 | To install them run
 87 | 
 88 | ```bash
 89 | make tools
 90 | ```
 91 | 
 92 | To create a ``gnparser`` executable and place it to $GOPATH/bin
 93 | 
 94 | ```bash
 95 | make
 96 | ```
 97 | 
 98 | Now you should be able to use gnparser compiled from the code:
 99 | 
100 | ```bash
101 | gnparser -f pretty "Pica pica (Linnaeus, 1758)"
102 | ```
103 | 
104 | ### To run tests
105 | 
106 | ```bash
107 | make test
108 | ```
109 | 
110 | or
111 | 
112 | ```bash
113 | go test ./...
114 | ```
115 | 
116 | ### To generate tests automatically
117 | 
118 | If your change generates a lot of changes in `testdata/test_data.md`
119 | and/or `testdata/test_data_cultivars.md` you can generate
120 | `testdata/test_data_new.md` and `testdata/test_data_cultivars_new.md`
121 | files using `gentest.go` tool.
122 | 
123 | ```bash
124 | cd tools
125 | go run gentest.go
126 | cd ../testdata
127 | ls
128 | ```
129 | 
130 | You will have two new files in testdata. It is VERY important now to check
131 | difference between old and new test files before making the next step:
132 | 
133 | ```bash
134 | mv test_data_new.md test_data.md
135 | mv test_data_cultivars_new.md test_data_cultivars.md
136 | ```
137 | 
138 | ## Benchmarks
139 | 
140 | Benchmarks are located in `gnparser_test.go`
141 | 
142 | To run benchmarks from the project's root:
143 | 
144 | ```bash
145 | # this command will install benchstat
146 | make tools
147 | 
148 | go test -bench=. -benchmem -count=10 -run=XXX > bench.txt && benchstat bench.txt
149 | ```
150 | 
151 | After running you should get results similar to:
152 | 
153 | ```bash
154 | name                                     time/op
155 | Parse/Parse_to_object_once-16            73.0µs ± 1%
156 | Parse/Parse_to_object_once_with_Init-16  83.2µs ± 1%
157 | Parse/Parse_to_object-16                 67.5ms ± 1%
158 | Parse/Parse_to_JSON-16                   71.5ms ± 1%
159 | Parse/Parse_to_JSON_(Details)-16         71.8ms ± 1%
160 | Parse/Parse_to_CSV-16                    69.1ms ± 1%
161 | 
162 | name                                     alloc/op
163 | Parse/Parse_to_object_once-16            10.9kB ± 0%
164 | Parse/Parse_to_object_once_with_Init-16  23.8kB ± 0%
165 | Parse/Parse_to_object-16                 15.5MB ± 0%
166 | Parse/Parse_to_JSON-16                   17.2MB ± 0%
167 | Parse/Parse_to_JSON_(Details)-16         17.2MB ± 0%
168 | Parse/Parse_to_CSV-16                    16.2MB ± 0%
169 | 
170 | name                                     allocs/op
171 | Parse/Parse_to_object_once-16               250 ± 0%
172 | Parse/Parse_to_object_once_with_Init-16     409 ± 0%
173 | Parse/Parse_to_object-16                   235k ± 0%
174 | Parse/Parse_to_JSON-16                     242k ± 0%
175 | Parse/Parse_to_JSON_(Details)-16           242k ± 0%
176 | Parse/Parse_to_CSV-16                      240k ± 0%
177 | ```
178 | 
179 | ### Accessing a raw parsed AST tree
180 | 
181 | PEG parser generates it own abstract syntax tree (AST), that later gets
182 | conberted into a ``gnparser`` specific AST. Sometimes it is useful to see the
183 | raw tree of nodes. To do that, open gnparser/gnparser/cmd/root.go,
184 | change ``const debug`` to ``true`` and run ``make``. After that you will be
185 | able to examing raw tree of a string, for example:
186 | 
187 | ```bash
188 | gnparser "Bubo bubo"
189 | ```
190 | 


--------------------------------------------------------------------------------
/Dockerfile:
--------------------------------------------------------------------------------
 1 | FROM alpine:3.14
 2 | 
 3 | LABEL maintainer="Dmitry Mozzherin"
 4 | 
 5 | ENV LAST_FULL_REBUILD=2024-10-11
 6 | 
 7 | WORKDIR /bin
 8 | 
 9 | COPY ./gnparser/gnparser /bin
10 | 
11 | ENTRYPOINT [ "gnparser" ]
12 | 
13 | CMD ["-p", "8778"]
14 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 | MIT License
 2 | 
 3 | Copyright (c) 2019 gnames
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | of this software and associated documentation files (the "Software"), to deal
 7 | in the Software without restriction, including without limitation the rights
 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 | 
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 | 
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 | 


--------------------------------------------------------------------------------
/Makefile:
--------------------------------------------------------------------------------
  1 | PROJ_NAME = gnparser
  2 | 
  3 | VERSION = $(shell git describe --tags)
  4 | VER = $(shell git describe --tags --abbrev=0)
  5 | DATE = $(shell date -u '+%Y-%m-%d_%H:%M:%S%Z')
  6 | 
  7 | NO_C = CGO_ENABLED=0
  8 | FLAGS_SHARED = GOARCH=amd64
  9 | FLAGS_LINUX = GOARCH=amd64 GOOS=linux
 10 | FLAGS_LINUX_ARM = GOARCH=arm64 GOOS=linux
 11 | FLAGS_MAC = GOARCH=amd64 GOOS=darwin
 12 | FLAGS_MAC_ARM = GOARCH=arm64 GOOS=darwin
 13 | FLAGS_WIN = GOARCH=amd64 GOOS=windows
 14 | FLAGS_WIN_ARM = GOARCH=arm64 GOOS=windows
 15 | FLAGS_LD=-ldflags "-s -w -X github.com/gnames/$(PROJ_NAME).Build=$(DATE) \
 16 |                   -X github.com/gnames/$(PROJ_NAME).Version=$(VERSION)"
 17 | FLAGS_REL = -trimpath -ldflags "-s -w \
 18 | 						-X github.com/gnames/$(PROJ_NAME).Build=$(DATE)"
 19 | 
 20 | GOCMD = go
 21 | GOBUILD = $(GOCMD) build $(FLAGS_LD)
 22 | GOINSTALL = $(GOCMD) install $(FLAGS_LD)
 23 | GORELEASE = $(GOCMD) build $(FLAGS_REL)
 24 | GOCLEAN = $(GOCMD) clean
 25 | GOGET = $(GOCMD) get
 26 | 
 27 | RELEASE_DIR ?= "/tmp"
 28 | BUILD_DIR ?= "."
 29 | CLIB_DIR ?= "."
 30 | 
 31 | all: install
 32 | 
 33 | test: deps install
 34 | 	$(FLAG_MODULE) go test -shuffle=on -race -count=1 ./...
 35 | 
 36 | test-build: deps build
 37 | 
 38 | deps:
 39 | 	$(GOCMD) mod download;
 40 | 
 41 | tools: deps
 42 | 	@echo Installing tools from tools.go
 43 | 	@cat $(PROJ_NAME)/tools.go | grep _ | awk -F'"' '{print $$2}' | xargs -tI % go install %
 44 | 
 45 | peg:
 46 | 	cd ent/parser; \
 47 | 	peg grammar.peg; \
 48 | 	goimports -w grammar.peg.go; \
 49 | 	cd ../internal/preparser; \
 50 | 	peg grammar.peg; \
 51 | 	goimports -w grammar.peg.go;
 52 | 
 53 | ragel:
 54 | 	cd ent/internal/preprocess; \
 55 | 	ragel -Z -G2 virus.rl; \
 56 | 	ragel -Z -G2 noparse.rl
 57 | 
 58 | asset:
 59 | 	cd io/fs; \
 60 | 	$(FLAGS_SHARED) go run -tags=dev assets_gen.go
 61 | 
 62 | build: peg
 63 | 	cd $(PROJ_NAME); \
 64 | 	$(GOCLEAN); \
 65 | 	$(NO_C) $(GOBUILD) -o $(BUILD_DIR)
 66 | 
 67 | buildrel: peg
 68 | 	cd $(PROJ_NAME); \
 69 | 	$(GOCLEAN); \
 70 | 	$(NO_C) $(GORELEASE) -o $(BUILD_DIR)
 71 | 
 72 | install: peg
 73 | 	cd $(PROJ_NAME); \
 74 | 	$(GOCLEAN); \
 75 | 	$(NO_C) $(GOINSTALL)
 76 | 
 77 | release: peg dockerhub
 78 | 	cd $(PROJ_NAME); \
 79 | 	$(GOCLEAN); \
 80 | 	$(FLAGS_LINUX) $(NO_C) $(GOBUILD); \
 81 | 	tar zcf $(RELEASE_DIR)/$(PROJ_NAME)-$(VER)-linux-x86.tar.gz $(PROJ_NAME); \
 82 | 	$(GOCLEAN); \
 83 | 	$(FLAGS_LINUX_ARM) $(NO_C) $(GOBUILD); \
 84 | 	tar zcf $(RELEASE_DIR)/$(PROJ_NAME)-$(VER)-linux-arm.tar.gz $(PROJ_NAME); \
 85 | 	$(GOCLEAN); \
 86 | 	$(FLAGS_MAC) $(NO_C) $(GOBUILD); \
 87 | 	tar zcf $(RELEASE_DIR)/$(PROJ_NAME)-$(VER)-mac-x86.tar.gz $(PROJ_NAME); \
 88 | 	$(GOCLEAN); \
 89 | 	$(FLAGS_MAC_ARM) $(NO_C) $(GOBUILD); \
 90 | 	tar zcf $(RELEASE_DIR)/$(PROJ_NAME)-$(VER)-mac-arm.tar.gz $(PROJ_NAME); \
 91 | 	$(GOCLEAN); \
 92 | 	$(FLAGS_WIN) $(NO_C) $(GOBUILD); \
 93 | 	zip -9 $(RELEASE_DIR)/$(PROJ_NAME)-$(VER)-win-x86.zip $(PROJ_NAME).exe; \
 94 | 	$(GOCLEAN); \
 95 | 	$(FLAGS_WIN_ARM) $(NO_C) $(GOBUILD); \
 96 | 	zip -9 $(RELEASE_DIR)/$(PROJ_NAME)-$(VER)-win-arm.zip $(PROJ_NAME).exe; \
 97 | 	$(GOCLEAN);
 98 | 
 99 | dc: asset build
100 | 	docker-compose build;
101 | 
102 | docker: build
103 | 	docker build -t gnames/go$(PROJ_NAME):latest -t gnames/go$(PROJ_NAME):$(VERSION) .; \
104 | 	cd $(PROJ_NAME); \
105 | 	$(GOCLEAN);
106 | 
107 | dockerhub: docker
108 | 	docker push gnames/go$(PROJ_NAME); \
109 | 	docker push gnames/go$(PROJ_NAME):$(VERSION)
110 | 
111 | clib_darwin: peg
112 | 	cd binding; \
113 | 	$(GOCLEAN); \
114 | 	CGO_ENABLED=1 GOOS=darwin GOARCH=arm64 $(GOBUILD) -buildmode=c-shared -o $(CLIB_DIR)/lib$(PROJ_NAME)_arm64.so; \
115 | 	CGO_ENABLED=1 GOOS=darwin GOARCH=amd64 $(GOBUILD) -buildmode=c-shared -o $(CLIB_DIR)/lib$(PROJ_NAME)_amd64.so; \
116 | 	rm lib$(PROJ_NAME)_amd64.h; \
117 | 	mv lib$(PROJ_NAME)_arm64.h lib$(PROJ_NAME).h; \
118 | 	lipo -create -output $(CLIB_DIR)/lib$(PROJ_NAME).so $(CLIB_DIR)/lib$(PROJ_NAME)_arm64.so $(CLIB_DIR)/lib$(PROJ_NAME)_amd64.so;
119 | 
120 | clib: peg
121 | 	cd binding; \
122 | 	$(GOBUILD) -buildmode=c-shared -o $(CLIB_DIR)/lib$(PROJ_NAME).so;
123 | 
124 | quality:
125 | 	cd tools;\
126 | 	$(GOCMD) run quality.go > ../quality.md
127 | 
128 | 
129 | .PHONY: man
130 | man: ronn
131 | 	@ronn ./man/$(PROJ_NAME).1.ronn --style=dark
132 | 
133 | .PHONY: ronn
134 | ronn:
135 | 	@which ronn > /dev/null || gem install ronn
136 | 


--------------------------------------------------------------------------------
/binding/main.go:
--------------------------------------------------------------------------------
  1 | // Package main provides C-binding functionality to use parser in
  2 | // other languages.
  3 | package main
  4 | 
  5 | /*
  6 |   #include "stdlib.h"
  7 | */
  8 | import "C"
  9 | 
 10 | import (
 11 | 	"strings"
 12 | 	"unsafe"
 13 | 
 14 | 	"github.com/gnames/gnfmt"
 15 | 	"github.com/gnames/gnlib/ent/nomcode"
 16 | 	"github.com/gnames/gnparser"
 17 | )
 18 | 
 19 | // ParseToString function takes a name-string, desired format, a withDetails
 20 | // flag as 0|1 integer. It parses the name-string to either JSON, or a CSV
 21 | // string, depending on the desired format. Format argument can take values of
 22 | // 'csv', 'compact', 'pretty'. If withDetails argument is 0, additional
 23 | // parsed details are ommited, if it is 1 -- they are included.
 24 | // true.
 25 | //
 26 | //export ParseToString
 27 | func ParseToString(
 28 | 	name *C.char,
 29 | 	fmtStr *C.char,
 30 | 	codeStr *C.char,
 31 | 	details C.int,
 32 | 	diaereses C.int,
 33 | ) *C.char {
 34 | 	goname := C.GoString(name)
 35 | 	code := nomcode.New(C.GoString(codeStr))
 36 | 	frmt, err := gnfmt.NewFormat(C.GoString(fmtStr))
 37 | 	if err != nil {
 38 | 		frmt = gnfmt.CSV
 39 | 	}
 40 | 	opts := []gnparser.Option{
 41 | 		gnparser.OptFormat(frmt),
 42 | 		gnparser.OptWithDetails(int(details) > 0),
 43 | 		gnparser.OptCode(code),
 44 | 		gnparser.OptWithPreserveDiaereses(int(diaereses) > 0),
 45 | 	}
 46 | 	cfg := gnparser.NewConfig(opts...)
 47 | 	gnp := gnparser.New(cfg)
 48 | 	parsed := gnp.ParseName(goname).Output(gnp.Format())
 49 | 
 50 | 	return C.CString(parsed)
 51 | }
 52 | 
 53 | // FreeMemory takes a string pointer and frees its memory.
 54 | //
 55 | //export FreeMemory
 56 | func FreeMemory(p *C.char) {
 57 | 	C.free(unsafe.Pointer(p))
 58 | }
 59 | 
 60 | // ParseAryToString function takes an array of names, parsing format, and a
 61 | // withDetails flag as 0|1 integer.  Parsed outputs are sent as a string in
 62 | // either CSV or JSONformat. Format argument can take values of 'csv',
 63 | // 'compact', or 'pretty'. For withDetails argument 0 means false, 1 means
 64 | // true.
 65 | //
 66 | //export ParseAryToString
 67 | func ParseAryToString(
 68 | 	in **C.char,
 69 | 	length C.int,
 70 | 	fmtStr *C.char,
 71 | 	codeStr *C.char,
 72 | 	details C.int,
 73 | 	diaereses C.int,
 74 | ) *C.char {
 75 | 	names := make([]string, int(length))
 76 | 	code := nomcode.New(C.GoString(codeStr))
 77 | 	frmt, err := gnfmt.NewFormat(C.GoString(fmtStr))
 78 | 	if err != nil {
 79 | 		frmt = gnfmt.CSV
 80 | 	}
 81 | 
 82 | 	opts := []gnparser.Option{
 83 | 		gnparser.OptFormat(frmt),
 84 | 		gnparser.OptWithDetails(int(details) > 0),
 85 | 		gnparser.OptCode(code),
 86 | 		gnparser.OptWithPreserveDiaereses(int(diaereses) > 0),
 87 | 	}
 88 | 	start := unsafe.Pointer(in)
 89 | 	pointerSize := unsafe.Sizeof(in)
 90 | 
 91 | 	for i := 0; i < int(length); i++ {
 92 | 		// Copy each input string into a Go string and add it to the slice.
 93 | 		pointer := (**C.char)(unsafe.Pointer(uintptr(start) + uintptr(i)*pointerSize))
 94 | 		name := C.GoString(*pointer)
 95 | 		names[i] = name
 96 | 	}
 97 | 
 98 | 	cfg := gnparser.NewConfig(opts...)
 99 | 	gnp := gnparser.New(cfg)
100 | 
101 | 	var res string
102 | 	parsed := gnp.ParseNames(names)
103 | 	if gnp.Format() == gnfmt.CSV {
104 | 		csv := make([]string, length)
105 | 		for i := range parsed {
106 | 			csv[i] = parsed[i].Output(gnfmt.CSV)
107 | 		}
108 | 		res = strings.Join(csv, "\n")
109 | 	} else {
110 | 		json, _ := gnfmt.GNjson{}.Encode(parsed)
111 | 		res = string(json)
112 | 	}
113 | 	return C.CString(res)
114 | }
115 | 
116 | func main() {}
117 | 


--------------------------------------------------------------------------------
/build/bin/.gitkeep:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/gnames/gnparser/a2bee074a6d9f6b23be70c0d760540863645fa74/build/bin/.gitkeep


--------------------------------------------------------------------------------
/build/clib/.gitkeep:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/gnames/gnparser/a2bee074a6d9f6b23be70c0d760540863645fa74/build/clib/.gitkeep


--------------------------------------------------------------------------------
/build/release/.gitkeep:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/gnames/gnparser/a2bee074a6d9f6b23be70c0d760540863645fa74/build/release/.gitkeep


--------------------------------------------------------------------------------
/cli_test.go:
--------------------------------------------------------------------------------
 1 | package gnparser_test
 2 | 
 3 | import (
 4 | 	"strings"
 5 | 	"testing"
 6 | 
 7 | 	"github.com/rendon/testcli"
 8 | 	"github.com/stretchr/testify/assert"
 9 | )
10 | 
11 | func TestTMP(t *testing.T) {
12 | 	assert.True(t, true)
13 | }
14 | 
15 | // Run make install before these tests to get meaningful
16 | // results.
17 | 
18 | func TestVersion(t *testing.T) {
19 | 	c := testcli.Command("gnparser", "-V")
20 | 	c.Run()
21 | 	assert.True(t, c.Success())
22 | 	assert.Contains(t, c.Stdout(), "version:")
23 | 
24 | 	c = testcli.Command("gnparser", "-V", "-f", "simple",
25 | 		"-j", "200", "-p", "8000")
26 | 	c.Run()
27 | 	assert.True(t, c.Success())
28 | 	assert.Contains(t, c.Stdout(), "version:")
29 | }
30 | 
31 | func TestFormat(t *testing.T) {
32 | 	t.Run("runs csv format", func(t *testing.T) {
33 | 		c := testcli.Command("gnparser", "Homo sapiens", "-f", "csv")
34 | 		c.Run()
35 | 		assert.True(t, c.Success())
36 | 		assert.Contains(t, c.Stdout(), ",Homo sapiens,2")
37 | 	})
38 | 
39 | 	t.Run("ignores parsing with --version", func(t *testing.T) {
40 | 		c := testcli.Command("gnparser", "Homo sapiens", "-f", "simple", "--version")
41 | 		c.Run()
42 | 		assert.True(t, c.Success())
43 | 		assert.NotContains(t, c.Stdout(), ",Homo sapiens,")
44 | 		assert.Contains(t, c.Stdout(), "version:")
45 | 	})
46 | 
47 | 	t.Run("sets format to default if -f value is unknown", func(t *testing.T) {
48 | 		c := testcli.Command("gnparser", "Homo sapiens", "-f", ":)")
49 | 		c.Run()
50 | 		assert.True(t, c.Success())
51 | 		assert.Contains(t, c.Stdout(), `Id,Verbatim,Cardinality,`)
52 | 	})
53 | }
54 | 
55 | func TestStdin(t *testing.T) {
56 | 	t.Run("takes data from Stdin", func(t *testing.T) {
57 | 		c := testcli.Command("gnparser", "-f", "simple")
58 | 		c.SetStdin(strings.NewReader("Homo sapiens"))
59 | 		c.Run()
60 | 		assert.True(t, c.Success())
61 | 		assert.Contains(t, c.Stdout(), ",Homo sapiens,")
62 | 	})
63 | 
64 | 	t.Run("takes multiple names from Stdin", func(t *testing.T) {
65 | 		c := testcli.Command("gnparser", "-f", "simple")
66 | 		c.SetStdin(strings.NewReader("Plantago\nBubo L.\n"))
67 | 		c.Run()
68 | 		assert.True(t, c.Success())
69 | 		assert.Contains(t, c.Stdout(), ",Plantago,")
70 | 		assert.Contains(t, c.Stdout(), ",Bubo,")
71 | 	})
72 | }
73 | 


--------------------------------------------------------------------------------
/config.go:
--------------------------------------------------------------------------------
  1 | package gnparser
  2 | 
  3 | import (
  4 | 	"log/slog"
  5 | 	"runtime"
  6 | 
  7 | 	"github.com/gnames/gnfmt"
  8 | 	"github.com/gnames/gnlib/ent/nomcode"
  9 | )
 10 | 
 11 | // Config keeps settings that might affect how parsing is done,
 12 | // of change the parsing output.
 13 | type Config struct {
 14 | 	// BatchSize sets the maximum number of elements in names-strings slice.
 15 | 	BatchSize int
 16 | 
 17 | 	// Code contains optional nomenclatural code value. This option is
 18 | 	// useful to solve ambiguous parsing cases and to add cultivar botanical
 19 | 	// rules.
 20 | 	nomcode.Code
 21 | 
 22 | 	// Debug sets a "debug" state for parsing. The debug state forces output
 23 | 	// format to showing parsed ast tree.
 24 | 	Debug bool
 25 | 
 26 | 	// Format sets the output format for CLI and Web interfaces.
 27 | 	// There are 3 formats available: 'CSV', 'CompactJSON' and
 28 | 	// 'PrettyJSON'.
 29 | 	Format gnfmt.Format
 30 | 
 31 | 	// IgnoreHTMLTags can be set to true when it is desirable to clean up names
 32 | 	// from a few HTML tags often present in names-strings that were planned to
 33 | 	// be presented via an HTML page.
 34 | 	IgnoreHTMLTags bool
 35 | 
 36 | 	// IsTest can be set to true when parsing functionality is used for tests.
 37 | 	// In such cases the `ParserVersion` field is presented as `test_version`
 38 | 	// instead of displaying the actual version of `gnparser`.
 39 | 	IsTest bool
 40 | 
 41 | 	// JobsNum sets a level of parallelism used during parsing of
 42 | 	// a stream of name-strings.
 43 | 	JobsNum int
 44 | 
 45 | 	// Port to run wer-service.
 46 | 	Port int
 47 | 
 48 | 	// WithCapitalization flag, when true, the first letter of a name-string
 49 | 	// is capitalized, if appropriate.
 50 | 	WithCapitalization bool
 51 | 
 52 | 	// WithDetails can be set to true when a simplified output is not sufficient
 53 | 	// for obtaining a required information.
 54 | 	WithDetails bool
 55 | 
 56 | 	// WithNoOrder flag, when true, output and input are in different order.
 57 | 	WithNoOrder bool
 58 | 
 59 | 	// WithPreserveDiaereses flag, when true, diaereses will not be transliterated
 60 | 	WithPreserveDiaereses bool
 61 | 
 62 | 	// WithStream changes from parsing a batch by batch, to parsing one name
 63 | 	// at a time. When WithStream is true, BatchSize setting is ignored.
 64 | 	WithStream bool
 65 | 
 66 | 	// WithWebLogs flag enables logs when running web-service. This flag is
 67 | 	// ignored if `Port` value is not set.
 68 | 	WithWebLogs bool
 69 | 
 70 | 	// WithSpeciesGroupCut flag means that stemmed version of autonyms (ICN) and
 71 | 	// species group names (ICZN) will be truncated to species. It helps to
 72 | 	// simplify matching names like `Aus bus` and `Aus bus bus`.
 73 | 	WithSpeciesGroupCut bool
 74 | }
 75 | 
 76 | // Option is a type that has to be returned by all Option functions. Such
 77 | // functions are able to modify the settings of a Config object.
 78 | type Option func(*Config)
 79 | 
 80 | // OptBatchSize sets the max number of names in a batch.
 81 | func OptBatchSize(i int) Option {
 82 | 	return func(cfg *Config) {
 83 | 		if i <= 0 {
 84 | 			slog.Warn("Batch size should be a positive number")
 85 | 			return
 86 | 		}
 87 | 		cfg.BatchSize = i
 88 | 	}
 89 | }
 90 | 
 91 | // OptDebugParse returns parsed tree
 92 | func OptDebug(b bool) Option {
 93 | 	return func(cfg *Config) {
 94 | 		cfg.Debug = b
 95 | 	}
 96 | }
 97 | 
 98 | // OptFormat sets the formatting option for CLI or Web presentation.
 99 | // It accepts a gnfmt.Format value to control the output format.
100 | func OptFormat(f gnfmt.Format) Option {
101 | 	return func(cfg *Config) {
102 | 		cfg.Format = f
103 | 	}
104 | }
105 | 
106 | // OptKeepHTMLTags sets the KeepHTMLTags field. This option is useful if
107 | // names with HTML tags shold not be parsed, or they are absent in input
108 | // data.
109 | func OptIgnoreHTMLTags(b bool) Option {
110 | 	return func(cfg *Config) {
111 | 		cfg.IgnoreHTMLTags = b
112 | 	}
113 | }
114 | 
115 | // OptIsTest sets a test flag.
116 | func OptIsTest(b bool) Option {
117 | 	return func(cfg *Config) {
118 | 		cfg.IsTest = b
119 | 	}
120 | }
121 | 
122 | // OptJobsNum sets the JobsNum field.
123 | func OptJobsNum(i int) Option {
124 | 	return func(cfg *Config) {
125 | 		cfg.JobsNum = i
126 | 	}
127 | }
128 | 
129 | // OptPort sets a port for web-service.
130 | func OptPort(i int) Option {
131 | 	return func(cfg *Config) {
132 | 		cfg.Port = i
133 | 	}
134 | }
135 | 
136 | // OptWithCapitaliation sets the WithCapitalization field.
137 | func OptWithCapitaliation(b bool) Option {
138 | 	return func(cfg *Config) {
139 | 		cfg.WithCapitalization = b
140 | 	}
141 | }
142 | 
143 | // OptCode sets Code field
144 | func OptCode(c nomcode.Code) Option {
145 | 	return func(cfg *Config) {
146 | 		cfg.Code = c
147 | 	}
148 | }
149 | 
150 | // OptWithDetails sets the WithDetails field.
151 | func OptWithDetails(b bool) Option {
152 | 	return func(cfg *Config) {
153 | 		cfg.WithDetails = b
154 | 	}
155 | }
156 | 
157 | // OptWithNoOrder sets the WithNoOrder field.
158 | func OptWithNoOrder(b bool) Option {
159 | 	return func(cfg *Config) {
160 | 		cfg.WithNoOrder = b
161 | 	}
162 | }
163 | 
164 | // OptWithPreserveDiaereses sets the PreserveDiaereses field.
165 | func OptWithPreserveDiaereses(b bool) Option {
166 | 	return func(cfg *Config) {
167 | 		cfg.WithPreserveDiaereses = b
168 | 	}
169 | }
170 | 
171 | // OptWithDetails sets the WithDetails field.
172 | func OptWithStream(b bool) Option {
173 | 	return func(cfg *Config) {
174 | 		cfg.WithStream = b
175 | 	}
176 | }
177 | 
178 | // OptWithWebLogs sets the WithWebLogs field.
179 | func OptWithWebLogs(b bool) Option {
180 | 	return func(cfg *Config) {
181 | 		cfg.WithWebLogs = b
182 | 	}
183 | }
184 | 
185 | // OptWithSpeciesGroupCut sets WithSpeciesGroupCut field.
186 | func OptWithSpeciesGroupCut(b bool) Option {
187 | 	return func(cfg *Config) {
188 | 		cfg.WithSpeciesGroupCut = b
189 | 	}
190 | }
191 | 
192 | // NewConfig generates a new Config object. It can take an arbitrary number
193 | // of `Option` functions to modify default configuration settings.
194 | func NewConfig(opts ...Option) Config {
195 | 	cfg := Config{
196 | 		Format:         gnfmt.CSV,
197 | 		JobsNum:        runtime.NumCPU(),
198 | 		BatchSize:      50_000,
199 | 		IgnoreHTMLTags: false,
200 | 		Port:           8080,
201 | 		Code:           nomcode.Unknown,
202 | 	}
203 | 	for i := range opts {
204 | 		opts[i](&cfg)
205 | 	}
206 | 	return cfg
207 | }
208 | 


--------------------------------------------------------------------------------
/config_test.go:
--------------------------------------------------------------------------------
 1 | package gnparser_test
 2 | 
 3 | import (
 4 | 	"runtime"
 5 | 	"testing"
 6 | 
 7 | 	"github.com/gnames/gnfmt"
 8 | 	"github.com/gnames/gnparser"
 9 | 	"github.com/stretchr/testify/assert"
10 | )
11 | 
12 | func TestNew(t *testing.T) {
13 | 	cfg := gnparser.NewConfig()
14 | 	deflt := gnparser.Config{
15 | 		Format:         gnfmt.CSV,
16 | 		JobsNum:        runtime.NumCPU(),
17 | 		BatchSize:      50_000,
18 | 		IgnoreHTMLTags: false,
19 | 		WithDetails:    false,
20 | 		Port:           8080,
21 | 		IsTest:         false,
22 | 	}
23 | 	assert.Equal(t, deflt, cfg)
24 | }
25 | 
26 | func TestNewOpts(t *testing.T) {
27 | 	opts := opts()
28 | 	cnf := gnparser.NewConfig(opts...)
29 | 	updt := gnparser.Config{
30 | 		Format:         gnfmt.CompactJSON,
31 | 		JobsNum:        161,
32 | 		BatchSize:      1,
33 | 		IgnoreHTMLTags: true,
34 | 		WithDetails:    true,
35 | 		Port:           8989,
36 | 	}
37 | 	assert.Equal(t, updt, cnf)
38 | }
39 | 
40 | func opts() []gnparser.Option {
41 | 	return []gnparser.Option{
42 | 		gnparser.OptFormat(gnfmt.CompactJSON),
43 | 		gnparser.OptJobsNum(161),
44 | 		gnparser.OptBatchSize(1),
45 | 		gnparser.OptIgnoreHTMLTags(true),
46 | 		gnparser.OptWithDetails(true),
47 | 		gnparser.OptPort(8989),
48 | 	}
49 | }
50 | 


--------------------------------------------------------------------------------
/default.nix:
--------------------------------------------------------------------------------
 1 | { lib, buildGoModule, fetchFromGitHub, stdenv, glibc }:
 2 | 
 3 | buildGoModule rec {
 4 |   pname = "gnparser";
 5 |   version = "v1.6.6";
 6 |   date = "2022-05-17";
 7 | 
 8 |   src = ./.;
 9 | 
10 |   vendorSha256 = "sha256-TY/vIgtu/GeVKJ1AonMMxCvIbK3ATc2jp9Zqq1YQ9Mg=";
11 | 
12 |   buildInputs = [
13 |     stdenv
14 |     glibc.static
15 |   ];
16 | 
17 |   doChecks = false;
18 | 
19 |   subPackages = "gnparser";
20 | 
21 |   ldflags = [
22 |     "-s"
23 |     "-w"
24 |     "-linkmode external"
25 |     "-extldflags"
26 |     "-static"
27 |     "-X github.com/gnames/gnparser.Version=${version}"
28 |     "-X github.com/gnames/gnparser.Build=${date}"
29 |   ];
30 | 
31 |   meta = with lib; {
32 |     description = "Parser for bio scientific names";
33 |     homepage = "https://github.com/gnames/gnparser";
34 |     license = licenses.mit;
35 |     maintainers = with maintainers; [ "dimus" ];
36 |   };
37 | }
38 | 


--------------------------------------------------------------------------------
/docker-compose.yml:
--------------------------------------------------------------------------------
 1 | version: "3"
 2 | 
 3 | services:
 4 |   app:
 5 |     build:
 6 |       context: .
 7 |     ports:
 8 |       - 0.0.0.0:8778:8778
 9 |     command:
10 |       - -p
11 |       - "8778"
12 | 
13 | 


--------------------------------------------------------------------------------
/ent/internal/preparser/grammar.peg:
--------------------------------------------------------------------------------
 1 | package preparser
 2 | 
 3 | type PreParser Peg { PreString }
 4 | 
 5 | String <- _? (Head Tail? / TailPhrase .*) SpaceOrEnd
 6 | 
 7 | Head <- Word (CommaSpace Word)* &(Tail / SpaceOrEnd)
 8 | 
 9 | Tail <- { p.tailIndex = int(token.begin) } CommaSpace TailPhrase .*
10 | 
11 | Word <- !TailPhrase [^, ]+ / ','
12 | 
13 | TailPhrase <- TailLastWordJunk / TailPhrase4 / TailPhrase3 /
14 |   TailStopWords / TailPhrase2 / TailPhrase1
15 | 
16 | TailLastWordJunk <- (("var" / "ined" / "ssp" / "subsp" / "subgen" ) '.'? /
17 |  "sensu" / "new" / "non" / "nec" / "hybrid" / "von" / 'P.' _? 'P.' /
18 |  "ms" / 'CF') '?'? &SpaceOrEnd
19 | 
20 | TailPhrase4 <- ("pro" _ "parte" / "nomen") &NotLetterOrEnd / 'p.' _? 'p.' /
21 |   "nom." / "comb."
22 | 
23 | TailPhrase3 <- '('? 's' ('.' _? / _ ) ('s' '.'? &NotLetterOrEnd / 'l.' / 'str.' /
24 |    'lat.')
25 | 
26 | TailStopWords <- ("environmental" / "enrichment" / "samples" /
27 |   "species" / "group" / "complex" / "clade" /
28 |   "author" / "nec" / "vide" / "species" / "fide" / "non" / "not" ) &NotLetterOrEnd
29 | 
30 | TailPhrase2 <- ("sero" ("var" / "type") / "sensu" / "auct" / "sec" / "near" /
31 |   "str") '.'? &NotLetterOrEnd
32 | 
33 | TailPhrase1 <- (('('? ('ht' / 'hort')) / "S" 'pec' /
34 |   'nov' '.'? _ 'spec') '.'? &NotLetterOrEnd
35 | 
36 | SpaceOrEnd <- CommaSpace? END
37 | 
38 | CommaSpace <- (_? ',' _?)+ / _
39 | 
40 | _ <- MultipleSpace / SingleSpace
41 | 
42 | NotLetterOrEnd <- NotLetter / END
43 | 
44 | NotLetter <- [[^A-Z0-9_.\-]]
45 | 
46 | MultipleSpace <- SingleSpace SingleSpace+
47 | 
48 | SingleSpace <- ' ' / OtherSpace
49 | 
50 | OtherSpace <- [　 \t\r\n\f\v]
51 | 
52 | END <- !.
53 | 


--------------------------------------------------------------------------------
/ent/internal/preparser/preparser.go:
--------------------------------------------------------------------------------
 1 | package preparser
 2 | 
 3 | import "log/slog"
 4 | 
 5 | func New() *PreParser {
 6 | 	res := &PreParser{}
 7 | 	res.Init()
 8 | 	return res
 9 | }
10 | 
11 | type PreString struct {
12 | 	tailIndex int
13 | }
14 | 
15 | // ParseString returns index of the Tail
16 | func (ppr *PreParser) NewString(s string) {
17 | 	ppr.tailIndex = -1
18 | 	ppr.Buffer = s
19 | 	ppr.Reset()
20 | }
21 | 
22 | func (ppr *PreParser) TailIndex(s string) int {
23 | 	ppr.NewString(s)
24 | 	if err := ppr.Parse(); err != nil {
25 | 		slog.Error("Preparsing failed", "error", err, "string", s)
26 | 		return -1
27 | 	}
28 | 	ppr.Execute()
29 | 	if ppr.tailIndex >= 0 {
30 | 		rs := []rune(s)
31 | 		head := rs[0:ppr.tailIndex]
32 | 		return len([]byte(string(head)))
33 | 	}
34 | 	return ppr.tailIndex
35 | }
36 | 
37 | // Debug takes a string, parses it, and prints its AST.
38 | func (ppr *PreParser) Debug(q string) error {
39 | 	ppr.NewString(q)
40 | 	err := ppr.Parse()
41 | 	if err != nil {
42 | 		return err
43 | 	}
44 | 	ppr.PrettyPrintSyntaxTree(q)
45 | 	return nil
46 | }
47 | 


--------------------------------------------------------------------------------
/ent/internal/preparser/preparser_test.go:
--------------------------------------------------------------------------------
  1 | package preparser_test
  2 | 
  3 | import (
  4 | 	"testing"
  5 | 
  6 | 	"github.com/gnames/gnparser/ent/internal/preparser"
  7 | 	"github.com/stretchr/testify/assert"
  8 | )
  9 | 
 10 | func TestDebug(t *testing.T) {
 11 | 	debug := true
 12 | 	q := "The annulignatha group"
 13 | 	assert := assert.New(t)
 14 | 	ppr := preparser.New()
 15 | 	if debug {
 16 | 		err := ppr.Debug(q)
 17 | 		assert.Nil(err)
 18 | 	}
 19 | }
 20 | 
 21 | func TestPreParsed(t *testing.T) {
 22 | 	tests := []struct {
 23 | 		msg, str, tail string
 24 | 	}{
 25 | 		// Last 'junk' words/ annotations
 26 | 		{"var", "Musca domeſtica Linnaeus 1758 var?  ", " var?  "},
 27 | 		{"ined", "  Oxalis_barrelieri ined.?", " ined.?"},
 28 | 		{"ssp.", "Peperomia non-alata Trel. ssp.", " ssp."},
 29 | 		{"subsp.", "Sanogasta x-signata (Keyserling,1891) subsp.",
 30 | 			" subsp."},
 31 | 		{"subgen", "Sanogasta x-signata (Keyserling,1891) subgen?  ",
 32 | 			" subgen?  "},
 33 | 		{"sensu", "Pseudomonas methanica (Söhngen 1906) sensu. Dworkin and Foster 1956",
 34 | 			" sensu. Dworkin and Foster 1956"},
 35 | 		{"new", "Hegeter (Hegeter) intercedens Lindberg H 1950 new", " new"},
 36 | 		{"non", "Anthoscopus Cabanis [1851?] non", " non"},
 37 | 		{"nec", "Hegeter (Hegeter) intercedens Lindberg H 1950 nec", " nec"},
 38 | 		{"hybrid", "  Arthopyrenia hyalospora x hybrid?", " hybrid?"},
 39 | 		{"von$", "Nautilus asterizans von", " von"},
 40 | 
 41 | 		// Pro Parte
 42 | 		{"Pro Parte", "Abarema clypearia (Jack) Kosterm., Pro Parte",
 43 | 			", Pro Parte"},
 44 | 		{"nomen", "Akeratidae Nomen Nudum", " Nomen Nudum"},
 45 | 		{"nom.", "Akeratidae nom. nudum", " nom. nudum"},
 46 | 		{"nom illeg", "Abutilon avicennae Gaertn., nom. illeg.", ", nom. illeg."},
 47 | 		{"comb", "Arthopyrenia hyalospora (Nyl.) R.C. Harris comb. nov.",
 48 | 			" comb. nov."},
 49 | 		{"p. p.", "Abarema clypearia (Jack) Kosterm., p. p.", ", p. p."},
 50 | 		{"P. P.", "Abarema clypearia (Jack) Kosterm., P. P.", ", P. P."},
 51 | 
 52 | 		// s.s.
 53 | 		{", s. s.", "Bubo bubo, s. s. nov spec something",
 54 | 			", s. s. nov spec something"},
 55 | 		{"s.s.", "Bubo bubo s.s. nov spec something",
 56 | 			" s.s. nov spec something"},
 57 | 		{"s.l.", "Bubo bubo s.l. something",
 58 | 			" s.l. something"},
 59 | 		{"s. lat.", "Bubo bubo s. lat. something",
 60 | 			" s. lat. something"},
 61 | 		{"s. str.", "Bubo bubo s. str. something",
 62 | 			" s. str. something"},
 63 | 		{"no break space", " Canadensis Erxleben, 1777 s.str.", " s.str."},
 64 | 
 65 | 		// Stop words
 66 | 		{"env", "Ge Nicéville 1895 Environmental sample",
 67 | 			" Environmental sample"},
 68 | 		{"env samples", "Candidatus Anammoxoglobus environmental samples",
 69 | 			" environmental samples"},
 70 | 		{"enrichment", "Crenarchaeote enrichment culture clone OREC-B1022",
 71 | 			" enrichment culture clone OREC-B1022"},
 72 | 		{"samples", "Candidatus Anammoxoglobus samples",
 73 | 			" samples"},
 74 | 
 75 | 		{"sec", "Ataladoris Iredale & O'Donoghue 1923 sec Eschmeyer",
 76 | 			" sec Eschmeyer"},
 77 | 		{"sec.", "Ataladoris Iredale & O'Donoghue 1923 sec. Eschmeyer",
 78 | 			" sec. Eschmeyer"},
 79 | 		{"sp compl", "Acarospora cratericola cratericola Shenk 1974 species complex",
 80 | 			" species complex"},
 81 | 		{"utf8", "× Dialaeliopsis hort.", " hort."},
 82 | 	}
 83 | 
 84 | 	assert := assert.New(t)
 85 | 	ppr := preparser.New()
 86 | 
 87 | 	for _, v := range tests {
 88 | 		idx := ppr.TailIndex(v.str)
 89 | 		assert.True(idx >= 0, v.msg)
 90 | 		assert.Equal(v.tail, string([]byte(v.str)[idx:]), v.msg)
 91 | 	}
 92 | }
 93 | 
 94 | func TestNotPreParsed(t *testing.T) {
 95 | 	tests := []struct {
 96 | 		msg, str string
 97 | 	}{
 98 | 		{"no tail1", "Lachenalia tricolor var. nelsonii (anon.) Baker"},
 99 | 		{"S. S.", "Bubo bubo, S. S. something"},
100 | 		{"dagger", "Heteralocha acutirostris (Gould, 1837) Huia N E†"},
101 | 		{"spaces", "Heteralocha acutirostris (Gould, 1837) Huia N E   "},
102 | 		{"comma", "Abantiadinus pusillus Broun, T. , 1914"},
103 | 		{"last comma", "Acalles foveopunctatus Fiedler,"},
104 | 		{"space comma", "Calamagrostis neglecta G.Gaertn. ,B.Mey. & Scherb."},
105 | 		{"all tail", "Non splenectomized mulatta"},
106 | 		{"several commas", "Naupliicola cystifingens Michajlow, ,1968"},
107 | 		{"spp", "Crataegus curvisepala nvar. naviculiformis T. Petauer Alaria spp."},
108 | 	}
109 | 
110 | 	assert := assert.New(t)
111 | 	ppr := preparser.New()
112 | 
113 | 	for _, v := range tests {
114 | 		idx := ppr.TailIndex(v.str)
115 | 		assert.Equal(-1, idx)
116 | 	}
117 | }
118 | 


--------------------------------------------------------------------------------
/ent/internal/preprocess/annot.rl:
--------------------------------------------------------------------------------
 1 | package preprocess
 2 | 
 3 | import (
 4 | )
 5 | 
 6 | func AnnotationRL(data []byte) bool {
 7 |   %%{
 8 |     machine annot;
 9 |     write data;
10 |   }%%
11 | 
12 |   cs, p, pe, eof := 0, 0, len(data), len(data)
13 |   _ = eof
14 |   _ = annot_en_main
15 |   _ = annot_error
16 |   _ = annot_first_final
17 | 
18 |   var match bool
19 | 
20 |   %%{
21 |     action setMatch {match = true}
22 |     action setPos {pos = append(pos,p)}
23 | 
24 |     notes = ("species"i | "group"i | "clade"i | "authors"i | "non" | "nec" |
25 |       "fide" | "vide" );
26 |     tc1 = ("sensu"i | "auct"i | "sec"i | "near" | "str") "."?;
27 |     tc2 = "("? "s." space? ([sl] | "str" | "lat") ".";
28 |     tc3 = "pro parte"i | "p." space? "p.";
29 |     tc4 = "("? ("nomen"i | "nom."i | "comb.");
30 | 
31 |     main := any* ((space+ | "," space?)
32 |             (notes | tc1 |tc2 | tc3 | tc4))  %/setMatch
33 |             ((space | punct) >setMatch);
34 | 
35 |     write init;
36 |     write exec;
37 |   }%%
38 | 
39 |   return match
40 | }
41 | 


--------------------------------------------------------------------------------
/ent/internal/preprocess/cleanup.go:
--------------------------------------------------------------------------------
 1 | package preprocess
 2 | 
 3 | import (
 4 | 	"bytes"
 5 | 	"io"
 6 | 	"sync"
 7 | 
 8 | 	"golang.org/x/net/html"
 9 | )
10 | 
11 | var tags = map[string]struct{}{
12 | 	"i":     {},
13 | 	"small": {},
14 | 	"br":    {},
15 | 	"em":    {},
16 | 	"b":     {},
17 | }
18 | 
19 | // CleanupResult keeps results of removal of some HTML tags.
20 | type CleanupResult struct {
21 | 	// Input is the original name.
22 | 	Input string
23 | 	// Output is the name after the tag removal.
24 | 	Output string
25 | }
26 | 
27 | // CleanupStream takes input and output string channels, and feeds output with
28 | // pipe delimited strings with original name on the left and cleaned up name
29 | // on the right from the pipe.
30 | func CleanupStream(in <-chan string, out chan<- *CleanupResult, wn int) {
31 | 	var wg sync.WaitGroup
32 | 	wg.Add(wn)
33 | 	for i := 0; i < wn; i++ {
34 | 		go cleanupWorker(in, out, &wg)
35 | 	}
36 | 	wg.Wait()
37 | 	close(out)
38 | }
39 | 
40 | func cleanupWorker(in <-chan string, out chan<- *CleanupResult,
41 | 	wg *sync.WaitGroup) {
42 | 	defer wg.Done()
43 | 	for s := range in {
44 | 		res := StripTags(s)
45 | 		out <- &CleanupResult{Input: s, Output: res}
46 | 	}
47 | }
48 | 
49 | // StripTags takes a slice of bytes and returns a string with common
50 | // tags removed and html entities escaped. It does keep all uncommon tags
51 | // intact to let parser deal with them.
52 | func StripTags(s string) string {
53 | 	var buff bytes.Buffer
54 | 	r := bytes.NewReader([]byte(s))
55 | 
56 | 	tokenizer := html.NewTokenizer(r)
57 | 	for {
58 | 		if tokenizer.Next() == html.ErrorToken {
59 | 			err := tokenizer.Err()
60 | 			if err == io.EOF {
61 | 				return html.UnescapeString(buff.String())
62 | 			}
63 | 			return ""
64 | 		}
65 | 		tokenVal := string(tokenizer.Raw())
66 | 
67 | 		token := tokenizer.Token()
68 | 		switch token.Type {
69 | 		case html.DoctypeToken:
70 | 		case html.CommentToken:
71 | 		case html.StartTagToken:
72 | 			if _, ok := tags[token.Data]; ok {
73 | 				break
74 | 			}
75 | 			buff.WriteString(tokenVal)
76 | 
77 | 		case html.EndTagToken:
78 | 			if _, ok := tags[token.Data]; ok {
79 | 				break
80 | 			}
81 | 			buff.WriteString(tokenVal)
82 | 
83 | 		case html.TextToken:
84 | 			buff.WriteString(tokenVal)
85 | 
86 | 		default:
87 | 			return ""
88 | 		}
89 | 	}
90 | }
91 | 


--------------------------------------------------------------------------------
/ent/internal/preprocess/noparse.rl:
--------------------------------------------------------------------------------
 1 | package preprocess
 2 | 
 3 | func NoParse(data []byte) bool {
 4 | 
 5 |   %%{
 6 |     machine noparse;
 7 |     write data;
 8 |   }%%
 9 | 
10 |   cs, p, pe, eof := 0, 0, len(data), len(data)
11 |   _ = eof
12 | 	_ = noparse_first_final
13 | 	_ = noparse_error
14 | 	_ = noparse_en_main
15 | 
16 |   var match bool
17 | 
18 | 
19 |   %%{
20 |     action setMatch {match = true}
21 | 
22 |     noparse1 = ("Not" | "None" | "Un" ("n"? "amed" | "identified"));
23 |     noparse2 = any* [Ii] "nc" ("." | "ertae") space* [Ss] "ed" ("." | "is");
24 |     noparse3 = any* (("endo" | "ecto")? "symbiont" | "phytoplasma" | space "cyano"? "bacterium"| "plasmid" "s"? | [^A-Z] "RNA" [^A-Z]*);
25 | 
26 | 
27 |     main := (noparse1 | noparse2 | noparse3) %/setMatch
28 |             ((space | punct) >setMatch);
29 | 
30 |     write init;
31 |     write exec;
32 | 
33 |   }%%
34 | 
35 |   return match
36 | }
37 | 


--------------------------------------------------------------------------------
/ent/internal/preprocess/preprocess.go:
--------------------------------------------------------------------------------
  1 | // Package preprocess performs preparsing filtering and modification of a
  2 | // scientific-name.
  3 | package preprocess
  4 | 
  5 | import (
  6 | 	"bytes"
  7 | 	"io"
  8 | 	"regexp"
  9 | 	"strings"
 10 | 	"unicode"
 11 | 
 12 | 	"github.com/gnames/gnparser/ent/internal/preparser"
 13 | 	"golang.org/x/text/unicode/norm"
 14 | )
 15 | 
 16 | var VirusException = map[string]string{
 17 | 	"Aspilota":      "vector",
 18 | 	"Bembidion":     "satellites",
 19 | 	"Bolivina":      "prion",
 20 | 	"Ceylonesmus":   "vector",
 21 | 	"Cryptops":      "vector",
 22 | 	"Culex":         "vector",
 23 | 	"Dasyproctus":   "cevirus",
 24 | 	"Desmoxytes":    "vector",
 25 | 	"Dicathais":     "vector",
 26 | 	"Erateina":      "satellites",
 27 | 	"Euragallia":    "prion",
 28 | 	"Exochus":       "virus",
 29 | 	"Hilara":        "vector",
 30 | 	"Ithomeis":      "satellites",
 31 | 	"Microgoneplax": "prion",
 32 | 	"Neoaemula":     "vector",
 33 | 	"Nephodia":      "satellites",
 34 | 	"Ophion":        "virus",
 35 | 	"Phalium":       "vector",
 36 | 	"Psenulus":      "trevirus",
 37 | 	"Tidabius":      "vector",
 38 | 	"Turkozelotes":  "attavirus",
 39 | }
 40 | 
 41 | var AmbiguousException = map[string][]string{
 42 | 	"Aeolesthes":        {"mihi"},
 43 | 	"Agnetina":          {"den"},
 44 | 	"Agra":              {"not"},
 45 | 	"Aleuroclava":       {"complex"},
 46 | 	"Allawrencius":      {"complex"},
 47 | 	"Anisochaeta":       {"mihi"},
 48 | 	"Antaplaga":         {"dela"},
 49 | 	"Baeolidia":         {"dela"},
 50 | 	"Bolbodeomyia":      {"complex"},
 51 | 	"Bolitoglossa":      {"la"},
 52 | 	"Campylosphaera":    {"dela"},
 53 | 	"Castelnaudia":      {"spec"},
 54 | 	"Cicada":            {"complex"},
 55 | 	"Concinnum":         {"ten"},
 56 | 	"Desmoxytes":        {"des"},
 57 | 	"Dicentria":         {"dela"},
 58 | 	"Dichostasia":       {"complex"},
 59 | 	"Dimorphoceras":     {"complex"},
 60 | 	"Dischidia":         {"complex"},
 61 | 	"Ecnomus":           {"complex"},
 62 | 	"Eresus":            {"da"},
 63 | 	"Eucyclops":         {"mihi"},
 64 | 	"Eulaira":           {"dela"},
 65 | 	"Fusinus":           {"complex"},
 66 | 	"Gnathopleustes":    {"den"},
 67 | 	"Gobiosoma":         {"spec"},
 68 | 	"Gonatobotrys":      {"complex"},
 69 | 	"Heizmannia":        {"complex"},
 70 | 	"Helophorus":        {"ser"},
 71 | 	"Hemicloeina":       {"spec"},
 72 | 	"Lampona":           {"spec"},
 73 | 	"Leptonetela":       {"la"},
 74 | 	"Libystica":         {"complex"},
 75 | 	"Malamatidia":       {"zu"},
 76 | 	"Meteorus":          {"dos"},
 77 | 	"Nocaracris":        {"van"},
 78 | 	"Notozomus":         {"spec"},
 79 | 	"Ochodaeus":         {"complex"},
 80 | 	"Odontella":         {"do"},
 81 | 	"Oecetis":           {"complex"},
 82 | 	"Oedipina":          {"complex"},
 83 | 	"Oedipus":           {"complex"},
 84 | 	"Oedopinola":        {"complex"},
 85 | 	"Orcevia":           {"zu"},
 86 | 	"Paradimorphoceras": {"complex"},
 87 | 	"Paralvinella":      {"dela"},
 88 | 	"Parentia":          {"do"},
 89 | 	"Phyllospongia":     {"complex"},
 90 | 	"Plagiozopelma":     {"du"},
 91 | 	"Plectrocnemia":     {"complex"},
 92 | 	"Rubus":             {"complex"},
 93 | 	"Ruteloryctes":      {"bis"},
 94 | 	"Sceliphron":        {"complex"},
 95 | 	"Scopaeus":          {"complex"},
 96 | 	"Scoparia":          {"dela"},
 97 | 	"Selenops":          {"ab"},
 98 | 	"Semiothisa":        {"da"},
 99 | 	"Serina":            {"ser", "subser"},
100 | 	"Schizura":          {"dela"},
101 | 	"Sigipinius":        {"complex"},
102 | 	"Stegosoladidus":    {"complex"},
103 | 	"Stenoecia":         {"dos"},
104 | 	"Sympycnus":         {"du"},
105 | 	"Tetracis":          {"complex"},
106 | 	"Tetramorium":       {"do"},
107 | 	"Tortolena":         {"dela"},
108 | 	"Trichosternus":     {"spec"},
109 | 	"Trisephena":        {"complex"},
110 | 	"Zodarion":          {"van"},
111 | }
112 | 
113 | var NoParseException = map[string]string{
114 | 	"Navicula":   "bacterium",
115 | 	"Spirophora": "bacterium",
116 | }
117 | 
118 | var cultivarRankRe = regexp.MustCompile(
119 | 	`\s+(cultivar\.?[\W_]|cv\.?[\W_]|['"‘’“”]).*$`,
120 | )
121 | 
122 | var ofWordRe = regexp.MustCompile(
123 | 	`\s+(of[\W_]).*$`,
124 | )
125 | 
126 | var dagger = []byte("†")
127 | 
128 | // Preprocessor structure keeps state of the preprocessor results.
129 | type Preprocessor struct {
130 | 	Virus       bool
131 | 	Underscore  bool
132 | 	NoParse     bool
133 | 	DaggerChar  bool
134 | 	Approximate bool
135 | 	Annotation  bool
136 | 	Body        []byte
137 | 	Tail        []byte
138 | 	Ambiguous   ambiguous
139 | }
140 | 
141 | type ambiguous struct {
142 | 	Orig  string
143 | 	Subst string
144 | }
145 | 
146 | var normalizer = norm.NFC
147 | 
148 | // Preprocess runs a series of regular expressions over the input to determine
149 | // features of the input before parsing.
150 | func Preprocess(ppr *preparser.PreParser, bs []byte) *Preprocessor {
151 | 	bs = normalizer.Bytes(bs)
152 | 
153 | 	pr := &Preprocessor{}
154 | 
155 | 	// check for empty string
156 | 	if len(bs) == 0 || strings.TrimSpace(string(bs)) == "" {
157 | 		pr.NoParse = true
158 | 		return pr
159 | 	}
160 | 	i := len(bs)
161 | 	words := strings.Fields(string(bs))
162 | 
163 | 	// check for viruses, plasmids, RNA, DNA etc.
164 | 	if !isException(words, VirusException) {
165 | 		pr.Virus = IsVirus(bs[0:i])
166 | 	}
167 | 	if pr.Virus {
168 | 		pr.NoParse = true
169 | 		return pr
170 | 	}
171 | 
172 | 	// check for unparseable names
173 | 	pr.NoParse = NoParse(bs[0:i])
174 | 	if isException(words, NoParseException) {
175 | 		pr.NoParse = false
176 | 	}
177 | 	if pr.NoParse {
178 | 		return pr
179 | 	}
180 | 
181 | 	pr.DaggerChar = hasDagger(bs[0:i])
182 | 
183 | 	if len(words) > 1 {
184 | 		pr.ambiguous(words[0], bs)
185 | 	}
186 | 
187 | 	j := procAnnot(ppr, bs[0:i])
188 | 	if j < i {
189 | 		pr.Annotation = true
190 | 		i = j
191 | 	}
192 | 
193 | 	// ignoring error, as it should never happen
194 | 	changed, _ := UnderscoreToSpace(bs[0:i])
195 | 	if changed {
196 | 		pr.Underscore = true
197 | 	}
198 | 
199 | 	pr.Body = bs[0:i]
200 | 	pr.Tail = bs[i:]
201 | 	return pr
202 | }
203 | 
204 | func hasDagger(bs []byte) bool {
205 | 	idx := bytes.Index(bs, dagger)
206 | 	if idx == -1 {
207 | 		return false
208 | 	}
209 | 
210 | 	sp := byte(' ')
211 | 	bs[idx], bs[idx+1], bs[idx+2] = sp, sp, sp
212 | 	return true
213 | }
214 | 
215 | func isException(words []string, names map[string]string) bool {
216 | 	if len(words) < 2 {
217 | 		return false
218 | 	}
219 | 	if epithet, ok := names[words[0]]; ok {
220 | 		for _, w := range words[1:] {
221 | 			if w == epithet {
222 | 				return true
223 | 			}
224 | 		}
225 | 	}
226 | 	return false
227 | }
228 | 
229 | func (p *Preprocessor) ambiguous(firstWord string, bs []byte) {
230 | 	if epithets, ok := AmbiguousException[firstWord]; ok {
231 | 		var sub byte = 'k'
232 | 		for _, epithet := range epithets {
233 | 			idx := bytes.Index(bs, []byte(" "+epithet))
234 | 			if idx == -1 {
235 | 				continue
236 | 			}
237 | 			p.Ambiguous.Orig = epithet
238 | 			p.Ambiguous.Subst = string(sub) + epithet[1:]
239 | 			bs[idx+1] = sub
240 | 		}
241 | 	}
242 | }
243 | 
244 | // procAnnot returns index where unparsed part starts. In case if
245 | // the full string can be parsed, returns returns the index of the end of the
246 | // input.
247 | func procAnnot(ppr *preparser.PreParser, bs []byte) int {
248 | 	i := len(bs)
249 | 	if idx := ppr.TailIndex(string(bs)); idx >= 0 {
250 | 		i = idx
251 | 	}
252 | 
253 | 	// If ` of ` is in the string, before the start of the already-calculated
254 | 	// unparsed part, but there is no cultivar rank marker before it, consider it
255 | 	// unparseable. `Anthurium 'Ace of Spades'` should parse fully;
256 | 	// `Anthurium Trustees of the British Museum` should not.
257 | 	cultivarRankLoc := cultivarRankRe.FindIndex(bs[0:i])
258 | 	ofLoc := ofWordRe.FindIndex(bs[0:i])
259 | 	if len(ofLoc) > 0 && ofLoc[0] < i &&
260 | 		(len(cultivarRankLoc) == 0 || cultivarRankLoc[0] > ofLoc[0]) {
261 | 		i = ofLoc[0]
262 | 	}
263 | 
264 | 	return i
265 | }
266 | 
267 | // UnderscoreToSpace takes a slice of bytes. If it finds that the string
268 | // contains underscores, but not spaces, it substitutes underscores to spaces
269 | // in the slice. In case if any spaces are present, the slice is returned
270 | // unmodified.
271 | func UnderscoreToSpace(bs []byte) (bool, error) {
272 | 	reader := bytes.NewReader(bs)
273 | 	var hasUnderscore bool
274 | 	for {
275 | 		r, _, err := reader.ReadRune()
276 | 		if err == io.EOF {
277 | 			break
278 | 		} else if err != nil {
279 | 			return false, err
280 | 		}
281 | 		if unicode.IsSpace(r) {
282 | 			return false, nil
283 | 		}
284 | 		if r == '_' {
285 | 			hasUnderscore = true
286 | 		}
287 | 	}
288 | 	if !hasUnderscore {
289 | 		return false, nil
290 | 	}
291 | 
292 | 	for i, v := range bs {
293 | 		if v == '_' {
294 | 			bs[i] = ' '
295 | 		}
296 | 	}
297 | 	return true, nil
298 | }
299 | 


--------------------------------------------------------------------------------
/ent/internal/preprocess/preprocess_test.go:
--------------------------------------------------------------------------------
  1 | package preprocess
  2 | 
  3 | import (
  4 | 	"strings"
  5 | 	"testing"
  6 | 
  7 | 	"github.com/gnames/gnparser/ent/internal/preparser"
  8 | 	"github.com/stretchr/testify/assert"
  9 | )
 10 | 
 11 | func TestCleanup(t *testing.T) {
 12 | 	t.Run("StripTags", func(t *testing.T) {
 13 | 		data := []struct {
 14 | 			msg    string
 15 | 			tags   string
 16 | 			notags string
 17 | 		}{
 18 | 			{"no html", "Hello", "Hello"},
 19 | 			{"html tags", "<i>Hello</i>", "Hello"},
 20 | 			{"html tags", "<I>Hello</I>", "Hello"},
 21 | 			{"bad tag", "<!--", ""},
 22 | 			{"bad tag with newline", "<!--\n", ""},
 23 | 			{"keep other tags",
 24 | 				"<code>Hello</code> & you",
 25 | 				"<code>Hello</code> & you"},
 26 | 			{"preserve case for other tags",
 27 | 				"<CODE>Hello & you</CODE>",
 28 | 				"<CODE>Hello & you</CODE>"},
 29 | 			{"unknown tags", "<NA>Hello</NA> & you", "<NA>Hello</NA> & you"},
 30 | 			{"entities", "Hello &amp; you", "Hello & you"},
 31 | 		}
 32 | 		for _, v := range data {
 33 | 			assert.Equal(t, v.notags, StripTags(v.tags), v.msg)
 34 | 		}
 35 | 	})
 36 | 	t.Run("does not return nil", func(t *testing.T) {
 37 | 		assert.NotNil(t, StripTags("<!--"))
 38 | 		assert.NotNil(t, StripTags("<!--\r\n"))
 39 | 	})
 40 | }
 41 | 
 42 | func TestPreprocess(t *testing.T) {
 43 | 	t.Run("NoParseLikeName", func(t *testing.T) {
 44 | 		data := []struct {
 45 | 			msg            string
 46 | 			name           string
 47 | 			likeAnnotation bool
 48 | 		}{
 49 | 			{"name", "Navicula bacterium", true},
 50 | 		}
 51 | 		for _, v := range data {
 52 | 			words := strings.Split(v.name, " ")
 53 | 			assert.Equal(t, v.likeAnnotation, isException(words, NoParseException), v.msg)
 54 | 		}
 55 | 	})
 56 | 
 57 | 	t.Run("VirusLikeName", func(t *testing.T) {
 58 | 		data := []struct {
 59 | 			msg       string
 60 | 			name      string
 61 | 			likeVirus bool
 62 | 		}{
 63 | 			{"name1", "Aspilota vector Belokobylskij, 2007", true},
 64 | 			{"name2", "Ceylonesmus vector Chamberlin, 1941", true},
 65 | 			{"name3", "Cryptops (Cryptops) vector Chamberlin, 1939", true},
 66 | 			{"name4", "Culex vector Dyar & Knab, 1906", true},
 67 | 			{"name5", "Dasyproctus cevirus Leclercq, 1963", true},
 68 | 			{"name6", "Desmoxytes vector (Chamberlin, 1941)", true},
 69 | 			{"name7", "Dicathais vector Thornley, 1952", true},
 70 | 			{"name8", "Euragallia prion Kramer, 1976", true},
 71 | 			{"name9", "Exochus virus Gauld & Sithole, 2002", true},
 72 | 			{"name10", "Hilara vector Miller, 1923", true},
 73 | 			{"name11", "Microgoneplax prion Castro, 2007", true},
 74 | 			{"name12", "Neoaemula vector Mackinnon, Hiller, Long & Marshall, 2008", true},
 75 | 			{"name13", "Ophion virus Gauld & Mitchell, 1981", true},
 76 | 			{"name14", "Psenulus trevirus Leclercq, 1961", true},
 77 | 			{"name15", "Tidabius vector Chamberlin, 1931", true},
 78 | 			{"name16", "Ceylonesmus prion", false},
 79 | 			{"name17", "Homo sapiens coronavirus", false},
 80 | 		}
 81 | 		for _, v := range data {
 82 | 			words := strings.Split(v.name, " ")
 83 | 			assert.Equal(t, v.likeVirus, isException(words, VirusException), v.msg)
 84 | 		}
 85 | 	})
 86 | 
 87 | 	t.Run("IsVirus", func(t *testing.T) {
 88 | 		data := []struct {
 89 | 			msg     string
 90 | 			name    string
 91 | 			isVirus bool
 92 | 		}{
 93 | 			{"No match", "Homo sapiens", false},
 94 | 			{"Match word", "Arv1virus ", true},
 95 | 			{"Match word", "Turtle herpesviruses", true},
 96 | 			{"Match word", "Cre expression vector", true},
 97 | 			{"Match word", "Abutilon mosaic vir. ICTV", true},
 98 | 			{"Match word", "Aeromonas phage 65", true},
 99 | 			{"Match word", "Apple scar skin viroid", true},
100 | 			{"Match word", "Agents of Spongiform Encephalopathies CWD prion Chronic wasting disease", true},
101 | 			{"Match word", "Phi h-like viruses", true},
102 | 			{"Match word", "Viroids", true},
103 | 			{"Match word", "Human rhinovirus A11", true},
104 | 			{"Match word", "Gossypium mustilinum symptomless alphasatellite", true},
105 | 			{"Match word", "Bemisia betasatellite LW-2014", true},
106 | 			{"Match word", "Intracisternal A-particles", true},
107 | 			{"Match word", "Uranotaenia sapphirina NPV", true},
108 | 			{"Match word", "Spodoptera frugiperda MNPV", true},
109 | 			{"Match word", "Mamestra configurata NPV-A", true},
110 | 			{"Match word", "Bacteriophage PH75", true},
111 | 		}
112 | 		for _, v := range data {
113 | 			res := IsVirus([]byte(v.name))
114 | 			assert.Equal(t, v.isVirus, res, v.msg)
115 | 		}
116 | 	})
117 | 
118 | 	t.Run("NoParse", func(t *testing.T) {
119 | 		data := []struct {
120 | 			msg    string
121 | 			name   string
122 | 			parsed bool
123 | 		}{
124 | 			{"No match", "Homo sapiens", false},
125 | 			{"No word at the start", "Not Homo sapiens", true},
126 | 			{"Noword at the start", "Nothomo sapiens", false},
127 | 			{"Not word at the start", "Not Homo sapiens", true},
128 | 			{"None word at the start", "None Homo sapiens", true},
129 | 			{"Unidentified at the start", "Unidentified species", true},
130 | 			{"Incertae sedis1", "incertae sedis", true},
131 | 			{"Incertae sedis2", "Incertae Sedis", true},
132 | 			{"Incertae sedis3", "Something incertae sedis", true},
133 | 			{"Incertae sedis4", "Homo sapiens inc.sed.", true},
134 | 			{"Incertae sedis5", "Incertae sedis", true},
135 | 			{"Phytoplasma in the middle", "Homo sapiensphytoplasmaoid", false},
136 | 			{"Phytoplasma in the end", "Homo sapiensphytoplasma Linn", true},
137 | 			{"Phytoplasma in the end", "Homo sapiensphytoplasma Linn", true},
138 | 			{"Plasmid1", "E. coli plasmids", true},
139 | 			{"Plasmid2", "E. coli plasmidia", false},
140 | 			{"Plasmid3", "E. coli plasmid", true},
141 | 			{"RNA1", "E. coli RNA", true},
142 | 			{"RNA2", "E. coli 32RNA", true},
143 | 			{"RNA3", "KURNAKOV", false},
144 | 			{"RNA4", "E. coli mRNA", true},
145 | 		}
146 | 		for _, v := range data {
147 | 			res := NoParse([]byte(v.name))
148 | 			assert.Equal(t, v.parsed, res, v.msg)
149 | 		}
150 | 	})
151 | 
152 | 	t.Run("Annotations", func(t *testing.T) {
153 | 		tests := []struct {
154 | 			msg  string
155 | 			in   string
156 | 			out  string
157 | 			tail string
158 | 		}{
159 | 
160 | 			{"No tail", "Homo sapiens", "Homo sapiens", ""},
161 | 			{"S. S.", "Homo sapiens S. S.", "Homo sapiens S. S.", ""},
162 | 			{"s. s.", "Homo sapiens s. s.", "Homo sapiens", " s. s."},
163 | 			{"sensu", "Homo sapiens sensu Linn.", "Homo sapiens", " sensu Linn."},
164 | 			{"nomen", "Homo sapiens nomen nudum", "Homo sapiens", " nomen nudum"},
165 | 		}
166 | 		ppr := preparser.New()
167 | 
168 | 		for _, v := range tests {
169 | 			bs := []byte(v.in)
170 | 			i := procAnnot(ppr, bs)
171 | 			assert.Equal(t, v.out, string(bs[0:i]), v.msg)
172 | 			assert.Equal(t, v.tail, string(bs[i:]), v.msg)
173 | 		}
174 | 	})
175 | 
176 | 	t.Run("UnderscoreToSpace", func(t *testing.T) {
177 | 		data := []struct {
178 | 			msg     string
179 | 			in      string
180 | 			out     string
181 | 			changed bool
182 | 		}{
183 | 			{"no nothing", "Hello", "Hello", false},
184 | 			{"has spaces", "Hello_you !", "Hello_you !", false},
185 | 			{"has spaces", "Hello_you\t!", "Hello_you\t!", false},
186 | 			{"has only underscores", "Hello_you_!_", "Hello you ! ", true},
187 | 		}
188 | 		for _, v := range data {
189 | 			bs := []byte(v.in)
190 | 			changed2, _ := UnderscoreToSpace(bs)
191 | 			assert.Equal(t, v.out, string(bs), v.msg)
192 | 			assert.Equal(t, v.changed, changed2)
193 | 		}
194 | 	})
195 | 
196 | 	t.Run("does not remove spaces", func(t *testing.T) {
197 | 		name := "    Asplenium       × inexpectatum(E. L. Braun ex Friesner      )Morton"
198 | 		ppr := preparser.New()
199 | 		res := Preprocess(ppr, []byte(name))
200 | 		assert.Equal(t, name, string(res.Body))
201 | 	})
202 | }
203 | 


--------------------------------------------------------------------------------
/ent/internal/preprocess/virus.rl:
--------------------------------------------------------------------------------
 1 | package preprocess
 2 | 
 3 | func IsVirus(data []byte) bool {
 4 |   %%{
 5 |     machine virus;
 6 |     write data;
 7 |   }%%
 8 | 
 9 |   cs, p, pe, eof := 0, 0, len(data), len(data)
10 |   _ = eof
11 |   _ = virus_en_main
12 |   _ = virus_error
13 |   _ = virus_first_final
14 | 
15 |   var match bool
16 | 
17 |   %%{
18 |     action setMatch {match = true}
19 | 
20 |     vir_str = (alnum* "virus"i "es"i?) |
21 |               'ICTV' | 'Ictv' |
22 |               ("cyano"i | "bacterio"i | "viro"i)? "phage"i "s"i? |
23 |               ("vector"i | "viroid"i | "particle"i | "prion"i) "s"i? |
24 |               alnum* "npv"i |
25 |               ("alpha"i | "beta"i)? "satellite"i "s"i?;
26 | 
27 | 
28 |     main := ('' | any* (space | punct))
29 |             vir_str %/setMatch
30 |             ((space | punct) >setMatch);
31 | 
32 |     write init;
33 |     write exec;
34 | 
35 |   }%%
36 | 
37 |   return match
38 | }
39 | 


--------------------------------------------------------------------------------
/ent/nameidx/nameidx.go:
--------------------------------------------------------------------------------
 1 | // Package nameidx provides a structure that preserves original position
 2 | // of a name-string in an input slice.
 3 | package nameidx
 4 | 
 5 | // NameIdx presents an input name-string and its position in the input
 6 | // slice.
 7 | type NameIdx struct {
 8 | 	// Index is the position of a string in the input slice.
 9 | 	Index int
10 | 
11 | 	// NameString is the input string.
12 | 	NameString string
13 | }
14 | 


--------------------------------------------------------------------------------
/ent/parsed/annotation.go:
--------------------------------------------------------------------------------
 1 | package parsed
 2 | 
 3 | import (
 4 | 	"errors"
 5 | 	"strings"
 6 | )
 7 | 
 8 | // Annotations are additional descriptions of a name type.
 9 | type Annotation int
10 | 
11 | const (
12 | 	// NoAnnot is absence of additional descriptions.
13 | 	NoAnnot Annotation = iota
14 | 	// SurrogateAnnot is a miscellaneous informal name.
15 | 	SurrogateAnnot
16 | 	// ComparisonAnnot name with comparison marker (cf.).
17 | 	ComparisonAnnot
18 | 	// ApproximationAnnot is a name with approximation annotation (sp., spp etc.)
19 | 	ApproximationAnnot
20 | 	// BOLDAnnot is a surrogate name created by BOLD project.
21 | 	BOLDAnnot
22 | 	// HybridAnnot is a miscellaneous hybrid name.
23 | 	HybridAnnot
24 | 	// NamedHybridAnnot is a stable hybrid in botany with registered name.
25 | 	NamedHybridAnnot
26 | 	// HybridFormulaAnnot is a hybrid created by combination of 2 or more names.
27 | 	HybridFormulaAnnot
28 | 	// NothoHybridAnnot is a hybrid with notho- 'ranks'.
29 | 	NothoHybridAnnot
30 | 	// GraftChimeraAnnot is a miscellaneous graft-chimera name.
31 | 	GraftChimeraAnnot
32 | 	// GraftChimeraFormulatAnnot is a graft-chimera created by the combination of 2 or more names
33 | 	GraftChimeraFormulaAnnot
34 | 	// NamedGraftChimeraAnnot is a stable graft-chimera in botany with registered name.
35 | 	NamedGraftChimeraAnnot
36 | )
37 | 
38 | var annotMap = map[Annotation]string{
39 | 	NoAnnot:                  "",
40 | 	SurrogateAnnot:           "SURROGATE",
41 | 	ComparisonAnnot:          "COMPARISON",
42 | 	ApproximationAnnot:       "APPROXIMATION",
43 | 	BOLDAnnot:                "BOLD_SURROGATE",
44 | 	HybridAnnot:              "HYBRID",
45 | 	NamedHybridAnnot:         "NAMED_HYBRID",
46 | 	HybridFormulaAnnot:       "HYBRID_FORMULA",
47 | 	NothoHybridAnnot:         "NOTHO_HYBRID",
48 | 	GraftChimeraFormulaAnnot: "GRAFT_CHIMERA_FORMULA",
49 | 	NamedGraftChimeraAnnot:   "NAMED_GRAFT_CHIMERA",
50 | }
51 | 
52 | var annotStrMap = func() map[string]Annotation {
53 | 	res := make(map[string]Annotation)
54 | 	for k, v := range annotMap {
55 | 		res[v] = k
56 | 	}
57 | 	return res
58 | }()
59 | 
60 | // String is an implementation of fmt.Stringer interface.
61 | func (a Annotation) String() string {
62 | 	return annotMap[a]
63 | }
64 | 
65 | // MarshalJSON implements json.Marshaler.
66 | // It will encode null if this Int is null.
67 | func (a Annotation) MarshalJSON() ([]byte, error) {
68 | 	return []byte("\"" + a.String() + "\""), nil
69 | }
70 | 
71 | // UnmarshalJSON implements json.Unmarshaller.
72 | func (a *Annotation) UnmarshalJSON(bs []byte) error {
73 | 	var err error
74 | 	var ok bool
75 | 	// strings.Trim seems to be ~10 time faster here than
76 | 	// json-iter Unmarshal
77 | 	s := strings.Trim(string(bs), `"`)
78 | 	*a, ok = annotStrMap[s]
79 | 	if !ok {
80 | 		err = errors.New("cannot decode Annotation")
81 | 	}
82 | 	return err
83 | }
84 | 


--------------------------------------------------------------------------------
/ent/parsed/annotation_test.go:
--------------------------------------------------------------------------------
 1 | package parsed_test
 2 | 
 3 | import (
 4 | 	"testing"
 5 | 
 6 | 	"github.com/gnames/gnfmt"
 7 | 	"github.com/gnames/gnparser/ent/parsed"
 8 | 	"github.com/stretchr/testify/assert"
 9 | )
10 | 
11 | func TestStringAnnot(t *testing.T) {
12 | 	data := []struct {
13 | 		annot parsed.Annotation
14 | 		res   string
15 | 	}{
16 | 		{parsed.NoAnnot, ""},
17 | 		{parsed.ComparisonAnnot, "COMPARISON"},
18 | 		{parsed.ApproximationAnnot, "APPROXIMATION"},
19 | 		{parsed.SurrogateAnnot, "SURROGATE"},
20 | 	}
21 | 
22 | 	for i := range data {
23 | 		assert.Equal(t, data[i].res, data[i].annot.String())
24 | 	}
25 | }
26 | 
27 | func TestJSONAnnot(t *testing.T) {
28 | 	type dataOb struct {
29 | 		Field1 string            `json:"f1"`
30 | 		Annot  parsed.Annotation `json:"annot"`
31 | 		Field2 []int             `json:"f2"`
32 | 	}
33 | 	data := []struct {
34 | 		dob dataOb
35 | 		res string
36 | 	}{
37 | 		{dataOb{"None", parsed.NoAnnot, []int{}},
38 | 			`{"f1":"None","annot":"","f2":[]}`},
39 | 		{dataOb{"Comparison", parsed.ComparisonAnnot, []int{2, 3, 4}},
40 | 			`{"f1":"Comparison","annot":"COMPARISON","f2":[2,3,4]}`},
41 | 	}
42 | 	enc := gnfmt.GNjson{}
43 | 	var dob dataOb
44 | 	for i := range data {
45 | 		res, err := enc.Encode(data[i].dob)
46 | 		assert.Nil(t, err)
47 | 		assert.Equal(t, data[i].res, string(res))
48 | 		err = enc.Decode(res, &dob)
49 | 		assert.Nil(t, err)
50 | 		assert.Equal(t, data[i].dob.Annot, dob.Annot)
51 | 	}
52 | }
53 | 


--------------------------------------------------------------------------------
/ent/parsed/details.go:
--------------------------------------------------------------------------------
  1 | package parsed
  2 | 
  3 | // Uninomial are details for names with cardinality 1.
  4 | type Uninomial struct {
  5 | 	// Value is the uninomial name.
  6 | 	Value string `json:"uninomial"`
  7 | 	// Rank of the uninomial in a combination name, for example
  8 | 	// "Pereskia subg. Maihuenia Philippi ex F.A.C.Weber, 1898"
  9 | 	Rank string `json:"rank,omitempty"`
 10 | 	// Cultivar is a value of a cultivar of a uninomial.
 11 | 	Cultivar string `json:"cultivar,omitempty"`
 12 | 	// Parent of a uninomial in a combination name.
 13 | 	Parent string `json:"parent,omitempty"`
 14 | 	// Authorship of the uninomial.
 15 | 	Authorship *Authorship `json:"authorship,omitempty"`
 16 | }
 17 | 
 18 | // Species are details for binomial names with cardinality 2.
 19 | type Species struct {
 20 | 	// Genus is a value of a genus of a binomial.
 21 | 	Genus string `json:"genus"`
 22 | 	// Subgenus is a value of subgenus of binomial.
 23 | 	Subgenus string `json:"subgenus,omitempty"`
 24 | 	// Species is a value of a specific epithet.
 25 | 	Species string `json:"species"`
 26 | 	// Cultivar is a value of a cultivar of a binomial.
 27 | 	Cultivar string `json:"cultivar,omitempty"`
 28 | 	// Authorship of the binomial.
 29 | 	Authorship *Authorship `json:"authorship,omitempty"`
 30 | }
 31 | 
 32 | // Infraspecies are details for names with cardinality higher than 2.
 33 | type Infraspecies struct {
 34 | 	// Species are details for the binomial part of a name.
 35 | 	Species
 36 | 	// Infraspecies is a slice of infraspecific epithets of a name.
 37 | 	Infraspecies []InfraspeciesElem `json:"infraspecies,omitempty"`
 38 | }
 39 | 
 40 | // InfraspeciesElem are details for an infraspecific epithet of an
 41 | // Infraspecies name.
 42 | type InfraspeciesElem struct {
 43 | 	// Value of an infraspecific epithet.
 44 | 	Value string `json:"value"`
 45 | 	// Rank of the infraspecific epithet.
 46 | 	Rank string `json:"rank,omitempty"`
 47 | 	// Authorship of the infraspecific epithet.
 48 | 	Authorship *Authorship `json:"authorship,omitempty"`
 49 | }
 50 | 
 51 | // Comparison are details for a surrogate comparison name.
 52 | type Comparison struct {
 53 | 	// Genus is used if no species information is given
 54 | 	Genus string `json:"genus,omitempty"`
 55 | 
 56 | 	// Species are details for the binomial part of a name.
 57 | 	*Species
 58 | 
 59 | 	// InfraSpecies is an infraspecific epthet of a name.
 60 | 	InfraSpecies *InfraspeciesElem `json:"infraspecies,omitempty"`
 61 | 
 62 | 	// CompMarker, usually "cf.".
 63 | 	CompMarker string `json:"comparisonMarker"`
 64 | }
 65 | 
 66 | // Approximation are details for a surrogate approximation name.
 67 | type Approximation struct {
 68 | 	// Genus is the genus of a name.
 69 | 	Genus string `json:"genus"`
 70 | 	// Species is a specific epithet of a name.
 71 | 	Species string `json:"species,omitempty"`
 72 | 	// Cultivar is a value of a cultivar of a binomial.
 73 | 	Cultivar string `json:"cultivar,omitempty"`
 74 | 	// SpeciesAuthorship the authorship of Species.
 75 | 	SpeciesAuthorship *Authorship `json:"authorship,omitempty"`
 76 | 	// ApproxMarker describes what kind of approximation it is (sp., spp. etc.).
 77 | 	ApproxMarker string `json:"approximationMarker,omitempty"`
 78 | 	// Part of a name after ApproxMarker.
 79 | 	Ignored string `json:"ignored,omitempty"`
 80 | }
 81 | 
 82 | // DetailsHybridFormula are details for a hybrid formula names.
 83 | type DetailsHybridFormula struct {
 84 | 	HybridFormula []Details `json:"hybridFormula"`
 85 | }
 86 | 
 87 | // DetailsGraftChimeraFormula are details for a graft-chimera formula names.
 88 | type DetailsGraftChimeraFormula struct {
 89 | 	GraftChimeraFormula []Details `json:"graftChimeraFormula"`
 90 | }
 91 | 
 92 | // isDetails implements Details interface.
 93 | func (DetailsHybridFormula) isDetails() {}
 94 | 
 95 | // isDetails implements Details interface.
 96 | func (DetailsGraftChimeraFormula) isDetails() {}
 97 | 
 98 | // DetailsUninomial are Uninomial details.
 99 | type DetailsUninomial struct {
100 | 	// Uninomial details.
101 | 	Uninomial Uninomial `json:"uninomial"`
102 | }
103 | 
104 | // isDetails implements Details interface.
105 | func (DetailsUninomial) isDetails() {}
106 | 
107 | // DetailsSpecies are binomial details.
108 | type DetailsSpecies struct {
109 | 	// Species is details for binomial names.
110 | 	Species Species `json:"species"`
111 | }
112 | 
113 | // isDetails implements Details interface.
114 | func (DetailsSpecies) isDetails() {}
115 | 
116 | // DetailsInfraspecies are multinomial details.
117 | type DetailsInfraspecies struct {
118 | 	// Infraspecies details.
119 | 	Infraspecies Infraspecies `json:"infraspecies"`
120 | }
121 | 
122 | // isDetails implements Details interface.
123 | func (DetailsInfraspecies) isDetails() {}
124 | 
125 | // DetailsComparison are details for comparison surrogate names.
126 | type DetailsComparison struct {
127 | 	// Comparison details.
128 | 	Comparison Comparison `json:"comparison"`
129 | }
130 | 
131 | // isDetails implements Details interface.
132 | func (DetailsComparison) isDetails() {}
133 | 
134 | // DetailsApproximation are details for approximation surrogate names.
135 | type DetailsApproximation struct {
136 | 	// Approximation details.
137 | 	Approximation Approximation `json:"approximation"`
138 | }
139 | 
140 | // isDetails implements Details interface.
141 | func (DetailsApproximation) isDetails() {}
142 | 


--------------------------------------------------------------------------------
/ent/parsed/flatten_test.go:
--------------------------------------------------------------------------------
  1 | package parsed_test
  2 | 
  3 | import (
  4 | 	"testing"
  5 | 
  6 | 	"github.com/gnames/gnparser/ent/parsed"
  7 | 	"github.com/stretchr/testify/assert"
  8 | )
  9 | 
 10 | func TestFlatten(t *testing.T) {
 11 | 	tests := []struct {
 12 | 		name     string
 13 | 		input    parsed.Parsed
 14 | 		expected parsed.ParsedFlat
 15 | 	}{
 16 | 		{
 17 | 			name: "Parsed with all fields",
 18 | 			input: parsed.Parsed{
 19 | 				Parsed:        true,
 20 | 				NomCode:       "ICZN",
 21 | 				ParseQuality:  1,
 22 | 				Verbatim:      "Aus bus",
 23 | 				Normalized:    "Aus bus",
 24 | 				Cardinality:   2,
 25 | 				Rank:          "species",
 26 | 				Candidatus:    true,
 27 | 				Virus:         false,
 28 | 				Cultivar:      false,
 29 | 				DaggerChar:    false,
 30 | 				Hybrid:        nil,
 31 | 				GraftChimera:  nil,
 32 | 				Surrogate:     nil,
 33 | 				Tail:          "tail",
 34 | 				VerbatimID:    "12345",
 35 | 				ParserVersion: "1.0.0",
 36 | 				Canonical: &parsed.Canonical{
 37 | 					Simple:  "Aus bus",
 38 | 					Full:    "Aus bus",
 39 | 					Stemmed: "Aus bus",
 40 | 				},
 41 | 				Authorship: &parsed.Authorship{
 42 | 					Verbatim: "L.",
 43 | 					Original: &parsed.AuthGroup{
 44 | 						Authors: []string{"Linnaeus"},
 45 | 						Year:    &parsed.Year{Value: "1758"},
 46 | 					},
 47 | 					Combination: &parsed.AuthGroup{
 48 | 						Authors: []string{"Smith"},
 49 | 						Year:    &parsed.Year{Value: "1800"},
 50 | 					},
 51 | 				},
 52 | 				Details: parsed.DetailsSpecies{
 53 | 					Species: parsed.Species{
 54 | 						Genus:   "Aus",
 55 | 						Species: "bus",
 56 | 					},
 57 | 				},
 58 | 			},
 59 | 			expected: parsed.ParsedFlat{
 60 | 				Parsed:                    true,
 61 | 				NomCode:                   "ICZN",
 62 | 				ParseQuality:              1,
 63 | 				Verbatim:                  "Aus bus",
 64 | 				Normalized:                "Aus bus",
 65 | 				Cardinality:               2,
 66 | 				Rank:                      "species",
 67 | 				Candidatus:                true,
 68 | 				Virus:                     false,
 69 | 				Cultivar:                  false,
 70 | 				DaggerChar:                false,
 71 | 				Hybrid:                    "",
 72 | 				GraftChimera:              "",
 73 | 				Surrogate:                 "",
 74 | 				Tail:                      "tail",
 75 | 				VerbatimID:                "12345",
 76 | 				ParserVersion:             "1.0.0",
 77 | 				CanonicalSimple:           "Aus bus",
 78 | 				CanonicalFull:             "Aus bus",
 79 | 				CanonicalStemmed:          "Aus bus",
 80 | 				Authorship:                "L.",
 81 | 				BasionymAuthorship:        "Linnaeus",
 82 | 				BasionymAuthorshipYear:    "1758",
 83 | 				CombinationAuthorship:     "Smith",
 84 | 				CombinationAuthorshipYear: "1800",
 85 | 				Genus:                     "Aus",
 86 | 				Subgenus:                  "",
 87 | 				Species:                   "bus",
 88 | 			},
 89 | 		},
 90 | 		{
 91 | 			name: "Parsed with minimal fields",
 92 | 			input: parsed.Parsed{
 93 | 				Parsed:        false,
 94 | 				NomCode:       "ICZN",
 95 | 				ParseQuality:  0,
 96 | 				Verbatim:      "Unknown",
 97 | 				VerbatimID:    "67890",
 98 | 				ParserVersion: "1.0.0",
 99 | 			},
100 | 			expected: parsed.ParsedFlat{
101 | 				Parsed:        false,
102 | 				NomCode:       "ICZN",
103 | 				ParseQuality:  0,
104 | 				Verbatim:      "Unknown",
105 | 				VerbatimID:    "67890",
106 | 				ParserVersion: "1.0.0",
107 | 			},
108 | 		},
109 | 	}
110 | 
111 | 	for _, tt := range tests {
112 | 		t.Run(tt.name, func(t *testing.T) {
113 | 			result := tt.input.Flatten()
114 | 			assert.Equal(t, tt.expected, result)
115 | 		})
116 | 	}
117 | }
118 | 


--------------------------------------------------------------------------------
/ent/parsed/interface.go:
--------------------------------------------------------------------------------
1 | package parsed
2 | 
3 | // Details is a placeholder interface that allows to unify details of
4 | // various name types.
5 | type Details interface {
6 | 	// isDetails is a placeholder method.
7 | 	isDetails()
8 | }
9 | 


--------------------------------------------------------------------------------
/ent/parsed/output.go:
--------------------------------------------------------------------------------
 1 | package parsed
 2 | 
 3 | import (
 4 | 	"strconv"
 5 | 
 6 | 	"github.com/gnames/gnfmt"
 7 | )
 8 | 
 9 | // Output creates a JSON or CSV representation of Parsed results.
10 | func (p Parsed) Output(f gnfmt.Format) string {
11 | 	switch f {
12 | 	case gnfmt.CSV:
13 | 		return p.csvOutput(',')
14 | 	case gnfmt.TSV:
15 | 		return p.csvOutput('\t')
16 | 	case gnfmt.CompactJSON:
17 | 		return p.jsonOutput(false)
18 | 	case gnfmt.PrettyJSON:
19 | 		return p.jsonOutput(true)
20 | 	default:
21 | 		return "N/A"
22 | 	}
23 | }
24 | 
25 | // HeadersCSV returns the CSV header for parsing output.
26 | func HeaderCSV(f gnfmt.Format) string {
27 | 	header := []string{"Id", "Verbatim", "Cardinality", "CanonicalStem",
28 | 		"CanonicalSimple", "CanonicalFull", "Authorship", "Year", "Quality"}
29 | 	switch f {
30 | 	case gnfmt.CSV:
31 | 		return gnfmt.ToCSV(header, ',')
32 | 	case gnfmt.TSV:
33 | 		return gnfmt.ToCSV(header, '\t')
34 | 	default:
35 | 		return ""
36 | 	}
37 | }
38 | 
39 | func (p Parsed) csvOutput(sep rune) string {
40 | 	var stem, simple, full, authorship, year string
41 | 	if p.Canonical != nil {
42 | 		stem = p.Canonical.Stemmed
43 | 		simple = p.Canonical.Simple
44 | 		full = p.Canonical.Full
45 | 	}
46 | 
47 | 	if p.Authorship != nil {
48 | 		authorship = p.Authorship.Verbatim
49 | 		year = p.Authorship.Year
50 | 	}
51 | 
52 | 	res := []string{
53 | 		p.VerbatimID,
54 | 		p.Verbatim,
55 | 		strconv.Itoa(p.Cardinality),
56 | 		stem,
57 | 		simple,
58 | 		full,
59 | 		authorship,
60 | 		year,
61 | 		strconv.Itoa(p.ParseQuality),
62 | 	}
63 | 	return gnfmt.ToCSV(res, sep)
64 | }
65 | 
66 | func (p Parsed) jsonOutput(pretty bool) string {
67 | 	enc := gnfmt.GNjson{Pretty: pretty}
68 | 	res, _ := enc.Encode(p)
69 | 	return string(res)
70 | }
71 | 


--------------------------------------------------------------------------------
/ent/parsed/parsed_result.go:
--------------------------------------------------------------------------------
 1 | package parsed
 2 | 
 3 | import "fmt"
 4 | 
 5 | // ParsedWithIdx structure contains parsing output, its place in the
 6 | // slice, and an unexpected error, if it happened during the parsing.
 7 | type ParsedWithIdx struct {
 8 | 	Idx    int
 9 | 	Parsed Parsed
10 | 	Error  error
11 | }
12 | 
13 | func (pr ParsedWithIdx) Index() int {
14 | 	return pr.Idx
15 | }
16 | 
17 | func (pr ParsedWithIdx) Unpack(v interface{}) error {
18 | 	if pr.Error != nil {
19 | 		return pr.Error
20 | 	}
21 | 	switch p := v.(type) {
22 | 	case *Parsed:
23 | 		*p = pr.Parsed
24 | 		return nil
25 | 	default:
26 | 		return fmt.Errorf("cannot use %T as Parsed", v)
27 | 	}
28 | }
29 | 


--------------------------------------------------------------------------------
/ent/parsed/restore_ambiguous.go:
--------------------------------------------------------------------------------
 1 | package parsed
 2 | 
 3 | import (
 4 | 	"strings"
 5 | 
 6 | 	"github.com/gnames/gnparser/ent/stemmer"
 7 | )
 8 | 
 9 | // RestoreAmbiguous method is used for cases where specific or infra-specific
10 | // epithets had to be changed to be parsed sucessfully. Such situation
11 | // arises when an epithet is the same as some word that is also an
12 | // annotation, a prefix/suffix of an author name etc.
13 | func (p *Parsed) RestoreAmbiguous(epithet, subst string) {
14 | 	stem := stemmer.Stem(epithet).Stem
15 | 	stemSubst := stemmer.Stem(subst).Stem
16 | 	p.Normalized = restoreString(p.Normalized, epithet, subst)
17 | 	p.Canonical.Full = restoreString(p.Canonical.Full, epithet, subst)
18 | 	p.Canonical.Simple = restoreString(p.Canonical.Simple, epithet, subst)
19 | 	p.Canonical.Stemmed = restoreString(p.Canonical.Stemmed, stem, stemSubst)
20 | 
21 | 	for i := range p.Words {
22 | 		p.Words[i].Verbatim = restoreWord(p.Words[i].Verbatim, epithet, subst)
23 | 		p.Words[i].Normalized = restoreWord(p.Words[i].Normalized, epithet, subst)
24 | 	}
25 | 
26 | 	if sp, ok := p.Details.(DetailsSpecies); ok {
27 | 		sp.Species.Species = restoreWord(sp.Species.Species, epithet, subst)
28 | 		p.Details = sp
29 | 	}
30 | }
31 | 
32 | func restoreString(s, epithet, subst string) string {
33 | 	words := strings.Split(s, " ")
34 | 	for i := range words {
35 | 		if strings.HasPrefix(words[i], subst) {
36 | 			words[i] = epithet + words[i][len(epithet):]
37 | 			return strings.Join(words, " ")
38 | 		}
39 | 	}
40 | 	return s
41 | }
42 | 
43 | func restoreWord(w, epithet, subst string) string {
44 | 	if strings.HasPrefix(w, subst) {
45 | 		return epithet + w[len(epithet):]
46 | 	}
47 | 	return w
48 | }
49 | 


--------------------------------------------------------------------------------
/ent/parsed/warning_test.go:
--------------------------------------------------------------------------------
 1 | package parsed_test
 2 | 
 3 | import (
 4 | 	"testing"
 5 | 
 6 | 	"github.com/gnames/gnfmt"
 7 | 	"github.com/gnames/gnparser/ent/parsed"
 8 | 	"github.com/stretchr/testify/assert"
 9 | )
10 | 
11 | func TestStringWarn(t *testing.T) {
12 | 	data := []struct {
13 | 		annot parsed.Warning
14 | 		res   string
15 | 	}{
16 | 		{parsed.TailWarn, "Unparsed tail"},
17 | 	}
18 | 
19 | 	for i := range data {
20 | 		assert.Equal(t, data[i].res, data[i].annot.String())
21 | 	}
22 | }
23 | 
24 | func TestJSONWarn(t *testing.T) {
25 | 	type dataOb struct {
26 | 		Field1 string         `json:"f1"`
27 | 		Warn   parsed.Warning `json:"warning"`
28 | 		Field2 []int          `json:"f2"`
29 | 	}
30 | 	data := []struct {
31 | 		dob dataOb
32 | 		res string
33 | 	}{
34 | 		{dataOb{"Tail", parsed.TailWarn, []int{}},
35 | 			`{"f1":"Tail","warning":"Unparsed tail","f2":[]}`},
36 | 		{dataOb{"AuthEx", parsed.AuthExWarn, []int{2, 3, 4}},
37 | 			`{"f1":"AuthEx","warning":"` + "`ex`" + ` authors are not required (ICZN only)","f2":[2,3,4]}`},
38 | 	}
39 | 	enc := gnfmt.GNjson{}
40 | 	var dob dataOb
41 | 	for i := range data {
42 | 		res, err := enc.Encode(data[i].dob)
43 | 		assert.Nil(t, err)
44 | 		assert.Equal(t, data[i].res, string(res))
45 | 		err = enc.Decode(res, &dob)
46 | 		assert.Nil(t, err)
47 | 		assert.Equal(t, data[i].dob.Warn, dob.Warn)
48 | 	}
49 | }
50 | 


--------------------------------------------------------------------------------
/ent/parsed/words.go:
--------------------------------------------------------------------------------
  1 | package parsed
  2 | 
  3 | import (
  4 | 	"errors"
  5 | 	"strings"
  6 | 
  7 | 	"github.com/gnames/gnparser/ent/stemmer"
  8 | )
  9 | 
 10 | // Word represents a parsed word and its meaning in the name-string.
 11 | type Word struct {
 12 | 	// Verbatim is unmodified value of a word.
 13 | 	Verbatim string `json:"verbatim"`
 14 | 	// Normalized is normalized value of a word.
 15 | 	Normalized string `json:"normalized"`
 16 | 	// Type is a semantic meaning of a word.
 17 | 	Type WordType `json:"wordType"`
 18 | 	// Start is the index of the first letter of a word.
 19 | 	Start int `json:"start"`
 20 | 	// End is the index of the end of a word.
 21 | 	End int `json:"end"`
 22 | }
 23 | 
 24 | // NormalizeByType is useful when searching for a word alone.
 25 | // In such cases specific epithets will match better when stemmed,
 26 | // authors and genera low-cased, authors with stripped periods.
 27 | //
 28 | // The wrd argument supposed to be taken from `Words` field of the
 29 | // `Parsed` output.
 30 | func NormalizeByType(wrd string, wt WordType) string {
 31 | 	var res string
 32 | 	switch wt {
 33 | 	case SpEpithetType, InfraspEpithetType:
 34 | 		res = stemmer.Stem(wrd).Stem
 35 | 	case UninomialType, GenusType:
 36 | 		res = strings.ToLower(wrd)
 37 | 	case AuthorWordType:
 38 | 		res = strings.ToLower(wrd)
 39 | 	default:
 40 | 		res = strings.ToLower(wrd)
 41 | 	}
 42 | 	return res
 43 | }
 44 | 
 45 | // WordType designates semantic meaning of a word.
 46 | type WordType int
 47 | 
 48 | const (
 49 | 	UnknownType WordType = iota
 50 | 	ComparisonMarkerType
 51 | 	CultivarType
 52 | 	ApproxMarkerType
 53 | 	AuthorWordType
 54 | 	AuthorWordFiliusType
 55 | 	CandidatusType
 56 | 	GenusType
 57 | 	InfraspEpithetType
 58 | 	HybridCharType
 59 | 	GraftChimeraCharType
 60 | 	RankType
 61 | 	SpEpithetType
 62 | 	SubgenusType
 63 | 	SuperspType
 64 | 	UninomialType
 65 | 	YearApproximateType
 66 | 	YearType
 67 | )
 68 | 
 69 | var wordTypeMap = map[WordType]string{
 70 | 	UnknownType:          "WORD",
 71 | 	ComparisonMarkerType: "COMPARISON_MARKER",
 72 | 	CultivarType:         "CULTIVAR",
 73 | 	ApproxMarkerType:     "APPROXIMATION_MARKER",
 74 | 	AuthorWordType:       "AUTHOR_WORD",
 75 | 	AuthorWordFiliusType: "AUTHOR_WORD_FILIUS",
 76 | 	CandidatusType:       "CANDIDATUS",
 77 | 	GenusType:            "GENUS",
 78 | 	HybridCharType:       "HYBRID_CHAR",
 79 | 	GraftChimeraCharType: "GRAFT_CHIMERA_CHAR",
 80 | 	InfraspEpithetType:   "INFRASPECIES",
 81 | 	RankType:             "RANK",
 82 | 	SpEpithetType:        "SPECIES",
 83 | 	SubgenusType:         "INFRA_GENUS",
 84 | 	UninomialType:        "UNINOMIAL",
 85 | 	YearApproximateType:  "APPROXIMATE_YEAR",
 86 | 	YearType:             "YEAR",
 87 | }
 88 | 
 89 | var wordTypeStrMap = func() map[string]WordType {
 90 | 	res := make(map[string]WordType)
 91 | 	for k, v := range wordTypeMap {
 92 | 		res[v] = k
 93 | 	}
 94 | 	return res
 95 | }()
 96 | 
 97 | // String is an implementation of fmt.Stringer interface.
 98 | func (wt WordType) String() string {
 99 | 	return wordTypeMap[wt]
100 | }
101 | 
102 | // MarshalJSON implements json.Marshaler.
103 | func (wt WordType) MarshalJSON() ([]byte, error) {
104 | 	return []byte("\"" + wt.String() + "\""), nil
105 | }
106 | 
107 | // UnmarshalJSON implements json.Unmarshaller.
108 | func (wt *WordType) UnmarshalJSON(bs []byte) error {
109 | 	var err error
110 | 	var ok bool
111 | 	// strings.Trim seems to be ~10 time faster here than
112 | 	// json-iter Unmarshal
113 | 	s := strings.Trim(string(bs), `"`)
114 | 	*wt, ok = wordTypeStrMap[s]
115 | 	if !ok {
116 | 		err = errors.New("cannot decode WordType")
117 | 	}
118 | 	return err
119 | }
120 | 


--------------------------------------------------------------------------------
/ent/parser/engine.go:
--------------------------------------------------------------------------------
  1 | package parser
  2 | 
  3 | import (
  4 | 	"io"
  5 | 
  6 | 	"github.com/gnames/gnlib/ent/nomcode"
  7 | 	"github.com/gnames/gnparser/ent/internal/preparser"
  8 | 	"github.com/gnames/gnparser/ent/parsed"
  9 | 	"github.com/gnames/gnparser/io/dict"
 10 | 	"github.com/gnames/tribool"
 11 | )
 12 | 
 13 | type baseEngine struct {
 14 | 	preParser         *preparser.PreParser
 15 | 	sn                *scientificNameNode
 16 | 	root              *node32
 17 | 	code              nomcode.Code
 18 | 	cardinality       int
 19 | 	rank              string
 20 | 	error             error
 21 | 	hybrid            *parsed.Annotation
 22 | 	graftChimera      *parsed.Annotation
 23 | 	surrogate         *parsed.Annotation
 24 | 	bacteria          *tribool.Tribool
 25 | 	candidatus        bool
 26 | 	warnings          map[parsed.Warning]struct{}
 27 | 	tail              string
 28 | 	cultivar          bool
 29 | 	preserveDiaereses bool
 30 | }
 31 | 
 32 | // New creates implementation of Parser interface.
 33 | func New() Parser {
 34 | 	p := Engine{}
 35 | 	p.Init()
 36 | 	p.preParser = preparser.New()
 37 | 	return &p
 38 | }
 39 | 
 40 | // fullReset must set all fields to empty, or results from the previous
 41 | // parse might bleed into new results.
 42 | func (p *Engine) fullReset() {
 43 | 	p.cardinality = 0
 44 | 	p.rank = ""
 45 | 	p.error = nil
 46 | 	p.hybrid = nil
 47 | 	p.graftChimera = nil
 48 | 	p.surrogate = nil
 49 | 	p.bacteria = nil
 50 | 	p.candidatus = false
 51 | 	var warnReset map[parsed.Warning]struct{}
 52 | 	p.warnings = warnReset
 53 | 	p.tail = ""
 54 | 	p.cultivar = false
 55 | 	p.Reset()
 56 | }
 57 | 
 58 | func (p *Engine) addWarn(w parsed.Warning) {
 59 | 	if p.warnings == nil {
 60 | 		p.warnings = make(map[parsed.Warning]struct{})
 61 | 	}
 62 | 	if _, ok := p.warnings[w]; !ok {
 63 | 		p.warnings[w] = struct{}{}
 64 | 	}
 65 | }
 66 | 
 67 | func (p *Engine) isBacteria(gen string) {
 68 | 	if p.code == nomcode.Bacterial {
 69 | 		bac := tribool.New(1)
 70 | 		p.bacteria = &bac
 71 | 	}
 72 | 	if hom, ok := dict.Dict.Bacteria[gen]; ok {
 73 | 		if hom {
 74 | 			p.addWarn(parsed.BacteriaMaybeWarn)
 75 | 			bac := tribool.New(0)
 76 | 			p.bacteria = &bac
 77 | 		} else {
 78 | 			bac := tribool.New(1)
 79 | 			p.bacteria = &bac
 80 | 		}
 81 | 	}
 82 | }
 83 | 
 84 | // outputAST assembles PEG nodes' AST structure.
 85 | func (p *Engine) outputAST() {
 86 | 	type element struct {
 87 | 		node *node32
 88 | 		down *element
 89 | 	}
 90 | 	var node *node32
 91 | 	var skip bool
 92 | 	var stack *element
 93 | 	for _, token := range p.Tokens() {
 94 | 		if node, skip = p.newNode(token); skip {
 95 | 			continue
 96 | 		}
 97 | 		for stack != nil && stackNodeIsWithin(stack.node, token) {
 98 | 			stack.node.next = node.up
 99 | 			node.up = stack.node
100 | 			stack = stack.down
101 | 		}
102 | 		stack = &element{node: node, down: stack}
103 | 	}
104 | 	if stack != nil {
105 | 		p.root = stack.node
106 | 	}
107 | }
108 | 
109 | func stackNodeIsWithin(n *node32, t token32) bool {
110 | 	return n.begin >= t.begin && n.end <= t.end
111 | }
112 | 
113 | // PrintOutputSyntaxTree outputs a simplified version of a nodes
114 | // Abstract Syntax Tree. This method can be used for debugging purposes.
115 | func (p *Engine) PrintOutputSyntaxTree(w io.Writer) {
116 | 	if p.root == nil || p.root.pegRule != ruleSciName {
117 | 		return
118 | 	}
119 | 	p.root.print(w, true, p.Buffer)
120 | }
121 | 
122 | func (p *Engine) newNode(t token32) (*node32, bool) {
123 | 	var node *node32
124 | 	var annot parsed.Annotation
125 | 	switch t.pegRule {
126 | 	case ruleHybridChar:
127 | 		annot = parsed.HybridAnnot
128 | 		p.hybrid = &annot
129 | 	case ruleGraftChimeraChar:
130 | 		annot = parsed.GraftChimeraAnnot
131 | 		p.hybrid = &annot
132 | 	case ruleRankNotho, ruleRankUninomialNotho:
133 | 		annot = parsed.NothoHybridAnnot
134 | 		p.hybrid = &annot
135 | 		p.addWarn(parsed.HybridNamedWarn)
136 | 	case ruleOtherSpace:
137 | 		p.addWarn(parsed.SpaceNonStandardWarn)
138 | 	case ruleMiscodedChar:
139 | 		p.addWarn(parsed.UTF8ConvBadWarn)
140 | 	case ruleAbbrSubgenus:
141 | 		p.addWarn(parsed.SubgenusAbbrWarn)
142 | 	case ruleBasionymAuthorship2Parens:
143 | 		p.addWarn(parsed.AuthDoubleParensWarn)
144 | 	case ruleBasionymAuthorshipMissingParens:
145 | 		p.addWarn(parsed.AuthMissingOneParensWarn)
146 | 	case ruleUpperAfterDash:
147 | 		p.addWarn(parsed.GenusUpperCharAfterDash)
148 | 	case ruleLowerGreek:
149 | 		p.addWarn(parsed.GreekLetterInRank)
150 | 	case ruleAuthorSepSpanish:
151 | 		p.addWarn(parsed.SpanishAndAsSeparator)
152 | 	case ruleIgnoredWord:
153 | 		p.addWarn(parsed.ContainsIgnoredAnnotation)
154 | 	}
155 | 	if _, ok := nodeRules[t.pegRule]; ok {
156 | 		node = &node32{token32: t}
157 | 		return node, false
158 | 	}
159 | 
160 | 	return node, true
161 | }
162 | 
163 | func (p *Engine) nodeValue(n *node32) string {
164 | 	t := n.token32
165 | 	v := string([]rune(p.Buffer)[t.begin:t.end])
166 | 	return v
167 | }
168 | 
169 | // ParseName returns the name the nodes. In case of parsing errors
170 | // returns string 'noparse'.
171 | func (p *Engine) ParsedName() string {
172 | 	if p.error != nil {
173 | 		return "noparse"
174 | 	}
175 | 	for i := len(p.tree) - 1; i >= 0; i-- {
176 | 		t := p.tree[i]
177 | 		if t.pegRule == ruleName {
178 | 			return string(p.buffer[t.begin:t.end])
179 | 		}
180 | 	}
181 | 	return "noparse"
182 | }
183 | 
184 | var nodeRules = map[pegRule]struct{}{
185 | 	ruleAbbrGenus:                       {},
186 | 	ruleAbbrSubgenus:                    {},
187 | 	ruleAllCapsAuthorWord:               {},
188 | 	ruleApostrOther:                     {},
189 | 	ruleApproxNameIgnored:               {},
190 | 	ruleApproximation:                   {},
191 | 	ruleAuthor:                          {},
192 | 	ruleAuthorEmend:                     {},
193 | 	ruleAuthorEtAl:                      {},
194 | 	ruleAuthorEx:                        {},
195 | 	ruleAuthorIn:                        {},
196 | 	ruleAuthorPrefix:                    {},
197 | 	ruleAuthorSep:                       {},
198 | 	ruleAuthorSuffix:                    {},
199 | 	ruleAuthorWord:                      {},
200 | 	ruleAuthorsGroup:                    {},
201 | 	ruleAuthorsTeam:                     {},
202 | 	ruleAuthorship:                      {},
203 | 	ruleBasionymAuthorship:              {},
204 | 	ruleBasionymAuthorshipMissingParens: {},
205 | 	ruleBasionymAuthorshipYearMisformed: {},
206 | 	ruleCandidatus:                      {},
207 | 	ruleCandidatusName:                  {},
208 | 	ruleCombinationAuthorship:           {},
209 | 	ruleComparison:                      {},
210 | 	ruleCultivar:                        {},
211 | 	ruleCultivarRecursive:               {},
212 | 	ruleDashOther:                       {},
213 | 	ruleDotPrefix:                       {},
214 | 	ruleFilius:                          {},
215 | 	ruleFiliusFNoSpace:                  {},
216 | 	ruleGenusWord:                       {},
217 | 	ruleGraftChimeraChar:                {},
218 | 	ruleGraftChimeraFormula:             {},
219 | 	ruleHybridChar:                      {},
220 | 	ruleHybridFormula:                   {},
221 | 	ruleInfraspEpithet:                  {},
222 | 	ruleInfraspGroup:                    {},
223 | 	ruleLowerCharExtended:               {},
224 | 	ruleName:                            {},
225 | 	ruleNameApprox:                      {},
226 | 	ruleNameComp:                        {},
227 | 	ruleNameCompSp:                      {},
228 | 	ruleNameCompIsp:                     {},
229 | 	ruleNameSpecies:                     {},
230 | 	ruleNamedGenusGraftChimera:          {},
231 | 	ruleNamedGenusHybrid:                {},
232 | 	ruleNamedSpeciesHybrid:              {},
233 | 	ruleOriginalAuthorship:              {},
234 | 	ruleOriginalAuthorshipComb:          {},
235 | 	ruleRank:                            {},
236 | 	ruleRankCultivar:                    {},
237 | 	ruleRankForma:                       {},
238 | 	ruleRankOtherUncommon:               {},
239 | 	ruleRankSsp:                         {},
240 | 	ruleRankUninomial:                   {},
241 | 	ruleRankVar:                         {},
242 | 	ruleSciName:                         {},
243 | 	ruleSingleName:                      {},
244 | 	ruleSpeciesEpithet:                  {},
245 | 	ruleSubgenus:                        {},
246 | 	ruleSubgenusOrSuperspecies:          {},
247 | 	ruleTail:                            {},
248 | 	ruleUninomial:                       {},
249 | 	ruleUninomialCombo:                  {},
250 | 	ruleUninomialWord:                   {},
251 | 	ruleUnknownAuthor:                   {},
252 | 	ruleUpperCharExtended:               {},
253 | 	ruleWord:                            {},
254 | 	ruleWordApostr:                      {},
255 | 	ruleWordStartsWithDigit:             {},
256 | 	ruleYear:                            {},
257 | 	ruleYearApprox:                      {},
258 | 	ruleYearNum:                         {},
259 | 	ruleYearRange:                       {},
260 | 	ruleYearWithChar:                    {},
261 | 	ruleYearWithDot:                     {},
262 | 	ruleYearWithPage:                    {},
263 | 	ruleYearWithParens:                  {},
264 | }
265 | 


--------------------------------------------------------------------------------
/ent/parser/interfaces.go:
--------------------------------------------------------------------------------
 1 | package parser
 2 | 
 3 | import (
 4 | 	"github.com/gnames/gnlib/ent/nomcode"
 5 | 	"github.com/gnames/gnparser/ent/parsed"
 6 | )
 7 | 
 8 | // Parser is an interface that is responsible for parsing of a scientific
 9 | // name and creation of the Abstract Syntax Tree of the name-string.
10 | type Parser interface {
11 | 	// PreprocessAndParse takes a scientific name and returns back Abstract
12 | 	// Syntax Tree of the name-string.
13 | 	PreprocessAndParse(
14 | 		name, version string,
15 | 		code nomcode.Code,
16 | 		keepHTML, capitalize, preserveDiaereses bool,
17 | 	) ScientificNameNode
18 | 	Debug(name string) []byte
19 | }
20 | 
21 | // ScientificNameNode is the Abstract Syntax Tree of a name-string.
22 | // It contains a method to convert AST into final output.
23 | type ScientificNameNode interface {
24 | 	// ToOutput converts AST into final output object.
25 | 	ToOutput(withDetails, withSpGr bool) parsed.Parsed
26 | }
27 | 
28 | // nameData is the interface for converting AST to output elements.
29 | type nameData interface {
30 | 	valuer
31 | 	canonizer
32 | 	worder
33 | 	authorFinder
34 | 	outputter
35 | }
36 | 
37 | type valuer interface {
38 | 	// value function returns the complete composite value of a node.
39 | 	// for low level nodes it would be the same as Value field, for higher
40 | 	// nodes it will be a value made from all their components.
41 | 	value() string
42 | }
43 | 
44 | type canonizer interface {
45 | 	// canonical function would return something only for nodes that do
46 | 	// contribute to canonical representation. For other nodes the return
47 | 	// value is an empty canonical structure.
48 | 	canonical() *canonical
49 | }
50 | 
51 | type worder interface {
52 | 	// words function returns a meaning of words in a string and their positions
53 | 	words() []parsed.Word
54 | }
55 | 
56 | type authorFinder interface {
57 | 	lastAuthorship() *authorshipNode
58 | }
59 | 
60 | type outputter interface {
61 | 	// details creates a details structure for JSON-based outputs
62 | 	details() parsed.Details
63 | }
64 | 


--------------------------------------------------------------------------------
/ent/parser/output.go:
--------------------------------------------------------------------------------
  1 | package parser
  2 | 
  3 | import (
  4 | 	"cmp"
  5 | 	"slices"
  6 | 	"strings"
  7 | 
  8 | 	"github.com/gnames/gnparser/ent/parsed"
  9 | )
 10 | 
 11 | // ToOutput converts Abstract Syntax Tree of scientific name to a
 12 | // final output object.
 13 | func (sn *scientificNameNode) ToOutput(
 14 | 	withDetails, withSpGr bool) parsed.Parsed {
 15 | 	res := parsed.Parsed{
 16 | 		NomCode:       sn.code.Abbr(),
 17 | 		Verbatim:      sn.verbatim,
 18 | 		Canonical:     sn.Canonical(withSpGr),
 19 | 		Virus:         sn.virus,
 20 | 		DaggerChar:    sn.daggerChar,
 21 | 		VerbatimID:    sn.verbatimID,
 22 | 		ParserVersion: sn.parserVersion,
 23 | 	}
 24 | 
 25 | 	if res.Canonical == nil {
 26 | 		return res
 27 | 	}
 28 | 
 29 | 	res.Parsed = true
 30 | 	res.ParseQuality, res.QualityWarnings = sn.qualityWarnings()
 31 | 	res.Normalized = sn.Normalized()
 32 | 	res.Cardinality = sn.cardinality
 33 | 	res.Candidatus = sn.candidatus
 34 | 	res.Rank = sn.rank
 35 | 	res.Authorship = sn.LastAuthorship(withDetails)
 36 | 	res.Hybrid = sn.hybrid
 37 | 	res.Surrogate = sn.surrogate
 38 | 	res.Bacteria = sn.bacteria
 39 | 	res.Cultivar = sn.cultivar
 40 | 	res.Tail = sn.tail
 41 | 	if withDetails {
 42 | 		res.Details = sn.Details()
 43 | 		res.Words = sn.Words()
 44 | 	}
 45 | 
 46 | 	if sn.ambiguousEpithet != "" {
 47 | 		res.RestoreAmbiguous(sn.ambiguousEpithet, sn.ambiguousModif)
 48 | 	}
 49 | 	return res
 50 | }
 51 | 
 52 | func (sn *scientificNameNode) qualityWarnings() (int, []parsed.QualityWarning) {
 53 | 	if sn.cardinality > 2 && sn.maybeFilius() {
 54 | 		if sn.warnings == nil {
 55 | 			sn.warnings = make(map[parsed.Warning]struct{})
 56 | 		}
 57 | 		sn.warnings[parsed.AuthAmbiguousFiliusWarn] = struct{}{}
 58 | 	}
 59 | 
 60 | 	warns := prepareWarnings(sn.warnings)
 61 | 	quality := 1
 62 | 	if len(warns) > 0 {
 63 | 		quality = warns[0].Quality
 64 | 	}
 65 | 	return quality, warns
 66 | }
 67 | 
 68 | func (sn *scientificNameNode) maybeFilius() bool {
 69 | 	words := sn.Words()
 70 | 	for i := range words {
 71 | 		if words[i].Verbatim != "f." {
 72 | 			continue
 73 | 		}
 74 | 		if i == 0 || i == len(words)-1 {
 75 | 			continue
 76 | 		}
 77 | 
 78 | 		betweenChars := sn.verbatim[words[i-1].End:words[i+1].Start]
 79 | 
 80 | 		if words[i-1].Type == parsed.AuthorWordType &&
 81 | 			words[i+1].Type == parsed.InfraspEpithetType &&
 82 | 			!strings.Contains(betweenChars, ")") {
 83 | 			return true
 84 | 		}
 85 | 	}
 86 | 	return false
 87 | }
 88 | 
 89 | func prepareWarnings(ws map[parsed.Warning]struct{}) []parsed.QualityWarning {
 90 | 	res := make([]parsed.QualityWarning, len(ws))
 91 | 	var i int
 92 | 	for k := range ws {
 93 | 		res[i] = k.NewQualityWarning()
 94 | 		i++
 95 | 	}
 96 | 
 97 | 	slices.SortFunc(res, func(a, b parsed.QualityWarning) int {
 98 | 		res := cmp.Compare(b.Quality, a.Quality)
 99 | 		if res != 0 {
100 | 			return res
101 | 		}
102 | 		return cmp.Compare(a.Warning.String(), b.Warning.String())
103 | 	})
104 | 
105 | 	return res
106 | }
107 | 


--------------------------------------------------------------------------------
/ent/parser/parser.go:
--------------------------------------------------------------------------------
  1 | // Package parser provides entities and methods to perform Parsing
  2 | // Expression Grammer parsing on scientific names.
  3 | package parser
  4 | 
  5 | import (
  6 | 	"bytes"
  7 | 	"fmt"
  8 | 
  9 | 	"github.com/gnames/gnlib/ent/nomcode"
 10 | 	"github.com/gnames/gnparser/ent/internal/preprocess"
 11 | 	"github.com/gnames/gnparser/ent/parsed"
 12 | 	"github.com/gnames/gnparser/ent/str"
 13 | )
 14 | 
 15 | // Debug takes a string, parsers it, and returns a byte representation of
 16 | // the node tree
 17 | func (p *Engine) Debug(s string) []byte {
 18 | 	ppr := preprocess.Preprocess(p.preParser, []byte(s))
 19 | 	var b bytes.Buffer
 20 | 	if ppr.NoParse || ppr.Virus {
 21 | 		b.WriteString("\n*** Preprocessing: NO PARSE ***\n")
 22 | 		b.WriteString(fmt.Sprintf("\n%s\n", s))
 23 | 		return b.Bytes()
 24 | 	}
 25 | 	p.Buffer = string(ppr.Body)
 26 | 	fmt.Println(p.Buffer)
 27 | 	p.fullReset()
 28 | 	p.parse()
 29 | 	p.outputAST()
 30 | 	b.WriteString("\n*** Complete Syntax Tree ***\n")
 31 | 	p.AST().PrettyPrint(&b, p.Buffer)
 32 | 	b.WriteString("\n*** Output Syntax Tree ***\n")
 33 | 	p.PrintOutputSyntaxTree(&b)
 34 | 	return b.Bytes()
 35 | }
 36 | 
 37 | // PreprocessAndParse takes a string and returns back the Abstract
 38 | // Syntax Tree of the scientific names. The AST is later used to
 39 | // create the final output.
 40 | func (p *Engine) PreprocessAndParse(
 41 | 	s, ver string,
 42 | 	code nomcode.Code,
 43 | 	keepHTML bool,
 44 | 	capitalize bool,
 45 | 	preserveDiaereses bool,
 46 | ) ScientificNameNode {
 47 | 	p.code = code
 48 | 	p.preserveDiaereses = preserveDiaereses
 49 | 
 50 | 	originalString := s
 51 | 	var tagsOrEntities, lowCase bool
 52 | 	if !keepHTML {
 53 | 		s = preprocess.StripTags(s)
 54 | 		if originalString != s {
 55 | 			tagsOrEntities = true
 56 | 		}
 57 | 	}
 58 | 
 59 | 	if capitalize {
 60 | 		s = str.CapitalizeName(s)
 61 | 		if s != originalString {
 62 | 			lowCase = true
 63 | 		}
 64 | 	}
 65 | 
 66 | 	preproc := preprocess.Preprocess(p.preParser, []byte(s))
 67 | 
 68 | 	defer func() {
 69 | 		p.sn.daggerChar = preproc.DaggerChar
 70 | 		if len(preproc.Tail) > 0 {
 71 | 			p.sn.tail += string(preproc.Tail)
 72 | 		}
 73 | 		if len(p.sn.tail) > 0 {
 74 | 			p.addWarn(parsed.TailWarn)
 75 | 			if str.IsBoldSurrogate(p.sn.tail) {
 76 | 				p.sn.cardinality = 0
 77 | 				annot := parsed.BOLDAnnot
 78 | 				p.sn.surrogate = &annot
 79 | 			}
 80 | 		}
 81 | 
 82 | 		p.sn.ambiguousEpithet = preproc.Ambiguous.Orig
 83 | 		p.sn.ambiguousModif = preproc.Ambiguous.Subst
 84 | 
 85 | 		p.sn.warnings = p.warnings
 86 | 		p.sn.addVerbatim(originalString)
 87 | 		p.sn.parserVersion = ver
 88 | 	}()
 89 | 
 90 | 	if preproc.NoParse {
 91 | 		p.newNotParsedScientificNameNode(preproc)
 92 | 		return p.sn
 93 | 	}
 94 | 
 95 | 	p.Buffer = string(preproc.Body)
 96 | 	p.fullReset()
 97 | 
 98 | 	if tagsOrEntities {
 99 | 		p.addWarn(parsed.HTMLTagsEntitiesWarn)
100 | 	}
101 | 
102 | 	if lowCase {
103 | 		p.addWarn(parsed.LowCaseWarn)
104 | 	}
105 | 
106 | 	if preproc.Underscore {
107 | 		p.addWarn(parsed.SpaceNonStandardWarn)
108 | 	}
109 | 	err := p.Parse()
110 | 
111 | 	if err != nil {
112 | 		p.error = err
113 | 		p.newNotParsedScientificNameNode(preproc)
114 | 		return p.sn
115 | 	}
116 | 
117 | 	p.outputAST()
118 | 	p.newScientificNameNode()
119 | 	return p.sn
120 | }
121 | 


--------------------------------------------------------------------------------
/ent/parser/parser_test.go:
--------------------------------------------------------------------------------
  1 | package parser_test
  2 | 
  3 | import (
  4 | 	"fmt"
  5 | 	"testing"
  6 | 
  7 | 	"github.com/gnames/gnlib/ent/nomcode"
  8 | 	"github.com/gnames/gnparser/ent/parser"
  9 | 	"github.com/stretchr/testify/assert"
 10 | )
 11 | 
 12 | // TTestPreNParse tests PreprocessAndParse method
 13 | func TestPreNParse(t *testing.T) {
 14 | 	p := parser.New()
 15 | 	testData := []struct {
 16 | 		name, can string
 17 | 	}{
 18 | 		{"Pardosa moesta L.", "Pardosa moesta"},
 19 | 		{"something", ""},
 20 | 	}
 21 | 	for _, v := range testData {
 22 | 		sn := p.PreprocessAndParse(
 23 | 			v.name, "test_version", nomcode.Unknown, true, false, false,
 24 | 		)
 25 | 		parsed := sn.ToOutput(false, false)
 26 | 		can := parsed.Canonical
 27 | 		msg := v.name
 28 | 		if v.can == "" {
 29 | 			assert.Nil(t, can, msg)
 30 | 			continue
 31 | 		}
 32 | 		assert.Equal(t, v.can, can.Simple, msg)
 33 | 	}
 34 | }
 35 | 
 36 | // TestToOutput tests ToOutput method of ScientificNameNode
 37 | func TestToOutput(t *testing.T) {
 38 | 	p := parser.New()
 39 | 	testData := []struct {
 40 | 		name, can, au string
 41 | 		det, parsed   bool
 42 | 	}{
 43 | 		{"Pardosa moesta L.", "Pardosa moesta", "L.", false, true},
 44 | 		{
 45 | 			"Bacillus subtilis (Ehrenberg, 1835) Cohn, 1872",
 46 | 			"Bacillus subtilis", "(Ehrenberg 1835) Cohn 1872",
 47 | 			false, true,
 48 | 		},
 49 | 		{
 50 | 			"Bacillus subtilis (Ehrenberg, 1835) Cohn, 1872 sec. Miller",
 51 | 			"Bacillus subtilis", "(Ehrenberg 1835) Cohn 1872",
 52 | 			false, true,
 53 | 		},
 54 | 		{
 55 | 			"Aconitum napellus var. formosum (Rchb.) W. D. J. Koch (nom. ambig.)",
 56 | 			"Aconitum napellus formosum", "(Rchb.) W. D. J. Koch",
 57 | 			true, true,
 58 | 		},
 59 | 		{"something", "", "", false, false},
 60 | 	}
 61 | 	for _, v := range testData {
 62 | 		sn := p.PreprocessAndParse(
 63 | 			v.name, "test_version", nomcode.Unknown, true, false, false,
 64 | 		)
 65 | 		out := sn.ToOutput(v.det, false)
 66 | 		msg := v.name
 67 | 		if !out.Parsed {
 68 | 			assert.Nil(t, out.Canonical, msg)
 69 | 			continue
 70 | 		}
 71 | 		assert.Equal(t, v.can, out.Canonical.Simple, msg)
 72 | 		assert.Equal(t, v.au, out.Authorship.Normalized, msg)
 73 | 	}
 74 | }
 75 | 
 76 | // TestSpecGroupOption checks if stem is cut when WithSpeciesGroupCut is true.
 77 | func TestSpecGroupOption(t *testing.T) {
 78 | 	assert := assert.New(t)
 79 | 
 80 | 	p := parser.New()
 81 | 	testData := []struct {
 82 | 		name, stemmed string
 83 | 		spGrp         bool
 84 | 	}{
 85 | 		{"Aus alba alba", "Aus alb alb", false},
 86 | 		{"Aus alba alba", "Aus alb", true},
 87 | 		{"Aus alba albus", "Aus alb alb", true},
 88 | 		{
 89 | 			"Bacillus subtilis subtilis (Ehrenberg, 1835) Cohn, 1872",
 90 | 			"Bacillus subtil subtil", false,
 91 | 		},
 92 | 		{
 93 | 			"Bacillus subtilis subtilis (Ehrenberg, 1835) Cohn, 1872",
 94 | 			"Bacillus subtil", true,
 95 | 		},
 96 | 		{
 97 | 			"Bacillus subtila subtilis (Ehrenberg, 1835) Cohn, 1872",
 98 | 			"Bacillus subtil subtil", true,
 99 | 		},
100 | 	}
101 | 	for _, v := range testData {
102 | 		sn := p.PreprocessAndParse(
103 | 			v.name, "test_version",
104 | 			nomcode.Unknown,
105 | 			true, false, false,
106 | 		)
107 | 		out := sn.ToOutput(false, v.spGrp)
108 | 		msg := v.name
109 | 		fmt.Println(out.Canonical.Simple)
110 | 		assert.Equal(v.stemmed, out.Canonical.Stemmed, msg)
111 | 	}
112 | }
113 | 


--------------------------------------------------------------------------------
/ent/stemmer/stemmer.go:
--------------------------------------------------------------------------------
  1 | // stemmer package is responsible for extracting a stem of a latinized word. It
  2 | // is used to create a stem for latinized specific epithets in scientific names.
  3 | // Specific epithets are always nouns, so we need to take this into account.
  4 | 
  5 | // http://snowballstem.org/otherapps/schinke/
  6 | // http://caio.ueberalles.net/a_stemming_algorithm_for_latin_text_databases-schinke_et_al.pdf
  7 | //
  8 | // The Schinke Latin stemming algorithm is described in,
  9 | // Schinke R, Greengrass M, Robertson AM and Willett P (1996)
 10 | // A stemming algorithm for Latin text databases. Journal of Documentation, 52: 172-187.
 11 | //
 12 | // It has the feature that it stems each word to two forms, noun and verb. For example,
 13 | //
 14 | //	            NOUN        VERB
 15 | //	            ----        ----
 16 | //	aquila      aquil       aquila
 17 | //	portat      portat      porta
 18 | //	portis      port        por
 19 | //
 20 | // Here (slightly reformatted) are the rules of the stemmer,
 21 | //
 22 | // 1. (start)
 23 | //
 24 | //  2. Convert all occurrences of the letters 'j' or 'v' to 'i' or 'u',
 25 | //     respectively.
 26 | //
 27 | //  3. If the word ends in '-que' then
 28 | //     if the word is on the list shown in Figure 4, then
 29 | //     write the original word to both the noun-based and verb-based
 30 | //     stem dictionaries and go to 8.
 31 | //     else remove '-que'
 32 | //
 33 | //     [Figure 4 was
 34 | //
 35 | //     atque quoque neque itaque absque apsque abusque adaeque adusque denique
 36 | //     deque susque oblique peraeque plenisque quandoque quisque quaeque
 37 | //     cuiusque cuique quemque quamque quaque quique quorumque quarumque
 38 | //     quibusque quosque quasque quotusquisque quousque ubique undique usque
 39 | //     uterque utique utroque utribique torque coque concoque contorque
 40 | //     detorque decoque excoque extorque obtorque optorque retorque recoque
 41 | //     attorque incoque intorque praetorque]
 42 | //
 43 | //  4. Match the end of the word against the suffix list show in Figure 6(a),
 44 | //     removing the longest matching suffix, (if any).
 45 | //
 46 | //     [Figure 6(a) was
 47 | //
 48 | //     -ibus -ius  -ae   -am   -as   -em   -es   -ia
 49 | //     -is   -nt   -os   -ud   -um   -us   -a    -e
 50 | //     -i    -o    -u]
 51 | //
 52 | //  5. If the resulting stem contains at least two characters then write this stem
 53 | //     to the noun-based stem dictionary.
 54 | //
 55 | //  6. Match the end of the word against the suffix list show in Figure 6(b),
 56 | //     identifying the longest matching suffix, (if any).
 57 | //
 58 | //     [Figure 6(b) was
 59 | //
 60 | //     -iuntur-beris -erunt -untur -iunt  -mini  -ntur  -stis
 61 | //     -bor   -ero   -mur   -mus   -ris   -sti   -tis   -tur
 62 | //     -unt   -bo    -ns    -nt    -ri    -m     -r     -s
 63 | //     -t]
 64 | //
 65 | //     If any of the following suffixes are found then convert them as shown:
 66 | //
 67 | //     '-iuntur', '-erunt', '-untur', '-iunt', and '-unt', to '-i';
 68 | //     '-beris', '-bor', and '-bo' to '-bi';
 69 | //     '-ero' to '-eri'
 70 | //
 71 | //     else remove the suffix in the normal way.
 72 | //
 73 | //  7. If the resulting stem contains at least two characters then write this stem
 74 | //     to the verb-based stem dictionary.
 75 | //
 76 | // 8.  (end)
 77 | //
 78 | // Addendum: adding -ii to Step 4.
 79 | package stemmer
 80 | 
 81 | import (
 82 | 	"strings"
 83 | 
 84 | 	"github.com/gnames/gnparser/ent/str"
 85 | )
 86 | 
 87 | var empty = struct{}{}
 88 | 
 89 | var queExceptions = map[string]struct{}{
 90 | 	"atque": empty, "quoque": empty, "neque": empty, "itaque": empty,
 91 | 	"absque": empty, "apsque": empty, "abusque": empty, "adaeque": empty,
 92 | 	"adusque": empty, "denique": empty, "deque": empty, "susque": empty,
 93 | 	"oblique": empty, "peraeque": empty, "plenisque": empty, "quandoque": empty,
 94 | 	"quisque": empty, "quaeque": empty, "cuiusque": empty, "cuique": empty,
 95 | 	"quemque": empty, "quamque": empty, "quaque": empty, "quique": empty,
 96 | 	"quorumque": empty, "quarumque": empty, "quibusque": empty,
 97 | 	"quosque": empty, "quasque": empty, "quotusquisque": empty,
 98 | 	"quousque": empty, "ubique": empty, "undique": empty, "usque": empty,
 99 | 	"uterque": empty, "utique": empty, "utroque": empty, "utribique": empty,
100 | 	"torque": empty, "coque": empty, "concoque": empty, "contorque": empty,
101 | 	"detorque": empty, "decoque": empty, "excoque": empty, "extorque": empty,
102 | 	"obtorque": empty, "optorque": empty, "retorque": empty, "recoque": empty,
103 | 	"attorque": empty, "incoque": empty, "intorque": empty, "praetorque": empty,
104 | }
105 | 
106 | var nounSuffixes = []string{
107 | 	"ibus", "ius", "ae", "am", "as",
108 | 	"em", "es", "ia", "is",
109 | 	"nt", "os", "ud", "um", "us",
110 | 	"a", "e", "ii", "i", "o", "u",
111 | }
112 | 
113 | // StemmedWord is the output of stemming algorithm applied to a word.
114 | type StemmedWord struct {
115 | 	// Orig is the original word (input).
116 | 	Orig string
117 | 	// Stem is the stemmed version of the original word.
118 | 	Stem string
119 | 	// Suffix is the 'tail' left after stemming.
120 | 	Suffix string
121 | }
122 | 
123 | // StemCanonical takes a short form of a canonical name and returns back
124 | // stemmed specific and infraspecific epithets, and an unstemmed cultivar
125 | // epithet.
126 | // It assumes the following properties of a string:
127 | //
128 | //  1. There are no empty spaces over any side of a string.
129 | //  2. All spaces within the string are single.
130 | //  3. All characters in the string are ASCII with exception of the
131 | //     hybrid sign.
132 | //  4. The string always starts with a capitalized word.
133 | func StemCanonical(c string) string {
134 | 	graftChimeraFormulaParts := strings.Split(c, " + ")
135 | 	for gci, gcv := range graftChimeraFormulaParts {
136 | 		hybridFormulaParts := strings.Split(gcv, " × ")
137 | 		for hi, hv := range hybridFormulaParts {
138 | 			nameParts := strings.Split(hv, "‘")
139 | 			latinPart := nameParts[0]
140 | 			words := strings.Split(latinPart, " ")
141 | 			if len(words) == 1 {
142 | 				hybridFormulaParts[hi] = hv
143 | 				continue
144 | 			}
145 | 			formulaPartsRes := make([]string, len(words))
146 | 			for wi, wv := range words {
147 | 				if wi == 0 || len(wv) < 3 {
148 | 					formulaPartsRes[wi] = wv
149 | 				} else {
150 | 					formulaPartsRes[wi] = Stem(wv).Stem
151 | 				}
152 | 			}
153 | 			nameParts[0] = strings.Join(formulaPartsRes, " ")
154 | 			hybridFormulaParts[hi] = strings.Join(nameParts, "‘")
155 | 		}
156 | 		graftChimeraFormulaParts[gci] = strings.Join(hybridFormulaParts, " × ")
157 | 	}
158 | 	//return strings.Join(graftChimeraFormulaParts, " + ")
159 | 	return str.TransliterateDiaereses(strings.Join(graftChimeraFormulaParts, " + "))
160 | }
161 | 
162 | // Stem takes a word and, assuming the word is noun, removes its latin suffix
163 | // if such suffix is detected.
164 | func Stem(wrd string) StemmedWord {
165 | 	wrdR := []rune(wrd)
166 | 	for i, v := range wrdR {
167 | 		switch v {
168 | 		case 'j':
169 | 			wrdR[i] = 'i'
170 | 		case 'v':
171 | 			wrdR[i] = 'u'
172 | 		}
173 | 	}
174 | 	var sw StemmedWord
175 | 	var isException bool
176 | 	if sw, isException = processEndsWithQue(wrd, wrdR); isException {
177 | 		return sw
178 | 	}
179 | 	return checkNounSuffix(sw)
180 | }
181 | 
182 | func processEndsWithQue(wrd string, wrdR []rune) (StemmedWord, bool) {
183 | 	sw := StemmedWord{Orig: wrd, Stem: string(wrdR)}
184 | 
185 | 	if len(wrdR) < 3 {
186 | 		return sw, false
187 | 	}
188 | 	suffix := string(wrdR[len(wrdR)-3:])
189 | 	endsWithQue := suffix == "que"
190 | 	if endsWithQue {
191 | 		if _, ok := queExceptions[sw.Stem]; ok {
192 | 			return sw, true
193 | 		} else {
194 | 			sw.Stem = string(wrdR[:len(wrdR)-3])
195 | 		}
196 | 	}
197 | 	return sw, false
198 | }
199 | 
200 | func checkNounSuffix(sw StemmedWord) StemmedWord {
201 | 	var found bool
202 | 	for _, v := range nounSuffixes {
203 | 		if strings.HasSuffix(sw.Stem, v) {
204 | 			if found {
205 | 				break
206 | 			}
207 | 			found = true
208 | 			wrdR := []rune(sw.Stem)
209 | 			stem := string(wrdR[:len(wrdR)-len(v)])
210 | 			if len(stem) >= 2 {
211 | 				sw.Stem = stem
212 | 				sw.Suffix = string(wrdR[len(v):])
213 | 			}
214 | 		}
215 | 	}
216 | 	return sw
217 | }
218 | 


--------------------------------------------------------------------------------
/ent/stemmer/stemmer_test.go:
--------------------------------------------------------------------------------
 1 | package stemmer_test
 2 | 
 3 | import (
 4 | 	"bufio"
 5 | 	"os"
 6 | 	"path/filepath"
 7 | 	"regexp"
 8 | 	"strings"
 9 | 	"testing"
10 | 
11 | 	"github.com/gnames/gnparser/ent/stemmer"
12 | 	"github.com/stretchr/testify/assert"
13 | )
14 | 
15 | func TestStemmer(t *testing.T) {
16 | 	stemsDict := stemData(t)
17 | 	t.Run("treats que suffix with exceptions", func(t *testing.T) {
18 | 		assert.Equal(t, "detorque", stemmer.Stem("detorque").Stem)
19 | 		assert.Equal(t, "something", stemmer.Stem("somethingque").Stem)
20 | 	})
21 | 	t.Run("removes suffixes correctly", func(t *testing.T) {
22 | 		for k, v := range stemsDict {
23 | 			assert.Equal(t, v, stemmer.Stem(k).Stem)
24 | 		}
25 | 	})
26 | 
27 | 	t.Run("StemCanonical", func(t *testing.T) {
28 | 		data := []struct {
29 | 			msg string
30 | 			in  string
31 | 			out string
32 | 		}{
33 | 			{"Uninomial", "Pomatomus", "Pomatomus"},
34 | 			{"Binomial1", "Betula naturae", "Betula natur"},
35 | 			{"Binomial2", "Betula alba", "Betula alb"},
36 | 			{"Binomial3", "Leptochloöpsis virgata", "Leptochloopsis uirgat"},
37 | 			{"Trinomial", "Betula alba naturae", "Betula alb natur"},
38 | 			{"SpGroup", "Betula alba alba", "Betula alb alb"},
39 | 			{"SpGroup", "Betula alba albus", "Betula alb alb"},
40 | 			{"GraftChimeraFormula", "Crataegus + Mespilus", "Crataegus + Mespilus"},
41 | 			{"GraftChimeraFormula2", "Cytisus purpureus + Laburnum anagyroides", "Cytisus purpure + Laburnum anagyroid"},
42 | 		}
43 | 		for _, v := range data {
44 | 			assert.Equal(t, v.out, stemmer.StemCanonical(v.in), v.msg)
45 | 		}
46 | 	})
47 | }
48 | 
49 | func stemData(t *testing.T) map[string]string {
50 | 	res := make(map[string]string)
51 | 	path := filepath.Join("..", "..", "testdata", "stems.txt")
52 | 	f, err := os.Open(path)
53 | 	assert.Nil(t, err)
54 | 	scan := bufio.NewScanner(f)
55 | 
56 | 	for scan.Scan() {
57 | 		l := strings.TrimSpace(scan.Text())
58 | 		ws := regexp.MustCompile(`\s+`).Split(l, 2)
59 | 		res[ws[0]] = ws[1]
60 | 	}
61 | 
62 | 	assert.Nil(t, scan.Err())
63 | 
64 | 	return res
65 | }
66 | 


--------------------------------------------------------------------------------
/ent/str/str.go:
--------------------------------------------------------------------------------
  1 | // Package str provides functions for manipulating scientific name-strings.
  2 | package str
  3 | 
  4 | import (
  5 | 	"bytes"
  6 | 	"fmt"
  7 | 	"strings"
  8 | 	"unicode"
  9 | 	"unicode/utf8"
 10 | )
 11 | 
 12 | // CapitalizeName function capitalizes the first character of a name-string.
 13 | // It can be a useful option if the data is known to contain 'real' names, for
 14 | // example canonical forms, but they are provided with all letters in lower
 15 | // case.
 16 | func CapitalizeName(name string) string {
 17 | 	runes := []rune(name)
 18 | 	if len(runes) < 2 {
 19 | 		return name
 20 | 	}
 21 | 
 22 | 	one := runes[0]
 23 | 	two := runes[1]
 24 | 	if unicode.IsUpper(one) || !unicode.IsLetter(one) {
 25 | 		return name
 26 | 	}
 27 | 	if one == 'x' && (two == ' ' || unicode.IsUpper(two)) {
 28 | 		return name
 29 | 	}
 30 | 	runes[0] = unicode.ToUpper(one)
 31 | 	return string(runes)
 32 | }
 33 | 
 34 | // Normalize takes a string and returns normalized version of it.
 35 | // Normalize function should be indempotent.
 36 | func Normalize(s string) string {
 37 | 	return ToASCII(s, Transliterations)
 38 | }
 39 | 
 40 | // Transliterate diaereses (ä, ë, ï, ö, ü) to their ASCII equivalents
 41 | // Note that this is a straight replacement and doesn't check for the
 42 | // existence of a vowel preceding them.
 43 | func TransliterateDiaereses(s string) string {
 44 | 	return ToASCII(s, DiaeresesTransliterations)
 45 | }
 46 | 
 47 | // NormalizePreservingDiaereses converts diacritics in a UTF8 string to their ASCII
 48 | // equivalents, but preserves diaereses (i.e. ä, ë, ï, ö, ü that occur after a vowel)
 49 | func NormalizePreservingDiaereses(s string) string {
 50 | 	if s == "" {
 51 | 		return s
 52 | 	}
 53 | 	b := []byte(s)
 54 | 	var r rune
 55 | 	var width int
 56 | 	tlBuf := bytes.NewBuffer(make([]byte, 0, len(b)*125/100))
 57 | 	for i, w := 0, 0; i < len(b); i += w {
 58 | 		prevRuneIsVowel := Vowels[r] // r is the rune from the last invocation (or empty)
 59 | 		r, width = utf8.DecodeRune(b[i:])
 60 | 		s, runeIsTransliterable := Transliterations[r]
 61 | 		_, runeIsDiaeresis := DiaeresesTransliterations[r]
 62 | 		// replace with transliteration if one is found, and it's not a diaeresis
 63 | 		if runeIsTransliterable && !(runeIsDiaeresis && prevRuneIsVowel) {
 64 | 			tlBuf.WriteString(s)
 65 | 		} else {
 66 | 			tlBuf.WriteRune(r)
 67 | 		}
 68 | 		w = width
 69 | 	}
 70 | 	return tlBuf.String()
 71 | }
 72 | 
 73 | // ToASCII converts a UTF-8 diacritics to corresponding ASCII chars.
 74 | func ToASCII(s string, m map[rune]string) string {
 75 | 	if s == "" {
 76 | 		return s
 77 | 	}
 78 | 	b := []byte(s)
 79 | 	tlBuf := bytes.NewBuffer(make([]byte, 0, len(b)*125/100))
 80 | 	for i, w := 0, 0; i < len(b); i += w {
 81 | 		r, width := utf8.DecodeRune(b[i:])
 82 | 		if s, ok := m[r]; ok {
 83 | 			tlBuf.WriteString(s)
 84 | 		} else {
 85 | 			tlBuf.WriteRune(r)
 86 | 		}
 87 | 		w = width
 88 | 	}
 89 | 	return tlBuf.String()
 90 | }
 91 | 
 92 | func IsBoldSurrogate(s string) bool {
 93 | 	if len(s) < 5 {
 94 | 		return false
 95 | 	}
 96 | 	s = strings.ToLower(s)
 97 | 	return strings.Contains(s, "bold:")
 98 | }
 99 | 
100 | // JoinStrings contatenates two strings with a separator. If either of the
101 | // strings is empty, then the value of another string is returned instead
102 | // of concatenation.
103 | func JoinStrings(s1 string, s2 string, sep string) string {
104 | 	if s1 == "" {
105 | 		return s2
106 | 	}
107 | 	if s2 == "" {
108 | 		return s1
109 | 	}
110 | 	return fmt.Sprintf("%s%s%s", s1, sep, s2)
111 | }
112 | 
113 | // FixAllCaps converts all-caps authors names to capitalized version.
114 | func FixAllCaps(s string) string {
115 | 	rs := []rune(s)
116 | 	res := make([]rune, len(rs))
117 | 	var prev rune
118 | 	for i, v := range rs {
119 | 		if i == 0 || prev == '-' {
120 | 			res[i] = v
121 | 			prev = v
122 | 			continue
123 | 		}
124 | 		res[i] = unicode.ToLower(v)
125 | 		prev = v
126 | 	}
127 | 	return string(res)
128 | }
129 | 
130 | // NumToString converts numbers in old-style species names to their
131 | // word equivalents.
132 | func NumToStr(num string) string {
133 | 	if v, ok := nameNums[num]; ok {
134 | 		return v
135 | 	}
136 | 	return num
137 | }
138 | 
139 | var DiaeresesTransliterations = map[rune]string{'ä': "a", 'ë': "e", 'ï': "i", 'ö': "o", 'ü': "u"}
140 | 
141 | var Vowels = map[rune]bool{'A': true, 'a': true, 'E': true, 'e': true,
142 | 	'I': true, 'i': true, 'O': true, 'o': true, 'U': true, 'u': true}
143 | 
144 | // Transliteration table is used to convert diacritical characters to their
145 | // latin letter equivalents.
146 | var Transliterations = map[rune]string{
147 | 
148 | 	'À': "A", 'Â': "A", 'Ã': "A", 'Á': "A", 'Ç': "C", 'Č': "C", 'Ð': "D",
149 | 	'Ë': "E", 'É': "E", 'È': "E", 'Í': "I", 'Ì': "I", 'Ï': "I", 'Ł': "L",
150 | 	'Ň': "N", 'Ñ': "N", 'Ó': "O", 'Ò': "O", 'Ô': "O", 'Õ': "O", 'Ú': "U",
151 | 	'Ù': "U", 'Ŕ': "R", 'Ř': "R", 'Ŗ': "R", 'Š': "S", 'Ş': "S", 'Ž': "Z",
152 | 	'à': "a", 'â': "a", 'ã': "a", 'á': "a", 'ç': "c", 'č': "c", 'ë': "e",
153 | 	'é': "e", 'è': "e", 'ð': "d", 'í': "i", 'ì': "i", 'ï': "i", 'ł': "l",
154 | 	'ň': "n", 'ñ': "n", 'ó': "o", 'ò': "o", 'ô': "o", 'õ': "o", 'ú': "u",
155 | 	'ù': "u", 'û': "u", 'ŕ': "r", 'ř': "r", 'ŗ': "r", 'ſ': "s", 'š': "s",
156 | 	'ş': "s", 'ž': "z", '\'': "", '‘': "", '’': "", '.': "",
157 | 	'Æ': "Ae", 'Å': "Ao", 'Ä': "Ae", 'Ø': "Oe", 'Ö': "Oe", 'Þ': "Th",
158 | 	'Ü': "Ue", 'ß': "ss", 'æ': "ae", 'å': "ao", 'ä': "ae", 'ø': "oe",
159 | 	'ö': "oe", 'þ': "th", 'Œ': "Oe", 'œ': "oe", 'ü': "ue",
160 | }
161 | 
162 | // GlobalTransliterations are applied not only to scientific names, but
163 | // to the whole name-string.
164 | var GlobalTransliterations = map[rune]string{
165 | 	'‘': "'", '’': "'", '`': "'", '´': "'",
166 | }
167 | 
168 | var nameNums = map[string]string{
169 | 	"1":  "uni",
170 | 	"2":  "bi",
171 | 	"3":  "tri",
172 | 	"4":  "quadri",
173 | 	"5":  "quinque",
174 | 	"6":  "sex",
175 | 	"7":  "septem",
176 | 	"8":  "octo",
177 | 	"9":  "novem",
178 | 	"10": "decem",
179 | 	"11": "undecim",
180 | 	"12": "duodecim",
181 | 	"13": "tredecim",
182 | 	"14": "quatuordecim",
183 | 	"15": "quindecim",
184 | 	"16": "sedecim",
185 | 	"17": "septendecim",
186 | 	"18": "octodecim",
187 | 	"19": "novemdecim",
188 | 	"20": "viginti",
189 | 	"21": "vigintiuno",
190 | 	"22": "vigintiduo",
191 | 	"23": "vigintitre",
192 | 	"24": "vigintiquatuor",
193 | 	"25": "vigintiquinque",
194 | 	"26": "vigintisex",
195 | 	"27": "vigintiseptem",
196 | 	"28": "vigintiocto",
197 | 	"30": "triginta",
198 | 	"31": "trigintauno",
199 | 	"32": "trigintaduo",
200 | 	"38": "trigintaocto",
201 | 	"40": "quadraginta",
202 | }
203 | 
204 | // Uniq removes duplicates from an array without changing the order of
205 | // the elements.
206 | func Uniq(strs []string) []string {
207 | 	sMap := make(map[string]struct{})
208 | 	res := make([]string, len(strs))
209 | 	var offset, i int
210 | 	for i = range strs {
211 | 		if _, ok := sMap[strs[i]]; ok {
212 | 			offset++
213 | 		} else {
214 | 			sMap[strs[i]] = struct{}{}
215 | 			res[i-offset] = strs[i]
216 | 		}
217 | 	}
218 | 	i = i - offset + 1
219 | 	return res[0:i]
220 | }
221 | 


--------------------------------------------------------------------------------
/ent/str/str_test.go:
--------------------------------------------------------------------------------
  1 | package str_test
  2 | 
  3 | import (
  4 | 	"testing"
  5 | 
  6 | 	"github.com/gnames/gnparser/ent/str"
  7 | 	"github.com/stretchr/testify/assert"
  8 | )
  9 | 
 10 | func TestStringTools(t *testing.T) {
 11 | 	t.Run("CapitalizeName", func(t *testing.T) {
 12 | 		tests := []struct {
 13 | 			msg string
 14 | 			in  string
 15 | 			out string
 16 | 		}{
 17 | 			{"common canonical", "Pomatomus saltator", "Pomatomus saltator"},
 18 | 			{"low-case canonical", "pomatomus saltator", "Pomatomus saltator"},
 19 | 			{"hybrid sign", "× Hydnellum scrobiculatum", "× Hydnellum scrobiculatum"},
 20 | 			{"hybrid sign2", "×Hydnellum scrobiculatum", "×Hydnellum scrobiculatum"},
 21 | 			{"hybrid x", "xHydnellum scrobiculatum", "xHydnellum scrobiculatum"},
 22 | 			{"first x", "xhydnellum scrobiculatum", "Xhydnellum scrobiculatum"},
 23 | 			{"first x", "x hydnellum scrobiculatum", "x hydnellum scrobiculatum"},
 24 | 			{"uninomial", "bubo", "Bubo"},
 25 | 			{"greek", "ß-Goma-dimeroceras Sobolew", "ß-Goma-dimeroceras Sobolew"},
 26 | 			{"hindi", "खपृष्ठ म", "खपृष्ठ म"},
 27 | 		}
 28 | 		for _, v := range tests {
 29 | 			res := str.CapitalizeName(v.in)
 30 | 			assert.Equal(t, v.out, res, v.msg)
 31 | 		}
 32 | 	})
 33 | 
 34 | 	t.Run("NormalizePreservingDiaereses", func(t *testing.T) {
 35 | 		tests := []struct {
 36 | 			msg string
 37 | 			in  string
 38 | 			out string
 39 | 		}{
 40 | 			{"Döringina", "Döringina", "Doeringina"},
 41 | 			{"Lecythis paraënsis", "Lecythis paraënsis", "Lecythis paraënsis"},
 42 | 			{"thomæ", "thomæ", "thomae"},
 43 | 			{"many ö", "ööö", "oeoeoe"},
 44 | 			{"’", "’", ""},
 45 | 			{"‘", "‘", ""},
 46 | 		}
 47 | 		for _, v := range tests {
 48 | 			res := str.NormalizePreservingDiaereses(v.in)
 49 | 			assert.Equal(t, v.out, res, v.msg)
 50 | 		}
 51 | 	})
 52 | 
 53 | 	t.Run("ToASCII", func(t *testing.T) {
 54 | 		tests := []struct {
 55 | 			msg string
 56 | 			in  string
 57 | 			out string
 58 | 			tbl map[rune]string
 59 | 		}{
 60 | 			{"Döringina", "Döringina", "Doeringina", str.Transliterations},
 61 | 			{"Aëtosaurus", "Aëtosaurus", "Aetosaurus", str.Transliterations},
 62 | 			{"thomæ", "thomæ", "thomae", str.Transliterations},
 63 | 			{"many ö", "ööö", "oeoeoe", str.Transliterations},
 64 | 			{"’", "’", "'", str.GlobalTransliterations},
 65 | 			{"‘", "‘", "'", str.GlobalTransliterations},
 66 | 			{"’", "’", "", str.Transliterations},
 67 | 			{"‘", "‘", "", str.Transliterations},
 68 | 		}
 69 | 		for _, v := range tests {
 70 | 			res := str.ToASCII(v.in, v.tbl)
 71 | 			assert.Equal(t, v.out, res, v.msg)
 72 | 		}
 73 | 	})
 74 | 
 75 | 	t.Run("NumToStr", func(t *testing.T) {
 76 | 		tests := []struct {
 77 | 			msg string
 78 | 			in  string
 79 | 			out string
 80 | 		}{
 81 | 			{"1", "1", "uni"},
 82 | 			{"2", "2", "bi"},
 83 | 			{"3", "3", "tri"},
 84 | 			{"4", "4", "quadri"},
 85 | 			{"5", "5", "quinque"},
 86 | 			{"6", "6", "sex"},
 87 | 			{"7", "7", "septem"},
 88 | 			{"8", "8", "octo"},
 89 | 			{"9", "9", "novem"},
 90 | 			{"10", "10", "decem"},
 91 | 			{"11", "11", "undecim"},
 92 | 			{"12", "12", "duodecim"},
 93 | 			{"13", "13", "tredecim"},
 94 | 			{"14", "14", "quatuordecim"},
 95 | 			{"15", "15", "quindecim"},
 96 | 			{"16", "16", "sedecim"},
 97 | 			{"17", "17", "septendecim"},
 98 | 			{"18", "18", "octodecim"},
 99 | 			{"19", "19", "novemdecim"},
100 | 			{"20", "20", "viginti"},
101 | 			{"21", "21", "vigintiuno"},
102 | 			{"22", "22", "vigintiduo"},
103 | 			{"23", "23", "vigintitre"},
104 | 			{"24", "24", "vigintiquatuor"},
105 | 			{"25", "25", "vigintiquinque"},
106 | 			{"26", "26", "vigintisex"},
107 | 			{"27", "27", "vigintiseptem"},
108 | 			{"28", "28", "vigintiocto"},
109 | 			{"30", "30", "triginta"},
110 | 			{"31", "31", "trigintauno"},
111 | 			{"32", "32", "trigintaduo"},
112 | 			{"38", "38", "trigintaocto"},
113 | 			{"40", "40", "quadraginta"},
114 | 			{"400", "400", "400"},
115 | 			{"something", "something", "something"},
116 | 		}
117 | 		for _, v := range tests {
118 | 			res := str.NumToStr(v.in)
119 | 			assert.Equal(t, v.out, res, v.msg)
120 | 		}
121 | 	})
122 | 
123 | 	t.Run("FixAllCaps", func(t *testing.T) {
124 | 		tests := []struct {
125 | 			msg string
126 | 			in  string
127 | 			out string
128 | 		}{
129 | 			{"KURNAKOV", "KURNAKOV", "Kurnakov"},
130 | 			{"GÓMEZ-BOLEA", "GÓMEZ-BOLEA", "Gómez-Bolea"},
131 | 			{"hello", "hello", "hello"},
132 | 		}
133 | 		for _, v := range tests {
134 | 			res := str.FixAllCaps(v.in)
135 | 			assert.Equal(t, v.out, res, v.msg)
136 | 		}
137 | 	})
138 | }
139 | 


--------------------------------------------------------------------------------
/flake.lock:
--------------------------------------------------------------------------------
 1 | {
 2 |   "nodes": {
 3 |     "nixpkgs": {
 4 |       "locked": {
 5 |         "lastModified": 1652657345,
 6 |         "narHash": "sha256-XCpw463PdFzpj1QCn9yjiBxhaD2Tq6GElwD8PYVRLT8=",
 7 |         "owner": "NixOS",
 8 |         "repo": "nixpkgs",
 9 |         "rev": "aaf0d5d4364b6b7330b4748abebe1785388b7576",
10 |         "type": "github"
11 |       },
12 |       "original": {
13 |         "owner": "NixOS",
14 |         "repo": "nixpkgs",
15 |         "type": "github"
16 |       }
17 |     },
18 |     "root": {
19 |       "inputs": {
20 |         "nixpkgs": "nixpkgs"
21 |       }
22 |     }
23 |   },
24 |   "root": "root",
25 |   "version": 7
26 | }
27 | 


--------------------------------------------------------------------------------
/flake.nix:
--------------------------------------------------------------------------------
 1 | {
 2 |   description = "Parser for bio scientific names";
 3 |   inputs.nixpkgs.url = github:NixOS/nixpkgs;
 4 | 
 5 |   outputs = { self, nixpkgs }:
 6 |   let
 7 |     system = "x86_64-linux";
 8 |     pkgs = nixpkgs.legacyPackages.${system};
 9 |     lib = pkgs.lib;
10 |   in {
11 |     defaultPackage.${system} = pkgs.callPackage ./default.nix {};
12 |     devShell.${system} = pkgs.callPackage ./shell.nix {};
13 |   };
14 | }


--------------------------------------------------------------------------------
/gnparser.go:
--------------------------------------------------------------------------------
  1 | // Package gnparser implements the main use-case of the project -- parsing
  2 | // scientific names. There are methods to parse one name at a time,
  3 | // a slice of names, or a stream of names. All methods return results in the
  4 | // same order as input. It is achieved by restoring the order after concurrent
  5 | // execution of the parsing process.
  6 | package gnparser
  7 | 
  8 | import (
  9 | 	"context"
 10 | 	"sync"
 11 | 
 12 | 	"github.com/gnames/gnfmt"
 13 | 	"github.com/gnames/gnlib/ent/gnvers"
 14 | 	"github.com/gnames/gnparser/ent/nameidx"
 15 | 	"github.com/gnames/gnparser/ent/parsed"
 16 | 	"github.com/gnames/gnparser/ent/parser"
 17 | )
 18 | 
 19 | // gnparser is an implementation of GNparser interface.
 20 | // It is responsible for main parsing operations.
 21 | type gnparser struct {
 22 | 	// cfg keeps gnparser settings.
 23 | 	cfg Config
 24 | 
 25 | 	// parser keeps parsing engine
 26 | 	parser parser.Parser
 27 | }
 28 | 
 29 | // New constructor function takes options organized into a
 30 | // configuration struct and returns an object that implements GNparser
 31 | // interface.
 32 | func New(cfg Config) GNparser {
 33 | 	gnp := gnparser{cfg: cfg}
 34 | 	gnp.parser = parser.New()
 35 | 	return gnp
 36 | }
 37 | 
 38 | // NewPool creates a pool of GNparser objects. It is useful for concurrent
 39 | // parsing of many names. The function takes a configuration object and the
 40 | // size of the pool. It returns a channel GNparser objects with the
 41 | // corresponding buffer size.
 42 | func NewPool(cfg Config, size int) chan GNparser {
 43 | 	res := make(chan GNparser, size)
 44 | 	for range size {
 45 | 		gnp := New(cfg)
 46 | 		res <- gnp
 47 | 	}
 48 | 	return res
 49 | }
 50 | 
 51 | // Debug returns byte representation of complete and 'output' syntax trees.
 52 | func (gnp gnparser) Debug(s string) []byte {
 53 | 	return gnp.parser.Debug(s)
 54 | }
 55 | 
 56 | // Parse function parses input string according to configurations.
 57 | // It takes a string and returns an parsed.Parsed object.
 58 | func (gnp gnparser) ParseName(s string) parsed.Parsed {
 59 | 	ver := Version
 60 | 	if gnp.cfg.IsTest {
 61 | 		ver = "test_version"
 62 | 	}
 63 | 	sciNameNode := gnp.parser.PreprocessAndParse(
 64 | 		s,
 65 | 		ver,
 66 | 		gnp.cfg.Code,
 67 | 		gnp.cfg.IgnoreHTMLTags,
 68 | 		gnp.cfg.WithCapitalization,
 69 | 		gnp.cfg.WithPreserveDiaereses,
 70 | 	)
 71 | 	res := sciNameNode.ToOutput(
 72 | 		gnp.cfg.WithDetails,
 73 | 		gnp.cfg.WithSpeciesGroupCut,
 74 | 	)
 75 | 	return res
 76 | }
 77 | 
 78 | // ParseNames function takes input names and returns parsed results.
 79 | func (gnp gnparser) ParseNames(names []string) []parsed.Parsed {
 80 | 	res := make([]parsed.Parsed, len(names))
 81 | 	jobsNum := gnp.cfg.JobsNum
 82 | 	chOut := make(chan parsed.ParsedWithIdx)
 83 | 	var wgIn, wgOut sync.WaitGroup
 84 | 	wgIn.Add(jobsNum)
 85 | 	wgOut.Add(1)
 86 | 
 87 | 	ctx, cancel := context.WithCancel(context.Background())
 88 | 	defer cancel()
 89 | 
 90 | 	chIn := loadNames(ctx, names)
 91 | 
 92 | 	for i := jobsNum; i > 0; i-- {
 93 | 		go gnp.parseWorker(ctx, chIn, chOut, &wgIn)
 94 | 	}
 95 | 
 96 | 	go func() {
 97 | 		defer wgOut.Done()
 98 | 		var count int
 99 | 		for {
100 | 			select {
101 | 			case <-ctx.Done():
102 | 				return
103 | 			case v, ok := <-chOut:
104 | 				if !ok {
105 | 					return
106 | 				}
107 | 				if gnp.cfg.WithNoOrder {
108 | 					res[count] = v.Parsed
109 | 					count++
110 | 				} else {
111 | 					res[v.Idx] = v.Parsed
112 | 				}
113 | 			}
114 | 		}
115 | 	}()
116 | 
117 | 	wgIn.Wait()
118 | 	close(chOut)
119 | 	wgOut.Wait()
120 | 	return res
121 | }
122 | 
123 | // Format returns the configured output format value.
124 | func (gnp gnparser) Format() gnfmt.Format {
125 | 	return gnp.cfg.Format
126 | }
127 | 
128 | // WebLogs returns a boolean to show or not the web-service logs.
129 | func (gnp gnparser) WebLogs() bool {
130 | 	return gnp.cfg.WithWebLogs
131 | }
132 | 
133 | // ChangeConfig allows change configuration of already created
134 | // GNparser object.
135 | func (gnp gnparser) ChangeConfig(opts ...Option) GNparser {
136 | 	for i := range opts {
137 | 		opts[i](&gnp.cfg)
138 | 	}
139 | 	return gnp
140 | }
141 | 
142 | // Version function returns version number of `gnparser` and the timestamp
143 | // of its build.
144 | func (gnp gnparser) GetVersion() gnvers.Version {
145 | 	version := Version
146 | 	build := Build
147 | 	if gnp.cfg.IsTest {
148 | 		version = "test_version"
149 | 	}
150 | 	return gnvers.Version{Version: version, Build: build}
151 | }
152 | 
153 | func (gnp gnparser) parseWorker(
154 | 	ctx context.Context,
155 | 	chIn <-chan nameidx.NameIdx,
156 | 	chOut chan<- parsed.ParsedWithIdx,
157 | 	wgIn *sync.WaitGroup,
158 | ) {
159 | 	defer wgIn.Done()
160 | 	gnp.parser = parser.New()
161 | 
162 | 	for v := range chIn {
163 | 		parseRes := gnp.ParseName(v.NameString)
164 | 		select {
165 | 		case <-ctx.Done():
166 | 			return
167 | 		case chOut <- parsed.ParsedWithIdx{Idx: v.Index, Parsed: parseRes}:
168 | 		}
169 | 	}
170 | }
171 | 
172 | func loadNames(ctx context.Context, names []string) <-chan nameidx.NameIdx {
173 | 	chIn := make(chan nameidx.NameIdx)
174 | 	go func() {
175 | 		defer close(chIn)
176 | 		for i := range names {
177 | 			select {
178 | 			case <-ctx.Done():
179 | 				return
180 | 			case chIn <- nameidx.NameIdx{Index: i, NameString: names[i]}:
181 | 			}
182 | 		}
183 | 	}()
184 | 	return chIn
185 | }
186 | 


--------------------------------------------------------------------------------
/gnparser/LICENSE:
--------------------------------------------------------------------------------
 1 | The MIT License (MIT)
 2 | 
 3 | Copyright © 2019 Dmitry Mozzherin <dmozzherin@gmail.com>
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | of this software and associated documentation files (the "Software"), to deal
 7 | in the Software without restriction, including without limitation the rights
 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 | 
12 | The above copyright notice and this permission notice shall be included in
13 | all copies or substantial portions of the Software.
14 | 
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
21 | THE SOFTWARE.
22 | 


--------------------------------------------------------------------------------
/gnparser/cmd/flags.go:
--------------------------------------------------------------------------------
  1 | package cmd
  2 | 
  3 | import (
  4 | 	"fmt"
  5 | 	"log/slog"
  6 | 	"os"
  7 | 
  8 | 	"github.com/gnames/gnfmt"
  9 | 	"github.com/gnames/gnlib/ent/nomcode"
 10 | 	"github.com/gnames/gnparser"
 11 | 	"github.com/spf13/cobra"
 12 | )
 13 | 
 14 | func batchSizeFlag(cmd *cobra.Command) {
 15 | 	bs, err := cmd.Flags().GetInt("batch_size")
 16 | 	if err != nil {
 17 | 		fmt.Println(err)
 18 | 		os.Exit(1)
 19 | 	}
 20 | 	if bs > 0 {
 21 | 		opts = append(opts, gnparser.OptBatchSize(bs))
 22 | 	}
 23 | }
 24 | 
 25 | func codeFlag(cmd *cobra.Command) {
 26 | 	s, _ := cmd.Flags().GetString("nomenclatural-code")
 27 | 	if s == "" {
 28 | 		return
 29 | 	}
 30 | 	code := nomcode.New(s)
 31 | 	if code == nomcode.Unknown && s != "any" {
 32 | 		slog.Warn("Cannot determine nomenclatural-code from input", "input", s)
 33 | 	}
 34 | 	opts = append(opts, gnparser.OptCode(code))
 35 | }
 36 | 
 37 | func formatFlag(cmd *cobra.Command) {
 38 | 	s, _ := cmd.Flags().GetString("format")
 39 | 	if s != "" {
 40 | 		frmt, err := gnfmt.NewFormat(s)
 41 | 		if err != nil {
 42 | 			slog.Warn("Unknown format input, using default: CSV", "inut", s)
 43 | 			frmt = gnfmt.CSV
 44 | 		}
 45 | 		opts = append(opts, gnparser.OptFormat(frmt))
 46 | 	}
 47 | }
 48 | 
 49 | func jobsNumFlag(cmd *cobra.Command) {
 50 | 	jn, err := cmd.Flags().GetInt("jobs")
 51 | 	if err != nil {
 52 | 		fmt.Println(err)
 53 | 		os.Exit(1)
 54 | 	}
 55 | 	if jn > 0 {
 56 | 		opts = append(opts, gnparser.OptJobsNum(jn))
 57 | 	}
 58 | }
 59 | 
 60 | func ignoreHTMLTagsFlag(cmd *cobra.Command) {
 61 | 	ignoreTags, err := cmd.Flags().GetBool("ignore_tags")
 62 | 	if err != nil {
 63 | 		fmt.Println(err)
 64 | 		os.Exit(1)
 65 | 	}
 66 | 	if ignoreTags {
 67 | 		opts = append(opts, gnparser.OptIgnoreHTMLTags(true))
 68 | 	}
 69 | }
 70 | 
 71 | func portFlag(cmd *cobra.Command) int {
 72 | 	webPort, err := cmd.Flags().GetInt("port")
 73 | 	if err != nil {
 74 | 		fmt.Println(err)
 75 | 		os.Exit(1)
 76 | 	}
 77 | 	if webPort > 0 {
 78 | 		opts = append(opts, gnparser.OptPort(webPort))
 79 | 	}
 80 | 	return webPort
 81 | }
 82 | 
 83 | func versionFlag(cmd *cobra.Command) bool {
 84 | 	version, _ := cmd.Flags().GetBool("version")
 85 | 	if version {
 86 | 		fmt.Printf("\nversion: %s\n\nbuild:   %s\n\n",
 87 | 			gnparser.Version, gnparser.Build)
 88 | 		return true
 89 | 	}
 90 | 	return false
 91 | }
 92 | 
 93 | func withCapitalizeFlag(cmd *cobra.Command) {
 94 | 	b, err := cmd.Flags().GetBool("capitalize")
 95 | 	if err != nil {
 96 | 		fmt.Println(err)
 97 | 		os.Exit(1)
 98 | 	}
 99 | 	if b {
100 | 		opts = append(opts, gnparser.OptWithCapitaliation(true))
101 | 	}
102 | }
103 | 
104 | func withDetailsFlag(cmd *cobra.Command) {
105 | 	withDet, err := cmd.Flags().GetBool("details")
106 | 	if err != nil {
107 | 		fmt.Println(err)
108 | 		os.Exit(1)
109 | 	}
110 | 	if withDet {
111 | 		opts = append(opts, gnparser.OptWithDetails(true))
112 | 	}
113 | }
114 | 
115 | func withEnableCultivarsFlag(cmd *cobra.Command) {
116 | 	b, _ := cmd.Flags().GetBool("cultivar")
117 | 	if b {
118 | 		opts = append(opts, gnparser.OptCode(nomcode.Cultivars))
119 | 	}
120 | }
121 | 
122 | func withNoOrderFlag(cmd *cobra.Command) {
123 | 	withOrd, err := cmd.Flags().GetBool("unordered")
124 | 	if err != nil {
125 | 		fmt.Println(err)
126 | 		os.Exit(1)
127 | 	}
128 | 	if withOrd {
129 | 		opts = append(opts, gnparser.OptWithNoOrder(true))
130 | 	}
131 | }
132 | 
133 | func withPreserveDiaeresesFlag(cmd *cobra.Command) {
134 | 	b, err := cmd.Flags().GetBool("diaereses")
135 | 	if err != nil {
136 | 		fmt.Println(err)
137 | 		os.Exit(1)
138 | 	}
139 | 	if b {
140 | 		opts = append(opts, gnparser.OptWithPreserveDiaereses(true))
141 | 	}
142 | }
143 | 
144 | func spGrCutFlag(cmd *cobra.Command) {
145 | 	b, err := cmd.Flags().GetBool("species-group-cut")
146 | 	if err != nil {
147 | 		fmt.Println(err)
148 | 		os.Exit(1)
149 | 	}
150 | 	if b {
151 | 		opts = append(opts, gnparser.OptWithSpeciesGroupCut(true))
152 | 	}
153 | 
154 | }
155 | 
156 | func withStreamFlag(cmd *cobra.Command) {
157 | 	withDet, err := cmd.Flags().GetBool("stream")
158 | 	if err != nil {
159 | 		fmt.Println(err)
160 | 		os.Exit(1)
161 | 	}
162 | 	if withDet {
163 | 		opts = append(opts, gnparser.OptWithStream(true))
164 | 	}
165 | }
166 | 
167 | func withWebLogsFlag(cmd *cobra.Command) bool {
168 | 	withLogs, err := cmd.Flags().GetBool("web-logs")
169 | 	if err != nil {
170 | 		fmt.Println(err)
171 | 		os.Exit(1)
172 | 	}
173 | 	return withLogs
174 | }
175 | 


--------------------------------------------------------------------------------
/gnparser/cmd/parse_batch.go:
--------------------------------------------------------------------------------
 1 | package cmd
 2 | 
 3 | import (
 4 | 	"bufio"
 5 | 	"fmt"
 6 | 	"io"
 7 | 	"log/slog"
 8 | 	"sync"
 9 | 	"time"
10 | 
11 | 	"github.com/gnames/gnfmt"
12 | 	"github.com/gnames/gnparser"
13 | 	"github.com/gnames/gnparser/ent/parsed"
14 | )
15 | 
16 | func parseBatch(
17 | 	gnp gnparser.GNparser,
18 | 	f io.Reader,
19 | ) {
20 | 	batch := make([]string, batchSize)
21 | 	chOut := make(chan []parsed.Parsed)
22 | 	start := time.Now()
23 | 	var wg sync.WaitGroup
24 | 
25 | 	wg.Add(1)
26 | 	go processResults(chOut, &wg, gnp.Format())
27 | 
28 | 	sc := bufio.NewScanner(f)
29 | 	var i, count int
30 | 	for sc.Scan() {
31 | 		batch[count] = sc.Text()
32 | 		count++
33 | 		if count == batchSize {
34 | 			i++
35 | 			progressLog(start, count*i)
36 | 			chOut <- gnp.ParseNames(batch)
37 | 			batch = make([]string, batchSize)
38 | 			count = 0
39 | 		}
40 | 	}
41 | 	chOut <- gnp.ParseNames(batch[:count])
42 | 	close(chOut)
43 | 	if err := sc.Err(); err != nil {
44 | 		slog.Error("File reading failed", "error", err)
45 | 	}
46 | 	wg.Wait()
47 | }
48 | 
49 | func processResults(
50 | 	out <-chan []parsed.Parsed,
51 | 	wg *sync.WaitGroup,
52 | 	f gnfmt.Format,
53 | ) {
54 | 	defer wg.Done()
55 | 
56 | 	header := parsed.HeaderCSV(f)
57 | 	if header != "" {
58 | 		fmt.Println(header)
59 | 	}
60 | 
61 | 	for pr := range out {
62 | 		for i := range pr {
63 | 			fmt.Println(pr[i].Output(f))
64 | 		}
65 | 	}
66 | }
67 | 


--------------------------------------------------------------------------------
/gnparser/cmd/parse_stream.go:
--------------------------------------------------------------------------------
 1 | package cmd
 2 | 
 3 | import (
 4 | 	"bufio"
 5 | 	"context"
 6 | 	"fmt"
 7 | 	"io"
 8 | 	"log/slog"
 9 | 	"sync"
10 | 	"time"
11 | 
12 | 	"github.com/gnames/gnparser"
13 | 	"github.com/gnames/gnparser/ent/nameidx"
14 | 	"github.com/gnames/gnparser/ent/parsed"
15 | )
16 | 
17 | func getNames(
18 | 	ctx context.Context,
19 | 	f io.Reader,
20 | ) <-chan nameidx.NameIdx {
21 | 	chIn := make(chan nameidx.NameIdx)
22 | 	sc := bufio.NewScanner(f)
23 | 
24 | 	go func() {
25 | 		defer close(chIn)
26 | 		var count int
27 | 		for sc.Scan() {
28 | 			nameString := sc.Text()
29 | 			select {
30 | 			case <-ctx.Done():
31 | 				return
32 | 			case chIn <- nameidx.NameIdx{Index: count, NameString: nameString}:
33 | 			}
34 | 			count++
35 | 		}
36 | 	}()
37 | 	if err := sc.Err(); err != nil {
38 | 		slog.Error("Cannot read data", "error", err)
39 | 	}
40 | 	return chIn
41 | }
42 | 
43 | func parseStream(
44 | 	gnp gnparser.GNparser,
45 | 	f io.Reader,
46 | ) {
47 | 	ctx, cancel := context.WithCancel(context.Background())
48 | 	defer cancel()
49 | 	chIn := getNames(ctx, f)
50 | 	chOut := make(chan parsed.Parsed)
51 | 	var wg sync.WaitGroup
52 | 	wg.Add(1)
53 | 
54 | 	go gnp.ParseNameStream(ctx, chIn, chOut)
55 | 
56 | 	// process parsing results
57 | 	go func() {
58 | 		defer cancel()
59 | 		defer wg.Done()
60 | 		start := time.Now()
61 | 
62 | 		header := parsed.HeaderCSV(gnp.Format())
63 | 		if header != "" {
64 | 			fmt.Println(header)
65 | 		}
66 | 
67 | 		var count int
68 | 		for {
69 | 			count++
70 | 			if count%50_000 == 0 {
71 | 				progressLog(start, count)
72 | 			}
73 | 			select {
74 | 			case <-ctx.Done():
75 | 				return
76 | 			case v, ok := <-chOut:
77 | 				if !ok {
78 | 					return
79 | 				}
80 | 				fmt.Println(v.Output(gnp.Format()))
81 | 			}
82 | 		}
83 | 	}()
84 | 	wg.Wait()
85 | }
86 | 


--------------------------------------------------------------------------------
/gnparser/cmd/root.go:
--------------------------------------------------------------------------------
  1 | // Package cmd creates a command line application for parsing scientific names.
  2 | package cmd
  3 | 
  4 | import (
  5 | 	"fmt"
  6 | 	"log/slog"
  7 | 	"os"
  8 | 	"time"
  9 | 
 10 | 	"github.com/dustin/go-humanize"
 11 | 	"github.com/gnames/gnfmt"
 12 | 	"github.com/gnames/gnparser"
 13 | 	"github.com/gnames/gnparser/ent/parsed"
 14 | 	"github.com/gnames/gnparser/io/web"
 15 | 	"github.com/gnames/gnsys"
 16 | 	"github.com/spf13/cobra"
 17 | )
 18 | 
 19 | // debug is true when output shows Abstract Synthax Tree instead of
 20 | // parsed results.
 21 | const debug = false
 22 | 
 23 | var (
 24 | 	// opts is a container for configuration options
 25 | 	opts []gnparser.Option
 26 | 
 27 | 	// batchSize determines the size of a batch sent to gnparser workers.
 28 | 	batchSize int
 29 | )
 30 | 
 31 | // rootCmd represents the base command when called without any subcommands
 32 | var rootCmd = &cobra.Command{
 33 | 	Use:   "gnparser file_or_name",
 34 | 	Short: "Parses scientific names into their semantic elements.",
 35 | 	Long: `
 36 | Parses scientific names into their semantic elements.
 37 | 
 38 | To see version:
 39 | gnparser -V
 40 | 
 41 | To parse one name in CSV format
 42 | gnparser "Homo sapiens Linnaeus 1758" [flags]
 43 | or (the same)
 44 | gnparser "Homo sapiens Linnaeus 1758" -f csv [flags]
 45 | 
 46 | To parse one name using JSON format:
 47 | gnparser "Homo sapiens Linnaeus 1758" -f compact [flags]
 48 | or
 49 | gnparser "Homo sapiens Linnaeus 1758" -f pretty [flags]
 50 | 
 51 | To parse with maximum amount of details:
 52 | gnparser "Homo sapiens Linnaeus 1758" -d -f pretty
 53 | 
 54 | To parse many names from a file (one name per line):
 55 | gnparser names.txt [flags] > parsed_names.txt
 56 | 
 57 | To leave HTML tags and entities intact when parsing (faster)
 58 | gnparser names.txt -n > parsed_names.txt
 59 | 
 60 | To start web service on port 8080 with 5 concurrent jobs:
 61 | gnparser -j 5 -p 8080
 62 |  `,
 63 | 
 64 | 	Run: func(cmd *cobra.Command, args []string) {
 65 | 		if versionFlag(cmd) {
 66 | 			os.Exit(0)
 67 | 		}
 68 | 
 69 | 		if debug {
 70 | 			opts = append(opts, gnparser.OptDebug(true))
 71 | 		}
 72 | 
 73 | 		formatFlag(cmd)
 74 | 		jobsNumFlag(cmd)
 75 | 		ignoreHTMLTagsFlag(cmd)
 76 | 		withDetailsFlag(cmd)
 77 | 		withStreamFlag(cmd)
 78 | 		withNoOrderFlag(cmd)
 79 | 		withCapitalizeFlag(cmd)
 80 | 		withEnableCultivarsFlag(cmd)
 81 | 		// overrides Cultivar flag
 82 | 		codeFlag(cmd)
 83 | 		withPreserveDiaeresesFlag(cmd)
 84 | 		batchSizeFlag(cmd)
 85 | 		spGrCutFlag(cmd)
 86 | 		port := portFlag(cmd)
 87 | 		cfg := gnparser.NewConfig(opts...)
 88 | 		batchSize = cfg.BatchSize
 89 | 
 90 | 		if port != 0 {
 91 | 
 92 | 			// Create a JSON handler
 93 | 			handler := slog.NewJSONHandler(os.Stdout, nil)
 94 | 			logger := slog.New(handler).With(
 95 | 				slog.String("gnApp", "gnparser"),
 96 | 			)
 97 | 			slog.SetDefault(logger)
 98 | 
 99 | 			webopts := []gnparser.Option{
100 | 				gnparser.OptFormat(gnfmt.CompactJSON),
101 | 				gnparser.OptWithWebLogs(withWebLogsFlag(cmd)),
102 | 			}
103 | 			cfg = gnparser.NewConfig(webopts...)
104 | 			gnp := gnparser.New(cfg)
105 | 			gnps := web.NewGNparserService(gnp, port)
106 | 			web.Run(gnps)
107 | 			os.Exit(0)
108 | 		}
109 | 
110 | 		quiet, _ := cmd.Flags().GetBool("quiet")
111 | 		if quiet {
112 | 			slog.SetLogLoggerLevel(10)
113 | 		}
114 | 
115 | 		if len(args) == 0 {
116 | 			processStdin(cmd, cfg)
117 | 			os.Exit(0)
118 | 		}
119 | 		data := getInput(cmd, args)
120 | 
121 | 		if debug {
122 | 			debugName(data, cfg)
123 | 			os.Exit(0)
124 | 		}
125 | 		parse(data, cfg)
126 | 	},
127 | }
128 | 
129 | // Execute adds all child commands to the root command and sets flags
130 | // appropriately. This is called by main.main(). It only needs to happen once to
131 | // the rootCmd.
132 | func Execute() {
133 | 	if err := rootCmd.Execute(); err != nil {
134 | 		fmt.Println(err)
135 | 		os.Exit(1)
136 | 	}
137 | }
138 | 
139 | func init() {
140 | 	rootCmd.Flags().IntP("batch_size", "b", 0,
141 | 		"maximum number of names in a batch send for processing.")
142 | 
143 | 	rootCmd.Flags().BoolP("cultivar", "C", false,
144 | 		`parse according to  cultivar code ICNCP
145 | (DEPRECATED, use nomenclatural-code instead)`,
146 | 	)
147 | 
148 | 	codeHelp := `Modifies the parser's behavior in ambiguous cases, sometimes 
149 | introducing additional parsing rules.
150 | 
151 | Accepted values are:
152 |   - 'bact', 'icnp', 'bacterial' for bacterial code
153 |   - 'bot', 'icn', 'botanical' for botanical code
154 |   - 'cult', 'icncp', 'cultivar' for cultivar code
155 |   - 'zoo', 'iczn', 'zoological' for zoological code
156 | 
157 | If not set, the parser will attempt to determine the appropriate code/s.`
158 | 	rootCmd.Flags().StringP("nomenclatural-code", "n", "", codeHelp)
159 | 
160 | 	rootCmd.Flags().BoolP("capitalize", "c", false,
161 | 		"capitalize the first letter of input name-strings")
162 | 
163 | 	rootCmd.Flags().BoolP("diaereses", "D", false,
164 | 		"preserve diaereses in names")
165 | 
166 | 	rootCmd.Flags().BoolP("details", "d", false, "provides more details")
167 | 
168 | 	formatHelp := `Sets the output format.
169 | 
170 | Accepted values are:
171 |   - 'csv': Comma-separated values
172 |   - 'tsv': Tab-separated values
173 |   - 'compact': Compact JSON format
174 |   - 'pretty': Human-readable JSON format
175 | 
176 | If not set, the output format defaults to 'csv'.`
177 | 	rootCmd.Flags().StringP("format", "f", "", formatHelp)
178 | 
179 | 	rootCmd.Flags().BoolP("ignore_tags", "i", false,
180 | 		"ignore HTML entities and tags when parsing.")
181 | 
182 | 	rootCmd.Flags().IntP("jobs", "j", 0,
183 | 		"number of threads to run. CPU's threads number is the default.")
184 | 
185 | 	rootCmd.Flags().IntP("port", "p", 0,
186 | 		"starts web site and REST server on the port.")
187 | 
188 | 	rootCmd.Flags().BoolP("quiet", "q", false, "do not show progress")
189 | 
190 | 	rootCmd.Flags().BoolP("stream", "s", false,
191 | 		"parse one name at a time in a stream instead of a batch parsing")
192 | 
193 | 	rootCmd.Flags().BoolP("unordered", "u", false,
194 | 		"output and input are in different order")
195 | 
196 | 	rootCmd.PersistentFlags().BoolP("version", "V", false,
197 | 		"shows build version and date, ignores other flags.")
198 | 
199 | 	rootCmd.Flags().BoolP("web-logs", "", false, "enable logs for the web service")
200 | 
201 | 	rootCmd.Flags().
202 | 		BoolP("species-group-cut", "", false, "cut autonym/species group names to species for stemmed version")
203 | }
204 | 
205 | func processStdin(cmd *cobra.Command, cfg gnparser.Config) {
206 | 	if !checkStdin() {
207 | 		_ = cmd.Help()
208 | 		return
209 | 	}
210 | 	gnp := gnparser.New(cfg)
211 | 
212 | 	if cfg.WithStream {
213 | 		parseStream(gnp, os.Stdin)
214 | 	} else {
215 | 		parseBatch(gnp, os.Stdin)
216 | 	}
217 | }
218 | 
219 | func checkStdin() bool {
220 | 	stdInFile := os.Stdin
221 | 	stat, err := stdInFile.Stat()
222 | 	if err != nil {
223 | 		slog.Error("No stdin input", "error", err)
224 | 	}
225 | 	return (stat.Mode() & os.ModeCharDevice) == 0
226 | }
227 | 
228 | func getInput(cmd *cobra.Command, args []string) string {
229 | 	var data string
230 | 	switch len(args) {
231 | 	case 1:
232 | 		data = args[0]
233 | 	default:
234 | 		_ = cmd.Help()
235 | 		os.Exit(0)
236 | 	}
237 | 	return data
238 | }
239 | 
240 | func debugName(
241 | 	data string,
242 | 	cfg gnparser.Config,
243 | ) {
244 | 	gnp := gnparser.New(cfg)
245 | 	res := gnp.Debug(data)
246 | 	fmt.Println(string(res))
247 | }
248 | 
249 | func parse(
250 | 	data string,
251 | 	cfg gnparser.Config,
252 | ) {
253 | 	gnp := gnparser.New(cfg)
254 | 
255 | 	path := string(data)
256 | 	exists, _ := gnsys.FileExists(path)
257 | 	if exists {
258 | 		f, err := os.OpenFile(path, os.O_RDONLY, os.ModePerm)
259 | 		if err != nil {
260 | 			slog.Error("Cannot open file", "error", err, "path", path)
261 | 		}
262 | 		if cfg.WithStream {
263 | 			parseStream(gnp, f)
264 | 		} else {
265 | 			parseBatch(gnp, f)
266 | 		}
267 | 		f.Close()
268 | 	} else {
269 | 		parseString(gnp, data)
270 | 	}
271 | }
272 | 
273 | func parseString(gnp gnparser.GNparser, name string) {
274 | 	res := gnp.ParseName(name)
275 | 	f := gnp.Format()
276 | 
277 | 	header := parsed.HeaderCSV(f)
278 | 	if header != "" {
279 | 		fmt.Println(header)
280 | 	}
281 | 
282 | 	fmt.Println(res.Output(f))
283 | }
284 | 
285 | func progressLog(start time.Time, namesNum int) {
286 | 	dur := float64(time.Since(start)) / float64(time.Second)
287 | 	rate := float64(namesNum) / dur
288 | 	rateStr := humanize.Comma(int64(rate))
289 | 	slog.Info("File parsing",
290 | 		"names/sec", rateStr,
291 | 		"count", humanize.Comma(int64(namesNum)),
292 | 	)
293 | }
294 | 


--------------------------------------------------------------------------------
/gnparser/main.go:
--------------------------------------------------------------------------------
 1 | // Copyright © 2019 Dmitry Mozzherin <dmozzherin@gmail.com>
 2 | //
 3 | // Permission is hereby granted, free of charge, to any person obtaining a copy
 4 | // of this software and associated documentation files (the "Software"), to deal
 5 | // in the Software without restriction, including without limitation the rights
 6 | // to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 7 | // copies of the Software, and to permit persons to whom the Software is
 8 | // furnished to do so, subject to the following conditions:
 9 | //
10 | // The above copyright notice and this permission notice shall be included in
11 | // all copies or substantial portions of the Software.
12 | //
13 | // THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
14 | // IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
15 | // FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
16 | // AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
17 | // LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
18 | // OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
19 | // THE SOFTWARE.
20 | 
21 | package main
22 | 
23 | import (
24 | 	"log/slog"
25 | 	"os"
26 | 
27 | 	"github.com/gnames/gnparser/gnparser/cmd"
28 | 	"github.com/lmittmann/tint"
29 | )
30 | 
31 | func main() {
32 | 	handle := slog.New(tint.NewHandler(os.Stderr, nil))
33 | 	slog.SetDefault(handle)
34 | 
35 | 	cmd.Execute()
36 | }
37 | 


--------------------------------------------------------------------------------
/gnparser/tools.go:
--------------------------------------------------------------------------------
 1 | //go:build tools
 2 | // +build tools
 3 | 
 4 | package main
 5 | 
 6 | import (
 7 | 	_ "github.com/pointlander/peg"
 8 | 	_ "github.com/spf13/cobra"
 9 | 	_ "golang.org/x/perf/cmd/benchstat"
10 | 	_ "golang.org/x/tools/cmd/goimports"
11 | )
12 | 


--------------------------------------------------------------------------------
/gnparser_stream.go:
--------------------------------------------------------------------------------
  1 | package gnparser
  2 | 
  3 | import (
  4 | 	"context"
  5 | 	"log/slog"
  6 | 	"sync"
  7 | 
  8 | 	"github.com/gnames/gnparser/ent/nameidx"
  9 | 	"github.com/gnames/gnparser/ent/parsed"
 10 | 	"github.com/gnames/gnparser/ent/parser"
 11 | 	"github.com/gnames/organizer"
 12 | )
 13 | 
 14 | // ParseNameStream takes an input channel of input.Name and
 15 | // returns back a stream of parsed data following the same order as
 16 | // the input.
 17 | func (gnp gnparser) ParseNameStream(
 18 | 	ctx context.Context,
 19 | 	chIn <-chan nameidx.NameIdx,
 20 | 	chOut chan<- parsed.Parsed,
 21 | ) {
 22 | 	chUnordered := make(chan organizer.Ordered)
 23 | 	chOrdered := make(chan organizer.Ordered)
 24 | 	var wgWorker, wgOutput sync.WaitGroup
 25 | 	jobs := gnp.cfg.JobsNum
 26 | 	wgWorker.Add(jobs)
 27 | 	wgOutput.Add(1)
 28 | 
 29 | 	for i := jobs; i > 0; i-- {
 30 | 		go gnp.parseStreamWorker(ctx, chIn, chUnordered, &wgWorker)
 31 | 	}
 32 | 
 33 | 	if gnp.cfg.WithNoOrder {
 34 | 		close(chOrdered)
 35 | 		go sendUnordered(ctx, chUnordered, chOut, &wgOutput)
 36 | 	} else {
 37 | 		go organizer.Organize(ctx, chUnordered, chOrdered)
 38 | 		go sendOrdered(ctx, chOrdered, chOut, &wgOutput)
 39 | 	}
 40 | 
 41 | 	wgWorker.Wait()
 42 | 	close(chUnordered)
 43 | 	wgOutput.Wait()
 44 | }
 45 | 
 46 | func (gnp gnparser) parseStreamWorker(
 47 | 	ctx context.Context,
 48 | 	chIn <-chan nameidx.NameIdx,
 49 | 	chOut chan<- organizer.Ordered,
 50 | 	wg *sync.WaitGroup,
 51 | ) {
 52 | 	defer wg.Done()
 53 | 	gnp.parser = parser.New()
 54 | 	for v := range chIn {
 55 | 		parseRes := gnp.ParseName(v.NameString)
 56 | 		select {
 57 | 		case <-ctx.Done():
 58 | 			return
 59 | 		case chOut <- parsed.ParsedWithIdx{Parsed: parseRes, Error: nil, Idx: v.Index}:
 60 | 		}
 61 | 	}
 62 | }
 63 | 
 64 | func sendOrdered(
 65 | 	ctx context.Context,
 66 | 	chOrdered <-chan organizer.Ordered,
 67 | 	chOut chan<- parsed.Parsed,
 68 | 	wg *sync.WaitGroup,
 69 | ) {
 70 | 	defer wg.Done()
 71 | 	for v := range chOrdered {
 72 | 		var p parsed.Parsed
 73 | 		err := v.Unpack(&p)
 74 | 		if err != nil {
 75 | 			slog.Error("Cannot reorganize data", "error", err)
 76 | 		}
 77 | 		select {
 78 | 		case <-ctx.Done():
 79 | 			return
 80 | 		case chOut <- p:
 81 | 		}
 82 | 	}
 83 | 	close(chOut)
 84 | }
 85 | 
 86 | func sendUnordered(
 87 | 	ctx context.Context,
 88 | 	chUnordered <-chan organizer.Ordered,
 89 | 	chOut chan<- parsed.Parsed,
 90 | 	wg *sync.WaitGroup,
 91 | ) {
 92 | 	defer wg.Done()
 93 | 	for v := range chUnordered {
 94 | 		var p parsed.Parsed
 95 | 		err := v.Unpack(&p)
 96 | 		if err != nil {
 97 | 			slog.Error("Cannot reorganize data", "error", err)
 98 | 		}
 99 | 		select {
100 | 		case <-ctx.Done():
101 | 			return
102 | 		case chOut <- p:
103 | 		}
104 | 	}
105 | 	close(chOut)
106 | }
107 | 


--------------------------------------------------------------------------------
/go.mod:
--------------------------------------------------------------------------------
 1 | module github.com/gnames/gnparser
 2 | 
 3 | go 1.24.1
 4 | 
 5 | require (
 6 | 	github.com/dustin/go-humanize v1.0.1
 7 | 	github.com/gnames/gnfmt v0.6.0
 8 | 	github.com/gnames/gnlib v0.48.0
 9 | 	github.com/gnames/gnsys v0.3.8
10 | 	github.com/gnames/gnuuid v0.2.0
11 | 	github.com/gnames/organizer v0.1.1
12 | 	github.com/gnames/tribool v0.1.1
13 | 	github.com/labstack/echo/v4 v4.13.3
14 | 	github.com/lmittmann/tint v1.0.7
15 | 	github.com/pointlander/peg v1.0.1
16 | 	github.com/rendon/testcli v1.0.0
17 | 	github.com/spf13/cobra v1.9.1
18 | 	github.com/stretchr/testify v1.10.0
19 | 	golang.org/x/net v0.39.0
20 | 	golang.org/x/perf v0.0.0-20250414141303-3fc2b901edf3
21 | 	golang.org/x/text v0.24.0
22 | 	golang.org/x/tools v0.32.0
23 | )
24 | 
25 | require (
26 | 	github.com/VividCortex/ewma v1.2.0 // indirect
27 | 	github.com/aclements/go-moremath v0.0.0-20241023150245-c8bbc672ef66 // indirect
28 | 	github.com/cheggaaa/pb/v3 v3.1.7 // indirect
29 | 	github.com/davecgh/go-spew v1.1.1 // indirect
30 | 	github.com/fatih/color v1.18.0 // indirect
31 | 	github.com/google/uuid v1.6.0 // indirect
32 | 	github.com/inconshreveable/mousetrap v1.1.0 // indirect
33 | 	github.com/json-iterator/go v1.1.12 // indirect
34 | 	github.com/labstack/gommon v0.4.2 // indirect
35 | 	github.com/mattn/go-colorable v0.1.14 // indirect
36 | 	github.com/mattn/go-isatty v0.0.20 // indirect
37 | 	github.com/mattn/go-runewidth v0.0.16 // indirect
38 | 	github.com/modern-go/concurrent v0.0.0-20180306012644-bacd9c7ef1dd // indirect
39 | 	github.com/modern-go/reflect2 v1.0.2 // indirect
40 | 	github.com/pmezard/go-difflib v1.0.0 // indirect
41 | 	github.com/pointlander/compress v1.1.1-0.20190518213731-ff44bd196cc3 // indirect
42 | 	github.com/pointlander/jetset v1.0.1-0.20190518214125-eee7eff80bd4 // indirect
43 | 	github.com/rivo/uniseg v0.4.7 // indirect
44 | 	github.com/spf13/pflag v1.0.6 // indirect
45 | 	github.com/ulikunitz/xz v0.5.12 // indirect
46 | 	github.com/valyala/bytebufferpool v1.0.0 // indirect
47 | 	github.com/valyala/fasttemplate v1.2.2 // indirect
48 | 	golang.org/x/crypto v0.37.0 // indirect
49 | 	golang.org/x/mod v0.24.0 // indirect
50 | 	golang.org/x/sync v0.13.0 // indirect
51 | 	golang.org/x/sys v0.32.0 // indirect
52 | 	golang.org/x/time v0.11.0 // indirect
53 | 	gopkg.in/yaml.v3 v3.0.1 // indirect
54 | )
55 | 


--------------------------------------------------------------------------------
/interface.go:
--------------------------------------------------------------------------------
 1 | package gnparser
 2 | 
 3 | import (
 4 | 	"context"
 5 | 
 6 | 	"github.com/gnames/gnfmt"
 7 | 	"github.com/gnames/gnlib/ent/gnvers"
 8 | 	"github.com/gnames/gnparser/ent/nameidx"
 9 | 	"github.com/gnames/gnparser/ent/parsed"
10 | )
11 | 
12 | // GNparser is the main use-case interface. It provides methods required
13 | // for parsing scientific names.
14 | type GNparser interface {
15 | 	// ChangeConfig allows to modify settings of GNparser. Changing settings
16 | 	// might modify parsing process, and the final output of results.
17 | 	ChangeConfig(opts ...Option) GNparser
18 | 
19 | 	// Debug parses a string and outputs raw AST tree from PEG engine.
20 | 	Debug(s string) []byte
21 | 
22 | 	// Format returns currently chosen desired output format of a JSON or
23 | 	// CSV output.
24 | 	Format() gnfmt.Format
25 | 
26 | 	// GetVersion provides a version and a build timestamp of gnparser.
27 | 	GetVersion() gnvers.Version
28 | 
29 | 	// ParseName takes a name-string, and returns parsed results for the name.
30 | 	ParseName(string) parsed.Parsed
31 | 
32 | 	// ParseNameStream takes a context, an input channel that takes a
33 | 	// a name-string and its position in the input. It returns parsed results
34 | 	// that come in the same order as the input.
35 | 	ParseNameStream(context.Context, <-chan nameidx.NameIdx, chan<- parsed.Parsed)
36 | 
37 | 	// ParseNames takes a slice of name-strings, and returns a slice of
38 | 	// parsed results in the same order as the input.
39 | 	ParseNames([]string) []parsed.Parsed
40 | 
41 | 	// WebLogs returns a boolean to show or not the web-service logs.
42 | 	WebLogs() bool
43 | }
44 | 


--------------------------------------------------------------------------------
/io/dict/data/README.md:
--------------------------------------------------------------------------------
 1 | ## Creation of genera_auth_icn.txt
 2 | 
 3 | 1. Get the latest IRMNG file.
 4 | 2. Extract authors of ICN genera
 5 | 3. Parse the authors and take only "basionym" authors (makes list 500 authors smaller)
 6 | 4. Break authors to words, collect words that are capitalized, have no periods, larger than 2 characters.
 7 | 5. Clean up authors from spaces, commas, parentheses.
 8 | 6. Create list of all genera (canonical form)
 9 | 7. Remove from authors list all genera names.
10 | 


--------------------------------------------------------------------------------
/io/dict/data/bacteria_genera_homonyms.txt:
--------------------------------------------------------------------------------
  1 | Acaciella
  2 | Actinocephalus
  3 | Actinomyces
  4 | Ahrensia
  5 | Amalia
  6 | Amphithrix
  7 | Ampullaria
  8 | Anabaena
  9 | Anabaenella
 10 | Arizona
 11 | Asterococcus
 12 | Bacillus
 13 | Bacteridium
 14 | Bacularia
 15 | Baicalia
 16 | Balfouria
 17 | Bartonia
 18 | Belmontia
 19 | Belonia
 20 | Bosea
 21 | Brachydactylus
 22 | Brachynema
 23 | Brachythrix
 24 | Bryantella
 25 | Buchnera
 26 | Calothrix
 27 | Catinella
 28 | Caulococcus
 29 | Celyphus
 30 | Centipeda
 31 | Chlamydia
 32 | Chondrococcus
 33 | Chondrostroma
 34 | Cladothrix
 35 | Coenonia
 36 | Cohnia
 37 | Coleonema
 38 | Corbularia
 39 | Coxiella
 40 | Cystocoleus
 41 | Dalmatella
 42 | Demetria
 43 | Dendractis
 44 | Desmonema
 45 | Dichothrix
 46 | Dietzia
 47 | Dillwynella
 48 | Diplocalyx
 49 | Diplocystis
 50 | Diplonema
 51 | Discomyces
 52 | Donovania
 53 | Edwardsiella
 54 | Ehrenbergia
 55 | Eleutheronema
 56 | Endonema
 57 | Endospora
 58 | Fergusonia
 59 | Fischera
 60 | Fistularia
 61 | Frondicola
 62 | Gemella
 63 | Girvanella
 64 | Godlewskia
 65 | Gordonia
 66 | Grahamia
 67 | Haematococcus
 68 | Hedstromia
 69 | Heteractis
 70 | Heterochroa
 71 | Holopedium
 72 | Homoeothrix
 73 | Hydrococcus
 74 | Hydrocoryne
 75 | Jonesia
 76 | Kingella
 77 | Lamprocystis
 78 | Lauterbornia
 79 | Lawsonia
 80 | Leptobasis
 81 | Leptochaete
 82 | Leptonema
 83 | Leptopogon
 84 | Leptothrix
 85 | Leucothrix
 86 | Linckia
 87 | Listerella
 88 | Lithococcus
 89 | Macrospora
 90 | Malacostroma
 91 | Mantellum
 92 | Mastigonema
 93 | Mazaea
 94 | Microchaete
 95 | Micrococcus
 96 | Microcyclus
 97 | Microcystis
 98 | Micromyces
 99 | Microsphaera
100 | Mojavia
101 | Montanoa
102 | Moorella
103 | Morganella
104 | Muellerina
105 | Muricauda
106 | Murraya
107 | Mycoderma
108 | Myxoderma
109 | Nevskia
110 | Nocardia
111 | Nodularia
112 | Not
113 | Oospora
114 | Ophiothrix
115 | Palmella
116 | Palmula
117 | Paracoccus
118 | Paraplectonema
119 | Pelagothrix
120 | Pellicularia
121 | Pelosphaera
122 | Petalonema
123 | Pirella
124 | Planococcus
125 | Podocapsa
126 | Polycystis
127 | Polythrix
128 | Proteus
129 | Pulvinaria
130 | Pycnostroma
131 | Raciborskia
132 | Reichenbachia
133 | Rhabdoderma
134 | Rhodobium
135 | Rhodococcus
136 | Rhodosphaera
137 | Ristella
138 | Rivularia
139 | Rosaria
140 | Rothia
141 | Sapromyces
142 | Sarcomyces
143 | Schineria
144 | Schizosiphon
145 | Schizothrix
146 | Schwartzia
147 | Sclerothrix
148 | Scytonema
149 | Serpula
150 | Shuttleworthia
151 | Sinaiella
152 | Siphonosphaera
153 | Slackia
154 | Sphaeronema
155 | Sphaerophorus
156 | Spirillum
157 | Spirochaeta
158 | Spironema
159 | Staurocladia
160 | Stella
161 | Stigmatella
162 | Stigonema
163 | Streptomyces
164 | Streptothrix
165 | Stylobasis
166 | Symbiotes
167 | Symphyonema
168 | Syntrophus
169 | Tetracoccus
170 | Thalassobius
171 | Thalpophila
172 | Thermus
173 | Tildenia
174 | Trichococcus
175 | Trichodesmium
176 | Trichophora
177 | Trichophorus
178 | Trichospira
179 | Tubercularia
180 | Undina
181 | Vaginaria
182 | Vesicularia
183 | Williamsia
184 | Xenococcus
185 | Xylophilus
186 | Yersinia
187 | Zonotrichia
188 | Zoogloea
189 | 


--------------------------------------------------------------------------------
/io/dict/dict.go:
--------------------------------------------------------------------------------
 1 | // Package dict provides lookup data for gnparser.
 2 | package dict
 3 | 
 4 | import (
 5 | 	"bufio"
 6 | 	"embed"
 7 | 	"fmt"
 8 | 	"log/slog"
 9 | 	"os"
10 | )
11 | 
12 | //go:embed data
13 | var data embed.FS
14 | 
15 | // Dict contains loaded dictionaries
16 | var Dict *Dictionary = LoadDictionary()
17 | 
18 | // Dictionary contains dictionaries used for detecting information
19 | // about scientific names
20 | type Dictionary struct {
21 | 	// Bacteria contains bacterial genera, where boolean value is true if
22 | 	// we are aware of homonyms from other codes.
23 | 	Bacteria map[string]bool
24 | 	// AuthorICN contains family names of ICN authors of genera names.
25 | 	// This list is used to detect ICN name-strings so we can parse a word in
26 | 	// parenthesis after genus word as an author instead of subgenus.
27 | 	AuthorICN map[string]struct{}
28 | }
29 | 
30 | // LoadDictionary creates dictionary from text files.
31 | func LoadDictionary() *Dictionary {
32 | 	d := Dictionary{
33 | 		Bacteria:  readBacterialData(),
34 | 		AuthorICN: readAuthorICNData(),
35 | 	}
36 | 	return &d
37 | }
38 | 
39 | func readBacterialData() map[string]bool {
40 | 	m := make(map[string]bool)
41 | 	scanBacterialFile("bacteria_genera.txt", false, m)
42 | 	scanBacterialFile("bacteria_genera_homonyms.txt", true, m)
43 | 	return m
44 | }
45 | 
46 | func readAuthorICNData() map[string]struct{} {
47 | 	m := make(map[string]struct{})
48 | 	scanAuthorICNFIle("genera_auth_icn.txt", m)
49 | 	return m
50 | }
51 | 
52 | func scanAuthorICNFIle(path string, m map[string]struct{}) {
53 | 	path = fmt.Sprintf("data/%s", path)
54 | 	f, err := data.Open(path)
55 | 	if err != nil {
56 | 		slog.Error("Cannot open authors' file", "error", err, "path", path)
57 | 		os.Exit(1)
58 | 	}
59 | 	sc := bufio.NewScanner(f)
60 | 	for sc.Scan() {
61 | 		m[sc.Text()] = struct{}{}
62 | 	}
63 | }
64 | 
65 | func scanBacterialFile(path string, isHomonym bool, m map[string]bool) {
66 | 	path = fmt.Sprintf("data/%s", path)
67 | 	f, err := data.Open(path)
68 | 	if err != nil {
69 | 		slog.Error("Cannot open bacteria file", "error", err, "path", path)
70 | 	}
71 | 	sc := bufio.NewScanner(f)
72 | 	for sc.Scan() {
73 | 		m[sc.Text()] = isHomonym
74 | 	}
75 | }
76 | 


--------------------------------------------------------------------------------
/io/dict/dict_test.go:
--------------------------------------------------------------------------------
 1 | package dict_test
 2 | 
 3 | import (
 4 | 	"testing"
 5 | 
 6 | 	"github.com/gnames/gnparser/io/dict"
 7 | 	"github.com/stretchr/testify/assert"
 8 | )
 9 | 
10 | func TestLoadDictionary(t *testing.T) {
11 | 	d := dict.LoadDictionary()
12 | 	t.Run("loads bacterial dictionary", func(t *testing.T) {
13 | 		assert.Greater(t, len(d.Bacteria), 100)
14 | 	})
15 | 	t.Run("finds non homopypic genus", func(t *testing.T) {
16 | 		hom, ok := d.Bacteria["Sphingomonas"]
17 | 		assert.True(t, ok)
18 | 		assert.False(t, hom)
19 | 	})
20 | 	t.Run("finds homotypic genus", func(t *testing.T) {
21 | 		hom, ok := d.Bacteria["Arizona"]
22 | 		assert.True(t, ok)
23 | 		assert.True(t, hom)
24 | 	})
25 | 	t.Run("does not find non-bacterial genus", func(t *testing.T) {
26 | 		hom, ok := d.Bacteria["Homo"]
27 | 		assert.False(t, ok)
28 | 		assert.False(t, hom)
29 | 	})
30 | 	t.Run("does not find not ICN author", func(t *testing.T) {
31 | 		_, ok := d.AuthorICN["Arizona"]
32 | 		assert.False(t, ok)
33 | 	})
34 | 	t.Run("finds ICN author", func(t *testing.T) {
35 | 		_, ok := d.AuthorICN["Abramov"]
36 | 		assert.True(t, ok)
37 | 	})
38 | }
39 | 


--------------------------------------------------------------------------------
/io/web/gnparser_service.go:
--------------------------------------------------------------------------------
 1 | package web
 2 | 
 3 | import (
 4 | 	"github.com/gnames/gnparser"
 5 | )
 6 | 
 7 | type gnparserService struct {
 8 | 	gnparser.GNparser
 9 | 	port int
10 | }
11 | 
12 | // NewGNparserService creates a new object that implements GNparserService
13 | // interface.
14 | func NewGNparserService(gnp gnparser.GNparser, port int) GNparserService {
15 | 	res := gnparserService{
16 | 		GNparser: gnp,
17 | 		port:     port,
18 | 	}
19 | 	return &res
20 | }
21 | 
22 | // Ping is a method to check a liveliness of the service, returns "pong".
23 | func (gnps *gnparserService) Ping() string {
24 | 	return "pong"
25 | }
26 | 
27 | // Port returns the port of the service.
28 | func (gnps *gnparserService) Port() int {
29 | 	return gnps.port
30 | }
31 | 


--------------------------------------------------------------------------------
/io/web/interface.go:
--------------------------------------------------------------------------------
 1 | package web
 2 | 
 3 | import (
 4 | 	"github.com/gnames/gnparser"
 5 | )
 6 | 
 7 | // GNparserService is an interface that provides functionality for
 8 | // GNparser RESTful service.
 9 | type GNparserService interface {
10 | 	gnparser.GNparser
11 | 	// Ping is a method to check if the service is running. Returns "pong".
12 | 	Ping() string
13 | 	// Port returns the port of the service.
14 | 	Port() int
15 | }
16 | 


--------------------------------------------------------------------------------
/io/web/server.go:
--------------------------------------------------------------------------------
  1 | package web
  2 | 
  3 | import (
  4 | 	"embed"
  5 | 	"fmt"
  6 | 	"log/slog"
  7 | 	"net/http"
  8 | 	"net/url"
  9 | 	"strings"
 10 | 	"time"
 11 | 
 12 | 	"github.com/gnames/gnfmt"
 13 | 	"github.com/gnames/gnlib/ent/nomcode"
 14 | 	"github.com/gnames/gnparser"
 15 | 	"github.com/gnames/gnparser/ent/parsed"
 16 | 	"github.com/labstack/echo/v4"
 17 | 	"github.com/labstack/echo/v4/middleware"
 18 | )
 19 | 
 20 | //go:embed static
 21 | var static embed.FS
 22 | 
 23 | type inputREST struct {
 24 | 	Names             []string `json:"names"`
 25 | 	CSV               bool     `json:"csv"`
 26 | 	WithDetails       bool     `json:"withDetails"`
 27 | 	PreserveDiaereses bool     `json:"preserveDiaereses"`
 28 | 	Code              string   `json:"code"`
 29 | 
 30 | 	// WithCultivars is deprecated by Code and overriden by it
 31 | 	WithCultivars bool `json:"withCultivars"`
 32 | }
 33 | 
 34 | // Run starts the GNparser web service and servies both RESTful API and
 35 | // a website.
 36 | func Run(gnps GNparserService) {
 37 | 	var err error
 38 | 
 39 | 	e := echo.New()
 40 | 
 41 | 	e.Renderer, err = NewTemplate()
 42 | 	if err != nil {
 43 | 		e.Logger.Fatal(err)
 44 | 	}
 45 | 
 46 | 	e.Use(middleware.Gzip())
 47 | 	e.Use(middleware.CORS())
 48 | 
 49 | 	e.GET("/", homeGET(gnps))
 50 | 	e.POST("/", homePOST(gnps))
 51 | 	e.GET("/doc/api", docAPI())
 52 | 	e.GET("/api", info())
 53 | 	e.GET("/api/v1", info())
 54 | 	e.GET("/api/v1/ping", ping(gnps))
 55 | 	e.GET("/api/v1/version", ver(gnps))
 56 | 	e.GET("/api/v1/:names", parseNamesGET(gnps))
 57 | 	e.GET("/api/:names", parseNamesGET(gnps))
 58 | 	e.POST("/api/v1/", parseNamesPOST(gnps))
 59 | 	e.POST("/api/", parseNamesPOST(gnps))
 60 | 
 61 | 	fs := http.FileServer(http.FS(static))
 62 | 	e.GET("/static/*", echo.WrapHandler(fs))
 63 | 
 64 | 	addr := fmt.Sprintf(":%d", gnps.Port())
 65 | 	s := &http.Server{
 66 | 		Addr:         addr,
 67 | 		ReadTimeout:  5 * time.Minute,
 68 | 		WriteTimeout: 5 * time.Minute,
 69 | 	}
 70 | 	e.Logger.Fatal(e.StartServer(s))
 71 | }
 72 | 
 73 | func info() func(c echo.Context) error {
 74 | 	return func(c echo.Context) error {
 75 | 		return c.String(
 76 | 			http.StatusOK,
 77 | 			`OpenAPI for gnparser is described at
 78 | 
 79 | https://apidoc.globalnames.org/gnparser`,
 80 | 		)
 81 | 	}
 82 | }
 83 | 
 84 | func ping(gnps GNparserService) func(echo.Context) error {
 85 | 	return func(c echo.Context) error {
 86 | 		result := gnps.Ping()
 87 | 		return c.String(http.StatusOK, result)
 88 | 	}
 89 | }
 90 | 
 91 | func ver(gnps GNparserService) func(echo.Context) error {
 92 | 	return func(c echo.Context) error {
 93 | 		version := gnps.GetVersion()
 94 | 		return c.JSON(http.StatusOK, version)
 95 | 	}
 96 | }
 97 | 
 98 | func parseNamesGET(gnps GNparserService) func(echo.Context) error {
 99 | 	return func(c echo.Context) error {
100 | 		nameStr, _ := url.QueryUnescape(c.Param("names"))
101 | 		csv := c.QueryParam("csv") == "true"
102 | 		det := c.QueryParam("with_details") == "true"
103 | 		cultivars := c.QueryParam("cultivars") == "true"
104 | 		diaereses := c.QueryParam("diaereses") == "true"
105 | 		codeStr := c.QueryParam("code")
106 | 
107 | 		code := getCode(codeStr, cultivars)
108 | 
109 | 		gnp := gnps.ChangeConfig(opts(code, csv, det, diaereses)...)
110 | 		names := strings.Split(nameStr, "|")
111 | 		res := gnp.ParseNames(names)
112 | 		if l := len(names); l > 0 {
113 | 			slog.Info("Parsed",
114 | 				"namesNum", l, "example", names[0],
115 | 				"parsedBy", "REST API", "method", "GET",
116 | 			)
117 | 		}
118 | 		return formatNames(c, res, gnp.Format())
119 | 	}
120 | }
121 | 
122 | func parseNamesPOST(gnps GNparserService) func(echo.Context) error {
123 | 	return func(c echo.Context) error {
124 | 		var input inputREST
125 | 		if err := c.Bind(&input); err != nil {
126 | 			return err
127 | 		}
128 | 
129 | 		if l := len(input.Names); l > 0 {
130 | 			slog.Info("Parsed",
131 | 				"namesNum", l,
132 | 				"example", input.Names[0],
133 | 				"parsedBy", "REST API",
134 | 				"method", "POST",
135 | 			)
136 | 		}
137 | 		code := getCode(input.Code, input.WithCultivars)
138 | 
139 | 		gnp := gnps.ChangeConfig(
140 | 			opts(code, input.CSV, input.WithDetails, input.PreserveDiaereses)...)
141 | 		res := gnp.ParseNames(input.Names)
142 | 		return formatNames(c, res, gnp.Format())
143 | 	}
144 | }
145 | 
146 | func getCode(codeStr string, cultivars bool) nomcode.Code {
147 | 	code := nomcode.Unknown
148 | 	if cultivars {
149 | 		code = nomcode.Cultivars
150 | 	}
151 | 	code2 := nomcode.New(codeStr)
152 | 	if code2 == nomcode.Unknown {
153 | 		return code
154 | 	}
155 | 	return code2
156 | }
157 | 
158 | func formatNames(
159 | 	c echo.Context,
160 | 	res []parsed.Parsed,
161 | 	f gnfmt.Format,
162 | ) error {
163 | 
164 | 	switch f {
165 | 	case gnfmt.CSV, gnfmt.TSV:
166 | 		resCSV := make([]string, 0, len(res)+1)
167 | 		resCSV = append(resCSV, parsed.HeaderCSV(f))
168 | 		for i := range res {
169 | 			resCSV = append(resCSV, res[i].Output(f))
170 | 		}
171 | 		return c.String(http.StatusOK, strings.Join(resCSV, "\n"))
172 | 	default:
173 | 		return c.JSON(http.StatusOK, res)
174 | 	}
175 | }
176 | 
177 | func opts(code nomcode.Code, csv, details, diaereses bool) []gnparser.Option {
178 | 	res := []gnparser.Option{
179 | 		gnparser.OptWithDetails(details),
180 | 		gnparser.OptCode(code),
181 | 		gnparser.OptWithPreserveDiaereses(diaereses),
182 | 	}
183 | 	if csv {
184 | 		res = append(res, gnparser.OptFormat(gnfmt.CSV))
185 | 	} else {
186 | 		res = append(res, gnparser.OptFormat(gnfmt.CompactJSON))
187 | 	}
188 | 
189 | 	return res
190 | }
191 | 


--------------------------------------------------------------------------------
/io/web/static/images/favicon.ico:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/gnames/gnparser/a2bee074a6d9f6b23be70c0d760540863645fa74/io/web/static/images/favicon.ico


--------------------------------------------------------------------------------
/io/web/static/images/github-mark.svg:
--------------------------------------------------------------------------------
1 | <svg width="98" height="96" xmlns="http://www.w3.org/2000/svg"><path fill-rule="evenodd" clip-rule="evenodd" d="M48.854 0C21.839 0 0 22 0 49.217c0 21.756 13.993 40.172 33.405 46.69 2.427.49 3.316-1.059 3.316-2.362 0-1.141-.08-5.052-.08-9.127-13.59 2.934-16.42-5.867-16.42-5.867-2.184-5.704-5.42-7.17-5.42-7.17-4.448-3.015.324-3.015.324-3.015 4.934.326 7.523 5.052 7.523 5.052 4.367 7.496 11.404 5.378 14.235 4.074.404-3.178 1.699-5.378 3.074-6.6-10.839-1.141-22.243-5.378-22.243-24.283 0-5.378 1.94-9.778 5.014-13.2-.485-1.222-2.184-6.275.486-13.038 0 0 4.125-1.304 13.426 5.052a46.97 46.97 0 0 1 12.214-1.63c4.125 0 8.33.571 12.213 1.63 9.302-6.356 13.427-5.052 13.427-5.052 2.67 6.763.97 11.816.485 13.038 3.155 3.422 5.015 7.822 5.015 13.2 0 18.905-11.404 23.06-22.324 24.283 1.78 1.548 3.316 4.481 3.316 9.126 0 6.6-.08 11.897-.08 13.526 0 1.304.89 2.853 3.316 2.364 19.412-6.52 33.405-24.935 33.405-46.691C97.707 22 75.788 0 48.854 0z" fill="#45704d"/></svg>
2 | 


--------------------------------------------------------------------------------
/io/web/static/images/gna.svg:
--------------------------------------------------------------------------------
 1 | <?xml version="1.0" encoding="utf-8"?>
 2 | <!-- Generator: Adobe Illustrator 19.1.0, SVG Export Plug-In . SVG Version: 6.00 Build 0)  -->
 3 | <svg version="1.1" id="Layer_1" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" x="0px" y="0px"
 4 | 	 viewBox="-130 172 350 450" style="enable-background:new -130 172 350 450;" xml:space="preserve">
 5 | <style type="text/css">
 6 | 	.st0{stroke:#FFFFFF;stroke-width:2;stroke-miterlimit:10;}
 7 | 	.st1{fill:#007934;stroke:#FFFFFF;stroke-width:2;stroke-miterlimit:10;}
 8 | 	.st2{fill:#FDBA12;stroke:#FFFFFF;stroke-width:2;stroke-miterlimit:10;}
 9 | 	.st3{fill:#001689;stroke:#FFFFFF;stroke-width:2;stroke-miterlimit:10;}
10 | </style>
11 | <path class="st0" d="M-111.2,481.3c0-12.7,4.7-22.7,13.9-30.1c9.3-7.4,22-11,38-11h53.1v18.4h-28.1c4.7,4.8,7.9,9,9.7,12.9
12 | 	c1.8,3.9,2.7,8.4,2.7,13.4c0,6.2-1.8,12.3-5.4,18.3c-3.6,6-8.1,10.6-13.7,13.8c-5.6,3.2-14.7,5.7-27.3,7.7
13 | 	c-8.9,1.3-13.4,4.3-13.4,9.1c0,2.8,1.7,5,5,6.8c3.3,1.8,9.4,3.6,18.1,5.5c14.6,3.2,24,5.7,28.1,7.5c4.2,1.8,7.9,4.5,11.4,7.8
14 | 	c5.7,5.7,8.6,12.8,8.6,21.4c0,11.3-5,20.3-15,27s-23.5,10.1-40.4,10.1c-16.9,0-30.5-3.3-40.7-10.1s-15.2-15.8-15.2-27.2
15 | 	c0-16.1,9.9-26.4,29.9-31.1c-7.9-5.1-11.9-10.1-11.9-15.2c0-3.8,1.7-7.2,5.1-10.4c3.5-3.1,8-5.4,13.8-6.9
16 | 	C-102.5,511.2-111.2,498.6-111.2,481.3z M-72.5,564.7c-7.9,0-14.4,1.7-19.5,5c-5,3.3-7.6,7.7-7.6,12.9c0,12.3,11,18.4,33.1,18.4
17 | 	c10.5,0,18.5-1.6,24.3-4.6c5.7-3,8.6-7.4,8.6-12.9c0-5.5-3.6-10-10.8-13.5C-51.6,566.5-61,564.7-72.5,564.7z M-66.9,460.6
18 | 	c-6.4,0-11.8,2.1-16.3,6.5c-4.5,4.3-6.7,9.5-6.7,15.5c0,6.1,2.2,11.1,6.6,15.2c4.5,4,9.9,6,16.7,6c6.7,0,12.1-2,16.6-6.1
19 | 	c4.5-4.1,6.7-9.3,6.7-15.4c0-6.2-2.2-11.4-6.7-15.5C-54.6,462.7-60.2,460.6-66.9,460.6z"/>
20 | <path class="st0" d="M22.8,440.2v15.2c10.5-11.6,22.5-17.5,35.9-17.5c7.5,0,14.3,1.9,20.7,5.8c6.4,3.8,11.3,9.1,14.6,15.8
21 | 	c3.3,6.7,5,17.3,5,31.8v68.1H75.5v-67.8c0-12.1-1.9-20.8-5.6-26.1c-3.7-5.2-9.9-7.8-18.6-7.8c-11.1,0-20.6,5.6-28.3,16.7v85H-1
22 | 	V440.2H22.8z"/>
23 | <path class="st0" d="M188.6,487.7v50.6c0,4,1.3,6.1,4.1,6.1c2.8,0,7.2-2.1,13.3-6.4v14.4c-5.4,3.5-9.6,5.8-12.8,7
24 | 	c-3.2,1.2-6.6,1.9-10.1,1.9c-10.1,0-16.1-3.9-17.8-11.9c-10,7.8-20.6,11.6-32,11.6c-8.2,0-15.2-2.7-20.7-8.2
25 | 	c-5.6-5.5-8.2-12.4-8.2-20.6c0-7.5,2.7-14.2,8.1-20.1c5.4-5.9,13-10.6,23-14l30.1-10.4v-6.4c0-14.3-7.1-21.5-21.5-21.5
26 | 	c-12.8,0-27,5.2-39.2,18.5c0,0-0.7-7-2.2-13.3c-1-4.1-3.2-9-3.2-9c9.1-10.8,29.4-18.3,46.6-18.3c12.8,0,23.2,3.3,30.9,10.1
27 | 	c2.6,2.1,4.9,5,7,8.6c2.1,3.6,3.3,7.1,3.9,10.7C188.3,470.8,188.6,477.6,188.6,487.7z M165.5,535.6v-35.3l-15.8,6.1
28 | 	c-8,3.2-13.7,6.4-17.1,9.6c-3.3,3.2-5,7.2-5,12.1s1.6,8.9,4.7,12c3.1,3.1,7.2,4.7,12.3,4.7C152.1,544.9,159,541.7,165.5,535.6z"/>
29 | <path class="st1" d="M80.4,305.5c0-14-10.1-32.4-33-32.4c-22.1,0-33,15.5-33,32c0,14.2,11.9,29.6,11.9,29.6s6.4,8,13.3,26.6
30 | 	c6.5,17.2,7.4,39.9,7.5,35c0.1,5.1,1-17.7,7.5-35c6.9-18.6,13.3-26.6,13.3-26.6S80.4,319.4,80.4,305.5z"/>
31 | <path class="st2" d="M0.9,311.5c-6.9-12.3-24.6-23.4-44.6-12.3C-63,310-65,328.8-57,343.2c6.9,12.4,24.9,20.2,24.9,20.2
32 | 	s9.5,3.9,24.5,16.8c13.9,11.9,25.9,31.3,23.5,27c2.6,4.5-7.7-15.9-10.5-34.2c-3-19.6-1.3-29.8-1.3-29.8S7.5,323.5,0.9,311.5z"/>
33 | <path class="st3" d="M93.8,311.5c6.9-12.3,24.6-23.4,44.6-12.3c19.3,10.8,21.3,29.6,13.3,44c-6.9,12.4-24.9,20.2-24.9,20.2
34 | 	s-9.5,3.9-24.6,16.8c-13.9,11.9-25.9,31.3-23.5,27c-2.6,4.5,7.7-15.9,10.5-34.2c3-19.6,1.3-29.8,1.3-29.8S87,323.5,93.8,311.5z"/>
35 | <path class="st2" d="M-37.5,251.8c0,22.5-18.3,40.8-40.8,40.8s-40.8-18.3-40.8-40.8s18.3-40.8,40.8-40.8S-37.5,229.3-37.5,251.8z"/>
36 | <circle class="st1" cx="46.3" cy="214.9" r="40.8"/>
37 | <path class="st3" d="M211.8,249.8c0,22.5-18.3,40.8-40.8,40.8s-40.8-18.3-40.8-40.8S148.4,209,171,209
38 | 	C193.5,209,211.8,227.3,211.8,249.8z"/>
39 | </svg>
40 | 


--------------------------------------------------------------------------------
/io/web/static/styles/parser.css:
--------------------------------------------------------------------------------
1 | .parser textarea {
2 |   width: 100%;
3 |   height: 7em;
4 |   display: block;
5 |   margin-bottom:1em;
6 | }
7 | 


--------------------------------------------------------------------------------
/io/web/templates.go:
--------------------------------------------------------------------------------
 1 | package web
 2 | 
 3 | import (
 4 | 	"embed"
 5 | 	"fmt"
 6 | 	"html/template"
 7 | 	"io"
 8 | 	"path"
 9 | 
10 | 	"github.com/gnames/gnfmt"
11 | 	"github.com/gnames/gnparser/ent/parsed"
12 | 	"github.com/labstack/echo/v4"
13 | )
14 | 
15 | //go:embed templates
16 | var tmpls embed.FS
17 | 
18 | // echoTempl implements echo.Renderer interface.
19 | type echoTempl struct {
20 | 	templates *template.Template
21 | }
22 | 
23 | // Render implements echo.Renderer interface.
24 | func (t *echoTempl) Render(
25 | 	w io.Writer,
26 | 	name string,
27 | 	data interface{},
28 | 	c echo.Context,
29 | ) error {
30 | 	return t.templates.ExecuteTemplate(w, name, data)
31 | }
32 | 
33 | func NewTemplate() (*echoTempl, error) {
34 | 	t, err := parseFiles()
35 | 	if err != nil {
36 | 		return nil, fmt.Errorf("cannot parse file %w", err)
37 | 	}
38 | 	return &echoTempl{t}, nil
39 | }
40 | 
41 | func parseFiles() (*template.Template, error) {
42 | 	var err error
43 | 	var t *template.Template
44 | 
45 | 	var filenames []string
46 | 	dir := "templates"
47 | 	entries, _ := tmpls.ReadDir(dir)
48 | 	for i := range entries {
49 | 		if entries[i].Type().IsRegular() {
50 | 			filenames = append(
51 | 				filenames,
52 | 				fmt.Sprintf("%s/%s", dir, entries[i].Name()),
53 | 			)
54 | 		}
55 | 	}
56 | 
57 | 	for _, filename := range filenames {
58 | 		name := path.Base(filename)
59 | 		var tmpl *template.Template
60 | 		if t == nil {
61 | 			t = template.New(name)
62 | 		}
63 | 		if name == t.Name() {
64 | 			tmpl = t
65 | 		} else {
66 | 			tmpl = t.New(name)
67 | 		}
68 | 		addFuncs(tmpl)
69 | 		_, err = tmpl.ParseFS(tmpls, filename)
70 | 		if err != nil {
71 | 			return nil, err
72 | 		}
73 | 	}
74 | 	return t, nil
75 | }
76 | 
77 | func addFuncs(tmpl *template.Template) {
78 | 	tmpl.Funcs(template.FuncMap{
79 | 		"parsedJSON": func(p parsed.Parsed) string {
80 | 			return p.Output(gnfmt.PrettyJSON)
81 | 		},
82 | 	})
83 | }
84 | 


--------------------------------------------------------------------------------
/io/web/templates/doc_api.html:
--------------------------------------------------------------------------------
 1 | {{ define "doc" }}
 2 |   <section class="parser api">
 3 |     <div class="grid">
 4 |       <div class="unit whole">
 5 |         <h2 id="api">Application Programming Interface (API)</h2>
 6 | 
 7 |         <p>Web-based parser service includes a RESTful interface to parsing
 8 |         functionalilty.  Both GET and POST methods are supported.</p>
 9 | 
10 |         <h3 id="get">GET</h3>
11 | 
12 |         <p>
13 |         Append a vertical line separated array of strings to your domain url.
14 |         Make sure that '&amp;' in the names are escaped as '%26',
15 |         and spaces are escaped as '+'.
16 |         </p>
17 | 
18 |         <p>
19 |         <code>/api/v1/Aus+bus|Aus+bus+D.+%26+M.,+1870</code>
20 |         </p>
21 | 
22 |         <h3 id="post">POST</h3>
23 | 
24 |         <p><code>/api/v1</code></p>
25 | 
26 |         <p>
27 |         with request body of JSON array of strings
28 |         </p>
29 | 
30 |         <h3> OpenAPI Schema</h3>
31 |         <p>
32 |         Read the GNparser's
33 |         <a href="https://apidoc.globalnames.org/gnparser">
34 |           OpenAPI documentation
35 |         </a> to learn about all options and the output schema.
36 |         </p>
37 |       </div>
38 |     </div>
39 |   </section>
40 |   {{ end }}
41 | 


--------------------------------------------------------------------------------
/io/web/templates/home.html:
--------------------------------------------------------------------------------
 1 | {{ define "home" }}
 2 | <section class="parser">
 3 |   <div class="grid">
 4 |     <div class="unit whole">
 5 |       <form action="/" method="post">
 6 |         <div class="form-elements">
 7 |           <label for="format">Output format</label>
 8 |           <select id="format" name="format">
 9 |             <option value="html">HTML</option>
10 |             <option value="json">JSON</option>
11 |             <option value="csv">CSV</option>
12 |             <option value="tsv">TSV</option>
13 |           </select>
14 |           <label for="with_details">Show details</label>
15 |           <input
16 |             type="checkbox"
17 |             id="with_details"
18 |             name="with_details"
19 |             checked="checked"
20 |           />
21 |           <label for="code">Nomenclatural Code</label>
22 |           <select id="code" name="code">
23 |             <option value="">Any</option>
24 |             <option value="bacterial" {{ if eq .Code "bacterial" }}selected="selected"{{ end }}>Bacterial</option>
25 |             <option value="botanical" {{ if eq .Code "botanical" }}selected="selected"{{ end }}>Botanical</option>
26 |             <option value="cultivar" {{ if eq .Code "cultivar" }}selected="selected"{{ end }}>Cultivar</option>
27 |             <option value="zoological" {{ if eq .Code "zoological" }}selected="selected"{{ end }}>Zoological</option>
28 |           </select>
29 |           <label for="diaereses">Preserve diaereses</label>
30 |           <input type="checkbox" id="diaereses" name="diaereses" />
31 |         </div>
32 |         <textarea
33 |           autofocus
34 |           id="names"
35 |           name="names"
36 |           placeholder="Add up to 5000 names, one per line"
37 |         >
38 | {{.Input}}</textarea
39 |         >
40 |         <input type="submit" value="Parse" />
41 |       </form>
42 |     </div>
43 |   </div>
44 | </section>
45 | {{ if .Parsed }} {{if eq .Format "html" }}
46 | <section class="parser results">
47 |   <div class="grid">
48 |     <div class="unit whole">
49 |       <h4>Results:</h4>
50 |       {{ range .Parsed }}
51 |       <p>
52 |         <code class="unit whole" style="margin-bottom: 1em"
53 |           >{{ parsedJSON . }}</code
54 |         >
55 |       </p>
56 |       <p></p>
57 |       {{ end }}
58 |     </div>
59 |   </div>
60 | </section>
61 | {{ end }} {{ end }} {{ end }}
62 | 


--------------------------------------------------------------------------------
/io/web/templates/layout.html:
--------------------------------------------------------------------------------
  1 | {{ define "layout" }}
  2 | <!doctype html>
  3 | <html>
  4 |   <head>
  5 |     <meta charset="UTF-8" />
  6 |     <title>GNparser</title>
  7 |     <meta content="width=device-width,initial-scale=1" name="viewport" />
  8 |     <link href="/static/styles/screen.css" rel="stylesheet" />
  9 |     <link href="/static/styles/parser.css" rel="stylesheet" />
 10 |     <link href="/static/images/favicon.ico" rel="icon" type="image/x-icon" />
 11 |   </head>
 12 | 
 13 |   <body class="wrap">
 14 |     <header role="banner">
 15 |       <nav class="mobile-nav show-on-mobiles">
 16 |         <ul>
 17 |           {{ if .HomePage }}
 18 |           <li class="current">{{ else }}</li>
 19 |           <li>
 20 |             {{ end }}
 21 |             <a href="/">Parser</a>
 22 |           </li>
 23 |           {{ if .HomePage }}
 24 |           <li>{{ else }}</li>
 25 |           <li class="current">
 26 |             {{ end }}
 27 |             <a href="/doc/api">API</a>
 28 |           </li>
 29 |           <li>
 30 |             <a href="https://github.com/gnames/gnparser/blob/master/README.md"
 31 |               ><span class="hide-on-mobiles">Doc on</span> GitHub</a
 32 |             >
 33 |           </li>
 34 |           <li>
 35 |             <a href="https://globalnames.org/apps">Projects</a>
 36 |           </li>
 37 |         </ul>
 38 |       </nav>
 39 |       <div class="grid">
 40 |         <div class="unit one-quarter center-on-mobiles">
 41 |           <div class="logo">
 42 |             <a href="/">
 43 |               <span class="sr-only">GlobalNames</span>
 44 |               <img alt="GNA Logo" src="/static/images/gna.svg" width="72" />
 45 |             </a>
 46 |           </div>
 47 |         </div>
 48 |         <nav class="main-nav unit three-quarters hide-on-mobiles">
 49 |           <ul>
 50 |             {{ if .HomePage }}
 51 |             <li class="current">{{ else }}</li>
 52 |             <li>
 53 |               {{ end }}
 54 |               <a href="/">Parser</a>
 55 |             </li>
 56 |             {{ if .HomePage }}
 57 |             <li>{{ else }}</li>
 58 |             <li class="current">
 59 |               {{ end }}
 60 |               <a href="/doc/api">API</a>
 61 |             </li>
 62 |             <li>
 63 |               <a href="https://github.com/gnames/gnparser/blob/master/README.md"
 64 |                 ><span class="hide-on-mobiles">Doc on</span> GitHub</a
 65 |               >
 66 |             </li>
 67 |             <li>
 68 |               <a href="https://github.com/gnames/gnparser/issues">Issues</a>
 69 |             </li>
 70 |           </ul>
 71 |         </nav>
 72 |       </div>
 73 |     </header>
 74 |     <section class="intro">
 75 |       <div class="grid">
 76 |         <div class="unit whole center-on-mobiles">
 77 |           <h1>Global Names Parser</h1>
 78 |           <h4>Scientific Names in Detail</h4>
 79 |         </div>
 80 |       </div>
 81 |     </section>
 82 | 
 83 |     {{ if .HomePage }} {{ template "home" . }} {{ else }} {{ template "doc" .}}
 84 |     {{ end }}
 85 | 
 86 |     <section class="footer">
 87 |       <div class="grid">
 88 |         <div class="unit whole center-on-mobiles" style="margin: 1em 0 1em 0" />
 89 |         <div align="center" id="version">
 90 |           <a href="https://github.com/gnames/gnparser">
 91 |             <img src="/static/images/github-mark.svg" alt="GitHub link" />
 92 |             Version {{ .Version }}
 93 |           </a>
 94 |         </div>
 95 |         <div
 96 |           class="unit whole center-on-mobiles"
 97 |           style="border-top: 1px solid grey; margin: 1em 0 1em 0"
 98 |         />
 99 |         <div class="unit whole center-on-mobiles">
100 |           <div align="center">
101 |             <span><a href="https://globalnames.org">Global Names</a></span> |
102 |             <span
103 |               ><a href="https://parser.globalnames.org"
104 |                 >Global Names Parser</a
105 |               ></span
106 |             >
107 |             |
108 |             <span
109 |               ><a href="https://finder.globalnames.org"
110 |                 >Global Names Finder</a
111 |               ></span
112 |             >
113 |             |
114 |             <span
115 |               ><a href="https://verifier.globalnames.org"
116 |                 >Global Names Verifier</a
117 |               ></span
118 |             >
119 |           </div>
120 |         </div>
121 |       </div>
122 |     </section>
123 |   </body>
124 | </html>
125 | {{ end }}
126 | 


--------------------------------------------------------------------------------
/io/web/web.go:
--------------------------------------------------------------------------------
  1 | // Package web provides RESTful API service and a website for gnparser.
  2 | package web
  3 | 
  4 | import (
  5 | 	"fmt"
  6 | 	"log/slog"
  7 | 	"net/http"
  8 | 	"net/url"
  9 | 	"strings"
 10 | 
 11 | 	"github.com/gnames/gnfmt"
 12 | 	"github.com/gnames/gnlib/ent/nomcode"
 13 | 	"github.com/gnames/gnparser"
 14 | 	"github.com/gnames/gnparser/ent/parsed"
 15 | 	"github.com/labstack/echo/v4"
 16 | )
 17 | 
 18 | // inputFORM is used to collect data from HTML form.
 19 | type inputFORM struct {
 20 | 	Names       string `query:"names"        form:"names"`
 21 | 	Code        string `query:"code"         form:"code"`
 22 | 	Format      string `query:"format"       form:"format"`
 23 | 	WithDetails string `query:"with_details" form:"with_details"`
 24 | 
 25 | 	// WithCultivars is deprecated and overriden by Code
 26 | 	WithCultivars     string `query:"cultivars" form:"cultivars"`
 27 | 	PreserveDiaereses string `query:"diaereses" form:"diaereses"`
 28 | }
 29 | 
 30 | // Data contains information required to render web-pages.
 31 | type Data struct {
 32 | 	Input       string
 33 | 	Parsed      []parsed.Parsed
 34 | 	Code        string
 35 | 	Format      string
 36 | 	HomePage    bool
 37 | 	Version     string
 38 | 	WithDetails bool
 39 | 	// WithCultivars is deprecated by Code field
 40 | 	WithCultivars     bool
 41 | 	PreserveDiaereses bool
 42 | }
 43 | 
 44 | // NewData creates new Data for web-page templates.
 45 | func newData(isHome bool) *Data {
 46 | 	return &Data{HomePage: isHome, Format: "html", Version: gnparser.Version}
 47 | }
 48 | 
 49 | func homePOST(gnps GNparserService) func(echo.Context) error {
 50 | 	return func(c echo.Context) error {
 51 | 		inp := new(inputFORM)
 52 | 		data := newData(true)
 53 | 
 54 | 		err := c.Bind(inp)
 55 | 		if err != nil {
 56 | 			return err
 57 | 		}
 58 | 
 59 | 		if strings.TrimSpace(inp.Names) == "" {
 60 | 			return c.Redirect(http.StatusFound, "")
 61 | 		}
 62 | 
 63 | 		if strings.Count(inp.Names, "\n") < 20 {
 64 | 			return redirectToHomeGET(c, inp)
 65 | 		}
 66 | 
 67 | 		return parsingResults(c, gnps, inp, data)
 68 | 	}
 69 | }
 70 | 
 71 | func redirectToHomeGET(c echo.Context, inp *inputFORM) error {
 72 | 	withDetails := inp.WithDetails == "on"
 73 | 	withCultivars := inp.WithCultivars == "on"
 74 | 	preserveDiaereses := inp.PreserveDiaereses == "on"
 75 | 	q := make(url.Values)
 76 | 	q.Set("names", inp.Names)
 77 | 	q.Set("format", inp.Format)
 78 | 	if withDetails {
 79 | 		q.Set("with_details", inp.WithDetails)
 80 | 	}
 81 | 	if withCultivars {
 82 | 		q.Set("cultivars", inp.WithCultivars)
 83 | 	}
 84 | 	if preserveDiaereses {
 85 | 		q.Set("diaereses", inp.PreserveDiaereses)
 86 | 	}
 87 | 	q.Set("code", inp.Code)
 88 | 
 89 | 	url := fmt.Sprintf("/?%s", q.Encode())
 90 | 	return c.Redirect(http.StatusFound, url)
 91 | }
 92 | 
 93 | func homeGET(gnps GNparserService) func(echo.Context) error {
 94 | 	return func(c echo.Context) error {
 95 | 		data := newData(true)
 96 | 
 97 | 		inp := new(inputFORM)
 98 | 		err := c.Bind(inp)
 99 | 		if err != nil {
100 | 			return err
101 | 		}
102 | 
103 | 		if strings.TrimSpace(inp.Names) == "" {
104 | 			return c.Render(http.StatusOK, "layout", data)
105 | 		}
106 | 
107 | 		return parsingResults(c, gnps, inp, data)
108 | 	}
109 | }
110 | 
111 | func parsingResults(
112 | 	c echo.Context,
113 | 	gnps GNparserService,
114 | 	inp *inputFORM,
115 | 	data *Data,
116 | ) error {
117 | 	var names []string
118 | 	data.WithDetails = inp.WithDetails == "on"
119 | 	data.WithCultivars = inp.WithCultivars == "on"
120 | 	data.PreserveDiaereses = inp.PreserveDiaereses == "on"
121 | 	data.Code = inp.Code
122 | 
123 | 	format := inp.Format
124 | 	if format == "csv" || format == "tsv" || format == "json" {
125 | 		data.Format = format
126 | 	}
127 | 
128 | 	data.Input = strings.TrimSpace(inp.Names)
129 | 	split := strings.Split(data.Input, "\n")
130 | 	if len(split) > 5_000 {
131 | 		split = split[0:5_000]
132 | 	}
133 | 
134 | 	names = make([]string, len(split))
135 | 	for i := range split {
136 | 		names[i] = strings.TrimSpace(split[i])
137 | 	}
138 | 	if l := len(names); l > 0 {
139 | 		slog.Info("Parsed",
140 | 			"namesNum", l,
141 | 			"example", names[0],
142 | 			"parsedBy", "WEB GUI",
143 | 		)
144 | 	}
145 | 	data.Input = strings.Join(names, "\n")
146 | 
147 | 	opts := []gnparser.Option{
148 | 		gnparser.OptWithDetails(data.WithDetails),
149 | 		gnparser.OptWithPreserveDiaereses(data.PreserveDiaereses),
150 | 	}
151 | 
152 | 	if data.WithCultivars {
153 | 		opts = append(opts, gnparser.OptCode(nomcode.Cultivars))
154 | 	}
155 | 
156 | 	code := nomcode.New(data.Code)
157 | 	if code != nomcode.Unknown {
158 | 		// overrides data.WithCultivars
159 | 		opts = append(opts, gnparser.OptCode(code))
160 | 	}
161 | 
162 | 	gnp := gnps.ChangeConfig(opts...)
163 | 	data.Parsed = gnp.ParseNames(names)
164 | 
165 | 	switch data.Format {
166 | 	case "json":
167 | 		return c.JSON(http.StatusOK, data.Parsed)
168 | 	case "csv", "tsv":
169 | 		f := gnfmt.CSV
170 | 		if data.Format == "tsv" {
171 | 			f = gnfmt.TSV
172 | 		}
173 | 
174 | 		res := make([]string, len(data.Parsed)+1)
175 | 		res[0] = parsed.HeaderCSV(f)
176 | 		for i := range data.Parsed {
177 | 			res[i+1] = data.Parsed[i].Output(f)
178 | 		}
179 | 		return c.String(http.StatusOK, strings.Join(res, "\n"))
180 | 	default:
181 | 		return c.Render(http.StatusOK, "layout", data)
182 | 	}
183 | }
184 | 
185 | func docAPI() func(echo.Context) error {
186 | 	return func(c echo.Context) error {
187 | 		data := newData(false)
188 | 		return c.Render(http.StatusOK, "layout", data)
189 | 	}
190 | }
191 | 


--------------------------------------------------------------------------------
/io/web/web_internal_test.go:
--------------------------------------------------------------------------------
  1 | package web
  2 | 
  3 | import (
  4 | 	"bytes"
  5 | 	"net/http"
  6 | 	"net/http/httptest"
  7 | 	"net/url"
  8 | 	"strings"
  9 | 	"testing"
 10 | 
 11 | 	"github.com/gnames/gnfmt"
 12 | 	"github.com/gnames/gnlib/ent/gnvers"
 13 | 	"github.com/gnames/gnparser"
 14 | 	"github.com/gnames/gnparser/ent/parsed"
 15 | 	"github.com/labstack/echo/v4"
 16 | 	"github.com/stretchr/testify/assert"
 17 | )
 18 | 
 19 | func handlerGET(path string) (echo.Context, *httptest.ResponseRecorder) {
 20 | 	req := httptest.NewRequest(http.MethodGet, path, nil)
 21 | 	rec := httptest.NewRecorder()
 22 | 	e := echo.New()
 23 | 	e.Renderer, _ = NewTemplate()
 24 | 	c := e.NewContext(req, rec)
 25 | 	return c, rec
 26 | }
 27 | 
 28 | func TestHome(t *testing.T) {
 29 | 	var err error
 30 | 	cfg := gnparser.NewConfig(gnparser.OptFormat(gnfmt.CompactJSON))
 31 | 	gnp := gnparser.New(cfg)
 32 | 	gnps := NewGNparserService(gnp, 0)
 33 | 
 34 | 	req := httptest.NewRequest(http.MethodGet, "/", nil)
 35 | 	rec := httptest.NewRecorder()
 36 | 	e := echo.New()
 37 | 	c := e.NewContext(req, rec)
 38 | 	e.Renderer, err = NewTemplate()
 39 | 	assert.Nil(t, err)
 40 | 
 41 | 	assert.Nil(t, homePOST(gnps)(c))
 42 | 	assert.Equal(t, http.StatusFound, rec.Code)
 43 | }
 44 | 
 45 | // func TestDocAPI(t *testing.T) {
 46 | // 	c, rec := handlerGET("/doc/api")
 47 | // 	assert.Nil(t, docAPI()(c))
 48 | // 	assert.Equal(t, http.StatusOK, rec.Code)
 49 | // 	assert.Contains(t, rec.Body.String(), "Application Programming Interface")
 50 | // }
 51 | 
 52 | func TestInfo(t *testing.T) {
 53 | 	c, rec := handlerGET("/")
 54 | 
 55 | 	assert.Nil(t, info()(c))
 56 | 	assert.Equal(t, http.StatusOK, rec.Code)
 57 | 	assert.Contains(t, rec.Body.String(), "OpenAPI")
 58 | }
 59 | 
 60 | func TestPing(t *testing.T) {
 61 | 	cfg := gnparser.NewConfig(gnparser.OptFormat(gnfmt.CompactJSON))
 62 | 	gnp := gnparser.New(cfg)
 63 | 	gnps := NewGNparserService(gnp, 0)
 64 | 	c, rec := handlerGET("/ping")
 65 | 
 66 | 	assert.Nil(t, ping(gnps)(c))
 67 | 	assert.Equal(t, http.StatusOK, rec.Code)
 68 | 	assert.Equal(t, "pong", rec.Body.String())
 69 | }
 70 | 
 71 | func TestVer(t *testing.T) {
 72 | 	cfg := gnparser.NewConfig(gnparser.OptFormat(gnfmt.CompactJSON))
 73 | 	gnp := gnparser.New(cfg)
 74 | 	gnps := NewGNparserService(gnp, 0)
 75 | 	c, rec := handlerGET("/version")
 76 | 
 77 | 	assert.Nil(t, ver(gnps)(c))
 78 | 	enc := gnfmt.GNjson{}
 79 | 	var response gnvers.Version
 80 | 	err := enc.Decode(rec.Body.Bytes(), &response)
 81 | 	assert.Nil(t, err)
 82 | 	assert.Regexp(t, `^v\d+\.\d+\.\d+`, response.Version)
 83 | }
 84 | 
 85 | func TestParseGET(t *testing.T) {
 86 | 	cfg := gnparser.NewConfig(gnparser.OptFormat(gnfmt.CompactJSON))
 87 | 	gnp := gnparser.New(cfg)
 88 | 	gnps := NewGNparserService(gnp, 0)
 89 | 
 90 | 	var response []parsed.Parsed
 91 | 	names := []string{
 92 | 		"Not name", "Bubo bubo", "Pomatomus",
 93 | 		"Pardosa moesta", "Plantago major var major",
 94 | 		"Cytospora ribis mitovirus 2",
 95 | 		"A-shaped rods", "Alb. alba",
 96 | 		"Pisonia grandis", "Acacia vestita may",
 97 | 	}
 98 | 	request := strings.Join(
 99 | 		names,
100 | 		"|",
101 | 	)
102 | 	namesQuery := url.QueryEscape(request)
103 | 	path := "/" + namesQuery
104 | 
105 | 	c, rec := handlerGET(path)
106 | 	c.SetPath("/:names")
107 | 	c.SetParamNames("names")
108 | 	c.SetParamValues(namesQuery)
109 | 
110 | 	assert.Nil(t, parseNamesGET(gnps)(c))
111 | 
112 | 	enc := gnfmt.GNjson{}
113 | 	err := enc.Decode(rec.Body.Bytes(), &response)
114 | 	assert.Nil(t, err)
115 | 
116 | 	assert.Equal(t, len(names), len(response))
117 | 	for i, v := range response {
118 | 		switch i {
119 | 		case 0:
120 | 			assert.Equal(t, "Not name", v.Verbatim, v.Verbatim)
121 | 			assert.False(t, v.Parsed, v.Verbatim)
122 | 		case 1:
123 | 			assert.Equal(t, "Bubo bubo", v.Verbatim, v.Verbatim)
124 | 			assert.Equal(t, "Bubo bubo", v.Canonical.Simple)
125 | 		}
126 | 	}
127 | }
128 | 
129 | func TestParseParamsGET(t *testing.T) {
130 | 	cfg := gnparser.NewConfig(gnparser.OptFormat(gnfmt.CompactJSON))
131 | 	gnp := gnparser.New(cfg)
132 | 	gnps := NewGNparserService(gnp, 0)
133 | 
134 | 	name := url.QueryEscape("Bubo bubo")
135 | 	tests := []struct {
136 | 		csv, det, startsWith, pattern string
137 | 		contains                      bool
138 | 	}{
139 | 		{"true", "false", "Id", "[", false},
140 | 		{"true", "true", "Id", "[", false},
141 | 		{"false", "false", "[", "details", false},
142 | 		{"false", "true", "[", "details", true},
143 | 	}
144 | 
145 | 	for _, v := range tests {
146 | 		e := echo.New()
147 | 		q := make(url.Values)
148 | 		q.Set("csv", v.csv)
149 | 		q.Set("with_details", v.det)
150 | 		req := httptest.NewRequest(http.MethodGet, "/?"+q.Encode(), nil)
151 | 		rec := httptest.NewRecorder()
152 | 		c := e.NewContext(req, rec)
153 | 		c.SetPath("/:names")
154 | 		c.SetParamNames("names")
155 | 		c.SetParamValues(name)
156 | 
157 | 		assert.Nil(t, parseNamesGET(gnps)(c))
158 | 
159 | 		body := rec.Body.String()
160 | 		assert.True(t, strings.HasPrefix(body, v.startsWith))
161 | 		if v.contains {
162 | 			assert.True(t, strings.Contains(body, v.pattern))
163 | 		} else {
164 | 			assert.False(t, strings.HasPrefix(body, v.pattern))
165 | 		}
166 | 	}
167 | }
168 | 
169 | func TestParsePOST(t *testing.T) {
170 | 	cfg := gnparser.NewConfig(gnparser.OptFormat(gnfmt.CompactJSON))
171 | 	gnp := gnparser.New(cfg)
172 | 	gnps := NewGNparserService(gnp, 0)
173 | 
174 | 	var response []parsed.Parsed
175 | 	names := []string{
176 | 		"Not name", "Bubo bubo", "Leptochloöpsis virgata",
177 | 		"Pomatomus", "Pardosa moesta",
178 | 		"Plantago major var major",
179 | 		"Cytospora ribis mitovirus 2",
180 | 		"A-shaped rods", "Alb. alba",
181 | 		"Pisonia grandis", "Acacia vestita may",
182 | 		"Sarracenia flava 'Maxima'",
183 | 	}
184 | 	params := inputREST{
185 | 		Names:             names,
186 | 		CSV:               false,
187 | 		WithDetails:       false,
188 | 		WithCultivars:     true,
189 | 		PreserveDiaereses: true,
190 | 	}
191 | 	reqBody, err := gnfmt.GNjson{}.Encode(params)
192 | 	assert.Nil(t, err)
193 | 	r := bytes.NewReader(reqBody)
194 | 	req := httptest.NewRequest(http.MethodPost, "/api/v1", r)
195 | 	req.Header.Set(echo.HeaderContentType, echo.MIMEApplicationJSON)
196 | 	rec := httptest.NewRecorder()
197 | 	e := echo.New()
198 | 	c := e.NewContext(req, rec)
199 | 
200 | 	assert.Nil(t, parseNamesPOST(gnps)(c))
201 | 
202 | 	enc := gnfmt.GNjson{}
203 | 	err = enc.Decode(rec.Body.Bytes(), &response)
204 | 	assert.Nil(t, err)
205 | 
206 | 	assert.Equal(t, len(names), len(response))
207 | 	for i, v := range response {
208 | 		switch i {
209 | 		case 0:
210 | 			assert.Equal(t, "Not name", v.Verbatim, v.Verbatim)
211 | 			assert.False(t, v.Parsed, v.Verbatim)
212 | 		case 1:
213 | 			assert.Equal(t, "Bubo bubo", v.Verbatim, v.Verbatim)
214 | 			assert.Equal(t, "Bubo bubo", v.Canonical.Simple)
215 | 		case 2:
216 | 			assert.Equal(t, "Leptochloöpsis virgata", v.Verbatim, v.Verbatim)
217 | 			assert.Equal(t, "Leptochloöpsis virgata", v.Canonical.Simple)
218 | 		case 11:
219 | 			assert.Equal(t, "Sarracenia flava ‘Maxima’", v.Normalized)
220 | 			assert.Equal(t, 3, v.Cardinality)
221 | 		}
222 | 
223 | 	}
224 | 
225 | 	params = inputREST{
226 | 		Names:             names,
227 | 		CSV:               true,
228 | 		WithDetails:       false,
229 | 		WithCultivars:     false,
230 | 		PreserveDiaereses: false,
231 | 	}
232 | 	reqBody, err = gnfmt.GNjson{}.Encode(params)
233 | 	r = bytes.NewReader(reqBody)
234 | 	req = httptest.NewRequest(http.MethodPost, "/api/v1", r)
235 | 	req.Header.Set(echo.HeaderContentType, echo.MIMEApplicationJSON)
236 | 	rec = httptest.NewRecorder()
237 | 	c = e.NewContext(req, rec)
238 | 	assert.Nil(t, parseNamesPOST(gnps)(c))
239 | 	assert.True(t, strings.HasPrefix(rec.Body.String(), "Id"))
240 | }
241 | 


--------------------------------------------------------------------------------
/man/gnparser.1:
--------------------------------------------------------------------------------
  1 | .\" generated with Ronn/v0.7.3
  2 | .\" http://github.com/rtomayko/ronn/tree/0.7.3
  3 | .
  4 | .TH "GNPARSER" "1" "November 2021" "" ""
  5 | .
  6 | .SH "NAME"
  7 | \fBgnparser\fR \- parse biodiversity scientific names
  8 | .
  9 | .SH "SYNOPSIS"
 10 | \fBgnparser\fR [OPTION\.\.\.] [TERM/FILE]
 11 | .
 12 | .SH "DESCRIPTION"
 13 | \fBGNparser\fR breaks biodiversity scientific names into their structural elements\. For example it finds that a genus in \fIHomo sapiens\fR is \fIHomo\fR\.
 14 | .
 15 | .P
 16 | It can be used for one name, or for many names in a file (one name per line)\.
 17 | .
 18 | .SH "USAGE"
 19 | .
 20 | .SS "Usage for one name"
 21 | .
 22 | .nf
 23 | 
 24 | gnparser "Pleurosigma vitrea var\. kjellmanii H\.Peragallo, 1891"
 25 | 
 26 | # CSV output (default)
 27 | gnparser "Parus major Linnaeus, 1788"
 28 | # or
 29 | gnparser \-f csv "Parus major Linnaeus, 1788"
 30 | 
 31 | # TSV output (default)
 32 | gnparser \-f tsv "Parus major Linnaeus, 1788"
 33 | 
 34 | # JSON compact format
 35 | gnparser "Parus major Linnaeus, 1788" \-f compact
 36 | 
 37 | # pretty format
 38 | gnparser \-f pretty "Parus major Linnaeus, 1788"
 39 | 
 40 | # to parse a name from the standard input
 41 | echo "Parus major Linnaeus, 1788" | gnparser
 42 | .
 43 | .fi
 44 | .
 45 | .SS "Usage for many names in a file"
 46 | There is no flag for parsing a file\. If parser finds the given file path on your computer, it will parse the content of the file, assuming that every line is a new scientific name\. If the file path is not found, gnparser will try to parse the "path" as a scientific name\.
 47 | .
 48 | .P
 49 | Parsed results will stream to STDOUT, while progress of the parsing will be directed to STDERR\.
 50 | .
 51 | .IP "" 4
 52 | .
 53 | .nf
 54 | 
 55 | # to parse with 200 parallel processes
 56 | gnparser \-j 200 names\.txt > names_parsed\.csv
 57 | 
 58 | # to parse file with more detailed output
 59 | gnparser names\.txt \-d \-f compact > names_parsed\.txt
 60 | 
 61 | # to parse files using pipes
 62 | cat names\.txt | gnparser \-f csv \-j 200 > names_parsed\.csv
 63 | 
 64 | # to parse using stream method instead of batch method\.
 65 | cat names\.txt | gnparser \-s > names_parsed\.csv
 66 | 
 67 | # to not remove html tags and entities during parsing\. You gain a bit of
 68 | # performance with this option if your data does not contain HTML tags or
 69 | # entities\.
 70 | gnparser "<i>Pomatomus</i>&nbsp;<i>saltator</i>"
 71 | gnparser \-i "<i>Pomatomus</i>&nbsp;<i>saltator</i>"
 72 | gnparser \-i "Pomatomus saltator"
 73 | .
 74 | .fi
 75 | .
 76 | .IP "" 0
 77 | .
 78 | .SH "GNPARSER SETTINGS"
 79 | .
 80 | .SS "\-h, \-\-help"
 81 | Prints help information:
 82 | .
 83 | .IP "" 4
 84 | .
 85 | .nf
 86 | 
 87 | gnparser \-h
 88 | .
 89 | .fi
 90 | .
 91 | .IP "" 0
 92 | .
 93 | .SS "\-b, \-\-batch_size (values: positive integers, default 50,000)"
 94 | Sets a maximum number of names collected into a batch before processing\. This flag is ignored, if parsing is applied to only one name or if parsing mode is set to streaming with \-s flag:
 95 | .
 96 | .P
 97 | gnparser \-b 100 names\.txt
 98 | .
 99 | .SS "\-c, \-\-capitalize"
100 | Capitalizes the first letter of a name\-string before parsing:
101 | .
102 | .P
103 | gnparser "homo sapiens" \-c
104 | .
105 | .SS "\-C, \-\-cultivar"
106 | Parses given name/s according to the Code of Cultivar Plants:
107 | .
108 | .P
109 | gnparser "Sarracenia flava \'Maxima\'" \-C gnparser "Cytisus purpureus + Laburnum anagyroides" \-C
110 | .
111 | .SS "\-D, \-\-diaereses"
112 | Preserves diaereses present in names:
113 | .
114 | .IP "" 4
115 | .
116 | .nf
117 | 
118 | gnparser "Leptochloöpsis virgata" \-D
119 | .
120 | .fi
121 | .
122 | .IP "" 0
123 | .
124 | .P
125 | The stemmed canonical name will be generated without diaereses\.
126 | .
127 | .SS "\-d, \-\-details"
128 | Return more details for a parsed name\. This flag is ignored for CSV formatting:
129 | .
130 | .IP "" 4
131 | .
132 | .nf
133 | 
134 | gnparser "Pardosa moesta Banks, 1982" \-d \-f pretty
135 | .
136 | .fi
137 | .
138 | .IP "" 0
139 | .
140 | .SS "\-f, \-\-format"
141 | Determines an output format\. Can be \fBcompact\fR, \fBpretty\fR, \fBcsv\fR\. Default is \fBcsv\fR\.
142 | .
143 | .P
144 | The default \fBcsv\fR format returns a header row and the CSV\-compatible parsed result:
145 | .
146 | .IP "" 4
147 | .
148 | .nf
149 | 
150 | gnparser "Pardosa moesta"
151 | .
152 | .fi
153 | .
154 | .IP "" 0
155 | .
156 | .P
157 | The \fBtsv\fR format returns a header row and a tab\-delimited output:
158 | .
159 | .IP "" 4
160 | .
161 | .nf
162 | 
163 | gnparser "Pardosa moesta" \-f tsv
164 | .
165 | .fi
166 | .
167 | .IP "" 0
168 | .
169 | .P
170 | The \fBcompact\fR format returns a JSON\-encoded result without indentations and new lines:
171 | .
172 | .IP "" 4
173 | .
174 | .nf
175 | 
176 | gnparser "Pardosa moesta" \-f compact
177 | .
178 | .fi
179 | .
180 | .IP "" 0
181 | .
182 | .P
183 | The \fBpretty\fR format returns a JSON\-encoded result in a more human\-readable form:
184 | .
185 | .IP "" 4
186 | .
187 | .nf
188 | 
189 | gnparser "Pardosa moesta" \-f pretty
190 | .
191 | .fi
192 | .
193 | .IP "" 0
194 | .
195 | .SS "\-i, \-\-ignore_tags"
196 | By default \fBgnparser\fR scans names for HTML tags and removes them before parsing\. It slows the process slightly\. If there are no HTML tags in names (no names are like \fB<i>Aus bus<i> L\.\fR, this flag allows to skip HTML removal step, increasing performance slightly:
197 | .
198 | .IP "" 4
199 | .
200 | .nf
201 | 
202 | gnparser \-i plain\-text\-names\.txt
203 | .
204 | .fi
205 | .
206 | .IP "" 0
207 | .
208 | .SS "\-j, \-\-jobs (positive integer, default is a number of CPUs on a machine)"
209 | The number of jobs running concurrently\. This flag is ignored when parsing one name:
210 | .
211 | .IP "" 4
212 | .
213 | .nf
214 | 
215 | gnparser \-j 200 names\.txt
216 | .
217 | .fi
218 | .
219 | .IP "" 0
220 | .
221 | .SS "\-p, \-\-port (port number)"
222 | Set a port to run web\-interface and RESTful API and starts an HTTP service on this port:
223 | .
224 | .IP "" 4
225 | .
226 | .nf
227 | 
228 | gnparser \-p 80
229 | .
230 | .fi
231 | .
232 | .IP "" 0
233 | .
234 | .SS "\-s, \-\-stream"
235 | Changes parsing method for large number of names from \fBbatch\fR to \fBstream\fR\. If this flag is set, gnparser can be used from any language application using pipe\-in/pipe\-out methods\. Such an approach requires sending 1 name at a time to gnparser instead of sending names in batches\. Streaming allows to achieve that, but there is a slight decrease in performance:
236 | .
237 | .IP "" 4
238 | .
239 | .nf
240 | 
241 | gnparser \-s names\.json
242 | .
243 | .fi
244 | .
245 | .IP "" 0
246 | .
247 | .SS "\-u, \-\-unordered"
248 | If this flag is on, output and intput order will not be syncronized\. If there is only one parsing job running (\fB\-j\fR flag), the input and output will be of the same order even if \fB\-u\fR flag is given\.
249 | .
250 | .IP "" 4
251 | .
252 | .nf
253 | 
254 | gnparser \-u \-j 100 names\.txt
255 | .
256 | .fi
257 | .
258 | .IP "" 0
259 | .
260 | .SS "\-V, \-\-version"
261 | Shows the version number of gnparser\.
262 | .
263 | .SH "COPYRIGHT"
264 | The MIT License (MIT)
265 | .
266 | .P
267 | Copyright (c) 2018\-2022 Dmitry Mozzherin
268 | .
269 | .SH "Contributors"
270 | Toby Marsden, Geoffrey Ower, Hernan Lucas Pereira
271 | 


--------------------------------------------------------------------------------
/man/gnparser.1.ronn:
--------------------------------------------------------------------------------
  1 | # gnparser -- parse biodiversity scientific names
  2 | 
  3 | ## SYNOPSIS
  4 | 
  5 | **gnparser** [OPTION...] [TERM/FILE]
  6 | 
  7 | ## DESCRIPTION
  8 | 
  9 | **GNparser** breaks biodiversity scientific names into their structural
 10 | elements. For example it finds that a genus in *Homo sapiens* is *Homo*.
 11 | 
 12 | It can be used for one name, or for many names in a file (one name per line).
 13 | 
 14 | ## USAGE
 15 | 
 16 | ### Usage for one name
 17 | 
 18 |     gnparser "Pleurosigma vitrea var. kjellmanii H.Peragallo, 1891"
 19 | 
 20 |     # CSV output (default)
 21 |     gnparser "Parus major Linnaeus, 1788"
 22 |     # or
 23 |     gnparser -f csv "Parus major Linnaeus, 1788"
 24 | 
 25 |     # TSV output (default)
 26 |     gnparser -f tsv "Parus major Linnaeus, 1788"
 27 | 
 28 |     # JSON compact format
 29 |     gnparser "Parus major Linnaeus, 1788" -f compact
 30 | 
 31 |     # pretty format
 32 |     gnparser -f pretty "Parus major Linnaeus, 1788"
 33 | 
 34 |     # to parse a name from the standard input
 35 |     echo "Parus major Linnaeus, 1788" | gnparser
 36 | 
 37 | ### Usage for many names in a file
 38 | 
 39 | There is no flag for parsing a file. If parser finds the given file path on
 40 | your computer, it will parse the content of the file, assuming that every line
 41 | is a new scientific name. If the file path is not found, gnparser will try to
 42 | parse the "path" as a scientific name.
 43 | 
 44 | Parsed results will stream to STDOUT, while progress of the parsing will be
 45 | directed to STDERR.
 46 | 
 47 |     # to parse with 200 parallel processes
 48 |     gnparser -j 200 names.txt > names_parsed.csv
 49 | 
 50 |     # to parse file with more detailed output
 51 |     gnparser names.txt -d -f compact > names_parsed.txt
 52 | 
 53 |     # to parse files using pipes
 54 |     cat names.txt | gnparser -f csv -j 200 > names_parsed.csv
 55 | 
 56 |     # to parse using stream method instead of batch method.
 57 |     cat names.txt | gnparser -s > names_parsed.csv
 58 | 
 59 |     # to not remove html tags and entities during parsing. You gain a bit of
 60 |     # performance with this option if your data does not contain HTML tags or
 61 |     # entities.
 62 |     gnparser "<i>Pomatomus</i>&nbsp;<i>saltator</i>"
 63 |     gnparser -i "<i>Pomatomus</i>&nbsp;<i>saltator</i>"
 64 |     gnparser -i "Pomatomus saltator"
 65 | 
 66 | ## GNPARSER SETTINGS
 67 | 
 68 | ### -h, --help
 69 | 
 70 | Prints help information:
 71 | 
 72 |     gnparser -h
 73 | 
 74 | ### -b, --batch_size (values: positive integers, default 50,000)
 75 | 
 76 | Sets a maximum number of names collected into a batch before processing.
 77 | This flag is ignored, if parsing is applied to only one name or
 78 | if parsing mode is set to streaming with -s flag:
 79 | 
 80 |    gnparser -b 100 names.txt
 81 | 
 82 | ### -c, --capitalize
 83 | 
 84 | Capitalizes the first letter of a name-string before parsing:
 85 | 
 86 |    gnparser "homo sapiens" -c
 87 | 
 88 | ### -C, --cultivar
 89 | 
 90 | Parses given name/s according to the Code of Cultivar Plants:
 91 | 
 92 |    gnparser "Sarracenia flava 'Maxima'" -C
 93 |    gnparser "Cytisus purpureus + Laburnum anagyroides" -C
 94 | 
 95 | ### -D, --diaereses
 96 | 
 97 | Preserves diaereses present in names:
 98 | 
 99 |     gnparser "Leptochloöpsis virgata" -D
100 | 
101 | The stemmed canonical name will be generated without diaereses.
102 | 
103 | ### -d, --details
104 | 
105 | Return more details for a parsed name. This flag is ignored for CSV formatting:
106 | 
107 |     gnparser "Pardosa moesta Banks, 1982" -d -f pretty
108 | 
109 | ### -f, --format
110 | 
111 | Determines an output format. Can be `compact`, `pretty`, `csv`.
112 | Default is `csv`.
113 | 
114 | The default `csv` format returns a header row and the CSV-compatible
115 | parsed result:
116 | 
117 |     gnparser "Pardosa moesta"
118 | 
119 | The `tsv` format returns a header row and a tab-delimited output:
120 | 
121 |     gnparser "Pardosa moesta" -f tsv
122 | 
123 | The `compact` format returns a JSON-encoded result without indentations and
124 | new lines:
125 | 
126 |     gnparser "Pardosa moesta" -f compact
127 | 
128 | The `pretty` format returns a JSON-encoded result in a more human-readable
129 | form:
130 | 
131 |     gnparser "Pardosa moesta" -f pretty
132 | 
133 | ### -i, --ignore_tags
134 | 
135 | By default `gnparser` scans names for HTML tags and removes them before
136 | parsing. It slows the process slightly. If there are no HTML tags in names
137 | (no names are like `<i>Aus bus<i> L.`, this flag allows to skip HTML removal
138 | step, increasing performance slightly:
139 | 
140 |     gnparser -i plain-text-names.txt
141 | 
142 | ### -j, --jobs (positive integer, default is a number of CPUs on a machine)
143 | 
144 | The number of jobs running concurrently. This flag is ignored when parsing
145 | one name:
146 | 
147 |     gnparser -j 200 names.txt
148 | 
149 | ### -p, --port (port number)
150 | 
151 | Set a port to run web-interface and RESTful API and starts an HTTP service on
152 | this port:
153 | 
154 |     gnparser -p 80
155 | 
156 | ### -s, --stream
157 | 
158 | Changes parsing method for large number of names from `batch` to `stream`.
159 | If this flag is set, gnparser can be used from any language application
160 | using pipe-in/pipe-out methods. Such an approach requires sending 1 name
161 | at a time to gnparser instead of sending names in batches. Streaming allows
162 | to achieve that, but there is a slight decrease in performance:
163 | 
164 |     gnparser -s names.json
165 | 
166 | ### -u, --unordered
167 | 
168 | If this flag is on, output and intput order will not be syncronized. If there
169 | is only one parsing job running (`-j` flag), the input and output will be of
170 | the same order even if `-u` flag is given.
171 | 
172 |     gnparser -u -j 100 names.txt
173 | 
174 | ### -V, --version
175 | 
176 | Shows the version number of gnparser.
177 | 
178 | 
179 | ## COPYRIGHT
180 | 
181 | The MIT License (MIT)
182 | 
183 | Copyright (c) 2018-2022 Dmitry Mozzherin
184 | 
185 | ## Contributors
186 | 
187 | Toby Marsden, Geoffrey Ower, Hernan Lucas Pereira
188 | 


--------------------------------------------------------------------------------
/nsqd.dat:
--------------------------------------------------------------------------------
1 | {"topics":[{"channels":[],"name":"test","paused":false}],"version":"1.2.1"}


--------------------------------------------------------------------------------
/quality.md:
--------------------------------------------------------------------------------
 1 | # Quality categories
 2 | 
 3 | ## Quality 0
 4 | 
 5 | Parsing failed.
 6 | 
 7 | ## Quality 1
 8 | 
 9 | Parsing finished without detecting any problems.
10 | 
11 | ## Quality 2
12 | 
13 | - Abbreviated subgenus
14 | - Ambiguity: subgenus or superspecies found
15 | - Ambiguous f. (filius or forma)
16 | - Apparent genus with capital character after hyphen
17 | - Author in upper case
18 | - Author is unknown
19 | - Bacterial `Candidatus` name
20 | - Combination of two uninomials
21 | - Cultivar epithet
22 | - Deprecated Greek letter enumeration in rank
23 | - Emend authors are not required
24 | - `ex` authors are not required
25 | - Hybrid formula
26 | - Misplaced basionym year
27 | - Multiple adjacent space characters
28 | - Named hybrid
29 | - Non-standard characters in canonical
30 | - Non-standard space characters
31 | - Ambiguity: ICN author or subgenus
32 | - Probably incomplete hybrid formula
33 | - Spanish 'y' is used instead of '&'
34 | - Trailing whitespace
35 | - Year with latin character
36 | - Year with page info
37 | - Year with parentheses
38 | - Year with period
39 | - Year with question mark
40 | 
41 | ## Quality 3
42 | 
43 | - Apostrophe is not allowed in canonical
44 | - Author is too short
45 | - HTML tags or entities in the name
46 | - Hybrid char is not separated by space
47 | - Not an ASCII apostrophe
48 | - Numeric prefix
49 | - Uncommon rank
50 | - Year with square brackets
51 | - Years range
52 | - `emend` without a period
53 | - `ex` ends with a period
54 | - `in` ends with a period
55 | 
56 | ## Quality 4
57 | 
58 | - Abbreviated uninomial word
59 | - Author as a question mark
60 | - Authorship in double parentheses
61 | - Authorship is missing one parenthesis
62 | - Incomplete hybrid formula
63 | - Incorrect conversion to UTF-8
64 | - Name comparison
65 | - Name is approximate
66 | - Name starts with low-case character
67 | - Uninomial word with question mark
68 | - Unparsed tail
69 | 


--------------------------------------------------------------------------------
/shell.nix:
--------------------------------------------------------------------------------
1 | { mkShell, go, gopls }:
2 | mkShell rec {
3 |   buildInputs = [ go gopls ];
4 | }
5 | 


--------------------------------------------------------------------------------
/testdata/exceptions.txt:
--------------------------------------------------------------------------------
 1 | Aleuroclava complex Singh, 1931
 2 | Allawrencius complex Lawrence, 1953
 3 | Bolbodeomyia complex Theobald, 1910
 4 | Castelnaudia spec (Darlington, 1962)
 5 | Cicada complex Walker, 1850
 6 | Dichostasia complex Yochelson, 1956
 7 | Dimorphoceras complex (Moore, 1939)
 8 | Dischidia complex Griff.
 9 | Ecnomus complex Mosely, 1932
10 | Fusinus complex M. A. Snyder, 2000
11 | Fusinus pauciliratus complex M. A. Snyder, 2000
12 | Gonatobotrys complex Jane Walker & Minter
13 | Heizmannia (Heizmannia) complex (Theobald, 1910)
14 | Hemicloeina spec Platnick, 2002
15 | Libystica complex Holland, 1894
16 | Notozomus spec (Harvey, 1992)
17 | Ochodaeus complex LeConte, 1868
18 | Odontella do J Najt, & WM Weiner
19 | Oecetis complex Hwang, 1957
20 | Oedipina complex (Dunn, 1924)
21 | Oedipus complex Dunn, 1924
22 | Oedopinola complex (Dunn, 1924)
23 | Paradimorphoceras complex (Moore, 1939)
24 | Parentia do Bickel, 2002
25 | Phyllospongia complex de Laubenfels, 1954
26 | Plectrocnemia complex Hwang, 1958
27 | Rubus complex L. H. Bailey
28 | Sceliphron complex Kohl, 1918
29 | Sceliphron fossuliferum complex Kohl, 1918
30 | Scopaeus (Scopaeus) complex Sharp, 1874
31 | Scopaeus complex Sharp, 1874
32 | Sigipinius complex Golovatch, 2013
33 | Stegosoladidus complex Berge, 2001
34 | Tetracis complex Sharp, 1874
35 | Tetramorium do Forel, 1914
36 | Trichosternus spec Darlington, 1962
37 | Trisephena complex Medler, 1990
38 | 


--------------------------------------------------------------------------------
/tools/gentest.go:
--------------------------------------------------------------------------------
  1 | //go:build ignore
  2 | // +build ignore
  3 | 
  4 | // Generates a new test_data_new.txt file out of test_data.txt using current
  5 | // parser output. We need to do this in cases when parser output is modified.
  6 | // Run `go run gentest.go`
  7 | package main
  8 | 
  9 | import (
 10 | 	"bufio"
 11 | 	"os"
 12 | 	"path/filepath"
 13 | 	"strings"
 14 | 
 15 | 	"github.com/gnames/gnfmt"
 16 | 	"github.com/gnames/gnlib/ent/nomcode"
 17 | 	"github.com/gnames/gnparser"
 18 | 	"github.com/gnames/gnparser/ent/parsed"
 19 | )
 20 | 
 21 | func genTestData() error {
 22 | 	testFiles := []string{"test_data", "test_data_cultivars"}
 23 | 	for _, v := range testFiles {
 24 | 		err := newTestFile(v)
 25 | 		if err != nil {
 26 | 			return err
 27 | 		}
 28 | 	}
 29 | 	return nil
 30 | }
 31 | 
 32 | func newTestFile(file string) error {
 33 | 	enc := gnfmt.GNjson{}
 34 | 	path := filepath.Join("..", "testdata", file+".md")
 35 | 	outPath := filepath.Join("..", "testdata", file+"_new.md")
 36 | 	f, err := os.OpenFile(path, os.O_RDONLY, os.ModePerm)
 37 | 	if err != nil {
 38 | 		return err
 39 | 	}
 40 | 	w, err := os.Create(outPath)
 41 | 	if err != nil {
 42 | 		return err
 43 | 	}
 44 | 	defer f.Close()
 45 | 	defer w.Close()
 46 | 
 47 | 	sc := bufio.NewScanner(f)
 48 | 	opts := []gnparser.Option{gnparser.OptIsTest(true), gnparser.OptWithDetails(true)}
 49 | 	if file == "test_data_cultivars" {
 50 | 		opts = append(opts, gnparser.OptCode(nomcode.Cultivars))
 51 | 	}
 52 | 	cfg := gnparser.NewConfig(opts...)
 53 | 	gnp := gnparser.New(cfg)
 54 | 	var res parsed.Parsed
 55 | 	isName := false
 56 | 	var count int
 57 | 	var can, au, nameString string
 58 | 	var jsonData []byte
 59 | 	for sc.Scan() {
 60 | 		line := sc.Text()
 61 | 		if !isName {
 62 | 			w.Write([]byte(line + "\n"))
 63 | 			if strings.HasPrefix(line, "Name: ") {
 64 | 				isName = true
 65 | 				nameString = line[6:]
 66 | 				res = gnp.ParseName(nameString)
 67 | 				jsonData, _ = enc.Encode(res)
 68 | 				if res.Parsed {
 69 | 					can = res.Canonical.Full
 70 | 					if res.Authorship != nil {
 71 | 						au = res.Authorship.Normalized
 72 | 					}
 73 | 				}
 74 | 			}
 75 | 			continue
 76 | 		}
 77 | 		count++
 78 | 		switch count {
 79 | 		case 2: // Canonical: name_here
 80 | 			can = strings.TrimRight("Canonical: "+can, " ")
 81 | 			w.Write([]byte(can + "\n"))
 82 | 		case 4: // Authorship
 83 | 			au = strings.TrimRight("Authorship: "+au, " ")
 84 | 			w.Write([]byte(au + "\n"))
 85 | 		case 7:
 86 | 			w.Write(jsonData)
 87 | 			w.Write([]byte("\n"))
 88 | 			count = 0
 89 | 			isName = false
 90 | 			can, au = "", ""
 91 | 			jsonData = []byte("")
 92 | 		default:
 93 | 			w.Write([]byte(line + "\n"))
 94 | 		}
 95 | 	}
 96 | 	if err := sc.Err(); err != nil {
 97 | 		return err
 98 | 	}
 99 | 
100 | 	return nil
101 | }
102 | 
103 | func main() {
104 | 	genTestData()
105 | }
106 | 


--------------------------------------------------------------------------------
/tools/quality.go:
--------------------------------------------------------------------------------
 1 | //go:build ignore
 2 | // +build ignore
 3 | 
 4 | // quality.go generates a markdown file that describes meaning of each quality
 5 | // category.
 6 | package main
 7 | 
 8 | import (
 9 | 	"fmt"
10 | 	"slices"
11 | 
12 | 	"github.com/gnames/gnparser/ent/parsed"
13 | )
14 | 
15 | var body = `# Quality categories
16 | 
17 | ## Quality 0
18 | 
19 | Parsing failed.
20 | 
21 | ## Quality 1
22 | 
23 | Parsing finished without detecting any problems.`
24 | 
25 | func main() {
26 | 	warnsMap := make(map[int][]string)
27 | 	for k, v := range parsed.WarningQualityMap {
28 | 		warnsMap[v] = append(warnsMap[v], k.String())
29 | 	}
30 | 
31 | 	for _, v := range []int{2, 3, 4} {
32 | 		warns := warnsMap[v]
33 | 		slices.Sort(warns)
34 | 		item := fmt.Sprintf("\n\n## Quality %d\n", v)
35 | 		for i := range warns {
36 | 			warn := fmt.Sprintf("\n- %s", warns[i])
37 | 			item += warn
38 | 		}
39 | 		body += item
40 | 	}
41 | 	fmt.Println(body)
42 | }
43 | 


--------------------------------------------------------------------------------
/version.go:
--------------------------------------------------------------------------------
 1 | package gnparser
 2 | 
 3 | var (
 4 | 	// Version is the version of the gnparser package. When Makefile is
 5 | 	// used, the version is calculated out of Git tags.
 6 | 	Version = "v1.11.6"
 7 | 	// Build is a timestamp of when Makefile was used to compile
 8 | 	// the gnparser code. If go build was used, Build stays empty.
 9 | 	Build string
10 | )
11 | 


--------------------------------------------------------------------------------