├── .github
    ├── CONTRIBUTOR_AGREEMENT.md
    ├── FUNDING.yml
    ├── ISSUE_TEMPLATE
    │   ├── 01_bugs.md
    │   ├── 02_docs.md
    │   └── config.yml
    ├── PULL_REQUEST_TEMPLATE.md
    ├── contributors
    │   ├── 0x2b3bfa0.md
    │   ├── 5hirish.md
    │   ├── ALSchwalm.md
    │   ├── AMArostegui.md
    │   ├── AlJohri.md
    │   ├── Arvindcheenu.md
    │   ├── AyushExel.md
    │   ├── Azagh3l.md
    │   ├── Baciccin.md
    │   ├── Bharat123rox.md
    │   ├── BigstickCarpet.md
    │   ├── BramVanroy.md
    │   ├── BreakBB.md
    │   ├── Bri-Will.md
    │   ├── Brixjohn.md
    │   ├── Cinnamy.md
    │   ├── DeNeutoy.md
    │   ├── DimaBryuhanov.md
    │   ├── Dobita21.md
    │   ├── DoomCoder.md
    │   ├── DuyguA.md
    │   ├── EARL_GREYT.md
    │   ├── Eleni170.md
    │   ├── EmilStenstrom.md
    │   ├── F0rge1cE.md
    │   ├── FallakAsad.md
    │   ├── GiorgioPorgio.md
    │   ├── Gizzio.md
    │   ├── GuiGel.md
    │   ├── Hazoom.md
    │   ├── HiromuHota.md
    │   ├── ICLRandD.md
    │   ├── IsaacHaze.md
    │   ├── JKhakpour.md
    │   ├── Jan-711.md
    │   ├── JannisTriesToCode.md
    │   ├── Jette16.md
    │   ├── KKsharma99.md
    │   ├── KennethEnevoldsen.md
    │   ├── Kimahriman.md
    │   ├── LRAbbade.md
    │   ├── Loghijiaha.md
    │   ├── Lucaterre.md
    │   ├── MartinoMensio.md
    │   ├── MateuszOlko.md
    │   ├── MathiasDesch.md
    │   ├── MiniLau.md
    │   ├── MisterKeefe.md
    │   ├── Mlawrence95.md
    │   ├── NSchrading.md
    │   ├── NirantK.md
    │   ├── Nuccy90.md
    │   ├── Olamyy.md
    │   ├── Pantalaymon.md
    │   ├── Pavle992.md
    │   ├── PeterGilles.md
    │   ├── PluieElectrique.md
    │   ├── Poluglottos.md
    │   ├── PolyglotOpenstreetmap.md
    │   ├── R1j1t.md
    │   ├── RvanNieuwpoort.md
    │   ├── SamEdwardes.md
    │   ├── SamuelLKane.md
    │   ├── Schibsted.png
    │   ├── Stannislav.md
    │   ├── Tiljander.md
    │   ├── YohannesDatasci.md
    │   ├── ZeeD.md
    │   ├── aajanki.md
    │   ├── aaronkub.md
    │   ├── aashishg.md
    │   ├── abchapman93.md
    │   ├── abhi18av.md
    │   ├── adrianeboyd.md
    │   ├── adrienball.md
    │   ├── ajrader.md
    │   ├── akki2825.md
    │   ├── akornilo.md
    │   ├── alexcombessie.md
    │   ├── alexvy86.md
    │   ├── aliiae.md
    │   ├── alldefector.md
    │   ├── alvaroabascar.md
    │   ├── alvations.md
    │   ├── ameyuuno.md
    │   ├── amitness.md
    │   ├── amperinet.md
    │   ├── aniruddha-adhikary.md
    │   ├── ansgar-t.md
    │   ├── aongko.md
    │   ├── aristorinjuang.md
    │   ├── armsp.md
    │   ├── aryaprabhudesai.md
    │   ├── askhogan.md
    │   ├── avadhpatel.md
    │   ├── avi197.md
    │   ├── avramandrei.md
    │   ├── azarezade.md
    │   ├── b1uec0in.md
    │   ├── bbieniek.md
    │   ├── bdewilde.md
    │   ├── beatesi.md
    │   ├── bellabie.md
    │   ├── bintay.md
    │   ├── bittlingmayer.md
    │   ├── bjascob.md
    │   ├── bodak.md
    │   ├── boena.md
    │   ├── borijang.md
    │   ├── bratao.md
    │   ├── broaddeep.md
    │   ├── bryant1410.md
    │   ├── bsweileh.md
    │   ├── btrungchi.md
    │   ├── calumcalder.md
    │   ├── cbilgili.md
    │   ├── cclauss.md
    │   ├── cedar101.md
    │   ├── celikomer.md
    │   ├── ceteri.md
    │   ├── charlax.md
    │   ├── chezou.md
    │   ├── chopeen.md
    │   ├── chrisdubois.md
    │   ├── cicorias.md
    │   ├── clarus.md
    │   ├── clippered.md
    │   ├── connorbrinton.md
    │   ├── coryhurst.md
    │   ├── cristianasp.md
    │   ├── d99kris.md
    │   ├── danielhers.md
    │   ├── danielkingai2.md
    │   ├── danielruf.md
    │   ├── danielvasic.md
    │   ├── dardoria.md
    │   ├── darindf.md
    │   ├── delzac.md
    │   ├── demfier.md
    │   ├── demongolem.md
    │   ├── dhpollack.md
    │   ├── dhruvrnaik.md
    │   ├── doug-descombaz.md
    │   ├── drndos.md
    │   ├── dvsrepo.md
    │   ├── elbaulp.md
    │   ├── elben10
    │   ├── emulbreh.md
    │   ├── enerrio.md
    │   ├── er-raoniz.md
    │   ├── erip.md
    │   ├── estr4ng7d.md
    │   ├── ezorita.md
    │   ├── fgaim.md
    │   ├── filipecaixeta.md
    │   ├── fizban99.md
    │   ├── florijanstamenkovic.md
    │   ├── fonfonx.md
    │   ├── forest1988.md
    │   ├── foufaster.md
    │   ├── frascuchon.md
    │   ├── free-variation.md
    │   ├── fsonntag.md
    │   ├── fucking-signup.md
    │   ├── gandersen101.md
    │   ├── gavrieltal.md
    │   ├── giannisdaras.md
    │   ├── graue70.md
    │   ├── graus.md
    │   ├── greenriverrus.md
    │   ├── grivaz.md
    │   ├── gtoffoli.md
    │   ├── guerda.md
    │   ├── gustavengstrom.md
    │   ├── henry860916.md
    │   ├── hertelm.md
    │   ├── himkt.md
    │   ├── hiroshi-matsuda-rit.md
    │   ├── hlasse.md
    │   ├── holubvl3.md
    │   ├── honnibal.md
    │   ├── howl-anderson.md
    │   ├── hugovk.md
    │   ├── iann0036.md
    │   ├── idealley.md
    │   ├── idoshr.md
    │   ├── iechevarria.md
    │   ├── ilivans.md
    │   ├── ines.md
    │   ├── intrafindBreno.md
    │   ├── isaric.md
    │   ├── iurshina.md
    │   ├── ivigamberdiev.md
    │   ├── ivyleavedtoadflax.md
    │   ├── jabortell.md
    │   ├── jacopofar.md
    │   ├── jacse.md
    │   ├── janimo.md
    │   ├── jankrepl.md
    │   ├── jarib.md
    │   ├── jaydeepborkar.md
    │   ├── jbesomi.md
    │   ├── jeannefukumaru.md
    │   ├── jenojp.md
    │   ├── jerbob92.md
    │   ├── jganseman.md
    │   ├── jgutix.md
    │   ├── jimregan.md
    │   ├── jklaise.md
    │   ├── jmargeta.md
    │   ├── jmyerston.md
    │   ├── johnhaley81.md
    │   ├── jonesmartins.md
    │   ├── juliamakogon.md
    │   ├── julien-talkair.md
    │   ├── juliensalinas.md
    │   ├── jumasheff.md
    │   ├── justindujardin.md
    │   ├── kabirkhan.md
    │   ├── katarkor.md
    │   ├── katrinleinweber.md
    │   ├── kbulygin.md
    │   ├── keshan.md
    │   ├── keshav.md
    │   ├── kevinlu1248.md
    │   ├── khellan.md
    │   ├── kimfalk.md
    │   ├── knoxdw.md
    │   ├── koaning.md
    │   ├── kognate.md
    │   ├── kororo.md
    │   ├── kowaalczyk.md
    │   ├── kwhumphreys.md
    │   ├── laszabine.md
    │   ├── lauraBaakman.md
    │   ├── ldorigo.md
    │   ├── leicmi.md
    │   ├── leomrocha.md
    │   ├── leyendecker.md
    │   ├── lfiedler.md
    │   ├── ligser.md
    │   ├── lizhe2004.md
    │   ├── lorenanda.md
    │   ├── louisguitton.md
    │   ├── luvogels.md
    │   ├── mabraham.md
    │   ├── magnusburton.md
    │   ├── mahnerak.md
    │   ├── mariosasko.md
    │   ├── markulrich.md
    │   ├── mauryaland.md
    │   ├── mbkupfer.md
    │   ├── mdaudali.md
    │   ├── mdcclv.md
    │   ├── mdda.md
    │   ├── meghanabhange.md
    │   ├── melanuria.pdf
    │   ├── merrcury.md
    │   ├── michael-k.md
    │   ├── mihaigliga21.md
    │   ├── mikeizbicki.md
    │   ├── mikelibg.md
    │   ├── mirfan899.md
    │   ├── miroli.md
    │   ├── mmaybeno.md
    │   ├── mn3mos.md
    │   ├── mollerhoj.md
    │   ├── moreymat.md
    │   ├── mpszumowski.md
    │   ├── mpuig.md
    │   ├── mr-bjerre.md
    │   ├── msklvsk.md
    │   ├── munozbravo.md
    │   ├── myavrum.md
    │   ├── narayanacharya6.md
    │   ├── neelkamath.md
    │   ├── nikhilsaldanha.md
    │   ├── nipunsadvilkar.md
    │   ├── njsmith.md
    │   ├── nlptown.md
    │   ├── nourshalabi.md
    │   ├── nsorros.md
    │   ├── ohenrik.md
    │   ├── onlyanegg.md
    │   ├── ophelielacroix.md
    │   ├── oroszgy.md
    │   ├── osori.md
    │   ├── ottosulin.md
    │   ├── oxinabox.md
    │   ├── ozcankasal.md
    │   ├── paoloq.md
    │   ├── pberba.md
    │   ├── pbnsilva.md
    │   ├── peter-exos.md
    │   ├── phiedulxp.md
    │   ├── philipvollet.md
    │   ├── phojnacki.md
    │   ├── pickfire.md
    │   ├── pinealan.md
    │   ├── pktippa.md
    │   ├── plison.md
    │   ├── pmbaumgartner.md
    │   ├── polm.md
    │   ├── prilopes.md
    │   ├── punitvara.md
    │   ├── pzelasko.md
    │   ├── questoph.md
    │   ├── rafguns.md
    │   ├── rahul1990gupta.md
    │   ├── ramananbalakrishnan.md
    │   ├── rameshhpathak.md
    │   ├── rasyidf.md
    │   ├── reneoctavio.md
    │   ├── retnuh.md
    │   ├── revuel.md
    │   ├── richardliaw.md
    │   ├── richardpaulhudson.md
    │   ├── robertsipek.md
    │   ├── rokasramas.md
    │   ├── roshni-b.md
    │   ├── ryanzhe.md
    │   ├── sabiqueqb.md
    │   ├── sainathadapa.md
    │   ├── sammous.md
    │   ├── savkov.md
    │   ├── seanBE.md
    │   ├── sebastienharinck.md
    │   ├── sevdimali.md
    │   ├── shigapov.md
    │   ├── shuvanon.md
    │   ├── skrcode.md
    │   ├── sloev.md
    │   ├── snsten.md
    │   ├── socool.md
    │   ├── solarmist.md
    │   ├── sorenlind.md
    │   ├── suchow.md
    │   ├── svlandeg.md
    │   ├── swfarnsworth.md
    │   ├── syrull.md
    │   ├── tamuhey.md
    │   ├── therealronnie.md
    │   ├── theudas.md
    │   ├── thomasbird.md
    │   ├── thomashacker.md
    │   ├── thomasopsomer.md
    │   ├── thomasthiebaud.md
    │   ├── thoppe.md
    │   ├── tiangolo.md
    │   ├── tilusnet.md
    │   ├── tjkemp.md
    │   ├── tmetzl.md
    │   ├── tokestermw.md
    │   ├── tommilligan.md
    │   ├── trungtv.md
    │   ├── tupui.md
    │   ├── tyburam.md
    │   ├── tzano.md
    │   ├── ujwal-narayan.md
    │   ├── umarbutler.md
    │   ├── ursachec.md
    │   ├── uwol.md
    │   ├── veer-bains.md
    │   ├── vha14.md
    │   ├── vikaskyadav.md
    │   ├── vishnumenon.md
    │   ├── vishnupriyavr.md
    │   ├── vondersam.md
    │   ├── vsolovyov.md
    │   ├── w4nderlust.md
    │   ├── wallinm1.md
    │   ├── walterhenry.md
    │   ├── wannaphongcom.md
    │   ├── werew.md
    │   ├── willismonroe.md
    │   ├── willprice.md
    │   ├── wojtuch.md
    │   ├── wxv.md
    │   ├── x-ji.md
    │   ├── xadrianzetx.md
    │   ├── xssChauhan.md
    │   ├── yanaiela.md
    │   ├── yaph.md
    │   ├── yashpatadia.md
    │   ├── yohasebe.md
    │   ├── yosiasz.md
    │   ├── yuukos.md
    │   ├── zaibacu.md
    │   ├── zhuorulin.md
    │   ├── zqhZY.md
    │   └── zqianem.md
    ├── spacy_universe_alert.py
    ├── validate_universe_json.py
    └── workflows
    │   ├── cibuildwheel.yml
    │   ├── explosionbot.yml
    │   ├── gputests.yml.disabled
    │   ├── issue-manager.yml
    │   ├── lock.yml
    │   ├── publish_pypi.yml
    │   ├── slowtests.yml.disabled
    │   ├── spacy_universe_alert.yml
    │   ├── tests.yml
    │   └── universe_validation.yml
├── .gitignore
├── .pre-commit-config.yaml
├── CITATION.cff
├── CONTRIBUTING.md
├── LICENSE
├── MANIFEST.in
├── Makefile
├── README.md
├── bin
    ├── get-package.sh
    ├── get-version.sh
    ├── push-tag.sh
    └── release.sh
├── build-constraints.txt
├── examples
    ├── README.md
    └── training
    │   └── README.md
├── extra
    ├── DEVELOPER_DOCS
    │   ├── Code Conventions.md
    │   ├── ExplosionBot.md
    │   ├── Language.md
    │   ├── Listeners.md
    │   ├── README.md
    │   ├── Satellite Packages.md
    │   └── StringStore-Vocab.md
    └── example_data
    │   ├── ner_example_data
    │       ├── README.md
    │       ├── ner-sent-per-line.iob
    │       ├── ner-sent-per-line.json
    │       ├── ner-token-per-line-conll2003.iob
    │       ├── ner-token-per-line-conll2003.json
    │       ├── ner-token-per-line-with-pos.iob
    │       ├── ner-token-per-line-with-pos.json
    │       ├── ner-token-per-line.iob
    │       └── ner-token-per-line.json
    │   ├── textcat_example_data
    │       ├── CC0.txt
    │       ├── CC_BY-SA-3.0.txt
    │       ├── CC_BY-SA-4.0.txt
    │       ├── README.md
    │       ├── cooking.json
    │       ├── cooking.jsonl
    │       ├── jigsaw-toxic-comment.json
    │       ├── jigsaw-toxic-comment.jsonl
    │       └── textcatjsonl_to_trainjson.py
    │   ├── training-data.json
    │   └── vocab-data.jsonl
├── licenses
    └── 3rd_party_licenses.txt
├── netlify.toml
├── pyproject.toml
├── requirements.txt
├── setup.cfg
├── setup.py
├── spacy
    ├── __init__.pxd
    ├── __init__.py
    ├── __main__.py
    ├── about.py
    ├── attrs.pxd
    ├── attrs.pyx
    ├── cli
    │   ├── __init__.py
    │   ├── _util.py
    │   ├── apply.py
    │   ├── assemble.py
    │   ├── benchmark_speed.py
    │   ├── convert.py
    │   ├── debug_config.py
    │   ├── debug_data.py
    │   ├── debug_diff.py
    │   ├── debug_model.py
    │   ├── download.py
    │   ├── evaluate.py
    │   ├── find_function.py
    │   ├── find_threshold.py
    │   ├── info.py
    │   ├── init_config.py
    │   ├── init_pipeline.py
    │   ├── package.py
    │   ├── pretrain.py
    │   ├── profile.py
    │   ├── project
    │   │   ├── __init__.py
    │   │   ├── assets.py
    │   │   ├── clone.py
    │   │   ├── document.py
    │   │   ├── dvc.py
    │   │   ├── pull.py
    │   │   ├── push.py
    │   │   ├── remote_storage.py
    │   │   └── run.py
    │   ├── templates
    │   │   ├── quickstart_training.jinja
    │   │   └── quickstart_training_recommendations.yml
    │   ├── train.py
    │   └── validate.py
    ├── compat.py
    ├── default_config.cfg
    ├── default_config_pretraining.cfg
    ├── displacy
    │   ├── __init__.py
    │   ├── render.py
    │   └── templates.py
    ├── errors.py
    ├── glossary.py
    ├── kb
    │   ├── __init__.py
    │   ├── candidate.pxd
    │   ├── candidate.pyx
    │   ├── kb.pxd
    │   ├── kb.pyx
    │   ├── kb_in_memory.pxd
    │   └── kb_in_memory.pyx
    ├── lang
    │   ├── __init__.py
    │   ├── af
    │   │   ├── __init__.py
    │   │   └── stop_words.py
    │   ├── am
    │   │   ├── __init__.py
    │   │   ├── examples.py
    │   │   ├── lex_attrs.py
    │   │   ├── punctuation.py
    │   │   ├── stop_words.py
    │   │   └── tokenizer_exceptions.py
    │   ├── ar
    │   │   ├── __init__.py
    │   │   ├── examples.py
    │   │   ├── lex_attrs.py
    │   │   ├── punctuation.py
    │   │   ├── stop_words.py
    │   │   └── tokenizer_exceptions.py
    │   ├── az
    │   │   ├── __init__.py
    │   │   ├── examples.py
    │   │   ├── lex_attrs.py
    │   │   └── stop_words.py
    │   ├── bg
    │   │   ├── __init__.py
    │   │   ├── examples.py
    │   │   ├── lex_attrs.py
    │   │   ├── stop_words.py
    │   │   └── tokenizer_exceptions.py
    │   ├── bn
    │   │   ├── __init__.py
    │   │   ├── examples.py
    │   │   ├── punctuation.py
    │   │   ├── stop_words.py
    │   │   └── tokenizer_exceptions.py
    │   ├── bo
    │   │   ├── __init__.py
    │   │   ├── examples.py
    │   │   ├── lex_attrs.py
    │   │   └── stop_words.py
    │   ├── ca
    │   │   ├── __init__.py
    │   │   ├── examples.py
    │   │   ├── lemmatizer.py
    │   │   ├── lex_attrs.py
    │   │   ├── punctuation.py
    │   │   ├── stop_words.py
    │   │   ├── syntax_iterators.py
    │   │   └── tokenizer_exceptions.py
    │   ├── char_classes.py
    │   ├── cs
    │   │   ├── __init__.py
    │   │   ├── examples.py
    │   │   ├── lex_attrs.py
    │   │   └── stop_words.py
    │   ├── da
    │   │   ├── __init__.py
    │   │   ├── examples.py
    │   │   ├── lex_attrs.py
    │   │   ├── punctuation.py
    │   │   ├── stop_words.py
    │   │   ├── syntax_iterators.py
    │   │   └── tokenizer_exceptions.py
    │   ├── de
    │   │   ├── __init__.py
    │   │   ├── examples.py
    │   │   ├── punctuation.py
    │   │   ├── stop_words.py
    │   │   ├── syntax_iterators.py
    │   │   └── tokenizer_exceptions.py
    │   ├── dsb
    │   │   ├── __init__.py
    │   │   ├── examples.py
    │   │   ├── lex_attrs.py
    │   │   └── stop_words.py
    │   ├── el
    │   │   ├── __init__.py
    │   │   ├── examples.py
    │   │   ├── get_pos_from_wiktionary.py
    │   │   ├── lemmatizer.py
    │   │   ├── lex_attrs.py
    │   │   ├── punctuation.py
    │   │   ├── stop_words.py
    │   │   ├── syntax_iterators.py
    │   │   └── tokenizer_exceptions.py
    │   ├── en
    │   │   ├── __init__.py
    │   │   ├── examples.py
    │   │   ├── lemmatizer.py
    │   │   ├── lex_attrs.py
    │   │   ├── punctuation.py
    │   │   ├── stop_words.py
    │   │   ├── syntax_iterators.py
    │   │   └── tokenizer_exceptions.py
    │   ├── es
    │   │   ├── __init__.py
    │   │   ├── examples.py
    │   │   ├── lemmatizer.py
    │   │   ├── lex_attrs.py
    │   │   ├── punctuation.py
    │   │   ├── stop_words.py
    │   │   ├── syntax_iterators.py
    │   │   └── tokenizer_exceptions.py
    │   ├── et
    │   │   ├── __init__.py
    │   │   └── stop_words.py
    │   ├── eu
    │   │   ├── __init__.py
    │   │   ├── examples.py
    │   │   ├── lex_attrs.py
    │   │   ├── punctuation.py
    │   │   └── stop_words.py
    │   ├── fa
    │   │   ├── __init__.py
    │   │   ├── examples.py
    │   │   ├── generate_verbs_exc.py
    │   │   ├── lex_attrs.py
    │   │   ├── punctuation.py
    │   │   ├── stop_words.py
    │   │   ├── syntax_iterators.py
    │   │   └── tokenizer_exceptions.py
    │   ├── fi
    │   │   ├── __init__.py
    │   │   ├── examples.py
    │   │   ├── lex_attrs.py
    │   │   ├── punctuation.py
    │   │   ├── stop_words.py
    │   │   ├── syntax_iterators.py
    │   │   └── tokenizer_exceptions.py
    │   ├── fo
    │   │   ├── __init__.py
    │   │   └── tokenizer_exceptions.py
    │   ├── fr
    │   │   ├── __init__.py
    │   │   ├── _tokenizer_exceptions_list.py
    │   │   ├── examples.py
    │   │   ├── lemmatizer.py
    │   │   ├── lex_attrs.py
    │   │   ├── punctuation.py
    │   │   ├── stop_words.py
    │   │   ├── syntax_iterators.py
    │   │   └── tokenizer_exceptions.py
    │   ├── ga
    │   │   ├── __init__.py
    │   │   ├── lemmatizer.py
    │   │   ├── stop_words.py
    │   │   └── tokenizer_exceptions.py
    │   ├── gd
    │   │   ├── __init__.py
    │   │   ├── stop_words.py
    │   │   └── tokenizer_exceptions.py
    │   ├── grc
    │   │   ├── __init__.py
    │   │   ├── examples.py
    │   │   ├── lex_attrs.py
    │   │   ├── punctuation.py
    │   │   ├── stop_words.py
    │   │   └── tokenizer_exceptions.py
    │   ├── gu
    │   │   ├── __init__.py
    │   │   ├── examples.py
    │   │   └── stop_words.py
    │   ├── he
    │   │   ├── __init__.py
    │   │   ├── examples.py
    │   │   ├── lex_attrs.py
    │   │   └── stop_words.py
    │   ├── hi
    │   │   ├── __init__.py
    │   │   ├── examples.py
    │   │   ├── lex_attrs.py
    │   │   └── stop_words.py
    │   ├── hr
    │   │   ├── __init__.py
    │   │   ├── examples.py
    │   │   ├── lemma_lookup_license.txt
    │   │   └── stop_words.py
    │   ├── hsb
    │   │   ├── __init__.py
    │   │   ├── examples.py
    │   │   ├── lex_attrs.py
    │   │   ├── stop_words.py
    │   │   └── tokenizer_exceptions.py
    │   ├── ht
    │   │   ├── __init__.py
    │   │   ├── examples.py
    │   │   ├── lemmatizer.py
    │   │   ├── lex_attrs.py
    │   │   ├── punctuation.py
    │   │   ├── stop_words.py
    │   │   ├── syntax_iterators.py
    │   │   ├── tag_map.py
    │   │   └── tokenizer_exceptions.py
    │   ├── hu
    │   │   ├── __init__.py
    │   │   ├── examples.py
    │   │   ├── punctuation.py
    │   │   ├── stop_words.py
    │   │   └── tokenizer_exceptions.py
    │   ├── hy
    │   │   ├── __init__.py
    │   │   ├── examples.py
    │   │   ├── lex_attrs.py
    │   │   └── stop_words.py
    │   ├── id
    │   │   ├── __init__.py
    │   │   ├── _tokenizer_exceptions_list.py
    │   │   ├── examples.py
    │   │   ├── lex_attrs.py
    │   │   ├── punctuation.py
    │   │   ├── stop_words.py
    │   │   ├── syntax_iterators.py
    │   │   └── tokenizer_exceptions.py
    │   ├── is
    │   │   ├── __init__.py
    │   │   └── stop_words.py
    │   ├── it
    │   │   ├── __init__.py
    │   │   ├── examples.py
    │   │   ├── lemmatizer.py
    │   │   ├── punctuation.py
    │   │   ├── stop_words.py
    │   │   ├── syntax_iterators.py
    │   │   └── tokenizer_exceptions.py
    │   ├── ja
    │   │   ├── __init__.py
    │   │   ├── examples.py
    │   │   ├── stop_words.py
    │   │   ├── syntax_iterators.py
    │   │   ├── tag_bigram_map.py
    │   │   ├── tag_map.py
    │   │   └── tag_orth_map.py
    │   ├── kmr
    │   │   ├── __init__.py
    │   │   ├── examples.py
    │   │   ├── lex_attrs.py
    │   │   └── stop_words.py
    │   ├── kn
    │   │   ├── __init__.py
    │   │   ├── examples.py
    │   │   └── stop_words.py
    │   ├── ko
    │   │   ├── __init__.py
    │   │   ├── examples.py
    │   │   ├── lex_attrs.py
    │   │   ├── punctuation.py
    │   │   ├── stop_words.py
    │   │   └── tag_map.py
    │   ├── ky
    │   │   ├── __init__.py
    │   │   ├── examples.py
    │   │   ├── lex_attrs.py
    │   │   ├── punctuation.py
    │   │   ├── stop_words.py
    │   │   └── tokenizer_exceptions.py
    │   ├── la
    │   │   ├── __init__.py
    │   │   ├── examples.py
    │   │   ├── lex_attrs.py
    │   │   ├── stop_words.py
    │   │   ├── syntax_iterators.py
    │   │   └── tokenizer_exceptions.py
    │   ├── lb
    │   │   ├── __init__.py
    │   │   ├── examples.py
    │   │   ├── lex_attrs.py
    │   │   ├── punctuation.py
    │   │   ├── stop_words.py
    │   │   └── tokenizer_exceptions.py
    │   ├── lex_attrs.py
    │   ├── lg
    │   │   ├── __init__.py
    │   │   ├── examples.py
    │   │   ├── lex_attrs.py
    │   │   ├── punctuation.py
    │   │   └── stop_words.py
    │   ├── lij
    │   │   ├── __init__.py
    │   │   ├── examples.py
    │   │   ├── punctuation.py
    │   │   ├── stop_words.py
    │   │   └── tokenizer_exceptions.py
    │   ├── lt
    │   │   ├── __init__.py
    │   │   ├── examples.py
    │   │   ├── lex_attrs.py
    │   │   ├── punctuation.py
    │   │   ├── stop_words.py
    │   │   └── tokenizer_exceptions.py
    │   ├── lv
    │   │   ├── __init__.py
    │   │   └── stop_words.py
    │   ├── mk
    │   │   ├── __init__.py
    │   │   ├── lemmatizer.py
    │   │   ├── lex_attrs.py
    │   │   ├── stop_words.py
    │   │   └── tokenizer_exceptions.py
    │   ├── ml
    │   │   ├── __init__.py
    │   │   ├── examples.py
    │   │   ├── lex_attrs.py
    │   │   └── stop_words.py
    │   ├── mr
    │   │   ├── __init__.py
    │   │   └── stop_words.py
    │   ├── ms
    │   │   ├── __init__.py
    │   │   ├── _tokenizer_exceptions_list.py
    │   │   ├── examples.py
    │   │   ├── lex_attrs.py
    │   │   ├── punctuation.py
    │   │   ├── stop_words.py
    │   │   ├── syntax_iterators.py
    │   │   └── tokenizer_exceptions.py
    │   ├── nb
    │   │   ├── __init__.py
    │   │   ├── examples.py
    │   │   ├── punctuation.py
    │   │   ├── stop_words.py
    │   │   ├── syntax_iterators.py
    │   │   └── tokenizer_exceptions.py
    │   ├── ne
    │   │   ├── __init__.py
    │   │   ├── examples.py
    │   │   ├── lex_attrs.py
    │   │   └── stop_words.py
    │   ├── nl
    │   │   ├── __init__.py
    │   │   ├── examples.py
    │   │   ├── lemmatizer.py
    │   │   ├── lex_attrs.py
    │   │   ├── punctuation.py
    │   │   ├── stop_words.py
    │   │   ├── syntax_iterators.py
    │   │   └── tokenizer_exceptions.py
    │   ├── nn
    │   │   ├── __init__.py
    │   │   ├── examples.py
    │   │   ├── punctuation.py
    │   │   └── tokenizer_exceptions.py
    │   ├── norm_exceptions.py
    │   ├── pl
    │   │   ├── __init__.py
    │   │   ├── examples.py
    │   │   ├── lemmatizer.py
    │   │   ├── lex_attrs.py
    │   │   ├── punctuation.py
    │   │   └── stop_words.py
    │   ├── pt
    │   │   ├── __init__.py
    │   │   ├── examples.py
    │   │   ├── lex_attrs.py
    │   │   ├── punctuation.py
    │   │   ├── stop_words.py
    │   │   ├── syntax_iterators.py
    │   │   └── tokenizer_exceptions.py
    │   ├── punctuation.py
    │   ├── ro
    │   │   ├── __init__.py
    │   │   ├── examples.py
    │   │   ├── lex_attrs.py
    │   │   ├── punctuation.py
    │   │   ├── stop_words.py
    │   │   └── tokenizer_exceptions.py
    │   ├── ru
    │   │   ├── __init__.py
    │   │   ├── examples.py
    │   │   ├── lemmatizer.py
    │   │   ├── lex_attrs.py
    │   │   ├── stop_words.py
    │   │   └── tokenizer_exceptions.py
    │   ├── sa
    │   │   ├── __init__.py
    │   │   ├── examples.py
    │   │   ├── lex_attrs.py
    │   │   └── stop_words.py
    │   ├── si
    │   │   ├── __init__.py
    │   │   ├── examples.py
    │   │   ├── lex_attrs.py
    │   │   └── stop_words.py
    │   ├── sk
    │   │   ├── __init__.py
    │   │   ├── examples.py
    │   │   ├── lex_attrs.py
    │   │   └── stop_words.py
    │   ├── sl
    │   │   ├── __init__.py
    │   │   ├── examples.py
    │   │   ├── lex_attrs.py
    │   │   ├── punctuation.py
    │   │   ├── stop_words.py
    │   │   └── tokenizer_exceptions.py
    │   ├── sq
    │   │   ├── __init__.py
    │   │   ├── examples.py
    │   │   └── stop_words.py
    │   ├── sr
    │   │   ├── __init__.py
    │   │   ├── examples.py
    │   │   ├── lemma_lookup_licence.txt
    │   │   ├── lex_attrs.py
    │   │   ├── punctuation.py
    │   │   ├── stop_words.py
    │   │   └── tokenizer_exceptions.py
    │   ├── sv
    │   │   ├── __init__.py
    │   │   ├── examples.py
    │   │   ├── lex_attrs.py
    │   │   ├── punctuation.py
    │   │   ├── stop_words.py
    │   │   ├── syntax_iterators.py
    │   │   └── tokenizer_exceptions.py
    │   ├── ta
    │   │   ├── __init__.py
    │   │   ├── examples.py
    │   │   ├── lex_attrs.py
    │   │   └── stop_words.py
    │   ├── te
    │   │   ├── __init__.py
    │   │   ├── examples.py
    │   │   ├── lex_attrs.py
    │   │   └── stop_words.py
    │   ├── th
    │   │   ├── __init__.py
    │   │   ├── lex_attrs.py
    │   │   ├── stop_words.py
    │   │   └── tokenizer_exceptions.py
    │   ├── ti
    │   │   ├── __init__.py
    │   │   ├── examples.py
    │   │   ├── lex_attrs.py
    │   │   ├── punctuation.py
    │   │   ├── stop_words.py
    │   │   └── tokenizer_exceptions.py
    │   ├── tl
    │   │   ├── __init__.py
    │   │   ├── lex_attrs.py
    │   │   ├── stop_words.py
    │   │   └── tokenizer_exceptions.py
    │   ├── tn
    │   │   ├── __init__.py
    │   │   ├── examples.py
    │   │   ├── lex_attrs.py
    │   │   ├── punctuation.py
    │   │   └── stop_words.py
    │   ├── tokenizer_exceptions.py
    │   ├── tr
    │   │   ├── __init__.py
    │   │   ├── examples.py
    │   │   ├── lex_attrs.py
    │   │   ├── stop_words.py
    │   │   ├── syntax_iterators.py
    │   │   └── tokenizer_exceptions.py
    │   ├── tt
    │   │   ├── __init__.py
    │   │   ├── examples.py
    │   │   ├── lex_attrs.py
    │   │   ├── punctuation.py
    │   │   ├── stop_words.py
    │   │   └── tokenizer_exceptions.py
    │   ├── uk
    │   │   ├── __init__.py
    │   │   ├── examples.py
    │   │   ├── lemmatizer.py
    │   │   ├── lex_attrs.py
    │   │   ├── stop_words.py
    │   │   └── tokenizer_exceptions.py
    │   ├── ur
    │   │   ├── __init__.py
    │   │   ├── examples.py
    │   │   ├── lex_attrs.py
    │   │   ├── punctuation.py
    │   │   └── stop_words.py
    │   ├── vi
    │   │   ├── __init__.py
    │   │   ├── examples.py
    │   │   ├── lex_attrs.py
    │   │   └── stop_words.py
    │   ├── xx
    │   │   ├── __init__.py
    │   │   └── examples.py
    │   ├── yo
    │   │   ├── __init__.py
    │   │   ├── examples.py
    │   │   ├── lex_attrs.py
    │   │   └── stop_words.py
    │   └── zh
    │   │   ├── __init__.py
    │   │   ├── examples.py
    │   │   ├── lex_attrs.py
    │   │   └── stop_words.py
    ├── language.py
    ├── lexeme.pxd
    ├── lexeme.pyi
    ├── lexeme.pyx
    ├── lookups.py
    ├── matcher
    │   ├── __init__.py
    │   ├── dependencymatcher.pyi
    │   ├── dependencymatcher.pyx
    │   ├── levenshtein.pyx
    │   ├── matcher.pxd
    │   ├── matcher.pyi
    │   ├── matcher.pyx
    │   ├── phrasematcher.pxd
    │   ├── phrasematcher.pyi
    │   ├── phrasematcher.pyx
    │   └── polyleven.c
    ├── ml
    │   ├── __init__.py
    │   ├── _character_embed.py
    │   ├── _precomputable_affine.py
    │   ├── callbacks.py
    │   ├── extract_ngrams.py
    │   ├── extract_spans.py
    │   ├── featureextractor.py
    │   ├── models
    │   │   ├── __init__.py
    │   │   ├── entity_linker.py
    │   │   ├── multi_task.py
    │   │   ├── parser.py
    │   │   ├── span_finder.py
    │   │   ├── spancat.py
    │   │   ├── tagger.py
    │   │   ├── textcat.py
    │   │   └── tok2vec.py
    │   ├── parser_model.pxd
    │   ├── parser_model.pyx
    │   ├── staticvectors.py
    │   └── tb_framework.py
    ├── morphology.pxd
    ├── morphology.pyx
    ├── parts_of_speech.pxd
    ├── parts_of_speech.pyx
    ├── pipe_analysis.py
    ├── pipeline
    │   ├── __init__.py
    │   ├── _edit_tree_internals
    │   │   ├── __init__.py
    │   │   ├── edit_trees.pxd
    │   │   ├── edit_trees.pyx
    │   │   └── schemas.py
    │   ├── _parser_internals
    │   │   ├── __init__.pxd
    │   │   ├── __init__.py
    │   │   ├── _beam_utils.pxd
    │   │   ├── _beam_utils.pyx
    │   │   ├── _state.pxd
    │   │   ├── _state.pyx
    │   │   ├── arc_eager.pxd
    │   │   ├── arc_eager.pyx
    │   │   ├── ner.pxd
    │   │   ├── ner.pyx
    │   │   ├── nonproj.hh
    │   │   ├── nonproj.pxd
    │   │   ├── nonproj.pyx
    │   │   ├── stateclass.pxd
    │   │   ├── stateclass.pyx
    │   │   ├── transition_system.pxd
    │   │   └── transition_system.pyx
    │   ├── attributeruler.py
    │   ├── dep_parser.pyx
    │   ├── edit_tree_lemmatizer.py
    │   ├── entity_linker.py
    │   ├── entityruler.py
    │   ├── factories.py
    │   ├── functions.py
    │   ├── legacy
    │   │   ├── __init__.py
    │   │   └── entity_linker.py
    │   ├── lemmatizer.py
    │   ├── morphologizer.pyx
    │   ├── multitask.pyx
    │   ├── ner.pyx
    │   ├── pipe.pxd
    │   ├── pipe.pyi
    │   ├── pipe.pyx
    │   ├── sentencizer.pyx
    │   ├── senter.pyx
    │   ├── span_finder.py
    │   ├── span_ruler.py
    │   ├── spancat.py
    │   ├── tagger.pyx
    │   ├── textcat.py
    │   ├── textcat_multilabel.py
    │   ├── tok2vec.py
    │   ├── trainable_pipe.pxd
    │   ├── trainable_pipe.pyx
    │   ├── transition_parser.pxd
    │   └── transition_parser.pyx
    ├── py.typed
    ├── registrations.py
    ├── schemas.py
    ├── scorer.py
    ├── strings.pxd
    ├── strings.pyi
    ├── strings.pyx
    ├── structs.pxd
    ├── symbols.pxd
    ├── symbols.pyx
    ├── tests
    │   ├── README.md
    │   ├── __init__.py
    │   ├── conftest.py
    │   ├── doc
    │   │   ├── __init__.py
    │   │   ├── test_add_entities.py
    │   │   ├── test_array.py
    │   │   ├── test_creation.py
    │   │   ├── test_doc_api.py
    │   │   ├── test_graph.py
    │   │   ├── test_json_doc_conversion.py
    │   │   ├── test_morphanalysis.py
    │   │   ├── test_pickle_doc.py
    │   │   ├── test_retokenize_merge.py
    │   │   ├── test_retokenize_split.py
    │   │   ├── test_span.py
    │   │   ├── test_span_group.py
    │   │   ├── test_token_api.py
    │   │   └── test_underscore.py
    │   ├── enable_gpu.py
    │   ├── factory_registrations.json
    │   ├── lang
    │   │   ├── __init__.py
    │   │   ├── af
    │   │   │   ├── __init__.py
    │   │   │   ├── test_text.py
    │   │   │   └── test_tokenizer.py
    │   │   ├── am
    │   │   │   ├── __init__.py
    │   │   │   ├── test_exception.py
    │   │   │   └── test_text.py
    │   │   ├── ar
    │   │   │   ├── __init__.py
    │   │   │   ├── test_exceptions.py
    │   │   │   └── test_text.py
    │   │   ├── bg
    │   │   │   ├── test_text.py
    │   │   │   └── test_tokenizer.py
    │   │   ├── bn
    │   │   │   ├── __init__.py
    │   │   │   └── test_tokenizer.py
    │   │   ├── bo
    │   │   │   ├── __init__.py
    │   │   │   └── test_text.py
    │   │   ├── ca
    │   │   │   ├── __init__.py
    │   │   │   ├── test_exception.py
    │   │   │   ├── test_prefix_suffix_infix.py
    │   │   │   └── test_text.py
    │   │   ├── cs
    │   │   │   ├── __init__.py
    │   │   │   └── test_text.py
    │   │   ├── da
    │   │   │   ├── __init__.py
    │   │   │   ├── test_exceptions.py
    │   │   │   ├── test_noun_chunks.py
    │   │   │   ├── test_prefix_suffix_infix.py
    │   │   │   └── test_text.py
    │   │   ├── de
    │   │   │   ├── __init__.py
    │   │   │   ├── test_exceptions.py
    │   │   │   ├── test_noun_chunks.py
    │   │   │   ├── test_parser.py
    │   │   │   ├── test_prefix_suffix_infix.py
    │   │   │   └── test_text.py
    │   │   ├── dsb
    │   │   │   ├── __init__.py
    │   │   │   ├── test_text.py
    │   │   │   └── test_tokenizer.py
    │   │   ├── el
    │   │   │   ├── __init__.py
    │   │   │   ├── test_exception.py
    │   │   │   ├── test_noun_chunks.py
    │   │   │   └── test_text.py
    │   │   ├── en
    │   │   │   ├── __init__.py
    │   │   │   ├── test_customized_tokenizer.py
    │   │   │   ├── test_exceptions.py
    │   │   │   ├── test_indices.py
    │   │   │   ├── test_noun_chunks.py
    │   │   │   ├── test_parser.py
    │   │   │   ├── test_prefix_suffix_infix.py
    │   │   │   ├── test_punct.py
    │   │   │   ├── test_sbd.py
    │   │   │   ├── test_text.py
    │   │   │   └── test_tokenizer.py
    │   │   ├── es
    │   │   │   ├── __init__.py
    │   │   │   ├── test_exception.py
    │   │   │   ├── test_noun_chunks.py
    │   │   │   └── test_text.py
    │   │   ├── et
    │   │   │   ├── __init__.py
    │   │   │   ├── test_text.py
    │   │   │   └── test_tokenizer.py
    │   │   ├── eu
    │   │   │   ├── __init__.py
    │   │   │   └── test_text.py
    │   │   ├── fa
    │   │   │   ├── __init__.py
    │   │   │   └── test_noun_chunks.py
    │   │   ├── fi
    │   │   │   ├── __init__.py
    │   │   │   ├── test_noun_chunks.py
    │   │   │   ├── test_text.py
    │   │   │   └── test_tokenizer.py
    │   │   ├── fo
    │   │   │   ├── __init__.py
    │   │   │   └── test_tokenizer.py
    │   │   ├── fr
    │   │   │   ├── __init__.py
    │   │   │   ├── test_exceptions.py
    │   │   │   ├── test_noun_chunks.py
    │   │   │   ├── test_prefix_suffix_infix.py
    │   │   │   └── test_text.py
    │   │   ├── ga
    │   │   │   ├── __init__.py
    │   │   │   └── test_tokenizer.py
    │   │   ├── grc
    │   │   │   ├── __init__.py
    │   │   │   ├── test_text.py
    │   │   │   └── test_tokenizer.py
    │   │   ├── gu
    │   │   │   ├── __init__.py
    │   │   │   └── test_text.py
    │   │   ├── he
    │   │   │   ├── __init__.py
    │   │   │   └── test_tokenizer.py
    │   │   ├── hi
    │   │   │   ├── __init__.py
    │   │   │   ├── test_lex_attrs.py
    │   │   │   └── test_text.py
    │   │   ├── hr
    │   │   │   ├── __init__.py
    │   │   │   ├── test_text.py
    │   │   │   └── test_tokenizer.py
    │   │   ├── hsb
    │   │   │   ├── __init__.py
    │   │   │   ├── test_text.py
    │   │   │   └── test_tokenizer.py
    │   │   ├── ht
    │   │   │   ├── __init__.py
    │   │   │   ├── test_exceptions.py
    │   │   │   ├── test_noun_chunks.py
    │   │   │   ├── test_prefix_suffix_infix.py
    │   │   │   └── test_text.py
    │   │   ├── hu
    │   │   │   ├── __init__.py
    │   │   │   └── test_tokenizer.py
    │   │   ├── hy
    │   │   │   ├── __init__.py
    │   │   │   ├── test_text.py
    │   │   │   └── test_tokenizer.py
    │   │   ├── id
    │   │   │   ├── __init__.py
    │   │   │   ├── test_noun_chunks.py
    │   │   │   ├── test_prefix_suffix_infix.py
    │   │   │   └── test_text.py
    │   │   ├── is
    │   │   │   ├── __init__.py
    │   │   │   ├── test_text.py
    │   │   │   └── test_tokenizer.py
    │   │   ├── it
    │   │   │   ├── __init__.py
    │   │   │   ├── test_noun_chunks.py
    │   │   │   ├── test_prefix_suffix_infix.py
    │   │   │   ├── test_stopwords.py
    │   │   │   └── test_text.py
    │   │   ├── ja
    │   │   │   ├── __init__.py
    │   │   │   ├── test_lemmatization.py
    │   │   │   ├── test_morphologizer_factory.py
    │   │   │   ├── test_serialize.py
    │   │   │   └── test_tokenizer.py
    │   │   ├── kmr
    │   │   │   ├── __init__.py
    │   │   │   └── test_text.py
    │   │   ├── ko
    │   │   │   ├── __init__.py
    │   │   │   ├── test_lemmatization.py
    │   │   │   ├── test_serialize.py
    │   │   │   └── test_tokenizer.py
    │   │   ├── ky
    │   │   │   ├── __init__.py
    │   │   │   └── test_tokenizer.py
    │   │   ├── la
    │   │   │   ├── __init__.py
    │   │   │   ├── test_exception.py
    │   │   │   ├── test_noun_chunks.py
    │   │   │   └── test_text.py
    │   │   ├── lb
    │   │   │   ├── __init__.py
    │   │   │   ├── test_exceptions.py
    │   │   │   ├── test_prefix_suffix_infix.py
    │   │   │   └── test_text.py
    │   │   ├── lg
    │   │   │   ├── __init__.py
    │   │   │   └── test_tokenizer.py
    │   │   ├── lt
    │   │   │   ├── __init__.py
    │   │   │   └── test_text.py
    │   │   ├── lv
    │   │   │   ├── __init__.py
    │   │   │   ├── test_text.py
    │   │   │   └── test_tokenizer.py
    │   │   ├── mk
    │   │   │   ├── __init__.py
    │   │   │   └── test_text.py
    │   │   ├── ml
    │   │   │   ├── __init__.py
    │   │   │   └── test_text.py
    │   │   ├── ms
    │   │   │   ├── __init__.py
    │   │   │   ├── test_noun_chunks.py
    │   │   │   ├── test_prefix_suffix_infix.py
    │   │   │   └── test_text.py
    │   │   ├── nb
    │   │   │   ├── __init__.py
    │   │   │   ├── test_noun_chunks.py
    │   │   │   └── test_tokenizer.py
    │   │   ├── ne
    │   │   │   ├── __init__.py
    │   │   │   └── test_text.py
    │   │   ├── nl
    │   │   │   ├── __init__.py
    │   │   │   ├── test_noun_chunks.py
    │   │   │   └── test_text.py
    │   │   ├── nn
    │   │   │   ├── __init__.py
    │   │   │   └── test_tokenizer.py
    │   │   ├── pl
    │   │   │   ├── __init__.py
    │   │   │   ├── test_text.py
    │   │   │   └── test_tokenizer.py
    │   │   ├── pt
    │   │   │   ├── __init__.py
    │   │   │   ├── test_noun_chunks.py
    │   │   │   └── test_text.py
    │   │   ├── ro
    │   │   │   ├── __init__.py
    │   │   │   └── test_tokenizer.py
    │   │   ├── ru
    │   │   │   ├── __init__.py
    │   │   │   ├── test_exceptions.py
    │   │   │   ├── test_lemmatizer.py
    │   │   │   ├── test_text.py
    │   │   │   └── test_tokenizer.py
    │   │   ├── sa
    │   │   │   ├── __init__.py
    │   │   │   └── test_text.py
    │   │   ├── sk
    │   │   │   ├── __init__.py
    │   │   │   ├── test_text.py
    │   │   │   └── test_tokenizer.py
    │   │   ├── sl
    │   │   │   ├── __init__.py
    │   │   │   ├── test_text.py
    │   │   │   └── test_tokenizer.py
    │   │   ├── sq
    │   │   │   ├── __init__.py
    │   │   │   ├── test_text.py
    │   │   │   └── test_tokenizer.py
    │   │   ├── sr
    │   │   │   ├── __init__.py
    │   │   │   ├── test_exceptions.py
    │   │   │   └── test_tokenizer.py
    │   │   ├── sv
    │   │   │   ├── __init__.py
    │   │   │   ├── test_exceptions.py
    │   │   │   ├── test_lex_attrs.py
    │   │   │   ├── test_noun_chunks.py
    │   │   │   ├── test_prefix_suffix_infix.py
    │   │   │   ├── test_text.py
    │   │   │   └── test_tokenizer.py
    │   │   ├── ta
    │   │   │   ├── __init__.py
    │   │   │   ├── test_text.py
    │   │   │   └── test_tokenizer.py
    │   │   ├── test_attrs.py
    │   │   ├── test_initialize.py
    │   │   ├── test_lemmatizers.py
    │   │   ├── th
    │   │   │   ├── __init__.py
    │   │   │   ├── test_serialize.py
    │   │   │   └── test_tokenizer.py
    │   │   ├── ti
    │   │   │   ├── __init__.py
    │   │   │   ├── test_exception.py
    │   │   │   └── test_text.py
    │   │   ├── tl
    │   │   │   ├── __init__.py
    │   │   │   ├── test_indices.py
    │   │   │   ├── test_punct.py
    │   │   │   └── test_text.py
    │   │   ├── tr
    │   │   │   ├── __init__.py
    │   │   │   ├── test_noun_chunks.py
    │   │   │   ├── test_parser.py
    │   │   │   ├── test_text.py
    │   │   │   └── test_tokenizer.py
    │   │   ├── tt
    │   │   │   ├── __init__.py
    │   │   │   └── test_tokenizer.py
    │   │   ├── uk
    │   │   │   ├── __init__.py
    │   │   │   ├── test_lemmatizer.py
    │   │   │   ├── test_tokenizer.py
    │   │   │   └── test_tokenizer_exc.py
    │   │   ├── ur
    │   │   │   ├── __init__.py
    │   │   │   ├── test_prefix_suffix_infix.py
    │   │   │   └── test_text.py
    │   │   ├── vi
    │   │   │   ├── __init__.py
    │   │   │   ├── test_serialize.py
    │   │   │   └── test_tokenizer.py
    │   │   ├── xx
    │   │   │   ├── __init__.py
    │   │   │   ├── test_text.py
    │   │   │   └── test_tokenizer.py
    │   │   ├── yo
    │   │   │   ├── __init__.py
    │   │   │   └── test_text.py
    │   │   └── zh
    │   │   │   ├── __init__.py
    │   │   │   ├── test_serialize.py
    │   │   │   ├── test_text.py
    │   │   │   └── test_tokenizer.py
    │   ├── matcher
    │   │   ├── __init__.py
    │   │   ├── test_dependency_matcher.py
    │   │   ├── test_levenshtein.py
    │   │   ├── test_matcher_api.py
    │   │   ├── test_matcher_logic.py
    │   │   ├── test_pattern_validation.py
    │   │   └── test_phrase_matcher.py
    │   ├── morphology
    │   │   ├── __init__.py
    │   │   ├── test_morph_converters.py
    │   │   ├── test_morph_features.py
    │   │   └── test_morph_pickle.py
    │   ├── package
    │   │   ├── __init__.py
    │   │   └── test_requirements.py
    │   ├── parser
    │   │   ├── __init__.py
    │   │   ├── test_add_label.py
    │   │   ├── test_arc_eager_oracle.py
    │   │   ├── test_ner.py
    │   │   ├── test_neural_parser.py
    │   │   ├── test_nn_beam.py
    │   │   ├── test_nonproj.py
    │   │   ├── test_parse.py
    │   │   ├── test_parse_navigate.py
    │   │   ├── test_preset_sbd.py
    │   │   ├── test_space_attachment.py
    │   │   └── test_state.py
    │   ├── pipeline
    │   │   ├── __init__.py
    │   │   ├── test_analysis.py
    │   │   ├── test_annotates_on_update.py
    │   │   ├── test_attributeruler.py
    │   │   ├── test_edit_tree_lemmatizer.py
    │   │   ├── test_entity_linker.py
    │   │   ├── test_entity_ruler.py
    │   │   ├── test_functions.py
    │   │   ├── test_initialize.py
    │   │   ├── test_lemmatizer.py
    │   │   ├── test_models.py
    │   │   ├── test_morphologizer.py
    │   │   ├── test_pipe_factories.py
    │   │   ├── test_pipe_methods.py
    │   │   ├── test_sentencizer.py
    │   │   ├── test_senter.py
    │   │   ├── test_span_finder.py
    │   │   ├── test_span_ruler.py
    │   │   ├── test_spancat.py
    │   │   ├── test_tagger.py
    │   │   ├── test_textcat.py
    │   │   └── test_tok2vec.py
    │   ├── registry_contents.json
    │   ├── serialize
    │   │   ├── __init__.py
    │   │   ├── test_resource_warning.py
    │   │   ├── test_serialize_config.py
    │   │   ├── test_serialize_doc.py
    │   │   ├── test_serialize_docbin.py
    │   │   ├── test_serialize_extension_attrs.py
    │   │   ├── test_serialize_kb.py
    │   │   ├── test_serialize_language.py
    │   │   ├── test_serialize_pipeline.py
    │   │   ├── test_serialize_span_groups.py
    │   │   ├── test_serialize_tokenizer.py
    │   │   └── test_serialize_vocab_strings.py
    │   ├── test_architectures.py
    │   ├── test_cli.py
    │   ├── test_cli_app.py
    │   ├── test_displacy.py
    │   ├── test_errors.py
    │   ├── test_factory_imports.py
    │   ├── test_factory_registrations.py
    │   ├── test_language.py
    │   ├── test_misc.py
    │   ├── test_models.py
    │   ├── test_pickles.py
    │   ├── test_registry_population.py
    │   ├── test_scorer.py
    │   ├── test_ty.py
    │   ├── tok2vec.py
    │   ├── tokenizer
    │   │   ├── __init__.py
    │   │   ├── sun.txt
    │   │   ├── test_exceptions.py
    │   │   ├── test_explain.py
    │   │   ├── test_naughty_strings.py
    │   │   ├── test_tokenizer.py
    │   │   ├── test_urls.py
    │   │   └── test_whitespace.py
    │   ├── training
    │   │   ├── __init__.py
    │   │   ├── test_augmenters.py
    │   │   ├── test_corpus.py
    │   │   ├── test_logger.py
    │   │   ├── test_new_example.py
    │   │   ├── test_pretraining.py
    │   │   ├── test_readers.py
    │   │   ├── test_rehearse.py
    │   │   └── test_training.py
    │   ├── util.py
    │   └── vocab_vectors
    │   │   ├── __init__.py
    │   │   ├── test_lexeme.py
    │   │   ├── test_lookups.py
    │   │   ├── test_memory_zone.py
    │   │   ├── test_similarity.py
    │   │   ├── test_stringstore.py
    │   │   ├── test_vectors.py
    │   │   └── test_vocab_api.py
    ├── tokenizer.pxd
    ├── tokenizer.pyx
    ├── tokens
    │   ├── __init__.pxd
    │   ├── __init__.py
    │   ├── _dict_proxies.py
    │   ├── _retokenize.pyi
    │   ├── _retokenize.pyx
    │   ├── _serialize.py
    │   ├── doc.pxd
    │   ├── doc.pyi
    │   ├── doc.pyx
    │   ├── graph.pxd
    │   ├── graph.pyx
    │   ├── morphanalysis.pxd
    │   ├── morphanalysis.pyi
    │   ├── morphanalysis.pyx
    │   ├── span.pxd
    │   ├── span.pyi
    │   ├── span.pyx
    │   ├── span_group.pxd
    │   ├── span_group.pyi
    │   ├── span_group.pyx
    │   ├── token.pxd
    │   ├── token.pyi
    │   ├── token.pyx
    │   └── underscore.py
    ├── training
    │   ├── __init__.pxd
    │   ├── __init__.py
    │   ├── align.pyx
    │   ├── alignment.py
    │   ├── alignment_array.pxd
    │   ├── alignment_array.pyx
    │   ├── augment.py
    │   ├── batchers.py
    │   ├── callbacks.py
    │   ├── converters
    │   │   ├── __init__.py
    │   │   ├── conll_ner_to_docs.py
    │   │   ├── conllu_to_docs.py
    │   │   ├── iob_to_docs.py
    │   │   └── json_to_docs.py
    │   ├── corpus.py
    │   ├── example.pxd
    │   ├── example.pyi
    │   ├── example.pyx
    │   ├── gold_io.pyx
    │   ├── initialize.py
    │   ├── iob_utils.py
    │   ├── loggers.py
    │   ├── loop.py
    │   └── pretrain.py
    ├── ty.py
    ├── typedefs.pxd
    ├── typedefs.pyx
    ├── util.py
    ├── vectors.pyx
    ├── vocab.pxd
    ├── vocab.pyi
    └── vocab.pyx
└── website
    ├── .dockerignore
    ├── .eslintrc
    ├── .eslintrc.json
    ├── .gitignore
    ├── .nvmrc
    ├── .prettierignore
    ├── .prettierrc
    ├── .vscode
        └── extensions.json
    ├── Dockerfile
    ├── README.md
    ├── UNIVERSE.md
    ├── docs
        ├── api
        │   ├── architectures.mdx
        │   ├── attributeruler.mdx
        │   ├── attributes.mdx
        │   ├── basevectors.mdx
        │   ├── cli.mdx
        │   ├── coref.mdx
        │   ├── corpus.mdx
        │   ├── curatedtransformer.mdx
        │   ├── cython-classes.mdx
        │   ├── cython-structs.mdx
        │   ├── cython.mdx
        │   ├── data-formats.mdx
        │   ├── dependencymatcher.mdx
        │   ├── dependencyparser.mdx
        │   ├── doc.mdx
        │   ├── docbin.mdx
        │   ├── edittreelemmatizer.mdx
        │   ├── entitylinker.mdx
        │   ├── entityrecognizer.mdx
        │   ├── entityruler.mdx
        │   ├── example.mdx
        │   ├── index.mdx
        │   ├── inmemorylookupkb.mdx
        │   ├── kb.mdx
        │   ├── language.mdx
        │   ├── large-language-models.mdx
        │   ├── legacy.mdx
        │   ├── lemmatizer.mdx
        │   ├── lexeme.mdx
        │   ├── lookups.mdx
        │   ├── matcher.mdx
        │   ├── morphologizer.mdx
        │   ├── morphology.mdx
        │   ├── phrasematcher.mdx
        │   ├── pipe.mdx
        │   ├── pipeline-functions.mdx
        │   ├── scorer.mdx
        │   ├── sentencerecognizer.mdx
        │   ├── sentencizer.mdx
        │   ├── span-resolver.mdx
        │   ├── span.mdx
        │   ├── spancategorizer.mdx
        │   ├── spanfinder.mdx
        │   ├── spangroup.mdx
        │   ├── spanruler.mdx
        │   ├── stringstore.mdx
        │   ├── tagger.mdx
        │   ├── textcategorizer.mdx
        │   ├── tok2vec.mdx
        │   ├── token.mdx
        │   ├── tokenizer.mdx
        │   ├── top-level.mdx
        │   ├── transformer.mdx
        │   ├── vectors.mdx
        │   └── vocab.mdx
        ├── images
        │   └── displacy-long2.html
        ├── models
        │   └── index.mdx
        ├── styleguide.mdx
        └── usage
        │   ├── 101
        │       ├── _architecture.mdx
        │       ├── _language-data.mdx
        │       ├── _named-entities.mdx
        │       ├── _pipelines.mdx
        │       ├── _pos-deps.mdx
        │       ├── _serialization.mdx
        │       ├── _tokenization.mdx
        │       ├── _training.mdx
        │       └── _vectors-similarity.mdx
        │   ├── _benchmarks-models.mdx
        │   ├── embeddings-transformers.mdx
        │   ├── facts-figures.mdx
        │   ├── index.mdx
        │   ├── large-language-models.mdx
        │   ├── layers-architectures.mdx
        │   ├── linguistic-features.mdx
        │   ├── memory-management.mdx
        │   ├── models.mdx
        │   ├── processing-pipelines.mdx
        │   ├── projects.mdx
        │   ├── rule-based-matching.mdx
        │   ├── saving-loading.mdx
        │   ├── spacy-101.mdx
        │   ├── training.mdx
        │   ├── v2-1.mdx
        │   ├── v2-2.mdx
        │   ├── v2-3.mdx
        │   ├── v2.mdx
        │   ├── v3-1.mdx
        │   ├── v3-2.mdx
        │   ├── v3-3.mdx
        │   ├── v3-4.mdx
        │   ├── v3-5.mdx
        │   ├── v3-6.mdx
        │   ├── v3-7.mdx
        │   ├── v3.mdx
        │   └── visualizers.mdx
    ├── meta
        ├── dynamicMeta.mjs
        ├── languageSorted.tsx
        ├── languages.json
        ├── recordLanguages.tsx
        ├── recordSections.tsx
        ├── recordUniverse.tsx
        ├── sidebarFlat.tsx
        ├── sidebars.json
        ├── site.json
        ├── type-annotations.json
        └── universe.json
    ├── netlify.toml
    ├── next-sitemap.config.mjs
    ├── next.config.mjs
    ├── package-lock.json
    ├── package.json
    ├── pages
        ├── 404.js
        ├── [...listPathPage].tsx
        ├── _app.tsx
        ├── _document.tsx
        ├── index.tsx
        ├── models
        │   └── [slug].tsx
        └── universe
        │   ├── category
        │       └── [slug].tsx
        │   ├── index.tsx
        │   └── project
        │       └── [slug].tsx
    ├── plugins
        ├── getProps.mjs
        ├── index.mjs
        ├── remarkCodeBlocks.mjs
        ├── remarkCustomAttrs.mjs
        ├── remarkFindAndReplace.mjs
        └── remarkWrapSections.mjs
    ├── public
        ├── favicon.ico
        ├── icons
        │   ├── icon-192x192.png
        │   ├── icon-256x256.png
        │   ├── icon-384x384.png
        │   └── icon-512x512.png
        ├── images
        │   ├── architecture.svg
        │   ├── cli_init_fill-config_diff.jpg
        │   ├── course.jpg
        │   ├── dep-match-diagram.svg
        │   ├── displacy-compact.svg
        │   ├── displacy-custom-parser.svg
        │   ├── displacy-dep-founded.svg
        │   ├── displacy-long.svg
        │   ├── displacy-long2.svg
        │   ├── displacy-model-rules.svg
        │   ├── displacy-model-rules2.svg
        │   ├── displacy-small.svg
        │   ├── displacy.svg
        │   ├── displacy_jupyter.jpg
        │   ├── huggingface_hub.jpg
        │   ├── lifecycle.svg
        │   ├── matcher-demo.jpg
        │   ├── pipeline-design.svg
        │   ├── pipeline.svg
        │   ├── pipeline_transformer.svg
        │   ├── prodigy.jpg
        │   ├── prodigy_overview.jpg
        │   ├── prodigy_spans-manual.jpg
        │   ├── prodigy_train_curve.jpg
        │   ├── project_document.jpg
        │   ├── projects.png
        │   ├── projects.svg
        │   ├── sense2vec.jpg
        │   ├── spacy-extension-demo.gif
        │   ├── spacy-ray.svg
        │   ├── spacy-streamlit.png
        │   ├── spacy-tailored-pipelines_wide.png
        │   ├── thinc_mypy.jpg
        │   ├── tok2vec-listener.svg
        │   ├── tok2vec.svg
        │   ├── tokenization.svg
        │   ├── trainable_component.svg
        │   ├── training.svg
        │   ├── vocab_stringstore.svg
        │   ├── wandb1.jpg
        │   └── wandb2.jpg
        ├── manifest.webmanifest
        └── vercel.svg
    ├── runtime.txt
    ├── setup
        ├── jinja_to_js.py
        ├── requirements.txt
        └── setup.sh
    ├── src
        ├── components
        │   ├── accordion.js
        │   ├── alert.js
        │   ├── aside.js
        │   ├── button.js
        │   ├── card.js
        │   ├── code.js
        │   ├── codeBlock.js
        │   ├── codeDynamic.js
        │   ├── copy.js
        │   ├── dropdown.js
        │   ├── embed.js
        │   ├── footer.js
        │   ├── github.js
        │   ├── grid.js
        │   ├── htmlToReact.js
        │   ├── icon.js
        │   ├── infobox.js
        │   ├── inlineCode.js
        │   ├── juniper.js
        │   ├── landing.js
        │   ├── link.js
        │   ├── list.js
        │   ├── main.js
        │   ├── markdownToReact.js
        │   ├── markdownToReactDynamic.js
        │   ├── navigation.js
        │   ├── newsletter.js
        │   ├── progress.js
        │   ├── quickstart.js
        │   ├── readnext.js
        │   ├── search.js
        │   ├── section.js
        │   ├── seo.js
        │   ├── sidebar.js
        │   ├── table.js
        │   ├── tag.js
        │   ├── title.js
        │   ├── typeAnnotation.js
        │   ├── typography.js
        │   └── util.js
        ├── fonts
        │   ├── hkgrotesk-bold.woff
        │   ├── hkgrotesk-bold.woff2
        │   ├── hkgrotesk-bolditalic.woff
        │   ├── hkgrotesk-bolditalic.woff2
        │   ├── hkgrotesk-semibold.woff
        │   ├── hkgrotesk-semibold.woff2
        │   ├── hkgrotesk-semibolditalic.woff
        │   ├── hkgrotesk-semibolditalic.woff2
        │   ├── jetbrainsmono-italic.woff
        │   ├── jetbrainsmono-italic.woff2
        │   ├── jetbrainsmono-regular.woff
        │   └── jetbrainsmono-regular.woff2
        ├── images
        │   ├── explosion.svg
        │   ├── icon.png
        │   ├── icon_legacy.png
        │   ├── icon_nightly.png
        │   ├── icons
        │   │   ├── accept.svg
        │   │   ├── arrow-right.svg
        │   │   ├── clipboard.svg
        │   │   ├── code.svg
        │   │   ├── docs.svg
        │   │   ├── download.svg
        │   │   ├── github.svg
        │   │   ├── help-outline.svg
        │   │   ├── help.svg
        │   │   ├── info.svg
        │   │   ├── moon.svg
        │   │   ├── network.svg
        │   │   ├── neutral.svg
        │   │   ├── no.svg
        │   │   ├── offline.svg
        │   │   ├── package.svg
        │   │   ├── reject.svg
        │   │   ├── search.svg
        │   │   ├── twitter.svg
        │   │   ├── warning.svg
        │   │   ├── website.svg
        │   │   └── yes.svg
        │   ├── logo.svg
        │   ├── logos
        │   │   ├── dvc.svg
        │   │   ├── fastapi.svg
        │   │   ├── huggingface_hub.svg
        │   │   ├── prodigy.svg
        │   │   ├── ray.svg
        │   │   ├── streamlit.svg
        │   │   └── wandb.svg
        │   ├── pattern_blue.png
        │   ├── pattern_green.png
        │   ├── pattern_landing.png
        │   ├── pattern_landing_legacy.png
        │   ├── pattern_landing_nightly.png
        │   ├── pattern_legacy.png
        │   ├── pattern_nightly.png
        │   ├── pattern_purple.png
        │   ├── social_api.jpg
        │   ├── social_default.jpg
        │   ├── social_legacy.jpg
        │   ├── social_nightly.jpg
        │   ├── social_universe.jpg
        │   └── spacy-irl.jpg
        ├── remark.js
        ├── styles
        │   ├── accordion.module.sass
        │   ├── alert.module.sass
        │   ├── aside.module.sass
        │   ├── base.sass
        │   ├── button.module.sass
        │   ├── card.module.sass
        │   ├── code.module.sass
        │   ├── copy.module.sass
        │   ├── dropdown.module.sass
        │   ├── embed.module.sass
        │   ├── footer.module.sass
        │   ├── grid.module.sass
        │   ├── icon.module.sass
        │   ├── infobox.module.sass
        │   ├── landing.module.sass
        │   ├── layout.sass
        │   ├── link.module.sass
        │   ├── list.module.sass
        │   ├── main.module.sass
        │   ├── navigation.module.sass
        │   ├── newsletter.module.sass
        │   ├── progress.module.sass
        │   ├── quickstart.module.sass
        │   ├── readnext.module.sass
        │   ├── search.sass
        │   ├── section.module.sass
        │   ├── sidebar.module.sass
        │   ├── table.module.sass
        │   ├── tag.module.sass
        │   ├── title.module.sass
        │   └── typography.module.sass
        ├── templates
        │   ├── docs.js
        │   ├── index.js
        │   ├── models.js
        │   └── universe.js
        └── widgets
        │   ├── changelog.js
        │   ├── features.js
        │   ├── integration.js
        │   ├── languages.js
        │   ├── project.js
        │   ├── quickstart-install.js
        │   ├── quickstart-models.js
        │   ├── quickstart-training.js
        │   └── styleguide.js
    └── tsconfig.json


/.github/FUNDING.yml:
--------------------------------------------------------------------------------
1 | custom: [https://explosion.ai/merch, https://explosion.ai/tailored-solutions]
2 | 


--------------------------------------------------------------------------------
/.github/ISSUE_TEMPLATE/01_bugs.md:
--------------------------------------------------------------------------------
 1 | ---
 2 | name: "\U0001F6A8 Submit a Bug Report"
 3 | about: Use this template if you came across a bug or unexpected behaviour differing from the docs.
 4 | 
 5 | ---
 6 | 
 7 | <!-- NOTE: For questions or install related issues, please open a Discussion instead. -->
 8 | 
 9 | ## How to reproduce the behaviour
10 | <!-- Include a code example or the steps that led to the problem. Please try to be as specific as possible. -->
11 | 
12 | ## Your Environment
13 | <!-- Include details of your environment. You can also type `python -m spacy info --markdown` and copy-paste the result here.-->
14 | * Operating System:
15 | * Python Version Used:
16 | * spaCy Version Used:
17 | * Environment Information:
18 | 


--------------------------------------------------------------------------------
/.github/ISSUE_TEMPLATE/02_docs.md:
--------------------------------------------------------------------------------
 1 | ---
 2 | name: "\U0001F4DA Submit a Documentation Report"
 3 | about: Did you spot a mistake in the docs, is anything unclear or do you have a
 4 |   suggestion?
 5 | 
 6 | ---
 7 | <!-- Describe the problem or suggestion here. If you've found a mistake and you know the answer, feel free to submit a pull request straight away: https://github.com/explosion/spaCy/pulls -->
 8 | 
 9 | ## Which page or section is this issue related to?
10 | <!-- Please include the URL and/or source. -->
11 | 


--------------------------------------------------------------------------------
/.github/contributors/Bri-Will.md:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/explosion/spaCy/41e07772dc5805594bab2997a090a9033e26bf56/.github/contributors/Bri-Will.md


--------------------------------------------------------------------------------
/.github/contributors/Schibsted.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/explosion/spaCy/41e07772dc5805594bab2997a090a9033e26bf56/.github/contributors/Schibsted.png


--------------------------------------------------------------------------------
/.github/contributors/melanuria.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/explosion/spaCy/41e07772dc5805594bab2997a090a9033e26bf56/.github/contributors/melanuria.pdf


--------------------------------------------------------------------------------
/.github/contributors/svlandeg.md:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/explosion/spaCy/41e07772dc5805594bab2997a090a9033e26bf56/.github/contributors/svlandeg.md


--------------------------------------------------------------------------------
/.github/validate_universe_json.py:
--------------------------------------------------------------------------------
 1 | import json
 2 | import re
 3 | import sys
 4 | from pathlib import Path
 5 | 
 6 | 
 7 | def validate_json(document):
 8 |     universe_file = Path(document)
 9 |     with universe_file.open() as f:
10 |         universe_data = json.load(f)
11 |         for entry in universe_data["resources"]:
12 |             if "github" in entry:
13 |                 assert not re.match(
14 |                     r"^(http:)|^(https:)", entry["github"]
15 |                 ), "Github field should be user/repo, not a url"
16 | 
17 | 
18 | if __name__ == "__main__":
19 |     validate_json(str(sys.argv[1]))
20 | 


--------------------------------------------------------------------------------
/.github/workflows/gputests.yml.disabled:
--------------------------------------------------------------------------------
 1 | name: Weekly GPU tests
 2 | 
 3 | on:
 4 |   schedule:
 5 |     - cron: '0 1 * * MON'
 6 | 
 7 | jobs:
 8 |   weekly-gputests:
 9 |     strategy:
10 |       fail-fast: false
11 |       matrix:
12 |         branch: [master, v4]
13 |     if: github.repository_owner == 'explosion'
14 |     runs-on: ubuntu-latest
15 |     steps:
16 |       - name: Trigger buildkite build
17 |         uses: buildkite/trigger-pipeline-action@v1.2.0
18 |         env:
19 |           PIPELINE: explosion-ai/spacy-slow-gpu-tests
20 |           BRANCH: ${{ matrix.branch }}
21 |           MESSAGE: ":github: Weekly GPU + slow tests - triggered from a GitHub Action"
22 |           BUILDKITE_API_ACCESS_TOKEN: ${{ secrets.BUILDKITE_SECRET }}
23 | 


--------------------------------------------------------------------------------
/.github/workflows/lock.yml:
--------------------------------------------------------------------------------
 1 | name: 'Lock Threads'
 2 | 
 3 | on:
 4 |   schedule:
 5 |     - cron: '0 0 * * *'  # check every day
 6 |   workflow_dispatch:
 7 | 
 8 | permissions:
 9 |   issues: write
10 | 
11 | concurrency:
12 |   group: lock
13 | 
14 | jobs:
15 |   action:
16 |     if: github.repository_owner == 'explosion'
17 |     runs-on: ubuntu-latest
18 |     steps:
19 |       - uses: dessant/lock-threads@v5
20 |         with:
21 |           process-only: 'issues'
22 |           issue-inactive-days: '30'
23 |           issue-comment: >
24 |             This thread has been automatically locked since there
25 |             has not been any recent activity after it was closed.
26 |             Please open a new issue for related bugs.
27 | 


--------------------------------------------------------------------------------
/.pre-commit-config.yaml:
--------------------------------------------------------------------------------
 1 | repos:
 2 | -   repo: https://github.com/ambv/black
 3 |     rev: 22.3.0
 4 |     hooks:
 5 |     - id: black
 6 |       language_version: python3.7
 7 |       additional_dependencies: ['click==8.0.4']
 8 | -   repo: https://github.com/pycqa/flake8
 9 |     rev: 5.0.4
10 |     hooks:
11 |     - id: flake8
12 |       args:
13 |         - "--config=setup.cfg"
14 | 


--------------------------------------------------------------------------------
/CITATION.cff:
--------------------------------------------------------------------------------
 1 | cff-version: 1.2.0
 2 | preferred-citation:
 3 |   type: article
 4 |   message: "If you use spaCy, please cite it as below."
 5 |   authors:
 6 |   - family-names: "Honnibal"
 7 |     given-names: "Matthew"
 8 |   - family-names: "Montani"
 9 |     given-names: "Ines"
10 |   - family-names: "Van Landeghem"
11 |     given-names: "Sofie"
12 |   - family-names: "Boyd"
13 |     given-names: "Adriane"
14 |   title: "spaCy: Industrial-strength Natural Language Processing in Python"
15 |   doi: "10.5281/zenodo.1212303"
16 |   year: 2020
17 | 


--------------------------------------------------------------------------------
/MANIFEST.in:
--------------------------------------------------------------------------------
 1 | recursive-include spacy *.pyi *.pyx *.pxd *.txt *.cfg *.jinja *.toml *.hh
 2 | include LICENSE
 3 | include README.md
 4 | include pyproject.toml
 5 | include spacy/py.typed
 6 | recursive-include spacy/cli *.yml
 7 | recursive-include spacy/tests *.json
 8 | recursive-include licenses *
 9 | recursive-exclude spacy *.cpp
10 | 


--------------------------------------------------------------------------------
/bin/get-package.sh:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env bash
 2 | 
 3 | set -e
 4 | 
 5 | version=$(grep "__title__ = " spacy/about.py)
 6 | version=${version/__title__ = }
 7 | version=${version/\'/}
 8 | version=${version/\'/}
 9 | version=${version/\"/}
10 | version=${version/\"/}
11 | 
12 | echo $version
13 | 


--------------------------------------------------------------------------------
/bin/get-version.sh:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env bash
 2 | 
 3 | set -e
 4 | 
 5 | version=$(grep "__version__ = " spacy/about.py)
 6 | version=${version/__version__ = }
 7 | version=${version/\'/}
 8 | version=${version/\'/}
 9 | version=${version/\"/}
10 | version=${version/\"/}
11 | 
12 | echo $version
13 | 


--------------------------------------------------------------------------------
/bin/push-tag.sh:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env bash
 2 | 
 3 | set -e
 4 | 
 5 | # Insist repository is clean
 6 | git diff-index --quiet HEAD
 7 | 
 8 | git checkout $1
 9 | git pull origin $1
10 | git push origin $1
11 | 
12 | version=$(grep "__version__ = " spacy/about.py)
13 | version=${version/__version__ = }
14 | version=${version/\'/}
15 | version=${version/\'/}
16 | version=${version/\"/}
17 | version=${version/\"/}
18 | git tag "v$version"
19 | git push origin "v$version"
20 | 


--------------------------------------------------------------------------------
/bin/release.sh:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env bash
 2 | 
 3 | set -e
 4 | 
 5 | # Insist repository is clean
 6 | git diff-index --quiet HEAD
 7 | 
 8 | version=$(grep "__version__ = " spacy/about.py)
 9 | version=${version/__version__ = }
10 | version=${version/\'/}
11 | version=${version/\'/}
12 | version=${version/\"/}
13 | version=${version/\"/}
14 | 
15 | echo "Pushing release-v"$version
16 | 
17 | git tag -d release-v$version || true
18 | git push origin :release-v$version || true
19 | git tag release-v$version
20 | git push origin release-v$version
21 | 


--------------------------------------------------------------------------------
/build-constraints.txt:
--------------------------------------------------------------------------------
1 | # build version constraints for use with wheelwright
2 | numpy>=2.0.0,<3.0.0
3 | 


--------------------------------------------------------------------------------
/examples/training/README.md:
--------------------------------------------------------------------------------
1 | <a href="https://explosion.ai"><img src="https://explosion.ai/assets/img/logo.svg" width="125" height="125" align="right" /></a>
2 | 
3 | # spaCy examples
4 | 
5 | See [examples/README.md](../README.md)
6 | 


--------------------------------------------------------------------------------
/extra/example_data/ner_example_data/README.md:
--------------------------------------------------------------------------------
 1 | ## Examples of NER/IOB data that can be converted with `spacy convert`
 2 | 
 3 | To convert an IOB file to `.spacy` ([`DocBin`](https://spacy.io/api/docbin))
 4 | for spaCy v3:
 5 | 
 6 | ```bash
 7 | python -m spacy convert -c iob -s -n 10 -b en_core_web_sm file.iob .
 8 | ```
 9 | 
10 | See all the `spacy convert` options: https://spacy.io/api/cli#convert
11 | 
12 | ---
13 | 
14 | The spaCy v2 JSON training files were generated using **spaCy v2** with:
15 | 
16 | ```bash
17 | python -m spacy convert -c iob -s -n 10 -b en file.iob
18 | ```
19 | 
20 | To convert an existing JSON training file to `.spacy` for spaCy v3, convert
21 | with **spaCy v3**:
22 | 
23 | ```bash
24 | python -m spacy convert file.json .
25 | ```
26 | 


--------------------------------------------------------------------------------
/extra/example_data/ner_example_data/ner-sent-per-line.iob:
--------------------------------------------------------------------------------
1 | When|WRB|O Sebastian|NNP|B-PERSON Thrun|NNP|I-PERSON started|VBD|O working|VBG|O on|IN|O self|NN|O -|HYPH|O driving|VBG|O cars|NNS|O at|IN|O Google|NNP|B-ORG in|IN|O 2007|CD|B-DATE ,|,|O few|JJ|O people|NNS|O outside|RB|O of|IN|O the|DT|O company|NN|O took|VBD|O him|PRP|O seriously|RB|O .|.|O
2 | “|''|O I|PRP|O can|MD|O tell|VB|O you|PRP|O very|RB|O senior|JJ|O CEOs|NNS|O of|IN|O major|JJ|O American|JJ|B-NORP car|NN|O companies|NNS|O would|MD|O shake|VB|O my|PRP$|O hand|NN|O and|CC|O turn|VB|O away|RB|O because|IN|O I|PRP|O was|VBD|O n’t|RB|O worth|JJ|O talking|VBG|O to|IN|O ,|,|O ”|''|O said|VBD|O Thrun|NNP|B-PERSON ,|,|O in|IN|O an|DT|O interview|NN|O with|IN|O Recode|NNP|B-ORG earlier|RBR|B-DATE this|DT|I-DATE week|NN|I-DATE .|.|O
3 | 


--------------------------------------------------------------------------------
/spacy/__init__.pxd:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/explosion/spaCy/41e07772dc5805594bab2997a090a9033e26bf56/spacy/__init__.pxd


--------------------------------------------------------------------------------
/spacy/__main__.py:
--------------------------------------------------------------------------------
1 | if __name__ == "__main__":
2 |     from spacy.cli import setup_cli
3 | 
4 |     setup_cli()
5 | 


--------------------------------------------------------------------------------
/spacy/about.py:
--------------------------------------------------------------------------------
1 | # fmt: off
2 | __title__ = "spacy"
3 | __version__ = "3.8.7"
4 | __download_url__ = "https://github.com/explosion/spacy-models/releases/download"
5 | __compatibility__ = "https://raw.githubusercontent.com/explosion/spacy-models/master/compatibility.json"
6 | 


--------------------------------------------------------------------------------
/spacy/cli/project/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/explosion/spaCy/41e07772dc5805594bab2997a090a9033e26bf56/spacy/cli/project/__init__.py


--------------------------------------------------------------------------------
/spacy/cli/project/assets.py:
--------------------------------------------------------------------------------
1 | from weasel.cli.assets import *
2 | 


--------------------------------------------------------------------------------
/spacy/cli/project/clone.py:
--------------------------------------------------------------------------------
1 | from weasel.cli.clone import *
2 | 


--------------------------------------------------------------------------------
/spacy/cli/project/document.py:
--------------------------------------------------------------------------------
1 | from weasel.cli.document import *
2 | 


--------------------------------------------------------------------------------
/spacy/cli/project/dvc.py:
--------------------------------------------------------------------------------
1 | from weasel.cli.dvc import *
2 | 


--------------------------------------------------------------------------------
/spacy/cli/project/pull.py:
--------------------------------------------------------------------------------
1 | from weasel.cli.pull import *
2 | 


--------------------------------------------------------------------------------
/spacy/cli/project/push.py:
--------------------------------------------------------------------------------
1 | from weasel.cli.push import *
2 | 


--------------------------------------------------------------------------------
/spacy/cli/project/remote_storage.py:
--------------------------------------------------------------------------------
1 | from weasel.cli.remote_storage import *
2 | 


--------------------------------------------------------------------------------
/spacy/cli/project/run.py:
--------------------------------------------------------------------------------
1 | from weasel.cli.run import *
2 | 


--------------------------------------------------------------------------------
/spacy/kb/__init__.py:
--------------------------------------------------------------------------------
 1 | from .candidate import Candidate, get_candidates, get_candidates_batch
 2 | from .kb import KnowledgeBase
 3 | from .kb_in_memory import InMemoryLookupKB
 4 | 
 5 | __all__ = [
 6 |     "Candidate",
 7 |     "KnowledgeBase",
 8 |     "InMemoryLookupKB",
 9 |     "get_candidates",
10 |     "get_candidates_batch",
11 | ]
12 | 


--------------------------------------------------------------------------------
/spacy/kb/candidate.pxd:
--------------------------------------------------------------------------------
 1 | from libcpp.vector cimport vector
 2 | 
 3 | from ..typedefs cimport hash_t
 4 | from .kb cimport KnowledgeBase
 5 | 
 6 | 
 7 | # Object used by the Entity Linker that summarizes one entity-alias candidate
 8 | # combination.
 9 | cdef class Candidate:
10 |     cdef readonly KnowledgeBase kb
11 |     cdef hash_t entity_hash
12 |     cdef float entity_freq
13 |     cdef vector[float] entity_vector
14 |     cdef hash_t alias_hash
15 |     cdef float prior_prob
16 | 


--------------------------------------------------------------------------------
/spacy/kb/kb.pxd:
--------------------------------------------------------------------------------
 1 | """Knowledge-base for entity or concept linking."""
 2 | 
 3 | from cymem.cymem cimport Pool
 4 | from libc.stdint cimport int64_t
 5 | 
 6 | from ..vocab cimport Vocab
 7 | 
 8 | 
 9 | cdef class KnowledgeBase:
10 |     cdef Pool mem
11 |     cdef readonly Vocab vocab
12 |     cdef readonly int64_t entity_vector_length
13 | 


--------------------------------------------------------------------------------
/spacy/lang/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/explosion/spaCy/41e07772dc5805594bab2997a090a9033e26bf56/spacy/lang/__init__.py


--------------------------------------------------------------------------------
/spacy/lang/af/__init__.py:
--------------------------------------------------------------------------------
 1 | from ...language import BaseDefaults, Language
 2 | from .stop_words import STOP_WORDS
 3 | 
 4 | 
 5 | class AfrikaansDefaults(BaseDefaults):
 6 |     stop_words = STOP_WORDS
 7 | 
 8 | 
 9 | class Afrikaans(Language):
10 |     lang = "af"
11 |     Defaults = AfrikaansDefaults
12 | 
13 | 
14 | __all__ = ["Afrikaans"]
15 | 


--------------------------------------------------------------------------------
/spacy/lang/af/stop_words.py:
--------------------------------------------------------------------------------
 1 | # Source: https://github.com/stopwords-iso/stopwords-af
 2 | 
 3 | STOP_WORDS = set(
 4 |     """
 5 | 'n
 6 | aan
 7 | af
 8 | al
 9 | as
10 | baie
11 | by
12 | daar
13 | dag
14 | dat
15 | die
16 | dit
17 | een
18 | ek
19 | en
20 | gaan
21 | gesê
22 | haar
23 | het
24 | hom
25 | hulle
26 | hy
27 | in
28 | is
29 | jou
30 | jy
31 | kan
32 | kom
33 | ma
34 | maar
35 | met
36 | my
37 | na
38 | nie
39 | om
40 | ons
41 | op
42 | saam
43 | sal
44 | se
45 | sien
46 | so
47 | sy
48 | te
49 | toe
50 | uit
51 | van
52 | vir
53 | was
54 | wat
55 | ŉ
56 | """.split()
57 | )
58 | 


--------------------------------------------------------------------------------
/spacy/lang/am/examples.py:
--------------------------------------------------------------------------------
 1 | """
 2 | Example sentences to test spaCy and its language models.
 3 | 
 4 | >>> from spacy.lang.am.examples import sentences
 5 | >>> docs = nlp.pipe(sentences)
 6 | """
 7 | 
 8 | 
 9 | sentences = [
10 |     "አፕል የዩኬን ጅምር ድርጅት በ 1 ቢሊዮን ዶላር ለመግዛት አስቧል።",
11 |     "የራስ ገዝ መኪኖች የኢንሹራንስ ኃላፊነትን ወደ አምራቾች ያዛውራሉ",
12 |     "ሳን ፍራንሲስኮ የእግረኛ መንገድ አቅርቦት ሮቦቶችን ማገድን ይመለከታል",
13 |     "ለንደን በእንግሊዝ የምትገኝ ትልቅ ከተማ ናት።",
14 |     "የት ነህ?",
15 |     "የፈረንሳይ ፕሬዝዳንት ማናቸው?",
16 |     "የአሜሪካ ዋና ከተማ ምንድነው?",
17 |     "ባራክ ኦባማ መቼ ተወለደ?",
18 | ]
19 | 


--------------------------------------------------------------------------------
/spacy/lang/am/punctuation.py:
--------------------------------------------------------------------------------
 1 | from ..char_classes import (
 2 |     ALPHA_UPPER,
 3 |     CURRENCY,
 4 |     LIST_ELLIPSES,
 5 |     LIST_PUNCT,
 6 |     LIST_QUOTES,
 7 |     UNITS,
 8 | )
 9 | 
10 | _list_punct = LIST_PUNCT + "፡ ። ፣ ፤ ፥ ፦ ፧ ፠ ፨".strip().split()
11 | 
12 | _suffixes = (
13 |     _list_punct
14 |     + LIST_ELLIPSES
15 |     + LIST_QUOTES
16 |     + [
17 |         r"(?<=[0-9])\+",
18 |         # Amharic is written from Left-To-Right
19 |         r"(?<=[0-9])(?:{c})".format(c=CURRENCY),
20 |         r"(?<=[0-9])(?:{u})".format(u=UNITS),
21 |         r"(?<=[{au}][{au}])\.".format(au=ALPHA_UPPER),
22 |     ]
23 | )
24 | 
25 | TOKENIZER_SUFFIXES = _suffixes
26 | 


--------------------------------------------------------------------------------
/spacy/lang/am/tokenizer_exceptions.py:
--------------------------------------------------------------------------------
 1 | from ...symbols import NORM, ORTH
 2 | 
 3 | _exc = {}
 4 | 
 5 | 
 6 | for exc_data in [
 7 |     {ORTH: "ት/ቤት"},
 8 |     {ORTH: "ወ/ሮ", NORM: "ወይዘሮ"},
 9 | ]:
10 |     _exc[exc_data[ORTH]] = [exc_data]
11 | 
12 | 
13 | for orth in [
14 |     "ዓ.ም.",
15 |     "ኪ.ሜ.",
16 | ]:
17 |     _exc[orth] = [{ORTH: orth}]
18 | 
19 | 
20 | TOKENIZER_EXCEPTIONS = _exc
21 | 


--------------------------------------------------------------------------------
/spacy/lang/ar/__init__.py:
--------------------------------------------------------------------------------
 1 | from ...language import BaseDefaults, Language
 2 | from .lex_attrs import LEX_ATTRS
 3 | from .punctuation import TOKENIZER_SUFFIXES
 4 | from .stop_words import STOP_WORDS
 5 | from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS
 6 | 
 7 | 
 8 | class ArabicDefaults(BaseDefaults):
 9 |     tokenizer_exceptions = TOKENIZER_EXCEPTIONS
10 |     suffixes = TOKENIZER_SUFFIXES
11 |     stop_words = STOP_WORDS
12 |     lex_attr_getters = LEX_ATTRS
13 |     writing_system = {"direction": "rtl", "has_case": False, "has_letters": True}
14 | 
15 | 
16 | class Arabic(Language):
17 |     Defaults = ArabicDefaults
18 |     lang = "ar"
19 | 
20 | 
21 | __all__ = ["Arabic"]
22 | 


--------------------------------------------------------------------------------
/spacy/lang/ar/examples.py:
--------------------------------------------------------------------------------
 1 | """
 2 | Example sentences to test spaCy and its language models.
 3 | 
 4 | >>> from spacy.lang.ar.examples import sentences
 5 | >>> docs = nlp.pipe(sentences)
 6 | """
 7 | 
 8 | sentences = [
 9 |     "نال الكاتب خالد توفيق  جائزة الرواية العربية في معرض الشارقة الدولي للكتاب",
10 |     "أين تقع دمشق ؟",
11 |     "كيف حالك ؟",
12 |     "هل يمكن ان نلتقي على الساعة الثانية عشرة ظهرا ؟",
13 |     "ماهي أبرز التطورات السياسية، الأمنية والاجتماعية في العالم ؟",
14 |     "هل بالإمكان أن نلتقي غدا؟",
15 |     "هناك نحو 382 مليون شخص مصاب بداء السكَّري في العالم",
16 |     "كشفت دراسة حديثة أن الخيل تقرأ تعبيرات الوجه وتستطيع أن تتذكر مشاعر الناس وعواطفهم",
17 | ]
18 | 


--------------------------------------------------------------------------------
/spacy/lang/ar/punctuation.py:
--------------------------------------------------------------------------------
 1 | from ..char_classes import (
 2 |     ALPHA_UPPER,
 3 |     CURRENCY,
 4 |     LIST_ELLIPSES,
 5 |     LIST_PUNCT,
 6 |     LIST_QUOTES,
 7 |     UNITS,
 8 | )
 9 | 
10 | _suffixes = (
11 |     LIST_PUNCT
12 |     + LIST_ELLIPSES
13 |     + LIST_QUOTES
14 |     + [
15 |         r"(?<=[0-9])\+",
16 |         # Arabic is written from Right-To-Left
17 |         r"(?<=[0-9])(?:{c})".format(c=CURRENCY),
18 |         r"(?<=[0-9])(?:{u})".format(u=UNITS),
19 |         r"(?<=[{au}][{au}])\.".format(au=ALPHA_UPPER),
20 |     ]
21 | )
22 | 
23 | TOKENIZER_SUFFIXES = _suffixes
24 | 


--------------------------------------------------------------------------------
/spacy/lang/az/__init__.py:
--------------------------------------------------------------------------------
 1 | from ...language import BaseDefaults, Language
 2 | from .lex_attrs import LEX_ATTRS
 3 | from .stop_words import STOP_WORDS
 4 | 
 5 | 
 6 | class AzerbaijaniDefaults(BaseDefaults):
 7 |     lex_attr_getters = LEX_ATTRS
 8 |     stop_words = STOP_WORDS
 9 | 
10 | 
11 | class Azerbaijani(Language):
12 |     lang = "az"
13 |     Defaults = AzerbaijaniDefaults
14 | 
15 | 
16 | __all__ = ["Azerbaijani"]
17 | 


--------------------------------------------------------------------------------
/spacy/lang/az/examples.py:
--------------------------------------------------------------------------------
 1 | """
 2 | Example sentences to test spaCy and its language models.
 3 | >>> from spacy.lang.az.examples import sentences
 4 | >>> docs = nlp.pipe(sentences)
 5 | """
 6 | 
 7 | 
 8 | sentences = [
 9 |     "Bu bir cümlədir.",
10 |     "Necəsən?",
11 |     "Qarabağ ordeni vətən müharibəsində qələbə münasibəti ilə təsis edilmişdir.",
12 |     "Məktəbimizə Bakıdan bir tarix müəllimi gəlmişdi.",
13 |     "Atılan növbəti mərmilər lap yaxınlıqda partladı.",
14 |     "Sinqapur koronavirus baxımından ən təhlükəsiz ölkələr sırasındadır.",
15 |     "Marsda ilk sınaq uçuşu həyata keçirilib.",
16 |     "SSRİ dağılandan bəri 5 sahil dövləti Xəzərin statusunu müəyyən edə bilməyiblər.",
17 |     "Videoda beyninə xüsusi çip yerləşdirilmiş meymun əks olunub.",
18 | ]
19 | 


--------------------------------------------------------------------------------
/spacy/lang/bg/examples.py:
--------------------------------------------------------------------------------
 1 | """
 2 | Example sentences to test spaCy and its language models.
 3 | 
 4 | >>> from spacy.lang.bg.examples import sentences
 5 | >>> docs = nlp.pipe(sentences)
 6 | """
 7 | 
 8 | sentences = [
 9 |     "Епъл иска да купи английски стартъп за 1 милиард долара."
10 |     "Автономните коли прехвърлят застрахователната отговорност към производителите."
11 |     "Сан Франциско обмисля забрана на роботи доставящи по тротоари. "
12 |     "Лондон е голям град в Обединеното Кралство."
13 | ]
14 | 


--------------------------------------------------------------------------------
/spacy/lang/bn/examples.py:
--------------------------------------------------------------------------------
 1 | """
 2 | Example sentences to test spaCy and its language models.
 3 | 
 4 | >>> from spacy.lang.bn.examples import sentences
 5 | >>> docs = nlp.pipe(sentences)
 6 | """
 7 | 
 8 | 
 9 | sentences = ["তুই খুব ভালো", "আজ আমরা ডাক্তার দেখতে যাবো", "আমি জানি না "]
10 | 


--------------------------------------------------------------------------------
/spacy/lang/bo/__init__.py:
--------------------------------------------------------------------------------
 1 | from ...language import BaseDefaults, Language
 2 | from .lex_attrs import LEX_ATTRS
 3 | from .stop_words import STOP_WORDS
 4 | 
 5 | 
 6 | class TibetanDefaults(BaseDefaults):
 7 |     lex_attr_getters = LEX_ATTRS
 8 |     stop_words = STOP_WORDS
 9 | 
10 | 
11 | class Tibetan(Language):
12 |     lang = "bo"
13 |     Defaults = TibetanDefaults
14 | 
15 | 
16 | __all__ = ["Tibetan"]
17 | 


--------------------------------------------------------------------------------
/spacy/lang/bo/examples.py:
--------------------------------------------------------------------------------
 1 | """
 2 | Example sentences to test spaCy and its language models.
 3 | 
 4 | >>> from spacy.lang.bo.examples import sentences
 5 | >>> docs = nlp.pipe(sentences)
 6 | """
 7 | 
 8 | 
 9 | sentences = [
10 |     "དོན་དུ་རྒྱ་མཚོ་བླ་མ་ཞེས་བྱ་ཞིང༌།",
11 |     "ཏཱ་ལའི་ཞེས་པ་ནི་སོག་སྐད་ཡིན་པ་དེ་བོད་སྐད་དུ་རྒྱ་མཚོའི་དོན་དུ་འཇུག",
12 |     "སོག་པོ་ཨལ་ཐན་རྒྱལ་པོས་རྒྱལ་དབང་བསོད་ནམས་རྒྱ་མཚོར་ཆེ་བསྟོད་ཀྱི་མཚན་གསོལ་བ་ཞིག་ཡིན་ཞིང༌།",
13 |     "རྗེས་སུ་རྒྱལ་བ་དགེ་འདུན་གྲུབ་དང༌། དགེ་འདུན་རྒྱ་མཚོ་སོ་སོར་ཡང་ཏཱ་ལའི་བླ་མའི་སྐུ་ཕྲེང་དང་པོ་དང༌།",
14 |     "གཉིས་པའི་མཚན་དེ་གསོལ་ཞིང༌།༸རྒྱལ་དབང་སྐུ་ཕྲེང་ལྔ་པས་དགའ་ལྡན་ཕོ་བྲང་གི་སྲིད་དབང་བཙུགས་པ་ནས་ཏཱ་ལའི་བླ་མ་ནི་བོད་ཀྱི་ཆོས་སྲིད་གཉིས་ཀྱི་དབུ་ཁྲིད་དུ་གྱུར་ཞིང་།",
15 |     "ད་ལྟའི་བར་ཏཱ་ལའི་བླ་མ་སྐུ་ཕྲེང་བཅུ་བཞི་བྱོན་ཡོད།",
16 | ]
17 | 


--------------------------------------------------------------------------------
/spacy/lang/ca/examples.py:
--------------------------------------------------------------------------------
 1 | """
 2 | Example sentences to test spaCy and its language models.
 3 | 
 4 | >>> from spacy.lang.ca.examples import sentences
 5 | >>> docs = nlp.pipe(sentences)
 6 | """
 7 | 
 8 | 
 9 | sentences = [
10 |     "Apple està buscant comprar una startup del Regne Unit per mil milions de dòlars",
11 |     "Els cotxes autònoms deleguen la responsabilitat de l'assegurança als seus fabricants",
12 |     "San Francisco analitza prohibir els robots de repartiment",
13 |     "Londres és una gran ciutat del Regne Unit",
14 |     "El gat menja peix",
15 |     "Veig a l'home amb el telescopi",
16 |     "L'aranya menja mosques",
17 |     "El pingüí incuba en el seu niu",
18 | ]
19 | 


--------------------------------------------------------------------------------
/spacy/lang/cs/__init__.py:
--------------------------------------------------------------------------------
 1 | from ...language import BaseDefaults, Language
 2 | from .lex_attrs import LEX_ATTRS
 3 | from .stop_words import STOP_WORDS
 4 | 
 5 | 
 6 | class CzechDefaults(BaseDefaults):
 7 |     lex_attr_getters = LEX_ATTRS
 8 |     stop_words = STOP_WORDS
 9 | 
10 | 
11 | class Czech(Language):
12 |     lang = "cs"
13 |     Defaults = CzechDefaults
14 | 
15 | 
16 | __all__ = ["Czech"]
17 | 


--------------------------------------------------------------------------------
/spacy/lang/da/__init__.py:
--------------------------------------------------------------------------------
 1 | from ...language import BaseDefaults, Language
 2 | from .lex_attrs import LEX_ATTRS
 3 | from .punctuation import TOKENIZER_INFIXES, TOKENIZER_SUFFIXES
 4 | from .stop_words import STOP_WORDS
 5 | from .syntax_iterators import SYNTAX_ITERATORS
 6 | from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS
 7 | 
 8 | 
 9 | class DanishDefaults(BaseDefaults):
10 |     tokenizer_exceptions = TOKENIZER_EXCEPTIONS
11 |     infixes = TOKENIZER_INFIXES
12 |     suffixes = TOKENIZER_SUFFIXES
13 |     lex_attr_getters = LEX_ATTRS
14 |     stop_words = STOP_WORDS
15 |     syntax_iterators = SYNTAX_ITERATORS
16 | 
17 | 
18 | class Danish(Language):
19 |     lang = "da"
20 |     Defaults = DanishDefaults
21 | 
22 | 
23 | __all__ = ["Danish"]
24 | 


--------------------------------------------------------------------------------
/spacy/lang/da/examples.py:
--------------------------------------------------------------------------------
 1 | """
 2 | Example sentences to test spaCy and its language models.
 3 | 
 4 | >>> from spacy.lang.da.examples import sentences
 5 | >>> docs = nlp.pipe(sentences)
 6 | """
 7 | 
 8 | sentences = [
 9 |     "Apple overvejer at købe et britisk startup for 1 milliard dollar.",
10 |     "Selvkørende biler flytter forsikringsansvaret over på producenterne.",
11 |     "San Francisco overvejer at forbyde udbringningsrobotter på fortovet.",
12 |     "London er en storby i Storbritannien.",
13 |     "Hvor er du?",
14 |     "Hvem er Frankrings president?",
15 |     "Hvad er hovedstaden i USA?",
16 |     "Hvornår blev Barack Obama født?",
17 | ]
18 | 


--------------------------------------------------------------------------------
/spacy/lang/de/__init__.py:
--------------------------------------------------------------------------------
 1 | from ...language import BaseDefaults, Language
 2 | from .punctuation import TOKENIZER_INFIXES, TOKENIZER_PREFIXES, TOKENIZER_SUFFIXES
 3 | from .stop_words import STOP_WORDS
 4 | from .syntax_iterators import SYNTAX_ITERATORS
 5 | from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS
 6 | 
 7 | 
 8 | class GermanDefaults(BaseDefaults):
 9 |     tokenizer_exceptions = TOKENIZER_EXCEPTIONS
10 |     prefixes = TOKENIZER_PREFIXES
11 |     suffixes = TOKENIZER_SUFFIXES
12 |     infixes = TOKENIZER_INFIXES
13 |     syntax_iterators = SYNTAX_ITERATORS
14 |     stop_words = STOP_WORDS
15 | 
16 | 
17 | class German(Language):
18 |     lang = "de"
19 |     Defaults = GermanDefaults
20 | 
21 | 
22 | __all__ = ["German"]
23 | 


--------------------------------------------------------------------------------
/spacy/lang/de/examples.py:
--------------------------------------------------------------------------------
 1 | """
 2 | Example sentences to test spaCy and its language models.
 3 | 
 4 | >>> from spacy.lang.de.examples import sentences
 5 | >>> docs = nlp.pipe(sentences)
 6 | """
 7 | 
 8 | 
 9 | sentences = [
10 |     "Die ganze Stadt ist ein Startup: Shenzhen ist das Silicon Valley für Hardware-Firmen",
11 |     "Wie deutsche Startups die Technologie vorantreiben wollen: Künstliche Intelligenz",
12 |     "Trend zum Urlaub in Deutschland beschert Gastwirten mehr Umsatz",
13 |     "Bundesanwaltschaft erhebt Anklage gegen mutmaßlichen Schweizer Spion",
14 |     "San Francisco erwägt Verbot von Lieferrobotern",
15 |     "Autonome Fahrzeuge verlagern Haftpflicht auf Hersteller",
16 |     "Wo bist du?",
17 |     "Was ist die Hauptstadt von Deutschland?",
18 | ]
19 | 


--------------------------------------------------------------------------------
/spacy/lang/dsb/__init__.py:
--------------------------------------------------------------------------------
 1 | from ...language import BaseDefaults, Language
 2 | from .lex_attrs import LEX_ATTRS
 3 | from .stop_words import STOP_WORDS
 4 | 
 5 | 
 6 | class LowerSorbianDefaults(BaseDefaults):
 7 |     lex_attr_getters = LEX_ATTRS
 8 |     stop_words = STOP_WORDS
 9 | 
10 | 
11 | class LowerSorbian(Language):
12 |     lang = "dsb"
13 |     Defaults = LowerSorbianDefaults
14 | 
15 | 
16 | __all__ = ["LowerSorbian"]
17 | 


--------------------------------------------------------------------------------
/spacy/lang/dsb/examples.py:
--------------------------------------------------------------------------------
 1 | """
 2 | Example sentences to test spaCy and its language models.
 3 | 
 4 | >>> from spacy.lang.dsb.examples import sentences
 5 | >>> docs = nlp.pipe(sentences)
 6 | """
 7 | 
 8 | 
 9 | sentences = [
10 |     "Z tym stwori so wuměnjenje a zakład za dalše wobdźěłanje přez analyzu tekstoweje struktury a semantisku anotaciju a z tym tež za tu předstajenu digitalnu online-wersiju.",
11 |     "Mi so tu jara derje spodoba.",
12 |     "Kotre nowniny chceće měć?",
13 |     "Tak ako w slědnem lěśe jo teke lětosa jano doma zapustowaś móžno.",
14 |     "Zwóstanjo pótakem hyšći wjele źěła.",
15 | ]
16 | 


--------------------------------------------------------------------------------
/spacy/lang/dsb/stop_words.py:
--------------------------------------------------------------------------------
 1 | STOP_WORDS = set(
 2 |     """
 3 | a abo aby ako ale až
 4 | 
 5 | daniž dokulaž
 6 | 
 7 | gaž
 8 | 
 9 | jolic
10 | 
11 | pak pótom
12 | 
13 | teke togodla
14 | """.split()
15 | )
16 | 


--------------------------------------------------------------------------------
/spacy/lang/en/examples.py:
--------------------------------------------------------------------------------
 1 | """
 2 | Example sentences to test spaCy and its language models.
 3 | 
 4 | >>> from spacy.lang.en.examples import sentences
 5 | >>> docs = nlp.pipe(sentences)
 6 | """
 7 | 
 8 | 
 9 | sentences = [
10 |     "Apple is looking at buying U.K. startup for $1 billion",
11 |     "Autonomous cars shift insurance liability toward manufacturers",
12 |     "San Francisco considers banning sidewalk delivery robots",
13 |     "London is a big city in the United Kingdom.",
14 |     "Where are you?",
15 |     "Who is the president of France?",
16 |     "What is the capital of the United States?",
17 |     "When was Barack Obama born?",
18 | ]
19 | 


--------------------------------------------------------------------------------
/spacy/lang/en/punctuation.py:
--------------------------------------------------------------------------------
 1 | from ..char_classes import (
 2 |     ALPHA,
 3 |     ALPHA_LOWER,
 4 |     ALPHA_UPPER,
 5 |     CONCAT_QUOTES,
 6 |     HYPHENS,
 7 |     LIST_ELLIPSES,
 8 |     LIST_ICONS,
 9 | )
10 | 
11 | _infixes = (
12 |     LIST_ELLIPSES
13 |     + LIST_ICONS
14 |     + [
15 |         r"(?<=[0-9])[+\-\*^](?=[0-9-])",
16 |         r"(?<=[{al}{q}])\.(?=[{au}{q}])".format(
17 |             al=ALPHA_LOWER, au=ALPHA_UPPER, q=CONCAT_QUOTES
18 |         ),
19 |         r"(?<=[{a}]),(?=[{a}])".format(a=ALPHA),
20 |         r"(?<=[{a}0-9])(?:{h})(?=[{a}])".format(a=ALPHA, h=HYPHENS),
21 |         r"(?<=[{a}0-9])[:<>=/](?=[{a}])".format(a=ALPHA),
22 |     ]
23 | )
24 | 
25 | 
26 | TOKENIZER_INFIXES = _infixes
27 | 


--------------------------------------------------------------------------------
/spacy/lang/et/__init__.py:
--------------------------------------------------------------------------------
 1 | from ...language import BaseDefaults, Language
 2 | from .stop_words import STOP_WORDS
 3 | 
 4 | 
 5 | class EstonianDefaults(BaseDefaults):
 6 |     stop_words = STOP_WORDS
 7 | 
 8 | 
 9 | class Estonian(Language):
10 |     lang = "et"
11 |     Defaults = EstonianDefaults
12 | 
13 | 
14 | __all__ = ["Estonian"]
15 | 


--------------------------------------------------------------------------------
/spacy/lang/et/stop_words.py:
--------------------------------------------------------------------------------
 1 | # Source: https://github.com/stopwords-iso/stopwords-et
 2 | 
 3 | STOP_WORDS = set(
 4 |     """
 5 | aga
 6 | ei
 7 | et
 8 | ja
 9 | jah
10 | kas
11 | kui
12 | kõik
13 | ma
14 | me
15 | mida
16 | midagi
17 | mind
18 | minu
19 | mis
20 | mu
21 | mul
22 | mulle
23 | nad
24 | nii
25 | oled
26 | olen
27 | oli
28 | oma
29 | on
30 | pole
31 | sa
32 | seda
33 | see
34 | selle
35 | siin
36 | siis
37 | ta
38 | te
39 | ära
40 | """.split()
41 | )
42 | 


--------------------------------------------------------------------------------
/spacy/lang/eu/__init__.py:
--------------------------------------------------------------------------------
 1 | from ...language import BaseDefaults, Language
 2 | from .lex_attrs import LEX_ATTRS
 3 | from .punctuation import TOKENIZER_SUFFIXES
 4 | from .stop_words import STOP_WORDS
 5 | 
 6 | 
 7 | class BasqueDefaults(BaseDefaults):
 8 |     suffixes = TOKENIZER_SUFFIXES
 9 |     stop_words = STOP_WORDS
10 |     lex_attr_getters = LEX_ATTRS
11 | 
12 | 
13 | class Basque(Language):
14 |     lang = "eu"
15 |     Defaults = BasqueDefaults
16 | 
17 | 
18 | __all__ = ["Basque"]
19 | 


--------------------------------------------------------------------------------
/spacy/lang/eu/examples.py:
--------------------------------------------------------------------------------
 1 | """
 2 | Example sentences to test spaCy and its language models.
 3 | 
 4 | >>> from spacy.lang.eu.examples import sentences
 5 | >>> docs = nlp.pipe(sentences)
 6 | """
 7 | 
 8 | sentences = [
 9 |     "bilbon ko castinga egin da eta nik jakin ez zuetako inork egin al du edota parte hartu duen ezagunik ba al du",
10 |     "gaur telebistan entzunda denok martetik gatoz hortaz martzianoak gara beno nire ustez batzuk beste batzuk baino martzianoagoak dira",
11 | ]
12 | 


--------------------------------------------------------------------------------
/spacy/lang/eu/punctuation.py:
--------------------------------------------------------------------------------
1 | from ..punctuation import TOKENIZER_SUFFIXES
2 | 
3 | _suffixes = TOKENIZER_SUFFIXES
4 | 


--------------------------------------------------------------------------------
/spacy/lang/fa/examples.py:
--------------------------------------------------------------------------------
 1 | """
 2 | Example sentences to test spaCy and its language models.
 3 | 
 4 | >>> from spacy.lang.fa.examples import sentences
 5 | >>> docs = nlp.pipe(sentences)
 6 | """
 7 | 
 8 | 
 9 | sentences = [
10 |     "این یک جمله نمونه می باشد.",
11 |     "قرار ما، امروز ساعت ۲:۳۰ بعدازظهر هست!",
12 |     "دیروز علی به من ۲۰۰۰.۱﷼ پول نقد داد.",
13 |     "چطور می‌توان از تهران به کاشان رفت؟",
14 |     "حدود ۸۰٪ هوا از نیتروژن تشکیل شده است.",
15 | ]
16 | 


--------------------------------------------------------------------------------
/spacy/lang/fa/punctuation.py:
--------------------------------------------------------------------------------
 1 | from ..char_classes import (
 2 |     ALPHA_UPPER,
 3 |     CURRENCY,
 4 |     LIST_ELLIPSES,
 5 |     LIST_PUNCT,
 6 |     LIST_QUOTES,
 7 |     UNITS,
 8 | )
 9 | 
10 | _suffixes = (
11 |     LIST_PUNCT
12 |     + LIST_ELLIPSES
13 |     + LIST_QUOTES
14 |     + [
15 |         r"(?<=[0-9])\+",
16 |         r"(?<=[0-9])%",  # 4% -> ["4", "%"]
17 |         # Persian is written from Right-To-Left
18 |         r"(?<=[0-9])(?:{c})".format(c=CURRENCY),
19 |         r"(?<=[0-9])(?:{u})".format(u=UNITS),
20 |         r"(?<=[{au}][{au}])\.".format(au=ALPHA_UPPER),
21 |     ]
22 | )
23 | 
24 | TOKENIZER_SUFFIXES = _suffixes
25 | 


--------------------------------------------------------------------------------
/spacy/lang/fi/__init__.py:
--------------------------------------------------------------------------------
 1 | from ...language import BaseDefaults, Language
 2 | from .lex_attrs import LEX_ATTRS
 3 | from .punctuation import TOKENIZER_INFIXES, TOKENIZER_SUFFIXES
 4 | from .stop_words import STOP_WORDS
 5 | from .syntax_iterators import SYNTAX_ITERATORS
 6 | from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS
 7 | 
 8 | 
 9 | class FinnishDefaults(BaseDefaults):
10 |     infixes = TOKENIZER_INFIXES
11 |     suffixes = TOKENIZER_SUFFIXES
12 |     tokenizer_exceptions = TOKENIZER_EXCEPTIONS
13 |     lex_attr_getters = LEX_ATTRS
14 |     stop_words = STOP_WORDS
15 |     syntax_iterators = SYNTAX_ITERATORS
16 | 
17 | 
18 | class Finnish(Language):
19 |     lang = "fi"
20 |     Defaults = FinnishDefaults
21 | 
22 | 
23 | __all__ = ["Finnish"]
24 | 


--------------------------------------------------------------------------------
/spacy/lang/fi/examples.py:
--------------------------------------------------------------------------------
 1 | """
 2 | Example sentences to test spaCy and its language models.
 3 | >>> from spacy.lang.fi.examples import sentences
 4 | >>> docs = nlp.pipe(sentences)
 5 | """
 6 | 
 7 | sentences = [
 8 |     "Itseajavat autot siirtävät vakuutusvastuun autojen valmistajille",
 9 |     "San Francisco harkitsee toimitusrobottien liikkumisen kieltämistä jalkakäytävillä",
10 |     "Lontoo on suuri kaupunki Yhdistyneessä Kuningaskunnassa.",
11 |     "Missä sinä olet?",
12 |     "Mikä on Yhdysvaltojen pääkaupunki?",
13 |     "Kuka on Suomen presidentti?",
14 |     "Milloin Sauli Niinistö on syntynyt?",
15 | ]
16 | 


--------------------------------------------------------------------------------
/spacy/lang/fo/__init__.py:
--------------------------------------------------------------------------------
 1 | from ...language import BaseDefaults, Language
 2 | from ..punctuation import TOKENIZER_INFIXES, TOKENIZER_PREFIXES, TOKENIZER_SUFFIXES
 3 | from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS
 4 | 
 5 | 
 6 | class FaroeseDefaults(BaseDefaults):
 7 |     tokenizer_exceptions = TOKENIZER_EXCEPTIONS
 8 |     infixes = TOKENIZER_INFIXES
 9 |     suffixes = TOKENIZER_SUFFIXES
10 |     prefixes = TOKENIZER_PREFIXES
11 | 
12 | 
13 | class Faroese(Language):
14 |     lang = "fo"
15 |     Defaults = FaroeseDefaults
16 | 
17 | 
18 | __all__ = ["Faroese"]
19 | 


--------------------------------------------------------------------------------
/spacy/lang/ga/stop_words.py:
--------------------------------------------------------------------------------
 1 | STOP_WORDS = set(
 2 |     """
 3 | a ach ag agus an aon ar arna as
 4 | 
 5 | ba beirt bhúr
 6 | 
 7 | caoga ceathair ceathrar chomh chuig chun cois céad cúig cúigear
 8 | 
 9 | daichead dar de deich deichniúr den dhá do don dtí dá dár dó
10 | 
11 | faoi faoin faoina faoinár fara fiche
12 | 
13 | gach gan go gur
14 | 
15 | haon hocht
16 | 
17 | i iad idir in ina ins inár is
18 | 
19 | le leis lena lenár
20 | 
21 | mar mo muid mé
22 | 
23 | na nach naoi naonúr ná ní níor nó nócha
24 | 
25 | ocht ochtar ochtó os
26 | 
27 | roimh
28 | 
29 | sa seacht seachtar seachtó seasca seisear siad sibh sinn sna sé sí
30 | 
31 | tar thar thú triúr trí trína trínár tríocha tú
32 | 
33 | um
34 | 
35 | ár
36 | 
37 | é éis
38 | 
39 | í
40 | 
41 | ó ón óna ónár
42 | """.split()
43 | )
44 | 


--------------------------------------------------------------------------------
/spacy/lang/gd/__init__.py:
--------------------------------------------------------------------------------
 1 | from typing import Optional
 2 | 
 3 | from ...language import BaseDefaults, Language
 4 | from .stop_words import STOP_WORDS
 5 | from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS
 6 | 
 7 | 
 8 | class ScottishDefaults(BaseDefaults):
 9 |     tokenizer_exceptions = TOKENIZER_EXCEPTIONS
10 |     stop_words = STOP_WORDS
11 | 
12 | 
13 | class Scottish(Language):
14 |     lang = "gd"
15 |     Defaults = ScottishDefaults
16 | 
17 | 
18 | __all__ = ["Scottish"]
19 | 


--------------------------------------------------------------------------------
/spacy/lang/grc/__init__.py:
--------------------------------------------------------------------------------
 1 | from ...language import BaseDefaults, Language
 2 | from .lex_attrs import LEX_ATTRS
 3 | from .punctuation import TOKENIZER_INFIXES, TOKENIZER_PREFIXES, TOKENIZER_SUFFIXES
 4 | from .stop_words import STOP_WORDS
 5 | from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS
 6 | 
 7 | 
 8 | class AncientGreekDefaults(BaseDefaults):
 9 |     tokenizer_exceptions = TOKENIZER_EXCEPTIONS
10 |     prefixes = TOKENIZER_PREFIXES
11 |     suffixes = TOKENIZER_SUFFIXES
12 |     infixes = TOKENIZER_INFIXES
13 |     lex_attr_getters = LEX_ATTRS
14 |     stop_words = STOP_WORDS
15 | 
16 | 
17 | class AncientGreek(Language):
18 |     lang = "grc"
19 |     Defaults = AncientGreekDefaults
20 | 
21 | 
22 | __all__ = ["AncientGreek"]
23 | 


--------------------------------------------------------------------------------
/spacy/lang/grc/examples.py:
--------------------------------------------------------------------------------
 1 | """
 2 | Example sentences to test spaCy and its language models.
 3 | 
 4 | >>> from spacy.lang.grc.examples import sentences
 5 | >>> docs = nlp.pipe(sentences)
 6 | """
 7 | 
 8 | 
 9 | sentences = [
10 |     "ἐρᾷ μὲν ἁγνὸς οὐρανὸς τρῶσαι χθόνα, ἔρως δὲ γαῖαν λαμβάνει γάμου τυχεῖν·",
11 |     "εὐδαίμων Χαρίτων καὶ Μελάνιππος ἔφυ, θείας ἁγητῆρες ἐφαμερίοις φιλότατος.",
12 |     "ὃ μὲν δὴ ἀπόστολος ἐς τὴν Μίλητον ἦν.",
13 |     "Θρασύβουλος δὲ σαφέως προπεπυσμένος πάντα λόγον καὶ εἰδὼς τὰ Ἀλυάττης μέλλοι ποιήσειν μηχανᾶται τοιάδε.",
14 |     "φιλόπαις δ' ἦν ἐκμανῶς καὶ Ἀλέξανδρος ὁ βασιλεύς.",
15 |     "Ἀντίγονος ὁ βασιλεὺς ἐπεκώμαζε τῷ Ζήνωνι",
16 |     "αὐτὰρ ὃ δεύτατος ἦλθεν ἄναξ ἀνδρῶν Ἀγαμέμνων ἕλκος ἔχων",
17 | ]
18 | 


--------------------------------------------------------------------------------
/spacy/lang/gu/__init__.py:
--------------------------------------------------------------------------------
 1 | from ...language import BaseDefaults, Language
 2 | from .stop_words import STOP_WORDS
 3 | 
 4 | 
 5 | class GujaratiDefaults(BaseDefaults):
 6 |     stop_words = STOP_WORDS
 7 | 
 8 | 
 9 | class Gujarati(Language):
10 |     lang = "gu"
11 |     Defaults = GujaratiDefaults
12 | 
13 | 
14 | __all__ = ["Gujarati"]
15 | 


--------------------------------------------------------------------------------
/spacy/lang/gu/examples.py:
--------------------------------------------------------------------------------
 1 | """
 2 | Example sentences to test spaCy and its language models.
 3 | 
 4 | >>> from spacy.lang.gu.examples import sentences
 5 | >>> docs = nlp.pipe(sentences)
 6 | """
 7 | 
 8 | 
 9 | sentences = [
10 |     "લોકશાહી એ સરકારનું એક એવું તંત્ર છે જ્યાં નાગરિકો મત દ્વારા સત્તાનો ઉપયોગ કરે છે.",
11 |     "તે ગુજરાત રાજ્યના ધરમપુર શહેરમાં આવેલું હતું",
12 |     "કર્ણદેવ પહેલો સોલંકી વંશનો રાજા હતો",
13 |     "તેજપાળને બે પત્ની હતી",
14 |     "ગુજરાતમાં ભારતીય જનતા પક્ષનો ઉદય આ સમયગાળા દરમિયાન થયો",
15 |     "આંદોલનકારીઓએ ચીમનભાઇ પટેલના રાજીનામાની માંગણી કરી.",
16 |     "અહિયાં શું જોડાય છે?",
17 |     "મંદિરનો પૂર્વાભિમુખ ભાગ નાના મંડપ સાથે થોડો લંબચોરસ આકારનો છે.",
18 | ]
19 | 


--------------------------------------------------------------------------------
/spacy/lang/he/__init__.py:
--------------------------------------------------------------------------------
 1 | from ...language import BaseDefaults, Language
 2 | from .lex_attrs import LEX_ATTRS
 3 | from .stop_words import STOP_WORDS
 4 | 
 5 | 
 6 | class HebrewDefaults(BaseDefaults):
 7 |     stop_words = STOP_WORDS
 8 |     lex_attr_getters = LEX_ATTRS
 9 |     writing_system = {"direction": "rtl", "has_case": False, "has_letters": True}
10 | 
11 | 
12 | class Hebrew(Language):
13 |     lang = "he"
14 |     Defaults = HebrewDefaults
15 | 
16 | 
17 | __all__ = ["Hebrew"]
18 | 


--------------------------------------------------------------------------------
/spacy/lang/hi/__init__.py:
--------------------------------------------------------------------------------
 1 | from ...language import BaseDefaults, Language
 2 | from .lex_attrs import LEX_ATTRS
 3 | from .stop_words import STOP_WORDS
 4 | 
 5 | 
 6 | class HindiDefaults(BaseDefaults):
 7 |     stop_words = STOP_WORDS
 8 |     lex_attr_getters = LEX_ATTRS
 9 | 
10 | 
11 | class Hindi(Language):
12 |     lang = "hi"
13 |     Defaults = HindiDefaults
14 | 
15 | 
16 | __all__ = ["Hindi"]
17 | 


--------------------------------------------------------------------------------
/spacy/lang/hr/__init__.py:
--------------------------------------------------------------------------------
 1 | from ...language import BaseDefaults, Language
 2 | from .stop_words import STOP_WORDS
 3 | 
 4 | 
 5 | class CroatianDefaults(BaseDefaults):
 6 |     stop_words = STOP_WORDS
 7 | 
 8 | 
 9 | class Croatian(Language):
10 |     lang = "hr"
11 |     Defaults = CroatianDefaults
12 | 
13 | 
14 | __all__ = ["Croatian"]
15 | 


--------------------------------------------------------------------------------
/spacy/lang/hr/examples.py:
--------------------------------------------------------------------------------
 1 | """
 2 | Example sentences to test spaCy and its language models.
 3 | 
 4 | >>> from spacy.lang.hr.examples import sentences
 5 | >>> docs = nlp.pipe(sentences)
 6 | """
 7 | 
 8 | sentences = [
 9 |     "Ovo je rečenica.",
10 |     "Kako se popravlja auto?",
11 |     "Zagreb je udaljen od Ljubljane svega 150 km.",
12 |     "Nećete vjerovati što se dogodilo na ovogodišnjem festivalu!",
13 |     "Budućnost Apple je upitna nakon dugotrajnog pada vrijednosti dionica firme.",
14 |     "Trgovina oružjem predstavlja prijetnju za globalni mir.",
15 | ]
16 | 


--------------------------------------------------------------------------------
/spacy/lang/hsb/__init__.py:
--------------------------------------------------------------------------------
 1 | from ...language import BaseDefaults, Language
 2 | from .lex_attrs import LEX_ATTRS
 3 | from .stop_words import STOP_WORDS
 4 | from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS
 5 | 
 6 | 
 7 | class UpperSorbianDefaults(BaseDefaults):
 8 |     lex_attr_getters = LEX_ATTRS
 9 |     stop_words = STOP_WORDS
10 |     tokenizer_exceptions = TOKENIZER_EXCEPTIONS
11 | 
12 | 
13 | class UpperSorbian(Language):
14 |     lang = "hsb"
15 |     Defaults = UpperSorbianDefaults
16 | 
17 | 
18 | __all__ = ["UpperSorbian"]
19 | 


--------------------------------------------------------------------------------
/spacy/lang/hsb/examples.py:
--------------------------------------------------------------------------------
 1 | """
 2 | Example sentences to test spaCy and its language models.
 3 | 
 4 | >>> from spacy.lang.hsb.examples import sentences
 5 | >>> docs = nlp.pipe(sentences)
 6 | """
 7 | 
 8 | 
 9 | sentences = [
10 |     "To běšo wjelgin raźone a jo se wót luźi derje pśiwzeło. Tak som dožywiła wjelgin",
11 |     "Jogo pśewóźowarce stej groniłej, až how w serbskich stronach njama Santa Claus nic pytaś.",
12 |     "A ten sobuźěłaśeŕ Statneje biblioteki w Barlinju jo pśimjeł drogotne knigły bźez rukajcowu z nagima rukoma!",
13 |     "Take wobchadanje z našym kulturnym derbstwom zewšym njejźo.",
14 |     "Wopśimjeśe drugich pśinoskow jo było na wusokem niwowje, ako pśecej.",
15 | ]
16 | 


--------------------------------------------------------------------------------
/spacy/lang/hsb/stop_words.py:
--------------------------------------------------------------------------------
 1 | STOP_WORDS = set(
 2 |     """
 3 | a abo ale ani
 4 | 
 5 | dokelž
 6 | 
 7 | hdyž
 8 | 
 9 | jeli jelizo
10 | 
11 | kaž
12 | 
13 | pak potom
14 | 
15 | tež tohodla
16 | 
17 | zo zoby
18 | """.split()
19 | )
20 | 


--------------------------------------------------------------------------------
/spacy/lang/hsb/tokenizer_exceptions.py:
--------------------------------------------------------------------------------
 1 | from ...symbols import NORM, ORTH
 2 | from ...util import update_exc
 3 | from ..tokenizer_exceptions import BASE_EXCEPTIONS
 4 | 
 5 | _exc = dict()
 6 | for exc_data in [
 7 |     {ORTH: "mil.", NORM: "milion"},
 8 |     {ORTH: "wob.", NORM: "wobydler"},
 9 | ]:
10 |     _exc[exc_data[ORTH]] = [exc_data]
11 | 
12 | for orth in [
13 |     "resp.",
14 | ]:
15 |     _exc[orth] = [{ORTH: orth}]
16 | 
17 | 
18 | TOKENIZER_EXCEPTIONS = update_exc(BASE_EXCEPTIONS, _exc)
19 | 


--------------------------------------------------------------------------------
/spacy/lang/ht/examples.py:
--------------------------------------------------------------------------------
 1 | """
 2 | Example sentences to test spaCy and its language models.
 3 | 
 4 | >>> from spacy.lang.ht.examples import sentences
 5 | >>> docs = nlp.pipe(sentences)
 6 | """
 7 | 
 8 | 
 9 | sentences = [
10 |     "Apple ap panse achte yon demaraj nan Wayòm Ini pou $1 milya dola",
11 |     "Machin otonòm fè responsablite asirans lan ale sou men fabrikan yo",
12 |     "San Francisco ap konsidere entèdi robo ki livre sou twotwa yo",
13 |     "Lond se yon gwo vil nan Wayòm Ini",
14 |     "Kote ou ye?",
15 |     "Kilès ki prezidan Lafrans?",
16 |     "Ki kapital Etazini?",
17 |     "Kile Barack Obama te fèt?",
18 | ]
19 | 


--------------------------------------------------------------------------------
/spacy/lang/ht/tag_map.py:
--------------------------------------------------------------------------------
 1 | from spacy.symbols import NOUN, VERB, AUX, ADJ, ADV, PRON, DET, ADP, SCONJ, CCONJ, PART, INTJ, NUM, PROPN, PUNCT, SYM, X
 2 | 
 3 | TAG_MAP = {
 4 |     "NOUN": {"pos": NOUN},
 5 |     "VERB": {"pos": VERB},
 6 |     "AUX": {"pos": AUX},
 7 |     "ADJ": {"pos": ADJ},
 8 |     "ADV": {"pos": ADV},
 9 |     "PRON": {"pos": PRON},
10 |     "DET": {"pos": DET},
11 |     "ADP": {"pos": ADP},
12 |     "SCONJ": {"pos": SCONJ},
13 |     "CCONJ": {"pos": CCONJ},
14 |     "PART": {"pos": PART},
15 |     "INTJ": {"pos": INTJ},
16 |     "NUM": {"pos": NUM},
17 |     "PROPN": {"pos": PROPN},
18 |     "PUNCT": {"pos": PUNCT},
19 |     "SYM": {"pos": SYM},
20 |     "X": {"pos": X},
21 | }
22 | 


--------------------------------------------------------------------------------
/spacy/lang/hu/__init__.py:
--------------------------------------------------------------------------------
 1 | from ...language import BaseDefaults, Language
 2 | from .punctuation import TOKENIZER_INFIXES, TOKENIZER_PREFIXES, TOKENIZER_SUFFIXES
 3 | from .stop_words import STOP_WORDS
 4 | from .tokenizer_exceptions import TOKEN_MATCH, TOKENIZER_EXCEPTIONS
 5 | 
 6 | 
 7 | class HungarianDefaults(BaseDefaults):
 8 |     tokenizer_exceptions = TOKENIZER_EXCEPTIONS
 9 |     prefixes = TOKENIZER_PREFIXES
10 |     suffixes = TOKENIZER_SUFFIXES
11 |     infixes = TOKENIZER_INFIXES
12 |     token_match = TOKEN_MATCH
13 |     stop_words = STOP_WORDS
14 | 
15 | 
16 | class Hungarian(Language):
17 |     lang = "hu"
18 |     Defaults = HungarianDefaults
19 | 
20 | 
21 | __all__ = ["Hungarian"]
22 | 


--------------------------------------------------------------------------------
/spacy/lang/hu/examples.py:
--------------------------------------------------------------------------------
 1 | """
 2 | Example sentences to test spaCy and its language models.
 3 | 
 4 | >>> from spacy.lang.hu.examples import sentences
 5 | >>> docs = nlp.pipe(sentences)
 6 | """
 7 | 
 8 | 
 9 | sentences = [
10 |     "Az Apple egy brit startup vásárlását tervezi 1 milliárd dollár értékben.",
11 |     "San Francisco vezetése mérlegeli a járdát használó szállító robotok betiltását.",
12 |     "London az Egyesült Királyság egy nagy városa.",
13 | ]
14 | 


--------------------------------------------------------------------------------
/spacy/lang/hy/__init__.py:
--------------------------------------------------------------------------------
 1 | from ...language import BaseDefaults, Language
 2 | from .lex_attrs import LEX_ATTRS
 3 | from .stop_words import STOP_WORDS
 4 | 
 5 | 
 6 | class ArmenianDefaults(BaseDefaults):
 7 |     lex_attr_getters = LEX_ATTRS
 8 |     stop_words = STOP_WORDS
 9 | 
10 | 
11 | class Armenian(Language):
12 |     lang = "hy"
13 |     Defaults = ArmenianDefaults
14 | 
15 | 
16 | __all__ = ["Armenian"]
17 | 


--------------------------------------------------------------------------------
/spacy/lang/hy/examples.py:
--------------------------------------------------------------------------------
 1 | """
 2 | Example sentences to test spaCy and its language models.
 3 | >>> from spacy.lang.hy.examples import sentences
 4 | >>> docs = nlp.pipe(sentences)
 5 | """
 6 | 
 7 | 
 8 | sentences = [
 9 |     "Լոնդոնը Միացյալ Թագավորության մեծ քաղաք է։",
10 |     "Ո՞վ է Ֆրանսիայի նախագահը։",
11 |     "Ո՞րն է Միացյալ Նահանգների մայրաքաղաքը։",
12 |     "Ե՞րբ է ծնվել Բարաք Օբաման։",
13 | ]
14 | 


--------------------------------------------------------------------------------
/spacy/lang/is/__init__.py:
--------------------------------------------------------------------------------
 1 | from ...language import BaseDefaults, Language
 2 | from .stop_words import STOP_WORDS
 3 | 
 4 | 
 5 | class IcelandicDefaults(BaseDefaults):
 6 |     stop_words = STOP_WORDS
 7 | 
 8 | 
 9 | class Icelandic(Language):
10 |     lang = "is"
11 |     Defaults = IcelandicDefaults
12 | 
13 | 
14 | __all__ = ["Icelandic"]
15 | 


--------------------------------------------------------------------------------
/spacy/lang/it/examples.py:
--------------------------------------------------------------------------------
 1 | """
 2 | Example sentences to test spaCy and its language models.
 3 | 
 4 | >>> from spacy.lang.it.examples import sentences
 5 | >>> docs = nlp.pipe(sentences)
 6 | """
 7 | 
 8 | 
 9 | sentences = [
10 |     "Apple vuole comprare una startup del Regno Unito per un miliardo di dollari",
11 |     "Le automobili a guida autonoma spostano la responsabilità assicurativa verso i produttori",
12 |     "San Francisco prevede di bandire i robot di consegna porta a porta",
13 |     "Londra è una grande città del Regno Unito.",
14 | ]
15 | 


--------------------------------------------------------------------------------
/spacy/lang/ja/examples.py:
--------------------------------------------------------------------------------
 1 | """
 2 | Example sentences to test spaCy and its language models.
 3 | 
 4 | >>> from spacy.lang.ja.examples import sentences
 5 | >>> docs = nlp.pipe(sentences)
 6 | """
 7 | 
 8 | 
 9 | sentences = [
10 |     "アップルがイギリスの新興企業を１０億ドルで購入を検討",
11 |     "自動運転車の損害賠償責任、自動車メーカーに一定の負担を求める",
12 |     "歩道を走る自動配達ロボ、サンフランシスコ市が走行禁止を検討",
13 |     "ロンドンはイギリスの大都市です。",
14 | ]
15 | 


--------------------------------------------------------------------------------
/spacy/lang/ja/tag_orth_map.py:
--------------------------------------------------------------------------------
 1 | from ...symbols import DET, PART, PRON, SPACE, X
 2 | 
 3 | # mapping from tag bi-gram to pos of previous token
 4 | TAG_ORTH_MAP = {
 5 |     "空白": {" ": SPACE, "　": X},
 6 |     "助詞-副助詞": {"たり": PART},
 7 |     "連体詞": {
 8 |         "あの": DET,
 9 |         "かの": DET,
10 |         "この": DET,
11 |         "その": DET,
12 |         "どの": DET,
13 |         "彼の": DET,
14 |         "此の": DET,
15 |         "其の": DET,
16 |         "ある": PRON,
17 |         "こんな": PRON,
18 |         "そんな": PRON,
19 |         "どんな": PRON,
20 |         "あらゆる": PRON,
21 |     },
22 | }
23 | 


--------------------------------------------------------------------------------
/spacy/lang/kmr/__init__.py:
--------------------------------------------------------------------------------
 1 | from ...language import BaseDefaults, Language
 2 | from .lex_attrs import LEX_ATTRS
 3 | from .stop_words import STOP_WORDS
 4 | 
 5 | 
 6 | class KurmanjiDefaults(BaseDefaults):
 7 |     stop_words = STOP_WORDS
 8 |     lex_attr_getters = LEX_ATTRS
 9 | 
10 | 
11 | class Kurmanji(Language):
12 |     lang = "kmr"
13 |     Defaults = KurmanjiDefaults
14 | 
15 | 
16 | __all__ = ["Kurmanji"]
17 | 


--------------------------------------------------------------------------------
/spacy/lang/kmr/stop_words.py:
--------------------------------------------------------------------------------
 1 | STOP_WORDS = set(
 2 |     """
 3 | û
 4 | li
 5 | bi
 6 | di
 7 | da
 8 | de
 9 | ji
10 | ku
11 | ew
12 | ez
13 | tu
14 | em
15 | hûn
16 | ew
17 | ev
18 | min
19 | te
20 | wî
21 | wê
22 | me
23 | we
24 | wan
25 | vê
26 | vî
27 | va
28 | çi
29 | kî
30 | kê
31 | çawa
32 | çima
33 | kengî
34 | li ku
35 | çend
36 | çiqas
37 | her
38 | hin
39 | gelek
40 | hemû
41 | kes
42 | tişt
43 | """.split()
44 | )
45 | 


--------------------------------------------------------------------------------
/spacy/lang/kn/__init__.py:
--------------------------------------------------------------------------------
 1 | from ...language import BaseDefaults, Language
 2 | from .stop_words import STOP_WORDS
 3 | 
 4 | 
 5 | class KannadaDefaults(BaseDefaults):
 6 |     stop_words = STOP_WORDS
 7 | 
 8 | 
 9 | class Kannada(Language):
10 |     lang = "kn"
11 |     Defaults = KannadaDefaults
12 | 
13 | 
14 | __all__ = ["Kannada"]
15 | 


--------------------------------------------------------------------------------
/spacy/lang/kn/examples.py:
--------------------------------------------------------------------------------
 1 | """
 2 | Example sentences to test spaCy and its language models.
 3 | 
 4 | >>> from spacy.lang.en.examples import sentences
 5 | >>> docs = nlp.pipe(sentences)
 6 | """
 7 | 
 8 | 
 9 | sentences = [
10 |     "ಆಪಲ್ ಒಂದು ಯು.ಕೆ. ಸ್ಟಾರ್ಟ್ಅಪ್ ಅನ್ನು ೧ ಶತಕೋಟಿ ಡಾಲರ್ಗಳಿಗೆ ಖರೀದಿಸಲು ನೋಡುತ್ತಿದೆ.",
11 |     "ಸ್ವಾಯತ್ತ ಕಾರುಗಳು ವಿಮಾ ಹೊಣೆಗಾರಿಕೆಯನ್ನು ತಯಾರಕರ ಕಡೆಗೆ ಬದಲಾಯಿಸುತ್ತವೆ.",
12 |     "ಕಾಲುದಾರಿ ವಿತರಣಾ ರೋಬೋಟ್‌ಗಳನ್ನು ನಿಷೇಧಿಸುವುದನ್ನು ಸ್ಯಾನ್ ಫ್ರಾನ್ಸಿಸ್ಕೊ ​​ಪರಿಗಣಿಸುತ್ತದೆ.",
13 |     "ಲಂಡನ್ ಯುನೈಟೆಡ್ ಕಿಂಗ್‌ಡಂನ ದೊಡ್ಡ ನಗರ.",
14 |     "ನೀನು ಎಲ್ಲಿದಿಯಾ?",
15 |     "ಫ್ರಾನ್ಸಾದ ಅಧ್ಯಕ್ಷರು ಯಾರು?",
16 |     "ಯುನೈಟೆಡ್ ಸ್ಟೇಟ್ಸ್ನ ರಾಜಧಾನಿ ಯಾವುದು?",
17 |     "ಬರಾಕ್ ಒಬಾಮ ಯಾವಾಗ ಜನಿಸಿದರು?",
18 | ]
19 | 


--------------------------------------------------------------------------------
/spacy/lang/ko/examples.py:
--------------------------------------------------------------------------------
 1 | """
 2 | Example sentences to test spaCy and its language models.
 3 | 
 4 | >>> from spacy.lang.ko.examples import sentences
 5 | >>> docs = nlp.pipe(sentences)
 6 | """
 7 | 
 8 | sentences = [
 9 |     "애플이 영국의 스타트업을 10억 달러에 인수하는 것을 알아보고 있다.",
10 |     "자율주행 자동차의 손해 배상 책임이 제조 업체로 옮겨 가다",
11 |     "샌프란시스코 시가 자동 배달 로봇의 보도 주행 금지를 검토 중이라고 합니다.",
12 |     "런던은 영국의 수도이자 가장 큰 도시입니다.",
13 | ]
14 | 


--------------------------------------------------------------------------------
/spacy/lang/ko/punctuation.py:
--------------------------------------------------------------------------------
 1 | from ..char_classes import LIST_QUOTES
 2 | from ..punctuation import TOKENIZER_INFIXES as BASE_TOKENIZER_INFIXES
 3 | 
 4 | _infixes = (
 5 |     ["·", "ㆍ", r"\(", r"\)"]
 6 |     + [r"(?<=[0-9])~(?=[0-9-])"]
 7 |     + LIST_QUOTES
 8 |     + BASE_TOKENIZER_INFIXES
 9 | )
10 | 
11 | TOKENIZER_INFIXES = _infixes
12 | 


--------------------------------------------------------------------------------
/spacy/lang/ko/stop_words.py:
--------------------------------------------------------------------------------
 1 | STOP_WORDS = set(
 2 |     """
 3 | 이
 4 | 있
 5 | 하
 6 | 것
 7 | 들
 8 | 그
 9 | 되
10 | 수
11 | 이
12 | 보
13 | 않
14 | 없
15 | 나
16 | 주
17 | 아니
18 | 등
19 | 같
20 | 때
21 | 년
22 | 가
23 | 한
24 | 지
25 | 오
26 | 말
27 | 일
28 | 그렇
29 | 위하
30 | 때문
31 | 그것
32 | 두
33 | 말하
34 | 알
35 | 그러나
36 | 받
37 | 못하
38 | 일
39 | 그런
40 | 또
41 | 더
42 | 많
43 | 그리고
44 | 좋
45 | 크
46 | 시키
47 | 그러
48 | 하나
49 | 살
50 | 데
51 | 안
52 | 어떤
53 | 번
54 | 나
55 | 다른
56 | 어떻
57 | 들
58 | 이렇
59 | 점
60 | 싶
61 | 말
62 | 좀
63 | 원
64 | 잘
65 | 놓
66 | """.split()
67 | )
68 | 


--------------------------------------------------------------------------------
/spacy/lang/ky/__init__.py:
--------------------------------------------------------------------------------
 1 | from ...language import BaseDefaults, Language
 2 | from .lex_attrs import LEX_ATTRS
 3 | from .punctuation import TOKENIZER_INFIXES
 4 | from .stop_words import STOP_WORDS
 5 | from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS
 6 | 
 7 | 
 8 | class KyrgyzDefaults(BaseDefaults):
 9 |     tokenizer_exceptions = TOKENIZER_EXCEPTIONS
10 |     infixes = TOKENIZER_INFIXES
11 |     lex_attr_getters = LEX_ATTRS
12 |     stop_words = STOP_WORDS
13 | 
14 | 
15 | class Kyrgyz(Language):
16 |     lang = "ky"
17 |     Defaults = KyrgyzDefaults
18 | 
19 | 
20 | __all__ = ["Kyrgyz"]
21 | 


--------------------------------------------------------------------------------
/spacy/lang/ky/examples.py:
--------------------------------------------------------------------------------
 1 | """
 2 | Example sentences to test spaCy and its language models.
 3 | >>> from spacy.lang.ky.examples import sentences
 4 | >>> docs = nlp.pipe(sentences)
 5 | """
 6 | 
 7 | sentences = [
 8 |     "Apple Улуу Британия стартабын $1 миллиардга сатып алууну көздөөдө.",
 9 |     "Автоном автомобилдерди камсыздоо жоопкерчилиги өндүрүүчүлөргө артылды.",
10 |     "Сан-Франциско тротуар менен жүрүүчү робот-курьерлерге тыю салууну караштырууда.",
11 |     "Лондон - Улуу Британияда жайгашкан ири шаар.",
12 |     "Кайдасың?",
13 |     "Франциянын президенти ким?",
14 |     "Америка Кошмо Штаттарынын борбор калаасы кайсы шаар?",
15 |     "Барак Обама качан төрөлгөн?",
16 | ]
17 | 


--------------------------------------------------------------------------------
/spacy/lang/la/__init__.py:
--------------------------------------------------------------------------------
 1 | from ...language import BaseDefaults, Language
 2 | from .lex_attrs import LEX_ATTRS
 3 | from .stop_words import STOP_WORDS
 4 | from .syntax_iterators import SYNTAX_ITERATORS
 5 | from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS
 6 | 
 7 | 
 8 | class LatinDefaults(BaseDefaults):
 9 |     tokenizer_exceptions = TOKENIZER_EXCEPTIONS
10 |     stop_words = STOP_WORDS
11 |     lex_attr_getters = LEX_ATTRS
12 |     syntax_iterators = SYNTAX_ITERATORS
13 | 
14 | 
15 | class Latin(Language):
16 |     lang = "la"
17 |     Defaults = LatinDefaults
18 | 
19 | 
20 | __all__ = ["Latin"]
21 | 


--------------------------------------------------------------------------------
/spacy/lang/lb/__init__.py:
--------------------------------------------------------------------------------
 1 | from ...language import BaseDefaults, Language
 2 | from .lex_attrs import LEX_ATTRS
 3 | from .punctuation import TOKENIZER_INFIXES
 4 | from .stop_words import STOP_WORDS
 5 | from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS
 6 | 
 7 | 
 8 | class LuxembourgishDefaults(BaseDefaults):
 9 |     tokenizer_exceptions = TOKENIZER_EXCEPTIONS
10 |     infixes = TOKENIZER_INFIXES
11 |     lex_attr_getters = LEX_ATTRS
12 |     stop_words = STOP_WORDS
13 | 
14 | 
15 | class Luxembourgish(Language):
16 |     lang = "lb"
17 |     Defaults = LuxembourgishDefaults
18 | 
19 | 
20 | __all__ = ["Luxembourgish"]
21 | 


--------------------------------------------------------------------------------
/spacy/lang/lb/punctuation.py:
--------------------------------------------------------------------------------
 1 | from ..char_classes import ALPHA, ALPHA_LOWER, ALPHA_UPPER, LIST_ELLIPSES, LIST_ICONS
 2 | 
 3 | ELISION = " ' ’ ".strip().replace(" ", "")
 4 | 
 5 | abbrev = ("d", "D")
 6 | 
 7 | _infixes = (
 8 |     LIST_ELLIPSES
 9 |     + LIST_ICONS
10 |     + [
11 |         r"(?<=^[{ab}][{el}])(?=[{a}])".format(ab=abbrev, a=ALPHA, el=ELISION),
12 |         r"(?<=[{al}])\.(?=[{au}])".format(al=ALPHA_LOWER, au=ALPHA_UPPER),
13 |         r"(?<=[{a}])[,!?](?=[{a}])".format(a=ALPHA),
14 |         r"(?<=[{a}])[:<>=](?=[{a}])".format(a=ALPHA),
15 |         r"(?<=[{a}]),(?=[{a}])".format(a=ALPHA),
16 |         r"(?<=[{a}])--(?=[{a}])".format(a=ALPHA),
17 |         r"(?<=[0-9])-(?=[0-9])",
18 |     ]
19 | )
20 | 
21 | TOKENIZER_INFIXES = _infixes
22 | 


--------------------------------------------------------------------------------
/spacy/lang/lg/__init__.py:
--------------------------------------------------------------------------------
 1 | from ...language import BaseDefaults, Language
 2 | from .lex_attrs import LEX_ATTRS
 3 | from .punctuation import TOKENIZER_INFIXES
 4 | from .stop_words import STOP_WORDS
 5 | 
 6 | 
 7 | class LugandaDefaults(BaseDefaults):
 8 |     lex_attr_getters = LEX_ATTRS
 9 |     infixes = TOKENIZER_INFIXES
10 |     stop_words = STOP_WORDS
11 | 
12 | 
13 | class Luganda(Language):
14 |     lang = "lg"
15 |     Defaults = LugandaDefaults
16 | 
17 | 
18 | __all__ = ["Luganda"]
19 | 


--------------------------------------------------------------------------------
/spacy/lang/lg/examples.py:
--------------------------------------------------------------------------------
 1 | """
 2 | Example sentences to test spaCy and its language models.
 3 | 
 4 | >>> from spacy.lang.lg.examples import sentences
 5 | >>> docs = nlp.pipe(sentences)
 6 | """
 7 | 
 8 | sentences = [
 9 |     "Mpa ebyafaayo ku byalo Nakatu ne Nkajja",
10 |     "Okuyita Ttembo kitegeeza kugwa ddalu",
11 |     "Ekifumu kino kyali kya mulimu ki?",
12 |     "Ekkovu we liyise wayitibwa mukululo",
13 |     "Akola mulimu ki oguvaamu ssente?",
14 |     "Emisumaali egikomerera embaawo giyitibwa nninga",
15 |     "Abooluganda ab’emmamba ababiri",
16 |     "Ekisaawe ky'ebyenjigiriza kya mugaso nnyo",
17 | ]
18 | 


--------------------------------------------------------------------------------
/spacy/lang/lg/punctuation.py:
--------------------------------------------------------------------------------
 1 | from ..char_classes import (
 2 |     ALPHA,
 3 |     ALPHA_LOWER,
 4 |     ALPHA_UPPER,
 5 |     CONCAT_QUOTES,
 6 |     HYPHENS,
 7 |     LIST_ELLIPSES,
 8 |     LIST_ICONS,
 9 | )
10 | 
11 | _infixes = (
12 |     LIST_ELLIPSES
13 |     + LIST_ICONS
14 |     + [
15 |         r"(?<=[0-9])[+\-\*^](?=[0-9-])",
16 |         r"(?<=[{al}{q}])\.(?=[{au}{q}])".format(
17 |             al=ALPHA_LOWER, au=ALPHA_UPPER, q=CONCAT_QUOTES
18 |         ),
19 |         r"(?<=[{a}]),(?=[{a}])".format(a=ALPHA),
20 |         r"(?<=[{a}0-9])(?:{h})(?=[{a}])".format(a=ALPHA, h=HYPHENS),
21 |         r"(?<=[{a}0-9])[:<>=/](?=[{a}])".format(a=ALPHA),
22 |     ]
23 | )
24 | 
25 | 
26 | TOKENIZER_INFIXES = _infixes
27 | 


--------------------------------------------------------------------------------
/spacy/lang/lij/__init__.py:
--------------------------------------------------------------------------------
 1 | from ...language import BaseDefaults, Language
 2 | from .punctuation import TOKENIZER_INFIXES
 3 | from .stop_words import STOP_WORDS
 4 | from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS
 5 | 
 6 | 
 7 | class LigurianDefaults(BaseDefaults):
 8 |     tokenizer_exceptions = TOKENIZER_EXCEPTIONS
 9 |     infixes = TOKENIZER_INFIXES
10 |     stop_words = STOP_WORDS
11 | 
12 | 
13 | class Ligurian(Language):
14 |     lang = "lij"
15 |     Defaults = LigurianDefaults
16 | 
17 | 
18 | __all__ = ["Ligurian"]
19 | 


--------------------------------------------------------------------------------
/spacy/lang/lij/examples.py:
--------------------------------------------------------------------------------
 1 | """
 2 | Example sentences to test spaCy and its language models.
 3 | 
 4 | >>> from spacy.lang.lij.examples import sentences
 5 | >>> docs = nlp.pipe(sentences)
 6 | """
 7 | 
 8 | 
 9 | sentences = [
10 |     "Sciusciâ e sciorbî no se peu.",
11 |     "Graçie di çetroin, che me son arrivæ.",
12 |     "Vegnime apreuvo, che ve fasso pescâ di òmmi.",
13 |     "Bella pe sempre l'ægua inta conchetta quande unn'agoggia d'ægua a se â trapaña.",
14 | ]
15 | 


--------------------------------------------------------------------------------
/spacy/lang/lij/punctuation.py:
--------------------------------------------------------------------------------
 1 | from ..char_classes import ALPHA
 2 | from ..punctuation import TOKENIZER_INFIXES
 3 | 
 4 | ELISION = " ' ’ ".strip().replace(" ", "").replace("\n", "")
 5 | 
 6 | 
 7 | _infixes = TOKENIZER_INFIXES + [
 8 |     r"(?<=[{a}][{el}])(?=[{a}])".format(a=ALPHA, el=ELISION)
 9 | ]
10 | 
11 | TOKENIZER_INFIXES = _infixes
12 | 


--------------------------------------------------------------------------------
/spacy/lang/lt/__init__.py:
--------------------------------------------------------------------------------
 1 | from ...language import BaseDefaults, Language
 2 | from .lex_attrs import LEX_ATTRS
 3 | from .punctuation import TOKENIZER_INFIXES, TOKENIZER_SUFFIXES
 4 | from .stop_words import STOP_WORDS
 5 | from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS
 6 | 
 7 | 
 8 | class LithuanianDefaults(BaseDefaults):
 9 |     infixes = TOKENIZER_INFIXES
10 |     suffixes = TOKENIZER_SUFFIXES
11 |     tokenizer_exceptions = TOKENIZER_EXCEPTIONS
12 |     stop_words = STOP_WORDS
13 |     lex_attr_getters = LEX_ATTRS
14 | 
15 | 
16 | class Lithuanian(Language):
17 |     lang = "lt"
18 |     Defaults = LithuanianDefaults
19 | 
20 | 
21 | __all__ = ["Lithuanian"]
22 | 


--------------------------------------------------------------------------------
/spacy/lang/lt/examples.py:
--------------------------------------------------------------------------------
 1 | """
 2 | Example sentences to test spaCy and its language models.
 3 | 
 4 | >>> from spacy.lang.lt.examples import sentences
 5 | >>> docs = nlp.pipe(sentences)
 6 | """
 7 | 
 8 | 
 9 | sentences = [
10 |     "Jaunikis pirmąją vestuvinę naktį iškeitė į areštinės gultą",
11 |     "Bepiločiai automobiliai išnaikins vairavimo mokyklas, autoservisus ir eismo nelaimes",
12 |     "Vilniuje galvojama uždrausti naudoti skėčius",
13 |     "Londonas yra didelis miestas Jungtinėje Karalystėje",
14 |     "Kur tu?",
15 |     "Kas yra Prancūzijos prezidentas?",
16 |     "Kokia yra Jungtinių Amerikos Valstijų sostinė?",
17 |     "Kada gimė Dalia Grybauskaitė?",
18 | ]
19 | 


--------------------------------------------------------------------------------
/spacy/lang/lt/tokenizer_exceptions.py:
--------------------------------------------------------------------------------
 1 | from ...symbols import ORTH
 2 | from ...util import update_exc
 3 | from ..tokenizer_exceptions import BASE_EXCEPTIONS
 4 | 
 5 | _exc = {}
 6 | 
 7 | for orth in ["n-tosios", "?!"]:
 8 |     _exc[orth] = [{ORTH: orth}]
 9 | 
10 | mod_base_exceptions = {
11 |     exc: val for exc, val in BASE_EXCEPTIONS.items() if not exc.endswith(".")
12 | }
13 | del mod_base_exceptions["8)"]
14 | TOKENIZER_EXCEPTIONS = update_exc(mod_base_exceptions, _exc)
15 | 


--------------------------------------------------------------------------------
/spacy/lang/lv/__init__.py:
--------------------------------------------------------------------------------
 1 | from ...language import BaseDefaults, Language
 2 | from .stop_words import STOP_WORDS
 3 | 
 4 | 
 5 | class LatvianDefaults(BaseDefaults):
 6 |     stop_words = STOP_WORDS
 7 | 
 8 | 
 9 | class Latvian(Language):
10 |     lang = "lv"
11 |     Defaults = LatvianDefaults
12 | 
13 | 
14 | __all__ = ["Latvian"]
15 | 


--------------------------------------------------------------------------------
/spacy/lang/ml/__init__.py:
--------------------------------------------------------------------------------
 1 | from ...language import BaseDefaults, Language
 2 | from .lex_attrs import LEX_ATTRS
 3 | from .stop_words import STOP_WORDS
 4 | 
 5 | 
 6 | class MalayalamDefaults(BaseDefaults):
 7 |     lex_attr_getters = LEX_ATTRS
 8 |     stop_words = STOP_WORDS
 9 | 
10 | 
11 | class Malayalam(Language):
12 |     lang = "ml"
13 |     Defaults = MalayalamDefaults
14 | 
15 | 
16 | __all__ = ["Malayalam"]
17 | 


--------------------------------------------------------------------------------
/spacy/lang/ml/examples.py:
--------------------------------------------------------------------------------
 1 | """
 2 | Example sentences to test spaCy and its language models.
 3 | 
 4 | >>> from spacy.lang.ml.examples import sentences
 5 | >>> docs = nlp.pipe(sentences)
 6 | """
 7 | 
 8 | 
 9 | sentences = [
10 |     "അനാവശ്യമായി കണ്ണിലും മൂക്കിലും വായിലും സ്പർശിക്കാതിരിക്കുക",
11 |     "പൊതുരംഗത്ത് മലയാള ഭാഷയുടെ സമഗ്രപുരോഗതി ലക്ഷ്യമാക്കി പ്രവർത്തിക്കുന്ന സംഘടനയായ മലയാളഐക്യവേദിയുടെ വിദ്യാർത്ഥിക്കൂട്ടായ്മയാണ് വിദ്യാർത്ഥി മലയാളവേദി",
12 |     "എന്താണ്‌ കവാടങ്ങൾ?",
13 |     "ചുരുക്കത്തിൽ വിക്കിപീഡിയയുടെ ഉള്ളടക്കത്തിലേക്കുള്ള പടിപ്പുരകളാണ്‌‌ കവാടങ്ങൾ. അവ ലളിതവും വായനക്കാരനെ ആകർഷിക്കുന്നതുമായിരിക്കും",
14 |     "പതിനൊന്നുപേർ വീതമുള്ള രണ്ടു ടീമുകൾ കളിക്കുന്ന സംഘകായിക വിനോദമാണു ക്രിക്കറ്റ്",
15 | ]
16 | 


--------------------------------------------------------------------------------
/spacy/lang/ml/stop_words.py:
--------------------------------------------------------------------------------
 1 | STOP_WORDS = set(
 2 |     """
 3 | അത്
 4 | ഇത്
 5 | ആയിരുന്നു
 6 | ആകുന്നു
 7 | വരെ
 8 | അന്നേരം
 9 | അന്ന്
10 | ഇന്ന്
11 | ആണ്
12 | """.split()
13 | )
14 | 


--------------------------------------------------------------------------------
/spacy/lang/mr/__init__.py:
--------------------------------------------------------------------------------
 1 | from ...language import BaseDefaults, Language
 2 | from .stop_words import STOP_WORDS
 3 | 
 4 | 
 5 | class MarathiDefaults(BaseDefaults):
 6 |     stop_words = STOP_WORDS
 7 | 
 8 | 
 9 | class Marathi(Language):
10 |     lang = "mr"
11 |     Defaults = MarathiDefaults
12 | 
13 | 
14 | __all__ = ["Marathi"]
15 | 


--------------------------------------------------------------------------------
/spacy/lang/ms/examples.py:
--------------------------------------------------------------------------------
 1 | """
 2 | Example sentences to test spaCy and its language models.
 3 | 
 4 | >>> from spacy.lang.ms.examples import sentences
 5 | >>> docs = nlp.pipe(sentences)
 6 | """
 7 | 
 8 | 
 9 | sentences = [
10 |     "Malaysia ialah sebuah negara yang terletak di Asia Tenggara.",
11 |     "Berapa banyak pelajar yang akan menghadiri majlis perpisahan sekolah?",
12 |     "Pengeluaran makanan berasal dari beberapa lokasi termasuk Cameron Highlands, Johor Bahru, dan Kuching.",
13 |     "Syarikat XYZ telah menghasilkan 20,000 unit produk baharu dalam setahun terakhir",
14 |     "Kuala Lumpur merupakan ibu negara Malaysia." "Kau berada di mana semalam?",
15 |     "Siapa yang akan memimpin projek itu?",
16 |     "Siapa perdana menteri Malaysia sekarang?",
17 | ]
18 | 


--------------------------------------------------------------------------------
/spacy/lang/nb/examples.py:
--------------------------------------------------------------------------------
 1 | """
 2 | Example sentences to test spaCy and its language models.
 3 | 
 4 | >>> from spacy.lang.nb.examples import sentences
 5 | >>> docs = nlp.pipe(sentences)
 6 | """
 7 | 
 8 | 
 9 | sentences = [
10 |     "Apple vurderer å kjøpe britisk oppstartfirma for en milliard dollar.",
11 |     "Selvkjørende biler flytter forsikringsansvaret over på produsentene.",
12 |     "San Francisco vurderer å forby robotbud på fortauene.",
13 |     "London er en stor by i Storbritannia.",
14 | ]
15 | 


--------------------------------------------------------------------------------
/spacy/lang/ne/__init__.py:
--------------------------------------------------------------------------------
 1 | from ...language import BaseDefaults, Language
 2 | from .lex_attrs import LEX_ATTRS
 3 | from .stop_words import STOP_WORDS
 4 | 
 5 | 
 6 | class NepaliDefaults(BaseDefaults):
 7 |     stop_words = STOP_WORDS
 8 |     lex_attr_getters = LEX_ATTRS
 9 | 
10 | 
11 | class Nepali(Language):
12 |     lang = "ne"
13 |     Defaults = NepaliDefaults
14 | 
15 | 
16 | __all__ = ["Nepali"]
17 | 


--------------------------------------------------------------------------------
/spacy/lang/ne/examples.py:
--------------------------------------------------------------------------------
 1 | """
 2 | Example sentences to test spaCy and its language models.
 3 | 
 4 | >>> from spacy.lang.ne.examples import sentences
 5 | >>> docs = nlp.pipe(sentences)
 6 | """
 7 | 
 8 | 
 9 | sentences = [
10 |     "एप्पलले अमेरिकी स्टार्टअप १ अर्ब डलरमा किन्ने सोच्दै छ",
11 |     "स्वायत्त कारहरूले बीमा दायित्व निर्माताहरु तिर बदल्छन्",
12 |     "स्यान फ्रांसिस्कोले फुटपाथ वितरण रोबोटहरु प्रतिबंध गर्ने विचार गर्दै छ",
13 |     "लन्डन यूनाइटेड किंगडमको एक ठूलो शहर हो।",
14 |     "तिमी कहाँ छौ?",
15 |     "फ्रान्स को राष्ट्रपति को हो?",
16 |     "संयुक्त राज्यको राजधानी के हो?",
17 |     "बराक ओबामा कहिले कहिले जन्मेका हुन्?",
18 | ]
19 | 


--------------------------------------------------------------------------------
/spacy/lang/nl/examples.py:
--------------------------------------------------------------------------------
 1 | """
 2 | Example sentences to test spaCy and its language models.
 3 | 
 4 | >>> from spacy.lang.nl.examples import sentences
 5 | >>> docs = nlp.pipe(sentences)
 6 | """
 7 | 
 8 | 
 9 | sentences = [
10 |     "Apple overweegt om voor 1 miljard een U.K. startup te kopen",
11 |     "Autonome auto's verschuiven de verzekeringverantwoordelijkheid naar producenten",
12 |     "San Francisco overweegt robots op voetpaden te verbieden",
13 |     "Londen is een grote stad in het Verenigd Koninkrijk",
14 | ]
15 | 


--------------------------------------------------------------------------------
/spacy/lang/nn/__init__.py:
--------------------------------------------------------------------------------
 1 | from ...language import BaseDefaults, Language
 2 | from ..nb import SYNTAX_ITERATORS
 3 | from .punctuation import TOKENIZER_INFIXES, TOKENIZER_PREFIXES, TOKENIZER_SUFFIXES
 4 | from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS
 5 | 
 6 | 
 7 | class NorwegianNynorskDefaults(BaseDefaults):
 8 |     tokenizer_exceptions = TOKENIZER_EXCEPTIONS
 9 |     prefixes = TOKENIZER_PREFIXES
10 |     infixes = TOKENIZER_INFIXES
11 |     suffixes = TOKENIZER_SUFFIXES
12 |     syntax_iterators = SYNTAX_ITERATORS
13 | 
14 | 
15 | class NorwegianNynorsk(Language):
16 |     lang = "nn"
17 |     Defaults = NorwegianNynorskDefaults
18 | 
19 | 
20 | __all__ = ["NorwegianNynorsk"]
21 | 


--------------------------------------------------------------------------------
/spacy/lang/nn/examples.py:
--------------------------------------------------------------------------------
 1 | """
 2 | Example sentences to test spaCy and its language models.
 3 | 
 4 | >>> from spacy.lang.nn.examples import sentences
 5 | >>> docs = nlp.pipe(sentences)
 6 | """
 7 | 
 8 | 
 9 | # sentences taken from Omsetjingsminne frå Nynorsk pressekontor 2022 (https://www.nb.no/sprakbanken/en/resource-catalogue/oai-nb-no-sbr-80/)
10 | sentences = [
11 |     "Konseptet går ut på at alle tre omgangar tel, alle hopparar må stille i kvalifiseringa og poengsummen skal telje.",
12 |     "Det er ein meir enn i same periode i fjor.",
13 |     "Det har lava ned enorme snømengder i store delar av Europa den siste tida.",
14 |     "Akhtar Chaudhry er ikkje innstilt på Oslo-lista til SV, men utfordrar Heikki Holmås om førsteplassen.",
15 | ]
16 | 


--------------------------------------------------------------------------------
/spacy/lang/pl/examples.py:
--------------------------------------------------------------------------------
 1 | """
 2 | Example sentences to test spaCy and its language models.
 3 | 
 4 | >>> from spacy.lang.pl.examples import sentences
 5 | >>> docs = nlp.pipe(sentences)
 6 | """
 7 | 
 8 | 
 9 | sentences = [
10 |     "Poczuł przyjemną woń mocnej kawy.",
11 |     "Istnieje wiele dróg oddziaływania substancji psychoaktywnej na układ nerwowy.",
12 |     "Powitał mnie biało-czarny kot, płosząc siedzące na płocie trzy dorodne dudki.",
13 |     "Nowy abonament pod lupą Komisji Europejskiej",
14 |     "Czy w ciągu ostatnich 48 godzin spożyłeś leki zawierające paracetamol?",
15 |     "Kto ma ochotę zapoznać się z innymi niż w książkach przygodami Muminków i ich przyjaciół, temu polecam komiks Tove Jansson „Muminki i morze”.",
16 | ]
17 | 


--------------------------------------------------------------------------------
/spacy/lang/pt/__init__.py:
--------------------------------------------------------------------------------
 1 | from ...language import BaseDefaults, Language
 2 | from .lex_attrs import LEX_ATTRS
 3 | from .punctuation import TOKENIZER_INFIXES, TOKENIZER_PREFIXES
 4 | from .stop_words import STOP_WORDS
 5 | from .syntax_iterators import SYNTAX_ITERATORS
 6 | from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS
 7 | 
 8 | 
 9 | class PortugueseDefaults(BaseDefaults):
10 |     tokenizer_exceptions = TOKENIZER_EXCEPTIONS
11 |     infixes = TOKENIZER_INFIXES
12 |     prefixes = TOKENIZER_PREFIXES
13 |     lex_attr_getters = LEX_ATTRS
14 |     syntax_iterators = SYNTAX_ITERATORS
15 |     stop_words = STOP_WORDS
16 | 
17 | 
18 | class Portuguese(Language):
19 |     lang = "pt"
20 |     Defaults = PortugueseDefaults
21 | 
22 | 
23 | __all__ = ["Portuguese"]
24 | 


--------------------------------------------------------------------------------
/spacy/lang/pt/examples.py:
--------------------------------------------------------------------------------
 1 | """
 2 | Example sentences to test spaCy and its language models.
 3 | 
 4 | >>> from spacy.lang.pt.examples import sentences
 5 | >>> docs = nlp.pipe(sentences)
 6 | """
 7 | 
 8 | 
 9 | sentences = [
10 |     "Apple está querendo comprar uma startup do Reino Unido por 100 milhões de dólares",
11 |     "Carros autônomos empurram a responsabilidade do seguro para os fabricantes."
12 |     "São Francisco considera banir os robôs de entrega que andam pelas calçadas",
13 |     "Londres é a maior cidade do Reino Unido",
14 | ]
15 | 


--------------------------------------------------------------------------------
/spacy/lang/pt/punctuation.py:
--------------------------------------------------------------------------------
 1 | from ..punctuation import TOKENIZER_INFIXES as BASE_TOKENIZER_INFIXES
 2 | from ..punctuation import TOKENIZER_PREFIXES as BASE_TOKENIZER_PREFIXES
 3 | from ..punctuation import TOKENIZER_SUFFIXES as BASE_TOKENIZER_SUFFIXES
 4 | 
 5 | _prefixes = [r"\w{1,3}\$"] + BASE_TOKENIZER_PREFIXES
 6 | 
 7 | _suffixes = BASE_TOKENIZER_SUFFIXES
 8 | 
 9 | _infixes = [r"(\w+-\w+(-\w+)*)"] + BASE_TOKENIZER_INFIXES
10 | 
11 | TOKENIZER_PREFIXES = _prefixes
12 | TOKENIZER_SUFFIXES = _suffixes
13 | TOKENIZER_INFIXES = _infixes
14 | 


--------------------------------------------------------------------------------
/spacy/lang/ro/examples.py:
--------------------------------------------------------------------------------
 1 | """
 2 | Example sentences to test spaCy and its language models.
 3 | 
 4 | >>> from spacy.lang.ro import Romanian
 5 | >>> from spacy.lang.ro.examples import sentences
 6 | >>> nlp = Romanian()
 7 | >>> docs = nlp.pipe(sentences)
 8 | """
 9 | 
10 | 
11 | sentences = [
12 |     "Apple plănuiește să cumpere o companie britanică pentru un miliard de dolari",
13 |     "Municipalitatea din San Francisco ia în calcul interzicerea roboților curieri pe trotuar",
14 |     "Londra este un oraș mare în Regatul Unit",
15 |     "Unde ești?",
16 |     "Cine este președintele Franței?",
17 |     "Care este capitala Statelor Unite?",
18 |     "Când s-a născut Barack Obama?",
19 | ]
20 | 


--------------------------------------------------------------------------------
/spacy/lang/sa/__init__.py:
--------------------------------------------------------------------------------
 1 | from ...language import BaseDefaults, Language
 2 | from .lex_attrs import LEX_ATTRS
 3 | from .stop_words import STOP_WORDS
 4 | 
 5 | 
 6 | class SanskritDefaults(BaseDefaults):
 7 |     lex_attr_getters = LEX_ATTRS
 8 |     stop_words = STOP_WORDS
 9 | 
10 | 
11 | class Sanskrit(Language):
12 |     lang = "sa"
13 |     Defaults = SanskritDefaults
14 | 
15 | 
16 | __all__ = ["Sanskrit"]
17 | 


--------------------------------------------------------------------------------
/spacy/lang/sa/examples.py:
--------------------------------------------------------------------------------
 1 | """
 2 | Example sentences to test spaCy and its language models.
 3 | 
 4 | >>> from spacy.lang.sa.examples import sentences
 5 | >>> docs = nlp.pipe(sentences)
 6 | """
 7 | 
 8 | 
 9 | sentences = [
10 |     "अभ्यावहति कल्याणं विविधं वाक् सुभाषिता ।",
11 |     "मनसि व्याकुले चक्षुः पश्यन्नपि न पश्यति ।",
12 |     "यस्य बुद्धिर्बलं तस्य निर्बुद्धेस्तु कुतो बलम्?",
13 |     "परो अपि हितवान् बन्धुः बन्धुः अपि अहितः परः ।",
14 |     "अहितः देहजः व्याधिः हितम् आरण्यं औषधम् ॥",
15 | ]
16 | 


--------------------------------------------------------------------------------
/spacy/lang/si/__init__.py:
--------------------------------------------------------------------------------
 1 | from ...language import BaseDefaults, Language
 2 | from .lex_attrs import LEX_ATTRS
 3 | from .stop_words import STOP_WORDS
 4 | 
 5 | 
 6 | class SinhalaDefaults(BaseDefaults):
 7 |     lex_attr_getters = LEX_ATTRS
 8 |     stop_words = STOP_WORDS
 9 | 
10 | 
11 | class Sinhala(Language):
12 |     lang = "si"
13 |     Defaults = SinhalaDefaults
14 | 
15 | 
16 | __all__ = ["Sinhala"]
17 | 


--------------------------------------------------------------------------------
/spacy/lang/si/examples.py:
--------------------------------------------------------------------------------
 1 | """
 2 | Example sentences to test spaCy and its language models.
 3 | 
 4 | >>> from spacy.lang.si.examples import sentences
 5 | >>> docs = nlp.pipe(sentences)
 6 | """
 7 | 
 8 | 
 9 | sentences = [
10 |     "මෙය වාක්‍යයකි.",
11 |     "ඔබ කවුද?",
12 |     "ගූගල් සමාගම ඩොලර් මිලියන 500 කට එම ආයතනය මිලදී ගන්නා ලදී.",
13 |     "කොළඹ ශ්‍රී ලංකාවේ ප්‍රධානතම නගරය යි.",
14 |     "ප්‍රංශයේ ජනාධිපති කවරෙක් ද?",
15 |     "මට බිස්කට් 1 ක් දෙන්න",
16 |     "ඔවුන් ලකුණු 59 කින් තරඟය පරාජයට පත් විය.",
17 |     "1 ත් 10 ත් අතර සංඛ්‍යාවක් කියන්න",
18 |     "ඔහු සහ ඇය නුවර හෝ කොළඹ පදිංචි කරුවන් නොවේ",
19 | ]
20 | 


--------------------------------------------------------------------------------
/spacy/lang/sk/__init__.py:
--------------------------------------------------------------------------------
 1 | from ...language import BaseDefaults, Language
 2 | from .lex_attrs import LEX_ATTRS
 3 | from .stop_words import STOP_WORDS
 4 | 
 5 | 
 6 | class SlovakDefaults(BaseDefaults):
 7 |     lex_attr_getters = LEX_ATTRS
 8 |     stop_words = STOP_WORDS
 9 | 
10 | 
11 | class Slovak(Language):
12 |     lang = "sk"
13 |     Defaults = SlovakDefaults
14 | 
15 | 
16 | __all__ = ["Slovak"]
17 | 


--------------------------------------------------------------------------------
/spacy/lang/sl/__init__.py:
--------------------------------------------------------------------------------
 1 | from ...language import BaseDefaults, Language
 2 | from .lex_attrs import LEX_ATTRS
 3 | from .punctuation import TOKENIZER_INFIXES, TOKENIZER_PREFIXES, TOKENIZER_SUFFIXES
 4 | from .stop_words import STOP_WORDS
 5 | from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS
 6 | 
 7 | 
 8 | class SlovenianDefaults(BaseDefaults):
 9 |     stop_words = STOP_WORDS
10 |     tokenizer_exceptions = TOKENIZER_EXCEPTIONS
11 |     prefixes = TOKENIZER_PREFIXES
12 |     infixes = TOKENIZER_INFIXES
13 |     suffixes = TOKENIZER_SUFFIXES
14 |     lex_attr_getters = LEX_ATTRS
15 | 
16 | 
17 | class Slovenian(Language):
18 |     lang = "sl"
19 |     Defaults = SlovenianDefaults
20 | 
21 | 
22 | __all__ = ["Slovenian"]
23 | 


--------------------------------------------------------------------------------
/spacy/lang/sl/examples.py:
--------------------------------------------------------------------------------
 1 | """
 2 | Example sentences to test spaCy and its language models.
 3 | 
 4 | >>> from spacy.lang.sl.examples import sentences
 5 | >>> docs = nlp.pipe(sentences)
 6 | """
 7 | 
 8 | 
 9 | sentences = [
10 |     "Apple načrtuje nakup britanskega startupa za 1 bilijon dolarjev",
11 |     "France Prešeren je umrl 8. februarja 1849 v Kranju",
12 |     "Staro ljubljansko letališče Moste bo obnovila družba BTC",
13 |     "London je največje mesto v Združenem kraljestvu.",
14 |     "Kje se skrivaš?",
15 |     "Kdo je predsednik Francije?",
16 |     "Katero je glavno mesto Združenih držav Amerike?",
17 |     "Kdaj je bil rojen Milan Kučan?",
18 | ]
19 | 


--------------------------------------------------------------------------------
/spacy/lang/sq/__init__.py:
--------------------------------------------------------------------------------
 1 | from ...language import BaseDefaults, Language
 2 | from .stop_words import STOP_WORDS
 3 | 
 4 | 
 5 | class AlbanianDefaults(BaseDefaults):
 6 |     stop_words = STOP_WORDS
 7 | 
 8 | 
 9 | class Albanian(Language):
10 |     lang = "sq"
11 |     Defaults = AlbanianDefaults
12 | 
13 | 
14 | __all__ = ["Albanian"]
15 | 


--------------------------------------------------------------------------------
/spacy/lang/sq/examples.py:
--------------------------------------------------------------------------------
 1 | """
 2 | Example sentences to test spaCy and its language models.
 3 | 
 4 | >>> from spacy.lang.sq.examples import sentences
 5 | >>> docs = nlp.pipe(sentences)
 6 | """
 7 | 
 8 | 
 9 | sentences = [
10 |     "Apple po shqyrton blerjen e nje shoqërie të U.K. për 1 miliard dollarë",
11 |     "Makinat autonome ndryshojnë përgjegjësinë e sigurimit ndaj prodhuesve",
12 |     "San Francisko konsideron ndalimin e robotëve të shpërndarjes",
13 |     "Londra është një qytet i madh në Mbretërinë e Bashkuar.",
14 | ]
15 | 


--------------------------------------------------------------------------------
/spacy/lang/sr/__init__.py:
--------------------------------------------------------------------------------
 1 | from ...language import BaseDefaults, Language
 2 | from .lex_attrs import LEX_ATTRS
 3 | from .punctuation import TOKENIZER_INFIXES, TOKENIZER_SUFFIXES
 4 | from .stop_words import STOP_WORDS
 5 | from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS
 6 | 
 7 | 
 8 | class SerbianDefaults(BaseDefaults):
 9 |     tokenizer_exceptions = TOKENIZER_EXCEPTIONS
10 |     infixes = TOKENIZER_INFIXES
11 |     suffixes = TOKENIZER_SUFFIXES
12 |     lex_attr_getters = LEX_ATTRS
13 |     stop_words = STOP_WORDS
14 | 
15 | 
16 | class Serbian(Language):
17 |     lang = "sr"
18 |     Defaults = SerbianDefaults
19 | 
20 | 
21 | __all__ = ["Serbian"]
22 | 


--------------------------------------------------------------------------------
/spacy/lang/sr/examples.py:
--------------------------------------------------------------------------------
 1 | """
 2 | Example sentences to test spaCy and its language models.
 3 | 
 4 | >>> from spacy.lang.sr.examples import sentences
 5 | >>> docs = nlp.pipe(sentences)
 6 | """
 7 | 
 8 | 
 9 | sentences = [
10 |     # Translations from English
11 |     "Apple планира куповину америчког стартапа за $1 милијарду.",
12 |     "Беспилотни аутомобили пребацују одговорност осигурања на произвођаче.",
13 |     "Лондон је велики град у Уједињеном Краљевству.",
14 |     "Где си ти?",
15 |     "Ко је председник Француске?",
16 |     # Serbian common and slang
17 |     "Moj ћале је инжењер!",
18 |     "Новак Ђоковић је најбољи тенисер света.",
19 |     "У Пироту има добрих кафана!",
20 |     "Музеј Николе Тесле се налази у Београду.",
21 | ]
22 | 


--------------------------------------------------------------------------------
/spacy/lang/sv/examples.py:
--------------------------------------------------------------------------------
 1 | """
 2 | Example sentences to test spaCy and its language models.
 3 | 
 4 | >>> from spacy.lang.sv.examples import sentences
 5 | >>> docs = nlp.pipe(sentences)
 6 | """
 7 | 
 8 | 
 9 | sentences = [
10 |     "Apple överväger att köpa brittisk startup för 1 miljard dollar.",
11 |     "Självkörande bilar förskjuter försäkringsansvar mot tillverkare.",
12 |     "San Fransisco överväger förbud mot leveransrobotar på trottoarer.",
13 |     "London är en storstad i Storbritannien.",
14 | ]
15 | 


--------------------------------------------------------------------------------
/spacy/lang/ta/__init__.py:
--------------------------------------------------------------------------------
 1 | from ...language import BaseDefaults, Language
 2 | from .lex_attrs import LEX_ATTRS
 3 | from .stop_words import STOP_WORDS
 4 | 
 5 | 
 6 | class TamilDefaults(BaseDefaults):
 7 |     lex_attr_getters = LEX_ATTRS
 8 |     stop_words = STOP_WORDS
 9 | 
10 | 
11 | class Tamil(Language):
12 |     lang = "ta"
13 |     Defaults = TamilDefaults
14 | 
15 | 
16 | __all__ = ["Tamil"]
17 | 


--------------------------------------------------------------------------------
/spacy/lang/te/__init__.py:
--------------------------------------------------------------------------------
 1 | from ...language import BaseDefaults, Language
 2 | from .lex_attrs import LEX_ATTRS
 3 | from .stop_words import STOP_WORDS
 4 | 
 5 | 
 6 | class TeluguDefaults(BaseDefaults):
 7 |     lex_attr_getters = LEX_ATTRS
 8 |     stop_words = STOP_WORDS
 9 | 
10 | 
11 | class Telugu(Language):
12 |     lang = "te"
13 |     Defaults = TeluguDefaults
14 | 
15 | 
16 | __all__ = ["Telugu"]
17 | 


--------------------------------------------------------------------------------
/spacy/lang/te/examples.py:
--------------------------------------------------------------------------------
 1 | """
 2 | Example sentences to test spaCy and its language models.
 3 | 
 4 | >>> from spacy.lang.te import Telugu
 5 | >>> nlp = Telugu()
 6 | >>> from spacy.lang.te.examples import sentences
 7 | >>> docs = nlp.pipe(sentences)
 8 | """
 9 | 
10 | 
11 | sentences = [
12 |     "ఆపిల్ 1 బిలియన్ డాలర్స్ కి యూ.కె. స్టార్ట్అప్ ని కొనాలని అనుకుంటుంది.",
13 |     "ఆటోనోమోస్ కార్లు భీమా బాధ్యతను తయారీదారులపైకి మళ్లిస్తాయి.",
14 |     "సాన్ ఫ్రాన్సిస్కో కాలిబాట డెలివరీ రోబోట్లను నిషేధించడానికి ఆలోచిస్తుంది.",
15 |     "లండన్ యునైటెడ్ కింగ్డమ్ లో పెద్ద సిటీ.",
16 |     "నువ్వు ఎక్కడ ఉన్నావ్?",
17 |     "ఫ్రాన్స్ అధ్యక్షుడు ఎవరు?",
18 |     "యునైటెడ్ స్టేట్స్ యొక్క రాజధాని ఏంటి?",
19 |     "బరాక్ ఒబామా ఎప్పుడు జన్మించారు?",
20 | ]
21 | 


--------------------------------------------------------------------------------
/spacy/lang/te/stop_words.py:
--------------------------------------------------------------------------------
 1 | # Source: https://github.com/Xangis/extra-stopwords (MIT License)
 2 | 
 3 | STOP_WORDS = set(
 4 |     """
 5 | అందరూ
 6 | అందుబాటులో
 7 | అడగండి
 8 | అడగడం
 9 | అడ్డంగా
10 | అనుగుణంగా
11 | అనుమతించు
12 | అనుమతిస్తుంది
13 | అయితే
14 | ఇప్పటికే
15 | ఉన్నారు
16 | ఎక్కడైనా
17 | ఎప్పుడు
18 | ఎవరైనా
19 | ఎవరో ఒకరు
20 | ఏ
21 | ఏదైనా
22 | ఏమైనప్పటికి
23 | ఏమైనప్పటికి
24 | ఒక
25 | ఒక ప్రక్కన
26 | కనిపిస్తాయి
27 | కాదు
28 | కాదు
29 | కూడా
30 | గా
31 | గురించి
32 | చుట్టూ
33 | చేయగలిగింది
34 | తగిన
35 | తర్వాత
36 | తర్వాత
37 | దాదాపు
38 | దూరంగా
39 | నిజంగా
40 | పై
41 | ప్రకారం
42 | మధ్య
43 | మధ్య
44 | మరియు
45 | మరొక
46 | మళ్ళీ
47 | మాత్రమే
48 | మెచ్చుకో
49 | వద్ద
50 | వద్ద
51 | వెంట
52 | వేరుగా
53 | వ్యతిరేకంగా
54 | సంబంధం
55 | """.split()
56 | )
57 | 


--------------------------------------------------------------------------------
/spacy/lang/ti/examples.py:
--------------------------------------------------------------------------------
 1 | """
 2 | Example sentences to test spaCy and its language models.
 3 | 
 4 | >>> from spacy.lang.ti.examples import sentences
 5 | >>> docs = nlp.pipe(sentences)
 6 | """
 7 | 
 8 | 
 9 | sentences = [
10 |     "አፕል ብዩኬ ትርከብ ንግድ ብ1 ቢሊዮን ዶላር ንምግዛዕ ሐሲባ።",
11 |     "ፈላማይ ክታበት ኮቪድ 19 ተጀሚሩ፤ሓዱሽ ተስፋ ሂቡ ኣሎ",
12 |     "ቻንስለር ጀርመን ኣንገላ መርከል ዝርግሓ ቫይረስ ኮሮና ንምክልካል ጽኑዕ እገዳ ክግበር ጸዊዓ",
13 |     "ለንደን ብዓዲ እንግሊዝ ትርከብ ዓባይ ከተማ እያ።",
14 |     "ናበይ አለኻ፧",
15 |     "ናይ ፈረንሳይ ፕሬዝዳንት መን እዩ፧",
16 |     "ናይ አሜሪካ ዋና ከተማ እንታይ እያ፧",
17 |     "ኦባማ መዓስ ተወሊዱ፧",
18 | ]
19 | 


--------------------------------------------------------------------------------
/spacy/lang/ti/punctuation.py:
--------------------------------------------------------------------------------
 1 | from ..char_classes import (
 2 |     ALPHA_UPPER,
 3 |     CURRENCY,
 4 |     LIST_ELLIPSES,
 5 |     LIST_PUNCT,
 6 |     LIST_QUOTES,
 7 |     UNITS,
 8 | )
 9 | 
10 | _list_punct = LIST_PUNCT + "፡ ። ፣ ፤ ፥ ፦ ፧ ፠ ፨".strip().split()
11 | 
12 | _suffixes = (
13 |     _list_punct
14 |     + LIST_ELLIPSES
15 |     + LIST_QUOTES
16 |     + [
17 |         r"(?<=[0-9])\+",
18 |         # Tigrinya is written from Left-To-Right
19 |         r"(?<=[0-9])(?:{c})".format(c=CURRENCY),
20 |         r"(?<=[0-9])(?:{u})".format(u=UNITS),
21 |         r"(?<=[{au}][{au}])\.".format(au=ALPHA_UPPER),
22 |     ]
23 | )
24 | 
25 | TOKENIZER_SUFFIXES = _suffixes
26 | 


--------------------------------------------------------------------------------
/spacy/lang/ti/tokenizer_exceptions.py:
--------------------------------------------------------------------------------
 1 | from ...symbols import NORM, ORTH
 2 | 
 3 | _exc = {}
 4 | 
 5 | 
 6 | for exc_data in [
 7 |     {ORTH: "ት/ቤት"},
 8 |     {ORTH: "ወ/ሮ", NORM: "ወይዘሮ"},
 9 |     {ORTH: "ወ/ሪ", NORM: "ወይዘሪት"},
10 | ]:
11 |     _exc[exc_data[ORTH]] = [exc_data]
12 | 
13 | 
14 | for orth in [
15 |     "ዓ.ም.",
16 |     "ኪ.ሜ.",
17 | ]:
18 |     _exc[orth] = [{ORTH: orth}]
19 | 
20 | 
21 | TOKENIZER_EXCEPTIONS = _exc
22 | 


--------------------------------------------------------------------------------
/spacy/lang/tl/__init__.py:
--------------------------------------------------------------------------------
 1 | from ...language import BaseDefaults, Language
 2 | from .lex_attrs import LEX_ATTRS
 3 | from .stop_words import STOP_WORDS
 4 | from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS
 5 | 
 6 | 
 7 | class TagalogDefaults(BaseDefaults):
 8 |     tokenizer_exceptions = TOKENIZER_EXCEPTIONS
 9 |     lex_attr_getters = LEX_ATTRS
10 |     stop_words = STOP_WORDS
11 | 
12 | 
13 | class Tagalog(Language):
14 |     lang = "tl"
15 |     Defaults = TagalogDefaults
16 | 
17 | 
18 | __all__ = ["Tagalog"]
19 | 


--------------------------------------------------------------------------------
/spacy/lang/tn/__init__.py:
--------------------------------------------------------------------------------
 1 | from ...language import BaseDefaults, Language
 2 | from .lex_attrs import LEX_ATTRS
 3 | from .punctuation import TOKENIZER_INFIXES
 4 | from .stop_words import STOP_WORDS
 5 | 
 6 | 
 7 | class SetswanaDefaults(BaseDefaults):
 8 |     infixes = TOKENIZER_INFIXES
 9 |     stop_words = STOP_WORDS
10 |     lex_attr_getters = LEX_ATTRS
11 | 
12 | 
13 | class Setswana(Language):
14 |     lang = "tn"
15 |     Defaults = SetswanaDefaults
16 | 
17 | 
18 | __all__ = ["Setswana"]
19 | 


--------------------------------------------------------------------------------
/spacy/lang/tn/examples.py:
--------------------------------------------------------------------------------
 1 | """
 2 | Example sentences to test spaCy and its language models.
 3 | >>> from spacy.lang.tn.examples import sentences
 4 | >>> docs = nlp.pipe(sentences)
 5 | """
 6 | 
 7 | 
 8 | sentences = [
 9 |     "Apple e nyaka go reka JSE ka tlhwatlhwa ta R1 billion",
10 |     "Johannesburg ke toropo e kgolo mo Afrika Borwa.",
11 |     "O ko kae?",
12 |     "ke mang presidente ya Afrika Borwa?",
13 |     "ke eng toropo kgolo ya Afrika Borwa?",
14 |     "Nelson Mandela o belegwe leng?",
15 | ]
16 | 


--------------------------------------------------------------------------------
/spacy/lang/tn/punctuation.py:
--------------------------------------------------------------------------------
 1 | from ..char_classes import (
 2 |     ALPHA,
 3 |     ALPHA_LOWER,
 4 |     ALPHA_UPPER,
 5 |     CONCAT_QUOTES,
 6 |     HYPHENS,
 7 |     LIST_ELLIPSES,
 8 |     LIST_ICONS,
 9 | )
10 | 
11 | _infixes = (
12 |     LIST_ELLIPSES
13 |     + LIST_ICONS
14 |     + [
15 |         r"(?<=[0-9])[+\-\*^](?=[0-9-])",
16 |         r"(?<=[{al}{q}])\.(?=[{au}{q}])".format(
17 |             al=ALPHA_LOWER, au=ALPHA_UPPER, q=CONCAT_QUOTES
18 |         ),
19 |         r"(?<=[{a}]),(?=[{a}])".format(a=ALPHA),
20 |         r"(?<=[{a}0-9])(?:{h})(?=[{a}])".format(a=ALPHA, h=HYPHENS),
21 |         r"(?<=[{a}0-9])[:<>=/](?=[{a}])".format(a=ALPHA),
22 |     ]
23 | )
24 | 
25 | 
26 | TOKENIZER_INFIXES = _infixes
27 | 


--------------------------------------------------------------------------------
/spacy/lang/tr/__init__.py:
--------------------------------------------------------------------------------
 1 | from ...language import BaseDefaults, Language
 2 | from .lex_attrs import LEX_ATTRS
 3 | from .stop_words import STOP_WORDS
 4 | from .syntax_iterators import SYNTAX_ITERATORS
 5 | from .tokenizer_exceptions import TOKEN_MATCH, TOKENIZER_EXCEPTIONS
 6 | 
 7 | 
 8 | class TurkishDefaults(BaseDefaults):
 9 |     tokenizer_exceptions = TOKENIZER_EXCEPTIONS
10 |     lex_attr_getters = LEX_ATTRS
11 |     stop_words = STOP_WORDS
12 |     token_match = TOKEN_MATCH
13 |     syntax_iterators = SYNTAX_ITERATORS
14 | 
15 | 
16 | class Turkish(Language):
17 |     lang = "tr"
18 |     Defaults = TurkishDefaults
19 | 
20 | 
21 | __all__ = ["Turkish"]
22 | 


--------------------------------------------------------------------------------
/spacy/lang/tt/__init__.py:
--------------------------------------------------------------------------------
 1 | from ...language import BaseDefaults, Language
 2 | from .lex_attrs import LEX_ATTRS
 3 | from .punctuation import TOKENIZER_INFIXES
 4 | from .stop_words import STOP_WORDS
 5 | from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS
 6 | 
 7 | 
 8 | class TatarDefaults(BaseDefaults):
 9 |     tokenizer_exceptions = TOKENIZER_EXCEPTIONS
10 |     infixes = TOKENIZER_INFIXES
11 |     lex_attr_getters = LEX_ATTRS
12 |     stop_words = STOP_WORDS
13 | 
14 | 
15 | class Tatar(Language):
16 |     lang = "tt"
17 |     Defaults = TatarDefaults
18 | 
19 | 
20 | __all__ = ["Tatar"]
21 | 


--------------------------------------------------------------------------------
/spacy/lang/tt/examples.py:
--------------------------------------------------------------------------------
 1 | """
 2 | Example sentences to test spaCy and its language models.
 3 | >>> from spacy.lang.tt.examples import sentences
 4 | >>> docs = nlp.pipe(sentences)
 5 | """
 6 | 
 7 | sentences = [
 8 |     "Apple Бөекбритания стартабын $1 миллиард өчен сатып алыун исәпли.",
 9 |     "Автоном автомобильләр иминият җаваплылыкны җитештерүчеләргә күчерә.",
10 |     "Сан-Франциско тротуар буенча йөри торган робот-курьерларны тыю мөмкинлеген карый.",
11 |     "Лондон - Бөекбританиядә урнашкан зур шәһәр.",
12 |     "Син кайда?",
13 |     "Францияда кем президент?",
14 |     "Америка Кушма Штатларының башкаласы нинди шәһәр?",
15 |     "Барак Обама кайчан туган?",
16 | ]
17 | 


--------------------------------------------------------------------------------
/spacy/lang/ur/__init__.py:
--------------------------------------------------------------------------------
 1 | from ...language import BaseDefaults, Language
 2 | from .lex_attrs import LEX_ATTRS
 3 | from .punctuation import TOKENIZER_SUFFIXES
 4 | from .stop_words import STOP_WORDS
 5 | 
 6 | 
 7 | class UrduDefaults(BaseDefaults):
 8 |     suffixes = TOKENIZER_SUFFIXES
 9 |     lex_attr_getters = LEX_ATTRS
10 |     stop_words = STOP_WORDS
11 |     writing_system = {"direction": "rtl", "has_case": False, "has_letters": True}
12 | 
13 | 
14 | class Urdu(Language):
15 |     lang = "ur"
16 |     Defaults = UrduDefaults
17 | 
18 | 
19 | __all__ = ["Urdu"]
20 | 


--------------------------------------------------------------------------------
/spacy/lang/ur/examples.py:
--------------------------------------------------------------------------------
 1 | """
 2 | Example sentences to test spaCy and its language models.
 3 | 
 4 | >>> from spacy.lang.da.examples import sentences
 5 | >>> docs = nlp.pipe(sentences)
 6 | """
 7 | 
 8 | 
 9 | sentences = [
10 |     "اردو ہے جس کا نام ہم جانتے ہیں داغ",
11 |     "سارے جہاں میں دھوم ہماری زباں کی ہے",
12 | ]
13 | 


--------------------------------------------------------------------------------
/spacy/lang/ur/punctuation.py:
--------------------------------------------------------------------------------
1 | from ..punctuation import TOKENIZER_SUFFIXES
2 | 
3 | _suffixes = TOKENIZER_SUFFIXES
4 | 


--------------------------------------------------------------------------------
/spacy/lang/vi/examples.py:
--------------------------------------------------------------------------------
 1 | """
 2 | Example sentences to test spaCy and its language models.
 3 | >>> from spacy.lang.vi.examples import sentences
 4 | >>> docs = nlp.pipe(sentences)
 5 | """
 6 | 
 7 | 
 8 | sentences = [
 9 |     "Đây là đâu, tôi là ai?",
10 |     "Căn phòng có nhiều cửa sổ nên nó khá sáng",
11 |     "Đại dịch COVID vừa qua đã gây ảnh hưởng rất lớn tới nhiều doanh nghiệp lớn nhỏ.",
12 |     "Thành phố Hồ Chí Minh đã bị ảnh hưởng nặng nề trong thời gian vừa qua.",
13 |     "Ông bạn đang ở đâu thế?",
14 |     "Ai là người giải phóng đất nước Việt Nam khỏi ách đô hộ?",
15 |     "Vị tướng nào là người đã làm nên chiến thắng lịch sử Điện Biên Phủ?",
16 |     "Làm việc nhiều chán quá, đi chơi đâu đi?",
17 | ]
18 | 


--------------------------------------------------------------------------------
/spacy/lang/xx/__init__.py:
--------------------------------------------------------------------------------
 1 | from ...language import Language
 2 | 
 3 | 
 4 | class MultiLanguage(Language):
 5 |     """Language class to be used for models that support multiple languages.
 6 |     This module allows models to specify their language ID as 'xx'.
 7 |     """
 8 | 
 9 |     lang = "xx"
10 | 
11 | 
12 | __all__ = ["MultiLanguage"]
13 | 


--------------------------------------------------------------------------------
/spacy/lang/yo/__init__.py:
--------------------------------------------------------------------------------
 1 | from ...language import BaseDefaults, Language
 2 | from .lex_attrs import LEX_ATTRS
 3 | from .stop_words import STOP_WORDS
 4 | 
 5 | 
 6 | class YorubaDefaults(BaseDefaults):
 7 |     lex_attr_getters = LEX_ATTRS
 8 |     stop_words = STOP_WORDS
 9 | 
10 | 
11 | class Yoruba(Language):
12 |     lang = "yo"
13 |     Defaults = YorubaDefaults
14 | 
15 | 
16 | __all__ = ["Yoruba"]
17 | 


--------------------------------------------------------------------------------
/spacy/lang/yo/stop_words.py:
--------------------------------------------------------------------------------
 1 | # stop words as whitespace-separated list.
 2 | # Source: https://raw.githubusercontent.com/dohliam/more-stoplists/master/yo/yo.txt
 3 | 
 4 | STOP_WORDS = set(
 5 |     "a an b bá bí bẹ̀rẹ̀ d e f fún fẹ́ g gbogbo i inú j jù jẹ jẹ́ k kan kì kí kò "
 6 |     "l láti lè lọ m mi mo máa mọ̀ n ni náà ní nígbà nítorí nǹkan o p padà pé "
 7 |     "púpọ̀ pẹ̀lú r rẹ̀ s sì sí sínú t ti tí u w wà wá wọn wọ́n y yìí à àti àwọn á "
 8 |     "è é ì í ò òun ó ù ú ń ńlá ǹ ̀ ́ ̣ ṣ ṣe ṣé ṣùgbọ́n ẹ ẹmọ́ ọ ọjọ́ ọ̀pọ̀lọpọ̀".split()
 9 | )
10 | 


--------------------------------------------------------------------------------
/spacy/lang/zh/examples.py:
--------------------------------------------------------------------------------
 1 | """
 2 | Example sentences to test spaCy and its language models.
 3 | 
 4 | >>> from spacy.lang.zh.examples import sentences
 5 | >>> docs = nlp.pipe(sentences)
 6 | """
 7 | 
 8 | # from https://zh.wikipedia.org/wiki/汉语
 9 | sentences = [
10 |     "作为语言而言，为世界使用人数最多的语言，目前世界有五分之一人口做为母语。",
11 |     "汉语有多种分支，当中官话最为流行，为中华人民共和国的国家通用语言（又称为普通话）、以及中华民国的国语。",
12 |     "此外，中文还是联合国正式语文，并被上海合作组织等国际组织采用为官方语言。",
13 |     "在中国大陆，汉语通称为“汉语”。",
14 |     "在联合国、台湾、香港及澳门，通称为“中文”。",
15 |     "在新加坡及马来西亚，通称为“华语”。",
16 | ]
17 | 


--------------------------------------------------------------------------------
/spacy/matcher/__init__.py:
--------------------------------------------------------------------------------
1 | from .dependencymatcher import DependencyMatcher
2 | from .levenshtein import levenshtein
3 | from .matcher import Matcher
4 | from .phrasematcher import PhraseMatcher
5 | 
6 | __all__ = ["DependencyMatcher", "Matcher", "PhraseMatcher", "levenshtein"]
7 | 


--------------------------------------------------------------------------------
/spacy/matcher/phrasematcher.pxd:
--------------------------------------------------------------------------------
 1 | from cymem.cymem cimport Pool
 2 | from libcpp.vector cimport vector
 3 | from preshed.maps cimport MapStruct, key_t
 4 | 
 5 | from ..attrs cimport attr_id_t
 6 | from ..structs cimport SpanC
 7 | from ..tokens.doc cimport Doc
 8 | from ..vocab cimport Vocab
 9 | 
10 | 
11 | cdef class PhraseMatcher:
12 |     cdef readonly Vocab vocab
13 |     cdef attr_id_t attr
14 |     cdef object _callbacks
15 |     cdef object _docs
16 |     cdef bint _validate
17 |     cdef MapStruct* c_map
18 |     cdef Pool mem
19 |     cdef key_t _terminal_hash
20 | 
21 |     cdef void find_matches(self, Doc doc, int start_idx, int end_idx, vector[SpanC] *matches) nogil
22 | 


--------------------------------------------------------------------------------
/spacy/ml/__init__.py:
--------------------------------------------------------------------------------
1 | from .callbacks import create_models_with_nvtx_range  # noqa: F401
2 | from .models import *  # noqa: F401, F403
3 | 


--------------------------------------------------------------------------------
/spacy/ml/models/__init__.py:
--------------------------------------------------------------------------------
1 | from .entity_linker import *  # noqa
2 | from .multi_task import *  # noqa
3 | from .parser import *  # noqa
4 | from .span_finder import *  # noqa
5 | from .spancat import *  # noqa
6 | from .tagger import *  # noqa
7 | from .textcat import *  # noqa
8 | from .tok2vec import *  # noqa
9 | 


--------------------------------------------------------------------------------
/spacy/parts_of_speech.pxd:
--------------------------------------------------------------------------------
 1 | from . cimport symbols
 2 | 
 3 | 
 4 | cpdef enum univ_pos_t:
 5 |     NO_TAG = 0
 6 |     ADJ = symbols.ADJ
 7 |     ADP
 8 |     ADV
 9 |     AUX
10 |     CONJ
11 |     CCONJ  # U20
12 |     DET
13 |     INTJ
14 |     NOUN
15 |     NUM
16 |     PART
17 |     PRON
18 |     PROPN
19 |     PUNCT
20 |     SCONJ
21 |     SYM
22 |     VERB
23 |     X
24 |     EOL
25 |     SPACE
26 | 


--------------------------------------------------------------------------------
/spacy/parts_of_speech.pyx:
--------------------------------------------------------------------------------
 1 | # cython: profile=False
 2 | IDS = {
 3 |     "": NO_TAG,
 4 |     "ADJ": ADJ,
 5 |     "ADP": ADP,
 6 |     "ADV": ADV,
 7 |     "AUX": AUX,
 8 |     "CONJ": CONJ,  # U20
 9 |     "CCONJ": CCONJ,
10 |     "DET": DET,
11 |     "INTJ": INTJ,
12 |     "NOUN": NOUN,
13 |     "NUM": NUM,
14 |     "PART": PART,
15 |     "PRON": PRON,
16 |     "PROPN": PROPN,
17 |     "PUNCT": PUNCT,
18 |     "SCONJ": SCONJ,
19 |     "SYM": SYM,
20 |     "VERB": VERB,
21 |     "X": X,
22 |     "EOL": EOL,
23 |     "SPACE": SPACE
24 | }
25 | 
26 | 
27 | NAMES = {value: key for key, value in IDS.items()}
28 | 
29 | # As of Cython 3.1, the global Python namespace no longer has the enum
30 | # contents by default.
31 | globals().update(IDS)
32 | 
33 | 


--------------------------------------------------------------------------------
/spacy/pipeline/_edit_tree_internals/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/explosion/spaCy/41e07772dc5805594bab2997a090a9033e26bf56/spacy/pipeline/_edit_tree_internals/__init__.py


--------------------------------------------------------------------------------
/spacy/pipeline/_parser_internals/__init__.pxd:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/explosion/spaCy/41e07772dc5805594bab2997a090a9033e26bf56/spacy/pipeline/_parser_internals/__init__.pxd


--------------------------------------------------------------------------------
/spacy/pipeline/_parser_internals/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/explosion/spaCy/41e07772dc5805594bab2997a090a9033e26bf56/spacy/pipeline/_parser_internals/__init__.py


--------------------------------------------------------------------------------
/spacy/pipeline/_parser_internals/_beam_utils.pxd:
--------------------------------------------------------------------------------
1 | from ...typedefs cimport class_t, hash_t
2 | 
3 | 
4 | # These are passed as callbacks to thinc.search.Beam
5 | cdef int transition_state(void* _dest, void* _src, class_t clas, void* _moves) except -1
6 | 
7 | cdef int check_final_state(void* _state, void* extra_args) except -1
8 | 


--------------------------------------------------------------------------------
/spacy/pipeline/_parser_internals/_state.pyx:
--------------------------------------------------------------------------------
1 | # cython: profile=False
2 | 


--------------------------------------------------------------------------------
/spacy/pipeline/_parser_internals/arc_eager.pxd:
--------------------------------------------------------------------------------
1 | from ...typedefs cimport attr_t, weight_t
2 | from ._state cimport StateC
3 | from .transition_system cimport Transition, TransitionSystem
4 | 
5 | 
6 | cdef class ArcEager(TransitionSystem):
7 |     cdef get_arcs(self, StateC* state)
8 | 


--------------------------------------------------------------------------------
/spacy/pipeline/_parser_internals/ner.pxd:
--------------------------------------------------------------------------------
1 | from .transition_system cimport TransitionSystem
2 | 
3 | 
4 | cdef class BiluoPushDown(TransitionSystem):
5 |     pass
6 | 


--------------------------------------------------------------------------------
/spacy/pipeline/_parser_internals/nonproj.hh:
--------------------------------------------------------------------------------
 1 | #ifndef NONPROJ_HH
 2 | #define NONPROJ_HH
 3 | 
 4 | #include <stdexcept>
 5 | #include <string>
 6 | 
 7 | void raise_domain_error(std::string const &msg) {
 8 |     throw std::domain_error(msg);
 9 | }
10 | 
11 | #endif // NONPROJ_HH
12 | 


--------------------------------------------------------------------------------
/spacy/pipeline/_parser_internals/nonproj.pxd:
--------------------------------------------------------------------------------
1 | from libcpp.string cimport string
2 | 
3 | 
4 | cdef extern from "nonproj.hh":
5 |     cdef void raise_domain_error(const string& msg) nogil except +
6 | 


--------------------------------------------------------------------------------
/spacy/pipeline/legacy/__init__.py:
--------------------------------------------------------------------------------
1 | from .entity_linker import EntityLinker_v1
2 | 
3 | __all__ = ["EntityLinker_v1"]
4 | 


--------------------------------------------------------------------------------
/spacy/pipeline/pipe.pxd:
--------------------------------------------------------------------------------
1 | cdef class Pipe:
2 |     cdef public str name
3 | 


--------------------------------------------------------------------------------
/spacy/pipeline/trainable_pipe.pxd:
--------------------------------------------------------------------------------
 1 | from ..vocab cimport Vocab
 2 | from .pipe cimport Pipe
 3 | 
 4 | 
 5 | cdef class TrainablePipe(Pipe):
 6 |     cdef public Vocab vocab
 7 |     cdef public object model
 8 |     cdef public object cfg
 9 |     cdef public object scorer
10 | 


--------------------------------------------------------------------------------
/spacy/py.typed:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/explosion/spaCy/41e07772dc5805594bab2997a090a9033e26bf56/spacy/py.typed


--------------------------------------------------------------------------------
/spacy/tests/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/explosion/spaCy/41e07772dc5805594bab2997a090a9033e26bf56/spacy/tests/__init__.py


--------------------------------------------------------------------------------
/spacy/tests/doc/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/explosion/spaCy/41e07772dc5805594bab2997a090a9033e26bf56/spacy/tests/doc/__init__.py


--------------------------------------------------------------------------------
/spacy/tests/enable_gpu.py:
--------------------------------------------------------------------------------
1 | from spacy import require_gpu
2 | 
3 | require_gpu()
4 | 


--------------------------------------------------------------------------------
/spacy/tests/lang/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/explosion/spaCy/41e07772dc5805594bab2997a090a9033e26bf56/spacy/tests/lang/__init__.py


--------------------------------------------------------------------------------
/spacy/tests/lang/af/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/explosion/spaCy/41e07772dc5805594bab2997a090a9033e26bf56/spacy/tests/lang/af/__init__.py


--------------------------------------------------------------------------------
/spacy/tests/lang/am/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/explosion/spaCy/41e07772dc5805594bab2997a090a9033e26bf56/spacy/tests/lang/am/__init__.py


--------------------------------------------------------------------------------
/spacy/tests/lang/am/test_exception.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/explosion/spaCy/41e07772dc5805594bab2997a090a9033e26bf56/spacy/tests/lang/am/test_exception.py


--------------------------------------------------------------------------------
/spacy/tests/lang/ar/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/explosion/spaCy/41e07772dc5805594bab2997a090a9033e26bf56/spacy/tests/lang/ar/__init__.py


--------------------------------------------------------------------------------
/spacy/tests/lang/ar/test_exceptions.py:
--------------------------------------------------------------------------------
 1 | import pytest
 2 | 
 3 | 
 4 | @pytest.mark.parametrize("text", ["ق.م", "إلخ", "ص.ب", "ت."])
 5 | def test_ar_tokenizer_handles_abbr(ar_tokenizer, text):
 6 |     tokens = ar_tokenizer(text)
 7 |     assert len(tokens) == 1
 8 | 
 9 | 
10 | def test_ar_tokenizer_handles_exc_in_text(ar_tokenizer):
11 |     text = "تعود الكتابة الهيروغليفية إلى سنة 3200 ق.م"
12 |     tokens = ar_tokenizer(text)
13 |     assert len(tokens) == 7
14 |     assert tokens[6].text == "ق.م"
15 | 
16 | 
17 | def test_ar_tokenizer_handles_exc_in_text_2(ar_tokenizer):
18 |     text = "يبلغ طول مضيق طارق 14كم "
19 |     tokens = ar_tokenizer(text)
20 |     assert len(tokens) == 6
21 | 


--------------------------------------------------------------------------------
/spacy/tests/lang/ar/test_text.py:
--------------------------------------------------------------------------------
 1 | def test_ar_tokenizer_handles_long_text(ar_tokenizer):
 2 |     text = """نجيب محفوظ مؤلف و كاتب روائي عربي، يعد من أهم الأدباء العرب خلال القرن العشرين.
 3 |      ولد نجيب محفوظ في مدينة القاهرة، حيث ترعرع و تلقى تعليمه الجامعي في جامعتها،
 4 |       فتمكن من نيل شهادة في الفلسفة. ألف محفوظ على مدار حياته الكثير من الأعمال الأدبية، و في مقدمتها ثلاثيته الشهيرة.
 5 |       و قد نجح في الحصول على جائزة نوبل للآداب، ليكون بذلك العربي الوحيد الذي فاز بها."""
 6 | 
 7 |     tokens = ar_tokenizer(text)
 8 |     assert tokens[3].is_stop is True
 9 |     assert len(tokens) == 77
10 | 


--------------------------------------------------------------------------------
/spacy/tests/lang/bg/test_tokenizer.py:
--------------------------------------------------------------------------------
1 | import pytest
2 | 
3 | 
4 | def test_bg_tokenizer_handles_final_diacritics(bg_tokenizer):
5 |     text = "Ня̀маше яйца̀. Ня̀маше яйца̀."
6 |     tokens = bg_tokenizer(text)
7 |     assert tokens[1].text == "яйца̀"
8 |     assert tokens[2].text == "."
9 | 


--------------------------------------------------------------------------------
/spacy/tests/lang/bn/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/explosion/spaCy/41e07772dc5805594bab2997a090a9033e26bf56/spacy/tests/lang/bn/__init__.py


--------------------------------------------------------------------------------
/spacy/tests/lang/bo/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/explosion/spaCy/41e07772dc5805594bab2997a090a9033e26bf56/spacy/tests/lang/bo/__init__.py


--------------------------------------------------------------------------------
/spacy/tests/lang/bo/test_text.py:
--------------------------------------------------------------------------------
 1 | import pytest
 2 | 
 3 | 
 4 | @pytest.mark.parametrize(
 5 |     "text,match",
 6 |     [
 7 |         ("10", True),
 8 |         ("1", True),
 9 |         ("999.0", True),
10 |         ("གཅིག་", True),
11 |         ("གཉིས་", True),
12 |         ("ཀླད་ཀོར་", True),
13 |         ("བཅུ་གཅིག་", True),
14 |         ("ཁྱི་", False),
15 |         (",", False),
16 |     ],
17 | )
18 | def test_lex_attrs_like_number(bo_tokenizer, text, match):
19 |     tokens = bo_tokenizer(text)
20 |     assert len(tokens) == 1
21 |     assert tokens[0].like_num == match
22 | 


--------------------------------------------------------------------------------
/spacy/tests/lang/ca/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/explosion/spaCy/41e07772dc5805594bab2997a090a9033e26bf56/spacy/tests/lang/ca/__init__.py


--------------------------------------------------------------------------------
/spacy/tests/lang/ca/test_exception.py:
--------------------------------------------------------------------------------
 1 | import pytest
 2 | 
 3 | 
 4 | @pytest.mark.parametrize(
 5 |     "text,lemma",
 6 |     [("aprox.", "aproximadament"), ("pàg.", "pàgina"), ("p.ex.", "per exemple")],
 7 | )
 8 | def test_ca_tokenizer_handles_abbr(ca_tokenizer, text, lemma):
 9 |     tokens = ca_tokenizer(text)
10 |     assert len(tokens) == 1
11 | 
12 | 
13 | def test_ca_tokenizer_handles_exc_in_text(ca_tokenizer):
14 |     text = "La Dra. Puig viu a la pl. dels Til·lers."
15 |     doc = ca_tokenizer(text)
16 |     assert [t.text for t in doc] == [
17 |         "La",
18 |         "Dra.",
19 |         "Puig",
20 |         "viu",
21 |         "a",
22 |         "la",
23 |         "pl.",
24 |         "d",
25 |         "els",
26 |         "Til·lers",
27 |         ".",
28 |     ]
29 | 


--------------------------------------------------------------------------------
/spacy/tests/lang/ca/test_prefix_suffix_infix.py:
--------------------------------------------------------------------------------
 1 | import pytest
 2 | 
 3 | 
 4 | @pytest.mark.parametrize(
 5 |     "text,expected_tokens",
 6 |     [
 7 |         ("d'un", ["d'", "un"]),
 8 |         ("s'ha", ["s'", "ha"]),
 9 |         ("del", ["d", "el"]),
10 |         ("cantar-te", ["cantar", "-te"]),
11 |         ("-hola", ["-", "hola"]),
12 |     ],
13 | )
14 | def test_contractions(ca_tokenizer, text, expected_tokens):
15 |     """Test that the contractions are split into two tokens"""
16 |     tokens = ca_tokenizer(text)
17 |     assert len(tokens) == 2
18 |     assert [t.text for t in tokens] == expected_tokens
19 | 


--------------------------------------------------------------------------------
/spacy/tests/lang/cs/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/explosion/spaCy/41e07772dc5805594bab2997a090a9033e26bf56/spacy/tests/lang/cs/__init__.py


--------------------------------------------------------------------------------
/spacy/tests/lang/cs/test_text.py:
--------------------------------------------------------------------------------
 1 | import pytest
 2 | 
 3 | 
 4 | @pytest.mark.parametrize(
 5 |     "text,match",
 6 |     [
 7 |         ("10", True),
 8 |         ("1", True),
 9 |         ("10.000", True),
10 |         ("1000", True),
11 |         ("999,0", True),
12 |         ("devatenáct", True),
13 |         ("osmdesát", True),
14 |         ("kvadrilion", True),
15 |         ("Pes", False),
16 |         (",", False),
17 |         ("1/2", True),
18 |     ],
19 | )
20 | def test_lex_attrs_like_number(cs_tokenizer, text, match):
21 |     tokens = cs_tokenizer(text)
22 |     assert len(tokens) == 1
23 |     assert tokens[0].like_num == match
24 | 


--------------------------------------------------------------------------------
/spacy/tests/lang/da/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/explosion/spaCy/41e07772dc5805594bab2997a090a9033e26bf56/spacy/tests/lang/da/__init__.py


--------------------------------------------------------------------------------
/spacy/tests/lang/de/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/explosion/spaCy/41e07772dc5805594bab2997a090a9033e26bf56/spacy/tests/lang/de/__init__.py


--------------------------------------------------------------------------------
/spacy/tests/lang/de/test_exceptions.py:
--------------------------------------------------------------------------------
 1 | import pytest
 2 | 
 3 | 
 4 | @pytest.mark.parametrize("text", ["auf'm", "du's", "über'm", "wir's"])
 5 | def test_de_tokenizer_splits_contractions(de_tokenizer, text):
 6 |     tokens = de_tokenizer(text)
 7 |     assert len(tokens) == 2
 8 | 
 9 | 
10 | @pytest.mark.parametrize("text", ["z.B.", "d.h.", "Jan.", "Dez.", "Chr."])
11 | def test_de_tokenizer_handles_abbr(de_tokenizer, text):
12 |     tokens = de_tokenizer(text)
13 |     assert len(tokens) == 1
14 | 
15 | 
16 | def test_de_tokenizer_handles_exc_in_text(de_tokenizer):
17 |     text = "Ich bin z.Zt. im Urlaub."
18 |     tokens = de_tokenizer(text)
19 |     assert len(tokens) == 6
20 |     assert tokens[2].text == "z.Zt."
21 | 


--------------------------------------------------------------------------------
/spacy/tests/lang/de/test_noun_chunks.py:
--------------------------------------------------------------------------------
1 | import pytest
2 | 
3 | 
4 | def test_noun_chunks_is_parsed_de(de_tokenizer):
5 |     """Test that noun_chunks raises Value Error for 'de' language if Doc is not parsed."""
6 |     doc = de_tokenizer("Er lag auf seinem")
7 |     with pytest.raises(ValueError):
8 |         list(doc.noun_chunks)
9 | 


--------------------------------------------------------------------------------
/spacy/tests/lang/dsb/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/explosion/spaCy/41e07772dc5805594bab2997a090a9033e26bf56/spacy/tests/lang/dsb/__init__.py


--------------------------------------------------------------------------------
/spacy/tests/lang/dsb/test_text.py:
--------------------------------------------------------------------------------
 1 | import pytest
 2 | 
 3 | 
 4 | @pytest.mark.parametrize(
 5 |     "text,match",
 6 |     [
 7 |         ("10", True),
 8 |         ("1", True),
 9 |         ("10,000", True),
10 |         ("10,00", True),
11 |         ("jadno", True),
12 |         ("dwanassćo", True),
13 |         ("milion", True),
14 |         ("sto", True),
15 |         ("ceła", False),
16 |         ("kopica", False),
17 |         ("narěcow", False),
18 |         (",", False),
19 |         ("1/2", True),
20 |     ],
21 | )
22 | def test_lex_attrs_like_number(dsb_tokenizer, text, match):
23 |     tokens = dsb_tokenizer(text)
24 |     assert len(tokens) == 1
25 |     assert tokens[0].like_num == match
26 | 


--------------------------------------------------------------------------------
/spacy/tests/lang/el/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/explosion/spaCy/41e07772dc5805594bab2997a090a9033e26bf56/spacy/tests/lang/el/__init__.py


--------------------------------------------------------------------------------
/spacy/tests/lang/el/test_exception.py:
--------------------------------------------------------------------------------
 1 | import pytest
 2 | 
 3 | 
 4 | @pytest.mark.parametrize("text", ["αριθ.", "τρισ.", "δισ.", "σελ."])
 5 | def test_el_tokenizer_handles_abbr(el_tokenizer, text):
 6 |     tokens = el_tokenizer(text)
 7 |     assert len(tokens) == 1
 8 | 
 9 | 
10 | def test_el_tokenizer_handles_exc_in_text(el_tokenizer):
11 |     text = "Στα 14 τρισ. δολάρια το κόστος από την άνοδο της στάθμης της θάλασσας."
12 |     tokens = el_tokenizer(text)
13 |     assert len(tokens) == 14
14 |     assert tokens[2].text == "τρισ."
15 | 


--------------------------------------------------------------------------------
/spacy/tests/lang/el/test_noun_chunks.py:
--------------------------------------------------------------------------------
1 | import pytest
2 | 
3 | 
4 | def test_noun_chunks_is_parsed_el(el_tokenizer):
5 |     """Test that noun_chunks raises Value Error for 'el' language if Doc is not parsed."""
6 |     doc = el_tokenizer("είναι χώρα της νοτιοανατολικής")
7 |     with pytest.raises(ValueError):
8 |         list(doc.noun_chunks)
9 | 


--------------------------------------------------------------------------------
/spacy/tests/lang/en/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/explosion/spaCy/41e07772dc5805594bab2997a090a9033e26bf56/spacy/tests/lang/en/__init__.py


--------------------------------------------------------------------------------
/spacy/tests/lang/es/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/explosion/spaCy/41e07772dc5805594bab2997a090a9033e26bf56/spacy/tests/lang/es/__init__.py


--------------------------------------------------------------------------------
/spacy/tests/lang/es/test_exception.py:
--------------------------------------------------------------------------------
 1 | import pytest
 2 | 
 3 | 
 4 | @pytest.mark.parametrize(
 5 |     "text,lemma",
 6 |     [
 7 |         ("aprox.", "aproximadamente"),
 8 |         ("esq.", "esquina"),
 9 |         ("pág.", "página"),
10 |         ("p.ej.", "por ejemplo"),
11 |     ],
12 | )
13 | def test_es_tokenizer_handles_abbr(es_tokenizer, text, lemma):
14 |     tokens = es_tokenizer(text)
15 |     assert len(tokens) == 1
16 | 
17 | 
18 | def test_es_tokenizer_handles_exc_in_text(es_tokenizer):
19 |     text = "Mariano Rajoy ha corrido aprox. medio kilómetro"
20 |     tokens = es_tokenizer(text)
21 |     assert len(tokens) == 7
22 |     assert tokens[4].text == "aprox."
23 | 


--------------------------------------------------------------------------------
/spacy/tests/lang/et/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/explosion/spaCy/41e07772dc5805594bab2997a090a9033e26bf56/spacy/tests/lang/et/__init__.py


--------------------------------------------------------------------------------
/spacy/tests/lang/eu/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/explosion/spaCy/41e07772dc5805594bab2997a090a9033e26bf56/spacy/tests/lang/eu/__init__.py


--------------------------------------------------------------------------------
/spacy/tests/lang/eu/test_text.py:
--------------------------------------------------------------------------------
 1 | import pytest
 2 | 
 3 | 
 4 | def test_eu_tokenizer_handles_long_text(eu_tokenizer):
 5 |     text = """ta nere guitarra estrenatu ondoren"""
 6 |     tokens = eu_tokenizer(text)
 7 |     assert len(tokens) == 5
 8 | 
 9 | 
10 | @pytest.mark.parametrize(
11 |     "text,length",
12 |     [
13 |         ("milesker ederra joan zen hitzaldia plazer hutsa", 7),
14 |         ("astelehen guztia sofan pasau biot", 5),
15 |     ],
16 | )
17 | def test_eu_tokenizer_handles_cnts(eu_tokenizer, text, length):
18 |     tokens = eu_tokenizer(text)
19 |     assert len(tokens) == length
20 | 


--------------------------------------------------------------------------------
/spacy/tests/lang/fa/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/explosion/spaCy/41e07772dc5805594bab2997a090a9033e26bf56/spacy/tests/lang/fa/__init__.py


--------------------------------------------------------------------------------
/spacy/tests/lang/fa/test_noun_chunks.py:
--------------------------------------------------------------------------------
 1 | import pytest
 2 | 
 3 | 
 4 | def test_noun_chunks_is_parsed_fa(fa_tokenizer):
 5 |     """Test that noun_chunks raises Value Error for 'fa' language if Doc is not parsed."""
 6 | 
 7 |     doc = fa_tokenizer("این یک جمله نمونه می باشد.")
 8 |     with pytest.raises(ValueError):
 9 |         list(doc.noun_chunks)
10 | 


--------------------------------------------------------------------------------
/spacy/tests/lang/fi/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/explosion/spaCy/41e07772dc5805594bab2997a090a9033e26bf56/spacy/tests/lang/fi/__init__.py


--------------------------------------------------------------------------------
/spacy/tests/lang/fi/test_text.py:
--------------------------------------------------------------------------------
 1 | import pytest
 2 | 
 3 | 
 4 | @pytest.mark.parametrize(
 5 |     "text,match",
 6 |     [
 7 |         ("10", True),
 8 |         ("1", True),
 9 |         ("10000", True),
10 |         ("10,00", True),
11 |         ("-999,0", True),
12 |         ("yksi", True),
13 |         ("kolmetoista", True),
14 |         ("viisikymmentä", True),
15 |         ("tuhat", True),
16 |         ("1/2", True),
17 |         ("hevonen", False),
18 |         (",", False),
19 |     ],
20 | )
21 | def test_fi_lex_attrs_like_number(fi_tokenizer, text, match):
22 |     tokens = fi_tokenizer(text)
23 |     assert len(tokens) == 1
24 |     assert tokens[0].like_num == match
25 | 


--------------------------------------------------------------------------------
/spacy/tests/lang/fo/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/explosion/spaCy/41e07772dc5805594bab2997a090a9033e26bf56/spacy/tests/lang/fo/__init__.py


--------------------------------------------------------------------------------
/spacy/tests/lang/fr/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/explosion/spaCy/41e07772dc5805594bab2997a090a9033e26bf56/spacy/tests/lang/fr/__init__.py


--------------------------------------------------------------------------------
/spacy/tests/lang/ga/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/explosion/spaCy/41e07772dc5805594bab2997a090a9033e26bf56/spacy/tests/lang/ga/__init__.py


--------------------------------------------------------------------------------
/spacy/tests/lang/ga/test_tokenizer.py:
--------------------------------------------------------------------------------
 1 | import pytest
 2 | 
 3 | # fmt: off
 4 | GA_TOKEN_EXCEPTION_TESTS = [
 5 |     ("Niall Ó Domhnaill, Rialtas na hÉireann 1977 (lch. 600).", ["Niall", "Ó", "Domhnaill", ",", "Rialtas", "na", "hÉireann", "1977", "(", "lch.", "600", ")", "."]),
 6 |     ("Daoine a bhfuil Gaeilge acu, m.sh. tusa agus mise", ["Daoine", "a", "bhfuil", "Gaeilge", "acu", ",", "m.sh.", "tusa", "agus", "mise"])
 7 | ]
 8 | # fmt: on
 9 | 
10 | 
11 | @pytest.mark.parametrize("text,expected_tokens", GA_TOKEN_EXCEPTION_TESTS)
12 | def test_ga_tokenizer_handles_exception_cases(ga_tokenizer, text, expected_tokens):
13 |     tokens = ga_tokenizer(text)
14 |     token_list = [token.text for token in tokens if not token.is_space]
15 |     assert expected_tokens == token_list
16 | 


--------------------------------------------------------------------------------
/spacy/tests/lang/grc/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/explosion/spaCy/41e07772dc5805594bab2997a090a9033e26bf56/spacy/tests/lang/grc/__init__.py


--------------------------------------------------------------------------------
/spacy/tests/lang/grc/test_text.py:
--------------------------------------------------------------------------------
 1 | import pytest
 2 | 
 3 | 
 4 | @pytest.mark.parametrize(
 5 |     "text,match",
 6 |     [
 7 |         ("ι", True),
 8 |         ("α", True),
 9 |         ("ϟα", True),
10 |         ("ἑκατόν", True),
11 |         ("ἐνακόσια", True),
12 |         ("δισχίλια", True),
13 |         ("μύρια", True),
14 |         ("εἷς", True),
15 |         ("λόγος", False),
16 |         (",", False),
17 |         ("λβ", True),
18 |     ],
19 | )
20 | def test_lex_attrs_like_number(grc_tokenizer, text, match):
21 |     tokens = grc_tokenizer(text)
22 |     assert len(tokens) == 1
23 |     assert tokens[0].like_num == match
24 | 


--------------------------------------------------------------------------------
/spacy/tests/lang/gu/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/explosion/spaCy/41e07772dc5805594bab2997a090a9033e26bf56/spacy/tests/lang/gu/__init__.py


--------------------------------------------------------------------------------
/spacy/tests/lang/gu/test_text.py:
--------------------------------------------------------------------------------
 1 | import pytest
 2 | 
 3 | 
 4 | def test_gu_tokenizer_handlers_long_text(gu_tokenizer):
 5 |     text = """પશ્ચિમ ભારતમાં આવેલું ગુજરાત રાજ્ય જે વ્યક્તિઓની માતૃભૂમિ છે"""
 6 |     tokens = gu_tokenizer(text)
 7 |     assert len(tokens) == 9
 8 | 
 9 | 
10 | @pytest.mark.parametrize(
11 |     "text,length",
12 |     [("ગુજરાતીઓ ખાવાના શોખીન માનવામાં આવે છે", 6), ("ખેતરની ખેડ કરવામાં આવે છે.", 5)],
13 | )
14 | def test_gu_tokenizer_handles_cnts(gu_tokenizer, text, length):
15 |     tokens = gu_tokenizer(text)
16 |     assert len(tokens) == length
17 | 


--------------------------------------------------------------------------------
/spacy/tests/lang/he/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/explosion/spaCy/41e07772dc5805594bab2997a090a9033e26bf56/spacy/tests/lang/he/__init__.py


--------------------------------------------------------------------------------
/spacy/tests/lang/hi/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/explosion/spaCy/41e07772dc5805594bab2997a090a9033e26bf56/spacy/tests/lang/hi/__init__.py


--------------------------------------------------------------------------------
/spacy/tests/lang/hi/test_text.py:
--------------------------------------------------------------------------------
 1 | import pytest
 2 | 
 3 | from spacy.lang.hi import Hindi
 4 | 
 5 | 
 6 | @pytest.mark.issue(3625)
 7 | def test_issue3625():
 8 |     """Test that default punctuation rules applies to hindi unicode characters"""
 9 |     nlp = Hindi()
10 |     doc = nlp("hi. how हुए. होटल, होटल")
11 |     expected = ["hi", ".", "how", "हुए", ".", "होटल", ",", "होटल"]
12 |     assert [token.text for token in doc] == expected
13 | 


--------------------------------------------------------------------------------
/spacy/tests/lang/hr/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/explosion/spaCy/41e07772dc5805594bab2997a090a9033e26bf56/spacy/tests/lang/hr/__init__.py


--------------------------------------------------------------------------------
/spacy/tests/lang/hsb/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/explosion/spaCy/41e07772dc5805594bab2997a090a9033e26bf56/spacy/tests/lang/hsb/__init__.py


--------------------------------------------------------------------------------
/spacy/tests/lang/hsb/test_text.py:
--------------------------------------------------------------------------------
 1 | import pytest
 2 | 
 3 | 
 4 | @pytest.mark.parametrize(
 5 |     "text,match",
 6 |     [
 7 |         ("10", True),
 8 |         ("1", True),
 9 |         ("10,000", True),
10 |         ("10,00", True),
11 |         ("jedne", True),
12 |         ("dwanaće", True),
13 |         ("milion", True),
14 |         ("sto", True),
15 |         ("załožene", False),
16 |         ("wona", False),
17 |         ("powšitkownej", False),
18 |         (",", False),
19 |         ("1/2", True),
20 |     ],
21 | )
22 | def test_lex_attrs_like_number(hsb_tokenizer, text, match):
23 |     tokens = hsb_tokenizer(text)
24 |     assert len(tokens) == 1
25 |     assert tokens[0].like_num == match
26 | 


--------------------------------------------------------------------------------
/spacy/tests/lang/ht/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/explosion/spaCy/41e07772dc5805594bab2997a090a9033e26bf56/spacy/tests/lang/ht/__init__.py


--------------------------------------------------------------------------------
/spacy/tests/lang/hu/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/explosion/spaCy/41e07772dc5805594bab2997a090a9033e26bf56/spacy/tests/lang/hu/__init__.py


--------------------------------------------------------------------------------
/spacy/tests/lang/hy/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/explosion/spaCy/41e07772dc5805594bab2997a090a9033e26bf56/spacy/tests/lang/hy/__init__.py


--------------------------------------------------------------------------------
/spacy/tests/lang/hy/test_text.py:
--------------------------------------------------------------------------------
 1 | import pytest
 2 | 
 3 | from spacy.lang.hy.lex_attrs import like_num
 4 | 
 5 | 
 6 | @pytest.mark.parametrize("word", ["հիսուն"])
 7 | def test_hy_lex_attrs_capitals(word):
 8 |     assert like_num(word)
 9 |     assert like_num(word.upper())
10 | 


--------------------------------------------------------------------------------
/spacy/tests/lang/id/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/explosion/spaCy/41e07772dc5805594bab2997a090a9033e26bf56/spacy/tests/lang/id/__init__.py


--------------------------------------------------------------------------------
/spacy/tests/lang/id/test_noun_chunks.py:
--------------------------------------------------------------------------------
1 | import pytest
2 | 
3 | 
4 | def test_noun_chunks_is_parsed_id(id_tokenizer):
5 |     """Test that noun_chunks raises Value Error for 'id' language if Doc is not parsed."""
6 |     doc = id_tokenizer("sebelas")
7 |     with pytest.raises(ValueError):
8 |         list(doc.noun_chunks)
9 | 


--------------------------------------------------------------------------------
/spacy/tests/lang/id/test_text.py:
--------------------------------------------------------------------------------
 1 | import pytest
 2 | 
 3 | from spacy.lang.id.lex_attrs import like_num
 4 | 
 5 | 
 6 | @pytest.mark.parametrize("word", ["sebelas"])
 7 | def test_id_lex_attrs_capitals(word):
 8 |     assert like_num(word)
 9 |     assert like_num(word.upper())
10 | 


--------------------------------------------------------------------------------
/spacy/tests/lang/is/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/explosion/spaCy/41e07772dc5805594bab2997a090a9033e26bf56/spacy/tests/lang/is/__init__.py


--------------------------------------------------------------------------------
/spacy/tests/lang/it/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/explosion/spaCy/41e07772dc5805594bab2997a090a9033e26bf56/spacy/tests/lang/it/__init__.py


--------------------------------------------------------------------------------
/spacy/tests/lang/it/test_prefix_suffix_infix.py:
--------------------------------------------------------------------------------
 1 | import pytest
 2 | 
 3 | 
 4 | @pytest.mark.parametrize(
 5 |     "text,expected_tokens", [("c'è", ["c'", "è"]), ("l'ha", ["l'", "ha"])]
 6 | )
 7 | def test_contractions(it_tokenizer, text, expected_tokens):
 8 |     """Test that the contractions are split into two tokens"""
 9 |     tokens = it_tokenizer(text)
10 |     assert len(tokens) == 2
11 |     assert [t.text for t in tokens] == expected_tokens
12 | 


--------------------------------------------------------------------------------
/spacy/tests/lang/it/test_stopwords.py:
--------------------------------------------------------------------------------
 1 | import pytest
 2 | 
 3 | 
 4 | @pytest.mark.parametrize(
 5 |     "word", ["un", "lo", "dell", "dall", "si", "ti", "mi", "quest", "quel", "quello"]
 6 | )
 7 | def test_stopwords_basic(it_tokenizer, word):
 8 |     tok = it_tokenizer(word)[0]
 9 |     assert tok.is_stop
10 | 
11 | 
12 | @pytest.mark.parametrize(
13 |     "word", ["quest'uomo", "l'ho", "un'amica", "dell'olio", "s'arrende", "m'ascolti"]
14 | )
15 | def test_stopwords_elided(it_tokenizer, word):
16 |     tok = it_tokenizer(word)[0]
17 |     assert tok.is_stop
18 | 


--------------------------------------------------------------------------------
/spacy/tests/lang/it/test_text.py:
--------------------------------------------------------------------------------
 1 | import pytest
 2 | 
 3 | 
 4 | @pytest.mark.issue(2822)
 5 | def test_issue2822(it_tokenizer):
 6 |     """Test that the abbreviation of poco is kept as one word."""
 7 |     doc = it_tokenizer("Vuoi un po' di zucchero?")
 8 |     assert len(doc) == 6
 9 |     assert doc[0].text == "Vuoi"
10 |     assert doc[1].text == "un"
11 |     assert doc[2].text == "po'"
12 |     assert doc[3].text == "di"
13 |     assert doc[4].text == "zucchero"
14 |     assert doc[5].text == "?"
15 | 


--------------------------------------------------------------------------------
/spacy/tests/lang/ja/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/explosion/spaCy/41e07772dc5805594bab2997a090a9033e26bf56/spacy/tests/lang/ja/__init__.py


--------------------------------------------------------------------------------
/spacy/tests/lang/ja/test_morphologizer_factory.py:
--------------------------------------------------------------------------------
 1 | import pytest
 2 | 
 3 | from spacy.lang.ja import Japanese
 4 | 
 5 | 
 6 | def test_ja_morphologizer_factory():
 7 |     pytest.importorskip("sudachipy")
 8 |     nlp = Japanese()
 9 |     morphologizer = nlp.add_pipe("morphologizer")
10 |     assert morphologizer.cfg["extend"] is True
11 | 


--------------------------------------------------------------------------------
/spacy/tests/lang/kmr/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/explosion/spaCy/41e07772dc5805594bab2997a090a9033e26bf56/spacy/tests/lang/kmr/__init__.py


--------------------------------------------------------------------------------
/spacy/tests/lang/kmr/test_text.py:
--------------------------------------------------------------------------------
 1 | import pytest
 2 | 
 3 | from spacy.lang.kmr.lex_attrs import like_num
 4 | 
 5 | 
 6 | @pytest.mark.parametrize(
 7 |     "word",
 8 |     [
 9 |         "yekem",
10 |         "duyemîn",
11 |         "100em",
12 |         "dehem",
13 |         "sedemîn",
14 |         "34em",
15 |         "30yem",
16 |         "20emîn",
17 |         "50yemîn",
18 |     ],
19 | )
20 | def test_kmr_lex_attrs_like_number_for_ordinal(word):
21 |     assert like_num(word)
22 | 
23 | 
24 | @pytest.mark.parametrize("word", ["deh"])
25 | def test_kmr_lex_attrs_capitals(word):
26 |     assert like_num(word)
27 |     assert like_num(word.upper())
28 | 


--------------------------------------------------------------------------------
/spacy/tests/lang/ko/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/explosion/spaCy/41e07772dc5805594bab2997a090a9033e26bf56/spacy/tests/lang/ko/__init__.py


--------------------------------------------------------------------------------
/spacy/tests/lang/ko/test_lemmatization.py:
--------------------------------------------------------------------------------
 1 | import pytest
 2 | 
 3 | 
 4 | @pytest.mark.parametrize(
 5 |     "word,lemma",
 6 |     [
 7 |         ("새로운", "새롭"),
 8 |         ("빨간", "빨갛"),
 9 |         ("클수록", "크"),
10 |         ("뭡니까", "뭣"),
11 |         ("됐다", "되"),
12 |     ],
13 | )
14 | def test_ko_lemmatizer_assigns(ko_tokenizer, word, lemma):
15 |     test_lemma = ko_tokenizer(word)[0].lemma_
16 |     assert test_lemma == lemma
17 | 


--------------------------------------------------------------------------------
/spacy/tests/lang/ky/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/explosion/spaCy/41e07772dc5805594bab2997a090a9033e26bf56/spacy/tests/lang/ky/__init__.py


--------------------------------------------------------------------------------
/spacy/tests/lang/la/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/explosion/spaCy/41e07772dc5805594bab2997a090a9033e26bf56/spacy/tests/lang/la/__init__.py


--------------------------------------------------------------------------------
/spacy/tests/lang/la/test_exception.py:
--------------------------------------------------------------------------------
1 | import pytest
2 | 
3 | 
4 | def test_la_tokenizer_handles_exc_in_text(la_tokenizer):
5 |     text = "scio te omnia facturum, ut nobiscum quam primum sis"
6 |     tokens = la_tokenizer(text)
7 |     assert len(tokens) == 11
8 |     assert tokens[6].text == "nobis"
9 | 


--------------------------------------------------------------------------------
/spacy/tests/lang/lb/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/explosion/spaCy/41e07772dc5805594bab2997a090a9033e26bf56/spacy/tests/lang/lb/__init__.py


--------------------------------------------------------------------------------
/spacy/tests/lang/lb/test_exceptions.py:
--------------------------------------------------------------------------------
 1 | import pytest
 2 | 
 3 | 
 4 | @pytest.mark.parametrize("text", ["z.B.", "Jan."])
 5 | def test_lb_tokenizer_handles_abbr(lb_tokenizer, text):
 6 |     tokens = lb_tokenizer(text)
 7 |     assert len(tokens) == 1
 8 | 
 9 | 
10 | @pytest.mark.parametrize("text", ["d'Saach", "d'Kanner", "d’Welt", "d’Suen"])
11 | def test_lb_tokenizer_splits_contractions(lb_tokenizer, text):
12 |     tokens = lb_tokenizer(text)
13 |     assert len(tokens) == 2
14 | 
15 | 
16 | def test_lb_tokenizer_handles_exc_in_text(lb_tokenizer):
17 |     text = "Mee 't ass net evident, d'Liewen."
18 |     tokens = lb_tokenizer(text)
19 |     assert len(tokens) == 9
20 |     assert tokens[1].text == "'t"
21 | 


--------------------------------------------------------------------------------
/spacy/tests/lang/lb/test_prefix_suffix_infix.py:
--------------------------------------------------------------------------------
 1 | import pytest
 2 | 
 3 | 
 4 | @pytest.mark.parametrize("text,length", [("z.B.", 1), ("zb.", 2), ("(z.B.", 2)])
 5 | def test_lb_tokenizer_splits_prefix_interact(lb_tokenizer, text, length):
 6 |     tokens = lb_tokenizer(text)
 7 |     assert len(tokens) == length
 8 | 
 9 | 
10 | @pytest.mark.parametrize("text", ["z.B.)"])
11 | def test_lb_tokenizer_splits_suffix_interact(lb_tokenizer, text):
12 |     tokens = lb_tokenizer(text)
13 |     assert len(tokens) == 2
14 | 
15 | 
16 | @pytest.mark.parametrize("text", ["(z.B.)"])
17 | def test_lb_tokenizer_splits_even_wrap_interact(lb_tokenizer, text):
18 |     tokens = lb_tokenizer(text)
19 |     assert len(tokens) == 3
20 | 


--------------------------------------------------------------------------------
/spacy/tests/lang/lg/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/explosion/spaCy/41e07772dc5805594bab2997a090a9033e26bf56/spacy/tests/lang/lg/__init__.py


--------------------------------------------------------------------------------
/spacy/tests/lang/lg/test_tokenizer.py:
--------------------------------------------------------------------------------
 1 | import pytest
 2 | 
 3 | LG_BASIC_TOKENIZATION_TESTS = [
 4 |     (
 5 |         "Abooluganda ab’emmamba ababiri",
 6 |         ["Abooluganda", "ab’emmamba", "ababiri"],
 7 |     ),
 8 | ]
 9 | 
10 | 
11 | @pytest.mark.parametrize("text,expected_tokens", LG_BASIC_TOKENIZATION_TESTS)
12 | def test_lg_tokenizer_basic(lg_tokenizer, text, expected_tokens):
13 |     tokens = lg_tokenizer(text)
14 |     token_list = [token.text for token in tokens if not token.is_space]
15 |     assert expected_tokens == token_list
16 | 


--------------------------------------------------------------------------------
/spacy/tests/lang/lt/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/explosion/spaCy/41e07772dc5805594bab2997a090a9033e26bf56/spacy/tests/lang/lt/__init__.py


--------------------------------------------------------------------------------
/spacy/tests/lang/lv/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/explosion/spaCy/41e07772dc5805594bab2997a090a9033e26bf56/spacy/tests/lang/lv/__init__.py


--------------------------------------------------------------------------------
/spacy/tests/lang/mk/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/explosion/spaCy/41e07772dc5805594bab2997a090a9033e26bf56/spacy/tests/lang/mk/__init__.py


--------------------------------------------------------------------------------
/spacy/tests/lang/ml/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/explosion/spaCy/41e07772dc5805594bab2997a090a9033e26bf56/spacy/tests/lang/ml/__init__.py


--------------------------------------------------------------------------------
/spacy/tests/lang/ml/test_text.py:
--------------------------------------------------------------------------------
 1 | import pytest
 2 | 
 3 | 
 4 | def test_ml_tokenizer_handles_long_text(ml_tokenizer):
 5 |     text = """അനാവശ്യമായി കണ്ണിലും മൂക്കിലും വായിലും സ്പർശിക്കാതിരിക്കുക"""
 6 |     tokens = ml_tokenizer(text)
 7 |     assert len(tokens) == 5
 8 | 
 9 | 
10 | @pytest.mark.parametrize(
11 |     "text,length",
12 |     [
13 |         (
14 |             "എന്നാൽ അച്ചടിയുടെ ആവിർഭാവം ലിപിയിൽ കാര്യമായ മാറ്റങ്ങൾ വരുത്തിയത് കൂട്ടക്ഷരങ്ങളെ അണുഅക്ഷരങ്ങളായി പിരിച്ചുകൊണ്ടായിരുന്നു",
15 |             10,
16 |         ),
17 |         ("പരമ്പരാഗതമായി മലയാളം ഇടത്തുനിന്ന് വലത്തോട്ടാണ് എഴുതുന്നത്", 5),
18 |     ],
19 | )
20 | def test_ml_tokenizer_handles_cnts(ml_tokenizer, text, length):
21 |     tokens = ml_tokenizer(text)
22 |     assert len(tokens) == length
23 | 


--------------------------------------------------------------------------------
/spacy/tests/lang/ms/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/explosion/spaCy/41e07772dc5805594bab2997a090a9033e26bf56/spacy/tests/lang/ms/__init__.py


--------------------------------------------------------------------------------
/spacy/tests/lang/ms/test_noun_chunks.py:
--------------------------------------------------------------------------------
1 | import pytest
2 | 
3 | 
4 | def test_noun_chunks_is_parsed_ms(ms_tokenizer):
5 |     """Test that noun_chunks raises Value Error for 'ms' language if Doc is not parsed."""
6 |     doc = ms_tokenizer("sebelas")
7 |     with pytest.raises(ValueError):
8 |         list(doc.noun_chunks)
9 | 


--------------------------------------------------------------------------------
/spacy/tests/lang/ms/test_text.py:
--------------------------------------------------------------------------------
 1 | import pytest
 2 | 
 3 | from spacy.lang.ms.lex_attrs import like_num
 4 | 
 5 | 
 6 | @pytest.mark.parametrize("word", ["sebelas"])
 7 | def test_ms_lex_attrs_capitals(word):
 8 |     assert like_num(word)
 9 |     assert like_num(word.upper())
10 | 


--------------------------------------------------------------------------------
/spacy/tests/lang/nb/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/explosion/spaCy/41e07772dc5805594bab2997a090a9033e26bf56/spacy/tests/lang/nb/__init__.py


--------------------------------------------------------------------------------
/spacy/tests/lang/nb/test_noun_chunks.py:
--------------------------------------------------------------------------------
1 | import pytest
2 | 
3 | 
4 | def test_noun_chunks_is_parsed_nb(nb_tokenizer):
5 |     """Test that noun_chunks raises Value Error for 'nb' language if Doc is not parsed."""
6 |     doc = nb_tokenizer("Smørsausen brukes bl.a. til")
7 |     with pytest.raises(ValueError):
8 |         list(doc.noun_chunks)
9 | 


--------------------------------------------------------------------------------
/spacy/tests/lang/nb/test_tokenizer.py:
--------------------------------------------------------------------------------
 1 | import pytest
 2 | 
 3 | NB_TOKEN_EXCEPTION_TESTS = [
 4 |     (
 5 |         "Smørsausen brukes bl.a. til fisk",
 6 |         ["Smørsausen", "brukes", "bl.a.", "til", "fisk"],
 7 |     ),
 8 |     (
 9 |         "Jeg kommer først kl. 13 pga. diverse forsinkelser",
10 |         ["Jeg", "kommer", "først", "kl.", "13", "pga.", "diverse", "forsinkelser"],
11 |     ),
12 | ]
13 | 
14 | 
15 | @pytest.mark.parametrize("text,expected_tokens", NB_TOKEN_EXCEPTION_TESTS)
16 | def test_nb_tokenizer_handles_exception_cases(nb_tokenizer, text, expected_tokens):
17 |     tokens = nb_tokenizer(text)
18 |     token_list = [token.text for token in tokens if not token.is_space]
19 |     assert expected_tokens == token_list
20 | 


--------------------------------------------------------------------------------
/spacy/tests/lang/ne/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/explosion/spaCy/41e07772dc5805594bab2997a090a9033e26bf56/spacy/tests/lang/ne/__init__.py


--------------------------------------------------------------------------------
/spacy/tests/lang/ne/test_text.py:
--------------------------------------------------------------------------------
 1 | import pytest
 2 | 
 3 | 
 4 | def test_ne_tokenizer_handlers_long_text(ne_tokenizer):
 5 |     text = """मैले पाएको सर्टिफिकेटलाई म त बोक्रो सम्झन्छु र अभ्यास तब सुरु भयो, जब मैले कलेज पार गरेँ र जीवनको पढाइ सुरु गरेँ ।"""
 6 |     tokens = ne_tokenizer(text)
 7 |     assert len(tokens) == 24
 8 | 
 9 | 
10 | @pytest.mark.parametrize(
11 |     "text,length", [("समय जान कति पनि बेर लाग्दैन ।", 7), ("म ठूलो हुँदै थिएँ ।", 5)]
12 | )
13 | def test_ne_tokenizer_handles_cnts(ne_tokenizer, text, length):
14 |     tokens = ne_tokenizer(text)
15 |     assert len(tokens) == length
16 | 


--------------------------------------------------------------------------------
/spacy/tests/lang/nl/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/explosion/spaCy/41e07772dc5805594bab2997a090a9033e26bf56/spacy/tests/lang/nl/__init__.py


--------------------------------------------------------------------------------
/spacy/tests/lang/nn/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/explosion/spaCy/41e07772dc5805594bab2997a090a9033e26bf56/spacy/tests/lang/nn/__init__.py


--------------------------------------------------------------------------------
/spacy/tests/lang/pl/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/explosion/spaCy/41e07772dc5805594bab2997a090a9033e26bf56/spacy/tests/lang/pl/__init__.py


--------------------------------------------------------------------------------
/spacy/tests/lang/pl/test_text.py:
--------------------------------------------------------------------------------
 1 | """Words like numbers are recognized correctly."""
 2 | 
 3 | import pytest
 4 | 
 5 | 
 6 | @pytest.mark.parametrize(
 7 |     "text,match",
 8 |     [
 9 |         ("10", True),
10 |         ("1", True),
11 |         ("10,000", True),
12 |         ("10,00", True),
13 |         ("jeden", True),
14 |         ("dwa", True),
15 |         ("milion", True),
16 |         ("pies", False),
17 |         (",", False),
18 |         ("1/2", True),
19 |     ],
20 | )
21 | def test_lex_attrs_like_number(pl_tokenizer, text, match):
22 |     tokens = pl_tokenizer(text)
23 |     assert len(tokens) == 1
24 |     assert tokens[0].like_num == match
25 | 


--------------------------------------------------------------------------------
/spacy/tests/lang/pl/test_tokenizer.py:
--------------------------------------------------------------------------------
 1 | import pytest
 2 | 
 3 | DOT_TESTS = [
 4 |     ("tel.", ["tel", "."]),
 5 |     ("0 zł 99 gr", ["0", "zł", "99", "gr"]),
 6 | ]
 7 | 
 8 | HYPHEN_TESTS = [
 9 |     ("cztero-", ["cztero-"]),
10 |     ("jedno-", ["jedno-"]),
11 |     ("dwu-", ["dwu-"]),
12 |     ("trzy-", ["trzy-"]),
13 | ]
14 | 
15 | 
16 | TESTCASES = DOT_TESTS + HYPHEN_TESTS
17 | 
18 | 
19 | @pytest.mark.parametrize("text,expected_tokens", TESTCASES)
20 | def test_tokenizer_handles_testcases(pl_tokenizer, text, expected_tokens):
21 |     tokens = pl_tokenizer(text)
22 |     token_list = [token.text for token in tokens if not token.is_space]
23 |     assert expected_tokens == token_list
24 | 


--------------------------------------------------------------------------------
/spacy/tests/lang/pt/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/explosion/spaCy/41e07772dc5805594bab2997a090a9033e26bf56/spacy/tests/lang/pt/__init__.py


--------------------------------------------------------------------------------
/spacy/tests/lang/pt/test_text.py:
--------------------------------------------------------------------------------
 1 | import pytest
 2 | 
 3 | from spacy.lang.pt.lex_attrs import like_num
 4 | 
 5 | 
 6 | @pytest.mark.parametrize("word", ["onze", "quadragésimo"])
 7 | def test_pt_lex_attrs_capitals(word):
 8 |     assert like_num(word)
 9 |     assert like_num(word.upper())
10 | 


--------------------------------------------------------------------------------
/spacy/tests/lang/ro/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/explosion/spaCy/41e07772dc5805594bab2997a090a9033e26bf56/spacy/tests/lang/ro/__init__.py


--------------------------------------------------------------------------------
/spacy/tests/lang/ru/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/explosion/spaCy/41e07772dc5805594bab2997a090a9033e26bf56/spacy/tests/lang/ru/__init__.py


--------------------------------------------------------------------------------
/spacy/tests/lang/ru/test_exceptions.py:
--------------------------------------------------------------------------------
 1 | import pytest
 2 | 
 3 | 
 4 | @pytest.mark.parametrize(
 5 |     "text,norms",
 6 |     [("пн.", ["понедельник"]), ("пт.", ["пятница"]), ("дек.", ["декабрь"])],
 7 | )
 8 | def test_ru_tokenizer_abbrev_exceptions(ru_tokenizer, text, norms):
 9 |     tokens = ru_tokenizer(text)
10 |     assert len(tokens) == 1
11 |     assert [token.norm_ for token in tokens] == norms
12 | 


--------------------------------------------------------------------------------
/spacy/tests/lang/ru/test_text.py:
--------------------------------------------------------------------------------
 1 | import pytest
 2 | 
 3 | from spacy.lang.ru.lex_attrs import like_num
 4 | 
 5 | 
 6 | @pytest.mark.parametrize("word", ["одиннадцать"])
 7 | def test_ru_lex_attrs_capitals(word):
 8 |     assert like_num(word)
 9 |     assert like_num(word.upper())
10 | 


--------------------------------------------------------------------------------
/spacy/tests/lang/sa/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/explosion/spaCy/41e07772dc5805594bab2997a090a9033e26bf56/spacy/tests/lang/sa/__init__.py


--------------------------------------------------------------------------------
/spacy/tests/lang/sk/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/explosion/spaCy/41e07772dc5805594bab2997a090a9033e26bf56/spacy/tests/lang/sk/__init__.py


--------------------------------------------------------------------------------
/spacy/tests/lang/sk/test_tokenizer.py:
--------------------------------------------------------------------------------
 1 | import pytest
 2 | 
 3 | SK_BASIC_TOKENIZATION_TESTS = [
 4 |     (
 5 |         "Kedy sa narodil Andrej Kiska?",
 6 |         ["Kedy", "sa", "narodil", "Andrej", "Kiska", "?"],
 7 |     ),
 8 | ]
 9 | 
10 | 
11 | @pytest.mark.parametrize("text,expected_tokens", SK_BASIC_TOKENIZATION_TESTS)
12 | def test_sk_tokenizer_basic(sk_tokenizer, text, expected_tokens):
13 |     tokens = sk_tokenizer(text)
14 |     token_list = [token.text for token in tokens if not token.is_space]
15 |     assert expected_tokens == token_list
16 | 


--------------------------------------------------------------------------------
/spacy/tests/lang/sl/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/explosion/spaCy/41e07772dc5805594bab2997a090a9033e26bf56/spacy/tests/lang/sl/__init__.py


--------------------------------------------------------------------------------
/spacy/tests/lang/sq/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/explosion/spaCy/41e07772dc5805594bab2997a090a9033e26bf56/spacy/tests/lang/sq/__init__.py


--------------------------------------------------------------------------------
/spacy/tests/lang/sr/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/explosion/spaCy/41e07772dc5805594bab2997a090a9033e26bf56/spacy/tests/lang/sr/__init__.py


--------------------------------------------------------------------------------
/spacy/tests/lang/sr/test_exceptions.py:
--------------------------------------------------------------------------------
 1 | import pytest
 2 | 
 3 | 
 4 | @pytest.mark.parametrize(
 5 |     "text,norms,lemmas",
 6 |     [
 7 |         ("о.г.", ["ове године"], ["ова година"]),
 8 |         ("чет.", ["четвртак"], ["четвртак"]),
 9 |         ("гђа", ["госпођа"], ["госпођа"]),
10 |         ("ил'", ["или"], ["или"]),
11 |     ],
12 | )
13 | def test_sr_tokenizer_abbrev_exceptions(sr_tokenizer, text, norms, lemmas):
14 |     tokens = sr_tokenizer(text)
15 |     assert len(tokens) == 1
16 |     assert [token.norm_ for token in tokens] == norms
17 | 


--------------------------------------------------------------------------------
/spacy/tests/lang/sv/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/explosion/spaCy/41e07772dc5805594bab2997a090a9033e26bf56/spacy/tests/lang/sv/__init__.py


--------------------------------------------------------------------------------
/spacy/tests/lang/sv/test_text.py:
--------------------------------------------------------------------------------
 1 | def test_sv_tokenizer_handles_long_text(sv_tokenizer):
 2 |     text = """Det var så härligt ute på landet. Det var sommar, majsen var gul, havren grön,
 3 | höet var uppställt i stackar nere vid den gröna ängen, och där gick storken på sina långa,
 4 | röda ben och snackade engelska, för det språket hade han lärt sig av sin mor.
 5 | 
 6 | Runt om åkrar och äng låg den stora skogen, och mitt i skogen fanns djupa sjöar; jo, det var verkligen trevligt ute på landet!"""
 7 |     tokens = sv_tokenizer(text)
 8 |     assert len(tokens) == 86
 9 | 
10 | 
11 | def test_sv_tokenizer_handles_trailing_dot_for_i_in_sentence(sv_tokenizer):
12 |     text = "Provar att tokenisera en mening med ord i."
13 |     tokens = sv_tokenizer(text)
14 |     assert len(tokens) == 9
15 | 


--------------------------------------------------------------------------------
/spacy/tests/lang/ta/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/explosion/spaCy/41e07772dc5805594bab2997a090a9033e26bf56/spacy/tests/lang/ta/__init__.py


--------------------------------------------------------------------------------
/spacy/tests/lang/th/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/explosion/spaCy/41e07772dc5805594bab2997a090a9033e26bf56/spacy/tests/lang/th/__init__.py


--------------------------------------------------------------------------------
/spacy/tests/lang/th/test_tokenizer.py:
--------------------------------------------------------------------------------
 1 | import pytest
 2 | 
 3 | 
 4 | @pytest.mark.parametrize(
 5 |     "text,expected_tokens", [("คุณรักผมไหม", ["คุณ", "รัก", "ผม", "ไหม"])]
 6 | )
 7 | def test_th_tokenizer(th_tokenizer, text, expected_tokens):
 8 |     tokens = [token.text for token in th_tokenizer(text)]
 9 |     assert tokens == expected_tokens
10 | 


--------------------------------------------------------------------------------
/spacy/tests/lang/ti/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/explosion/spaCy/41e07772dc5805594bab2997a090a9033e26bf56/spacy/tests/lang/ti/__init__.py


--------------------------------------------------------------------------------
/spacy/tests/lang/ti/test_exception.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/explosion/spaCy/41e07772dc5805594bab2997a090a9033e26bf56/spacy/tests/lang/ti/test_exception.py


--------------------------------------------------------------------------------
/spacy/tests/lang/tl/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/explosion/spaCy/41e07772dc5805594bab2997a090a9033e26bf56/spacy/tests/lang/tl/__init__.py


--------------------------------------------------------------------------------
/spacy/tests/lang/tl/test_indices.py:
--------------------------------------------------------------------------------
1 | def test_tl_simple_punct(tl_tokenizer):
2 |     text = "Sige, punta ka dito"
3 |     tokens = tl_tokenizer(text)
4 |     assert tokens[0].idx == 0
5 |     assert tokens[1].idx == 4
6 |     assert tokens[2].idx == 6
7 |     assert tokens[3].idx == 12
8 |     assert tokens[4].idx == 15
9 | 


--------------------------------------------------------------------------------
/spacy/tests/lang/tr/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/explosion/spaCy/41e07772dc5805594bab2997a090a9033e26bf56/spacy/tests/lang/tr/__init__.py


--------------------------------------------------------------------------------
/spacy/tests/lang/tr/test_noun_chunks.py:
--------------------------------------------------------------------------------
 1 | import pytest
 2 | 
 3 | 
 4 | def test_noun_chunks_is_parsed(tr_tokenizer):
 5 |     """Test that noun_chunks raises Value Error for 'tr' language if Doc is not parsed.
 6 |     To check this test, we're constructing a Doc
 7 |     with a new Vocab here and forcing is_parsed to 'False'
 8 |     to make sure the noun chunks don't run.
 9 |     """
10 |     doc = tr_tokenizer("Dün seni gördüm.")
11 |     with pytest.raises(ValueError):
12 |         list(doc.noun_chunks)
13 | 


--------------------------------------------------------------------------------
/spacy/tests/lang/tt/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/explosion/spaCy/41e07772dc5805594bab2997a090a9033e26bf56/spacy/tests/lang/tt/__init__.py


--------------------------------------------------------------------------------
/spacy/tests/lang/uk/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/explosion/spaCy/41e07772dc5805594bab2997a090a9033e26bf56/spacy/tests/lang/uk/__init__.py


--------------------------------------------------------------------------------
/spacy/tests/lang/uk/test_tokenizer_exc.py:
--------------------------------------------------------------------------------
 1 | import pytest
 2 | 
 3 | 
 4 | @pytest.mark.parametrize(
 5 |     "text,norms,lemmas",
 6 |     [("ім.", ["імені"], ["ім'я"]), ("проф.", ["професор"], ["професор"])],
 7 | )
 8 | def test_uk_tokenizer_abbrev_exceptions(uk_tokenizer, text, norms, lemmas):
 9 |     tokens = uk_tokenizer(text)
10 |     assert len(tokens) == 1
11 |     assert [token.norm_ for token in tokens] == norms
12 | 


--------------------------------------------------------------------------------
/spacy/tests/lang/ur/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/explosion/spaCy/41e07772dc5805594bab2997a090a9033e26bf56/spacy/tests/lang/ur/__init__.py


--------------------------------------------------------------------------------
/spacy/tests/lang/ur/test_prefix_suffix_infix.py:
--------------------------------------------------------------------------------
1 | import pytest
2 | 
3 | 
4 | @pytest.mark.parametrize("text", ["ہےں۔", "کیا۔"])
5 | def test_contractions(ur_tokenizer, text):
6 |     """Test specific Urdu punctuation character"""
7 |     tokens = ur_tokenizer(text)
8 |     assert len(tokens) == 2
9 | 


--------------------------------------------------------------------------------
/spacy/tests/lang/ur/test_text.py:
--------------------------------------------------------------------------------
 1 | import pytest
 2 | 
 3 | 
 4 | def test_ur_tokenizer_handles_long_text(ur_tokenizer):
 5 |     text = """اصل میں، رسوا ہونے کی ہمیں کچھ عادت سی ہو گئی ہے۔"""
 6 |     tokens = ur_tokenizer(text)
 7 |     assert len(tokens) == 14
 8 | 
 9 | 
10 | @pytest.mark.parametrize("text,length", [("تحریر باسط حبیب", 3), ("میرا پاکستان", 2)])
11 | def test_ur_tokenizer_handles_cnts(ur_tokenizer, text, length):
12 |     tokens = ur_tokenizer(text)
13 |     assert len(tokens) == length
14 | 


--------------------------------------------------------------------------------
/spacy/tests/lang/vi/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/explosion/spaCy/41e07772dc5805594bab2997a090a9033e26bf56/spacy/tests/lang/vi/__init__.py


--------------------------------------------------------------------------------
/spacy/tests/lang/xx/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/explosion/spaCy/41e07772dc5805594bab2997a090a9033e26bf56/spacy/tests/lang/xx/__init__.py


--------------------------------------------------------------------------------
/spacy/tests/lang/xx/test_tokenizer.py:
--------------------------------------------------------------------------------
 1 | import pytest
 2 | 
 3 | XX_BASIC_TOKENIZATION_TESTS = [
 4 |     (
 5 |         "Lääʹddjânnmest lie nuʹtt 10 000 säʹmmliʹžžed. Seeʹst pâʹjjel",
 6 |         [
 7 |             "Lääʹddjânnmest",
 8 |             "lie",
 9 |             "nuʹtt",
10 |             "10",
11 |             "000",
12 |             "säʹmmliʹžžed",
13 |             ".",
14 |             "Seeʹst",
15 |             "pâʹjjel",
16 |         ],
17 |     ),
18 | ]
19 | 
20 | 
21 | @pytest.mark.parametrize("text,expected_tokens", XX_BASIC_TOKENIZATION_TESTS)
22 | def test_xx_tokenizer_basic(xx_tokenizer, text, expected_tokens):
23 |     tokens = xx_tokenizer(text)
24 |     token_list = [token.text for token in tokens if not token.is_space]
25 |     assert expected_tokens == token_list
26 | 


--------------------------------------------------------------------------------
/spacy/tests/lang/yo/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/explosion/spaCy/41e07772dc5805594bab2997a090a9033e26bf56/spacy/tests/lang/yo/__init__.py


--------------------------------------------------------------------------------
/spacy/tests/lang/zh/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/explosion/spaCy/41e07772dc5805594bab2997a090a9033e26bf56/spacy/tests/lang/zh/__init__.py


--------------------------------------------------------------------------------
/spacy/tests/lang/zh/test_text.py:
--------------------------------------------------------------------------------
 1 | import pytest
 2 | 
 3 | 
 4 | @pytest.mark.parametrize(
 5 |     "text,match",
 6 |     [
 7 |         ("10", True),
 8 |         ("1", True),
 9 |         ("999.0", True),
10 |         ("一", True),
11 |         ("二", True),
12 |         ("〇", True),
13 |         ("十一", True),
14 |         ("狗", False),
15 |         (",", False),
16 |     ],
17 | )
18 | def test_lex_attrs_like_number(zh_tokenizer_jieba, text, match):
19 |     tokens = zh_tokenizer_jieba(text)
20 |     assert len(tokens) == 1
21 |     assert tokens[0].like_num == match
22 | 


--------------------------------------------------------------------------------
/spacy/tests/matcher/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/explosion/spaCy/41e07772dc5805594bab2997a090a9033e26bf56/spacy/tests/matcher/__init__.py


--------------------------------------------------------------------------------
/spacy/tests/morphology/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/explosion/spaCy/41e07772dc5805594bab2997a090a9033e26bf56/spacy/tests/morphology/__init__.py


--------------------------------------------------------------------------------
/spacy/tests/package/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/explosion/spaCy/41e07772dc5805594bab2997a090a9033e26bf56/spacy/tests/package/__init__.py


--------------------------------------------------------------------------------
/spacy/tests/parser/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/explosion/spaCy/41e07772dc5805594bab2997a090a9033e26bf56/spacy/tests/parser/__init__.py


--------------------------------------------------------------------------------
/spacy/tests/pipeline/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/explosion/spaCy/41e07772dc5805594bab2997a090a9033e26bf56/spacy/tests/pipeline/__init__.py


--------------------------------------------------------------------------------
/spacy/tests/serialize/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/explosion/spaCy/41e07772dc5805594bab2997a090a9033e26bf56/spacy/tests/serialize/__init__.py


--------------------------------------------------------------------------------
/spacy/tests/test_architectures.py:
--------------------------------------------------------------------------------
 1 | import pytest
 2 | from catalogue import RegistryError
 3 | from thinc.api import Linear
 4 | 
 5 | from spacy import registry
 6 | 
 7 | 
 8 | def test_get_architecture():
 9 |     @registry.architectures("my_test_function")
10 |     def create_model(nr_in, nr_out):
11 |         return Linear(nr_in, nr_out)
12 | 
13 |     arch = registry.architectures.get("my_test_function")
14 |     assert arch is create_model
15 |     with pytest.raises(RegistryError):
16 |         registry.architectures.get("not_an_existing_key")
17 | 


--------------------------------------------------------------------------------
/spacy/tests/test_errors.py:
--------------------------------------------------------------------------------
 1 | from inspect import isclass
 2 | 
 3 | import pytest
 4 | 
 5 | from spacy.errors import ErrorsWithCodes
 6 | 
 7 | 
 8 | class Errors(metaclass=ErrorsWithCodes):
 9 |     E001 = "error description"
10 | 
11 | 
12 | def test_add_codes():
13 |     assert Errors.E001 == "[E001] error description"
14 |     with pytest.raises(AttributeError):
15 |         Errors.E002
16 |     assert isclass(Errors.__class__)
17 | 


--------------------------------------------------------------------------------
/spacy/tests/tokenizer/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/explosion/spaCy/41e07772dc5805594bab2997a090a9033e26bf56/spacy/tests/tokenizer/__init__.py


--------------------------------------------------------------------------------
/spacy/tests/training/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/explosion/spaCy/41e07772dc5805594bab2997a090a9033e26bf56/spacy/tests/training/__init__.py


--------------------------------------------------------------------------------
/spacy/tests/training/test_logger.py:
--------------------------------------------------------------------------------
 1 | import pytest
 2 | 
 3 | import spacy
 4 | from spacy.training import loggers
 5 | 
 6 | 
 7 | @pytest.fixture()
 8 | def nlp():
 9 |     nlp = spacy.blank("en")
10 |     nlp.add_pipe("ner")
11 |     return nlp
12 | 
13 | 
14 | @pytest.fixture()
15 | def info():
16 |     return {
17 |         "losses": {"ner": 100},
18 |         "other_scores": {"ENTS_F": 0.85, "ENTS_P": 0.90, "ENTS_R": 0.80},
19 |         "epoch": 100,
20 |         "step": 125,
21 |         "score": 85,
22 |     }
23 | 
24 | 
25 | def test_console_logger(nlp, info):
26 |     console_logger = loggers.console_logger(
27 |         progress_bar=True, console_output=True, output_file=None
28 |     )
29 |     log_step, finalize = console_logger(nlp)
30 |     log_step(info)
31 | 


--------------------------------------------------------------------------------
/spacy/tests/vocab_vectors/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/explosion/spaCy/41e07772dc5805594bab2997a090a9033e26bf56/spacy/tests/vocab_vectors/__init__.py


--------------------------------------------------------------------------------
/spacy/tokens/__init__.pxd:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/explosion/spaCy/41e07772dc5805594bab2997a090a9033e26bf56/spacy/tokens/__init__.pxd


--------------------------------------------------------------------------------
/spacy/tokens/__init__.py:
--------------------------------------------------------------------------------
1 | from ._serialize import DocBin
2 | from .doc import Doc
3 | from .morphanalysis import MorphAnalysis
4 | from .span import Span
5 | from .span_group import SpanGroup
6 | from .token import Token
7 | 
8 | __all__ = ["Doc", "DocBin", "MorphAnalysis", "Span", "SpanGroup", "Token"]
9 | 


--------------------------------------------------------------------------------
/spacy/tokens/graph.pxd:
--------------------------------------------------------------------------------
 1 | from cymem.cymem cimport Pool
 2 | from libcpp.vector cimport vector
 3 | from preshed.maps cimport PreshMap
 4 | 
 5 | from ..structs cimport EdgeC, GraphC
 6 | 
 7 | 
 8 | cdef class Graph:
 9 |     cdef GraphC c
10 |     cdef Pool mem
11 |     cdef PreshMap node_map
12 |     cdef PreshMap edge_map
13 |     cdef object doc_ref
14 |     cdef public str name
15 | 


--------------------------------------------------------------------------------
/spacy/tokens/morphanalysis.pxd:
--------------------------------------------------------------------------------
 1 | from ..structs cimport MorphAnalysisC
 2 | from ..typedefs cimport hash_t
 3 | from ..vocab cimport Vocab
 4 | 
 5 | 
 6 | cdef class MorphAnalysis:
 7 |     cdef readonly Vocab vocab
 8 |     cdef readonly hash_t key
 9 |     cdef MorphAnalysisC c
10 | 


--------------------------------------------------------------------------------
/spacy/tokens/span.pxd:
--------------------------------------------------------------------------------
 1 | cimport numpy as np
 2 | 
 3 | from ..structs cimport SpanC
 4 | from ..typedefs cimport attr_t
 5 | from .doc cimport Doc
 6 | 
 7 | 
 8 | cdef class Span:
 9 |     cdef readonly Doc doc
10 |     cdef SpanC c
11 |     cdef public _vector
12 |     cdef public _vector_norm
13 | 
14 |     @staticmethod
15 |     cdef inline Span cinit(Doc doc, SpanC span):
16 |         cdef Span self = Span.__new__(
17 |             Span,
18 |             doc,
19 |             start=span.start,
20 |             end=span.end
21 |         )
22 |         self.c = span
23 |         return self
24 | 
25 |     cpdef np.ndarray to_array(self, object features)
26 | 


--------------------------------------------------------------------------------
/spacy/tokens/span_group.pxd:
--------------------------------------------------------------------------------
 1 | from libcpp.vector cimport vector
 2 | 
 3 | from ..structs cimport SpanC
 4 | 
 5 | 
 6 | cdef class SpanGroup:
 7 |     cdef public object _doc_ref
 8 |     cdef public str name
 9 |     cdef public dict attrs
10 |     cdef vector[SpanC] c
11 | 
12 |     cdef void push_back(self, SpanC span) nogil
13 | 


--------------------------------------------------------------------------------
/spacy/training/__init__.pxd:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/explosion/spaCy/41e07772dc5805594bab2997a090a9033e26bf56/spacy/training/__init__.pxd


--------------------------------------------------------------------------------
/spacy/training/alignment.py:
--------------------------------------------------------------------------------
 1 | from dataclasses import dataclass
 2 | from typing import List
 3 | 
 4 | from .align import get_alignments
 5 | from .alignment_array import AlignmentArray
 6 | 
 7 | 
 8 | @dataclass
 9 | class Alignment:
10 |     x2y: AlignmentArray
11 |     y2x: AlignmentArray
12 | 
13 |     @classmethod
14 |     def from_indices(cls, x2y: List[List[int]], y2x: List[List[int]]) -> "Alignment":
15 |         x2y = AlignmentArray(x2y)
16 |         y2x = AlignmentArray(y2x)
17 |         return Alignment(x2y=x2y, y2x=y2x)
18 | 
19 |     @classmethod
20 |     def from_strings(cls, A: List[str], B: List[str]) -> "Alignment":
21 |         x2y, y2x = get_alignments(A, B)
22 |         return Alignment.from_indices(x2y=x2y, y2x=y2x)
23 | 


--------------------------------------------------------------------------------
/spacy/training/alignment_array.pxd:
--------------------------------------------------------------------------------
1 | cimport numpy as np
2 | from libcpp.vector cimport vector
3 | 
4 | 
5 | cdef class AlignmentArray:
6 |     cdef np.ndarray _data
7 |     cdef np.ndarray _lengths
8 |     cdef np.ndarray _starts_ends
9 | 


--------------------------------------------------------------------------------
/spacy/training/converters/__init__.py:
--------------------------------------------------------------------------------
1 | from .conll_ner_to_docs import conll_ner_to_docs  # noqa: F401
2 | from .conllu_to_docs import conllu_to_docs  # noqa: F401
3 | from .iob_to_docs import iob_to_docs  # noqa: F401
4 | from .json_to_docs import json_to_docs  # noqa: F401
5 | 


--------------------------------------------------------------------------------
/spacy/training/example.pxd:
--------------------------------------------------------------------------------
 1 | from libc.stdint cimport uint64_t
 2 | 
 3 | from ..tokens.doc cimport Doc
 4 | 
 5 | 
 6 | cdef class Example:
 7 |     cdef readonly Doc x
 8 |     cdef readonly Doc y
 9 |     cdef readonly object _cached_alignment
10 |     cdef readonly object _cached_words_x
11 |     cdef readonly object _cached_words_y
12 |     cdef readonly uint64_t _x_sig
13 |     cdef readonly uint64_t _y_sig
14 | 


--------------------------------------------------------------------------------
/spacy/typedefs.pxd:
--------------------------------------------------------------------------------
 1 | from libc.stdint cimport int32_t, uint8_t, uint16_t, uint32_t, uint64_t, uintptr_t
 2 | 
 3 | ctypedef float weight_t
 4 | ctypedef uint64_t hash_t
 5 | ctypedef uint64_t class_t
 6 | ctypedef uint64_t attr_t
 7 | ctypedef uint64_t flags_t
 8 | ctypedef uint16_t len_t
 9 | ctypedef uint16_t tag_t
10 | 


--------------------------------------------------------------------------------
/spacy/typedefs.pyx:
--------------------------------------------------------------------------------
1 | # cython: profile=False
2 | 


--------------------------------------------------------------------------------
/website/.dockerignore:
--------------------------------------------------------------------------------
 1 | .cache/
 2 | .next/
 3 | public/
 4 | node_modules
 5 | .npm
 6 | logs
 7 | *.log
 8 | npm-debug.log*
 9 | quickstart-training-generator.js
10 | 


--------------------------------------------------------------------------------
/website/.eslintrc.json:
--------------------------------------------------------------------------------
1 | {
2 |     "extends": "next/core-web-vitals"
3 | }
4 | 


--------------------------------------------------------------------------------
/website/.gitignore:
--------------------------------------------------------------------------------
 1 | # See https://help.github.com/articles/ignoring-files/ for more about ignoring files.
 2 | 
 3 | quickstart-training-generator.js
 4 | 
 5 | # dependencies
 6 | /node_modules
 7 | /.pnp
 8 | .pnp.js
 9 | 
10 | # testing
11 | /coverage
12 | 
13 | # next.js
14 | /.next/
15 | /out/
16 | 
17 | # production
18 | /build
19 | 
20 | # misc
21 | .DS_Store
22 | *.pem
23 | 
24 | # debug
25 | npm-debug.log*
26 | yarn-debug.log*
27 | yarn-error.log*
28 | .pnpm-debug.log*
29 | 
30 | # local env files
31 | .env*.local
32 | 
33 | # vercel
34 | .vercel
35 | 
36 | # typescript
37 | *.tsbuildinfo
38 | next-env.d.ts
39 | 
40 | !.vscode/extensions.json
41 | !public
42 | 
43 | public/robots.txt
44 | public/sitemap*
45 | public/sw.js*
46 | public/workbox*
47 | 


--------------------------------------------------------------------------------
/website/.nvmrc:
--------------------------------------------------------------------------------
1 | 18
2 | 


--------------------------------------------------------------------------------
/website/.prettierignore:
--------------------------------------------------------------------------------
1 | .next


--------------------------------------------------------------------------------
/website/.vscode/extensions.json:
--------------------------------------------------------------------------------
1 | {
2 |     "recommendations": [
3 |         "dbaeumer.vscode-eslint",
4 |         "unifiedjs.vscode-mdx",
5 |         "esbenp.prettier-vscode",
6 |         "syler.sass-indented"
7 |     ]
8 | }
9 | 


--------------------------------------------------------------------------------
/website/Dockerfile:
--------------------------------------------------------------------------------
 1 | FROM node:18
 2 | 
 3 | USER node
 4 | 
 5 | # This is so the installed node_modules will be up one directory
 6 | # from where a user mounts files, so that they don't accidentally mount
 7 | # their own node_modules from a different build
 8 | # https://nodejs.org/api/modules.html#modules_loading_from_node_modules_folders
 9 | WORKDIR /home/node
10 | COPY --chown=node package.json .
11 | COPY --chown=node package-lock.json .
12 | RUN npm install
13 | 
14 | WORKDIR /home/node/website/
15 | 


--------------------------------------------------------------------------------
/website/docs/api/index.mdx:
--------------------------------------------------------------------------------
1 | ---
2 | title: Library Architecture
3 | next: /api/architectures
4 | ---
5 | 
6 | <Architecture101 />
7 | 


--------------------------------------------------------------------------------
/website/meta/dynamicMeta.mjs:
--------------------------------------------------------------------------------
 1 | import site from './site.json' assert { type: 'json' }
 2 | 
 3 | export const domain = process.env.BRANCH || site.domain
 4 | export const siteUrl = `https://${domain}`
 5 | export const nightly = site.nightlyBranches.includes(domain)
 6 | export const legacy = site.legacy || !!+process.env.SPACY_LEGACY
 7 | export const binderBranch = domain
 8 | export const branch = nightly ? 'develop' : 'master'
 9 | export const replacements = {
10 |     GITHUB_SPACY: `https://github.com/explosion/spaCy/tree/${branch}`,
11 |     GITHUB_PROJECTS: `https://github.com/${site.projectsRepo}`,
12 |     SPACY_PKG_NAME: nightly ? 'spacy-nightly' : 'spacy',
13 |     SPACY_PKG_FLAGS: nightly ? ' --pre' : '',
14 | }
15 | 


--------------------------------------------------------------------------------
/website/meta/languageSorted.tsx:
--------------------------------------------------------------------------------
1 | import models from './languages.json'
2 | 
3 | export const languagesSorted = models.languages
4 |     .filter(({ models }) => models && models.length)
5 |     .sort((a, b) => a.name.localeCompare(b.name))
6 | 


--------------------------------------------------------------------------------
/website/meta/recordLanguages.tsx:
--------------------------------------------------------------------------------
1 | import models from './languages.json'
2 | 
3 | const recordLanguages = Object.fromEntries(
4 |     models.languages.map((language, index) => [language.code, language])
5 | )
6 | 
7 | export default recordLanguages
8 | 


--------------------------------------------------------------------------------
/website/meta/recordSections.tsx:
--------------------------------------------------------------------------------
1 | import siteMetadata from './site.json'
2 | 
3 | const recordSections = Object.fromEntries(siteMetadata.sections.map((s) => [s.id, s]))
4 | 
5 | export default recordSections
6 | 


--------------------------------------------------------------------------------
/website/meta/recordUniverse.tsx:
--------------------------------------------------------------------------------
 1 | import universe from './universe.json'
 2 | 
 3 | export const recordUniverseCategories = Object.fromEntries(
 4 |     universe.categories.flatMap((category) => category.items.map((item) => [item.id, item]))
 5 | )
 6 | 
 7 | export const recordUniverseResources = Object.fromEntries(
 8 |     universe.resources.map((resource) => [resource.id, resource])
 9 | )
10 | 


--------------------------------------------------------------------------------
/website/meta/sidebarFlat.tsx:
--------------------------------------------------------------------------------
1 | import sidebars from './sidebars.json'
2 | 
3 | export const sidebarUsageFlat = sidebars
4 |     .find((sidebar) => sidebar.section === 'usage')
5 |     .items.flatMap((item) => item.items)
6 | 


--------------------------------------------------------------------------------
/website/next-sitemap.config.mjs:
--------------------------------------------------------------------------------
 1 | import { siteUrl } from './meta/dynamicMeta.mjs'
 2 | 
 3 | /** @type {import('next-sitemap').IConfig} */
 4 | const config = {
 5 |     siteUrl,
 6 |     generateRobotsTxt: true,
 7 |     autoLastmod: false,
 8 | }
 9 | 
10 | export default config
11 | 


--------------------------------------------------------------------------------
/website/pages/_document.tsx:
--------------------------------------------------------------------------------
 1 | import { Html, Head, Main, NextScript } from 'next/document'
 2 | 
 3 | export default function Document() {
 4 |     return (
 5 |         <Html lang="en">
 6 |             <Head />
 7 |             <body className="theme-blue">
 8 |                 <Main />
 9 |                 <NextScript />
10 |             </body>
11 |         </Html>
12 |     )
13 | }
14 | 


--------------------------------------------------------------------------------
/website/pages/universe/index.tsx:
--------------------------------------------------------------------------------
 1 | import recordSections from '../../meta/recordSections'
 2 | import Layout from '../../src/templates'
 3 | 
 4 | const Universe = () => {
 5 |     return (
 6 |         <Layout
 7 |             slug={'/universe'}
 8 |             section="universe"
 9 |             sectionTitle={recordSections.universe.title}
10 |             theme={recordSections.universe.theme}
11 |             isIndex
12 |             title="Overview"
13 |         />
14 |     )
15 | }
16 | 
17 | export default Universe
18 | 


--------------------------------------------------------------------------------
/website/plugins/index.mjs:
--------------------------------------------------------------------------------
 1 | import remarkGfm from 'remark-gfm'
 2 | import remarkUnwrapImages from 'remark-unwrap-images'
 3 | import remarkSmartypants from 'remark-smartypants'
 4 | 
 5 | import remarkCustomAttrs from './remarkCustomAttrs.mjs'
 6 | import remarkWrapSections from './remarkWrapSections.mjs'
 7 | import remarkCodeBlocks from './remarkCodeBlocks.mjs'
 8 | import remarkFindAndReplace from './remarkFindAndReplace.mjs'
 9 | 
10 | const remarkPlugins = [
11 |     remarkGfm,
12 |     remarkSmartypants,
13 |     remarkFindAndReplace,
14 |     remarkUnwrapImages,
15 |     remarkCustomAttrs,
16 |     remarkCodeBlocks,
17 |     remarkWrapSections,
18 | ]
19 | 
20 | export default remarkPlugins
21 | 


--------------------------------------------------------------------------------
/website/public/favicon.ico:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/explosion/spaCy/41e07772dc5805594bab2997a090a9033e26bf56/website/public/favicon.ico


--------------------------------------------------------------------------------
/website/public/icons/icon-192x192.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/explosion/spaCy/41e07772dc5805594bab2997a090a9033e26bf56/website/public/icons/icon-192x192.png


--------------------------------------------------------------------------------
/website/public/icons/icon-256x256.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/explosion/spaCy/41e07772dc5805594bab2997a090a9033e26bf56/website/public/icons/icon-256x256.png


--------------------------------------------------------------------------------
/website/public/icons/icon-384x384.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/explosion/spaCy/41e07772dc5805594bab2997a090a9033e26bf56/website/public/icons/icon-384x384.png


--------------------------------------------------------------------------------
/website/public/icons/icon-512x512.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/explosion/spaCy/41e07772dc5805594bab2997a090a9033e26bf56/website/public/icons/icon-512x512.png


--------------------------------------------------------------------------------
/website/public/images/cli_init_fill-config_diff.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/explosion/spaCy/41e07772dc5805594bab2997a090a9033e26bf56/website/public/images/cli_init_fill-config_diff.jpg


--------------------------------------------------------------------------------
/website/public/images/course.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/explosion/spaCy/41e07772dc5805594bab2997a090a9033e26bf56/website/public/images/course.jpg


--------------------------------------------------------------------------------
/website/public/images/displacy_jupyter.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/explosion/spaCy/41e07772dc5805594bab2997a090a9033e26bf56/website/public/images/displacy_jupyter.jpg


--------------------------------------------------------------------------------
/website/public/images/huggingface_hub.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/explosion/spaCy/41e07772dc5805594bab2997a090a9033e26bf56/website/public/images/huggingface_hub.jpg


--------------------------------------------------------------------------------
/website/public/images/matcher-demo.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/explosion/spaCy/41e07772dc5805594bab2997a090a9033e26bf56/website/public/images/matcher-demo.jpg


--------------------------------------------------------------------------------
/website/public/images/prodigy.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/explosion/spaCy/41e07772dc5805594bab2997a090a9033e26bf56/website/public/images/prodigy.jpg


--------------------------------------------------------------------------------
/website/public/images/prodigy_overview.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/explosion/spaCy/41e07772dc5805594bab2997a090a9033e26bf56/website/public/images/prodigy_overview.jpg


--------------------------------------------------------------------------------
/website/public/images/prodigy_spans-manual.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/explosion/spaCy/41e07772dc5805594bab2997a090a9033e26bf56/website/public/images/prodigy_spans-manual.jpg


--------------------------------------------------------------------------------
/website/public/images/prodigy_train_curve.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/explosion/spaCy/41e07772dc5805594bab2997a090a9033e26bf56/website/public/images/prodigy_train_curve.jpg


--------------------------------------------------------------------------------
/website/public/images/project_document.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/explosion/spaCy/41e07772dc5805594bab2997a090a9033e26bf56/website/public/images/project_document.jpg


--------------------------------------------------------------------------------
/website/public/images/projects.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/explosion/spaCy/41e07772dc5805594bab2997a090a9033e26bf56/website/public/images/projects.png


--------------------------------------------------------------------------------
/website/public/images/sense2vec.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/explosion/spaCy/41e07772dc5805594bab2997a090a9033e26bf56/website/public/images/sense2vec.jpg


--------------------------------------------------------------------------------
/website/public/images/spacy-extension-demo.gif:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/explosion/spaCy/41e07772dc5805594bab2997a090a9033e26bf56/website/public/images/spacy-extension-demo.gif


--------------------------------------------------------------------------------
/website/public/images/spacy-streamlit.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/explosion/spaCy/41e07772dc5805594bab2997a090a9033e26bf56/website/public/images/spacy-streamlit.png


--------------------------------------------------------------------------------
/website/public/images/spacy-tailored-pipelines_wide.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/explosion/spaCy/41e07772dc5805594bab2997a090a9033e26bf56/website/public/images/spacy-tailored-pipelines_wide.png


--------------------------------------------------------------------------------
/website/public/images/thinc_mypy.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/explosion/spaCy/41e07772dc5805594bab2997a090a9033e26bf56/website/public/images/thinc_mypy.jpg


--------------------------------------------------------------------------------
/website/public/images/wandb1.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/explosion/spaCy/41e07772dc5805594bab2997a090a9033e26bf56/website/public/images/wandb1.jpg


--------------------------------------------------------------------------------
/website/public/images/wandb2.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/explosion/spaCy/41e07772dc5805594bab2997a090a9033e26bf56/website/public/images/wandb2.jpg


--------------------------------------------------------------------------------
/website/runtime.txt:
--------------------------------------------------------------------------------
1 | 3.8
2 | 


--------------------------------------------------------------------------------
/website/setup/requirements.txt:
--------------------------------------------------------------------------------
1 | # These are used to compile the training quickstart config
2 | jinja2>=3.1.0
3 | srsly
4 | 


--------------------------------------------------------------------------------
/website/setup/setup.sh:
--------------------------------------------------------------------------------
1 | python setup/jinja_to_js.py ../spacy/cli/templates/quickstart_training.jinja src/widgets/quickstart-training-generator.js ../spacy/cli/templates/quickstart_training_recommendations.yml
2 | 


--------------------------------------------------------------------------------
/website/src/components/aside.js:
--------------------------------------------------------------------------------
 1 | import React from 'react'
 2 | import PropTypes from 'prop-types'
 3 | 
 4 | import classes from '../styles/aside.module.sass'
 5 | 
 6 | export default function Aside({ title, children }) {
 7 |     return (
 8 |         <aside className={classes.root}>
 9 |             <div className={classes.content} role="complementary">
10 |                 <div className={classes.text}>
11 |                     {title && <h4 className={classes.title}>{title}</h4>}
12 |                     {children}
13 |                 </div>
14 |             </div>
15 |         </aside>
16 |     )
17 | }
18 | 
19 | Aside.propTypes = {
20 |     title: PropTypes.string,
21 |     children: PropTypes.node.isRequired,
22 | }
23 | 


--------------------------------------------------------------------------------
/website/src/components/codeBlock.js:
--------------------------------------------------------------------------------
 1 | import React from 'react'
 2 | import Code from './codeDynamic'
 3 | import classes from '../styles/code.module.sass'
 4 | 
 5 | export const Pre = (props) => {
 6 |     return <pre className={classes['pre']}>{props.children}</pre>
 7 | }
 8 | 
 9 | const CodeBlock = (props) => (
10 |     <Pre>
11 |         <Code {...props} />
12 |     </Pre>
13 | )
14 | export default CodeBlock
15 | 


--------------------------------------------------------------------------------
/website/src/components/codeDynamic.js:
--------------------------------------------------------------------------------
1 | import dynamic from 'next/dynamic'
2 | 
3 | export default dynamic(() => import('./code'), {
4 |     loading: () => <div style={{ color: 'white', padding: '1rem' }}>Loading...</div>,
5 | })
6 | 


--------------------------------------------------------------------------------
/website/src/components/htmlToReact.js:
--------------------------------------------------------------------------------
 1 | import { Parser as HtmlToReactParser } from 'html-to-react'
 2 | 
 3 | const htmlToReactParser = new HtmlToReactParser()
 4 | /**
 5 |  * Convert raw HTML to React elements
 6 |  * @param {string} html - The HTML markup to convert.
 7 |  * @returns {Node} - The converted React elements.
 8 |  */
 9 | 
10 | export default function HtmlToReact(props) {
11 |     return htmlToReactParser.parse(props.children)
12 | }
13 | 


--------------------------------------------------------------------------------
/website/src/components/list.js:
--------------------------------------------------------------------------------
 1 | import React from 'react'
 2 | import classNames from 'classnames'
 3 | 
 4 | import classes from '../styles/list.module.sass'
 5 | import { replaceEmoji } from './icon'
 6 | 
 7 | export const Ol = (props) => <ol className={classes.ol} {...props} />
 8 | export const Ul = (props) => <ul className={classes.ul} {...props} />
 9 | export const Li = ({ children, emoji, ...props }) => {
10 |     const { hasIcon, content } = replaceEmoji(children)
11 |     const liClassNames = classNames(classes.li, {
12 |         [classes['li-icon']]: hasIcon,
13 |         [classes.emoji]: emoji,
14 |     })
15 |     return (
16 |         <li data-emoji={emoji} className={liClassNames} {...props}>
17 |             {content}
18 |         </li>
19 |     )
20 | }
21 | 


--------------------------------------------------------------------------------
/website/src/components/markdownToReactDynamic.js:
--------------------------------------------------------------------------------
1 | import dynamic from 'next/dynamic'
2 | 
3 | export default dynamic(() => import('./markdownToReact'), {
4 |     loading: () => <p>Loading...</p>,
5 | })
6 | 


--------------------------------------------------------------------------------
/website/src/components/search.js:
--------------------------------------------------------------------------------
 1 | import React from 'react'
 2 | import PropTypes from 'prop-types'
 3 | import { DocSearch } from '@docsearch/react'
 4 | import '@docsearch/css'
 5 | 
 6 | import siteMetadata from '../../meta/site.json'
 7 | 
 8 | export default function Search({ placeholder = 'Search docs' }) {
 9 |     const apiKey = process.env.DOCSEARCH_API_KEY
10 |     const { indexName, appId } = siteMetadata.docSearch
11 |     return (
12 |         <DocSearch appId={appId} indexName={indexName} apiKey={apiKey} placeholder={placeholder} />
13 |     )
14 | }
15 | 
16 | Search.propTypes = {
17 |     id: PropTypes.string,
18 |     placeholder: PropTypes.string,
19 | }
20 | 


--------------------------------------------------------------------------------
/website/src/fonts/hkgrotesk-bold.woff:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/explosion/spaCy/41e07772dc5805594bab2997a090a9033e26bf56/website/src/fonts/hkgrotesk-bold.woff


--------------------------------------------------------------------------------
/website/src/fonts/hkgrotesk-bold.woff2:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/explosion/spaCy/41e07772dc5805594bab2997a090a9033e26bf56/website/src/fonts/hkgrotesk-bold.woff2


--------------------------------------------------------------------------------
/website/src/fonts/hkgrotesk-bolditalic.woff:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/explosion/spaCy/41e07772dc5805594bab2997a090a9033e26bf56/website/src/fonts/hkgrotesk-bolditalic.woff


--------------------------------------------------------------------------------
/website/src/fonts/hkgrotesk-bolditalic.woff2:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/explosion/spaCy/41e07772dc5805594bab2997a090a9033e26bf56/website/src/fonts/hkgrotesk-bolditalic.woff2


--------------------------------------------------------------------------------
/website/src/fonts/hkgrotesk-semibold.woff:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/explosion/spaCy/41e07772dc5805594bab2997a090a9033e26bf56/website/src/fonts/hkgrotesk-semibold.woff


--------------------------------------------------------------------------------
/website/src/fonts/hkgrotesk-semibold.woff2:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/explosion/spaCy/41e07772dc5805594bab2997a090a9033e26bf56/website/src/fonts/hkgrotesk-semibold.woff2


--------------------------------------------------------------------------------
/website/src/fonts/hkgrotesk-semibolditalic.woff:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/explosion/spaCy/41e07772dc5805594bab2997a090a9033e26bf56/website/src/fonts/hkgrotesk-semibolditalic.woff


--------------------------------------------------------------------------------
/website/src/fonts/hkgrotesk-semibolditalic.woff2:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/explosion/spaCy/41e07772dc5805594bab2997a090a9033e26bf56/website/src/fonts/hkgrotesk-semibolditalic.woff2


--------------------------------------------------------------------------------
/website/src/fonts/jetbrainsmono-italic.woff:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/explosion/spaCy/41e07772dc5805594bab2997a090a9033e26bf56/website/src/fonts/jetbrainsmono-italic.woff


--------------------------------------------------------------------------------
/website/src/fonts/jetbrainsmono-italic.woff2:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/explosion/spaCy/41e07772dc5805594bab2997a090a9033e26bf56/website/src/fonts/jetbrainsmono-italic.woff2


--------------------------------------------------------------------------------
/website/src/fonts/jetbrainsmono-regular.woff:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/explosion/spaCy/41e07772dc5805594bab2997a090a9033e26bf56/website/src/fonts/jetbrainsmono-regular.woff


--------------------------------------------------------------------------------
/website/src/fonts/jetbrainsmono-regular.woff2:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/explosion/spaCy/41e07772dc5805594bab2997a090a9033e26bf56/website/src/fonts/jetbrainsmono-regular.woff2


--------------------------------------------------------------------------------
/website/src/images/icon.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/explosion/spaCy/41e07772dc5805594bab2997a090a9033e26bf56/website/src/images/icon.png


--------------------------------------------------------------------------------
/website/src/images/icon_legacy.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/explosion/spaCy/41e07772dc5805594bab2997a090a9033e26bf56/website/src/images/icon_legacy.png


--------------------------------------------------------------------------------
/website/src/images/icon_nightly.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/explosion/spaCy/41e07772dc5805594bab2997a090a9033e26bf56/website/src/images/icon_nightly.png


--------------------------------------------------------------------------------
/website/src/images/icons/accept.svg:
--------------------------------------------------------------------------------
1 | <svg  xmlns="http://www.w3.org/2000/svg" width="24" height="24" viewBox="0 0 24 24">
2 | <path d="M9 16.172l10.594-10.594 1.406 1.406-12 12-5.578-5.578 1.406-1.406z"></path>
3 | </svg>
4 | 


--------------------------------------------------------------------------------
/website/src/images/icons/clipboard.svg:
--------------------------------------------------------------------------------
1 | <svg xmlns="http://www.w3.org/2000/svg" width="24" height="24" viewBox="0 0 24 24" fill="none" stroke="currentColor" stroke-width="2" stroke-linecap="round" stroke-linejoin="round">
2 |     <rect x="9" y="9" width="13" height="13" rx="2" ry="2" fill="none"></rect>
3 |     <path d="M5 15H4a2 2 0 0 1-2-2V4a2 2 0 0 1 2-2h9a2 2 0 0 1 2 2v1" fill="none"></path>
4 | </svg>
5 | 


--------------------------------------------------------------------------------
/website/src/images/icons/code.svg:
--------------------------------------------------------------------------------
1 | <svg  xmlns="http://www.w3.org/2000/svg" width="20" height="20" viewBox="0 0 20 20">
2 | <path d="M5.719 14.75c-0.236 0-0.474-0.083-0.664-0.252l-5.060-4.498 5.341-4.748c0.412-0.365 1.044-0.33 1.411 0.083s0.33 1.045-0.083 1.412l-3.659 3.253 3.378 3.002c0.413 0.367 0.45 0.999 0.083 1.412-0.197 0.223-0.472 0.336-0.747 0.336zM14.664 14.748l5.341-4.748-5.060-4.498c-0.413-0.367-1.045-0.33-1.411 0.083s-0.33 1.045 0.083 1.412l3.378 3.003-3.659 3.252c-0.413 0.367-0.45 0.999-0.083 1.412 0.197 0.223 0.472 0.336 0.747 0.336 0.236 0 0.474-0.083 0.664-0.252zM9.986 16.165l2-12c0.091-0.545-0.277-1.060-0.822-1.151-0.547-0.092-1.061 0.277-1.15 0.822l-2 12c-0.091 0.545 0.277 1.060 0.822 1.151 0.056 0.009 0.11 0.013 0.165 0.013 0.48 0 0.904-0.347 0.985-0.835z"></path>
3 | </svg>
4 | 


--------------------------------------------------------------------------------
/website/src/images/icons/docs.svg:
--------------------------------------------------------------------------------
1 | <svg  xmlns="http://www.w3.org/2000/svg" width="20" height="20" viewBox="0 0 20 20">
2 | <path d="M15.5 11h-11c-0.275 0-0.5 0.225-0.5 0.5v1c0 0.276 0.225 0.5 0.5 0.5h11c0.276 0 0.5-0.224 0.5-0.5v-1c0-0.275-0.224-0.5-0.5-0.5zM15.5 7h-11c-0.275 0-0.5 0.225-0.5 0.5v1c0 0.276 0.225 0.5 0.5 0.5h11c0.276 0 0.5-0.224 0.5-0.5v-1c0-0.275-0.224-0.5-0.5-0.5zM10.5 15h-6c-0.275 0-0.5 0.225-0.5 0.5v1c0 0.276 0.225 0.5 0.5 0.5h6c0.276 0 0.5-0.224 0.5-0.5v-1c0-0.275-0.224-0.5-0.5-0.5zM15.5 3h-11c-0.275 0-0.5 0.225-0.5 0.5v1c0 0.276 0.225 0.5 0.5 0.5h11c0.276 0 0.5-0.224 0.5-0.5v-1c0-0.275-0.224-0.5-0.5-0.5z"></path>
3 | </svg>
4 | 


--------------------------------------------------------------------------------
/website/src/images/icons/info.svg:
--------------------------------------------------------------------------------
1 | <svg  xmlns="http://www.w3.org/2000/svg" width="24" height="24" viewBox="0 0 24 24">
2 | <path d="M12.984 12.984v-6h-1.969v6h1.969zM12.984 17.016v-2.016h-1.969v2.016h1.969zM12 2.016c5.531 0 9.984 4.453 9.984 9.984s-4.453 9.984-9.984 9.984-9.984-4.453-9.984-9.984 4.453-9.984 9.984-9.984z"></path>
3 | </svg>
4 | 


--------------------------------------------------------------------------------
/website/src/images/icons/moon.svg:
--------------------------------------------------------------------------------
1 | <svg xmlns="http://www.w3.org/2000/svg" width="32" height="32" viewBox="0 0 32 32">
2 | <path d="M10.895 7.574c0 7.55 5.179 13.67 11.567 13.67 1.588 0 3.101-0.38 4.479-1.063-1.695 4.46-5.996 7.636-11.051 7.636-6.533 0-11.83-5.297-11.83-11.83 0-4.82 2.888-8.959 7.023-10.803-0.116 0.778-0.188 1.573-0.188 2.39z"></path>
3 | </svg>
4 | 


--------------------------------------------------------------------------------
/website/src/images/icons/neutral.svg:
--------------------------------------------------------------------------------
1 | <svg  xmlns="http://www.w3.org/2000/svg" width="24" height="24" viewBox="0 0 24 24">
2 | <path d="M12 2.016c5.531 0 9.984 4.453 9.984 9.984s-4.453 9.984-9.984 9.984-9.984-4.453-9.984-9.984 4.453-9.984 9.984-9.984z"></path>
3 | </svg>
4 | 


--------------------------------------------------------------------------------
/website/src/images/icons/no.svg:
--------------------------------------------------------------------------------
1 | <svg  xmlns="http://www.w3.org/2000/svg" width="24" height="24" viewBox="0 0 24 24">
2 | <path d="M17.016 15.609l-3.609-3.609 3.609-3.609-1.406-1.406-3.609 3.609-3.609-3.609-1.406 1.406 3.609 3.609-3.609 3.609 1.406 1.406 3.609-3.609 3.609 3.609zM12 2.016c5.531 0 9.984 4.453 9.984 9.984s-4.453 9.984-9.984 9.984-9.984-4.453-9.984-9.984 4.453-9.984 9.984-9.984z"></path>
3 | </svg>
4 | 


--------------------------------------------------------------------------------
/website/src/images/icons/offline.svg:
--------------------------------------------------------------------------------
1 | <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 100 100" width="24" height="24">
2 |   <path d="M88.17,69.32A22,22,0,0,0,77.79,37.77a22,22,0,0,0-21.91-20.2,20.29,20.29,0,0,0-13.67,4.81l4.57,5.3a13.31,13.31,0,0,1,9.1-3.11,15,15,0,0,1,15,15c0,.06,0,.13,0,.19s0,.26,0,.39l-.1,2.75,2.65.75a15,15,0,0,1,8.79,22.1Z M28.76,80H60.44V73H28.76a15,15,0,0,1-15-15A17,17,0,0,1,23.6,42.7l-2.79-6.42A23.91,23.91,0,0,0,6.78,58,22,22,0,0,0,28.76,80Z" />
3 |   <rect x="45.53" y="-4.58" width="7" height="106.75" transform="translate(-20.14 48.96) rotate(-45)"></rect>
4 | </svg>
5 | 


--------------------------------------------------------------------------------
/website/src/images/icons/package.svg:
--------------------------------------------------------------------------------
1 | <svg xmlns="http://www.w3.org/2000/svg" width="24" height="24" viewBox="0 0 24 24" fill="none" stroke="currentColor" stroke-width="2">
2 |     <path fill="none" d="M21 16V8a2 2 0 0 0-1-1.73l-7-4a2 2 0 0 0-2 0l-7 4A2 2 0 0 0 3 8v8a2 2 0 0 0 1 1.73l7 4a2 2 0 0 0 2 0l7-4A2 2 0 0 0 21 16z"></path>
3 |     <polyline fill="none" points="3.27 6.96 12 12.01 20.73 6.96"></polyline>
4 |     <line fill="none" x1="12" y1="22.08" x2="12" y2="12"></line>
5 | </svg>
6 | 


--------------------------------------------------------------------------------
/website/src/images/icons/reject.svg:
--------------------------------------------------------------------------------
1 | <svg  xmlns="http://www.w3.org/2000/svg" width="24" height="24" viewBox="0 0 24 24">
2 | <path d="M18.984 6.422l-5.578 5.578 5.578 5.578-1.406 1.406-5.578-5.578-5.578 5.578-1.406-1.406 5.578-5.578-5.578-5.578 1.406-1.406 5.578 5.578 5.578-5.578z"></path>
3 | </svg>
4 | 


--------------------------------------------------------------------------------
/website/src/images/icons/search.svg:
--------------------------------------------------------------------------------
1 | <svg  xmlns="http://www.w3.org/2000/svg" width="24" height="24" viewBox="0 0 24 24">
2 | <path d="M9.516 14.016c2.484 0 4.5-2.016 4.5-4.5s-2.016-4.5-4.5-4.5-4.5 2.016-4.5 4.5 2.016 4.5 4.5 4.5zM15.516 14.016l4.969 4.969-1.5 1.5-4.969-4.969v-0.797l-0.281-0.281c-1.125 0.984-2.625 1.547-4.219 1.547-3.609 0-6.516-2.859-6.516-6.469s2.906-6.516 6.516-6.516 6.469 2.906 6.469 6.516c0 1.594-0.563 3.094-1.547 4.219l0.281 0.281h0.797z"></path>
3 | </svg>
4 | 


--------------------------------------------------------------------------------
/website/src/images/icons/twitter.svg:
--------------------------------------------------------------------------------
1 | <svg xmlns="http://www.w3.org/2000/svg" width="30" height="32" viewBox="0 0 30 32">
2 |     <path d="M28.929 7.286q-1.196 1.75-2.893 2.982 0.018 0.25 0.018 0.75 0 2.321-0.679 4.634t-2.063 4.437-3.295 3.759-4.607 2.607-5.768 0.973q-4.839 0-8.857-2.589 0.625 0.071 1.393 0.071 4.018 0 7.161-2.464-1.875-0.036-3.357-1.152t-2.036-2.848q0.589 0.089 1.089 0.089 0.768 0 1.518-0.196-2-0.411-3.313-1.991t-1.313-3.67v-0.071q1.214 0.679 2.607 0.732-1.179-0.786-1.875-2.054t-0.696-2.75q0-1.571 0.786-2.911 2.161 2.661 5.259 4.259t6.634 1.777q-0.143-0.679-0.143-1.321 0-2.393 1.688-4.080t4.080-1.688q2.5 0 4.214 1.821 1.946-0.375 3.661-1.393-0.661 2.054-2.536 3.179 1.661-0.179 3.321-0.893z" />
3 | </svg>
4 | 


--------------------------------------------------------------------------------
/website/src/images/icons/warning.svg:
--------------------------------------------------------------------------------
1 | <svg  xmlns="http://www.w3.org/2000/svg" width="24" height="24" viewBox="0 0 24 24">
2 | <path d="M21.171 15.398l-5.912-9.854c-0.776-1.293-1.963-2.033-3.259-2.033s-2.483 0.74-3.259 2.031l-5.912 9.856c-0.786 1.309-0.872 2.705-0.235 3.83 0.636 1.126 1.878 1.772 3.406 1.772h12c1.528 0 2.77-0.646 3.406-1.771 0.637-1.125 0.551-2.521-0.235-3.831zM12 17.549c-0.854 0-1.55-0.695-1.55-1.549s0.695-1.551 1.55-1.551 1.55 0.696 1.55 1.551c0 0.854-0.696 1.549-1.55 1.549zM13.633 10.125c-0.011 0.031-1.401 3.468-1.401 3.468-0.038 0.094-0.13 0.156-0.231 0.156s-0.193-0.062-0.231-0.156l-1.391-3.438c-0.090-0.233-0.129-0.443-0.129-0.655 0-0.965 0.785-1.75 1.75-1.75s1.75 0.785 1.75 1.75c0 0.212-0.039 0.422-0.117 0.625z"></path>
3 | </svg>
4 | 


--------------------------------------------------------------------------------
/website/src/images/icons/yes.svg:
--------------------------------------------------------------------------------
1 | <svg  xmlns="http://www.w3.org/2000/svg" width="24" height="24" viewBox="0 0 24 24">
2 | <path d="M9.984 17.016l9-9-1.406-1.453-7.594 7.594-3.563-3.563-1.406 1.406zM12 2.016c5.531 0 9.984 4.453 9.984 9.984s-4.453 9.984-9.984 9.984-9.984-4.453-9.984-9.984 4.453-9.984 9.984-9.984z"></path>
3 | </svg>
4 | 


--------------------------------------------------------------------------------
/website/src/images/pattern_blue.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/explosion/spaCy/41e07772dc5805594bab2997a090a9033e26bf56/website/src/images/pattern_blue.png


--------------------------------------------------------------------------------
/website/src/images/pattern_green.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/explosion/spaCy/41e07772dc5805594bab2997a090a9033e26bf56/website/src/images/pattern_green.png


--------------------------------------------------------------------------------
/website/src/images/pattern_landing.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/explosion/spaCy/41e07772dc5805594bab2997a090a9033e26bf56/website/src/images/pattern_landing.png


--------------------------------------------------------------------------------
/website/src/images/pattern_landing_legacy.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/explosion/spaCy/41e07772dc5805594bab2997a090a9033e26bf56/website/src/images/pattern_landing_legacy.png


--------------------------------------------------------------------------------
/website/src/images/pattern_landing_nightly.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/explosion/spaCy/41e07772dc5805594bab2997a090a9033e26bf56/website/src/images/pattern_landing_nightly.png


--------------------------------------------------------------------------------
/website/src/images/pattern_legacy.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/explosion/spaCy/41e07772dc5805594bab2997a090a9033e26bf56/website/src/images/pattern_legacy.png


--------------------------------------------------------------------------------
/website/src/images/pattern_nightly.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/explosion/spaCy/41e07772dc5805594bab2997a090a9033e26bf56/website/src/images/pattern_nightly.png


--------------------------------------------------------------------------------
/website/src/images/pattern_purple.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/explosion/spaCy/41e07772dc5805594bab2997a090a9033e26bf56/website/src/images/pattern_purple.png


--------------------------------------------------------------------------------
/website/src/images/social_api.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/explosion/spaCy/41e07772dc5805594bab2997a090a9033e26bf56/website/src/images/social_api.jpg


--------------------------------------------------------------------------------
/website/src/images/social_default.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/explosion/spaCy/41e07772dc5805594bab2997a090a9033e26bf56/website/src/images/social_default.jpg


--------------------------------------------------------------------------------
/website/src/images/social_legacy.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/explosion/spaCy/41e07772dc5805594bab2997a090a9033e26bf56/website/src/images/social_legacy.jpg


--------------------------------------------------------------------------------
/website/src/images/social_nightly.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/explosion/spaCy/41e07772dc5805594bab2997a090a9033e26bf56/website/src/images/social_nightly.jpg


--------------------------------------------------------------------------------
/website/src/images/social_universe.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/explosion/spaCy/41e07772dc5805594bab2997a090a9033e26bf56/website/src/images/social_universe.jpg


--------------------------------------------------------------------------------
/website/src/images/spacy-irl.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/explosion/spaCy/41e07772dc5805594bab2997a090a9033e26bf56/website/src/images/spacy-irl.jpg


--------------------------------------------------------------------------------
/website/src/styles/alert.module.sass:
--------------------------------------------------------------------------------
 1 | .root
 2 |     position: fixed
 3 |     bottom: 0
 4 |     left: 0
 5 |     width: 100%
 6 |     background: var(--color-back)
 7 |     z-index: 100
 8 |     font: var(--font-size-sm)/var(--line-height-md) var(--font-primary)
 9 |     text-align: center
10 |     padding: 1rem
11 |     box-shadow: var(--box-shadow)
12 |     border-top: 2px solid
13 |     color: var(--color-theme-dark)
14 | 
15 | .warning
16 |     --alert-bg: var(--color-yellow-light)
17 |     --color-theme: var(--color-yellow-dark)
18 |     --color-theme-dark: var(--color-yellow-dark)
19 |     --color-inline-code-bg: var(--color-yellow-opaque)
20 |     background: var(--color-yellow-light)
21 |     color: var(--color-yellow-dark)
22 | 
23 | .clickable
24 |     cursor: pointer
25 | 


--------------------------------------------------------------------------------
/website/src/styles/card.module.sass:
--------------------------------------------------------------------------------
 1 | .root
 2 |     background: var(--color-subtle-light)
 3 |     border-radius: var(--border-radius)
 4 |     padding: 2rem
 5 |     font: var(--font-size-md)/var(--line-height-md) var(--font-primary)
 6 |     margin-bottom: var(--spacing-sm)
 7 | 
 8 | .small
 9 |     padding: 1.5rem
10 |     font-size: var(--font-size-sm)
11 |     line-height: var(--line-height-sm)
12 |     color: var(--color-dark)
13 | 
14 |     .title
15 |         margin-bottom: var(--spacing-xs)
16 | 
17 | .image
18 |     $image-size: 35px
19 |     width: $image-size
20 |     height: $image-size
21 |     overflow: hidden
22 |     float: right
23 |     border-radius: 50%
24 | 


--------------------------------------------------------------------------------
/website/src/styles/copy.module.sass:
--------------------------------------------------------------------------------
 1 | .root
 2 |     background: var(--color-back)
 3 |     border-radius: 2em
 4 |     border: 1px solid var(--color-subtle)
 5 |     width: 100%
 6 |     padding: 0.25em 1em
 7 |     display: inline-flex
 8 |     margin: var(--spacing-xs) 0
 9 |     font: var(--font-size-code)/var(--line-height-code) var(--font-code)
10 |     -webkit-font-smoothing: subpixel-antialiased
11 |     -moz-osx-font-smoothing: auto
12 | 
13 | .textarea
14 |     flex: 100%
15 |     background: transparent
16 |     resize: none
17 |     font: inherit
18 |     overflow: hidden
19 |     white-space: nowrap
20 |     text-overflow: ellipsis
21 |     margin-right: 1rem
22 | 
23 | .prefix
24 |     margin-right: 0.75em
25 |     color: var(--color-subtle-dark)
26 | 


--------------------------------------------------------------------------------
/website/src/styles/icon.module.sass:
--------------------------------------------------------------------------------
 1 | .root
 2 |     vertical-align: middle
 3 | 
 4 | .inline
 5 |     margin: 0 0.55em 0 0.1em
 6 | 
 7 | .tag
 8 |     vertical-align: bottom
 9 |     height: 100%
10 |     position: relative
11 |     top: 1px
12 | 
13 | .success
14 |     color: var(--color-green-medium)
15 | 
16 | .error
17 |     color: var(--color-red-medium)
18 | 
19 | .subtle
20 |     color: var(--color-subtle-dark)
21 | 


--------------------------------------------------------------------------------
/website/src/styles/link.module.sass:
--------------------------------------------------------------------------------
 1 | .root
 2 |     color: var(--color-theme-dark)
 3 |     border-bottom: 1px solid
 4 |     transition: color 0.2s ease
 5 |     cursor: pointer
 6 | 
 7 |     &:hover
 8 |         color: var(--color-front)
 9 | 
10 | .no-link-layout
11 |     border: none
12 |     color: inherit
13 | 
14 |     &:hover
15 |         color: inherit
16 | 
17 | .icon
18 |     margin-left: 0.5em
19 |     width: 1.1em
20 |     height: 1.1em
21 | 
22 | .nowrap
23 |     white-space: nowrap
24 |     display: inline-block
25 | 
26 | .with-icon
27 |     border: none
28 | 
29 | .source-text
30 |     border-bottom: 1px solid
31 | 


--------------------------------------------------------------------------------
/website/src/styles/newsletter.module.sass:
--------------------------------------------------------------------------------
 1 | .root
 2 |     font: var(--font-size-sm)/var(--line-height-sm) var(--font-primary)
 3 |     margin: var(--spacing-xs) 0
 4 |     background: var(--color-back)
 5 |     border-radius: 2em
 6 |     border: 1px solid var(--color-subtle)
 7 |     padding-right: 1em
 8 |     display: inline-flex
 9 |     max-width: 300px
10 | 
11 | .input
12 |     font: inherit
13 |     background: transparent
14 |     padding: 0.5em 1em
15 |     margin: 0 0 0.25rem 0.25rem
16 |     flex: 100%
17 | 
18 | .button
19 |     font: bold var(--font-size-lg)/var(--line-height-md) var(--font-secondary)
20 |     text-transform: uppercase
21 |     color: var(--color-theme-dark)
22 |     white-space: nowrap
23 | 


--------------------------------------------------------------------------------
/website/src/styles/progress.module.sass:
--------------------------------------------------------------------------------
 1 | .root
 2 |     display: block
 3 |     flex: 105%
 4 |     width: 105%
 5 |     height: 3px
 6 |     color: var(--color-theme)
 7 |     background: transparent
 8 |     border: none
 9 |     position: absolute
10 |     bottom: 0
11 |     left: -2.5%
12 | 
13 |     &::-webkit-progress-bar
14 |         background: var(--color-back)
15 |         border-radius: none
16 | 
17 |     &::-webkit-progress-value
18 |         background: var(--color-theme)
19 |         border-radius: none
20 | 
21 |     &::-moz-progress-bar
22 |         background: var(--color-theme)
23 | 


--------------------------------------------------------------------------------
/website/src/styles/readnext.module.sass:
--------------------------------------------------------------------------------
 1 | .root
 2 |     display: flex
 3 |     justify-content: flex-end
 4 |     align-items: center
 5 |     text-align: right
 6 |     font: var(--font-size-sm)/var(--line-height-md) var(--font-primary)
 7 | 
 8 | .icon
 9 |     $icon-size: 35px
10 |     width: $icon-size
11 |     height: $icon-size
12 |     background: var(--color-subtle-light)
13 |     color: var(--color-subtle-dark)
14 |     border-radius: 50%
15 |     padding: 0.5rem 0.65rem 0.5rem 0
16 |     transition: color 0.2s ease
17 |     float: right
18 |     margin-left: 3rem
19 | 
20 |     &:hover
21 |         color: var(--color-theme-dark)
22 | 


--------------------------------------------------------------------------------
/website/src/styles/section.module.sass:
--------------------------------------------------------------------------------
 1 | .root
 2 |     &:not(:last-child):not(:last-of-type)
 3 |         margin-bottom: var(--spacing-md)
 4 |         padding-bottom: var(--spacing-md)
 5 |         border-bottom: 1px dotted var(--color-subtle)
 6 | 
 7 | .hr
 8 |     border: 0
 9 |     padding: var(--spacing-sm) 0
10 | 


--------------------------------------------------------------------------------
/website/src/styles/tag.module.sass:
--------------------------------------------------------------------------------
 1 | .root
 2 |     display: inline-block
 3 |     font: bold var(--font-size-xs)/#{1} var(--font-secondary)
 4 |     background: var(--color-theme-dark)
 5 |     color: var(--color-back)
 6 |     padding: 2px 6px 4px
 7 |     border-radius: 1em
 8 |     text-transform: uppercase
 9 |     vertical-align: middle
10 | 
11 | .spaced
12 |     margin-left: 0.75em
13 |     margin-right: 0.5em
14 | 
15 | .icon
16 |     margin-left: 0.5em
17 | 


--------------------------------------------------------------------------------
/website/tsconfig.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "compilerOptions": {
 3 |         "target": "es5",
 4 |         "lib": ["dom", "dom.iterable", "esnext"],
 5 |         "allowJs": true,
 6 |         "skipLibCheck": true,
 7 |         "strict": false,
 8 |         "forceConsistentCasingInFileNames": true,
 9 |         "noEmit": true,
10 |         "esModuleInterop": true,
11 |         "module": "esnext",
12 |         "moduleResolution": "node",
13 |         "resolveJsonModule": true,
14 |         "isolatedModules": true,
15 |         "jsx": "preserve",
16 |         "incremental": true
17 |     },
18 |     "include": ["next-env.d.ts", "**/*.ts", "**/*.tsx"],
19 |     "exclude": ["node_modules"]
20 | }
21 | 


--------------------------------------------------------------------------------