├── .github ├── CONTRIBUTOR_AGREEMENT.md ├── FUNDING.yml ├── ISSUE_TEMPLATE │ ├── 01_bugs.md │ ├── 02_docs.md │ └── config.yml ├── PULL_REQUEST_TEMPLATE.md ├── contributors │ ├── 0x2b3bfa0.md │ ├── 5hirish.md │ ├── ALSchwalm.md │ ├── AMArostegui.md │ ├── AlJohri.md │ ├── Arvindcheenu.md │ ├── AyushExel.md │ ├── Azagh3l.md │ ├── Baciccin.md │ ├── Bharat123rox.md │ ├── BigstickCarpet.md │ ├── BramVanroy.md │ ├── BreakBB.md │ ├── Bri-Will.md │ ├── Brixjohn.md │ ├── Cinnamy.md │ ├── DeNeutoy.md │ ├── DimaBryuhanov.md │ ├── Dobita21.md │ ├── DoomCoder.md │ ├── DuyguA.md │ ├── EARL_GREYT.md │ ├── Eleni170.md │ ├── EmilStenstrom.md │ ├── F0rge1cE.md │ ├── FallakAsad.md │ ├── GiorgioPorgio.md │ ├── Gizzio.md │ ├── GuiGel.md │ ├── Hazoom.md │ ├── HiromuHota.md │ ├── ICLRandD.md │ ├── IsaacHaze.md │ ├── JKhakpour.md │ ├── Jan-711.md │ ├── JannisTriesToCode.md │ ├── Jette16.md │ ├── KKsharma99.md │ ├── KennethEnevoldsen.md │ ├── Kimahriman.md │ ├── LRAbbade.md │ ├── Loghijiaha.md │ ├── Lucaterre.md │ ├── MartinoMensio.md │ ├── MateuszOlko.md │ ├── MathiasDesch.md │ ├── MiniLau.md │ ├── MisterKeefe.md │ ├── Mlawrence95.md │ ├── NSchrading.md │ ├── NirantK.md │ ├── Nuccy90.md │ ├── Olamyy.md │ ├── Pantalaymon.md │ ├── Pavle992.md │ ├── PeterGilles.md │ ├── PluieElectrique.md │ ├── Poluglottos.md │ ├── PolyglotOpenstreetmap.md │ ├── R1j1t.md │ ├── RvanNieuwpoort.md │ ├── SamEdwardes.md │ ├── SamuelLKane.md │ ├── Schibsted.png │ ├── Stannislav.md │ ├── Tiljander.md │ ├── YohannesDatasci.md │ ├── ZeeD.md │ ├── aajanki.md │ ├── aaronkub.md │ ├── aashishg.md │ ├── abchapman93.md │ ├── abhi18av.md │ ├── adrianeboyd.md │ ├── adrienball.md │ ├── ajrader.md │ ├── akki2825.md │ ├── akornilo.md │ ├── alexcombessie.md │ ├── alexvy86.md │ ├── aliiae.md │ ├── alldefector.md │ ├── alvaroabascar.md │ ├── alvations.md │ ├── ameyuuno.md │ ├── amitness.md │ ├── amperinet.md │ ├── aniruddha-adhikary.md │ ├── ansgar-t.md │ ├── aongko.md │ ├── aristorinjuang.md │ ├── armsp.md │ ├── aryaprabhudesai.md │ ├── askhogan.md │ ├── avadhpatel.md │ ├── avi197.md │ ├── avramandrei.md │ ├── azarezade.md │ ├── b1uec0in.md │ ├── bbieniek.md │ ├── bdewilde.md │ ├── beatesi.md │ ├── bellabie.md │ ├── bintay.md │ ├── bittlingmayer.md │ ├── bjascob.md │ ├── bodak.md │ ├── boena.md │ ├── borijang.md │ ├── bratao.md │ ├── broaddeep.md │ ├── bryant1410.md │ ├── bsweileh.md │ ├── btrungchi.md │ ├── calumcalder.md │ ├── cbilgili.md │ ├── cclauss.md │ ├── cedar101.md │ ├── celikomer.md │ ├── ceteri.md │ ├── charlax.md │ ├── chezou.md │ ├── chopeen.md │ ├── chrisdubois.md │ ├── cicorias.md │ ├── clarus.md │ ├── clippered.md │ ├── connorbrinton.md │ ├── coryhurst.md │ ├── cristianasp.md │ ├── d99kris.md │ ├── danielhers.md │ ├── danielkingai2.md │ ├── danielruf.md │ ├── danielvasic.md │ ├── dardoria.md │ ├── darindf.md │ ├── delzac.md │ ├── demfier.md │ ├── demongolem.md │ ├── dhpollack.md │ ├── dhruvrnaik.md │ ├── doug-descombaz.md │ ├── drndos.md │ ├── dvsrepo.md │ ├── elbaulp.md │ ├── elben10 │ ├── emulbreh.md │ ├── enerrio.md │ ├── er-raoniz.md │ ├── erip.md │ ├── estr4ng7d.md │ ├── ezorita.md │ ├── fgaim.md │ ├── filipecaixeta.md │ ├── fizban99.md │ ├── florijanstamenkovic.md │ ├── fonfonx.md │ ├── forest1988.md │ ├── foufaster.md │ ├── frascuchon.md │ ├── free-variation.md │ ├── fsonntag.md │ ├── fucking-signup.md │ ├── gandersen101.md │ ├── gavrieltal.md │ ├── giannisdaras.md │ ├── graue70.md │ ├── graus.md │ ├── greenriverrus.md │ ├── grivaz.md │ ├── gtoffoli.md │ ├── guerda.md │ ├── gustavengstrom.md │ ├── henry860916.md │ ├── hertelm.md │ ├── himkt.md │ ├── hiroshi-matsuda-rit.md │ ├── hlasse.md │ ├── holubvl3.md │ ├── honnibal.md │ ├── howl-anderson.md │ ├── hugovk.md │ ├── iann0036.md │ ├── idealley.md │ ├── idoshr.md │ ├── iechevarria.md │ ├── ilivans.md │ ├── ines.md │ ├── intrafindBreno.md │ ├── isaric.md │ ├── iurshina.md │ ├── ivigamberdiev.md │ ├── ivyleavedtoadflax.md │ ├── jabortell.md │ ├── jacopofar.md │ ├── jacse.md │ ├── janimo.md │ ├── jankrepl.md │ ├── jarib.md │ ├── jaydeepborkar.md │ ├── jbesomi.md │ ├── jeannefukumaru.md │ ├── jenojp.md │ ├── jerbob92.md │ ├── jganseman.md │ ├── jgutix.md │ ├── jimregan.md │ ├── jklaise.md │ ├── jmargeta.md │ ├── jmyerston.md │ ├── johnhaley81.md │ ├── jonesmartins.md │ ├── juliamakogon.md │ ├── julien-talkair.md │ ├── juliensalinas.md │ ├── jumasheff.md │ ├── justindujardin.md │ ├── kabirkhan.md │ ├── katarkor.md │ ├── katrinleinweber.md │ ├── kbulygin.md │ ├── keshan.md │ ├── keshav.md │ ├── kevinlu1248.md │ ├── khellan.md │ ├── kimfalk.md │ ├── knoxdw.md │ ├── koaning.md │ ├── kognate.md │ ├── kororo.md │ ├── kowaalczyk.md │ ├── kwhumphreys.md │ ├── laszabine.md │ ├── lauraBaakman.md │ ├── ldorigo.md │ ├── leicmi.md │ ├── leomrocha.md │ ├── leyendecker.md │ ├── lfiedler.md │ ├── ligser.md │ ├── lizhe2004.md │ ├── lorenanda.md │ ├── louisguitton.md │ ├── luvogels.md │ ├── mabraham.md │ ├── magnusburton.md │ ├── mahnerak.md │ ├── mariosasko.md │ ├── markulrich.md │ ├── mauryaland.md │ ├── mbkupfer.md │ ├── mdaudali.md │ ├── mdcclv.md │ ├── mdda.md │ ├── meghanabhange.md │ ├── melanuria.pdf │ ├── merrcury.md │ ├── michael-k.md │ ├── mihaigliga21.md │ ├── mikeizbicki.md │ ├── mikelibg.md │ ├── mirfan899.md │ ├── miroli.md │ ├── mmaybeno.md │ ├── mn3mos.md │ ├── mollerhoj.md │ ├── moreymat.md │ ├── mpszumowski.md │ ├── mpuig.md │ ├── mr-bjerre.md │ ├── msklvsk.md │ ├── munozbravo.md │ ├── myavrum.md │ ├── narayanacharya6.md │ ├── neelkamath.md │ ├── nikhilsaldanha.md │ ├── nipunsadvilkar.md │ ├── njsmith.md │ ├── nlptown.md │ ├── nourshalabi.md │ ├── nsorros.md │ ├── ohenrik.md │ ├── onlyanegg.md │ ├── ophelielacroix.md │ ├── oroszgy.md │ ├── osori.md │ ├── ottosulin.md │ ├── oxinabox.md │ ├── ozcankasal.md │ ├── paoloq.md │ ├── pberba.md │ ├── pbnsilva.md │ ├── peter-exos.md │ ├── phiedulxp.md │ ├── philipvollet.md │ ├── phojnacki.md │ ├── pickfire.md │ ├── pinealan.md │ ├── pktippa.md │ ├── plison.md │ ├── pmbaumgartner.md │ ├── polm.md │ ├── prilopes.md │ ├── punitvara.md │ ├── pzelasko.md │ ├── questoph.md │ ├── rafguns.md │ ├── rahul1990gupta.md │ ├── ramananbalakrishnan.md │ ├── rameshhpathak.md │ ├── rasyidf.md │ ├── reneoctavio.md │ ├── retnuh.md │ ├── revuel.md │ ├── richardliaw.md │ ├── richardpaulhudson.md │ ├── robertsipek.md │ ├── rokasramas.md │ ├── roshni-b.md │ ├── ryanzhe.md │ ├── sabiqueqb.md │ ├── sainathadapa.md │ ├── sammous.md │ ├── savkov.md │ ├── seanBE.md │ ├── sebastienharinck.md │ ├── sevdimali.md │ ├── shigapov.md │ ├── shuvanon.md │ ├── skrcode.md │ ├── sloev.md │ ├── snsten.md │ ├── socool.md │ ├── solarmist.md │ ├── sorenlind.md │ ├── suchow.md │ ├── svlandeg.md │ ├── swfarnsworth.md │ ├── syrull.md │ ├── tamuhey.md │ ├── therealronnie.md │ ├── theudas.md │ ├── thomasbird.md │ ├── thomashacker.md │ ├── thomasopsomer.md │ ├── thomasthiebaud.md │ ├── thoppe.md │ ├── tiangolo.md │ ├── tilusnet.md │ ├── tjkemp.md │ ├── tmetzl.md │ ├── tokestermw.md │ ├── tommilligan.md │ ├── trungtv.md │ ├── tupui.md │ ├── tyburam.md │ ├── tzano.md │ ├── ujwal-narayan.md │ ├── umarbutler.md │ ├── ursachec.md │ ├── uwol.md │ ├── veer-bains.md │ ├── vha14.md │ ├── vikaskyadav.md │ ├── vishnumenon.md │ ├── vishnupriyavr.md │ ├── vondersam.md │ ├── vsolovyov.md │ ├── w4nderlust.md │ ├── wallinm1.md │ ├── walterhenry.md │ ├── wannaphongcom.md │ ├── werew.md │ ├── willismonroe.md │ ├── willprice.md │ ├── wojtuch.md │ ├── wxv.md │ ├── x-ji.md │ ├── xadrianzetx.md │ ├── xssChauhan.md │ ├── yanaiela.md │ ├── yaph.md │ ├── yashpatadia.md │ ├── yohasebe.md │ ├── yosiasz.md │ ├── yuukos.md │ ├── zaibacu.md │ ├── zhuorulin.md │ ├── zqhZY.md │ └── zqianem.md ├── spacy_universe_alert.py ├── validate_universe_json.py └── workflows │ ├── cibuildwheel.yml │ ├── explosionbot.yml │ ├── gputests.yml.disabled │ ├── issue-manager.yml │ ├── lock.yml │ ├── publish_pypi.yml │ ├── slowtests.yml.disabled │ ├── spacy_universe_alert.yml │ ├── tests.yml │ └── universe_validation.yml ├── .gitignore ├── .pre-commit-config.yaml ├── CITATION.cff ├── CONTRIBUTING.md ├── LICENSE ├── MANIFEST.in ├── Makefile ├── README.md ├── bin ├── get-package.sh ├── get-version.sh ├── push-tag.sh └── release.sh ├── build-constraints.txt ├── examples ├── README.md └── training │ └── README.md ├── extra ├── DEVELOPER_DOCS │ ├── Code Conventions.md │ ├── ExplosionBot.md │ ├── Language.md │ ├── Listeners.md │ ├── README.md │ ├── Satellite Packages.md │ └── StringStore-Vocab.md └── example_data │ ├── ner_example_data │ ├── README.md │ ├── ner-sent-per-line.iob │ ├── ner-sent-per-line.json │ ├── ner-token-per-line-conll2003.iob │ ├── ner-token-per-line-conll2003.json │ ├── ner-token-per-line-with-pos.iob │ ├── ner-token-per-line-with-pos.json │ ├── ner-token-per-line.iob │ └── ner-token-per-line.json │ ├── textcat_example_data │ ├── CC0.txt │ ├── CC_BY-SA-3.0.txt │ ├── CC_BY-SA-4.0.txt │ ├── README.md │ ├── cooking.json │ ├── cooking.jsonl │ ├── jigsaw-toxic-comment.json │ ├── jigsaw-toxic-comment.jsonl │ └── textcatjsonl_to_trainjson.py │ ├── training-data.json │ └── vocab-data.jsonl ├── licenses └── 3rd_party_licenses.txt ├── netlify.toml ├── pyproject.toml ├── requirements.txt ├── setup.cfg ├── setup.py ├── spacy ├── __init__.pxd ├── __init__.py ├── __main__.py ├── about.py ├── attrs.pxd ├── attrs.pyx ├── cli │ ├── __init__.py │ ├── _util.py │ ├── apply.py │ ├── assemble.py │ ├── benchmark_speed.py │ ├── convert.py │ ├── debug_config.py │ ├── debug_data.py │ ├── debug_diff.py │ ├── debug_model.py │ ├── download.py │ ├── evaluate.py │ ├── find_function.py │ ├── find_threshold.py │ ├── info.py │ ├── init_config.py │ ├── init_pipeline.py │ ├── package.py │ ├── pretrain.py │ ├── profile.py │ ├── project │ │ ├── __init__.py │ │ ├── assets.py │ │ ├── clone.py │ │ ├── document.py │ │ ├── dvc.py │ │ ├── pull.py │ │ ├── push.py │ │ ├── remote_storage.py │ │ └── run.py │ ├── templates │ │ ├── quickstart_training.jinja │ │ └── quickstart_training_recommendations.yml │ ├── train.py │ └── validate.py ├── compat.py ├── default_config.cfg ├── default_config_pretraining.cfg ├── displacy │ ├── __init__.py │ ├── render.py │ └── templates.py ├── errors.py ├── glossary.py ├── kb │ ├── __init__.py │ ├── candidate.pxd │ ├── candidate.pyx │ ├── kb.pxd │ ├── kb.pyx │ ├── kb_in_memory.pxd │ └── kb_in_memory.pyx ├── lang │ ├── __init__.py │ ├── af │ │ ├── __init__.py │ │ └── stop_words.py │ ├── am │ │ ├── __init__.py │ │ ├── examples.py │ │ ├── lex_attrs.py │ │ ├── punctuation.py │ │ ├── stop_words.py │ │ └── tokenizer_exceptions.py │ ├── ar │ │ ├── __init__.py │ │ ├── examples.py │ │ ├── lex_attrs.py │ │ ├── punctuation.py │ │ ├── stop_words.py │ │ └── tokenizer_exceptions.py │ ├── az │ │ ├── __init__.py │ │ ├── examples.py │ │ ├── lex_attrs.py │ │ └── stop_words.py │ ├── bg │ │ ├── __init__.py │ │ ├── examples.py │ │ ├── lex_attrs.py │ │ ├── stop_words.py │ │ └── tokenizer_exceptions.py │ ├── bn │ │ ├── __init__.py │ │ ├── examples.py │ │ ├── punctuation.py │ │ ├── stop_words.py │ │ └── tokenizer_exceptions.py │ ├── bo │ │ ├── __init__.py │ │ ├── examples.py │ │ ├── lex_attrs.py │ │ └── stop_words.py │ ├── ca │ │ ├── __init__.py │ │ ├── examples.py │ │ ├── lemmatizer.py │ │ ├── lex_attrs.py │ │ ├── punctuation.py │ │ ├── stop_words.py │ │ ├── syntax_iterators.py │ │ └── tokenizer_exceptions.py │ ├── char_classes.py │ ├── cs │ │ ├── __init__.py │ │ ├── examples.py │ │ ├── lex_attrs.py │ │ └── stop_words.py │ ├── da │ │ ├── __init__.py │ │ ├── examples.py │ │ ├── lex_attrs.py │ │ ├── punctuation.py │ │ ├── stop_words.py │ │ ├── syntax_iterators.py │ │ └── tokenizer_exceptions.py │ ├── de │ │ ├── __init__.py │ │ ├── examples.py │ │ ├── punctuation.py │ │ ├── stop_words.py │ │ ├── syntax_iterators.py │ │ └── tokenizer_exceptions.py │ ├── dsb │ │ ├── __init__.py │ │ ├── examples.py │ │ ├── lex_attrs.py │ │ └── stop_words.py │ ├── el │ │ ├── __init__.py │ │ ├── examples.py │ │ ├── get_pos_from_wiktionary.py │ │ ├── lemmatizer.py │ │ ├── lex_attrs.py │ │ ├── punctuation.py │ │ ├── stop_words.py │ │ ├── syntax_iterators.py │ │ └── tokenizer_exceptions.py │ ├── en │ │ ├── __init__.py │ │ ├── examples.py │ │ ├── lemmatizer.py │ │ ├── lex_attrs.py │ │ ├── punctuation.py │ │ ├── stop_words.py │ │ ├── syntax_iterators.py │ │ └── tokenizer_exceptions.py │ ├── es │ │ ├── __init__.py │ │ ├── examples.py │ │ ├── lemmatizer.py │ │ ├── lex_attrs.py │ │ ├── punctuation.py │ │ ├── stop_words.py │ │ ├── syntax_iterators.py │ │ └── tokenizer_exceptions.py │ ├── et │ │ ├── __init__.py │ │ └── stop_words.py │ ├── eu │ │ ├── __init__.py │ │ ├── examples.py │ │ ├── lex_attrs.py │ │ ├── punctuation.py │ │ └── stop_words.py │ ├── fa │ │ ├── __init__.py │ │ ├── examples.py │ │ ├── generate_verbs_exc.py │ │ ├── lex_attrs.py │ │ ├── punctuation.py │ │ ├── stop_words.py │ │ ├── syntax_iterators.py │ │ └── tokenizer_exceptions.py │ ├── fi │ │ ├── __init__.py │ │ ├── examples.py │ │ ├── lex_attrs.py │ │ ├── punctuation.py │ │ ├── stop_words.py │ │ ├── syntax_iterators.py │ │ └── tokenizer_exceptions.py │ ├── fo │ │ ├── __init__.py │ │ └── tokenizer_exceptions.py │ ├── fr │ │ ├── __init__.py │ │ ├── _tokenizer_exceptions_list.py │ │ ├── examples.py │ │ ├── lemmatizer.py │ │ ├── lex_attrs.py │ │ ├── punctuation.py │ │ ├── stop_words.py │ │ ├── syntax_iterators.py │ │ └── tokenizer_exceptions.py │ ├── ga │ │ ├── __init__.py │ │ ├── lemmatizer.py │ │ ├── stop_words.py │ │ └── tokenizer_exceptions.py │ ├── gd │ │ ├── __init__.py │ │ ├── stop_words.py │ │ └── tokenizer_exceptions.py │ ├── grc │ │ ├── __init__.py │ │ ├── examples.py │ │ ├── lex_attrs.py │ │ ├── punctuation.py │ │ ├── stop_words.py │ │ └── tokenizer_exceptions.py │ ├── gu │ │ ├── __init__.py │ │ ├── examples.py │ │ └── stop_words.py │ ├── he │ │ ├── __init__.py │ │ ├── examples.py │ │ ├── lex_attrs.py │ │ └── stop_words.py │ ├── hi │ │ ├── __init__.py │ │ ├── examples.py │ │ ├── lex_attrs.py │ │ └── stop_words.py │ ├── hr │ │ ├── __init__.py │ │ ├── examples.py │ │ ├── lemma_lookup_license.txt │ │ └── stop_words.py │ ├── hsb │ │ ├── __init__.py │ │ ├── examples.py │ │ ├── lex_attrs.py │ │ ├── stop_words.py │ │ └── tokenizer_exceptions.py │ ├── ht │ │ ├── __init__.py │ │ ├── examples.py │ │ ├── lemmatizer.py │ │ ├── lex_attrs.py │ │ ├── punctuation.py │ │ ├── stop_words.py │ │ ├── syntax_iterators.py │ │ ├── tag_map.py │ │ └── tokenizer_exceptions.py │ ├── hu │ │ ├── __init__.py │ │ ├── examples.py │ │ ├── punctuation.py │ │ ├── stop_words.py │ │ └── tokenizer_exceptions.py │ ├── hy │ │ ├── __init__.py │ │ ├── examples.py │ │ ├── lex_attrs.py │ │ └── stop_words.py │ ├── id │ │ ├── __init__.py │ │ ├── _tokenizer_exceptions_list.py │ │ ├── examples.py │ │ ├── lex_attrs.py │ │ ├── punctuation.py │ │ ├── stop_words.py │ │ ├── syntax_iterators.py │ │ └── tokenizer_exceptions.py │ ├── is │ │ ├── __init__.py │ │ └── stop_words.py │ ├── it │ │ ├── __init__.py │ │ ├── examples.py │ │ ├── lemmatizer.py │ │ ├── punctuation.py │ │ ├── stop_words.py │ │ ├── syntax_iterators.py │ │ └── tokenizer_exceptions.py │ ├── ja │ │ ├── __init__.py │ │ ├── examples.py │ │ ├── stop_words.py │ │ ├── syntax_iterators.py │ │ ├── tag_bigram_map.py │ │ ├── tag_map.py │ │ └── tag_orth_map.py │ ├── kmr │ │ ├── __init__.py │ │ ├── examples.py │ │ ├── lex_attrs.py │ │ └── stop_words.py │ ├── kn │ │ ├── __init__.py │ │ ├── examples.py │ │ └── stop_words.py │ ├── ko │ │ ├── __init__.py │ │ ├── examples.py │ │ ├── lex_attrs.py │ │ ├── punctuation.py │ │ ├── stop_words.py │ │ └── tag_map.py │ ├── ky │ │ ├── __init__.py │ │ ├── examples.py │ │ ├── lex_attrs.py │ │ ├── punctuation.py │ │ ├── stop_words.py │ │ └── tokenizer_exceptions.py │ ├── la │ │ ├── __init__.py │ │ ├── examples.py │ │ ├── lex_attrs.py │ │ ├── stop_words.py │ │ ├── syntax_iterators.py │ │ └── tokenizer_exceptions.py │ ├── lb │ │ ├── __init__.py │ │ ├── examples.py │ │ ├── lex_attrs.py │ │ ├── punctuation.py │ │ ├── stop_words.py │ │ └── tokenizer_exceptions.py │ ├── lex_attrs.py │ ├── lg │ │ ├── __init__.py │ │ ├── examples.py │ │ ├── lex_attrs.py │ │ ├── punctuation.py │ │ └── stop_words.py │ ├── lij │ │ ├── __init__.py │ │ ├── examples.py │ │ ├── punctuation.py │ │ ├── stop_words.py │ │ └── tokenizer_exceptions.py │ ├── lt │ │ ├── __init__.py │ │ ├── examples.py │ │ ├── lex_attrs.py │ │ ├── punctuation.py │ │ ├── stop_words.py │ │ └── tokenizer_exceptions.py │ ├── lv │ │ ├── __init__.py │ │ └── stop_words.py │ ├── mk │ │ ├── __init__.py │ │ ├── lemmatizer.py │ │ ├── lex_attrs.py │ │ ├── stop_words.py │ │ └── tokenizer_exceptions.py │ ├── ml │ │ ├── __init__.py │ │ ├── examples.py │ │ ├── lex_attrs.py │ │ └── stop_words.py │ ├── mr │ │ ├── __init__.py │ │ └── stop_words.py │ ├── ms │ │ ├── __init__.py │ │ ├── _tokenizer_exceptions_list.py │ │ ├── examples.py │ │ ├── lex_attrs.py │ │ ├── punctuation.py │ │ ├── stop_words.py │ │ ├── syntax_iterators.py │ │ └── tokenizer_exceptions.py │ ├── nb │ │ ├── __init__.py │ │ ├── examples.py │ │ ├── punctuation.py │ │ ├── stop_words.py │ │ ├── syntax_iterators.py │ │ └── tokenizer_exceptions.py │ ├── ne │ │ ├── __init__.py │ │ ├── examples.py │ │ ├── lex_attrs.py │ │ └── stop_words.py │ ├── nl │ │ ├── __init__.py │ │ ├── examples.py │ │ ├── lemmatizer.py │ │ ├── lex_attrs.py │ │ ├── punctuation.py │ │ ├── stop_words.py │ │ ├── syntax_iterators.py │ │ └── tokenizer_exceptions.py │ ├── nn │ │ ├── __init__.py │ │ ├── examples.py │ │ ├── punctuation.py │ │ └── tokenizer_exceptions.py │ ├── norm_exceptions.py │ ├── pl │ │ ├── __init__.py │ │ ├── examples.py │ │ ├── lemmatizer.py │ │ ├── lex_attrs.py │ │ ├── punctuation.py │ │ └── stop_words.py │ ├── pt │ │ ├── __init__.py │ │ ├── examples.py │ │ ├── lex_attrs.py │ │ ├── punctuation.py │ │ ├── stop_words.py │ │ ├── syntax_iterators.py │ │ └── tokenizer_exceptions.py │ ├── punctuation.py │ ├── ro │ │ ├── __init__.py │ │ ├── examples.py │ │ ├── lex_attrs.py │ │ ├── punctuation.py │ │ ├── stop_words.py │ │ └── tokenizer_exceptions.py │ ├── ru │ │ ├── __init__.py │ │ ├── examples.py │ │ ├── lemmatizer.py │ │ ├── lex_attrs.py │ │ ├── stop_words.py │ │ └── tokenizer_exceptions.py │ ├── sa │ │ ├── __init__.py │ │ ├── examples.py │ │ ├── lex_attrs.py │ │ └── stop_words.py │ ├── si │ │ ├── __init__.py │ │ ├── examples.py │ │ ├── lex_attrs.py │ │ └── stop_words.py │ ├── sk │ │ ├── __init__.py │ │ ├── examples.py │ │ ├── lex_attrs.py │ │ └── stop_words.py │ ├── sl │ │ ├── __init__.py │ │ ├── examples.py │ │ ├── lex_attrs.py │ │ ├── punctuation.py │ │ ├── stop_words.py │ │ └── tokenizer_exceptions.py │ ├── sq │ │ ├── __init__.py │ │ ├── examples.py │ │ └── stop_words.py │ ├── sr │ │ ├── __init__.py │ │ ├── examples.py │ │ ├── lemma_lookup_licence.txt │ │ ├── lex_attrs.py │ │ ├── punctuation.py │ │ ├── stop_words.py │ │ └── tokenizer_exceptions.py │ ├── sv │ │ ├── __init__.py │ │ ├── examples.py │ │ ├── lex_attrs.py │ │ ├── punctuation.py │ │ ├── stop_words.py │ │ ├── syntax_iterators.py │ │ └── tokenizer_exceptions.py │ ├── ta │ │ ├── __init__.py │ │ ├── examples.py │ │ ├── lex_attrs.py │ │ └── stop_words.py │ ├── te │ │ ├── __init__.py │ │ ├── examples.py │ │ ├── lex_attrs.py │ │ └── stop_words.py │ ├── th │ │ ├── __init__.py │ │ ├── lex_attrs.py │ │ ├── stop_words.py │ │ └── tokenizer_exceptions.py │ ├── ti │ │ ├── __init__.py │ │ ├── examples.py │ │ ├── lex_attrs.py │ │ ├── punctuation.py │ │ ├── stop_words.py │ │ └── tokenizer_exceptions.py │ ├── tl │ │ ├── __init__.py │ │ ├── lex_attrs.py │ │ ├── stop_words.py │ │ └── tokenizer_exceptions.py │ ├── tn │ │ ├── __init__.py │ │ ├── examples.py │ │ ├── lex_attrs.py │ │ ├── punctuation.py │ │ └── stop_words.py │ ├── tokenizer_exceptions.py │ ├── tr │ │ ├── __init__.py │ │ ├── examples.py │ │ ├── lex_attrs.py │ │ ├── stop_words.py │ │ ├── syntax_iterators.py │ │ └── tokenizer_exceptions.py │ ├── tt │ │ ├── __init__.py │ │ ├── examples.py │ │ ├── lex_attrs.py │ │ ├── punctuation.py │ │ ├── stop_words.py │ │ └── tokenizer_exceptions.py │ ├── uk │ │ ├── __init__.py │ │ ├── examples.py │ │ ├── lemmatizer.py │ │ ├── lex_attrs.py │ │ ├── stop_words.py │ │ └── tokenizer_exceptions.py │ ├── ur │ │ ├── __init__.py │ │ ├── examples.py │ │ ├── lex_attrs.py │ │ ├── punctuation.py │ │ └── stop_words.py │ ├── vi │ │ ├── __init__.py │ │ ├── examples.py │ │ ├── lex_attrs.py │ │ └── stop_words.py │ ├── xx │ │ ├── __init__.py │ │ └── examples.py │ ├── yo │ │ ├── __init__.py │ │ ├── examples.py │ │ ├── lex_attrs.py │ │ └── stop_words.py │ └── zh │ │ ├── __init__.py │ │ ├── examples.py │ │ ├── lex_attrs.py │ │ └── stop_words.py ├── language.py ├── lexeme.pxd ├── lexeme.pyi ├── lexeme.pyx ├── lookups.py ├── matcher │ ├── __init__.py │ ├── dependencymatcher.pyi │ ├── dependencymatcher.pyx │ ├── levenshtein.pyx │ ├── matcher.pxd │ ├── matcher.pyi │ ├── matcher.pyx │ ├── phrasematcher.pxd │ ├── phrasematcher.pyi │ ├── phrasematcher.pyx │ └── polyleven.c ├── ml │ ├── __init__.py │ ├── _character_embed.py │ ├── _precomputable_affine.py │ ├── callbacks.py │ ├── extract_ngrams.py │ ├── extract_spans.py │ ├── featureextractor.py │ ├── models │ │ ├── __init__.py │ │ ├── entity_linker.py │ │ ├── multi_task.py │ │ ├── parser.py │ │ ├── span_finder.py │ │ ├── spancat.py │ │ ├── tagger.py │ │ ├── textcat.py │ │ └── tok2vec.py │ ├── parser_model.pxd │ ├── parser_model.pyx │ ├── staticvectors.py │ └── tb_framework.py ├── morphology.pxd ├── morphology.pyx ├── parts_of_speech.pxd ├── parts_of_speech.pyx ├── pipe_analysis.py ├── pipeline │ ├── __init__.py │ ├── _edit_tree_internals │ │ ├── __init__.py │ │ ├── edit_trees.pxd │ │ ├── edit_trees.pyx │ │ └── schemas.py │ ├── _parser_internals │ │ ├── __init__.pxd │ │ ├── __init__.py │ │ ├── _beam_utils.pxd │ │ ├── _beam_utils.pyx │ │ ├── _state.pxd │ │ ├── _state.pyx │ │ ├── arc_eager.pxd │ │ ├── arc_eager.pyx │ │ ├── ner.pxd │ │ ├── ner.pyx │ │ ├── nonproj.hh │ │ ├── nonproj.pxd │ │ ├── nonproj.pyx │ │ ├── stateclass.pxd │ │ ├── stateclass.pyx │ │ ├── transition_system.pxd │ │ └── transition_system.pyx │ ├── attributeruler.py │ ├── dep_parser.pyx │ ├── edit_tree_lemmatizer.py │ ├── entity_linker.py │ ├── entityruler.py │ ├── factories.py │ ├── functions.py │ ├── legacy │ │ ├── __init__.py │ │ └── entity_linker.py │ ├── lemmatizer.py │ ├── morphologizer.pyx │ ├── multitask.pyx │ ├── ner.pyx │ ├── pipe.pxd │ ├── pipe.pyi │ ├── pipe.pyx │ ├── sentencizer.pyx │ ├── senter.pyx │ ├── span_finder.py │ ├── span_ruler.py │ ├── spancat.py │ ├── tagger.pyx │ ├── textcat.py │ ├── textcat_multilabel.py │ ├── tok2vec.py │ ├── trainable_pipe.pxd │ ├── trainable_pipe.pyx │ ├── transition_parser.pxd │ └── transition_parser.pyx ├── py.typed ├── registrations.py ├── schemas.py ├── scorer.py ├── strings.pxd ├── strings.pyi ├── strings.pyx ├── structs.pxd ├── symbols.pxd ├── symbols.pyx ├── tests │ ├── README.md │ ├── __init__.py │ ├── conftest.py │ ├── doc │ │ ├── __init__.py │ │ ├── test_add_entities.py │ │ ├── test_array.py │ │ ├── test_creation.py │ │ ├── test_doc_api.py │ │ ├── test_graph.py │ │ ├── test_json_doc_conversion.py │ │ ├── test_morphanalysis.py │ │ ├── test_pickle_doc.py │ │ ├── test_retokenize_merge.py │ │ ├── test_retokenize_split.py │ │ ├── test_span.py │ │ ├── test_span_group.py │ │ ├── test_token_api.py │ │ └── test_underscore.py │ ├── enable_gpu.py │ ├── factory_registrations.json │ ├── lang │ │ ├── __init__.py │ │ ├── af │ │ │ ├── __init__.py │ │ │ ├── test_text.py │ │ │ └── test_tokenizer.py │ │ ├── am │ │ │ ├── __init__.py │ │ │ ├── test_exception.py │ │ │ └── test_text.py │ │ ├── ar │ │ │ ├── __init__.py │ │ │ ├── test_exceptions.py │ │ │ └── test_text.py │ │ ├── bg │ │ │ ├── test_text.py │ │ │ └── test_tokenizer.py │ │ ├── bn │ │ │ ├── __init__.py │ │ │ └── test_tokenizer.py │ │ ├── bo │ │ │ ├── __init__.py │ │ │ └── test_text.py │ │ ├── ca │ │ │ ├── __init__.py │ │ │ ├── test_exception.py │ │ │ ├── test_prefix_suffix_infix.py │ │ │ └── test_text.py │ │ ├── cs │ │ │ ├── __init__.py │ │ │ └── test_text.py │ │ ├── da │ │ │ ├── __init__.py │ │ │ ├── test_exceptions.py │ │ │ ├── test_noun_chunks.py │ │ │ ├── test_prefix_suffix_infix.py │ │ │ └── test_text.py │ │ ├── de │ │ │ ├── __init__.py │ │ │ ├── test_exceptions.py │ │ │ ├── test_noun_chunks.py │ │ │ ├── test_parser.py │ │ │ ├── test_prefix_suffix_infix.py │ │ │ └── test_text.py │ │ ├── dsb │ │ │ ├── __init__.py │ │ │ ├── test_text.py │ │ │ └── test_tokenizer.py │ │ ├── el │ │ │ ├── __init__.py │ │ │ ├── test_exception.py │ │ │ ├── test_noun_chunks.py │ │ │ └── test_text.py │ │ ├── en │ │ │ ├── __init__.py │ │ │ ├── test_customized_tokenizer.py │ │ │ ├── test_exceptions.py │ │ │ ├── test_indices.py │ │ │ ├── test_noun_chunks.py │ │ │ ├── test_parser.py │ │ │ ├── test_prefix_suffix_infix.py │ │ │ ├── test_punct.py │ │ │ ├── test_sbd.py │ │ │ ├── test_text.py │ │ │ └── test_tokenizer.py │ │ ├── es │ │ │ ├── __init__.py │ │ │ ├── test_exception.py │ │ │ ├── test_noun_chunks.py │ │ │ └── test_text.py │ │ ├── et │ │ │ ├── __init__.py │ │ │ ├── test_text.py │ │ │ └── test_tokenizer.py │ │ ├── eu │ │ │ ├── __init__.py │ │ │ └── test_text.py │ │ ├── fa │ │ │ ├── __init__.py │ │ │ └── test_noun_chunks.py │ │ ├── fi │ │ │ ├── __init__.py │ │ │ ├── test_noun_chunks.py │ │ │ ├── test_text.py │ │ │ └── test_tokenizer.py │ │ ├── fo │ │ │ ├── __init__.py │ │ │ └── test_tokenizer.py │ │ ├── fr │ │ │ ├── __init__.py │ │ │ ├── test_exceptions.py │ │ │ ├── test_noun_chunks.py │ │ │ ├── test_prefix_suffix_infix.py │ │ │ └── test_text.py │ │ ├── ga │ │ │ ├── __init__.py │ │ │ └── test_tokenizer.py │ │ ├── grc │ │ │ ├── __init__.py │ │ │ ├── test_text.py │ │ │ └── test_tokenizer.py │ │ ├── gu │ │ │ ├── __init__.py │ │ │ └── test_text.py │ │ ├── he │ │ │ ├── __init__.py │ │ │ └── test_tokenizer.py │ │ ├── hi │ │ │ ├── __init__.py │ │ │ ├── test_lex_attrs.py │ │ │ └── test_text.py │ │ ├── hr │ │ │ ├── __init__.py │ │ │ ├── test_text.py │ │ │ └── test_tokenizer.py │ │ ├── hsb │ │ │ ├── __init__.py │ │ │ ├── test_text.py │ │ │ └── test_tokenizer.py │ │ ├── ht │ │ │ ├── __init__.py │ │ │ ├── test_exceptions.py │ │ │ ├── test_noun_chunks.py │ │ │ ├── test_prefix_suffix_infix.py │ │ │ └── test_text.py │ │ ├── hu │ │ │ ├── __init__.py │ │ │ └── test_tokenizer.py │ │ ├── hy │ │ │ ├── __init__.py │ │ │ ├── test_text.py │ │ │ └── test_tokenizer.py │ │ ├── id │ │ │ ├── __init__.py │ │ │ ├── test_noun_chunks.py │ │ │ ├── test_prefix_suffix_infix.py │ │ │ └── test_text.py │ │ ├── is │ │ │ ├── __init__.py │ │ │ ├── test_text.py │ │ │ └── test_tokenizer.py │ │ ├── it │ │ │ ├── __init__.py │ │ │ ├── test_noun_chunks.py │ │ │ ├── test_prefix_suffix_infix.py │ │ │ ├── test_stopwords.py │ │ │ └── test_text.py │ │ ├── ja │ │ │ ├── __init__.py │ │ │ ├── test_lemmatization.py │ │ │ ├── test_morphologizer_factory.py │ │ │ ├── test_serialize.py │ │ │ └── test_tokenizer.py │ │ ├── kmr │ │ │ ├── __init__.py │ │ │ └── test_text.py │ │ ├── ko │ │ │ ├── __init__.py │ │ │ ├── test_lemmatization.py │ │ │ ├── test_serialize.py │ │ │ └── test_tokenizer.py │ │ ├── ky │ │ │ ├── __init__.py │ │ │ └── test_tokenizer.py │ │ ├── la │ │ │ ├── __init__.py │ │ │ ├── test_exception.py │ │ │ ├── test_noun_chunks.py │ │ │ └── test_text.py │ │ ├── lb │ │ │ ├── __init__.py │ │ │ ├── test_exceptions.py │ │ │ ├── test_prefix_suffix_infix.py │ │ │ └── test_text.py │ │ ├── lg │ │ │ ├── __init__.py │ │ │ └── test_tokenizer.py │ │ ├── lt │ │ │ ├── __init__.py │ │ │ └── test_text.py │ │ ├── lv │ │ │ ├── __init__.py │ │ │ ├── test_text.py │ │ │ └── test_tokenizer.py │ │ ├── mk │ │ │ ├── __init__.py │ │ │ └── test_text.py │ │ ├── ml │ │ │ ├── __init__.py │ │ │ └── test_text.py │ │ ├── ms │ │ │ ├── __init__.py │ │ │ ├── test_noun_chunks.py │ │ │ ├── test_prefix_suffix_infix.py │ │ │ └── test_text.py │ │ ├── nb │ │ │ ├── __init__.py │ │ │ ├── test_noun_chunks.py │ │ │ └── test_tokenizer.py │ │ ├── ne │ │ │ ├── __init__.py │ │ │ └── test_text.py │ │ ├── nl │ │ │ ├── __init__.py │ │ │ ├── test_noun_chunks.py │ │ │ └── test_text.py │ │ ├── nn │ │ │ ├── __init__.py │ │ │ └── test_tokenizer.py │ │ ├── pl │ │ │ ├── __init__.py │ │ │ ├── test_text.py │ │ │ └── test_tokenizer.py │ │ ├── pt │ │ │ ├── __init__.py │ │ │ ├── test_noun_chunks.py │ │ │ └── test_text.py │ │ ├── ro │ │ │ ├── __init__.py │ │ │ └── test_tokenizer.py │ │ ├── ru │ │ │ ├── __init__.py │ │ │ ├── test_exceptions.py │ │ │ ├── test_lemmatizer.py │ │ │ ├── test_text.py │ │ │ └── test_tokenizer.py │ │ ├── sa │ │ │ ├── __init__.py │ │ │ └── test_text.py │ │ ├── sk │ │ │ ├── __init__.py │ │ │ ├── test_text.py │ │ │ └── test_tokenizer.py │ │ ├── sl │ │ │ ├── __init__.py │ │ │ ├── test_text.py │ │ │ └── test_tokenizer.py │ │ ├── sq │ │ │ ├── __init__.py │ │ │ ├── test_text.py │ │ │ └── test_tokenizer.py │ │ ├── sr │ │ │ ├── __init__.py │ │ │ ├── test_exceptions.py │ │ │ └── test_tokenizer.py │ │ ├── sv │ │ │ ├── __init__.py │ │ │ ├── test_exceptions.py │ │ │ ├── test_lex_attrs.py │ │ │ ├── test_noun_chunks.py │ │ │ ├── test_prefix_suffix_infix.py │ │ │ ├── test_text.py │ │ │ └── test_tokenizer.py │ │ ├── ta │ │ │ ├── __init__.py │ │ │ ├── test_text.py │ │ │ └── test_tokenizer.py │ │ ├── test_attrs.py │ │ ├── test_initialize.py │ │ ├── test_lemmatizers.py │ │ ├── th │ │ │ ├── __init__.py │ │ │ ├── test_serialize.py │ │ │ └── test_tokenizer.py │ │ ├── ti │ │ │ ├── __init__.py │ │ │ ├── test_exception.py │ │ │ └── test_text.py │ │ ├── tl │ │ │ ├── __init__.py │ │ │ ├── test_indices.py │ │ │ ├── test_punct.py │ │ │ └── test_text.py │ │ ├── tr │ │ │ ├── __init__.py │ │ │ ├── test_noun_chunks.py │ │ │ ├── test_parser.py │ │ │ ├── test_text.py │ │ │ └── test_tokenizer.py │ │ ├── tt │ │ │ ├── __init__.py │ │ │ └── test_tokenizer.py │ │ ├── uk │ │ │ ├── __init__.py │ │ │ ├── test_lemmatizer.py │ │ │ ├── test_tokenizer.py │ │ │ └── test_tokenizer_exc.py │ │ ├── ur │ │ │ ├── __init__.py │ │ │ ├── test_prefix_suffix_infix.py │ │ │ └── test_text.py │ │ ├── vi │ │ │ ├── __init__.py │ │ │ ├── test_serialize.py │ │ │ └── test_tokenizer.py │ │ ├── xx │ │ │ ├── __init__.py │ │ │ ├── test_text.py │ │ │ └── test_tokenizer.py │ │ ├── yo │ │ │ ├── __init__.py │ │ │ └── test_text.py │ │ └── zh │ │ │ ├── __init__.py │ │ │ ├── test_serialize.py │ │ │ ├── test_text.py │ │ │ └── test_tokenizer.py │ ├── matcher │ │ ├── __init__.py │ │ ├── test_dependency_matcher.py │ │ ├── test_levenshtein.py │ │ ├── test_matcher_api.py │ │ ├── test_matcher_logic.py │ │ ├── test_pattern_validation.py │ │ └── test_phrase_matcher.py │ ├── morphology │ │ ├── __init__.py │ │ ├── test_morph_converters.py │ │ ├── test_morph_features.py │ │ └── test_morph_pickle.py │ ├── package │ │ ├── __init__.py │ │ └── test_requirements.py │ ├── parser │ │ ├── __init__.py │ │ ├── test_add_label.py │ │ ├── test_arc_eager_oracle.py │ │ ├── test_ner.py │ │ ├── test_neural_parser.py │ │ ├── test_nn_beam.py │ │ ├── test_nonproj.py │ │ ├── test_parse.py │ │ ├── test_parse_navigate.py │ │ ├── test_preset_sbd.py │ │ ├── test_space_attachment.py │ │ └── test_state.py │ ├── pipeline │ │ ├── __init__.py │ │ ├── test_analysis.py │ │ ├── test_annotates_on_update.py │ │ ├── test_attributeruler.py │ │ ├── test_edit_tree_lemmatizer.py │ │ ├── test_entity_linker.py │ │ ├── test_entity_ruler.py │ │ ├── test_functions.py │ │ ├── test_initialize.py │ │ ├── test_lemmatizer.py │ │ ├── test_models.py │ │ ├── test_morphologizer.py │ │ ├── test_pipe_factories.py │ │ ├── test_pipe_methods.py │ │ ├── test_sentencizer.py │ │ ├── test_senter.py │ │ ├── test_span_finder.py │ │ ├── test_span_ruler.py │ │ ├── test_spancat.py │ │ ├── test_tagger.py │ │ ├── test_textcat.py │ │ └── test_tok2vec.py │ ├── registry_contents.json │ ├── serialize │ │ ├── __init__.py │ │ ├── test_resource_warning.py │ │ ├── test_serialize_config.py │ │ ├── test_serialize_doc.py │ │ ├── test_serialize_docbin.py │ │ ├── test_serialize_extension_attrs.py │ │ ├── test_serialize_kb.py │ │ ├── test_serialize_language.py │ │ ├── test_serialize_pipeline.py │ │ ├── test_serialize_span_groups.py │ │ ├── test_serialize_tokenizer.py │ │ └── test_serialize_vocab_strings.py │ ├── test_architectures.py │ ├── test_cli.py │ ├── test_cli_app.py │ ├── test_displacy.py │ ├── test_errors.py │ ├── test_factory_imports.py │ ├── test_factory_registrations.py │ ├── test_language.py │ ├── test_misc.py │ ├── test_models.py │ ├── test_pickles.py │ ├── test_registry_population.py │ ├── test_scorer.py │ ├── test_ty.py │ ├── tok2vec.py │ ├── tokenizer │ │ ├── __init__.py │ │ ├── sun.txt │ │ ├── test_exceptions.py │ │ ├── test_explain.py │ │ ├── test_naughty_strings.py │ │ ├── test_tokenizer.py │ │ ├── test_urls.py │ │ └── test_whitespace.py │ ├── training │ │ ├── __init__.py │ │ ├── test_augmenters.py │ │ ├── test_corpus.py │ │ ├── test_logger.py │ │ ├── test_new_example.py │ │ ├── test_pretraining.py │ │ ├── test_readers.py │ │ ├── test_rehearse.py │ │ └── test_training.py │ ├── util.py │ └── vocab_vectors │ │ ├── __init__.py │ │ ├── test_lexeme.py │ │ ├── test_lookups.py │ │ ├── test_memory_zone.py │ │ ├── test_similarity.py │ │ ├── test_stringstore.py │ │ ├── test_vectors.py │ │ └── test_vocab_api.py ├── tokenizer.pxd ├── tokenizer.pyx ├── tokens │ ├── __init__.pxd │ ├── __init__.py │ ├── _dict_proxies.py │ ├── _retokenize.pyi │ ├── _retokenize.pyx │ ├── _serialize.py │ ├── doc.pxd │ ├── doc.pyi │ ├── doc.pyx │ ├── graph.pxd │ ├── graph.pyx │ ├── morphanalysis.pxd │ ├── morphanalysis.pyi │ ├── morphanalysis.pyx │ ├── span.pxd │ ├── span.pyi │ ├── span.pyx │ ├── span_group.pxd │ ├── span_group.pyi │ ├── span_group.pyx │ ├── token.pxd │ ├── token.pyi │ ├── token.pyx │ └── underscore.py ├── training │ ├── __init__.pxd │ ├── __init__.py │ ├── align.pyx │ ├── alignment.py │ ├── alignment_array.pxd │ ├── alignment_array.pyx │ ├── augment.py │ ├── batchers.py │ ├── callbacks.py │ ├── converters │ │ ├── __init__.py │ │ ├── conll_ner_to_docs.py │ │ ├── conllu_to_docs.py │ │ ├── iob_to_docs.py │ │ └── json_to_docs.py │ ├── corpus.py │ ├── example.pxd │ ├── example.pyi │ ├── example.pyx │ ├── gold_io.pyx │ ├── initialize.py │ ├── iob_utils.py │ ├── loggers.py │ ├── loop.py │ └── pretrain.py ├── ty.py ├── typedefs.pxd ├── typedefs.pyx ├── util.py ├── vectors.pyx ├── vocab.pxd ├── vocab.pyi └── vocab.pyx └── website ├── .dockerignore ├── .eslintrc ├── .eslintrc.json ├── .gitignore ├── .nvmrc ├── .prettierignore ├── .prettierrc ├── .vscode └── extensions.json ├── Dockerfile ├── README.md ├── UNIVERSE.md ├── docs ├── api │ ├── architectures.mdx │ ├── attributeruler.mdx │ ├── attributes.mdx │ ├── basevectors.mdx │ ├── cli.mdx │ ├── coref.mdx │ ├── corpus.mdx │ ├── curatedtransformer.mdx │ ├── cython-classes.mdx │ ├── cython-structs.mdx │ ├── cython.mdx │ ├── data-formats.mdx │ ├── dependencymatcher.mdx │ ├── dependencyparser.mdx │ ├── doc.mdx │ ├── docbin.mdx │ ├── edittreelemmatizer.mdx │ ├── entitylinker.mdx │ ├── entityrecognizer.mdx │ ├── entityruler.mdx │ ├── example.mdx │ ├── index.mdx │ ├── inmemorylookupkb.mdx │ ├── kb.mdx │ ├── language.mdx │ ├── large-language-models.mdx │ ├── legacy.mdx │ ├── lemmatizer.mdx │ ├── lexeme.mdx │ ├── lookups.mdx │ ├── matcher.mdx │ ├── morphologizer.mdx │ ├── morphology.mdx │ ├── phrasematcher.mdx │ ├── pipe.mdx │ ├── pipeline-functions.mdx │ ├── scorer.mdx │ ├── sentencerecognizer.mdx │ ├── sentencizer.mdx │ ├── span-resolver.mdx │ ├── span.mdx │ ├── spancategorizer.mdx │ ├── spanfinder.mdx │ ├── spangroup.mdx │ ├── spanruler.mdx │ ├── stringstore.mdx │ ├── tagger.mdx │ ├── textcategorizer.mdx │ ├── tok2vec.mdx │ ├── token.mdx │ ├── tokenizer.mdx │ ├── top-level.mdx │ ├── transformer.mdx │ ├── vectors.mdx │ └── vocab.mdx ├── images │ └── displacy-long2.html ├── models │ └── index.mdx ├── styleguide.mdx └── usage │ ├── 101 │ ├── _architecture.mdx │ ├── _language-data.mdx │ ├── _named-entities.mdx │ ├── _pipelines.mdx │ ├── _pos-deps.mdx │ ├── _serialization.mdx │ ├── _tokenization.mdx │ ├── _training.mdx │ └── _vectors-similarity.mdx │ ├── _benchmarks-models.mdx │ ├── embeddings-transformers.mdx │ ├── facts-figures.mdx │ ├── index.mdx │ ├── large-language-models.mdx │ ├── layers-architectures.mdx │ ├── linguistic-features.mdx │ ├── memory-management.mdx │ ├── models.mdx │ ├── processing-pipelines.mdx │ ├── projects.mdx │ ├── rule-based-matching.mdx │ ├── saving-loading.mdx │ ├── spacy-101.mdx │ ├── training.mdx │ ├── v2-1.mdx │ ├── v2-2.mdx │ ├── v2-3.mdx │ ├── v2.mdx │ ├── v3-1.mdx │ ├── v3-2.mdx │ ├── v3-3.mdx │ ├── v3-4.mdx │ ├── v3-5.mdx │ ├── v3-6.mdx │ ├── v3-7.mdx │ ├── v3.mdx │ └── visualizers.mdx ├── meta ├── dynamicMeta.mjs ├── languageSorted.tsx ├── languages.json ├── recordLanguages.tsx ├── recordSections.tsx ├── recordUniverse.tsx ├── sidebarFlat.tsx ├── sidebars.json ├── site.json ├── type-annotations.json └── universe.json ├── netlify.toml ├── next-sitemap.config.mjs ├── next.config.mjs ├── package-lock.json ├── package.json ├── pages ├── 404.js ├── [...listPathPage].tsx ├── _app.tsx ├── _document.tsx ├── index.tsx ├── models │ └── [slug].tsx └── universe │ ├── category │ └── [slug].tsx │ ├── index.tsx │ └── project │ └── [slug].tsx ├── plugins ├── getProps.mjs ├── index.mjs ├── remarkCodeBlocks.mjs ├── remarkCustomAttrs.mjs ├── remarkFindAndReplace.mjs └── remarkWrapSections.mjs ├── public ├── favicon.ico ├── icons │ ├── icon-192x192.png │ ├── icon-256x256.png │ ├── icon-384x384.png │ └── icon-512x512.png ├── images │ ├── architecture.svg │ ├── cli_init_fill-config_diff.jpg │ ├── course.jpg │ ├── dep-match-diagram.svg │ ├── displacy-compact.svg │ ├── displacy-custom-parser.svg │ ├── displacy-dep-founded.svg │ ├── displacy-long.svg │ ├── displacy-long2.svg │ ├── displacy-model-rules.svg │ ├── displacy-model-rules2.svg │ ├── displacy-small.svg │ ├── displacy.svg │ ├── displacy_jupyter.jpg │ ├── huggingface_hub.jpg │ ├── lifecycle.svg │ ├── matcher-demo.jpg │ ├── pipeline-design.svg │ ├── pipeline.svg │ ├── pipeline_transformer.svg │ ├── prodigy.jpg │ ├── prodigy_overview.jpg │ ├── prodigy_spans-manual.jpg │ ├── prodigy_train_curve.jpg │ ├── project_document.jpg │ ├── projects.png │ ├── projects.svg │ ├── sense2vec.jpg │ ├── spacy-extension-demo.gif │ ├── spacy-ray.svg │ ├── spacy-streamlit.png │ ├── spacy-tailored-pipelines_wide.png │ ├── thinc_mypy.jpg │ ├── tok2vec-listener.svg │ ├── tok2vec.svg │ ├── tokenization.svg │ ├── trainable_component.svg │ ├── training.svg │ ├── vocab_stringstore.svg │ ├── wandb1.jpg │ └── wandb2.jpg ├── manifest.webmanifest └── vercel.svg ├── runtime.txt ├── setup ├── jinja_to_js.py ├── requirements.txt └── setup.sh ├── src ├── components │ ├── accordion.js │ ├── alert.js │ ├── aside.js │ ├── button.js │ ├── card.js │ ├── code.js │ ├── codeBlock.js │ ├── codeDynamic.js │ ├── copy.js │ ├── dropdown.js │ ├── embed.js │ ├── footer.js │ ├── github.js │ ├── grid.js │ ├── htmlToReact.js │ ├── icon.js │ ├── infobox.js │ ├── inlineCode.js │ ├── juniper.js │ ├── landing.js │ ├── link.js │ ├── list.js │ ├── main.js │ ├── markdownToReact.js │ ├── markdownToReactDynamic.js │ ├── navigation.js │ ├── newsletter.js │ ├── progress.js │ ├── quickstart.js │ ├── readnext.js │ ├── search.js │ ├── section.js │ ├── seo.js │ ├── sidebar.js │ ├── table.js │ ├── tag.js │ ├── title.js │ ├── typeAnnotation.js │ ├── typography.js │ └── util.js ├── fonts │ ├── hkgrotesk-bold.woff │ ├── hkgrotesk-bold.woff2 │ ├── hkgrotesk-bolditalic.woff │ ├── hkgrotesk-bolditalic.woff2 │ ├── hkgrotesk-semibold.woff │ ├── hkgrotesk-semibold.woff2 │ ├── hkgrotesk-semibolditalic.woff │ ├── hkgrotesk-semibolditalic.woff2 │ ├── jetbrainsmono-italic.woff │ ├── jetbrainsmono-italic.woff2 │ ├── jetbrainsmono-regular.woff │ └── jetbrainsmono-regular.woff2 ├── images │ ├── explosion.svg │ ├── icon.png │ ├── icon_legacy.png │ ├── icon_nightly.png │ ├── icons │ │ ├── accept.svg │ │ ├── arrow-right.svg │ │ ├── clipboard.svg │ │ ├── code.svg │ │ ├── docs.svg │ │ ├── download.svg │ │ ├── github.svg │ │ ├── help-outline.svg │ │ ├── help.svg │ │ ├── info.svg │ │ ├── moon.svg │ │ ├── network.svg │ │ ├── neutral.svg │ │ ├── no.svg │ │ ├── offline.svg │ │ ├── package.svg │ │ ├── reject.svg │ │ ├── search.svg │ │ ├── twitter.svg │ │ ├── warning.svg │ │ ├── website.svg │ │ └── yes.svg │ ├── logo.svg │ ├── logos │ │ ├── dvc.svg │ │ ├── fastapi.svg │ │ ├── huggingface_hub.svg │ │ ├── prodigy.svg │ │ ├── ray.svg │ │ ├── streamlit.svg │ │ └── wandb.svg │ ├── pattern_blue.png │ ├── pattern_green.png │ ├── pattern_landing.png │ ├── pattern_landing_legacy.png │ ├── pattern_landing_nightly.png │ ├── pattern_legacy.png │ ├── pattern_nightly.png │ ├── pattern_purple.png │ ├── social_api.jpg │ ├── social_default.jpg │ ├── social_legacy.jpg │ ├── social_nightly.jpg │ ├── social_universe.jpg │ └── spacy-irl.jpg ├── remark.js ├── styles │ ├── accordion.module.sass │ ├── alert.module.sass │ ├── aside.module.sass │ ├── base.sass │ ├── button.module.sass │ ├── card.module.sass │ ├── code.module.sass │ ├── copy.module.sass │ ├── dropdown.module.sass │ ├── embed.module.sass │ ├── footer.module.sass │ ├── grid.module.sass │ ├── icon.module.sass │ ├── infobox.module.sass │ ├── landing.module.sass │ ├── layout.sass │ ├── link.module.sass │ ├── list.module.sass │ ├── main.module.sass │ ├── navigation.module.sass │ ├── newsletter.module.sass │ ├── progress.module.sass │ ├── quickstart.module.sass │ ├── readnext.module.sass │ ├── search.sass │ ├── section.module.sass │ ├── sidebar.module.sass │ ├── table.module.sass │ ├── tag.module.sass │ ├── title.module.sass │ └── typography.module.sass ├── templates │ ├── docs.js │ ├── index.js │ ├── models.js │ └── universe.js └── widgets │ ├── changelog.js │ ├── features.js │ ├── integration.js │ ├── languages.js │ ├── project.js │ ├── quickstart-install.js │ ├── quickstart-models.js │ ├── quickstart-training.js │ └── styleguide.js └── tsconfig.json /.github/FUNDING.yml: -------------------------------------------------------------------------------- 1 | custom: [https://explosion.ai/merch, https://explosion.ai/tailored-solutions] 2 | -------------------------------------------------------------------------------- /.github/ISSUE_TEMPLATE/01_bugs.md: -------------------------------------------------------------------------------- 1 | --- 2 | name: "\U0001F6A8 Submit a Bug Report" 3 | about: Use this template if you came across a bug or unexpected behaviour differing from the docs. 4 | 5 | --- 6 | 7 | 8 | 9 | ## How to reproduce the behaviour 10 | 11 | 12 | ## Your Environment 13 | 14 | * Operating System: 15 | * Python Version Used: 16 | * spaCy Version Used: 17 | * Environment Information: 18 | -------------------------------------------------------------------------------- /.github/ISSUE_TEMPLATE/02_docs.md: -------------------------------------------------------------------------------- 1 | --- 2 | name: "\U0001F4DA Submit a Documentation Report" 3 | about: Did you spot a mistake in the docs, is anything unclear or do you have a 4 | suggestion? 5 | 6 | --- 7 | 8 | 9 | ## Which page or section is this issue related to? 10 | 11 | -------------------------------------------------------------------------------- /.github/contributors/Bri-Will.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/explosion/spaCy/41e07772dc5805594bab2997a090a9033e26bf56/.github/contributors/Bri-Will.md -------------------------------------------------------------------------------- /.github/contributors/Schibsted.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/explosion/spaCy/41e07772dc5805594bab2997a090a9033e26bf56/.github/contributors/Schibsted.png -------------------------------------------------------------------------------- /.github/contributors/melanuria.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/explosion/spaCy/41e07772dc5805594bab2997a090a9033e26bf56/.github/contributors/melanuria.pdf -------------------------------------------------------------------------------- /.github/contributors/svlandeg.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/explosion/spaCy/41e07772dc5805594bab2997a090a9033e26bf56/.github/contributors/svlandeg.md -------------------------------------------------------------------------------- /.github/validate_universe_json.py: -------------------------------------------------------------------------------- 1 | import json 2 | import re 3 | import sys 4 | from pathlib import Path 5 | 6 | 7 | def validate_json(document): 8 | universe_file = Path(document) 9 | with universe_file.open() as f: 10 | universe_data = json.load(f) 11 | for entry in universe_data["resources"]: 12 | if "github" in entry: 13 | assert not re.match( 14 | r"^(http:)|^(https:)", entry["github"] 15 | ), "Github field should be user/repo, not a url" 16 | 17 | 18 | if __name__ == "__main__": 19 | validate_json(str(sys.argv[1])) 20 | -------------------------------------------------------------------------------- /.github/workflows/gputests.yml.disabled: -------------------------------------------------------------------------------- 1 | name: Weekly GPU tests 2 | 3 | on: 4 | schedule: 5 | - cron: '0 1 * * MON' 6 | 7 | jobs: 8 | weekly-gputests: 9 | strategy: 10 | fail-fast: false 11 | matrix: 12 | branch: [master, v4] 13 | if: github.repository_owner == 'explosion' 14 | runs-on: ubuntu-latest 15 | steps: 16 | - name: Trigger buildkite build 17 | uses: buildkite/trigger-pipeline-action@v1.2.0 18 | env: 19 | PIPELINE: explosion-ai/spacy-slow-gpu-tests 20 | BRANCH: ${{ matrix.branch }} 21 | MESSAGE: ":github: Weekly GPU + slow tests - triggered from a GitHub Action" 22 | BUILDKITE_API_ACCESS_TOKEN: ${{ secrets.BUILDKITE_SECRET }} 23 | -------------------------------------------------------------------------------- /.github/workflows/lock.yml: -------------------------------------------------------------------------------- 1 | name: 'Lock Threads' 2 | 3 | on: 4 | schedule: 5 | - cron: '0 0 * * *' # check every day 6 | workflow_dispatch: 7 | 8 | permissions: 9 | issues: write 10 | 11 | concurrency: 12 | group: lock 13 | 14 | jobs: 15 | action: 16 | if: github.repository_owner == 'explosion' 17 | runs-on: ubuntu-latest 18 | steps: 19 | - uses: dessant/lock-threads@v5 20 | with: 21 | process-only: 'issues' 22 | issue-inactive-days: '30' 23 | issue-comment: > 24 | This thread has been automatically locked since there 25 | has not been any recent activity after it was closed. 26 | Please open a new issue for related bugs. 27 | -------------------------------------------------------------------------------- /.pre-commit-config.yaml: -------------------------------------------------------------------------------- 1 | repos: 2 | - repo: https://github.com/ambv/black 3 | rev: 22.3.0 4 | hooks: 5 | - id: black 6 | language_version: python3.7 7 | additional_dependencies: ['click==8.0.4'] 8 | - repo: https://github.com/pycqa/flake8 9 | rev: 5.0.4 10 | hooks: 11 | - id: flake8 12 | args: 13 | - "--config=setup.cfg" 14 | -------------------------------------------------------------------------------- /CITATION.cff: -------------------------------------------------------------------------------- 1 | cff-version: 1.2.0 2 | preferred-citation: 3 | type: article 4 | message: "If you use spaCy, please cite it as below." 5 | authors: 6 | - family-names: "Honnibal" 7 | given-names: "Matthew" 8 | - family-names: "Montani" 9 | given-names: "Ines" 10 | - family-names: "Van Landeghem" 11 | given-names: "Sofie" 12 | - family-names: "Boyd" 13 | given-names: "Adriane" 14 | title: "spaCy: Industrial-strength Natural Language Processing in Python" 15 | doi: "10.5281/zenodo.1212303" 16 | year: 2020 17 | -------------------------------------------------------------------------------- /MANIFEST.in: -------------------------------------------------------------------------------- 1 | recursive-include spacy *.pyi *.pyx *.pxd *.txt *.cfg *.jinja *.toml *.hh 2 | include LICENSE 3 | include README.md 4 | include pyproject.toml 5 | include spacy/py.typed 6 | recursive-include spacy/cli *.yml 7 | recursive-include spacy/tests *.json 8 | recursive-include licenses * 9 | recursive-exclude spacy *.cpp 10 | -------------------------------------------------------------------------------- /bin/get-package.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | 3 | set -e 4 | 5 | version=$(grep "__title__ = " spacy/about.py) 6 | version=${version/__title__ = } 7 | version=${version/\'/} 8 | version=${version/\'/} 9 | version=${version/\"/} 10 | version=${version/\"/} 11 | 12 | echo $version 13 | -------------------------------------------------------------------------------- /bin/get-version.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | 3 | set -e 4 | 5 | version=$(grep "__version__ = " spacy/about.py) 6 | version=${version/__version__ = } 7 | version=${version/\'/} 8 | version=${version/\'/} 9 | version=${version/\"/} 10 | version=${version/\"/} 11 | 12 | echo $version 13 | -------------------------------------------------------------------------------- /bin/push-tag.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | 3 | set -e 4 | 5 | # Insist repository is clean 6 | git diff-index --quiet HEAD 7 | 8 | git checkout $1 9 | git pull origin $1 10 | git push origin $1 11 | 12 | version=$(grep "__version__ = " spacy/about.py) 13 | version=${version/__version__ = } 14 | version=${version/\'/} 15 | version=${version/\'/} 16 | version=${version/\"/} 17 | version=${version/\"/} 18 | git tag "v$version" 19 | git push origin "v$version" 20 | -------------------------------------------------------------------------------- /bin/release.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | 3 | set -e 4 | 5 | # Insist repository is clean 6 | git diff-index --quiet HEAD 7 | 8 | version=$(grep "__version__ = " spacy/about.py) 9 | version=${version/__version__ = } 10 | version=${version/\'/} 11 | version=${version/\'/} 12 | version=${version/\"/} 13 | version=${version/\"/} 14 | 15 | echo "Pushing release-v"$version 16 | 17 | git tag -d release-v$version || true 18 | git push origin :release-v$version || true 19 | git tag release-v$version 20 | git push origin release-v$version 21 | -------------------------------------------------------------------------------- /build-constraints.txt: -------------------------------------------------------------------------------- 1 | # build version constraints for use with wheelwright 2 | numpy>=2.0.0,<3.0.0 3 | -------------------------------------------------------------------------------- /examples/training/README.md: -------------------------------------------------------------------------------- 1 | 2 | 3 | # spaCy examples 4 | 5 | See [examples/README.md](../README.md) 6 | -------------------------------------------------------------------------------- /extra/example_data/ner_example_data/README.md: -------------------------------------------------------------------------------- 1 | ## Examples of NER/IOB data that can be converted with `spacy convert` 2 | 3 | To convert an IOB file to `.spacy` ([`DocBin`](https://spacy.io/api/docbin)) 4 | for spaCy v3: 5 | 6 | ```bash 7 | python -m spacy convert -c iob -s -n 10 -b en_core_web_sm file.iob . 8 | ``` 9 | 10 | See all the `spacy convert` options: https://spacy.io/api/cli#convert 11 | 12 | --- 13 | 14 | The spaCy v2 JSON training files were generated using **spaCy v2** with: 15 | 16 | ```bash 17 | python -m spacy convert -c iob -s -n 10 -b en file.iob 18 | ``` 19 | 20 | To convert an existing JSON training file to `.spacy` for spaCy v3, convert 21 | with **spaCy v3**: 22 | 23 | ```bash 24 | python -m spacy convert file.json . 25 | ``` 26 | -------------------------------------------------------------------------------- /extra/example_data/ner_example_data/ner-sent-per-line.iob: -------------------------------------------------------------------------------- 1 | When|WRB|O Sebastian|NNP|B-PERSON Thrun|NNP|I-PERSON started|VBD|O working|VBG|O on|IN|O self|NN|O -|HYPH|O driving|VBG|O cars|NNS|O at|IN|O Google|NNP|B-ORG in|IN|O 2007|CD|B-DATE ,|,|O few|JJ|O people|NNS|O outside|RB|O of|IN|O the|DT|O company|NN|O took|VBD|O him|PRP|O seriously|RB|O .|.|O 2 | “|''|O I|PRP|O can|MD|O tell|VB|O you|PRP|O very|RB|O senior|JJ|O CEOs|NNS|O of|IN|O major|JJ|O American|JJ|B-NORP car|NN|O companies|NNS|O would|MD|O shake|VB|O my|PRP$|O hand|NN|O and|CC|O turn|VB|O away|RB|O because|IN|O I|PRP|O was|VBD|O n’t|RB|O worth|JJ|O talking|VBG|O to|IN|O ,|,|O ”|''|O said|VBD|O Thrun|NNP|B-PERSON ,|,|O in|IN|O an|DT|O interview|NN|O with|IN|O Recode|NNP|B-ORG earlier|RBR|B-DATE this|DT|I-DATE week|NN|I-DATE .|.|O 3 | -------------------------------------------------------------------------------- /spacy/__init__.pxd: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/explosion/spaCy/41e07772dc5805594bab2997a090a9033e26bf56/spacy/__init__.pxd -------------------------------------------------------------------------------- /spacy/__main__.py: -------------------------------------------------------------------------------- 1 | if __name__ == "__main__": 2 | from spacy.cli import setup_cli 3 | 4 | setup_cli() 5 | -------------------------------------------------------------------------------- /spacy/about.py: -------------------------------------------------------------------------------- 1 | # fmt: off 2 | __title__ = "spacy" 3 | __version__ = "3.8.7" 4 | __download_url__ = "https://github.com/explosion/spacy-models/releases/download" 5 | __compatibility__ = "https://raw.githubusercontent.com/explosion/spacy-models/master/compatibility.json" 6 | -------------------------------------------------------------------------------- /spacy/cli/project/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/explosion/spaCy/41e07772dc5805594bab2997a090a9033e26bf56/spacy/cli/project/__init__.py -------------------------------------------------------------------------------- /spacy/cli/project/assets.py: -------------------------------------------------------------------------------- 1 | from weasel.cli.assets import * 2 | -------------------------------------------------------------------------------- /spacy/cli/project/clone.py: -------------------------------------------------------------------------------- 1 | from weasel.cli.clone import * 2 | -------------------------------------------------------------------------------- /spacy/cli/project/document.py: -------------------------------------------------------------------------------- 1 | from weasel.cli.document import * 2 | -------------------------------------------------------------------------------- /spacy/cli/project/dvc.py: -------------------------------------------------------------------------------- 1 | from weasel.cli.dvc import * 2 | -------------------------------------------------------------------------------- /spacy/cli/project/pull.py: -------------------------------------------------------------------------------- 1 | from weasel.cli.pull import * 2 | -------------------------------------------------------------------------------- /spacy/cli/project/push.py: -------------------------------------------------------------------------------- 1 | from weasel.cli.push import * 2 | -------------------------------------------------------------------------------- /spacy/cli/project/remote_storage.py: -------------------------------------------------------------------------------- 1 | from weasel.cli.remote_storage import * 2 | -------------------------------------------------------------------------------- /spacy/cli/project/run.py: -------------------------------------------------------------------------------- 1 | from weasel.cli.run import * 2 | -------------------------------------------------------------------------------- /spacy/kb/__init__.py: -------------------------------------------------------------------------------- 1 | from .candidate import Candidate, get_candidates, get_candidates_batch 2 | from .kb import KnowledgeBase 3 | from .kb_in_memory import InMemoryLookupKB 4 | 5 | __all__ = [ 6 | "Candidate", 7 | "KnowledgeBase", 8 | "InMemoryLookupKB", 9 | "get_candidates", 10 | "get_candidates_batch", 11 | ] 12 | -------------------------------------------------------------------------------- /spacy/kb/candidate.pxd: -------------------------------------------------------------------------------- 1 | from libcpp.vector cimport vector 2 | 3 | from ..typedefs cimport hash_t 4 | from .kb cimport KnowledgeBase 5 | 6 | 7 | # Object used by the Entity Linker that summarizes one entity-alias candidate 8 | # combination. 9 | cdef class Candidate: 10 | cdef readonly KnowledgeBase kb 11 | cdef hash_t entity_hash 12 | cdef float entity_freq 13 | cdef vector[float] entity_vector 14 | cdef hash_t alias_hash 15 | cdef float prior_prob 16 | -------------------------------------------------------------------------------- /spacy/kb/kb.pxd: -------------------------------------------------------------------------------- 1 | """Knowledge-base for entity or concept linking.""" 2 | 3 | from cymem.cymem cimport Pool 4 | from libc.stdint cimport int64_t 5 | 6 | from ..vocab cimport Vocab 7 | 8 | 9 | cdef class KnowledgeBase: 10 | cdef Pool mem 11 | cdef readonly Vocab vocab 12 | cdef readonly int64_t entity_vector_length 13 | -------------------------------------------------------------------------------- /spacy/lang/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/explosion/spaCy/41e07772dc5805594bab2997a090a9033e26bf56/spacy/lang/__init__.py -------------------------------------------------------------------------------- /spacy/lang/af/__init__.py: -------------------------------------------------------------------------------- 1 | from ...language import BaseDefaults, Language 2 | from .stop_words import STOP_WORDS 3 | 4 | 5 | class AfrikaansDefaults(BaseDefaults): 6 | stop_words = STOP_WORDS 7 | 8 | 9 | class Afrikaans(Language): 10 | lang = "af" 11 | Defaults = AfrikaansDefaults 12 | 13 | 14 | __all__ = ["Afrikaans"] 15 | -------------------------------------------------------------------------------- /spacy/lang/af/stop_words.py: -------------------------------------------------------------------------------- 1 | # Source: https://github.com/stopwords-iso/stopwords-af 2 | 3 | STOP_WORDS = set( 4 | """ 5 | 'n 6 | aan 7 | af 8 | al 9 | as 10 | baie 11 | by 12 | daar 13 | dag 14 | dat 15 | die 16 | dit 17 | een 18 | ek 19 | en 20 | gaan 21 | gesê 22 | haar 23 | het 24 | hom 25 | hulle 26 | hy 27 | in 28 | is 29 | jou 30 | jy 31 | kan 32 | kom 33 | ma 34 | maar 35 | met 36 | my 37 | na 38 | nie 39 | om 40 | ons 41 | op 42 | saam 43 | sal 44 | se 45 | sien 46 | so 47 | sy 48 | te 49 | toe 50 | uit 51 | van 52 | vir 53 | was 54 | wat 55 | ʼn 56 | """.split() 57 | ) 58 | -------------------------------------------------------------------------------- /spacy/lang/am/examples.py: -------------------------------------------------------------------------------- 1 | """ 2 | Example sentences to test spaCy and its language models. 3 | 4 | >>> from spacy.lang.am.examples import sentences 5 | >>> docs = nlp.pipe(sentences) 6 | """ 7 | 8 | 9 | sentences = [ 10 | "አፕል የዩኬን ጅምር ድርጅት በ 1 ቢሊዮን ዶላር ለመግዛት አስቧል።", 11 | "የራስ ገዝ መኪኖች የኢንሹራንስ ኃላፊነትን ወደ አምራቾች ያዛውራሉ", 12 | "ሳን ፍራንሲስኮ የእግረኛ መንገድ አቅርቦት ሮቦቶችን ማገድን ይመለከታል", 13 | "ለንደን በእንግሊዝ የምትገኝ ትልቅ ከተማ ናት።", 14 | "የት ነህ?", 15 | "የፈረንሳይ ፕሬዝዳንት ማናቸው?", 16 | "የአሜሪካ ዋና ከተማ ምንድነው?", 17 | "ባራክ ኦባማ መቼ ተወለደ?", 18 | ] 19 | -------------------------------------------------------------------------------- /spacy/lang/am/punctuation.py: -------------------------------------------------------------------------------- 1 | from ..char_classes import ( 2 | ALPHA_UPPER, 3 | CURRENCY, 4 | LIST_ELLIPSES, 5 | LIST_PUNCT, 6 | LIST_QUOTES, 7 | UNITS, 8 | ) 9 | 10 | _list_punct = LIST_PUNCT + "፡ ። ፣ ፤ ፥ ፦ ፧ ፠ ፨".strip().split() 11 | 12 | _suffixes = ( 13 | _list_punct 14 | + LIST_ELLIPSES 15 | + LIST_QUOTES 16 | + [ 17 | r"(?<=[0-9])\+", 18 | # Amharic is written from Left-To-Right 19 | r"(?<=[0-9])(?:{c})".format(c=CURRENCY), 20 | r"(?<=[0-9])(?:{u})".format(u=UNITS), 21 | r"(?<=[{au}][{au}])\.".format(au=ALPHA_UPPER), 22 | ] 23 | ) 24 | 25 | TOKENIZER_SUFFIXES = _suffixes 26 | -------------------------------------------------------------------------------- /spacy/lang/am/tokenizer_exceptions.py: -------------------------------------------------------------------------------- 1 | from ...symbols import NORM, ORTH 2 | 3 | _exc = {} 4 | 5 | 6 | for exc_data in [ 7 | {ORTH: "ት/ቤት"}, 8 | {ORTH: "ወ/ሮ", NORM: "ወይዘሮ"}, 9 | ]: 10 | _exc[exc_data[ORTH]] = [exc_data] 11 | 12 | 13 | for orth in [ 14 | "ዓ.ም.", 15 | "ኪ.ሜ.", 16 | ]: 17 | _exc[orth] = [{ORTH: orth}] 18 | 19 | 20 | TOKENIZER_EXCEPTIONS = _exc 21 | -------------------------------------------------------------------------------- /spacy/lang/ar/__init__.py: -------------------------------------------------------------------------------- 1 | from ...language import BaseDefaults, Language 2 | from .lex_attrs import LEX_ATTRS 3 | from .punctuation import TOKENIZER_SUFFIXES 4 | from .stop_words import STOP_WORDS 5 | from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS 6 | 7 | 8 | class ArabicDefaults(BaseDefaults): 9 | tokenizer_exceptions = TOKENIZER_EXCEPTIONS 10 | suffixes = TOKENIZER_SUFFIXES 11 | stop_words = STOP_WORDS 12 | lex_attr_getters = LEX_ATTRS 13 | writing_system = {"direction": "rtl", "has_case": False, "has_letters": True} 14 | 15 | 16 | class Arabic(Language): 17 | Defaults = ArabicDefaults 18 | lang = "ar" 19 | 20 | 21 | __all__ = ["Arabic"] 22 | -------------------------------------------------------------------------------- /spacy/lang/ar/examples.py: -------------------------------------------------------------------------------- 1 | """ 2 | Example sentences to test spaCy and its language models. 3 | 4 | >>> from spacy.lang.ar.examples import sentences 5 | >>> docs = nlp.pipe(sentences) 6 | """ 7 | 8 | sentences = [ 9 | "نال الكاتب خالد توفيق جائزة الرواية العربية في معرض الشارقة الدولي للكتاب", 10 | "أين تقع دمشق ؟", 11 | "كيف حالك ؟", 12 | "هل يمكن ان نلتقي على الساعة الثانية عشرة ظهرا ؟", 13 | "ماهي أبرز التطورات السياسية، الأمنية والاجتماعية في العالم ؟", 14 | "هل بالإمكان أن نلتقي غدا؟", 15 | "هناك نحو 382 مليون شخص مصاب بداء السكَّري في العالم", 16 | "كشفت دراسة حديثة أن الخيل تقرأ تعبيرات الوجه وتستطيع أن تتذكر مشاعر الناس وعواطفهم", 17 | ] 18 | -------------------------------------------------------------------------------- /spacy/lang/ar/punctuation.py: -------------------------------------------------------------------------------- 1 | from ..char_classes import ( 2 | ALPHA_UPPER, 3 | CURRENCY, 4 | LIST_ELLIPSES, 5 | LIST_PUNCT, 6 | LIST_QUOTES, 7 | UNITS, 8 | ) 9 | 10 | _suffixes = ( 11 | LIST_PUNCT 12 | + LIST_ELLIPSES 13 | + LIST_QUOTES 14 | + [ 15 | r"(?<=[0-9])\+", 16 | # Arabic is written from Right-To-Left 17 | r"(?<=[0-9])(?:{c})".format(c=CURRENCY), 18 | r"(?<=[0-9])(?:{u})".format(u=UNITS), 19 | r"(?<=[{au}][{au}])\.".format(au=ALPHA_UPPER), 20 | ] 21 | ) 22 | 23 | TOKENIZER_SUFFIXES = _suffixes 24 | -------------------------------------------------------------------------------- /spacy/lang/az/__init__.py: -------------------------------------------------------------------------------- 1 | from ...language import BaseDefaults, Language 2 | from .lex_attrs import LEX_ATTRS 3 | from .stop_words import STOP_WORDS 4 | 5 | 6 | class AzerbaijaniDefaults(BaseDefaults): 7 | lex_attr_getters = LEX_ATTRS 8 | stop_words = STOP_WORDS 9 | 10 | 11 | class Azerbaijani(Language): 12 | lang = "az" 13 | Defaults = AzerbaijaniDefaults 14 | 15 | 16 | __all__ = ["Azerbaijani"] 17 | -------------------------------------------------------------------------------- /spacy/lang/az/examples.py: -------------------------------------------------------------------------------- 1 | """ 2 | Example sentences to test spaCy and its language models. 3 | >>> from spacy.lang.az.examples import sentences 4 | >>> docs = nlp.pipe(sentences) 5 | """ 6 | 7 | 8 | sentences = [ 9 | "Bu bir cümlədir.", 10 | "Necəsən?", 11 | "Qarabağ ordeni vətən müharibəsində qələbə münasibəti ilə təsis edilmişdir.", 12 | "Məktəbimizə Bakıdan bir tarix müəllimi gəlmişdi.", 13 | "Atılan növbəti mərmilər lap yaxınlıqda partladı.", 14 | "Sinqapur koronavirus baxımından ən təhlükəsiz ölkələr sırasındadır.", 15 | "Marsda ilk sınaq uçuşu həyata keçirilib.", 16 | "SSRİ dağılandan bəri 5 sahil dövləti Xəzərin statusunu müəyyən edə bilməyiblər.", 17 | "Videoda beyninə xüsusi çip yerləşdirilmiş meymun əks olunub.", 18 | ] 19 | -------------------------------------------------------------------------------- /spacy/lang/bg/examples.py: -------------------------------------------------------------------------------- 1 | """ 2 | Example sentences to test spaCy and its language models. 3 | 4 | >>> from spacy.lang.bg.examples import sentences 5 | >>> docs = nlp.pipe(sentences) 6 | """ 7 | 8 | sentences = [ 9 | "Епъл иска да купи английски стартъп за 1 милиард долара." 10 | "Автономните коли прехвърлят застрахователната отговорност към производителите." 11 | "Сан Франциско обмисля забрана на роботи доставящи по тротоари. " 12 | "Лондон е голям град в Обединеното Кралство." 13 | ] 14 | -------------------------------------------------------------------------------- /spacy/lang/bn/examples.py: -------------------------------------------------------------------------------- 1 | """ 2 | Example sentences to test spaCy and its language models. 3 | 4 | >>> from spacy.lang.bn.examples import sentences 5 | >>> docs = nlp.pipe(sentences) 6 | """ 7 | 8 | 9 | sentences = ["তুই খুব ভালো", "আজ আমরা ডাক্তার দেখতে যাবো", "আমি জানি না "] 10 | -------------------------------------------------------------------------------- /spacy/lang/bo/__init__.py: -------------------------------------------------------------------------------- 1 | from ...language import BaseDefaults, Language 2 | from .lex_attrs import LEX_ATTRS 3 | from .stop_words import STOP_WORDS 4 | 5 | 6 | class TibetanDefaults(BaseDefaults): 7 | lex_attr_getters = LEX_ATTRS 8 | stop_words = STOP_WORDS 9 | 10 | 11 | class Tibetan(Language): 12 | lang = "bo" 13 | Defaults = TibetanDefaults 14 | 15 | 16 | __all__ = ["Tibetan"] 17 | -------------------------------------------------------------------------------- /spacy/lang/bo/examples.py: -------------------------------------------------------------------------------- 1 | """ 2 | Example sentences to test spaCy and its language models. 3 | 4 | >>> from spacy.lang.bo.examples import sentences 5 | >>> docs = nlp.pipe(sentences) 6 | """ 7 | 8 | 9 | sentences = [ 10 | "དོན་དུ་རྒྱ་མཚོ་བླ་མ་ཞེས་བྱ་ཞིང༌།", 11 | "ཏཱ་ལའི་ཞེས་པ་ནི་སོག་སྐད་ཡིན་པ་དེ་བོད་སྐད་དུ་རྒྱ་མཚོའི་དོན་དུ་འཇུག", 12 | "སོག་པོ་ཨལ་ཐན་རྒྱལ་པོས་རྒྱལ་དབང་བསོད་ནམས་རྒྱ་མཚོར་ཆེ་བསྟོད་ཀྱི་མཚན་གསོལ་བ་ཞིག་ཡིན་ཞིང༌།", 13 | "རྗེས་སུ་རྒྱལ་བ་དགེ་འདུན་གྲུབ་དང༌། དགེ་འདུན་རྒྱ་མཚོ་སོ་སོར་ཡང་ཏཱ་ལའི་བླ་མའི་སྐུ་ཕྲེང་དང་པོ་དང༌།", 14 | "གཉིས་པའི་མཚན་དེ་གསོལ་ཞིང༌།༸རྒྱལ་དབང་སྐུ་ཕྲེང་ལྔ་པས་དགའ་ལྡན་ཕོ་བྲང་གི་སྲིད་དབང་བཙུགས་པ་ནས་ཏཱ་ལའི་བླ་མ་ནི་བོད་ཀྱི་ཆོས་སྲིད་གཉིས་ཀྱི་དབུ་ཁྲིད་དུ་གྱུར་ཞིང་།", 15 | "ད་ལྟའི་བར་ཏཱ་ལའི་བླ་མ་སྐུ་ཕྲེང་བཅུ་བཞི་བྱོན་ཡོད།", 16 | ] 17 | -------------------------------------------------------------------------------- /spacy/lang/ca/examples.py: -------------------------------------------------------------------------------- 1 | """ 2 | Example sentences to test spaCy and its language models. 3 | 4 | >>> from spacy.lang.ca.examples import sentences 5 | >>> docs = nlp.pipe(sentences) 6 | """ 7 | 8 | 9 | sentences = [ 10 | "Apple està buscant comprar una startup del Regne Unit per mil milions de dòlars", 11 | "Els cotxes autònoms deleguen la responsabilitat de l'assegurança als seus fabricants", 12 | "San Francisco analitza prohibir els robots de repartiment", 13 | "Londres és una gran ciutat del Regne Unit", 14 | "El gat menja peix", 15 | "Veig a l'home amb el telescopi", 16 | "L'aranya menja mosques", 17 | "El pingüí incuba en el seu niu", 18 | ] 19 | -------------------------------------------------------------------------------- /spacy/lang/cs/__init__.py: -------------------------------------------------------------------------------- 1 | from ...language import BaseDefaults, Language 2 | from .lex_attrs import LEX_ATTRS 3 | from .stop_words import STOP_WORDS 4 | 5 | 6 | class CzechDefaults(BaseDefaults): 7 | lex_attr_getters = LEX_ATTRS 8 | stop_words = STOP_WORDS 9 | 10 | 11 | class Czech(Language): 12 | lang = "cs" 13 | Defaults = CzechDefaults 14 | 15 | 16 | __all__ = ["Czech"] 17 | -------------------------------------------------------------------------------- /spacy/lang/da/__init__.py: -------------------------------------------------------------------------------- 1 | from ...language import BaseDefaults, Language 2 | from .lex_attrs import LEX_ATTRS 3 | from .punctuation import TOKENIZER_INFIXES, TOKENIZER_SUFFIXES 4 | from .stop_words import STOP_WORDS 5 | from .syntax_iterators import SYNTAX_ITERATORS 6 | from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS 7 | 8 | 9 | class DanishDefaults(BaseDefaults): 10 | tokenizer_exceptions = TOKENIZER_EXCEPTIONS 11 | infixes = TOKENIZER_INFIXES 12 | suffixes = TOKENIZER_SUFFIXES 13 | lex_attr_getters = LEX_ATTRS 14 | stop_words = STOP_WORDS 15 | syntax_iterators = SYNTAX_ITERATORS 16 | 17 | 18 | class Danish(Language): 19 | lang = "da" 20 | Defaults = DanishDefaults 21 | 22 | 23 | __all__ = ["Danish"] 24 | -------------------------------------------------------------------------------- /spacy/lang/da/examples.py: -------------------------------------------------------------------------------- 1 | """ 2 | Example sentences to test spaCy and its language models. 3 | 4 | >>> from spacy.lang.da.examples import sentences 5 | >>> docs = nlp.pipe(sentences) 6 | """ 7 | 8 | sentences = [ 9 | "Apple overvejer at købe et britisk startup for 1 milliard dollar.", 10 | "Selvkørende biler flytter forsikringsansvaret over på producenterne.", 11 | "San Francisco overvejer at forbyde udbringningsrobotter på fortovet.", 12 | "London er en storby i Storbritannien.", 13 | "Hvor er du?", 14 | "Hvem er Frankrings president?", 15 | "Hvad er hovedstaden i USA?", 16 | "Hvornår blev Barack Obama født?", 17 | ] 18 | -------------------------------------------------------------------------------- /spacy/lang/de/__init__.py: -------------------------------------------------------------------------------- 1 | from ...language import BaseDefaults, Language 2 | from .punctuation import TOKENIZER_INFIXES, TOKENIZER_PREFIXES, TOKENIZER_SUFFIXES 3 | from .stop_words import STOP_WORDS 4 | from .syntax_iterators import SYNTAX_ITERATORS 5 | from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS 6 | 7 | 8 | class GermanDefaults(BaseDefaults): 9 | tokenizer_exceptions = TOKENIZER_EXCEPTIONS 10 | prefixes = TOKENIZER_PREFIXES 11 | suffixes = TOKENIZER_SUFFIXES 12 | infixes = TOKENIZER_INFIXES 13 | syntax_iterators = SYNTAX_ITERATORS 14 | stop_words = STOP_WORDS 15 | 16 | 17 | class German(Language): 18 | lang = "de" 19 | Defaults = GermanDefaults 20 | 21 | 22 | __all__ = ["German"] 23 | -------------------------------------------------------------------------------- /spacy/lang/de/examples.py: -------------------------------------------------------------------------------- 1 | """ 2 | Example sentences to test spaCy and its language models. 3 | 4 | >>> from spacy.lang.de.examples import sentences 5 | >>> docs = nlp.pipe(sentences) 6 | """ 7 | 8 | 9 | sentences = [ 10 | "Die ganze Stadt ist ein Startup: Shenzhen ist das Silicon Valley für Hardware-Firmen", 11 | "Wie deutsche Startups die Technologie vorantreiben wollen: Künstliche Intelligenz", 12 | "Trend zum Urlaub in Deutschland beschert Gastwirten mehr Umsatz", 13 | "Bundesanwaltschaft erhebt Anklage gegen mutmaßlichen Schweizer Spion", 14 | "San Francisco erwägt Verbot von Lieferrobotern", 15 | "Autonome Fahrzeuge verlagern Haftpflicht auf Hersteller", 16 | "Wo bist du?", 17 | "Was ist die Hauptstadt von Deutschland?", 18 | ] 19 | -------------------------------------------------------------------------------- /spacy/lang/dsb/__init__.py: -------------------------------------------------------------------------------- 1 | from ...language import BaseDefaults, Language 2 | from .lex_attrs import LEX_ATTRS 3 | from .stop_words import STOP_WORDS 4 | 5 | 6 | class LowerSorbianDefaults(BaseDefaults): 7 | lex_attr_getters = LEX_ATTRS 8 | stop_words = STOP_WORDS 9 | 10 | 11 | class LowerSorbian(Language): 12 | lang = "dsb" 13 | Defaults = LowerSorbianDefaults 14 | 15 | 16 | __all__ = ["LowerSorbian"] 17 | -------------------------------------------------------------------------------- /spacy/lang/dsb/examples.py: -------------------------------------------------------------------------------- 1 | """ 2 | Example sentences to test spaCy and its language models. 3 | 4 | >>> from spacy.lang.dsb.examples import sentences 5 | >>> docs = nlp.pipe(sentences) 6 | """ 7 | 8 | 9 | sentences = [ 10 | "Z tym stwori so wuměnjenje a zakład za dalše wobdźěłanje přez analyzu tekstoweje struktury a semantisku anotaciju a z tym tež za tu předstajenu digitalnu online-wersiju.", 11 | "Mi so tu jara derje spodoba.", 12 | "Kotre nowniny chceće měć?", 13 | "Tak ako w slědnem lěśe jo teke lětosa jano doma zapustowaś móžno.", 14 | "Zwóstanjo pótakem hyšći wjele źěła.", 15 | ] 16 | -------------------------------------------------------------------------------- /spacy/lang/dsb/stop_words.py: -------------------------------------------------------------------------------- 1 | STOP_WORDS = set( 2 | """ 3 | a abo aby ako ale až 4 | 5 | daniž dokulaž 6 | 7 | gaž 8 | 9 | jolic 10 | 11 | pak pótom 12 | 13 | teke togodla 14 | """.split() 15 | ) 16 | -------------------------------------------------------------------------------- /spacy/lang/en/examples.py: -------------------------------------------------------------------------------- 1 | """ 2 | Example sentences to test spaCy and its language models. 3 | 4 | >>> from spacy.lang.en.examples import sentences 5 | >>> docs = nlp.pipe(sentences) 6 | """ 7 | 8 | 9 | sentences = [ 10 | "Apple is looking at buying U.K. startup for $1 billion", 11 | "Autonomous cars shift insurance liability toward manufacturers", 12 | "San Francisco considers banning sidewalk delivery robots", 13 | "London is a big city in the United Kingdom.", 14 | "Where are you?", 15 | "Who is the president of France?", 16 | "What is the capital of the United States?", 17 | "When was Barack Obama born?", 18 | ] 19 | -------------------------------------------------------------------------------- /spacy/lang/en/punctuation.py: -------------------------------------------------------------------------------- 1 | from ..char_classes import ( 2 | ALPHA, 3 | ALPHA_LOWER, 4 | ALPHA_UPPER, 5 | CONCAT_QUOTES, 6 | HYPHENS, 7 | LIST_ELLIPSES, 8 | LIST_ICONS, 9 | ) 10 | 11 | _infixes = ( 12 | LIST_ELLIPSES 13 | + LIST_ICONS 14 | + [ 15 | r"(?<=[0-9])[+\-\*^](?=[0-9-])", 16 | r"(?<=[{al}{q}])\.(?=[{au}{q}])".format( 17 | al=ALPHA_LOWER, au=ALPHA_UPPER, q=CONCAT_QUOTES 18 | ), 19 | r"(?<=[{a}]),(?=[{a}])".format(a=ALPHA), 20 | r"(?<=[{a}0-9])(?:{h})(?=[{a}])".format(a=ALPHA, h=HYPHENS), 21 | r"(?<=[{a}0-9])[:<>=/](?=[{a}])".format(a=ALPHA), 22 | ] 23 | ) 24 | 25 | 26 | TOKENIZER_INFIXES = _infixes 27 | -------------------------------------------------------------------------------- /spacy/lang/et/__init__.py: -------------------------------------------------------------------------------- 1 | from ...language import BaseDefaults, Language 2 | from .stop_words import STOP_WORDS 3 | 4 | 5 | class EstonianDefaults(BaseDefaults): 6 | stop_words = STOP_WORDS 7 | 8 | 9 | class Estonian(Language): 10 | lang = "et" 11 | Defaults = EstonianDefaults 12 | 13 | 14 | __all__ = ["Estonian"] 15 | -------------------------------------------------------------------------------- /spacy/lang/et/stop_words.py: -------------------------------------------------------------------------------- 1 | # Source: https://github.com/stopwords-iso/stopwords-et 2 | 3 | STOP_WORDS = set( 4 | """ 5 | aga 6 | ei 7 | et 8 | ja 9 | jah 10 | kas 11 | kui 12 | kõik 13 | ma 14 | me 15 | mida 16 | midagi 17 | mind 18 | minu 19 | mis 20 | mu 21 | mul 22 | mulle 23 | nad 24 | nii 25 | oled 26 | olen 27 | oli 28 | oma 29 | on 30 | pole 31 | sa 32 | seda 33 | see 34 | selle 35 | siin 36 | siis 37 | ta 38 | te 39 | ära 40 | """.split() 41 | ) 42 | -------------------------------------------------------------------------------- /spacy/lang/eu/__init__.py: -------------------------------------------------------------------------------- 1 | from ...language import BaseDefaults, Language 2 | from .lex_attrs import LEX_ATTRS 3 | from .punctuation import TOKENIZER_SUFFIXES 4 | from .stop_words import STOP_WORDS 5 | 6 | 7 | class BasqueDefaults(BaseDefaults): 8 | suffixes = TOKENIZER_SUFFIXES 9 | stop_words = STOP_WORDS 10 | lex_attr_getters = LEX_ATTRS 11 | 12 | 13 | class Basque(Language): 14 | lang = "eu" 15 | Defaults = BasqueDefaults 16 | 17 | 18 | __all__ = ["Basque"] 19 | -------------------------------------------------------------------------------- /spacy/lang/eu/examples.py: -------------------------------------------------------------------------------- 1 | """ 2 | Example sentences to test spaCy and its language models. 3 | 4 | >>> from spacy.lang.eu.examples import sentences 5 | >>> docs = nlp.pipe(sentences) 6 | """ 7 | 8 | sentences = [ 9 | "bilbon ko castinga egin da eta nik jakin ez zuetako inork egin al du edota parte hartu duen ezagunik ba al du", 10 | "gaur telebistan entzunda denok martetik gatoz hortaz martzianoak gara beno nire ustez batzuk beste batzuk baino martzianoagoak dira", 11 | ] 12 | -------------------------------------------------------------------------------- /spacy/lang/eu/punctuation.py: -------------------------------------------------------------------------------- 1 | from ..punctuation import TOKENIZER_SUFFIXES 2 | 3 | _suffixes = TOKENIZER_SUFFIXES 4 | -------------------------------------------------------------------------------- /spacy/lang/fa/examples.py: -------------------------------------------------------------------------------- 1 | """ 2 | Example sentences to test spaCy and its language models. 3 | 4 | >>> from spacy.lang.fa.examples import sentences 5 | >>> docs = nlp.pipe(sentences) 6 | """ 7 | 8 | 9 | sentences = [ 10 | "این یک جمله نمونه می باشد.", 11 | "قرار ما، امروز ساعت ۲:۳۰ بعدازظهر هست!", 12 | "دیروز علی به من ۲۰۰۰.۱﷼ پول نقد داد.", 13 | "چطور می‌توان از تهران به کاشان رفت؟", 14 | "حدود ۸۰٪ هوا از نیتروژن تشکیل شده است.", 15 | ] 16 | -------------------------------------------------------------------------------- /spacy/lang/fa/punctuation.py: -------------------------------------------------------------------------------- 1 | from ..char_classes import ( 2 | ALPHA_UPPER, 3 | CURRENCY, 4 | LIST_ELLIPSES, 5 | LIST_PUNCT, 6 | LIST_QUOTES, 7 | UNITS, 8 | ) 9 | 10 | _suffixes = ( 11 | LIST_PUNCT 12 | + LIST_ELLIPSES 13 | + LIST_QUOTES 14 | + [ 15 | r"(?<=[0-9])\+", 16 | r"(?<=[0-9])%", # 4% -> ["4", "%"] 17 | # Persian is written from Right-To-Left 18 | r"(?<=[0-9])(?:{c})".format(c=CURRENCY), 19 | r"(?<=[0-9])(?:{u})".format(u=UNITS), 20 | r"(?<=[{au}][{au}])\.".format(au=ALPHA_UPPER), 21 | ] 22 | ) 23 | 24 | TOKENIZER_SUFFIXES = _suffixes 25 | -------------------------------------------------------------------------------- /spacy/lang/fi/__init__.py: -------------------------------------------------------------------------------- 1 | from ...language import BaseDefaults, Language 2 | from .lex_attrs import LEX_ATTRS 3 | from .punctuation import TOKENIZER_INFIXES, TOKENIZER_SUFFIXES 4 | from .stop_words import STOP_WORDS 5 | from .syntax_iterators import SYNTAX_ITERATORS 6 | from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS 7 | 8 | 9 | class FinnishDefaults(BaseDefaults): 10 | infixes = TOKENIZER_INFIXES 11 | suffixes = TOKENIZER_SUFFIXES 12 | tokenizer_exceptions = TOKENIZER_EXCEPTIONS 13 | lex_attr_getters = LEX_ATTRS 14 | stop_words = STOP_WORDS 15 | syntax_iterators = SYNTAX_ITERATORS 16 | 17 | 18 | class Finnish(Language): 19 | lang = "fi" 20 | Defaults = FinnishDefaults 21 | 22 | 23 | __all__ = ["Finnish"] 24 | -------------------------------------------------------------------------------- /spacy/lang/fi/examples.py: -------------------------------------------------------------------------------- 1 | """ 2 | Example sentences to test spaCy and its language models. 3 | >>> from spacy.lang.fi.examples import sentences 4 | >>> docs = nlp.pipe(sentences) 5 | """ 6 | 7 | sentences = [ 8 | "Itseajavat autot siirtävät vakuutusvastuun autojen valmistajille", 9 | "San Francisco harkitsee toimitusrobottien liikkumisen kieltämistä jalkakäytävillä", 10 | "Lontoo on suuri kaupunki Yhdistyneessä Kuningaskunnassa.", 11 | "Missä sinä olet?", 12 | "Mikä on Yhdysvaltojen pääkaupunki?", 13 | "Kuka on Suomen presidentti?", 14 | "Milloin Sauli Niinistö on syntynyt?", 15 | ] 16 | -------------------------------------------------------------------------------- /spacy/lang/fo/__init__.py: -------------------------------------------------------------------------------- 1 | from ...language import BaseDefaults, Language 2 | from ..punctuation import TOKENIZER_INFIXES, TOKENIZER_PREFIXES, TOKENIZER_SUFFIXES 3 | from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS 4 | 5 | 6 | class FaroeseDefaults(BaseDefaults): 7 | tokenizer_exceptions = TOKENIZER_EXCEPTIONS 8 | infixes = TOKENIZER_INFIXES 9 | suffixes = TOKENIZER_SUFFIXES 10 | prefixes = TOKENIZER_PREFIXES 11 | 12 | 13 | class Faroese(Language): 14 | lang = "fo" 15 | Defaults = FaroeseDefaults 16 | 17 | 18 | __all__ = ["Faroese"] 19 | -------------------------------------------------------------------------------- /spacy/lang/ga/stop_words.py: -------------------------------------------------------------------------------- 1 | STOP_WORDS = set( 2 | """ 3 | a ach ag agus an aon ar arna as 4 | 5 | ba beirt bhúr 6 | 7 | caoga ceathair ceathrar chomh chuig chun cois céad cúig cúigear 8 | 9 | daichead dar de deich deichniúr den dhá do don dtí dá dár dó 10 | 11 | faoi faoin faoina faoinár fara fiche 12 | 13 | gach gan go gur 14 | 15 | haon hocht 16 | 17 | i iad idir in ina ins inár is 18 | 19 | le leis lena lenár 20 | 21 | mar mo muid mé 22 | 23 | na nach naoi naonúr ná ní níor nó nócha 24 | 25 | ocht ochtar ochtó os 26 | 27 | roimh 28 | 29 | sa seacht seachtar seachtó seasca seisear siad sibh sinn sna sé sí 30 | 31 | tar thar thú triúr trí trína trínár tríocha tú 32 | 33 | um 34 | 35 | ár 36 | 37 | é éis 38 | 39 | í 40 | 41 | ó ón óna ónár 42 | """.split() 43 | ) 44 | -------------------------------------------------------------------------------- /spacy/lang/gd/__init__.py: -------------------------------------------------------------------------------- 1 | from typing import Optional 2 | 3 | from ...language import BaseDefaults, Language 4 | from .stop_words import STOP_WORDS 5 | from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS 6 | 7 | 8 | class ScottishDefaults(BaseDefaults): 9 | tokenizer_exceptions = TOKENIZER_EXCEPTIONS 10 | stop_words = STOP_WORDS 11 | 12 | 13 | class Scottish(Language): 14 | lang = "gd" 15 | Defaults = ScottishDefaults 16 | 17 | 18 | __all__ = ["Scottish"] 19 | -------------------------------------------------------------------------------- /spacy/lang/grc/__init__.py: -------------------------------------------------------------------------------- 1 | from ...language import BaseDefaults, Language 2 | from .lex_attrs import LEX_ATTRS 3 | from .punctuation import TOKENIZER_INFIXES, TOKENIZER_PREFIXES, TOKENIZER_SUFFIXES 4 | from .stop_words import STOP_WORDS 5 | from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS 6 | 7 | 8 | class AncientGreekDefaults(BaseDefaults): 9 | tokenizer_exceptions = TOKENIZER_EXCEPTIONS 10 | prefixes = TOKENIZER_PREFIXES 11 | suffixes = TOKENIZER_SUFFIXES 12 | infixes = TOKENIZER_INFIXES 13 | lex_attr_getters = LEX_ATTRS 14 | stop_words = STOP_WORDS 15 | 16 | 17 | class AncientGreek(Language): 18 | lang = "grc" 19 | Defaults = AncientGreekDefaults 20 | 21 | 22 | __all__ = ["AncientGreek"] 23 | -------------------------------------------------------------------------------- /spacy/lang/grc/examples.py: -------------------------------------------------------------------------------- 1 | """ 2 | Example sentences to test spaCy and its language models. 3 | 4 | >>> from spacy.lang.grc.examples import sentences 5 | >>> docs = nlp.pipe(sentences) 6 | """ 7 | 8 | 9 | sentences = [ 10 | "ἐρᾷ μὲν ἁγνὸς οὐρανὸς τρῶσαι χθόνα, ἔρως δὲ γαῖαν λαμβάνει γάμου τυχεῖν·", 11 | "εὐδαίμων Χαρίτων καὶ Μελάνιππος ἔφυ, θείας ἁγητῆρες ἐφαμερίοις φιλότατος.", 12 | "ὃ μὲν δὴ ἀπόστολος ἐς τὴν Μίλητον ἦν.", 13 | "Θρασύβουλος δὲ σαφέως προπεπυσμένος πάντα λόγον καὶ εἰδὼς τὰ Ἀλυάττης μέλλοι ποιήσειν μηχανᾶται τοιάδε.", 14 | "φιλόπαις δ' ἦν ἐκμανῶς καὶ Ἀλέξανδρος ὁ βασιλεύς.", 15 | "Ἀντίγονος ὁ βασιλεὺς ἐπεκώμαζε τῷ Ζήνωνι", 16 | "αὐτὰρ ὃ δεύτατος ἦλθεν ἄναξ ἀνδρῶν Ἀγαμέμνων ἕλκος ἔχων", 17 | ] 18 | -------------------------------------------------------------------------------- /spacy/lang/gu/__init__.py: -------------------------------------------------------------------------------- 1 | from ...language import BaseDefaults, Language 2 | from .stop_words import STOP_WORDS 3 | 4 | 5 | class GujaratiDefaults(BaseDefaults): 6 | stop_words = STOP_WORDS 7 | 8 | 9 | class Gujarati(Language): 10 | lang = "gu" 11 | Defaults = GujaratiDefaults 12 | 13 | 14 | __all__ = ["Gujarati"] 15 | -------------------------------------------------------------------------------- /spacy/lang/gu/examples.py: -------------------------------------------------------------------------------- 1 | """ 2 | Example sentences to test spaCy and its language models. 3 | 4 | >>> from spacy.lang.gu.examples import sentences 5 | >>> docs = nlp.pipe(sentences) 6 | """ 7 | 8 | 9 | sentences = [ 10 | "લોકશાહી એ સરકારનું એક એવું તંત્ર છે જ્યાં નાગરિકો મત દ્વારા સત્તાનો ઉપયોગ કરે છે.", 11 | "તે ગુજરાત રાજ્યના ધરમપુર શહેરમાં આવેલું હતું", 12 | "કર્ણદેવ પહેલો સોલંકી વંશનો રાજા હતો", 13 | "તેજપાળને બે પત્ની હતી", 14 | "ગુજરાતમાં ભારતીય જનતા પક્ષનો ઉદય આ સમયગાળા દરમિયાન થયો", 15 | "આંદોલનકારીઓએ ચીમનભાઇ પટેલના રાજીનામાની માંગણી કરી.", 16 | "અહિયાં શું જોડાય છે?", 17 | "મંદિરનો પૂર્વાભિમુખ ભાગ નાના મંડપ સાથે થોડો લંબચોરસ આકારનો છે.", 18 | ] 19 | -------------------------------------------------------------------------------- /spacy/lang/he/__init__.py: -------------------------------------------------------------------------------- 1 | from ...language import BaseDefaults, Language 2 | from .lex_attrs import LEX_ATTRS 3 | from .stop_words import STOP_WORDS 4 | 5 | 6 | class HebrewDefaults(BaseDefaults): 7 | stop_words = STOP_WORDS 8 | lex_attr_getters = LEX_ATTRS 9 | writing_system = {"direction": "rtl", "has_case": False, "has_letters": True} 10 | 11 | 12 | class Hebrew(Language): 13 | lang = "he" 14 | Defaults = HebrewDefaults 15 | 16 | 17 | __all__ = ["Hebrew"] 18 | -------------------------------------------------------------------------------- /spacy/lang/hi/__init__.py: -------------------------------------------------------------------------------- 1 | from ...language import BaseDefaults, Language 2 | from .lex_attrs import LEX_ATTRS 3 | from .stop_words import STOP_WORDS 4 | 5 | 6 | class HindiDefaults(BaseDefaults): 7 | stop_words = STOP_WORDS 8 | lex_attr_getters = LEX_ATTRS 9 | 10 | 11 | class Hindi(Language): 12 | lang = "hi" 13 | Defaults = HindiDefaults 14 | 15 | 16 | __all__ = ["Hindi"] 17 | -------------------------------------------------------------------------------- /spacy/lang/hr/__init__.py: -------------------------------------------------------------------------------- 1 | from ...language import BaseDefaults, Language 2 | from .stop_words import STOP_WORDS 3 | 4 | 5 | class CroatianDefaults(BaseDefaults): 6 | stop_words = STOP_WORDS 7 | 8 | 9 | class Croatian(Language): 10 | lang = "hr" 11 | Defaults = CroatianDefaults 12 | 13 | 14 | __all__ = ["Croatian"] 15 | -------------------------------------------------------------------------------- /spacy/lang/hr/examples.py: -------------------------------------------------------------------------------- 1 | """ 2 | Example sentences to test spaCy and its language models. 3 | 4 | >>> from spacy.lang.hr.examples import sentences 5 | >>> docs = nlp.pipe(sentences) 6 | """ 7 | 8 | sentences = [ 9 | "Ovo je rečenica.", 10 | "Kako se popravlja auto?", 11 | "Zagreb je udaljen od Ljubljane svega 150 km.", 12 | "Nećete vjerovati što se dogodilo na ovogodišnjem festivalu!", 13 | "Budućnost Apple je upitna nakon dugotrajnog pada vrijednosti dionica firme.", 14 | "Trgovina oružjem predstavlja prijetnju za globalni mir.", 15 | ] 16 | -------------------------------------------------------------------------------- /spacy/lang/hsb/__init__.py: -------------------------------------------------------------------------------- 1 | from ...language import BaseDefaults, Language 2 | from .lex_attrs import LEX_ATTRS 3 | from .stop_words import STOP_WORDS 4 | from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS 5 | 6 | 7 | class UpperSorbianDefaults(BaseDefaults): 8 | lex_attr_getters = LEX_ATTRS 9 | stop_words = STOP_WORDS 10 | tokenizer_exceptions = TOKENIZER_EXCEPTIONS 11 | 12 | 13 | class UpperSorbian(Language): 14 | lang = "hsb" 15 | Defaults = UpperSorbianDefaults 16 | 17 | 18 | __all__ = ["UpperSorbian"] 19 | -------------------------------------------------------------------------------- /spacy/lang/hsb/examples.py: -------------------------------------------------------------------------------- 1 | """ 2 | Example sentences to test spaCy and its language models. 3 | 4 | >>> from spacy.lang.hsb.examples import sentences 5 | >>> docs = nlp.pipe(sentences) 6 | """ 7 | 8 | 9 | sentences = [ 10 | "To běšo wjelgin raźone a jo se wót luźi derje pśiwzeło. Tak som dožywiła wjelgin", 11 | "Jogo pśewóźowarce stej groniłej, až how w serbskich stronach njama Santa Claus nic pytaś.", 12 | "A ten sobuźěłaśeŕ Statneje biblioteki w Barlinju jo pśimjeł drogotne knigły bźez rukajcowu z nagima rukoma!", 13 | "Take wobchadanje z našym kulturnym derbstwom zewšym njejźo.", 14 | "Wopśimjeśe drugich pśinoskow jo było na wusokem niwowje, ako pśecej.", 15 | ] 16 | -------------------------------------------------------------------------------- /spacy/lang/hsb/stop_words.py: -------------------------------------------------------------------------------- 1 | STOP_WORDS = set( 2 | """ 3 | a abo ale ani 4 | 5 | dokelž 6 | 7 | hdyž 8 | 9 | jeli jelizo 10 | 11 | kaž 12 | 13 | pak potom 14 | 15 | tež tohodla 16 | 17 | zo zoby 18 | """.split() 19 | ) 20 | -------------------------------------------------------------------------------- /spacy/lang/hsb/tokenizer_exceptions.py: -------------------------------------------------------------------------------- 1 | from ...symbols import NORM, ORTH 2 | from ...util import update_exc 3 | from ..tokenizer_exceptions import BASE_EXCEPTIONS 4 | 5 | _exc = dict() 6 | for exc_data in [ 7 | {ORTH: "mil.", NORM: "milion"}, 8 | {ORTH: "wob.", NORM: "wobydler"}, 9 | ]: 10 | _exc[exc_data[ORTH]] = [exc_data] 11 | 12 | for orth in [ 13 | "resp.", 14 | ]: 15 | _exc[orth] = [{ORTH: orth}] 16 | 17 | 18 | TOKENIZER_EXCEPTIONS = update_exc(BASE_EXCEPTIONS, _exc) 19 | -------------------------------------------------------------------------------- /spacy/lang/ht/examples.py: -------------------------------------------------------------------------------- 1 | """ 2 | Example sentences to test spaCy and its language models. 3 | 4 | >>> from spacy.lang.ht.examples import sentences 5 | >>> docs = nlp.pipe(sentences) 6 | """ 7 | 8 | 9 | sentences = [ 10 | "Apple ap panse achte yon demaraj nan Wayòm Ini pou $1 milya dola", 11 | "Machin otonòm fè responsablite asirans lan ale sou men fabrikan yo", 12 | "San Francisco ap konsidere entèdi robo ki livre sou twotwa yo", 13 | "Lond se yon gwo vil nan Wayòm Ini", 14 | "Kote ou ye?", 15 | "Kilès ki prezidan Lafrans?", 16 | "Ki kapital Etazini?", 17 | "Kile Barack Obama te fèt?", 18 | ] 19 | -------------------------------------------------------------------------------- /spacy/lang/ht/tag_map.py: -------------------------------------------------------------------------------- 1 | from spacy.symbols import NOUN, VERB, AUX, ADJ, ADV, PRON, DET, ADP, SCONJ, CCONJ, PART, INTJ, NUM, PROPN, PUNCT, SYM, X 2 | 3 | TAG_MAP = { 4 | "NOUN": {"pos": NOUN}, 5 | "VERB": {"pos": VERB}, 6 | "AUX": {"pos": AUX}, 7 | "ADJ": {"pos": ADJ}, 8 | "ADV": {"pos": ADV}, 9 | "PRON": {"pos": PRON}, 10 | "DET": {"pos": DET}, 11 | "ADP": {"pos": ADP}, 12 | "SCONJ": {"pos": SCONJ}, 13 | "CCONJ": {"pos": CCONJ}, 14 | "PART": {"pos": PART}, 15 | "INTJ": {"pos": INTJ}, 16 | "NUM": {"pos": NUM}, 17 | "PROPN": {"pos": PROPN}, 18 | "PUNCT": {"pos": PUNCT}, 19 | "SYM": {"pos": SYM}, 20 | "X": {"pos": X}, 21 | } 22 | -------------------------------------------------------------------------------- /spacy/lang/hu/__init__.py: -------------------------------------------------------------------------------- 1 | from ...language import BaseDefaults, Language 2 | from .punctuation import TOKENIZER_INFIXES, TOKENIZER_PREFIXES, TOKENIZER_SUFFIXES 3 | from .stop_words import STOP_WORDS 4 | from .tokenizer_exceptions import TOKEN_MATCH, TOKENIZER_EXCEPTIONS 5 | 6 | 7 | class HungarianDefaults(BaseDefaults): 8 | tokenizer_exceptions = TOKENIZER_EXCEPTIONS 9 | prefixes = TOKENIZER_PREFIXES 10 | suffixes = TOKENIZER_SUFFIXES 11 | infixes = TOKENIZER_INFIXES 12 | token_match = TOKEN_MATCH 13 | stop_words = STOP_WORDS 14 | 15 | 16 | class Hungarian(Language): 17 | lang = "hu" 18 | Defaults = HungarianDefaults 19 | 20 | 21 | __all__ = ["Hungarian"] 22 | -------------------------------------------------------------------------------- /spacy/lang/hu/examples.py: -------------------------------------------------------------------------------- 1 | """ 2 | Example sentences to test spaCy and its language models. 3 | 4 | >>> from spacy.lang.hu.examples import sentences 5 | >>> docs = nlp.pipe(sentences) 6 | """ 7 | 8 | 9 | sentences = [ 10 | "Az Apple egy brit startup vásárlását tervezi 1 milliárd dollár értékben.", 11 | "San Francisco vezetése mérlegeli a járdát használó szállító robotok betiltását.", 12 | "London az Egyesült Királyság egy nagy városa.", 13 | ] 14 | -------------------------------------------------------------------------------- /spacy/lang/hy/__init__.py: -------------------------------------------------------------------------------- 1 | from ...language import BaseDefaults, Language 2 | from .lex_attrs import LEX_ATTRS 3 | from .stop_words import STOP_WORDS 4 | 5 | 6 | class ArmenianDefaults(BaseDefaults): 7 | lex_attr_getters = LEX_ATTRS 8 | stop_words = STOP_WORDS 9 | 10 | 11 | class Armenian(Language): 12 | lang = "hy" 13 | Defaults = ArmenianDefaults 14 | 15 | 16 | __all__ = ["Armenian"] 17 | -------------------------------------------------------------------------------- /spacy/lang/hy/examples.py: -------------------------------------------------------------------------------- 1 | """ 2 | Example sentences to test spaCy and its language models. 3 | >>> from spacy.lang.hy.examples import sentences 4 | >>> docs = nlp.pipe(sentences) 5 | """ 6 | 7 | 8 | sentences = [ 9 | "Լոնդոնը Միացյալ Թագավորության մեծ քաղաք է։", 10 | "Ո՞վ է Ֆրանսիայի նախագահը։", 11 | "Ո՞րն է Միացյալ Նահանգների մայրաքաղաքը։", 12 | "Ե՞րբ է ծնվել Բարաք Օբաման։", 13 | ] 14 | -------------------------------------------------------------------------------- /spacy/lang/is/__init__.py: -------------------------------------------------------------------------------- 1 | from ...language import BaseDefaults, Language 2 | from .stop_words import STOP_WORDS 3 | 4 | 5 | class IcelandicDefaults(BaseDefaults): 6 | stop_words = STOP_WORDS 7 | 8 | 9 | class Icelandic(Language): 10 | lang = "is" 11 | Defaults = IcelandicDefaults 12 | 13 | 14 | __all__ = ["Icelandic"] 15 | -------------------------------------------------------------------------------- /spacy/lang/it/examples.py: -------------------------------------------------------------------------------- 1 | """ 2 | Example sentences to test spaCy and its language models. 3 | 4 | >>> from spacy.lang.it.examples import sentences 5 | >>> docs = nlp.pipe(sentences) 6 | """ 7 | 8 | 9 | sentences = [ 10 | "Apple vuole comprare una startup del Regno Unito per un miliardo di dollari", 11 | "Le automobili a guida autonoma spostano la responsabilità assicurativa verso i produttori", 12 | "San Francisco prevede di bandire i robot di consegna porta a porta", 13 | "Londra è una grande città del Regno Unito.", 14 | ] 15 | -------------------------------------------------------------------------------- /spacy/lang/ja/examples.py: -------------------------------------------------------------------------------- 1 | """ 2 | Example sentences to test spaCy and its language models. 3 | 4 | >>> from spacy.lang.ja.examples import sentences 5 | >>> docs = nlp.pipe(sentences) 6 | """ 7 | 8 | 9 | sentences = [ 10 | "アップルがイギリスの新興企業を10億ドルで購入を検討", 11 | "自動運転車の損害賠償責任、自動車メーカーに一定の負担を求める", 12 | "歩道を走る自動配達ロボ、サンフランシスコ市が走行禁止を検討", 13 | "ロンドンはイギリスの大都市です。", 14 | ] 15 | -------------------------------------------------------------------------------- /spacy/lang/ja/tag_orth_map.py: -------------------------------------------------------------------------------- 1 | from ...symbols import DET, PART, PRON, SPACE, X 2 | 3 | # mapping from tag bi-gram to pos of previous token 4 | TAG_ORTH_MAP = { 5 | "空白": {" ": SPACE, " ": X}, 6 | "助詞-副助詞": {"たり": PART}, 7 | "連体詞": { 8 | "あの": DET, 9 | "かの": DET, 10 | "この": DET, 11 | "その": DET, 12 | "どの": DET, 13 | "彼の": DET, 14 | "此の": DET, 15 | "其の": DET, 16 | "ある": PRON, 17 | "こんな": PRON, 18 | "そんな": PRON, 19 | "どんな": PRON, 20 | "あらゆる": PRON, 21 | }, 22 | } 23 | -------------------------------------------------------------------------------- /spacy/lang/kmr/__init__.py: -------------------------------------------------------------------------------- 1 | from ...language import BaseDefaults, Language 2 | from .lex_attrs import LEX_ATTRS 3 | from .stop_words import STOP_WORDS 4 | 5 | 6 | class KurmanjiDefaults(BaseDefaults): 7 | stop_words = STOP_WORDS 8 | lex_attr_getters = LEX_ATTRS 9 | 10 | 11 | class Kurmanji(Language): 12 | lang = "kmr" 13 | Defaults = KurmanjiDefaults 14 | 15 | 16 | __all__ = ["Kurmanji"] 17 | -------------------------------------------------------------------------------- /spacy/lang/kmr/stop_words.py: -------------------------------------------------------------------------------- 1 | STOP_WORDS = set( 2 | """ 3 | û 4 | li 5 | bi 6 | di 7 | da 8 | de 9 | ji 10 | ku 11 | ew 12 | ez 13 | tu 14 | em 15 | hûn 16 | ew 17 | ev 18 | min 19 | te 20 | wî 21 | wê 22 | me 23 | we 24 | wan 25 | vê 26 | vî 27 | va 28 | çi 29 | kî 30 | kê 31 | çawa 32 | çima 33 | kengî 34 | li ku 35 | çend 36 | çiqas 37 | her 38 | hin 39 | gelek 40 | hemû 41 | kes 42 | tişt 43 | """.split() 44 | ) 45 | -------------------------------------------------------------------------------- /spacy/lang/kn/__init__.py: -------------------------------------------------------------------------------- 1 | from ...language import BaseDefaults, Language 2 | from .stop_words import STOP_WORDS 3 | 4 | 5 | class KannadaDefaults(BaseDefaults): 6 | stop_words = STOP_WORDS 7 | 8 | 9 | class Kannada(Language): 10 | lang = "kn" 11 | Defaults = KannadaDefaults 12 | 13 | 14 | __all__ = ["Kannada"] 15 | -------------------------------------------------------------------------------- /spacy/lang/kn/examples.py: -------------------------------------------------------------------------------- 1 | """ 2 | Example sentences to test spaCy and its language models. 3 | 4 | >>> from spacy.lang.en.examples import sentences 5 | >>> docs = nlp.pipe(sentences) 6 | """ 7 | 8 | 9 | sentences = [ 10 | "ಆಪಲ್ ಒಂದು ಯು.ಕೆ. ಸ್ಟಾರ್ಟ್ಅಪ್ ಅನ್ನು ೧ ಶತಕೋಟಿ ಡಾಲರ್ಗಳಿಗೆ ಖರೀದಿಸಲು ನೋಡುತ್ತಿದೆ.", 11 | "ಸ್ವಾಯತ್ತ ಕಾರುಗಳು ವಿಮಾ ಹೊಣೆಗಾರಿಕೆಯನ್ನು ತಯಾರಕರ ಕಡೆಗೆ ಬದಲಾಯಿಸುತ್ತವೆ.", 12 | "ಕಾಲುದಾರಿ ವಿತರಣಾ ರೋಬೋಟ್‌ಗಳನ್ನು ನಿಷೇಧಿಸುವುದನ್ನು ಸ್ಯಾನ್ ಫ್ರಾನ್ಸಿಸ್ಕೊ ​​ಪರಿಗಣಿಸುತ್ತದೆ.", 13 | "ಲಂಡನ್ ಯುನೈಟೆಡ್ ಕಿಂಗ್‌ಡಂನ ದೊಡ್ಡ ನಗರ.", 14 | "ನೀನು ಎಲ್ಲಿದಿಯಾ?", 15 | "ಫ್ರಾನ್ಸಾದ ಅಧ್ಯಕ್ಷರು ಯಾರು?", 16 | "ಯುನೈಟೆಡ್ ಸ್ಟೇಟ್ಸ್ನ ರಾಜಧಾನಿ ಯಾವುದು?", 17 | "ಬರಾಕ್ ಒಬಾಮ ಯಾವಾಗ ಜನಿಸಿದರು?", 18 | ] 19 | -------------------------------------------------------------------------------- /spacy/lang/ko/examples.py: -------------------------------------------------------------------------------- 1 | """ 2 | Example sentences to test spaCy and its language models. 3 | 4 | >>> from spacy.lang.ko.examples import sentences 5 | >>> docs = nlp.pipe(sentences) 6 | """ 7 | 8 | sentences = [ 9 | "애플이 영국의 스타트업을 10억 달러에 인수하는 것을 알아보고 있다.", 10 | "자율주행 자동차의 손해 배상 책임이 제조 업체로 옮겨 가다", 11 | "샌프란시스코 시가 자동 배달 로봇의 보도 주행 금지를 검토 중이라고 합니다.", 12 | "런던은 영국의 수도이자 가장 큰 도시입니다.", 13 | ] 14 | -------------------------------------------------------------------------------- /spacy/lang/ko/punctuation.py: -------------------------------------------------------------------------------- 1 | from ..char_classes import LIST_QUOTES 2 | from ..punctuation import TOKENIZER_INFIXES as BASE_TOKENIZER_INFIXES 3 | 4 | _infixes = ( 5 | ["·", "ㆍ", r"\(", r"\)"] 6 | + [r"(?<=[0-9])~(?=[0-9-])"] 7 | + LIST_QUOTES 8 | + BASE_TOKENIZER_INFIXES 9 | ) 10 | 11 | TOKENIZER_INFIXES = _infixes 12 | -------------------------------------------------------------------------------- /spacy/lang/ko/stop_words.py: -------------------------------------------------------------------------------- 1 | STOP_WORDS = set( 2 | """ 3 | 이 4 | 있 5 | 하 6 | 것 7 | 들 8 | 그 9 | 되 10 | 수 11 | 이 12 | 보 13 | 않 14 | 없 15 | 나 16 | 주 17 | 아니 18 | 등 19 | 같 20 | 때 21 | 년 22 | 가 23 | 한 24 | 지 25 | 오 26 | 말 27 | 일 28 | 그렇 29 | 위하 30 | 때문 31 | 그것 32 | 두 33 | 말하 34 | 알 35 | 그러나 36 | 받 37 | 못하 38 | 일 39 | 그런 40 | 또 41 | 더 42 | 많 43 | 그리고 44 | 좋 45 | 크 46 | 시키 47 | 그러 48 | 하나 49 | 살 50 | 데 51 | 안 52 | 어떤 53 | 번 54 | 나 55 | 다른 56 | 어떻 57 | 들 58 | 이렇 59 | 점 60 | 싶 61 | 말 62 | 좀 63 | 원 64 | 잘 65 | 놓 66 | """.split() 67 | ) 68 | -------------------------------------------------------------------------------- /spacy/lang/ky/__init__.py: -------------------------------------------------------------------------------- 1 | from ...language import BaseDefaults, Language 2 | from .lex_attrs import LEX_ATTRS 3 | from .punctuation import TOKENIZER_INFIXES 4 | from .stop_words import STOP_WORDS 5 | from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS 6 | 7 | 8 | class KyrgyzDefaults(BaseDefaults): 9 | tokenizer_exceptions = TOKENIZER_EXCEPTIONS 10 | infixes = TOKENIZER_INFIXES 11 | lex_attr_getters = LEX_ATTRS 12 | stop_words = STOP_WORDS 13 | 14 | 15 | class Kyrgyz(Language): 16 | lang = "ky" 17 | Defaults = KyrgyzDefaults 18 | 19 | 20 | __all__ = ["Kyrgyz"] 21 | -------------------------------------------------------------------------------- /spacy/lang/ky/examples.py: -------------------------------------------------------------------------------- 1 | """ 2 | Example sentences to test spaCy and its language models. 3 | >>> from spacy.lang.ky.examples import sentences 4 | >>> docs = nlp.pipe(sentences) 5 | """ 6 | 7 | sentences = [ 8 | "Apple Улуу Британия стартабын $1 миллиардга сатып алууну көздөөдө.", 9 | "Автоном автомобилдерди камсыздоо жоопкерчилиги өндүрүүчүлөргө артылды.", 10 | "Сан-Франциско тротуар менен жүрүүчү робот-курьерлерге тыю салууну караштырууда.", 11 | "Лондон - Улуу Британияда жайгашкан ири шаар.", 12 | "Кайдасың?", 13 | "Франциянын президенти ким?", 14 | "Америка Кошмо Штаттарынын борбор калаасы кайсы шаар?", 15 | "Барак Обама качан төрөлгөн?", 16 | ] 17 | -------------------------------------------------------------------------------- /spacy/lang/la/__init__.py: -------------------------------------------------------------------------------- 1 | from ...language import BaseDefaults, Language 2 | from .lex_attrs import LEX_ATTRS 3 | from .stop_words import STOP_WORDS 4 | from .syntax_iterators import SYNTAX_ITERATORS 5 | from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS 6 | 7 | 8 | class LatinDefaults(BaseDefaults): 9 | tokenizer_exceptions = TOKENIZER_EXCEPTIONS 10 | stop_words = STOP_WORDS 11 | lex_attr_getters = LEX_ATTRS 12 | syntax_iterators = SYNTAX_ITERATORS 13 | 14 | 15 | class Latin(Language): 16 | lang = "la" 17 | Defaults = LatinDefaults 18 | 19 | 20 | __all__ = ["Latin"] 21 | -------------------------------------------------------------------------------- /spacy/lang/lb/__init__.py: -------------------------------------------------------------------------------- 1 | from ...language import BaseDefaults, Language 2 | from .lex_attrs import LEX_ATTRS 3 | from .punctuation import TOKENIZER_INFIXES 4 | from .stop_words import STOP_WORDS 5 | from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS 6 | 7 | 8 | class LuxembourgishDefaults(BaseDefaults): 9 | tokenizer_exceptions = TOKENIZER_EXCEPTIONS 10 | infixes = TOKENIZER_INFIXES 11 | lex_attr_getters = LEX_ATTRS 12 | stop_words = STOP_WORDS 13 | 14 | 15 | class Luxembourgish(Language): 16 | lang = "lb" 17 | Defaults = LuxembourgishDefaults 18 | 19 | 20 | __all__ = ["Luxembourgish"] 21 | -------------------------------------------------------------------------------- /spacy/lang/lb/punctuation.py: -------------------------------------------------------------------------------- 1 | from ..char_classes import ALPHA, ALPHA_LOWER, ALPHA_UPPER, LIST_ELLIPSES, LIST_ICONS 2 | 3 | ELISION = " ' ’ ".strip().replace(" ", "") 4 | 5 | abbrev = ("d", "D") 6 | 7 | _infixes = ( 8 | LIST_ELLIPSES 9 | + LIST_ICONS 10 | + [ 11 | r"(?<=^[{ab}][{el}])(?=[{a}])".format(ab=abbrev, a=ALPHA, el=ELISION), 12 | r"(?<=[{al}])\.(?=[{au}])".format(al=ALPHA_LOWER, au=ALPHA_UPPER), 13 | r"(?<=[{a}])[,!?](?=[{a}])".format(a=ALPHA), 14 | r"(?<=[{a}])[:<>=](?=[{a}])".format(a=ALPHA), 15 | r"(?<=[{a}]),(?=[{a}])".format(a=ALPHA), 16 | r"(?<=[{a}])--(?=[{a}])".format(a=ALPHA), 17 | r"(?<=[0-9])-(?=[0-9])", 18 | ] 19 | ) 20 | 21 | TOKENIZER_INFIXES = _infixes 22 | -------------------------------------------------------------------------------- /spacy/lang/lg/__init__.py: -------------------------------------------------------------------------------- 1 | from ...language import BaseDefaults, Language 2 | from .lex_attrs import LEX_ATTRS 3 | from .punctuation import TOKENIZER_INFIXES 4 | from .stop_words import STOP_WORDS 5 | 6 | 7 | class LugandaDefaults(BaseDefaults): 8 | lex_attr_getters = LEX_ATTRS 9 | infixes = TOKENIZER_INFIXES 10 | stop_words = STOP_WORDS 11 | 12 | 13 | class Luganda(Language): 14 | lang = "lg" 15 | Defaults = LugandaDefaults 16 | 17 | 18 | __all__ = ["Luganda"] 19 | -------------------------------------------------------------------------------- /spacy/lang/lg/examples.py: -------------------------------------------------------------------------------- 1 | """ 2 | Example sentences to test spaCy and its language models. 3 | 4 | >>> from spacy.lang.lg.examples import sentences 5 | >>> docs = nlp.pipe(sentences) 6 | """ 7 | 8 | sentences = [ 9 | "Mpa ebyafaayo ku byalo Nakatu ne Nkajja", 10 | "Okuyita Ttembo kitegeeza kugwa ddalu", 11 | "Ekifumu kino kyali kya mulimu ki?", 12 | "Ekkovu we liyise wayitibwa mukululo", 13 | "Akola mulimu ki oguvaamu ssente?", 14 | "Emisumaali egikomerera embaawo giyitibwa nninga", 15 | "Abooluganda ab’emmamba ababiri", 16 | "Ekisaawe ky'ebyenjigiriza kya mugaso nnyo", 17 | ] 18 | -------------------------------------------------------------------------------- /spacy/lang/lg/punctuation.py: -------------------------------------------------------------------------------- 1 | from ..char_classes import ( 2 | ALPHA, 3 | ALPHA_LOWER, 4 | ALPHA_UPPER, 5 | CONCAT_QUOTES, 6 | HYPHENS, 7 | LIST_ELLIPSES, 8 | LIST_ICONS, 9 | ) 10 | 11 | _infixes = ( 12 | LIST_ELLIPSES 13 | + LIST_ICONS 14 | + [ 15 | r"(?<=[0-9])[+\-\*^](?=[0-9-])", 16 | r"(?<=[{al}{q}])\.(?=[{au}{q}])".format( 17 | al=ALPHA_LOWER, au=ALPHA_UPPER, q=CONCAT_QUOTES 18 | ), 19 | r"(?<=[{a}]),(?=[{a}])".format(a=ALPHA), 20 | r"(?<=[{a}0-9])(?:{h})(?=[{a}])".format(a=ALPHA, h=HYPHENS), 21 | r"(?<=[{a}0-9])[:<>=/](?=[{a}])".format(a=ALPHA), 22 | ] 23 | ) 24 | 25 | 26 | TOKENIZER_INFIXES = _infixes 27 | -------------------------------------------------------------------------------- /spacy/lang/lij/__init__.py: -------------------------------------------------------------------------------- 1 | from ...language import BaseDefaults, Language 2 | from .punctuation import TOKENIZER_INFIXES 3 | from .stop_words import STOP_WORDS 4 | from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS 5 | 6 | 7 | class LigurianDefaults(BaseDefaults): 8 | tokenizer_exceptions = TOKENIZER_EXCEPTIONS 9 | infixes = TOKENIZER_INFIXES 10 | stop_words = STOP_WORDS 11 | 12 | 13 | class Ligurian(Language): 14 | lang = "lij" 15 | Defaults = LigurianDefaults 16 | 17 | 18 | __all__ = ["Ligurian"] 19 | -------------------------------------------------------------------------------- /spacy/lang/lij/examples.py: -------------------------------------------------------------------------------- 1 | """ 2 | Example sentences to test spaCy and its language models. 3 | 4 | >>> from spacy.lang.lij.examples import sentences 5 | >>> docs = nlp.pipe(sentences) 6 | """ 7 | 8 | 9 | sentences = [ 10 | "Sciusciâ e sciorbî no se peu.", 11 | "Graçie di çetroin, che me son arrivæ.", 12 | "Vegnime apreuvo, che ve fasso pescâ di òmmi.", 13 | "Bella pe sempre l'ægua inta conchetta quande unn'agoggia d'ægua a se â trapaña.", 14 | ] 15 | -------------------------------------------------------------------------------- /spacy/lang/lij/punctuation.py: -------------------------------------------------------------------------------- 1 | from ..char_classes import ALPHA 2 | from ..punctuation import TOKENIZER_INFIXES 3 | 4 | ELISION = " ' ’ ".strip().replace(" ", "").replace("\n", "") 5 | 6 | 7 | _infixes = TOKENIZER_INFIXES + [ 8 | r"(?<=[{a}][{el}])(?=[{a}])".format(a=ALPHA, el=ELISION) 9 | ] 10 | 11 | TOKENIZER_INFIXES = _infixes 12 | -------------------------------------------------------------------------------- /spacy/lang/lt/__init__.py: -------------------------------------------------------------------------------- 1 | from ...language import BaseDefaults, Language 2 | from .lex_attrs import LEX_ATTRS 3 | from .punctuation import TOKENIZER_INFIXES, TOKENIZER_SUFFIXES 4 | from .stop_words import STOP_WORDS 5 | from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS 6 | 7 | 8 | class LithuanianDefaults(BaseDefaults): 9 | infixes = TOKENIZER_INFIXES 10 | suffixes = TOKENIZER_SUFFIXES 11 | tokenizer_exceptions = TOKENIZER_EXCEPTIONS 12 | stop_words = STOP_WORDS 13 | lex_attr_getters = LEX_ATTRS 14 | 15 | 16 | class Lithuanian(Language): 17 | lang = "lt" 18 | Defaults = LithuanianDefaults 19 | 20 | 21 | __all__ = ["Lithuanian"] 22 | -------------------------------------------------------------------------------- /spacy/lang/lt/examples.py: -------------------------------------------------------------------------------- 1 | """ 2 | Example sentences to test spaCy and its language models. 3 | 4 | >>> from spacy.lang.lt.examples import sentences 5 | >>> docs = nlp.pipe(sentences) 6 | """ 7 | 8 | 9 | sentences = [ 10 | "Jaunikis pirmąją vestuvinę naktį iškeitė į areštinės gultą", 11 | "Bepiločiai automobiliai išnaikins vairavimo mokyklas, autoservisus ir eismo nelaimes", 12 | "Vilniuje galvojama uždrausti naudoti skėčius", 13 | "Londonas yra didelis miestas Jungtinėje Karalystėje", 14 | "Kur tu?", 15 | "Kas yra Prancūzijos prezidentas?", 16 | "Kokia yra Jungtinių Amerikos Valstijų sostinė?", 17 | "Kada gimė Dalia Grybauskaitė?", 18 | ] 19 | -------------------------------------------------------------------------------- /spacy/lang/lt/tokenizer_exceptions.py: -------------------------------------------------------------------------------- 1 | from ...symbols import ORTH 2 | from ...util import update_exc 3 | from ..tokenizer_exceptions import BASE_EXCEPTIONS 4 | 5 | _exc = {} 6 | 7 | for orth in ["n-tosios", "?!"]: 8 | _exc[orth] = [{ORTH: orth}] 9 | 10 | mod_base_exceptions = { 11 | exc: val for exc, val in BASE_EXCEPTIONS.items() if not exc.endswith(".") 12 | } 13 | del mod_base_exceptions["8)"] 14 | TOKENIZER_EXCEPTIONS = update_exc(mod_base_exceptions, _exc) 15 | -------------------------------------------------------------------------------- /spacy/lang/lv/__init__.py: -------------------------------------------------------------------------------- 1 | from ...language import BaseDefaults, Language 2 | from .stop_words import STOP_WORDS 3 | 4 | 5 | class LatvianDefaults(BaseDefaults): 6 | stop_words = STOP_WORDS 7 | 8 | 9 | class Latvian(Language): 10 | lang = "lv" 11 | Defaults = LatvianDefaults 12 | 13 | 14 | __all__ = ["Latvian"] 15 | -------------------------------------------------------------------------------- /spacy/lang/ml/__init__.py: -------------------------------------------------------------------------------- 1 | from ...language import BaseDefaults, Language 2 | from .lex_attrs import LEX_ATTRS 3 | from .stop_words import STOP_WORDS 4 | 5 | 6 | class MalayalamDefaults(BaseDefaults): 7 | lex_attr_getters = LEX_ATTRS 8 | stop_words = STOP_WORDS 9 | 10 | 11 | class Malayalam(Language): 12 | lang = "ml" 13 | Defaults = MalayalamDefaults 14 | 15 | 16 | __all__ = ["Malayalam"] 17 | -------------------------------------------------------------------------------- /spacy/lang/ml/examples.py: -------------------------------------------------------------------------------- 1 | """ 2 | Example sentences to test spaCy and its language models. 3 | 4 | >>> from spacy.lang.ml.examples import sentences 5 | >>> docs = nlp.pipe(sentences) 6 | """ 7 | 8 | 9 | sentences = [ 10 | "അനാവശ്യമായി കണ്ണിലും മൂക്കിലും വായിലും സ്പർശിക്കാതിരിക്കുക", 11 | "പൊതുരംഗത്ത് മലയാള ഭാഷയുടെ സമഗ്രപുരോഗതി ലക്ഷ്യമാക്കി പ്രവർത്തിക്കുന്ന സംഘടനയായ മലയാളഐക്യവേദിയുടെ വിദ്യാർത്ഥിക്കൂട്ടായ്മയാണ് വിദ്യാർത്ഥി മലയാളവേദി", 12 | "എന്താണ്‌ കവാടങ്ങൾ?", 13 | "ചുരുക്കത്തിൽ വിക്കിപീഡിയയുടെ ഉള്ളടക്കത്തിലേക്കുള്ള പടിപ്പുരകളാണ്‌‌ കവാടങ്ങൾ. അവ ലളിതവും വായനക്കാരനെ ആകർഷിക്കുന്നതുമായിരിക്കും", 14 | "പതിനൊന്നുപേർ വീതമുള്ള രണ്ടു ടീമുകൾ കളിക്കുന്ന സംഘകായിക വിനോദമാണു ക്രിക്കറ്റ്", 15 | ] 16 | -------------------------------------------------------------------------------- /spacy/lang/ml/stop_words.py: -------------------------------------------------------------------------------- 1 | STOP_WORDS = set( 2 | """ 3 | അത് 4 | ഇത് 5 | ആയിരുന്നു 6 | ആകുന്നു 7 | വരെ 8 | അന്നേരം 9 | അന്ന് 10 | ഇന്ന് 11 | ആണ് 12 | """.split() 13 | ) 14 | -------------------------------------------------------------------------------- /spacy/lang/mr/__init__.py: -------------------------------------------------------------------------------- 1 | from ...language import BaseDefaults, Language 2 | from .stop_words import STOP_WORDS 3 | 4 | 5 | class MarathiDefaults(BaseDefaults): 6 | stop_words = STOP_WORDS 7 | 8 | 9 | class Marathi(Language): 10 | lang = "mr" 11 | Defaults = MarathiDefaults 12 | 13 | 14 | __all__ = ["Marathi"] 15 | -------------------------------------------------------------------------------- /spacy/lang/ms/examples.py: -------------------------------------------------------------------------------- 1 | """ 2 | Example sentences to test spaCy and its language models. 3 | 4 | >>> from spacy.lang.ms.examples import sentences 5 | >>> docs = nlp.pipe(sentences) 6 | """ 7 | 8 | 9 | sentences = [ 10 | "Malaysia ialah sebuah negara yang terletak di Asia Tenggara.", 11 | "Berapa banyak pelajar yang akan menghadiri majlis perpisahan sekolah?", 12 | "Pengeluaran makanan berasal dari beberapa lokasi termasuk Cameron Highlands, Johor Bahru, dan Kuching.", 13 | "Syarikat XYZ telah menghasilkan 20,000 unit produk baharu dalam setahun terakhir", 14 | "Kuala Lumpur merupakan ibu negara Malaysia." "Kau berada di mana semalam?", 15 | "Siapa yang akan memimpin projek itu?", 16 | "Siapa perdana menteri Malaysia sekarang?", 17 | ] 18 | -------------------------------------------------------------------------------- /spacy/lang/nb/examples.py: -------------------------------------------------------------------------------- 1 | """ 2 | Example sentences to test spaCy and its language models. 3 | 4 | >>> from spacy.lang.nb.examples import sentences 5 | >>> docs = nlp.pipe(sentences) 6 | """ 7 | 8 | 9 | sentences = [ 10 | "Apple vurderer å kjøpe britisk oppstartfirma for en milliard dollar.", 11 | "Selvkjørende biler flytter forsikringsansvaret over på produsentene.", 12 | "San Francisco vurderer å forby robotbud på fortauene.", 13 | "London er en stor by i Storbritannia.", 14 | ] 15 | -------------------------------------------------------------------------------- /spacy/lang/ne/__init__.py: -------------------------------------------------------------------------------- 1 | from ...language import BaseDefaults, Language 2 | from .lex_attrs import LEX_ATTRS 3 | from .stop_words import STOP_WORDS 4 | 5 | 6 | class NepaliDefaults(BaseDefaults): 7 | stop_words = STOP_WORDS 8 | lex_attr_getters = LEX_ATTRS 9 | 10 | 11 | class Nepali(Language): 12 | lang = "ne" 13 | Defaults = NepaliDefaults 14 | 15 | 16 | __all__ = ["Nepali"] 17 | -------------------------------------------------------------------------------- /spacy/lang/ne/examples.py: -------------------------------------------------------------------------------- 1 | """ 2 | Example sentences to test spaCy and its language models. 3 | 4 | >>> from spacy.lang.ne.examples import sentences 5 | >>> docs = nlp.pipe(sentences) 6 | """ 7 | 8 | 9 | sentences = [ 10 | "एप्पलले अमेरिकी स्टार्टअप १ अर्ब डलरमा किन्ने सोच्दै छ", 11 | "स्वायत्त कारहरूले बीमा दायित्व निर्माताहरु तिर बदल्छन्", 12 | "स्यान फ्रांसिस्कोले फुटपाथ वितरण रोबोटहरु प्रतिबंध गर्ने विचार गर्दै छ", 13 | "लन्डन यूनाइटेड किंगडमको एक ठूलो शहर हो।", 14 | "तिमी कहाँ छौ?", 15 | "फ्रान्स को राष्ट्रपति को हो?", 16 | "संयुक्त राज्यको राजधानी के हो?", 17 | "बराक ओबामा कहिले कहिले जन्मेका हुन्?", 18 | ] 19 | -------------------------------------------------------------------------------- /spacy/lang/nl/examples.py: -------------------------------------------------------------------------------- 1 | """ 2 | Example sentences to test spaCy and its language models. 3 | 4 | >>> from spacy.lang.nl.examples import sentences 5 | >>> docs = nlp.pipe(sentences) 6 | """ 7 | 8 | 9 | sentences = [ 10 | "Apple overweegt om voor 1 miljard een U.K. startup te kopen", 11 | "Autonome auto's verschuiven de verzekeringverantwoordelijkheid naar producenten", 12 | "San Francisco overweegt robots op voetpaden te verbieden", 13 | "Londen is een grote stad in het Verenigd Koninkrijk", 14 | ] 15 | -------------------------------------------------------------------------------- /spacy/lang/nn/__init__.py: -------------------------------------------------------------------------------- 1 | from ...language import BaseDefaults, Language 2 | from ..nb import SYNTAX_ITERATORS 3 | from .punctuation import TOKENIZER_INFIXES, TOKENIZER_PREFIXES, TOKENIZER_SUFFIXES 4 | from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS 5 | 6 | 7 | class NorwegianNynorskDefaults(BaseDefaults): 8 | tokenizer_exceptions = TOKENIZER_EXCEPTIONS 9 | prefixes = TOKENIZER_PREFIXES 10 | infixes = TOKENIZER_INFIXES 11 | suffixes = TOKENIZER_SUFFIXES 12 | syntax_iterators = SYNTAX_ITERATORS 13 | 14 | 15 | class NorwegianNynorsk(Language): 16 | lang = "nn" 17 | Defaults = NorwegianNynorskDefaults 18 | 19 | 20 | __all__ = ["NorwegianNynorsk"] 21 | -------------------------------------------------------------------------------- /spacy/lang/nn/examples.py: -------------------------------------------------------------------------------- 1 | """ 2 | Example sentences to test spaCy and its language models. 3 | 4 | >>> from spacy.lang.nn.examples import sentences 5 | >>> docs = nlp.pipe(sentences) 6 | """ 7 | 8 | 9 | # sentences taken from Omsetjingsminne frå Nynorsk pressekontor 2022 (https://www.nb.no/sprakbanken/en/resource-catalogue/oai-nb-no-sbr-80/) 10 | sentences = [ 11 | "Konseptet går ut på at alle tre omgangar tel, alle hopparar må stille i kvalifiseringa og poengsummen skal telje.", 12 | "Det er ein meir enn i same periode i fjor.", 13 | "Det har lava ned enorme snømengder i store delar av Europa den siste tida.", 14 | "Akhtar Chaudhry er ikkje innstilt på Oslo-lista til SV, men utfordrar Heikki Holmås om førsteplassen.", 15 | ] 16 | -------------------------------------------------------------------------------- /spacy/lang/pl/examples.py: -------------------------------------------------------------------------------- 1 | """ 2 | Example sentences to test spaCy and its language models. 3 | 4 | >>> from spacy.lang.pl.examples import sentences 5 | >>> docs = nlp.pipe(sentences) 6 | """ 7 | 8 | 9 | sentences = [ 10 | "Poczuł przyjemną woń mocnej kawy.", 11 | "Istnieje wiele dróg oddziaływania substancji psychoaktywnej na układ nerwowy.", 12 | "Powitał mnie biało-czarny kot, płosząc siedzące na płocie trzy dorodne dudki.", 13 | "Nowy abonament pod lupą Komisji Europejskiej", 14 | "Czy w ciągu ostatnich 48 godzin spożyłeś leki zawierające paracetamol?", 15 | "Kto ma ochotę zapoznać się z innymi niż w książkach przygodami Muminków i ich przyjaciół, temu polecam komiks Tove Jansson „Muminki i morze”.", 16 | ] 17 | -------------------------------------------------------------------------------- /spacy/lang/pt/__init__.py: -------------------------------------------------------------------------------- 1 | from ...language import BaseDefaults, Language 2 | from .lex_attrs import LEX_ATTRS 3 | from .punctuation import TOKENIZER_INFIXES, TOKENIZER_PREFIXES 4 | from .stop_words import STOP_WORDS 5 | from .syntax_iterators import SYNTAX_ITERATORS 6 | from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS 7 | 8 | 9 | class PortugueseDefaults(BaseDefaults): 10 | tokenizer_exceptions = TOKENIZER_EXCEPTIONS 11 | infixes = TOKENIZER_INFIXES 12 | prefixes = TOKENIZER_PREFIXES 13 | lex_attr_getters = LEX_ATTRS 14 | syntax_iterators = SYNTAX_ITERATORS 15 | stop_words = STOP_WORDS 16 | 17 | 18 | class Portuguese(Language): 19 | lang = "pt" 20 | Defaults = PortugueseDefaults 21 | 22 | 23 | __all__ = ["Portuguese"] 24 | -------------------------------------------------------------------------------- /spacy/lang/pt/examples.py: -------------------------------------------------------------------------------- 1 | """ 2 | Example sentences to test spaCy and its language models. 3 | 4 | >>> from spacy.lang.pt.examples import sentences 5 | >>> docs = nlp.pipe(sentences) 6 | """ 7 | 8 | 9 | sentences = [ 10 | "Apple está querendo comprar uma startup do Reino Unido por 100 milhões de dólares", 11 | "Carros autônomos empurram a responsabilidade do seguro para os fabricantes." 12 | "São Francisco considera banir os robôs de entrega que andam pelas calçadas", 13 | "Londres é a maior cidade do Reino Unido", 14 | ] 15 | -------------------------------------------------------------------------------- /spacy/lang/pt/punctuation.py: -------------------------------------------------------------------------------- 1 | from ..punctuation import TOKENIZER_INFIXES as BASE_TOKENIZER_INFIXES 2 | from ..punctuation import TOKENIZER_PREFIXES as BASE_TOKENIZER_PREFIXES 3 | from ..punctuation import TOKENIZER_SUFFIXES as BASE_TOKENIZER_SUFFIXES 4 | 5 | _prefixes = [r"\w{1,3}\$"] + BASE_TOKENIZER_PREFIXES 6 | 7 | _suffixes = BASE_TOKENIZER_SUFFIXES 8 | 9 | _infixes = [r"(\w+-\w+(-\w+)*)"] + BASE_TOKENIZER_INFIXES 10 | 11 | TOKENIZER_PREFIXES = _prefixes 12 | TOKENIZER_SUFFIXES = _suffixes 13 | TOKENIZER_INFIXES = _infixes 14 | -------------------------------------------------------------------------------- /spacy/lang/ro/examples.py: -------------------------------------------------------------------------------- 1 | """ 2 | Example sentences to test spaCy and its language models. 3 | 4 | >>> from spacy.lang.ro import Romanian 5 | >>> from spacy.lang.ro.examples import sentences 6 | >>> nlp = Romanian() 7 | >>> docs = nlp.pipe(sentences) 8 | """ 9 | 10 | 11 | sentences = [ 12 | "Apple plănuiește să cumpere o companie britanică pentru un miliard de dolari", 13 | "Municipalitatea din San Francisco ia în calcul interzicerea roboților curieri pe trotuar", 14 | "Londra este un oraș mare în Regatul Unit", 15 | "Unde ești?", 16 | "Cine este președintele Franței?", 17 | "Care este capitala Statelor Unite?", 18 | "Când s-a născut Barack Obama?", 19 | ] 20 | -------------------------------------------------------------------------------- /spacy/lang/sa/__init__.py: -------------------------------------------------------------------------------- 1 | from ...language import BaseDefaults, Language 2 | from .lex_attrs import LEX_ATTRS 3 | from .stop_words import STOP_WORDS 4 | 5 | 6 | class SanskritDefaults(BaseDefaults): 7 | lex_attr_getters = LEX_ATTRS 8 | stop_words = STOP_WORDS 9 | 10 | 11 | class Sanskrit(Language): 12 | lang = "sa" 13 | Defaults = SanskritDefaults 14 | 15 | 16 | __all__ = ["Sanskrit"] 17 | -------------------------------------------------------------------------------- /spacy/lang/sa/examples.py: -------------------------------------------------------------------------------- 1 | """ 2 | Example sentences to test spaCy and its language models. 3 | 4 | >>> from spacy.lang.sa.examples import sentences 5 | >>> docs = nlp.pipe(sentences) 6 | """ 7 | 8 | 9 | sentences = [ 10 | "अभ्यावहति कल्याणं विविधं वाक् सुभाषिता ।", 11 | "मनसि व्याकुले चक्षुः पश्यन्नपि न पश्यति ।", 12 | "यस्य बुद्धिर्बलं तस्य निर्बुद्धेस्तु कुतो बलम्?", 13 | "परो अपि हितवान् बन्धुः बन्धुः अपि अहितः परः ।", 14 | "अहितः देहजः व्याधिः हितम् आरण्यं औषधम् ॥", 15 | ] 16 | -------------------------------------------------------------------------------- /spacy/lang/si/__init__.py: -------------------------------------------------------------------------------- 1 | from ...language import BaseDefaults, Language 2 | from .lex_attrs import LEX_ATTRS 3 | from .stop_words import STOP_WORDS 4 | 5 | 6 | class SinhalaDefaults(BaseDefaults): 7 | lex_attr_getters = LEX_ATTRS 8 | stop_words = STOP_WORDS 9 | 10 | 11 | class Sinhala(Language): 12 | lang = "si" 13 | Defaults = SinhalaDefaults 14 | 15 | 16 | __all__ = ["Sinhala"] 17 | -------------------------------------------------------------------------------- /spacy/lang/si/examples.py: -------------------------------------------------------------------------------- 1 | """ 2 | Example sentences to test spaCy and its language models. 3 | 4 | >>> from spacy.lang.si.examples import sentences 5 | >>> docs = nlp.pipe(sentences) 6 | """ 7 | 8 | 9 | sentences = [ 10 | "මෙය වාක්‍යයකි.", 11 | "ඔබ කවුද?", 12 | "ගූගල් සමාගම ඩොලර් මිලියන 500 කට එම ආයතනය මිලදී ගන්නා ලදී.", 13 | "කොළඹ ශ්‍රී ලංකාවේ ප්‍රධානතම නගරය යි.", 14 | "ප්‍රංශයේ ජනාධිපති කවරෙක් ද?", 15 | "මට බිස්කට් 1 ක් දෙන්න", 16 | "ඔවුන් ලකුණු 59 කින් තරඟය පරාජයට පත් විය.", 17 | "1 ත් 10 ත් අතර සංඛ්‍යාවක් කියන්න", 18 | "ඔහු සහ ඇය නුවර හෝ කොළඹ පදිංචි කරුවන් නොවේ", 19 | ] 20 | -------------------------------------------------------------------------------- /spacy/lang/sk/__init__.py: -------------------------------------------------------------------------------- 1 | from ...language import BaseDefaults, Language 2 | from .lex_attrs import LEX_ATTRS 3 | from .stop_words import STOP_WORDS 4 | 5 | 6 | class SlovakDefaults(BaseDefaults): 7 | lex_attr_getters = LEX_ATTRS 8 | stop_words = STOP_WORDS 9 | 10 | 11 | class Slovak(Language): 12 | lang = "sk" 13 | Defaults = SlovakDefaults 14 | 15 | 16 | __all__ = ["Slovak"] 17 | -------------------------------------------------------------------------------- /spacy/lang/sl/__init__.py: -------------------------------------------------------------------------------- 1 | from ...language import BaseDefaults, Language 2 | from .lex_attrs import LEX_ATTRS 3 | from .punctuation import TOKENIZER_INFIXES, TOKENIZER_PREFIXES, TOKENIZER_SUFFIXES 4 | from .stop_words import STOP_WORDS 5 | from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS 6 | 7 | 8 | class SlovenianDefaults(BaseDefaults): 9 | stop_words = STOP_WORDS 10 | tokenizer_exceptions = TOKENIZER_EXCEPTIONS 11 | prefixes = TOKENIZER_PREFIXES 12 | infixes = TOKENIZER_INFIXES 13 | suffixes = TOKENIZER_SUFFIXES 14 | lex_attr_getters = LEX_ATTRS 15 | 16 | 17 | class Slovenian(Language): 18 | lang = "sl" 19 | Defaults = SlovenianDefaults 20 | 21 | 22 | __all__ = ["Slovenian"] 23 | -------------------------------------------------------------------------------- /spacy/lang/sl/examples.py: -------------------------------------------------------------------------------- 1 | """ 2 | Example sentences to test spaCy and its language models. 3 | 4 | >>> from spacy.lang.sl.examples import sentences 5 | >>> docs = nlp.pipe(sentences) 6 | """ 7 | 8 | 9 | sentences = [ 10 | "Apple načrtuje nakup britanskega startupa za 1 bilijon dolarjev", 11 | "France Prešeren je umrl 8. februarja 1849 v Kranju", 12 | "Staro ljubljansko letališče Moste bo obnovila družba BTC", 13 | "London je največje mesto v Združenem kraljestvu.", 14 | "Kje se skrivaš?", 15 | "Kdo je predsednik Francije?", 16 | "Katero je glavno mesto Združenih držav Amerike?", 17 | "Kdaj je bil rojen Milan Kučan?", 18 | ] 19 | -------------------------------------------------------------------------------- /spacy/lang/sq/__init__.py: -------------------------------------------------------------------------------- 1 | from ...language import BaseDefaults, Language 2 | from .stop_words import STOP_WORDS 3 | 4 | 5 | class AlbanianDefaults(BaseDefaults): 6 | stop_words = STOP_WORDS 7 | 8 | 9 | class Albanian(Language): 10 | lang = "sq" 11 | Defaults = AlbanianDefaults 12 | 13 | 14 | __all__ = ["Albanian"] 15 | -------------------------------------------------------------------------------- /spacy/lang/sq/examples.py: -------------------------------------------------------------------------------- 1 | """ 2 | Example sentences to test spaCy and its language models. 3 | 4 | >>> from spacy.lang.sq.examples import sentences 5 | >>> docs = nlp.pipe(sentences) 6 | """ 7 | 8 | 9 | sentences = [ 10 | "Apple po shqyrton blerjen e nje shoqërie të U.K. për 1 miliard dollarë", 11 | "Makinat autonome ndryshojnë përgjegjësinë e sigurimit ndaj prodhuesve", 12 | "San Francisko konsideron ndalimin e robotëve të shpërndarjes", 13 | "Londra është një qytet i madh në Mbretërinë e Bashkuar.", 14 | ] 15 | -------------------------------------------------------------------------------- /spacy/lang/sr/__init__.py: -------------------------------------------------------------------------------- 1 | from ...language import BaseDefaults, Language 2 | from .lex_attrs import LEX_ATTRS 3 | from .punctuation import TOKENIZER_INFIXES, TOKENIZER_SUFFIXES 4 | from .stop_words import STOP_WORDS 5 | from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS 6 | 7 | 8 | class SerbianDefaults(BaseDefaults): 9 | tokenizer_exceptions = TOKENIZER_EXCEPTIONS 10 | infixes = TOKENIZER_INFIXES 11 | suffixes = TOKENIZER_SUFFIXES 12 | lex_attr_getters = LEX_ATTRS 13 | stop_words = STOP_WORDS 14 | 15 | 16 | class Serbian(Language): 17 | lang = "sr" 18 | Defaults = SerbianDefaults 19 | 20 | 21 | __all__ = ["Serbian"] 22 | -------------------------------------------------------------------------------- /spacy/lang/sr/examples.py: -------------------------------------------------------------------------------- 1 | """ 2 | Example sentences to test spaCy and its language models. 3 | 4 | >>> from spacy.lang.sr.examples import sentences 5 | >>> docs = nlp.pipe(sentences) 6 | """ 7 | 8 | 9 | sentences = [ 10 | # Translations from English 11 | "Apple планира куповину америчког стартапа за $1 милијарду.", 12 | "Беспилотни аутомобили пребацују одговорност осигурања на произвођаче.", 13 | "Лондон је велики град у Уједињеном Краљевству.", 14 | "Где си ти?", 15 | "Ко је председник Француске?", 16 | # Serbian common and slang 17 | "Moj ћале је инжењер!", 18 | "Новак Ђоковић је најбољи тенисер света.", 19 | "У Пироту има добрих кафана!", 20 | "Музеј Николе Тесле се налази у Београду.", 21 | ] 22 | -------------------------------------------------------------------------------- /spacy/lang/sv/examples.py: -------------------------------------------------------------------------------- 1 | """ 2 | Example sentences to test spaCy and its language models. 3 | 4 | >>> from spacy.lang.sv.examples import sentences 5 | >>> docs = nlp.pipe(sentences) 6 | """ 7 | 8 | 9 | sentences = [ 10 | "Apple överväger att köpa brittisk startup för 1 miljard dollar.", 11 | "Självkörande bilar förskjuter försäkringsansvar mot tillverkare.", 12 | "San Fransisco överväger förbud mot leveransrobotar på trottoarer.", 13 | "London är en storstad i Storbritannien.", 14 | ] 15 | -------------------------------------------------------------------------------- /spacy/lang/ta/__init__.py: -------------------------------------------------------------------------------- 1 | from ...language import BaseDefaults, Language 2 | from .lex_attrs import LEX_ATTRS 3 | from .stop_words import STOP_WORDS 4 | 5 | 6 | class TamilDefaults(BaseDefaults): 7 | lex_attr_getters = LEX_ATTRS 8 | stop_words = STOP_WORDS 9 | 10 | 11 | class Tamil(Language): 12 | lang = "ta" 13 | Defaults = TamilDefaults 14 | 15 | 16 | __all__ = ["Tamil"] 17 | -------------------------------------------------------------------------------- /spacy/lang/te/__init__.py: -------------------------------------------------------------------------------- 1 | from ...language import BaseDefaults, Language 2 | from .lex_attrs import LEX_ATTRS 3 | from .stop_words import STOP_WORDS 4 | 5 | 6 | class TeluguDefaults(BaseDefaults): 7 | lex_attr_getters = LEX_ATTRS 8 | stop_words = STOP_WORDS 9 | 10 | 11 | class Telugu(Language): 12 | lang = "te" 13 | Defaults = TeluguDefaults 14 | 15 | 16 | __all__ = ["Telugu"] 17 | -------------------------------------------------------------------------------- /spacy/lang/te/examples.py: -------------------------------------------------------------------------------- 1 | """ 2 | Example sentences to test spaCy and its language models. 3 | 4 | >>> from spacy.lang.te import Telugu 5 | >>> nlp = Telugu() 6 | >>> from spacy.lang.te.examples import sentences 7 | >>> docs = nlp.pipe(sentences) 8 | """ 9 | 10 | 11 | sentences = [ 12 | "ఆపిల్ 1 బిలియన్ డాలర్స్ కి యూ.కె. స్టార్ట్అప్ ని కొనాలని అనుకుంటుంది.", 13 | "ఆటోనోమోస్ కార్లు భీమా బాధ్యతను తయారీదారులపైకి మళ్లిస్తాయి.", 14 | "సాన్ ఫ్రాన్సిస్కో కాలిబాట డెలివరీ రోబోట్లను నిషేధించడానికి ఆలోచిస్తుంది.", 15 | "లండన్ యునైటెడ్ కింగ్డమ్ లో పెద్ద సిటీ.", 16 | "నువ్వు ఎక్కడ ఉన్నావ్?", 17 | "ఫ్రాన్స్ అధ్యక్షుడు ఎవరు?", 18 | "యునైటెడ్ స్టేట్స్ యొక్క రాజధాని ఏంటి?", 19 | "బరాక్ ఒబామా ఎప్పుడు జన్మించారు?", 20 | ] 21 | -------------------------------------------------------------------------------- /spacy/lang/te/stop_words.py: -------------------------------------------------------------------------------- 1 | # Source: https://github.com/Xangis/extra-stopwords (MIT License) 2 | 3 | STOP_WORDS = set( 4 | """ 5 | అందరూ 6 | అందుబాటులో 7 | అడగండి 8 | అడగడం 9 | అడ్డంగా 10 | అనుగుణంగా 11 | అనుమతించు 12 | అనుమతిస్తుంది 13 | అయితే 14 | ఇప్పటికే 15 | ఉన్నారు 16 | ఎక్కడైనా 17 | ఎప్పుడు 18 | ఎవరైనా 19 | ఎవరో ఒకరు 20 | ఏ 21 | ఏదైనా 22 | ఏమైనప్పటికి 23 | ఏమైనప్పటికి 24 | ఒక 25 | ఒక ప్రక్కన 26 | కనిపిస్తాయి 27 | కాదు 28 | కాదు 29 | కూడా 30 | గా 31 | గురించి 32 | చుట్టూ 33 | చేయగలిగింది 34 | తగిన 35 | తర్వాత 36 | తర్వాత 37 | దాదాపు 38 | దూరంగా 39 | నిజంగా 40 | పై 41 | ప్రకారం 42 | మధ్య 43 | మధ్య 44 | మరియు 45 | మరొక 46 | మళ్ళీ 47 | మాత్రమే 48 | మెచ్చుకో 49 | వద్ద 50 | వద్ద 51 | వెంట 52 | వేరుగా 53 | వ్యతిరేకంగా 54 | సంబంధం 55 | """.split() 56 | ) 57 | -------------------------------------------------------------------------------- /spacy/lang/ti/examples.py: -------------------------------------------------------------------------------- 1 | """ 2 | Example sentences to test spaCy and its language models. 3 | 4 | >>> from spacy.lang.ti.examples import sentences 5 | >>> docs = nlp.pipe(sentences) 6 | """ 7 | 8 | 9 | sentences = [ 10 | "አፕል ብዩኬ ትርከብ ንግድ ብ1 ቢሊዮን ዶላር ንምግዛዕ ሐሲባ።", 11 | "ፈላማይ ክታበት ኮቪድ 19 ተጀሚሩ፤ሓዱሽ ተስፋ ሂቡ ኣሎ", 12 | "ቻንስለር ጀርመን ኣንገላ መርከል ዝርግሓ ቫይረስ ኮሮና ንምክልካል ጽኑዕ እገዳ ክግበር ጸዊዓ", 13 | "ለንደን ብዓዲ እንግሊዝ ትርከብ ዓባይ ከተማ እያ።", 14 | "ናበይ አለኻ፧", 15 | "ናይ ፈረንሳይ ፕሬዝዳንት መን እዩ፧", 16 | "ናይ አሜሪካ ዋና ከተማ እንታይ እያ፧", 17 | "ኦባማ መዓስ ተወሊዱ፧", 18 | ] 19 | -------------------------------------------------------------------------------- /spacy/lang/ti/punctuation.py: -------------------------------------------------------------------------------- 1 | from ..char_classes import ( 2 | ALPHA_UPPER, 3 | CURRENCY, 4 | LIST_ELLIPSES, 5 | LIST_PUNCT, 6 | LIST_QUOTES, 7 | UNITS, 8 | ) 9 | 10 | _list_punct = LIST_PUNCT + "፡ ። ፣ ፤ ፥ ፦ ፧ ፠ ፨".strip().split() 11 | 12 | _suffixes = ( 13 | _list_punct 14 | + LIST_ELLIPSES 15 | + LIST_QUOTES 16 | + [ 17 | r"(?<=[0-9])\+", 18 | # Tigrinya is written from Left-To-Right 19 | r"(?<=[0-9])(?:{c})".format(c=CURRENCY), 20 | r"(?<=[0-9])(?:{u})".format(u=UNITS), 21 | r"(?<=[{au}][{au}])\.".format(au=ALPHA_UPPER), 22 | ] 23 | ) 24 | 25 | TOKENIZER_SUFFIXES = _suffixes 26 | -------------------------------------------------------------------------------- /spacy/lang/ti/tokenizer_exceptions.py: -------------------------------------------------------------------------------- 1 | from ...symbols import NORM, ORTH 2 | 3 | _exc = {} 4 | 5 | 6 | for exc_data in [ 7 | {ORTH: "ት/ቤት"}, 8 | {ORTH: "ወ/ሮ", NORM: "ወይዘሮ"}, 9 | {ORTH: "ወ/ሪ", NORM: "ወይዘሪት"}, 10 | ]: 11 | _exc[exc_data[ORTH]] = [exc_data] 12 | 13 | 14 | for orth in [ 15 | "ዓ.ም.", 16 | "ኪ.ሜ.", 17 | ]: 18 | _exc[orth] = [{ORTH: orth}] 19 | 20 | 21 | TOKENIZER_EXCEPTIONS = _exc 22 | -------------------------------------------------------------------------------- /spacy/lang/tl/__init__.py: -------------------------------------------------------------------------------- 1 | from ...language import BaseDefaults, Language 2 | from .lex_attrs import LEX_ATTRS 3 | from .stop_words import STOP_WORDS 4 | from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS 5 | 6 | 7 | class TagalogDefaults(BaseDefaults): 8 | tokenizer_exceptions = TOKENIZER_EXCEPTIONS 9 | lex_attr_getters = LEX_ATTRS 10 | stop_words = STOP_WORDS 11 | 12 | 13 | class Tagalog(Language): 14 | lang = "tl" 15 | Defaults = TagalogDefaults 16 | 17 | 18 | __all__ = ["Tagalog"] 19 | -------------------------------------------------------------------------------- /spacy/lang/tn/__init__.py: -------------------------------------------------------------------------------- 1 | from ...language import BaseDefaults, Language 2 | from .lex_attrs import LEX_ATTRS 3 | from .punctuation import TOKENIZER_INFIXES 4 | from .stop_words import STOP_WORDS 5 | 6 | 7 | class SetswanaDefaults(BaseDefaults): 8 | infixes = TOKENIZER_INFIXES 9 | stop_words = STOP_WORDS 10 | lex_attr_getters = LEX_ATTRS 11 | 12 | 13 | class Setswana(Language): 14 | lang = "tn" 15 | Defaults = SetswanaDefaults 16 | 17 | 18 | __all__ = ["Setswana"] 19 | -------------------------------------------------------------------------------- /spacy/lang/tn/examples.py: -------------------------------------------------------------------------------- 1 | """ 2 | Example sentences to test spaCy and its language models. 3 | >>> from spacy.lang.tn.examples import sentences 4 | >>> docs = nlp.pipe(sentences) 5 | """ 6 | 7 | 8 | sentences = [ 9 | "Apple e nyaka go reka JSE ka tlhwatlhwa ta R1 billion", 10 | "Johannesburg ke toropo e kgolo mo Afrika Borwa.", 11 | "O ko kae?", 12 | "ke mang presidente ya Afrika Borwa?", 13 | "ke eng toropo kgolo ya Afrika Borwa?", 14 | "Nelson Mandela o belegwe leng?", 15 | ] 16 | -------------------------------------------------------------------------------- /spacy/lang/tn/punctuation.py: -------------------------------------------------------------------------------- 1 | from ..char_classes import ( 2 | ALPHA, 3 | ALPHA_LOWER, 4 | ALPHA_UPPER, 5 | CONCAT_QUOTES, 6 | HYPHENS, 7 | LIST_ELLIPSES, 8 | LIST_ICONS, 9 | ) 10 | 11 | _infixes = ( 12 | LIST_ELLIPSES 13 | + LIST_ICONS 14 | + [ 15 | r"(?<=[0-9])[+\-\*^](?=[0-9-])", 16 | r"(?<=[{al}{q}])\.(?=[{au}{q}])".format( 17 | al=ALPHA_LOWER, au=ALPHA_UPPER, q=CONCAT_QUOTES 18 | ), 19 | r"(?<=[{a}]),(?=[{a}])".format(a=ALPHA), 20 | r"(?<=[{a}0-9])(?:{h})(?=[{a}])".format(a=ALPHA, h=HYPHENS), 21 | r"(?<=[{a}0-9])[:<>=/](?=[{a}])".format(a=ALPHA), 22 | ] 23 | ) 24 | 25 | 26 | TOKENIZER_INFIXES = _infixes 27 | -------------------------------------------------------------------------------- /spacy/lang/tr/__init__.py: -------------------------------------------------------------------------------- 1 | from ...language import BaseDefaults, Language 2 | from .lex_attrs import LEX_ATTRS 3 | from .stop_words import STOP_WORDS 4 | from .syntax_iterators import SYNTAX_ITERATORS 5 | from .tokenizer_exceptions import TOKEN_MATCH, TOKENIZER_EXCEPTIONS 6 | 7 | 8 | class TurkishDefaults(BaseDefaults): 9 | tokenizer_exceptions = TOKENIZER_EXCEPTIONS 10 | lex_attr_getters = LEX_ATTRS 11 | stop_words = STOP_WORDS 12 | token_match = TOKEN_MATCH 13 | syntax_iterators = SYNTAX_ITERATORS 14 | 15 | 16 | class Turkish(Language): 17 | lang = "tr" 18 | Defaults = TurkishDefaults 19 | 20 | 21 | __all__ = ["Turkish"] 22 | -------------------------------------------------------------------------------- /spacy/lang/tt/__init__.py: -------------------------------------------------------------------------------- 1 | from ...language import BaseDefaults, Language 2 | from .lex_attrs import LEX_ATTRS 3 | from .punctuation import TOKENIZER_INFIXES 4 | from .stop_words import STOP_WORDS 5 | from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS 6 | 7 | 8 | class TatarDefaults(BaseDefaults): 9 | tokenizer_exceptions = TOKENIZER_EXCEPTIONS 10 | infixes = TOKENIZER_INFIXES 11 | lex_attr_getters = LEX_ATTRS 12 | stop_words = STOP_WORDS 13 | 14 | 15 | class Tatar(Language): 16 | lang = "tt" 17 | Defaults = TatarDefaults 18 | 19 | 20 | __all__ = ["Tatar"] 21 | -------------------------------------------------------------------------------- /spacy/lang/tt/examples.py: -------------------------------------------------------------------------------- 1 | """ 2 | Example sentences to test spaCy and its language models. 3 | >>> from spacy.lang.tt.examples import sentences 4 | >>> docs = nlp.pipe(sentences) 5 | """ 6 | 7 | sentences = [ 8 | "Apple Бөекбритания стартабын $1 миллиард өчен сатып алыун исәпли.", 9 | "Автоном автомобильләр иминият җаваплылыкны җитештерүчеләргә күчерә.", 10 | "Сан-Франциско тротуар буенча йөри торган робот-курьерларны тыю мөмкинлеген карый.", 11 | "Лондон - Бөекбританиядә урнашкан зур шәһәр.", 12 | "Син кайда?", 13 | "Францияда кем президент?", 14 | "Америка Кушма Штатларының башкаласы нинди шәһәр?", 15 | "Барак Обама кайчан туган?", 16 | ] 17 | -------------------------------------------------------------------------------- /spacy/lang/ur/__init__.py: -------------------------------------------------------------------------------- 1 | from ...language import BaseDefaults, Language 2 | from .lex_attrs import LEX_ATTRS 3 | from .punctuation import TOKENIZER_SUFFIXES 4 | from .stop_words import STOP_WORDS 5 | 6 | 7 | class UrduDefaults(BaseDefaults): 8 | suffixes = TOKENIZER_SUFFIXES 9 | lex_attr_getters = LEX_ATTRS 10 | stop_words = STOP_WORDS 11 | writing_system = {"direction": "rtl", "has_case": False, "has_letters": True} 12 | 13 | 14 | class Urdu(Language): 15 | lang = "ur" 16 | Defaults = UrduDefaults 17 | 18 | 19 | __all__ = ["Urdu"] 20 | -------------------------------------------------------------------------------- /spacy/lang/ur/examples.py: -------------------------------------------------------------------------------- 1 | """ 2 | Example sentences to test spaCy and its language models. 3 | 4 | >>> from spacy.lang.da.examples import sentences 5 | >>> docs = nlp.pipe(sentences) 6 | """ 7 | 8 | 9 | sentences = [ 10 | "اردو ہے جس کا نام ہم جانتے ہیں داغ", 11 | "سارے جہاں میں دھوم ہماری زباں کی ہے", 12 | ] 13 | -------------------------------------------------------------------------------- /spacy/lang/ur/punctuation.py: -------------------------------------------------------------------------------- 1 | from ..punctuation import TOKENIZER_SUFFIXES 2 | 3 | _suffixes = TOKENIZER_SUFFIXES 4 | -------------------------------------------------------------------------------- /spacy/lang/vi/examples.py: -------------------------------------------------------------------------------- 1 | """ 2 | Example sentences to test spaCy and its language models. 3 | >>> from spacy.lang.vi.examples import sentences 4 | >>> docs = nlp.pipe(sentences) 5 | """ 6 | 7 | 8 | sentences = [ 9 | "Đây là đâu, tôi là ai?", 10 | "Căn phòng có nhiều cửa sổ nên nó khá sáng", 11 | "Đại dịch COVID vừa qua đã gây ảnh hưởng rất lớn tới nhiều doanh nghiệp lớn nhỏ.", 12 | "Thành phố Hồ Chí Minh đã bị ảnh hưởng nặng nề trong thời gian vừa qua.", 13 | "Ông bạn đang ở đâu thế?", 14 | "Ai là người giải phóng đất nước Việt Nam khỏi ách đô hộ?", 15 | "Vị tướng nào là người đã làm nên chiến thắng lịch sử Điện Biên Phủ?", 16 | "Làm việc nhiều chán quá, đi chơi đâu đi?", 17 | ] 18 | -------------------------------------------------------------------------------- /spacy/lang/xx/__init__.py: -------------------------------------------------------------------------------- 1 | from ...language import Language 2 | 3 | 4 | class MultiLanguage(Language): 5 | """Language class to be used for models that support multiple languages. 6 | This module allows models to specify their language ID as 'xx'. 7 | """ 8 | 9 | lang = "xx" 10 | 11 | 12 | __all__ = ["MultiLanguage"] 13 | -------------------------------------------------------------------------------- /spacy/lang/yo/__init__.py: -------------------------------------------------------------------------------- 1 | from ...language import BaseDefaults, Language 2 | from .lex_attrs import LEX_ATTRS 3 | from .stop_words import STOP_WORDS 4 | 5 | 6 | class YorubaDefaults(BaseDefaults): 7 | lex_attr_getters = LEX_ATTRS 8 | stop_words = STOP_WORDS 9 | 10 | 11 | class Yoruba(Language): 12 | lang = "yo" 13 | Defaults = YorubaDefaults 14 | 15 | 16 | __all__ = ["Yoruba"] 17 | -------------------------------------------------------------------------------- /spacy/lang/yo/stop_words.py: -------------------------------------------------------------------------------- 1 | # stop words as whitespace-separated list. 2 | # Source: https://raw.githubusercontent.com/dohliam/more-stoplists/master/yo/yo.txt 3 | 4 | STOP_WORDS = set( 5 | "a an b bá bí bẹ̀rẹ̀ d e f fún fẹ́ g gbogbo i inú j jù jẹ jẹ́ k kan kì kí kò " 6 | "l láti lè lọ m mi mo máa mọ̀ n ni náà ní nígbà nítorí nǹkan o p padà pé " 7 | "púpọ̀ pẹ̀lú r rẹ̀ s sì sí sínú t ti tí u w wà wá wọn wọ́n y yìí à àti àwọn á " 8 | "è é ì í ò òun ó ù ú ń ńlá ǹ ̀ ́ ̣ ṣ ṣe ṣé ṣùgbọ́n ẹ ẹmọ́ ọ ọjọ́ ọ̀pọ̀lọpọ̀".split() 9 | ) 10 | -------------------------------------------------------------------------------- /spacy/lang/zh/examples.py: -------------------------------------------------------------------------------- 1 | """ 2 | Example sentences to test spaCy and its language models. 3 | 4 | >>> from spacy.lang.zh.examples import sentences 5 | >>> docs = nlp.pipe(sentences) 6 | """ 7 | 8 | # from https://zh.wikipedia.org/wiki/汉语 9 | sentences = [ 10 | "作为语言而言,为世界使用人数最多的语言,目前世界有五分之一人口做为母语。", 11 | "汉语有多种分支,当中官话最为流行,为中华人民共和国的国家通用语言(又称为普通话)、以及中华民国的国语。", 12 | "此外,中文还是联合国正式语文,并被上海合作组织等国际组织采用为官方语言。", 13 | "在中国大陆,汉语通称为“汉语”。", 14 | "在联合国、台湾、香港及澳门,通称为“中文”。", 15 | "在新加坡及马来西亚,通称为“华语”。", 16 | ] 17 | -------------------------------------------------------------------------------- /spacy/matcher/__init__.py: -------------------------------------------------------------------------------- 1 | from .dependencymatcher import DependencyMatcher 2 | from .levenshtein import levenshtein 3 | from .matcher import Matcher 4 | from .phrasematcher import PhraseMatcher 5 | 6 | __all__ = ["DependencyMatcher", "Matcher", "PhraseMatcher", "levenshtein"] 7 | -------------------------------------------------------------------------------- /spacy/matcher/phrasematcher.pxd: -------------------------------------------------------------------------------- 1 | from cymem.cymem cimport Pool 2 | from libcpp.vector cimport vector 3 | from preshed.maps cimport MapStruct, key_t 4 | 5 | from ..attrs cimport attr_id_t 6 | from ..structs cimport SpanC 7 | from ..tokens.doc cimport Doc 8 | from ..vocab cimport Vocab 9 | 10 | 11 | cdef class PhraseMatcher: 12 | cdef readonly Vocab vocab 13 | cdef attr_id_t attr 14 | cdef object _callbacks 15 | cdef object _docs 16 | cdef bint _validate 17 | cdef MapStruct* c_map 18 | cdef Pool mem 19 | cdef key_t _terminal_hash 20 | 21 | cdef void find_matches(self, Doc doc, int start_idx, int end_idx, vector[SpanC] *matches) nogil 22 | -------------------------------------------------------------------------------- /spacy/ml/__init__.py: -------------------------------------------------------------------------------- 1 | from .callbacks import create_models_with_nvtx_range # noqa: F401 2 | from .models import * # noqa: F401, F403 3 | -------------------------------------------------------------------------------- /spacy/ml/models/__init__.py: -------------------------------------------------------------------------------- 1 | from .entity_linker import * # noqa 2 | from .multi_task import * # noqa 3 | from .parser import * # noqa 4 | from .span_finder import * # noqa 5 | from .spancat import * # noqa 6 | from .tagger import * # noqa 7 | from .textcat import * # noqa 8 | from .tok2vec import * # noqa 9 | -------------------------------------------------------------------------------- /spacy/parts_of_speech.pxd: -------------------------------------------------------------------------------- 1 | from . cimport symbols 2 | 3 | 4 | cpdef enum univ_pos_t: 5 | NO_TAG = 0 6 | ADJ = symbols.ADJ 7 | ADP 8 | ADV 9 | AUX 10 | CONJ 11 | CCONJ # U20 12 | DET 13 | INTJ 14 | NOUN 15 | NUM 16 | PART 17 | PRON 18 | PROPN 19 | PUNCT 20 | SCONJ 21 | SYM 22 | VERB 23 | X 24 | EOL 25 | SPACE 26 | -------------------------------------------------------------------------------- /spacy/parts_of_speech.pyx: -------------------------------------------------------------------------------- 1 | # cython: profile=False 2 | IDS = { 3 | "": NO_TAG, 4 | "ADJ": ADJ, 5 | "ADP": ADP, 6 | "ADV": ADV, 7 | "AUX": AUX, 8 | "CONJ": CONJ, # U20 9 | "CCONJ": CCONJ, 10 | "DET": DET, 11 | "INTJ": INTJ, 12 | "NOUN": NOUN, 13 | "NUM": NUM, 14 | "PART": PART, 15 | "PRON": PRON, 16 | "PROPN": PROPN, 17 | "PUNCT": PUNCT, 18 | "SCONJ": SCONJ, 19 | "SYM": SYM, 20 | "VERB": VERB, 21 | "X": X, 22 | "EOL": EOL, 23 | "SPACE": SPACE 24 | } 25 | 26 | 27 | NAMES = {value: key for key, value in IDS.items()} 28 | 29 | # As of Cython 3.1, the global Python namespace no longer has the enum 30 | # contents by default. 31 | globals().update(IDS) 32 | 33 | -------------------------------------------------------------------------------- /spacy/pipeline/_edit_tree_internals/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/explosion/spaCy/41e07772dc5805594bab2997a090a9033e26bf56/spacy/pipeline/_edit_tree_internals/__init__.py -------------------------------------------------------------------------------- /spacy/pipeline/_parser_internals/__init__.pxd: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/explosion/spaCy/41e07772dc5805594bab2997a090a9033e26bf56/spacy/pipeline/_parser_internals/__init__.pxd -------------------------------------------------------------------------------- /spacy/pipeline/_parser_internals/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/explosion/spaCy/41e07772dc5805594bab2997a090a9033e26bf56/spacy/pipeline/_parser_internals/__init__.py -------------------------------------------------------------------------------- /spacy/pipeline/_parser_internals/_beam_utils.pxd: -------------------------------------------------------------------------------- 1 | from ...typedefs cimport class_t, hash_t 2 | 3 | 4 | # These are passed as callbacks to thinc.search.Beam 5 | cdef int transition_state(void* _dest, void* _src, class_t clas, void* _moves) except -1 6 | 7 | cdef int check_final_state(void* _state, void* extra_args) except -1 8 | -------------------------------------------------------------------------------- /spacy/pipeline/_parser_internals/_state.pyx: -------------------------------------------------------------------------------- 1 | # cython: profile=False 2 | -------------------------------------------------------------------------------- /spacy/pipeline/_parser_internals/arc_eager.pxd: -------------------------------------------------------------------------------- 1 | from ...typedefs cimport attr_t, weight_t 2 | from ._state cimport StateC 3 | from .transition_system cimport Transition, TransitionSystem 4 | 5 | 6 | cdef class ArcEager(TransitionSystem): 7 | cdef get_arcs(self, StateC* state) 8 | -------------------------------------------------------------------------------- /spacy/pipeline/_parser_internals/ner.pxd: -------------------------------------------------------------------------------- 1 | from .transition_system cimport TransitionSystem 2 | 3 | 4 | cdef class BiluoPushDown(TransitionSystem): 5 | pass 6 | -------------------------------------------------------------------------------- /spacy/pipeline/_parser_internals/nonproj.hh: -------------------------------------------------------------------------------- 1 | #ifndef NONPROJ_HH 2 | #define NONPROJ_HH 3 | 4 | #include 5 | #include 6 | 7 | void raise_domain_error(std::string const &msg) { 8 | throw std::domain_error(msg); 9 | } 10 | 11 | #endif // NONPROJ_HH 12 | -------------------------------------------------------------------------------- /spacy/pipeline/_parser_internals/nonproj.pxd: -------------------------------------------------------------------------------- 1 | from libcpp.string cimport string 2 | 3 | 4 | cdef extern from "nonproj.hh": 5 | cdef void raise_domain_error(const string& msg) nogil except + 6 | -------------------------------------------------------------------------------- /spacy/pipeline/legacy/__init__.py: -------------------------------------------------------------------------------- 1 | from .entity_linker import EntityLinker_v1 2 | 3 | __all__ = ["EntityLinker_v1"] 4 | -------------------------------------------------------------------------------- /spacy/pipeline/pipe.pxd: -------------------------------------------------------------------------------- 1 | cdef class Pipe: 2 | cdef public str name 3 | -------------------------------------------------------------------------------- /spacy/pipeline/trainable_pipe.pxd: -------------------------------------------------------------------------------- 1 | from ..vocab cimport Vocab 2 | from .pipe cimport Pipe 3 | 4 | 5 | cdef class TrainablePipe(Pipe): 6 | cdef public Vocab vocab 7 | cdef public object model 8 | cdef public object cfg 9 | cdef public object scorer 10 | -------------------------------------------------------------------------------- /spacy/py.typed: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/explosion/spaCy/41e07772dc5805594bab2997a090a9033e26bf56/spacy/py.typed -------------------------------------------------------------------------------- /spacy/tests/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/explosion/spaCy/41e07772dc5805594bab2997a090a9033e26bf56/spacy/tests/__init__.py -------------------------------------------------------------------------------- /spacy/tests/doc/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/explosion/spaCy/41e07772dc5805594bab2997a090a9033e26bf56/spacy/tests/doc/__init__.py -------------------------------------------------------------------------------- /spacy/tests/enable_gpu.py: -------------------------------------------------------------------------------- 1 | from spacy import require_gpu 2 | 3 | require_gpu() 4 | -------------------------------------------------------------------------------- /spacy/tests/lang/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/explosion/spaCy/41e07772dc5805594bab2997a090a9033e26bf56/spacy/tests/lang/__init__.py -------------------------------------------------------------------------------- /spacy/tests/lang/af/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/explosion/spaCy/41e07772dc5805594bab2997a090a9033e26bf56/spacy/tests/lang/af/__init__.py -------------------------------------------------------------------------------- /spacy/tests/lang/am/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/explosion/spaCy/41e07772dc5805594bab2997a090a9033e26bf56/spacy/tests/lang/am/__init__.py -------------------------------------------------------------------------------- /spacy/tests/lang/am/test_exception.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/explosion/spaCy/41e07772dc5805594bab2997a090a9033e26bf56/spacy/tests/lang/am/test_exception.py -------------------------------------------------------------------------------- /spacy/tests/lang/ar/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/explosion/spaCy/41e07772dc5805594bab2997a090a9033e26bf56/spacy/tests/lang/ar/__init__.py -------------------------------------------------------------------------------- /spacy/tests/lang/ar/test_exceptions.py: -------------------------------------------------------------------------------- 1 | import pytest 2 | 3 | 4 | @pytest.mark.parametrize("text", ["ق.م", "إلخ", "ص.ب", "ت."]) 5 | def test_ar_tokenizer_handles_abbr(ar_tokenizer, text): 6 | tokens = ar_tokenizer(text) 7 | assert len(tokens) == 1 8 | 9 | 10 | def test_ar_tokenizer_handles_exc_in_text(ar_tokenizer): 11 | text = "تعود الكتابة الهيروغليفية إلى سنة 3200 ق.م" 12 | tokens = ar_tokenizer(text) 13 | assert len(tokens) == 7 14 | assert tokens[6].text == "ق.م" 15 | 16 | 17 | def test_ar_tokenizer_handles_exc_in_text_2(ar_tokenizer): 18 | text = "يبلغ طول مضيق طارق 14كم " 19 | tokens = ar_tokenizer(text) 20 | assert len(tokens) == 6 21 | -------------------------------------------------------------------------------- /spacy/tests/lang/ar/test_text.py: -------------------------------------------------------------------------------- 1 | def test_ar_tokenizer_handles_long_text(ar_tokenizer): 2 | text = """نجيب محفوظ مؤلف و كاتب روائي عربي، يعد من أهم الأدباء العرب خلال القرن العشرين. 3 | ولد نجيب محفوظ في مدينة القاهرة، حيث ترعرع و تلقى تعليمه الجامعي في جامعتها، 4 | فتمكن من نيل شهادة في الفلسفة. ألف محفوظ على مدار حياته الكثير من الأعمال الأدبية، و في مقدمتها ثلاثيته الشهيرة. 5 | و قد نجح في الحصول على جائزة نوبل للآداب، ليكون بذلك العربي الوحيد الذي فاز بها.""" 6 | 7 | tokens = ar_tokenizer(text) 8 | assert tokens[3].is_stop is True 9 | assert len(tokens) == 77 10 | -------------------------------------------------------------------------------- /spacy/tests/lang/bg/test_tokenizer.py: -------------------------------------------------------------------------------- 1 | import pytest 2 | 3 | 4 | def test_bg_tokenizer_handles_final_diacritics(bg_tokenizer): 5 | text = "Ня̀маше яйца̀. Ня̀маше яйца̀." 6 | tokens = bg_tokenizer(text) 7 | assert tokens[1].text == "яйца̀" 8 | assert tokens[2].text == "." 9 | -------------------------------------------------------------------------------- /spacy/tests/lang/bn/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/explosion/spaCy/41e07772dc5805594bab2997a090a9033e26bf56/spacy/tests/lang/bn/__init__.py -------------------------------------------------------------------------------- /spacy/tests/lang/bo/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/explosion/spaCy/41e07772dc5805594bab2997a090a9033e26bf56/spacy/tests/lang/bo/__init__.py -------------------------------------------------------------------------------- /spacy/tests/lang/bo/test_text.py: -------------------------------------------------------------------------------- 1 | import pytest 2 | 3 | 4 | @pytest.mark.parametrize( 5 | "text,match", 6 | [ 7 | ("10", True), 8 | ("1", True), 9 | ("999.0", True), 10 | ("གཅིག་", True), 11 | ("གཉིས་", True), 12 | ("ཀླད་ཀོར་", True), 13 | ("བཅུ་གཅིག་", True), 14 | ("ཁྱི་", False), 15 | (",", False), 16 | ], 17 | ) 18 | def test_lex_attrs_like_number(bo_tokenizer, text, match): 19 | tokens = bo_tokenizer(text) 20 | assert len(tokens) == 1 21 | assert tokens[0].like_num == match 22 | -------------------------------------------------------------------------------- /spacy/tests/lang/ca/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/explosion/spaCy/41e07772dc5805594bab2997a090a9033e26bf56/spacy/tests/lang/ca/__init__.py -------------------------------------------------------------------------------- /spacy/tests/lang/ca/test_exception.py: -------------------------------------------------------------------------------- 1 | import pytest 2 | 3 | 4 | @pytest.mark.parametrize( 5 | "text,lemma", 6 | [("aprox.", "aproximadament"), ("pàg.", "pàgina"), ("p.ex.", "per exemple")], 7 | ) 8 | def test_ca_tokenizer_handles_abbr(ca_tokenizer, text, lemma): 9 | tokens = ca_tokenizer(text) 10 | assert len(tokens) == 1 11 | 12 | 13 | def test_ca_tokenizer_handles_exc_in_text(ca_tokenizer): 14 | text = "La Dra. Puig viu a la pl. dels Til·lers." 15 | doc = ca_tokenizer(text) 16 | assert [t.text for t in doc] == [ 17 | "La", 18 | "Dra.", 19 | "Puig", 20 | "viu", 21 | "a", 22 | "la", 23 | "pl.", 24 | "d", 25 | "els", 26 | "Til·lers", 27 | ".", 28 | ] 29 | -------------------------------------------------------------------------------- /spacy/tests/lang/ca/test_prefix_suffix_infix.py: -------------------------------------------------------------------------------- 1 | import pytest 2 | 3 | 4 | @pytest.mark.parametrize( 5 | "text,expected_tokens", 6 | [ 7 | ("d'un", ["d'", "un"]), 8 | ("s'ha", ["s'", "ha"]), 9 | ("del", ["d", "el"]), 10 | ("cantar-te", ["cantar", "-te"]), 11 | ("-hola", ["-", "hola"]), 12 | ], 13 | ) 14 | def test_contractions(ca_tokenizer, text, expected_tokens): 15 | """Test that the contractions are split into two tokens""" 16 | tokens = ca_tokenizer(text) 17 | assert len(tokens) == 2 18 | assert [t.text for t in tokens] == expected_tokens 19 | -------------------------------------------------------------------------------- /spacy/tests/lang/cs/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/explosion/spaCy/41e07772dc5805594bab2997a090a9033e26bf56/spacy/tests/lang/cs/__init__.py -------------------------------------------------------------------------------- /spacy/tests/lang/cs/test_text.py: -------------------------------------------------------------------------------- 1 | import pytest 2 | 3 | 4 | @pytest.mark.parametrize( 5 | "text,match", 6 | [ 7 | ("10", True), 8 | ("1", True), 9 | ("10.000", True), 10 | ("1000", True), 11 | ("999,0", True), 12 | ("devatenáct", True), 13 | ("osmdesát", True), 14 | ("kvadrilion", True), 15 | ("Pes", False), 16 | (",", False), 17 | ("1/2", True), 18 | ], 19 | ) 20 | def test_lex_attrs_like_number(cs_tokenizer, text, match): 21 | tokens = cs_tokenizer(text) 22 | assert len(tokens) == 1 23 | assert tokens[0].like_num == match 24 | -------------------------------------------------------------------------------- /spacy/tests/lang/da/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/explosion/spaCy/41e07772dc5805594bab2997a090a9033e26bf56/spacy/tests/lang/da/__init__.py -------------------------------------------------------------------------------- /spacy/tests/lang/de/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/explosion/spaCy/41e07772dc5805594bab2997a090a9033e26bf56/spacy/tests/lang/de/__init__.py -------------------------------------------------------------------------------- /spacy/tests/lang/de/test_exceptions.py: -------------------------------------------------------------------------------- 1 | import pytest 2 | 3 | 4 | @pytest.mark.parametrize("text", ["auf'm", "du's", "über'm", "wir's"]) 5 | def test_de_tokenizer_splits_contractions(de_tokenizer, text): 6 | tokens = de_tokenizer(text) 7 | assert len(tokens) == 2 8 | 9 | 10 | @pytest.mark.parametrize("text", ["z.B.", "d.h.", "Jan.", "Dez.", "Chr."]) 11 | def test_de_tokenizer_handles_abbr(de_tokenizer, text): 12 | tokens = de_tokenizer(text) 13 | assert len(tokens) == 1 14 | 15 | 16 | def test_de_tokenizer_handles_exc_in_text(de_tokenizer): 17 | text = "Ich bin z.Zt. im Urlaub." 18 | tokens = de_tokenizer(text) 19 | assert len(tokens) == 6 20 | assert tokens[2].text == "z.Zt." 21 | -------------------------------------------------------------------------------- /spacy/tests/lang/de/test_noun_chunks.py: -------------------------------------------------------------------------------- 1 | import pytest 2 | 3 | 4 | def test_noun_chunks_is_parsed_de(de_tokenizer): 5 | """Test that noun_chunks raises Value Error for 'de' language if Doc is not parsed.""" 6 | doc = de_tokenizer("Er lag auf seinem") 7 | with pytest.raises(ValueError): 8 | list(doc.noun_chunks) 9 | -------------------------------------------------------------------------------- /spacy/tests/lang/dsb/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/explosion/spaCy/41e07772dc5805594bab2997a090a9033e26bf56/spacy/tests/lang/dsb/__init__.py -------------------------------------------------------------------------------- /spacy/tests/lang/dsb/test_text.py: -------------------------------------------------------------------------------- 1 | import pytest 2 | 3 | 4 | @pytest.mark.parametrize( 5 | "text,match", 6 | [ 7 | ("10", True), 8 | ("1", True), 9 | ("10,000", True), 10 | ("10,00", True), 11 | ("jadno", True), 12 | ("dwanassćo", True), 13 | ("milion", True), 14 | ("sto", True), 15 | ("ceła", False), 16 | ("kopica", False), 17 | ("narěcow", False), 18 | (",", False), 19 | ("1/2", True), 20 | ], 21 | ) 22 | def test_lex_attrs_like_number(dsb_tokenizer, text, match): 23 | tokens = dsb_tokenizer(text) 24 | assert len(tokens) == 1 25 | assert tokens[0].like_num == match 26 | -------------------------------------------------------------------------------- /spacy/tests/lang/el/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/explosion/spaCy/41e07772dc5805594bab2997a090a9033e26bf56/spacy/tests/lang/el/__init__.py -------------------------------------------------------------------------------- /spacy/tests/lang/el/test_exception.py: -------------------------------------------------------------------------------- 1 | import pytest 2 | 3 | 4 | @pytest.mark.parametrize("text", ["αριθ.", "τρισ.", "δισ.", "σελ."]) 5 | def test_el_tokenizer_handles_abbr(el_tokenizer, text): 6 | tokens = el_tokenizer(text) 7 | assert len(tokens) == 1 8 | 9 | 10 | def test_el_tokenizer_handles_exc_in_text(el_tokenizer): 11 | text = "Στα 14 τρισ. δολάρια το κόστος από την άνοδο της στάθμης της θάλασσας." 12 | tokens = el_tokenizer(text) 13 | assert len(tokens) == 14 14 | assert tokens[2].text == "τρισ." 15 | -------------------------------------------------------------------------------- /spacy/tests/lang/el/test_noun_chunks.py: -------------------------------------------------------------------------------- 1 | import pytest 2 | 3 | 4 | def test_noun_chunks_is_parsed_el(el_tokenizer): 5 | """Test that noun_chunks raises Value Error for 'el' language if Doc is not parsed.""" 6 | doc = el_tokenizer("είναι χώρα της νοτιοανατολικής") 7 | with pytest.raises(ValueError): 8 | list(doc.noun_chunks) 9 | -------------------------------------------------------------------------------- /spacy/tests/lang/en/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/explosion/spaCy/41e07772dc5805594bab2997a090a9033e26bf56/spacy/tests/lang/en/__init__.py -------------------------------------------------------------------------------- /spacy/tests/lang/es/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/explosion/spaCy/41e07772dc5805594bab2997a090a9033e26bf56/spacy/tests/lang/es/__init__.py -------------------------------------------------------------------------------- /spacy/tests/lang/es/test_exception.py: -------------------------------------------------------------------------------- 1 | import pytest 2 | 3 | 4 | @pytest.mark.parametrize( 5 | "text,lemma", 6 | [ 7 | ("aprox.", "aproximadamente"), 8 | ("esq.", "esquina"), 9 | ("pág.", "página"), 10 | ("p.ej.", "por ejemplo"), 11 | ], 12 | ) 13 | def test_es_tokenizer_handles_abbr(es_tokenizer, text, lemma): 14 | tokens = es_tokenizer(text) 15 | assert len(tokens) == 1 16 | 17 | 18 | def test_es_tokenizer_handles_exc_in_text(es_tokenizer): 19 | text = "Mariano Rajoy ha corrido aprox. medio kilómetro" 20 | tokens = es_tokenizer(text) 21 | assert len(tokens) == 7 22 | assert tokens[4].text == "aprox." 23 | -------------------------------------------------------------------------------- /spacy/tests/lang/et/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/explosion/spaCy/41e07772dc5805594bab2997a090a9033e26bf56/spacy/tests/lang/et/__init__.py -------------------------------------------------------------------------------- /spacy/tests/lang/eu/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/explosion/spaCy/41e07772dc5805594bab2997a090a9033e26bf56/spacy/tests/lang/eu/__init__.py -------------------------------------------------------------------------------- /spacy/tests/lang/eu/test_text.py: -------------------------------------------------------------------------------- 1 | import pytest 2 | 3 | 4 | def test_eu_tokenizer_handles_long_text(eu_tokenizer): 5 | text = """ta nere guitarra estrenatu ondoren""" 6 | tokens = eu_tokenizer(text) 7 | assert len(tokens) == 5 8 | 9 | 10 | @pytest.mark.parametrize( 11 | "text,length", 12 | [ 13 | ("milesker ederra joan zen hitzaldia plazer hutsa", 7), 14 | ("astelehen guztia sofan pasau biot", 5), 15 | ], 16 | ) 17 | def test_eu_tokenizer_handles_cnts(eu_tokenizer, text, length): 18 | tokens = eu_tokenizer(text) 19 | assert len(tokens) == length 20 | -------------------------------------------------------------------------------- /spacy/tests/lang/fa/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/explosion/spaCy/41e07772dc5805594bab2997a090a9033e26bf56/spacy/tests/lang/fa/__init__.py -------------------------------------------------------------------------------- /spacy/tests/lang/fa/test_noun_chunks.py: -------------------------------------------------------------------------------- 1 | import pytest 2 | 3 | 4 | def test_noun_chunks_is_parsed_fa(fa_tokenizer): 5 | """Test that noun_chunks raises Value Error for 'fa' language if Doc is not parsed.""" 6 | 7 | doc = fa_tokenizer("این یک جمله نمونه می باشد.") 8 | with pytest.raises(ValueError): 9 | list(doc.noun_chunks) 10 | -------------------------------------------------------------------------------- /spacy/tests/lang/fi/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/explosion/spaCy/41e07772dc5805594bab2997a090a9033e26bf56/spacy/tests/lang/fi/__init__.py -------------------------------------------------------------------------------- /spacy/tests/lang/fi/test_text.py: -------------------------------------------------------------------------------- 1 | import pytest 2 | 3 | 4 | @pytest.mark.parametrize( 5 | "text,match", 6 | [ 7 | ("10", True), 8 | ("1", True), 9 | ("10000", True), 10 | ("10,00", True), 11 | ("-999,0", True), 12 | ("yksi", True), 13 | ("kolmetoista", True), 14 | ("viisikymmentä", True), 15 | ("tuhat", True), 16 | ("1/2", True), 17 | ("hevonen", False), 18 | (",", False), 19 | ], 20 | ) 21 | def test_fi_lex_attrs_like_number(fi_tokenizer, text, match): 22 | tokens = fi_tokenizer(text) 23 | assert len(tokens) == 1 24 | assert tokens[0].like_num == match 25 | -------------------------------------------------------------------------------- /spacy/tests/lang/fo/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/explosion/spaCy/41e07772dc5805594bab2997a090a9033e26bf56/spacy/tests/lang/fo/__init__.py -------------------------------------------------------------------------------- /spacy/tests/lang/fr/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/explosion/spaCy/41e07772dc5805594bab2997a090a9033e26bf56/spacy/tests/lang/fr/__init__.py -------------------------------------------------------------------------------- /spacy/tests/lang/ga/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/explosion/spaCy/41e07772dc5805594bab2997a090a9033e26bf56/spacy/tests/lang/ga/__init__.py -------------------------------------------------------------------------------- /spacy/tests/lang/ga/test_tokenizer.py: -------------------------------------------------------------------------------- 1 | import pytest 2 | 3 | # fmt: off 4 | GA_TOKEN_EXCEPTION_TESTS = [ 5 | ("Niall Ó Domhnaill, Rialtas na hÉireann 1977 (lch. 600).", ["Niall", "Ó", "Domhnaill", ",", "Rialtas", "na", "hÉireann", "1977", "(", "lch.", "600", ")", "."]), 6 | ("Daoine a bhfuil Gaeilge acu, m.sh. tusa agus mise", ["Daoine", "a", "bhfuil", "Gaeilge", "acu", ",", "m.sh.", "tusa", "agus", "mise"]) 7 | ] 8 | # fmt: on 9 | 10 | 11 | @pytest.mark.parametrize("text,expected_tokens", GA_TOKEN_EXCEPTION_TESTS) 12 | def test_ga_tokenizer_handles_exception_cases(ga_tokenizer, text, expected_tokens): 13 | tokens = ga_tokenizer(text) 14 | token_list = [token.text for token in tokens if not token.is_space] 15 | assert expected_tokens == token_list 16 | -------------------------------------------------------------------------------- /spacy/tests/lang/grc/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/explosion/spaCy/41e07772dc5805594bab2997a090a9033e26bf56/spacy/tests/lang/grc/__init__.py -------------------------------------------------------------------------------- /spacy/tests/lang/grc/test_text.py: -------------------------------------------------------------------------------- 1 | import pytest 2 | 3 | 4 | @pytest.mark.parametrize( 5 | "text,match", 6 | [ 7 | ("ι", True), 8 | ("α", True), 9 | ("ϟα", True), 10 | ("ἑκατόν", True), 11 | ("ἐνακόσια", True), 12 | ("δισχίλια", True), 13 | ("μύρια", True), 14 | ("εἷς", True), 15 | ("λόγος", False), 16 | (",", False), 17 | ("λβ", True), 18 | ], 19 | ) 20 | def test_lex_attrs_like_number(grc_tokenizer, text, match): 21 | tokens = grc_tokenizer(text) 22 | assert len(tokens) == 1 23 | assert tokens[0].like_num == match 24 | -------------------------------------------------------------------------------- /spacy/tests/lang/gu/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/explosion/spaCy/41e07772dc5805594bab2997a090a9033e26bf56/spacy/tests/lang/gu/__init__.py -------------------------------------------------------------------------------- /spacy/tests/lang/gu/test_text.py: -------------------------------------------------------------------------------- 1 | import pytest 2 | 3 | 4 | def test_gu_tokenizer_handlers_long_text(gu_tokenizer): 5 | text = """પશ્ચિમ ભારતમાં આવેલું ગુજરાત રાજ્ય જે વ્યક્તિઓની માતૃભૂમિ છે""" 6 | tokens = gu_tokenizer(text) 7 | assert len(tokens) == 9 8 | 9 | 10 | @pytest.mark.parametrize( 11 | "text,length", 12 | [("ગુજરાતીઓ ખાવાના શોખીન માનવામાં આવે છે", 6), ("ખેતરની ખેડ કરવામાં આવે છે.", 5)], 13 | ) 14 | def test_gu_tokenizer_handles_cnts(gu_tokenizer, text, length): 15 | tokens = gu_tokenizer(text) 16 | assert len(tokens) == length 17 | -------------------------------------------------------------------------------- /spacy/tests/lang/he/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/explosion/spaCy/41e07772dc5805594bab2997a090a9033e26bf56/spacy/tests/lang/he/__init__.py -------------------------------------------------------------------------------- /spacy/tests/lang/hi/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/explosion/spaCy/41e07772dc5805594bab2997a090a9033e26bf56/spacy/tests/lang/hi/__init__.py -------------------------------------------------------------------------------- /spacy/tests/lang/hi/test_text.py: -------------------------------------------------------------------------------- 1 | import pytest 2 | 3 | from spacy.lang.hi import Hindi 4 | 5 | 6 | @pytest.mark.issue(3625) 7 | def test_issue3625(): 8 | """Test that default punctuation rules applies to hindi unicode characters""" 9 | nlp = Hindi() 10 | doc = nlp("hi. how हुए. होटल, होटल") 11 | expected = ["hi", ".", "how", "हुए", ".", "होटल", ",", "होटल"] 12 | assert [token.text for token in doc] == expected 13 | -------------------------------------------------------------------------------- /spacy/tests/lang/hr/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/explosion/spaCy/41e07772dc5805594bab2997a090a9033e26bf56/spacy/tests/lang/hr/__init__.py -------------------------------------------------------------------------------- /spacy/tests/lang/hsb/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/explosion/spaCy/41e07772dc5805594bab2997a090a9033e26bf56/spacy/tests/lang/hsb/__init__.py -------------------------------------------------------------------------------- /spacy/tests/lang/hsb/test_text.py: -------------------------------------------------------------------------------- 1 | import pytest 2 | 3 | 4 | @pytest.mark.parametrize( 5 | "text,match", 6 | [ 7 | ("10", True), 8 | ("1", True), 9 | ("10,000", True), 10 | ("10,00", True), 11 | ("jedne", True), 12 | ("dwanaće", True), 13 | ("milion", True), 14 | ("sto", True), 15 | ("załožene", False), 16 | ("wona", False), 17 | ("powšitkownej", False), 18 | (",", False), 19 | ("1/2", True), 20 | ], 21 | ) 22 | def test_lex_attrs_like_number(hsb_tokenizer, text, match): 23 | tokens = hsb_tokenizer(text) 24 | assert len(tokens) == 1 25 | assert tokens[0].like_num == match 26 | -------------------------------------------------------------------------------- /spacy/tests/lang/ht/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/explosion/spaCy/41e07772dc5805594bab2997a090a9033e26bf56/spacy/tests/lang/ht/__init__.py -------------------------------------------------------------------------------- /spacy/tests/lang/hu/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/explosion/spaCy/41e07772dc5805594bab2997a090a9033e26bf56/spacy/tests/lang/hu/__init__.py -------------------------------------------------------------------------------- /spacy/tests/lang/hy/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/explosion/spaCy/41e07772dc5805594bab2997a090a9033e26bf56/spacy/tests/lang/hy/__init__.py -------------------------------------------------------------------------------- /spacy/tests/lang/hy/test_text.py: -------------------------------------------------------------------------------- 1 | import pytest 2 | 3 | from spacy.lang.hy.lex_attrs import like_num 4 | 5 | 6 | @pytest.mark.parametrize("word", ["հիսուն"]) 7 | def test_hy_lex_attrs_capitals(word): 8 | assert like_num(word) 9 | assert like_num(word.upper()) 10 | -------------------------------------------------------------------------------- /spacy/tests/lang/id/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/explosion/spaCy/41e07772dc5805594bab2997a090a9033e26bf56/spacy/tests/lang/id/__init__.py -------------------------------------------------------------------------------- /spacy/tests/lang/id/test_noun_chunks.py: -------------------------------------------------------------------------------- 1 | import pytest 2 | 3 | 4 | def test_noun_chunks_is_parsed_id(id_tokenizer): 5 | """Test that noun_chunks raises Value Error for 'id' language if Doc is not parsed.""" 6 | doc = id_tokenizer("sebelas") 7 | with pytest.raises(ValueError): 8 | list(doc.noun_chunks) 9 | -------------------------------------------------------------------------------- /spacy/tests/lang/id/test_text.py: -------------------------------------------------------------------------------- 1 | import pytest 2 | 3 | from spacy.lang.id.lex_attrs import like_num 4 | 5 | 6 | @pytest.mark.parametrize("word", ["sebelas"]) 7 | def test_id_lex_attrs_capitals(word): 8 | assert like_num(word) 9 | assert like_num(word.upper()) 10 | -------------------------------------------------------------------------------- /spacy/tests/lang/is/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/explosion/spaCy/41e07772dc5805594bab2997a090a9033e26bf56/spacy/tests/lang/is/__init__.py -------------------------------------------------------------------------------- /spacy/tests/lang/it/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/explosion/spaCy/41e07772dc5805594bab2997a090a9033e26bf56/spacy/tests/lang/it/__init__.py -------------------------------------------------------------------------------- /spacy/tests/lang/it/test_prefix_suffix_infix.py: -------------------------------------------------------------------------------- 1 | import pytest 2 | 3 | 4 | @pytest.mark.parametrize( 5 | "text,expected_tokens", [("c'è", ["c'", "è"]), ("l'ha", ["l'", "ha"])] 6 | ) 7 | def test_contractions(it_tokenizer, text, expected_tokens): 8 | """Test that the contractions are split into two tokens""" 9 | tokens = it_tokenizer(text) 10 | assert len(tokens) == 2 11 | assert [t.text for t in tokens] == expected_tokens 12 | -------------------------------------------------------------------------------- /spacy/tests/lang/it/test_stopwords.py: -------------------------------------------------------------------------------- 1 | import pytest 2 | 3 | 4 | @pytest.mark.parametrize( 5 | "word", ["un", "lo", "dell", "dall", "si", "ti", "mi", "quest", "quel", "quello"] 6 | ) 7 | def test_stopwords_basic(it_tokenizer, word): 8 | tok = it_tokenizer(word)[0] 9 | assert tok.is_stop 10 | 11 | 12 | @pytest.mark.parametrize( 13 | "word", ["quest'uomo", "l'ho", "un'amica", "dell'olio", "s'arrende", "m'ascolti"] 14 | ) 15 | def test_stopwords_elided(it_tokenizer, word): 16 | tok = it_tokenizer(word)[0] 17 | assert tok.is_stop 18 | -------------------------------------------------------------------------------- /spacy/tests/lang/it/test_text.py: -------------------------------------------------------------------------------- 1 | import pytest 2 | 3 | 4 | @pytest.mark.issue(2822) 5 | def test_issue2822(it_tokenizer): 6 | """Test that the abbreviation of poco is kept as one word.""" 7 | doc = it_tokenizer("Vuoi un po' di zucchero?") 8 | assert len(doc) == 6 9 | assert doc[0].text == "Vuoi" 10 | assert doc[1].text == "un" 11 | assert doc[2].text == "po'" 12 | assert doc[3].text == "di" 13 | assert doc[4].text == "zucchero" 14 | assert doc[5].text == "?" 15 | -------------------------------------------------------------------------------- /spacy/tests/lang/ja/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/explosion/spaCy/41e07772dc5805594bab2997a090a9033e26bf56/spacy/tests/lang/ja/__init__.py -------------------------------------------------------------------------------- /spacy/tests/lang/ja/test_morphologizer_factory.py: -------------------------------------------------------------------------------- 1 | import pytest 2 | 3 | from spacy.lang.ja import Japanese 4 | 5 | 6 | def test_ja_morphologizer_factory(): 7 | pytest.importorskip("sudachipy") 8 | nlp = Japanese() 9 | morphologizer = nlp.add_pipe("morphologizer") 10 | assert morphologizer.cfg["extend"] is True 11 | -------------------------------------------------------------------------------- /spacy/tests/lang/kmr/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/explosion/spaCy/41e07772dc5805594bab2997a090a9033e26bf56/spacy/tests/lang/kmr/__init__.py -------------------------------------------------------------------------------- /spacy/tests/lang/kmr/test_text.py: -------------------------------------------------------------------------------- 1 | import pytest 2 | 3 | from spacy.lang.kmr.lex_attrs import like_num 4 | 5 | 6 | @pytest.mark.parametrize( 7 | "word", 8 | [ 9 | "yekem", 10 | "duyemîn", 11 | "100em", 12 | "dehem", 13 | "sedemîn", 14 | "34em", 15 | "30yem", 16 | "20emîn", 17 | "50yemîn", 18 | ], 19 | ) 20 | def test_kmr_lex_attrs_like_number_for_ordinal(word): 21 | assert like_num(word) 22 | 23 | 24 | @pytest.mark.parametrize("word", ["deh"]) 25 | def test_kmr_lex_attrs_capitals(word): 26 | assert like_num(word) 27 | assert like_num(word.upper()) 28 | -------------------------------------------------------------------------------- /spacy/tests/lang/ko/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/explosion/spaCy/41e07772dc5805594bab2997a090a9033e26bf56/spacy/tests/lang/ko/__init__.py -------------------------------------------------------------------------------- /spacy/tests/lang/ko/test_lemmatization.py: -------------------------------------------------------------------------------- 1 | import pytest 2 | 3 | 4 | @pytest.mark.parametrize( 5 | "word,lemma", 6 | [ 7 | ("새로운", "새롭"), 8 | ("빨간", "빨갛"), 9 | ("클수록", "크"), 10 | ("뭡니까", "뭣"), 11 | ("됐다", "되"), 12 | ], 13 | ) 14 | def test_ko_lemmatizer_assigns(ko_tokenizer, word, lemma): 15 | test_lemma = ko_tokenizer(word)[0].lemma_ 16 | assert test_lemma == lemma 17 | -------------------------------------------------------------------------------- /spacy/tests/lang/ky/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/explosion/spaCy/41e07772dc5805594bab2997a090a9033e26bf56/spacy/tests/lang/ky/__init__.py -------------------------------------------------------------------------------- /spacy/tests/lang/la/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/explosion/spaCy/41e07772dc5805594bab2997a090a9033e26bf56/spacy/tests/lang/la/__init__.py -------------------------------------------------------------------------------- /spacy/tests/lang/la/test_exception.py: -------------------------------------------------------------------------------- 1 | import pytest 2 | 3 | 4 | def test_la_tokenizer_handles_exc_in_text(la_tokenizer): 5 | text = "scio te omnia facturum, ut nobiscum quam primum sis" 6 | tokens = la_tokenizer(text) 7 | assert len(tokens) == 11 8 | assert tokens[6].text == "nobis" 9 | -------------------------------------------------------------------------------- /spacy/tests/lang/lb/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/explosion/spaCy/41e07772dc5805594bab2997a090a9033e26bf56/spacy/tests/lang/lb/__init__.py -------------------------------------------------------------------------------- /spacy/tests/lang/lb/test_exceptions.py: -------------------------------------------------------------------------------- 1 | import pytest 2 | 3 | 4 | @pytest.mark.parametrize("text", ["z.B.", "Jan."]) 5 | def test_lb_tokenizer_handles_abbr(lb_tokenizer, text): 6 | tokens = lb_tokenizer(text) 7 | assert len(tokens) == 1 8 | 9 | 10 | @pytest.mark.parametrize("text", ["d'Saach", "d'Kanner", "d’Welt", "d’Suen"]) 11 | def test_lb_tokenizer_splits_contractions(lb_tokenizer, text): 12 | tokens = lb_tokenizer(text) 13 | assert len(tokens) == 2 14 | 15 | 16 | def test_lb_tokenizer_handles_exc_in_text(lb_tokenizer): 17 | text = "Mee 't ass net evident, d'Liewen." 18 | tokens = lb_tokenizer(text) 19 | assert len(tokens) == 9 20 | assert tokens[1].text == "'t" 21 | -------------------------------------------------------------------------------- /spacy/tests/lang/lb/test_prefix_suffix_infix.py: -------------------------------------------------------------------------------- 1 | import pytest 2 | 3 | 4 | @pytest.mark.parametrize("text,length", [("z.B.", 1), ("zb.", 2), ("(z.B.", 2)]) 5 | def test_lb_tokenizer_splits_prefix_interact(lb_tokenizer, text, length): 6 | tokens = lb_tokenizer(text) 7 | assert len(tokens) == length 8 | 9 | 10 | @pytest.mark.parametrize("text", ["z.B.)"]) 11 | def test_lb_tokenizer_splits_suffix_interact(lb_tokenizer, text): 12 | tokens = lb_tokenizer(text) 13 | assert len(tokens) == 2 14 | 15 | 16 | @pytest.mark.parametrize("text", ["(z.B.)"]) 17 | def test_lb_tokenizer_splits_even_wrap_interact(lb_tokenizer, text): 18 | tokens = lb_tokenizer(text) 19 | assert len(tokens) == 3 20 | -------------------------------------------------------------------------------- /spacy/tests/lang/lg/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/explosion/spaCy/41e07772dc5805594bab2997a090a9033e26bf56/spacy/tests/lang/lg/__init__.py -------------------------------------------------------------------------------- /spacy/tests/lang/lg/test_tokenizer.py: -------------------------------------------------------------------------------- 1 | import pytest 2 | 3 | LG_BASIC_TOKENIZATION_TESTS = [ 4 | ( 5 | "Abooluganda ab’emmamba ababiri", 6 | ["Abooluganda", "ab’emmamba", "ababiri"], 7 | ), 8 | ] 9 | 10 | 11 | @pytest.mark.parametrize("text,expected_tokens", LG_BASIC_TOKENIZATION_TESTS) 12 | def test_lg_tokenizer_basic(lg_tokenizer, text, expected_tokens): 13 | tokens = lg_tokenizer(text) 14 | token_list = [token.text for token in tokens if not token.is_space] 15 | assert expected_tokens == token_list 16 | -------------------------------------------------------------------------------- /spacy/tests/lang/lt/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/explosion/spaCy/41e07772dc5805594bab2997a090a9033e26bf56/spacy/tests/lang/lt/__init__.py -------------------------------------------------------------------------------- /spacy/tests/lang/lv/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/explosion/spaCy/41e07772dc5805594bab2997a090a9033e26bf56/spacy/tests/lang/lv/__init__.py -------------------------------------------------------------------------------- /spacy/tests/lang/mk/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/explosion/spaCy/41e07772dc5805594bab2997a090a9033e26bf56/spacy/tests/lang/mk/__init__.py -------------------------------------------------------------------------------- /spacy/tests/lang/ml/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/explosion/spaCy/41e07772dc5805594bab2997a090a9033e26bf56/spacy/tests/lang/ml/__init__.py -------------------------------------------------------------------------------- /spacy/tests/lang/ml/test_text.py: -------------------------------------------------------------------------------- 1 | import pytest 2 | 3 | 4 | def test_ml_tokenizer_handles_long_text(ml_tokenizer): 5 | text = """അനാവശ്യമായി കണ്ണിലും മൂക്കിലും വായിലും സ്പർശിക്കാതിരിക്കുക""" 6 | tokens = ml_tokenizer(text) 7 | assert len(tokens) == 5 8 | 9 | 10 | @pytest.mark.parametrize( 11 | "text,length", 12 | [ 13 | ( 14 | "എന്നാൽ അച്ചടിയുടെ ആവിർഭാവം ലിപിയിൽ കാര്യമായ മാറ്റങ്ങൾ വരുത്തിയത് കൂട്ടക്ഷരങ്ങളെ അണുഅക്ഷരങ്ങളായി പിരിച്ചുകൊണ്ടായിരുന്നു", 15 | 10, 16 | ), 17 | ("പരമ്പരാഗതമായി മലയാളം ഇടത്തുനിന്ന് വലത്തോട്ടാണ് എഴുതുന്നത്", 5), 18 | ], 19 | ) 20 | def test_ml_tokenizer_handles_cnts(ml_tokenizer, text, length): 21 | tokens = ml_tokenizer(text) 22 | assert len(tokens) == length 23 | -------------------------------------------------------------------------------- /spacy/tests/lang/ms/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/explosion/spaCy/41e07772dc5805594bab2997a090a9033e26bf56/spacy/tests/lang/ms/__init__.py -------------------------------------------------------------------------------- /spacy/tests/lang/ms/test_noun_chunks.py: -------------------------------------------------------------------------------- 1 | import pytest 2 | 3 | 4 | def test_noun_chunks_is_parsed_ms(ms_tokenizer): 5 | """Test that noun_chunks raises Value Error for 'ms' language if Doc is not parsed.""" 6 | doc = ms_tokenizer("sebelas") 7 | with pytest.raises(ValueError): 8 | list(doc.noun_chunks) 9 | -------------------------------------------------------------------------------- /spacy/tests/lang/ms/test_text.py: -------------------------------------------------------------------------------- 1 | import pytest 2 | 3 | from spacy.lang.ms.lex_attrs import like_num 4 | 5 | 6 | @pytest.mark.parametrize("word", ["sebelas"]) 7 | def test_ms_lex_attrs_capitals(word): 8 | assert like_num(word) 9 | assert like_num(word.upper()) 10 | -------------------------------------------------------------------------------- /spacy/tests/lang/nb/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/explosion/spaCy/41e07772dc5805594bab2997a090a9033e26bf56/spacy/tests/lang/nb/__init__.py -------------------------------------------------------------------------------- /spacy/tests/lang/nb/test_noun_chunks.py: -------------------------------------------------------------------------------- 1 | import pytest 2 | 3 | 4 | def test_noun_chunks_is_parsed_nb(nb_tokenizer): 5 | """Test that noun_chunks raises Value Error for 'nb' language if Doc is not parsed.""" 6 | doc = nb_tokenizer("Smørsausen brukes bl.a. til") 7 | with pytest.raises(ValueError): 8 | list(doc.noun_chunks) 9 | -------------------------------------------------------------------------------- /spacy/tests/lang/nb/test_tokenizer.py: -------------------------------------------------------------------------------- 1 | import pytest 2 | 3 | NB_TOKEN_EXCEPTION_TESTS = [ 4 | ( 5 | "Smørsausen brukes bl.a. til fisk", 6 | ["Smørsausen", "brukes", "bl.a.", "til", "fisk"], 7 | ), 8 | ( 9 | "Jeg kommer først kl. 13 pga. diverse forsinkelser", 10 | ["Jeg", "kommer", "først", "kl.", "13", "pga.", "diverse", "forsinkelser"], 11 | ), 12 | ] 13 | 14 | 15 | @pytest.mark.parametrize("text,expected_tokens", NB_TOKEN_EXCEPTION_TESTS) 16 | def test_nb_tokenizer_handles_exception_cases(nb_tokenizer, text, expected_tokens): 17 | tokens = nb_tokenizer(text) 18 | token_list = [token.text for token in tokens if not token.is_space] 19 | assert expected_tokens == token_list 20 | -------------------------------------------------------------------------------- /spacy/tests/lang/ne/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/explosion/spaCy/41e07772dc5805594bab2997a090a9033e26bf56/spacy/tests/lang/ne/__init__.py -------------------------------------------------------------------------------- /spacy/tests/lang/ne/test_text.py: -------------------------------------------------------------------------------- 1 | import pytest 2 | 3 | 4 | def test_ne_tokenizer_handlers_long_text(ne_tokenizer): 5 | text = """मैले पाएको सर्टिफिकेटलाई म त बोक्रो सम्झन्छु र अभ्यास तब सुरु भयो, जब मैले कलेज पार गरेँ र जीवनको पढाइ सुरु गरेँ ।""" 6 | tokens = ne_tokenizer(text) 7 | assert len(tokens) == 24 8 | 9 | 10 | @pytest.mark.parametrize( 11 | "text,length", [("समय जान कति पनि बेर लाग्दैन ।", 7), ("म ठूलो हुँदै थिएँ ।", 5)] 12 | ) 13 | def test_ne_tokenizer_handles_cnts(ne_tokenizer, text, length): 14 | tokens = ne_tokenizer(text) 15 | assert len(tokens) == length 16 | -------------------------------------------------------------------------------- /spacy/tests/lang/nl/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/explosion/spaCy/41e07772dc5805594bab2997a090a9033e26bf56/spacy/tests/lang/nl/__init__.py -------------------------------------------------------------------------------- /spacy/tests/lang/nn/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/explosion/spaCy/41e07772dc5805594bab2997a090a9033e26bf56/spacy/tests/lang/nn/__init__.py -------------------------------------------------------------------------------- /spacy/tests/lang/pl/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/explosion/spaCy/41e07772dc5805594bab2997a090a9033e26bf56/spacy/tests/lang/pl/__init__.py -------------------------------------------------------------------------------- /spacy/tests/lang/pl/test_text.py: -------------------------------------------------------------------------------- 1 | """Words like numbers are recognized correctly.""" 2 | 3 | import pytest 4 | 5 | 6 | @pytest.mark.parametrize( 7 | "text,match", 8 | [ 9 | ("10", True), 10 | ("1", True), 11 | ("10,000", True), 12 | ("10,00", True), 13 | ("jeden", True), 14 | ("dwa", True), 15 | ("milion", True), 16 | ("pies", False), 17 | (",", False), 18 | ("1/2", True), 19 | ], 20 | ) 21 | def test_lex_attrs_like_number(pl_tokenizer, text, match): 22 | tokens = pl_tokenizer(text) 23 | assert len(tokens) == 1 24 | assert tokens[0].like_num == match 25 | -------------------------------------------------------------------------------- /spacy/tests/lang/pl/test_tokenizer.py: -------------------------------------------------------------------------------- 1 | import pytest 2 | 3 | DOT_TESTS = [ 4 | ("tel.", ["tel", "."]), 5 | ("0 zł 99 gr", ["0", "zł", "99", "gr"]), 6 | ] 7 | 8 | HYPHEN_TESTS = [ 9 | ("cztero-", ["cztero-"]), 10 | ("jedno-", ["jedno-"]), 11 | ("dwu-", ["dwu-"]), 12 | ("trzy-", ["trzy-"]), 13 | ] 14 | 15 | 16 | TESTCASES = DOT_TESTS + HYPHEN_TESTS 17 | 18 | 19 | @pytest.mark.parametrize("text,expected_tokens", TESTCASES) 20 | def test_tokenizer_handles_testcases(pl_tokenizer, text, expected_tokens): 21 | tokens = pl_tokenizer(text) 22 | token_list = [token.text for token in tokens if not token.is_space] 23 | assert expected_tokens == token_list 24 | -------------------------------------------------------------------------------- /spacy/tests/lang/pt/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/explosion/spaCy/41e07772dc5805594bab2997a090a9033e26bf56/spacy/tests/lang/pt/__init__.py -------------------------------------------------------------------------------- /spacy/tests/lang/pt/test_text.py: -------------------------------------------------------------------------------- 1 | import pytest 2 | 3 | from spacy.lang.pt.lex_attrs import like_num 4 | 5 | 6 | @pytest.mark.parametrize("word", ["onze", "quadragésimo"]) 7 | def test_pt_lex_attrs_capitals(word): 8 | assert like_num(word) 9 | assert like_num(word.upper()) 10 | -------------------------------------------------------------------------------- /spacy/tests/lang/ro/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/explosion/spaCy/41e07772dc5805594bab2997a090a9033e26bf56/spacy/tests/lang/ro/__init__.py -------------------------------------------------------------------------------- /spacy/tests/lang/ru/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/explosion/spaCy/41e07772dc5805594bab2997a090a9033e26bf56/spacy/tests/lang/ru/__init__.py -------------------------------------------------------------------------------- /spacy/tests/lang/ru/test_exceptions.py: -------------------------------------------------------------------------------- 1 | import pytest 2 | 3 | 4 | @pytest.mark.parametrize( 5 | "text,norms", 6 | [("пн.", ["понедельник"]), ("пт.", ["пятница"]), ("дек.", ["декабрь"])], 7 | ) 8 | def test_ru_tokenizer_abbrev_exceptions(ru_tokenizer, text, norms): 9 | tokens = ru_tokenizer(text) 10 | assert len(tokens) == 1 11 | assert [token.norm_ for token in tokens] == norms 12 | -------------------------------------------------------------------------------- /spacy/tests/lang/ru/test_text.py: -------------------------------------------------------------------------------- 1 | import pytest 2 | 3 | from spacy.lang.ru.lex_attrs import like_num 4 | 5 | 6 | @pytest.mark.parametrize("word", ["одиннадцать"]) 7 | def test_ru_lex_attrs_capitals(word): 8 | assert like_num(word) 9 | assert like_num(word.upper()) 10 | -------------------------------------------------------------------------------- /spacy/tests/lang/sa/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/explosion/spaCy/41e07772dc5805594bab2997a090a9033e26bf56/spacy/tests/lang/sa/__init__.py -------------------------------------------------------------------------------- /spacy/tests/lang/sk/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/explosion/spaCy/41e07772dc5805594bab2997a090a9033e26bf56/spacy/tests/lang/sk/__init__.py -------------------------------------------------------------------------------- /spacy/tests/lang/sk/test_tokenizer.py: -------------------------------------------------------------------------------- 1 | import pytest 2 | 3 | SK_BASIC_TOKENIZATION_TESTS = [ 4 | ( 5 | "Kedy sa narodil Andrej Kiska?", 6 | ["Kedy", "sa", "narodil", "Andrej", "Kiska", "?"], 7 | ), 8 | ] 9 | 10 | 11 | @pytest.mark.parametrize("text,expected_tokens", SK_BASIC_TOKENIZATION_TESTS) 12 | def test_sk_tokenizer_basic(sk_tokenizer, text, expected_tokens): 13 | tokens = sk_tokenizer(text) 14 | token_list = [token.text for token in tokens if not token.is_space] 15 | assert expected_tokens == token_list 16 | -------------------------------------------------------------------------------- /spacy/tests/lang/sl/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/explosion/spaCy/41e07772dc5805594bab2997a090a9033e26bf56/spacy/tests/lang/sl/__init__.py -------------------------------------------------------------------------------- /spacy/tests/lang/sq/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/explosion/spaCy/41e07772dc5805594bab2997a090a9033e26bf56/spacy/tests/lang/sq/__init__.py -------------------------------------------------------------------------------- /spacy/tests/lang/sr/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/explosion/spaCy/41e07772dc5805594bab2997a090a9033e26bf56/spacy/tests/lang/sr/__init__.py -------------------------------------------------------------------------------- /spacy/tests/lang/sr/test_exceptions.py: -------------------------------------------------------------------------------- 1 | import pytest 2 | 3 | 4 | @pytest.mark.parametrize( 5 | "text,norms,lemmas", 6 | [ 7 | ("о.г.", ["ове године"], ["ова година"]), 8 | ("чет.", ["четвртак"], ["четвртак"]), 9 | ("гђа", ["госпођа"], ["госпођа"]), 10 | ("ил'", ["или"], ["или"]), 11 | ], 12 | ) 13 | def test_sr_tokenizer_abbrev_exceptions(sr_tokenizer, text, norms, lemmas): 14 | tokens = sr_tokenizer(text) 15 | assert len(tokens) == 1 16 | assert [token.norm_ for token in tokens] == norms 17 | -------------------------------------------------------------------------------- /spacy/tests/lang/sv/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/explosion/spaCy/41e07772dc5805594bab2997a090a9033e26bf56/spacy/tests/lang/sv/__init__.py -------------------------------------------------------------------------------- /spacy/tests/lang/sv/test_text.py: -------------------------------------------------------------------------------- 1 | def test_sv_tokenizer_handles_long_text(sv_tokenizer): 2 | text = """Det var så härligt ute på landet. Det var sommar, majsen var gul, havren grön, 3 | höet var uppställt i stackar nere vid den gröna ängen, och där gick storken på sina långa, 4 | röda ben och snackade engelska, för det språket hade han lärt sig av sin mor. 5 | 6 | Runt om åkrar och äng låg den stora skogen, och mitt i skogen fanns djupa sjöar; jo, det var verkligen trevligt ute på landet!""" 7 | tokens = sv_tokenizer(text) 8 | assert len(tokens) == 86 9 | 10 | 11 | def test_sv_tokenizer_handles_trailing_dot_for_i_in_sentence(sv_tokenizer): 12 | text = "Provar att tokenisera en mening med ord i." 13 | tokens = sv_tokenizer(text) 14 | assert len(tokens) == 9 15 | -------------------------------------------------------------------------------- /spacy/tests/lang/ta/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/explosion/spaCy/41e07772dc5805594bab2997a090a9033e26bf56/spacy/tests/lang/ta/__init__.py -------------------------------------------------------------------------------- /spacy/tests/lang/th/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/explosion/spaCy/41e07772dc5805594bab2997a090a9033e26bf56/spacy/tests/lang/th/__init__.py -------------------------------------------------------------------------------- /spacy/tests/lang/th/test_tokenizer.py: -------------------------------------------------------------------------------- 1 | import pytest 2 | 3 | 4 | @pytest.mark.parametrize( 5 | "text,expected_tokens", [("คุณรักผมไหม", ["คุณ", "รัก", "ผม", "ไหม"])] 6 | ) 7 | def test_th_tokenizer(th_tokenizer, text, expected_tokens): 8 | tokens = [token.text for token in th_tokenizer(text)] 9 | assert tokens == expected_tokens 10 | -------------------------------------------------------------------------------- /spacy/tests/lang/ti/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/explosion/spaCy/41e07772dc5805594bab2997a090a9033e26bf56/spacy/tests/lang/ti/__init__.py -------------------------------------------------------------------------------- /spacy/tests/lang/ti/test_exception.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/explosion/spaCy/41e07772dc5805594bab2997a090a9033e26bf56/spacy/tests/lang/ti/test_exception.py -------------------------------------------------------------------------------- /spacy/tests/lang/tl/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/explosion/spaCy/41e07772dc5805594bab2997a090a9033e26bf56/spacy/tests/lang/tl/__init__.py -------------------------------------------------------------------------------- /spacy/tests/lang/tl/test_indices.py: -------------------------------------------------------------------------------- 1 | def test_tl_simple_punct(tl_tokenizer): 2 | text = "Sige, punta ka dito" 3 | tokens = tl_tokenizer(text) 4 | assert tokens[0].idx == 0 5 | assert tokens[1].idx == 4 6 | assert tokens[2].idx == 6 7 | assert tokens[3].idx == 12 8 | assert tokens[4].idx == 15 9 | -------------------------------------------------------------------------------- /spacy/tests/lang/tr/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/explosion/spaCy/41e07772dc5805594bab2997a090a9033e26bf56/spacy/tests/lang/tr/__init__.py -------------------------------------------------------------------------------- /spacy/tests/lang/tr/test_noun_chunks.py: -------------------------------------------------------------------------------- 1 | import pytest 2 | 3 | 4 | def test_noun_chunks_is_parsed(tr_tokenizer): 5 | """Test that noun_chunks raises Value Error for 'tr' language if Doc is not parsed. 6 | To check this test, we're constructing a Doc 7 | with a new Vocab here and forcing is_parsed to 'False' 8 | to make sure the noun chunks don't run. 9 | """ 10 | doc = tr_tokenizer("Dün seni gördüm.") 11 | with pytest.raises(ValueError): 12 | list(doc.noun_chunks) 13 | -------------------------------------------------------------------------------- /spacy/tests/lang/tt/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/explosion/spaCy/41e07772dc5805594bab2997a090a9033e26bf56/spacy/tests/lang/tt/__init__.py -------------------------------------------------------------------------------- /spacy/tests/lang/uk/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/explosion/spaCy/41e07772dc5805594bab2997a090a9033e26bf56/spacy/tests/lang/uk/__init__.py -------------------------------------------------------------------------------- /spacy/tests/lang/uk/test_tokenizer_exc.py: -------------------------------------------------------------------------------- 1 | import pytest 2 | 3 | 4 | @pytest.mark.parametrize( 5 | "text,norms,lemmas", 6 | [("ім.", ["імені"], ["ім'я"]), ("проф.", ["професор"], ["професор"])], 7 | ) 8 | def test_uk_tokenizer_abbrev_exceptions(uk_tokenizer, text, norms, lemmas): 9 | tokens = uk_tokenizer(text) 10 | assert len(tokens) == 1 11 | assert [token.norm_ for token in tokens] == norms 12 | -------------------------------------------------------------------------------- /spacy/tests/lang/ur/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/explosion/spaCy/41e07772dc5805594bab2997a090a9033e26bf56/spacy/tests/lang/ur/__init__.py -------------------------------------------------------------------------------- /spacy/tests/lang/ur/test_prefix_suffix_infix.py: -------------------------------------------------------------------------------- 1 | import pytest 2 | 3 | 4 | @pytest.mark.parametrize("text", ["ہےں۔", "کیا۔"]) 5 | def test_contractions(ur_tokenizer, text): 6 | """Test specific Urdu punctuation character""" 7 | tokens = ur_tokenizer(text) 8 | assert len(tokens) == 2 9 | -------------------------------------------------------------------------------- /spacy/tests/lang/ur/test_text.py: -------------------------------------------------------------------------------- 1 | import pytest 2 | 3 | 4 | def test_ur_tokenizer_handles_long_text(ur_tokenizer): 5 | text = """اصل میں، رسوا ہونے کی ہمیں کچھ عادت سی ہو گئی ہے۔""" 6 | tokens = ur_tokenizer(text) 7 | assert len(tokens) == 14 8 | 9 | 10 | @pytest.mark.parametrize("text,length", [("تحریر باسط حبیب", 3), ("میرا پاکستان", 2)]) 11 | def test_ur_tokenizer_handles_cnts(ur_tokenizer, text, length): 12 | tokens = ur_tokenizer(text) 13 | assert len(tokens) == length 14 | -------------------------------------------------------------------------------- /spacy/tests/lang/vi/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/explosion/spaCy/41e07772dc5805594bab2997a090a9033e26bf56/spacy/tests/lang/vi/__init__.py -------------------------------------------------------------------------------- /spacy/tests/lang/xx/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/explosion/spaCy/41e07772dc5805594bab2997a090a9033e26bf56/spacy/tests/lang/xx/__init__.py -------------------------------------------------------------------------------- /spacy/tests/lang/xx/test_tokenizer.py: -------------------------------------------------------------------------------- 1 | import pytest 2 | 3 | XX_BASIC_TOKENIZATION_TESTS = [ 4 | ( 5 | "Lääʹddjânnmest lie nuʹtt 10 000 säʹmmliʹžžed. Seeʹst pâʹjjel", 6 | [ 7 | "Lääʹddjânnmest", 8 | "lie", 9 | "nuʹtt", 10 | "10", 11 | "000", 12 | "säʹmmliʹžžed", 13 | ".", 14 | "Seeʹst", 15 | "pâʹjjel", 16 | ], 17 | ), 18 | ] 19 | 20 | 21 | @pytest.mark.parametrize("text,expected_tokens", XX_BASIC_TOKENIZATION_TESTS) 22 | def test_xx_tokenizer_basic(xx_tokenizer, text, expected_tokens): 23 | tokens = xx_tokenizer(text) 24 | token_list = [token.text for token in tokens if not token.is_space] 25 | assert expected_tokens == token_list 26 | -------------------------------------------------------------------------------- /spacy/tests/lang/yo/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/explosion/spaCy/41e07772dc5805594bab2997a090a9033e26bf56/spacy/tests/lang/yo/__init__.py -------------------------------------------------------------------------------- /spacy/tests/lang/zh/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/explosion/spaCy/41e07772dc5805594bab2997a090a9033e26bf56/spacy/tests/lang/zh/__init__.py -------------------------------------------------------------------------------- /spacy/tests/lang/zh/test_text.py: -------------------------------------------------------------------------------- 1 | import pytest 2 | 3 | 4 | @pytest.mark.parametrize( 5 | "text,match", 6 | [ 7 | ("10", True), 8 | ("1", True), 9 | ("999.0", True), 10 | ("一", True), 11 | ("二", True), 12 | ("〇", True), 13 | ("十一", True), 14 | ("狗", False), 15 | (",", False), 16 | ], 17 | ) 18 | def test_lex_attrs_like_number(zh_tokenizer_jieba, text, match): 19 | tokens = zh_tokenizer_jieba(text) 20 | assert len(tokens) == 1 21 | assert tokens[0].like_num == match 22 | -------------------------------------------------------------------------------- /spacy/tests/matcher/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/explosion/spaCy/41e07772dc5805594bab2997a090a9033e26bf56/spacy/tests/matcher/__init__.py -------------------------------------------------------------------------------- /spacy/tests/morphology/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/explosion/spaCy/41e07772dc5805594bab2997a090a9033e26bf56/spacy/tests/morphology/__init__.py -------------------------------------------------------------------------------- /spacy/tests/package/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/explosion/spaCy/41e07772dc5805594bab2997a090a9033e26bf56/spacy/tests/package/__init__.py -------------------------------------------------------------------------------- /spacy/tests/parser/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/explosion/spaCy/41e07772dc5805594bab2997a090a9033e26bf56/spacy/tests/parser/__init__.py -------------------------------------------------------------------------------- /spacy/tests/pipeline/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/explosion/spaCy/41e07772dc5805594bab2997a090a9033e26bf56/spacy/tests/pipeline/__init__.py -------------------------------------------------------------------------------- /spacy/tests/serialize/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/explosion/spaCy/41e07772dc5805594bab2997a090a9033e26bf56/spacy/tests/serialize/__init__.py -------------------------------------------------------------------------------- /spacy/tests/test_architectures.py: -------------------------------------------------------------------------------- 1 | import pytest 2 | from catalogue import RegistryError 3 | from thinc.api import Linear 4 | 5 | from spacy import registry 6 | 7 | 8 | def test_get_architecture(): 9 | @registry.architectures("my_test_function") 10 | def create_model(nr_in, nr_out): 11 | return Linear(nr_in, nr_out) 12 | 13 | arch = registry.architectures.get("my_test_function") 14 | assert arch is create_model 15 | with pytest.raises(RegistryError): 16 | registry.architectures.get("not_an_existing_key") 17 | -------------------------------------------------------------------------------- /spacy/tests/test_errors.py: -------------------------------------------------------------------------------- 1 | from inspect import isclass 2 | 3 | import pytest 4 | 5 | from spacy.errors import ErrorsWithCodes 6 | 7 | 8 | class Errors(metaclass=ErrorsWithCodes): 9 | E001 = "error description" 10 | 11 | 12 | def test_add_codes(): 13 | assert Errors.E001 == "[E001] error description" 14 | with pytest.raises(AttributeError): 15 | Errors.E002 16 | assert isclass(Errors.__class__) 17 | -------------------------------------------------------------------------------- /spacy/tests/tokenizer/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/explosion/spaCy/41e07772dc5805594bab2997a090a9033e26bf56/spacy/tests/tokenizer/__init__.py -------------------------------------------------------------------------------- /spacy/tests/training/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/explosion/spaCy/41e07772dc5805594bab2997a090a9033e26bf56/spacy/tests/training/__init__.py -------------------------------------------------------------------------------- /spacy/tests/training/test_logger.py: -------------------------------------------------------------------------------- 1 | import pytest 2 | 3 | import spacy 4 | from spacy.training import loggers 5 | 6 | 7 | @pytest.fixture() 8 | def nlp(): 9 | nlp = spacy.blank("en") 10 | nlp.add_pipe("ner") 11 | return nlp 12 | 13 | 14 | @pytest.fixture() 15 | def info(): 16 | return { 17 | "losses": {"ner": 100}, 18 | "other_scores": {"ENTS_F": 0.85, "ENTS_P": 0.90, "ENTS_R": 0.80}, 19 | "epoch": 100, 20 | "step": 125, 21 | "score": 85, 22 | } 23 | 24 | 25 | def test_console_logger(nlp, info): 26 | console_logger = loggers.console_logger( 27 | progress_bar=True, console_output=True, output_file=None 28 | ) 29 | log_step, finalize = console_logger(nlp) 30 | log_step(info) 31 | -------------------------------------------------------------------------------- /spacy/tests/vocab_vectors/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/explosion/spaCy/41e07772dc5805594bab2997a090a9033e26bf56/spacy/tests/vocab_vectors/__init__.py -------------------------------------------------------------------------------- /spacy/tokens/__init__.pxd: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/explosion/spaCy/41e07772dc5805594bab2997a090a9033e26bf56/spacy/tokens/__init__.pxd -------------------------------------------------------------------------------- /spacy/tokens/__init__.py: -------------------------------------------------------------------------------- 1 | from ._serialize import DocBin 2 | from .doc import Doc 3 | from .morphanalysis import MorphAnalysis 4 | from .span import Span 5 | from .span_group import SpanGroup 6 | from .token import Token 7 | 8 | __all__ = ["Doc", "DocBin", "MorphAnalysis", "Span", "SpanGroup", "Token"] 9 | -------------------------------------------------------------------------------- /spacy/tokens/graph.pxd: -------------------------------------------------------------------------------- 1 | from cymem.cymem cimport Pool 2 | from libcpp.vector cimport vector 3 | from preshed.maps cimport PreshMap 4 | 5 | from ..structs cimport EdgeC, GraphC 6 | 7 | 8 | cdef class Graph: 9 | cdef GraphC c 10 | cdef Pool mem 11 | cdef PreshMap node_map 12 | cdef PreshMap edge_map 13 | cdef object doc_ref 14 | cdef public str name 15 | -------------------------------------------------------------------------------- /spacy/tokens/morphanalysis.pxd: -------------------------------------------------------------------------------- 1 | from ..structs cimport MorphAnalysisC 2 | from ..typedefs cimport hash_t 3 | from ..vocab cimport Vocab 4 | 5 | 6 | cdef class MorphAnalysis: 7 | cdef readonly Vocab vocab 8 | cdef readonly hash_t key 9 | cdef MorphAnalysisC c 10 | -------------------------------------------------------------------------------- /spacy/tokens/span.pxd: -------------------------------------------------------------------------------- 1 | cimport numpy as np 2 | 3 | from ..structs cimport SpanC 4 | from ..typedefs cimport attr_t 5 | from .doc cimport Doc 6 | 7 | 8 | cdef class Span: 9 | cdef readonly Doc doc 10 | cdef SpanC c 11 | cdef public _vector 12 | cdef public _vector_norm 13 | 14 | @staticmethod 15 | cdef inline Span cinit(Doc doc, SpanC span): 16 | cdef Span self = Span.__new__( 17 | Span, 18 | doc, 19 | start=span.start, 20 | end=span.end 21 | ) 22 | self.c = span 23 | return self 24 | 25 | cpdef np.ndarray to_array(self, object features) 26 | -------------------------------------------------------------------------------- /spacy/tokens/span_group.pxd: -------------------------------------------------------------------------------- 1 | from libcpp.vector cimport vector 2 | 3 | from ..structs cimport SpanC 4 | 5 | 6 | cdef class SpanGroup: 7 | cdef public object _doc_ref 8 | cdef public str name 9 | cdef public dict attrs 10 | cdef vector[SpanC] c 11 | 12 | cdef void push_back(self, SpanC span) nogil 13 | -------------------------------------------------------------------------------- /spacy/training/__init__.pxd: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/explosion/spaCy/41e07772dc5805594bab2997a090a9033e26bf56/spacy/training/__init__.pxd -------------------------------------------------------------------------------- /spacy/training/alignment.py: -------------------------------------------------------------------------------- 1 | from dataclasses import dataclass 2 | from typing import List 3 | 4 | from .align import get_alignments 5 | from .alignment_array import AlignmentArray 6 | 7 | 8 | @dataclass 9 | class Alignment: 10 | x2y: AlignmentArray 11 | y2x: AlignmentArray 12 | 13 | @classmethod 14 | def from_indices(cls, x2y: List[List[int]], y2x: List[List[int]]) -> "Alignment": 15 | x2y = AlignmentArray(x2y) 16 | y2x = AlignmentArray(y2x) 17 | return Alignment(x2y=x2y, y2x=y2x) 18 | 19 | @classmethod 20 | def from_strings(cls, A: List[str], B: List[str]) -> "Alignment": 21 | x2y, y2x = get_alignments(A, B) 22 | return Alignment.from_indices(x2y=x2y, y2x=y2x) 23 | -------------------------------------------------------------------------------- /spacy/training/alignment_array.pxd: -------------------------------------------------------------------------------- 1 | cimport numpy as np 2 | from libcpp.vector cimport vector 3 | 4 | 5 | cdef class AlignmentArray: 6 | cdef np.ndarray _data 7 | cdef np.ndarray _lengths 8 | cdef np.ndarray _starts_ends 9 | -------------------------------------------------------------------------------- /spacy/training/converters/__init__.py: -------------------------------------------------------------------------------- 1 | from .conll_ner_to_docs import conll_ner_to_docs # noqa: F401 2 | from .conllu_to_docs import conllu_to_docs # noqa: F401 3 | from .iob_to_docs import iob_to_docs # noqa: F401 4 | from .json_to_docs import json_to_docs # noqa: F401 5 | -------------------------------------------------------------------------------- /spacy/training/example.pxd: -------------------------------------------------------------------------------- 1 | from libc.stdint cimport uint64_t 2 | 3 | from ..tokens.doc cimport Doc 4 | 5 | 6 | cdef class Example: 7 | cdef readonly Doc x 8 | cdef readonly Doc y 9 | cdef readonly object _cached_alignment 10 | cdef readonly object _cached_words_x 11 | cdef readonly object _cached_words_y 12 | cdef readonly uint64_t _x_sig 13 | cdef readonly uint64_t _y_sig 14 | -------------------------------------------------------------------------------- /spacy/typedefs.pxd: -------------------------------------------------------------------------------- 1 | from libc.stdint cimport int32_t, uint8_t, uint16_t, uint32_t, uint64_t, uintptr_t 2 | 3 | ctypedef float weight_t 4 | ctypedef uint64_t hash_t 5 | ctypedef uint64_t class_t 6 | ctypedef uint64_t attr_t 7 | ctypedef uint64_t flags_t 8 | ctypedef uint16_t len_t 9 | ctypedef uint16_t tag_t 10 | -------------------------------------------------------------------------------- /spacy/typedefs.pyx: -------------------------------------------------------------------------------- 1 | # cython: profile=False 2 | -------------------------------------------------------------------------------- /website/.dockerignore: -------------------------------------------------------------------------------- 1 | .cache/ 2 | .next/ 3 | public/ 4 | node_modules 5 | .npm 6 | logs 7 | *.log 8 | npm-debug.log* 9 | quickstart-training-generator.js 10 | -------------------------------------------------------------------------------- /website/.eslintrc.json: -------------------------------------------------------------------------------- 1 | { 2 | "extends": "next/core-web-vitals" 3 | } 4 | -------------------------------------------------------------------------------- /website/.gitignore: -------------------------------------------------------------------------------- 1 | # See https://help.github.com/articles/ignoring-files/ for more about ignoring files. 2 | 3 | quickstart-training-generator.js 4 | 5 | # dependencies 6 | /node_modules 7 | /.pnp 8 | .pnp.js 9 | 10 | # testing 11 | /coverage 12 | 13 | # next.js 14 | /.next/ 15 | /out/ 16 | 17 | # production 18 | /build 19 | 20 | # misc 21 | .DS_Store 22 | *.pem 23 | 24 | # debug 25 | npm-debug.log* 26 | yarn-debug.log* 27 | yarn-error.log* 28 | .pnpm-debug.log* 29 | 30 | # local env files 31 | .env*.local 32 | 33 | # vercel 34 | .vercel 35 | 36 | # typescript 37 | *.tsbuildinfo 38 | next-env.d.ts 39 | 40 | !.vscode/extensions.json 41 | !public 42 | 43 | public/robots.txt 44 | public/sitemap* 45 | public/sw.js* 46 | public/workbox* 47 | -------------------------------------------------------------------------------- /website/.nvmrc: -------------------------------------------------------------------------------- 1 | 18 2 | -------------------------------------------------------------------------------- /website/.prettierignore: -------------------------------------------------------------------------------- 1 | .next -------------------------------------------------------------------------------- /website/.vscode/extensions.json: -------------------------------------------------------------------------------- 1 | { 2 | "recommendations": [ 3 | "dbaeumer.vscode-eslint", 4 | "unifiedjs.vscode-mdx", 5 | "esbenp.prettier-vscode", 6 | "syler.sass-indented" 7 | ] 8 | } 9 | -------------------------------------------------------------------------------- /website/Dockerfile: -------------------------------------------------------------------------------- 1 | FROM node:18 2 | 3 | USER node 4 | 5 | # This is so the installed node_modules will be up one directory 6 | # from where a user mounts files, so that they don't accidentally mount 7 | # their own node_modules from a different build 8 | # https://nodejs.org/api/modules.html#modules_loading_from_node_modules_folders 9 | WORKDIR /home/node 10 | COPY --chown=node package.json . 11 | COPY --chown=node package-lock.json . 12 | RUN npm install 13 | 14 | WORKDIR /home/node/website/ 15 | -------------------------------------------------------------------------------- /website/docs/api/index.mdx: -------------------------------------------------------------------------------- 1 | --- 2 | title: Library Architecture 3 | next: /api/architectures 4 | --- 5 | 6 | 7 | -------------------------------------------------------------------------------- /website/meta/dynamicMeta.mjs: -------------------------------------------------------------------------------- 1 | import site from './site.json' assert { type: 'json' } 2 | 3 | export const domain = process.env.BRANCH || site.domain 4 | export const siteUrl = `https://${domain}` 5 | export const nightly = site.nightlyBranches.includes(domain) 6 | export const legacy = site.legacy || !!+process.env.SPACY_LEGACY 7 | export const binderBranch = domain 8 | export const branch = nightly ? 'develop' : 'master' 9 | export const replacements = { 10 | GITHUB_SPACY: `https://github.com/explosion/spaCy/tree/${branch}`, 11 | GITHUB_PROJECTS: `https://github.com/${site.projectsRepo}`, 12 | SPACY_PKG_NAME: nightly ? 'spacy-nightly' : 'spacy', 13 | SPACY_PKG_FLAGS: nightly ? ' --pre' : '', 14 | } 15 | -------------------------------------------------------------------------------- /website/meta/languageSorted.tsx: -------------------------------------------------------------------------------- 1 | import models from './languages.json' 2 | 3 | export const languagesSorted = models.languages 4 | .filter(({ models }) => models && models.length) 5 | .sort((a, b) => a.name.localeCompare(b.name)) 6 | -------------------------------------------------------------------------------- /website/meta/recordLanguages.tsx: -------------------------------------------------------------------------------- 1 | import models from './languages.json' 2 | 3 | const recordLanguages = Object.fromEntries( 4 | models.languages.map((language, index) => [language.code, language]) 5 | ) 6 | 7 | export default recordLanguages 8 | -------------------------------------------------------------------------------- /website/meta/recordSections.tsx: -------------------------------------------------------------------------------- 1 | import siteMetadata from './site.json' 2 | 3 | const recordSections = Object.fromEntries(siteMetadata.sections.map((s) => [s.id, s])) 4 | 5 | export default recordSections 6 | -------------------------------------------------------------------------------- /website/meta/recordUniverse.tsx: -------------------------------------------------------------------------------- 1 | import universe from './universe.json' 2 | 3 | export const recordUniverseCategories = Object.fromEntries( 4 | universe.categories.flatMap((category) => category.items.map((item) => [item.id, item])) 5 | ) 6 | 7 | export const recordUniverseResources = Object.fromEntries( 8 | universe.resources.map((resource) => [resource.id, resource]) 9 | ) 10 | -------------------------------------------------------------------------------- /website/meta/sidebarFlat.tsx: -------------------------------------------------------------------------------- 1 | import sidebars from './sidebars.json' 2 | 3 | export const sidebarUsageFlat = sidebars 4 | .find((sidebar) => sidebar.section === 'usage') 5 | .items.flatMap((item) => item.items) 6 | -------------------------------------------------------------------------------- /website/next-sitemap.config.mjs: -------------------------------------------------------------------------------- 1 | import { siteUrl } from './meta/dynamicMeta.mjs' 2 | 3 | /** @type {import('next-sitemap').IConfig} */ 4 | const config = { 5 | siteUrl, 6 | generateRobotsTxt: true, 7 | autoLastmod: false, 8 | } 9 | 10 | export default config 11 | -------------------------------------------------------------------------------- /website/pages/_document.tsx: -------------------------------------------------------------------------------- 1 | import { Html, Head, Main, NextScript } from 'next/document' 2 | 3 | export default function Document() { 4 | return ( 5 | 6 | 7 | 8 |
9 | 10 | 11 | 12 | ) 13 | } 14 | -------------------------------------------------------------------------------- /website/pages/universe/index.tsx: -------------------------------------------------------------------------------- 1 | import recordSections from '../../meta/recordSections' 2 | import Layout from '../../src/templates' 3 | 4 | const Universe = () => { 5 | return ( 6 | 14 | ) 15 | } 16 | 17 | export default Universe 18 | -------------------------------------------------------------------------------- /website/plugins/index.mjs: -------------------------------------------------------------------------------- 1 | import remarkGfm from 'remark-gfm' 2 | import remarkUnwrapImages from 'remark-unwrap-images' 3 | import remarkSmartypants from 'remark-smartypants' 4 | 5 | import remarkCustomAttrs from './remarkCustomAttrs.mjs' 6 | import remarkWrapSections from './remarkWrapSections.mjs' 7 | import remarkCodeBlocks from './remarkCodeBlocks.mjs' 8 | import remarkFindAndReplace from './remarkFindAndReplace.mjs' 9 | 10 | const remarkPlugins = [ 11 | remarkGfm, 12 | remarkSmartypants, 13 | remarkFindAndReplace, 14 | remarkUnwrapImages, 15 | remarkCustomAttrs, 16 | remarkCodeBlocks, 17 | remarkWrapSections, 18 | ] 19 | 20 | export default remarkPlugins 21 | -------------------------------------------------------------------------------- /website/public/favicon.ico: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/explosion/spaCy/41e07772dc5805594bab2997a090a9033e26bf56/website/public/favicon.ico -------------------------------------------------------------------------------- /website/public/icons/icon-192x192.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/explosion/spaCy/41e07772dc5805594bab2997a090a9033e26bf56/website/public/icons/icon-192x192.png -------------------------------------------------------------------------------- /website/public/icons/icon-256x256.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/explosion/spaCy/41e07772dc5805594bab2997a090a9033e26bf56/website/public/icons/icon-256x256.png -------------------------------------------------------------------------------- /website/public/icons/icon-384x384.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/explosion/spaCy/41e07772dc5805594bab2997a090a9033e26bf56/website/public/icons/icon-384x384.png -------------------------------------------------------------------------------- /website/public/icons/icon-512x512.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/explosion/spaCy/41e07772dc5805594bab2997a090a9033e26bf56/website/public/icons/icon-512x512.png -------------------------------------------------------------------------------- /website/public/images/cli_init_fill-config_diff.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/explosion/spaCy/41e07772dc5805594bab2997a090a9033e26bf56/website/public/images/cli_init_fill-config_diff.jpg -------------------------------------------------------------------------------- /website/public/images/course.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/explosion/spaCy/41e07772dc5805594bab2997a090a9033e26bf56/website/public/images/course.jpg -------------------------------------------------------------------------------- /website/public/images/displacy_jupyter.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/explosion/spaCy/41e07772dc5805594bab2997a090a9033e26bf56/website/public/images/displacy_jupyter.jpg -------------------------------------------------------------------------------- /website/public/images/huggingface_hub.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/explosion/spaCy/41e07772dc5805594bab2997a090a9033e26bf56/website/public/images/huggingface_hub.jpg -------------------------------------------------------------------------------- /website/public/images/matcher-demo.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/explosion/spaCy/41e07772dc5805594bab2997a090a9033e26bf56/website/public/images/matcher-demo.jpg -------------------------------------------------------------------------------- /website/public/images/prodigy.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/explosion/spaCy/41e07772dc5805594bab2997a090a9033e26bf56/website/public/images/prodigy.jpg -------------------------------------------------------------------------------- /website/public/images/prodigy_overview.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/explosion/spaCy/41e07772dc5805594bab2997a090a9033e26bf56/website/public/images/prodigy_overview.jpg -------------------------------------------------------------------------------- /website/public/images/prodigy_spans-manual.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/explosion/spaCy/41e07772dc5805594bab2997a090a9033e26bf56/website/public/images/prodigy_spans-manual.jpg -------------------------------------------------------------------------------- /website/public/images/prodigy_train_curve.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/explosion/spaCy/41e07772dc5805594bab2997a090a9033e26bf56/website/public/images/prodigy_train_curve.jpg -------------------------------------------------------------------------------- /website/public/images/project_document.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/explosion/spaCy/41e07772dc5805594bab2997a090a9033e26bf56/website/public/images/project_document.jpg -------------------------------------------------------------------------------- /website/public/images/projects.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/explosion/spaCy/41e07772dc5805594bab2997a090a9033e26bf56/website/public/images/projects.png -------------------------------------------------------------------------------- /website/public/images/sense2vec.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/explosion/spaCy/41e07772dc5805594bab2997a090a9033e26bf56/website/public/images/sense2vec.jpg -------------------------------------------------------------------------------- /website/public/images/spacy-extension-demo.gif: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/explosion/spaCy/41e07772dc5805594bab2997a090a9033e26bf56/website/public/images/spacy-extension-demo.gif -------------------------------------------------------------------------------- /website/public/images/spacy-streamlit.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/explosion/spaCy/41e07772dc5805594bab2997a090a9033e26bf56/website/public/images/spacy-streamlit.png -------------------------------------------------------------------------------- /website/public/images/spacy-tailored-pipelines_wide.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/explosion/spaCy/41e07772dc5805594bab2997a090a9033e26bf56/website/public/images/spacy-tailored-pipelines_wide.png -------------------------------------------------------------------------------- /website/public/images/thinc_mypy.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/explosion/spaCy/41e07772dc5805594bab2997a090a9033e26bf56/website/public/images/thinc_mypy.jpg -------------------------------------------------------------------------------- /website/public/images/wandb1.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/explosion/spaCy/41e07772dc5805594bab2997a090a9033e26bf56/website/public/images/wandb1.jpg -------------------------------------------------------------------------------- /website/public/images/wandb2.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/explosion/spaCy/41e07772dc5805594bab2997a090a9033e26bf56/website/public/images/wandb2.jpg -------------------------------------------------------------------------------- /website/runtime.txt: -------------------------------------------------------------------------------- 1 | 3.8 2 | -------------------------------------------------------------------------------- /website/setup/requirements.txt: -------------------------------------------------------------------------------- 1 | # These are used to compile the training quickstart config 2 | jinja2>=3.1.0 3 | srsly 4 | -------------------------------------------------------------------------------- /website/setup/setup.sh: -------------------------------------------------------------------------------- 1 | python setup/jinja_to_js.py ../spacy/cli/templates/quickstart_training.jinja src/widgets/quickstart-training-generator.js ../spacy/cli/templates/quickstart_training_recommendations.yml 2 | -------------------------------------------------------------------------------- /website/src/components/aside.js: -------------------------------------------------------------------------------- 1 | import React from 'react' 2 | import PropTypes from 'prop-types' 3 | 4 | import classes from '../styles/aside.module.sass' 5 | 6 | export default function Aside({ title, children }) { 7 | return ( 8 | 16 | ) 17 | } 18 | 19 | Aside.propTypes = { 20 | title: PropTypes.string, 21 | children: PropTypes.node.isRequired, 22 | } 23 | -------------------------------------------------------------------------------- /website/src/components/codeBlock.js: -------------------------------------------------------------------------------- 1 | import React from 'react' 2 | import Code from './codeDynamic' 3 | import classes from '../styles/code.module.sass' 4 | 5 | export const Pre = (props) => { 6 | return
{props.children}
7 | } 8 | 9 | const CodeBlock = (props) => ( 10 |
11 |         
12 |     
13 | ) 14 | export default CodeBlock 15 | -------------------------------------------------------------------------------- /website/src/components/codeDynamic.js: -------------------------------------------------------------------------------- 1 | import dynamic from 'next/dynamic' 2 | 3 | export default dynamic(() => import('./code'), { 4 | loading: () =>
Loading...
, 5 | }) 6 | -------------------------------------------------------------------------------- /website/src/components/htmlToReact.js: -------------------------------------------------------------------------------- 1 | import { Parser as HtmlToReactParser } from 'html-to-react' 2 | 3 | const htmlToReactParser = new HtmlToReactParser() 4 | /** 5 | * Convert raw HTML to React elements 6 | * @param {string} html - The HTML markup to convert. 7 | * @returns {Node} - The converted React elements. 8 | */ 9 | 10 | export default function HtmlToReact(props) { 11 | return htmlToReactParser.parse(props.children) 12 | } 13 | -------------------------------------------------------------------------------- /website/src/components/list.js: -------------------------------------------------------------------------------- 1 | import React from 'react' 2 | import classNames from 'classnames' 3 | 4 | import classes from '../styles/list.module.sass' 5 | import { replaceEmoji } from './icon' 6 | 7 | export const Ol = (props) =>
    8 | export const Ul = (props) =>
      9 | export const Li = ({ children, emoji, ...props }) => { 10 | const { hasIcon, content } = replaceEmoji(children) 11 | const liClassNames = classNames(classes.li, { 12 | [classes['li-icon']]: hasIcon, 13 | [classes.emoji]: emoji, 14 | }) 15 | return ( 16 |
    • 17 | {content} 18 |
    • 19 | ) 20 | } 21 | -------------------------------------------------------------------------------- /website/src/components/markdownToReactDynamic.js: -------------------------------------------------------------------------------- 1 | import dynamic from 'next/dynamic' 2 | 3 | export default dynamic(() => import('./markdownToReact'), { 4 | loading: () =>

      Loading...

      , 5 | }) 6 | -------------------------------------------------------------------------------- /website/src/components/search.js: -------------------------------------------------------------------------------- 1 | import React from 'react' 2 | import PropTypes from 'prop-types' 3 | import { DocSearch } from '@docsearch/react' 4 | import '@docsearch/css' 5 | 6 | import siteMetadata from '../../meta/site.json' 7 | 8 | export default function Search({ placeholder = 'Search docs' }) { 9 | const apiKey = process.env.DOCSEARCH_API_KEY 10 | const { indexName, appId } = siteMetadata.docSearch 11 | return ( 12 | 13 | ) 14 | } 15 | 16 | Search.propTypes = { 17 | id: PropTypes.string, 18 | placeholder: PropTypes.string, 19 | } 20 | -------------------------------------------------------------------------------- /website/src/fonts/hkgrotesk-bold.woff: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/explosion/spaCy/41e07772dc5805594bab2997a090a9033e26bf56/website/src/fonts/hkgrotesk-bold.woff -------------------------------------------------------------------------------- /website/src/fonts/hkgrotesk-bold.woff2: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/explosion/spaCy/41e07772dc5805594bab2997a090a9033e26bf56/website/src/fonts/hkgrotesk-bold.woff2 -------------------------------------------------------------------------------- /website/src/fonts/hkgrotesk-bolditalic.woff: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/explosion/spaCy/41e07772dc5805594bab2997a090a9033e26bf56/website/src/fonts/hkgrotesk-bolditalic.woff -------------------------------------------------------------------------------- /website/src/fonts/hkgrotesk-bolditalic.woff2: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/explosion/spaCy/41e07772dc5805594bab2997a090a9033e26bf56/website/src/fonts/hkgrotesk-bolditalic.woff2 -------------------------------------------------------------------------------- /website/src/fonts/hkgrotesk-semibold.woff: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/explosion/spaCy/41e07772dc5805594bab2997a090a9033e26bf56/website/src/fonts/hkgrotesk-semibold.woff -------------------------------------------------------------------------------- /website/src/fonts/hkgrotesk-semibold.woff2: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/explosion/spaCy/41e07772dc5805594bab2997a090a9033e26bf56/website/src/fonts/hkgrotesk-semibold.woff2 -------------------------------------------------------------------------------- /website/src/fonts/hkgrotesk-semibolditalic.woff: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/explosion/spaCy/41e07772dc5805594bab2997a090a9033e26bf56/website/src/fonts/hkgrotesk-semibolditalic.woff -------------------------------------------------------------------------------- /website/src/fonts/hkgrotesk-semibolditalic.woff2: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/explosion/spaCy/41e07772dc5805594bab2997a090a9033e26bf56/website/src/fonts/hkgrotesk-semibolditalic.woff2 -------------------------------------------------------------------------------- /website/src/fonts/jetbrainsmono-italic.woff: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/explosion/spaCy/41e07772dc5805594bab2997a090a9033e26bf56/website/src/fonts/jetbrainsmono-italic.woff -------------------------------------------------------------------------------- /website/src/fonts/jetbrainsmono-italic.woff2: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/explosion/spaCy/41e07772dc5805594bab2997a090a9033e26bf56/website/src/fonts/jetbrainsmono-italic.woff2 -------------------------------------------------------------------------------- /website/src/fonts/jetbrainsmono-regular.woff: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/explosion/spaCy/41e07772dc5805594bab2997a090a9033e26bf56/website/src/fonts/jetbrainsmono-regular.woff -------------------------------------------------------------------------------- /website/src/fonts/jetbrainsmono-regular.woff2: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/explosion/spaCy/41e07772dc5805594bab2997a090a9033e26bf56/website/src/fonts/jetbrainsmono-regular.woff2 -------------------------------------------------------------------------------- /website/src/images/icon.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/explosion/spaCy/41e07772dc5805594bab2997a090a9033e26bf56/website/src/images/icon.png -------------------------------------------------------------------------------- /website/src/images/icon_legacy.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/explosion/spaCy/41e07772dc5805594bab2997a090a9033e26bf56/website/src/images/icon_legacy.png -------------------------------------------------------------------------------- /website/src/images/icon_nightly.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/explosion/spaCy/41e07772dc5805594bab2997a090a9033e26bf56/website/src/images/icon_nightly.png -------------------------------------------------------------------------------- /website/src/images/icons/accept.svg: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | -------------------------------------------------------------------------------- /website/src/images/icons/clipboard.svg: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | -------------------------------------------------------------------------------- /website/src/images/icons/code.svg: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | -------------------------------------------------------------------------------- /website/src/images/icons/docs.svg: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | -------------------------------------------------------------------------------- /website/src/images/icons/info.svg: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | -------------------------------------------------------------------------------- /website/src/images/icons/moon.svg: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | -------------------------------------------------------------------------------- /website/src/images/icons/neutral.svg: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | -------------------------------------------------------------------------------- /website/src/images/icons/no.svg: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | -------------------------------------------------------------------------------- /website/src/images/icons/offline.svg: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | -------------------------------------------------------------------------------- /website/src/images/icons/package.svg: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | -------------------------------------------------------------------------------- /website/src/images/icons/reject.svg: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | -------------------------------------------------------------------------------- /website/src/images/icons/search.svg: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | -------------------------------------------------------------------------------- /website/src/images/icons/twitter.svg: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | -------------------------------------------------------------------------------- /website/src/images/icons/warning.svg: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | -------------------------------------------------------------------------------- /website/src/images/icons/yes.svg: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | -------------------------------------------------------------------------------- /website/src/images/pattern_blue.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/explosion/spaCy/41e07772dc5805594bab2997a090a9033e26bf56/website/src/images/pattern_blue.png -------------------------------------------------------------------------------- /website/src/images/pattern_green.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/explosion/spaCy/41e07772dc5805594bab2997a090a9033e26bf56/website/src/images/pattern_green.png -------------------------------------------------------------------------------- /website/src/images/pattern_landing.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/explosion/spaCy/41e07772dc5805594bab2997a090a9033e26bf56/website/src/images/pattern_landing.png -------------------------------------------------------------------------------- /website/src/images/pattern_landing_legacy.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/explosion/spaCy/41e07772dc5805594bab2997a090a9033e26bf56/website/src/images/pattern_landing_legacy.png -------------------------------------------------------------------------------- /website/src/images/pattern_landing_nightly.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/explosion/spaCy/41e07772dc5805594bab2997a090a9033e26bf56/website/src/images/pattern_landing_nightly.png -------------------------------------------------------------------------------- /website/src/images/pattern_legacy.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/explosion/spaCy/41e07772dc5805594bab2997a090a9033e26bf56/website/src/images/pattern_legacy.png -------------------------------------------------------------------------------- /website/src/images/pattern_nightly.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/explosion/spaCy/41e07772dc5805594bab2997a090a9033e26bf56/website/src/images/pattern_nightly.png -------------------------------------------------------------------------------- /website/src/images/pattern_purple.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/explosion/spaCy/41e07772dc5805594bab2997a090a9033e26bf56/website/src/images/pattern_purple.png -------------------------------------------------------------------------------- /website/src/images/social_api.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/explosion/spaCy/41e07772dc5805594bab2997a090a9033e26bf56/website/src/images/social_api.jpg -------------------------------------------------------------------------------- /website/src/images/social_default.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/explosion/spaCy/41e07772dc5805594bab2997a090a9033e26bf56/website/src/images/social_default.jpg -------------------------------------------------------------------------------- /website/src/images/social_legacy.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/explosion/spaCy/41e07772dc5805594bab2997a090a9033e26bf56/website/src/images/social_legacy.jpg -------------------------------------------------------------------------------- /website/src/images/social_nightly.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/explosion/spaCy/41e07772dc5805594bab2997a090a9033e26bf56/website/src/images/social_nightly.jpg -------------------------------------------------------------------------------- /website/src/images/social_universe.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/explosion/spaCy/41e07772dc5805594bab2997a090a9033e26bf56/website/src/images/social_universe.jpg -------------------------------------------------------------------------------- /website/src/images/spacy-irl.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/explosion/spaCy/41e07772dc5805594bab2997a090a9033e26bf56/website/src/images/spacy-irl.jpg -------------------------------------------------------------------------------- /website/src/styles/alert.module.sass: -------------------------------------------------------------------------------- 1 | .root 2 | position: fixed 3 | bottom: 0 4 | left: 0 5 | width: 100% 6 | background: var(--color-back) 7 | z-index: 100 8 | font: var(--font-size-sm)/var(--line-height-md) var(--font-primary) 9 | text-align: center 10 | padding: 1rem 11 | box-shadow: var(--box-shadow) 12 | border-top: 2px solid 13 | color: var(--color-theme-dark) 14 | 15 | .warning 16 | --alert-bg: var(--color-yellow-light) 17 | --color-theme: var(--color-yellow-dark) 18 | --color-theme-dark: var(--color-yellow-dark) 19 | --color-inline-code-bg: var(--color-yellow-opaque) 20 | background: var(--color-yellow-light) 21 | color: var(--color-yellow-dark) 22 | 23 | .clickable 24 | cursor: pointer 25 | -------------------------------------------------------------------------------- /website/src/styles/card.module.sass: -------------------------------------------------------------------------------- 1 | .root 2 | background: var(--color-subtle-light) 3 | border-radius: var(--border-radius) 4 | padding: 2rem 5 | font: var(--font-size-md)/var(--line-height-md) var(--font-primary) 6 | margin-bottom: var(--spacing-sm) 7 | 8 | .small 9 | padding: 1.5rem 10 | font-size: var(--font-size-sm) 11 | line-height: var(--line-height-sm) 12 | color: var(--color-dark) 13 | 14 | .title 15 | margin-bottom: var(--spacing-xs) 16 | 17 | .image 18 | $image-size: 35px 19 | width: $image-size 20 | height: $image-size 21 | overflow: hidden 22 | float: right 23 | border-radius: 50% 24 | -------------------------------------------------------------------------------- /website/src/styles/copy.module.sass: -------------------------------------------------------------------------------- 1 | .root 2 | background: var(--color-back) 3 | border-radius: 2em 4 | border: 1px solid var(--color-subtle) 5 | width: 100% 6 | padding: 0.25em 1em 7 | display: inline-flex 8 | margin: var(--spacing-xs) 0 9 | font: var(--font-size-code)/var(--line-height-code) var(--font-code) 10 | -webkit-font-smoothing: subpixel-antialiased 11 | -moz-osx-font-smoothing: auto 12 | 13 | .textarea 14 | flex: 100% 15 | background: transparent 16 | resize: none 17 | font: inherit 18 | overflow: hidden 19 | white-space: nowrap 20 | text-overflow: ellipsis 21 | margin-right: 1rem 22 | 23 | .prefix 24 | margin-right: 0.75em 25 | color: var(--color-subtle-dark) 26 | -------------------------------------------------------------------------------- /website/src/styles/icon.module.sass: -------------------------------------------------------------------------------- 1 | .root 2 | vertical-align: middle 3 | 4 | .inline 5 | margin: 0 0.55em 0 0.1em 6 | 7 | .tag 8 | vertical-align: bottom 9 | height: 100% 10 | position: relative 11 | top: 1px 12 | 13 | .success 14 | color: var(--color-green-medium) 15 | 16 | .error 17 | color: var(--color-red-medium) 18 | 19 | .subtle 20 | color: var(--color-subtle-dark) 21 | -------------------------------------------------------------------------------- /website/src/styles/link.module.sass: -------------------------------------------------------------------------------- 1 | .root 2 | color: var(--color-theme-dark) 3 | border-bottom: 1px solid 4 | transition: color 0.2s ease 5 | cursor: pointer 6 | 7 | &:hover 8 | color: var(--color-front) 9 | 10 | .no-link-layout 11 | border: none 12 | color: inherit 13 | 14 | &:hover 15 | color: inherit 16 | 17 | .icon 18 | margin-left: 0.5em 19 | width: 1.1em 20 | height: 1.1em 21 | 22 | .nowrap 23 | white-space: nowrap 24 | display: inline-block 25 | 26 | .with-icon 27 | border: none 28 | 29 | .source-text 30 | border-bottom: 1px solid 31 | -------------------------------------------------------------------------------- /website/src/styles/newsletter.module.sass: -------------------------------------------------------------------------------- 1 | .root 2 | font: var(--font-size-sm)/var(--line-height-sm) var(--font-primary) 3 | margin: var(--spacing-xs) 0 4 | background: var(--color-back) 5 | border-radius: 2em 6 | border: 1px solid var(--color-subtle) 7 | padding-right: 1em 8 | display: inline-flex 9 | max-width: 300px 10 | 11 | .input 12 | font: inherit 13 | background: transparent 14 | padding: 0.5em 1em 15 | margin: 0 0 0.25rem 0.25rem 16 | flex: 100% 17 | 18 | .button 19 | font: bold var(--font-size-lg)/var(--line-height-md) var(--font-secondary) 20 | text-transform: uppercase 21 | color: var(--color-theme-dark) 22 | white-space: nowrap 23 | -------------------------------------------------------------------------------- /website/src/styles/progress.module.sass: -------------------------------------------------------------------------------- 1 | .root 2 | display: block 3 | flex: 105% 4 | width: 105% 5 | height: 3px 6 | color: var(--color-theme) 7 | background: transparent 8 | border: none 9 | position: absolute 10 | bottom: 0 11 | left: -2.5% 12 | 13 | &::-webkit-progress-bar 14 | background: var(--color-back) 15 | border-radius: none 16 | 17 | &::-webkit-progress-value 18 | background: var(--color-theme) 19 | border-radius: none 20 | 21 | &::-moz-progress-bar 22 | background: var(--color-theme) 23 | -------------------------------------------------------------------------------- /website/src/styles/readnext.module.sass: -------------------------------------------------------------------------------- 1 | .root 2 | display: flex 3 | justify-content: flex-end 4 | align-items: center 5 | text-align: right 6 | font: var(--font-size-sm)/var(--line-height-md) var(--font-primary) 7 | 8 | .icon 9 | $icon-size: 35px 10 | width: $icon-size 11 | height: $icon-size 12 | background: var(--color-subtle-light) 13 | color: var(--color-subtle-dark) 14 | border-radius: 50% 15 | padding: 0.5rem 0.65rem 0.5rem 0 16 | transition: color 0.2s ease 17 | float: right 18 | margin-left: 3rem 19 | 20 | &:hover 21 | color: var(--color-theme-dark) 22 | -------------------------------------------------------------------------------- /website/src/styles/section.module.sass: -------------------------------------------------------------------------------- 1 | .root 2 | &:not(:last-child):not(:last-of-type) 3 | margin-bottom: var(--spacing-md) 4 | padding-bottom: var(--spacing-md) 5 | border-bottom: 1px dotted var(--color-subtle) 6 | 7 | .hr 8 | border: 0 9 | padding: var(--spacing-sm) 0 10 | -------------------------------------------------------------------------------- /website/src/styles/tag.module.sass: -------------------------------------------------------------------------------- 1 | .root 2 | display: inline-block 3 | font: bold var(--font-size-xs)/#{1} var(--font-secondary) 4 | background: var(--color-theme-dark) 5 | color: var(--color-back) 6 | padding: 2px 6px 4px 7 | border-radius: 1em 8 | text-transform: uppercase 9 | vertical-align: middle 10 | 11 | .spaced 12 | margin-left: 0.75em 13 | margin-right: 0.5em 14 | 15 | .icon 16 | margin-left: 0.5em 17 | -------------------------------------------------------------------------------- /website/tsconfig.json: -------------------------------------------------------------------------------- 1 | { 2 | "compilerOptions": { 3 | "target": "es5", 4 | "lib": ["dom", "dom.iterable", "esnext"], 5 | "allowJs": true, 6 | "skipLibCheck": true, 7 | "strict": false, 8 | "forceConsistentCasingInFileNames": true, 9 | "noEmit": true, 10 | "esModuleInterop": true, 11 | "module": "esnext", 12 | "moduleResolution": "node", 13 | "resolveJsonModule": true, 14 | "isolatedModules": true, 15 | "jsx": "preserve", 16 | "incremental": true 17 | }, 18 | "include": ["next-env.d.ts", "**/*.ts", "**/*.tsx"], 19 | "exclude": ["node_modules"] 20 | } 21 | --------------------------------------------------------------------------------