├── .agignore ├── .babelrc ├── .circleci └── config.yml ├── .eslintignore ├── .eslintrc ├── .gitattributes ├── .github ├── ISSUE_TEMPLATE.md └── PULL_REQUEST_TEMPLATE.md ├── .gitignore ├── .nvmrc ├── .prettierignore ├── .prettierrc ├── .remarkrc ├── CHANGELOG.md ├── CODE_OF_CONDUCT.md ├── CONTRIBUTING.md ├── LICENSE-APACHE ├── LICENSE-MIT ├── README.md ├── RELEASE.md ├── assets └── parser-basic-usage.gif ├── cli.js ├── dist ├── generate-custom-parser.js ├── generate-custom-parser.js.map ├── mercury.esm.js ├── mercury.esm.js.map ├── mercury.js ├── mercury.js.map ├── mercury.web.js └── mercury.web.js.map ├── fixtures ├── 247sports.com.html ├── abcnews.go.com.html ├── arstechnica.com.html ├── biorxiv.org.html ├── blisterreview.com.html ├── bookwalker.jp.html ├── buzzap.jp.html ├── clinicaltrials.gov.html ├── deadline.com.html ├── deadspin.com--video.html ├── deadspin.com.html ├── epaper.zeit.de.html ├── fandom.wikia.com.html ├── fortune.com.html ├── forward.com.html ├── genius.com.html ├── getnews.jp.html ├── github.com.html ├── gothamist.com.html ├── hellogiggles.com.html ├── ici.radio-canada.ca.html ├── japan.cnet.com.html ├── japan.zdnet.com.html ├── jvndb.jvn.jp.html ├── ma.ttias.be.html ├── mashable.com.html ├── medium.com--another.html ├── medium.com.html ├── money.cnn.com.html ├── newrepublic.com--minutes.html ├── newrepublic.com.html ├── news.mynavi.jp.html ├── news.nationalgeographic.com.html ├── nock │ ├── fetch-resource-test.js │ ├── mercury-test.js │ └── resource-test.js ├── nymag.com.html ├── obamawhitehouse.archives.gov--blog.html ├── obamawhitehouse.archives.gov--empty.html ├── obamawhitehouse.archives.gov--speeches.html ├── obamawhitehouse.archives.gov.html ├── observer.com.html ├── otrs.com.html ├── pagesix.com.html ├── pastebin.com.html ├── people.com.html ├── phpspot.org.html ├── pitchfork.com.html ├── postlight.com.html ├── qz.com.html ├── sandiegouniontribune.com.html ├── scan.netsecurity.ne.jp.html ├── sciencefly.com.html ├── sect.iij.ad.jp.html ├── takagi-hiromitsu.jp.html ├── techlog.iij.ad.jp.html ├── thefederalistpapers.org.html ├── thoughtcatalog.com.html ├── timesofindia.indiatimes.com.html ├── twitter.com.html ├── uproxx.com.html ├── weekly.ascii.jp.html ├── wired.jp.html ├── www.abendblatt.de.html ├── www.al.com.html ├── www.americanow.com.html ├── www.androidcentral.com.html ├── www.aol.com.html ├── www.apartmenttherapy.com.html ├── www.asahi.com.html ├── www.bloomberg.com--graphics.html ├── www.bloomberg.com--news.html ├── www.bloomberg.com.html ├── www.broadwayworld.com.html ├── www.bustle.com.html ├── www.buzzfeed.com--splash.html ├── www.buzzfeed.com.html ├── www.cbc.ca.html ├── www.cbssports.com.html ├── www.chicagotribune.com.html ├── www.cnbc.com--redesign.html ├── www.cnbc.com.html ├── www.cnet.com.html ├── www.cnn.com.html ├── www.dmagazine.com.html ├── www.elecom.co.jp.html ├── www.engadget.com.html ├── www.eonline.com.html ├── www.fastcompany.com.html ├── www.fool.com.html ├── www.fortinet.com.html ├── www.gizmodo.jp.html ├── www.gruene.de.html ├── www.huffingtonpost.com.html ├── www.infoq.com.html ├── www.inquisitr.com.html ├── www.investmentexecutive.com.html ├── www.ipa.go.jp.html ├── www.itmedia.co.jp.html ├── www.jnsa.org.html ├── www.ladbible.com.html ├── www.latimes.com--old.html ├── www.latimes.com.html ├── www.lemonde.fr.html ├── www.lifehacker.jp.html ├── www.linkedin.com.html ├── www.littlethings.com.html ├── www.macrumors.com.html ├── www.mentalfloss.com.html ├── www.miamiherald.com.html ├── www.moongift.jp.html ├── www.msn.com.html ├── www.msnbc.com.html ├── www.nationalgeographic.com.html ├── www.nbcnews.com.html ├── www.ndtv.com.html ├── www.newyorker.com--magazine.html ├── www.newyorker.com--multiple-authors.html ├── www.newyorker.com.html ├── www.npr.org.html ├── www.nydailynews.com.html ├── www.nytimes.com--feature.html ├── www.nytimes.com--recent.html ├── www.nytimes.com.html ├── www.opposingviews.com.html ├── www.oreilly.co.jp.html ├── www.ossnews.jp.html ├── www.phoronix.com.html ├── www.politico.com--test-case-2.html ├── www.politico.com--test-case-3.html ├── www.politico.com.html ├── www.popsugar.com.html ├── www.prospectmagazine.co.uk.html ├── www.publickey1.jp.html ├── www.qdaily.com.html ├── www.rawstory.com.html ├── www.rbbtoday.com.html ├── www.recode.net.html ├── www.reddit.com--embedded.html ├── www.reddit.com--external-image.html ├── www.reddit.com--external-link.html ├── www.reddit.com--image.html ├── www.reddit.com--title-only.html ├── www.reddit.com--video.html ├── www.reddit.com.html ├── www.refinery29.com.html ├── www.reuters.com.html ├── www.rollingstone.com.html ├── www.sanwa.co.jp.html ├── www.sbnation.com.html ├── www.si.com.html ├── www.slate.com.html ├── www.spektrum.de.html ├── www.theatlantic.com.html ├── www.theguardian.com.html ├── www.thepennyhoarder.com.html ├── www.thepoliticalinsider.com.html ├── www.theverge.com--feature.html ├── www.theverge.com.html ├── www.tmz.com.html ├── www.today.com.html ├── www.usmagazine.com.html ├── www.vox.com.html ├── www.vulture.com--content-test.html ├── www.vulture.com.html ├── www.washingtonpost.com.html ├── www.westernjournalism.com.html ├── www.wired.com--content-test.html ├── www.wired.com--other.html ├── www.wired.com.html ├── www.yahoo.com.html ├── www.yomiuri.co.jp.html └── www.youtube.com.html ├── karma.conf.js ├── package.json ├── preview ├── rollup.config.esm.js ├── rollup.config.js ├── rollup.config.web.js ├── score-move ├── scripts ├── check-build.test.js ├── comment-for-pr.js ├── find-and-replace.sh ├── generate-custom-parser.js ├── generate-fixture-preview.js ├── karma.conf.js ├── pr-parser-preview.sh ├── proxy-browser-test.js ├── rollup.config.js ├── templates │ ├── custom-extractor-test.js │ ├── custom-extractor.js │ ├── index.js │ └── insert-values.js ├── update-fixtures.js └── write-test-report.js ├── src ├── cleaners │ ├── author.js │ ├── author.test.js │ ├── constants.js │ ├── content.js │ ├── content.test.js │ ├── date-published.js │ ├── date-published.test.js │ ├── dek.js │ ├── dek.test.js │ ├── index.js │ ├── lead-image-url.js │ ├── lead-image-url.test.js │ ├── resolve-split-title.js │ ├── resolve-split-title.test.js │ ├── title.js │ └── title.test.js ├── extractors │ ├── add-extractor.js │ ├── add-extractor.test.js │ ├── all.js │ ├── collect-all-pages.js │ ├── constants.js │ ├── custom │ │ ├── 247sports.com │ │ │ ├── index.js │ │ │ └── index.test.js │ │ ├── README.md │ │ ├── abcnews.go.com │ │ │ ├── index.js │ │ │ └── index.test.js │ │ ├── arstechnica.com │ │ │ ├── index.js │ │ │ └── index.test.js │ │ ├── biorxiv.org │ │ │ ├── index.js │ │ │ └── index.test.js │ │ ├── blisterreview.com │ │ │ ├── index.js │ │ │ └── index.test.js │ │ ├── blogspot.com │ │ │ └── index.js │ │ ├── bookwalker.jp │ │ │ ├── index.js │ │ │ └── index.test.js │ │ ├── buzzap.jp │ │ │ ├── index.js │ │ │ └── index.test.js │ │ ├── clinicaltrials.gov │ │ │ ├── index.js │ │ │ └── index.test.js │ │ ├── deadline.com │ │ │ ├── index.js │ │ │ └── index.test.js │ │ ├── deadspin.com │ │ │ ├── index.js │ │ │ └── index.test.js │ │ ├── epaper.zeit.de │ │ │ ├── index.js │ │ │ └── index.test.js │ │ ├── fandom.wikia.com │ │ │ ├── index.js │ │ │ └── index.test.js │ │ ├── fortune.com │ │ │ ├── index.js │ │ │ └── index.test.js │ │ ├── forward.com │ │ │ ├── index.js │ │ │ └── index.test.js │ │ ├── genius.com │ │ │ ├── index.js │ │ │ └── index.test.js │ │ ├── getnews.jp │ │ │ ├── index.js │ │ │ └── index.test.js │ │ ├── github.com │ │ │ ├── index.js │ │ │ └── index.test.js │ │ ├── gothamist.com │ │ │ ├── index.js │ │ │ └── index.test.js │ │ ├── hellogiggles.com │ │ │ ├── index.js │ │ │ └── index.test.js │ │ ├── ici.radio-canada.ca │ │ │ ├── index.js │ │ │ └── index.test.js │ │ ├── index.js │ │ ├── japan.cnet.com │ │ │ ├── index.js │ │ │ └── index.test.js │ │ ├── japan.zdnet.com │ │ │ ├── index.js │ │ │ └── index.test.js │ │ ├── jvndb.jvn.jp │ │ │ ├── index.js │ │ │ └── index.test.js │ │ ├── ma.ttias.be │ │ │ ├── index.js │ │ │ └── index.test.js │ │ ├── mashable.com │ │ │ ├── index.js │ │ │ └── index.test.js │ │ ├── medium.com │ │ │ ├── index.js │ │ │ └── index.test.js │ │ ├── money.cnn.com │ │ │ ├── index.js │ │ │ └── index.test.js │ │ ├── newrepublic.com │ │ │ ├── index.js │ │ │ └── index.test.js │ │ ├── news.mynavi.jp │ │ │ ├── index.js │ │ │ └── index.test.js │ │ ├── news.nationalgeographic.com │ │ │ ├── index.js │ │ │ └── index.test.js │ │ ├── nymag.com │ │ │ ├── fixtures │ │ │ │ └── test.html │ │ │ ├── index.js │ │ │ └── index.test.js │ │ ├── obamawhitehouse.archives.gov │ │ │ ├── index.js │ │ │ └── index.test.js │ │ ├── observer.com │ │ │ ├── index.js │ │ │ └── index.test.js │ │ ├── otrs.com │ │ │ ├── index.js │ │ │ └── index.test.js │ │ ├── pagesix.com │ │ │ ├── index.js │ │ │ └── index.test.js │ │ ├── pastebin.com │ │ │ ├── index.js │ │ │ └── index.test.js │ │ ├── people.com │ │ │ ├── index.js │ │ │ └── index.test.js │ │ ├── phpspot.org │ │ │ ├── index.js │ │ │ └── index.test.js │ │ ├── pitchfork.com │ │ │ ├── index.js │ │ │ └── index.test.js │ │ ├── postlight.com │ │ │ ├── index.js │ │ │ └── index.test.js │ │ ├── qz.com │ │ │ ├── index.js │ │ │ └── index.test.js │ │ ├── scan.netsecurity.ne.jp │ │ │ ├── index.js │ │ │ └── index.test.js │ │ ├── sciencefly.com │ │ │ ├── index.js │ │ │ └── index.test.js │ │ ├── sect.iij.ad.jp │ │ │ ├── index.js │ │ │ └── index.test.js │ │ ├── takagi-hiromitsu.jp │ │ │ ├── index.js │ │ │ └── index.test.js │ │ ├── techlog.iij.ad.jp │ │ │ ├── index.js │ │ │ └── index.test.js │ │ ├── thefederalistpapers.org │ │ │ ├── index.js │ │ │ └── index.test.js │ │ ├── thoughtcatalog.com │ │ │ ├── index.js │ │ │ └── index.test.js │ │ ├── timesofindia.indiatimes.com │ │ │ ├── index.js │ │ │ └── index.test.js │ │ ├── twitter.com │ │ │ ├── index.js │ │ │ └── index.test.js │ │ ├── uproxx.com │ │ │ ├── index.js │ │ │ └── index.test.js │ │ ├── weekly.ascii.jp │ │ │ ├── index.js │ │ │ └── index.test.js │ │ ├── wikipedia.org │ │ │ └── index.js │ │ ├── wired.jp │ │ │ ├── index.js │ │ │ └── index.test.js │ │ ├── www.abendblatt.de │ │ │ ├── index.js │ │ │ └── index.test.js │ │ ├── www.al.com │ │ │ ├── index.js │ │ │ └── index.test.js │ │ ├── www.americanow.com │ │ │ ├── index.js │ │ │ └── index.test.js │ │ ├── www.androidcentral.com │ │ │ ├── index.js │ │ │ └── index.test.js │ │ ├── www.aol.com │ │ │ ├── index.js │ │ │ └── index.test.js │ │ ├── www.apartmenttherapy.com │ │ │ ├── index.js │ │ │ └── index.test.js │ │ ├── www.asahi.com │ │ │ ├── index.js │ │ │ └── index.test.js │ │ ├── www.bloomberg.com │ │ │ ├── index.js │ │ │ └── index.test.js │ │ ├── www.broadwayworld.com │ │ │ ├── index.js │ │ │ └── index.test.js │ │ ├── www.bustle.com │ │ │ ├── index.js │ │ │ └── index.test.js │ │ ├── www.buzzfeed.com │ │ │ ├── index.js │ │ │ └── index.test.js │ │ ├── www.cbc.ca │ │ │ ├── index.js │ │ │ └── index.test.js │ │ ├── www.cbssports.com │ │ │ ├── index.js │ │ │ └── index.test.js │ │ ├── www.chicagotribune.com │ │ │ ├── index.js │ │ │ └── index.test.js │ │ ├── www.cnbc.com │ │ │ ├── index.js │ │ │ └── index.test.js │ │ ├── www.cnet.com │ │ │ ├── index.js │ │ │ └── index.test.js │ │ ├── www.cnn.com │ │ │ ├── index.js │ │ │ └── index.test.js │ │ ├── www.dmagazine.com │ │ │ ├── index.js │ │ │ └── index.test.js │ │ ├── www.elecom.co.jp │ │ │ ├── index.js │ │ │ └── index.test.js │ │ ├── www.engadget.com │ │ │ ├── index.js │ │ │ └── index.test.js │ │ ├── www.eonline.com │ │ │ ├── index.js │ │ │ └── index.test.js │ │ ├── www.fastcompany.com │ │ │ ├── index.js │ │ │ └── index.test.js │ │ ├── www.fool.com │ │ │ ├── index.js │ │ │ └── index.test.js │ │ ├── www.fortinet.com │ │ │ ├── index.js │ │ │ └── index.test.js │ │ ├── www.gizmodo.jp │ │ │ ├── index.js │ │ │ └── index.test.js │ │ ├── www.gruene.de │ │ │ ├── index.js │ │ │ └── index.test.js │ │ ├── www.huffingtonpost.com │ │ │ ├── index.js │ │ │ └── index.test.js │ │ ├── www.infoq.com │ │ │ ├── index.js │ │ │ └── index.test.js │ │ ├── www.inquisitr.com │ │ │ ├── index.js │ │ │ └── index.test.js │ │ ├── www.investmentexecutive.com │ │ │ ├── index.js │ │ │ └── index.test.js │ │ ├── www.ipa.go.jp │ │ │ ├── index.js │ │ │ └── index.test.js │ │ ├── www.itmedia.co.jp │ │ │ ├── index.js │ │ │ └── index.test.js │ │ ├── www.jnsa.org │ │ │ ├── index.js │ │ │ └── index.test.js │ │ ├── www.ladbible.com │ │ │ ├── index.js │ │ │ └── index.test.js │ │ ├── www.latimes.com │ │ │ ├── index.js │ │ │ └── index.test.js │ │ ├── www.lemonde.fr │ │ │ ├── index.js │ │ │ └── index.test.js │ │ ├── www.lifehacker.jp │ │ │ ├── index.js │ │ │ └── index.test.js │ │ ├── www.linkedin.com │ │ │ ├── index.js │ │ │ └── index.test.js │ │ ├── www.littlethings.com │ │ │ ├── index.js │ │ │ └── index.test.js │ │ ├── www.macrumors.com │ │ │ ├── index.js │ │ │ └── index.test.js │ │ ├── www.mentalfloss.com │ │ │ ├── index.js │ │ │ └── index.test.js │ │ ├── www.miamiherald.com │ │ │ ├── index.js │ │ │ └── index.test.js │ │ ├── www.moongift.jp │ │ │ ├── index.js │ │ │ └── index.test.js │ │ ├── www.msn.com │ │ │ ├── index.js │ │ │ └── index.test.js │ │ ├── www.msnbc.com │ │ │ ├── index.js │ │ │ └── index.test.js │ │ ├── www.nationalgeographic.com │ │ │ ├── index.js │ │ │ └── index.test.js │ │ ├── www.nbcnews.com │ │ │ ├── index.js │ │ │ └── index.test.js │ │ ├── www.ndtv.com │ │ │ ├── index.js │ │ │ └── index.test.js │ │ ├── www.newyorker.com │ │ │ ├── index.js │ │ │ └── index.test.js │ │ ├── www.npr.org │ │ │ ├── index.js │ │ │ └── index.test.js │ │ ├── www.nydailynews.com │ │ │ ├── index.js │ │ │ └── index.test.js │ │ ├── www.nytimes.com │ │ │ ├── index.js │ │ │ └── index.test.js │ │ ├── www.opposingviews.com │ │ │ ├── index.js │ │ │ └── index.test.js │ │ ├── www.oreilly.co.jp │ │ │ ├── index.js │ │ │ └── index.test.js │ │ ├── www.ossnews.jp │ │ │ ├── index.js │ │ │ └── index.test.js │ │ ├── www.phoronix.com │ │ │ ├── index.js │ │ │ └── index.test.js │ │ ├── www.politico.com │ │ │ ├── index.js │ │ │ └── index.test.js │ │ ├── www.popsugar.com │ │ │ ├── index.js │ │ │ └── index.test.js │ │ ├── www.prospectmagazine.co.uk │ │ │ ├── index.js │ │ │ └── index.test.js │ │ ├── www.publickey1.jp │ │ │ ├── index.js │ │ │ └── index.test.js │ │ ├── www.qdaily.com │ │ │ ├── index.js │ │ │ └── index.test.js │ │ ├── www.rawstory.com │ │ │ ├── index.js │ │ │ └── index.test.js │ │ ├── www.rbbtoday.com │ │ │ ├── index.js │ │ │ └── index.test.js │ │ ├── www.recode.net │ │ │ ├── index.js │ │ │ └── index.test.js │ │ ├── www.reddit.com │ │ │ ├── index.js │ │ │ └── index.test.js │ │ ├── www.refinery29.com │ │ │ ├── index.js │ │ │ └── index.test.js │ │ ├── www.reuters.com │ │ │ ├── index.js │ │ │ └── index.test.js │ │ ├── www.rollingstone.com │ │ │ ├── index.js │ │ │ └── index.test.js │ │ ├── www.sanwa.co.jp │ │ │ ├── index.js │ │ │ └── index.test.js │ │ ├── www.sbnation.com │ │ │ ├── index.js │ │ │ └── index.test.js │ │ ├── www.si.com │ │ │ ├── index.js │ │ │ └── index.test.js │ │ ├── www.slate.com │ │ │ ├── index.js │ │ │ └── index.test.js │ │ ├── www.spektrum.de │ │ │ ├── index.js │ │ │ └── index.test.js │ │ ├── www.theatlantic.com │ │ │ ├── index.js │ │ │ └── index.test.js │ │ ├── www.theguardian.com │ │ │ ├── index.js │ │ │ └── index.test.js │ │ ├── www.thepennyhoarder.com │ │ │ ├── index.js │ │ │ └── index.test.js │ │ ├── www.thepoliticalinsider.com │ │ │ ├── index.js │ │ │ └── index.test.js │ │ ├── www.theverge.com │ │ │ ├── index.js │ │ │ └── index.test.js │ │ ├── www.tmz.com │ │ │ ├── index.js │ │ │ └── index.test.js │ │ ├── www.today.com │ │ │ ├── index.js │ │ │ └── index.test.js │ │ ├── www.usmagazine.com │ │ │ ├── index.js │ │ │ └── index.test.js │ │ ├── www.vox.com │ │ │ ├── index.js │ │ │ └── index.test.js │ │ ├── www.washingtonpost.com │ │ │ ├── index.js │ │ │ └── index.test.js │ │ ├── www.westernjournalism.com │ │ │ ├── index.js │ │ │ └── index.test.js │ │ ├── www.wired.com │ │ │ ├── index.js │ │ │ └── index.test.js │ │ ├── www.yahoo.com │ │ │ ├── index.js │ │ │ └── index.test.js │ │ ├── www.yomiuri.co.jp │ │ │ ├── index.js │ │ │ └── index.test.js │ │ └── www.youtube.com │ │ │ ├── index.js │ │ │ └── index.test.js │ ├── detect-by-html.js │ ├── detect-by-html.test.js │ ├── fixtures │ │ └── postlight.com │ │ │ └── index.js │ ├── generic │ │ ├── author │ │ │ ├── constants.js │ │ │ ├── extractor.js │ │ │ └── extractor.test.js │ │ ├── content │ │ │ ├── extract-best-node.js │ │ │ ├── extract-best-node.test.js │ │ │ ├── extractor.js │ │ │ ├── extractor.test.js │ │ │ └── scoring │ │ │ │ ├── add-score.js │ │ │ │ ├── add-score.test.js │ │ │ │ ├── add-to-parent.js │ │ │ │ ├── add-to-parent.test.js │ │ │ │ ├── constants.js │ │ │ │ ├── find-top-candidate.js │ │ │ │ ├── find-top-candidate.test.js │ │ │ │ ├── get-or-init-score.js │ │ │ │ ├── get-or-init-score.test.js │ │ │ │ ├── get-score.js │ │ │ │ ├── get-score.test.js │ │ │ │ ├── get-weight.js │ │ │ │ ├── get-weight.test.js │ │ │ │ ├── index.js │ │ │ │ ├── merge-siblings.js │ │ │ │ ├── score-commas.js │ │ │ │ ├── score-commas.test.js │ │ │ │ ├── score-content.js │ │ │ │ ├── score-content.test.js │ │ │ │ ├── score-length.js │ │ │ │ ├── score-length.test.js │ │ │ │ ├── score-node.js │ │ │ │ ├── score-node.test.js │ │ │ │ ├── score-paragraph.js │ │ │ │ ├── score-paragraph.test.js │ │ │ │ ├── set-score.js │ │ │ │ └── set-score.test.js │ │ ├── date-published │ │ │ ├── constants.js │ │ │ ├── extractor.js │ │ │ └── extractor.test.js │ │ ├── dek │ │ │ ├── extractor.js │ │ │ └── extractor.test.js │ │ ├── excerpt │ │ │ ├── constants.js │ │ │ ├── extractor.js │ │ │ └── extractor.test.js │ │ ├── index.js │ │ ├── index.test.js │ │ ├── lead-image-url │ │ │ ├── constants.js │ │ │ ├── extractor.js │ │ │ ├── extractor.test.js │ │ │ ├── score-image.js │ │ │ └── score-image.test.js │ │ ├── next-page-url │ │ │ ├── extractor.js │ │ │ ├── extractor.test.js │ │ │ └── scoring │ │ │ │ ├── constants.js │ │ │ │ ├── score-links.js │ │ │ │ ├── score-links.test.js │ │ │ │ └── utils │ │ │ │ ├── index.js │ │ │ │ ├── score-base-url.js │ │ │ │ ├── score-base-url.test.js │ │ │ │ ├── score-by-parents.js │ │ │ │ ├── score-by-parents.test.js │ │ │ │ ├── score-cap-links.js │ │ │ │ ├── score-cap-links.test.js │ │ │ │ ├── score-extraneous-links.js │ │ │ │ ├── score-extraneous-links.test.js │ │ │ │ ├── score-link-text.js │ │ │ │ ├── score-link-text.test.js │ │ │ │ ├── score-next-link-text.js │ │ │ │ ├── score-next-link-text.test.js │ │ │ │ ├── score-page-in-link.js │ │ │ │ ├── score-page-in-link.test.js │ │ │ │ ├── score-prev-link.js │ │ │ │ ├── score-prev-link.test.js │ │ │ │ ├── score-similarity.js │ │ │ │ ├── score-similarity.test.js │ │ │ │ ├── should-score.js │ │ │ │ └── should-score.test.js │ │ ├── title │ │ │ ├── constants.js │ │ │ ├── extractor.js │ │ │ └── extractor.test.js │ │ ├── url │ │ │ ├── constants.js │ │ │ ├── extractor.js │ │ │ └── extractor.test.js │ │ └── word-count │ │ │ ├── extractor.js │ │ │ └── extractor.test.js │ ├── get-extractor.js │ ├── get-extractor.test.js │ ├── index.js │ ├── root-extractor.js │ └── root-extractor.test.js ├── mercury.js ├── mercury.test.js ├── resource │ ├── index.js │ ├── index.test.js │ └── utils │ │ ├── constants.js │ │ ├── dom │ │ ├── clean.js │ │ ├── clean.test.js │ │ ├── constants.js │ │ ├── convert-lazy-loaded-images.js │ │ ├── convert-lazy-loaded-images.test.js │ │ ├── index.js │ │ ├── normalize-meta-tags.js │ │ └── normalize-meta-tags.test.js │ │ ├── fetch-resource.js │ │ ├── fetch-resource.test.js │ │ └── index.js ├── shims │ ├── cheerio-query.js │ └── iconv-lite.js ├── test-helpers.js └── utils │ ├── dom │ ├── brs-to-ps.js │ ├── brs-to-ps.test.js │ ├── clean-attributes.js │ ├── clean-attributes.test.js │ ├── clean-h-ones.js │ ├── clean-h-ones.test.js │ ├── clean-headers.js │ ├── clean-headers.test.js │ ├── clean-images.js │ ├── clean-images.test.js │ ├── clean-tags.js │ ├── clean-tags.test.js │ ├── constants.js │ ├── convert-node-to.js │ ├── convert-node-to.test.js │ ├── convert-to-paragraphs.js │ ├── convert-to-paragraphs.test.js │ ├── extract-from-meta.js │ ├── extract-from-meta.test.js │ ├── extract-from-selectors.js │ ├── extract-from-selectors.test.js │ ├── get-attrs.js │ ├── get-attrs.test.js │ ├── index.js │ ├── is-wordpress.js │ ├── is-wordpress.test.js │ ├── link-density.js │ ├── link-density.test.js │ ├── make-links-absolute.js │ ├── make-links-absolute.test.js │ ├── mark-to-keep.js │ ├── mark-to-keep.test.js │ ├── node-is-sufficient.js │ ├── node-is-sufficient.test.js │ ├── paragraphize.js │ ├── paragraphize.test.js │ ├── remove-empty.js │ ├── remove-empty.test.js │ ├── rewrite-top-level.js │ ├── rewrite-top-level.test.js │ ├── set-attr.js │ ├── set-attr.test.js │ ├── set-attrs.js │ ├── set-attrs.test.js │ ├── strip-junk-tags.js │ ├── strip-junk-tags.test.js │ ├── strip-tags.js │ ├── strip-tags.test.js │ ├── strip-unlikely-candidates.js │ ├── strip-unlikely-candidates.test.js │ ├── within-comment.js │ └── within-comment.test.js │ ├── index.js │ ├── merge-supported-domains.js │ ├── merge-supported-domains.test.js │ ├── range.js │ ├── text │ ├── article-base-url.js │ ├── article-base-url.test.js │ ├── constants.js │ ├── excerpt-content.js │ ├── excerpt.test.js │ ├── extract-from-url.js │ ├── extract-from-url.test.js │ ├── get-encoding.js │ ├── get-encoding.test.js │ ├── has-sentence-end.js │ ├── index.js │ ├── normalize-spaces.js │ ├── normalize-spaces.test.js │ ├── page-num-from-url.js │ ├── page-num-from-url.test.js │ ├── remove-anchor.js │ └── remove-anchor.test.js │ ├── validate-url.js │ └── validate-url.test.js └── yarn.lock /.agignore: -------------------------------------------------------------------------------- 1 | dist 2 | -------------------------------------------------------------------------------- /.babelrc: -------------------------------------------------------------------------------- 1 | { 2 | "presets": ["@babel/preset-env"], 3 | "plugins": [ 4 | [ 5 | "module-resolver", 6 | { 7 | "root": ["./src"], 8 | "alias": { 9 | "./utils": "utils", 10 | "./cleaners": "cleaners", 11 | "./resource": "resource", 12 | "./extractors": "extractors", 13 | "./test-helpers.js": "test-helpers", 14 | "./mercury.js": "mercury" 15 | } 16 | } 17 | ] 18 | ], 19 | "env": { 20 | "development": { 21 | "plugins": [ 22 | [ 23 | "@babel/plugin-transform-runtime", 24 | { 25 | "corejs": 2, 26 | "regenerator": true 27 | } 28 | ] 29 | ] 30 | } 31 | } 32 | } 33 | -------------------------------------------------------------------------------- /.eslintignore: -------------------------------------------------------------------------------- 1 | **/fixtures/* 2 | dist/* 3 | coverage/* 4 | karma.conf.js 5 | -------------------------------------------------------------------------------- /.eslintrc: -------------------------------------------------------------------------------- 1 | { 2 | "parser": "babel-eslint", 3 | "extends": ["airbnb", "prettier"], 4 | "plugins": ["babel"], 5 | "globals": { 6 | "describe": true, 7 | "it": true, 8 | "fit": true, 9 | "jasmine": true, 10 | "beforeEach": true, 11 | "beforeAll": true, 12 | "afterAll": true 13 | }, 14 | "rules": { 15 | "no-param-reassign": 0, 16 | "no-control-regex": 0, 17 | "import/prefer-default-export": 0, 18 | "generator-star-spacing": 0, 19 | "babel/generator-star-spacing": 0, 20 | "func-names": 0, 21 | "no-confusing-arrow": 0, 22 | "camelcase": 0, 23 | "no-multiple-empty-lines": [ 24 | "error", 25 | { "max": 1, "maxEOF": 0, "maxBOF": 0 } 26 | ], 27 | "import/no-unresolved": false, 28 | "import/no-extraneous-dependencies": [ 29 | "error", 30 | { 31 | "devDependencies": [ 32 | "**/*.test.js", 33 | "scripts/proxy-browser-test.js", 34 | "rollup.config*js" 35 | ] 36 | } 37 | ] 38 | }, 39 | "settings": { 40 | "import/resolver": { 41 | "babel-module": {} 42 | } 43 | } 44 | } 45 | -------------------------------------------------------------------------------- /.gitattributes: -------------------------------------------------------------------------------- 1 | # All html files are fixtures, so marking as vendored 2 | # so Linguist (https://github.com/github/linguist) 3 | # ignores them for the purpose of language detection 4 | *.html linguist-vendored 5 | -------------------------------------------------------------------------------- /.github/PULL_REQUEST_TEMPLATE.md: -------------------------------------------------------------------------------- 1 | 9 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | node_modules 2 | build 3 | npm-debug.log 4 | TODO.md 5 | read 6 | preview.html 7 | preview.json 8 | coverage 9 | dist/mercury_test.js 10 | dist/mercury_test.js.map 11 | dist/mercury_test.web.js 12 | tmp/artifacts 13 | test-output.json 14 | .tool-versions 15 | .yarnrc.yml 16 | **/.DS_Store 17 | -------------------------------------------------------------------------------- /.nvmrc: -------------------------------------------------------------------------------- 1 | 12.8.1 2 | -------------------------------------------------------------------------------- /.prettierignore: -------------------------------------------------------------------------------- 1 | dist 2 | -------------------------------------------------------------------------------- /.prettierrc: -------------------------------------------------------------------------------- 1 | { 2 | "trailingComma": "es5", 3 | "semi": true, 4 | "singleQuote": true, 5 | "printWidth": 80, 6 | "tabWidth": 2, 7 | "useTabs": false, 8 | "bracketSpacing": true, 9 | "arrowParens": "avoid" 10 | } 11 | -------------------------------------------------------------------------------- /.remarkrc: -------------------------------------------------------------------------------- 1 | { 2 | "plugins": [ 3 | "remark-preset-lint-recommended", 4 | ["remark-lint-list-item-indent", false] 5 | ] 6 | } 7 | -------------------------------------------------------------------------------- /LICENSE-MIT: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2019 Postlight 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /RELEASE.md: -------------------------------------------------------------------------------- 1 | # How to cut a new release 2 | 3 | 1. Create a release branch. 4 | 5 | ```bash 6 | git checkout -b release-1.x.x # (where 1.x.x reflects the release) 7 | ``` 8 | 9 | 2. Update package.json with the version number 10 | 3. Build the release 11 | 12 | ```bash 13 | yarn release 14 | ``` 15 | 16 | 4. Update the changelog 17 | 18 | ```bash 19 | # Copy the output of the command below and paste it into CHANGELOG.md 20 | # following the conventions of that file 21 | yarn changelog-maker postlight parser 22 | ``` 23 | 24 | 5. Submit a PR 25 | 6. Merge once the PR's tests pass 26 | 7. [Create a release](https://github.com/postlight/parser/releases), linking to this release's entry in the changelog. (See other releases for context.) 27 | -------------------------------------------------------------------------------- /assets/parser-basic-usage.gif: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/postlight/parser/e8ba7ece291efa4d915d50dd4deeec17d54359f2/assets/parser-basic-usage.gif -------------------------------------------------------------------------------- /karma.conf.js: -------------------------------------------------------------------------------- 1 | module.exports = function (config) { 2 | config.set({ 3 | 4 | basePath: '', 5 | 6 | frameworks: ['jasmine', 'browserify'], 7 | files: [ 8 | { pattern: 'src/**/*.test.js', included: true }, 9 | ], 10 | 11 | exclude: [], 12 | 13 | preprocessors: { 14 | 'src/**/*.js': ['browserify'], 15 | }, 16 | 17 | browserify: { 18 | debug: true, 19 | transform: ['babelify', 'brfs'], 20 | }, 21 | 22 | reporters: ['progress'], 23 | port: 9876, 24 | colors: true, 25 | logLevel: config.LOG_INFO, 26 | autoWatch: false, 27 | browsers: ['Chrome'], 28 | singleRun: true, 29 | concurrency: Infinity, 30 | }); 31 | }; 32 | -------------------------------------------------------------------------------- /preview: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env node 2 | var fs = require('fs') 3 | var execSync = require('child_process').execSync 4 | 5 | var optRe = /^--/ 6 | var args = process.argv.slice(2).reduce((acc, arg) => { 7 | if (optRe.test(arg)) { 8 | acc.opts.push(arg) 9 | } else { 10 | acc.urls.push(arg) 11 | } 12 | 13 | return acc 14 | }, { opts: [], urls: [] }) 15 | 16 | var urls = args.urls 17 | 18 | if (!args.opts.find(arg => arg === '--no-rebuild')) { 19 | console.log('Rebuilding Mercury') 20 | execSync('MERCURY_TEST_BUILD=true npm run build') 21 | } 22 | 23 | var Mercury = require('./dist/mercury_test') 24 | 25 | console.log(`Fetching link(s)`) 26 | 27 | urls.map(url => { 28 | Mercury.parse(url, { fallback: false }).then(function(result) { 29 | var htmlFile = './preview.html' 30 | var jsonFile = './preview.json' 31 | 32 | var html = `
Foo
'); 10 | const $node = $('p').first(); 11 | addScore($node, $, 25); 12 | assert.equal(getScore($node), 50); 13 | }); 14 | 15 | it('adds score if score not yet set (assumes score is 0)', () => { 16 | const $ = cheerio.load('Foo
'); 17 | const $node = $('p').first(); 18 | addScore($node, $, 25); 19 | assert.equal(getScore($node), 25); 20 | }); 21 | }); 22 | }); 23 | -------------------------------------------------------------------------------- /src/extractors/generic/content/scoring/add-to-parent.js: -------------------------------------------------------------------------------- 1 | import { addScore } from './index'; 2 | 3 | // Adds 1/4 of a child's score to its parent 4 | export default function addToParent(node, $, score) { 5 | const parent = node.parent(); 6 | if (parent) { 7 | addScore(parent, $, score * 0.25); 8 | } 9 | 10 | return node; 11 | } 12 | -------------------------------------------------------------------------------- /src/extractors/generic/content/scoring/add-to-parent.test.js: -------------------------------------------------------------------------------- 1 | import assert from 'assert'; 2 | import cheerio from 'cheerio'; 3 | 4 | import { addToParent, getScore } from './index'; 5 | 6 | describe('Scoring utils', () => { 7 | describe('addToParent(node, $, amount)', () => { 8 | it("adds 1/4 of a node's score it its parent", () => { 9 | const $ = cheerio.load('Foo
Foo
'); 10 | assert.equal(getScore($('p').first()), null); 11 | }); 12 | 13 | it('returns 25 if the node has a score attr of 25', () => { 14 | const $ = cheerio.load('Foo
'); 15 | const score = getScore($('p').first()); 16 | assert.equal(typeof score, 'number'); 17 | assert.equal(score, 25); 18 | }); 19 | }); 20 | }); 21 | -------------------------------------------------------------------------------- /src/extractors/generic/content/scoring/index.js: -------------------------------------------------------------------------------- 1 | // Scoring 2 | export { default as getWeight } from './get-weight'; 3 | export { default as getScore } from './get-score'; 4 | export { default as scoreCommas } from './score-commas'; 5 | export { default as scoreLength } from './score-length'; 6 | export { default as scoreParagraph } from './score-paragraph'; 7 | export { default as setScore } from './set-score'; 8 | export { default as addScore } from './add-score'; 9 | export { default as addToParent } from './add-to-parent'; 10 | export { default as getOrInitScore } from './get-or-init-score'; 11 | export { default as scoreNode } from './score-node'; 12 | export { default as scoreContent } from './score-content'; 13 | export { default as findTopCandidate } from './find-top-candidate'; 14 | -------------------------------------------------------------------------------- /src/extractors/generic/content/scoring/score-commas.js: -------------------------------------------------------------------------------- 1 | // return 1 for every comma in text 2 | export default function scoreCommas(text) { 3 | return (text.match(/,/g) || []).length; 4 | } 5 | -------------------------------------------------------------------------------- /src/extractors/generic/content/scoring/score-commas.test.js: -------------------------------------------------------------------------------- 1 | import assert from 'assert'; 2 | 3 | import { scoreCommas } from './index'; 4 | 5 | describe('Scoring utils', () => { 6 | describe('scoreCommas(text)', () => { 7 | it('returns 0 if text has no commas', () => { 8 | assert.equal(scoreCommas('Foo bar'), 0); 9 | }); 10 | 11 | it('returns a point for every comma in the text', () => { 12 | assert.equal(scoreCommas('Foo, bar'), 1); 13 | assert.equal(scoreCommas('Foo, bar, baz'), 2); 14 | assert.equal(scoreCommas('Foo, bar, baz, bat'), 3); 15 | }); 16 | }); 17 | }); 18 | -------------------------------------------------------------------------------- /src/extractors/generic/content/scoring/score-length.js: -------------------------------------------------------------------------------- 1 | const idkRe = new RegExp('^(p|pre)$', 'i'); 2 | 3 | export default function scoreLength(textLength, tagName = 'p') { 4 | const chunks = textLength / 50; 5 | 6 | if (chunks > 0) { 7 | let lengthBonus; 8 | 9 | // No idea why p or pre are being tamped down here 10 | // but just following the source for now 11 | // Not even sure why tagName is included here, 12 | // since this is only being called from the context 13 | // of scoreParagraph 14 | if (idkRe.test(tagName)) { 15 | lengthBonus = chunks - 2; 16 | } else { 17 | lengthBonus = chunks - 1.25; 18 | } 19 | 20 | return Math.min(Math.max(lengthBonus, 0), 3); 21 | } 22 | 23 | return 0; 24 | } 25 | -------------------------------------------------------------------------------- /src/extractors/generic/content/scoring/score-length.test.js: -------------------------------------------------------------------------------- 1 | import assert from 'assert'; 2 | 3 | import { scoreLength } from './index'; 4 | 5 | describe('Scoring utils', () => { 6 | describe('scoreLength(textLength, tagName)', () => { 7 | it('returns 0 if length < 50 chars', () => { 8 | assert.equal(scoreLength(30), 0); 9 | }); 10 | 11 | it('returns varying scores but maxes out at 3', () => { 12 | assert.equal(scoreLength(150), 1); 13 | assert.equal(scoreLength(199), 1.98); 14 | assert.equal(scoreLength(200), 2); 15 | assert.equal(scoreLength(250), 3); 16 | assert.equal(scoreLength(500), 3); 17 | assert.equal(scoreLength(1500), 3); 18 | }); 19 | }); 20 | }); 21 | -------------------------------------------------------------------------------- /src/extractors/generic/content/scoring/score-node.js: -------------------------------------------------------------------------------- 1 | import { scoreParagraph } from './index'; 2 | import { 3 | PARAGRAPH_SCORE_TAGS, 4 | CHILD_CONTENT_TAGS, 5 | BAD_TAGS, 6 | } from './constants'; 7 | 8 | // Score an individual node. Has some smarts for paragraphs, otherwise 9 | // just scores based on tag. 10 | export default function scoreNode($node) { 11 | const { tagName } = $node.get(0); 12 | 13 | // TODO: Consider ordering by most likely. 14 | // E.g., if divs are a more common tag on a page, 15 | // Could save doing that regex test on every node – AP 16 | if (PARAGRAPH_SCORE_TAGS.test(tagName)) { 17 | return scoreParagraph($node); 18 | } 19 | if (tagName.toLowerCase() === 'div') { 20 | return 5; 21 | } 22 | if (CHILD_CONTENT_TAGS.test(tagName)) { 23 | return 3; 24 | } 25 | if (BAD_TAGS.test(tagName)) { 26 | return -3; 27 | } 28 | if (tagName.toLowerCase() === 'th') { 29 | return -5; 30 | } 31 | 32 | return 0; 33 | } 34 | -------------------------------------------------------------------------------- /src/extractors/generic/content/scoring/score-paragraph.js: -------------------------------------------------------------------------------- 1 | import { scoreCommas, scoreLength } from './index'; 2 | 3 | // Score a paragraph using various methods. Things like number of 4 | // commas, etc. Higher is better. 5 | export default function scoreParagraph(node) { 6 | let score = 1; 7 | const text = node.text().trim(); 8 | const textLength = text.length; 9 | 10 | // If this paragraph is less than 25 characters, don't count it. 11 | if (textLength < 25) { 12 | return 0; 13 | } 14 | 15 | // Add points for any commas within this paragraph 16 | score += scoreCommas(text); 17 | 18 | // For every 50 characters in this paragraph, add another point. Up 19 | // to 3 points. 20 | score += scoreLength(textLength); 21 | 22 | // Articles can end with short paragraphs when people are being clever 23 | // but they can also end with short paragraphs setting up lists of junk 24 | // that we strip. This negative tweaks junk setup paragraphs just below 25 | // the cutoff threshold. 26 | if (text.slice(-1) === ':') { 27 | score -= 1; 28 | } 29 | 30 | return score; 31 | } 32 | -------------------------------------------------------------------------------- /src/extractors/generic/content/scoring/set-score.js: -------------------------------------------------------------------------------- 1 | export default function setScore($node, $, score) { 2 | $node.attr('score', score); 3 | return $node; 4 | } 5 | -------------------------------------------------------------------------------- /src/extractors/generic/content/scoring/set-score.test.js: -------------------------------------------------------------------------------- 1 | import assert from 'assert'; 2 | import cheerio from 'cheerio'; 3 | 4 | import { setScore, getScore } from './index'; 5 | 6 | describe('Scoring utils', () => { 7 | describe('setScore(node, $, amount)', () => { 8 | it("sets the specified amount as the node's score", () => { 9 | const $ = cheerio.load('Foo
'); 10 | const $node = $('p').first(); 11 | const newScore = 25; 12 | setScore($node, $, newScore); 13 | 14 | const score = getScore($node); 15 | assert(score, newScore); 16 | }); 17 | }); 18 | }); 19 | -------------------------------------------------------------------------------- /src/extractors/generic/dek/extractor.js: -------------------------------------------------------------------------------- 1 | // Currently there is only one selector for 2 | // deks. We should simply return null here 3 | // until we have a more robust generic option. 4 | // Below is the original source for this, for reference. 5 | const GenericDekExtractor = { 6 | extract() { 7 | return null; 8 | }, 9 | }; 10 | 11 | export default GenericDekExtractor; 12 | -------------------------------------------------------------------------------- /src/extractors/generic/dek/extractor.test.js: -------------------------------------------------------------------------------- 1 | import assert from 'assert'; 2 | import cheerio from 'cheerio'; 3 | 4 | import GenericDekExtractor from './extractor'; 5 | 6 | describe('GenericDekExtractor', () => { 7 | describe('extract({ $, metaCache })', () => { 8 | it('returns null if no dek can be found', () => { 9 | const $ = cheerio.load(''); 10 | const metaCache = []; 11 | const result = GenericDekExtractor.extract({ $, metaCache }); 12 | 13 | assert.equal(result, null); 14 | }); 15 | }); 16 | }); 17 | -------------------------------------------------------------------------------- /src/extractors/generic/excerpt/constants.js: -------------------------------------------------------------------------------- 1 | export const EXCERPT_META_SELECTORS = ['og:description', 'twitter:description']; 2 | -------------------------------------------------------------------------------- /src/extractors/generic/excerpt/extractor.js: -------------------------------------------------------------------------------- 1 | import ellipsize from 'ellipsize'; 2 | 3 | import { extractFromMeta, stripTags } from 'utils/dom'; 4 | 5 | import { EXCERPT_META_SELECTORS } from './constants'; 6 | 7 | export function clean(content, $, maxLength = 200) { 8 | content = content.replace(/[\s\n]+/g, ' ').trim(); 9 | return ellipsize(content, maxLength, { ellipse: '…' }); 10 | } 11 | 12 | const GenericExcerptExtractor = { 13 | extract({ $, content, metaCache }) { 14 | const excerpt = extractFromMeta($, EXCERPT_META_SELECTORS, metaCache); 15 | if (excerpt) { 16 | return clean(stripTags(excerpt, $)); 17 | } 18 | // Fall back to excerpting from the extracted content 19 | const maxLength = 200; 20 | const shortContent = content.slice(0, maxLength * 5); 21 | return clean($(shortContent).text(), $, maxLength); 22 | }, 23 | }; 24 | 25 | export default GenericExcerptExtractor; 26 | -------------------------------------------------------------------------------- /src/extractors/generic/next-page-url/extractor.test.js: -------------------------------------------------------------------------------- 1 | import assert from 'assert'; 2 | import cheerio from 'cheerio'; 3 | 4 | import GenericNextPageUrlExtractor from './extractor'; 5 | 6 | const fs = require('fs'); 7 | 8 | describe('GenericNextPageUrlExtractor', () => { 9 | it('returns most likely next page url', () => { 10 | const html = fs.readFileSync('./fixtures/arstechnica.com.html', 'utf8'); 11 | const $ = cheerio.load(html); 12 | const url = 13 | 'https://arstechnica.com/gadgets/2016/08/the-connected-renter-how-to-make-your-apartment-smarter/'; 14 | const next = 15 | 'https://arstechnica.com/gadgets/2016/08/the-connected-renter-how-to-make-your-apartment-smarter/2'; 16 | 17 | const nextPage = GenericNextPageUrlExtractor.extract({ 18 | $, 19 | url, 20 | }); 21 | 22 | assert.equal(nextPage, next); 23 | }); 24 | 25 | it('returns null if there is no likely next page', () => { 26 | const html = 'HI
One two three.
11 |Four five six.
12 |Seven eight nine.
13 |Ten eleven twelve.
14 | `; 15 | 16 | const wordCount = GenericWordCountExtractor.extract({ content }); 17 | 18 | assert.equal(wordCount, 12); 19 | }); 20 | }); 21 | }); 22 | -------------------------------------------------------------------------------- /src/extractors/get-extractor.js: -------------------------------------------------------------------------------- 1 | import URL from 'url'; 2 | 3 | import Extractors from './all'; 4 | import GenericExtractor from './generic'; 5 | import detectByHtml from './detect-by-html'; 6 | import { apiExtractors } from './add-extractor'; 7 | 8 | export default function getExtractor(url, parsedUrl, $) { 9 | parsedUrl = parsedUrl || URL.parse(url); 10 | const { hostname } = parsedUrl; 11 | const baseDomain = hostname 12 | .split('.') 13 | .slice(-2) 14 | .join('.'); 15 | 16 | return ( 17 | apiExtractors[hostname] || 18 | apiExtractors[baseDomain] || 19 | Extractors[hostname] || 20 | Extractors[baseDomain] || 21 | detectByHtml($) || 22 | GenericExtractor 23 | ); 24 | } 25 | -------------------------------------------------------------------------------- /src/extractors/index.js: -------------------------------------------------------------------------------- 1 | const Extractor = {}; 2 | 3 | export default Extractor; 4 | -------------------------------------------------------------------------------- /src/resource/utils/dom/clean.js: -------------------------------------------------------------------------------- 1 | import { TAGS_TO_REMOVE } from './constants'; 2 | 3 | function isComment(index, node) { 4 | return node.type === 'comment'; 5 | } 6 | 7 | function cleanComments($) { 8 | $.root() 9 | .find('*') 10 | .contents() 11 | .filter(isComment) 12 | .remove(); 13 | 14 | return $; 15 | } 16 | 17 | export default function clean($) { 18 | $(TAGS_TO_REMOVE).remove(); 19 | 20 | $ = cleanComments($); 21 | return $; 22 | } 23 | -------------------------------------------------------------------------------- /src/resource/utils/dom/clean.test.js: -------------------------------------------------------------------------------- 1 | import assert from 'assert'; 2 | import cheerio from 'cheerio'; 3 | 4 | import clean from './clean'; 5 | 6 | describe('clean($)', () => { 7 | it('removes script elements', () => { 8 | const html = ""; 9 | const $ = cheerio.load(html); 10 | 11 | assert.equal(clean($).html(), ''); 12 | }); 13 | 14 | it('removes style elements', () => { 15 | const html = ''; 16 | const $ = cheerio.load(html); 17 | 18 | assert.equal(clean($).html(), ''); 19 | }); 20 | 21 | it('removes comments', () => { 22 | const html = 'What do you think?
12 |What do you think?
21 |What do you think?
30 |What do you think?
39 |Some text!
What do you think?
11 |What do you think?
26 |What happens to spaces?27 |
What do you think?
What happens to spaces?