├── .agignore ├── .babelrc ├── .circleci └── config.yml ├── .eslintignore ├── .eslintrc ├── .gitattributes ├── .github ├── ISSUE_TEMPLATE.md └── PULL_REQUEST_TEMPLATE.md ├── .gitignore ├── .nvmrc ├── .prettierignore ├── .prettierrc ├── .remarkrc ├── CHANGELOG.md ├── CODE_OF_CONDUCT.md ├── CONTRIBUTING.md ├── LICENSE-APACHE ├── LICENSE-MIT ├── README.md ├── RELEASE.md ├── assets └── parser-basic-usage.gif ├── cli.js ├── dist ├── generate-custom-parser.js ├── generate-custom-parser.js.map ├── mercury.esm.js ├── mercury.esm.js.map ├── mercury.js ├── mercury.js.map ├── mercury.web.js └── mercury.web.js.map ├── fixtures ├── 247sports.com.html ├── abcnews.go.com.html ├── arstechnica.com.html ├── biorxiv.org.html ├── blisterreview.com.html ├── bookwalker.jp.html ├── buzzap.jp.html ├── clinicaltrials.gov.html ├── deadline.com.html ├── deadspin.com--video.html ├── deadspin.com.html ├── epaper.zeit.de.html ├── fandom.wikia.com.html ├── fortune.com.html ├── forward.com.html ├── genius.com.html ├── getnews.jp.html ├── github.com.html ├── gothamist.com.html ├── hellogiggles.com.html ├── ici.radio-canada.ca.html ├── japan.cnet.com.html ├── japan.zdnet.com.html ├── jvndb.jvn.jp.html ├── ma.ttias.be.html ├── mashable.com.html ├── medium.com--another.html ├── medium.com.html ├── money.cnn.com.html ├── newrepublic.com--minutes.html ├── newrepublic.com.html ├── news.mynavi.jp.html ├── news.nationalgeographic.com.html ├── nock │ ├── fetch-resource-test.js │ ├── mercury-test.js │ └── resource-test.js ├── nymag.com.html ├── obamawhitehouse.archives.gov--blog.html ├── obamawhitehouse.archives.gov--empty.html ├── obamawhitehouse.archives.gov--speeches.html ├── obamawhitehouse.archives.gov.html ├── observer.com.html ├── otrs.com.html ├── pagesix.com.html ├── pastebin.com.html ├── people.com.html ├── phpspot.org.html ├── pitchfork.com.html ├── postlight.com.html ├── qz.com.html ├── sandiegouniontribune.com.html ├── scan.netsecurity.ne.jp.html ├── sciencefly.com.html ├── sect.iij.ad.jp.html ├── takagi-hiromitsu.jp.html ├── techlog.iij.ad.jp.html ├── thefederalistpapers.org.html ├── thoughtcatalog.com.html ├── timesofindia.indiatimes.com.html ├── twitter.com.html ├── uproxx.com.html ├── weekly.ascii.jp.html ├── wired.jp.html ├── www.abendblatt.de.html ├── www.al.com.html ├── www.americanow.com.html ├── www.androidcentral.com.html ├── www.aol.com.html ├── www.apartmenttherapy.com.html ├── www.asahi.com.html ├── www.bloomberg.com--graphics.html ├── www.bloomberg.com--news.html ├── www.bloomberg.com.html ├── www.broadwayworld.com.html ├── www.bustle.com.html ├── www.buzzfeed.com--splash.html ├── www.buzzfeed.com.html ├── www.cbc.ca.html ├── www.cbssports.com.html ├── www.chicagotribune.com.html ├── www.cnbc.com--redesign.html ├── www.cnbc.com.html ├── www.cnet.com.html ├── www.cnn.com.html ├── www.dmagazine.com.html ├── www.elecom.co.jp.html ├── www.engadget.com.html ├── www.eonline.com.html ├── www.fastcompany.com.html ├── www.fool.com.html ├── www.fortinet.com.html ├── www.gizmodo.jp.html ├── www.gruene.de.html ├── www.huffingtonpost.com.html ├── www.infoq.com.html ├── www.inquisitr.com.html ├── www.investmentexecutive.com.html ├── www.ipa.go.jp.html ├── www.itmedia.co.jp.html ├── www.jnsa.org.html ├── www.ladbible.com.html ├── www.latimes.com--old.html ├── www.latimes.com.html ├── www.lemonde.fr.html ├── www.lifehacker.jp.html ├── www.linkedin.com.html ├── www.littlethings.com.html ├── www.macrumors.com.html ├── www.mentalfloss.com.html ├── www.miamiherald.com.html ├── www.moongift.jp.html ├── www.msn.com.html ├── www.msnbc.com.html ├── www.nationalgeographic.com.html ├── www.nbcnews.com.html ├── www.ndtv.com.html ├── www.newyorker.com--magazine.html ├── www.newyorker.com--multiple-authors.html ├── www.newyorker.com.html ├── www.npr.org.html ├── www.nydailynews.com.html ├── www.nytimes.com--feature.html ├── www.nytimes.com--recent.html ├── www.nytimes.com.html ├── www.opposingviews.com.html ├── www.oreilly.co.jp.html ├── www.ossnews.jp.html ├── www.phoronix.com.html ├── www.politico.com--test-case-2.html ├── www.politico.com--test-case-3.html ├── www.politico.com.html ├── www.popsugar.com.html ├── www.prospectmagazine.co.uk.html ├── www.publickey1.jp.html ├── www.qdaily.com.html ├── www.rawstory.com.html ├── www.rbbtoday.com.html ├── www.recode.net.html ├── www.reddit.com--embedded.html ├── www.reddit.com--external-image.html ├── www.reddit.com--external-link.html ├── www.reddit.com--image.html ├── www.reddit.com--title-only.html ├── www.reddit.com--video.html ├── www.reddit.com.html ├── www.refinery29.com.html ├── www.reuters.com.html ├── www.rollingstone.com.html ├── www.sanwa.co.jp.html ├── www.sbnation.com.html ├── www.si.com.html ├── www.slate.com.html ├── www.spektrum.de.html ├── www.theatlantic.com.html ├── www.theguardian.com.html ├── www.thepennyhoarder.com.html ├── www.thepoliticalinsider.com.html ├── www.theverge.com--feature.html ├── www.theverge.com.html ├── www.tmz.com.html ├── www.today.com.html ├── www.usmagazine.com.html ├── www.vox.com.html ├── www.vulture.com--content-test.html ├── www.vulture.com.html ├── www.washingtonpost.com.html ├── www.westernjournalism.com.html ├── www.wired.com--content-test.html ├── www.wired.com--other.html ├── www.wired.com.html ├── www.yahoo.com.html ├── www.yomiuri.co.jp.html └── www.youtube.com.html ├── karma.conf.js ├── package.json ├── preview ├── rollup.config.esm.js ├── rollup.config.js ├── rollup.config.web.js ├── score-move ├── scripts ├── check-build.test.js ├── comment-for-pr.js ├── find-and-replace.sh ├── generate-custom-parser.js ├── generate-fixture-preview.js ├── karma.conf.js ├── pr-parser-preview.sh ├── proxy-browser-test.js ├── rollup.config.js ├── templates │ ├── custom-extractor-test.js │ ├── custom-extractor.js │ ├── index.js │ └── insert-values.js ├── update-fixtures.js └── write-test-report.js ├── src ├── cleaners │ ├── author.js │ ├── author.test.js │ ├── constants.js │ ├── content.js │ ├── content.test.js │ ├── date-published.js │ ├── date-published.test.js │ ├── dek.js │ ├── dek.test.js │ ├── index.js │ ├── lead-image-url.js │ ├── lead-image-url.test.js │ ├── resolve-split-title.js │ ├── resolve-split-title.test.js │ ├── title.js │ └── title.test.js ├── extractors │ ├── add-extractor.js │ ├── add-extractor.test.js │ ├── all.js │ ├── collect-all-pages.js │ ├── constants.js │ ├── custom │ │ ├── 247sports.com │ │ │ ├── index.js │ │ │ └── index.test.js │ │ ├── README.md │ │ ├── abcnews.go.com │ │ │ ├── index.js │ │ │ └── index.test.js │ │ ├── arstechnica.com │ │ │ ├── index.js │ │ │ └── index.test.js │ │ ├── biorxiv.org │ │ │ ├── index.js │ │ │ └── index.test.js │ │ ├── blisterreview.com │ │ │ ├── index.js │ │ │ └── index.test.js │ │ ├── blogspot.com │ │ │ └── index.js │ │ ├── bookwalker.jp │ │ │ ├── index.js │ │ │ └── index.test.js │ │ ├── buzzap.jp │ │ │ ├── index.js │ │ │ └── index.test.js │ │ ├── clinicaltrials.gov │ │ │ ├── index.js │ │ │ └── index.test.js │ │ ├── deadline.com │ │ │ ├── index.js │ │ │ └── index.test.js │ │ ├── deadspin.com │ │ │ ├── index.js │ │ │ └── index.test.js │ │ ├── epaper.zeit.de │ │ │ ├── index.js │ │ │ └── index.test.js │ │ ├── fandom.wikia.com │ │ │ ├── index.js │ │ │ └── index.test.js │ │ ├── fortune.com │ │ │ ├── index.js │ │ │ └── index.test.js │ │ ├── forward.com │ │ │ ├── index.js │ │ │ └── index.test.js │ │ ├── genius.com │ │ │ ├── index.js │ │ │ └── index.test.js │ │ ├── getnews.jp │ │ │ ├── index.js │ │ │ └── index.test.js │ │ ├── github.com │ │ │ ├── index.js │ │ │ └── index.test.js │ │ ├── gothamist.com │ │ │ ├── index.js │ │ │ └── index.test.js │ │ ├── hellogiggles.com │ │ │ ├── index.js │ │ │ └── index.test.js │ │ ├── ici.radio-canada.ca │ │ │ ├── index.js │ │ │ └── index.test.js │ │ ├── index.js │ │ ├── japan.cnet.com │ │ │ ├── index.js │ │ │ └── index.test.js │ │ ├── japan.zdnet.com │ │ │ ├── index.js │ │ │ └── index.test.js │ │ ├── jvndb.jvn.jp │ │ │ ├── index.js │ │ │ └── index.test.js │ │ ├── ma.ttias.be │ │ │ ├── index.js │ │ │ └── index.test.js │ │ ├── mashable.com │ │ │ ├── index.js │ │ │ └── index.test.js │ │ ├── medium.com │ │ │ ├── index.js │ │ │ └── index.test.js │ │ ├── money.cnn.com │ │ │ ├── index.js │ │ │ └── index.test.js │ │ ├── newrepublic.com │ │ │ ├── index.js │ │ │ └── index.test.js │ │ ├── news.mynavi.jp │ │ │ ├── index.js │ │ │ └── index.test.js │ │ ├── news.nationalgeographic.com │ │ │ ├── index.js │ │ │ └── index.test.js │ │ ├── nymag.com │ │ │ ├── fixtures │ │ │ │ └── test.html │ │ │ ├── index.js │ │ │ └── index.test.js │ │ ├── obamawhitehouse.archives.gov │ │ │ ├── index.js │ │ │ └── index.test.js │ │ ├── observer.com │ │ │ ├── index.js │ │ │ └── index.test.js │ │ ├── otrs.com │ │ │ ├── index.js │ │ │ └── index.test.js │ │ ├── pagesix.com │ │ │ ├── index.js │ │ │ └── index.test.js │ │ ├── pastebin.com │ │ │ ├── index.js │ │ │ └── index.test.js │ │ ├── people.com │ │ │ ├── index.js │ │ │ └── index.test.js │ │ ├── phpspot.org │ │ │ ├── index.js │ │ │ └── index.test.js │ │ ├── pitchfork.com │ │ │ ├── index.js │ │ │ └── index.test.js │ │ ├── postlight.com │ │ │ ├── index.js │ │ │ └── index.test.js │ │ ├── qz.com │ │ │ ├── index.js │ │ │ └── index.test.js │ │ ├── scan.netsecurity.ne.jp │ │ │ ├── index.js │ │ │ └── index.test.js │ │ ├── sciencefly.com │ │ │ ├── index.js │ │ │ └── index.test.js │ │ ├── sect.iij.ad.jp │ │ │ ├── index.js │ │ │ └── index.test.js │ │ ├── takagi-hiromitsu.jp │ │ │ ├── index.js │ │ │ └── index.test.js │ │ ├── techlog.iij.ad.jp │ │ │ ├── index.js │ │ │ └── index.test.js │ │ ├── thefederalistpapers.org │ │ │ ├── index.js │ │ │ └── index.test.js │ │ ├── thoughtcatalog.com │ │ │ ├── index.js │ │ │ └── index.test.js │ │ ├── timesofindia.indiatimes.com │ │ │ ├── index.js │ │ │ └── index.test.js │ │ ├── twitter.com │ │ │ ├── index.js │ │ │ └── index.test.js │ │ ├── uproxx.com │ │ │ ├── index.js │ │ │ └── index.test.js │ │ ├── weekly.ascii.jp │ │ │ ├── index.js │ │ │ └── index.test.js │ │ ├── wikipedia.org │ │ │ └── index.js │ │ ├── wired.jp │ │ │ ├── index.js │ │ │ └── index.test.js │ │ ├── www.abendblatt.de │ │ │ ├── index.js │ │ │ └── index.test.js │ │ ├── www.al.com │ │ │ ├── index.js │ │ │ └── index.test.js │ │ ├── www.americanow.com │ │ │ ├── index.js │ │ │ └── index.test.js │ │ ├── www.androidcentral.com │ │ │ ├── index.js │ │ │ └── index.test.js │ │ ├── www.aol.com │ │ │ ├── index.js │ │ │ └── index.test.js │ │ ├── www.apartmenttherapy.com │ │ │ ├── index.js │ │ │ └── index.test.js │ │ ├── www.asahi.com │ │ │ ├── index.js │ │ │ └── index.test.js │ │ ├── www.bloomberg.com │ │ │ ├── index.js │ │ │ └── index.test.js │ │ ├── www.broadwayworld.com │ │ │ ├── index.js │ │ │ └── index.test.js │ │ ├── www.bustle.com │ │ │ ├── index.js │ │ │ └── index.test.js │ │ ├── www.buzzfeed.com │ │ │ ├── index.js │ │ │ └── index.test.js │ │ ├── www.cbc.ca │ │ │ ├── index.js │ │ │ └── index.test.js │ │ ├── www.cbssports.com │ │ │ ├── index.js │ │ │ └── index.test.js │ │ ├── www.chicagotribune.com │ │ │ ├── index.js │ │ │ └── index.test.js │ │ ├── www.cnbc.com │ │ │ ├── index.js │ │ │ └── index.test.js │ │ ├── www.cnet.com │ │ │ ├── index.js │ │ │ └── index.test.js │ │ ├── www.cnn.com │ │ │ ├── index.js │ │ │ └── index.test.js │ │ ├── www.dmagazine.com │ │ │ ├── index.js │ │ │ └── index.test.js │ │ ├── www.elecom.co.jp │ │ │ ├── index.js │ │ │ └── index.test.js │ │ ├── www.engadget.com │ │ │ ├── index.js │ │ │ └── index.test.js │ │ ├── www.eonline.com │ │ │ ├── index.js │ │ │ └── index.test.js │ │ ├── www.fastcompany.com │ │ │ ├── index.js │ │ │ └── index.test.js │ │ ├── www.fool.com │ │ │ ├── index.js │ │ │ └── index.test.js │ │ ├── www.fortinet.com │ │ │ ├── index.js │ │ │ └── index.test.js │ │ ├── www.gizmodo.jp │ │ │ ├── index.js │ │ │ └── index.test.js │ │ ├── www.gruene.de │ │ │ ├── index.js │ │ │ └── index.test.js │ │ ├── www.huffingtonpost.com │ │ │ ├── index.js │ │ │ └── index.test.js │ │ ├── www.infoq.com │ │ │ ├── index.js │ │ │ └── index.test.js │ │ ├── www.inquisitr.com │ │ │ ├── index.js │ │ │ └── index.test.js │ │ ├── www.investmentexecutive.com │ │ │ ├── index.js │ │ │ └── index.test.js │ │ ├── www.ipa.go.jp │ │ │ ├── index.js │ │ │ └── index.test.js │ │ ├── www.itmedia.co.jp │ │ │ ├── index.js │ │ │ └── index.test.js │ │ ├── www.jnsa.org │ │ │ ├── index.js │ │ │ └── index.test.js │ │ ├── www.ladbible.com │ │ │ ├── index.js │ │ │ └── index.test.js │ │ ├── www.latimes.com │ │ │ ├── index.js │ │ │ └── index.test.js │ │ ├── www.lemonde.fr │ │ │ ├── index.js │ │ │ └── index.test.js │ │ ├── www.lifehacker.jp │ │ │ ├── index.js │ │ │ └── index.test.js │ │ ├── www.linkedin.com │ │ │ ├── index.js │ │ │ └── index.test.js │ │ ├── www.littlethings.com │ │ │ ├── index.js │ │ │ └── index.test.js │ │ ├── www.macrumors.com │ │ │ ├── index.js │ │ │ └── index.test.js │ │ ├── www.mentalfloss.com │ │ │ ├── index.js │ │ │ └── index.test.js │ │ ├── www.miamiherald.com │ │ │ ├── index.js │ │ │ └── index.test.js │ │ ├── www.moongift.jp │ │ │ ├── index.js │ │ │ └── index.test.js │ │ ├── www.msn.com │ │ │ ├── index.js │ │ │ └── index.test.js │ │ ├── www.msnbc.com │ │ │ ├── index.js │ │ │ └── index.test.js │ │ ├── www.nationalgeographic.com │ │ │ ├── index.js │ │ │ └── index.test.js │ │ ├── www.nbcnews.com │ │ │ ├── index.js │ │ │ └── index.test.js │ │ ├── www.ndtv.com │ │ │ ├── index.js │ │ │ └── index.test.js │ │ ├── www.newyorker.com │ │ │ ├── index.js │ │ │ └── index.test.js │ │ ├── www.npr.org │ │ │ ├── index.js │ │ │ └── index.test.js │ │ ├── www.nydailynews.com │ │ │ ├── index.js │ │ │ └── index.test.js │ │ ├── www.nytimes.com │ │ │ ├── index.js │ │ │ └── index.test.js │ │ ├── www.opposingviews.com │ │ │ ├── index.js │ │ │ └── index.test.js │ │ ├── www.oreilly.co.jp │ │ │ ├── index.js │ │ │ └── index.test.js │ │ ├── www.ossnews.jp │ │ │ ├── index.js │ │ │ └── index.test.js │ │ ├── www.phoronix.com │ │ │ ├── index.js │ │ │ └── index.test.js │ │ ├── www.politico.com │ │ │ ├── index.js │ │ │ └── index.test.js │ │ ├── www.popsugar.com │ │ │ ├── index.js │ │ │ └── index.test.js │ │ ├── www.prospectmagazine.co.uk │ │ │ ├── index.js │ │ │ └── index.test.js │ │ ├── www.publickey1.jp │ │ │ ├── index.js │ │ │ └── index.test.js │ │ ├── www.qdaily.com │ │ │ ├── index.js │ │ │ └── index.test.js │ │ ├── www.rawstory.com │ │ │ ├── index.js │ │ │ └── index.test.js │ │ ├── www.rbbtoday.com │ │ │ ├── index.js │ │ │ └── index.test.js │ │ ├── www.recode.net │ │ │ ├── index.js │ │ │ └── index.test.js │ │ ├── www.reddit.com │ │ │ ├── index.js │ │ │ └── index.test.js │ │ ├── www.refinery29.com │ │ │ ├── index.js │ │ │ └── index.test.js │ │ ├── www.reuters.com │ │ │ ├── index.js │ │ │ └── index.test.js │ │ ├── www.rollingstone.com │ │ │ ├── index.js │ │ │ └── index.test.js │ │ ├── www.sanwa.co.jp │ │ │ ├── index.js │ │ │ └── index.test.js │ │ ├── www.sbnation.com │ │ │ ├── index.js │ │ │ └── index.test.js │ │ ├── www.si.com │ │ │ ├── index.js │ │ │ └── index.test.js │ │ ├── www.slate.com │ │ │ ├── index.js │ │ │ └── index.test.js │ │ ├── www.spektrum.de │ │ │ ├── index.js │ │ │ └── index.test.js │ │ ├── www.theatlantic.com │ │ │ ├── index.js │ │ │ └── index.test.js │ │ ├── www.theguardian.com │ │ │ ├── index.js │ │ │ └── index.test.js │ │ ├── www.thepennyhoarder.com │ │ │ ├── index.js │ │ │ └── index.test.js │ │ ├── www.thepoliticalinsider.com │ │ │ ├── index.js │ │ │ └── index.test.js │ │ ├── www.theverge.com │ │ │ ├── index.js │ │ │ └── index.test.js │ │ ├── www.tmz.com │ │ │ ├── index.js │ │ │ └── index.test.js │ │ ├── www.today.com │ │ │ ├── index.js │ │ │ └── index.test.js │ │ ├── www.usmagazine.com │ │ │ ├── index.js │ │ │ └── index.test.js │ │ ├── www.vox.com │ │ │ ├── index.js │ │ │ └── index.test.js │ │ ├── www.washingtonpost.com │ │ │ ├── index.js │ │ │ └── index.test.js │ │ ├── www.westernjournalism.com │ │ │ ├── index.js │ │ │ └── index.test.js │ │ ├── www.wired.com │ │ │ ├── index.js │ │ │ └── index.test.js │ │ ├── www.yahoo.com │ │ │ ├── index.js │ │ │ └── index.test.js │ │ ├── www.yomiuri.co.jp │ │ │ ├── index.js │ │ │ └── index.test.js │ │ └── www.youtube.com │ │ │ ├── index.js │ │ │ └── index.test.js │ ├── detect-by-html.js │ ├── detect-by-html.test.js │ ├── fixtures │ │ └── postlight.com │ │ │ └── index.js │ ├── generic │ │ ├── author │ │ │ ├── constants.js │ │ │ ├── extractor.js │ │ │ └── extractor.test.js │ │ ├── content │ │ │ ├── extract-best-node.js │ │ │ ├── extract-best-node.test.js │ │ │ ├── extractor.js │ │ │ ├── extractor.test.js │ │ │ └── scoring │ │ │ │ ├── add-score.js │ │ │ │ ├── add-score.test.js │ │ │ │ ├── add-to-parent.js │ │ │ │ ├── add-to-parent.test.js │ │ │ │ ├── constants.js │ │ │ │ ├── find-top-candidate.js │ │ │ │ ├── find-top-candidate.test.js │ │ │ │ ├── get-or-init-score.js │ │ │ │ ├── get-or-init-score.test.js │ │ │ │ ├── get-score.js │ │ │ │ ├── get-score.test.js │ │ │ │ ├── get-weight.js │ │ │ │ ├── get-weight.test.js │ │ │ │ ├── index.js │ │ │ │ ├── merge-siblings.js │ │ │ │ ├── score-commas.js │ │ │ │ ├── score-commas.test.js │ │ │ │ ├── score-content.js │ │ │ │ ├── score-content.test.js │ │ │ │ ├── score-length.js │ │ │ │ ├── score-length.test.js │ │ │ │ ├── score-node.js │ │ │ │ ├── score-node.test.js │ │ │ │ ├── score-paragraph.js │ │ │ │ ├── score-paragraph.test.js │ │ │ │ ├── set-score.js │ │ │ │ └── set-score.test.js │ │ ├── date-published │ │ │ ├── constants.js │ │ │ ├── extractor.js │ │ │ └── extractor.test.js │ │ ├── dek │ │ │ ├── extractor.js │ │ │ └── extractor.test.js │ │ ├── excerpt │ │ │ ├── constants.js │ │ │ ├── extractor.js │ │ │ └── extractor.test.js │ │ ├── index.js │ │ ├── index.test.js │ │ ├── lead-image-url │ │ │ ├── constants.js │ │ │ ├── extractor.js │ │ │ ├── extractor.test.js │ │ │ ├── score-image.js │ │ │ └── score-image.test.js │ │ ├── next-page-url │ │ │ ├── extractor.js │ │ │ ├── extractor.test.js │ │ │ └── scoring │ │ │ │ ├── constants.js │ │ │ │ ├── score-links.js │ │ │ │ ├── score-links.test.js │ │ │ │ └── utils │ │ │ │ ├── index.js │ │ │ │ ├── score-base-url.js │ │ │ │ ├── score-base-url.test.js │ │ │ │ ├── score-by-parents.js │ │ │ │ ├── score-by-parents.test.js │ │ │ │ ├── score-cap-links.js │ │ │ │ ├── score-cap-links.test.js │ │ │ │ ├── score-extraneous-links.js │ │ │ │ ├── score-extraneous-links.test.js │ │ │ │ ├── score-link-text.js │ │ │ │ ├── score-link-text.test.js │ │ │ │ ├── score-next-link-text.js │ │ │ │ ├── score-next-link-text.test.js │ │ │ │ ├── score-page-in-link.js │ │ │ │ ├── score-page-in-link.test.js │ │ │ │ ├── score-prev-link.js │ │ │ │ ├── score-prev-link.test.js │ │ │ │ ├── score-similarity.js │ │ │ │ ├── score-similarity.test.js │ │ │ │ ├── should-score.js │ │ │ │ └── should-score.test.js │ │ ├── title │ │ │ ├── constants.js │ │ │ ├── extractor.js │ │ │ └── extractor.test.js │ │ ├── url │ │ │ ├── constants.js │ │ │ ├── extractor.js │ │ │ └── extractor.test.js │ │ └── word-count │ │ │ ├── extractor.js │ │ │ └── extractor.test.js │ ├── get-extractor.js │ ├── get-extractor.test.js │ ├── index.js │ ├── root-extractor.js │ └── root-extractor.test.js ├── mercury.js ├── mercury.test.js ├── resource │ ├── index.js │ ├── index.test.js │ └── utils │ │ ├── constants.js │ │ ├── dom │ │ ├── clean.js │ │ ├── clean.test.js │ │ ├── constants.js │ │ ├── convert-lazy-loaded-images.js │ │ ├── convert-lazy-loaded-images.test.js │ │ ├── index.js │ │ ├── normalize-meta-tags.js │ │ └── normalize-meta-tags.test.js │ │ ├── fetch-resource.js │ │ ├── fetch-resource.test.js │ │ └── index.js ├── shims │ ├── cheerio-query.js │ └── iconv-lite.js ├── test-helpers.js └── utils │ ├── dom │ ├── brs-to-ps.js │ ├── brs-to-ps.test.js │ ├── clean-attributes.js │ ├── clean-attributes.test.js │ ├── clean-h-ones.js │ ├── clean-h-ones.test.js │ ├── clean-headers.js │ ├── clean-headers.test.js │ ├── clean-images.js │ ├── clean-images.test.js │ ├── clean-tags.js │ ├── clean-tags.test.js │ ├── constants.js │ ├── convert-node-to.js │ ├── convert-node-to.test.js │ ├── convert-to-paragraphs.js │ ├── convert-to-paragraphs.test.js │ ├── extract-from-meta.js │ ├── extract-from-meta.test.js │ ├── extract-from-selectors.js │ ├── extract-from-selectors.test.js │ ├── get-attrs.js │ ├── get-attrs.test.js │ ├── index.js │ ├── is-wordpress.js │ ├── is-wordpress.test.js │ ├── link-density.js │ ├── link-density.test.js │ ├── make-links-absolute.js │ ├── make-links-absolute.test.js │ ├── mark-to-keep.js │ ├── mark-to-keep.test.js │ ├── node-is-sufficient.js │ ├── node-is-sufficient.test.js │ ├── paragraphize.js │ ├── paragraphize.test.js │ ├── remove-empty.js │ ├── remove-empty.test.js │ ├── rewrite-top-level.js │ ├── rewrite-top-level.test.js │ ├── set-attr.js │ ├── set-attr.test.js │ ├── set-attrs.js │ ├── set-attrs.test.js │ ├── strip-junk-tags.js │ ├── strip-junk-tags.test.js │ ├── strip-tags.js │ ├── strip-tags.test.js │ ├── strip-unlikely-candidates.js │ ├── strip-unlikely-candidates.test.js │ ├── within-comment.js │ └── within-comment.test.js │ ├── index.js │ ├── merge-supported-domains.js │ ├── merge-supported-domains.test.js │ ├── range.js │ ├── text │ ├── article-base-url.js │ ├── article-base-url.test.js │ ├── constants.js │ ├── excerpt-content.js │ ├── excerpt.test.js │ ├── extract-from-url.js │ ├── extract-from-url.test.js │ ├── get-encoding.js │ ├── get-encoding.test.js │ ├── has-sentence-end.js │ ├── index.js │ ├── normalize-spaces.js │ ├── normalize-spaces.test.js │ ├── page-num-from-url.js │ ├── page-num-from-url.test.js │ ├── remove-anchor.js │ └── remove-anchor.test.js │ ├── validate-url.js │ └── validate-url.test.js └── yarn.lock /.agignore: -------------------------------------------------------------------------------- 1 | dist 2 | -------------------------------------------------------------------------------- /.babelrc: -------------------------------------------------------------------------------- 1 | { 2 | "presets": ["@babel/preset-env"], 3 | "plugins": [ 4 | [ 5 | "module-resolver", 6 | { 7 | "root": ["./src"], 8 | "alias": { 9 | "./utils": "utils", 10 | "./cleaners": "cleaners", 11 | "./resource": "resource", 12 | "./extractors": "extractors", 13 | "./test-helpers.js": "test-helpers", 14 | "./mercury.js": "mercury" 15 | } 16 | } 17 | ] 18 | ], 19 | "env": { 20 | "development": { 21 | "plugins": [ 22 | [ 23 | "@babel/plugin-transform-runtime", 24 | { 25 | "corejs": 2, 26 | "regenerator": true 27 | } 28 | ] 29 | ] 30 | } 31 | } 32 | } 33 | -------------------------------------------------------------------------------- /.eslintignore: -------------------------------------------------------------------------------- 1 | **/fixtures/* 2 | dist/* 3 | coverage/* 4 | karma.conf.js 5 | -------------------------------------------------------------------------------- /.eslintrc: -------------------------------------------------------------------------------- 1 | { 2 | "parser": "babel-eslint", 3 | "extends": ["airbnb", "prettier"], 4 | "plugins": ["babel"], 5 | "globals": { 6 | "describe": true, 7 | "it": true, 8 | "fit": true, 9 | "jasmine": true, 10 | "beforeEach": true, 11 | "beforeAll": true, 12 | "afterAll": true 13 | }, 14 | "rules": { 15 | "no-param-reassign": 0, 16 | "no-control-regex": 0, 17 | "import/prefer-default-export": 0, 18 | "generator-star-spacing": 0, 19 | "babel/generator-star-spacing": 0, 20 | "func-names": 0, 21 | "no-confusing-arrow": 0, 22 | "camelcase": 0, 23 | "no-multiple-empty-lines": [ 24 | "error", 25 | { "max": 1, "maxEOF": 0, "maxBOF": 0 } 26 | ], 27 | "import/no-unresolved": false, 28 | "import/no-extraneous-dependencies": [ 29 | "error", 30 | { 31 | "devDependencies": [ 32 | "**/*.test.js", 33 | "scripts/proxy-browser-test.js", 34 | "rollup.config*js" 35 | ] 36 | } 37 | ] 38 | }, 39 | "settings": { 40 | "import/resolver": { 41 | "babel-module": {} 42 | } 43 | } 44 | } 45 | -------------------------------------------------------------------------------- /.gitattributes: -------------------------------------------------------------------------------- 1 | # All html files are fixtures, so marking as vendored 2 | # so Linguist (https://github.com/github/linguist) 3 | # ignores them for the purpose of language detection 4 | *.html linguist-vendored 5 | -------------------------------------------------------------------------------- /.github/PULL_REQUEST_TEMPLATE.md: -------------------------------------------------------------------------------- 1 | 9 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | node_modules 2 | build 3 | npm-debug.log 4 | TODO.md 5 | read 6 | preview.html 7 | preview.json 8 | coverage 9 | dist/mercury_test.js 10 | dist/mercury_test.js.map 11 | dist/mercury_test.web.js 12 | tmp/artifacts 13 | test-output.json 14 | .tool-versions 15 | .yarnrc.yml 16 | **/.DS_Store 17 | -------------------------------------------------------------------------------- /.nvmrc: -------------------------------------------------------------------------------- 1 | 12.8.1 2 | -------------------------------------------------------------------------------- /.prettierignore: -------------------------------------------------------------------------------- 1 | dist 2 | -------------------------------------------------------------------------------- /.prettierrc: -------------------------------------------------------------------------------- 1 | { 2 | "trailingComma": "es5", 3 | "semi": true, 4 | "singleQuote": true, 5 | "printWidth": 80, 6 | "tabWidth": 2, 7 | "useTabs": false, 8 | "bracketSpacing": true, 9 | "arrowParens": "avoid" 10 | } 11 | -------------------------------------------------------------------------------- /.remarkrc: -------------------------------------------------------------------------------- 1 | { 2 | "plugins": [ 3 | "remark-preset-lint-recommended", 4 | ["remark-lint-list-item-indent", false] 5 | ] 6 | } 7 | -------------------------------------------------------------------------------- /LICENSE-MIT: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2019 Postlight 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /RELEASE.md: -------------------------------------------------------------------------------- 1 | # How to cut a new release 2 | 3 | 1. Create a release branch. 4 | 5 | ```bash 6 | git checkout -b release-1.x.x # (where 1.x.x reflects the release) 7 | ``` 8 | 9 | 2. Update package.json with the version number 10 | 3. Build the release 11 | 12 | ```bash 13 | yarn release 14 | ``` 15 | 16 | 4. Update the changelog 17 | 18 | ```bash 19 | # Copy the output of the command below and paste it into CHANGELOG.md 20 | # following the conventions of that file 21 | yarn changelog-maker postlight parser 22 | ``` 23 | 24 | 5. Submit a PR 25 | 6. Merge once the PR's tests pass 26 | 7. [Create a release](https://github.com/postlight/parser/releases), linking to this release's entry in the changelog. (See other releases for context.) 27 | -------------------------------------------------------------------------------- /assets/parser-basic-usage.gif: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/postlight/parser/e8ba7ece291efa4d915d50dd4deeec17d54359f2/assets/parser-basic-usage.gif -------------------------------------------------------------------------------- /karma.conf.js: -------------------------------------------------------------------------------- 1 | module.exports = function (config) { 2 | config.set({ 3 | 4 | basePath: '', 5 | 6 | frameworks: ['jasmine', 'browserify'], 7 | files: [ 8 | { pattern: 'src/**/*.test.js', included: true }, 9 | ], 10 | 11 | exclude: [], 12 | 13 | preprocessors: { 14 | 'src/**/*.js': ['browserify'], 15 | }, 16 | 17 | browserify: { 18 | debug: true, 19 | transform: ['babelify', 'brfs'], 20 | }, 21 | 22 | reporters: ['progress'], 23 | port: 9876, 24 | colors: true, 25 | logLevel: config.LOG_INFO, 26 | autoWatch: false, 27 | browsers: ['Chrome'], 28 | singleRun: true, 29 | concurrency: Infinity, 30 | }); 31 | }; 32 | -------------------------------------------------------------------------------- /preview: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env node 2 | var fs = require('fs') 3 | var execSync = require('child_process').execSync 4 | 5 | var optRe = /^--/ 6 | var args = process.argv.slice(2).reduce((acc, arg) => { 7 | if (optRe.test(arg)) { 8 | acc.opts.push(arg) 9 | } else { 10 | acc.urls.push(arg) 11 | } 12 | 13 | return acc 14 | }, { opts: [], urls: [] }) 15 | 16 | var urls = args.urls 17 | 18 | if (!args.opts.find(arg => arg === '--no-rebuild')) { 19 | console.log('Rebuilding Mercury') 20 | execSync('MERCURY_TEST_BUILD=true npm run build') 21 | } 22 | 23 | var Mercury = require('./dist/mercury_test') 24 | 25 | console.log(`Fetching link(s)`) 26 | 27 | urls.map(url => { 28 | Mercury.parse(url, { fallback: false }).then(function(result) { 29 | var htmlFile = './preview.html' 30 | var jsonFile = './preview.json' 31 | 32 | var html = `

${result.title}

${result.content}` 33 | 34 | fs.writeFileSync(htmlFile, html) 35 | fs.writeFileSync(jsonFile, JSON.stringify(result)) 36 | execSync(`open ${jsonFile}`) 37 | execSync(`open ${htmlFile}`) 38 | }) 39 | }) 40 | -------------------------------------------------------------------------------- /rollup.config.esm.js: -------------------------------------------------------------------------------- 1 | import nodeResolve from 'rollup-plugin-node-resolve'; 2 | import globals from 'rollup-plugin-node-globals'; 3 | import { terser } from 'rollup-plugin-terser'; // eslint-disable-line import/extensions 4 | import babel from 'rollup-plugin-babel'; 5 | import commonjs from 'rollup-plugin-commonjs'; 6 | 7 | export default { 8 | input: 'src/mercury.js', 9 | plugins: [ 10 | babel({ 11 | runtimeHelpers: true, 12 | exclude: './node_modules#<{(|*', 13 | }), 14 | commonjs({ 15 | ignoreGlobal: true, 16 | }), 17 | globals(), 18 | nodeResolve({ 19 | browser: true, 20 | preferBuiltins: false, 21 | }), 22 | terser(), 23 | ], 24 | treeshake: true, 25 | output: { 26 | file: process.env.MERCURY_TEST_BUILD 27 | ? 'dist/mercury_test.esm.js' 28 | : 'dist/mercury.esm.js', 29 | format: 'es', 30 | sourcemap: true, 31 | }, 32 | }; 33 | -------------------------------------------------------------------------------- /rollup.config.js: -------------------------------------------------------------------------------- 1 | /* eslint-disable import/no-extraneous-dependencies */ 2 | import babel from 'rollup-plugin-babel'; 3 | import commonjs from 'rollup-plugin-commonjs'; 4 | 5 | export default { 6 | input: 'src/mercury.js', 7 | plugins: [ 8 | commonjs(), 9 | babel({ 10 | externalHelpers: false, 11 | runtimeHelpers: true, 12 | }), 13 | ], 14 | treeshake: true, 15 | output: { 16 | file: process.env.MERCURY_TEST_BUILD 17 | ? 'dist/mercury_test.js' 18 | : 'dist/mercury.js', 19 | format: 'cjs', 20 | sourcemap: true, 21 | }, 22 | }; 23 | -------------------------------------------------------------------------------- /rollup.config.web.js: -------------------------------------------------------------------------------- 1 | import nodeResolve from 'rollup-plugin-node-resolve'; 2 | import globals from 'rollup-plugin-node-globals'; 3 | import { uglify } from 'rollup-plugin-uglify'; // eslint-disable-line import/extensions 4 | import babel from 'rollup-plugin-babel'; 5 | import commonjs from 'rollup-plugin-commonjs'; 6 | 7 | export default { 8 | input: 'src/mercury.js', 9 | plugins: [ 10 | babel({ 11 | runtimeHelpers: true, 12 | exclude: './node_modules#<{(|*', 13 | }), 14 | commonjs({ 15 | ignoreGlobal: true, 16 | }), 17 | globals(), 18 | nodeResolve({ 19 | browser: true, 20 | preferBuiltins: false, 21 | }), 22 | uglify(), 23 | ], 24 | treeshake: true, 25 | output: { 26 | file: process.env.MERCURY_TEST_BUILD 27 | ? 'dist/mercury_test.web.js' 28 | : 'dist/mercury.web.js', 29 | format: 'iife', 30 | name: 'Mercury', 31 | sourcemap: true, 32 | }, 33 | }; 34 | -------------------------------------------------------------------------------- /score-move: -------------------------------------------------------------------------------- 1 | #!/usr/local/bin/fish 2 | 3 | set file $argv[1] 4 | set function $argv[2] 5 | 6 | touch src/extractors/generic/next-page-url/scoring/utils/index.js 7 | touch src/extractors/generic/next-page-url/scoring/utils/$file.js 8 | touch src/extractors/generic/next-page-url/scoring/utils/$file.test.js 9 | 10 | echo "import assert from 'assert';" > src/extractors/generic/next-page-url/scoring/utils/$file.test.js 11 | echo "" >> src/extractors/generic/next-page-url/scoring/utils/$file.test.js 12 | echo "import $function from './$file';" >> src/extractors/generic/next-page-url/scoring/utils/$file.test.js 13 | echo "" >> src/extractors/generic/next-page-url/scoring/utils/$file.test.js 14 | echo "export { default as $function } from './$file'" >> src/extractors/generic/next-page-url/scoring/utils/index.js 15 | 16 | echo "Now make it a default export" 17 | echo "Move it to its file" 18 | echo "Move its tests to its test file" 19 | echo "import in score-links" 20 | echo "Test it." 21 | 22 | -------------------------------------------------------------------------------- /scripts/find-and-replace.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | echo $3 4 | 5 | find $3 -exec sed -i '' "s%$1%$2%g" '{}' \; -------------------------------------------------------------------------------- /scripts/pr-parser-preview.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | changes=( `git diff origin/master --name-only` ) 4 | 5 | for fixture in "${changes[@]}" 6 | do 7 | # If one of the changed files is a fixture, hold onto it 8 | if [[ $fixture == "fixtures/"* ]]; then 9 | fixtures=$fixture,$fixtures 10 | fi 11 | done 12 | 13 | if [[ $fixtures ]]; then 14 | # Take a screenshot of the fixture 15 | yarn phantomjs scripts/generate-fixture-preview.js $fixtures 16 | 17 | screenshots=( `find tmp/artifacts -type f | grep ".html.png"` ) 18 | 19 | for screenshot in "${screenshots[@]}" 20 | do 21 | # Create a comment with a link to the screenshot 22 | # and json output for the fixture 23 | node scripts/comment-for-pr.js $screenshot 24 | done 25 | else 26 | echo "No fixtures added in this PR, so no preview needed" 27 | node scripts/write-test-report.js 28 | fi 29 | -------------------------------------------------------------------------------- /scripts/rollup.config.js: -------------------------------------------------------------------------------- 1 | /* eslint-disable import/no-extraneous-dependencies */ 2 | import babel from 'rollup-plugin-babel'; 3 | import commonjs from 'rollup-plugin-commonjs'; 4 | 5 | export default { 6 | input: 'scripts/generate-custom-parser.js', 7 | plugins: [ 8 | commonjs(), 9 | babel({ 10 | externalHelpers: false, 11 | runtimeHelpers: true, 12 | }), 13 | ], 14 | treeshake: true, 15 | output: { 16 | file: 'dist/generate-custom-parser.js', 17 | format: 'cjs', 18 | sourcemap: true, 19 | }, 20 | }; 21 | -------------------------------------------------------------------------------- /scripts/templates/index.js: -------------------------------------------------------------------------------- 1 | import insertValues from './insert-values'; 2 | 3 | const bodyPattern = /^\n([\s\S]+)\s{2}$/gm; 4 | const trailingWhitespace = /\s+$/; 5 | 6 | export default function template(strings, ...values) { 7 | const compiled = insertValues(strings, ...values); 8 | let [body] = compiled.match(bodyPattern) || []; 9 | let indentLevel = /^\s{0,4}(.+)$/g; 10 | 11 | if (!body) { 12 | body = compiled; 13 | indentLevel = /^\s{0,2}(.+)$/g; 14 | } 15 | 16 | return body 17 | .split('\n') 18 | .slice(1) 19 | .map(line => { 20 | line = line.replace(indentLevel, '$1'); 21 | 22 | if (trailingWhitespace.test(line)) { 23 | line = line.replace(trailingWhitespace, ''); 24 | } 25 | 26 | return line; 27 | }) 28 | .join('\n'); 29 | } 30 | -------------------------------------------------------------------------------- /scripts/templates/insert-values.js: -------------------------------------------------------------------------------- 1 | export default function insertValues(strings, ...values) { 2 | if (values.length) { 3 | return strings.reduce((result, part, idx) => { 4 | let value = values[idx]; 5 | 6 | if (value && typeof value.toString === 'function') { 7 | value = value.toString(); 8 | } else { 9 | value = ''; 10 | } 11 | 12 | return result + part + value; 13 | }, ''); 14 | } 15 | 16 | return strings.join(''); 17 | } 18 | -------------------------------------------------------------------------------- /scripts/write-test-report.js: -------------------------------------------------------------------------------- 1 | const path = require('path'); 2 | const fs = require('fs'); 3 | 4 | const { getReport } = require('@postlight/ci-failed-test-reporter'); 5 | 6 | const report = getReport(path.join(__dirname, '../', '/test-output.json')); 7 | if (report) { 8 | const commentPath = 'tmp/artifacts/comment.json'; 9 | fs.mkdirSync('tmp'); 10 | fs.mkdirSync('tmp/artifacts'); 11 | fs.writeFileSync( 12 | commentPath, 13 | JSON.stringify({ 14 | body: report, 15 | issue: process.env.CIRCLE_PULL_REQUEST, 16 | }) 17 | ); 18 | } 19 | -------------------------------------------------------------------------------- /src/cleaners/author.js: -------------------------------------------------------------------------------- 1 | import { normalizeSpaces } from 'utils/text'; 2 | import { CLEAN_AUTHOR_RE } from './constants'; 3 | 4 | // Take an author string (like 'By David Smith ') and clean it to 5 | // just the name(s): 'David Smith'. 6 | export default function cleanAuthor(author) { 7 | return normalizeSpaces(author.replace(CLEAN_AUTHOR_RE, '$2').trim()); 8 | } 9 | -------------------------------------------------------------------------------- /src/cleaners/author.test.js: -------------------------------------------------------------------------------- 1 | import assert from 'assert'; 2 | 3 | import cleanAuthor from './author'; 4 | 5 | describe('cleanAuthor(author)', () => { 6 | it('removes the By from an author string', () => { 7 | const author = cleanAuthor('By Bob Dylan'); 8 | 9 | assert.equal(author, 'Bob Dylan'); 10 | }); 11 | 12 | it('trims trailing whitespace and line breaks', () => { 13 | const text = ` 14 | written by 15 | Bob Dylan 16 | `; 17 | const author = cleanAuthor(text); 18 | 19 | assert.equal(author, 'Bob Dylan'); 20 | }); 21 | }); 22 | -------------------------------------------------------------------------------- /src/cleaners/content.test.js: -------------------------------------------------------------------------------- 1 | import assert from 'assert'; 2 | import cheerio from 'cheerio'; 3 | 4 | import extractBestNode from 'extractors/generic/content/extract-best-node'; 5 | import extractCleanNode from './content'; 6 | 7 | const fs = require('fs'); 8 | 9 | describe('extractCleanNode(article, { $, cleanConditionally, title } })', () => { 10 | it('cleans cruft out of a DOM node', () => { 11 | const html = fs.readFileSync( 12 | './fixtures/www.wired.com--content-test.html', 13 | 'utf-8' 14 | ); 15 | const $ = cheerio.load(html); 16 | 17 | const opts = { 18 | stripUnlikelyCandidates: true, 19 | weightNodes: true, 20 | cleanConditionally: true, 21 | }; 22 | 23 | const bestNode = extractBestNode($, opts); 24 | 25 | const cleanNode = extractCleanNode(bestNode, { $, opts }); 26 | 27 | const text = $(cleanNode) 28 | .text() 29 | .replace(/\n/g, '') 30 | .replace(/\s+/g, ' ') 31 | .trim(); 32 | assert.equal(text.length === 2656 || text.length === 2657, true); 33 | }); 34 | }); 35 | -------------------------------------------------------------------------------- /src/cleaners/dek.js: -------------------------------------------------------------------------------- 1 | import { stripTags } from 'utils/dom'; 2 | import { excerptContent, normalizeSpaces } from 'utils/text'; 3 | 4 | import { TEXT_LINK_RE } from './constants'; 5 | 6 | // Take a dek HTML fragment, and return the cleaned version of it. 7 | // Return None if the dek wasn't good enough. 8 | export default function cleanDek(dek, { $, excerpt }) { 9 | // Sanity check that we didn't get too short or long of a dek. 10 | if (dek.length > 1000 || dek.length < 5) return null; 11 | 12 | // Check that dek isn't the same as excerpt 13 | if (excerpt && excerptContent(excerpt, 10) === excerptContent(dek, 10)) 14 | return null; 15 | 16 | const dekText = stripTags(dek, $); 17 | 18 | // Plain text links shouldn't exist in the dek. If we have some, it's 19 | // not a good dek - bail. 20 | if (TEXT_LINK_RE.test(dekText)) return null; 21 | 22 | return normalizeSpaces(dekText.trim()); 23 | } 24 | -------------------------------------------------------------------------------- /src/cleaners/index.js: -------------------------------------------------------------------------------- 1 | import cleanAuthor from './author'; 2 | import cleanImage from './lead-image-url'; 3 | import cleanDek from './dek'; 4 | import cleanDatePublished from './date-published'; 5 | import cleanContent from './content'; 6 | import cleanTitle from './title'; 7 | 8 | const Cleaners = { 9 | author: cleanAuthor, 10 | lead_image_url: cleanImage, 11 | dek: cleanDek, 12 | date_published: cleanDatePublished, 13 | content: cleanContent, 14 | title: cleanTitle, 15 | }; 16 | 17 | export default Cleaners; 18 | 19 | export { cleanAuthor }; 20 | export { cleanImage }; 21 | export { cleanDek }; 22 | export { cleanDatePublished }; 23 | export { cleanContent }; 24 | export { cleanTitle }; 25 | export { default as resolveSplitTitle } from './resolve-split-title'; 26 | -------------------------------------------------------------------------------- /src/cleaners/lead-image-url.js: -------------------------------------------------------------------------------- 1 | import validUrl from 'valid-url'; 2 | 3 | export default function clean(leadImageUrl) { 4 | leadImageUrl = leadImageUrl.trim(); 5 | if (validUrl.isWebUri(leadImageUrl)) { 6 | return leadImageUrl; 7 | } 8 | 9 | return null; 10 | } 11 | -------------------------------------------------------------------------------- /src/cleaners/lead-image-url.test.js: -------------------------------------------------------------------------------- 1 | import assert from 'assert'; 2 | 3 | import clean from './lead-image-url'; 4 | 5 | describe('clean(leadImageUrl)', () => { 6 | it('returns the url if valid', () => { 7 | const url = 'https://example.com'; 8 | assert.equal(clean(url), url); 9 | }); 10 | 11 | it('returns null if the url is not valid', () => { 12 | assert.equal(clean('this is not a valid url'), null); 13 | }); 14 | 15 | it('trims whitespace', () => { 16 | const url = ' https://example.com/foo/bar.jpg'; 17 | assert.equal(clean(url), url.trim()); 18 | }); 19 | }); 20 | -------------------------------------------------------------------------------- /src/cleaners/resolve-split-title.test.js: -------------------------------------------------------------------------------- 1 | import assert from 'assert'; 2 | 3 | import { resolveSplitTitle } from './index'; 4 | 5 | describe('resolveSplitTitle(text)', () => { 6 | it('does nothing if title not splittable', () => { 7 | const title = 'This Is a Normal Title'; 8 | 9 | assert.equal(resolveSplitTitle(title), title); 10 | }); 11 | 12 | it('extracts titles from breadcrumb-like titles', () => { 13 | const title = 'The Best Gadgets on Earth : Bits : Blogs : NYTimes.com'; 14 | 15 | assert.equal(resolveSplitTitle(title), 'The Best Gadgets on Earth '); 16 | }); 17 | 18 | it('cleans domains from titles at the front', () => { 19 | const title = 'NYTimes - The Best Gadgets on Earth'; 20 | const url = 'https://www.nytimes.com/bits/blog/etc/'; 21 | 22 | assert.equal(resolveSplitTitle(title, url), 'The Best Gadgets on Earth'); 23 | }); 24 | 25 | it('cleans domains from titles at the back', () => { 26 | const title = 'The Best Gadgets on Earth | NYTimes'; 27 | const url = 'https://www.nytimes.com/bits/blog/etc/'; 28 | 29 | assert.equal(resolveSplitTitle(title, url), 'The Best Gadgets on Earth'); 30 | }); 31 | }); 32 | -------------------------------------------------------------------------------- /src/cleaners/title.js: -------------------------------------------------------------------------------- 1 | import { stripTags } from 'utils/dom'; 2 | import { normalizeSpaces } from 'utils/text'; 3 | 4 | import { TITLE_SPLITTERS_RE } from './constants'; 5 | import { resolveSplitTitle } from './index'; 6 | 7 | export default function cleanTitle(title, { url, $ }) { 8 | // If title has |, :, or - in it, see if 9 | // we can clean it up. 10 | if (TITLE_SPLITTERS_RE.test(title)) { 11 | title = resolveSplitTitle(title, url); 12 | } 13 | 14 | // Final sanity check that we didn't get a crazy title. 15 | // if (title.length > 150 || title.length < 15) { 16 | if (title.length > 150) { 17 | // If we did, return h1 from the document if it exists 18 | const h1 = $('h1'); 19 | if (h1.length === 1) { 20 | title = h1.text(); 21 | } 22 | } 23 | 24 | // strip any html tags in the title text 25 | return normalizeSpaces(stripTags(title, $).trim()); 26 | } 27 | -------------------------------------------------------------------------------- /src/cleaners/title.test.js: -------------------------------------------------------------------------------- 1 | import assert from 'assert'; 2 | import cheerio from 'cheerio'; 3 | 4 | import { cleanTitle } from './index'; 5 | 6 | describe('cleanTitle(title, { url, $ })', () => { 7 | it('only uses h1 if there is only one on the page', () => { 8 | const title = 'Too Short'; 9 | const $ = cheerio.load(` 10 |
11 |

This Is the Real Title

12 |

This Is the Real Title

13 |
14 | `); 15 | 16 | assert.equal(cleanTitle(title, { url: '', $ }), title); 17 | }); 18 | 19 | it('removes HTML tags from titles', () => { 20 | const $ = cheerio.load( 21 | '

This Is the Real Title

' 22 | ); 23 | const title = $('h1').html(); 24 | 25 | assert.equal(cleanTitle(title, { url: '', $ }), 'This Is the Real Title'); 26 | }); 27 | 28 | it('trims extraneous spaces', () => { 29 | const title = " This Is a Great Title That You'll Love "; 30 | const $ = cheerio.load( 31 | '

This Is the Real Title

' 32 | ); 33 | 34 | assert.equal(cleanTitle(title, { url: '', $ }), title.trim()); 35 | }); 36 | }); 37 | -------------------------------------------------------------------------------- /src/extractors/add-extractor.js: -------------------------------------------------------------------------------- 1 | import mergeSupportedDomains from '../utils/merge-supported-domains'; 2 | 3 | export const apiExtractors = {}; 4 | 5 | export default function addExtractor(extractor) { 6 | if (!extractor || !extractor.domain) { 7 | return { 8 | error: true, 9 | message: 'Unable to add custom extractor. Invalid parameters.', 10 | }; 11 | } 12 | 13 | Object.assign(apiExtractors, mergeSupportedDomains(extractor)); 14 | 15 | return apiExtractors; 16 | } 17 | -------------------------------------------------------------------------------- /src/extractors/add-extractor.test.js: -------------------------------------------------------------------------------- 1 | import assert from 'assert'; 2 | 3 | import addExtractor from './add-extractor'; 4 | 5 | describe('addExtractor(extractor)', () => { 6 | it('can add multiple custom extractors', () => { 7 | addExtractor({ domain: 'www.site1.com' }); 8 | addExtractor({ domain: 'www.site2.com' }); 9 | const result = addExtractor({ domain: 'www.site3.com' }); 10 | assert.equal(Object.keys(result).length, 3); 11 | }); 12 | 13 | it('returns error if an extractor is not provided', () => { 14 | const result = addExtractor(); 15 | assert.equal(result.error, true); 16 | }); 17 | 18 | it('returns error if a domain key is not included within the custom extractor', () => { 19 | const result = addExtractor({ test: 'abc' }); 20 | assert.equal(result.error, true); 21 | }); 22 | }); 23 | -------------------------------------------------------------------------------- /src/extractors/all.js: -------------------------------------------------------------------------------- 1 | import mergeSupportedDomains from 'utils/merge-supported-domains'; 2 | import * as CustomExtractors from './custom/index'; 3 | 4 | export default Object.keys(CustomExtractors).reduce((acc, key) => { 5 | const extractor = CustomExtractors[key]; 6 | return { 7 | ...acc, 8 | ...mergeSupportedDomains(extractor), 9 | }; 10 | }, {}); 11 | -------------------------------------------------------------------------------- /src/extractors/constants.js: -------------------------------------------------------------------------------- 1 | export const ATTR_RE = /\[([\w-]+)\]/; // eslint-disable-line no-useless-escape 2 | -------------------------------------------------------------------------------- /src/extractors/custom/247sports.com/index.js: -------------------------------------------------------------------------------- 1 | export const twofortysevensportsComExtractor = { 2 | domain: '247sports.com', 3 | 4 | title: { 5 | selectors: ['title', 'article header h1'], 6 | }, 7 | 8 | author: { 9 | selectors: ['.article-cnt__author', '.author'], 10 | }, 11 | 12 | date_published: { 13 | selectors: [['time[data-published]', 'data-published']], 14 | }, 15 | 16 | lead_image_url: { 17 | selectors: [['meta[name="og:image"]', 'value']], 18 | }, 19 | 20 | content: { 21 | selectors: ['.article-body', 'section.body.article'], 22 | 23 | // Is there anything in the content you selected that needs transformed 24 | // before it's consumable content? E.g., unusual lazy loaded images 25 | transforms: {}, 26 | 27 | // Is there anything that is in the result that shouldn't be? 28 | // The clean selectors will remove anything that matches from 29 | // the result 30 | clean: [], 31 | }, 32 | }; 33 | -------------------------------------------------------------------------------- /src/extractors/custom/abcnews.go.com/index.js: -------------------------------------------------------------------------------- 1 | export const AbcnewsGoComExtractor = { 2 | domain: 'abcnews.go.com', 3 | 4 | title: { 5 | selectors: ['div[class*="Article_main__body"] h1', '.article-header h1'], 6 | }, 7 | 8 | author: { 9 | selectors: ['.ShareByline span:nth-child(2)', '.authors'], 10 | clean: ['.author-overlay', '.by-text'], 11 | }, 12 | 13 | date_published: { 14 | selectors: ['.ShareByline', '.timestamp'], 15 | format: 'MMMM D, YYYY h:mm a', 16 | timezone: 'America/New_York', 17 | }, 18 | 19 | lead_image_url: { 20 | selectors: [['meta[name="og:image"]', 'value']], 21 | }, 22 | 23 | content: { 24 | selectors: ['article', '.article-copy'], 25 | 26 | // Is there anything in the content you selected that needs transformed 27 | // before it's consumable content? E.g., unusual lazy loaded images 28 | transforms: {}, 29 | 30 | // Is there anything that is in the result that shouldn't be? 31 | // The clean selectors will remove anything that matches from 32 | // the result 33 | clean: [], 34 | }, 35 | }; 36 | -------------------------------------------------------------------------------- /src/extractors/custom/biorxiv.org/index.js: -------------------------------------------------------------------------------- 1 | export const BiorxivOrgExtractor = { 2 | domain: 'biorxiv.org', 3 | 4 | title: { 5 | selectors: ['h1#page-title'], 6 | }, 7 | 8 | author: { 9 | selectors: [ 10 | 'div.highwire-citation-biorxiv-article-top > div.highwire-cite-authors', 11 | ], 12 | }, 13 | 14 | content: { 15 | selectors: ['div#abstract-1'], 16 | 17 | // Is there anything in the content you selected that needs transformed 18 | // before it's consumable content? E.g., unusual lazy loaded images 19 | transforms: {}, 20 | 21 | // Is there anything that is in the result that shouldn't be? 22 | // The clean selectors will remove anything that matches from 23 | // the result 24 | clean: [], 25 | }, 26 | }; 27 | -------------------------------------------------------------------------------- /src/extractors/custom/blogspot.com/index.js: -------------------------------------------------------------------------------- 1 | export const BloggerExtractor = { 2 | domain: 'blogspot.com', 3 | content: { 4 | // Blogger is insane and does not load its content 5 | // initially in the page, but it's all there 6 | // in noscript 7 | selectors: ['.post-content noscript'], 8 | 9 | // Selectors to remove from the extracted content 10 | clean: [], 11 | 12 | // Convert the noscript tag to a div 13 | transforms: { 14 | noscript: 'div', 15 | }, 16 | }, 17 | 18 | author: { 19 | selectors: ['.post-author-name'], 20 | }, 21 | 22 | title: { 23 | selectors: ['.post h2.title'], 24 | }, 25 | 26 | date_published: { 27 | selectors: ['span.publishdate'], 28 | }, 29 | }; 30 | -------------------------------------------------------------------------------- /src/extractors/custom/bookwalker.jp/index.js: -------------------------------------------------------------------------------- 1 | export const BookwalkerJpExtractor = { 2 | domain: 'bookwalker.jp', 3 | 4 | title: { 5 | selectors: ['h1.p-main__title', 'h1.main-heading'], 6 | }, 7 | 8 | author: { 9 | selectors: ['div.p-author__list', 'div.authors'], 10 | }, 11 | 12 | date_published: { 13 | selectors: [ 14 | 'dl.p-information__data dd:nth-of-type(7)', 15 | '.work-info .work-detail:first-of-type .work-detail-contents:last-of-type', 16 | ], 17 | timezone: 'Asia/Tokyo', 18 | }, 19 | 20 | dek: null, 21 | 22 | lead_image_url: { 23 | selectors: [['meta[name="og:image"]', 'value']], 24 | }, 25 | 26 | content: { 27 | selectors: [ 28 | 'div.p-main__information', 29 | ['div.main-info', 'div.main-cover-inner'], 30 | ], 31 | 32 | defaultCleaner: false, 33 | 34 | transforms: {}, 35 | 36 | clean: [ 37 | 'span.label.label--trial', 38 | 'dt.info-head.info-head--coin', 39 | 'dd.info-contents.info-contents--coin', 40 | 'div.info-notice.fn-toggleClass', 41 | ], 42 | }, 43 | }; 44 | -------------------------------------------------------------------------------- /src/extractors/custom/buzzap.jp/index.js: -------------------------------------------------------------------------------- 1 | export const BuzzapJpExtractor = { 2 | domain: 'buzzap.jp', 3 | 4 | title: { 5 | selectors: ['h1.entry-title'], 6 | }, 7 | 8 | author: null, 9 | 10 | date_published: { 11 | selectors: [['time.entry-date', 'datetime']], 12 | }, 13 | 14 | dek: null, 15 | 16 | lead_image_url: { 17 | selectors: [['meta[name="og:image"]', 'value']], 18 | }, 19 | 20 | content: { 21 | selectors: ['div.ctiframe'], 22 | 23 | defaultCleaner: false, 24 | 25 | transforms: {}, 26 | 27 | clean: [], 28 | }, 29 | }; 30 | -------------------------------------------------------------------------------- /src/extractors/custom/clinicaltrials.gov/index.js: -------------------------------------------------------------------------------- 1 | export const ClinicaltrialsGovExtractor = { 2 | domain: 'clinicaltrials.gov', 3 | 4 | title: { 5 | selectors: ['h1.tr-solo_record'], 6 | }, 7 | 8 | author: { 9 | selectors: ['div#sponsor.tr-info-text'], 10 | }, 11 | 12 | date_published: { 13 | // selectors: ['span.term[data-term="Last Update Posted"]'], 14 | selectors: ['div:has(> span.term[data-term="Last Update Posted"])'], 15 | }, 16 | 17 | content: { 18 | selectors: ['div#tab-body'], 19 | 20 | // Is there anything in the content you selected that needs transformed 21 | // before it's consumable content? E.g., unusual lazy loaded images 22 | transforms: {}, 23 | 24 | // Is there anything that is in the result that shouldn't be? 25 | // The clean selectors will remove anything that matches from 26 | // the result 27 | clean: ['.usa-alert> img'], 28 | }, 29 | }; 30 | -------------------------------------------------------------------------------- /src/extractors/custom/deadline.com/index.js: -------------------------------------------------------------------------------- 1 | export const DeadlineComExtractor = { 2 | domain: 'deadline.com', 3 | 4 | title: { 5 | selectors: ['h1'], 6 | }, 7 | 8 | author: { 9 | selectors: ['section.author h2'], 10 | }, 11 | 12 | date_published: { 13 | selectors: [['meta[name="article:published_time"]', 'value']], 14 | }, 15 | 16 | dek: null, 17 | 18 | lead_image_url: { 19 | selectors: [['meta[name="og:image"]', 'value']], 20 | }, 21 | 22 | content: { 23 | selectors: ['div.a-article-grid__main.pmc-a-grid article.pmc-a-grid-item'], 24 | 25 | transforms: { 26 | '.embed-twitter': $node => { 27 | const innerHtml = $node.html(); 28 | $node.replaceWith(innerHtml); 29 | }, 30 | }, 31 | 32 | clean: ['figcaption'], 33 | }, 34 | }; 35 | -------------------------------------------------------------------------------- /src/extractors/custom/epaper.zeit.de/index.js: -------------------------------------------------------------------------------- 1 | export const EpaperZeitDeExtractor = { 2 | domain: 'epaper.zeit.de', 3 | 4 | title: { 5 | selectors: ['p.title'], 6 | }, 7 | 8 | author: { 9 | selectors: ['.article__author'], 10 | }, 11 | 12 | date_published: null, 13 | 14 | excerpt: { 15 | selectors: ['subtitle'], 16 | }, 17 | 18 | lead_image_url: null, 19 | 20 | content: { 21 | selectors: ['.article'], 22 | 23 | // Is there anything in the content you selected that needs transformed 24 | // before it's consumable content? E.g., unusual lazy loaded images 25 | transforms: { 26 | 'p.title': 'h1', 27 | '.article__author': 'p', 28 | byline: 'p', 29 | linkbox: 'p', 30 | }, 31 | 32 | // Is there anything that is in the result that shouldn't be? 33 | // The clean selectors will remove anything that matches from 34 | // the result 35 | clean: ['image-credits', 'box[type=citation]'], 36 | }, 37 | }; 38 | -------------------------------------------------------------------------------- /src/extractors/custom/fortune.com/index.js: -------------------------------------------------------------------------------- 1 | export const FortuneComExtractor = { 2 | domain: 'fortune.com', 3 | 4 | title: { 5 | selectors: ['h1'], 6 | }, 7 | 8 | author: { 9 | selectors: [['meta[name="author"]', 'value']], 10 | }, 11 | 12 | date_published: { 13 | selectors: ['.MblGHNMJ'], 14 | 15 | timezone: 'UTC', 16 | }, 17 | 18 | lead_image_url: { 19 | selectors: [['meta[name="og:image"]', 'value']], 20 | }, 21 | 22 | content: { 23 | selectors: [['picture', 'article.row'], 'article.row'], 24 | 25 | // Is there anything in the content you selected that needs transformed 26 | // before it's consumable content? E.g., unusual lazy loaded images 27 | transforms: {}, 28 | 29 | // Is there anything that is in the result that shouldn't be? 30 | // The clean selectors will remove anything that matches from 31 | // the result 32 | clean: [], 33 | }, 34 | }; 35 | -------------------------------------------------------------------------------- /src/extractors/custom/getnews.jp/index.js: -------------------------------------------------------------------------------- 1 | export const GetnewsJpExtractor = { 2 | domain: 'getnews.jp', 3 | 4 | title: { 5 | selectors: ['article h1'], 6 | }, 7 | 8 | author: { 9 | selectors: [['meta[name="article:author"]', 'value'], 'span.prof'], 10 | }, 11 | 12 | date_published: { 13 | selectors: [ 14 | ['meta[name="article:published_time"]', 'value'], 15 | ['ul.cattag-top time', 'datetime'], 16 | ], 17 | }, 18 | 19 | dek: null, 20 | 21 | lead_image_url: { 22 | selectors: [['meta[name="og:image"]', 'value']], 23 | }, 24 | 25 | content: { 26 | selectors: ['div.post-bodycopy'], 27 | 28 | transforms: {}, 29 | 30 | clean: [], 31 | }, 32 | }; 33 | -------------------------------------------------------------------------------- /src/extractors/custom/github.com/index.js: -------------------------------------------------------------------------------- 1 | export const GithubComExtractor = { 2 | domain: 'github.com', 3 | 4 | title: { 5 | selectors: [['meta[name="og:title"]', 'value']], 6 | }, 7 | 8 | author: { 9 | selectors: [ 10 | // enter author selectors 11 | ], 12 | }, 13 | 14 | date_published: { 15 | selectors: [ 16 | ['relative-time[datetime]', 'datetime'], 17 | ['span[itemprop="dateModified"] relative-time', 'datetime'], 18 | ], 19 | }, 20 | 21 | dek: { 22 | selectors: [ 23 | ['meta[name="description"]', 'value'], 24 | 'span[itemprop="about"]', 25 | ], 26 | }, 27 | 28 | lead_image_url: { 29 | selectors: [['meta[name="og:image"]', 'value']], 30 | }, 31 | 32 | content: { 33 | selectors: [['#readme article']], 34 | 35 | // Is there anything in the content you selected that needs transformed 36 | // before it's consumable content? E.g., unusual lazy loaded images 37 | transforms: {}, 38 | 39 | // Is there anything that is in the result that shouldn't be? 40 | // The clean selectors will remove anything that matches from 41 | // the result 42 | clean: [], 43 | }, 44 | }; 45 | -------------------------------------------------------------------------------- /src/extractors/custom/hellogiggles.com/index.js: -------------------------------------------------------------------------------- 1 | export const HellogigglesComExtractor = { 2 | domain: 'hellogiggles.com', 3 | 4 | title: { 5 | selectors: [['meta[name="og:title"]', 'value'], '.title'], 6 | }, 7 | 8 | author: { 9 | selectors: ['.byline-wrapper span.author_name', '.author-link'], 10 | }, 11 | 12 | date_published: { 13 | selectors: [ 14 | ['meta[property="article:published_time"]', 'content'], 15 | ['meta[name="article:published_time"]', 'value'], 16 | ], 17 | }, 18 | 19 | lead_image_url: { 20 | selectors: [['meta[name="og:image"]', 'value']], 21 | }, 22 | 23 | content: { 24 | selectors: ['.main-content', '.entry-content'], 25 | 26 | // Is there anything in the content you selected that needs transformed 27 | // before it's consumable content? E.g., unusual lazy loaded images 28 | transforms: {}, 29 | 30 | // Is there anything that is in the result that shouldn't be? 31 | // The clean selectors will remove anything that matches from 32 | // the result 33 | clean: [], 34 | }, 35 | }; 36 | -------------------------------------------------------------------------------- /src/extractors/custom/ici.radio-canada.ca/index.js: -------------------------------------------------------------------------------- 1 | export const IciRadioCanadaCaExtractor = { 2 | domain: 'ici.radio-canada.ca', 3 | 4 | title: { 5 | selectors: ['h1'], 6 | }, 7 | 8 | author: { 9 | selectors: [['meta[name="dc.creator"]', 'value']], 10 | }, 11 | 12 | date_published: { 13 | selectors: [['meta[name="dc.date.created"]', 'value']], 14 | format: 'YYYY-MM-DD|HH[h]mm', 15 | timezone: 'America/New_York', 16 | }, 17 | 18 | dek: { 19 | selectors: ['div.lead-container', '.bunker-component.lead'], 20 | }, 21 | 22 | lead_image_url: { 23 | selectors: [['meta[name="og:image"]', 'value']], 24 | }, 25 | 26 | content: { 27 | selectors: [ 28 | 'section.document-content-style', 29 | ['.main-multimedia-item', '.news-story-content'], 30 | ], 31 | 32 | // Is there anything in the content you selected that needs transformed 33 | // before it's consumable content? E.g., unusual lazy loaded images 34 | transforms: {}, 35 | 36 | // Is there anything that is in the result that shouldn't be? 37 | // The clean selectors will remove anything that matches from 38 | // the result 39 | clean: [], 40 | }, 41 | }; 42 | -------------------------------------------------------------------------------- /src/extractors/custom/japan.cnet.com/index.js: -------------------------------------------------------------------------------- 1 | export const JapanCnetComExtractor = { 2 | domain: 'japan.cnet.com', 3 | 4 | title: { 5 | selectors: ['.leaf-headline-ttl'], 6 | }, 7 | 8 | author: { 9 | selectors: ['.writer'], 10 | }, 11 | 12 | date_published: { 13 | selectors: ['.date'], 14 | format: 'YYYY年MM月DD日 HH時mm分', 15 | timezone: 'Asia/Tokyo', 16 | }, 17 | 18 | dek: null, 19 | 20 | lead_image_url: { 21 | selectors: [['meta[name="og:image"]', 'value']], 22 | }, 23 | 24 | content: { 25 | selectors: ['div.article_body'], 26 | 27 | transforms: {}, 28 | 29 | clean: [], 30 | }, 31 | }; 32 | -------------------------------------------------------------------------------- /src/extractors/custom/japan.zdnet.com/index.js: -------------------------------------------------------------------------------- 1 | export const JapanZdnetComExtractor = { 2 | domain: 'japan.zdnet.com', 3 | 4 | title: { 5 | selectors: ['h1'], 6 | }, 7 | 8 | author: { 9 | selectors: [['meta[name="cXenseParse:author"]', 'value']], 10 | }, 11 | 12 | date_published: { 13 | selectors: [['meta[name="article:published_time"]', 'value']], 14 | }, 15 | 16 | dek: null, 17 | 18 | lead_image_url: { 19 | selectors: [['meta[name="og:image"]', 'value']], 20 | }, 21 | 22 | content: { 23 | selectors: ['div.article_body'], 24 | 25 | transforms: {}, 26 | 27 | clean: [], 28 | }, 29 | }; 30 | -------------------------------------------------------------------------------- /src/extractors/custom/jvndb.jvn.jp/index.js: -------------------------------------------------------------------------------- 1 | export const JvndbJvnJpExtractor = { 2 | domain: 'jvndb.jvn.jp', 3 | 4 | title: { 5 | selectors: ['title'], 6 | }, 7 | 8 | author: null, 9 | 10 | date_published: { 11 | selectors: ['div.modifytxt:nth-child(2)'], 12 | format: 'YYYY/MM/DD', 13 | timezone: 'Asia/Tokyo', 14 | }, 15 | 16 | dek: null, 17 | 18 | lead_image_url: null, 19 | 20 | content: { 21 | selectors: ['#news-list'], 22 | 23 | defaultCleaner: false, 24 | 25 | transforms: {}, 26 | 27 | clean: [], 28 | }, 29 | }; 30 | -------------------------------------------------------------------------------- /src/extractors/custom/mashable.com/index.js: -------------------------------------------------------------------------------- 1 | export const MashableComExtractor = { 2 | domain: 'mashable.com', 3 | 4 | title: { 5 | selectors: ['header h1', 'h1.title'], 6 | }, 7 | 8 | author: { 9 | selectors: [['meta[name="article:author"]', 'value'], 'span.author_name a'], 10 | }, 11 | 12 | date_published: { 13 | selectors: [['meta[name="article:published_time"]', 'value']], 14 | }, 15 | 16 | lead_image_url: { 17 | selectors: [['meta[name="og:image"]', 'value']], 18 | }, 19 | 20 | content: { 21 | selectors: ['#article', 'section.article-content.blueprint'], 22 | 23 | // Is there anything in the content you selected that needs transformed 24 | // before it's consumable content? E.g., unusual lazy loaded images 25 | transforms: { 26 | '.image-credit': 'figcaption', 27 | }, 28 | 29 | // Is there anything that is in the result that shouldn't be? 30 | // The clean selectors will remove anything that matches from 31 | // the result 32 | clean: [], 33 | }, 34 | }; 35 | -------------------------------------------------------------------------------- /src/extractors/custom/money.cnn.com/index.js: -------------------------------------------------------------------------------- 1 | export const MoneyCnnComExtractor = { 2 | domain: 'money.cnn.com', 3 | 4 | title: { 5 | selectors: ['.article-title'], 6 | }, 7 | 8 | author: { 9 | selectors: [['meta[name="author"]', 'value'], '.byline a'], 10 | }, 11 | 12 | date_published: { 13 | selectors: [['meta[name="date"]', 'value']], 14 | 15 | timezone: 'GMT', 16 | }, 17 | 18 | dek: { 19 | selectors: ['#storytext h2'], 20 | }, 21 | 22 | lead_image_url: { 23 | selectors: [['meta[name="og:image"]', 'value']], 24 | }, 25 | 26 | content: { 27 | selectors: ['#storytext'], 28 | 29 | // Is there anything in the content you selected that needs transformed 30 | // before it's consumable content? E.g., unusual lazy loaded images 31 | transforms: {}, 32 | 33 | // Is there anything that is in the result that shouldn't be? 34 | // The clean selectors will remove anything that matches from 35 | // the result 36 | clean: ['.inStoryHeading'], 37 | }, 38 | }; 39 | -------------------------------------------------------------------------------- /src/extractors/custom/newrepublic.com/index.js: -------------------------------------------------------------------------------- 1 | export const NewrepublicComExtractor = { 2 | domain: 'newrepublic.com', 3 | 4 | title: { 5 | selectors: ['h1.article-headline'], 6 | }, 7 | 8 | author: { 9 | selectors: ['span.AuthorList'], 10 | }, 11 | 12 | date_published: { 13 | selectors: [['meta[name="article:published_time"]', 'value']], 14 | 15 | timezone: 'America/New_York', 16 | }, 17 | 18 | dek: { 19 | selectors: ['h2.article-subhead'], 20 | }, 21 | 22 | lead_image_url: { 23 | selectors: [['meta[name="og:image"]', 'value']], 24 | }, 25 | 26 | content: { 27 | selectors: [['div.article-body']], 28 | 29 | // Is there anything in the content you selected that needs transformed 30 | // before it's consumable content? E.g., unusual lazy loaded images 31 | transforms: {}, 32 | 33 | // Is there anything that is in the result that shouldn't be? 34 | // The clean selectors will remove anything that matches from 35 | // the result 36 | clean: ['aside'], 37 | }, 38 | }; 39 | -------------------------------------------------------------------------------- /src/extractors/custom/nymag.com/index.test.js: -------------------------------------------------------------------------------- 1 | import assert from 'assert'; 2 | 3 | import Mercury from 'mercury'; 4 | 5 | const fs = require('fs'); 6 | 7 | describe('NYMagExtractor', () => { 8 | it('works with a feature story', async () => { 9 | const html = fs.readFileSync('./fixtures/nymag.com.html'); 10 | const uri = 11 | 'http://nymag.com/daily/intelligencer/2016/09/how-fox-news-women-took-down-roger-ailes.html'; 12 | 13 | const { dek, title, author } = await Mercury.parse(uri, html); 14 | const actualDek = 15 | 'How Fox News women took down the most powerful, and predatory, man in media.'; 16 | 17 | assert.equal(dek, actualDek); 18 | assert.equal(title, 'The Revenge of Roger’s Angels'); 19 | assert.equal(author, 'Gabriel Sherman'); 20 | }); 21 | }); 22 | -------------------------------------------------------------------------------- /src/extractors/custom/obamawhitehouse.archives.gov/index.js: -------------------------------------------------------------------------------- 1 | export const ObamawhitehouseArchivesGovExtractor = { 2 | domain: 'obamawhitehouse.archives.gov', 3 | 4 | supportedDomains: ['whitehouse.gov'], 5 | 6 | title: { 7 | selectors: ['h1', '.pane-node-title'], 8 | }, 9 | 10 | author: { 11 | selectors: ['.blog-author-link', '.node-person-name-link'], 12 | }, 13 | 14 | date_published: { 15 | selectors: [['meta[name="article:published_time"]', 'value']], 16 | }, 17 | 18 | dek: { 19 | selectors: ['.field-name-field-forall-summary'], 20 | }, 21 | 22 | lead_image_url: { 23 | selectors: [['meta[name="og:image"]', 'value']], 24 | }, 25 | 26 | content: { 27 | defaultCleaner: false, 28 | 29 | selectors: ['div#content-start', '.pane-node-field-forall-body'], 30 | 31 | // Is there anything in the content you selected that needs transformed 32 | // before it's consumable content? E.g., unusual lazy loaded images 33 | transforms: {}, 34 | 35 | // Is there anything that is in the result that shouldn't be? 36 | // The clean selectors will remove anything that matches from 37 | // the result 38 | clean: ['.pane-node-title', '.pane-custom.pane-1'], 39 | }, 40 | }; 41 | -------------------------------------------------------------------------------- /src/extractors/custom/observer.com/index.js: -------------------------------------------------------------------------------- 1 | export const ObserverComExtractor = { 2 | domain: 'observer.com', 3 | 4 | title: { 5 | selectors: ['h1.entry-title'], 6 | }, 7 | 8 | author: { 9 | selectors: ['.author', '.vcard'], 10 | }, 11 | 12 | date_published: { 13 | selectors: [['meta[name="article:published_time"]', 'value']], 14 | }, 15 | 16 | dek: { 17 | selectors: ['h2.dek'], 18 | }, 19 | 20 | lead_image_url: { 21 | selectors: [['meta[name="og:image"]', 'value']], 22 | }, 23 | 24 | content: { 25 | selectors: ['div.entry-content'], 26 | 27 | // Is there anything in the content you selected that needs transformed 28 | // before it's consumable content? E.g., unusual lazy loaded images 29 | transforms: {}, 30 | 31 | // Is there anything that is in the result that shouldn't be? 32 | // The clean selectors will remove anything that matches from 33 | // the result 34 | clean: [], 35 | }, 36 | }; 37 | -------------------------------------------------------------------------------- /src/extractors/custom/otrs.com/index.js: -------------------------------------------------------------------------------- 1 | export const OtrsComExtractor = { 2 | domain: 'otrs.com', 3 | 4 | title: { 5 | selectors: ['#main article h1'], 6 | }, 7 | 8 | author: { 9 | selectors: ['div.dateplusauthor a'], 10 | }, 11 | 12 | date_published: { 13 | selectors: [['meta[name="article:published_time"]', 'value']], 14 | }, 15 | 16 | dek: { 17 | selectors: [['meta[name="og:description"]', 'value']], 18 | }, 19 | 20 | lead_image_url: { 21 | selectors: [['meta[name="og:image"]', 'value']], 22 | }, 23 | 24 | content: { 25 | selectors: ['#main article'], 26 | 27 | defaultCleaner: false, 28 | 29 | transforms: {}, 30 | 31 | clean: [ 32 | 'div.dateplusauthor', 33 | 'div.gr-12.push-6.footershare', 34 | '#atftbx', 35 | 'div.category-modul', 36 | ], 37 | }, 38 | }; 39 | -------------------------------------------------------------------------------- /src/extractors/custom/pagesix.com/index.js: -------------------------------------------------------------------------------- 1 | export const PagesixComExtractor = { 2 | domain: 'pagesix.com', 3 | 4 | supportedDomains: ['nypost.com'], 5 | 6 | title: { 7 | selectors: [['meta[name="og:title"]', 'value']], 8 | }, 9 | 10 | author: { 11 | selectors: ['.byline'], 12 | }, 13 | 14 | date_published: { 15 | selectors: [['meta[name="article:published_time"]', 'value']], 16 | }, 17 | 18 | dek: { 19 | selectors: [['meta[name="description"]', 'value']], 20 | }, 21 | 22 | lead_image_url: { 23 | selectors: [['meta[name="og:image"]', 'value']], 24 | }, 25 | 26 | content: { 27 | selectors: [ 28 | ['#featured-image-wrapper', '.entry-content'], 29 | '.entry-content', 30 | ], 31 | 32 | // Is there anything in the content you selected that needs transformed 33 | // before it's consumable content? E.g., unusual lazy loaded images 34 | transforms: { 35 | '#featured-image-wrapper': 'figure', 36 | '.wp-caption-text': 'figcaption', 37 | }, 38 | 39 | // Is there anything that is in the result that shouldn't be? 40 | // The clean selectors will remove anything that matches from 41 | // the result 42 | clean: ['.modal-trigger'], 43 | }, 44 | }; 45 | -------------------------------------------------------------------------------- /src/extractors/custom/pastebin.com/index.js: -------------------------------------------------------------------------------- 1 | export const PastebinComExtractor = { 2 | domain: 'pastebin.com', 3 | 4 | title: { 5 | selectors: ['h1'], 6 | }, 7 | 8 | author: { 9 | selectors: ['.username', '.paste_box_line2 .t_us + a'], 10 | }, 11 | 12 | date_published: { 13 | selectors: ['.date', '.paste_box_line2 .t_da + span'], 14 | timezone: 'America/New_York', 15 | format: 'MMMM D, YYYY', 16 | }, 17 | 18 | lead_image_url: { 19 | selectors: [['meta[name="og:image"]', 'value']], 20 | }, 21 | 22 | content: { 23 | selectors: ['.source', '#selectable .text'], 24 | 25 | // Is there anything in the content you selected that needs transformed 26 | // before it's consumable content? E.g., unusual lazy loaded images 27 | transforms: { 28 | ol: 'div', 29 | li: 'p', 30 | }, 31 | 32 | // Is there anything that is in the result that shouldn't be? 33 | // The clean selectors will remove anything that matches from 34 | // the result 35 | clean: [], 36 | }, 37 | }; 38 | -------------------------------------------------------------------------------- /src/extractors/custom/people.com/index.js: -------------------------------------------------------------------------------- 1 | export const PeopleComExtractor = { 2 | domain: 'people.com', 3 | 4 | title: { 5 | selectors: ['.article-header h1', ['meta[name="og:title"]', 'value']], 6 | }, 7 | 8 | author: { 9 | selectors: [['meta[name="sailthru.author"]', 'value'], 'a.author.url.fn'], 10 | }, 11 | 12 | date_published: { 13 | selectors: [ 14 | '.mntl-attribution__item-date', 15 | ['meta[name="article:published_time"]', 'value'], 16 | ], 17 | }, 18 | 19 | lead_image_url: { 20 | selectors: [['meta[name="og:image"]', 'value']], 21 | }, 22 | 23 | dek: { 24 | selectors: ['.article-header h2'], 25 | }, 26 | 27 | content: { 28 | selectors: ['div[class^="loc article-content"]', 'div.article-body__inner'], 29 | 30 | // Is there anything in the content you selected that needs transformed 31 | // before it's consumable content? E.g., unusual lazy loaded images 32 | transforms: {}, 33 | 34 | // Is there anything that is in the result that shouldn't be? 35 | // The clean selectors will remove anything that matches from 36 | // the result 37 | clean: [], 38 | }, 39 | }; 40 | -------------------------------------------------------------------------------- /src/extractors/custom/phpspot.org/index.js: -------------------------------------------------------------------------------- 1 | export const PhpspotOrgExtractor = { 2 | domain: 'phpspot.org', 3 | 4 | title: { 5 | selectors: ['h3.hl'], 6 | }, 7 | 8 | author: null, 9 | 10 | date_published: { 11 | selectors: ['h4.hl'], 12 | format: 'YYYY年MM月DD日', 13 | timezone: 'Asia/Tokyo', 14 | }, 15 | 16 | dek: null, 17 | 18 | lead_image_url: null, 19 | 20 | content: { 21 | selectors: ['div.entrybody'], 22 | 23 | defaultCleaner: false, 24 | 25 | transforms: {}, 26 | 27 | clean: [], 28 | }, 29 | }; 30 | -------------------------------------------------------------------------------- /src/extractors/custom/pitchfork.com/index.js: -------------------------------------------------------------------------------- 1 | export const PitchforkComExtractor = { 2 | domain: 'pitchfork.com', 3 | 4 | title: { 5 | selectors: [['meta[name="og:title"]', 'value'], 'title'], 6 | }, 7 | 8 | author: { 9 | selectors: [ 10 | ['meta[name="article:author"]', 'value'], 11 | '.authors-detail__display-name', 12 | ], 13 | }, 14 | 15 | date_published: { 16 | selectors: ['div[class^="InfoSliceWrapper-"]', ['.pub-date', 'datetime']], 17 | }, 18 | 19 | dek: { 20 | selectors: [ 21 | ['meta[name="og:description"]', 'value'], 22 | '.review-detail__abstract', 23 | ], 24 | }, 25 | 26 | lead_image_url: { 27 | selectors: [ 28 | ['meta[name="og:image"]', 'value'], 29 | ['.single-album-tombstone__art img', 'src'], 30 | ], 31 | }, 32 | 33 | content: { 34 | selectors: ['div.body__inner-container', '.review-detail__text'], 35 | }, 36 | 37 | extend: { 38 | score: { 39 | selectors: ['p[class*="Rating"]', '.score'], 40 | }, 41 | }, 42 | }; 43 | -------------------------------------------------------------------------------- /src/extractors/custom/postlight.com/index.js: -------------------------------------------------------------------------------- 1 | export const PostlightComExtractor = { 2 | domain: 'postlight.com', 3 | 4 | title: { 5 | selectors: [['meta[name="og:title"]', 'value']], 6 | }, 7 | 8 | author: { 9 | selectors: [['meta[name="parsely-author"]', 'value']], 10 | }, 11 | 12 | date_published: { 13 | selectors: [['meta[name="article:published_time"]', 'value']], 14 | }, 15 | 16 | dek: { 17 | selectors: ['h2.single-hero__abstract'], 18 | }, 19 | 20 | lead_image_url: { 21 | selectors: [['meta[name="og:image"]', 'value']], 22 | }, 23 | 24 | content: { 25 | selectors: ['main.post'], 26 | 27 | // Is there anything in the content you selected that needs transformed 28 | // before it's consumable content? E.g., unusual lazy loaded images 29 | transforms: {}, 30 | 31 | // Is there anything that is in the result that shouldn't be? 32 | // The clean selectors will remove anything that matches from 33 | // the result 34 | clean: [ 35 | 'section.pl-post-link', 36 | 'aside', 37 | 'section.insights_featured_case_studies', 38 | ], 39 | }, 40 | }; 41 | -------------------------------------------------------------------------------- /src/extractors/custom/qz.com/index.js: -------------------------------------------------------------------------------- 1 | export const QzComExtractor = { 2 | domain: 'qz.com', 3 | 4 | title: { 5 | selectors: ['article header h1'], 6 | }, 7 | 8 | author: { 9 | selectors: [['meta[name="author"]', 'value']], 10 | }, 11 | 12 | date_published: { 13 | selectors: [ 14 | ['meta[name="article:published_time"]', 'value'], 15 | ['time[datetime]', 'datetime'], 16 | ], 17 | }, 18 | 19 | lead_image_url: { 20 | selectors: [ 21 | ['meta[name="og:image"]', 'value'], 22 | ['meta[property="og:image"]', 'content'], 23 | ['meta[name="twitter:image"]', 'content'], 24 | ], 25 | }, 26 | 27 | content: { 28 | selectors: ['#article-content'], 29 | 30 | // Is there anything in the content you selected that needs transformed 31 | // before it's consumable content? E.g., unusual lazy loaded images 32 | transforms: {}, 33 | 34 | // Is there anything that is in the result that shouldn't be? 35 | // The clean selectors will remove anything that matches from 36 | // the result 37 | clean: [], 38 | }, 39 | }; 40 | -------------------------------------------------------------------------------- /src/extractors/custom/scan.netsecurity.ne.jp/index.js: -------------------------------------------------------------------------------- 1 | export const ScanNetsecurityNeJpExtractor = { 2 | domain: 'scan.netsecurity.ne.jp', 3 | 4 | title: { 5 | selectors: ['header.arti-header h1.head'], 6 | }, 7 | 8 | author: null, 9 | 10 | date_published: { 11 | selectors: [['meta[name="article:modified_time"]', 'value']], 12 | }, 13 | 14 | dek: { 15 | selectors: ['header.arti-header p.arti-summary'], 16 | }, 17 | 18 | lead_image_url: { 19 | selectors: [['meta[name="og:image"]', 'value']], 20 | }, 21 | 22 | content: { 23 | selectors: ['div.arti-content.arti-content--thumbnail'], 24 | 25 | defaultCleaner: false, 26 | 27 | transforms: {}, 28 | 29 | clean: ['aside.arti-giga'], 30 | }, 31 | }; 32 | -------------------------------------------------------------------------------- /src/extractors/custom/sciencefly.com/index.js: -------------------------------------------------------------------------------- 1 | export const ScienceflyComExtractor = { 2 | domain: 'sciencefly.com', 3 | 4 | title: { 5 | selectors: ['.entry-title', '.cb-entry-title', '.cb-single-title'], 6 | }, 7 | 8 | author: { 9 | selectors: ['div.cb-author', 'div.cb-author-title'], 10 | }, 11 | 12 | date_published: { 13 | selectors: [['meta[name="article:published_time"]', 'value']], 14 | }, 15 | 16 | dek: { 17 | selectors: [ 18 | // enter selectors 19 | ], 20 | }, 21 | 22 | lead_image_url: { 23 | selectors: [['div.theiaPostSlider_slides img', 'src']], 24 | }, 25 | 26 | content: { 27 | selectors: ['div.theiaPostSlider_slides'], 28 | 29 | // Is there anything in the content you selected that needs transformed 30 | // before it's consumable content? E.g., unusual lazy loaded images 31 | transforms: {}, 32 | 33 | // Is there anything that is in the result that shouldn't be? 34 | // The clean selectors will remove anything that matches from 35 | // the result 36 | clean: [], 37 | }, 38 | }; 39 | -------------------------------------------------------------------------------- /src/extractors/custom/sect.iij.ad.jp/index.js: -------------------------------------------------------------------------------- 1 | export const SectIijAdJpExtractor = { 2 | domain: 'sect.iij.ad.jp', 3 | 4 | title: { 5 | selectors: ['div.title-box-inner h1', 'h3'], 6 | }, 7 | 8 | author: { 9 | selectors: ['p.post-author a', 'dl.entrydate dd'], 10 | }, 11 | 12 | date_published: { 13 | selectors: ['time'], 14 | format: 'YYYY年MM月DD日', 15 | timezone: 'Asia/Tokyo', 16 | }, 17 | 18 | dek: null, 19 | 20 | lead_image_url: { 21 | selectors: [['meta[name="og:image"]', 'value']], 22 | }, 23 | 24 | content: { 25 | selectors: ['.entry-inner', '#article'], 26 | 27 | transforms: {}, 28 | 29 | clean: ['dl.entrydate'], 30 | }, 31 | }; 32 | -------------------------------------------------------------------------------- /src/extractors/custom/takagi-hiromitsu.jp/index.js: -------------------------------------------------------------------------------- 1 | export const TakagihiromitsuJpExtractor = { 2 | domain: 'takagi-hiromitsu.jp', 3 | 4 | title: { 5 | selectors: ['h3'], 6 | }, 7 | 8 | author: { 9 | selectors: [['meta[name="author"]', 'value']], 10 | }, 11 | 12 | date_published: { 13 | selectors: [['meta[http-equiv="Last-Modified"]', 'value']], 14 | }, 15 | 16 | dek: null, 17 | 18 | lead_image_url: null, 19 | 20 | content: { 21 | selectors: ['div.body'], 22 | 23 | defaultCleaner: false, 24 | 25 | transforms: {}, 26 | 27 | clean: [], 28 | }, 29 | }; 30 | -------------------------------------------------------------------------------- /src/extractors/custom/techlog.iij.ad.jp/index.js: -------------------------------------------------------------------------------- 1 | export const TechlogIijAdJpExtractor = { 2 | domain: 'techlog.iij.ad.jp', 3 | 4 | title: { 5 | selectors: ['h1.entry-title'], 6 | }, 7 | 8 | author: { 9 | selectors: ['a[rel="author"]'], 10 | }, 11 | 12 | date_published: { 13 | selectors: [['time.entry-date', 'datetime']], 14 | }, 15 | 16 | dek: null, 17 | 18 | lead_image_url: { 19 | selectors: [['meta[name="og:image"]', 'value']], 20 | }, 21 | 22 | content: { 23 | selectors: ['div.entry-content'], 24 | 25 | defaultCleaner: false, 26 | 27 | transforms: {}, 28 | 29 | clean: ['.wp_social_bookmarking_light'], 30 | }, 31 | }; 32 | -------------------------------------------------------------------------------- /src/extractors/custom/thefederalistpapers.org/index.js: -------------------------------------------------------------------------------- 1 | export const ThefederalistpapersOrgExtractor = { 2 | domain: 'thefederalistpapers.org', 3 | 4 | title: { 5 | selectors: ['h1.entry-title'], 6 | }, 7 | 8 | author: { 9 | selectors: ['.author-meta-title', 'main span.entry-author-name'], 10 | }, 11 | 12 | date_published: { 13 | selectors: [['meta[name="article:published_time"]', 'value']], 14 | }, 15 | 16 | lead_image_url: { 17 | selectors: [['meta[name="og:image"]', 'value']], 18 | }, 19 | 20 | content: { 21 | selectors: ['.content'], 22 | 23 | // Is there anything in the content you selected that needs transformed 24 | // before it's consumable content? E.g., unusual lazy loaded images 25 | transforms: {}, 26 | 27 | // Is there anything that is in the result that shouldn't be? 28 | // The clean selectors will remove anything that matches from 29 | // the result 30 | clean: [ 31 | 'header', 32 | '.article-sharing', 33 | '.after-article', 34 | '.type-commenting', 35 | '.more-posts', 36 | ['p[style]'], 37 | ], 38 | }, 39 | }; 40 | -------------------------------------------------------------------------------- /src/extractors/custom/thoughtcatalog.com/index.js: -------------------------------------------------------------------------------- 1 | export const ThoughtcatalogComExtractor = { 2 | domain: 'thoughtcatalog.com', 3 | 4 | title: { 5 | selectors: ['h1.title', ['meta[name="og:title"]', 'value']], 6 | }, 7 | 8 | author: { 9 | selectors: [ 10 | 'cite a', 11 | 'div.col-xs-12.article_header div.writer-container.writer-container-inline.writer-no-avatar h4.writer-name', 12 | 'h1.writer-name', 13 | ], 14 | }, 15 | 16 | date_published: { 17 | selectors: [['meta[name="article:published_time"]', 'value']], 18 | }, 19 | 20 | lead_image_url: { 21 | selectors: [['meta[name="og:image"]', 'value']], 22 | }, 23 | 24 | content: { 25 | selectors: ['.entry.post'], 26 | 27 | // Is there anything in the content you selected that needs transformed 28 | // before it's consumable content? E.g., unusual lazy loaded images 29 | transforms: {}, 30 | 31 | // Is there anything that is in the result that shouldn't be? 32 | // The clean selectors will remove anything that matches from 33 | // the result 34 | clean: ['.tc_mark', 'figcaption'], 35 | }, 36 | }; 37 | -------------------------------------------------------------------------------- /src/extractors/custom/timesofindia.indiatimes.com/index.js: -------------------------------------------------------------------------------- 1 | export const TimesofindiaIndiatimesComExtractor = { 2 | domain: 'timesofindia.indiatimes.com', 3 | 4 | title: { 5 | selectors: ['h1'], 6 | }, 7 | 8 | extend: { 9 | reporter: { 10 | selectors: ['div.byline'], 11 | transforms: {}, 12 | }, 13 | }, 14 | 15 | date_published: { 16 | selectors: ['.byline'], 17 | format: 'MMM D, YYYY, HH:mm z', 18 | timezone: 'Asia/Kolkata', 19 | }, 20 | 21 | lead_image_url: { 22 | selectors: [['meta[name="og:image"]', 'value']], 23 | }, 24 | 25 | content: { 26 | selectors: ['div.contentwrapper:has(section)'], 27 | defaultCleaner: false, 28 | 29 | clean: [ 30 | 'section', 31 | 'h1', 32 | '.byline', 33 | '.img_cptn', 34 | '.icon_share_wrap', 35 | 'ul[itemtype="https://schema.org/BreadcrumbList"]', 36 | ], 37 | }, 38 | }; 39 | -------------------------------------------------------------------------------- /src/extractors/custom/twitter.com/index.js: -------------------------------------------------------------------------------- 1 | export const TwitterExtractor = { 2 | domain: 'twitter.com', 3 | 4 | content: { 5 | transforms: { 6 | // We're transforming essentially the whole page here. 7 | // Twitter doesn't have nice selectors, so our initial 8 | // selector grabs the whole page, then we're re-writing 9 | // it to fit our needs before we clean it up. 10 | '.permalink[role=main]': ($node, $) => { 11 | const tweets = $node.find('.tweet'); 12 | const $tweetContainer = $('
'); 13 | $tweetContainer.append(tweets); 14 | $node.replaceWith($tweetContainer); 15 | }, 16 | 17 | // Twitter wraps @ with s, which 18 | // renders as a strikethrough 19 | s: 'span', 20 | }, 21 | 22 | selectors: ['.permalink[role=main]'], 23 | 24 | defaultCleaner: false, 25 | 26 | clean: ['.stream-item-footer', 'button', '.tweet-details-fixer'], 27 | }, 28 | 29 | author: { 30 | selectors: ['.tweet.permalink-tweet .username'], 31 | }, 32 | 33 | date_published: { 34 | selectors: [['.permalink-tweet ._timestamp[data-time-ms]', 'data-time-ms']], 35 | }, 36 | }; 37 | -------------------------------------------------------------------------------- /src/extractors/custom/twitter.com/index.test.js: -------------------------------------------------------------------------------- 1 | import assert from 'assert'; 2 | 3 | import Mercury from 'mercury'; 4 | 5 | const fs = require('fs'); 6 | 7 | describe('TwitterExtractor', () => { 8 | it('works with a feature story', async () => { 9 | const html = fs.readFileSync('./fixtures/twitter.com.html'); 10 | const uri = 'https://twitter.com/KingBeyonceStan/status/745276948213968896'; 11 | 12 | const { title, author, date_published } = await Mercury.parse(uri, { 13 | html, 14 | }); 15 | 16 | assert.equal(title, 'Lina Morgana on Twitter'); 17 | assert.equal(author, '@KingBeyonceStan'); 18 | assert.equal(date_published, '2016-06-21T15:27:25.000Z'); 19 | }); 20 | }); 21 | -------------------------------------------------------------------------------- /src/extractors/custom/uproxx.com/index.js: -------------------------------------------------------------------------------- 1 | export const UproxxComExtractor = { 2 | domain: 'uproxx.com', 3 | 4 | title: { 5 | selectors: ['div.entry-header h1'], 6 | }, 7 | 8 | author: { 9 | selectors: [['meta[name="qc:author"]', 'value']], 10 | }, 11 | 12 | date_published: { 13 | selectors: [['meta[name="article:published_time"]', 'value']], 14 | }, 15 | 16 | lead_image_url: { 17 | selectors: [['meta[name="og:image"]', 'value']], 18 | }, 19 | 20 | content: { 21 | selectors: ['.entry-content'], 22 | 23 | // Is there anything in the content you selected that needs transformed 24 | // before it's consumable content? E.g., unusual lazy loaded images 25 | transforms: { 26 | 'div.image': 'figure', 27 | 'div.image .wp-media-credit': 'figcaption', 28 | }, 29 | 30 | // Is there anything that is in the result that shouldn't be? 31 | // The clean selectors will remove anything that matches from 32 | // the result 33 | clean: [], 34 | }, 35 | }; 36 | -------------------------------------------------------------------------------- /src/extractors/custom/weekly.ascii.jp/index.js: -------------------------------------------------------------------------------- 1 | export const WeeklyAsciiJpExtractor = { 2 | domain: 'weekly.ascii.jp', 3 | 4 | title: { 5 | selectors: ['article h1', 'h1[itemprop="headline"]'], 6 | }, 7 | 8 | author: { 9 | selectors: ['p.author'], 10 | }, 11 | 12 | date_published: { 13 | selectors: ['p.date', ['meta[name="odate"]', 'value']], 14 | 15 | format: 'YYYY年MM月DD日 HH:mm', 16 | 17 | timezone: 'Asia/Tokyo', 18 | }, 19 | 20 | dek: null, 21 | 22 | lead_image_url: { 23 | selectors: [['meta[name="og:image"]', 'value']], 24 | }, 25 | 26 | content: { 27 | selectors: ['div#contents_detail', 'div.article'], 28 | 29 | transforms: {}, 30 | 31 | clean: [], 32 | }, 33 | }; 34 | -------------------------------------------------------------------------------- /src/extractors/custom/wikipedia.org/index.js: -------------------------------------------------------------------------------- 1 | export const WikipediaExtractor = { 2 | domain: 'wikipedia.org', 3 | content: { 4 | selectors: ['#mw-content-text'], 5 | 6 | defaultCleaner: false, 7 | 8 | // transform top infobox to an image with caption 9 | transforms: { 10 | '.infobox img': $node => { 11 | const $parent = $node.parents('.infobox'); 12 | // Only prepend the first image in .infobox 13 | if ($parent.children('img').length === 0) { 14 | $parent.prepend($node); 15 | } 16 | }, 17 | '.infobox caption': 'figcaption', 18 | '.infobox': 'figure', 19 | }, 20 | 21 | // Selectors to remove from the extracted content 22 | clean: [ 23 | '.mw-editsection', 24 | 'figure tr, figure td, figure tbody', 25 | '#toc', 26 | '.navbox', 27 | ], 28 | }, 29 | 30 | author: 'Wikipedia Contributors', 31 | 32 | title: { 33 | selectors: ['h2.title'], 34 | }, 35 | 36 | date_published: { 37 | selectors: ['#footer-info-lastmod'], 38 | }, 39 | }; 40 | -------------------------------------------------------------------------------- /src/extractors/custom/wired.jp/index.js: -------------------------------------------------------------------------------- 1 | import URL from 'url'; 2 | 3 | export const WiredJpExtractor = { 4 | domain: 'wired.jp', 5 | 6 | title: { 7 | selectors: ['h1[data-testid="ContentHeaderHed"]', 'h1.post-title'], 8 | }, 9 | 10 | author: { 11 | selectors: [ 12 | ['meta[name="article:author"]', 'value'], 13 | 'p[itemprop="author"]', 14 | ], 15 | }, 16 | 17 | date_published: { 18 | selectors: [ 19 | ['meta[name="article:published_time"]', 'value'], 20 | ['time', 'datetime'], 21 | ], 22 | }, 23 | 24 | dek: { 25 | selectors: ['div[class^="ContentHeaderDek"]', '.post-intro'], 26 | }, 27 | 28 | lead_image_url: { 29 | selectors: [['meta[name="og:image"]', 'value']], 30 | }, 31 | 32 | content: { 33 | selectors: [ 34 | 'div[data-attribute-verso-pattern="article-body"]', 35 | 'article.article-detail', 36 | ], 37 | 38 | transforms: { 39 | 'img[data-original]': $node => { 40 | const dataOriginal = $node.attr('data-original'); 41 | const src = $node.attr('src'); 42 | const url = URL.resolve(src, dataOriginal); 43 | $node.attr('src', url); 44 | }, 45 | }, 46 | 47 | clean: ['.post-category', 'time', 'h1.post-title', '.social-area-syncer'], 48 | }, 49 | }; 50 | -------------------------------------------------------------------------------- /src/extractors/custom/www.al.com/index.js: -------------------------------------------------------------------------------- 1 | export const WwwAlComExtractor = { 2 | domain: 'www.al.com', 3 | 4 | title: { 5 | selectors: [['meta[name="title"]', 'value']], 6 | }, 7 | 8 | author: { 9 | selectors: [['meta[name="article_author"]', 'value']], 10 | }, 11 | 12 | date_published: { 13 | selectors: [['meta[name="article_date_original"]', 'value']], 14 | timezone: 'EST', 15 | }, 16 | 17 | lead_image_url: { 18 | selectors: [['meta[name="og:image"]', 'value']], 19 | }, 20 | 21 | content: { 22 | selectors: ['.entry-content'], 23 | 24 | // Is there anything in the content you selected that needs transformed 25 | // before it's consumable content? E.g., unusual lazy loaded images 26 | transforms: {}, 27 | 28 | // Is there anything that is in the result that shouldn't be? 29 | // The clean selectors will remove anything that matches from 30 | // the result 31 | clean: [], 32 | }, 33 | }; 34 | -------------------------------------------------------------------------------- /src/extractors/custom/www.americanow.com/index.js: -------------------------------------------------------------------------------- 1 | export const WwwAmericanowComExtractor = { 2 | domain: 'www.americanow.com', 3 | 4 | title: { 5 | selectors: ['.title', ['meta[name="title"]', 'value']], 6 | }, 7 | 8 | author: { 9 | selectors: ['.byline'], 10 | }, 11 | 12 | date_published: { 13 | selectors: [['meta[name="publish_date"]', 'value']], 14 | }, 15 | 16 | dek: { 17 | selectors: [ 18 | // enter selectors 19 | ], 20 | }, 21 | 22 | lead_image_url: { 23 | selectors: [['meta[name="og:image"]', 'value']], 24 | }, 25 | 26 | content: { 27 | selectors: [['.article-content', '.image', '.body'], '.body'], 28 | 29 | // Is there anything in the content you selected that needs transformed 30 | // before it's consumable content? E.g., unusual lazy loaded images 31 | transforms: {}, 32 | 33 | // Is there anything that is in the result that shouldn't be? 34 | // The clean selectors will remove anything that matches from 35 | // the result 36 | clean: ['.article-video-wrapper', '.show-for-small-only'], 37 | }, 38 | }; 39 | -------------------------------------------------------------------------------- /src/extractors/custom/www.androidcentral.com/index.js: -------------------------------------------------------------------------------- 1 | export const WwwAndroidcentralComExtractor = { 2 | domain: 'www.androidcentral.com', 3 | 4 | title: { 5 | selectors: ['h1', 'h1.main-title'], 6 | }, 7 | 8 | author: { 9 | selectors: [['meta[name="parsely-author"]', 'value']], 10 | }, 11 | 12 | date_published: { 13 | selectors: [['meta[name="article:published_time"]', 'value']], 14 | }, 15 | 16 | dek: { 17 | selectors: [['meta[name="description"]', 'value']], 18 | }, 19 | 20 | lead_image_url: { 21 | selectors: [['meta[name="og:image"]', 'value']], 22 | }, 23 | 24 | content: { 25 | selectors: ['#article-body'], 26 | 27 | // Is there anything in the content you selected that needs transformed 28 | // before it's consumable content? E.g., unusual lazy loaded images 29 | transforms: {}, 30 | 31 | // Is there anything that is in the result that shouldn't be? 32 | // The clean selectors will remove anything that matches from 33 | // the result 34 | clean: ['.intro', 'blockquote'], 35 | }, 36 | }; 37 | -------------------------------------------------------------------------------- /src/extractors/custom/www.aol.com/index.js: -------------------------------------------------------------------------------- 1 | export const WwwAolComExtractor = { 2 | domain: 'www.aol.com', 3 | 4 | title: { 5 | selectors: ['h1.p-article__title'], 6 | }, 7 | 8 | author: { 9 | selectors: [['meta[name="author"]', 'value']], 10 | }, 11 | 12 | date_published: { 13 | selectors: ['.p-article__byline__date'], 14 | 15 | timezone: 'America/New_York', 16 | }, 17 | 18 | dek: { 19 | selectors: [ 20 | // enter selectors 21 | ], 22 | }, 23 | 24 | lead_image_url: { 25 | selectors: [['meta[name="og:image"]', 'value']], 26 | }, 27 | 28 | content: { 29 | selectors: ['.article-content'], 30 | 31 | // Is there anything in the content you selected that needs transformed 32 | // before it's consumable content? E.g., unusual lazy loaded images 33 | transforms: {}, 34 | 35 | // Is there anything that is in the result that shouldn't be? 36 | // The clean selectors will remove anything that matches from 37 | // the result 38 | clean: [], 39 | }, 40 | }; 41 | -------------------------------------------------------------------------------- /src/extractors/custom/www.asahi.com/index.js: -------------------------------------------------------------------------------- 1 | export const WwwAsahiComExtractor = { 2 | domain: 'www.asahi.com', 3 | 4 | title: { 5 | selectors: ['main h1', '.ArticleTitle h1'], 6 | }, 7 | 8 | author: { 9 | selectors: [['meta[name="article:author"]', 'value']], 10 | }, 11 | 12 | date_published: { 13 | selectors: [['meta[name="pubdate"]', 'value']], 14 | }, 15 | 16 | dek: null, 17 | 18 | excerpt: { 19 | selectors: [['meta[name="og:description"]', 'value']], 20 | }, 21 | 22 | lead_image_url: { 23 | selectors: [['meta[name="og:image"]', 'value']], 24 | }, 25 | 26 | content: { 27 | selectors: ['main'], 28 | 29 | defaultCleaner: false, 30 | 31 | transforms: {}, 32 | 33 | clean: ['div.AdMod', 'div.LoginSelectArea', 'time', 'div.notPrint'], 34 | }, 35 | }; 36 | -------------------------------------------------------------------------------- /src/extractors/custom/www.broadwayworld.com/index.js: -------------------------------------------------------------------------------- 1 | // Rename CustomExtractor 2 | // to fit your publication 3 | // (e.g., NYTimesExtractor) 4 | export const BroadwayWorldExtractor = { 5 | domain: 'www.broadwayworld.com', 6 | title: { 7 | selectors: ['h1[itemprop=headline]', 'h1.article-title'], 8 | }, 9 | 10 | author: { 11 | selectors: ['span[itemprop=author]'], 12 | }, 13 | 14 | content: { 15 | selectors: ['div[itemprop=articlebody]'], 16 | 17 | // Is there anything in the content you selected that needs transformed 18 | // before it's consumable content? E.g., unusual lazy loaded images 19 | transforms: {}, 20 | 21 | // Is there anything that is in the result that shouldn't be? 22 | // The clean selectors will remove anything that matches from 23 | // the result 24 | clean: [], 25 | }, 26 | 27 | date_published: { 28 | selectors: [['meta[itemprop=datePublished]', 'value']], 29 | }, 30 | 31 | lead_image_url: { 32 | selectors: [['meta[name="og:image"]', 'value']], 33 | }, 34 | 35 | dek: { 36 | selectors: [], 37 | }, 38 | 39 | next_page_url: { 40 | selectors: [ 41 | // enter selectors 42 | ], 43 | }, 44 | 45 | excerpt: { 46 | selectors: [ 47 | // enter selectors 48 | ], 49 | }, 50 | }; 51 | -------------------------------------------------------------------------------- /src/extractors/custom/www.bustle.com/index.js: -------------------------------------------------------------------------------- 1 | export const WwwBustleComExtractor = { 2 | domain: 'www.bustle.com', 3 | 4 | title: { 5 | selectors: ['h1', 'h1.post-page__title'], 6 | }, 7 | 8 | author: { 9 | selectors: ['a[href*="profile"]', 'div.content-meta__author'], 10 | }, 11 | 12 | date_published: { 13 | selectors: [['time', 'datetime']], 14 | }, 15 | 16 | lead_image_url: { 17 | selectors: [['meta[name="og:image"]', 'value']], 18 | }, 19 | 20 | content: { 21 | selectors: ['article', '.post-page__body'], 22 | 23 | // Is there anything in the content you selected that needs transformed 24 | // before it's consumable content? E.g., unusual lazy loaded images 25 | transforms: {}, 26 | 27 | // Is there anything that is in the result that shouldn't be? 28 | // The clean selectors will remove anything that matches from 29 | // the result 30 | clean: [], 31 | }, 32 | }; 33 | -------------------------------------------------------------------------------- /src/extractors/custom/www.cbc.ca/index.js: -------------------------------------------------------------------------------- 1 | export const WwwCbcCaExtractor = { 2 | domain: 'www.cbc.ca', 3 | 4 | title: { 5 | selectors: ['h1'], 6 | }, 7 | 8 | author: { 9 | selectors: ['.authorText', '.bylineDetails'], 10 | }, 11 | 12 | date_published: { 13 | selectors: [['.timeStamp[datetime]', 'datetime']], 14 | }, 15 | 16 | dek: { 17 | selectors: ['.deck'], 18 | }, 19 | 20 | lead_image_url: { 21 | selectors: [['meta[name="og:image"]', 'value']], 22 | }, 23 | 24 | content: { 25 | selectors: ['.story'], 26 | 27 | // Is there anything in the content you selected that needs transformed 28 | // before it's consumable content? E.g., unusual lazy loaded images 29 | transforms: {}, 30 | 31 | // Is there anything that is in the result that shouldn't be? 32 | // The clean selectors will remove anything that matches from 33 | // the result 34 | clean: [], 35 | }, 36 | }; 37 | -------------------------------------------------------------------------------- /src/extractors/custom/www.cbssports.com/index.js: -------------------------------------------------------------------------------- 1 | export const WwwCbssportsComExtractor = { 2 | domain: 'www.cbssports.com', 3 | 4 | title: { 5 | selectors: ['.Article-headline', '.article-headline'], 6 | }, 7 | 8 | author: { 9 | selectors: ['.ArticleAuthor-nameText', '.author-name'], 10 | }, 11 | 12 | date_published: { 13 | selectors: [['meta[itemprop="datePublished"]', 'value']], 14 | timezone: 'UTC', 15 | }, 16 | 17 | dek: { 18 | selectors: ['.Article-subline', '.article-subline'], 19 | }, 20 | 21 | lead_image_url: { 22 | selectors: [['meta[name="og:image"]', 'value']], 23 | }, 24 | 25 | content: { 26 | selectors: ['.article'], 27 | 28 | // Is there anything in the content you selected that needs transformed 29 | // before it's consumable content? E.g., unusual lazy loaded images 30 | transforms: {}, 31 | 32 | // Is there anything that is in the result that shouldn't be? 33 | // The clean selectors will remove anything that matches from 34 | // the result 35 | clean: [], 36 | }, 37 | }; 38 | -------------------------------------------------------------------------------- /src/extractors/custom/www.chicagotribune.com/index.js: -------------------------------------------------------------------------------- 1 | export const WwwChicagotribuneComExtractor = { 2 | domain: 'www.chicagotribune.com', 3 | 4 | title: { 5 | selectors: [['meta[name="og:title"]', 'value']], 6 | }, 7 | 8 | author: { 9 | selectors: ['div.article_byline span:first-of-type'], 10 | }, 11 | 12 | date_published: { 13 | selectors: ['time'], 14 | }, 15 | 16 | lead_image_url: { 17 | selectors: [['meta[name="og:image"]', 'value']], 18 | }, 19 | 20 | content: { 21 | selectors: ['article'], 22 | 23 | // Is there anything in the content you selected that needs transformed 24 | // before it's consumable content? E.g., unusual lazy loaded images 25 | transforms: {}, 26 | 27 | // Is there anything that is in the result that shouldn't be? 28 | // The clean selectors will remove anything that matches from 29 | // the result 30 | clean: [], 31 | }, 32 | }; 33 | -------------------------------------------------------------------------------- /src/extractors/custom/www.cnbc.com/index.js: -------------------------------------------------------------------------------- 1 | export const WwwCnbcComExtractor = { 2 | domain: 'www.cnbc.com', 3 | 4 | title: { 5 | selectors: ['h1.title', 'h1.ArticleHeader-headline'], 6 | }, 7 | 8 | author: { 9 | selectors: [['meta[name="author"]', 'value']], 10 | }, 11 | 12 | date_published: { 13 | selectors: [['meta[name="article:published_time"]', 'value']], 14 | }, 15 | 16 | lead_image_url: { 17 | selectors: [['meta[name="og:image"]', 'value']], 18 | }, 19 | 20 | content: { 21 | selectors: [ 22 | 'div#article_body.content', 23 | 'div.story', 24 | 'div.ArticleBody-articleBody', 25 | ], 26 | 27 | // Is there anything in the content you selected that needs transformed 28 | // before it's consumable content? E.g., unusual lazy loaded images 29 | transforms: {}, 30 | 31 | // Is there anything that is in the result that shouldn't be? 32 | // The clean selectors will remove anything that matches from 33 | // the result 34 | clean: [], 35 | }, 36 | }; 37 | -------------------------------------------------------------------------------- /src/extractors/custom/www.dmagazine.com/index.js: -------------------------------------------------------------------------------- 1 | export const WwwDmagazineComExtractor = { 2 | domain: 'www.dmagazine.com', 3 | 4 | title: { 5 | selectors: ['h1.story__title'], 6 | }, 7 | 8 | author: { 9 | selectors: ['.story__info .story__info__item:first-child'], 10 | }, 11 | 12 | date_published: { 13 | selectors: [ 14 | // enter selectors 15 | '.story__info', 16 | ], 17 | 18 | timezone: 'America/Chicago', 19 | format: 'MMMM D, YYYY h:mm a', 20 | }, 21 | 22 | dek: { 23 | selectors: ['.story__subhead'], 24 | }, 25 | 26 | lead_image_url: { 27 | selectors: [['article figure a:first-child', 'href']], 28 | }, 29 | 30 | content: { 31 | selectors: ['.story__content'], 32 | 33 | // Is there anything in the content you selected that needs transformed 34 | // before it's consumable content? E.g., unusual lazy loaded images 35 | transforms: {}, 36 | 37 | // Is there anything that is in the result that shouldn't be? 38 | // The clean selectors will remove anything that matches from 39 | // the result 40 | clean: [], 41 | }, 42 | }; 43 | -------------------------------------------------------------------------------- /src/extractors/custom/www.elecom.co.jp/index.js: -------------------------------------------------------------------------------- 1 | export const WwwElecomCoJpExtractor = { 2 | domain: 'www.elecom.co.jp', 3 | 4 | title: { 5 | selectors: ['title'], 6 | }, 7 | 8 | author: null, 9 | 10 | date_published: { 11 | selectors: ['p.section-last'], 12 | format: 'YYYY.MM.DD', 13 | timezone: 'Asia/Tokyo', 14 | }, 15 | 16 | dek: null, 17 | 18 | lead_image_url: null, 19 | 20 | content: { 21 | selectors: ['td.TableMain2'], 22 | 23 | defaultCleaner: false, 24 | 25 | transforms: { 26 | table: $node => { 27 | $node.attr('width', 'auto'); 28 | }, 29 | }, 30 | 31 | clean: [], 32 | }, 33 | }; 34 | -------------------------------------------------------------------------------- /src/extractors/custom/www.eonline.com/index.js: -------------------------------------------------------------------------------- 1 | export const WwwEonlineComExtractor = { 2 | domain: 'www.eonline.com', 3 | 4 | title: { 5 | selectors: ['h1.article-detail__title', 'h1.article__title'], 6 | }, 7 | 8 | author: { 9 | selectors: ['.article-detail__meta__author', '.entry-meta__author a'], 10 | }, 11 | 12 | date_published: { 13 | selectors: [ 14 | ['meta[name="article:published_time"]', 'value'], 15 | ['meta[itemprop="datePublished"]', 'value'], 16 | ], 17 | }, 18 | 19 | lead_image_url: { 20 | selectors: [['meta[name="og:image"]', 'value']], 21 | }, 22 | 23 | content: { 24 | selectors: [ 25 | ['.article-detail__main-content section'], 26 | ['.post-content section, .post-content div.post-content__image'], 27 | ], 28 | 29 | // Is there anything in the content you selected that needs transformed 30 | // before it's consumable content? E.g., unusual lazy loaded images 31 | transforms: { 32 | 'div.post-content__image': 'figure', 33 | 'div.post-content__image .image__credits': 'figcaption', 34 | }, 35 | 36 | // Is there anything that is in the result that shouldn't be? 37 | // The clean selectors will remove anything that matches from 38 | // the result 39 | clean: [], 40 | }, 41 | }; 42 | -------------------------------------------------------------------------------- /src/extractors/custom/www.fastcompany.com/index.js: -------------------------------------------------------------------------------- 1 | export const WwwFastcompanyComExtractor = { 2 | domain: 'www.fastcompany.com', 3 | 4 | title: { 5 | selectors: ['h1'], 6 | }, 7 | 8 | author: { 9 | selectors: [['meta[name="author"]', 'value']], 10 | }, 11 | 12 | date_published: { 13 | selectors: [['meta[name="article:published_time"]', 'value']], 14 | }, 15 | 16 | dek: { 17 | selectors: ['.post__deck'], 18 | }, 19 | 20 | lead_image_url: { 21 | selectors: [['meta[name="og:image"]', 'value']], 22 | }, 23 | 24 | content: { 25 | selectors: ['.post__article'], 26 | }, 27 | }; 28 | -------------------------------------------------------------------------------- /src/extractors/custom/www.fortinet.com/index.js: -------------------------------------------------------------------------------- 1 | export const WwwFortinetComExtractor = { 2 | domain: 'www.fortinet.com', 3 | 4 | title: { 5 | selectors: ['h1'], 6 | }, 7 | 8 | author: { 9 | selectors: ['.b15-blog-meta__author'], 10 | }, 11 | 12 | date_published: { 13 | selectors: [['meta[name="article:published_time"]', 'value']], 14 | }, 15 | 16 | lead_image_url: { 17 | selectors: [['meta[name="og:image"]', 'value']], 18 | }, 19 | 20 | content: { 21 | selectors: [ 22 | 'div.responsivegrid.aem-GridColumn.aem-GridColumn--default--12', 23 | ], 24 | 25 | transforms: { 26 | noscript: $node => { 27 | const $children = $node.children(); 28 | if ($children.length === 1 && $children.get(0).tagName === 'img') { 29 | return 'figure'; 30 | } 31 | return null; 32 | }, 33 | }, 34 | }, 35 | }; 36 | -------------------------------------------------------------------------------- /src/extractors/custom/www.gizmodo.jp/index.js: -------------------------------------------------------------------------------- 1 | export const WwwGizmodoJpExtractor = { 2 | domain: 'www.gizmodo.jp', 3 | 4 | title: { 5 | selectors: ['h1.p-post-title'], 6 | }, 7 | 8 | author: { 9 | selectors: ['li.p-post-AssistAuthor'], 10 | }, 11 | 12 | date_published: { 13 | selectors: [['li.p-post-AssistTime time', 'datetime']], 14 | }, 15 | 16 | dek: null, 17 | 18 | lead_image_url: { 19 | selectors: [['meta[name="og:image"]', 'value']], 20 | }, 21 | 22 | content: { 23 | selectors: ['article.p-post'], 24 | 25 | transforms: { 26 | 'img.p-post-thumbnailImage': $node => { 27 | const src = $node.attr('src'); 28 | $node.attr('src', src.replace(/^.*=%27/, '').replace(/%27;$/, '')); 29 | }, 30 | }, 31 | 32 | clean: ['h1.p-post-title', 'ul.p-post-Assist'], 33 | }, 34 | }; 35 | -------------------------------------------------------------------------------- /src/extractors/custom/www.gruene.de/index.js: -------------------------------------------------------------------------------- 1 | export const WwwGrueneDeExtractor = { 2 | domain: 'www.gruene.de', 3 | 4 | title: { 5 | selectors: ['header h1'], 6 | }, 7 | 8 | author: null, 9 | 10 | date_published: null, 11 | 12 | dek: null, 13 | 14 | lead_image_url: { 15 | selectors: [['meta[property="og:image"]', 'content']], 16 | }, 17 | 18 | content: { 19 | // selectors: ['section'], 20 | selectors: [['section header', 'section h2', 'section p', 'section ol']], 21 | 22 | // Is there anything in the content you selected that needs transformed 23 | // before it's consumable content? E.g., unusual lazy loaded images 24 | transforms: {}, 25 | 26 | // Is there anything that is in the result that shouldn't be? 27 | // The clean selectors will remove anything that matches from 28 | // the result 29 | clean: ['figcaption', 'p[class]'], 30 | }, 31 | }; 32 | -------------------------------------------------------------------------------- /src/extractors/custom/www.infoq.com/index.js: -------------------------------------------------------------------------------- 1 | export const WwwInfoqComExtractor = { 2 | domain: 'www.infoq.com', 3 | 4 | title: { 5 | selectors: ['h1.heading'], 6 | }, 7 | 8 | author: { 9 | selectors: ['div.widget.article__authors'], 10 | }, 11 | 12 | date_published: { 13 | selectors: ['.article__readTime.date'], 14 | format: 'YYYY年MM月DD日', 15 | timezone: 'Asia/Tokyo', 16 | }, 17 | 18 | dek: { 19 | selectors: [['meta[name="og:description"]', 'value']], 20 | }, 21 | 22 | lead_image_url: { 23 | selectors: [['meta[name="og:image"]', 'value']], 24 | }, 25 | 26 | content: { 27 | selectors: ['div.article__data'], 28 | 29 | defaultCleaner: false, 30 | 31 | transforms: {}, 32 | 33 | clean: [], 34 | }, 35 | }; 36 | -------------------------------------------------------------------------------- /src/extractors/custom/www.inquisitr.com/index.js: -------------------------------------------------------------------------------- 1 | export const WwwInquisitrComExtractor = { 2 | domain: 'www.inquisitr.com', 3 | 4 | title: { 5 | selectors: ['h1.entry-title.story--header--title'], 6 | }, 7 | 8 | author: { 9 | selectors: ['div.story--header--author'], 10 | }, 11 | 12 | date_published: { 13 | selectors: [['meta[name="datePublished"]', 'value']], 14 | }, 15 | 16 | lead_image_url: { 17 | selectors: [['meta[name="og:image"]', 'value']], 18 | }, 19 | 20 | content: { 21 | selectors: ['article.story', '.entry-content.'], 22 | 23 | // Is there anything in the content you selected that needs transformed 24 | // before it's consumable content? E.g., unusual lazy loaded images 25 | transforms: {}, 26 | 27 | // Is there anything that is in the result that shouldn't be? 28 | // The clean selectors will remove anything that matches from 29 | // the result 30 | clean: [ 31 | '.post-category', 32 | '.story--header--socials', 33 | '.story--header--content', 34 | ], 35 | }, 36 | }; 37 | -------------------------------------------------------------------------------- /src/extractors/custom/www.investmentexecutive.com/index.js: -------------------------------------------------------------------------------- 1 | export const WwwInvestmentexecutiveComExtractor = { 2 | domain: 'www.investmentexecutive.com', 3 | 4 | title: { 5 | selectors: ['h1'], 6 | }, 7 | 8 | author: { 9 | selectors: ['div[itemprop="author"]'], 10 | }, 11 | 12 | date_published: { 13 | selectors: [['meta[itemprop="datePublished"]', 'value']], 14 | }, 15 | 16 | dek: { 17 | selectors: [['meta[name="og:description"]', 'value']], 18 | }, 19 | 20 | lead_image_url: { 21 | selectors: [['meta[name="og:image"]', 'value']], 22 | }, 23 | 24 | content: { 25 | selectors: ['section.article-body'], 26 | 27 | clean: ['.hidden'], 28 | }, 29 | }; 30 | -------------------------------------------------------------------------------- /src/extractors/custom/www.ipa.go.jp/index.js: -------------------------------------------------------------------------------- 1 | export const WwwIpaGoJpExtractor = { 2 | domain: 'www.ipa.go.jp', 3 | 4 | title: { 5 | selectors: ['h1'], 6 | }, 7 | 8 | author: null, 9 | 10 | date_published: { 11 | selectors: ['p.ipar_text_right'], 12 | format: 'YYYY年M月D日', 13 | timezone: 'Asia/Tokyo', 14 | }, 15 | 16 | dek: null, 17 | 18 | lead_image_url: null, 19 | 20 | content: { 21 | selectors: ['#ipar_main'], 22 | 23 | defaultCleaner: false, 24 | 25 | transforms: {}, 26 | 27 | clean: ['p.ipar_text_right'], 28 | }, 29 | }; 30 | -------------------------------------------------------------------------------- /src/extractors/custom/www.itmedia.co.jp/index.js: -------------------------------------------------------------------------------- 1 | export const WwwItmediaCoJpExtractor = { 2 | domain: 'www.itmedia.co.jp', 3 | 4 | supportedDomains: [ 5 | 'www.atmarkit.co.jp', 6 | 'techtarget.itmedia.co.jp', 7 | 'nlab.itmedia.co.jp', 8 | ], 9 | 10 | title: { 11 | selectors: ['#cmsTitle h1'], 12 | }, 13 | 14 | author: { 15 | selectors: ['#byline'], 16 | }, 17 | 18 | date_published: { 19 | selectors: [['meta[name="article:modified_time"]', 'value']], 20 | }, 21 | 22 | dek: { 23 | selectors: ['#cmsAbstract h2'], 24 | }, 25 | 26 | lead_image_url: { 27 | selectors: [['meta[name="og:image"]', 'value']], 28 | }, 29 | 30 | content: { 31 | selectors: ['#cmsBody'], 32 | 33 | defaultCleaner: false, 34 | 35 | transforms: {}, 36 | 37 | clean: ['#snsSharebox'], 38 | }, 39 | }; 40 | -------------------------------------------------------------------------------- /src/extractors/custom/www.jnsa.org/index.js: -------------------------------------------------------------------------------- 1 | export const WwwJnsaOrgExtractor = { 2 | domain: 'www.jnsa.org', 3 | 4 | title: { 5 | selectors: ['#wgtitle h2'], 6 | }, 7 | 8 | author: null, 9 | 10 | date_published: null, 11 | 12 | dek: null, 13 | 14 | excerpt: { 15 | selectors: [['meta[name="og:description"]', 'value']], 16 | }, 17 | 18 | lead_image_url: { 19 | selectors: [['meta[name="og:image"]', 'value']], 20 | }, 21 | 22 | content: { 23 | selectors: ['#main_area'], 24 | 25 | transforms: {}, 26 | 27 | clean: ['#pankuzu', '#side'], 28 | }, 29 | }; 30 | -------------------------------------------------------------------------------- /src/extractors/custom/www.ladbible.com/index.js: -------------------------------------------------------------------------------- 1 | export const WwwLadbibleComExtractor = { 2 | domain: 'www.ladbible.com', 3 | 4 | title: { 5 | selectors: ['h1'], 6 | }, 7 | 8 | author: { 9 | selectors: ['[class*=Byline]'], 10 | }, 11 | 12 | date_published: { 13 | selectors: ['time'], 14 | timezone: 'Europe/London', 15 | }, 16 | 17 | lead_image_url: { 18 | selectors: [['meta[name="og:image"]', 'value']], 19 | }, 20 | 21 | content: { 22 | selectors: ['[class*=ArticleContainer]'], 23 | clean: [ 24 | 'time', 25 | 'source', 26 | 'a[href^="https://www.ladbible.com/"]', 27 | 'picture', 28 | '[class*=StyledCardBlock]', 29 | ], 30 | }, 31 | }; 32 | -------------------------------------------------------------------------------- /src/extractors/custom/www.latimes.com/index.js: -------------------------------------------------------------------------------- 1 | export const WwwLatimesComExtractor = { 2 | domain: 'www.latimes.com', 3 | 4 | title: { 5 | selectors: ['h1.headline', '.trb_ar_hl'], 6 | }, 7 | 8 | author: { 9 | selectors: [ 10 | 'a[data-click="standardBylineAuthorName"]', 11 | ['meta[name="author"]', 'value'], 12 | ], 13 | }, 14 | 15 | date_published: { 16 | selectors: [ 17 | ['meta[name="article:published_time"]', 'value'], 18 | ['meta[itemprop="datePublished"]', 'value'], 19 | ], 20 | }, 21 | 22 | lead_image_url: { 23 | selectors: [['meta[name="og:image"]', 'value']], 24 | }, 25 | 26 | content: { 27 | selectors: ['.page-article-body', '.trb_ar_main'], 28 | 29 | // Is there anything in the content you selected that needs transformed 30 | // before it's consumable content? E.g., unusual lazy loaded images 31 | transforms: { 32 | '.trb_ar_la': $node => { 33 | const $figure = $node.find('figure'); 34 | $node.replaceWith($figure); 35 | }, 36 | }, 37 | 38 | // Is there anything that is in the result that shouldn't be? 39 | // The clean selectors will remove anything that matches from 40 | // the result 41 | clean: ['.trb_ar_by', '.trb_ar_cr'], 42 | }, 43 | }; 44 | -------------------------------------------------------------------------------- /src/extractors/custom/www.lemonde.fr/index.js: -------------------------------------------------------------------------------- 1 | export const WwwLemondeFrExtractor = { 2 | domain: 'www.lemonde.fr', 3 | 4 | title: { 5 | selectors: ['h1.article__title'], 6 | }, 7 | 8 | author: { 9 | selectors: ['.author__name'], 10 | }, 11 | 12 | date_published: { 13 | selectors: [['meta[name="og:article:published_time"]', 'value']], 14 | }, 15 | 16 | dek: { 17 | selectors: ['.article__desc'], 18 | }, 19 | 20 | lead_image_url: { 21 | selectors: [['meta[name="og:image"]', 'value']], 22 | }, 23 | 24 | content: { 25 | selectors: ['.article__content'], 26 | 27 | transforms: {}, 28 | 29 | clean: ['figcaption'], 30 | }, 31 | }; 32 | -------------------------------------------------------------------------------- /src/extractors/custom/www.lifehacker.jp/index.js: -------------------------------------------------------------------------------- 1 | export const WwwLifehackerJpExtractor = { 2 | domain: 'www.lifehacker.jp', 3 | 4 | title: { 5 | selectors: ['h1[class^="article_pArticle_Title"]', 'h1.lh-summary-title'], 6 | }, 7 | 8 | author: { 9 | selectors: [ 10 | ['meta[name="author"]', 'value'], 11 | 'p.lh-entryDetailInner--credit', 12 | ], 13 | }, 14 | 15 | date_published: { 16 | selectors: [ 17 | ['meta[name="article:published_time"]', 'value'], 18 | ['div.lh-entryDetail-header time', 'datetime'], 19 | ], 20 | }, 21 | 22 | dek: null, 23 | 24 | lead_image_url: { 25 | selectors: [['meta[name="og:image"]', 'value']], 26 | }, 27 | 28 | content: { 29 | selectors: [ 30 | 'div[class^="article_pArticle_Body__"]', 31 | 'div.lh-entryDetail-body', 32 | ], 33 | 34 | transforms: { 35 | 'img.lazyload': $node => { 36 | const src = $node.attr('src'); 37 | $node.attr('src', src.replace(/^.*=%27/, '').replace(/%27;$/, '')); 38 | }, 39 | }, 40 | 41 | clean: ['p.lh-entryDetailInner--credit'], 42 | }, 43 | }; 44 | -------------------------------------------------------------------------------- /src/extractors/custom/www.macrumors.com/index.js: -------------------------------------------------------------------------------- 1 | export const WwwMacrumorsComExtractor = { 2 | domain: 'www.macrumors.com', 3 | 4 | title: { 5 | selectors: ['h1', 'h1.title'], 6 | }, 7 | 8 | author: { 9 | selectors: ['article a[rel="author"]', '.author-url'], 10 | }, 11 | 12 | date_published: { 13 | selectors: [['time', 'datetime']], 14 | 15 | timezone: 'America/Los_Angeles', 16 | }, 17 | 18 | dek: { 19 | selectors: [['meta[name="description"]', 'value']], 20 | }, 21 | 22 | lead_image_url: { 23 | selectors: [['meta[name="og:image"]', 'value']], 24 | }, 25 | 26 | content: { 27 | selectors: ['article', '.article'], 28 | 29 | // Is there anything in the content you selected that needs transformed 30 | // before it's consumable content? E.g., unusual lazy loaded images 31 | transforms: {}, 32 | 33 | // Is there anything that is in the result that shouldn't be? 34 | // The clean selectors will remove anything that matches from 35 | // the result 36 | clean: [], 37 | }, 38 | }; 39 | -------------------------------------------------------------------------------- /src/extractors/custom/www.mentalfloss.com/index.js: -------------------------------------------------------------------------------- 1 | export const WwwMentalflossComExtractor = { 2 | domain: 'www.mentalfloss.com', 3 | 4 | title: { 5 | selectors: [ 6 | ['meta[name="og:title"]', 'value'], 7 | 'h1.title', 8 | '.title-group', 9 | '.inner', 10 | ], 11 | }, 12 | 13 | author: { 14 | selectors: [ 15 | 'a[data-vars-label*="authors"]', 16 | '.field-name-field-enhanced-authors', 17 | ], 18 | }, 19 | 20 | date_published: { 21 | selectors: [ 22 | ['meta[name="article:published_time"]', 'value'], 23 | '.date-display-single', 24 | ], 25 | timezone: 'America/New_York', 26 | }, 27 | 28 | lead_image_url: { 29 | selectors: [['meta[name="og:image"]', 'value']], 30 | }, 31 | 32 | content: { 33 | selectors: ['article main', 'div.field.field-name-body'], 34 | 35 | // Is there anything in the content you selected that needs transformed 36 | // before it's consumable content? E.g., unusual lazy loaded images 37 | transforms: {}, 38 | 39 | // Is there anything that is in the result that shouldn't be? 40 | // The clean selectors will remove anything that matches from 41 | // the result 42 | clean: ['small'], 43 | }, 44 | }; 45 | -------------------------------------------------------------------------------- /src/extractors/custom/www.miamiherald.com/index.js: -------------------------------------------------------------------------------- 1 | export const WwwMiamiheraldComExtractor = { 2 | domain: 'www.miamiherald.com', 3 | 4 | title: { 5 | selectors: ['h1.title'], 6 | }, 7 | 8 | date_published: { 9 | selectors: ['p.published-date'], 10 | 11 | timezone: 'America/New_York', 12 | }, 13 | 14 | lead_image_url: { 15 | selectors: [['meta[name="og:image"]', 'value']], 16 | }, 17 | 18 | content: { 19 | selectors: ['div.dateline-storybody'], 20 | 21 | // Is there anything in the content you selected that needs transformed 22 | // before it's consumable content? E.g., unusual lazy loaded images 23 | transforms: {}, 24 | 25 | // Is there anything that is in the result that shouldn't be? 26 | // The clean selectors will remove anything that matches from 27 | // the result 28 | clean: [], 29 | }, 30 | }; 31 | -------------------------------------------------------------------------------- /src/extractors/custom/www.moongift.jp/index.js: -------------------------------------------------------------------------------- 1 | export const WwwMoongiftJpExtractor = { 2 | domain: 'www.moongift.jp', 3 | 4 | title: { 5 | selectors: ['h1.title a'], 6 | }, 7 | 8 | author: null, 9 | 10 | date_published: { 11 | selectors: ['ul.meta li:not(.social):first-of-type'], 12 | timezone: 'Asia/Tokyo', 13 | }, 14 | 15 | dek: { 16 | selectors: [['meta[name="og:description"]', 'value']], 17 | }, 18 | 19 | lead_image_url: { 20 | selectors: [['meta[name="og:image"]', 'value']], 21 | }, 22 | 23 | content: { 24 | selectors: ['#main'], 25 | 26 | transforms: {}, 27 | 28 | clean: ['ul.mg_service.cf'], 29 | }, 30 | }; 31 | -------------------------------------------------------------------------------- /src/extractors/custom/www.msn.com/index.js: -------------------------------------------------------------------------------- 1 | // Rename CustomExtractor 2 | // to fit your publication 3 | // (e.g., NYTimesExtractor) 4 | export const MSNExtractor = { 5 | domain: 'www.msn.com', 6 | title: { 7 | selectors: [ 8 | 'h1', 9 | // enter title selectors 10 | ], 11 | }, 12 | 13 | author: { 14 | selectors: [ 15 | 'span.authorname-txt', 16 | // enter author selectors 17 | ], 18 | }, 19 | 20 | content: { 21 | selectors: [ 22 | 'div.richtext', 23 | // enter content selectors 24 | ], 25 | 26 | // Is there anything in the content you selected that needs transformed 27 | // before it's consumable content? E.g., unusual lazy loaded images 28 | transforms: [], 29 | 30 | // Is there anything that is in the result that shouldn't be? 31 | // The clean selectors will remove anything that matches from 32 | // the result 33 | clean: ['span.caption'], 34 | }, 35 | 36 | date_published: { 37 | selectors: ['span.time'], 38 | }, 39 | 40 | lead_image_url: { 41 | selectors: [], 42 | }, 43 | 44 | dek: { 45 | selectors: [], 46 | }, 47 | 48 | next_page_url: null, 49 | 50 | excerpt: null, 51 | }; 52 | -------------------------------------------------------------------------------- /src/extractors/custom/www.nbcnews.com/index.js: -------------------------------------------------------------------------------- 1 | export const WwwNbcnewsComExtractor = { 2 | domain: 'www.nbcnews.com', 3 | 4 | title: { 5 | selectors: ['div.article-hero-headline h1', 'div.article-hed h1'], 6 | }, 7 | 8 | author: { 9 | selectors: [ 10 | 'div.article-inline-byline span.byline-name', 11 | 'span.byline_author', 12 | ], 13 | }, 14 | 15 | date_published: { 16 | selectors: [ 17 | ['meta[name="article:published"]', 'value'], 18 | ['.flag_article-wrapper time.timestamp_article[datetime]', 'datetime'], 19 | '.flag_article-wrapper time', 20 | ], 21 | 22 | timezone: 'America/New_York', 23 | }, 24 | 25 | lead_image_url: { 26 | selectors: [['meta[name="og:image"]', 'value']], 27 | }, 28 | 29 | content: { 30 | selectors: ['div.article-body__content', 'div.article-body'], 31 | 32 | // Is there anything in the content you selected that needs transformed 33 | // before it's consumable content? E.g., unusual lazy loaded images 34 | transforms: {}, 35 | 36 | // Is there anything that is in the result that shouldn't be? 37 | // The clean selectors will remove anything that matches from 38 | // the result 39 | clean: [], 40 | }, 41 | }; 42 | -------------------------------------------------------------------------------- /src/extractors/custom/www.npr.org/index.js: -------------------------------------------------------------------------------- 1 | export const WwwNprOrgExtractor = { 2 | domain: 'www.npr.org', 3 | 4 | title: { 5 | selectors: ['h1', '.storytitle'], 6 | }, 7 | 8 | author: { 9 | selectors: ['p.byline__name.byline__name--block'], 10 | }, 11 | 12 | date_published: { 13 | selectors: [ 14 | ['.dateblock time[datetime]', 'datetime'], 15 | ['meta[name="date"]', 'value'], 16 | ], 17 | }, 18 | 19 | lead_image_url: { 20 | selectors: [ 21 | ['meta[name="og:image"]', 'value'], 22 | ['meta[name="twitter:image:src"]', 'value'], 23 | ], 24 | }, 25 | 26 | content: { 27 | selectors: ['.storytext'], 28 | 29 | // Is there anything in the content you selected that needs transformed 30 | // before it's consumable content? E.g., unusual lazy loaded images 31 | transforms: { 32 | '.bucketwrap.image': 'figure', 33 | '.bucketwrap.image .credit-caption': 'figcaption', 34 | }, 35 | 36 | // Is there anything that is in the result that shouldn't be? 37 | // The clean selectors will remove anything that matches from 38 | // the result 39 | clean: ['div.enlarge_measure'], 40 | }, 41 | }; 42 | -------------------------------------------------------------------------------- /src/extractors/custom/www.nydailynews.com/index.js: -------------------------------------------------------------------------------- 1 | export const WwwNydailynewsComExtractor = { 2 | domain: 'www.nydailynews.com', 3 | 4 | title: { 5 | selectors: ['h1.headline', 'h1#ra-headline'], 6 | }, 7 | 8 | author: { 9 | selectors: [ 10 | '.article_byline span', 11 | ['meta[name="parsely-author"]', 'value'], 12 | ], 13 | }, 14 | 15 | date_published: { 16 | selectors: ['time', ['meta[name="sailthru.date"]', 'value']], 17 | }, 18 | 19 | lead_image_url: { 20 | selectors: [['meta[name="og:image"]', 'value']], 21 | }, 22 | 23 | content: { 24 | selectors: ['article', 'article#ra-body'], 25 | 26 | // Is there anything in the content you selected that needs transformed 27 | // before it's consumable content? E.g., unusual lazy loaded images 28 | transforms: {}, 29 | 30 | // Is there anything that is in the result that shouldn't be? 31 | // The clean selectors will remove anything that matches from 32 | // the result 33 | clean: ['dl#ra-tags', '.ra-related', 'a.ra-editor', 'dl#ra-share-bottom'], 34 | }, 35 | }; 36 | -------------------------------------------------------------------------------- /src/extractors/custom/www.opposingviews.com/index.js: -------------------------------------------------------------------------------- 1 | export const WwwOpposingviewsComExtractor = { 2 | domain: 'www.opposingviews.com', 3 | 4 | title: { 5 | selectors: ['h1.m-detail-header--title', 'h1.title'], 6 | }, 7 | 8 | author: { 9 | selectors: [['meta[name="author"]', 'value'], 'div.date span span a'], 10 | }, 11 | 12 | date_published: { 13 | selectors: [ 14 | ['meta[name="published"]', 'value'], 15 | ['meta[name="publish_date"]', 'value'], 16 | ], 17 | }, 18 | 19 | dek: { 20 | selectors: [ 21 | // enter selectors 22 | ], 23 | }, 24 | 25 | lead_image_url: { 26 | selectors: [['meta[name="og:image"]', 'value']], 27 | }, 28 | 29 | content: { 30 | selectors: ['.m-detail--body', '.article-content'], 31 | 32 | // Is there anything in the content you selected that needs transformed 33 | // before it's consumable content? E.g., unusual lazy loaded images 34 | transforms: {}, 35 | 36 | // Is there anything that is in the result that shouldn't be? 37 | // The clean selectors will remove anything that matches from 38 | // the result 39 | clean: ['.show-for-small-only'], 40 | }, 41 | }; 42 | -------------------------------------------------------------------------------- /src/extractors/custom/www.oreilly.co.jp/index.js: -------------------------------------------------------------------------------- 1 | export const WwwOreillyCoJpExtractor = { 2 | domain: 'www.oreilly.co.jp', 3 | 4 | title: { 5 | selectors: [['meta[name="og:title"]', 'value'], 'h3'], 6 | }, 7 | 8 | author: { 9 | selectors: ['span[itemprop="author"]', 'li[itemprop="author"]'], 10 | }, 11 | 12 | date_published: { 13 | selectors: [ 14 | ['dd[itemprop="datePublished"]', 'content'], 15 | ['meta[itemprop="datePublished"]', 'value'], 16 | ], 17 | timezone: 'Asia/Tokyo', 18 | }, 19 | 20 | dek: null, 21 | 22 | lead_image_url: { 23 | selectors: [ 24 | ['meta[name="og:image:secure_url"]', 'value'], 25 | ['meta[name="og:image"]', 'value'], 26 | ], 27 | }, 28 | 29 | content: { 30 | selectors: ['section.detail', '#content'], 31 | 32 | defaultCleaner: false, 33 | 34 | transforms: {}, 35 | 36 | clean: ['.social-tools'], 37 | }, 38 | }; 39 | -------------------------------------------------------------------------------- /src/extractors/custom/www.ossnews.jp/index.js: -------------------------------------------------------------------------------- 1 | export const WwwOssnewsJpExtractor = { 2 | domain: 'www.ossnews.jp', 3 | 4 | title: { 5 | selectors: ['#alpha-block h1.hxnewstitle'], 6 | }, 7 | 8 | author: null, 9 | 10 | date_published: { 11 | selectors: ['p.fs12'], 12 | format: 'YYYY年MM月DD日 HH:mm', 13 | timezone: 'Asia/Tokyo', 14 | }, 15 | 16 | dek: null, 17 | 18 | lead_image_url: { 19 | selectors: [['meta[name="og:image"]', 'value']], 20 | }, 21 | 22 | content: { 23 | selectors: ['#alpha-block .section:has(h1.hxnewstitle)'], 24 | 25 | defaultCleaner: false, 26 | 27 | transforms: {}, 28 | 29 | clean: [], 30 | }, 31 | }; 32 | -------------------------------------------------------------------------------- /src/extractors/custom/www.phoronix.com/index.js: -------------------------------------------------------------------------------- 1 | export const WwwPhoronixComExtractor = { 2 | domain: 'www.phoronix.com', 3 | 4 | title: { 5 | selectors: ['article h1', 'article header'], 6 | }, 7 | 8 | author: { 9 | selectors: ['.author a:first-child'], 10 | }, 11 | 12 | date_published: { 13 | selectors: ['.author'], 14 | // 1 June 2019 at 08:34 PM EDT 15 | format: 'D MMMM YYYY at hh:mm', 16 | timezone: 'America/New_York', 17 | }, 18 | 19 | dek: null, 20 | 21 | lead_image_url: null, 22 | 23 | content: { 24 | selectors: ['.content'], 25 | 26 | // Is there anything in the content you selected that needs transformed 27 | // before it's consumable content? E.g., unusual lazy loaded images 28 | transforms: {}, 29 | 30 | // Is there anything that is in the result that shouldn't be? 31 | // The clean selectors will remove anything that matches from 32 | // the result 33 | clean: [], 34 | }, 35 | }; 36 | -------------------------------------------------------------------------------- /src/extractors/custom/www.politico.com/index.js: -------------------------------------------------------------------------------- 1 | export const PoliticoExtractor = { 2 | domain: 'www.politico.com', 3 | title: { 4 | selectors: [['meta[name="og:title"]', 'value']], 5 | }, 6 | 7 | author: { 8 | selectors: [ 9 | ['div[itemprop="author"] meta[itemprop="name"]', 'value'], 10 | '.story-meta__authors .vcard', 11 | '.story-main-content .byline .vcard', 12 | ], 13 | }, 14 | 15 | content: { 16 | selectors: [['.story-text'], '.story-main-content', '.story-core'], 17 | 18 | transforms: [], 19 | 20 | clean: ['figcaption', '.story-meta', '.ad'], 21 | }, 22 | 23 | date_published: { 24 | selectors: [ 25 | ['time[itemprop="datePublished"]', 'datetime'], 26 | ['.story-meta__details time[datetime]', 'datetime'], 27 | ['.story-main-content .timestamp time[datetime]', 'datetime'], 28 | ], 29 | timezone: 'America/New_York', 30 | }, 31 | 32 | lead_image_url: { 33 | selectors: [['meta[name="og:image"]', 'value']], 34 | }, 35 | 36 | dek: { 37 | selectors: [['meta[name="og:description"]', 'value']], 38 | }, 39 | }; 40 | -------------------------------------------------------------------------------- /src/extractors/custom/www.popsugar.com/index.js: -------------------------------------------------------------------------------- 1 | export const WwwPopsugarComExtractor = { 2 | domain: 'www.popsugar.com', 3 | 4 | title: { 5 | selectors: ['h2.post-title', 'title-text'], 6 | }, 7 | 8 | author: { 9 | selectors: [['meta[name="article:author"]', 'value']], 10 | }, 11 | 12 | date_published: { 13 | selectors: [['meta[name="article:published_time"]', 'value']], 14 | }, 15 | 16 | lead_image_url: { 17 | selectors: [['meta[name="og:image"]', 'value']], 18 | }, 19 | 20 | content: { 21 | selectors: ['#content'], 22 | 23 | // Is there anything in the content you selected that needs transformed 24 | // before it's consumable content? E.g., unusual lazy loaded images 25 | transforms: {}, 26 | 27 | // Is there anything that is in the result that shouldn't be? 28 | // The clean selectors will remove anything that matches from 29 | // the result 30 | clean: ['.share-copy-title', '.post-tags', '.reactions'], 31 | }, 32 | }; 33 | -------------------------------------------------------------------------------- /src/extractors/custom/www.prospectmagazine.co.uk/index.js: -------------------------------------------------------------------------------- 1 | export const WwwProspectmagazineCoUkExtractor = { 2 | domain: 'www.prospectmagazine.co.uk', 3 | 4 | title: { 5 | selectors: ['.blog-header__title', '.page-title'], 6 | }, 7 | 8 | author: { 9 | selectors: ['.blog-header__author-link', '.aside_author .title'], 10 | }, 11 | 12 | date_published: { 13 | selectors: [['meta[name="article:published_time"]', 'value'], '.post-info'], 14 | 15 | timezone: 'Europe/London', 16 | }, 17 | 18 | dek: { 19 | selectors: ['.blog-header__description', '.page-subtitle'], 20 | }, 21 | 22 | lead_image_url: { 23 | selectors: [['meta[name="og:image"]', 'value']], 24 | }, 25 | 26 | content: { 27 | selectors: ['.blog__container', 'article .post_content'], 28 | 29 | // Is there anything in the content you selected that needs transformed 30 | // before it's consumable content? E.g., unusual lazy loaded images 31 | transforms: {}, 32 | 33 | // Is there anything that is in the result that shouldn't be? 34 | // The clean selectors will remove anything that matches from 35 | // the result 36 | clean: [], 37 | }, 38 | }; 39 | -------------------------------------------------------------------------------- /src/extractors/custom/www.publickey1.jp/index.js: -------------------------------------------------------------------------------- 1 | export const WwwPublickey1JpExtractor = { 2 | domain: 'www.publickey1.jp', 3 | 4 | title: { 5 | selectors: ['h1'], 6 | }, 7 | 8 | author: { 9 | selectors: ['.bloggerinchief p:first-of-type', '#subcol p:has(img)'], 10 | }, 11 | 12 | date_published: { 13 | selectors: ['div.pubdate'], 14 | format: 'YYYY年MM月DD日', 15 | timezone: 'Asia/Tokyo', 16 | }, 17 | 18 | dek: null, 19 | 20 | lead_image_url: { 21 | selectors: [['meta[name="og:image"]', 'value']], 22 | }, 23 | 24 | content: { 25 | selectors: ['#maincol'], 26 | 27 | defaultCleaner: false, 28 | 29 | transforms: {}, 30 | 31 | clean: ['#breadcrumbs', 'div.sbm', 'div.ad_footer'], 32 | }, 33 | }; 34 | -------------------------------------------------------------------------------- /src/extractors/custom/www.qdaily.com/index.js: -------------------------------------------------------------------------------- 1 | export const WwwQdailyComExtractor = { 2 | domain: 'www.qdaily.com', 3 | 4 | title: { 5 | selectors: ['h2', 'h2.title'], 6 | }, 7 | 8 | author: { 9 | selectors: ['.name'], 10 | }, 11 | 12 | date_published: { 13 | selectors: [['.date.smart-date', 'data-origindate']], 14 | }, 15 | 16 | dek: { 17 | selectors: ['.excerpt'], 18 | }, 19 | 20 | lead_image_url: { 21 | selectors: [['.article-detail-hd img', 'src']], 22 | }, 23 | 24 | content: { 25 | selectors: ['.detail'], 26 | 27 | // Is there anything in the content you selected that needs transformed 28 | // before it's consumable content? E.g., unusual lazy loaded images 29 | transforms: {}, 30 | 31 | // Is there anything that is in the result that shouldn't be? 32 | // The clean selectors will remove anything that matches from 33 | // the result 34 | clean: ['.lazyload', '.lazylad', '.lazylood'], 35 | }, 36 | }; 37 | -------------------------------------------------------------------------------- /src/extractors/custom/www.rawstory.com/index.js: -------------------------------------------------------------------------------- 1 | export const WwwRawstoryComExtractor = { 2 | domain: 'www.rawstory.com', 3 | 4 | title: { 5 | selectors: [['meta[name="og:title"]', 'value'], '.blog-title'], 6 | }, 7 | 8 | author: { 9 | selectors: [ 10 | 'div.main-post-head .social-author__name', 11 | '.blog-author a:first-of-type', 12 | ], 13 | }, 14 | 15 | date_published: { 16 | selectors: [ 17 | ['meta[name="article:published_time"]', 'value'], 18 | '.blog-author a:last-of-type', 19 | ], 20 | 21 | timezone: 'EST', 22 | }, 23 | 24 | lead_image_url: { 25 | selectors: [['meta[name="og:image"]', 'value']], 26 | }, 27 | 28 | content: { 29 | selectors: ['.post-body', '.blog-content'], 30 | 31 | // Is there anything in the content you selected that needs transformed 32 | // before it's consumable content? E.g., unusual lazy loaded images 33 | transforms: {}, 34 | 35 | // Is there anything that is in the result that shouldn't be? 36 | // The clean selectors will remove anything that matches from 37 | // the result 38 | clean: [], 39 | }, 40 | }; 41 | -------------------------------------------------------------------------------- /src/extractors/custom/www.rbbtoday.com/index.js: -------------------------------------------------------------------------------- 1 | export const WwwRbbtodayComExtractor = { 2 | domain: 'www.rbbtoday.com', 3 | 4 | title: { 5 | selectors: ['h1'], 6 | }, 7 | 8 | author: { 9 | selectors: ['.writer.writer-name'], 10 | }, 11 | 12 | date_published: { 13 | selectors: [['header time', 'datetime']], 14 | }, 15 | 16 | dek: { 17 | selectors: [['meta[name="description"]', 'value'], '.arti-summary'], 18 | }, 19 | 20 | lead_image_url: { 21 | selectors: [['meta[name="og:image"]', 'value']], 22 | }, 23 | 24 | content: { 25 | selectors: ['.arti-content'], 26 | 27 | transforms: {}, 28 | 29 | clean: ['.arti-giga'], 30 | }, 31 | }; 32 | -------------------------------------------------------------------------------- /src/extractors/custom/www.recode.net/index.js: -------------------------------------------------------------------------------- 1 | export const WwwRecodeNetExtractor = { 2 | domain: 'www.recode.net', 3 | 4 | title: { 5 | selectors: ['h1.c-page-title'], 6 | }, 7 | 8 | author: { 9 | selectors: [['meta[name="author"]', 'value']], 10 | }, 11 | 12 | date_published: { 13 | selectors: [['meta[name="article:published_time"]', 'value']], 14 | }, 15 | 16 | dek: { 17 | selectors: ['h2.c-entry-summary.p-dek'], 18 | }, 19 | 20 | lead_image_url: { 21 | selectors: [['meta[name="og:image"]', 'value']], 22 | }, 23 | 24 | content: { 25 | selectors: [ 26 | ['figure.e-image--hero', '.c-entry-content'], 27 | '.c-entry-content', 28 | ], 29 | 30 | // Is there anything in the content you selected that needs transformed 31 | // before it's consumable content? E.g., unusual lazy loaded images 32 | transforms: {}, 33 | 34 | // Is there anything that is in the result that shouldn't be? 35 | // The clean selectors will remove anything that matches from 36 | // the result 37 | clean: [], 38 | }, 39 | }; 40 | -------------------------------------------------------------------------------- /src/extractors/custom/www.reuters.com/index.js: -------------------------------------------------------------------------------- 1 | export const WwwReutersComExtractor = { 2 | domain: 'www.reuters.com', 3 | 4 | title: { 5 | selectors: ['h1[class*="ArticleHeader-headline-"]', 'h1.article-headline'], 6 | }, 7 | 8 | author: { 9 | selectors: [['meta[name="og:article:author"]', 'value'], '.author'], 10 | }, 11 | 12 | date_published: { 13 | selectors: [['meta[name="og:article:published_time"]', 'value']], 14 | }, 15 | 16 | lead_image_url: { 17 | selectors: [['meta[name="og:image"]', 'value']], 18 | }, 19 | 20 | content: { 21 | selectors: ['div.ArticleBodyWrapper', '#article-text'], 22 | 23 | // Is there anything in the content you selected that needs transformed 24 | // before it's consumable content? E.g., unusual lazy loaded images 25 | transforms: { 26 | '.article-subtitle': 'h4', 27 | }, 28 | 29 | // Is there anything that is in the result that shouldn't be? 30 | // The clean selectors will remove anything that matches from 31 | // the result 32 | clean: [ 33 | 'div[class^="ArticleBody-byline-container-"]', 34 | '#article-byline .author', 35 | ], 36 | }, 37 | }; 38 | -------------------------------------------------------------------------------- /src/extractors/custom/www.sanwa.co.jp/index.js: -------------------------------------------------------------------------------- 1 | export const WwwSanwaCoJpExtractor = { 2 | domain: 'www.sanwa.co.jp', 3 | 4 | title: { 5 | selectors: ['#newsContent h1'], 6 | }, 7 | 8 | author: null, 9 | 10 | date_published: { 11 | selectors: ['dl.date'], 12 | format: 'YYYY.MM.DD', 13 | timezone: 'Asia/Tokyo', 14 | }, 15 | 16 | dek: { 17 | selectors: [['meta[name="og:description"]', 'value']], 18 | }, 19 | 20 | lead_image_url: { 21 | selectors: [['meta[name="og:image"]', 'value']], 22 | }, 23 | 24 | content: { 25 | selectors: ['#newsContent'], 26 | 27 | defaultCleaner: false, 28 | 29 | transforms: {}, 30 | 31 | clean: ['#smartphone', 'div.sns_box', 'div.contentFoot'], 32 | }, 33 | }; 34 | -------------------------------------------------------------------------------- /src/extractors/custom/www.sbnation.com/index.js: -------------------------------------------------------------------------------- 1 | export const WwwSbnationComExtractor = { 2 | domain: 'www.sbnation.com', 3 | 4 | title: { 5 | selectors: ['h1.c-page-title'], 6 | }, 7 | 8 | author: { 9 | selectors: [['meta[name="author"]', 'value']], 10 | }, 11 | 12 | date_published: { 13 | selectors: [['meta[name="article:published_time"]', 'value']], 14 | }, 15 | 16 | dek: { 17 | selectors: ['p.c-entry-summary.p-dek', 'h2.c-entry-summary.p-dek'], 18 | }, 19 | 20 | lead_image_url: { 21 | selectors: [['meta[name="og:image"]', 'value']], 22 | }, 23 | 24 | content: { 25 | selectors: ['div.c-entry-content'], 26 | 27 | // Is there anything in the content you selected that needs transformed 28 | // before it's consumable content? E.g., unusual lazy loaded images 29 | transforms: {}, 30 | 31 | // Is there anything that is in the result that shouldn't be? 32 | // The clean selectors will remove anything that matches from 33 | // the result 34 | clean: [], 35 | }, 36 | }; 37 | -------------------------------------------------------------------------------- /src/extractors/custom/www.slate.com/index.js: -------------------------------------------------------------------------------- 1 | export const WwwSlateComExtractor = { 2 | domain: 'www.slate.com', 3 | 4 | title: { 5 | selectors: ['.hed', 'h1'], 6 | }, 7 | 8 | author: { 9 | selectors: ['a[rel=author]'], 10 | }, 11 | 12 | date_published: { 13 | selectors: ['.pub-date'], 14 | 15 | timezone: 'America/New_York', 16 | }, 17 | 18 | dek: { 19 | selectors: ['.dek'], 20 | }, 21 | 22 | lead_image_url: { 23 | selectors: [['meta[name="og:image"]', 'value']], 24 | }, 25 | 26 | content: { 27 | selectors: ['.body'], 28 | 29 | // Is there anything in the content you selected that needs transformed 30 | // before it's consumable content? E.g., unusual lazy loaded images 31 | transforms: {}, 32 | 33 | // Is there anything that is in the result that shouldn't be? 34 | // The clean selectors will remove anything that matches from 35 | // the result 36 | clean: [ 37 | '.about-the-author', 38 | '.pullquote', 39 | '.newsletter-signup-component', 40 | '.top-comment', 41 | ], 42 | }, 43 | }; 44 | -------------------------------------------------------------------------------- /src/extractors/custom/www.theguardian.com/index.js: -------------------------------------------------------------------------------- 1 | export const WwwTheguardianComExtractor = { 2 | domain: 'www.theguardian.com', 3 | 4 | title: { 5 | selectors: ['h1', '.content__headline'], 6 | }, 7 | 8 | author: { 9 | selectors: ['address[data-link-name="byline"]', 'p.byline'], 10 | }, 11 | 12 | date_published: { 13 | selectors: [['meta[name="article:published_time"]', 'value']], 14 | }, 15 | 16 | dek: { 17 | selectors: ['div[data-gu-name="standfirst"]', '.content__standfirst'], 18 | }, 19 | 20 | lead_image_url: { 21 | selectors: [['meta[name="og:image"]', 'value']], 22 | }, 23 | 24 | content: { 25 | selectors: ['#maincontent', '.content__article-body'], 26 | 27 | // Is there anything in the content you selected that needs transformed 28 | // before it's consumable content? E.g., unusual lazy loaded images 29 | transforms: {}, 30 | 31 | // Is there anything that is in the result that shouldn't be? 32 | // The clean selectors will remove anything that matches from 33 | // the result 34 | clean: ['.hide-on-mobile', '.inline-icon'], 35 | }, 36 | }; 37 | -------------------------------------------------------------------------------- /src/extractors/custom/www.thepennyhoarder.com/index.js: -------------------------------------------------------------------------------- 1 | export const WwwThepennyhoarderComExtractor = { 2 | domain: 'www.thepennyhoarder.com', 3 | 4 | title: { 5 | selectors: [['meta[name="dcterms.title"]', 'value']], 6 | }, 7 | 8 | author: { 9 | selectors: [['link[rel="author"]', 'title']], 10 | }, 11 | 12 | date_published: { 13 | selectors: [['meta[name="article:published_time"]', 'value']], 14 | }, 15 | 16 | lead_image_url: { 17 | selectors: [['meta[name="og:image"]', 'value']], 18 | }, 19 | 20 | content: { 21 | selectors: [ 22 | ['.post-img', '.post-text'], 23 | '.post-text', 24 | '.single-post-content-inner', 25 | ], 26 | 27 | // Is there anything in the content you selected that needs transformed 28 | // before it's consumable content? E.g., unusual lazy loaded images 29 | transforms: {}, 30 | 31 | // Is there anything that is in the result that shouldn't be? 32 | // The clean selectors will remove anything that matches from 33 | // the result 34 | clean: [], 35 | }, 36 | }; 37 | -------------------------------------------------------------------------------- /src/extractors/custom/www.thepoliticalinsider.com/index.js: -------------------------------------------------------------------------------- 1 | export const WwwThepoliticalinsiderComExtractor = { 2 | domain: 'www.thepoliticalinsider.com', 3 | 4 | title: { 5 | selectors: [['meta[name="sailthru.title"]', 'value']], 6 | }, 7 | 8 | author: { 9 | selectors: [['meta[name="sailthru.author"]', 'value']], 10 | }, 11 | 12 | date_published: { 13 | selectors: [['meta[name="sailthru.date"]', 'value']], 14 | timezone: 'America/New_York', 15 | }, 16 | 17 | dek: { 18 | selectors: [ 19 | // enter selectors 20 | ], 21 | }, 22 | 23 | lead_image_url: { 24 | selectors: [ 25 | ['meta[name="og:image"]', 'value'], // enter selectors 26 | ], 27 | }, 28 | 29 | content: { 30 | selectors: ['div#article-body'], 31 | 32 | // Is there anything in the content you selected that needs transformed 33 | // before it's consumable content? E.g., unusual lazy loaded images 34 | transforms: {}, 35 | 36 | // Is there anything that is in the result that shouldn't be? 37 | // The clean selectors will remove anything that matches from 38 | // the result 39 | clean: [], 40 | }, 41 | }; 42 | -------------------------------------------------------------------------------- /src/extractors/custom/www.tmz.com/index.js: -------------------------------------------------------------------------------- 1 | export const WwwTmzComExtractor = { 2 | domain: 'www.tmz.com', 3 | 4 | title: { 5 | selectors: ['.post-title-breadcrumb', 'h1', '.headline'], 6 | }, 7 | 8 | author: 'TMZ STAFF', 9 | 10 | date_published: { 11 | selectors: ['.article__published-at', '.article-posted-date'], 12 | 13 | timezone: 'America/Los_Angeles', 14 | }, 15 | 16 | dek: { 17 | selectors: [ 18 | // enter selectors 19 | ], 20 | }, 21 | 22 | lead_image_url: { 23 | selectors: [['meta[name="og:image"]', 'value']], 24 | }, 25 | 26 | content: { 27 | selectors: ['.article__blocks', '.article-content', '.all-post-body'], 28 | 29 | // Is there anything in the content you selected that needs transformed 30 | // before it's consumable content? E.g., unusual lazy loaded images 31 | transforms: {}, 32 | 33 | // Is there anything that is in the result that shouldn't be? 34 | // The clean selectors will remove anything that matches from 35 | // the result 36 | clean: ['.lightbox-link'], 37 | }, 38 | }; 39 | -------------------------------------------------------------------------------- /src/extractors/custom/www.today.com/index.js: -------------------------------------------------------------------------------- 1 | export const WwwTodayComExtractor = { 2 | domain: 'www.today.com', 3 | 4 | title: { 5 | selectors: ['h1.article-hero-headline__htag', 'h1.entry-headline'], 6 | }, 7 | 8 | author: { 9 | selectors: ['span.byline-name', ['meta[name="author"]', 'value']], 10 | }, 11 | 12 | date_published: { 13 | selectors: ['time[datetime]', ['meta[name="DC.date.issued"]', 'value']], 14 | }, 15 | 16 | lead_image_url: { 17 | selectors: [['meta[name="og:image"]', 'value']], 18 | }, 19 | 20 | content: { 21 | selectors: ['div.article-body__content', '.entry-container'], 22 | 23 | // Is there anything in the content you selected that needs transformed 24 | // before it's consumable content? E.g., unusual lazy loaded images 25 | transforms: {}, 26 | 27 | // Is there anything that is in the result that shouldn't be? 28 | // The clean selectors will remove anything that matches from 29 | // the result 30 | clean: ['.label-comment'], 31 | }, 32 | }; 33 | -------------------------------------------------------------------------------- /src/extractors/custom/www.usmagazine.com/index.js: -------------------------------------------------------------------------------- 1 | export const WwwUsmagazineComExtractor = { 2 | domain: 'www.usmagazine.com', 3 | 4 | title: { 5 | selectors: ['header h1'], 6 | }, 7 | 8 | author: { 9 | selectors: ['a.author', 'a.article-byline.tracked-offpage'], 10 | }, 11 | 12 | date_published: { 13 | timezone: 'America/New_York', 14 | 15 | selectors: [['meta[name="article:published_time"]', 'value']], 16 | }, 17 | 18 | lead_image_url: { 19 | selectors: [['meta[name="og:image"]', 'value']], 20 | }, 21 | 22 | content: { 23 | selectors: ['div.article-content'], 24 | 25 | // Is there anything in the content you selected that needs transformed 26 | // before it's consumable content? E.g., unusual lazy loaded images 27 | transforms: {}, 28 | 29 | // Is there anything that is in the result that shouldn't be? 30 | // The clean selectors will remove anything that matches from 31 | // the result 32 | clean: ['.module-related'], 33 | }, 34 | }; 35 | -------------------------------------------------------------------------------- /src/extractors/custom/www.westernjournalism.com/index.js: -------------------------------------------------------------------------------- 1 | export const WwwWesternjournalismComExtractor = { 2 | domain: 'www.westernjournalism.com', 3 | 4 | title: { 5 | selectors: ['title', 'h1.entry-title'], 6 | }, 7 | 8 | author: { 9 | selectors: [['meta[name="author"]', 'value']], 10 | }, 11 | 12 | date_published: { 13 | selectors: [['meta[name="DC.date.issued"]', 'value']], 14 | }, 15 | 16 | dek: { 17 | selectors: ['.subtitle'], 18 | }, 19 | 20 | lead_image_url: { 21 | selectors: [['meta[name="og:image"]', 'value']], 22 | }, 23 | 24 | content: { 25 | selectors: ['div.article-sharing.top + div'], 26 | 27 | // Is there anything in the content you selected that needs transformed 28 | // before it's consumable content? E.g., unusual lazy loaded images 29 | transforms: {}, 30 | 31 | // Is there anything that is in the result that shouldn't be? 32 | // The clean selectors will remove anything that matches from 33 | // the result 34 | clean: ['.ad-notice-small'], 35 | }, 36 | }; 37 | -------------------------------------------------------------------------------- /src/extractors/custom/www.yomiuri.co.jp/index.js: -------------------------------------------------------------------------------- 1 | export const WwwYomiuriCoJpExtractor = { 2 | domain: 'www.yomiuri.co.jp', 3 | 4 | title: { 5 | selectors: ['h1.title-article.c-article-title'], 6 | }, 7 | 8 | author: null, 9 | 10 | date_published: { 11 | selectors: [['meta[name="article:published_time"]', 'value']], 12 | }, 13 | 14 | dek: null, 15 | 16 | lead_image_url: { 17 | selectors: [['meta[name="og:image"]', 'value']], 18 | }, 19 | 20 | content: { 21 | selectors: ['div.p-main-contents'], 22 | 23 | transforms: {}, 24 | 25 | clean: [], 26 | }, 27 | }; 28 | -------------------------------------------------------------------------------- /src/extractors/detect-by-html.js: -------------------------------------------------------------------------------- 1 | import { MediumExtractor, BloggerExtractor } from './custom'; 2 | 3 | const Detectors = { 4 | 'meta[name="al:ios:app_name"][value="Medium"]': MediumExtractor, 5 | 'meta[name="generator"][value="blogger"]': BloggerExtractor, 6 | }; 7 | 8 | export default function detectByHtml($) { 9 | const selector = Reflect.ownKeys(Detectors).find(s => $(s).length > 0); 10 | 11 | return Detectors[selector]; 12 | } 13 | -------------------------------------------------------------------------------- /src/extractors/detect-by-html.test.js: -------------------------------------------------------------------------------- 1 | import assert from 'assert'; 2 | import cheerio from 'cheerio'; 3 | 4 | import detectByHtml from './detect-by-html'; 5 | 6 | describe('detectByHtml', () => { 7 | it('detects a medium post from the html', () => { 8 | const $ = cheerio.load( 9 | '' 10 | ); 11 | 12 | assert.equal(detectByHtml($).domain, 'medium.com'); 13 | }); 14 | 15 | it('returns nothing if no match is found', () => { 16 | const $ = cheerio.load('
'); 17 | 18 | assert.equal(detectByHtml($), null); 19 | }); 20 | }); 21 | -------------------------------------------------------------------------------- /src/extractors/fixtures/postlight.com/index.js: -------------------------------------------------------------------------------- 1 | var customExtractor = { 2 | domain: 'postlight.com', 3 | title: { 4 | selectors: ['h1'], 5 | }, 6 | author: { 7 | selectors: ['.byline-name'], 8 | }, 9 | content: { 10 | selectors: ['article'], 11 | }, 12 | extend: { 13 | uniqueKeyFromFixture: { 14 | selectors: ['.single__hero-category'], 15 | }, 16 | }, 17 | }; 18 | 19 | module.exports = customExtractor; 20 | -------------------------------------------------------------------------------- /src/extractors/generic/content/extract-best-node.js: -------------------------------------------------------------------------------- 1 | import { stripUnlikelyCandidates, convertToParagraphs } from 'utils/dom'; 2 | 3 | import { scoreContent, findTopCandidate } from './scoring'; 4 | 5 | // Using a variety of scoring techniques, extract the content most 6 | // likely to be article text. 7 | // 8 | // If strip_unlikely_candidates is True, remove any elements that 9 | // match certain criteria first. (Like, does this element have a 10 | // classname of "comment") 11 | // 12 | // If weight_nodes is True, use classNames and IDs to determine the 13 | // worthiness of nodes. 14 | // 15 | // Returns a cheerio object $ 16 | export default function extractBestNode($, opts) { 17 | if (opts.stripUnlikelyCandidates) { 18 | $ = stripUnlikelyCandidates($); 19 | } 20 | 21 | $ = convertToParagraphs($); 22 | $ = scoreContent($, opts.weightNodes); 23 | const $topCandidate = findTopCandidate($); 24 | 25 | return $topCandidate; 26 | } 27 | -------------------------------------------------------------------------------- /src/extractors/generic/content/extract-best-node.test.js: -------------------------------------------------------------------------------- 1 | import assert from 'assert'; 2 | import cheerio from 'cheerio'; 3 | import extractBestNode from './extract-best-node'; 4 | 5 | const fs = require('fs'); 6 | 7 | describe('extractBestNode($, flags)', () => { 8 | it('scores the dom nodes and returns the best option', () => { 9 | const html = fs.readFileSync('./fixtures/www.latimes.com.html', 'utf-8'); 10 | const $ = cheerio.load(html); 11 | 12 | const bestNode = extractBestNode($, { 13 | stripUnlikelyCandidates: true, 14 | weightNodes: true, 15 | }); 16 | 17 | assert(typeof bestNode, 'object'); 18 | }); 19 | }); 20 | -------------------------------------------------------------------------------- /src/extractors/generic/content/extractor.test.js: -------------------------------------------------------------------------------- 1 | import assert from 'assert'; 2 | 3 | import { clean } from 'test-helpers'; 4 | 5 | import GenericContentExtractor from './extractor'; 6 | 7 | const fs = require('fs'); 8 | 9 | describe('GenericContentExtractor', () => { 10 | describe('extract($, html, opts)', () => { 11 | it('extracts html and returns the article', () => { 12 | const html = fs.readFileSync('./fixtures/www.vulture.com.html', 'utf-8'); 13 | const result = clean( 14 | GenericContentExtractor.extract({ 15 | $: null, 16 | html, 17 | url: 18 | 'http://www.vulture.com/2016/08/dc-comics-greg-berlanti-c-v-r.html', 19 | }) 20 | ); 21 | 22 | assert(typeof result, 'string'); 23 | }); 24 | }); 25 | }); 26 | -------------------------------------------------------------------------------- /src/extractors/generic/content/scoring/add-score.js: -------------------------------------------------------------------------------- 1 | import { getOrInitScore, setScore } from './index'; 2 | 3 | export default function addScore($node, $, amount) { 4 | try { 5 | const score = getOrInitScore($node, $) + amount; 6 | setScore($node, $, score); 7 | } catch (e) { 8 | // Ignoring; error occurs in scoreNode 9 | } 10 | 11 | return $node; 12 | } 13 | -------------------------------------------------------------------------------- /src/extractors/generic/content/scoring/add-score.test.js: -------------------------------------------------------------------------------- 1 | import assert from 'assert'; 2 | import cheerio from 'cheerio'; 3 | 4 | import { addScore, getScore } from './index'; 5 | 6 | describe('Scoring utils', () => { 7 | describe('addScore(node, $, amount)', () => { 8 | it("adds the specified amount to a node's score", () => { 9 | const $ = cheerio.load('

Foo

'); 10 | const $node = $('p').first(); 11 | addScore($node, $, 25); 12 | assert.equal(getScore($node), 50); 13 | }); 14 | 15 | it('adds score if score not yet set (assumes score is 0)', () => { 16 | const $ = cheerio.load('

Foo

'); 17 | const $node = $('p').first(); 18 | addScore($node, $, 25); 19 | assert.equal(getScore($node), 25); 20 | }); 21 | }); 22 | }); 23 | -------------------------------------------------------------------------------- /src/extractors/generic/content/scoring/add-to-parent.js: -------------------------------------------------------------------------------- 1 | import { addScore } from './index'; 2 | 3 | // Adds 1/4 of a child's score to its parent 4 | export default function addToParent(node, $, score) { 5 | const parent = node.parent(); 6 | if (parent) { 7 | addScore(parent, $, score * 0.25); 8 | } 9 | 10 | return node; 11 | } 12 | -------------------------------------------------------------------------------- /src/extractors/generic/content/scoring/add-to-parent.test.js: -------------------------------------------------------------------------------- 1 | import assert from 'assert'; 2 | import cheerio from 'cheerio'; 3 | 4 | import { addToParent, getScore } from './index'; 5 | 6 | describe('Scoring utils', () => { 7 | describe('addToParent(node, $, amount)', () => { 8 | it("adds 1/4 of a node's score it its parent", () => { 9 | const $ = cheerio.load('

Foo

'); 10 | const $node = addToParent($('p').first(), $, 40); 11 | 12 | assert.equal(getScore($node.parent()), 35); 13 | assert.equal(getScore($node), 40); 14 | }); 15 | }); 16 | }); 17 | -------------------------------------------------------------------------------- /src/extractors/generic/content/scoring/find-top-candidate.js: -------------------------------------------------------------------------------- 1 | import { NON_TOP_CANDIDATE_TAGS_RE } from './constants'; 2 | import { getScore } from './index'; 3 | import mergeSiblings from './merge-siblings'; 4 | 5 | // After we've calculated scores, loop through all of the possible 6 | // candidate nodes we found and find the one with the highest score. 7 | export default function findTopCandidate($) { 8 | let $candidate; 9 | let topScore = 0; 10 | 11 | $('[score]').each((index, node) => { 12 | // Ignore tags like BR, HR, etc 13 | if (NON_TOP_CANDIDATE_TAGS_RE.test(node.tagName)) { 14 | return; 15 | } 16 | 17 | const $node = $(node); 18 | const score = getScore($node); 19 | 20 | if (score > topScore) { 21 | topScore = score; 22 | $candidate = $node; 23 | } 24 | }); 25 | 26 | // If we don't have a candidate, return the body 27 | // or whatever the first element is 28 | if (!$candidate) { 29 | return $('body') || $('*').first(); 30 | } 31 | 32 | $candidate = mergeSiblings($candidate, topScore, $); 33 | 34 | return $candidate; 35 | } 36 | -------------------------------------------------------------------------------- /src/extractors/generic/content/scoring/get-or-init-score.js: -------------------------------------------------------------------------------- 1 | import { getScore, scoreNode, getWeight, addToParent } from './index'; 2 | 3 | // gets and returns the score if it exists 4 | // if not, initializes a score based on 5 | // the node's tag type 6 | export default function getOrInitScore($node, $, weightNodes = true) { 7 | let score = getScore($node); 8 | 9 | if (score) { 10 | return score; 11 | } 12 | 13 | score = scoreNode($node); 14 | 15 | if (weightNodes) { 16 | score += getWeight($node); 17 | } 18 | 19 | addToParent($node, $, score); 20 | 21 | return score; 22 | } 23 | -------------------------------------------------------------------------------- /src/extractors/generic/content/scoring/get-score.js: -------------------------------------------------------------------------------- 1 | // returns the score of a node based on 2 | // the node's score attribute 3 | // returns null if no score set 4 | export default function getScore($node) { 5 | return parseFloat($node.attr('score')) || null; 6 | } 7 | -------------------------------------------------------------------------------- /src/extractors/generic/content/scoring/get-score.test.js: -------------------------------------------------------------------------------- 1 | import assert from 'assert'; 2 | import cheerio from 'cheerio'; 3 | 4 | import { getScore } from './index'; 5 | 6 | describe('Scoring utils', () => { 7 | describe('getScore($node)', () => { 8 | it('returns null if the node has no score set', () => { 9 | const $ = cheerio.load('

Foo

'); 10 | assert.equal(getScore($('p').first()), null); 11 | }); 12 | 13 | it('returns 25 if the node has a score attr of 25', () => { 14 | const $ = cheerio.load('

Foo

'); 15 | const score = getScore($('p').first()); 16 | assert.equal(typeof score, 'number'); 17 | assert.equal(score, 25); 18 | }); 19 | }); 20 | }); 21 | -------------------------------------------------------------------------------- /src/extractors/generic/content/scoring/index.js: -------------------------------------------------------------------------------- 1 | // Scoring 2 | export { default as getWeight } from './get-weight'; 3 | export { default as getScore } from './get-score'; 4 | export { default as scoreCommas } from './score-commas'; 5 | export { default as scoreLength } from './score-length'; 6 | export { default as scoreParagraph } from './score-paragraph'; 7 | export { default as setScore } from './set-score'; 8 | export { default as addScore } from './add-score'; 9 | export { default as addToParent } from './add-to-parent'; 10 | export { default as getOrInitScore } from './get-or-init-score'; 11 | export { default as scoreNode } from './score-node'; 12 | export { default as scoreContent } from './score-content'; 13 | export { default as findTopCandidate } from './find-top-candidate'; 14 | -------------------------------------------------------------------------------- /src/extractors/generic/content/scoring/score-commas.js: -------------------------------------------------------------------------------- 1 | // return 1 for every comma in text 2 | export default function scoreCommas(text) { 3 | return (text.match(/,/g) || []).length; 4 | } 5 | -------------------------------------------------------------------------------- /src/extractors/generic/content/scoring/score-commas.test.js: -------------------------------------------------------------------------------- 1 | import assert from 'assert'; 2 | 3 | import { scoreCommas } from './index'; 4 | 5 | describe('Scoring utils', () => { 6 | describe('scoreCommas(text)', () => { 7 | it('returns 0 if text has no commas', () => { 8 | assert.equal(scoreCommas('Foo bar'), 0); 9 | }); 10 | 11 | it('returns a point for every comma in the text', () => { 12 | assert.equal(scoreCommas('Foo, bar'), 1); 13 | assert.equal(scoreCommas('Foo, bar, baz'), 2); 14 | assert.equal(scoreCommas('Foo, bar, baz, bat'), 3); 15 | }); 16 | }); 17 | }); 18 | -------------------------------------------------------------------------------- /src/extractors/generic/content/scoring/score-length.js: -------------------------------------------------------------------------------- 1 | const idkRe = new RegExp('^(p|pre)$', 'i'); 2 | 3 | export default function scoreLength(textLength, tagName = 'p') { 4 | const chunks = textLength / 50; 5 | 6 | if (chunks > 0) { 7 | let lengthBonus; 8 | 9 | // No idea why p or pre are being tamped down here 10 | // but just following the source for now 11 | // Not even sure why tagName is included here, 12 | // since this is only being called from the context 13 | // of scoreParagraph 14 | if (idkRe.test(tagName)) { 15 | lengthBonus = chunks - 2; 16 | } else { 17 | lengthBonus = chunks - 1.25; 18 | } 19 | 20 | return Math.min(Math.max(lengthBonus, 0), 3); 21 | } 22 | 23 | return 0; 24 | } 25 | -------------------------------------------------------------------------------- /src/extractors/generic/content/scoring/score-length.test.js: -------------------------------------------------------------------------------- 1 | import assert from 'assert'; 2 | 3 | import { scoreLength } from './index'; 4 | 5 | describe('Scoring utils', () => { 6 | describe('scoreLength(textLength, tagName)', () => { 7 | it('returns 0 if length < 50 chars', () => { 8 | assert.equal(scoreLength(30), 0); 9 | }); 10 | 11 | it('returns varying scores but maxes out at 3', () => { 12 | assert.equal(scoreLength(150), 1); 13 | assert.equal(scoreLength(199), 1.98); 14 | assert.equal(scoreLength(200), 2); 15 | assert.equal(scoreLength(250), 3); 16 | assert.equal(scoreLength(500), 3); 17 | assert.equal(scoreLength(1500), 3); 18 | }); 19 | }); 20 | }); 21 | -------------------------------------------------------------------------------- /src/extractors/generic/content/scoring/score-node.js: -------------------------------------------------------------------------------- 1 | import { scoreParagraph } from './index'; 2 | import { 3 | PARAGRAPH_SCORE_TAGS, 4 | CHILD_CONTENT_TAGS, 5 | BAD_TAGS, 6 | } from './constants'; 7 | 8 | // Score an individual node. Has some smarts for paragraphs, otherwise 9 | // just scores based on tag. 10 | export default function scoreNode($node) { 11 | const { tagName } = $node.get(0); 12 | 13 | // TODO: Consider ordering by most likely. 14 | // E.g., if divs are a more common tag on a page, 15 | // Could save doing that regex test on every node – AP 16 | if (PARAGRAPH_SCORE_TAGS.test(tagName)) { 17 | return scoreParagraph($node); 18 | } 19 | if (tagName.toLowerCase() === 'div') { 20 | return 5; 21 | } 22 | if (CHILD_CONTENT_TAGS.test(tagName)) { 23 | return 3; 24 | } 25 | if (BAD_TAGS.test(tagName)) { 26 | return -3; 27 | } 28 | if (tagName.toLowerCase() === 'th') { 29 | return -5; 30 | } 31 | 32 | return 0; 33 | } 34 | -------------------------------------------------------------------------------- /src/extractors/generic/content/scoring/score-paragraph.js: -------------------------------------------------------------------------------- 1 | import { scoreCommas, scoreLength } from './index'; 2 | 3 | // Score a paragraph using various methods. Things like number of 4 | // commas, etc. Higher is better. 5 | export default function scoreParagraph(node) { 6 | let score = 1; 7 | const text = node.text().trim(); 8 | const textLength = text.length; 9 | 10 | // If this paragraph is less than 25 characters, don't count it. 11 | if (textLength < 25) { 12 | return 0; 13 | } 14 | 15 | // Add points for any commas within this paragraph 16 | score += scoreCommas(text); 17 | 18 | // For every 50 characters in this paragraph, add another point. Up 19 | // to 3 points. 20 | score += scoreLength(textLength); 21 | 22 | // Articles can end with short paragraphs when people are being clever 23 | // but they can also end with short paragraphs setting up lists of junk 24 | // that we strip. This negative tweaks junk setup paragraphs just below 25 | // the cutoff threshold. 26 | if (text.slice(-1) === ':') { 27 | score -= 1; 28 | } 29 | 30 | return score; 31 | } 32 | -------------------------------------------------------------------------------- /src/extractors/generic/content/scoring/set-score.js: -------------------------------------------------------------------------------- 1 | export default function setScore($node, $, score) { 2 | $node.attr('score', score); 3 | return $node; 4 | } 5 | -------------------------------------------------------------------------------- /src/extractors/generic/content/scoring/set-score.test.js: -------------------------------------------------------------------------------- 1 | import assert from 'assert'; 2 | import cheerio from 'cheerio'; 3 | 4 | import { setScore, getScore } from './index'; 5 | 6 | describe('Scoring utils', () => { 7 | describe('setScore(node, $, amount)', () => { 8 | it("sets the specified amount as the node's score", () => { 9 | const $ = cheerio.load('

Foo

'); 10 | const $node = $('p').first(); 11 | const newScore = 25; 12 | setScore($node, $, newScore); 13 | 14 | const score = getScore($node); 15 | assert(score, newScore); 16 | }); 17 | }); 18 | }); 19 | -------------------------------------------------------------------------------- /src/extractors/generic/dek/extractor.js: -------------------------------------------------------------------------------- 1 | // Currently there is only one selector for 2 | // deks. We should simply return null here 3 | // until we have a more robust generic option. 4 | // Below is the original source for this, for reference. 5 | const GenericDekExtractor = { 6 | extract() { 7 | return null; 8 | }, 9 | }; 10 | 11 | export default GenericDekExtractor; 12 | -------------------------------------------------------------------------------- /src/extractors/generic/dek/extractor.test.js: -------------------------------------------------------------------------------- 1 | import assert from 'assert'; 2 | import cheerio from 'cheerio'; 3 | 4 | import GenericDekExtractor from './extractor'; 5 | 6 | describe('GenericDekExtractor', () => { 7 | describe('extract({ $, metaCache })', () => { 8 | it('returns null if no dek can be found', () => { 9 | const $ = cheerio.load('
'); 10 | const metaCache = []; 11 | const result = GenericDekExtractor.extract({ $, metaCache }); 12 | 13 | assert.equal(result, null); 14 | }); 15 | }); 16 | }); 17 | -------------------------------------------------------------------------------- /src/extractors/generic/excerpt/constants.js: -------------------------------------------------------------------------------- 1 | export const EXCERPT_META_SELECTORS = ['og:description', 'twitter:description']; 2 | -------------------------------------------------------------------------------- /src/extractors/generic/excerpt/extractor.js: -------------------------------------------------------------------------------- 1 | import ellipsize from 'ellipsize'; 2 | 3 | import { extractFromMeta, stripTags } from 'utils/dom'; 4 | 5 | import { EXCERPT_META_SELECTORS } from './constants'; 6 | 7 | export function clean(content, $, maxLength = 200) { 8 | content = content.replace(/[\s\n]+/g, ' ').trim(); 9 | return ellipsize(content, maxLength, { ellipse: '…' }); 10 | } 11 | 12 | const GenericExcerptExtractor = { 13 | extract({ $, content, metaCache }) { 14 | const excerpt = extractFromMeta($, EXCERPT_META_SELECTORS, metaCache); 15 | if (excerpt) { 16 | return clean(stripTags(excerpt, $)); 17 | } 18 | // Fall back to excerpting from the extracted content 19 | const maxLength = 200; 20 | const shortContent = content.slice(0, maxLength * 5); 21 | return clean($(shortContent).text(), $, maxLength); 22 | }, 23 | }; 24 | 25 | export default GenericExcerptExtractor; 26 | -------------------------------------------------------------------------------- /src/extractors/generic/next-page-url/extractor.test.js: -------------------------------------------------------------------------------- 1 | import assert from 'assert'; 2 | import cheerio from 'cheerio'; 3 | 4 | import GenericNextPageUrlExtractor from './extractor'; 5 | 6 | const fs = require('fs'); 7 | 8 | describe('GenericNextPageUrlExtractor', () => { 9 | it('returns most likely next page url', () => { 10 | const html = fs.readFileSync('./fixtures/arstechnica.com.html', 'utf8'); 11 | const $ = cheerio.load(html); 12 | const url = 13 | 'https://arstechnica.com/gadgets/2016/08/the-connected-renter-how-to-make-your-apartment-smarter/'; 14 | const next = 15 | 'https://arstechnica.com/gadgets/2016/08/the-connected-renter-how-to-make-your-apartment-smarter/2'; 16 | 17 | const nextPage = GenericNextPageUrlExtractor.extract({ 18 | $, 19 | url, 20 | }); 21 | 22 | assert.equal(nextPage, next); 23 | }); 24 | 25 | it('returns null if there is no likely next page', () => { 26 | const html = '

HI

'; 27 | const $ = cheerio.load(html); 28 | const url = 'http://example.com/foo/bar'; 29 | 30 | const nextPage = GenericNextPageUrlExtractor.extract({ 31 | $, 32 | url, 33 | }); 34 | 35 | assert.equal(nextPage, null); 36 | }); 37 | }); 38 | -------------------------------------------------------------------------------- /src/extractors/generic/next-page-url/scoring/utils/index.js: -------------------------------------------------------------------------------- 1 | export { default as scoreSimilarity } from './score-similarity'; 2 | export { default as scoreLinkText } from './score-link-text'; 3 | export { default as scorePageInLink } from './score-page-in-link'; 4 | export { default as scoreExtraneousLinks } from './score-extraneous-links'; 5 | export { default as scoreByParents } from './score-by-parents'; 6 | export { default as scorePrevLink } from './score-prev-link'; 7 | export { default as shouldScore } from './should-score'; 8 | export { default as scoreBaseUrl } from './score-base-url'; 9 | export { default as scoreNextLinkText } from './score-next-link-text'; 10 | export { default as scoreCapLinks } from './score-cap-links'; 11 | -------------------------------------------------------------------------------- /src/extractors/generic/next-page-url/scoring/utils/score-base-url.js: -------------------------------------------------------------------------------- 1 | export default function scoreBaseUrl(href, baseRegex) { 2 | // If the baseUrl isn't part of this URL, penalize this 3 | // link. It could still be the link, but the odds are lower. 4 | // Example: 5 | // http://www.actionscript.org/resources/articles/745/1/JavaScript-and-VBScript-Injection-in-ActionScript-3/Page1.html 6 | if (!baseRegex.test(href)) { 7 | return -25; 8 | } 9 | 10 | return 0; 11 | } 12 | -------------------------------------------------------------------------------- /src/extractors/generic/next-page-url/scoring/utils/score-base-url.test.js: -------------------------------------------------------------------------------- 1 | import assert from 'assert'; 2 | 3 | import scoreBaseUrl from './score-base-url'; 4 | import { makeBaseRegex } from '../score-links'; 5 | 6 | describe('scoreBaseUrl(href, baseRegex)', () => { 7 | it('returns -25 if url does not contain the base url', () => { 8 | const baseUrl = 'http://example.com/foo/bar'; 9 | const badUrl = 'http://foo.com/foo/bar'; 10 | const baseRegex = makeBaseRegex(baseUrl); 11 | 12 | assert.equal(scoreBaseUrl(badUrl, baseRegex), -25); 13 | }); 14 | 15 | it('returns 0 if url contains the base url', () => { 16 | const baseUrl = 'http://example.com/foo/bar'; 17 | const badUrl = 'http://example.com/foo/bar/bat'; 18 | const baseRegex = makeBaseRegex(baseUrl); 19 | 20 | assert.equal(scoreBaseUrl(badUrl, baseRegex), 0); 21 | }); 22 | }); 23 | -------------------------------------------------------------------------------- /src/extractors/generic/next-page-url/scoring/utils/score-by-parents.test.js: -------------------------------------------------------------------------------- 1 | import assert from 'assert'; 2 | import cheerio from 'cheerio'; 3 | 4 | import scoreByParents from './score-by-parents'; 5 | 6 | describe('scoreByParents($link)', () => { 7 | it('returns 25 if parent sig looks like a page', () => { 8 | const $ = cheerio.load(` 9 |
10 |
11 | Next page 12 |
13 |
14 | `); 15 | 16 | assert.equal(scoreByParents($('a').first()), 25); 17 | }); 18 | 19 | it('returns -25 if parent sig looks like a comment', () => { 20 | const $ = cheerio.load(` 21 |
22 |
23 | Next page 24 |
25 |
26 | `); 27 | 28 | assert.equal(scoreByParents($('a').first()), -25); 29 | }); 30 | }); 31 | -------------------------------------------------------------------------------- /src/extractors/generic/next-page-url/scoring/utils/score-cap-links.js: -------------------------------------------------------------------------------- 1 | import { NEXT_LINK_TEXT_RE, CAP_LINK_TEXT_RE } from '../constants'; 2 | 3 | export default function scoreCapLinks(linkData) { 4 | // Cap links are links like "last", etc. 5 | if (CAP_LINK_TEXT_RE.test(linkData)) { 6 | // If we found a link like "last", but we've already seen that 7 | // this link is also "next", it's fine. If it's not been 8 | // previously marked as "next", then it's probably bad. 9 | // Penalize. 10 | if (NEXT_LINK_TEXT_RE.test(linkData)) { 11 | return -65; 12 | } 13 | } 14 | 15 | return 0; 16 | } 17 | -------------------------------------------------------------------------------- /src/extractors/generic/next-page-url/scoring/utils/score-cap-links.test.js: -------------------------------------------------------------------------------- 1 | import assert from 'assert'; 2 | 3 | import scoreCapLinks from './score-cap-links'; 4 | 5 | describe('scoreCapLinks(linkData)', () => { 6 | it('returns -65 if cap link with next link text', () => { 7 | assert.equal(scoreCapLinks('foo next Last page'), -65); 8 | }); 9 | 10 | it('returns 0 if does not match a cap link', () => { 11 | assert.equal(scoreCapLinks('foo bar WOW GREAT'), 0); 12 | }); 13 | }); 14 | -------------------------------------------------------------------------------- /src/extractors/generic/next-page-url/scoring/utils/score-extraneous-links.js: -------------------------------------------------------------------------------- 1 | import { EXTRANEOUS_LINK_HINTS_RE } from '../constants'; 2 | 3 | export default function scoreExtraneousLinks(href) { 4 | // If the URL itself contains extraneous values, give a penalty. 5 | if (EXTRANEOUS_LINK_HINTS_RE.test(href)) { 6 | return -25; 7 | } 8 | 9 | return 0; 10 | } 11 | -------------------------------------------------------------------------------- /src/extractors/generic/next-page-url/scoring/utils/score-extraneous-links.test.js: -------------------------------------------------------------------------------- 1 | import assert from 'assert'; 2 | 3 | import scoreExtraneousLinks from './score-extraneous-links'; 4 | 5 | describe('scoreExtraneousLinks(href)', () => { 6 | it('returns -25 if link matches extraneous text', () => { 7 | assert.equal(scoreExtraneousLinks('http://example.com/email-link'), -25); 8 | }); 9 | 10 | it('returns 0 if does not match extraneous text', () => { 11 | assert.equal(scoreExtraneousLinks('http://example.com/asdf'), 0); 12 | }); 13 | }); 14 | -------------------------------------------------------------------------------- /src/extractors/generic/next-page-url/scoring/utils/score-link-text.js: -------------------------------------------------------------------------------- 1 | import { IS_DIGIT_RE } from 'utils/text/constants'; 2 | 3 | export default function scoreLinkText(linkText, pageNum) { 4 | // If the link text can be parsed as a number, give it a minor 5 | // bonus, with a slight bias towards lower numbered pages. This is 6 | // so that pages that might not have 'next' in their text can still 7 | // get scored, and sorted properly by score. 8 | let score = 0; 9 | 10 | if (IS_DIGIT_RE.test(linkText.trim())) { 11 | const linkTextAsNum = parseInt(linkText, 10); 12 | // If it's the first page, we already got it on the first call. 13 | // Give it a negative score. Otherwise, up to page 10, give a 14 | // small bonus. 15 | if (linkTextAsNum < 2) { 16 | score = -30; 17 | } else { 18 | score = Math.max(0, 10 - linkTextAsNum); 19 | } 20 | 21 | // If it appears that the current page number is greater than 22 | // this links page number, it's a very bad sign. Give it a big 23 | // penalty. 24 | if (pageNum && pageNum >= linkTextAsNum) { 25 | score -= 50; 26 | } 27 | } 28 | 29 | return score; 30 | } 31 | -------------------------------------------------------------------------------- /src/extractors/generic/next-page-url/scoring/utils/score-link-text.test.js: -------------------------------------------------------------------------------- 1 | import assert from 'assert'; 2 | 3 | import scoreLinkText from './score-link-text'; 4 | 5 | describe('scoreLinkText(linkText)', () => { 6 | it('returns 8 if link contains the num 2', () => { 7 | assert.equal(scoreLinkText('2', 0), 8); 8 | }); 9 | 10 | it('returns 5 if link contains the num 5', () => { 11 | assert.equal(scoreLinkText('5', 0), 5); 12 | }); 13 | 14 | it('returns -30 if link contains the number 1', () => { 15 | assert.equal(scoreLinkText('1', 0), -30); 16 | }); 17 | 18 | it('penalizes -50 if pageNum is >= link text as num', () => { 19 | assert.equal(scoreLinkText('4', 5), -44); 20 | }); 21 | }); 22 | -------------------------------------------------------------------------------- /src/extractors/generic/next-page-url/scoring/utils/score-next-link-text.js: -------------------------------------------------------------------------------- 1 | import { NEXT_LINK_TEXT_RE } from '../constants'; 2 | 3 | export default function scoreNextLinkText(linkData) { 4 | // Things like "next", ">>", etc. 5 | if (NEXT_LINK_TEXT_RE.test(linkData)) { 6 | return 50; 7 | } 8 | 9 | return 0; 10 | } 11 | -------------------------------------------------------------------------------- /src/extractors/generic/next-page-url/scoring/utils/score-next-link-text.test.js: -------------------------------------------------------------------------------- 1 | import assert from 'assert'; 2 | 3 | import scoreNextLinkText from './score-next-link-text'; 4 | 5 | describe('scoreNextLinkText(linkData)', () => { 6 | it('returns 50 if contains common next link text', () => { 7 | assert.equal(scoreNextLinkText('foo bar Next page'), 50); 8 | }); 9 | 10 | it('returns 0 if does not contain common next link text', () => { 11 | assert.equal(scoreNextLinkText('foo bar WOW GREAT'), 0); 12 | }); 13 | }); 14 | -------------------------------------------------------------------------------- /src/extractors/generic/next-page-url/scoring/utils/score-page-in-link.js: -------------------------------------------------------------------------------- 1 | export default function scorePageInLink(pageNum, isWp) { 2 | // page in the link = bonus. Intentionally ignore wordpress because 3 | // their ?p=123 link style gets caught by this even though it means 4 | // separate documents entirely. 5 | if (pageNum && !isWp) { 6 | return 50; 7 | } 8 | 9 | return 0; 10 | } 11 | -------------------------------------------------------------------------------- /src/extractors/generic/next-page-url/scoring/utils/score-page-in-link.test.js: -------------------------------------------------------------------------------- 1 | import assert from 'assert'; 2 | import scorePageInLink from './score-page-in-link'; 3 | 4 | describe('scorePageInLink(pageNum, isWp)', () => { 5 | it('returns 50 if link contains a page num', () => { 6 | assert.equal(scorePageInLink(1, false), 50); 7 | }); 8 | 9 | it('returns 0 if link contains no page num', () => { 10 | assert.equal(scorePageInLink(null, false), 0); 11 | }); 12 | 13 | it('returns 0 if page is wordpress', () => { 14 | assert.equal(scorePageInLink(10, true), 0); 15 | }); 16 | }); 17 | -------------------------------------------------------------------------------- /src/extractors/generic/next-page-url/scoring/utils/score-prev-link.js: -------------------------------------------------------------------------------- 1 | import { PREV_LINK_TEXT_RE } from '../constants'; 2 | 3 | export default function scorePrevLink(linkData) { 4 | // If the link has something like "previous", its definitely 5 | // an old link, skip it. 6 | if (PREV_LINK_TEXT_RE.test(linkData)) { 7 | return -200; 8 | } 9 | 10 | return 0; 11 | } 12 | -------------------------------------------------------------------------------- /src/extractors/generic/next-page-url/scoring/utils/score-prev-link.test.js: -------------------------------------------------------------------------------- 1 | import assert from 'assert'; 2 | 3 | import scorePrevLink from './score-prev-link'; 4 | 5 | describe('scorePrevLink(linkData)', () => { 6 | it('returns -200 if link matches previous text', () => { 7 | assert.equal(scorePrevLink('foo next previous page'), -200); 8 | }); 9 | 10 | it('returns 0 if does not match a prev link', () => { 11 | assert.equal(scorePrevLink('foo bar WOW GREAT'), 0); 12 | }); 13 | }); 14 | -------------------------------------------------------------------------------- /src/extractors/generic/next-page-url/scoring/utils/score-similarity.js: -------------------------------------------------------------------------------- 1 | import difflib from 'difflib'; 2 | 3 | export default function scoreSimilarity(score, articleUrl, href) { 4 | // Do this last and only if we have a real candidate, because it's 5 | // potentially expensive computationally. Compare the link to this 6 | // URL using difflib to get the % similarity of these URLs. On a 7 | // sliding scale, subtract points from this link based on 8 | // similarity. 9 | if (score > 0) { 10 | const similarity = new difflib.SequenceMatcher( 11 | null, 12 | articleUrl, 13 | href 14 | ).ratio(); 15 | // Subtract .1 from diff_percent when calculating modifier, 16 | // which means that if it's less than 10% different, we give a 17 | // bonus instead. Ex: 18 | // 3% different = +17.5 points 19 | // 10% different = 0 points 20 | // 20% different = -25 points 21 | const diffPercent = 1.0 - similarity; 22 | const diffModifier = -(250 * (diffPercent - 0.2)); 23 | return score + diffModifier; 24 | } 25 | 26 | return 0; 27 | } 28 | -------------------------------------------------------------------------------- /src/extractors/generic/next-page-url/scoring/utils/score-similarity.test.js: -------------------------------------------------------------------------------- 1 | import assert from 'assert'; 2 | 3 | import scoreSimilarity from './score-similarity'; 4 | 5 | describe('scoreSimilarity(score, articleUrl, href)', () => { 6 | it('returns a similarity bonus based on current score', () => { 7 | const articleUrl = 'http://example.com/foo/bar'; 8 | const href = 'http://example.com/foo/bar/2'; 9 | const score = 25; 10 | assert.equal(Math.round(scoreSimilarity(score, articleUrl, href)), 66); 11 | }); 12 | 13 | it('returns 0 is current score <= 0', () => { 14 | const articleUrl = 'http://example.com/foo/bar'; 15 | const href = 'http://example.com/foo/bar/2'; 16 | const score = 0; 17 | assert.equal(scoreSimilarity(score, articleUrl, href), 0); 18 | }); 19 | }); 20 | -------------------------------------------------------------------------------- /src/extractors/generic/next-page-url/scoring/utils/should-score.test.js: -------------------------------------------------------------------------------- 1 | import assert from 'assert'; 2 | import URL from 'url'; 3 | 4 | import shouldScore from './should-score'; 5 | 6 | describe('shouldScore(href, articleUrl, baseUrl, parsedUrl, linkText, previousUrls)', () => { 7 | it('returns false if href has already been fetched', () => { 8 | const previousUrls = ['http://example.com/foo/bar/2']; 9 | const href = 'http://example.com/foo/bar/2'; 10 | const parsedUrl = URL.parse(href); 11 | 12 | assert.equal(shouldScore(href, '', '', parsedUrl, '', previousUrls), false); 13 | }); 14 | 15 | it('returns true if href has not been fetched', () => { 16 | const previousUrls = ['http://example.com/foo/bar']; 17 | const href = 'http://example.com/foo/bar/2'; 18 | const parsedUrl = URL.parse(href); 19 | 20 | assert.equal(shouldScore(href, '', '', parsedUrl, '', previousUrls), true); 21 | }); 22 | }); 23 | -------------------------------------------------------------------------------- /src/extractors/generic/url/constants.js: -------------------------------------------------------------------------------- 1 | export const CANONICAL_META_SELECTORS = ['og:url']; 2 | -------------------------------------------------------------------------------- /src/extractors/generic/url/extractor.js: -------------------------------------------------------------------------------- 1 | import URL from 'url'; 2 | import { extractFromMeta } from 'utils/dom'; 3 | 4 | import { CANONICAL_META_SELECTORS } from './constants'; 5 | 6 | function parseDomain(url) { 7 | const parsedUrl = URL.parse(url); 8 | const { hostname } = parsedUrl; 9 | return hostname; 10 | } 11 | 12 | function result(url) { 13 | return { 14 | url, 15 | domain: parseDomain(url), 16 | }; 17 | } 18 | 19 | const GenericUrlExtractor = { 20 | extract({ $, url, metaCache }) { 21 | const $canonical = $('link[rel=canonical]'); 22 | if ($canonical.length !== 0) { 23 | const href = $canonical.attr('href'); 24 | if (href) { 25 | return result(href); 26 | } 27 | } 28 | 29 | const metaUrl = extractFromMeta($, CANONICAL_META_SELECTORS, metaCache); 30 | if (metaUrl) { 31 | return result(metaUrl); 32 | } 33 | 34 | return result(url); 35 | }, 36 | }; 37 | 38 | export default GenericUrlExtractor; 39 | -------------------------------------------------------------------------------- /src/extractors/generic/word-count/extractor.js: -------------------------------------------------------------------------------- 1 | import cheerio from 'cheerio'; 2 | 3 | import { normalizeSpaces } from 'utils/text'; 4 | 5 | const getWordCount = content => { 6 | const $ = cheerio.load(content); 7 | const $content = $('div').first(); 8 | const text = normalizeSpaces($content.text()); 9 | return text.split(/\s/).length; 10 | }; 11 | 12 | const getWordCountAlt = content => { 13 | content = content.replace(/<[^>]*>/g, ' '); 14 | content = content.replace(/\s+/g, ' '); 15 | content = content.trim(); 16 | return content.split(' ').length; 17 | }; 18 | 19 | const GenericWordCountExtractor = { 20 | extract({ content }) { 21 | let count = getWordCount(content); 22 | if (count === 1) count = getWordCountAlt(content); 23 | return count; 24 | }, 25 | }; 26 | 27 | export default GenericWordCountExtractor; 28 | -------------------------------------------------------------------------------- /src/extractors/generic/word-count/extractor.test.js: -------------------------------------------------------------------------------- 1 | import assert from 'assert'; 2 | 3 | import GenericWordCountExtractor from './extractor'; 4 | 5 | describe('GenericWordCountExtractor', () => { 6 | describe('extact({ content })', () => { 7 | it('counts words', () => { 8 | const content = ` 9 |
10 |

One two three.

11 |

Four five six.

12 |

Seven eight nine.

13 |

Ten eleven twelve.

14 | `; 15 | 16 | const wordCount = GenericWordCountExtractor.extract({ content }); 17 | 18 | assert.equal(wordCount, 12); 19 | }); 20 | }); 21 | }); 22 | -------------------------------------------------------------------------------- /src/extractors/get-extractor.js: -------------------------------------------------------------------------------- 1 | import URL from 'url'; 2 | 3 | import Extractors from './all'; 4 | import GenericExtractor from './generic'; 5 | import detectByHtml from './detect-by-html'; 6 | import { apiExtractors } from './add-extractor'; 7 | 8 | export default function getExtractor(url, parsedUrl, $) { 9 | parsedUrl = parsedUrl || URL.parse(url); 10 | const { hostname } = parsedUrl; 11 | const baseDomain = hostname 12 | .split('.') 13 | .slice(-2) 14 | .join('.'); 15 | 16 | return ( 17 | apiExtractors[hostname] || 18 | apiExtractors[baseDomain] || 19 | Extractors[hostname] || 20 | Extractors[baseDomain] || 21 | detectByHtml($) || 22 | GenericExtractor 23 | ); 24 | } 25 | -------------------------------------------------------------------------------- /src/extractors/index.js: -------------------------------------------------------------------------------- 1 | const Extractor = {}; 2 | 3 | export default Extractor; 4 | -------------------------------------------------------------------------------- /src/resource/utils/dom/clean.js: -------------------------------------------------------------------------------- 1 | import { TAGS_TO_REMOVE } from './constants'; 2 | 3 | function isComment(index, node) { 4 | return node.type === 'comment'; 5 | } 6 | 7 | function cleanComments($) { 8 | $.root() 9 | .find('*') 10 | .contents() 11 | .filter(isComment) 12 | .remove(); 13 | 14 | return $; 15 | } 16 | 17 | export default function clean($) { 18 | $(TAGS_TO_REMOVE).remove(); 19 | 20 | $ = cleanComments($); 21 | return $; 22 | } 23 | -------------------------------------------------------------------------------- /src/resource/utils/dom/clean.test.js: -------------------------------------------------------------------------------- 1 | import assert from 'assert'; 2 | import cheerio from 'cheerio'; 3 | 4 | import clean from './clean'; 5 | 6 | describe('clean($)', () => { 7 | it('removes script elements', () => { 8 | const html = "
"; 9 | const $ = cheerio.load(html); 10 | 11 | assert.equal(clean($).html(), '
'); 12 | }); 13 | 14 | it('removes style elements', () => { 15 | const html = '
'; 16 | const $ = cheerio.load(html); 17 | 18 | assert.equal(clean($).html(), '
'); 19 | }); 20 | 21 | it('removes comments', () => { 22 | const html = '
HI
'; 23 | const $ = cheerio.load(html); 24 | 25 | assert.equal(clean($).html(), '
HI
'); 26 | }); 27 | }); 28 | -------------------------------------------------------------------------------- /src/resource/utils/dom/constants.js: -------------------------------------------------------------------------------- 1 | export const IS_LINK = new RegExp('https?://', 'i'); 2 | const IMAGE_RE = '.(png|gif|jpe?g)'; 3 | export const IS_IMAGE = new RegExp(`${IMAGE_RE}`, 'i'); 4 | export const IS_SRCSET = new RegExp( 5 | `${IMAGE_RE}(\\?\\S+)?(\\s*[\\d.]+[wx])`, 6 | 'i' 7 | ); 8 | 9 | export const TAGS_TO_REMOVE = ['script', 'style', 'form'].join(','); 10 | -------------------------------------------------------------------------------- /src/resource/utils/dom/index.js: -------------------------------------------------------------------------------- 1 | export { default as normalizeMetaTags } from './normalize-meta-tags'; 2 | export { 3 | default as convertLazyLoadedImages, 4 | } from './convert-lazy-loaded-images'; 5 | export { default as clean } from './clean'; 6 | -------------------------------------------------------------------------------- /src/resource/utils/dom/normalize-meta-tags.js: -------------------------------------------------------------------------------- 1 | function convertMetaProp($, from, to) { 2 | $(`meta[${from}]`).each((_, node) => { 3 | const $node = $(node); 4 | 5 | const value = $node.attr(from); 6 | $node.attr(to, value); 7 | $node.removeAttr(from); 8 | }); 9 | 10 | return $; 11 | } 12 | 13 | // For ease of use in extracting from meta tags, 14 | // replace the "content" attribute on meta tags with the 15 | // "value" attribute. 16 | // 17 | // In addition, normalize 'property' attributes to 'name' for ease of 18 | // querying later. See, e.g., og or twitter meta tags. 19 | 20 | export default function normalizeMetaTags($) { 21 | $ = convertMetaProp($, 'content', 'value'); 22 | $ = convertMetaProp($, 'property', 'name'); 23 | return $; 24 | } 25 | -------------------------------------------------------------------------------- /src/resource/utils/dom/normalize-meta-tags.test.js: -------------------------------------------------------------------------------- 1 | import assert from 'assert'; 2 | import cheerio from 'cheerio'; 3 | 4 | import normalizeMetaTags from './normalize-meta-tags'; 5 | 6 | describe('normalizeMetaTags($)', () => { 7 | it('replaces "content" attributes with "value"', () => { 8 | // browser cheerio/jquery will remove/replace html, so result is different 9 | const test = cheerio.browser 10 | ? '' 11 | : ''; 12 | 13 | const $ = cheerio.load(''); 14 | const result = normalizeMetaTags($).html(); 15 | 16 | assert.equal(result, test); 17 | }); 18 | 19 | it('replaces "property" attributes with "name"', () => { 20 | const test = cheerio.browser 21 | ? '' 22 | : ''; 23 | 24 | const $ = cheerio.load(''); 25 | const result = normalizeMetaTags($).html(); 26 | 27 | assert.equal(result, test); 28 | }); 29 | }); 30 | -------------------------------------------------------------------------------- /src/resource/utils/index.js: -------------------------------------------------------------------------------- 1 | export { default as fetchResource } from './fetch-resource'; 2 | -------------------------------------------------------------------------------- /src/shims/iconv-lite.js: -------------------------------------------------------------------------------- 1 | // this is a shim for the browser build; 2 | // iconv-lite doubles build size, and we 3 | // don't need it for already rendered text 4 | const iconv = { 5 | encodingExists: () => false, 6 | decode: s => s, 7 | }; 8 | 9 | export default iconv; 10 | -------------------------------------------------------------------------------- /src/utils/dom/brs-to-ps.js: -------------------------------------------------------------------------------- 1 | import { paragraphize } from './index'; 2 | 3 | // ## NOTES: 4 | // Another good candidate for refactoring/optimizing. 5 | // Very imperative code, I don't love it. - AP 6 | 7 | // Given cheerio object, convert consecutive
tags into 8 | //

tags instead. 9 | // 10 | // :param $: A cheerio object 11 | 12 | export default function brsToPs($) { 13 | let collapsing = false; 14 | $('br').each((index, element) => { 15 | const $element = $(element); 16 | const nextElement = $element.next().get(0); 17 | 18 | if (nextElement && nextElement.tagName.toLowerCase() === 'br') { 19 | collapsing = true; 20 | $element.remove(); 21 | } else if (collapsing) { 22 | collapsing = false; 23 | paragraphize(element, $, true); 24 | } 25 | }); 26 | 27 | return $; 28 | } 29 | -------------------------------------------------------------------------------- /src/utils/dom/clean-attributes.js: -------------------------------------------------------------------------------- 1 | import { getAttrs, setAttrs } from 'utils/dom'; 2 | 3 | import { WHITELIST_ATTRS_RE, KEEP_CLASS } from './constants'; 4 | 5 | function removeAllButWhitelist($article, $) { 6 | $article.find('*').each((index, node) => { 7 | const attrs = getAttrs(node); 8 | 9 | setAttrs( 10 | node, 11 | Reflect.ownKeys(attrs).reduce((acc, attr) => { 12 | if (WHITELIST_ATTRS_RE.test(attr)) { 13 | return { ...acc, [attr]: attrs[attr] }; 14 | } 15 | 16 | return acc; 17 | }, {}) 18 | ); 19 | }); 20 | 21 | // Remove the mercury-parser-keep class from result 22 | $(`.${KEEP_CLASS}`, $article).removeClass(KEEP_CLASS); 23 | 24 | return $article; 25 | } 26 | 27 | // Remove attributes like style or align 28 | export default function cleanAttributes($article, $) { 29 | // Grabbing the parent because at this point 30 | // $article will be wrapped in a div which will 31 | // have a score set on it. 32 | return removeAllButWhitelist( 33 | $article.parent().length ? $article.parent() : $article, 34 | $ 35 | ); 36 | } 37 | -------------------------------------------------------------------------------- /src/utils/dom/clean-attributes.test.js: -------------------------------------------------------------------------------- 1 | import cheerio from 'cheerio'; 2 | 3 | import { assertClean } from 'test-helpers'; 4 | 5 | import { cleanAttributes } from './index'; 6 | 7 | describe('cleanAttributes($)', () => { 8 | it('removes style attributes from nodes', () => { 9 | const $ = cheerio.load(` 10 |

11 |

What do you think?

12 |
13 | `); 14 | 15 | const result = cleanAttributes($('*').first(), $); 16 | assertClean( 17 | $.html(result), 18 | ` 19 |
20 |

What do you think?

21 |
22 | ` 23 | ); 24 | }); 25 | 26 | it('removes align attributes from nodes', () => { 27 | const $ = cheerio.load(` 28 |
29 |

What do you think?

30 |
31 | `); 32 | 33 | const result = cleanAttributes($('*').first(), $); 34 | assertClean( 35 | $.html(result), 36 | ` 37 |
38 |

What do you think?

39 |
40 | ` 41 | ); 42 | }); 43 | }); 44 | -------------------------------------------------------------------------------- /src/utils/dom/clean-h-ones.js: -------------------------------------------------------------------------------- 1 | import { convertNodeTo } from 'utils/dom'; 2 | 3 | // H1 tags are typically the article title, which should be extracted 4 | // by the title extractor instead. If there's less than 3 of them (<3), 5 | // strip them. Otherwise, turn 'em into H2s. 6 | export default function cleanHOnes(article, $) { 7 | const $hOnes = $('h1', article); 8 | 9 | if ($hOnes.length < 3) { 10 | $hOnes.each((index, node) => $(node).remove()); 11 | } else { 12 | $hOnes.each((index, node) => { 13 | convertNodeTo($(node), $, 'h2'); 14 | }); 15 | } 16 | 17 | return $; 18 | } 19 | -------------------------------------------------------------------------------- /src/utils/dom/clean-headers.js: -------------------------------------------------------------------------------- 1 | import { getWeight } from 'extractors/generic/content/scoring'; 2 | 3 | import { HEADER_TAG_LIST } from './constants'; 4 | import { normalizeSpaces } from '../text'; 5 | 6 | export default function cleanHeaders($article, $, title = '') { 7 | $(HEADER_TAG_LIST, $article).each((index, header) => { 8 | const $header = $(header); 9 | // Remove any headers that appear before all other p tags in the 10 | // document. This probably means that it was part of the title, a 11 | // subtitle or something else extraneous like a datestamp or byline, 12 | // all of which should be handled by other metadata handling. 13 | if ($($header, $article).prevAll('p').length === 0) { 14 | return $header.remove(); 15 | } 16 | 17 | // Remove any headers that match the title exactly. 18 | if (normalizeSpaces($(header).text()) === title) { 19 | return $header.remove(); 20 | } 21 | 22 | // If this header has a negative weight, it's probably junk. 23 | // Get rid of it. 24 | if (getWeight($(header)) < 0) { 25 | return $header.remove(); 26 | } 27 | 28 | return $header; 29 | }); 30 | 31 | return $; 32 | } 33 | -------------------------------------------------------------------------------- /src/utils/dom/clean-images.js: -------------------------------------------------------------------------------- 1 | import { SPACER_RE } from './constants'; 2 | 3 | function cleanForHeight($img, $) { 4 | const height = parseInt($img.attr('height'), 10); 5 | const width = parseInt($img.attr('width'), 10) || 20; 6 | 7 | // Remove images that explicitly have very small heights or 8 | // widths, because they are most likely shims or icons, 9 | // which aren't very useful for reading. 10 | if ((height || 20) < 10 || width < 10) { 11 | $img.remove(); 12 | } else if (height) { 13 | // Don't ever specify a height on images, so that we can 14 | // scale with respect to width without screwing up the 15 | // aspect ratio. 16 | $img.removeAttr('height'); 17 | } 18 | 19 | return $; 20 | } 21 | 22 | // Cleans out images where the source string matches transparent/spacer/etc 23 | // TODO This seems very aggressive - AP 24 | function removeSpacers($img, $) { 25 | if (SPACER_RE.test($img.attr('src'))) { 26 | $img.remove(); 27 | } 28 | 29 | return $; 30 | } 31 | 32 | export default function cleanImages($article, $) { 33 | $article.find('img').each((index, img) => { 34 | const $img = $(img); 35 | 36 | cleanForHeight($img, $); 37 | removeSpacers($img, $); 38 | }); 39 | 40 | return $; 41 | } 42 | -------------------------------------------------------------------------------- /src/utils/dom/convert-node-to.js: -------------------------------------------------------------------------------- 1 | import { getAttrs } from 'utils/dom'; 2 | 3 | export default function convertNodeTo($node, $, tag = 'p') { 4 | const node = $node.get(0); 5 | if (!node) { 6 | return $; 7 | } 8 | const attrs = getAttrs(node) || {}; 9 | 10 | const attribString = Reflect.ownKeys(attrs) 11 | .map(key => `${key}=${attrs[key]}`) 12 | .join(' '); 13 | let html; 14 | 15 | if ($.browser) { 16 | // In the browser, the contents of noscript tags aren't rendered, therefore 17 | // transforms on the noscript tag (commonly used for lazy-loading) don't work 18 | // as expected. This test case handles that 19 | html = 20 | node.tagName.toLowerCase() === 'noscript' ? $node.text() : $node.html(); 21 | } else { 22 | html = $node.contents(); 23 | } 24 | $node.replaceWith(`<${tag} ${attribString}>${html}`); 25 | return $; 26 | } 27 | -------------------------------------------------------------------------------- /src/utils/dom/extract-from-meta.test.js: -------------------------------------------------------------------------------- 1 | import assert from 'assert'; 2 | import cheerio from 'cheerio'; 3 | 4 | import { extractFromMeta } from './index'; 5 | 6 | describe('extractFromMeta($, metaNames, cachedNames, cleanTags)', () => { 7 | it('extracts an arbitrary meta tag by name', () => { 8 | const $ = cheerio.load(` 9 | 10 | 11 | 12 | `); 13 | const result = extractFromMeta($, ['foo', 'baz'], ['foo', 'bat']); 14 | 15 | assert.equal(result, 'bar'); 16 | }); 17 | 18 | it('returns nothing if a meta name is duplicated', () => { 19 | const $ = cheerio.load(` 20 | 21 | 22 | 23 | 24 | `); 25 | const result = extractFromMeta($, ['foo', 'baz'], ['foo', 'bat']); 26 | 27 | assert.equal(result, null); 28 | }); 29 | 30 | it('ignores duplicate meta names with empty values', () => { 31 | const $ = cheerio.load(` 32 | 33 | 34 | 35 | 36 | `); 37 | const result = extractFromMeta($, ['foo', 'baz'], ['foo', 'bat']); 38 | 39 | assert.equal(result, 'bar'); 40 | }); 41 | }); 42 | -------------------------------------------------------------------------------- /src/utils/dom/get-attrs.js: -------------------------------------------------------------------------------- 1 | export default function getAttrs(node) { 2 | const { attribs, attributes } = node; 3 | 4 | if (!attribs && attributes) { 5 | const attrs = Reflect.ownKeys(attributes).reduce((acc, index) => { 6 | const attr = attributes[index]; 7 | 8 | if (!attr.name || !attr.value) return acc; 9 | 10 | acc[attr.name] = attr.value; 11 | return acc; 12 | }, {}); 13 | return attrs; 14 | } 15 | 16 | return attribs; 17 | } 18 | -------------------------------------------------------------------------------- /src/utils/dom/get-attrs.test.js: -------------------------------------------------------------------------------- 1 | import assert from 'assert'; 2 | 3 | import getAttrs from './get-attrs'; 4 | 5 | describe('getAttrs(node)', () => { 6 | it('returns attrs for a raw jquery node', () => { 7 | const domNode = { 8 | attributes: { 9 | 0: { 10 | name: 'class', 11 | value: 'foo bar', 12 | }, 13 | }, 14 | }; 15 | 16 | const attrs = { 17 | class: 'foo bar', 18 | }; 19 | 20 | assert.deepEqual(getAttrs(domNode), attrs); 21 | }); 22 | 23 | it('returns attrs for a raw cheerio node', () => { 24 | const cheerioNode = { 25 | attribs: { 26 | class: 'foo bar', 27 | id: 'baz bat', 28 | }, 29 | }; 30 | 31 | assert.deepEqual(getAttrs(cheerioNode), cheerioNode.attribs); 32 | }); 33 | }); 34 | -------------------------------------------------------------------------------- /src/utils/dom/is-wordpress.js: -------------------------------------------------------------------------------- 1 | import { IS_WP_SELECTOR } from './constants'; 2 | 3 | export default function isWordpress($) { 4 | return $(IS_WP_SELECTOR).length > 0; 5 | } 6 | -------------------------------------------------------------------------------- /src/utils/dom/is-wordpress.test.js: -------------------------------------------------------------------------------- 1 | import assert from 'assert'; 2 | import cheerio from 'cheerio'; 3 | 4 | import isWordpress from './is-wordpress'; 5 | 6 | describe('isWordpress($)', () => { 7 | it('returns false if a site is not generated by wordpress', () => { 8 | const html = ` 9 | 10 | 11 | 12 | 13 | 14 | `; 15 | let $ = cheerio.load(html); 16 | 17 | assert.equal(isWordpress($), false); 18 | 19 | const html2 = ` 20 | 21 | 22 | 23 | 24 | 25 | `; 26 | $ = cheerio.load(html2); 27 | 28 | assert.equal(isWordpress($), false); 29 | }); 30 | 31 | it('returns true if a site is generated by wordpress', () => { 32 | const html = ` 33 | 34 | 35 | 36 | 37 | 38 | `; 39 | const $ = cheerio.load(html); 40 | 41 | assert.equal(isWordpress($), true); 42 | }); 43 | }); 44 | -------------------------------------------------------------------------------- /src/utils/dom/link-density.js: -------------------------------------------------------------------------------- 1 | export function textLength(text) { 2 | return text.trim().replace(/\s+/g, ' ').length; 3 | } 4 | 5 | // Determines what percentage of the text 6 | // in a node is link text 7 | // Takes a node, returns a float 8 | export function linkDensity($node) { 9 | const totalTextLength = textLength($node.text()); 10 | 11 | const linkText = $node.find('a').text(); 12 | const linkLength = textLength(linkText); 13 | 14 | if (totalTextLength > 0) { 15 | return linkLength / totalTextLength; 16 | } 17 | if (totalTextLength === 0 && linkLength > 0) { 18 | return 1; 19 | } 20 | 21 | return 0; 22 | } 23 | -------------------------------------------------------------------------------- /src/utils/dom/link-density.test.js: -------------------------------------------------------------------------------- 1 | import assert from 'assert'; 2 | import cheerio from 'cheerio'; 3 | 4 | import { linkDensity } from './index'; 5 | 6 | describe('linkDensity($)', () => { 7 | it('returns 0.5 if half of the text is a link', () => { 8 | const $ = cheerio.load(` 9 |

Some text!

Some text!

10 | `); 11 | 12 | const density = linkDensity($('div').first(), $); 13 | 14 | assert.equal(density, 0.5); 15 | }); 16 | 17 | it('returns 1 if all of the text is a link', () => { 18 | const $ = cheerio.load(` 19 |

Some text!

20 | `); 21 | 22 | const density = linkDensity($('div').first(), $); 23 | 24 | assert.equal(density, 1); 25 | }); 26 | 27 | it("returns 0 if there's no text", () => { 28 | const $ = cheerio.load(` 29 |

30 | `); 31 | 32 | const density = linkDensity($('div').first()); 33 | 34 | assert.equal(density, 0); 35 | }); 36 | }); 37 | -------------------------------------------------------------------------------- /src/utils/dom/mark-to-keep.js: -------------------------------------------------------------------------------- 1 | import URL from 'url'; 2 | 3 | import { KEEP_SELECTORS, KEEP_CLASS } from './constants'; 4 | 5 | export default function markToKeep(article, $, url, tags = []) { 6 | if (tags.length === 0) { 7 | tags = KEEP_SELECTORS; 8 | } 9 | 10 | if (url) { 11 | const { protocol, hostname } = URL.parse(url); 12 | tags = [...tags, `iframe[src^="${protocol}//${hostname}"]`]; 13 | } 14 | 15 | $(tags.join(','), article).addClass(KEEP_CLASS); 16 | 17 | return $; 18 | } 19 | -------------------------------------------------------------------------------- /src/utils/dom/node-is-sufficient.js: -------------------------------------------------------------------------------- 1 | // Given a node, determine if it's article-like enough to return 2 | // param: node (a cheerio node) 3 | // return: boolean 4 | 5 | export default function nodeIsSufficient($node) { 6 | return $node.text().trim().length >= 100; 7 | } 8 | -------------------------------------------------------------------------------- /src/utils/dom/node-is-sufficient.test.js: -------------------------------------------------------------------------------- 1 | import assert from 'assert'; 2 | import cheerio from 'cheerio'; 3 | 4 | import nodeIsSufficient from './node-is-sufficient'; 5 | 6 | describe('Utils', () => { 7 | describe('nodeIsSufficient(node)', () => { 8 | it('returns false if node text length < 100 chars', () => { 9 | const $ = cheerio.load(` 10 |
11 |

This is too short

12 |
13 | `); 14 | 15 | assert.equal(nodeIsSufficient($.root()), false); 16 | }); 17 | 18 | it('returns true if node text length > 100 chars', () => { 19 | const $ = cheerio.load(` 20 |
21 |

22 | Lorem ipsum dolor sit amet, consectetuer adipiscing elit. Aenean commodo ligula eget dolor. Aenean m 23 |

24 |
25 | `); 26 | 27 | assert.equal(nodeIsSufficient($.root()), true); 28 | }); 29 | }); 30 | }); 31 | -------------------------------------------------------------------------------- /src/utils/dom/paragraphize.js: -------------------------------------------------------------------------------- 1 | import { BLOCK_LEVEL_TAGS_RE } from './constants'; 2 | 3 | // Given a node, turn it into a P if it is not already a P, and 4 | // make sure it conforms to the constraints of a P tag (I.E. does 5 | // not contain any other block tags.) 6 | // 7 | // If the node is a
, it treats the following inline siblings 8 | // as if they were its children. 9 | // 10 | // :param node: The node to paragraphize; this is a raw node 11 | // :param $: The cheerio object to handle dom manipulation 12 | // :param br: Whether or not the passed node is a br 13 | 14 | export default function paragraphize(node, $, br = false) { 15 | const $node = $(node); 16 | 17 | if (br) { 18 | let sibling = node.nextSibling; 19 | const p = $('

'); 20 | 21 | // while the next node is text or not a block level element 22 | // append it to a new p node 23 | while ( 24 | sibling && 25 | !(sibling.tagName && BLOCK_LEVEL_TAGS_RE.test(sibling.tagName)) 26 | ) { 27 | const { nextSibling } = sibling; 28 | $(sibling).appendTo(p); 29 | sibling = nextSibling; 30 | } 31 | 32 | $node.replaceWith(p); 33 | $node.remove(); 34 | return $; 35 | } 36 | 37 | return $; 38 | } 39 | -------------------------------------------------------------------------------- /src/utils/dom/remove-empty.js: -------------------------------------------------------------------------------- 1 | export default function removeEmpty($article, $) { 2 | $article.find('p').each((index, p) => { 3 | const $p = $(p); 4 | if ($p.find('iframe, img').length === 0 && $p.text().trim() === '') 5 | $p.remove(); 6 | }); 7 | 8 | return $; 9 | } 10 | -------------------------------------------------------------------------------- /src/utils/dom/rewrite-top-level.js: -------------------------------------------------------------------------------- 1 | import { convertNodeTo } from 'utils/dom'; 2 | 3 | // Rewrite the tag name to div if it's a top level node like body or 4 | // html to avoid later complications with multiple body tags. 5 | export default function rewriteTopLevel(article, $) { 6 | // I'm not using context here because 7 | // it's problematic when converting the 8 | // top-level/root node - AP 9 | $ = convertNodeTo($('html'), $, 'div'); 10 | $ = convertNodeTo($('body'), $, 'div'); 11 | 12 | return $; 13 | } 14 | -------------------------------------------------------------------------------- /src/utils/dom/rewrite-top-level.test.js: -------------------------------------------------------------------------------- 1 | import cheerio from 'cheerio'; 2 | import assert from 'assert'; 3 | 4 | import { assertClean } from 'test-helpers'; 5 | 6 | import rewriteTopLevel from './rewrite-top-level'; 7 | 8 | describe('rewriteTopLevel(node, $)', () => { 9 | it('turns html and body tags into divs', () => { 10 | const $ = cheerio.load(` 11 |

Wow how about that

12 | `); 13 | const result = rewriteTopLevel($('html').first(), $); 14 | 15 | assert.equal(result('html').length, 0); 16 | assert.equal(result('body').length, 0); 17 | 18 | if (!cheerio.browser) { 19 | assertClean( 20 | result.html(), 21 | ` 22 |

Wow how about that

23 | ` 24 | ); 25 | } 26 | }); 27 | }); 28 | -------------------------------------------------------------------------------- /src/utils/dom/set-attr.js: -------------------------------------------------------------------------------- 1 | export default function setAttr(node, attr, val) { 2 | if (node.attribs) { 3 | node.attribs[attr] = val; 4 | } else if (node.attributes) { 5 | node.setAttribute(attr, val); 6 | } 7 | 8 | return node; 9 | } 10 | -------------------------------------------------------------------------------- /src/utils/dom/set-attr.test.js: -------------------------------------------------------------------------------- 1 | import assert from 'assert'; 2 | 3 | import { MockDomNode } from 'test-helpers'; 4 | import setAttr from './set-attr'; 5 | 6 | describe('setAttr(node, attr, val)', () => { 7 | it('sets attrs for a raw jquery node', () => { 8 | const domNode = new MockDomNode(); 9 | 10 | const node = setAttr(domNode, 'class', 'foo'); 11 | 12 | assert.equal(node.attributes[0].value, 'foo'); 13 | }); 14 | 15 | it('sets attrs for a raw cheerio node', () => { 16 | const cheerioNode = { 17 | attribs: { 18 | class: 'foo bar', 19 | id: 'baz bat', 20 | }, 21 | }; 22 | 23 | const node = setAttr(cheerioNode, 'class', 'foo'); 24 | 25 | assert.equal(node.attribs.class, 'foo'); 26 | }); 27 | }); 28 | -------------------------------------------------------------------------------- /src/utils/dom/set-attrs.js: -------------------------------------------------------------------------------- 1 | export default function setAttrs(node, attrs) { 2 | if (node.attribs) { 3 | node.attribs = attrs; 4 | } else if (node.attributes) { 5 | while (node.attributes.length > 0) { 6 | node.removeAttribute(node.attributes[0].name); 7 | } 8 | 9 | Reflect.ownKeys(attrs).forEach(key => { 10 | node.setAttribute(key, attrs[key]); 11 | }); 12 | } 13 | 14 | return node; 15 | } 16 | -------------------------------------------------------------------------------- /src/utils/dom/set-attrs.test.js: -------------------------------------------------------------------------------- 1 | import assert from 'assert'; 2 | 3 | import { MockDomNode } from 'test-helpers'; 4 | import setAttrs from './set-attrs'; 5 | 6 | describe('setAttrs(node, attrs)', () => { 7 | it('sets attrs for a raw jquery node', () => { 8 | const attrs = { 9 | class: 'baz', 10 | }; 11 | 12 | const postAttrs = [ 13 | { 14 | name: 'class', 15 | value: 'baz', 16 | }, 17 | ]; 18 | 19 | const domNode = new MockDomNode(); 20 | const node = setAttrs(domNode, attrs); 21 | 22 | assert.deepEqual(node.attributes, postAttrs); 23 | }); 24 | 25 | it('sets attrs for a raw cheerio node', () => { 26 | const cheerioNode = { 27 | attribs: { 28 | class: 'foo bar', 29 | id: 'baz bat', 30 | }, 31 | }; 32 | 33 | const attrs = { 34 | class: 'baz', 35 | id: 'bar', 36 | }; 37 | 38 | const node = setAttrs(cheerioNode, attrs); 39 | 40 | assert.deepEqual(node.attribs, attrs); 41 | }); 42 | }); 43 | -------------------------------------------------------------------------------- /src/utils/dom/strip-junk-tags.js: -------------------------------------------------------------------------------- 1 | import { STRIP_OUTPUT_TAGS, KEEP_CLASS } from './constants'; 2 | 3 | export default function stripJunkTags(article, $, tags = []) { 4 | if (tags.length === 0) { 5 | tags = STRIP_OUTPUT_TAGS; 6 | } 7 | 8 | // Remove matching elements, but ignore 9 | // any element with a class of mercury-parser-keep 10 | $(tags.join(','), article) 11 | .not(`.${KEEP_CLASS}`) 12 | .remove(); 13 | 14 | return $; 15 | } 16 | -------------------------------------------------------------------------------- /src/utils/dom/strip-tags.js: -------------------------------------------------------------------------------- 1 | // strips all tags from a string of text 2 | export default function stripTags(text, $) { 3 | // Wrapping text in html element prevents errors when text 4 | // has no html 5 | const cleanText = $(`${text}`).text(); 6 | return cleanText === '' ? text : cleanText; 7 | } 8 | -------------------------------------------------------------------------------- /src/utils/dom/strip-tags.test.js: -------------------------------------------------------------------------------- 1 | import assert from 'assert'; 2 | import cheerio from 'cheerio'; 3 | 4 | import stripTags from './strip-tags'; 5 | 6 | describe('stripTags(title, $)', () => { 7 | it('strips tags from a string of text', () => { 8 | const $ = cheerio.load('
'); 9 | 10 | const result = stripTags('What a Wonderful Day', $); 11 | 12 | assert.equal(result, 'What a Wonderful Day'); 13 | }); 14 | 15 | it('returns the original text if no tags found', () => { 16 | const $ = cheerio.load('
'); 17 | 18 | const result = stripTags('What a Wonderful Day', $); 19 | 20 | assert.equal(result, 'What a Wonderful Day'); 21 | }); 22 | }); 23 | -------------------------------------------------------------------------------- /src/utils/dom/strip-unlikely-candidates.js: -------------------------------------------------------------------------------- 1 | import { CANDIDATES_WHITELIST, CANDIDATES_BLACKLIST } from './constants'; 2 | 3 | export default function stripUnlikelyCandidates($) { 4 | // Loop through the provided document and remove any non-link nodes 5 | // that are unlikely candidates for article content. 6 | // 7 | // Links are ignored because there are very often links to content 8 | // that are identified as non-body-content, but may be inside 9 | // article-like content. 10 | // 11 | // :param $: a cheerio object to strip nodes from 12 | // :return $: the cleaned cheerio object 13 | $('*') 14 | .not('a') 15 | .each((index, node) => { 16 | const $node = $(node); 17 | const classes = $node.attr('class'); 18 | const id = $node.attr('id'); 19 | if (!id && !classes) return; 20 | 21 | const classAndId = `${classes || ''} ${id || ''}`; 22 | if (CANDIDATES_WHITELIST.test(classAndId)) { 23 | return; 24 | } 25 | if (CANDIDATES_BLACKLIST.test(classAndId)) { 26 | $node.remove(); 27 | } 28 | }); 29 | 30 | return $; 31 | } 32 | -------------------------------------------------------------------------------- /src/utils/dom/within-comment.js: -------------------------------------------------------------------------------- 1 | import { getAttrs } from 'utils/dom'; 2 | 3 | export default function withinComment($node) { 4 | const parents = $node.parents().toArray(); 5 | const commentParent = parents.find(parent => { 6 | const attrs = getAttrs(parent); 7 | const { class: nodeClass, id } = attrs; 8 | const classAndId = `${nodeClass} ${id}`; 9 | return classAndId.includes('comment'); 10 | }); 11 | 12 | return commentParent !== undefined; 13 | } 14 | -------------------------------------------------------------------------------- /src/utils/dom/within-comment.test.js: -------------------------------------------------------------------------------- 1 | import cheerio from 'cheerio'; 2 | import assert from 'assert'; 3 | 4 | import withinComment from './within-comment'; 5 | 6 | describe('withinComment(node)', () => { 7 | it('returns false if its parent is not a comment', () => { 8 | const $ = cheerio.load(` 9 |
10 |
11 |
Adam
12 |
13 |
14 | `); 15 | assert.equal(withinComment($('.author').first()), false); 16 | }); 17 | 18 | it('returns true if its parent has a class of comment', () => { 19 | const $ = cheerio.load(` 20 |
21 |
22 |
Adam
23 |
24 |
25 | `); 26 | assert.equal(withinComment($('.author').first()), true); 27 | }); 28 | 29 | it('returns true if its parent has an id of comment', () => { 30 | const $ = cheerio.load(` 31 |
32 |
33 |
Adam
34 |
35 |
36 | `); 37 | assert.equal(withinComment($('.author').first()), true); 38 | }); 39 | }); 40 | -------------------------------------------------------------------------------- /src/utils/index.js: -------------------------------------------------------------------------------- 1 | export { default as range } from './range'; 2 | export { default as validateUrl } from './validate-url'; 3 | -------------------------------------------------------------------------------- /src/utils/merge-supported-domains.js: -------------------------------------------------------------------------------- 1 | const merge = (extractor, domains) => 2 | domains.reduce((acc, domain) => { 3 | acc[domain] = extractor; 4 | return acc; 5 | }, {}); 6 | 7 | export default function mergeSupportedDomains(extractor) { 8 | return extractor.supportedDomains 9 | ? merge(extractor, [extractor.domain, ...extractor.supportedDomains]) 10 | : merge(extractor, [extractor.domain]); 11 | } 12 | -------------------------------------------------------------------------------- /src/utils/merge-supported-domains.test.js: -------------------------------------------------------------------------------- 1 | import assert from 'assert'; 2 | import mergeSupportedDomains from './merge-supported-domains'; 3 | 4 | describe('mergeSupportedDomains(extractor, domains)', () => { 5 | it('returns an object w/domains as keys and extractor as value', () => { 6 | const extractor = { 7 | domain: 'foo.com', 8 | supportedDomains: ['example.com'], 9 | }; 10 | 11 | const expected = { 12 | 'foo.com': extractor, 13 | 'example.com': extractor, 14 | }; 15 | 16 | const result = mergeSupportedDomains(extractor); 17 | assert.deepEqual(result, expected); 18 | }); 19 | 20 | it('returns an object w/single domain if no supportedDomains', () => { 21 | const extractor = { 22 | domain: 'foo.com', 23 | }; 24 | 25 | const expected = { 26 | 'foo.com': extractor, 27 | }; 28 | 29 | const result = mergeSupportedDomains(extractor); 30 | assert.deepEqual(result, expected); 31 | }); 32 | }); 33 | -------------------------------------------------------------------------------- /src/utils/range.js: -------------------------------------------------------------------------------- 1 | export default function* range(start = 1, end = 1) { 2 | while (start <= end) { 3 | yield (start += 1); 4 | } 5 | } 6 | -------------------------------------------------------------------------------- /src/utils/text/article-base-url.test.js: -------------------------------------------------------------------------------- 1 | import assert from 'assert'; 2 | 3 | import articleBaseUrl from './article-base-url'; 4 | 5 | describe('articleBaseUrl(url, parsedUrl)', () => { 6 | it('returns the base url of a paginated url', () => { 7 | const url = 'http://example.com/foo/bar/wow-cool/page=10'; 8 | const cleaned = 'http://example.com/foo/bar/wow-cool'; 9 | 10 | assert.equal(articleBaseUrl(url), cleaned); 11 | }); 12 | 13 | it('returns same url if url has no pagination info', () => { 14 | const url = 'http://example.com/foo/bar/wow-cool/'; 15 | const cleaned = 'http://example.com/foo/bar/wow-cool'; 16 | 17 | assert.equal(articleBaseUrl(url), cleaned); 18 | }); 19 | }); 20 | -------------------------------------------------------------------------------- /src/utils/text/constants.js: -------------------------------------------------------------------------------- 1 | // An expression that looks to try to find the page digit within a URL, if 2 | // it exists. 3 | // Matches: 4 | // page=1 5 | // pg=1 6 | // p=1 7 | // paging=12 8 | // pag=7 9 | // pagination/1 10 | // paging/88 11 | // pa/83 12 | // p/11 13 | // 14 | // Does not match: 15 | // pg=102 16 | // page:2 17 | export const PAGE_IN_HREF_RE = new RegExp( 18 | '(page|paging|(p(a|g|ag)?(e|enum|ewanted|ing|ination)))?(=|/)([0-9]{1,3})', 19 | 'i' 20 | ); 21 | 22 | export const HAS_ALPHA_RE = /[a-z]/i; 23 | 24 | export const IS_ALPHA_RE = /^[a-z]+$/i; 25 | export const IS_DIGIT_RE = /^[0-9]+$/i; 26 | 27 | export const ENCODING_RE = /charset=([\w-]+)\b/; 28 | export const DEFAULT_ENCODING = 'utf-8'; 29 | -------------------------------------------------------------------------------- /src/utils/text/excerpt-content.js: -------------------------------------------------------------------------------- 1 | export default function excerptContent(content, words = 10) { 2 | return content 3 | .trim() 4 | .split(/\s+/) 5 | .slice(0, words) 6 | .join(' '); 7 | } 8 | -------------------------------------------------------------------------------- /src/utils/text/excerpt.test.js: -------------------------------------------------------------------------------- 1 | import assert from 'assert'; 2 | import excerptContent from './excerpt-content'; 3 | 4 | describe('excerptContent(content, words)', () => { 5 | it('extracts the requested number of words from content', () => { 6 | const content = ' One two three four five six, seven eight, nine, ten.'; 7 | 8 | const three = excerptContent(content, 3); 9 | assert.equal(three, 'One two three'); 10 | 11 | const ten = excerptContent(content, 10); 12 | assert.equal(ten, content.trim().replace(/\s+/, ' ')); 13 | }); 14 | }); 15 | -------------------------------------------------------------------------------- /src/utils/text/extract-from-url.js: -------------------------------------------------------------------------------- 1 | // Given a node type to search for, and a list of regular expressions, 2 | // look to see if this extraction can be found in the URL. Expects 3 | // that each expression in r_list will return group(1) as the proper 4 | // string to be cleaned. 5 | // Only used for date_published currently. 6 | export default function extractFromUrl(url, regexList) { 7 | const matchRe = regexList.find(re => re.test(url)); 8 | if (matchRe) { 9 | return matchRe.exec(url)[1]; 10 | } 11 | 12 | return null; 13 | } 14 | -------------------------------------------------------------------------------- /src/utils/text/extract-from-url.test.js: -------------------------------------------------------------------------------- 1 | import assert from 'assert'; 2 | 3 | import extractFromUrl from './extract-from-url'; 4 | 5 | describe('extractFromUrl(url)', () => { 6 | it('extracts datePublished from url', () => { 7 | const url = 'https://example.com/2012/08/01/this-is-good'; 8 | const regexList = [new RegExp('/(20\\d{2}/\\d{2}/\\d{2})/')]; 9 | const result = extractFromUrl(url, regexList); 10 | 11 | assert.equal(result, '2012/08/01'); 12 | }); 13 | 14 | it('returns null if nothing found', () => { 15 | const url = 'https://example.com/this-is-good'; 16 | const regexList = [new RegExp('/(20\\d{2}/\\d{2}/\\d{2})/')]; 17 | const result = extractFromUrl(url, regexList); 18 | 19 | assert.equal(result, null); 20 | }); 21 | }); 22 | -------------------------------------------------------------------------------- /src/utils/text/get-encoding.js: -------------------------------------------------------------------------------- 1 | import iconv from 'iconv-lite'; 2 | import { DEFAULT_ENCODING, ENCODING_RE } from './constants'; 3 | 4 | // check a string for encoding; this is 5 | // used in our fetchResource function to 6 | // ensure correctly encoded responses 7 | export default function getEncoding(str) { 8 | let encoding = DEFAULT_ENCODING; 9 | const matches = ENCODING_RE.exec(str); 10 | if (matches !== null) { 11 | [, str] = matches; 12 | } 13 | if (iconv.encodingExists(str)) { 14 | encoding = str; 15 | } 16 | return encoding; 17 | } 18 | -------------------------------------------------------------------------------- /src/utils/text/get-encoding.test.js: -------------------------------------------------------------------------------- 1 | import assert from 'assert'; 2 | import cheerio from 'cheerio'; 3 | 4 | import getEncoding from './get-encoding'; 5 | 6 | // Tests are bypassed in the browser because it has an encoding 7 | // A shim is used /src/shims/iconv-lite.js to decrease load size 8 | 9 | describe('getEncoding(str)', () => { 10 | if (cheerio.browser) return; 11 | 12 | it('returns the encoding as a string', () => { 13 | const contentType = 'text/html; charset=iso-8859-15'; 14 | assert.equal(getEncoding(contentType), 'iso-8859-15'); 15 | }); 16 | 17 | it('returns utf-8 as a default if no encoding found', () => { 18 | const contentType = 'text/html'; 19 | assert.equal(getEncoding(contentType), 'utf-8'); 20 | }); 21 | 22 | it('returns utf-8 if there is an invalid encoding', () => { 23 | const contentType = 'text/html; charset=fake-charset'; 24 | assert.equal(getEncoding(contentType), 'utf-8'); 25 | }); 26 | }); 27 | -------------------------------------------------------------------------------- /src/utils/text/has-sentence-end.js: -------------------------------------------------------------------------------- 1 | // Given a string, return True if it appears to have an ending sentence 2 | // within it, false otherwise. 3 | const SENTENCE_END_RE = new RegExp('.( |$)'); 4 | export default function hasSentenceEnd(text) { 5 | return SENTENCE_END_RE.test(text); 6 | } 7 | -------------------------------------------------------------------------------- /src/utils/text/index.js: -------------------------------------------------------------------------------- 1 | export { default as normalizeSpaces } from './normalize-spaces'; 2 | export { default as extractFromUrl } from './extract-from-url'; 3 | export { default as pageNumFromUrl } from './page-num-from-url'; 4 | export { default as removeAnchor } from './remove-anchor'; 5 | export { default as articleBaseUrl } from './article-base-url'; 6 | export { default as hasSentenceEnd } from './has-sentence-end'; 7 | export { default as excerptContent } from './excerpt-content'; 8 | export { default as getEncoding } from './get-encoding'; 9 | -------------------------------------------------------------------------------- /src/utils/text/normalize-spaces.js: -------------------------------------------------------------------------------- 1 | const NORMALIZE_RE = /\s{2,}(?![^<>]*<\/(pre|code|textarea)>)/g; 2 | 3 | export default function normalizeSpaces(text) { 4 | return text.replace(NORMALIZE_RE, ' ').trim(); 5 | } 6 | -------------------------------------------------------------------------------- /src/utils/text/normalize-spaces.test.js: -------------------------------------------------------------------------------- 1 | import assert from 'assert'; 2 | import cheerio from 'cheerio'; 3 | 4 | import { normalizeSpaces } from './index'; 5 | 6 | describe('normalizeSpaces(text)', () => { 7 | it('normalizes spaces from text', () => { 8 | const $ = cheerio.load(` 9 |
10 |

What do you think?

11 |
12 | `); 13 | 14 | const result = normalizeSpaces( 15 | $('*') 16 | .first() 17 | .text() 18 | ); 19 | assert.equal(result, 'What do you think?'); 20 | }); 21 | 22 | it('preserves spaces in preformatted text blocks', () => { 23 | const $ = cheerio.load(` 24 |
25 |

What do you think?

26 |
  What     happens to        spaces?    
27 |
28 | `); 29 | 30 | const result = normalizeSpaces($.html()); 31 | assert.equal( 32 | result, 33 | '

What do you think?

  What     happens to        spaces?    
' 34 | ); 35 | }); 36 | }); 37 | -------------------------------------------------------------------------------- /src/utils/text/page-num-from-url.js: -------------------------------------------------------------------------------- 1 | import { PAGE_IN_HREF_RE } from './constants'; 2 | 3 | export default function pageNumFromUrl(url) { 4 | const matches = url.match(PAGE_IN_HREF_RE); 5 | if (!matches) return null; 6 | 7 | const pageNum = parseInt(matches[6], 10); 8 | 9 | // Return pageNum < 100, otherwise 10 | // return null 11 | return pageNum < 100 ? pageNum : null; 12 | } 13 | -------------------------------------------------------------------------------- /src/utils/text/page-num-from-url.test.js: -------------------------------------------------------------------------------- 1 | import assert from 'assert'; 2 | 3 | import pageNumFromUrl from './page-num-from-url'; 4 | 5 | describe('pageNumFromUrl(url)', () => { 6 | it('returns null if there is no page num in the url', () => { 7 | assert.equal(pageNumFromUrl('http://example.com'), null); 8 | assert.equal(pageNumFromUrl('http://example.com/?pg=102'), null); 9 | assert.equal(pageNumFromUrl('http://example.com/?page:102'), null); 10 | }); 11 | 12 | it('returns a page num if one matches the url', () => { 13 | assert.equal(pageNumFromUrl('http://example.com/foo?page=1'), 1); 14 | assert.equal(pageNumFromUrl('http://example.com/foo?pg=1'), 1); 15 | assert.equal(pageNumFromUrl('http://example.com/foo?p=1'), 1); 16 | assert.equal(pageNumFromUrl('http://example.com/foo?paging=1'), 1); 17 | assert.equal(pageNumFromUrl('http://example.com/foo?pag=1'), 1); 18 | assert.equal(pageNumFromUrl('http://example.com/foo?pagination/1'), 1); 19 | assert.equal(pageNumFromUrl('http://example.com/foo?paging/99'), 99); 20 | assert.equal(pageNumFromUrl('http://example.com/foo?pa/99'), 99); 21 | assert.equal(pageNumFromUrl('http://example.com/foo?p/99'), 99); 22 | }); 23 | }); 24 | -------------------------------------------------------------------------------- /src/utils/text/remove-anchor.js: -------------------------------------------------------------------------------- 1 | export default function removeAnchor(url) { 2 | return url.split('#')[0].replace(/\/$/, ''); 3 | } 4 | -------------------------------------------------------------------------------- /src/utils/text/remove-anchor.test.js: -------------------------------------------------------------------------------- 1 | import assert from 'assert'; 2 | 3 | import removeAnchor from './remove-anchor'; 4 | 5 | describe('removeAnchor(url)', () => { 6 | it('returns a url w/out #anchor', () => { 7 | const url = 'http://example.com/foo/bar/wow-cool/page=10/#wow'; 8 | const cleaned = 'http://example.com/foo/bar/wow-cool/page=10'; 9 | 10 | assert.equal(removeAnchor(url), cleaned); 11 | }); 12 | 13 | it('returns same url if url has no anchor found', () => { 14 | const url = 'http://example.com/foo/bar/wow-cool'; 15 | const cleaned = 'http://example.com/foo/bar/wow-cool'; 16 | 17 | assert.equal(removeAnchor(url), cleaned); 18 | }); 19 | }); 20 | -------------------------------------------------------------------------------- /src/utils/validate-url.js: -------------------------------------------------------------------------------- 1 | // extremely simple url validation as a first step 2 | export default function validateUrl({ hostname }) { 3 | // If this isn't a valid url, return an error message 4 | return !!hostname; 5 | } 6 | -------------------------------------------------------------------------------- /src/utils/validate-url.test.js: -------------------------------------------------------------------------------- 1 | import assert from 'assert'; 2 | import URL from 'url'; 3 | 4 | import validateUrl from './validate-url'; 5 | 6 | describe('validateUrl(parsedUrl)', () => { 7 | it('returns false if url is not valid', () => { 8 | const url = URL.parse('example.com'); 9 | const valid = validateUrl(url); 10 | 11 | assert.equal(valid, false); 12 | }); 13 | 14 | it('returns true if url is valid', () => { 15 | const url = URL.parse('http://example.com'); 16 | const valid = validateUrl(url); 17 | 18 | assert.equal(valid, true); 19 | }); 20 | }); 21 | --------------------------------------------------------------------------------