├── .agignore
├── .babelrc
├── .circleci
    └── config.yml
├── .eslintignore
├── .eslintrc
├── .gitattributes
├── .github
    ├── ISSUE_TEMPLATE.md
    └── PULL_REQUEST_TEMPLATE.md
├── .gitignore
├── .nvmrc
├── .prettierignore
├── .prettierrc
├── .remarkrc
├── CHANGELOG.md
├── CODE_OF_CONDUCT.md
├── CONTRIBUTING.md
├── LICENSE-APACHE
├── LICENSE-MIT
├── README.md
├── RELEASE.md
├── assets
    └── parser-basic-usage.gif
├── cli.js
├── dist
    ├── generate-custom-parser.js
    ├── generate-custom-parser.js.map
    ├── mercury.esm.js
    ├── mercury.esm.js.map
    ├── mercury.js
    ├── mercury.js.map
    ├── mercury.web.js
    └── mercury.web.js.map
├── fixtures
    ├── 247sports.com.html
    ├── abcnews.go.com.html
    ├── arstechnica.com.html
    ├── biorxiv.org.html
    ├── blisterreview.com.html
    ├── bookwalker.jp.html
    ├── buzzap.jp.html
    ├── clinicaltrials.gov.html
    ├── deadline.com.html
    ├── deadspin.com--video.html
    ├── deadspin.com.html
    ├── epaper.zeit.de.html
    ├── fandom.wikia.com.html
    ├── fortune.com.html
    ├── forward.com.html
    ├── genius.com.html
    ├── getnews.jp.html
    ├── github.com.html
    ├── gothamist.com.html
    ├── hellogiggles.com.html
    ├── ici.radio-canada.ca.html
    ├── japan.cnet.com.html
    ├── japan.zdnet.com.html
    ├── jvndb.jvn.jp.html
    ├── ma.ttias.be.html
    ├── mashable.com.html
    ├── medium.com--another.html
    ├── medium.com.html
    ├── money.cnn.com.html
    ├── newrepublic.com--minutes.html
    ├── newrepublic.com.html
    ├── news.mynavi.jp.html
    ├── news.nationalgeographic.com.html
    ├── nock
    │   ├── fetch-resource-test.js
    │   ├── mercury-test.js
    │   └── resource-test.js
    ├── nymag.com.html
    ├── obamawhitehouse.archives.gov--blog.html
    ├── obamawhitehouse.archives.gov--empty.html
    ├── obamawhitehouse.archives.gov--speeches.html
    ├── obamawhitehouse.archives.gov.html
    ├── observer.com.html
    ├── otrs.com.html
    ├── pagesix.com.html
    ├── pastebin.com.html
    ├── people.com.html
    ├── phpspot.org.html
    ├── pitchfork.com.html
    ├── postlight.com.html
    ├── qz.com.html
    ├── sandiegouniontribune.com.html
    ├── scan.netsecurity.ne.jp.html
    ├── sciencefly.com.html
    ├── sect.iij.ad.jp.html
    ├── takagi-hiromitsu.jp.html
    ├── techlog.iij.ad.jp.html
    ├── thefederalistpapers.org.html
    ├── thoughtcatalog.com.html
    ├── timesofindia.indiatimes.com.html
    ├── twitter.com.html
    ├── uproxx.com.html
    ├── weekly.ascii.jp.html
    ├── wired.jp.html
    ├── www.abendblatt.de.html
    ├── www.al.com.html
    ├── www.americanow.com.html
    ├── www.androidcentral.com.html
    ├── www.aol.com.html
    ├── www.apartmenttherapy.com.html
    ├── www.asahi.com.html
    ├── www.bloomberg.com--graphics.html
    ├── www.bloomberg.com--news.html
    ├── www.bloomberg.com.html
    ├── www.broadwayworld.com.html
    ├── www.bustle.com.html
    ├── www.buzzfeed.com--splash.html
    ├── www.buzzfeed.com.html
    ├── www.cbc.ca.html
    ├── www.cbssports.com.html
    ├── www.chicagotribune.com.html
    ├── www.cnbc.com--redesign.html
    ├── www.cnbc.com.html
    ├── www.cnet.com.html
    ├── www.cnn.com.html
    ├── www.dmagazine.com.html
    ├── www.elecom.co.jp.html
    ├── www.engadget.com.html
    ├── www.eonline.com.html
    ├── www.fastcompany.com.html
    ├── www.fool.com.html
    ├── www.fortinet.com.html
    ├── www.gizmodo.jp.html
    ├── www.gruene.de.html
    ├── www.huffingtonpost.com.html
    ├── www.infoq.com.html
    ├── www.inquisitr.com.html
    ├── www.investmentexecutive.com.html
    ├── www.ipa.go.jp.html
    ├── www.itmedia.co.jp.html
    ├── www.jnsa.org.html
    ├── www.ladbible.com.html
    ├── www.latimes.com--old.html
    ├── www.latimes.com.html
    ├── www.lemonde.fr.html
    ├── www.lifehacker.jp.html
    ├── www.linkedin.com.html
    ├── www.littlethings.com.html
    ├── www.macrumors.com.html
    ├── www.mentalfloss.com.html
    ├── www.miamiherald.com.html
    ├── www.moongift.jp.html
    ├── www.msn.com.html
    ├── www.msnbc.com.html
    ├── www.nationalgeographic.com.html
    ├── www.nbcnews.com.html
    ├── www.ndtv.com.html
    ├── www.newyorker.com--magazine.html
    ├── www.newyorker.com--multiple-authors.html
    ├── www.newyorker.com.html
    ├── www.npr.org.html
    ├── www.nydailynews.com.html
    ├── www.nytimes.com--feature.html
    ├── www.nytimes.com--recent.html
    ├── www.nytimes.com.html
    ├── www.opposingviews.com.html
    ├── www.oreilly.co.jp.html
    ├── www.ossnews.jp.html
    ├── www.phoronix.com.html
    ├── www.politico.com--test-case-2.html
    ├── www.politico.com--test-case-3.html
    ├── www.politico.com.html
    ├── www.popsugar.com.html
    ├── www.prospectmagazine.co.uk.html
    ├── www.publickey1.jp.html
    ├── www.qdaily.com.html
    ├── www.rawstory.com.html
    ├── www.rbbtoday.com.html
    ├── www.recode.net.html
    ├── www.reddit.com--embedded.html
    ├── www.reddit.com--external-image.html
    ├── www.reddit.com--external-link.html
    ├── www.reddit.com--image.html
    ├── www.reddit.com--title-only.html
    ├── www.reddit.com--video.html
    ├── www.reddit.com.html
    ├── www.refinery29.com.html
    ├── www.reuters.com.html
    ├── www.rollingstone.com.html
    ├── www.sanwa.co.jp.html
    ├── www.sbnation.com.html
    ├── www.si.com.html
    ├── www.slate.com.html
    ├── www.spektrum.de.html
    ├── www.theatlantic.com.html
    ├── www.theguardian.com.html
    ├── www.thepennyhoarder.com.html
    ├── www.thepoliticalinsider.com.html
    ├── www.theverge.com--feature.html
    ├── www.theverge.com.html
    ├── www.tmz.com.html
    ├── www.today.com.html
    ├── www.usmagazine.com.html
    ├── www.vox.com.html
    ├── www.vulture.com--content-test.html
    ├── www.vulture.com.html
    ├── www.washingtonpost.com.html
    ├── www.westernjournalism.com.html
    ├── www.wired.com--content-test.html
    ├── www.wired.com--other.html
    ├── www.wired.com.html
    ├── www.yahoo.com.html
    ├── www.yomiuri.co.jp.html
    └── www.youtube.com.html
├── karma.conf.js
├── package.json
├── preview
├── rollup.config.esm.js
├── rollup.config.js
├── rollup.config.web.js
├── score-move
├── scripts
    ├── check-build.test.js
    ├── comment-for-pr.js
    ├── find-and-replace.sh
    ├── generate-custom-parser.js
    ├── generate-fixture-preview.js
    ├── karma.conf.js
    ├── pr-parser-preview.sh
    ├── proxy-browser-test.js
    ├── rollup.config.js
    ├── templates
    │   ├── custom-extractor-test.js
    │   ├── custom-extractor.js
    │   ├── index.js
    │   └── insert-values.js
    ├── update-fixtures.js
    └── write-test-report.js
├── src
    ├── cleaners
    │   ├── author.js
    │   ├── author.test.js
    │   ├── constants.js
    │   ├── content.js
    │   ├── content.test.js
    │   ├── date-published.js
    │   ├── date-published.test.js
    │   ├── dek.js
    │   ├── dek.test.js
    │   ├── index.js
    │   ├── lead-image-url.js
    │   ├── lead-image-url.test.js
    │   ├── resolve-split-title.js
    │   ├── resolve-split-title.test.js
    │   ├── title.js
    │   └── title.test.js
    ├── extractors
    │   ├── add-extractor.js
    │   ├── add-extractor.test.js
    │   ├── all.js
    │   ├── collect-all-pages.js
    │   ├── constants.js
    │   ├── custom
    │   │   ├── 247sports.com
    │   │   │   ├── index.js
    │   │   │   └── index.test.js
    │   │   ├── README.md
    │   │   ├── abcnews.go.com
    │   │   │   ├── index.js
    │   │   │   └── index.test.js
    │   │   ├── arstechnica.com
    │   │   │   ├── index.js
    │   │   │   └── index.test.js
    │   │   ├── biorxiv.org
    │   │   │   ├── index.js
    │   │   │   └── index.test.js
    │   │   ├── blisterreview.com
    │   │   │   ├── index.js
    │   │   │   └── index.test.js
    │   │   ├── blogspot.com
    │   │   │   └── index.js
    │   │   ├── bookwalker.jp
    │   │   │   ├── index.js
    │   │   │   └── index.test.js
    │   │   ├── buzzap.jp
    │   │   │   ├── index.js
    │   │   │   └── index.test.js
    │   │   ├── clinicaltrials.gov
    │   │   │   ├── index.js
    │   │   │   └── index.test.js
    │   │   ├── deadline.com
    │   │   │   ├── index.js
    │   │   │   └── index.test.js
    │   │   ├── deadspin.com
    │   │   │   ├── index.js
    │   │   │   └── index.test.js
    │   │   ├── epaper.zeit.de
    │   │   │   ├── index.js
    │   │   │   └── index.test.js
    │   │   ├── fandom.wikia.com
    │   │   │   ├── index.js
    │   │   │   └── index.test.js
    │   │   ├── fortune.com
    │   │   │   ├── index.js
    │   │   │   └── index.test.js
    │   │   ├── forward.com
    │   │   │   ├── index.js
    │   │   │   └── index.test.js
    │   │   ├── genius.com
    │   │   │   ├── index.js
    │   │   │   └── index.test.js
    │   │   ├── getnews.jp
    │   │   │   ├── index.js
    │   │   │   └── index.test.js
    │   │   ├── github.com
    │   │   │   ├── index.js
    │   │   │   └── index.test.js
    │   │   ├── gothamist.com
    │   │   │   ├── index.js
    │   │   │   └── index.test.js
    │   │   ├── hellogiggles.com
    │   │   │   ├── index.js
    │   │   │   └── index.test.js
    │   │   ├── ici.radio-canada.ca
    │   │   │   ├── index.js
    │   │   │   └── index.test.js
    │   │   ├── index.js
    │   │   ├── japan.cnet.com
    │   │   │   ├── index.js
    │   │   │   └── index.test.js
    │   │   ├── japan.zdnet.com
    │   │   │   ├── index.js
    │   │   │   └── index.test.js
    │   │   ├── jvndb.jvn.jp
    │   │   │   ├── index.js
    │   │   │   └── index.test.js
    │   │   ├── ma.ttias.be
    │   │   │   ├── index.js
    │   │   │   └── index.test.js
    │   │   ├── mashable.com
    │   │   │   ├── index.js
    │   │   │   └── index.test.js
    │   │   ├── medium.com
    │   │   │   ├── index.js
    │   │   │   └── index.test.js
    │   │   ├── money.cnn.com
    │   │   │   ├── index.js
    │   │   │   └── index.test.js
    │   │   ├── newrepublic.com
    │   │   │   ├── index.js
    │   │   │   └── index.test.js
    │   │   ├── news.mynavi.jp
    │   │   │   ├── index.js
    │   │   │   └── index.test.js
    │   │   ├── news.nationalgeographic.com
    │   │   │   ├── index.js
    │   │   │   └── index.test.js
    │   │   ├── nymag.com
    │   │   │   ├── fixtures
    │   │   │   │   └── test.html
    │   │   │   ├── index.js
    │   │   │   └── index.test.js
    │   │   ├── obamawhitehouse.archives.gov
    │   │   │   ├── index.js
    │   │   │   └── index.test.js
    │   │   ├── observer.com
    │   │   │   ├── index.js
    │   │   │   └── index.test.js
    │   │   ├── otrs.com
    │   │   │   ├── index.js
    │   │   │   └── index.test.js
    │   │   ├── pagesix.com
    │   │   │   ├── index.js
    │   │   │   └── index.test.js
    │   │   ├── pastebin.com
    │   │   │   ├── index.js
    │   │   │   └── index.test.js
    │   │   ├── people.com
    │   │   │   ├── index.js
    │   │   │   └── index.test.js
    │   │   ├── phpspot.org
    │   │   │   ├── index.js
    │   │   │   └── index.test.js
    │   │   ├── pitchfork.com
    │   │   │   ├── index.js
    │   │   │   └── index.test.js
    │   │   ├── postlight.com
    │   │   │   ├── index.js
    │   │   │   └── index.test.js
    │   │   ├── qz.com
    │   │   │   ├── index.js
    │   │   │   └── index.test.js
    │   │   ├── scan.netsecurity.ne.jp
    │   │   │   ├── index.js
    │   │   │   └── index.test.js
    │   │   ├── sciencefly.com
    │   │   │   ├── index.js
    │   │   │   └── index.test.js
    │   │   ├── sect.iij.ad.jp
    │   │   │   ├── index.js
    │   │   │   └── index.test.js
    │   │   ├── takagi-hiromitsu.jp
    │   │   │   ├── index.js
    │   │   │   └── index.test.js
    │   │   ├── techlog.iij.ad.jp
    │   │   │   ├── index.js
    │   │   │   └── index.test.js
    │   │   ├── thefederalistpapers.org
    │   │   │   ├── index.js
    │   │   │   └── index.test.js
    │   │   ├── thoughtcatalog.com
    │   │   │   ├── index.js
    │   │   │   └── index.test.js
    │   │   ├── timesofindia.indiatimes.com
    │   │   │   ├── index.js
    │   │   │   └── index.test.js
    │   │   ├── twitter.com
    │   │   │   ├── index.js
    │   │   │   └── index.test.js
    │   │   ├── uproxx.com
    │   │   │   ├── index.js
    │   │   │   └── index.test.js
    │   │   ├── weekly.ascii.jp
    │   │   │   ├── index.js
    │   │   │   └── index.test.js
    │   │   ├── wikipedia.org
    │   │   │   └── index.js
    │   │   ├── wired.jp
    │   │   │   ├── index.js
    │   │   │   └── index.test.js
    │   │   ├── www.abendblatt.de
    │   │   │   ├── index.js
    │   │   │   └── index.test.js
    │   │   ├── www.al.com
    │   │   │   ├── index.js
    │   │   │   └── index.test.js
    │   │   ├── www.americanow.com
    │   │   │   ├── index.js
    │   │   │   └── index.test.js
    │   │   ├── www.androidcentral.com
    │   │   │   ├── index.js
    │   │   │   └── index.test.js
    │   │   ├── www.aol.com
    │   │   │   ├── index.js
    │   │   │   └── index.test.js
    │   │   ├── www.apartmenttherapy.com
    │   │   │   ├── index.js
    │   │   │   └── index.test.js
    │   │   ├── www.asahi.com
    │   │   │   ├── index.js
    │   │   │   └── index.test.js
    │   │   ├── www.bloomberg.com
    │   │   │   ├── index.js
    │   │   │   └── index.test.js
    │   │   ├── www.broadwayworld.com
    │   │   │   ├── index.js
    │   │   │   └── index.test.js
    │   │   ├── www.bustle.com
    │   │   │   ├── index.js
    │   │   │   └── index.test.js
    │   │   ├── www.buzzfeed.com
    │   │   │   ├── index.js
    │   │   │   └── index.test.js
    │   │   ├── www.cbc.ca
    │   │   │   ├── index.js
    │   │   │   └── index.test.js
    │   │   ├── www.cbssports.com
    │   │   │   ├── index.js
    │   │   │   └── index.test.js
    │   │   ├── www.chicagotribune.com
    │   │   │   ├── index.js
    │   │   │   └── index.test.js
    │   │   ├── www.cnbc.com
    │   │   │   ├── index.js
    │   │   │   └── index.test.js
    │   │   ├── www.cnet.com
    │   │   │   ├── index.js
    │   │   │   └── index.test.js
    │   │   ├── www.cnn.com
    │   │   │   ├── index.js
    │   │   │   └── index.test.js
    │   │   ├── www.dmagazine.com
    │   │   │   ├── index.js
    │   │   │   └── index.test.js
    │   │   ├── www.elecom.co.jp
    │   │   │   ├── index.js
    │   │   │   └── index.test.js
    │   │   ├── www.engadget.com
    │   │   │   ├── index.js
    │   │   │   └── index.test.js
    │   │   ├── www.eonline.com
    │   │   │   ├── index.js
    │   │   │   └── index.test.js
    │   │   ├── www.fastcompany.com
    │   │   │   ├── index.js
    │   │   │   └── index.test.js
    │   │   ├── www.fool.com
    │   │   │   ├── index.js
    │   │   │   └── index.test.js
    │   │   ├── www.fortinet.com
    │   │   │   ├── index.js
    │   │   │   └── index.test.js
    │   │   ├── www.gizmodo.jp
    │   │   │   ├── index.js
    │   │   │   └── index.test.js
    │   │   ├── www.gruene.de
    │   │   │   ├── index.js
    │   │   │   └── index.test.js
    │   │   ├── www.huffingtonpost.com
    │   │   │   ├── index.js
    │   │   │   └── index.test.js
    │   │   ├── www.infoq.com
    │   │   │   ├── index.js
    │   │   │   └── index.test.js
    │   │   ├── www.inquisitr.com
    │   │   │   ├── index.js
    │   │   │   └── index.test.js
    │   │   ├── www.investmentexecutive.com
    │   │   │   ├── index.js
    │   │   │   └── index.test.js
    │   │   ├── www.ipa.go.jp
    │   │   │   ├── index.js
    │   │   │   └── index.test.js
    │   │   ├── www.itmedia.co.jp
    │   │   │   ├── index.js
    │   │   │   └── index.test.js
    │   │   ├── www.jnsa.org
    │   │   │   ├── index.js
    │   │   │   └── index.test.js
    │   │   ├── www.ladbible.com
    │   │   │   ├── index.js
    │   │   │   └── index.test.js
    │   │   ├── www.latimes.com
    │   │   │   ├── index.js
    │   │   │   └── index.test.js
    │   │   ├── www.lemonde.fr
    │   │   │   ├── index.js
    │   │   │   └── index.test.js
    │   │   ├── www.lifehacker.jp
    │   │   │   ├── index.js
    │   │   │   └── index.test.js
    │   │   ├── www.linkedin.com
    │   │   │   ├── index.js
    │   │   │   └── index.test.js
    │   │   ├── www.littlethings.com
    │   │   │   ├── index.js
    │   │   │   └── index.test.js
    │   │   ├── www.macrumors.com
    │   │   │   ├── index.js
    │   │   │   └── index.test.js
    │   │   ├── www.mentalfloss.com
    │   │   │   ├── index.js
    │   │   │   └── index.test.js
    │   │   ├── www.miamiherald.com
    │   │   │   ├── index.js
    │   │   │   └── index.test.js
    │   │   ├── www.moongift.jp
    │   │   │   ├── index.js
    │   │   │   └── index.test.js
    │   │   ├── www.msn.com
    │   │   │   ├── index.js
    │   │   │   └── index.test.js
    │   │   ├── www.msnbc.com
    │   │   │   ├── index.js
    │   │   │   └── index.test.js
    │   │   ├── www.nationalgeographic.com
    │   │   │   ├── index.js
    │   │   │   └── index.test.js
    │   │   ├── www.nbcnews.com
    │   │   │   ├── index.js
    │   │   │   └── index.test.js
    │   │   ├── www.ndtv.com
    │   │   │   ├── index.js
    │   │   │   └── index.test.js
    │   │   ├── www.newyorker.com
    │   │   │   ├── index.js
    │   │   │   └── index.test.js
    │   │   ├── www.npr.org
    │   │   │   ├── index.js
    │   │   │   └── index.test.js
    │   │   ├── www.nydailynews.com
    │   │   │   ├── index.js
    │   │   │   └── index.test.js
    │   │   ├── www.nytimes.com
    │   │   │   ├── index.js
    │   │   │   └── index.test.js
    │   │   ├── www.opposingviews.com
    │   │   │   ├── index.js
    │   │   │   └── index.test.js
    │   │   ├── www.oreilly.co.jp
    │   │   │   ├── index.js
    │   │   │   └── index.test.js
    │   │   ├── www.ossnews.jp
    │   │   │   ├── index.js
    │   │   │   └── index.test.js
    │   │   ├── www.phoronix.com
    │   │   │   ├── index.js
    │   │   │   └── index.test.js
    │   │   ├── www.politico.com
    │   │   │   ├── index.js
    │   │   │   └── index.test.js
    │   │   ├── www.popsugar.com
    │   │   │   ├── index.js
    │   │   │   └── index.test.js
    │   │   ├── www.prospectmagazine.co.uk
    │   │   │   ├── index.js
    │   │   │   └── index.test.js
    │   │   ├── www.publickey1.jp
    │   │   │   ├── index.js
    │   │   │   └── index.test.js
    │   │   ├── www.qdaily.com
    │   │   │   ├── index.js
    │   │   │   └── index.test.js
    │   │   ├── www.rawstory.com
    │   │   │   ├── index.js
    │   │   │   └── index.test.js
    │   │   ├── www.rbbtoday.com
    │   │   │   ├── index.js
    │   │   │   └── index.test.js
    │   │   ├── www.recode.net
    │   │   │   ├── index.js
    │   │   │   └── index.test.js
    │   │   ├── www.reddit.com
    │   │   │   ├── index.js
    │   │   │   └── index.test.js
    │   │   ├── www.refinery29.com
    │   │   │   ├── index.js
    │   │   │   └── index.test.js
    │   │   ├── www.reuters.com
    │   │   │   ├── index.js
    │   │   │   └── index.test.js
    │   │   ├── www.rollingstone.com
    │   │   │   ├── index.js
    │   │   │   └── index.test.js
    │   │   ├── www.sanwa.co.jp
    │   │   │   ├── index.js
    │   │   │   └── index.test.js
    │   │   ├── www.sbnation.com
    │   │   │   ├── index.js
    │   │   │   └── index.test.js
    │   │   ├── www.si.com
    │   │   │   ├── index.js
    │   │   │   └── index.test.js
    │   │   ├── www.slate.com
    │   │   │   ├── index.js
    │   │   │   └── index.test.js
    │   │   ├── www.spektrum.de
    │   │   │   ├── index.js
    │   │   │   └── index.test.js
    │   │   ├── www.theatlantic.com
    │   │   │   ├── index.js
    │   │   │   └── index.test.js
    │   │   ├── www.theguardian.com
    │   │   │   ├── index.js
    │   │   │   └── index.test.js
    │   │   ├── www.thepennyhoarder.com
    │   │   │   ├── index.js
    │   │   │   └── index.test.js
    │   │   ├── www.thepoliticalinsider.com
    │   │   │   ├── index.js
    │   │   │   └── index.test.js
    │   │   ├── www.theverge.com
    │   │   │   ├── index.js
    │   │   │   └── index.test.js
    │   │   ├── www.tmz.com
    │   │   │   ├── index.js
    │   │   │   └── index.test.js
    │   │   ├── www.today.com
    │   │   │   ├── index.js
    │   │   │   └── index.test.js
    │   │   ├── www.usmagazine.com
    │   │   │   ├── index.js
    │   │   │   └── index.test.js
    │   │   ├── www.vox.com
    │   │   │   ├── index.js
    │   │   │   └── index.test.js
    │   │   ├── www.washingtonpost.com
    │   │   │   ├── index.js
    │   │   │   └── index.test.js
    │   │   ├── www.westernjournalism.com
    │   │   │   ├── index.js
    │   │   │   └── index.test.js
    │   │   ├── www.wired.com
    │   │   │   ├── index.js
    │   │   │   └── index.test.js
    │   │   ├── www.yahoo.com
    │   │   │   ├── index.js
    │   │   │   └── index.test.js
    │   │   ├── www.yomiuri.co.jp
    │   │   │   ├── index.js
    │   │   │   └── index.test.js
    │   │   └── www.youtube.com
    │   │   │   ├── index.js
    │   │   │   └── index.test.js
    │   ├── detect-by-html.js
    │   ├── detect-by-html.test.js
    │   ├── fixtures
    │   │   └── postlight.com
    │   │   │   └── index.js
    │   ├── generic
    │   │   ├── author
    │   │   │   ├── constants.js
    │   │   │   ├── extractor.js
    │   │   │   └── extractor.test.js
    │   │   ├── content
    │   │   │   ├── extract-best-node.js
    │   │   │   ├── extract-best-node.test.js
    │   │   │   ├── extractor.js
    │   │   │   ├── extractor.test.js
    │   │   │   └── scoring
    │   │   │   │   ├── add-score.js
    │   │   │   │   ├── add-score.test.js
    │   │   │   │   ├── add-to-parent.js
    │   │   │   │   ├── add-to-parent.test.js
    │   │   │   │   ├── constants.js
    │   │   │   │   ├── find-top-candidate.js
    │   │   │   │   ├── find-top-candidate.test.js
    │   │   │   │   ├── get-or-init-score.js
    │   │   │   │   ├── get-or-init-score.test.js
    │   │   │   │   ├── get-score.js
    │   │   │   │   ├── get-score.test.js
    │   │   │   │   ├── get-weight.js
    │   │   │   │   ├── get-weight.test.js
    │   │   │   │   ├── index.js
    │   │   │   │   ├── merge-siblings.js
    │   │   │   │   ├── score-commas.js
    │   │   │   │   ├── score-commas.test.js
    │   │   │   │   ├── score-content.js
    │   │   │   │   ├── score-content.test.js
    │   │   │   │   ├── score-length.js
    │   │   │   │   ├── score-length.test.js
    │   │   │   │   ├── score-node.js
    │   │   │   │   ├── score-node.test.js
    │   │   │   │   ├── score-paragraph.js
    │   │   │   │   ├── score-paragraph.test.js
    │   │   │   │   ├── set-score.js
    │   │   │   │   └── set-score.test.js
    │   │   ├── date-published
    │   │   │   ├── constants.js
    │   │   │   ├── extractor.js
    │   │   │   └── extractor.test.js
    │   │   ├── dek
    │   │   │   ├── extractor.js
    │   │   │   └── extractor.test.js
    │   │   ├── excerpt
    │   │   │   ├── constants.js
    │   │   │   ├── extractor.js
    │   │   │   └── extractor.test.js
    │   │   ├── index.js
    │   │   ├── index.test.js
    │   │   ├── lead-image-url
    │   │   │   ├── constants.js
    │   │   │   ├── extractor.js
    │   │   │   ├── extractor.test.js
    │   │   │   ├── score-image.js
    │   │   │   └── score-image.test.js
    │   │   ├── next-page-url
    │   │   │   ├── extractor.js
    │   │   │   ├── extractor.test.js
    │   │   │   └── scoring
    │   │   │   │   ├── constants.js
    │   │   │   │   ├── score-links.js
    │   │   │   │   ├── score-links.test.js
    │   │   │   │   └── utils
    │   │   │   │       ├── index.js
    │   │   │   │       ├── score-base-url.js
    │   │   │   │       ├── score-base-url.test.js
    │   │   │   │       ├── score-by-parents.js
    │   │   │   │       ├── score-by-parents.test.js
    │   │   │   │       ├── score-cap-links.js
    │   │   │   │       ├── score-cap-links.test.js
    │   │   │   │       ├── score-extraneous-links.js
    │   │   │   │       ├── score-extraneous-links.test.js
    │   │   │   │       ├── score-link-text.js
    │   │   │   │       ├── score-link-text.test.js
    │   │   │   │       ├── score-next-link-text.js
    │   │   │   │       ├── score-next-link-text.test.js
    │   │   │   │       ├── score-page-in-link.js
    │   │   │   │       ├── score-page-in-link.test.js
    │   │   │   │       ├── score-prev-link.js
    │   │   │   │       ├── score-prev-link.test.js
    │   │   │   │       ├── score-similarity.js
    │   │   │   │       ├── score-similarity.test.js
    │   │   │   │       ├── should-score.js
    │   │   │   │       └── should-score.test.js
    │   │   ├── title
    │   │   │   ├── constants.js
    │   │   │   ├── extractor.js
    │   │   │   └── extractor.test.js
    │   │   ├── url
    │   │   │   ├── constants.js
    │   │   │   ├── extractor.js
    │   │   │   └── extractor.test.js
    │   │   └── word-count
    │   │   │   ├── extractor.js
    │   │   │   └── extractor.test.js
    │   ├── get-extractor.js
    │   ├── get-extractor.test.js
    │   ├── index.js
    │   ├── root-extractor.js
    │   └── root-extractor.test.js
    ├── mercury.js
    ├── mercury.test.js
    ├── resource
    │   ├── index.js
    │   ├── index.test.js
    │   └── utils
    │   │   ├── constants.js
    │   │   ├── dom
    │   │       ├── clean.js
    │   │       ├── clean.test.js
    │   │       ├── constants.js
    │   │       ├── convert-lazy-loaded-images.js
    │   │       ├── convert-lazy-loaded-images.test.js
    │   │       ├── index.js
    │   │       ├── normalize-meta-tags.js
    │   │       └── normalize-meta-tags.test.js
    │   │   ├── fetch-resource.js
    │   │   ├── fetch-resource.test.js
    │   │   └── index.js
    ├── shims
    │   ├── cheerio-query.js
    │   └── iconv-lite.js
    ├── test-helpers.js
    └── utils
    │   ├── dom
    │       ├── brs-to-ps.js
    │       ├── brs-to-ps.test.js
    │       ├── clean-attributes.js
    │       ├── clean-attributes.test.js
    │       ├── clean-h-ones.js
    │       ├── clean-h-ones.test.js
    │       ├── clean-headers.js
    │       ├── clean-headers.test.js
    │       ├── clean-images.js
    │       ├── clean-images.test.js
    │       ├── clean-tags.js
    │       ├── clean-tags.test.js
    │       ├── constants.js
    │       ├── convert-node-to.js
    │       ├── convert-node-to.test.js
    │       ├── convert-to-paragraphs.js
    │       ├── convert-to-paragraphs.test.js
    │       ├── extract-from-meta.js
    │       ├── extract-from-meta.test.js
    │       ├── extract-from-selectors.js
    │       ├── extract-from-selectors.test.js
    │       ├── get-attrs.js
    │       ├── get-attrs.test.js
    │       ├── index.js
    │       ├── is-wordpress.js
    │       ├── is-wordpress.test.js
    │       ├── link-density.js
    │       ├── link-density.test.js
    │       ├── make-links-absolute.js
    │       ├── make-links-absolute.test.js
    │       ├── mark-to-keep.js
    │       ├── mark-to-keep.test.js
    │       ├── node-is-sufficient.js
    │       ├── node-is-sufficient.test.js
    │       ├── paragraphize.js
    │       ├── paragraphize.test.js
    │       ├── remove-empty.js
    │       ├── remove-empty.test.js
    │       ├── rewrite-top-level.js
    │       ├── rewrite-top-level.test.js
    │       ├── set-attr.js
    │       ├── set-attr.test.js
    │       ├── set-attrs.js
    │       ├── set-attrs.test.js
    │       ├── strip-junk-tags.js
    │       ├── strip-junk-tags.test.js
    │       ├── strip-tags.js
    │       ├── strip-tags.test.js
    │       ├── strip-unlikely-candidates.js
    │       ├── strip-unlikely-candidates.test.js
    │       ├── within-comment.js
    │       └── within-comment.test.js
    │   ├── index.js
    │   ├── merge-supported-domains.js
    │   ├── merge-supported-domains.test.js
    │   ├── range.js
    │   ├── text
    │       ├── article-base-url.js
    │       ├── article-base-url.test.js
    │       ├── constants.js
    │       ├── excerpt-content.js
    │       ├── excerpt.test.js
    │       ├── extract-from-url.js
    │       ├── extract-from-url.test.js
    │       ├── get-encoding.js
    │       ├── get-encoding.test.js
    │       ├── has-sentence-end.js
    │       ├── index.js
    │       ├── normalize-spaces.js
    │       ├── normalize-spaces.test.js
    │       ├── page-num-from-url.js
    │       ├── page-num-from-url.test.js
    │       ├── remove-anchor.js
    │       └── remove-anchor.test.js
    │   ├── validate-url.js
    │   └── validate-url.test.js
└── yarn.lock


/.agignore:
--------------------------------------------------------------------------------
1 | dist
2 | 


--------------------------------------------------------------------------------
/.babelrc:
--------------------------------------------------------------------------------
 1 | {
 2 |   "presets": ["@babel/preset-env"],
 3 |   "plugins": [
 4 |     [
 5 |       "module-resolver",
 6 |       {
 7 |         "root": ["./src"],
 8 |         "alias": {
 9 |           "./utils": "utils",
10 |           "./cleaners": "cleaners",
11 |           "./resource": "resource",
12 |           "./extractors": "extractors",
13 |           "./test-helpers.js": "test-helpers",
14 |           "./mercury.js": "mercury"
15 |         }
16 |       }
17 |     ]
18 |   ],
19 |   "env": {
20 |     "development": {
21 |       "plugins": [
22 |         [
23 |           "@babel/plugin-transform-runtime",
24 |           {
25 |             "corejs": 2,
26 |             "regenerator": true
27 |           }
28 |         ]
29 |       ]
30 |     }
31 |   }
32 | }
33 | 


--------------------------------------------------------------------------------
/.eslintignore:
--------------------------------------------------------------------------------
1 | **/fixtures/*
2 | dist/*
3 | coverage/*
4 | karma.conf.js
5 | 


--------------------------------------------------------------------------------
/.eslintrc:
--------------------------------------------------------------------------------
 1 | {
 2 |   "parser": "babel-eslint",
 3 |   "extends": ["airbnb", "prettier"],
 4 |   "plugins": ["babel"],
 5 |   "globals": {
 6 |     "describe": true,
 7 |     "it": true,
 8 |     "fit": true,
 9 |     "jasmine": true,
10 |     "beforeEach": true,
11 |     "beforeAll": true,
12 |     "afterAll": true
13 |   },
14 |   "rules": {
15 |     "no-param-reassign": 0,
16 |     "no-control-regex": 0,
17 |     "import/prefer-default-export": 0,
18 |     "generator-star-spacing": 0,
19 |     "babel/generator-star-spacing": 0,
20 |     "func-names": 0,
21 |     "no-confusing-arrow": 0,
22 |     "camelcase": 0,
23 |     "no-multiple-empty-lines": [
24 |       "error",
25 |       { "max": 1, "maxEOF": 0, "maxBOF": 0 }
26 |     ],
27 |     "import/no-unresolved": false,
28 |     "import/no-extraneous-dependencies": [
29 |       "error",
30 |       {
31 |         "devDependencies": [
32 |           "**/*.test.js",
33 |           "scripts/proxy-browser-test.js",
34 |           "rollup.config*js"
35 |         ]
36 |       }
37 |     ]
38 |   },
39 |   "settings": {
40 |     "import/resolver": {
41 |       "babel-module": {}
42 |     }
43 |   }
44 | }
45 | 


--------------------------------------------------------------------------------
/.gitattributes:
--------------------------------------------------------------------------------
1 | # All html files are fixtures, so marking as vendored
2 | # so Linguist (https://github.com/github/linguist)
3 | # ignores them for the purpose of language detection
4 | *.html linguist-vendored
5 | 


--------------------------------------------------------------------------------
/.github/PULL_REQUEST_TEMPLATE.md:
--------------------------------------------------------------------------------
1 | <!--
2 | Thanks for submitting a pull request!
3 | 
4 | In order to get this change merged in a timely manner, please provide a short
5 | description, review requirements, and a link to the issue this addresses (if applicable).
6 | 
7 | Contributing Guide: https://github.com/postlight/parser/blob/master/CONTRIBUTING.md
8 | -->
9 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
 1 | node_modules
 2 | build
 3 | npm-debug.log
 4 | TODO.md
 5 | read
 6 | preview.html
 7 | preview.json
 8 | coverage
 9 | dist/mercury_test.js
10 | dist/mercury_test.js.map
11 | dist/mercury_test.web.js
12 | tmp/artifacts
13 | test-output.json
14 | .tool-versions
15 | .yarnrc.yml
16 | **/.DS_Store
17 | 


--------------------------------------------------------------------------------
/.nvmrc:
--------------------------------------------------------------------------------
1 | 12.8.1
2 | 


--------------------------------------------------------------------------------
/.prettierignore:
--------------------------------------------------------------------------------
1 | dist
2 | 


--------------------------------------------------------------------------------
/.prettierrc:
--------------------------------------------------------------------------------
 1 | {
 2 |   "trailingComma": "es5",
 3 |   "semi": true,
 4 |   "singleQuote": true,
 5 |   "printWidth": 80,
 6 |   "tabWidth": 2,
 7 |   "useTabs": false,
 8 |   "bracketSpacing": true,
 9 |   "arrowParens": "avoid"
10 | }
11 | 


--------------------------------------------------------------------------------
/.remarkrc:
--------------------------------------------------------------------------------
1 | {
2 |   "plugins": [
3 |     "remark-preset-lint-recommended",
4 |     ["remark-lint-list-item-indent", false]
5 |   ]
6 | }
7 | 


--------------------------------------------------------------------------------
/LICENSE-MIT:
--------------------------------------------------------------------------------
 1 | MIT License
 2 | 
 3 | Copyright (c) 2019 Postlight
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | of this software and associated documentation files (the "Software"), to deal
 7 | in the Software without restriction, including without limitation the rights
 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 | 
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 | 
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 | 


--------------------------------------------------------------------------------
/RELEASE.md:
--------------------------------------------------------------------------------
 1 | # How to cut a new release
 2 | 
 3 | 1. Create a release branch.
 4 | 
 5 | ```bash
 6 | git checkout -b release-1.x.x # (where 1.x.x reflects the release)
 7 | ```
 8 | 
 9 | 2. Update package.json with the version number
10 | 3. Build the release
11 | 
12 | ```bash
13 | yarn release
14 | ```
15 | 
16 | 4. Update the changelog
17 | 
18 | ```bash
19 | # Copy the output of the command below and paste it into CHANGELOG.md
20 | # following the conventions of that file
21 | yarn changelog-maker postlight parser
22 | ```
23 | 
24 | 5. Submit a PR
25 | 6. Merge once the PR's tests pass
26 | 7. [Create a release](https://github.com/postlight/parser/releases), linking to this release's entry in the changelog. (See other releases for context.)
27 | 


--------------------------------------------------------------------------------
/assets/parser-basic-usage.gif:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/postlight/parser/e8ba7ece291efa4d915d50dd4deeec17d54359f2/assets/parser-basic-usage.gif


--------------------------------------------------------------------------------
/karma.conf.js:
--------------------------------------------------------------------------------
 1 | module.exports = function (config) {
 2 |   config.set({
 3 | 
 4 |     basePath: '',
 5 | 
 6 |     frameworks: ['jasmine', 'browserify'],
 7 |     files: [
 8 |       { pattern: 'src/**/*.test.js', included: true },
 9 |     ],
10 | 
11 |     exclude: [],
12 | 
13 |     preprocessors: {
14 |       'src/**/*.js': ['browserify'],
15 |     },
16 | 
17 |     browserify: {
18 |       debug: true,
19 |       transform: ['babelify', 'brfs'],
20 |     },
21 | 
22 |     reporters: ['progress'],
23 |     port: 9876,
24 |     colors: true,
25 |     logLevel: config.LOG_INFO,
26 |     autoWatch: false,
27 |     browsers: ['Chrome'],
28 |     singleRun: true,
29 |     concurrency: Infinity,
30 |   });
31 | };
32 | 


--------------------------------------------------------------------------------
/preview:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env node
 2 | var fs = require('fs')
 3 | var execSync = require('child_process').execSync
 4 | 
 5 | var optRe = /^--/
 6 | var args = process.argv.slice(2).reduce((acc, arg) => {
 7 |   if (optRe.test(arg)) {
 8 |     acc.opts.push(arg)
 9 |   } else {
10 |     acc.urls.push(arg)
11 |   }
12 | 
13 |   return acc
14 | }, { opts: [], urls: [] })
15 | 
16 | var urls = args.urls
17 | 
18 | if (!args.opts.find(arg => arg === '--no-rebuild')) {
19 |   console.log('Rebuilding Mercury')
20 |   execSync('MERCURY_TEST_BUILD=true npm run build')
21 | }
22 | 
23 | var Mercury = require('./dist/mercury_test')
24 | 
25 | console.log(`Fetching link(s)`)
26 | 
27 | urls.map(url => {
28 |   Mercury.parse(url, { fallback: false }).then(function(result) {
29 |     var htmlFile = './preview.html'
30 |     var jsonFile = './preview.json'
31 | 
32 |     var html = `<h1>${result.title}</h1>${result.content}`
33 | 
34 |     fs.writeFileSync(htmlFile, html)
35 |     fs.writeFileSync(jsonFile, JSON.stringify(result))
36 |     execSync(`open ${jsonFile}`)
37 |     execSync(`open ${htmlFile}`)
38 |   })
39 | })
40 | 


--------------------------------------------------------------------------------
/rollup.config.esm.js:
--------------------------------------------------------------------------------
 1 | import nodeResolve from 'rollup-plugin-node-resolve';
 2 | import globals from 'rollup-plugin-node-globals';
 3 | import { terser } from 'rollup-plugin-terser'; // eslint-disable-line import/extensions
 4 | import babel from 'rollup-plugin-babel';
 5 | import commonjs from 'rollup-plugin-commonjs';
 6 | 
 7 | export default {
 8 |   input: 'src/mercury.js',
 9 |   plugins: [
10 |     babel({
11 |       runtimeHelpers: true,
12 |       exclude: './node_modules#<{(|*',
13 |     }),
14 |     commonjs({
15 |       ignoreGlobal: true,
16 |     }),
17 |     globals(),
18 |     nodeResolve({
19 |       browser: true,
20 |       preferBuiltins: false,
21 |     }),
22 |     terser(),
23 |   ],
24 |   treeshake: true,
25 |   output: {
26 |     file: process.env.MERCURY_TEST_BUILD
27 |       ? 'dist/mercury_test.esm.js'
28 |       : 'dist/mercury.esm.js',
29 |     format: 'es',
30 |     sourcemap: true,
31 |   },
32 | };
33 | 


--------------------------------------------------------------------------------
/rollup.config.js:
--------------------------------------------------------------------------------
 1 | /* eslint-disable import/no-extraneous-dependencies */
 2 | import babel from 'rollup-plugin-babel';
 3 | import commonjs from 'rollup-plugin-commonjs';
 4 | 
 5 | export default {
 6 |   input: 'src/mercury.js',
 7 |   plugins: [
 8 |     commonjs(),
 9 |     babel({
10 |       externalHelpers: false,
11 |       runtimeHelpers: true,
12 |     }),
13 |   ],
14 |   treeshake: true,
15 |   output: {
16 |     file: process.env.MERCURY_TEST_BUILD
17 |       ? 'dist/mercury_test.js'
18 |       : 'dist/mercury.js',
19 |     format: 'cjs',
20 |     sourcemap: true,
21 |   },
22 | };
23 | 


--------------------------------------------------------------------------------
/rollup.config.web.js:
--------------------------------------------------------------------------------
 1 | import nodeResolve from 'rollup-plugin-node-resolve';
 2 | import globals from 'rollup-plugin-node-globals';
 3 | import { uglify } from 'rollup-plugin-uglify'; // eslint-disable-line import/extensions
 4 | import babel from 'rollup-plugin-babel';
 5 | import commonjs from 'rollup-plugin-commonjs';
 6 | 
 7 | export default {
 8 |   input: 'src/mercury.js',
 9 |   plugins: [
10 |     babel({
11 |       runtimeHelpers: true,
12 |       exclude: './node_modules#<{(|*',
13 |     }),
14 |     commonjs({
15 |       ignoreGlobal: true,
16 |     }),
17 |     globals(),
18 |     nodeResolve({
19 |       browser: true,
20 |       preferBuiltins: false,
21 |     }),
22 |     uglify(),
23 |   ],
24 |   treeshake: true,
25 |   output: {
26 |     file: process.env.MERCURY_TEST_BUILD
27 |       ? 'dist/mercury_test.web.js'
28 |       : 'dist/mercury.web.js',
29 |     format: 'iife',
30 |     name: 'Mercury',
31 |     sourcemap: true,
32 |   },
33 | };
34 | 


--------------------------------------------------------------------------------
/score-move:
--------------------------------------------------------------------------------
 1 | #!/usr/local/bin/fish
 2 | 
 3 | set file $argv[1]
 4 | set function $argv[2]
 5 | 
 6 | touch src/extractors/generic/next-page-url/scoring/utils/index.js
 7 | touch src/extractors/generic/next-page-url/scoring/utils/$file.js
 8 | touch src/extractors/generic/next-page-url/scoring/utils/$file.test.js
 9 | 
10 | echo "import assert from 'assert';" > src/extractors/generic/next-page-url/scoring/utils/$file.test.js
11 | echo "" >> src/extractors/generic/next-page-url/scoring/utils/$file.test.js
12 | echo "import $function from './$file';" >> src/extractors/generic/next-page-url/scoring/utils/$file.test.js
13 | echo "" >> src/extractors/generic/next-page-url/scoring/utils/$file.test.js
14 | echo "export { default as $function } from './$file'" >> src/extractors/generic/next-page-url/scoring/utils/index.js
15 | 
16 | echo "Now make it a default export"
17 | echo "Move it to its file"
18 | echo "Move its tests to its test file"
19 | echo "import in score-links"
20 | echo "Test it."
21 | 
22 | 


--------------------------------------------------------------------------------
/scripts/find-and-replace.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 | 
3 | echo $3
4 | 
5 | find $3 -exec sed -i '' "s%$1%$2%g" '{}' \;


--------------------------------------------------------------------------------
/scripts/pr-parser-preview.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | changes=( `git diff origin/master --name-only` )
 4 | 
 5 | for fixture in "${changes[@]}"
 6 | do
 7 | 	# If one of the changed files is a fixture, hold onto it
 8 |   if [[ $fixture == "fixtures/"* ]]; then
 9 |     fixtures=$fixture,$fixtures
10 |   fi
11 | done
12 | 
13 | if [[ $fixtures ]]; then
14 |   # Take a screenshot of the fixture
15 |   yarn phantomjs scripts/generate-fixture-preview.js $fixtures
16 | 
17 |   screenshots=( `find tmp/artifacts -type f | grep ".html.png"` )
18 | 
19 |   for screenshot in "${screenshots[@]}"
20 |   do
21 |     # Create a comment with a link to the screenshot
22 |     # and json output for the fixture
23 |     node scripts/comment-for-pr.js $screenshot
24 |   done
25 | else
26 |   echo "No fixtures added in this PR, so no preview needed"
27 |   node scripts/write-test-report.js
28 | fi
29 | 


--------------------------------------------------------------------------------
/scripts/rollup.config.js:
--------------------------------------------------------------------------------
 1 | /* eslint-disable import/no-extraneous-dependencies */
 2 | import babel from 'rollup-plugin-babel';
 3 | import commonjs from 'rollup-plugin-commonjs';
 4 | 
 5 | export default {
 6 |   input: 'scripts/generate-custom-parser.js',
 7 |   plugins: [
 8 |     commonjs(),
 9 |     babel({
10 |       externalHelpers: false,
11 |       runtimeHelpers: true,
12 |     }),
13 |   ],
14 |   treeshake: true,
15 |   output: {
16 |     file: 'dist/generate-custom-parser.js',
17 |     format: 'cjs',
18 |     sourcemap: true,
19 |   },
20 | };
21 | 


--------------------------------------------------------------------------------
/scripts/templates/index.js:
--------------------------------------------------------------------------------
 1 | import insertValues from './insert-values';
 2 | 
 3 | const bodyPattern = /^\n([\s\S]+)\s{2}$/gm;
 4 | const trailingWhitespace = /\s+$/;
 5 | 
 6 | export default function template(strings, ...values) {
 7 |   const compiled = insertValues(strings, ...values);
 8 |   let [body] = compiled.match(bodyPattern) || [];
 9 |   let indentLevel = /^\s{0,4}(.+)$/g;
10 | 
11 |   if (!body) {
12 |     body = compiled;
13 |     indentLevel = /^\s{0,2}(.+)$/g;
14 |   }
15 | 
16 |   return body
17 |     .split('\n')
18 |     .slice(1)
19 |     .map(line => {
20 |       line = line.replace(indentLevel, '$1');
21 | 
22 |       if (trailingWhitespace.test(line)) {
23 |         line = line.replace(trailingWhitespace, '');
24 |       }
25 | 
26 |       return line;
27 |     })
28 |     .join('\n');
29 | }
30 | 


--------------------------------------------------------------------------------
/scripts/templates/insert-values.js:
--------------------------------------------------------------------------------
 1 | export default function insertValues(strings, ...values) {
 2 |   if (values.length) {
 3 |     return strings.reduce((result, part, idx) => {
 4 |       let value = values[idx];
 5 | 
 6 |       if (value && typeof value.toString === 'function') {
 7 |         value = value.toString();
 8 |       } else {
 9 |         value = '';
10 |       }
11 | 
12 |       return result + part + value;
13 |     }, '');
14 |   }
15 | 
16 |   return strings.join('');
17 | }
18 | 


--------------------------------------------------------------------------------
/scripts/write-test-report.js:
--------------------------------------------------------------------------------
 1 | const path = require('path');
 2 | const fs = require('fs');
 3 | 
 4 | const { getReport } = require('@postlight/ci-failed-test-reporter');
 5 | 
 6 | const report = getReport(path.join(__dirname, '../', '/test-output.json'));
 7 | if (report) {
 8 |   const commentPath = 'tmp/artifacts/comment.json';
 9 |   fs.mkdirSync('tmp');
10 |   fs.mkdirSync('tmp/artifacts');
11 |   fs.writeFileSync(
12 |     commentPath,
13 |     JSON.stringify({
14 |       body: report,
15 |       issue: process.env.CIRCLE_PULL_REQUEST,
16 |     })
17 |   );
18 | }
19 | 


--------------------------------------------------------------------------------
/src/cleaners/author.js:
--------------------------------------------------------------------------------
1 | import { normalizeSpaces } from 'utils/text';
2 | import { CLEAN_AUTHOR_RE } from './constants';
3 | 
4 | // Take an author string (like 'By David Smith ') and clean it to
5 | // just the name(s): 'David Smith'.
6 | export default function cleanAuthor(author) {
7 |   return normalizeSpaces(author.replace(CLEAN_AUTHOR_RE, '$2').trim());
8 | }
9 | 


--------------------------------------------------------------------------------
/src/cleaners/author.test.js:
--------------------------------------------------------------------------------
 1 | import assert from 'assert';
 2 | 
 3 | import cleanAuthor from './author';
 4 | 
 5 | describe('cleanAuthor(author)', () => {
 6 |   it('removes the By from an author string', () => {
 7 |     const author = cleanAuthor('By Bob Dylan');
 8 | 
 9 |     assert.equal(author, 'Bob Dylan');
10 |   });
11 | 
12 |   it('trims trailing whitespace and line breaks', () => {
13 |     const text = `
14 |       written by
15 |       Bob Dylan
16 |     `;
17 |     const author = cleanAuthor(text);
18 | 
19 |     assert.equal(author, 'Bob Dylan');
20 |   });
21 | });
22 | 


--------------------------------------------------------------------------------
/src/cleaners/content.test.js:
--------------------------------------------------------------------------------
 1 | import assert from 'assert';
 2 | import cheerio from 'cheerio';
 3 | 
 4 | import extractBestNode from 'extractors/generic/content/extract-best-node';
 5 | import extractCleanNode from './content';
 6 | 
 7 | const fs = require('fs');
 8 | 
 9 | describe('extractCleanNode(article, { $, cleanConditionally, title } })', () => {
10 |   it('cleans cruft out of a DOM node', () => {
11 |     const html = fs.readFileSync(
12 |       './fixtures/www.wired.com--content-test.html',
13 |       'utf-8'
14 |     );
15 |     const $ = cheerio.load(html);
16 | 
17 |     const opts = {
18 |       stripUnlikelyCandidates: true,
19 |       weightNodes: true,
20 |       cleanConditionally: true,
21 |     };
22 | 
23 |     const bestNode = extractBestNode($, opts);
24 | 
25 |     const cleanNode = extractCleanNode(bestNode, { $, opts });
26 | 
27 |     const text = $(cleanNode)
28 |       .text()
29 |       .replace(/\n/g, '')
30 |       .replace(/\s+/g, ' ')
31 |       .trim();
32 |     assert.equal(text.length === 2656 || text.length === 2657, true);
33 |   });
34 | });
35 | 


--------------------------------------------------------------------------------
/src/cleaners/dek.js:
--------------------------------------------------------------------------------
 1 | import { stripTags } from 'utils/dom';
 2 | import { excerptContent, normalizeSpaces } from 'utils/text';
 3 | 
 4 | import { TEXT_LINK_RE } from './constants';
 5 | 
 6 | // Take a dek HTML fragment, and return the cleaned version of it.
 7 | // Return None if the dek wasn't good enough.
 8 | export default function cleanDek(dek, { $, excerpt }) {
 9 |   // Sanity check that we didn't get too short or long of a dek.
10 |   if (dek.length > 1000 || dek.length < 5) return null;
11 | 
12 |   // Check that dek isn't the same as excerpt
13 |   if (excerpt && excerptContent(excerpt, 10) === excerptContent(dek, 10))
14 |     return null;
15 | 
16 |   const dekText = stripTags(dek, $);
17 | 
18 |   // Plain text links shouldn't exist in the dek. If we have some, it's
19 |   // not a good dek - bail.
20 |   if (TEXT_LINK_RE.test(dekText)) return null;
21 | 
22 |   return normalizeSpaces(dekText.trim());
23 | }
24 | 


--------------------------------------------------------------------------------
/src/cleaners/index.js:
--------------------------------------------------------------------------------
 1 | import cleanAuthor from './author';
 2 | import cleanImage from './lead-image-url';
 3 | import cleanDek from './dek';
 4 | import cleanDatePublished from './date-published';
 5 | import cleanContent from './content';
 6 | import cleanTitle from './title';
 7 | 
 8 | const Cleaners = {
 9 |   author: cleanAuthor,
10 |   lead_image_url: cleanImage,
11 |   dek: cleanDek,
12 |   date_published: cleanDatePublished,
13 |   content: cleanContent,
14 |   title: cleanTitle,
15 | };
16 | 
17 | export default Cleaners;
18 | 
19 | export { cleanAuthor };
20 | export { cleanImage };
21 | export { cleanDek };
22 | export { cleanDatePublished };
23 | export { cleanContent };
24 | export { cleanTitle };
25 | export { default as resolveSplitTitle } from './resolve-split-title';
26 | 


--------------------------------------------------------------------------------
/src/cleaners/lead-image-url.js:
--------------------------------------------------------------------------------
 1 | import validUrl from 'valid-url';
 2 | 
 3 | export default function clean(leadImageUrl) {
 4 |   leadImageUrl = leadImageUrl.trim();
 5 |   if (validUrl.isWebUri(leadImageUrl)) {
 6 |     return leadImageUrl;
 7 |   }
 8 | 
 9 |   return null;
10 | }
11 | 


--------------------------------------------------------------------------------
/src/cleaners/lead-image-url.test.js:
--------------------------------------------------------------------------------
 1 | import assert from 'assert';
 2 | 
 3 | import clean from './lead-image-url';
 4 | 
 5 | describe('clean(leadImageUrl)', () => {
 6 |   it('returns the url if valid', () => {
 7 |     const url = 'https://example.com';
 8 |     assert.equal(clean(url), url);
 9 |   });
10 | 
11 |   it('returns null if the url is not valid', () => {
12 |     assert.equal(clean('this is not a valid url'), null);
13 |   });
14 | 
15 |   it('trims whitespace', () => {
16 |     const url = '  https://example.com/foo/bar.jpg';
17 |     assert.equal(clean(url), url.trim());
18 |   });
19 | });
20 | 


--------------------------------------------------------------------------------
/src/cleaners/resolve-split-title.test.js:
--------------------------------------------------------------------------------
 1 | import assert from 'assert';
 2 | 
 3 | import { resolveSplitTitle } from './index';
 4 | 
 5 | describe('resolveSplitTitle(text)', () => {
 6 |   it('does nothing if title not splittable', () => {
 7 |     const title = 'This Is a Normal Title';
 8 | 
 9 |     assert.equal(resolveSplitTitle(title), title);
10 |   });
11 | 
12 |   it('extracts titles from breadcrumb-like titles', () => {
13 |     const title = 'The Best Gadgets on Earth : Bits : Blogs : NYTimes.com';
14 | 
15 |     assert.equal(resolveSplitTitle(title), 'The Best Gadgets on Earth ');
16 |   });
17 | 
18 |   it('cleans domains from titles at the front', () => {
19 |     const title = 'NYTimes - The Best Gadgets on Earth';
20 |     const url = 'https://www.nytimes.com/bits/blog/etc/';
21 | 
22 |     assert.equal(resolveSplitTitle(title, url), 'The Best Gadgets on Earth');
23 |   });
24 | 
25 |   it('cleans domains from titles at the back', () => {
26 |     const title = 'The Best Gadgets on Earth | NYTimes';
27 |     const url = 'https://www.nytimes.com/bits/blog/etc/';
28 | 
29 |     assert.equal(resolveSplitTitle(title, url), 'The Best Gadgets on Earth');
30 |   });
31 | });
32 | 


--------------------------------------------------------------------------------
/src/cleaners/title.js:
--------------------------------------------------------------------------------
 1 | import { stripTags } from 'utils/dom';
 2 | import { normalizeSpaces } from 'utils/text';
 3 | 
 4 | import { TITLE_SPLITTERS_RE } from './constants';
 5 | import { resolveSplitTitle } from './index';
 6 | 
 7 | export default function cleanTitle(title, { url, $ }) {
 8 |   // If title has |, :, or - in it, see if
 9 |   // we can clean it up.
10 |   if (TITLE_SPLITTERS_RE.test(title)) {
11 |     title = resolveSplitTitle(title, url);
12 |   }
13 | 
14 |   // Final sanity check that we didn't get a crazy title.
15 |   // if (title.length > 150 || title.length < 15) {
16 |   if (title.length > 150) {
17 |     // If we did, return h1 from the document if it exists
18 |     const h1 = $('h1');
19 |     if (h1.length === 1) {
20 |       title = h1.text();
21 |     }
22 |   }
23 | 
24 |   // strip any html tags in the title text
25 |   return normalizeSpaces(stripTags(title, $).trim());
26 | }
27 | 


--------------------------------------------------------------------------------
/src/cleaners/title.test.js:
--------------------------------------------------------------------------------
 1 | import assert from 'assert';
 2 | import cheerio from 'cheerio';
 3 | 
 4 | import { cleanTitle } from './index';
 5 | 
 6 | describe('cleanTitle(title, { url, $ })', () => {
 7 |   it('only uses h1 if there is only one on the page', () => {
 8 |     const title = 'Too Short';
 9 |     const $ = cheerio.load(`
10 |       <div>
11 |         <h1>This Is the Real Title</h1>
12 |         <h1>This Is the Real Title</h1>
13 |       </div>
14 |     `);
15 | 
16 |     assert.equal(cleanTitle(title, { url: '', $ }), title);
17 |   });
18 | 
19 |   it('removes HTML tags from titles', () => {
20 |     const $ = cheerio.load(
21 |       '<div><h1>This Is the <em>Real</em> Title</h1></div>'
22 |     );
23 |     const title = $('h1').html();
24 | 
25 |     assert.equal(cleanTitle(title, { url: '', $ }), 'This Is the Real Title');
26 |   });
27 | 
28 |   it('trims extraneous spaces', () => {
29 |     const title = " This Is a Great Title That You'll Love ";
30 |     const $ = cheerio.load(
31 |       '<div><h1>This Is the <em>Real</em> Title</h1></div>'
32 |     );
33 | 
34 |     assert.equal(cleanTitle(title, { url: '', $ }), title.trim());
35 |   });
36 | });
37 | 


--------------------------------------------------------------------------------
/src/extractors/add-extractor.js:
--------------------------------------------------------------------------------
 1 | import mergeSupportedDomains from '../utils/merge-supported-domains';
 2 | 
 3 | export const apiExtractors = {};
 4 | 
 5 | export default function addExtractor(extractor) {
 6 |   if (!extractor || !extractor.domain) {
 7 |     return {
 8 |       error: true,
 9 |       message: 'Unable to add custom extractor. Invalid parameters.',
10 |     };
11 |   }
12 | 
13 |   Object.assign(apiExtractors, mergeSupportedDomains(extractor));
14 | 
15 |   return apiExtractors;
16 | }
17 | 


--------------------------------------------------------------------------------
/src/extractors/add-extractor.test.js:
--------------------------------------------------------------------------------
 1 | import assert from 'assert';
 2 | 
 3 | import addExtractor from './add-extractor';
 4 | 
 5 | describe('addExtractor(extractor)', () => {
 6 |   it('can add multiple custom extractors', () => {
 7 |     addExtractor({ domain: 'www.site1.com' });
 8 |     addExtractor({ domain: 'www.site2.com' });
 9 |     const result = addExtractor({ domain: 'www.site3.com' });
10 |     assert.equal(Object.keys(result).length, 3);
11 |   });
12 | 
13 |   it('returns error if an extractor is not provided', () => {
14 |     const result = addExtractor();
15 |     assert.equal(result.error, true);
16 |   });
17 | 
18 |   it('returns error if a domain key is not included within the custom extractor', () => {
19 |     const result = addExtractor({ test: 'abc' });
20 |     assert.equal(result.error, true);
21 |   });
22 | });
23 | 


--------------------------------------------------------------------------------
/src/extractors/all.js:
--------------------------------------------------------------------------------
 1 | import mergeSupportedDomains from 'utils/merge-supported-domains';
 2 | import * as CustomExtractors from './custom/index';
 3 | 
 4 | export default Object.keys(CustomExtractors).reduce((acc, key) => {
 5 |   const extractor = CustomExtractors[key];
 6 |   return {
 7 |     ...acc,
 8 |     ...mergeSupportedDomains(extractor),
 9 |   };
10 | }, {});
11 | 


--------------------------------------------------------------------------------
/src/extractors/constants.js:
--------------------------------------------------------------------------------
1 | export const ATTR_RE = /\[([\w-]+)\]/; // eslint-disable-line no-useless-escape
2 | 


--------------------------------------------------------------------------------
/src/extractors/custom/247sports.com/index.js:
--------------------------------------------------------------------------------
 1 | export const twofortysevensportsComExtractor = {
 2 |   domain: '247sports.com',
 3 | 
 4 |   title: {
 5 |     selectors: ['title', 'article header h1'],
 6 |   },
 7 | 
 8 |   author: {
 9 |     selectors: ['.article-cnt__author', '.author'],
10 |   },
11 | 
12 |   date_published: {
13 |     selectors: [['time[data-published]', 'data-published']],
14 |   },
15 | 
16 |   lead_image_url: {
17 |     selectors: [['meta[name="og:image"]', 'value']],
18 |   },
19 | 
20 |   content: {
21 |     selectors: ['.article-body', 'section.body.article'],
22 | 
23 |     // Is there anything in the content you selected that needs transformed
24 |     // before it's consumable content? E.g., unusual lazy loaded images
25 |     transforms: {},
26 | 
27 |     // Is there anything that is in the result that shouldn't be?
28 |     // The clean selectors will remove anything that matches from
29 |     // the result
30 |     clean: [],
31 |   },
32 | };
33 | 


--------------------------------------------------------------------------------
/src/extractors/custom/abcnews.go.com/index.js:
--------------------------------------------------------------------------------
 1 | export const AbcnewsGoComExtractor = {
 2 |   domain: 'abcnews.go.com',
 3 | 
 4 |   title: {
 5 |     selectors: ['div[class*="Article_main__body"] h1', '.article-header h1'],
 6 |   },
 7 | 
 8 |   author: {
 9 |     selectors: ['.ShareByline span:nth-child(2)', '.authors'],
10 |     clean: ['.author-overlay', '.by-text'],
11 |   },
12 | 
13 |   date_published: {
14 |     selectors: ['.ShareByline', '.timestamp'],
15 |     format: 'MMMM D, YYYY h:mm a',
16 |     timezone: 'America/New_York',
17 |   },
18 | 
19 |   lead_image_url: {
20 |     selectors: [['meta[name="og:image"]', 'value']],
21 |   },
22 | 
23 |   content: {
24 |     selectors: ['article', '.article-copy'],
25 | 
26 |     // Is there anything in the content you selected that needs transformed
27 |     // before it's consumable content? E.g., unusual lazy loaded images
28 |     transforms: {},
29 | 
30 |     // Is there anything that is in the result that shouldn't be?
31 |     // The clean selectors will remove anything that matches from
32 |     // the result
33 |     clean: [],
34 |   },
35 | };
36 | 


--------------------------------------------------------------------------------
/src/extractors/custom/biorxiv.org/index.js:
--------------------------------------------------------------------------------
 1 | export const BiorxivOrgExtractor = {
 2 |   domain: 'biorxiv.org',
 3 | 
 4 |   title: {
 5 |     selectors: ['h1#page-title'],
 6 |   },
 7 | 
 8 |   author: {
 9 |     selectors: [
10 |       'div.highwire-citation-biorxiv-article-top > div.highwire-cite-authors',
11 |     ],
12 |   },
13 | 
14 |   content: {
15 |     selectors: ['div#abstract-1'],
16 | 
17 |     // Is there anything in the content you selected that needs transformed
18 |     // before it's consumable content? E.g., unusual lazy loaded images
19 |     transforms: {},
20 | 
21 |     // Is there anything that is in the result that shouldn't be?
22 |     // The clean selectors will remove anything that matches from
23 |     // the result
24 |     clean: [],
25 |   },
26 | };
27 | 


--------------------------------------------------------------------------------
/src/extractors/custom/blogspot.com/index.js:
--------------------------------------------------------------------------------
 1 | export const BloggerExtractor = {
 2 |   domain: 'blogspot.com',
 3 |   content: {
 4 |     // Blogger is insane and does not load its content
 5 |     // initially in the page, but it's all there
 6 |     // in noscript
 7 |     selectors: ['.post-content noscript'],
 8 | 
 9 |     // Selectors to remove from the extracted content
10 |     clean: [],
11 | 
12 |     // Convert the noscript tag to a div
13 |     transforms: {
14 |       noscript: 'div',
15 |     },
16 |   },
17 | 
18 |   author: {
19 |     selectors: ['.post-author-name'],
20 |   },
21 | 
22 |   title: {
23 |     selectors: ['.post h2.title'],
24 |   },
25 | 
26 |   date_published: {
27 |     selectors: ['span.publishdate'],
28 |   },
29 | };
30 | 


--------------------------------------------------------------------------------
/src/extractors/custom/bookwalker.jp/index.js:
--------------------------------------------------------------------------------
 1 | export const BookwalkerJpExtractor = {
 2 |   domain: 'bookwalker.jp',
 3 | 
 4 |   title: {
 5 |     selectors: ['h1.p-main__title', 'h1.main-heading'],
 6 |   },
 7 | 
 8 |   author: {
 9 |     selectors: ['div.p-author__list', 'div.authors'],
10 |   },
11 | 
12 |   date_published: {
13 |     selectors: [
14 |       'dl.p-information__data dd:nth-of-type(7)',
15 |       '.work-info .work-detail:first-of-type .work-detail-contents:last-of-type',
16 |     ],
17 |     timezone: 'Asia/Tokyo',
18 |   },
19 | 
20 |   dek: null,
21 | 
22 |   lead_image_url: {
23 |     selectors: [['meta[name="og:image"]', 'value']],
24 |   },
25 | 
26 |   content: {
27 |     selectors: [
28 |       'div.p-main__information',
29 |       ['div.main-info', 'div.main-cover-inner'],
30 |     ],
31 | 
32 |     defaultCleaner: false,
33 | 
34 |     transforms: {},
35 | 
36 |     clean: [
37 |       'span.label.label--trial',
38 |       'dt.info-head.info-head--coin',
39 |       'dd.info-contents.info-contents--coin',
40 |       'div.info-notice.fn-toggleClass',
41 |     ],
42 |   },
43 | };
44 | 


--------------------------------------------------------------------------------
/src/extractors/custom/buzzap.jp/index.js:
--------------------------------------------------------------------------------
 1 | export const BuzzapJpExtractor = {
 2 |   domain: 'buzzap.jp',
 3 | 
 4 |   title: {
 5 |     selectors: ['h1.entry-title'],
 6 |   },
 7 | 
 8 |   author: null,
 9 | 
10 |   date_published: {
11 |     selectors: [['time.entry-date', 'datetime']],
12 |   },
13 | 
14 |   dek: null,
15 | 
16 |   lead_image_url: {
17 |     selectors: [['meta[name="og:image"]', 'value']],
18 |   },
19 | 
20 |   content: {
21 |     selectors: ['div.ctiframe'],
22 | 
23 |     defaultCleaner: false,
24 | 
25 |     transforms: {},
26 | 
27 |     clean: [],
28 |   },
29 | };
30 | 


--------------------------------------------------------------------------------
/src/extractors/custom/clinicaltrials.gov/index.js:
--------------------------------------------------------------------------------
 1 | export const ClinicaltrialsGovExtractor = {
 2 |   domain: 'clinicaltrials.gov',
 3 | 
 4 |   title: {
 5 |     selectors: ['h1.tr-solo_record'],
 6 |   },
 7 | 
 8 |   author: {
 9 |     selectors: ['div#sponsor.tr-info-text'],
10 |   },
11 | 
12 |   date_published: {
13 |     // selectors: ['span.term[data-term="Last Update Posted"]'],
14 |     selectors: ['div:has(> span.term[data-term="Last Update Posted"])'],
15 |   },
16 | 
17 |   content: {
18 |     selectors: ['div#tab-body'],
19 | 
20 |     // Is there anything in the content you selected that needs transformed
21 |     // before it's consumable content? E.g., unusual lazy loaded images
22 |     transforms: {},
23 | 
24 |     // Is there anything that is in the result that shouldn't be?
25 |     // The clean selectors will remove anything that matches from
26 |     // the result
27 |     clean: ['.usa-alert> img'],
28 |   },
29 | };
30 | 


--------------------------------------------------------------------------------
/src/extractors/custom/deadline.com/index.js:
--------------------------------------------------------------------------------
 1 | export const DeadlineComExtractor = {
 2 |   domain: 'deadline.com',
 3 | 
 4 |   title: {
 5 |     selectors: ['h1'],
 6 |   },
 7 | 
 8 |   author: {
 9 |     selectors: ['section.author h2'],
10 |   },
11 | 
12 |   date_published: {
13 |     selectors: [['meta[name="article:published_time"]', 'value']],
14 |   },
15 | 
16 |   dek: null,
17 | 
18 |   lead_image_url: {
19 |     selectors: [['meta[name="og:image"]', 'value']],
20 |   },
21 | 
22 |   content: {
23 |     selectors: ['div.a-article-grid__main.pmc-a-grid article.pmc-a-grid-item'],
24 | 
25 |     transforms: {
26 |       '.embed-twitter': $node => {
27 |         const innerHtml = $node.html();
28 |         $node.replaceWith(innerHtml);
29 |       },
30 |     },
31 | 
32 |     clean: ['figcaption'],
33 |   },
34 | };
35 | 


--------------------------------------------------------------------------------
/src/extractors/custom/epaper.zeit.de/index.js:
--------------------------------------------------------------------------------
 1 | export const EpaperZeitDeExtractor = {
 2 |   domain: 'epaper.zeit.de',
 3 | 
 4 |   title: {
 5 |     selectors: ['p.title'],
 6 |   },
 7 | 
 8 |   author: {
 9 |     selectors: ['.article__author'],
10 |   },
11 | 
12 |   date_published: null,
13 | 
14 |   excerpt: {
15 |     selectors: ['subtitle'],
16 |   },
17 | 
18 |   lead_image_url: null,
19 | 
20 |   content: {
21 |     selectors: ['.article'],
22 | 
23 |     // Is there anything in the content you selected that needs transformed
24 |     // before it's consumable content? E.g., unusual lazy loaded images
25 |     transforms: {
26 |       'p.title': 'h1',
27 |       '.article__author': 'p',
28 |       byline: 'p',
29 |       linkbox: 'p',
30 |     },
31 | 
32 |     // Is there anything that is in the result that shouldn't be?
33 |     // The clean selectors will remove anything that matches from
34 |     // the result
35 |     clean: ['image-credits', 'box[type=citation]'],
36 |   },
37 | };
38 | 


--------------------------------------------------------------------------------
/src/extractors/custom/fortune.com/index.js:
--------------------------------------------------------------------------------
 1 | export const FortuneComExtractor = {
 2 |   domain: 'fortune.com',
 3 | 
 4 |   title: {
 5 |     selectors: ['h1'],
 6 |   },
 7 | 
 8 |   author: {
 9 |     selectors: [['meta[name="author"]', 'value']],
10 |   },
11 | 
12 |   date_published: {
13 |     selectors: ['.MblGHNMJ'],
14 | 
15 |     timezone: 'UTC',
16 |   },
17 | 
18 |   lead_image_url: {
19 |     selectors: [['meta[name="og:image"]', 'value']],
20 |   },
21 | 
22 |   content: {
23 |     selectors: [['picture', 'article.row'], 'article.row'],
24 | 
25 |     // Is there anything in the content you selected that needs transformed
26 |     // before it's consumable content? E.g., unusual lazy loaded images
27 |     transforms: {},
28 | 
29 |     // Is there anything that is in the result that shouldn't be?
30 |     // The clean selectors will remove anything that matches from
31 |     // the result
32 |     clean: [],
33 |   },
34 | };
35 | 


--------------------------------------------------------------------------------
/src/extractors/custom/getnews.jp/index.js:
--------------------------------------------------------------------------------
 1 | export const GetnewsJpExtractor = {
 2 |   domain: 'getnews.jp',
 3 | 
 4 |   title: {
 5 |     selectors: ['article h1'],
 6 |   },
 7 | 
 8 |   author: {
 9 |     selectors: [['meta[name="article:author"]', 'value'], 'span.prof'],
10 |   },
11 | 
12 |   date_published: {
13 |     selectors: [
14 |       ['meta[name="article:published_time"]', 'value'],
15 |       ['ul.cattag-top time', 'datetime'],
16 |     ],
17 |   },
18 | 
19 |   dek: null,
20 | 
21 |   lead_image_url: {
22 |     selectors: [['meta[name="og:image"]', 'value']],
23 |   },
24 | 
25 |   content: {
26 |     selectors: ['div.post-bodycopy'],
27 | 
28 |     transforms: {},
29 | 
30 |     clean: [],
31 |   },
32 | };
33 | 


--------------------------------------------------------------------------------
/src/extractors/custom/github.com/index.js:
--------------------------------------------------------------------------------
 1 | export const GithubComExtractor = {
 2 |   domain: 'github.com',
 3 | 
 4 |   title: {
 5 |     selectors: [['meta[name="og:title"]', 'value']],
 6 |   },
 7 | 
 8 |   author: {
 9 |     selectors: [
10 |       // enter author selectors
11 |     ],
12 |   },
13 | 
14 |   date_published: {
15 |     selectors: [
16 |       ['relative-time[datetime]', 'datetime'],
17 |       ['span[itemprop="dateModified"] relative-time', 'datetime'],
18 |     ],
19 |   },
20 | 
21 |   dek: {
22 |     selectors: [
23 |       ['meta[name="description"]', 'value'],
24 |       'span[itemprop="about"]',
25 |     ],
26 |   },
27 | 
28 |   lead_image_url: {
29 |     selectors: [['meta[name="og:image"]', 'value']],
30 |   },
31 | 
32 |   content: {
33 |     selectors: [['#readme article']],
34 | 
35 |     // Is there anything in the content you selected that needs transformed
36 |     // before it's consumable content? E.g., unusual lazy loaded images
37 |     transforms: {},
38 | 
39 |     // Is there anything that is in the result that shouldn't be?
40 |     // The clean selectors will remove anything that matches from
41 |     // the result
42 |     clean: [],
43 |   },
44 | };
45 | 


--------------------------------------------------------------------------------
/src/extractors/custom/hellogiggles.com/index.js:
--------------------------------------------------------------------------------
 1 | export const HellogigglesComExtractor = {
 2 |   domain: 'hellogiggles.com',
 3 | 
 4 |   title: {
 5 |     selectors: [['meta[name="og:title"]', 'value'], '.title'],
 6 |   },
 7 | 
 8 |   author: {
 9 |     selectors: ['.byline-wrapper span.author_name', '.author-link'],
10 |   },
11 | 
12 |   date_published: {
13 |     selectors: [
14 |       ['meta[property="article:published_time"]', 'content'],
15 |       ['meta[name="article:published_time"]', 'value'],
16 |     ],
17 |   },
18 | 
19 |   lead_image_url: {
20 |     selectors: [['meta[name="og:image"]', 'value']],
21 |   },
22 | 
23 |   content: {
24 |     selectors: ['.main-content', '.entry-content'],
25 | 
26 |     // Is there anything in the content you selected that needs transformed
27 |     // before it's consumable content? E.g., unusual lazy loaded images
28 |     transforms: {},
29 | 
30 |     // Is there anything that is in the result that shouldn't be?
31 |     // The clean selectors will remove anything that matches from
32 |     // the result
33 |     clean: [],
34 |   },
35 | };
36 | 


--------------------------------------------------------------------------------
/src/extractors/custom/ici.radio-canada.ca/index.js:
--------------------------------------------------------------------------------
 1 | export const IciRadioCanadaCaExtractor = {
 2 |   domain: 'ici.radio-canada.ca',
 3 | 
 4 |   title: {
 5 |     selectors: ['h1'],
 6 |   },
 7 | 
 8 |   author: {
 9 |     selectors: [['meta[name="dc.creator"]', 'value']],
10 |   },
11 | 
12 |   date_published: {
13 |     selectors: [['meta[name="dc.date.created"]', 'value']],
14 |     format: 'YYYY-MM-DD|HH[h]mm',
15 |     timezone: 'America/New_York',
16 |   },
17 | 
18 |   dek: {
19 |     selectors: ['div.lead-container', '.bunker-component.lead'],
20 |   },
21 | 
22 |   lead_image_url: {
23 |     selectors: [['meta[name="og:image"]', 'value']],
24 |   },
25 | 
26 |   content: {
27 |     selectors: [
28 |       'section.document-content-style',
29 |       ['.main-multimedia-item', '.news-story-content'],
30 |     ],
31 | 
32 |     // Is there anything in the content you selected that needs transformed
33 |     // before it's consumable content? E.g., unusual lazy loaded images
34 |     transforms: {},
35 | 
36 |     // Is there anything that is in the result that shouldn't be?
37 |     // The clean selectors will remove anything that matches from
38 |     // the result
39 |     clean: [],
40 |   },
41 | };
42 | 


--------------------------------------------------------------------------------
/src/extractors/custom/japan.cnet.com/index.js:
--------------------------------------------------------------------------------
 1 | export const JapanCnetComExtractor = {
 2 |   domain: 'japan.cnet.com',
 3 | 
 4 |   title: {
 5 |     selectors: ['.leaf-headline-ttl'],
 6 |   },
 7 | 
 8 |   author: {
 9 |     selectors: ['.writer'],
10 |   },
11 | 
12 |   date_published: {
13 |     selectors: ['.date'],
14 |     format: 'YYYY年MM月DD日 HH時mm分',
15 |     timezone: 'Asia/Tokyo',
16 |   },
17 | 
18 |   dek: null,
19 | 
20 |   lead_image_url: {
21 |     selectors: [['meta[name="og:image"]', 'value']],
22 |   },
23 | 
24 |   content: {
25 |     selectors: ['div.article_body'],
26 | 
27 |     transforms: {},
28 | 
29 |     clean: [],
30 |   },
31 | };
32 | 


--------------------------------------------------------------------------------
/src/extractors/custom/japan.zdnet.com/index.js:
--------------------------------------------------------------------------------
 1 | export const JapanZdnetComExtractor = {
 2 |   domain: 'japan.zdnet.com',
 3 | 
 4 |   title: {
 5 |     selectors: ['h1'],
 6 |   },
 7 | 
 8 |   author: {
 9 |     selectors: [['meta[name="cXenseParse:author"]', 'value']],
10 |   },
11 | 
12 |   date_published: {
13 |     selectors: [['meta[name="article:published_time"]', 'value']],
14 |   },
15 | 
16 |   dek: null,
17 | 
18 |   lead_image_url: {
19 |     selectors: [['meta[name="og:image"]', 'value']],
20 |   },
21 | 
22 |   content: {
23 |     selectors: ['div.article_body'],
24 | 
25 |     transforms: {},
26 | 
27 |     clean: [],
28 |   },
29 | };
30 | 


--------------------------------------------------------------------------------
/src/extractors/custom/jvndb.jvn.jp/index.js:
--------------------------------------------------------------------------------
 1 | export const JvndbJvnJpExtractor = {
 2 |   domain: 'jvndb.jvn.jp',
 3 | 
 4 |   title: {
 5 |     selectors: ['title'],
 6 |   },
 7 | 
 8 |   author: null,
 9 | 
10 |   date_published: {
11 |     selectors: ['div.modifytxt:nth-child(2)'],
12 |     format: 'YYYY/MM/DD',
13 |     timezone: 'Asia/Tokyo',
14 |   },
15 | 
16 |   dek: null,
17 | 
18 |   lead_image_url: null,
19 | 
20 |   content: {
21 |     selectors: ['#news-list'],
22 | 
23 |     defaultCleaner: false,
24 | 
25 |     transforms: {},
26 | 
27 |     clean: [],
28 |   },
29 | };
30 | 


--------------------------------------------------------------------------------
/src/extractors/custom/mashable.com/index.js:
--------------------------------------------------------------------------------
 1 | export const MashableComExtractor = {
 2 |   domain: 'mashable.com',
 3 | 
 4 |   title: {
 5 |     selectors: ['header h1', 'h1.title'],
 6 |   },
 7 | 
 8 |   author: {
 9 |     selectors: [['meta[name="article:author"]', 'value'], 'span.author_name a'],
10 |   },
11 | 
12 |   date_published: {
13 |     selectors: [['meta[name="article:published_time"]', 'value']],
14 |   },
15 | 
16 |   lead_image_url: {
17 |     selectors: [['meta[name="og:image"]', 'value']],
18 |   },
19 | 
20 |   content: {
21 |     selectors: ['#article', 'section.article-content.blueprint'],
22 | 
23 |     // Is there anything in the content you selected that needs transformed
24 |     // before it's consumable content? E.g., unusual lazy loaded images
25 |     transforms: {
26 |       '.image-credit': 'figcaption',
27 |     },
28 | 
29 |     // Is there anything that is in the result that shouldn't be?
30 |     // The clean selectors will remove anything that matches from
31 |     // the result
32 |     clean: [],
33 |   },
34 | };
35 | 


--------------------------------------------------------------------------------
/src/extractors/custom/money.cnn.com/index.js:
--------------------------------------------------------------------------------
 1 | export const MoneyCnnComExtractor = {
 2 |   domain: 'money.cnn.com',
 3 | 
 4 |   title: {
 5 |     selectors: ['.article-title'],
 6 |   },
 7 | 
 8 |   author: {
 9 |     selectors: [['meta[name="author"]', 'value'], '.byline a'],
10 |   },
11 | 
12 |   date_published: {
13 |     selectors: [['meta[name="date"]', 'value']],
14 | 
15 |     timezone: 'GMT',
16 |   },
17 | 
18 |   dek: {
19 |     selectors: ['#storytext h2'],
20 |   },
21 | 
22 |   lead_image_url: {
23 |     selectors: [['meta[name="og:image"]', 'value']],
24 |   },
25 | 
26 |   content: {
27 |     selectors: ['#storytext'],
28 | 
29 |     // Is there anything in the content you selected that needs transformed
30 |     // before it's consumable content? E.g., unusual lazy loaded images
31 |     transforms: {},
32 | 
33 |     // Is there anything that is in the result that shouldn't be?
34 |     // The clean selectors will remove anything that matches from
35 |     // the result
36 |     clean: ['.inStoryHeading'],
37 |   },
38 | };
39 | 


--------------------------------------------------------------------------------
/src/extractors/custom/newrepublic.com/index.js:
--------------------------------------------------------------------------------
 1 | export const NewrepublicComExtractor = {
 2 |   domain: 'newrepublic.com',
 3 | 
 4 |   title: {
 5 |     selectors: ['h1.article-headline'],
 6 |   },
 7 | 
 8 |   author: {
 9 |     selectors: ['span.AuthorList'],
10 |   },
11 | 
12 |   date_published: {
13 |     selectors: [['meta[name="article:published_time"]', 'value']],
14 | 
15 |     timezone: 'America/New_York',
16 |   },
17 | 
18 |   dek: {
19 |     selectors: ['h2.article-subhead'],
20 |   },
21 | 
22 |   lead_image_url: {
23 |     selectors: [['meta[name="og:image"]', 'value']],
24 |   },
25 | 
26 |   content: {
27 |     selectors: [['div.article-body']],
28 | 
29 |     // Is there anything in the content you selected that needs transformed
30 |     // before it's consumable content? E.g., unusual lazy loaded images
31 |     transforms: {},
32 | 
33 |     // Is there anything that is in the result that shouldn't be?
34 |     // The clean selectors will remove anything that matches from
35 |     // the result
36 |     clean: ['aside'],
37 |   },
38 | };
39 | 


--------------------------------------------------------------------------------
/src/extractors/custom/nymag.com/index.test.js:
--------------------------------------------------------------------------------
 1 | import assert from 'assert';
 2 | 
 3 | import Mercury from 'mercury';
 4 | 
 5 | const fs = require('fs');
 6 | 
 7 | describe('NYMagExtractor', () => {
 8 |   it('works with a feature story', async () => {
 9 |     const html = fs.readFileSync('./fixtures/nymag.com.html');
10 |     const uri =
11 |       'http://nymag.com/daily/intelligencer/2016/09/how-fox-news-women-took-down-roger-ailes.html';
12 | 
13 |     const { dek, title, author } = await Mercury.parse(uri, html);
14 |     const actualDek =
15 |       'How Fox News women took down the most powerful, and predatory, man in media.';
16 | 
17 |     assert.equal(dek, actualDek);
18 |     assert.equal(title, 'The Revenge of Roger’s Angels');
19 |     assert.equal(author, 'Gabriel Sherman');
20 |   });
21 | });
22 | 


--------------------------------------------------------------------------------
/src/extractors/custom/obamawhitehouse.archives.gov/index.js:
--------------------------------------------------------------------------------
 1 | export const ObamawhitehouseArchivesGovExtractor = {
 2 |   domain: 'obamawhitehouse.archives.gov',
 3 | 
 4 |   supportedDomains: ['whitehouse.gov'],
 5 | 
 6 |   title: {
 7 |     selectors: ['h1', '.pane-node-title'],
 8 |   },
 9 | 
10 |   author: {
11 |     selectors: ['.blog-author-link', '.node-person-name-link'],
12 |   },
13 | 
14 |   date_published: {
15 |     selectors: [['meta[name="article:published_time"]', 'value']],
16 |   },
17 | 
18 |   dek: {
19 |     selectors: ['.field-name-field-forall-summary'],
20 |   },
21 | 
22 |   lead_image_url: {
23 |     selectors: [['meta[name="og:image"]', 'value']],
24 |   },
25 | 
26 |   content: {
27 |     defaultCleaner: false,
28 | 
29 |     selectors: ['div#content-start', '.pane-node-field-forall-body'],
30 | 
31 |     // Is there anything in the content you selected that needs transformed
32 |     // before it's consumable content? E.g., unusual lazy loaded images
33 |     transforms: {},
34 | 
35 |     // Is there anything that is in the result that shouldn't be?
36 |     // The clean selectors will remove anything that matches from
37 |     // the result
38 |     clean: ['.pane-node-title', '.pane-custom.pane-1'],
39 |   },
40 | };
41 | 


--------------------------------------------------------------------------------
/src/extractors/custom/observer.com/index.js:
--------------------------------------------------------------------------------
 1 | export const ObserverComExtractor = {
 2 |   domain: 'observer.com',
 3 | 
 4 |   title: {
 5 |     selectors: ['h1.entry-title'],
 6 |   },
 7 | 
 8 |   author: {
 9 |     selectors: ['.author', '.vcard'],
10 |   },
11 | 
12 |   date_published: {
13 |     selectors: [['meta[name="article:published_time"]', 'value']],
14 |   },
15 | 
16 |   dek: {
17 |     selectors: ['h2.dek'],
18 |   },
19 | 
20 |   lead_image_url: {
21 |     selectors: [['meta[name="og:image"]', 'value']],
22 |   },
23 | 
24 |   content: {
25 |     selectors: ['div.entry-content'],
26 | 
27 |     // Is there anything in the content you selected that needs transformed
28 |     // before it's consumable content? E.g., unusual lazy loaded images
29 |     transforms: {},
30 | 
31 |     // Is there anything that is in the result that shouldn't be?
32 |     // The clean selectors will remove anything that matches from
33 |     // the result
34 |     clean: [],
35 |   },
36 | };
37 | 


--------------------------------------------------------------------------------
/src/extractors/custom/otrs.com/index.js:
--------------------------------------------------------------------------------
 1 | export const OtrsComExtractor = {
 2 |   domain: 'otrs.com',
 3 | 
 4 |   title: {
 5 |     selectors: ['#main article h1'],
 6 |   },
 7 | 
 8 |   author: {
 9 |     selectors: ['div.dateplusauthor a'],
10 |   },
11 | 
12 |   date_published: {
13 |     selectors: [['meta[name="article:published_time"]', 'value']],
14 |   },
15 | 
16 |   dek: {
17 |     selectors: [['meta[name="og:description"]', 'value']],
18 |   },
19 | 
20 |   lead_image_url: {
21 |     selectors: [['meta[name="og:image"]', 'value']],
22 |   },
23 | 
24 |   content: {
25 |     selectors: ['#main article'],
26 | 
27 |     defaultCleaner: false,
28 | 
29 |     transforms: {},
30 | 
31 |     clean: [
32 |       'div.dateplusauthor',
33 |       'div.gr-12.push-6.footershare',
34 |       '#atftbx',
35 |       'div.category-modul',
36 |     ],
37 |   },
38 | };
39 | 


--------------------------------------------------------------------------------
/src/extractors/custom/pagesix.com/index.js:
--------------------------------------------------------------------------------
 1 | export const PagesixComExtractor = {
 2 |   domain: 'pagesix.com',
 3 | 
 4 |   supportedDomains: ['nypost.com'],
 5 | 
 6 |   title: {
 7 |     selectors: [['meta[name="og:title"]', 'value']],
 8 |   },
 9 | 
10 |   author: {
11 |     selectors: ['.byline'],
12 |   },
13 | 
14 |   date_published: {
15 |     selectors: [['meta[name="article:published_time"]', 'value']],
16 |   },
17 | 
18 |   dek: {
19 |     selectors: [['meta[name="description"]', 'value']],
20 |   },
21 | 
22 |   lead_image_url: {
23 |     selectors: [['meta[name="og:image"]', 'value']],
24 |   },
25 | 
26 |   content: {
27 |     selectors: [
28 |       ['#featured-image-wrapper', '.entry-content'],
29 |       '.entry-content',
30 |     ],
31 | 
32 |     // Is there anything in the content you selected that needs transformed
33 |     // before it's consumable content? E.g., unusual lazy loaded images
34 |     transforms: {
35 |       '#featured-image-wrapper': 'figure',
36 |       '.wp-caption-text': 'figcaption',
37 |     },
38 | 
39 |     // Is there anything that is in the result that shouldn't be?
40 |     // The clean selectors will remove anything that matches from
41 |     // the result
42 |     clean: ['.modal-trigger'],
43 |   },
44 | };
45 | 


--------------------------------------------------------------------------------
/src/extractors/custom/pastebin.com/index.js:
--------------------------------------------------------------------------------
 1 | export const PastebinComExtractor = {
 2 |   domain: 'pastebin.com',
 3 | 
 4 |   title: {
 5 |     selectors: ['h1'],
 6 |   },
 7 | 
 8 |   author: {
 9 |     selectors: ['.username', '.paste_box_line2 .t_us + a'],
10 |   },
11 | 
12 |   date_published: {
13 |     selectors: ['.date', '.paste_box_line2 .t_da + span'],
14 |     timezone: 'America/New_York',
15 |     format: 'MMMM D, YYYY',
16 |   },
17 | 
18 |   lead_image_url: {
19 |     selectors: [['meta[name="og:image"]', 'value']],
20 |   },
21 | 
22 |   content: {
23 |     selectors: ['.source', '#selectable .text'],
24 | 
25 |     // Is there anything in the content you selected that needs transformed
26 |     // before it's consumable content? E.g., unusual lazy loaded images
27 |     transforms: {
28 |       ol: 'div',
29 |       li: 'p',
30 |     },
31 | 
32 |     // Is there anything that is in the result that shouldn't be?
33 |     // The clean selectors will remove anything that matches from
34 |     // the result
35 |     clean: [],
36 |   },
37 | };
38 | 


--------------------------------------------------------------------------------
/src/extractors/custom/people.com/index.js:
--------------------------------------------------------------------------------
 1 | export const PeopleComExtractor = {
 2 |   domain: 'people.com',
 3 | 
 4 |   title: {
 5 |     selectors: ['.article-header h1', ['meta[name="og:title"]', 'value']],
 6 |   },
 7 | 
 8 |   author: {
 9 |     selectors: [['meta[name="sailthru.author"]', 'value'], 'a.author.url.fn'],
10 |   },
11 | 
12 |   date_published: {
13 |     selectors: [
14 |       '.mntl-attribution__item-date',
15 |       ['meta[name="article:published_time"]', 'value'],
16 |     ],
17 |   },
18 | 
19 |   lead_image_url: {
20 |     selectors: [['meta[name="og:image"]', 'value']],
21 |   },
22 | 
23 |   dek: {
24 |     selectors: ['.article-header h2'],
25 |   },
26 | 
27 |   content: {
28 |     selectors: ['div[class^="loc article-content"]', 'div.article-body__inner'],
29 | 
30 |     // Is there anything in the content you selected that needs transformed
31 |     // before it's consumable content? E.g., unusual lazy loaded images
32 |     transforms: {},
33 | 
34 |     // Is there anything that is in the result that shouldn't be?
35 |     // The clean selectors will remove anything that matches from
36 |     // the result
37 |     clean: [],
38 |   },
39 | };
40 | 


--------------------------------------------------------------------------------
/src/extractors/custom/phpspot.org/index.js:
--------------------------------------------------------------------------------
 1 | export const PhpspotOrgExtractor = {
 2 |   domain: 'phpspot.org',
 3 | 
 4 |   title: {
 5 |     selectors: ['h3.hl'],
 6 |   },
 7 | 
 8 |   author: null,
 9 | 
10 |   date_published: {
11 |     selectors: ['h4.hl'],
12 |     format: 'YYYY年MM月DD日',
13 |     timezone: 'Asia/Tokyo',
14 |   },
15 | 
16 |   dek: null,
17 | 
18 |   lead_image_url: null,
19 | 
20 |   content: {
21 |     selectors: ['div.entrybody'],
22 | 
23 |     defaultCleaner: false,
24 | 
25 |     transforms: {},
26 | 
27 |     clean: [],
28 |   },
29 | };
30 | 


--------------------------------------------------------------------------------
/src/extractors/custom/pitchfork.com/index.js:
--------------------------------------------------------------------------------
 1 | export const PitchforkComExtractor = {
 2 |   domain: 'pitchfork.com',
 3 | 
 4 |   title: {
 5 |     selectors: [['meta[name="og:title"]', 'value'], 'title'],
 6 |   },
 7 | 
 8 |   author: {
 9 |     selectors: [
10 |       ['meta[name="article:author"]', 'value'],
11 |       '.authors-detail__display-name',
12 |     ],
13 |   },
14 | 
15 |   date_published: {
16 |     selectors: ['div[class^="InfoSliceWrapper-"]', ['.pub-date', 'datetime']],
17 |   },
18 | 
19 |   dek: {
20 |     selectors: [
21 |       ['meta[name="og:description"]', 'value'],
22 |       '.review-detail__abstract',
23 |     ],
24 |   },
25 | 
26 |   lead_image_url: {
27 |     selectors: [
28 |       ['meta[name="og:image"]', 'value'],
29 |       ['.single-album-tombstone__art img', 'src'],
30 |     ],
31 |   },
32 | 
33 |   content: {
34 |     selectors: ['div.body__inner-container', '.review-detail__text'],
35 |   },
36 | 
37 |   extend: {
38 |     score: {
39 |       selectors: ['p[class*="Rating"]', '.score'],
40 |     },
41 |   },
42 | };
43 | 


--------------------------------------------------------------------------------
/src/extractors/custom/postlight.com/index.js:
--------------------------------------------------------------------------------
 1 | export const PostlightComExtractor = {
 2 |   domain: 'postlight.com',
 3 | 
 4 |   title: {
 5 |     selectors: [['meta[name="og:title"]', 'value']],
 6 |   },
 7 | 
 8 |   author: {
 9 |     selectors: [['meta[name="parsely-author"]', 'value']],
10 |   },
11 | 
12 |   date_published: {
13 |     selectors: [['meta[name="article:published_time"]', 'value']],
14 |   },
15 | 
16 |   dek: {
17 |     selectors: ['h2.single-hero__abstract'],
18 |   },
19 | 
20 |   lead_image_url: {
21 |     selectors: [['meta[name="og:image"]', 'value']],
22 |   },
23 | 
24 |   content: {
25 |     selectors: ['main.post'],
26 | 
27 |     // Is there anything in the content you selected that needs transformed
28 |     // before it's consumable content? E.g., unusual lazy loaded images
29 |     transforms: {},
30 | 
31 |     // Is there anything that is in the result that shouldn't be?
32 |     // The clean selectors will remove anything that matches from
33 |     // the result
34 |     clean: [
35 |       'section.pl-post-link',
36 |       'aside',
37 |       'section.insights_featured_case_studies',
38 |     ],
39 |   },
40 | };
41 | 


--------------------------------------------------------------------------------
/src/extractors/custom/qz.com/index.js:
--------------------------------------------------------------------------------
 1 | export const QzComExtractor = {
 2 |   domain: 'qz.com',
 3 | 
 4 |   title: {
 5 |     selectors: ['article header h1'],
 6 |   },
 7 | 
 8 |   author: {
 9 |     selectors: [['meta[name="author"]', 'value']],
10 |   },
11 | 
12 |   date_published: {
13 |     selectors: [
14 |       ['meta[name="article:published_time"]', 'value'],
15 |       ['time[datetime]', 'datetime'],
16 |     ],
17 |   },
18 | 
19 |   lead_image_url: {
20 |     selectors: [
21 |       ['meta[name="og:image"]', 'value'],
22 |       ['meta[property="og:image"]', 'content'],
23 |       ['meta[name="twitter:image"]', 'content'],
24 |     ],
25 |   },
26 | 
27 |   content: {
28 |     selectors: ['#article-content'],
29 | 
30 |     // Is there anything in the content you selected that needs transformed
31 |     // before it's consumable content? E.g., unusual lazy loaded images
32 |     transforms: {},
33 | 
34 |     // Is there anything that is in the result that shouldn't be?
35 |     // The clean selectors will remove anything that matches from
36 |     // the result
37 |     clean: [],
38 |   },
39 | };
40 | 


--------------------------------------------------------------------------------
/src/extractors/custom/scan.netsecurity.ne.jp/index.js:
--------------------------------------------------------------------------------
 1 | export const ScanNetsecurityNeJpExtractor = {
 2 |   domain: 'scan.netsecurity.ne.jp',
 3 | 
 4 |   title: {
 5 |     selectors: ['header.arti-header h1.head'],
 6 |   },
 7 | 
 8 |   author: null,
 9 | 
10 |   date_published: {
11 |     selectors: [['meta[name="article:modified_time"]', 'value']],
12 |   },
13 | 
14 |   dek: {
15 |     selectors: ['header.arti-header p.arti-summary'],
16 |   },
17 | 
18 |   lead_image_url: {
19 |     selectors: [['meta[name="og:image"]', 'value']],
20 |   },
21 | 
22 |   content: {
23 |     selectors: ['div.arti-content.arti-content--thumbnail'],
24 | 
25 |     defaultCleaner: false,
26 | 
27 |     transforms: {},
28 | 
29 |     clean: ['aside.arti-giga'],
30 |   },
31 | };
32 | 


--------------------------------------------------------------------------------
/src/extractors/custom/sciencefly.com/index.js:
--------------------------------------------------------------------------------
 1 | export const ScienceflyComExtractor = {
 2 |   domain: 'sciencefly.com',
 3 | 
 4 |   title: {
 5 |     selectors: ['.entry-title', '.cb-entry-title', '.cb-single-title'],
 6 |   },
 7 | 
 8 |   author: {
 9 |     selectors: ['div.cb-author', 'div.cb-author-title'],
10 |   },
11 | 
12 |   date_published: {
13 |     selectors: [['meta[name="article:published_time"]', 'value']],
14 |   },
15 | 
16 |   dek: {
17 |     selectors: [
18 |       // enter selectors
19 |     ],
20 |   },
21 | 
22 |   lead_image_url: {
23 |     selectors: [['div.theiaPostSlider_slides img', 'src']],
24 |   },
25 | 
26 |   content: {
27 |     selectors: ['div.theiaPostSlider_slides'],
28 | 
29 |     // Is there anything in the content you selected that needs transformed
30 |     // before it's consumable content? E.g., unusual lazy loaded images
31 |     transforms: {},
32 | 
33 |     // Is there anything that is in the result that shouldn't be?
34 |     // The clean selectors will remove anything that matches from
35 |     // the result
36 |     clean: [],
37 |   },
38 | };
39 | 


--------------------------------------------------------------------------------
/src/extractors/custom/sect.iij.ad.jp/index.js:
--------------------------------------------------------------------------------
 1 | export const SectIijAdJpExtractor = {
 2 |   domain: 'sect.iij.ad.jp',
 3 | 
 4 |   title: {
 5 |     selectors: ['div.title-box-inner h1', 'h3'],
 6 |   },
 7 | 
 8 |   author: {
 9 |     selectors: ['p.post-author a', 'dl.entrydate dd'],
10 |   },
11 | 
12 |   date_published: {
13 |     selectors: ['time'],
14 |     format: 'YYYY年MM月DD日',
15 |     timezone: 'Asia/Tokyo',
16 |   },
17 | 
18 |   dek: null,
19 | 
20 |   lead_image_url: {
21 |     selectors: [['meta[name="og:image"]', 'value']],
22 |   },
23 | 
24 |   content: {
25 |     selectors: ['.entry-inner', '#article'],
26 | 
27 |     transforms: {},
28 | 
29 |     clean: ['dl.entrydate'],
30 |   },
31 | };
32 | 


--------------------------------------------------------------------------------
/src/extractors/custom/takagi-hiromitsu.jp/index.js:
--------------------------------------------------------------------------------
 1 | export const TakagihiromitsuJpExtractor = {
 2 |   domain: 'takagi-hiromitsu.jp',
 3 | 
 4 |   title: {
 5 |     selectors: ['h3'],
 6 |   },
 7 | 
 8 |   author: {
 9 |     selectors: [['meta[name="author"]', 'value']],
10 |   },
11 | 
12 |   date_published: {
13 |     selectors: [['meta[http-equiv="Last-Modified"]', 'value']],
14 |   },
15 | 
16 |   dek: null,
17 | 
18 |   lead_image_url: null,
19 | 
20 |   content: {
21 |     selectors: ['div.body'],
22 | 
23 |     defaultCleaner: false,
24 | 
25 |     transforms: {},
26 | 
27 |     clean: [],
28 |   },
29 | };
30 | 


--------------------------------------------------------------------------------
/src/extractors/custom/techlog.iij.ad.jp/index.js:
--------------------------------------------------------------------------------
 1 | export const TechlogIijAdJpExtractor = {
 2 |   domain: 'techlog.iij.ad.jp',
 3 | 
 4 |   title: {
 5 |     selectors: ['h1.entry-title'],
 6 |   },
 7 | 
 8 |   author: {
 9 |     selectors: ['a[rel="author"]'],
10 |   },
11 | 
12 |   date_published: {
13 |     selectors: [['time.entry-date', 'datetime']],
14 |   },
15 | 
16 |   dek: null,
17 | 
18 |   lead_image_url: {
19 |     selectors: [['meta[name="og:image"]', 'value']],
20 |   },
21 | 
22 |   content: {
23 |     selectors: ['div.entry-content'],
24 | 
25 |     defaultCleaner: false,
26 | 
27 |     transforms: {},
28 | 
29 |     clean: ['.wp_social_bookmarking_light'],
30 |   },
31 | };
32 | 


--------------------------------------------------------------------------------
/src/extractors/custom/thefederalistpapers.org/index.js:
--------------------------------------------------------------------------------
 1 | export const ThefederalistpapersOrgExtractor = {
 2 |   domain: 'thefederalistpapers.org',
 3 | 
 4 |   title: {
 5 |     selectors: ['h1.entry-title'],
 6 |   },
 7 | 
 8 |   author: {
 9 |     selectors: ['.author-meta-title', 'main span.entry-author-name'],
10 |   },
11 | 
12 |   date_published: {
13 |     selectors: [['meta[name="article:published_time"]', 'value']],
14 |   },
15 | 
16 |   lead_image_url: {
17 |     selectors: [['meta[name="og:image"]', 'value']],
18 |   },
19 | 
20 |   content: {
21 |     selectors: ['.content'],
22 | 
23 |     // Is there anything in the content you selected that needs transformed
24 |     // before it's consumable content? E.g., unusual lazy loaded images
25 |     transforms: {},
26 | 
27 |     // Is there anything that is in the result that shouldn't be?
28 |     // The clean selectors will remove anything that matches from
29 |     // the result
30 |     clean: [
31 |       'header',
32 |       '.article-sharing',
33 |       '.after-article',
34 |       '.type-commenting',
35 |       '.more-posts',
36 |       ['p[style]'],
37 |     ],
38 |   },
39 | };
40 | 


--------------------------------------------------------------------------------
/src/extractors/custom/thoughtcatalog.com/index.js:
--------------------------------------------------------------------------------
 1 | export const ThoughtcatalogComExtractor = {
 2 |   domain: 'thoughtcatalog.com',
 3 | 
 4 |   title: {
 5 |     selectors: ['h1.title', ['meta[name="og:title"]', 'value']],
 6 |   },
 7 | 
 8 |   author: {
 9 |     selectors: [
10 |       'cite a',
11 |       'div.col-xs-12.article_header div.writer-container.writer-container-inline.writer-no-avatar h4.writer-name',
12 |       'h1.writer-name',
13 |     ],
14 |   },
15 | 
16 |   date_published: {
17 |     selectors: [['meta[name="article:published_time"]', 'value']],
18 |   },
19 | 
20 |   lead_image_url: {
21 |     selectors: [['meta[name="og:image"]', 'value']],
22 |   },
23 | 
24 |   content: {
25 |     selectors: ['.entry.post'],
26 | 
27 |     // Is there anything in the content you selected that needs transformed
28 |     // before it's consumable content? E.g., unusual lazy loaded images
29 |     transforms: {},
30 | 
31 |     // Is there anything that is in the result that shouldn't be?
32 |     // The clean selectors will remove anything that matches from
33 |     // the result
34 |     clean: ['.tc_mark', 'figcaption'],
35 |   },
36 | };
37 | 


--------------------------------------------------------------------------------
/src/extractors/custom/timesofindia.indiatimes.com/index.js:
--------------------------------------------------------------------------------
 1 | export const TimesofindiaIndiatimesComExtractor = {
 2 |   domain: 'timesofindia.indiatimes.com',
 3 | 
 4 |   title: {
 5 |     selectors: ['h1'],
 6 |   },
 7 | 
 8 |   extend: {
 9 |     reporter: {
10 |       selectors: ['div.byline'],
11 |       transforms: {},
12 |     },
13 |   },
14 | 
15 |   date_published: {
16 |     selectors: ['.byline'],
17 |     format: 'MMM D, YYYY, HH:mm z',
18 |     timezone: 'Asia/Kolkata',
19 |   },
20 | 
21 |   lead_image_url: {
22 |     selectors: [['meta[name="og:image"]', 'value']],
23 |   },
24 | 
25 |   content: {
26 |     selectors: ['div.contentwrapper:has(section)'],
27 |     defaultCleaner: false,
28 | 
29 |     clean: [
30 |       'section',
31 |       'h1',
32 |       '.byline',
33 |       '.img_cptn',
34 |       '.icon_share_wrap',
35 |       'ul[itemtype="https://schema.org/BreadcrumbList"]',
36 |     ],
37 |   },
38 | };
39 | 


--------------------------------------------------------------------------------
/src/extractors/custom/twitter.com/index.js:
--------------------------------------------------------------------------------
 1 | export const TwitterExtractor = {
 2 |   domain: 'twitter.com',
 3 | 
 4 |   content: {
 5 |     transforms: {
 6 |       // We're transforming essentially the whole page here.
 7 |       // Twitter doesn't have nice selectors, so our initial
 8 |       // selector grabs the whole page, then we're re-writing
 9 |       // it to fit our needs before we clean it up.
10 |       '.permalink[role=main]': ($node, $) => {
11 |         const tweets = $node.find('.tweet');
12 |         const $tweetContainer = $('<div id="TWEETS_GO_HERE"></div>');
13 |         $tweetContainer.append(tweets);
14 |         $node.replaceWith($tweetContainer);
15 |       },
16 | 
17 |       // Twitter wraps @ with s, which
18 |       // renders as a strikethrough
19 |       s: 'span',
20 |     },
21 | 
22 |     selectors: ['.permalink[role=main]'],
23 | 
24 |     defaultCleaner: false,
25 | 
26 |     clean: ['.stream-item-footer', 'button', '.tweet-details-fixer'],
27 |   },
28 | 
29 |   author: {
30 |     selectors: ['.tweet.permalink-tweet .username'],
31 |   },
32 | 
33 |   date_published: {
34 |     selectors: [['.permalink-tweet ._timestamp[data-time-ms]', 'data-time-ms']],
35 |   },
36 | };
37 | 


--------------------------------------------------------------------------------
/src/extractors/custom/twitter.com/index.test.js:
--------------------------------------------------------------------------------
 1 | import assert from 'assert';
 2 | 
 3 | import Mercury from 'mercury';
 4 | 
 5 | const fs = require('fs');
 6 | 
 7 | describe('TwitterExtractor', () => {
 8 |   it('works with a feature story', async () => {
 9 |     const html = fs.readFileSync('./fixtures/twitter.com.html');
10 |     const uri = 'https://twitter.com/KingBeyonceStan/status/745276948213968896';
11 | 
12 |     const { title, author, date_published } = await Mercury.parse(uri, {
13 |       html,
14 |     });
15 | 
16 |     assert.equal(title, 'Lina Morgana on Twitter');
17 |     assert.equal(author, '@KingBeyonceStan');
18 |     assert.equal(date_published, '2016-06-21T15:27:25.000Z');
19 |   });
20 | });
21 | 


--------------------------------------------------------------------------------
/src/extractors/custom/uproxx.com/index.js:
--------------------------------------------------------------------------------
 1 | export const UproxxComExtractor = {
 2 |   domain: 'uproxx.com',
 3 | 
 4 |   title: {
 5 |     selectors: ['div.entry-header h1'],
 6 |   },
 7 | 
 8 |   author: {
 9 |     selectors: [['meta[name="qc:author"]', 'value']],
10 |   },
11 | 
12 |   date_published: {
13 |     selectors: [['meta[name="article:published_time"]', 'value']],
14 |   },
15 | 
16 |   lead_image_url: {
17 |     selectors: [['meta[name="og:image"]', 'value']],
18 |   },
19 | 
20 |   content: {
21 |     selectors: ['.entry-content'],
22 | 
23 |     // Is there anything in the content you selected that needs transformed
24 |     // before it's consumable content? E.g., unusual lazy loaded images
25 |     transforms: {
26 |       'div.image': 'figure',
27 |       'div.image .wp-media-credit': 'figcaption',
28 |     },
29 | 
30 |     // Is there anything that is in the result that shouldn't be?
31 |     // The clean selectors will remove anything that matches from
32 |     // the result
33 |     clean: [],
34 |   },
35 | };
36 | 


--------------------------------------------------------------------------------
/src/extractors/custom/weekly.ascii.jp/index.js:
--------------------------------------------------------------------------------
 1 | export const WeeklyAsciiJpExtractor = {
 2 |   domain: 'weekly.ascii.jp',
 3 | 
 4 |   title: {
 5 |     selectors: ['article h1', 'h1[itemprop="headline"]'],
 6 |   },
 7 | 
 8 |   author: {
 9 |     selectors: ['p.author'],
10 |   },
11 | 
12 |   date_published: {
13 |     selectors: ['p.date', ['meta[name="odate"]', 'value']],
14 | 
15 |     format: 'YYYY年MM月DD日 HH:mm',
16 | 
17 |     timezone: 'Asia/Tokyo',
18 |   },
19 | 
20 |   dek: null,
21 | 
22 |   lead_image_url: {
23 |     selectors: [['meta[name="og:image"]', 'value']],
24 |   },
25 | 
26 |   content: {
27 |     selectors: ['div#contents_detail', 'div.article'],
28 | 
29 |     transforms: {},
30 | 
31 |     clean: [],
32 |   },
33 | };
34 | 


--------------------------------------------------------------------------------
/src/extractors/custom/wikipedia.org/index.js:
--------------------------------------------------------------------------------
 1 | export const WikipediaExtractor = {
 2 |   domain: 'wikipedia.org',
 3 |   content: {
 4 |     selectors: ['#mw-content-text'],
 5 | 
 6 |     defaultCleaner: false,
 7 | 
 8 |     // transform top infobox to an image with caption
 9 |     transforms: {
10 |       '.infobox img': $node => {
11 |         const $parent = $node.parents('.infobox');
12 |         // Only prepend the first image in .infobox
13 |         if ($parent.children('img').length === 0) {
14 |           $parent.prepend($node);
15 |         }
16 |       },
17 |       '.infobox caption': 'figcaption',
18 |       '.infobox': 'figure',
19 |     },
20 | 
21 |     // Selectors to remove from the extracted content
22 |     clean: [
23 |       '.mw-editsection',
24 |       'figure tr, figure td, figure tbody',
25 |       '#toc',
26 |       '.navbox',
27 |     ],
28 |   },
29 | 
30 |   author: 'Wikipedia Contributors',
31 | 
32 |   title: {
33 |     selectors: ['h2.title'],
34 |   },
35 | 
36 |   date_published: {
37 |     selectors: ['#footer-info-lastmod'],
38 |   },
39 | };
40 | 


--------------------------------------------------------------------------------
/src/extractors/custom/wired.jp/index.js:
--------------------------------------------------------------------------------
 1 | import URL from 'url';
 2 | 
 3 | export const WiredJpExtractor = {
 4 |   domain: 'wired.jp',
 5 | 
 6 |   title: {
 7 |     selectors: ['h1[data-testid="ContentHeaderHed"]', 'h1.post-title'],
 8 |   },
 9 | 
10 |   author: {
11 |     selectors: [
12 |       ['meta[name="article:author"]', 'value'],
13 |       'p[itemprop="author"]',
14 |     ],
15 |   },
16 | 
17 |   date_published: {
18 |     selectors: [
19 |       ['meta[name="article:published_time"]', 'value'],
20 |       ['time', 'datetime'],
21 |     ],
22 |   },
23 | 
24 |   dek: {
25 |     selectors: ['div[class^="ContentHeaderDek"]', '.post-intro'],
26 |   },
27 | 
28 |   lead_image_url: {
29 |     selectors: [['meta[name="og:image"]', 'value']],
30 |   },
31 | 
32 |   content: {
33 |     selectors: [
34 |       'div[data-attribute-verso-pattern="article-body"]',
35 |       'article.article-detail',
36 |     ],
37 | 
38 |     transforms: {
39 |       'img[data-original]': $node => {
40 |         const dataOriginal = $node.attr('data-original');
41 |         const src = $node.attr('src');
42 |         const url = URL.resolve(src, dataOriginal);
43 |         $node.attr('src', url);
44 |       },
45 |     },
46 | 
47 |     clean: ['.post-category', 'time', 'h1.post-title', '.social-area-syncer'],
48 |   },
49 | };
50 | 


--------------------------------------------------------------------------------
/src/extractors/custom/www.al.com/index.js:
--------------------------------------------------------------------------------
 1 | export const WwwAlComExtractor = {
 2 |   domain: 'www.al.com',
 3 | 
 4 |   title: {
 5 |     selectors: [['meta[name="title"]', 'value']],
 6 |   },
 7 | 
 8 |   author: {
 9 |     selectors: [['meta[name="article_author"]', 'value']],
10 |   },
11 | 
12 |   date_published: {
13 |     selectors: [['meta[name="article_date_original"]', 'value']],
14 |     timezone: 'EST',
15 |   },
16 | 
17 |   lead_image_url: {
18 |     selectors: [['meta[name="og:image"]', 'value']],
19 |   },
20 | 
21 |   content: {
22 |     selectors: ['.entry-content'],
23 | 
24 |     // Is there anything in the content you selected that needs transformed
25 |     // before it's consumable content? E.g., unusual lazy loaded images
26 |     transforms: {},
27 | 
28 |     // Is there anything that is in the result that shouldn't be?
29 |     // The clean selectors will remove anything that matches from
30 |     // the result
31 |     clean: [],
32 |   },
33 | };
34 | 


--------------------------------------------------------------------------------
/src/extractors/custom/www.americanow.com/index.js:
--------------------------------------------------------------------------------
 1 | export const WwwAmericanowComExtractor = {
 2 |   domain: 'www.americanow.com',
 3 | 
 4 |   title: {
 5 |     selectors: ['.title', ['meta[name="title"]', 'value']],
 6 |   },
 7 | 
 8 |   author: {
 9 |     selectors: ['.byline'],
10 |   },
11 | 
12 |   date_published: {
13 |     selectors: [['meta[name="publish_date"]', 'value']],
14 |   },
15 | 
16 |   dek: {
17 |     selectors: [
18 |       // enter selectors
19 |     ],
20 |   },
21 | 
22 |   lead_image_url: {
23 |     selectors: [['meta[name="og:image"]', 'value']],
24 |   },
25 | 
26 |   content: {
27 |     selectors: [['.article-content', '.image', '.body'], '.body'],
28 | 
29 |     // Is there anything in the content you selected that needs transformed
30 |     // before it's consumable content? E.g., unusual lazy loaded images
31 |     transforms: {},
32 | 
33 |     // Is there anything that is in the result that shouldn't be?
34 |     // The clean selectors will remove anything that matches from
35 |     // the result
36 |     clean: ['.article-video-wrapper', '.show-for-small-only'],
37 |   },
38 | };
39 | 


--------------------------------------------------------------------------------
/src/extractors/custom/www.androidcentral.com/index.js:
--------------------------------------------------------------------------------
 1 | export const WwwAndroidcentralComExtractor = {
 2 |   domain: 'www.androidcentral.com',
 3 | 
 4 |   title: {
 5 |     selectors: ['h1', 'h1.main-title'],
 6 |   },
 7 | 
 8 |   author: {
 9 |     selectors: [['meta[name="parsely-author"]', 'value']],
10 |   },
11 | 
12 |   date_published: {
13 |     selectors: [['meta[name="article:published_time"]', 'value']],
14 |   },
15 | 
16 |   dek: {
17 |     selectors: [['meta[name="description"]', 'value']],
18 |   },
19 | 
20 |   lead_image_url: {
21 |     selectors: [['meta[name="og:image"]', 'value']],
22 |   },
23 | 
24 |   content: {
25 |     selectors: ['#article-body'],
26 | 
27 |     // Is there anything in the content you selected that needs transformed
28 |     // before it's consumable content? E.g., unusual lazy loaded images
29 |     transforms: {},
30 | 
31 |     // Is there anything that is in the result that shouldn't be?
32 |     // The clean selectors will remove anything that matches from
33 |     // the result
34 |     clean: ['.intro', 'blockquote'],
35 |   },
36 | };
37 | 


--------------------------------------------------------------------------------
/src/extractors/custom/www.aol.com/index.js:
--------------------------------------------------------------------------------
 1 | export const WwwAolComExtractor = {
 2 |   domain: 'www.aol.com',
 3 | 
 4 |   title: {
 5 |     selectors: ['h1.p-article__title'],
 6 |   },
 7 | 
 8 |   author: {
 9 |     selectors: [['meta[name="author"]', 'value']],
10 |   },
11 | 
12 |   date_published: {
13 |     selectors: ['.p-article__byline__date'],
14 | 
15 |     timezone: 'America/New_York',
16 |   },
17 | 
18 |   dek: {
19 |     selectors: [
20 |       // enter selectors
21 |     ],
22 |   },
23 | 
24 |   lead_image_url: {
25 |     selectors: [['meta[name="og:image"]', 'value']],
26 |   },
27 | 
28 |   content: {
29 |     selectors: ['.article-content'],
30 | 
31 |     // Is there anything in the content you selected that needs transformed
32 |     // before it's consumable content? E.g., unusual lazy loaded images
33 |     transforms: {},
34 | 
35 |     // Is there anything that is in the result that shouldn't be?
36 |     // The clean selectors will remove anything that matches from
37 |     // the result
38 |     clean: [],
39 |   },
40 | };
41 | 


--------------------------------------------------------------------------------
/src/extractors/custom/www.asahi.com/index.js:
--------------------------------------------------------------------------------
 1 | export const WwwAsahiComExtractor = {
 2 |   domain: 'www.asahi.com',
 3 | 
 4 |   title: {
 5 |     selectors: ['main h1', '.ArticleTitle h1'],
 6 |   },
 7 | 
 8 |   author: {
 9 |     selectors: [['meta[name="article:author"]', 'value']],
10 |   },
11 | 
12 |   date_published: {
13 |     selectors: [['meta[name="pubdate"]', 'value']],
14 |   },
15 | 
16 |   dek: null,
17 | 
18 |   excerpt: {
19 |     selectors: [['meta[name="og:description"]', 'value']],
20 |   },
21 | 
22 |   lead_image_url: {
23 |     selectors: [['meta[name="og:image"]', 'value']],
24 |   },
25 | 
26 |   content: {
27 |     selectors: ['main'],
28 | 
29 |     defaultCleaner: false,
30 | 
31 |     transforms: {},
32 | 
33 |     clean: ['div.AdMod', 'div.LoginSelectArea', 'time', 'div.notPrint'],
34 |   },
35 | };
36 | 


--------------------------------------------------------------------------------
/src/extractors/custom/www.broadwayworld.com/index.js:
--------------------------------------------------------------------------------
 1 | // Rename CustomExtractor
 2 | // to fit your publication
 3 | // (e.g., NYTimesExtractor)
 4 | export const BroadwayWorldExtractor = {
 5 |   domain: 'www.broadwayworld.com',
 6 |   title: {
 7 |     selectors: ['h1[itemprop=headline]', 'h1.article-title'],
 8 |   },
 9 | 
10 |   author: {
11 |     selectors: ['span[itemprop=author]'],
12 |   },
13 | 
14 |   content: {
15 |     selectors: ['div[itemprop=articlebody]'],
16 | 
17 |     // Is there anything in the content you selected that needs transformed
18 |     // before it's consumable content? E.g., unusual lazy loaded images
19 |     transforms: {},
20 | 
21 |     // Is there anything that is in the result that shouldn't be?
22 |     // The clean selectors will remove anything that matches from
23 |     // the result
24 |     clean: [],
25 |   },
26 | 
27 |   date_published: {
28 |     selectors: [['meta[itemprop=datePublished]', 'value']],
29 |   },
30 | 
31 |   lead_image_url: {
32 |     selectors: [['meta[name="og:image"]', 'value']],
33 |   },
34 | 
35 |   dek: {
36 |     selectors: [],
37 |   },
38 | 
39 |   next_page_url: {
40 |     selectors: [
41 |       // enter selectors
42 |     ],
43 |   },
44 | 
45 |   excerpt: {
46 |     selectors: [
47 |       // enter selectors
48 |     ],
49 |   },
50 | };
51 | 


--------------------------------------------------------------------------------
/src/extractors/custom/www.bustle.com/index.js:
--------------------------------------------------------------------------------
 1 | export const WwwBustleComExtractor = {
 2 |   domain: 'www.bustle.com',
 3 | 
 4 |   title: {
 5 |     selectors: ['h1', 'h1.post-page__title'],
 6 |   },
 7 | 
 8 |   author: {
 9 |     selectors: ['a[href*="profile"]', 'div.content-meta__author'],
10 |   },
11 | 
12 |   date_published: {
13 |     selectors: [['time', 'datetime']],
14 |   },
15 | 
16 |   lead_image_url: {
17 |     selectors: [['meta[name="og:image"]', 'value']],
18 |   },
19 | 
20 |   content: {
21 |     selectors: ['article', '.post-page__body'],
22 | 
23 |     // Is there anything in the content you selected that needs transformed
24 |     // before it's consumable content? E.g., unusual lazy loaded images
25 |     transforms: {},
26 | 
27 |     // Is there anything that is in the result that shouldn't be?
28 |     // The clean selectors will remove anything that matches from
29 |     // the result
30 |     clean: [],
31 |   },
32 | };
33 | 


--------------------------------------------------------------------------------
/src/extractors/custom/www.cbc.ca/index.js:
--------------------------------------------------------------------------------
 1 | export const WwwCbcCaExtractor = {
 2 |   domain: 'www.cbc.ca',
 3 | 
 4 |   title: {
 5 |     selectors: ['h1'],
 6 |   },
 7 | 
 8 |   author: {
 9 |     selectors: ['.authorText', '.bylineDetails'],
10 |   },
11 | 
12 |   date_published: {
13 |     selectors: [['.timeStamp[datetime]', 'datetime']],
14 |   },
15 | 
16 |   dek: {
17 |     selectors: ['.deck'],
18 |   },
19 | 
20 |   lead_image_url: {
21 |     selectors: [['meta[name="og:image"]', 'value']],
22 |   },
23 | 
24 |   content: {
25 |     selectors: ['.story'],
26 | 
27 |     // Is there anything in the content you selected that needs transformed
28 |     // before it's consumable content? E.g., unusual lazy loaded images
29 |     transforms: {},
30 | 
31 |     // Is there anything that is in the result that shouldn't be?
32 |     // The clean selectors will remove anything that matches from
33 |     // the result
34 |     clean: [],
35 |   },
36 | };
37 | 


--------------------------------------------------------------------------------
/src/extractors/custom/www.cbssports.com/index.js:
--------------------------------------------------------------------------------
 1 | export const WwwCbssportsComExtractor = {
 2 |   domain: 'www.cbssports.com',
 3 | 
 4 |   title: {
 5 |     selectors: ['.Article-headline', '.article-headline'],
 6 |   },
 7 | 
 8 |   author: {
 9 |     selectors: ['.ArticleAuthor-nameText', '.author-name'],
10 |   },
11 | 
12 |   date_published: {
13 |     selectors: [['meta[itemprop="datePublished"]', 'value']],
14 |     timezone: 'UTC',
15 |   },
16 | 
17 |   dek: {
18 |     selectors: ['.Article-subline', '.article-subline'],
19 |   },
20 | 
21 |   lead_image_url: {
22 |     selectors: [['meta[name="og:image"]', 'value']],
23 |   },
24 | 
25 |   content: {
26 |     selectors: ['.article'],
27 | 
28 |     // Is there anything in the content you selected that needs transformed
29 |     // before it's consumable content? E.g., unusual lazy loaded images
30 |     transforms: {},
31 | 
32 |     // Is there anything that is in the result that shouldn't be?
33 |     // The clean selectors will remove anything that matches from
34 |     // the result
35 |     clean: [],
36 |   },
37 | };
38 | 


--------------------------------------------------------------------------------
/src/extractors/custom/www.chicagotribune.com/index.js:
--------------------------------------------------------------------------------
 1 | export const WwwChicagotribuneComExtractor = {
 2 |   domain: 'www.chicagotribune.com',
 3 | 
 4 |   title: {
 5 |     selectors: [['meta[name="og:title"]', 'value']],
 6 |   },
 7 | 
 8 |   author: {
 9 |     selectors: ['div.article_byline span:first-of-type'],
10 |   },
11 | 
12 |   date_published: {
13 |     selectors: ['time'],
14 |   },
15 | 
16 |   lead_image_url: {
17 |     selectors: [['meta[name="og:image"]', 'value']],
18 |   },
19 | 
20 |   content: {
21 |     selectors: ['article'],
22 | 
23 |     // Is there anything in the content you selected that needs transformed
24 |     // before it's consumable content? E.g., unusual lazy loaded images
25 |     transforms: {},
26 | 
27 |     // Is there anything that is in the result that shouldn't be?
28 |     // The clean selectors will remove anything that matches from
29 |     // the result
30 |     clean: [],
31 |   },
32 | };
33 | 


--------------------------------------------------------------------------------
/src/extractors/custom/www.cnbc.com/index.js:
--------------------------------------------------------------------------------
 1 | export const WwwCnbcComExtractor = {
 2 |   domain: 'www.cnbc.com',
 3 | 
 4 |   title: {
 5 |     selectors: ['h1.title', 'h1.ArticleHeader-headline'],
 6 |   },
 7 | 
 8 |   author: {
 9 |     selectors: [['meta[name="author"]', 'value']],
10 |   },
11 | 
12 |   date_published: {
13 |     selectors: [['meta[name="article:published_time"]', 'value']],
14 |   },
15 | 
16 |   lead_image_url: {
17 |     selectors: [['meta[name="og:image"]', 'value']],
18 |   },
19 | 
20 |   content: {
21 |     selectors: [
22 |       'div#article_body.content',
23 |       'div.story',
24 |       'div.ArticleBody-articleBody',
25 |     ],
26 | 
27 |     // Is there anything in the content you selected that needs transformed
28 |     // before it's consumable content? E.g., unusual lazy loaded images
29 |     transforms: {},
30 | 
31 |     // Is there anything that is in the result that shouldn't be?
32 |     // The clean selectors will remove anything that matches from
33 |     // the result
34 |     clean: [],
35 |   },
36 | };
37 | 


--------------------------------------------------------------------------------
/src/extractors/custom/www.dmagazine.com/index.js:
--------------------------------------------------------------------------------
 1 | export const WwwDmagazineComExtractor = {
 2 |   domain: 'www.dmagazine.com',
 3 | 
 4 |   title: {
 5 |     selectors: ['h1.story__title'],
 6 |   },
 7 | 
 8 |   author: {
 9 |     selectors: ['.story__info .story__info__item:first-child'],
10 |   },
11 | 
12 |   date_published: {
13 |     selectors: [
14 |       // enter selectors
15 |       '.story__info',
16 |     ],
17 | 
18 |     timezone: 'America/Chicago',
19 |     format: 'MMMM D, YYYY h:mm a',
20 |   },
21 | 
22 |   dek: {
23 |     selectors: ['.story__subhead'],
24 |   },
25 | 
26 |   lead_image_url: {
27 |     selectors: [['article figure a:first-child', 'href']],
28 |   },
29 | 
30 |   content: {
31 |     selectors: ['.story__content'],
32 | 
33 |     // Is there anything in the content you selected that needs transformed
34 |     // before it's consumable content? E.g., unusual lazy loaded images
35 |     transforms: {},
36 | 
37 |     // Is there anything that is in the result that shouldn't be?
38 |     // The clean selectors will remove anything that matches from
39 |     // the result
40 |     clean: [],
41 |   },
42 | };
43 | 


--------------------------------------------------------------------------------
/src/extractors/custom/www.elecom.co.jp/index.js:
--------------------------------------------------------------------------------
 1 | export const WwwElecomCoJpExtractor = {
 2 |   domain: 'www.elecom.co.jp',
 3 | 
 4 |   title: {
 5 |     selectors: ['title'],
 6 |   },
 7 | 
 8 |   author: null,
 9 | 
10 |   date_published: {
11 |     selectors: ['p.section-last'],
12 |     format: 'YYYY.MM.DD',
13 |     timezone: 'Asia/Tokyo',
14 |   },
15 | 
16 |   dek: null,
17 | 
18 |   lead_image_url: null,
19 | 
20 |   content: {
21 |     selectors: ['td.TableMain2'],
22 | 
23 |     defaultCleaner: false,
24 | 
25 |     transforms: {
26 |       table: $node => {
27 |         $node.attr('width', 'auto');
28 |       },
29 |     },
30 | 
31 |     clean: [],
32 |   },
33 | };
34 | 


--------------------------------------------------------------------------------
/src/extractors/custom/www.eonline.com/index.js:
--------------------------------------------------------------------------------
 1 | export const WwwEonlineComExtractor = {
 2 |   domain: 'www.eonline.com',
 3 | 
 4 |   title: {
 5 |     selectors: ['h1.article-detail__title', 'h1.article__title'],
 6 |   },
 7 | 
 8 |   author: {
 9 |     selectors: ['.article-detail__meta__author', '.entry-meta__author a'],
10 |   },
11 | 
12 |   date_published: {
13 |     selectors: [
14 |       ['meta[name="article:published_time"]', 'value'],
15 |       ['meta[itemprop="datePublished"]', 'value'],
16 |     ],
17 |   },
18 | 
19 |   lead_image_url: {
20 |     selectors: [['meta[name="og:image"]', 'value']],
21 |   },
22 | 
23 |   content: {
24 |     selectors: [
25 |       ['.article-detail__main-content section'],
26 |       ['.post-content section, .post-content div.post-content__image'],
27 |     ],
28 | 
29 |     // Is there anything in the content you selected that needs transformed
30 |     // before it's consumable content? E.g., unusual lazy loaded images
31 |     transforms: {
32 |       'div.post-content__image': 'figure',
33 |       'div.post-content__image .image__credits': 'figcaption',
34 |     },
35 | 
36 |     // Is there anything that is in the result that shouldn't be?
37 |     // The clean selectors will remove anything that matches from
38 |     // the result
39 |     clean: [],
40 |   },
41 | };
42 | 


--------------------------------------------------------------------------------
/src/extractors/custom/www.fastcompany.com/index.js:
--------------------------------------------------------------------------------
 1 | export const WwwFastcompanyComExtractor = {
 2 |   domain: 'www.fastcompany.com',
 3 | 
 4 |   title: {
 5 |     selectors: ['h1'],
 6 |   },
 7 | 
 8 |   author: {
 9 |     selectors: [['meta[name="author"]', 'value']],
10 |   },
11 | 
12 |   date_published: {
13 |     selectors: [['meta[name="article:published_time"]', 'value']],
14 |   },
15 | 
16 |   dek: {
17 |     selectors: ['.post__deck'],
18 |   },
19 | 
20 |   lead_image_url: {
21 |     selectors: [['meta[name="og:image"]', 'value']],
22 |   },
23 | 
24 |   content: {
25 |     selectors: ['.post__article'],
26 |   },
27 | };
28 | 


--------------------------------------------------------------------------------
/src/extractors/custom/www.fortinet.com/index.js:
--------------------------------------------------------------------------------
 1 | export const WwwFortinetComExtractor = {
 2 |   domain: 'www.fortinet.com',
 3 | 
 4 |   title: {
 5 |     selectors: ['h1'],
 6 |   },
 7 | 
 8 |   author: {
 9 |     selectors: ['.b15-blog-meta__author'],
10 |   },
11 | 
12 |   date_published: {
13 |     selectors: [['meta[name="article:published_time"]', 'value']],
14 |   },
15 | 
16 |   lead_image_url: {
17 |     selectors: [['meta[name="og:image"]', 'value']],
18 |   },
19 | 
20 |   content: {
21 |     selectors: [
22 |       'div.responsivegrid.aem-GridColumn.aem-GridColumn--default--12',
23 |     ],
24 | 
25 |     transforms: {
26 |       noscript: $node => {
27 |         const $children = $node.children();
28 |         if ($children.length === 1 && $children.get(0).tagName === 'img') {
29 |           return 'figure';
30 |         }
31 |         return null;
32 |       },
33 |     },
34 |   },
35 | };
36 | 


--------------------------------------------------------------------------------
/src/extractors/custom/www.gizmodo.jp/index.js:
--------------------------------------------------------------------------------
 1 | export const WwwGizmodoJpExtractor = {
 2 |   domain: 'www.gizmodo.jp',
 3 | 
 4 |   title: {
 5 |     selectors: ['h1.p-post-title'],
 6 |   },
 7 | 
 8 |   author: {
 9 |     selectors: ['li.p-post-AssistAuthor'],
10 |   },
11 | 
12 |   date_published: {
13 |     selectors: [['li.p-post-AssistTime time', 'datetime']],
14 |   },
15 | 
16 |   dek: null,
17 | 
18 |   lead_image_url: {
19 |     selectors: [['meta[name="og:image"]', 'value']],
20 |   },
21 | 
22 |   content: {
23 |     selectors: ['article.p-post'],
24 | 
25 |     transforms: {
26 |       'img.p-post-thumbnailImage': $node => {
27 |         const src = $node.attr('src');
28 |         $node.attr('src', src.replace(/^.*=%27/, '').replace(/%27;$/, ''));
29 |       },
30 |     },
31 | 
32 |     clean: ['h1.p-post-title', 'ul.p-post-Assist'],
33 |   },
34 | };
35 | 


--------------------------------------------------------------------------------
/src/extractors/custom/www.gruene.de/index.js:
--------------------------------------------------------------------------------
 1 | export const WwwGrueneDeExtractor = {
 2 |   domain: 'www.gruene.de',
 3 | 
 4 |   title: {
 5 |     selectors: ['header h1'],
 6 |   },
 7 | 
 8 |   author: null,
 9 | 
10 |   date_published: null,
11 | 
12 |   dek: null,
13 | 
14 |   lead_image_url: {
15 |     selectors: [['meta[property="og:image"]', 'content']],
16 |   },
17 | 
18 |   content: {
19 |     // selectors: ['section'],
20 |     selectors: [['section header', 'section h2', 'section p', 'section ol']],
21 | 
22 |     // Is there anything in the content you selected that needs transformed
23 |     // before it's consumable content? E.g., unusual lazy loaded images
24 |     transforms: {},
25 | 
26 |     // Is there anything that is in the result that shouldn't be?
27 |     // The clean selectors will remove anything that matches from
28 |     // the result
29 |     clean: ['figcaption', 'p[class]'],
30 |   },
31 | };
32 | 


--------------------------------------------------------------------------------
/src/extractors/custom/www.infoq.com/index.js:
--------------------------------------------------------------------------------
 1 | export const WwwInfoqComExtractor = {
 2 |   domain: 'www.infoq.com',
 3 | 
 4 |   title: {
 5 |     selectors: ['h1.heading'],
 6 |   },
 7 | 
 8 |   author: {
 9 |     selectors: ['div.widget.article__authors'],
10 |   },
11 | 
12 |   date_published: {
13 |     selectors: ['.article__readTime.date'],
14 |     format: 'YYYY年MM月DD日',
15 |     timezone: 'Asia/Tokyo',
16 |   },
17 | 
18 |   dek: {
19 |     selectors: [['meta[name="og:description"]', 'value']],
20 |   },
21 | 
22 |   lead_image_url: {
23 |     selectors: [['meta[name="og:image"]', 'value']],
24 |   },
25 | 
26 |   content: {
27 |     selectors: ['div.article__data'],
28 | 
29 |     defaultCleaner: false,
30 | 
31 |     transforms: {},
32 | 
33 |     clean: [],
34 |   },
35 | };
36 | 


--------------------------------------------------------------------------------
/src/extractors/custom/www.inquisitr.com/index.js:
--------------------------------------------------------------------------------
 1 | export const WwwInquisitrComExtractor = {
 2 |   domain: 'www.inquisitr.com',
 3 | 
 4 |   title: {
 5 |     selectors: ['h1.entry-title.story--header--title'],
 6 |   },
 7 | 
 8 |   author: {
 9 |     selectors: ['div.story--header--author'],
10 |   },
11 | 
12 |   date_published: {
13 |     selectors: [['meta[name="datePublished"]', 'value']],
14 |   },
15 | 
16 |   lead_image_url: {
17 |     selectors: [['meta[name="og:image"]', 'value']],
18 |   },
19 | 
20 |   content: {
21 |     selectors: ['article.story', '.entry-content.'],
22 | 
23 |     // Is there anything in the content you selected that needs transformed
24 |     // before it's consumable content? E.g., unusual lazy loaded images
25 |     transforms: {},
26 | 
27 |     // Is there anything that is in the result that shouldn't be?
28 |     // The clean selectors will remove anything that matches from
29 |     // the result
30 |     clean: [
31 |       '.post-category',
32 |       '.story--header--socials',
33 |       '.story--header--content',
34 |     ],
35 |   },
36 | };
37 | 


--------------------------------------------------------------------------------
/src/extractors/custom/www.investmentexecutive.com/index.js:
--------------------------------------------------------------------------------
 1 | export const WwwInvestmentexecutiveComExtractor = {
 2 |   domain: 'www.investmentexecutive.com',
 3 | 
 4 |   title: {
 5 |     selectors: ['h1'],
 6 |   },
 7 | 
 8 |   author: {
 9 |     selectors: ['div[itemprop="author"]'],
10 |   },
11 | 
12 |   date_published: {
13 |     selectors: [['meta[itemprop="datePublished"]', 'value']],
14 |   },
15 | 
16 |   dek: {
17 |     selectors: [['meta[name="og:description"]', 'value']],
18 |   },
19 | 
20 |   lead_image_url: {
21 |     selectors: [['meta[name="og:image"]', 'value']],
22 |   },
23 | 
24 |   content: {
25 |     selectors: ['section.article-body'],
26 | 
27 |     clean: ['.hidden'],
28 |   },
29 | };
30 | 


--------------------------------------------------------------------------------
/src/extractors/custom/www.ipa.go.jp/index.js:
--------------------------------------------------------------------------------
 1 | export const WwwIpaGoJpExtractor = {
 2 |   domain: 'www.ipa.go.jp',
 3 | 
 4 |   title: {
 5 |     selectors: ['h1'],
 6 |   },
 7 | 
 8 |   author: null,
 9 | 
10 |   date_published: {
11 |     selectors: ['p.ipar_text_right'],
12 |     format: 'YYYY年M月D日',
13 |     timezone: 'Asia/Tokyo',
14 |   },
15 | 
16 |   dek: null,
17 | 
18 |   lead_image_url: null,
19 | 
20 |   content: {
21 |     selectors: ['#ipar_main'],
22 | 
23 |     defaultCleaner: false,
24 | 
25 |     transforms: {},
26 | 
27 |     clean: ['p.ipar_text_right'],
28 |   },
29 | };
30 | 


--------------------------------------------------------------------------------
/src/extractors/custom/www.itmedia.co.jp/index.js:
--------------------------------------------------------------------------------
 1 | export const WwwItmediaCoJpExtractor = {
 2 |   domain: 'www.itmedia.co.jp',
 3 | 
 4 |   supportedDomains: [
 5 |     'www.atmarkit.co.jp',
 6 |     'techtarget.itmedia.co.jp',
 7 |     'nlab.itmedia.co.jp',
 8 |   ],
 9 | 
10 |   title: {
11 |     selectors: ['#cmsTitle h1'],
12 |   },
13 | 
14 |   author: {
15 |     selectors: ['#byline'],
16 |   },
17 | 
18 |   date_published: {
19 |     selectors: [['meta[name="article:modified_time"]', 'value']],
20 |   },
21 | 
22 |   dek: {
23 |     selectors: ['#cmsAbstract h2'],
24 |   },
25 | 
26 |   lead_image_url: {
27 |     selectors: [['meta[name="og:image"]', 'value']],
28 |   },
29 | 
30 |   content: {
31 |     selectors: ['#cmsBody'],
32 | 
33 |     defaultCleaner: false,
34 | 
35 |     transforms: {},
36 | 
37 |     clean: ['#snsSharebox'],
38 |   },
39 | };
40 | 


--------------------------------------------------------------------------------
/src/extractors/custom/www.jnsa.org/index.js:
--------------------------------------------------------------------------------
 1 | export const WwwJnsaOrgExtractor = {
 2 |   domain: 'www.jnsa.org',
 3 | 
 4 |   title: {
 5 |     selectors: ['#wgtitle h2'],
 6 |   },
 7 | 
 8 |   author: null,
 9 | 
10 |   date_published: null,
11 | 
12 |   dek: null,
13 | 
14 |   excerpt: {
15 |     selectors: [['meta[name="og:description"]', 'value']],
16 |   },
17 | 
18 |   lead_image_url: {
19 |     selectors: [['meta[name="og:image"]', 'value']],
20 |   },
21 | 
22 |   content: {
23 |     selectors: ['#main_area'],
24 | 
25 |     transforms: {},
26 | 
27 |     clean: ['#pankuzu', '#side'],
28 |   },
29 | };
30 | 


--------------------------------------------------------------------------------
/src/extractors/custom/www.ladbible.com/index.js:
--------------------------------------------------------------------------------
 1 | export const WwwLadbibleComExtractor = {
 2 |   domain: 'www.ladbible.com',
 3 | 
 4 |   title: {
 5 |     selectors: ['h1'],
 6 |   },
 7 | 
 8 |   author: {
 9 |     selectors: ['[class*=Byline]'],
10 |   },
11 | 
12 |   date_published: {
13 |     selectors: ['time'],
14 |     timezone: 'Europe/London',
15 |   },
16 | 
17 |   lead_image_url: {
18 |     selectors: [['meta[name="og:image"]', 'value']],
19 |   },
20 | 
21 |   content: {
22 |     selectors: ['[class*=ArticleContainer]'],
23 |     clean: [
24 |       'time',
25 |       'source',
26 |       'a[href^="https://www.ladbible.com/"]',
27 |       'picture',
28 |       '[class*=StyledCardBlock]',
29 |     ],
30 |   },
31 | };
32 | 


--------------------------------------------------------------------------------
/src/extractors/custom/www.latimes.com/index.js:
--------------------------------------------------------------------------------
 1 | export const WwwLatimesComExtractor = {
 2 |   domain: 'www.latimes.com',
 3 | 
 4 |   title: {
 5 |     selectors: ['h1.headline', '.trb_ar_hl'],
 6 |   },
 7 | 
 8 |   author: {
 9 |     selectors: [
10 |       'a[data-click="standardBylineAuthorName"]',
11 |       ['meta[name="author"]', 'value'],
12 |     ],
13 |   },
14 | 
15 |   date_published: {
16 |     selectors: [
17 |       ['meta[name="article:published_time"]', 'value'],
18 |       ['meta[itemprop="datePublished"]', 'value'],
19 |     ],
20 |   },
21 | 
22 |   lead_image_url: {
23 |     selectors: [['meta[name="og:image"]', 'value']],
24 |   },
25 | 
26 |   content: {
27 |     selectors: ['.page-article-body', '.trb_ar_main'],
28 | 
29 |     // Is there anything in the content you selected that needs transformed
30 |     // before it's consumable content? E.g., unusual lazy loaded images
31 |     transforms: {
32 |       '.trb_ar_la': $node => {
33 |         const $figure = $node.find('figure');
34 |         $node.replaceWith($figure);
35 |       },
36 |     },
37 | 
38 |     // Is there anything that is in the result that shouldn't be?
39 |     // The clean selectors will remove anything that matches from
40 |     // the result
41 |     clean: ['.trb_ar_by', '.trb_ar_cr'],
42 |   },
43 | };
44 | 


--------------------------------------------------------------------------------
/src/extractors/custom/www.lemonde.fr/index.js:
--------------------------------------------------------------------------------
 1 | export const WwwLemondeFrExtractor = {
 2 |   domain: 'www.lemonde.fr',
 3 | 
 4 |   title: {
 5 |     selectors: ['h1.article__title'],
 6 |   },
 7 | 
 8 |   author: {
 9 |     selectors: ['.author__name'],
10 |   },
11 | 
12 |   date_published: {
13 |     selectors: [['meta[name="og:article:published_time"]', 'value']],
14 |   },
15 | 
16 |   dek: {
17 |     selectors: ['.article__desc'],
18 |   },
19 | 
20 |   lead_image_url: {
21 |     selectors: [['meta[name="og:image"]', 'value']],
22 |   },
23 | 
24 |   content: {
25 |     selectors: ['.article__content'],
26 | 
27 |     transforms: {},
28 | 
29 |     clean: ['figcaption'],
30 |   },
31 | };
32 | 


--------------------------------------------------------------------------------
/src/extractors/custom/www.lifehacker.jp/index.js:
--------------------------------------------------------------------------------
 1 | export const WwwLifehackerJpExtractor = {
 2 |   domain: 'www.lifehacker.jp',
 3 | 
 4 |   title: {
 5 |     selectors: ['h1[class^="article_pArticle_Title"]', 'h1.lh-summary-title'],
 6 |   },
 7 | 
 8 |   author: {
 9 |     selectors: [
10 |       ['meta[name="author"]', 'value'],
11 |       'p.lh-entryDetailInner--credit',
12 |     ],
13 |   },
14 | 
15 |   date_published: {
16 |     selectors: [
17 |       ['meta[name="article:published_time"]', 'value'],
18 |       ['div.lh-entryDetail-header time', 'datetime'],
19 |     ],
20 |   },
21 | 
22 |   dek: null,
23 | 
24 |   lead_image_url: {
25 |     selectors: [['meta[name="og:image"]', 'value']],
26 |   },
27 | 
28 |   content: {
29 |     selectors: [
30 |       'div[class^="article_pArticle_Body__"]',
31 |       'div.lh-entryDetail-body',
32 |     ],
33 | 
34 |     transforms: {
35 |       'img.lazyload': $node => {
36 |         const src = $node.attr('src');
37 |         $node.attr('src', src.replace(/^.*=%27/, '').replace(/%27;$/, ''));
38 |       },
39 |     },
40 | 
41 |     clean: ['p.lh-entryDetailInner--credit'],
42 |   },
43 | };
44 | 


--------------------------------------------------------------------------------
/src/extractors/custom/www.macrumors.com/index.js:
--------------------------------------------------------------------------------
 1 | export const WwwMacrumorsComExtractor = {
 2 |   domain: 'www.macrumors.com',
 3 | 
 4 |   title: {
 5 |     selectors: ['h1', 'h1.title'],
 6 |   },
 7 | 
 8 |   author: {
 9 |     selectors: ['article a[rel="author"]', '.author-url'],
10 |   },
11 | 
12 |   date_published: {
13 |     selectors: [['time', 'datetime']],
14 | 
15 |     timezone: 'America/Los_Angeles',
16 |   },
17 | 
18 |   dek: {
19 |     selectors: [['meta[name="description"]', 'value']],
20 |   },
21 | 
22 |   lead_image_url: {
23 |     selectors: [['meta[name="og:image"]', 'value']],
24 |   },
25 | 
26 |   content: {
27 |     selectors: ['article', '.article'],
28 | 
29 |     // Is there anything in the content you selected that needs transformed
30 |     // before it's consumable content? E.g., unusual lazy loaded images
31 |     transforms: {},
32 | 
33 |     // Is there anything that is in the result that shouldn't be?
34 |     // The clean selectors will remove anything that matches from
35 |     // the result
36 |     clean: [],
37 |   },
38 | };
39 | 


--------------------------------------------------------------------------------
/src/extractors/custom/www.mentalfloss.com/index.js:
--------------------------------------------------------------------------------
 1 | export const WwwMentalflossComExtractor = {
 2 |   domain: 'www.mentalfloss.com',
 3 | 
 4 |   title: {
 5 |     selectors: [
 6 |       ['meta[name="og:title"]', 'value'],
 7 |       'h1.title',
 8 |       '.title-group',
 9 |       '.inner',
10 |     ],
11 |   },
12 | 
13 |   author: {
14 |     selectors: [
15 |       'a[data-vars-label*="authors"]',
16 |       '.field-name-field-enhanced-authors',
17 |     ],
18 |   },
19 | 
20 |   date_published: {
21 |     selectors: [
22 |       ['meta[name="article:published_time"]', 'value'],
23 |       '.date-display-single',
24 |     ],
25 |     timezone: 'America/New_York',
26 |   },
27 | 
28 |   lead_image_url: {
29 |     selectors: [['meta[name="og:image"]', 'value']],
30 |   },
31 | 
32 |   content: {
33 |     selectors: ['article main', 'div.field.field-name-body'],
34 | 
35 |     // Is there anything in the content you selected that needs transformed
36 |     // before it's consumable content? E.g., unusual lazy loaded images
37 |     transforms: {},
38 | 
39 |     // Is there anything that is in the result that shouldn't be?
40 |     // The clean selectors will remove anything that matches from
41 |     // the result
42 |     clean: ['small'],
43 |   },
44 | };
45 | 


--------------------------------------------------------------------------------
/src/extractors/custom/www.miamiherald.com/index.js:
--------------------------------------------------------------------------------
 1 | export const WwwMiamiheraldComExtractor = {
 2 |   domain: 'www.miamiherald.com',
 3 | 
 4 |   title: {
 5 |     selectors: ['h1.title'],
 6 |   },
 7 | 
 8 |   date_published: {
 9 |     selectors: ['p.published-date'],
10 | 
11 |     timezone: 'America/New_York',
12 |   },
13 | 
14 |   lead_image_url: {
15 |     selectors: [['meta[name="og:image"]', 'value']],
16 |   },
17 | 
18 |   content: {
19 |     selectors: ['div.dateline-storybody'],
20 | 
21 |     // Is there anything in the content you selected that needs transformed
22 |     // before it's consumable content? E.g., unusual lazy loaded images
23 |     transforms: {},
24 | 
25 |     // Is there anything that is in the result that shouldn't be?
26 |     // The clean selectors will remove anything that matches from
27 |     // the result
28 |     clean: [],
29 |   },
30 | };
31 | 


--------------------------------------------------------------------------------
/src/extractors/custom/www.moongift.jp/index.js:
--------------------------------------------------------------------------------
 1 | export const WwwMoongiftJpExtractor = {
 2 |   domain: 'www.moongift.jp',
 3 | 
 4 |   title: {
 5 |     selectors: ['h1.title a'],
 6 |   },
 7 | 
 8 |   author: null,
 9 | 
10 |   date_published: {
11 |     selectors: ['ul.meta li:not(.social):first-of-type'],
12 |     timezone: 'Asia/Tokyo',
13 |   },
14 | 
15 |   dek: {
16 |     selectors: [['meta[name="og:description"]', 'value']],
17 |   },
18 | 
19 |   lead_image_url: {
20 |     selectors: [['meta[name="og:image"]', 'value']],
21 |   },
22 | 
23 |   content: {
24 |     selectors: ['#main'],
25 | 
26 |     transforms: {},
27 | 
28 |     clean: ['ul.mg_service.cf'],
29 |   },
30 | };
31 | 


--------------------------------------------------------------------------------
/src/extractors/custom/www.msn.com/index.js:
--------------------------------------------------------------------------------
 1 | // Rename CustomExtractor
 2 | // to fit your publication
 3 | // (e.g., NYTimesExtractor)
 4 | export const MSNExtractor = {
 5 |   domain: 'www.msn.com',
 6 |   title: {
 7 |     selectors: [
 8 |       'h1',
 9 |       // enter title selectors
10 |     ],
11 |   },
12 | 
13 |   author: {
14 |     selectors: [
15 |       'span.authorname-txt',
16 |       // enter author selectors
17 |     ],
18 |   },
19 | 
20 |   content: {
21 |     selectors: [
22 |       'div.richtext',
23 |       // enter content selectors
24 |     ],
25 | 
26 |     // Is there anything in the content you selected that needs transformed
27 |     // before it's consumable content? E.g., unusual lazy loaded images
28 |     transforms: [],
29 | 
30 |     // Is there anything that is in the result that shouldn't be?
31 |     // The clean selectors will remove anything that matches from
32 |     // the result
33 |     clean: ['span.caption'],
34 |   },
35 | 
36 |   date_published: {
37 |     selectors: ['span.time'],
38 |   },
39 | 
40 |   lead_image_url: {
41 |     selectors: [],
42 |   },
43 | 
44 |   dek: {
45 |     selectors: [],
46 |   },
47 | 
48 |   next_page_url: null,
49 | 
50 |   excerpt: null,
51 | };
52 | 


--------------------------------------------------------------------------------
/src/extractors/custom/www.nbcnews.com/index.js:
--------------------------------------------------------------------------------
 1 | export const WwwNbcnewsComExtractor = {
 2 |   domain: 'www.nbcnews.com',
 3 | 
 4 |   title: {
 5 |     selectors: ['div.article-hero-headline h1', 'div.article-hed h1'],
 6 |   },
 7 | 
 8 |   author: {
 9 |     selectors: [
10 |       'div.article-inline-byline span.byline-name',
11 |       'span.byline_author',
12 |     ],
13 |   },
14 | 
15 |   date_published: {
16 |     selectors: [
17 |       ['meta[name="article:published"]', 'value'],
18 |       ['.flag_article-wrapper time.timestamp_article[datetime]', 'datetime'],
19 |       '.flag_article-wrapper time',
20 |     ],
21 | 
22 |     timezone: 'America/New_York',
23 |   },
24 | 
25 |   lead_image_url: {
26 |     selectors: [['meta[name="og:image"]', 'value']],
27 |   },
28 | 
29 |   content: {
30 |     selectors: ['div.article-body__content', 'div.article-body'],
31 | 
32 |     // Is there anything in the content you selected that needs transformed
33 |     // before it's consumable content? E.g., unusual lazy loaded images
34 |     transforms: {},
35 | 
36 |     // Is there anything that is in the result that shouldn't be?
37 |     // The clean selectors will remove anything that matches from
38 |     // the result
39 |     clean: [],
40 |   },
41 | };
42 | 


--------------------------------------------------------------------------------
/src/extractors/custom/www.npr.org/index.js:
--------------------------------------------------------------------------------
 1 | export const WwwNprOrgExtractor = {
 2 |   domain: 'www.npr.org',
 3 | 
 4 |   title: {
 5 |     selectors: ['h1', '.storytitle'],
 6 |   },
 7 | 
 8 |   author: {
 9 |     selectors: ['p.byline__name.byline__name--block'],
10 |   },
11 | 
12 |   date_published: {
13 |     selectors: [
14 |       ['.dateblock time[datetime]', 'datetime'],
15 |       ['meta[name="date"]', 'value'],
16 |     ],
17 |   },
18 | 
19 |   lead_image_url: {
20 |     selectors: [
21 |       ['meta[name="og:image"]', 'value'],
22 |       ['meta[name="twitter:image:src"]', 'value'],
23 |     ],
24 |   },
25 | 
26 |   content: {
27 |     selectors: ['.storytext'],
28 | 
29 |     // Is there anything in the content you selected that needs transformed
30 |     // before it's consumable content? E.g., unusual lazy loaded images
31 |     transforms: {
32 |       '.bucketwrap.image': 'figure',
33 |       '.bucketwrap.image .credit-caption': 'figcaption',
34 |     },
35 | 
36 |     // Is there anything that is in the result that shouldn't be?
37 |     // The clean selectors will remove anything that matches from
38 |     // the result
39 |     clean: ['div.enlarge_measure'],
40 |   },
41 | };
42 | 


--------------------------------------------------------------------------------
/src/extractors/custom/www.nydailynews.com/index.js:
--------------------------------------------------------------------------------
 1 | export const WwwNydailynewsComExtractor = {
 2 |   domain: 'www.nydailynews.com',
 3 | 
 4 |   title: {
 5 |     selectors: ['h1.headline', 'h1#ra-headline'],
 6 |   },
 7 | 
 8 |   author: {
 9 |     selectors: [
10 |       '.article_byline span',
11 |       ['meta[name="parsely-author"]', 'value'],
12 |     ],
13 |   },
14 | 
15 |   date_published: {
16 |     selectors: ['time', ['meta[name="sailthru.date"]', 'value']],
17 |   },
18 | 
19 |   lead_image_url: {
20 |     selectors: [['meta[name="og:image"]', 'value']],
21 |   },
22 | 
23 |   content: {
24 |     selectors: ['article', 'article#ra-body'],
25 | 
26 |     // Is there anything in the content you selected that needs transformed
27 |     // before it's consumable content? E.g., unusual lazy loaded images
28 |     transforms: {},
29 | 
30 |     // Is there anything that is in the result that shouldn't be?
31 |     // The clean selectors will remove anything that matches from
32 |     // the result
33 |     clean: ['dl#ra-tags', '.ra-related', 'a.ra-editor', 'dl#ra-share-bottom'],
34 |   },
35 | };
36 | 


--------------------------------------------------------------------------------
/src/extractors/custom/www.opposingviews.com/index.js:
--------------------------------------------------------------------------------
 1 | export const WwwOpposingviewsComExtractor = {
 2 |   domain: 'www.opposingviews.com',
 3 | 
 4 |   title: {
 5 |     selectors: ['h1.m-detail-header--title', 'h1.title'],
 6 |   },
 7 | 
 8 |   author: {
 9 |     selectors: [['meta[name="author"]', 'value'], 'div.date span span a'],
10 |   },
11 | 
12 |   date_published: {
13 |     selectors: [
14 |       ['meta[name="published"]', 'value'],
15 |       ['meta[name="publish_date"]', 'value'],
16 |     ],
17 |   },
18 | 
19 |   dek: {
20 |     selectors: [
21 |       // enter selectors
22 |     ],
23 |   },
24 | 
25 |   lead_image_url: {
26 |     selectors: [['meta[name="og:image"]', 'value']],
27 |   },
28 | 
29 |   content: {
30 |     selectors: ['.m-detail--body', '.article-content'],
31 | 
32 |     // Is there anything in the content you selected that needs transformed
33 |     // before it's consumable content? E.g., unusual lazy loaded images
34 |     transforms: {},
35 | 
36 |     // Is there anything that is in the result that shouldn't be?
37 |     // The clean selectors will remove anything that matches from
38 |     // the result
39 |     clean: ['.show-for-small-only'],
40 |   },
41 | };
42 | 


--------------------------------------------------------------------------------
/src/extractors/custom/www.oreilly.co.jp/index.js:
--------------------------------------------------------------------------------
 1 | export const WwwOreillyCoJpExtractor = {
 2 |   domain: 'www.oreilly.co.jp',
 3 | 
 4 |   title: {
 5 |     selectors: [['meta[name="og:title"]', 'value'], 'h3'],
 6 |   },
 7 | 
 8 |   author: {
 9 |     selectors: ['span[itemprop="author"]', 'li[itemprop="author"]'],
10 |   },
11 | 
12 |   date_published: {
13 |     selectors: [
14 |       ['dd[itemprop="datePublished"]', 'content'],
15 |       ['meta[itemprop="datePublished"]', 'value'],
16 |     ],
17 |     timezone: 'Asia/Tokyo',
18 |   },
19 | 
20 |   dek: null,
21 | 
22 |   lead_image_url: {
23 |     selectors: [
24 |       ['meta[name="og:image:secure_url"]', 'value'],
25 |       ['meta[name="og:image"]', 'value'],
26 |     ],
27 |   },
28 | 
29 |   content: {
30 |     selectors: ['section.detail', '#content'],
31 | 
32 |     defaultCleaner: false,
33 | 
34 |     transforms: {},
35 | 
36 |     clean: ['.social-tools'],
37 |   },
38 | };
39 | 


--------------------------------------------------------------------------------
/src/extractors/custom/www.ossnews.jp/index.js:
--------------------------------------------------------------------------------
 1 | export const WwwOssnewsJpExtractor = {
 2 |   domain: 'www.ossnews.jp',
 3 | 
 4 |   title: {
 5 |     selectors: ['#alpha-block h1.hxnewstitle'],
 6 |   },
 7 | 
 8 |   author: null,
 9 | 
10 |   date_published: {
11 |     selectors: ['p.fs12'],
12 |     format: 'YYYY年MM月DD日 HH:mm',
13 |     timezone: 'Asia/Tokyo',
14 |   },
15 | 
16 |   dek: null,
17 | 
18 |   lead_image_url: {
19 |     selectors: [['meta[name="og:image"]', 'value']],
20 |   },
21 | 
22 |   content: {
23 |     selectors: ['#alpha-block .section:has(h1.hxnewstitle)'],
24 | 
25 |     defaultCleaner: false,
26 | 
27 |     transforms: {},
28 | 
29 |     clean: [],
30 |   },
31 | };
32 | 


--------------------------------------------------------------------------------
/src/extractors/custom/www.phoronix.com/index.js:
--------------------------------------------------------------------------------
 1 | export const WwwPhoronixComExtractor = {
 2 |   domain: 'www.phoronix.com',
 3 | 
 4 |   title: {
 5 |     selectors: ['article h1', 'article header'],
 6 |   },
 7 | 
 8 |   author: {
 9 |     selectors: ['.author a:first-child'],
10 |   },
11 | 
12 |   date_published: {
13 |     selectors: ['.author'],
14 |     // 1 June 2019 at 08:34 PM EDT
15 |     format: 'D MMMM YYYY at hh:mm',
16 |     timezone: 'America/New_York',
17 |   },
18 | 
19 |   dek: null,
20 | 
21 |   lead_image_url: null,
22 | 
23 |   content: {
24 |     selectors: ['.content'],
25 | 
26 |     // Is there anything in the content you selected that needs transformed
27 |     // before it's consumable content? E.g., unusual lazy loaded images
28 |     transforms: {},
29 | 
30 |     // Is there anything that is in the result that shouldn't be?
31 |     // The clean selectors will remove anything that matches from
32 |     // the result
33 |     clean: [],
34 |   },
35 | };
36 | 


--------------------------------------------------------------------------------
/src/extractors/custom/www.politico.com/index.js:
--------------------------------------------------------------------------------
 1 | export const PoliticoExtractor = {
 2 |   domain: 'www.politico.com',
 3 |   title: {
 4 |     selectors: [['meta[name="og:title"]', 'value']],
 5 |   },
 6 | 
 7 |   author: {
 8 |     selectors: [
 9 |       ['div[itemprop="author"] meta[itemprop="name"]', 'value'],
10 |       '.story-meta__authors .vcard',
11 |       '.story-main-content .byline .vcard',
12 |     ],
13 |   },
14 | 
15 |   content: {
16 |     selectors: [['.story-text'], '.story-main-content', '.story-core'],
17 | 
18 |     transforms: [],
19 | 
20 |     clean: ['figcaption', '.story-meta', '.ad'],
21 |   },
22 | 
23 |   date_published: {
24 |     selectors: [
25 |       ['time[itemprop="datePublished"]', 'datetime'],
26 |       ['.story-meta__details time[datetime]', 'datetime'],
27 |       ['.story-main-content .timestamp time[datetime]', 'datetime'],
28 |     ],
29 |     timezone: 'America/New_York',
30 |   },
31 | 
32 |   lead_image_url: {
33 |     selectors: [['meta[name="og:image"]', 'value']],
34 |   },
35 | 
36 |   dek: {
37 |     selectors: [['meta[name="og:description"]', 'value']],
38 |   },
39 | };
40 | 


--------------------------------------------------------------------------------
/src/extractors/custom/www.popsugar.com/index.js:
--------------------------------------------------------------------------------
 1 | export const WwwPopsugarComExtractor = {
 2 |   domain: 'www.popsugar.com',
 3 | 
 4 |   title: {
 5 |     selectors: ['h2.post-title', 'title-text'],
 6 |   },
 7 | 
 8 |   author: {
 9 |     selectors: [['meta[name="article:author"]', 'value']],
10 |   },
11 | 
12 |   date_published: {
13 |     selectors: [['meta[name="article:published_time"]', 'value']],
14 |   },
15 | 
16 |   lead_image_url: {
17 |     selectors: [['meta[name="og:image"]', 'value']],
18 |   },
19 | 
20 |   content: {
21 |     selectors: ['#content'],
22 | 
23 |     // Is there anything in the content you selected that needs transformed
24 |     // before it's consumable content? E.g., unusual lazy loaded images
25 |     transforms: {},
26 | 
27 |     // Is there anything that is in the result that shouldn't be?
28 |     // The clean selectors will remove anything that matches from
29 |     // the result
30 |     clean: ['.share-copy-title', '.post-tags', '.reactions'],
31 |   },
32 | };
33 | 


--------------------------------------------------------------------------------
/src/extractors/custom/www.prospectmagazine.co.uk/index.js:
--------------------------------------------------------------------------------
 1 | export const WwwProspectmagazineCoUkExtractor = {
 2 |   domain: 'www.prospectmagazine.co.uk',
 3 | 
 4 |   title: {
 5 |     selectors: ['.blog-header__title', '.page-title'],
 6 |   },
 7 | 
 8 |   author: {
 9 |     selectors: ['.blog-header__author-link', '.aside_author .title'],
10 |   },
11 | 
12 |   date_published: {
13 |     selectors: [['meta[name="article:published_time"]', 'value'], '.post-info'],
14 | 
15 |     timezone: 'Europe/London',
16 |   },
17 | 
18 |   dek: {
19 |     selectors: ['.blog-header__description', '.page-subtitle'],
20 |   },
21 | 
22 |   lead_image_url: {
23 |     selectors: [['meta[name="og:image"]', 'value']],
24 |   },
25 | 
26 |   content: {
27 |     selectors: ['.blog__container', 'article .post_content'],
28 | 
29 |     // Is there anything in the content you selected that needs transformed
30 |     // before it's consumable content? E.g., unusual lazy loaded images
31 |     transforms: {},
32 | 
33 |     // Is there anything that is in the result that shouldn't be?
34 |     // The clean selectors will remove anything that matches from
35 |     // the result
36 |     clean: [],
37 |   },
38 | };
39 | 


--------------------------------------------------------------------------------
/src/extractors/custom/www.publickey1.jp/index.js:
--------------------------------------------------------------------------------
 1 | export const WwwPublickey1JpExtractor = {
 2 |   domain: 'www.publickey1.jp',
 3 | 
 4 |   title: {
 5 |     selectors: ['h1'],
 6 |   },
 7 | 
 8 |   author: {
 9 |     selectors: ['.bloggerinchief p:first-of-type', '#subcol p:has(img)'],
10 |   },
11 | 
12 |   date_published: {
13 |     selectors: ['div.pubdate'],
14 |     format: 'YYYY年MM月DD日',
15 |     timezone: 'Asia/Tokyo',
16 |   },
17 | 
18 |   dek: null,
19 | 
20 |   lead_image_url: {
21 |     selectors: [['meta[name="og:image"]', 'value']],
22 |   },
23 | 
24 |   content: {
25 |     selectors: ['#maincol'],
26 | 
27 |     defaultCleaner: false,
28 | 
29 |     transforms: {},
30 | 
31 |     clean: ['#breadcrumbs', 'div.sbm', 'div.ad_footer'],
32 |   },
33 | };
34 | 


--------------------------------------------------------------------------------
/src/extractors/custom/www.qdaily.com/index.js:
--------------------------------------------------------------------------------
 1 | export const WwwQdailyComExtractor = {
 2 |   domain: 'www.qdaily.com',
 3 | 
 4 |   title: {
 5 |     selectors: ['h2', 'h2.title'],
 6 |   },
 7 | 
 8 |   author: {
 9 |     selectors: ['.name'],
10 |   },
11 | 
12 |   date_published: {
13 |     selectors: [['.date.smart-date', 'data-origindate']],
14 |   },
15 | 
16 |   dek: {
17 |     selectors: ['.excerpt'],
18 |   },
19 | 
20 |   lead_image_url: {
21 |     selectors: [['.article-detail-hd img', 'src']],
22 |   },
23 | 
24 |   content: {
25 |     selectors: ['.detail'],
26 | 
27 |     // Is there anything in the content you selected that needs transformed
28 |     // before it's consumable content? E.g., unusual lazy loaded images
29 |     transforms: {},
30 | 
31 |     // Is there anything that is in the result that shouldn't be?
32 |     // The clean selectors will remove anything that matches from
33 |     // the result
34 |     clean: ['.lazyload', '.lazylad', '.lazylood'],
35 |   },
36 | };
37 | 


--------------------------------------------------------------------------------
/src/extractors/custom/www.rawstory.com/index.js:
--------------------------------------------------------------------------------
 1 | export const WwwRawstoryComExtractor = {
 2 |   domain: 'www.rawstory.com',
 3 | 
 4 |   title: {
 5 |     selectors: [['meta[name="og:title"]', 'value'], '.blog-title'],
 6 |   },
 7 | 
 8 |   author: {
 9 |     selectors: [
10 |       'div.main-post-head .social-author__name',
11 |       '.blog-author a:first-of-type',
12 |     ],
13 |   },
14 | 
15 |   date_published: {
16 |     selectors: [
17 |       ['meta[name="article:published_time"]', 'value'],
18 |       '.blog-author a:last-of-type',
19 |     ],
20 | 
21 |     timezone: 'EST',
22 |   },
23 | 
24 |   lead_image_url: {
25 |     selectors: [['meta[name="og:image"]', 'value']],
26 |   },
27 | 
28 |   content: {
29 |     selectors: ['.post-body', '.blog-content'],
30 | 
31 |     // Is there anything in the content you selected that needs transformed
32 |     // before it's consumable content? E.g., unusual lazy loaded images
33 |     transforms: {},
34 | 
35 |     // Is there anything that is in the result that shouldn't be?
36 |     // The clean selectors will remove anything that matches from
37 |     // the result
38 |     clean: [],
39 |   },
40 | };
41 | 


--------------------------------------------------------------------------------
/src/extractors/custom/www.rbbtoday.com/index.js:
--------------------------------------------------------------------------------
 1 | export const WwwRbbtodayComExtractor = {
 2 |   domain: 'www.rbbtoday.com',
 3 | 
 4 |   title: {
 5 |     selectors: ['h1'],
 6 |   },
 7 | 
 8 |   author: {
 9 |     selectors: ['.writer.writer-name'],
10 |   },
11 | 
12 |   date_published: {
13 |     selectors: [['header time', 'datetime']],
14 |   },
15 | 
16 |   dek: {
17 |     selectors: [['meta[name="description"]', 'value'], '.arti-summary'],
18 |   },
19 | 
20 |   lead_image_url: {
21 |     selectors: [['meta[name="og:image"]', 'value']],
22 |   },
23 | 
24 |   content: {
25 |     selectors: ['.arti-content'],
26 | 
27 |     transforms: {},
28 | 
29 |     clean: ['.arti-giga'],
30 |   },
31 | };
32 | 


--------------------------------------------------------------------------------
/src/extractors/custom/www.recode.net/index.js:
--------------------------------------------------------------------------------
 1 | export const WwwRecodeNetExtractor = {
 2 |   domain: 'www.recode.net',
 3 | 
 4 |   title: {
 5 |     selectors: ['h1.c-page-title'],
 6 |   },
 7 | 
 8 |   author: {
 9 |     selectors: [['meta[name="author"]', 'value']],
10 |   },
11 | 
12 |   date_published: {
13 |     selectors: [['meta[name="article:published_time"]', 'value']],
14 |   },
15 | 
16 |   dek: {
17 |     selectors: ['h2.c-entry-summary.p-dek'],
18 |   },
19 | 
20 |   lead_image_url: {
21 |     selectors: [['meta[name="og:image"]', 'value']],
22 |   },
23 | 
24 |   content: {
25 |     selectors: [
26 |       ['figure.e-image--hero', '.c-entry-content'],
27 |       '.c-entry-content',
28 |     ],
29 | 
30 |     // Is there anything in the content you selected that needs transformed
31 |     // before it's consumable content? E.g., unusual lazy loaded images
32 |     transforms: {},
33 | 
34 |     // Is there anything that is in the result that shouldn't be?
35 |     // The clean selectors will remove anything that matches from
36 |     // the result
37 |     clean: [],
38 |   },
39 | };
40 | 


--------------------------------------------------------------------------------
/src/extractors/custom/www.reuters.com/index.js:
--------------------------------------------------------------------------------
 1 | export const WwwReutersComExtractor = {
 2 |   domain: 'www.reuters.com',
 3 | 
 4 |   title: {
 5 |     selectors: ['h1[class*="ArticleHeader-headline-"]', 'h1.article-headline'],
 6 |   },
 7 | 
 8 |   author: {
 9 |     selectors: [['meta[name="og:article:author"]', 'value'], '.author'],
10 |   },
11 | 
12 |   date_published: {
13 |     selectors: [['meta[name="og:article:published_time"]', 'value']],
14 |   },
15 | 
16 |   lead_image_url: {
17 |     selectors: [['meta[name="og:image"]', 'value']],
18 |   },
19 | 
20 |   content: {
21 |     selectors: ['div.ArticleBodyWrapper', '#article-text'],
22 | 
23 |     // Is there anything in the content you selected that needs transformed
24 |     // before it's consumable content? E.g., unusual lazy loaded images
25 |     transforms: {
26 |       '.article-subtitle': 'h4',
27 |     },
28 | 
29 |     // Is there anything that is in the result that shouldn't be?
30 |     // The clean selectors will remove anything that matches from
31 |     // the result
32 |     clean: [
33 |       'div[class^="ArticleBody-byline-container-"]',
34 |       '#article-byline .author',
35 |     ],
36 |   },
37 | };
38 | 


--------------------------------------------------------------------------------
/src/extractors/custom/www.sanwa.co.jp/index.js:
--------------------------------------------------------------------------------
 1 | export const WwwSanwaCoJpExtractor = {
 2 |   domain: 'www.sanwa.co.jp',
 3 | 
 4 |   title: {
 5 |     selectors: ['#newsContent h1'],
 6 |   },
 7 | 
 8 |   author: null,
 9 | 
10 |   date_published: {
11 |     selectors: ['dl.date'],
12 |     format: 'YYYY.MM.DD',
13 |     timezone: 'Asia/Tokyo',
14 |   },
15 | 
16 |   dek: {
17 |     selectors: [['meta[name="og:description"]', 'value']],
18 |   },
19 | 
20 |   lead_image_url: {
21 |     selectors: [['meta[name="og:image"]', 'value']],
22 |   },
23 | 
24 |   content: {
25 |     selectors: ['#newsContent'],
26 | 
27 |     defaultCleaner: false,
28 | 
29 |     transforms: {},
30 | 
31 |     clean: ['#smartphone', 'div.sns_box', 'div.contentFoot'],
32 |   },
33 | };
34 | 


--------------------------------------------------------------------------------
/src/extractors/custom/www.sbnation.com/index.js:
--------------------------------------------------------------------------------
 1 | export const WwwSbnationComExtractor = {
 2 |   domain: 'www.sbnation.com',
 3 | 
 4 |   title: {
 5 |     selectors: ['h1.c-page-title'],
 6 |   },
 7 | 
 8 |   author: {
 9 |     selectors: [['meta[name="author"]', 'value']],
10 |   },
11 | 
12 |   date_published: {
13 |     selectors: [['meta[name="article:published_time"]', 'value']],
14 |   },
15 | 
16 |   dek: {
17 |     selectors: ['p.c-entry-summary.p-dek', 'h2.c-entry-summary.p-dek'],
18 |   },
19 | 
20 |   lead_image_url: {
21 |     selectors: [['meta[name="og:image"]', 'value']],
22 |   },
23 | 
24 |   content: {
25 |     selectors: ['div.c-entry-content'],
26 | 
27 |     // Is there anything in the content you selected that needs transformed
28 |     // before it's consumable content? E.g., unusual lazy loaded images
29 |     transforms: {},
30 | 
31 |     // Is there anything that is in the result that shouldn't be?
32 |     // The clean selectors will remove anything that matches from
33 |     // the result
34 |     clean: [],
35 |   },
36 | };
37 | 


--------------------------------------------------------------------------------
/src/extractors/custom/www.slate.com/index.js:
--------------------------------------------------------------------------------
 1 | export const WwwSlateComExtractor = {
 2 |   domain: 'www.slate.com',
 3 | 
 4 |   title: {
 5 |     selectors: ['.hed', 'h1'],
 6 |   },
 7 | 
 8 |   author: {
 9 |     selectors: ['a[rel=author]'],
10 |   },
11 | 
12 |   date_published: {
13 |     selectors: ['.pub-date'],
14 | 
15 |     timezone: 'America/New_York',
16 |   },
17 | 
18 |   dek: {
19 |     selectors: ['.dek'],
20 |   },
21 | 
22 |   lead_image_url: {
23 |     selectors: [['meta[name="og:image"]', 'value']],
24 |   },
25 | 
26 |   content: {
27 |     selectors: ['.body'],
28 | 
29 |     // Is there anything in the content you selected that needs transformed
30 |     // before it's consumable content? E.g., unusual lazy loaded images
31 |     transforms: {},
32 | 
33 |     // Is there anything that is in the result that shouldn't be?
34 |     // The clean selectors will remove anything that matches from
35 |     // the result
36 |     clean: [
37 |       '.about-the-author',
38 |       '.pullquote',
39 |       '.newsletter-signup-component',
40 |       '.top-comment',
41 |     ],
42 |   },
43 | };
44 | 


--------------------------------------------------------------------------------
/src/extractors/custom/www.theguardian.com/index.js:
--------------------------------------------------------------------------------
 1 | export const WwwTheguardianComExtractor = {
 2 |   domain: 'www.theguardian.com',
 3 | 
 4 |   title: {
 5 |     selectors: ['h1', '.content__headline'],
 6 |   },
 7 | 
 8 |   author: {
 9 |     selectors: ['address[data-link-name="byline"]', 'p.byline'],
10 |   },
11 | 
12 |   date_published: {
13 |     selectors: [['meta[name="article:published_time"]', 'value']],
14 |   },
15 | 
16 |   dek: {
17 |     selectors: ['div[data-gu-name="standfirst"]', '.content__standfirst'],
18 |   },
19 | 
20 |   lead_image_url: {
21 |     selectors: [['meta[name="og:image"]', 'value']],
22 |   },
23 | 
24 |   content: {
25 |     selectors: ['#maincontent', '.content__article-body'],
26 | 
27 |     // Is there anything in the content you selected that needs transformed
28 |     // before it's consumable content? E.g., unusual lazy loaded images
29 |     transforms: {},
30 | 
31 |     // Is there anything that is in the result that shouldn't be?
32 |     // The clean selectors will remove anything that matches from
33 |     // the result
34 |     clean: ['.hide-on-mobile', '.inline-icon'],
35 |   },
36 | };
37 | 


--------------------------------------------------------------------------------
/src/extractors/custom/www.thepennyhoarder.com/index.js:
--------------------------------------------------------------------------------
 1 | export const WwwThepennyhoarderComExtractor = {
 2 |   domain: 'www.thepennyhoarder.com',
 3 | 
 4 |   title: {
 5 |     selectors: [['meta[name="dcterms.title"]', 'value']],
 6 |   },
 7 | 
 8 |   author: {
 9 |     selectors: [['link[rel="author"]', 'title']],
10 |   },
11 | 
12 |   date_published: {
13 |     selectors: [['meta[name="article:published_time"]', 'value']],
14 |   },
15 | 
16 |   lead_image_url: {
17 |     selectors: [['meta[name="og:image"]', 'value']],
18 |   },
19 | 
20 |   content: {
21 |     selectors: [
22 |       ['.post-img', '.post-text'],
23 |       '.post-text',
24 |       '.single-post-content-inner',
25 |     ],
26 | 
27 |     // Is there anything in the content you selected that needs transformed
28 |     // before it's consumable content? E.g., unusual lazy loaded images
29 |     transforms: {},
30 | 
31 |     // Is there anything that is in the result that shouldn't be?
32 |     // The clean selectors will remove anything that matches from
33 |     // the result
34 |     clean: [],
35 |   },
36 | };
37 | 


--------------------------------------------------------------------------------
/src/extractors/custom/www.thepoliticalinsider.com/index.js:
--------------------------------------------------------------------------------
 1 | export const WwwThepoliticalinsiderComExtractor = {
 2 |   domain: 'www.thepoliticalinsider.com',
 3 | 
 4 |   title: {
 5 |     selectors: [['meta[name="sailthru.title"]', 'value']],
 6 |   },
 7 | 
 8 |   author: {
 9 |     selectors: [['meta[name="sailthru.author"]', 'value']],
10 |   },
11 | 
12 |   date_published: {
13 |     selectors: [['meta[name="sailthru.date"]', 'value']],
14 |     timezone: 'America/New_York',
15 |   },
16 | 
17 |   dek: {
18 |     selectors: [
19 |       // enter selectors
20 |     ],
21 |   },
22 | 
23 |   lead_image_url: {
24 |     selectors: [
25 |       ['meta[name="og:image"]', 'value'], // enter selectors
26 |     ],
27 |   },
28 | 
29 |   content: {
30 |     selectors: ['div#article-body'],
31 | 
32 |     // Is there anything in the content you selected that needs transformed
33 |     // before it's consumable content? E.g., unusual lazy loaded images
34 |     transforms: {},
35 | 
36 |     // Is there anything that is in the result that shouldn't be?
37 |     // The clean selectors will remove anything that matches from
38 |     // the result
39 |     clean: [],
40 |   },
41 | };
42 | 


--------------------------------------------------------------------------------
/src/extractors/custom/www.tmz.com/index.js:
--------------------------------------------------------------------------------
 1 | export const WwwTmzComExtractor = {
 2 |   domain: 'www.tmz.com',
 3 | 
 4 |   title: {
 5 |     selectors: ['.post-title-breadcrumb', 'h1', '.headline'],
 6 |   },
 7 | 
 8 |   author: 'TMZ STAFF',
 9 | 
10 |   date_published: {
11 |     selectors: ['.article__published-at', '.article-posted-date'],
12 | 
13 |     timezone: 'America/Los_Angeles',
14 |   },
15 | 
16 |   dek: {
17 |     selectors: [
18 |       // enter selectors
19 |     ],
20 |   },
21 | 
22 |   lead_image_url: {
23 |     selectors: [['meta[name="og:image"]', 'value']],
24 |   },
25 | 
26 |   content: {
27 |     selectors: ['.article__blocks', '.article-content', '.all-post-body'],
28 | 
29 |     // Is there anything in the content you selected that needs transformed
30 |     // before it's consumable content? E.g., unusual lazy loaded images
31 |     transforms: {},
32 | 
33 |     // Is there anything that is in the result that shouldn't be?
34 |     // The clean selectors will remove anything that matches from
35 |     // the result
36 |     clean: ['.lightbox-link'],
37 |   },
38 | };
39 | 


--------------------------------------------------------------------------------
/src/extractors/custom/www.today.com/index.js:
--------------------------------------------------------------------------------
 1 | export const WwwTodayComExtractor = {
 2 |   domain: 'www.today.com',
 3 | 
 4 |   title: {
 5 |     selectors: ['h1.article-hero-headline__htag', 'h1.entry-headline'],
 6 |   },
 7 | 
 8 |   author: {
 9 |     selectors: ['span.byline-name', ['meta[name="author"]', 'value']],
10 |   },
11 | 
12 |   date_published: {
13 |     selectors: ['time[datetime]', ['meta[name="DC.date.issued"]', 'value']],
14 |   },
15 | 
16 |   lead_image_url: {
17 |     selectors: [['meta[name="og:image"]', 'value']],
18 |   },
19 | 
20 |   content: {
21 |     selectors: ['div.article-body__content', '.entry-container'],
22 | 
23 |     // Is there anything in the content you selected that needs transformed
24 |     // before it's consumable content? E.g., unusual lazy loaded images
25 |     transforms: {},
26 | 
27 |     // Is there anything that is in the result that shouldn't be?
28 |     // The clean selectors will remove anything that matches from
29 |     // the result
30 |     clean: ['.label-comment'],
31 |   },
32 | };
33 | 


--------------------------------------------------------------------------------
/src/extractors/custom/www.usmagazine.com/index.js:
--------------------------------------------------------------------------------
 1 | export const WwwUsmagazineComExtractor = {
 2 |   domain: 'www.usmagazine.com',
 3 | 
 4 |   title: {
 5 |     selectors: ['header h1'],
 6 |   },
 7 | 
 8 |   author: {
 9 |     selectors: ['a.author', 'a.article-byline.tracked-offpage'],
10 |   },
11 | 
12 |   date_published: {
13 |     timezone: 'America/New_York',
14 | 
15 |     selectors: [['meta[name="article:published_time"]', 'value']],
16 |   },
17 | 
18 |   lead_image_url: {
19 |     selectors: [['meta[name="og:image"]', 'value']],
20 |   },
21 | 
22 |   content: {
23 |     selectors: ['div.article-content'],
24 | 
25 |     // Is there anything in the content you selected that needs transformed
26 |     // before it's consumable content? E.g., unusual lazy loaded images
27 |     transforms: {},
28 | 
29 |     // Is there anything that is in the result that shouldn't be?
30 |     // The clean selectors will remove anything that matches from
31 |     // the result
32 |     clean: ['.module-related'],
33 |   },
34 | };
35 | 


--------------------------------------------------------------------------------
/src/extractors/custom/www.westernjournalism.com/index.js:
--------------------------------------------------------------------------------
 1 | export const WwwWesternjournalismComExtractor = {
 2 |   domain: 'www.westernjournalism.com',
 3 | 
 4 |   title: {
 5 |     selectors: ['title', 'h1.entry-title'],
 6 |   },
 7 | 
 8 |   author: {
 9 |     selectors: [['meta[name="author"]', 'value']],
10 |   },
11 | 
12 |   date_published: {
13 |     selectors: [['meta[name="DC.date.issued"]', 'value']],
14 |   },
15 | 
16 |   dek: {
17 |     selectors: ['.subtitle'],
18 |   },
19 | 
20 |   lead_image_url: {
21 |     selectors: [['meta[name="og:image"]', 'value']],
22 |   },
23 | 
24 |   content: {
25 |     selectors: ['div.article-sharing.top + div'],
26 | 
27 |     // Is there anything in the content you selected that needs transformed
28 |     // before it's consumable content? E.g., unusual lazy loaded images
29 |     transforms: {},
30 | 
31 |     // Is there anything that is in the result that shouldn't be?
32 |     // The clean selectors will remove anything that matches from
33 |     // the result
34 |     clean: ['.ad-notice-small'],
35 |   },
36 | };
37 | 


--------------------------------------------------------------------------------
/src/extractors/custom/www.yomiuri.co.jp/index.js:
--------------------------------------------------------------------------------
 1 | export const WwwYomiuriCoJpExtractor = {
 2 |   domain: 'www.yomiuri.co.jp',
 3 | 
 4 |   title: {
 5 |     selectors: ['h1.title-article.c-article-title'],
 6 |   },
 7 | 
 8 |   author: null,
 9 | 
10 |   date_published: {
11 |     selectors: [['meta[name="article:published_time"]', 'value']],
12 |   },
13 | 
14 |   dek: null,
15 | 
16 |   lead_image_url: {
17 |     selectors: [['meta[name="og:image"]', 'value']],
18 |   },
19 | 
20 |   content: {
21 |     selectors: ['div.p-main-contents'],
22 | 
23 |     transforms: {},
24 | 
25 |     clean: [],
26 |   },
27 | };
28 | 


--------------------------------------------------------------------------------
/src/extractors/detect-by-html.js:
--------------------------------------------------------------------------------
 1 | import { MediumExtractor, BloggerExtractor } from './custom';
 2 | 
 3 | const Detectors = {
 4 |   'meta[name="al:ios:app_name"][value="Medium"]': MediumExtractor,
 5 |   'meta[name="generator"][value="blogger"]': BloggerExtractor,
 6 | };
 7 | 
 8 | export default function detectByHtml($) {
 9 |   const selector = Reflect.ownKeys(Detectors).find(s => $(s).length > 0);
10 | 
11 |   return Detectors[selector];
12 | }
13 | 


--------------------------------------------------------------------------------
/src/extractors/detect-by-html.test.js:
--------------------------------------------------------------------------------
 1 | import assert from 'assert';
 2 | import cheerio from 'cheerio';
 3 | 
 4 | import detectByHtml from './detect-by-html';
 5 | 
 6 | describe('detectByHtml', () => {
 7 |   it('detects a medium post from the html', () => {
 8 |     const $ = cheerio.load(
 9 |       '<head><meta name="al:ios:app_name" value="Medium" /></head>'
10 |     );
11 | 
12 |     assert.equal(detectByHtml($).domain, 'medium.com');
13 |   });
14 | 
15 |   it('returns nothing if no match is found', () => {
16 |     const $ = cheerio.load('<div></div>');
17 | 
18 |     assert.equal(detectByHtml($), null);
19 |   });
20 | });
21 | 


--------------------------------------------------------------------------------
/src/extractors/fixtures/postlight.com/index.js:
--------------------------------------------------------------------------------
 1 | var customExtractor = {
 2 |   domain: 'postlight.com',
 3 |   title: {
 4 |     selectors: ['h1'],
 5 |   },
 6 |   author: {
 7 |     selectors: ['.byline-name'],
 8 |   },
 9 |   content: {
10 |     selectors: ['article'],
11 |   },
12 |   extend: {
13 |     uniqueKeyFromFixture: {
14 |       selectors: ['.single__hero-category'],
15 |     },
16 |   },
17 | };
18 | 
19 | module.exports = customExtractor;
20 | 


--------------------------------------------------------------------------------
/src/extractors/generic/content/extract-best-node.js:
--------------------------------------------------------------------------------
 1 | import { stripUnlikelyCandidates, convertToParagraphs } from 'utils/dom';
 2 | 
 3 | import { scoreContent, findTopCandidate } from './scoring';
 4 | 
 5 | // Using a variety of scoring techniques, extract the content most
 6 | // likely to be article text.
 7 | //
 8 | // If strip_unlikely_candidates is True, remove any elements that
 9 | // match certain criteria first. (Like, does this element have a
10 | // classname of "comment")
11 | //
12 | // If weight_nodes is True, use classNames and IDs to determine the
13 | // worthiness of nodes.
14 | //
15 | // Returns a cheerio object $
16 | export default function extractBestNode($, opts) {
17 |   if (opts.stripUnlikelyCandidates) {
18 |     $ = stripUnlikelyCandidates($);
19 |   }
20 | 
21 |   $ = convertToParagraphs($);
22 |   $ = scoreContent($, opts.weightNodes);
23 |   const $topCandidate = findTopCandidate($);
24 | 
25 |   return $topCandidate;
26 | }
27 | 


--------------------------------------------------------------------------------
/src/extractors/generic/content/extract-best-node.test.js:
--------------------------------------------------------------------------------
 1 | import assert from 'assert';
 2 | import cheerio from 'cheerio';
 3 | import extractBestNode from './extract-best-node';
 4 | 
 5 | const fs = require('fs');
 6 | 
 7 | describe('extractBestNode($, flags)', () => {
 8 |   it('scores the dom nodes and returns the best option', () => {
 9 |     const html = fs.readFileSync('./fixtures/www.latimes.com.html', 'utf-8');
10 |     const $ = cheerio.load(html);
11 | 
12 |     const bestNode = extractBestNode($, {
13 |       stripUnlikelyCandidates: true,
14 |       weightNodes: true,
15 |     });
16 | 
17 |     assert(typeof bestNode, 'object');
18 |   });
19 | });
20 | 


--------------------------------------------------------------------------------
/src/extractors/generic/content/extractor.test.js:
--------------------------------------------------------------------------------
 1 | import assert from 'assert';
 2 | 
 3 | import { clean } from 'test-helpers';
 4 | 
 5 | import GenericContentExtractor from './extractor';
 6 | 
 7 | const fs = require('fs');
 8 | 
 9 | describe('GenericContentExtractor', () => {
10 |   describe('extract($, html, opts)', () => {
11 |     it('extracts html and returns the article', () => {
12 |       const html = fs.readFileSync('./fixtures/www.vulture.com.html', 'utf-8');
13 |       const result = clean(
14 |         GenericContentExtractor.extract({
15 |           $: null,
16 |           html,
17 |           url:
18 |             'http://www.vulture.com/2016/08/dc-comics-greg-berlanti-c-v-r.html',
19 |         })
20 |       );
21 | 
22 |       assert(typeof result, 'string');
23 |     });
24 |   });
25 | });
26 | 


--------------------------------------------------------------------------------
/src/extractors/generic/content/scoring/add-score.js:
--------------------------------------------------------------------------------
 1 | import { getOrInitScore, setScore } from './index';
 2 | 
 3 | export default function addScore($node, $, amount) {
 4 |   try {
 5 |     const score = getOrInitScore($node, $) + amount;
 6 |     setScore($node, $, score);
 7 |   } catch (e) {
 8 |     // Ignoring; error occurs in scoreNode
 9 |   }
10 | 
11 |   return $node;
12 | }
13 | 


--------------------------------------------------------------------------------
/src/extractors/generic/content/scoring/add-score.test.js:
--------------------------------------------------------------------------------
 1 | import assert from 'assert';
 2 | import cheerio from 'cheerio';
 3 | 
 4 | import { addScore, getScore } from './index';
 5 | 
 6 | describe('Scoring utils', () => {
 7 |   describe('addScore(node, $, amount)', () => {
 8 |     it("adds the specified amount to a node's score", () => {
 9 |       const $ = cheerio.load('<p score="25">Foo</p>');
10 |       const $node = $('p').first();
11 |       addScore($node, $, 25);
12 |       assert.equal(getScore($node), 50);
13 |     });
14 | 
15 |     it('adds score if score not yet set (assumes score is 0)', () => {
16 |       const $ = cheerio.load('<p>Foo</p>');
17 |       const $node = $('p').first();
18 |       addScore($node, $, 25);
19 |       assert.equal(getScore($node), 25);
20 |     });
21 |   });
22 | });
23 | 


--------------------------------------------------------------------------------
/src/extractors/generic/content/scoring/add-to-parent.js:
--------------------------------------------------------------------------------
 1 | import { addScore } from './index';
 2 | 
 3 | // Adds 1/4 of a child's score to its parent
 4 | export default function addToParent(node, $, score) {
 5 |   const parent = node.parent();
 6 |   if (parent) {
 7 |     addScore(parent, $, score * 0.25);
 8 |   }
 9 | 
10 |   return node;
11 | }
12 | 


--------------------------------------------------------------------------------
/src/extractors/generic/content/scoring/add-to-parent.test.js:
--------------------------------------------------------------------------------
 1 | import assert from 'assert';
 2 | import cheerio from 'cheerio';
 3 | 
 4 | import { addToParent, getScore } from './index';
 5 | 
 6 | describe('Scoring utils', () => {
 7 |   describe('addToParent(node, $, amount)', () => {
 8 |     it("adds 1/4 of a node's score it its parent", () => {
 9 |       const $ = cheerio.load('<div score="25"><p score="40">Foo</p></div>');
10 |       const $node = addToParent($('p').first(), $, 40);
11 | 
12 |       assert.equal(getScore($node.parent()), 35);
13 |       assert.equal(getScore($node), 40);
14 |     });
15 |   });
16 | });
17 | 


--------------------------------------------------------------------------------
/src/extractors/generic/content/scoring/find-top-candidate.js:
--------------------------------------------------------------------------------
 1 | import { NON_TOP_CANDIDATE_TAGS_RE } from './constants';
 2 | import { getScore } from './index';
 3 | import mergeSiblings from './merge-siblings';
 4 | 
 5 | // After we've calculated scores, loop through all of the possible
 6 | // candidate nodes we found and find the one with the highest score.
 7 | export default function findTopCandidate($) {
 8 |   let $candidate;
 9 |   let topScore = 0;
10 | 
11 |   $('[score]').each((index, node) => {
12 |     // Ignore tags like BR, HR, etc
13 |     if (NON_TOP_CANDIDATE_TAGS_RE.test(node.tagName)) {
14 |       return;
15 |     }
16 | 
17 |     const $node = $(node);
18 |     const score = getScore($node);
19 | 
20 |     if (score > topScore) {
21 |       topScore = score;
22 |       $candidate = $node;
23 |     }
24 |   });
25 | 
26 |   // If we don't have a candidate, return the body
27 |   // or whatever the first element is
28 |   if (!$candidate) {
29 |     return $('body') || $('*').first();
30 |   }
31 | 
32 |   $candidate = mergeSiblings($candidate, topScore, $);
33 | 
34 |   return $candidate;
35 | }
36 | 


--------------------------------------------------------------------------------
/src/extractors/generic/content/scoring/get-or-init-score.js:
--------------------------------------------------------------------------------
 1 | import { getScore, scoreNode, getWeight, addToParent } from './index';
 2 | 
 3 | // gets and returns the score if it exists
 4 | // if not, initializes a score based on
 5 | // the node's tag type
 6 | export default function getOrInitScore($node, $, weightNodes = true) {
 7 |   let score = getScore($node);
 8 | 
 9 |   if (score) {
10 |     return score;
11 |   }
12 | 
13 |   score = scoreNode($node);
14 | 
15 |   if (weightNodes) {
16 |     score += getWeight($node);
17 |   }
18 | 
19 |   addToParent($node, $, score);
20 | 
21 |   return score;
22 | }
23 | 


--------------------------------------------------------------------------------
/src/extractors/generic/content/scoring/get-score.js:
--------------------------------------------------------------------------------
1 | // returns the score of a node based on
2 | // the node's score attribute
3 | // returns null if no score set
4 | export default function getScore($node) {
5 |   return parseFloat($node.attr('score')) || null;
6 | }
7 | 


--------------------------------------------------------------------------------
/src/extractors/generic/content/scoring/get-score.test.js:
--------------------------------------------------------------------------------
 1 | import assert from 'assert';
 2 | import cheerio from 'cheerio';
 3 | 
 4 | import { getScore } from './index';
 5 | 
 6 | describe('Scoring utils', () => {
 7 |   describe('getScore($node)', () => {
 8 |     it('returns null if the node has no score set', () => {
 9 |       const $ = cheerio.load('<p>Foo</p>');
10 |       assert.equal(getScore($('p').first()), null);
11 |     });
12 | 
13 |     it('returns 25 if the node has a score attr of 25', () => {
14 |       const $ = cheerio.load('<p score="25">Foo</p>');
15 |       const score = getScore($('p').first());
16 |       assert.equal(typeof score, 'number');
17 |       assert.equal(score, 25);
18 |     });
19 |   });
20 | });
21 | 


--------------------------------------------------------------------------------
/src/extractors/generic/content/scoring/index.js:
--------------------------------------------------------------------------------
 1 | // Scoring
 2 | export { default as getWeight } from './get-weight';
 3 | export { default as getScore } from './get-score';
 4 | export { default as scoreCommas } from './score-commas';
 5 | export { default as scoreLength } from './score-length';
 6 | export { default as scoreParagraph } from './score-paragraph';
 7 | export { default as setScore } from './set-score';
 8 | export { default as addScore } from './add-score';
 9 | export { default as addToParent } from './add-to-parent';
10 | export { default as getOrInitScore } from './get-or-init-score';
11 | export { default as scoreNode } from './score-node';
12 | export { default as scoreContent } from './score-content';
13 | export { default as findTopCandidate } from './find-top-candidate';
14 | 


--------------------------------------------------------------------------------
/src/extractors/generic/content/scoring/score-commas.js:
--------------------------------------------------------------------------------
1 | // return 1 for every comma in text
2 | export default function scoreCommas(text) {
3 |   return (text.match(/,/g) || []).length;
4 | }
5 | 


--------------------------------------------------------------------------------
/src/extractors/generic/content/scoring/score-commas.test.js:
--------------------------------------------------------------------------------
 1 | import assert from 'assert';
 2 | 
 3 | import { scoreCommas } from './index';
 4 | 
 5 | describe('Scoring utils', () => {
 6 |   describe('scoreCommas(text)', () => {
 7 |     it('returns 0 if text has no commas', () => {
 8 |       assert.equal(scoreCommas('Foo bar'), 0);
 9 |     });
10 | 
11 |     it('returns a point for every comma in the text', () => {
12 |       assert.equal(scoreCommas('Foo, bar'), 1);
13 |       assert.equal(scoreCommas('Foo, bar, baz'), 2);
14 |       assert.equal(scoreCommas('Foo, bar, baz, bat'), 3);
15 |     });
16 |   });
17 | });
18 | 


--------------------------------------------------------------------------------
/src/extractors/generic/content/scoring/score-length.js:
--------------------------------------------------------------------------------
 1 | const idkRe = new RegExp('^(p|pre)$', 'i');
 2 | 
 3 | export default function scoreLength(textLength, tagName = 'p') {
 4 |   const chunks = textLength / 50;
 5 | 
 6 |   if (chunks > 0) {
 7 |     let lengthBonus;
 8 | 
 9 |     // No idea why p or pre are being tamped down here
10 |     // but just following the source for now
11 |     // Not even sure why tagName is included here,
12 |     // since this is only being called from the context
13 |     // of scoreParagraph
14 |     if (idkRe.test(tagName)) {
15 |       lengthBonus = chunks - 2;
16 |     } else {
17 |       lengthBonus = chunks - 1.25;
18 |     }
19 | 
20 |     return Math.min(Math.max(lengthBonus, 0), 3);
21 |   }
22 | 
23 |   return 0;
24 | }
25 | 


--------------------------------------------------------------------------------
/src/extractors/generic/content/scoring/score-length.test.js:
--------------------------------------------------------------------------------
 1 | import assert from 'assert';
 2 | 
 3 | import { scoreLength } from './index';
 4 | 
 5 | describe('Scoring utils', () => {
 6 |   describe('scoreLength(textLength, tagName)', () => {
 7 |     it('returns 0 if length < 50 chars', () => {
 8 |       assert.equal(scoreLength(30), 0);
 9 |     });
10 | 
11 |     it('returns varying scores but maxes out at 3', () => {
12 |       assert.equal(scoreLength(150), 1);
13 |       assert.equal(scoreLength(199), 1.98);
14 |       assert.equal(scoreLength(200), 2);
15 |       assert.equal(scoreLength(250), 3);
16 |       assert.equal(scoreLength(500), 3);
17 |       assert.equal(scoreLength(1500), 3);
18 |     });
19 |   });
20 | });
21 | 


--------------------------------------------------------------------------------
/src/extractors/generic/content/scoring/score-node.js:
--------------------------------------------------------------------------------
 1 | import { scoreParagraph } from './index';
 2 | import {
 3 |   PARAGRAPH_SCORE_TAGS,
 4 |   CHILD_CONTENT_TAGS,
 5 |   BAD_TAGS,
 6 | } from './constants';
 7 | 
 8 | // Score an individual node. Has some smarts for paragraphs, otherwise
 9 | // just scores based on tag.
10 | export default function scoreNode($node) {
11 |   const { tagName } = $node.get(0);
12 | 
13 |   // TODO: Consider ordering by most likely.
14 |   // E.g., if divs are a more common tag on a page,
15 |   // Could save doing that regex test on every node – AP
16 |   if (PARAGRAPH_SCORE_TAGS.test(tagName)) {
17 |     return scoreParagraph($node);
18 |   }
19 |   if (tagName.toLowerCase() === 'div') {
20 |     return 5;
21 |   }
22 |   if (CHILD_CONTENT_TAGS.test(tagName)) {
23 |     return 3;
24 |   }
25 |   if (BAD_TAGS.test(tagName)) {
26 |     return -3;
27 |   }
28 |   if (tagName.toLowerCase() === 'th') {
29 |     return -5;
30 |   }
31 | 
32 |   return 0;
33 | }
34 | 


--------------------------------------------------------------------------------
/src/extractors/generic/content/scoring/score-paragraph.js:
--------------------------------------------------------------------------------
 1 | import { scoreCommas, scoreLength } from './index';
 2 | 
 3 | // Score a paragraph using various methods. Things like number of
 4 | // commas, etc. Higher is better.
 5 | export default function scoreParagraph(node) {
 6 |   let score = 1;
 7 |   const text = node.text().trim();
 8 |   const textLength = text.length;
 9 | 
10 |   // If this paragraph is less than 25 characters, don't count it.
11 |   if (textLength < 25) {
12 |     return 0;
13 |   }
14 | 
15 |   // Add points for any commas within this paragraph
16 |   score += scoreCommas(text);
17 | 
18 |   // For every 50 characters in this paragraph, add another point. Up
19 |   // to 3 points.
20 |   score += scoreLength(textLength);
21 | 
22 |   // Articles can end with short paragraphs when people are being clever
23 |   // but they can also end with short paragraphs setting up lists of junk
24 |   // that we strip. This negative tweaks junk setup paragraphs just below
25 |   // the cutoff threshold.
26 |   if (text.slice(-1) === ':') {
27 |     score -= 1;
28 |   }
29 | 
30 |   return score;
31 | }
32 | 


--------------------------------------------------------------------------------
/src/extractors/generic/content/scoring/set-score.js:
--------------------------------------------------------------------------------
1 | export default function setScore($node, $, score) {
2 |   $node.attr('score', score);
3 |   return $node;
4 | }
5 | 


--------------------------------------------------------------------------------
/src/extractors/generic/content/scoring/set-score.test.js:
--------------------------------------------------------------------------------
 1 | import assert from 'assert';
 2 | import cheerio from 'cheerio';
 3 | 
 4 | import { setScore, getScore } from './index';
 5 | 
 6 | describe('Scoring utils', () => {
 7 |   describe('setScore(node, $, amount)', () => {
 8 |     it("sets the specified amount as the node's score", () => {
 9 |       const $ = cheerio.load('<p>Foo</p>');
10 |       const $node = $('p').first();
11 |       const newScore = 25;
12 |       setScore($node, $, newScore);
13 | 
14 |       const score = getScore($node);
15 |       assert(score, newScore);
16 |     });
17 |   });
18 | });
19 | 


--------------------------------------------------------------------------------
/src/extractors/generic/dek/extractor.js:
--------------------------------------------------------------------------------
 1 | // Currently there is only one selector for
 2 | // deks. We should simply return null here
 3 | // until we have a more robust generic option.
 4 | // Below is the original source for this, for reference.
 5 | const GenericDekExtractor = {
 6 |   extract() {
 7 |     return null;
 8 |   },
 9 | };
10 | 
11 | export default GenericDekExtractor;
12 | 


--------------------------------------------------------------------------------
/src/extractors/generic/dek/extractor.test.js:
--------------------------------------------------------------------------------
 1 | import assert from 'assert';
 2 | import cheerio from 'cheerio';
 3 | 
 4 | import GenericDekExtractor from './extractor';
 5 | 
 6 | describe('GenericDekExtractor', () => {
 7 |   describe('extract({ $, metaCache })', () => {
 8 |     it('returns null if no dek can be found', () => {
 9 |       const $ = cheerio.load('<div></div>');
10 |       const metaCache = [];
11 |       const result = GenericDekExtractor.extract({ $, metaCache });
12 | 
13 |       assert.equal(result, null);
14 |     });
15 |   });
16 | });
17 | 


--------------------------------------------------------------------------------
/src/extractors/generic/excerpt/constants.js:
--------------------------------------------------------------------------------
1 | export const EXCERPT_META_SELECTORS = ['og:description', 'twitter:description'];
2 | 


--------------------------------------------------------------------------------
/src/extractors/generic/excerpt/extractor.js:
--------------------------------------------------------------------------------
 1 | import ellipsize from 'ellipsize';
 2 | 
 3 | import { extractFromMeta, stripTags } from 'utils/dom';
 4 | 
 5 | import { EXCERPT_META_SELECTORS } from './constants';
 6 | 
 7 | export function clean(content, $, maxLength = 200) {
 8 |   content = content.replace(/[\s\n]+/g, ' ').trim();
 9 |   return ellipsize(content, maxLength, { ellipse: '&hellip;' });
10 | }
11 | 
12 | const GenericExcerptExtractor = {
13 |   extract({ $, content, metaCache }) {
14 |     const excerpt = extractFromMeta($, EXCERPT_META_SELECTORS, metaCache);
15 |     if (excerpt) {
16 |       return clean(stripTags(excerpt, $));
17 |     }
18 |     // Fall back to excerpting from the extracted content
19 |     const maxLength = 200;
20 |     const shortContent = content.slice(0, maxLength * 5);
21 |     return clean($(shortContent).text(), $, maxLength);
22 |   },
23 | };
24 | 
25 | export default GenericExcerptExtractor;
26 | 


--------------------------------------------------------------------------------
/src/extractors/generic/next-page-url/extractor.test.js:
--------------------------------------------------------------------------------
 1 | import assert from 'assert';
 2 | import cheerio from 'cheerio';
 3 | 
 4 | import GenericNextPageUrlExtractor from './extractor';
 5 | 
 6 | const fs = require('fs');
 7 | 
 8 | describe('GenericNextPageUrlExtractor', () => {
 9 |   it('returns most likely next page url', () => {
10 |     const html = fs.readFileSync('./fixtures/arstechnica.com.html', 'utf8');
11 |     const $ = cheerio.load(html);
12 |     const url =
13 |       'https://arstechnica.com/gadgets/2016/08/the-connected-renter-how-to-make-your-apartment-smarter/';
14 |     const next =
15 |       'https://arstechnica.com/gadgets/2016/08/the-connected-renter-how-to-make-your-apartment-smarter/2';
16 | 
17 |     const nextPage = GenericNextPageUrlExtractor.extract({
18 |       $,
19 |       url,
20 |     });
21 | 
22 |     assert.equal(nextPage, next);
23 |   });
24 | 
25 |   it('returns null if there is no likely next page', () => {
26 |     const html = '<div><p>HI</p></div>';
27 |     const $ = cheerio.load(html);
28 |     const url = 'http://example.com/foo/bar';
29 | 
30 |     const nextPage = GenericNextPageUrlExtractor.extract({
31 |       $,
32 |       url,
33 |     });
34 | 
35 |     assert.equal(nextPage, null);
36 |   });
37 | });
38 | 


--------------------------------------------------------------------------------
/src/extractors/generic/next-page-url/scoring/utils/index.js:
--------------------------------------------------------------------------------
 1 | export { default as scoreSimilarity } from './score-similarity';
 2 | export { default as scoreLinkText } from './score-link-text';
 3 | export { default as scorePageInLink } from './score-page-in-link';
 4 | export { default as scoreExtraneousLinks } from './score-extraneous-links';
 5 | export { default as scoreByParents } from './score-by-parents';
 6 | export { default as scorePrevLink } from './score-prev-link';
 7 | export { default as shouldScore } from './should-score';
 8 | export { default as scoreBaseUrl } from './score-base-url';
 9 | export { default as scoreNextLinkText } from './score-next-link-text';
10 | export { default as scoreCapLinks } from './score-cap-links';
11 | 


--------------------------------------------------------------------------------
/src/extractors/generic/next-page-url/scoring/utils/score-base-url.js:
--------------------------------------------------------------------------------
 1 | export default function scoreBaseUrl(href, baseRegex) {
 2 |   // If the baseUrl isn't part of this URL, penalize this
 3 |   // link. It could still be the link, but the odds are lower.
 4 |   // Example:
 5 |   // http://www.actionscript.org/resources/articles/745/1/JavaScript-and-VBScript-Injection-in-ActionScript-3/Page1.html
 6 |   if (!baseRegex.test(href)) {
 7 |     return -25;
 8 |   }
 9 | 
10 |   return 0;
11 | }
12 | 


--------------------------------------------------------------------------------
/src/extractors/generic/next-page-url/scoring/utils/score-base-url.test.js:
--------------------------------------------------------------------------------
 1 | import assert from 'assert';
 2 | 
 3 | import scoreBaseUrl from './score-base-url';
 4 | import { makeBaseRegex } from '../score-links';
 5 | 
 6 | describe('scoreBaseUrl(href, baseRegex)', () => {
 7 |   it('returns -25 if url does not contain the base url', () => {
 8 |     const baseUrl = 'http://example.com/foo/bar';
 9 |     const badUrl = 'http://foo.com/foo/bar';
10 |     const baseRegex = makeBaseRegex(baseUrl);
11 | 
12 |     assert.equal(scoreBaseUrl(badUrl, baseRegex), -25);
13 |   });
14 | 
15 |   it('returns 0 if url contains the base url', () => {
16 |     const baseUrl = 'http://example.com/foo/bar';
17 |     const badUrl = 'http://example.com/foo/bar/bat';
18 |     const baseRegex = makeBaseRegex(baseUrl);
19 | 
20 |     assert.equal(scoreBaseUrl(badUrl, baseRegex), 0);
21 |   });
22 | });
23 | 


--------------------------------------------------------------------------------
/src/extractors/generic/next-page-url/scoring/utils/score-by-parents.test.js:
--------------------------------------------------------------------------------
 1 | import assert from 'assert';
 2 | import cheerio from 'cheerio';
 3 | 
 4 | import scoreByParents from './score-by-parents';
 5 | 
 6 | describe('scoreByParents($link)', () => {
 7 |   it('returns 25 if parent sig looks like a page', () => {
 8 |     const $ = cheerio.load(`
 9 |       <div>
10 |         <div class="next-page">
11 |           <a href="blah">Next page</a>
12 |         </div>
13 |       </div>
14 |     `);
15 | 
16 |     assert.equal(scoreByParents($('a').first()), 25);
17 |   });
18 | 
19 |   it('returns -25 if parent sig looks like a comment', () => {
20 |     const $ = cheerio.load(`
21 |       <div>
22 |         <div class="comment">
23 |           <a href="blah">Next page</a>
24 |         </div>
25 |       </div>
26 |     `);
27 | 
28 |     assert.equal(scoreByParents($('a').first()), -25);
29 |   });
30 | });
31 | 


--------------------------------------------------------------------------------
/src/extractors/generic/next-page-url/scoring/utils/score-cap-links.js:
--------------------------------------------------------------------------------
 1 | import { NEXT_LINK_TEXT_RE, CAP_LINK_TEXT_RE } from '../constants';
 2 | 
 3 | export default function scoreCapLinks(linkData) {
 4 |   // Cap links are links like "last", etc.
 5 |   if (CAP_LINK_TEXT_RE.test(linkData)) {
 6 |     // If we found a link like "last", but we've already seen that
 7 |     // this link is also "next", it's fine. If it's not been
 8 |     // previously marked as "next", then it's probably bad.
 9 |     // Penalize.
10 |     if (NEXT_LINK_TEXT_RE.test(linkData)) {
11 |       return -65;
12 |     }
13 |   }
14 | 
15 |   return 0;
16 | }
17 | 


--------------------------------------------------------------------------------
/src/extractors/generic/next-page-url/scoring/utils/score-cap-links.test.js:
--------------------------------------------------------------------------------
 1 | import assert from 'assert';
 2 | 
 3 | import scoreCapLinks from './score-cap-links';
 4 | 
 5 | describe('scoreCapLinks(linkData)', () => {
 6 |   it('returns -65 if cap link with next link text', () => {
 7 |     assert.equal(scoreCapLinks('foo next Last page'), -65);
 8 |   });
 9 | 
10 |   it('returns 0 if does not match a cap link', () => {
11 |     assert.equal(scoreCapLinks('foo bar WOW GREAT'), 0);
12 |   });
13 | });
14 | 


--------------------------------------------------------------------------------
/src/extractors/generic/next-page-url/scoring/utils/score-extraneous-links.js:
--------------------------------------------------------------------------------
 1 | import { EXTRANEOUS_LINK_HINTS_RE } from '../constants';
 2 | 
 3 | export default function scoreExtraneousLinks(href) {
 4 |   // If the URL itself contains extraneous values, give a penalty.
 5 |   if (EXTRANEOUS_LINK_HINTS_RE.test(href)) {
 6 |     return -25;
 7 |   }
 8 | 
 9 |   return 0;
10 | }
11 | 


--------------------------------------------------------------------------------
/src/extractors/generic/next-page-url/scoring/utils/score-extraneous-links.test.js:
--------------------------------------------------------------------------------
 1 | import assert from 'assert';
 2 | 
 3 | import scoreExtraneousLinks from './score-extraneous-links';
 4 | 
 5 | describe('scoreExtraneousLinks(href)', () => {
 6 |   it('returns -25 if link matches extraneous text', () => {
 7 |     assert.equal(scoreExtraneousLinks('http://example.com/email-link'), -25);
 8 |   });
 9 | 
10 |   it('returns 0 if does not match extraneous text', () => {
11 |     assert.equal(scoreExtraneousLinks('http://example.com/asdf'), 0);
12 |   });
13 | });
14 | 


--------------------------------------------------------------------------------
/src/extractors/generic/next-page-url/scoring/utils/score-link-text.js:
--------------------------------------------------------------------------------
 1 | import { IS_DIGIT_RE } from 'utils/text/constants';
 2 | 
 3 | export default function scoreLinkText(linkText, pageNum) {
 4 |   // If the link text can be parsed as a number, give it a minor
 5 |   // bonus, with a slight bias towards lower numbered pages. This is
 6 |   // so that pages that might not have 'next' in their text can still
 7 |   // get scored, and sorted properly by score.
 8 |   let score = 0;
 9 | 
10 |   if (IS_DIGIT_RE.test(linkText.trim())) {
11 |     const linkTextAsNum = parseInt(linkText, 10);
12 |     // If it's the first page, we already got it on the first call.
13 |     // Give it a negative score. Otherwise, up to page 10, give a
14 |     // small bonus.
15 |     if (linkTextAsNum < 2) {
16 |       score = -30;
17 |     } else {
18 |       score = Math.max(0, 10 - linkTextAsNum);
19 |     }
20 | 
21 |     // If it appears that the current page number is greater than
22 |     // this links page number, it's a very bad sign. Give it a big
23 |     // penalty.
24 |     if (pageNum && pageNum >= linkTextAsNum) {
25 |       score -= 50;
26 |     }
27 |   }
28 | 
29 |   return score;
30 | }
31 | 


--------------------------------------------------------------------------------
/src/extractors/generic/next-page-url/scoring/utils/score-link-text.test.js:
--------------------------------------------------------------------------------
 1 | import assert from 'assert';
 2 | 
 3 | import scoreLinkText from './score-link-text';
 4 | 
 5 | describe('scoreLinkText(linkText)', () => {
 6 |   it('returns 8 if link contains the num 2', () => {
 7 |     assert.equal(scoreLinkText('2', 0), 8);
 8 |   });
 9 | 
10 |   it('returns 5 if link contains the num 5', () => {
11 |     assert.equal(scoreLinkText('5', 0), 5);
12 |   });
13 | 
14 |   it('returns -30 if link contains the number 1', () => {
15 |     assert.equal(scoreLinkText('1', 0), -30);
16 |   });
17 | 
18 |   it('penalizes -50 if pageNum is >= link text as num', () => {
19 |     assert.equal(scoreLinkText('4', 5), -44);
20 |   });
21 | });
22 | 


--------------------------------------------------------------------------------
/src/extractors/generic/next-page-url/scoring/utils/score-next-link-text.js:
--------------------------------------------------------------------------------
 1 | import { NEXT_LINK_TEXT_RE } from '../constants';
 2 | 
 3 | export default function scoreNextLinkText(linkData) {
 4 |   // Things like "next", ">>", etc.
 5 |   if (NEXT_LINK_TEXT_RE.test(linkData)) {
 6 |     return 50;
 7 |   }
 8 | 
 9 |   return 0;
10 | }
11 | 


--------------------------------------------------------------------------------
/src/extractors/generic/next-page-url/scoring/utils/score-next-link-text.test.js:
--------------------------------------------------------------------------------
 1 | import assert from 'assert';
 2 | 
 3 | import scoreNextLinkText from './score-next-link-text';
 4 | 
 5 | describe('scoreNextLinkText(linkData)', () => {
 6 |   it('returns 50 if contains common next link text', () => {
 7 |     assert.equal(scoreNextLinkText('foo bar Next page'), 50);
 8 |   });
 9 | 
10 |   it('returns 0 if does not contain common next link text', () => {
11 |     assert.equal(scoreNextLinkText('foo bar WOW GREAT'), 0);
12 |   });
13 | });
14 | 


--------------------------------------------------------------------------------
/src/extractors/generic/next-page-url/scoring/utils/score-page-in-link.js:
--------------------------------------------------------------------------------
 1 | export default function scorePageInLink(pageNum, isWp) {
 2 |   // page in the link = bonus. Intentionally ignore wordpress because
 3 |   // their ?p=123 link style gets caught by this even though it means
 4 |   // separate documents entirely.
 5 |   if (pageNum && !isWp) {
 6 |     return 50;
 7 |   }
 8 | 
 9 |   return 0;
10 | }
11 | 


--------------------------------------------------------------------------------
/src/extractors/generic/next-page-url/scoring/utils/score-page-in-link.test.js:
--------------------------------------------------------------------------------
 1 | import assert from 'assert';
 2 | import scorePageInLink from './score-page-in-link';
 3 | 
 4 | describe('scorePageInLink(pageNum, isWp)', () => {
 5 |   it('returns 50 if link contains a page num', () => {
 6 |     assert.equal(scorePageInLink(1, false), 50);
 7 |   });
 8 | 
 9 |   it('returns 0 if link contains no page num', () => {
10 |     assert.equal(scorePageInLink(null, false), 0);
11 |   });
12 | 
13 |   it('returns 0 if page is wordpress', () => {
14 |     assert.equal(scorePageInLink(10, true), 0);
15 |   });
16 | });
17 | 


--------------------------------------------------------------------------------
/src/extractors/generic/next-page-url/scoring/utils/score-prev-link.js:
--------------------------------------------------------------------------------
 1 | import { PREV_LINK_TEXT_RE } from '../constants';
 2 | 
 3 | export default function scorePrevLink(linkData) {
 4 |   // If the link has something like "previous", its definitely
 5 |   // an old link, skip it.
 6 |   if (PREV_LINK_TEXT_RE.test(linkData)) {
 7 |     return -200;
 8 |   }
 9 | 
10 |   return 0;
11 | }
12 | 


--------------------------------------------------------------------------------
/src/extractors/generic/next-page-url/scoring/utils/score-prev-link.test.js:
--------------------------------------------------------------------------------
 1 | import assert from 'assert';
 2 | 
 3 | import scorePrevLink from './score-prev-link';
 4 | 
 5 | describe('scorePrevLink(linkData)', () => {
 6 |   it('returns -200 if link matches previous text', () => {
 7 |     assert.equal(scorePrevLink('foo next previous page'), -200);
 8 |   });
 9 | 
10 |   it('returns 0 if does not match a prev link', () => {
11 |     assert.equal(scorePrevLink('foo bar WOW GREAT'), 0);
12 |   });
13 | });
14 | 


--------------------------------------------------------------------------------
/src/extractors/generic/next-page-url/scoring/utils/score-similarity.js:
--------------------------------------------------------------------------------
 1 | import difflib from 'difflib';
 2 | 
 3 | export default function scoreSimilarity(score, articleUrl, href) {
 4 |   // Do this last and only if we have a real candidate, because it's
 5 |   // potentially expensive computationally. Compare the link to this
 6 |   // URL using difflib to get the % similarity of these URLs. On a
 7 |   // sliding scale, subtract points from this link based on
 8 |   // similarity.
 9 |   if (score > 0) {
10 |     const similarity = new difflib.SequenceMatcher(
11 |       null,
12 |       articleUrl,
13 |       href
14 |     ).ratio();
15 |     // Subtract .1 from diff_percent when calculating modifier,
16 |     // which means that if it's less than 10% different, we give a
17 |     // bonus instead. Ex:
18 |     //  3% different = +17.5 points
19 |     // 10% different = 0 points
20 |     // 20% different = -25 points
21 |     const diffPercent = 1.0 - similarity;
22 |     const diffModifier = -(250 * (diffPercent - 0.2));
23 |     return score + diffModifier;
24 |   }
25 | 
26 |   return 0;
27 | }
28 | 


--------------------------------------------------------------------------------
/src/extractors/generic/next-page-url/scoring/utils/score-similarity.test.js:
--------------------------------------------------------------------------------
 1 | import assert from 'assert';
 2 | 
 3 | import scoreSimilarity from './score-similarity';
 4 | 
 5 | describe('scoreSimilarity(score, articleUrl, href)', () => {
 6 |   it('returns a similarity bonus based on current score', () => {
 7 |     const articleUrl = 'http://example.com/foo/bar';
 8 |     const href = 'http://example.com/foo/bar/2';
 9 |     const score = 25;
10 |     assert.equal(Math.round(scoreSimilarity(score, articleUrl, href)), 66);
11 |   });
12 | 
13 |   it('returns 0 is current score <= 0', () => {
14 |     const articleUrl = 'http://example.com/foo/bar';
15 |     const href = 'http://example.com/foo/bar/2';
16 |     const score = 0;
17 |     assert.equal(scoreSimilarity(score, articleUrl, href), 0);
18 |   });
19 | });
20 | 


--------------------------------------------------------------------------------
/src/extractors/generic/next-page-url/scoring/utils/should-score.test.js:
--------------------------------------------------------------------------------
 1 | import assert from 'assert';
 2 | import URL from 'url';
 3 | 
 4 | import shouldScore from './should-score';
 5 | 
 6 | describe('shouldScore(href, articleUrl, baseUrl, parsedUrl, linkText, previousUrls)', () => {
 7 |   it('returns false if href has already been fetched', () => {
 8 |     const previousUrls = ['http://example.com/foo/bar/2'];
 9 |     const href = 'http://example.com/foo/bar/2';
10 |     const parsedUrl = URL.parse(href);
11 | 
12 |     assert.equal(shouldScore(href, '', '', parsedUrl, '', previousUrls), false);
13 |   });
14 | 
15 |   it('returns true if href has not been fetched', () => {
16 |     const previousUrls = ['http://example.com/foo/bar'];
17 |     const href = 'http://example.com/foo/bar/2';
18 |     const parsedUrl = URL.parse(href);
19 | 
20 |     assert.equal(shouldScore(href, '', '', parsedUrl, '', previousUrls), true);
21 |   });
22 | });
23 | 


--------------------------------------------------------------------------------
/src/extractors/generic/url/constants.js:
--------------------------------------------------------------------------------
1 | export const CANONICAL_META_SELECTORS = ['og:url'];
2 | 


--------------------------------------------------------------------------------
/src/extractors/generic/url/extractor.js:
--------------------------------------------------------------------------------
 1 | import URL from 'url';
 2 | import { extractFromMeta } from 'utils/dom';
 3 | 
 4 | import { CANONICAL_META_SELECTORS } from './constants';
 5 | 
 6 | function parseDomain(url) {
 7 |   const parsedUrl = URL.parse(url);
 8 |   const { hostname } = parsedUrl;
 9 |   return hostname;
10 | }
11 | 
12 | function result(url) {
13 |   return {
14 |     url,
15 |     domain: parseDomain(url),
16 |   };
17 | }
18 | 
19 | const GenericUrlExtractor = {
20 |   extract({ $, url, metaCache }) {
21 |     const $canonical = $('link[rel=canonical]');
22 |     if ($canonical.length !== 0) {
23 |       const href = $canonical.attr('href');
24 |       if (href) {
25 |         return result(href);
26 |       }
27 |     }
28 | 
29 |     const metaUrl = extractFromMeta($, CANONICAL_META_SELECTORS, metaCache);
30 |     if (metaUrl) {
31 |       return result(metaUrl);
32 |     }
33 | 
34 |     return result(url);
35 |   },
36 | };
37 | 
38 | export default GenericUrlExtractor;
39 | 


--------------------------------------------------------------------------------
/src/extractors/generic/word-count/extractor.js:
--------------------------------------------------------------------------------
 1 | import cheerio from 'cheerio';
 2 | 
 3 | import { normalizeSpaces } from 'utils/text';
 4 | 
 5 | const getWordCount = content => {
 6 |   const $ = cheerio.load(content);
 7 |   const $content = $('div').first();
 8 |   const text = normalizeSpaces($content.text());
 9 |   return text.split(/\s/).length;
10 | };
11 | 
12 | const getWordCountAlt = content => {
13 |   content = content.replace(/<[^>]*>/g, ' ');
14 |   content = content.replace(/\s+/g, ' ');
15 |   content = content.trim();
16 |   return content.split(' ').length;
17 | };
18 | 
19 | const GenericWordCountExtractor = {
20 |   extract({ content }) {
21 |     let count = getWordCount(content);
22 |     if (count === 1) count = getWordCountAlt(content);
23 |     return count;
24 |   },
25 | };
26 | 
27 | export default GenericWordCountExtractor;
28 | 


--------------------------------------------------------------------------------
/src/extractors/generic/word-count/extractor.test.js:
--------------------------------------------------------------------------------
 1 | import assert from 'assert';
 2 | 
 3 | import GenericWordCountExtractor from './extractor';
 4 | 
 5 | describe('GenericWordCountExtractor', () => {
 6 |   describe('extact({ content })', () => {
 7 |     it('counts words', () => {
 8 |       const content = `
 9 |         <div>
10 |           <p>One two three.</p>
11 |           <p>Four five six.</p>
12 |           <p>Seven eight nine.</p>
13 |           <p>Ten eleven twelve.</p>
14 |       `;
15 | 
16 |       const wordCount = GenericWordCountExtractor.extract({ content });
17 | 
18 |       assert.equal(wordCount, 12);
19 |     });
20 |   });
21 | });
22 | 


--------------------------------------------------------------------------------
/src/extractors/get-extractor.js:
--------------------------------------------------------------------------------
 1 | import URL from 'url';
 2 | 
 3 | import Extractors from './all';
 4 | import GenericExtractor from './generic';
 5 | import detectByHtml from './detect-by-html';
 6 | import { apiExtractors } from './add-extractor';
 7 | 
 8 | export default function getExtractor(url, parsedUrl, $) {
 9 |   parsedUrl = parsedUrl || URL.parse(url);
10 |   const { hostname } = parsedUrl;
11 |   const baseDomain = hostname
12 |     .split('.')
13 |     .slice(-2)
14 |     .join('.');
15 | 
16 |   return (
17 |     apiExtractors[hostname] ||
18 |     apiExtractors[baseDomain] ||
19 |     Extractors[hostname] ||
20 |     Extractors[baseDomain] ||
21 |     detectByHtml($) ||
22 |     GenericExtractor
23 |   );
24 | }
25 | 


--------------------------------------------------------------------------------
/src/extractors/index.js:
--------------------------------------------------------------------------------
1 | const Extractor = {};
2 | 
3 | export default Extractor;
4 | 


--------------------------------------------------------------------------------
/src/resource/utils/dom/clean.js:
--------------------------------------------------------------------------------
 1 | import { TAGS_TO_REMOVE } from './constants';
 2 | 
 3 | function isComment(index, node) {
 4 |   return node.type === 'comment';
 5 | }
 6 | 
 7 | function cleanComments($) {
 8 |   $.root()
 9 |     .find('*')
10 |     .contents()
11 |     .filter(isComment)
12 |     .remove();
13 | 
14 |   return $;
15 | }
16 | 
17 | export default function clean($) {
18 |   $(TAGS_TO_REMOVE).remove();
19 | 
20 |   $ = cleanComments($);
21 |   return $;
22 | }
23 | 


--------------------------------------------------------------------------------
/src/resource/utils/dom/clean.test.js:
--------------------------------------------------------------------------------
 1 | import assert from 'assert';
 2 | import cheerio from 'cheerio';
 3 | 
 4 | import clean from './clean';
 5 | 
 6 | describe('clean($)', () => {
 7 |   it('removes script elements', () => {
 8 |     const html = "<div><script>alert('hi')</script></div>";
 9 |     const $ = cheerio.load(html);
10 | 
11 |     assert.equal(clean($).html(), '<div></div>');
12 |   });
13 | 
14 |   it('removes style elements', () => {
15 |     const html = '<div><style>foo: {color: red;}</style></div>';
16 |     const $ = cheerio.load(html);
17 | 
18 |     assert.equal(clean($).html(), '<div></div>');
19 |   });
20 | 
21 |   it('removes comments', () => {
22 |     const html = '<div>HI <!-- This is a comment --></div>';
23 |     const $ = cheerio.load(html);
24 | 
25 |     assert.equal(clean($).html(), '<div>HI </div>');
26 |   });
27 | });
28 | 


--------------------------------------------------------------------------------
/src/resource/utils/dom/constants.js:
--------------------------------------------------------------------------------
 1 | export const IS_LINK = new RegExp('https?://', 'i');
 2 | const IMAGE_RE = '.(png|gif|jpe?g)';
 3 | export const IS_IMAGE = new RegExp(`${IMAGE_RE}`, 'i');
 4 | export const IS_SRCSET = new RegExp(
 5 |   `${IMAGE_RE}(\\?\\S+)?(\\s*[\\d.]+[wx])`,
 6 |   'i'
 7 | );
 8 | 
 9 | export const TAGS_TO_REMOVE = ['script', 'style', 'form'].join(',');
10 | 


--------------------------------------------------------------------------------
/src/resource/utils/dom/index.js:
--------------------------------------------------------------------------------
1 | export { default as normalizeMetaTags } from './normalize-meta-tags';
2 | export {
3 |   default as convertLazyLoadedImages,
4 | } from './convert-lazy-loaded-images';
5 | export { default as clean } from './clean';
6 | 


--------------------------------------------------------------------------------
/src/resource/utils/dom/normalize-meta-tags.js:
--------------------------------------------------------------------------------
 1 | function convertMetaProp($, from, to) {
 2 |   $(`meta[${from}]`).each((_, node) => {
 3 |     const $node = $(node);
 4 | 
 5 |     const value = $node.attr(from);
 6 |     $node.attr(to, value);
 7 |     $node.removeAttr(from);
 8 |   });
 9 | 
10 |   return $;
11 | }
12 | 
13 | // For ease of use in extracting from meta tags,
14 | // replace the "content" attribute on meta tags with the
15 | // "value" attribute.
16 | //
17 | // In addition, normalize 'property' attributes to 'name' for ease of
18 | // querying later. See, e.g., og or twitter meta tags.
19 | 
20 | export default function normalizeMetaTags($) {
21 |   $ = convertMetaProp($, 'content', 'value');
22 |   $ = convertMetaProp($, 'property', 'name');
23 |   return $;
24 | }
25 | 


--------------------------------------------------------------------------------
/src/resource/utils/dom/normalize-meta-tags.test.js:
--------------------------------------------------------------------------------
 1 | import assert from 'assert';
 2 | import cheerio from 'cheerio';
 3 | 
 4 | import normalizeMetaTags from './normalize-meta-tags';
 5 | 
 6 | describe('normalizeMetaTags($)', () => {
 7 |   it('replaces "content" attributes with "value"', () => {
 8 |     // browser cheerio/jquery will remove/replace html, so result is different
 9 |     const test = cheerio.browser
10 |       ? '<meta name="foo" value="bar">'
11 |       : '<html><meta name="foo" value="bar"></html>';
12 | 
13 |     const $ = cheerio.load('<html><meta name="foo" content="bar"></html>');
14 |     const result = normalizeMetaTags($).html();
15 | 
16 |     assert.equal(result, test);
17 |   });
18 | 
19 |   it('replaces "property" attributes with "name"', () => {
20 |     const test = cheerio.browser
21 |       ? '<meta value="bar" name="foo">'
22 |       : '<html><meta value="bar" name="foo"></html>';
23 | 
24 |     const $ = cheerio.load('<html><meta property="foo" value="bar"></html>');
25 |     const result = normalizeMetaTags($).html();
26 | 
27 |     assert.equal(result, test);
28 |   });
29 | });
30 | 


--------------------------------------------------------------------------------
/src/resource/utils/index.js:
--------------------------------------------------------------------------------
1 | export { default as fetchResource } from './fetch-resource';
2 | 


--------------------------------------------------------------------------------
/src/shims/iconv-lite.js:
--------------------------------------------------------------------------------
 1 | // this is a shim for the browser build;
 2 | // iconv-lite doubles build size, and we
 3 | // don't need it for already rendered text
 4 | const iconv = {
 5 |   encodingExists: () => false,
 6 |   decode: s => s,
 7 | };
 8 | 
 9 | export default iconv;
10 | 


--------------------------------------------------------------------------------
/src/utils/dom/brs-to-ps.js:
--------------------------------------------------------------------------------
 1 | import { paragraphize } from './index';
 2 | 
 3 | // ## NOTES:
 4 | // Another good candidate for refactoring/optimizing.
 5 | // Very imperative code, I don't love it. - AP
 6 | 
 7 | //  Given cheerio object, convert consecutive <br /> tags into
 8 | //  <p /> tags instead.
 9 | //
10 | //  :param $: A cheerio object
11 | 
12 | export default function brsToPs($) {
13 |   let collapsing = false;
14 |   $('br').each((index, element) => {
15 |     const $element = $(element);
16 |     const nextElement = $element.next().get(0);
17 | 
18 |     if (nextElement && nextElement.tagName.toLowerCase() === 'br') {
19 |       collapsing = true;
20 |       $element.remove();
21 |     } else if (collapsing) {
22 |       collapsing = false;
23 |       paragraphize(element, $, true);
24 |     }
25 |   });
26 | 
27 |   return $;
28 | }
29 | 


--------------------------------------------------------------------------------
/src/utils/dom/clean-attributes.js:
--------------------------------------------------------------------------------
 1 | import { getAttrs, setAttrs } from 'utils/dom';
 2 | 
 3 | import { WHITELIST_ATTRS_RE, KEEP_CLASS } from './constants';
 4 | 
 5 | function removeAllButWhitelist($article, $) {
 6 |   $article.find('*').each((index, node) => {
 7 |     const attrs = getAttrs(node);
 8 | 
 9 |     setAttrs(
10 |       node,
11 |       Reflect.ownKeys(attrs).reduce((acc, attr) => {
12 |         if (WHITELIST_ATTRS_RE.test(attr)) {
13 |           return { ...acc, [attr]: attrs[attr] };
14 |         }
15 | 
16 |         return acc;
17 |       }, {})
18 |     );
19 |   });
20 | 
21 |   // Remove the mercury-parser-keep class from result
22 |   $(`.${KEEP_CLASS}`, $article).removeClass(KEEP_CLASS);
23 | 
24 |   return $article;
25 | }
26 | 
27 | // Remove attributes like style or align
28 | export default function cleanAttributes($article, $) {
29 |   // Grabbing the parent because at this point
30 |   // $article will be wrapped in a div which will
31 |   // have a score set on it.
32 |   return removeAllButWhitelist(
33 |     $article.parent().length ? $article.parent() : $article,
34 |     $
35 |   );
36 | }
37 | 


--------------------------------------------------------------------------------
/src/utils/dom/clean-attributes.test.js:
--------------------------------------------------------------------------------
 1 | import cheerio from 'cheerio';
 2 | 
 3 | import { assertClean } from 'test-helpers';
 4 | 
 5 | import { cleanAttributes } from './index';
 6 | 
 7 | describe('cleanAttributes($)', () => {
 8 |   it('removes style attributes from nodes', () => {
 9 |     const $ = cheerio.load(`
10 |       <div>
11 |         <p style="color: red;">What do you think?</p>
12 |       </div>
13 |     `);
14 | 
15 |     const result = cleanAttributes($('*').first(), $);
16 |     assertClean(
17 |       $.html(result),
18 |       `
19 |       <div>
20 |         <p>What do you think?</p>
21 |       </div>
22 |     `
23 |     );
24 |   });
25 | 
26 |   it('removes align attributes from nodes', () => {
27 |     const $ = cheerio.load(`
28 |       <div>
29 |         <p style="color: red;" align="center">What do you think?</p>
30 |       </div>
31 |     `);
32 | 
33 |     const result = cleanAttributes($('*').first(), $);
34 |     assertClean(
35 |       $.html(result),
36 |       `
37 |       <div>
38 |         <p>What do you think?</p>
39 |       </div>
40 |     `
41 |     );
42 |   });
43 | });
44 | 


--------------------------------------------------------------------------------
/src/utils/dom/clean-h-ones.js:
--------------------------------------------------------------------------------
 1 | import { convertNodeTo } from 'utils/dom';
 2 | 
 3 | // H1 tags are typically the article title, which should be extracted
 4 | // by the title extractor instead. If there's less than 3 of them (<3),
 5 | // strip them. Otherwise, turn 'em into H2s.
 6 | export default function cleanHOnes(article, $) {
 7 |   const $hOnes = $('h1', article);
 8 | 
 9 |   if ($hOnes.length < 3) {
10 |     $hOnes.each((index, node) => $(node).remove());
11 |   } else {
12 |     $hOnes.each((index, node) => {
13 |       convertNodeTo($(node), $, 'h2');
14 |     });
15 |   }
16 | 
17 |   return $;
18 | }
19 | 


--------------------------------------------------------------------------------
/src/utils/dom/clean-headers.js:
--------------------------------------------------------------------------------
 1 | import { getWeight } from 'extractors/generic/content/scoring';
 2 | 
 3 | import { HEADER_TAG_LIST } from './constants';
 4 | import { normalizeSpaces } from '../text';
 5 | 
 6 | export default function cleanHeaders($article, $, title = '') {
 7 |   $(HEADER_TAG_LIST, $article).each((index, header) => {
 8 |     const $header = $(header);
 9 |     // Remove any headers that appear before all other p tags in the
10 |     // document. This probably means that it was part of the title, a
11 |     // subtitle or something else extraneous like a datestamp or byline,
12 |     // all of which should be handled by other metadata handling.
13 |     if ($($header, $article).prevAll('p').length === 0) {
14 |       return $header.remove();
15 |     }
16 | 
17 |     // Remove any headers that match the title exactly.
18 |     if (normalizeSpaces($(header).text()) === title) {
19 |       return $header.remove();
20 |     }
21 | 
22 |     // If this header has a negative weight, it's probably junk.
23 |     // Get rid of it.
24 |     if (getWeight($(header)) < 0) {
25 |       return $header.remove();
26 |     }
27 | 
28 |     return $header;
29 |   });
30 | 
31 |   return $;
32 | }
33 | 


--------------------------------------------------------------------------------
/src/utils/dom/clean-images.js:
--------------------------------------------------------------------------------
 1 | import { SPACER_RE } from './constants';
 2 | 
 3 | function cleanForHeight($img, $) {
 4 |   const height = parseInt($img.attr('height'), 10);
 5 |   const width = parseInt($img.attr('width'), 10) || 20;
 6 | 
 7 |   // Remove images that explicitly have very small heights or
 8 |   // widths, because they are most likely shims or icons,
 9 |   // which aren't very useful for reading.
10 |   if ((height || 20) < 10 || width < 10) {
11 |     $img.remove();
12 |   } else if (height) {
13 |     // Don't ever specify a height on images, so that we can
14 |     // scale with respect to width without screwing up the
15 |     // aspect ratio.
16 |     $img.removeAttr('height');
17 |   }
18 | 
19 |   return $;
20 | }
21 | 
22 | // Cleans out images where the source string matches transparent/spacer/etc
23 | // TODO This seems very aggressive - AP
24 | function removeSpacers($img, $) {
25 |   if (SPACER_RE.test($img.attr('src'))) {
26 |     $img.remove();
27 |   }
28 | 
29 |   return $;
30 | }
31 | 
32 | export default function cleanImages($article, $) {
33 |   $article.find('img').each((index, img) => {
34 |     const $img = $(img);
35 | 
36 |     cleanForHeight($img, $);
37 |     removeSpacers($img, $);
38 |   });
39 | 
40 |   return $;
41 | }
42 | 


--------------------------------------------------------------------------------
/src/utils/dom/convert-node-to.js:
--------------------------------------------------------------------------------
 1 | import { getAttrs } from 'utils/dom';
 2 | 
 3 | export default function convertNodeTo($node, $, tag = 'p') {
 4 |   const node = $node.get(0);
 5 |   if (!node) {
 6 |     return $;
 7 |   }
 8 |   const attrs = getAttrs(node) || {};
 9 | 
10 |   const attribString = Reflect.ownKeys(attrs)
11 |     .map(key => `${key}=${attrs[key]}`)
12 |     .join(' ');
13 |   let html;
14 | 
15 |   if ($.browser) {
16 |     // In the browser, the contents of noscript tags aren't rendered, therefore
17 |     // transforms on the noscript tag (commonly used for lazy-loading) don't work
18 |     // as expected. This test case handles that
19 |     html =
20 |       node.tagName.toLowerCase() === 'noscript' ? $node.text() : $node.html();
21 |   } else {
22 |     html = $node.contents();
23 |   }
24 |   $node.replaceWith(`<${tag} ${attribString}>${html}</${tag}>`);
25 |   return $;
26 | }
27 | 


--------------------------------------------------------------------------------
/src/utils/dom/extract-from-meta.test.js:
--------------------------------------------------------------------------------
 1 | import assert from 'assert';
 2 | import cheerio from 'cheerio';
 3 | 
 4 | import { extractFromMeta } from './index';
 5 | 
 6 | describe('extractFromMeta($, metaNames, cachedNames, cleanTags)', () => {
 7 |   it('extracts an arbitrary meta tag by name', () => {
 8 |     const $ = cheerio.load(`
 9 |       <html>
10 |         <meta name="foo" value="bar" />
11 |       </html>
12 |     `);
13 |     const result = extractFromMeta($, ['foo', 'baz'], ['foo', 'bat']);
14 | 
15 |     assert.equal(result, 'bar');
16 |   });
17 | 
18 |   it('returns nothing if a meta name is duplicated', () => {
19 |     const $ = cheerio.load(`
20 |       <html>
21 |         <meta name="foo" value="bar" />
22 |         <meta name="foo" value="baz" />
23 |       </html>
24 |     `);
25 |     const result = extractFromMeta($, ['foo', 'baz'], ['foo', 'bat']);
26 | 
27 |     assert.equal(result, null);
28 |   });
29 | 
30 |   it('ignores duplicate meta names with empty values', () => {
31 |     const $ = cheerio.load(`
32 |       <html>
33 |         <meta name="foo" value="bar" />
34 |         <meta name="foo" value="" />
35 |       </html>
36 |     `);
37 |     const result = extractFromMeta($, ['foo', 'baz'], ['foo', 'bat']);
38 | 
39 |     assert.equal(result, 'bar');
40 |   });
41 | });
42 | 


--------------------------------------------------------------------------------
/src/utils/dom/get-attrs.js:
--------------------------------------------------------------------------------
 1 | export default function getAttrs(node) {
 2 |   const { attribs, attributes } = node;
 3 | 
 4 |   if (!attribs && attributes) {
 5 |     const attrs = Reflect.ownKeys(attributes).reduce((acc, index) => {
 6 |       const attr = attributes[index];
 7 | 
 8 |       if (!attr.name || !attr.value) return acc;
 9 | 
10 |       acc[attr.name] = attr.value;
11 |       return acc;
12 |     }, {});
13 |     return attrs;
14 |   }
15 | 
16 |   return attribs;
17 | }
18 | 


--------------------------------------------------------------------------------
/src/utils/dom/get-attrs.test.js:
--------------------------------------------------------------------------------
 1 | import assert from 'assert';
 2 | 
 3 | import getAttrs from './get-attrs';
 4 | 
 5 | describe('getAttrs(node)', () => {
 6 |   it('returns attrs for a raw jquery node', () => {
 7 |     const domNode = {
 8 |       attributes: {
 9 |         0: {
10 |           name: 'class',
11 |           value: 'foo bar',
12 |         },
13 |       },
14 |     };
15 | 
16 |     const attrs = {
17 |       class: 'foo bar',
18 |     };
19 | 
20 |     assert.deepEqual(getAttrs(domNode), attrs);
21 |   });
22 | 
23 |   it('returns attrs for a raw cheerio node', () => {
24 |     const cheerioNode = {
25 |       attribs: {
26 |         class: 'foo bar',
27 |         id: 'baz bat',
28 |       },
29 |     };
30 | 
31 |     assert.deepEqual(getAttrs(cheerioNode), cheerioNode.attribs);
32 |   });
33 | });
34 | 


--------------------------------------------------------------------------------
/src/utils/dom/is-wordpress.js:
--------------------------------------------------------------------------------
1 | import { IS_WP_SELECTOR } from './constants';
2 | 
3 | export default function isWordpress($) {
4 |   return $(IS_WP_SELECTOR).length > 0;
5 | }
6 | 


--------------------------------------------------------------------------------
/src/utils/dom/is-wordpress.test.js:
--------------------------------------------------------------------------------
 1 | import assert from 'assert';
 2 | import cheerio from 'cheerio';
 3 | 
 4 | import isWordpress from './is-wordpress';
 5 | 
 6 | describe('isWordpress($)', () => {
 7 |   it('returns false if a site is not generated by wordpress', () => {
 8 |     const html = `
 9 |       <html>
10 |         <head>
11 |           <meta name="generator" value="whatever">
12 |         <head>
13 |       </html>
14 |     `;
15 |     let $ = cheerio.load(html);
16 | 
17 |     assert.equal(isWordpress($), false);
18 | 
19 |     const html2 = `
20 |       <html>
21 |         <head>
22 |           <meta name="foo" value="bar">
23 |         <head>
24 |       </html>
25 |     `;
26 |     $ = cheerio.load(html2);
27 | 
28 |     assert.equal(isWordpress($), false);
29 |   });
30 | 
31 |   it('returns true if a site is generated by wordpress', () => {
32 |     const html = `
33 |       <html>
34 |         <head>
35 |           <meta name="generator" value="WordPress 4.7-alpha-38592">
36 |         <head>
37 |       </html>
38 |     `;
39 |     const $ = cheerio.load(html);
40 | 
41 |     assert.equal(isWordpress($), true);
42 |   });
43 | });
44 | 


--------------------------------------------------------------------------------
/src/utils/dom/link-density.js:
--------------------------------------------------------------------------------
 1 | export function textLength(text) {
 2 |   return text.trim().replace(/\s+/g, ' ').length;
 3 | }
 4 | 
 5 | // Determines what percentage of the text
 6 | // in a node is link text
 7 | // Takes a node, returns a float
 8 | export function linkDensity($node) {
 9 |   const totalTextLength = textLength($node.text());
10 | 
11 |   const linkText = $node.find('a').text();
12 |   const linkLength = textLength(linkText);
13 | 
14 |   if (totalTextLength > 0) {
15 |     return linkLength / totalTextLength;
16 |   }
17 |   if (totalTextLength === 0 && linkLength > 0) {
18 |     return 1;
19 |   }
20 | 
21 |   return 0;
22 | }
23 | 


--------------------------------------------------------------------------------
/src/utils/dom/link-density.test.js:
--------------------------------------------------------------------------------
 1 | import assert from 'assert';
 2 | import cheerio from 'cheerio';
 3 | 
 4 | import { linkDensity } from './index';
 5 | 
 6 | describe('linkDensity($)', () => {
 7 |   it('returns 0.5 if half of the text is a link', () => {
 8 |     const $ = cheerio.load(`
 9 |       <div><p>Some text!</p><p><a href="">Some text!</a></p> </div>
10 |     `);
11 | 
12 |     const density = linkDensity($('div').first(), $);
13 | 
14 |     assert.equal(density, 0.5);
15 |   });
16 | 
17 |   it('returns 1 if all of the text is a link', () => {
18 |     const $ = cheerio.load(`
19 |       <div><p><a href="">Some text!</a></p></div>
20 |     `);
21 | 
22 |     const density = linkDensity($('div').first(), $);
23 | 
24 |     assert.equal(density, 1);
25 |   });
26 | 
27 |   it("returns 0 if there's no text", () => {
28 |     const $ = cheerio.load(`
29 |       <div><p><a href=""></a></p></div>
30 |     `);
31 | 
32 |     const density = linkDensity($('div').first());
33 | 
34 |     assert.equal(density, 0);
35 |   });
36 | });
37 | 


--------------------------------------------------------------------------------
/src/utils/dom/mark-to-keep.js:
--------------------------------------------------------------------------------
 1 | import URL from 'url';
 2 | 
 3 | import { KEEP_SELECTORS, KEEP_CLASS } from './constants';
 4 | 
 5 | export default function markToKeep(article, $, url, tags = []) {
 6 |   if (tags.length === 0) {
 7 |     tags = KEEP_SELECTORS;
 8 |   }
 9 | 
10 |   if (url) {
11 |     const { protocol, hostname } = URL.parse(url);
12 |     tags = [...tags, `iframe[src^="${protocol}//${hostname}"]`];
13 |   }
14 | 
15 |   $(tags.join(','), article).addClass(KEEP_CLASS);
16 | 
17 |   return $;
18 | }
19 | 


--------------------------------------------------------------------------------
/src/utils/dom/node-is-sufficient.js:
--------------------------------------------------------------------------------
1 | // Given a node, determine if it's article-like enough to return
2 | // param: node (a cheerio node)
3 | // return: boolean
4 | 
5 | export default function nodeIsSufficient($node) {
6 |   return $node.text().trim().length >= 100;
7 | }
8 | 


--------------------------------------------------------------------------------
/src/utils/dom/node-is-sufficient.test.js:
--------------------------------------------------------------------------------
 1 | import assert from 'assert';
 2 | import cheerio from 'cheerio';
 3 | 
 4 | import nodeIsSufficient from './node-is-sufficient';
 5 | 
 6 | describe('Utils', () => {
 7 |   describe('nodeIsSufficient(node)', () => {
 8 |     it('returns false if node text length < 100 chars', () => {
 9 |       const $ = cheerio.load(`
10 |         <div class="foo bar">
11 |           <p>This is too short</p>
12 |         </div>
13 |       `);
14 | 
15 |       assert.equal(nodeIsSufficient($.root()), false);
16 |     });
17 | 
18 |     it('returns true if node text length > 100 chars', () => {
19 |       const $ = cheerio.load(`
20 |         <div class="foo bar">
21 |           <p>
22 |             Lorem ipsum dolor sit amet, consectetuer adipiscing elit. Aenean commodo ligula eget dolor. Aenean m
23 |           </p>
24 |         </div>
25 |       `);
26 | 
27 |       assert.equal(nodeIsSufficient($.root()), true);
28 |     });
29 |   });
30 | });
31 | 


--------------------------------------------------------------------------------
/src/utils/dom/paragraphize.js:
--------------------------------------------------------------------------------
 1 | import { BLOCK_LEVEL_TAGS_RE } from './constants';
 2 | 
 3 | // Given a node, turn it into a P if it is not already a P, and
 4 | // make sure it conforms to the constraints of a P tag (I.E. does
 5 | // not contain any other block tags.)
 6 | //
 7 | // If the node is a <br />, it treats the following inline siblings
 8 | // as if they were its children.
 9 | //
10 | // :param node: The node to paragraphize; this is a raw node
11 | // :param $: The cheerio object to handle dom manipulation
12 | // :param br: Whether or not the passed node is a br
13 | 
14 | export default function paragraphize(node, $, br = false) {
15 |   const $node = $(node);
16 | 
17 |   if (br) {
18 |     let sibling = node.nextSibling;
19 |     const p = $('<p></p>');
20 | 
21 |     // while the next node is text or not a block level element
22 |     // append it to a new p node
23 |     while (
24 |       sibling &&
25 |       !(sibling.tagName && BLOCK_LEVEL_TAGS_RE.test(sibling.tagName))
26 |     ) {
27 |       const { nextSibling } = sibling;
28 |       $(sibling).appendTo(p);
29 |       sibling = nextSibling;
30 |     }
31 | 
32 |     $node.replaceWith(p);
33 |     $node.remove();
34 |     return $;
35 |   }
36 | 
37 |   return $;
38 | }
39 | 


--------------------------------------------------------------------------------
/src/utils/dom/remove-empty.js:
--------------------------------------------------------------------------------
 1 | export default function removeEmpty($article, $) {
 2 |   $article.find('p').each((index, p) => {
 3 |     const $p = $(p);
 4 |     if ($p.find('iframe, img').length === 0 && $p.text().trim() === '')
 5 |       $p.remove();
 6 |   });
 7 | 
 8 |   return $;
 9 | }
10 | 


--------------------------------------------------------------------------------
/src/utils/dom/rewrite-top-level.js:
--------------------------------------------------------------------------------
 1 | import { convertNodeTo } from 'utils/dom';
 2 | 
 3 | // Rewrite the tag name to div if it's a top level node like body or
 4 | // html to avoid later complications with multiple body tags.
 5 | export default function rewriteTopLevel(article, $) {
 6 |   // I'm not using context here because
 7 |   // it's problematic when converting the
 8 |   // top-level/root node - AP
 9 |   $ = convertNodeTo($('html'), $, 'div');
10 |   $ = convertNodeTo($('body'), $, 'div');
11 | 
12 |   return $;
13 | }
14 | 


--------------------------------------------------------------------------------
/src/utils/dom/rewrite-top-level.test.js:
--------------------------------------------------------------------------------
 1 | import cheerio from 'cheerio';
 2 | import assert from 'assert';
 3 | 
 4 | import { assertClean } from 'test-helpers';
 5 | 
 6 | import rewriteTopLevel from './rewrite-top-level';
 7 | 
 8 | describe('rewriteTopLevel(node, $)', () => {
 9 |   it('turns html and body tags into divs', () => {
10 |     const $ = cheerio.load(`
11 |         <html><body><div><p><a href="">Wow how about that</a></p></div></body></html>
12 |     `);
13 |     const result = rewriteTopLevel($('html').first(), $);
14 | 
15 |     assert.equal(result('html').length, 0);
16 |     assert.equal(result('body').length, 0);
17 | 
18 |     if (!cheerio.browser) {
19 |       assertClean(
20 |         result.html(),
21 |         `
22 |         <div><div><div><p><a href="">Wow how about that</a></p></div></div></div>
23 |       `
24 |       );
25 |     }
26 |   });
27 | });
28 | 


--------------------------------------------------------------------------------
/src/utils/dom/set-attr.js:
--------------------------------------------------------------------------------
 1 | export default function setAttr(node, attr, val) {
 2 |   if (node.attribs) {
 3 |     node.attribs[attr] = val;
 4 |   } else if (node.attributes) {
 5 |     node.setAttribute(attr, val);
 6 |   }
 7 | 
 8 |   return node;
 9 | }
10 | 


--------------------------------------------------------------------------------
/src/utils/dom/set-attr.test.js:
--------------------------------------------------------------------------------
 1 | import assert from 'assert';
 2 | 
 3 | import { MockDomNode } from 'test-helpers';
 4 | import setAttr from './set-attr';
 5 | 
 6 | describe('setAttr(node, attr, val)', () => {
 7 |   it('sets attrs for a raw jquery node', () => {
 8 |     const domNode = new MockDomNode();
 9 | 
10 |     const node = setAttr(domNode, 'class', 'foo');
11 | 
12 |     assert.equal(node.attributes[0].value, 'foo');
13 |   });
14 | 
15 |   it('sets attrs for a raw cheerio node', () => {
16 |     const cheerioNode = {
17 |       attribs: {
18 |         class: 'foo bar',
19 |         id: 'baz bat',
20 |       },
21 |     };
22 | 
23 |     const node = setAttr(cheerioNode, 'class', 'foo');
24 | 
25 |     assert.equal(node.attribs.class, 'foo');
26 |   });
27 | });
28 | 


--------------------------------------------------------------------------------
/src/utils/dom/set-attrs.js:
--------------------------------------------------------------------------------
 1 | export default function setAttrs(node, attrs) {
 2 |   if (node.attribs) {
 3 |     node.attribs = attrs;
 4 |   } else if (node.attributes) {
 5 |     while (node.attributes.length > 0) {
 6 |       node.removeAttribute(node.attributes[0].name);
 7 |     }
 8 | 
 9 |     Reflect.ownKeys(attrs).forEach(key => {
10 |       node.setAttribute(key, attrs[key]);
11 |     });
12 |   }
13 | 
14 |   return node;
15 | }
16 | 


--------------------------------------------------------------------------------
/src/utils/dom/set-attrs.test.js:
--------------------------------------------------------------------------------
 1 | import assert from 'assert';
 2 | 
 3 | import { MockDomNode } from 'test-helpers';
 4 | import setAttrs from './set-attrs';
 5 | 
 6 | describe('setAttrs(node, attrs)', () => {
 7 |   it('sets attrs for a raw jquery node', () => {
 8 |     const attrs = {
 9 |       class: 'baz',
10 |     };
11 | 
12 |     const postAttrs = [
13 |       {
14 |         name: 'class',
15 |         value: 'baz',
16 |       },
17 |     ];
18 | 
19 |     const domNode = new MockDomNode();
20 |     const node = setAttrs(domNode, attrs);
21 | 
22 |     assert.deepEqual(node.attributes, postAttrs);
23 |   });
24 | 
25 |   it('sets attrs for a raw cheerio node', () => {
26 |     const cheerioNode = {
27 |       attribs: {
28 |         class: 'foo bar',
29 |         id: 'baz bat',
30 |       },
31 |     };
32 | 
33 |     const attrs = {
34 |       class: 'baz',
35 |       id: 'bar',
36 |     };
37 | 
38 |     const node = setAttrs(cheerioNode, attrs);
39 | 
40 |     assert.deepEqual(node.attribs, attrs);
41 |   });
42 | });
43 | 


--------------------------------------------------------------------------------
/src/utils/dom/strip-junk-tags.js:
--------------------------------------------------------------------------------
 1 | import { STRIP_OUTPUT_TAGS, KEEP_CLASS } from './constants';
 2 | 
 3 | export default function stripJunkTags(article, $, tags = []) {
 4 |   if (tags.length === 0) {
 5 |     tags = STRIP_OUTPUT_TAGS;
 6 |   }
 7 | 
 8 |   // Remove matching elements, but ignore
 9 |   // any element with a class of mercury-parser-keep
10 |   $(tags.join(','), article)
11 |     .not(`.${KEEP_CLASS}`)
12 |     .remove();
13 | 
14 |   return $;
15 | }
16 | 


--------------------------------------------------------------------------------
/src/utils/dom/strip-tags.js:
--------------------------------------------------------------------------------
1 | // strips all tags from a string of text
2 | export default function stripTags(text, $) {
3 |   // Wrapping text in html element prevents errors when text
4 |   // has no html
5 |   const cleanText = $(`<span>${text}</span>`).text();
6 |   return cleanText === '' ? text : cleanText;
7 | }
8 | 


--------------------------------------------------------------------------------
/src/utils/dom/strip-tags.test.js:
--------------------------------------------------------------------------------
 1 | import assert from 'assert';
 2 | import cheerio from 'cheerio';
 3 | 
 4 | import stripTags from './strip-tags';
 5 | 
 6 | describe('stripTags(title, $)', () => {
 7 |   it('strips tags from a string of text', () => {
 8 |     const $ = cheerio.load('<div></div>');
 9 | 
10 |     const result = stripTags('What a <em>Wonderful</em> Day', $);
11 | 
12 |     assert.equal(result, 'What a Wonderful Day');
13 |   });
14 | 
15 |   it('returns the original text if no tags found', () => {
16 |     const $ = cheerio.load('<div></div>');
17 | 
18 |     const result = stripTags('What a Wonderful Day', $);
19 | 
20 |     assert.equal(result, 'What a Wonderful Day');
21 |   });
22 | });
23 | 


--------------------------------------------------------------------------------
/src/utils/dom/strip-unlikely-candidates.js:
--------------------------------------------------------------------------------
 1 | import { CANDIDATES_WHITELIST, CANDIDATES_BLACKLIST } from './constants';
 2 | 
 3 | export default function stripUnlikelyCandidates($) {
 4 |   //  Loop through the provided document and remove any non-link nodes
 5 |   //  that are unlikely candidates for article content.
 6 |   //
 7 |   //  Links are ignored because there are very often links to content
 8 |   //  that are identified as non-body-content, but may be inside
 9 |   //  article-like content.
10 |   //
11 |   //  :param $: a cheerio object to strip nodes from
12 |   //  :return $: the cleaned cheerio object
13 |   $('*')
14 |     .not('a')
15 |     .each((index, node) => {
16 |       const $node = $(node);
17 |       const classes = $node.attr('class');
18 |       const id = $node.attr('id');
19 |       if (!id && !classes) return;
20 | 
21 |       const classAndId = `${classes || ''} ${id || ''}`;
22 |       if (CANDIDATES_WHITELIST.test(classAndId)) {
23 |         return;
24 |       }
25 |       if (CANDIDATES_BLACKLIST.test(classAndId)) {
26 |         $node.remove();
27 |       }
28 |     });
29 | 
30 |   return $;
31 | }
32 | 


--------------------------------------------------------------------------------
/src/utils/dom/within-comment.js:
--------------------------------------------------------------------------------
 1 | import { getAttrs } from 'utils/dom';
 2 | 
 3 | export default function withinComment($node) {
 4 |   const parents = $node.parents().toArray();
 5 |   const commentParent = parents.find(parent => {
 6 |     const attrs = getAttrs(parent);
 7 |     const { class: nodeClass, id } = attrs;
 8 |     const classAndId = `${nodeClass} ${id}`;
 9 |     return classAndId.includes('comment');
10 |   });
11 | 
12 |   return commentParent !== undefined;
13 | }
14 | 


--------------------------------------------------------------------------------
/src/utils/dom/within-comment.test.js:
--------------------------------------------------------------------------------
 1 | import cheerio from 'cheerio';
 2 | import assert from 'assert';
 3 | 
 4 | import withinComment from './within-comment';
 5 | 
 6 | describe('withinComment(node)', () => {
 7 |   it('returns false if its parent is not a comment', () => {
 8 |     const $ = cheerio.load(`
 9 |       <div>
10 |         <div>
11 |           <div class="author">Adam</div>
12 |         </div>
13 |       </div>
14 |     `);
15 |     assert.equal(withinComment($('.author').first()), false);
16 |   });
17 | 
18 |   it('returns true if its parent has a class of comment', () => {
19 |     const $ = cheerio.load(`
20 |       <div class="comments">
21 |         <div>
22 |           <div class="author">Adam</div>
23 |         </div>
24 |       </div>
25 |     `);
26 |     assert.equal(withinComment($('.author').first()), true);
27 |   });
28 | 
29 |   it('returns true if its parent has an id of comment', () => {
30 |     const $ = cheerio.load(`
31 |       <div id="comment">
32 |         <div>
33 |           <div class="author">Adam</div>
34 |         </div>
35 |       </div>
36 |     `);
37 |     assert.equal(withinComment($('.author').first()), true);
38 |   });
39 | });
40 | 


--------------------------------------------------------------------------------
/src/utils/index.js:
--------------------------------------------------------------------------------
1 | export { default as range } from './range';
2 | export { default as validateUrl } from './validate-url';
3 | 


--------------------------------------------------------------------------------
/src/utils/merge-supported-domains.js:
--------------------------------------------------------------------------------
 1 | const merge = (extractor, domains) =>
 2 |   domains.reduce((acc, domain) => {
 3 |     acc[domain] = extractor;
 4 |     return acc;
 5 |   }, {});
 6 | 
 7 | export default function mergeSupportedDomains(extractor) {
 8 |   return extractor.supportedDomains
 9 |     ? merge(extractor, [extractor.domain, ...extractor.supportedDomains])
10 |     : merge(extractor, [extractor.domain]);
11 | }
12 | 


--------------------------------------------------------------------------------
/src/utils/merge-supported-domains.test.js:
--------------------------------------------------------------------------------
 1 | import assert from 'assert';
 2 | import mergeSupportedDomains from './merge-supported-domains';
 3 | 
 4 | describe('mergeSupportedDomains(extractor, domains)', () => {
 5 |   it('returns an object w/domains as keys and extractor as value', () => {
 6 |     const extractor = {
 7 |       domain: 'foo.com',
 8 |       supportedDomains: ['example.com'],
 9 |     };
10 | 
11 |     const expected = {
12 |       'foo.com': extractor,
13 |       'example.com': extractor,
14 |     };
15 | 
16 |     const result = mergeSupportedDomains(extractor);
17 |     assert.deepEqual(result, expected);
18 |   });
19 | 
20 |   it('returns an object w/single domain if no supportedDomains', () => {
21 |     const extractor = {
22 |       domain: 'foo.com',
23 |     };
24 | 
25 |     const expected = {
26 |       'foo.com': extractor,
27 |     };
28 | 
29 |     const result = mergeSupportedDomains(extractor);
30 |     assert.deepEqual(result, expected);
31 |   });
32 | });
33 | 


--------------------------------------------------------------------------------
/src/utils/range.js:
--------------------------------------------------------------------------------
1 | export default function* range(start = 1, end = 1) {
2 |   while (start <= end) {
3 |     yield (start += 1);
4 |   }
5 | }
6 | 


--------------------------------------------------------------------------------
/src/utils/text/article-base-url.test.js:
--------------------------------------------------------------------------------
 1 | import assert from 'assert';
 2 | 
 3 | import articleBaseUrl from './article-base-url';
 4 | 
 5 | describe('articleBaseUrl(url, parsedUrl)', () => {
 6 |   it('returns the base url of a paginated url', () => {
 7 |     const url = 'http://example.com/foo/bar/wow-cool/page=10';
 8 |     const cleaned = 'http://example.com/foo/bar/wow-cool';
 9 | 
10 |     assert.equal(articleBaseUrl(url), cleaned);
11 |   });
12 | 
13 |   it('returns same url if url has no pagination info', () => {
14 |     const url = 'http://example.com/foo/bar/wow-cool/';
15 |     const cleaned = 'http://example.com/foo/bar/wow-cool';
16 | 
17 |     assert.equal(articleBaseUrl(url), cleaned);
18 |   });
19 | });
20 | 


--------------------------------------------------------------------------------
/src/utils/text/constants.js:
--------------------------------------------------------------------------------
 1 | // An expression that looks to try to find the page digit within a URL, if
 2 | // it exists.
 3 | // Matches:
 4 | //  page=1
 5 | //  pg=1
 6 | //  p=1
 7 | //  paging=12
 8 | //  pag=7
 9 | //  pagination/1
10 | //  paging/88
11 | //  pa/83
12 | //  p/11
13 | //
14 | // Does not match:
15 | //  pg=102
16 | //  page:2
17 | export const PAGE_IN_HREF_RE = new RegExp(
18 |   '(page|paging|(p(a|g|ag)?(e|enum|ewanted|ing|ination)))?(=|/)([0-9]{1,3})',
19 |   'i'
20 | );
21 | 
22 | export const HAS_ALPHA_RE = /[a-z]/i;
23 | 
24 | export const IS_ALPHA_RE = /^[a-z]+$/i;
25 | export const IS_DIGIT_RE = /^[0-9]+$/i;
26 | 
27 | export const ENCODING_RE = /charset=([\w-]+)\b/;
28 | export const DEFAULT_ENCODING = 'utf-8';
29 | 


--------------------------------------------------------------------------------
/src/utils/text/excerpt-content.js:
--------------------------------------------------------------------------------
1 | export default function excerptContent(content, words = 10) {
2 |   return content
3 |     .trim()
4 |     .split(/\s+/)
5 |     .slice(0, words)
6 |     .join(' ');
7 | }
8 | 


--------------------------------------------------------------------------------
/src/utils/text/excerpt.test.js:
--------------------------------------------------------------------------------
 1 | import assert from 'assert';
 2 | import excerptContent from './excerpt-content';
 3 | 
 4 | describe('excerptContent(content, words)', () => {
 5 |   it('extracts the requested number of words from content', () => {
 6 |     const content = ' One  two three four five six, seven eight, nine, ten.';
 7 | 
 8 |     const three = excerptContent(content, 3);
 9 |     assert.equal(three, 'One two three');
10 | 
11 |     const ten = excerptContent(content, 10);
12 |     assert.equal(ten, content.trim().replace(/\s+/, ' '));
13 |   });
14 | });
15 | 


--------------------------------------------------------------------------------
/src/utils/text/extract-from-url.js:
--------------------------------------------------------------------------------
 1 | // Given a node type to search for, and a list of regular expressions,
 2 | // look to see if this extraction can be found in the URL. Expects
 3 | // that each expression in r_list will return group(1) as the proper
 4 | // string to be cleaned.
 5 | // Only used for date_published currently.
 6 | export default function extractFromUrl(url, regexList) {
 7 |   const matchRe = regexList.find(re => re.test(url));
 8 |   if (matchRe) {
 9 |     return matchRe.exec(url)[1];
10 |   }
11 | 
12 |   return null;
13 | }
14 | 


--------------------------------------------------------------------------------
/src/utils/text/extract-from-url.test.js:
--------------------------------------------------------------------------------
 1 | import assert from 'assert';
 2 | 
 3 | import extractFromUrl from './extract-from-url';
 4 | 
 5 | describe('extractFromUrl(url)', () => {
 6 |   it('extracts datePublished from url', () => {
 7 |     const url = 'https://example.com/2012/08/01/this-is-good';
 8 |     const regexList = [new RegExp('/(20\\d{2}/\\d{2}/\\d{2})/')];
 9 |     const result = extractFromUrl(url, regexList);
10 | 
11 |     assert.equal(result, '2012/08/01');
12 |   });
13 | 
14 |   it('returns null if nothing found', () => {
15 |     const url = 'https://example.com/this-is-good';
16 |     const regexList = [new RegExp('/(20\\d{2}/\\d{2}/\\d{2})/')];
17 |     const result = extractFromUrl(url, regexList);
18 | 
19 |     assert.equal(result, null);
20 |   });
21 | });
22 | 


--------------------------------------------------------------------------------
/src/utils/text/get-encoding.js:
--------------------------------------------------------------------------------
 1 | import iconv from 'iconv-lite';
 2 | import { DEFAULT_ENCODING, ENCODING_RE } from './constants';
 3 | 
 4 | // check a string for encoding; this is
 5 | // used in our fetchResource function to
 6 | // ensure correctly encoded responses
 7 | export default function getEncoding(str) {
 8 |   let encoding = DEFAULT_ENCODING;
 9 |   const matches = ENCODING_RE.exec(str);
10 |   if (matches !== null) {
11 |     [, str] = matches;
12 |   }
13 |   if (iconv.encodingExists(str)) {
14 |     encoding = str;
15 |   }
16 |   return encoding;
17 | }
18 | 


--------------------------------------------------------------------------------
/src/utils/text/get-encoding.test.js:
--------------------------------------------------------------------------------
 1 | import assert from 'assert';
 2 | import cheerio from 'cheerio';
 3 | 
 4 | import getEncoding from './get-encoding';
 5 | 
 6 | // Tests are bypassed in the browser because it has an encoding
 7 | // A shim is used /src/shims/iconv-lite.js to decrease load size
 8 | 
 9 | describe('getEncoding(str)', () => {
10 |   if (cheerio.browser) return;
11 | 
12 |   it('returns the encoding as a string', () => {
13 |     const contentType = 'text/html; charset=iso-8859-15';
14 |     assert.equal(getEncoding(contentType), 'iso-8859-15');
15 |   });
16 | 
17 |   it('returns utf-8 as a default if no encoding found', () => {
18 |     const contentType = 'text/html';
19 |     assert.equal(getEncoding(contentType), 'utf-8');
20 |   });
21 | 
22 |   it('returns utf-8 if there is an invalid encoding', () => {
23 |     const contentType = 'text/html; charset=fake-charset';
24 |     assert.equal(getEncoding(contentType), 'utf-8');
25 |   });
26 | });
27 | 


--------------------------------------------------------------------------------
/src/utils/text/has-sentence-end.js:
--------------------------------------------------------------------------------
1 | // Given a string, return True if it appears to have an ending sentence
2 | // within it, false otherwise.
3 | const SENTENCE_END_RE = new RegExp('.( |$)');
4 | export default function hasSentenceEnd(text) {
5 |   return SENTENCE_END_RE.test(text);
6 | }
7 | 


--------------------------------------------------------------------------------
/src/utils/text/index.js:
--------------------------------------------------------------------------------
1 | export { default as normalizeSpaces } from './normalize-spaces';
2 | export { default as extractFromUrl } from './extract-from-url';
3 | export { default as pageNumFromUrl } from './page-num-from-url';
4 | export { default as removeAnchor } from './remove-anchor';
5 | export { default as articleBaseUrl } from './article-base-url';
6 | export { default as hasSentenceEnd } from './has-sentence-end';
7 | export { default as excerptContent } from './excerpt-content';
8 | export { default as getEncoding } from './get-encoding';
9 | 


--------------------------------------------------------------------------------
/src/utils/text/normalize-spaces.js:
--------------------------------------------------------------------------------
1 | const NORMALIZE_RE = /\s{2,}(?![^<>]*<\/(pre|code|textarea)>)/g;
2 | 
3 | export default function normalizeSpaces(text) {
4 |   return text.replace(NORMALIZE_RE, ' ').trim();
5 | }
6 | 


--------------------------------------------------------------------------------
/src/utils/text/normalize-spaces.test.js:
--------------------------------------------------------------------------------
 1 | import assert from 'assert';
 2 | import cheerio from 'cheerio';
 3 | 
 4 | import { normalizeSpaces } from './index';
 5 | 
 6 | describe('normalizeSpaces(text)', () => {
 7 |   it('normalizes spaces from text', () => {
 8 |     const $ = cheerio.load(`
 9 |       <div>
10 |         <p>What do you think?</p>
11 |       </div>
12 |     `);
13 | 
14 |     const result = normalizeSpaces(
15 |       $('*')
16 |         .first()
17 |         .text()
18 |     );
19 |     assert.equal(result, 'What do you think?');
20 |   });
21 | 
22 |   it('preserves spaces in preformatted text blocks', () => {
23 |     const $ = cheerio.load(`
24 |       <div>
25 |         <p>What   do  you    think?</p>
26 |         <pre>  What     happens to        spaces?    </pre>
27 |       </div>
28 |     `);
29 | 
30 |     const result = normalizeSpaces($.html());
31 |     assert.equal(
32 |       result,
33 |       '<div> <p>What do you think?</p> <pre>  What     happens to        spaces?    </pre> </div>'
34 |     );
35 |   });
36 | });
37 | 


--------------------------------------------------------------------------------
/src/utils/text/page-num-from-url.js:
--------------------------------------------------------------------------------
 1 | import { PAGE_IN_HREF_RE } from './constants';
 2 | 
 3 | export default function pageNumFromUrl(url) {
 4 |   const matches = url.match(PAGE_IN_HREF_RE);
 5 |   if (!matches) return null;
 6 | 
 7 |   const pageNum = parseInt(matches[6], 10);
 8 | 
 9 |   // Return pageNum < 100, otherwise
10 |   // return null
11 |   return pageNum < 100 ? pageNum : null;
12 | }
13 | 


--------------------------------------------------------------------------------
/src/utils/text/page-num-from-url.test.js:
--------------------------------------------------------------------------------
 1 | import assert from 'assert';
 2 | 
 3 | import pageNumFromUrl from './page-num-from-url';
 4 | 
 5 | describe('pageNumFromUrl(url)', () => {
 6 |   it('returns null if there is no page num in the url', () => {
 7 |     assert.equal(pageNumFromUrl('http://example.com'), null);
 8 |     assert.equal(pageNumFromUrl('http://example.com/?pg=102'), null);
 9 |     assert.equal(pageNumFromUrl('http://example.com/?page:102'), null);
10 |   });
11 | 
12 |   it('returns a page num if one matches the url', () => {
13 |     assert.equal(pageNumFromUrl('http://example.com/foo?page=1'), 1);
14 |     assert.equal(pageNumFromUrl('http://example.com/foo?pg=1'), 1);
15 |     assert.equal(pageNumFromUrl('http://example.com/foo?p=1'), 1);
16 |     assert.equal(pageNumFromUrl('http://example.com/foo?paging=1'), 1);
17 |     assert.equal(pageNumFromUrl('http://example.com/foo?pag=1'), 1);
18 |     assert.equal(pageNumFromUrl('http://example.com/foo?pagination/1'), 1);
19 |     assert.equal(pageNumFromUrl('http://example.com/foo?paging/99'), 99);
20 |     assert.equal(pageNumFromUrl('http://example.com/foo?pa/99'), 99);
21 |     assert.equal(pageNumFromUrl('http://example.com/foo?p/99'), 99);
22 |   });
23 | });
24 | 


--------------------------------------------------------------------------------
/src/utils/text/remove-anchor.js:
--------------------------------------------------------------------------------
1 | export default function removeAnchor(url) {
2 |   return url.split('#')[0].replace(/\/$/, '');
3 | }
4 | 


--------------------------------------------------------------------------------
/src/utils/text/remove-anchor.test.js:
--------------------------------------------------------------------------------
 1 | import assert from 'assert';
 2 | 
 3 | import removeAnchor from './remove-anchor';
 4 | 
 5 | describe('removeAnchor(url)', () => {
 6 |   it('returns a url w/out #anchor', () => {
 7 |     const url = 'http://example.com/foo/bar/wow-cool/page=10/#wow';
 8 |     const cleaned = 'http://example.com/foo/bar/wow-cool/page=10';
 9 | 
10 |     assert.equal(removeAnchor(url), cleaned);
11 |   });
12 | 
13 |   it('returns same url if url has no anchor found', () => {
14 |     const url = 'http://example.com/foo/bar/wow-cool';
15 |     const cleaned = 'http://example.com/foo/bar/wow-cool';
16 | 
17 |     assert.equal(removeAnchor(url), cleaned);
18 |   });
19 | });
20 | 


--------------------------------------------------------------------------------
/src/utils/validate-url.js:
--------------------------------------------------------------------------------
1 | // extremely simple url validation as a first step
2 | export default function validateUrl({ hostname }) {
3 |   // If this isn't a valid url, return an error message
4 |   return !!hostname;
5 | }
6 | 


--------------------------------------------------------------------------------
/src/utils/validate-url.test.js:
--------------------------------------------------------------------------------
 1 | import assert from 'assert';
 2 | import URL from 'url';
 3 | 
 4 | import validateUrl from './validate-url';
 5 | 
 6 | describe('validateUrl(parsedUrl)', () => {
 7 |   it('returns false if url is not valid', () => {
 8 |     const url = URL.parse('example.com');
 9 |     const valid = validateUrl(url);
10 | 
11 |     assert.equal(valid, false);
12 |   });
13 | 
14 |   it('returns true if url is valid', () => {
15 |     const url = URL.parse('http://example.com');
16 |     const valid = validateUrl(url);
17 | 
18 |     assert.equal(valid, true);
19 |   });
20 | });
21 | 


--------------------------------------------------------------------------------