├── .clang-format
├── .github
└── workflows
│ └── ant.yml
├── .gitignore
├── LICENSE
├── LICENSE.txt.boilerpipe
├── NOTICE.txt.boilerpipe
├── OWNERS
├── PRESUBMIT.py
├── Proto.gwt.xml
├── README.md
├── TestProto.gwt.xml
├── Vagrantfile
├── build.xml
├── codereview.settings
├── create-hook-symlinks
├── create_standalone_js.py
├── create_wrapped_standalone_js.py
├── extension
├── background.js
├── devtools.html
├── devtools.js
├── devtools_panel.html
├── devtools_panel.js
├── extract.js
├── icon19.png
├── icon24.png
├── icon38.png
├── icon48.png
├── icon96.png
├── manifest.json
└── preview.js
├── heuristics
└── distillable
│ ├── README.md
│ ├── calculate_derived_features.py
│ ├── check_derived_features.py
│ ├── check_distilled_mhtml.py
│ ├── extract_features.js
│ ├── gen_mhtml_corpus.py
│ ├── get_screenshots.py
│ ├── index.html
│ ├── index.js
│ ├── server.py
│ └── write_features_csv.py
├── hooks
└── pre-commit
├── install-build-deps.sh
├── java
├── DomDistiller.gwt.xml
└── org
│ └── chromium
│ └── distiller
│ ├── BoilerpipeFilter.java
│ ├── ContentExtractor.java
│ ├── DocumentTitleGetter.java
│ ├── DomDistiller.java
│ ├── DomDistillerEntry.java
│ ├── DomUtil.java
│ ├── DomWalker.java
│ ├── IEReadingViewParser.java
│ ├── JavaScript.java
│ ├── LogUtil.java
│ ├── MarkupParser.java
│ ├── MonotonicPageInfosGroups.java
│ ├── NodeListExpander.java
│ ├── NodeTree.java
│ ├── OpenGraphProtocolParser.java
│ ├── OpenGraphProtocolParserAccessor.java
│ ├── OrderedNodeMatcher.java
│ ├── PageLinkInfo.java
│ ├── PageParamInfo.java
│ ├── PageParameterDetector.java
│ ├── PageParameterParser.java
│ ├── PagingLinksFinder.java
│ ├── ParsedUrl.java
│ ├── PathComponentPagePattern.java
│ ├── QueryParamPagePattern.java
│ ├── SchemaOrgParser.java
│ ├── SchemaOrgParserAccessor.java
│ ├── StringUtil.java
│ ├── TableClassifier.java
│ ├── TreeCloneBuilder.java
│ ├── document
│ ├── TextBlock.java
│ ├── TextDocument.java
│ └── TextDocumentStatistics.java
│ ├── extractors
│ ├── ArticleExtractor.java
│ ├── KeepEverythingExtractor.java
│ ├── KeepEverythingWithMinKWordsExtractor.java
│ └── embeds
│ │ ├── EmbedExtractor.java
│ │ ├── ImageExtractor.java
│ │ ├── TwitterExtractor.java
│ │ ├── VimeoExtractor.java
│ │ └── YouTubeExtractor.java
│ ├── filters
│ ├── debug
│ │ └── PrintDebugFilter.java
│ ├── english
│ │ ├── NumWordsRulesClassifier.java
│ │ └── TerminatingBlocksFinder.java
│ ├── heuristics
│ │ ├── BlockProximityFusion.java
│ │ ├── DocumentTitleMatchClassifier.java
│ │ ├── ExpandTitleToContentFilter.java
│ │ ├── HeadingFusion.java
│ │ ├── KeepLargestBlockFilter.java
│ │ ├── LargeBlockSameTagLevelToContentFilter.java
│ │ ├── ListAtEndFilter.java
│ │ └── SimilarSiblingContentExpansion.java
│ └── simple
│ │ ├── BoilerplateBlockFilter.java
│ │ ├── LabelToBoilerplateFilter.java
│ │ ├── MarkEverythingBoilerplateFilter.java
│ │ ├── MarkEverythingContentFilter.java
│ │ └── MinWordsFilter.java
│ ├── labels
│ ├── DefaultLabels.java
│ └── LabelAction.java
│ └── webdocument
│ ├── DomConverter.java
│ ├── ElementAction.java
│ ├── WebDocument.java
│ ├── WebDocumentBuilder.java
│ ├── WebDocumentBuilderInterface.java
│ ├── WebElement.java
│ ├── WebEmbed.java
│ ├── WebFigure.java
│ ├── WebImage.java
│ ├── WebTable.java
│ ├── WebTag.java
│ ├── WebText.java
│ ├── WebTextBuilder.java
│ ├── WebVideo.java
│ └── filters
│ ├── LeadImageFinder.java
│ ├── NestedElementRetainer.java
│ ├── RelevantElements.java
│ └── images
│ ├── AreaScorer.java
│ ├── BaseImageScorer.java
│ ├── DimensionsRatioScorer.java
│ ├── DomDistanceScorer.java
│ ├── HasFigureScorer.java
│ └── ImageScorer.java
├── javatests
├── DomDistillerJsTest.gwt.xml
└── org
│ └── chromium
│ └── distiller
│ ├── Assert.java
│ ├── AssertTest.java
│ ├── BlockProximityFusionTest.java
│ ├── ContentExtractorTest.java
│ ├── DocumentTitleGetterTest.java
│ ├── DocumentTitleMatchClassifierTest.java
│ ├── DomDistillerJsTestCase.java
│ ├── DomUtilTest.java
│ ├── DomWalkerTest.java
│ ├── EmbedExtractorTest.java
│ ├── GwtOverlayProtoTest.java
│ ├── HeadingFusionTest.java
│ ├── IEReadingViewParserTest.java
│ ├── ImageHeuristicsTest.java
│ ├── JavaScriptTest.java
│ ├── JsTestCase.java
│ ├── JsTestEntry.java
│ ├── JsTestSuiteBase.java
│ ├── JsTestSuiteBaseTest.java
│ ├── JsTestSuiteBuilder.java
│ ├── MarkupParserProtoTest.java
│ ├── MarkupParserTest.java
│ ├── MonotonicPageInfosGroupsTest.java
│ ├── NodeDirectionalityTest.java
│ ├── NodeListExpanderTest.java
│ ├── OpenGraphProtocolParserAccessorTest.java
│ ├── OrderedNodeMatcherTest.java
│ ├── PageParamContentInfo.java
│ ├── PageParamInfoTest.java
│ ├── PageParameterDetectorTest.java
│ ├── PageParameterParserTest.java
│ ├── PagingLinksFinderTest.java
│ ├── ParsedUrlTest.java
│ ├── PathComponentPagePatternTest.java
│ ├── QueryParamPagePatternTest.java
│ ├── SchemaOrgParserAccessorTest.java
│ ├── SimilarSiblingContentExpansionTest.java
│ ├── SimpleTest.java
│ ├── StringUtilTest.java
│ ├── TableClassifierTest.java
│ ├── TerminatingBlocksFinderTest.java
│ ├── TestLogger.java
│ ├── TestTextBlockBuilder.java
│ ├── TestTextDocumentBuilder.java
│ ├── TestUtil.java
│ ├── TestUtilTest.java
│ ├── TextDocumentConstructionTest.java
│ ├── TextDocumentStatisticsTest.java
│ ├── TreeCloneBuilderTest.java
│ ├── document
│ └── TextDocumentTestUtil.java
│ ├── rebind
│ └── JsTestEntryGenerator.java
│ └── webdocument
│ ├── DomConverterTest.java
│ ├── ElementActionTest.java
│ ├── FakeWebDocumentBuilder.java
│ ├── TestWebDocumentBuilder.java
│ ├── TestWebTextBuilder.java
│ ├── WebDocumentBuilderTest.java
│ ├── WebImageTest.java
│ ├── WebTableTest.java
│ ├── WebTagTest.java
│ ├── WebTextBuilderTest.java
│ ├── WebTextTest.java
│ ├── WebVideoTest.java
│ └── filters
│ ├── LeadImageFinderTest.java
│ ├── NestedElementRetainerTest.java
│ └── RelevantElementsTest.java
├── land-external-contributor-cl.sh
├── proto
└── dom_distiller.proto
├── protoc_plugins
├── README
├── gwt_overlay.py
├── json_values_converter.py
├── json_values_converter_tests.py
└── util
│ ├── __init__.py
│ ├── plugin.py
│ ├── plugin_protos.py
│ ├── types.py
│ └── writer.py
├── run_jstests.py
├── test
└── proto
│ └── test.proto
├── third_party
├── gwt-2.7.0
│ ├── COPYING
│ ├── COPYING.html
│ ├── about.html
│ ├── about.txt
│ ├── gwt-api-checker.jar
│ ├── gwt-codeserver.jar
│ ├── gwt-dev.jar
│ ├── gwt-elemental.jar
│ ├── gwt-ll.dll
│ ├── gwt-module.dtd
│ ├── gwt-servlet-deps.jar
│ ├── gwt-servlet.jar
│ ├── gwt-user.jar
│ ├── i18nCreator
│ ├── i18nCreator.cmd
│ ├── release_notes.html
│ ├── requestfactory-apt-src.jar
│ ├── requestfactory-apt.jar
│ ├── requestfactory-client+src.jar
│ ├── requestfactory-client-src.jar
│ ├── requestfactory-client.jar
│ ├── requestfactory-server+src.jar
│ ├── requestfactory-server-src.jar
│ ├── requestfactory-server.jar
│ ├── validation-api-1.0.0.GA-sources.jar
│ ├── validation-api-1.0.0.GA.jar
│ ├── webAppCreator
│ └── webAppCreator.cmd
├── junit
│ └── junit-4.11.jar
└── protobuf
│ ├── COPYING.txt
│ └── google
│ └── protobuf
│ ├── compiler
│ └── plugin.proto
│ └── descriptor.proto
├── tools
└── UnicodePatternGenerator.java
└── war
├── DomDistiller.html
├── favicon.ico
├── test.html
└── wrapped_domdistiller_template.js
/.clang-format:
--------------------------------------------------------------------------------
1 | # Defines the DomDistiller style for automatic reformatting.
2 | # http://clang.llvm.org/docs/ClangFormatStyleOptions.html
3 | # To use `git cl format`, several things are expected to be in the buildtools/
4 | # directory. You can create a symlink to the relevant directories in a normal
5 | # Chromium checkout to make git cl format work.
6 | # See README.md for instructions.
7 | BasedOnStyle: Chromium
8 |
9 | Language: Java
10 | IndentWidth: 4
11 |
--------------------------------------------------------------------------------
/.github/workflows/ant.yml:
--------------------------------------------------------------------------------
1 | # This workflow will build a Java project with Ant
2 | # For more information see: https://help.github.com/actions/language-and-framework-guides/building-and-testing-java-with-ant
3 |
4 | name: Java CI
5 |
6 | on:
7 | push:
8 | branches: [ master ]
9 | pull_request:
10 | branches: [ master ]
11 | schedule:
12 | - cron: '30 5 * * 2'
13 |
14 | jobs:
15 | build:
16 | runs-on: ubuntu-latest
17 | strategy:
18 | matrix:
19 | include:
20 | - chrome-min-version: 49
21 | - chrome-min-version: 999
22 | env:
23 | CHROME_MIN_VERSION: ${{ matrix.chrome-min-version }}
24 | steps:
25 | - uses: actions/checkout@v2
26 | - name: Set up JDK 8
27 | uses: actions/setup-java@v2
28 | with:
29 | java-version: '8'
30 | distribution: 'adopt'
31 | - name: Show history
32 | run: git log --oneline -n 5
33 | - name: Install deps
34 | run: |
35 | google-chrome --version
36 | sudo -E ./install-build-deps.sh
37 | - name: Check environment
38 | run: |
39 | which java
40 | java -Xmx32m -version
41 | javac -J-Xmx32m -version
42 | google-chrome --version
43 | - name: Run pre-commit tests
44 | run: hooks/pre-commit
45 | - name: Run tests
46 | run: ant test -Dtest.shuffle=1 -Dtest.repeat=10
47 | - name: Package
48 | run: ant package
49 |
--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
1 | DomDistiller.war
2 | /buildtools
3 | /gwt-unitCache/
4 | /out/
5 | /war/WEB-INF/
6 | /war/domdistiller
7 | /war/domdistillerjstest
8 | /war/*.JUnit/
9 | .*.swp
10 | *.pyc
11 | junitvmwatcher*.properties
12 |
--------------------------------------------------------------------------------
/LICENSE.txt.boilerpipe:
--------------------------------------------------------------------------------
1 |
2 | boilerpipe
3 |
4 | Copyright (c) 2009-2011 Christian Kohlschütter
5 |
6 | The author licenses this file to You under the Apache License, Version 2.0
7 | (the "License"); you may not use this file except in compliance with
8 | the License. You may obtain a copy of the License at
9 |
10 | http://www.apache.org/licenses/LICENSE-2.0
11 |
12 | Unless required by applicable law or agreed to in writing, software
13 | distributed under the License is distributed on an "AS IS" BASIS,
14 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15 | See the License for the specific language governing permissions and
16 | limitations under the License.
17 |
18 |
--------------------------------------------------------------------------------
/NOTICE.txt.boilerpipe:
--------------------------------------------------------------------------------
1 |
2 | boilerpipe
3 |
4 | Copyright (c) 2009-2011 Christian Kohlschütter
5 |
6 | The author licenses this file to You under the Apache License, Version 2.0
7 | (the "License"); you may not use this file except in compliance with
8 | the License. You may obtain a copy of the License at
9 |
10 | http://www.apache.org/licenses/LICENSE-2.0
11 |
12 | Unless required by applicable law or agreed to in writing, software
13 | distributed under the License is distributed on an "AS IS" BASIS,
14 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15 | See the License for the specific language governing permissions and
16 | limitations under the License.
17 |
18 |
19 | This software contains the following parts which are also provided
20 | under the Apache License 2.0 (http://apache.org/licenses/LICENSE-2.0.txt):
21 |
22 | - NekoHTML
23 | - Xerces
24 |
25 |
--------------------------------------------------------------------------------
/OWNERS:
--------------------------------------------------------------------------------
1 | *
2 |
--------------------------------------------------------------------------------
/PRESUBMIT.py:
--------------------------------------------------------------------------------
1 | # Copyright 2014 The Chromium Authors
2 | # Use of this source code is governed by a BSD-style license that can be
3 | # found in the LICENSE file.
4 |
5 | """Top-level presubmit script for DOM Distiller.
6 |
7 | See http://dev.chromium.org/developers/how-tos/depottools/presubmit-scripts
8 | for more details about the presubmit API built into git cl.
9 | """
10 |
11 | import subprocess
12 | import sys
13 |
14 | def _Git(args):
15 | """Runs the requested git command and returns the first line of output."""
16 | output = subprocess.check_output(['git'] + args)
17 | return output.split('\n')[0]
18 |
19 |
20 | def _CheckUpstream(input_api, output_api):
21 | """Checks that the upstream branch is remote.
22 |
23 | git cl push will push the issue's change to the branch's upstream branch. This
24 | should be origin/master (or maybe origin/some_branch) to work as expected.
25 | Otherwise, git cl push will push the change to some local branch and close the
26 | issue.
27 | """
28 | branch = _Git(['symbolic-ref', 'HEAD'])
29 | shortbranch = branch.replace('refs/heads/', '')
30 | remote = _Git(['config', '--local', 'branch.%s.remote' % shortbranch])
31 | if remote != 'origin':
32 | upstream = _Git(['config', '--local', 'branch.%s.merge' % shortbranch])
33 | shortupstream = upstream.replace('refs/heads/', '')
34 | return [output_api.PresubmitError(
35 | 'Changes should be pushed to origin/master.\n'
36 | 'Try this:\n'
37 | ' git branch -u origin/master\n'
38 | ' git cl push\n'
39 | ' git branch -u %s' % shortupstream)]
40 |
41 | return []
42 |
43 | def CheckChangeOnCommit(input_api, output_api):
44 | results = []
45 | results.extend(input_api.canned_checks.CheckOwners(input_api, output_api))
46 | results.extend(_CheckUpstream(input_api, output_api))
47 | return results
48 |
--------------------------------------------------------------------------------
/Proto.gwt.xml:
--------------------------------------------------------------------------------
1 |
2 |
7 |
9 |
10 |
11 |
12 |
--------------------------------------------------------------------------------
/TestProto.gwt.xml:
--------------------------------------------------------------------------------
1 |
2 |
7 |
9 |
10 |
11 |
12 |
--------------------------------------------------------------------------------
/Vagrantfile:
--------------------------------------------------------------------------------
1 | # -*- mode: ruby -*-
2 | # vi: set ft=ruby :
3 |
4 | # All Vagrant configuration is done below. The "2" in Vagrant.configure
5 | # configures the configuration version (we support older styles for
6 | # backwards compatibility). Please don't change it unless you know what
7 | # you're doing.
8 | Vagrant.configure(2) do |config|
9 | # The most common configuration options are documented and commented below.
10 | # For a complete reference, please see the online documentation at
11 | # https://docs.vagrantup.com.
12 |
13 | # Every Vagrant development environment requires a box. You can search for
14 | # boxes at https://atlas.hashicorp.com/search.
15 | config.vm.box = "ubuntu/trusty64"
16 |
17 | # Disable automatic box update checking. If you disable this, then
18 | # boxes will only be checked for updates when the user runs
19 | # `vagrant box outdated`. This is not recommended.
20 | # config.vm.box_check_update = false
21 |
22 | # Create a forwarded port mapping which allows access to a specific port
23 | # within the machine from a port on the host machine. In the example below,
24 | # accessing "localhost:8080" will access port 80 on the guest machine.
25 | # config.vm.network "forwarded_port", guest: 80, host: 8080
26 |
27 | # Create a private network, which allows host-only access to the machine
28 | # using a specific IP.
29 | # config.vm.network "private_network", ip: "192.168.33.10"
30 |
31 | # Create a public network, which generally matched to bridged network.
32 | # Bridged networks make the machine appear as another physical device on
33 | # your network.
34 | # config.vm.network "public_network"
35 |
36 | # Share an additional folder to the guest VM. The first argument is
37 | # the path on the host to the actual folder. The second argument is
38 | # the path on the guest to mount the folder. And the optional third
39 | # argument is a set of non-required options.
40 | # config.vm.synced_folder "../data", "/vagrant_data"
41 |
42 | # Provider-specific configuration so you can fine-tune various
43 | # backing providers for Vagrant. These expose provider-specific options.
44 | # Example for VirtualBox:
45 | #
46 | # config.vm.provider "virtualbox" do |vb|
47 | # # Display the VirtualBox GUI when booting the machine
48 | # vb.gui = true
49 | #
50 | # # Customize the amount of memory on the VM:
51 | # vb.memory = "1024"
52 | # end
53 | #
54 | # View the documentation for the provider you are using for more
55 | # information on available options.
56 |
57 | # Define a Vagrant Push strategy for pushing to Atlas. Other push strategies
58 | # such as FTP and Heroku are also available. See the documentation at
59 | # https://docs.vagrantup.com/v2/push/atlas.html for more information.
60 | # config.push.define "atlas" do |push|
61 | # push.app = "YOUR_ATLAS_USERNAME/YOUR_APPLICATION_NAME"
62 | # end
63 |
64 | # Enable provisioning with a shell script. Additional provisioners such as
65 | # Puppet, Chef, Ansible, Salt, and Docker are also available. Please see the
66 | # documentation for more information about their specific syntax and use.
67 | config.vm.provision "shell", inline: <<-SHELL
68 | yes | /vagrant/install-build-deps.sh
69 | SHELL
70 | end
71 |
--------------------------------------------------------------------------------
/codereview.settings:
--------------------------------------------------------------------------------
1 | # This file is used by git-cl to get repository specific information.
2 | GERRIT_HOST: True
3 | PROJECT: dom_distiller
4 |
--------------------------------------------------------------------------------
/create-hook-symlinks:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 |
3 | # Install hook scripts by making symlinks to $GIT_ROOT/hooks.
4 |
5 | HOOK_DIR=.git/hooks
6 |
7 | for hook in $(ls hooks); do
8 | ln -s -f ../../hooks/$hook $HOOK_DIR/$hook
9 | done
10 |
--------------------------------------------------------------------------------
/create_standalone_js.py:
--------------------------------------------------------------------------------
1 | # Copyright 2014 The Chromium Authors
2 | # Use of this source code is governed by a BSD-style license that can be
3 | # found in the LICENSE file.
4 |
5 | """Converts gwt-compiled javascript to standalone javascript
6 |
7 | gwt-compiled javascript is in the form of an js file that is expected to be
8 | loaded into its own script tag. This reads such a compiled file and converts it
9 | to standalone javascript that can be loaded as Chrome does.
10 | """
11 |
12 | # TODO(cjhopman): The proper way to do this is to write a gwt Linker
13 | # (gwt.core.ext.Linker) and use that for compilation. See
14 | # http://crbug.com/437113
15 |
16 | import glob
17 | import optparse
18 | import os
19 | import re
20 | import shutil
21 | import sys
22 |
23 | def ExtractJavascript(content, module):
24 | """ Extracts javascript from within
4 |