├── .github
└── tokenmill-logo.svg
├── .gitignore
├── .gitlab-ci.yml
├── Dockerfile
├── Dockerfile.deps
├── LICENSE
├── Makefile
├── README.md
├── deps.edn
├── dev
└── fast_url_check
│ └── benchmark.clj
├── src
└── fast_url_check
│ ├── core.clj
│ └── java.clj
└── test
├── fast_url_check
└── core_test.clj
└── resources
├── bulk-test.txt
└── logback.xml
/.github/tokenmill-logo.svg:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
1 | .cpcache/
2 | .idea/
3 | **/*.iml
4 | libsunec.so
5 | pom.xml
6 | target/*
7 | url-checker
8 |
--------------------------------------------------------------------------------
/.gitlab-ci.yml:
--------------------------------------------------------------------------------
1 | stages:
2 | - deps
3 | - test
4 |
5 | update-deps:
6 | stage: deps
7 | image: docker:stable
8 | only:
9 | changes:
10 | - deps.edn
11 | - Dockerfile.deps
12 | before_script:
13 | - docker login -u gitlab-ci-token -p $CI_JOB_TOKEN registry.gitlab.com
14 | script:
15 | - docker build -f Dockerfile.deps -t registry.gitlab.com/tokenmill/crawl/fast-url-access-checker:deps .
16 | - docker push registry.gitlab.com/tokenmill/crawl/fast-url-access-checker:deps
17 | - docker rmi registry.gitlab.com/tokenmill/crawl/fast-url-access-checker:deps
18 |
19 | lint-and-unit-test:
20 | stage: test
21 | when: always
22 | image: registry.gitlab.com/tokenmill/crawl/fast-url-access-checker:deps
23 | script:
24 | - clojure -A:kibit
25 | - clojure -A:eastwood
26 | - clojure -A:test -e integration
27 |
--------------------------------------------------------------------------------
/Dockerfile:
--------------------------------------------------------------------------------
1 | FROM tokenmill/clojure:graalvm-ce-19.0.0-tools-deps-1.10.0.442 as builder
2 |
3 | RUN mkdir -p /usr/src/app
4 | WORKDIR /usr/src/app
5 |
6 | COPY deps.edn /usr/src/app/
7 | RUN clojure -R:native-image
8 | COPY . /usr/src/app
9 | RUN clojure -A:native-image
10 | RUN cp $JAVA_HOME/jre/lib/amd64/libsunec.so .
11 | RUN cp target/app url-checker
12 |
--------------------------------------------------------------------------------
/Dockerfile.deps:
--------------------------------------------------------------------------------
1 | FROM registry.gitlab.com/tokenmill/clojure:1 as builder
2 |
3 | RUN mkdir -p /usr/src/app
4 | WORKDIR /usr/src/app
5 | COPY deps.edn /usr/src/app/
6 | RUN clojure -R:test:dev:kibit:eastwood
--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
1 | Copyright 2019 Tokenmill, UAB
2 |
3 | Licensed under the Apache License, Version 2.0 (the "License");
4 | you may not use this file except in compliance with the License.
5 | You may obtain a copy of the License at
6 |
7 | http://www.apache.org/licenses/LICENSE-2.0
8 |
9 | Unless required by applicable law or agreed to in writing, software
10 | distributed under the License is distributed on an "AS IS" BASIS,
11 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | See the License for the specific language governing permissions and
13 | limitations under the License.
--------------------------------------------------------------------------------
/Makefile:
--------------------------------------------------------------------------------
1 | unit-test:
2 | clojure -A:test -e integration
3 |
4 | integration-test:
5 | clojure -A:test -i integration
6 |
7 | lint:
8 | clojure -A:kibit
9 | clojure -A:eastwood
10 |
11 | uberjar:
12 | clojure -A:uberjar
13 |
14 | check-urls:
15 | clojure -m fast-url-check.core $(file-name)
16 |
17 | benchmark:
18 | clojure -A:dev -m fast-url-check.benchmark $(file-name)
19 |
20 | build-graal-url-checker:
21 | docker build --target builder -f Dockerfile -t fast-url-checker .
22 | docker rm build || true
23 | docker create --name build fast-url-checker
24 | docker cp build:/usr/src/app/url-checker url-checker
25 | docker cp build:/usr/src/app/libsunec.so libsunec.so
26 |
--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 | # URL Access Checker
6 |
7 | This tool will take a list URLs of the sites, identify a correct form of the URL and run HTTP GET request against the URL to check its HTTP status. In cases where the address is not completely specified - a protocol is missing, 'www' part is not included when it is needed - a correct form of the URL will be identified. The library will also validate the correctness of the URL and in cases of redirection will return a target URL.
8 |
9 | It is a Clojure library. Additionaly an interface to call it from Java is provided. As well as native binary distribution to be used as a command line tool.
10 |
11 | # Features
12 |
13 | * Provides the interface for a single URL check.
14 | * Provides the interface for bulk URL checks.
15 | * In the case of bulk URL check library parallelizes checking to ensure maximum speed of the entire process.
16 | * In cases of incompletely formed URLs correct protocol (http or https) will be detected. Access with 'www' part if it is missing will also be tested.
17 | * Redirection will be detected and target URL returned.
18 | * URL check returns the following data: HTTP status, target URL, response time.
19 |
20 | # How to Use
21 |
22 | ## Command Line
23 |
24 | Use URL checker can be started from the command line with the following instruction
25 | ```
26 | ./url-checker test/resources/bulk-test.txt
27 | ```
28 |
29 | See bellow for the output sample.
30 |
31 |
32 | URL checker can be executed via command line using intalled [Clojure](https://clojure.org/) tools. Execution example with project's test url set:
33 |
34 | ```
35 | clojure -m fast-url-check.core test/resources/bulk-test.txt
36 | ```
37 |
38 | Or via project's Makefile
39 |
40 | ```
41 | make check-urls file-name=test/resources/bulk-test.txt
42 | ```
43 |
44 | This will result in CSV formated output of URL checking results
45 |
46 | ```
47 | timestamp,seed,url,status,status-type,response-time,exception
48 | 2019-05-30T10:43:47.674Z,cameron.slb.com,https://www.products.slb.com,302,redirect,431,
49 | 2019-05-30T10:43:47.691Z,co.williams.com,https://co.williams.com/,200,accessible,622,
50 | 2019-05-30T10:43:47.691Z,company.ingersollrand.com,https://www.company.ingersollrand.com/,200,accessible,645,
51 | ...
52 | 2019-05-30T10:43:51.950Z,http://aes.com,https://aes.com/,200,accessible,3632,
53 | ```
54 |
55 |
56 | ## Clojure
57 |
58 | Singe URL check example.
59 |
60 | ```
61 | (require '[fast-url-check.core :refer :all])
62 |
63 | (check-access "tokenmill.lt")
64 | =>
65 | {:url "http://www.tokenmill.lt/",
66 | :seed "tokenmill.lt",
67 | :status 200,
68 | :response-time 7,
69 | :status-type :accessible}
70 | ```
71 |
72 | Bulk URL check example
73 |
74 | ```
75 |
76 | (check-access-bulk ["tokenmill.lt" "15min.lt" "https://news.ycombinator.com"])
77 | =>
78 | ({:url "http://www.tokenmill.lt/",
79 | :seed "tokenmill.lt",
80 | :status 200,
81 | :response-time 10,
82 | :status-type :accessible}
83 | {:url "https://www.15min.lt/",
84 | :seed "15min.lt",
85 | :status 200,
86 | :response-time 46,
87 | :status-type :accessible}
88 | {:url "https://news.ycombinator.com/",
89 | :seed "https://news.ycombinator.com",
90 | :status 301,
91 | :response-time 379,
92 | :status-type :redirect})
93 |
94 | ```
95 |
96 | ## Java
97 |
98 | Java code example:
99 |
100 | ```
101 | import crawl.tools.URLCheck;
102 |
103 | import java.util.Map;
104 | import java.util.Arrays;
105 | import java.util.Collection;
106 |
107 | public class MyClass {
108 |
109 | public static void main(String[] args) {
110 | System.out.println(URLCheck.checkAccess("tokenmill.lt"));
111 |
112 | String[] urls = {"15min.lt", "https://news.ycombinator.com"};
113 | Collection