├── .gitignore
├── META-INF
└── MANIFEST.MF
├── README.md
├── assets
└── logo.png
├── build.gradle
├── checked_proxies.txt
├── gradle
└── wrapper
│ ├── gradle-wrapper.jar
│ └── gradle-wrapper.properties
├── gradlew
├── gradlew.bat
├── proxy_list.xml
├── proxyhandler-v1.md
├── proxyhandler-v2.md
├── src
├── main
│ └── java
│ │ └── com
│ │ └── scrapium
│ │ ├── CustomSignalHandler.java
│ │ ├── DatabaseConnection.java
│ │ ├── Main.java
│ │ ├── Scraper.java
│ │ ├── ThreadBase.java
│ │ ├── TweetThreadTaskProcessor.java
│ │ ├── handler.java
│ │ ├── proxium
│ │ ├── Proxy.java
│ │ └── ProxyService.java
│ │ ├── tests
│ │ └── Benchmark.java
│ │ ├── threads
│ │ ├── LoggingThread.java
│ │ ├── ProducerThread.java
│ │ ├── ProxyThread.java
│ │ └── TweetThread.java
│ │ ├── tweetium
│ │ ├── TaskService.java
│ │ └── TweetTask.java
│ │ └── utils
│ │ ├── DebugLogger.java
│ │ └── TimeUtils.java
└── schema.sql
└── unchecked_proxies.txt
/.gitignore:
--------------------------------------------------------------------------------
1 | ##############################
2 | ## Java
3 | ##############################
4 | .mtj.tmp/
5 | *.class
6 | *.jar
7 | *.war
8 | *.ear
9 | *.nar
10 | hs_err_pid*
11 | replay_pid*
12 |
13 | ##############################
14 | ## Maven
15 | ##############################
16 | target/
17 | pom.xml.tag
18 | pom.xml.releaseBackup
19 | pom.xml.versionsBackup
20 | pom.xml.next
21 | pom.xml.bak
22 | release.properties
23 | dependency-reduced-pom.xml
24 | buildNumber.properties
25 | .mvn/timing.properties
26 | .mvn/wrapper/maven-wrapper.jar
27 |
28 | ##############################
29 | ## Gradle
30 | ##############################
31 | bin/
32 | build/
33 | .gradle
34 | .gradletasknamecache
35 | gradle-app.setting
36 | !gradle-wrapper.jar
37 |
38 | ##############################
39 | ## IntelliJ
40 | ##############################
41 | out/
42 | .idea/
43 | .idea_modules/
44 | *.iml
45 | *.ipr
46 | *.iws
47 |
48 | ##############################
49 | ## Eclipse
50 | ##############################
51 | .settings/
52 | bin/
53 | tmp/
54 | .metadata
55 | .classpath
56 | .project
57 | *.tmp
58 | *.bak
59 | *.swp
60 | *~.nib
61 | local.properties
62 | .loadpath
63 | .factorypath
64 |
65 | ##############################
66 | ## NetBeans
67 | ##############################
68 | nbproject/private/
69 | build/
70 | nbbuild/
71 | dist/
72 | nbdist/
73 | nbactions.xml
74 | nb-configuration.xml
75 |
76 | ##############################
77 | ## Visual Studio Code
78 | ##############################
79 | .vscode/
80 | .code-workspace
81 |
82 | ##############################
83 | ## OS X
84 | ##############################
85 | .DS_Store
86 | proxy_checker/proxyChecker.js
87 | proxy_checker/proxies.txt
88 | proxy_checker/package.json
89 | proxy_checker/package-lock.json
90 | proxy_checker/node_modules/ms/readme.md
91 | proxy_checker/node_modules/ms/package.json
92 | proxy_checker/node_modules/ms/license.md
93 | proxy_checker/node_modules/ms/index.js
94 | proxy_checker/node_modules/mass-proxy-validator/yarn.lock
95 | proxy_checker/node_modules/mass-proxy-validator/README.md
96 | proxy_checker/node_modules/mass-proxy-validator/package.json
97 | proxy_checker/node_modules/mass-proxy-validator/LICENSE
98 | proxy_checker/node_modules/mass-proxy-validator/index.js
99 | proxy_checker/node_modules/mass-proxy-validator/.gitattributes
100 | proxy_checker/node_modules/is-buffer/README.md
101 | proxy_checker/node_modules/is-buffer/package.json
102 | proxy_checker/node_modules/is-buffer/LICENSE
103 | proxy_checker/node_modules/is-buffer/index.js
104 | proxy_checker/node_modules/is-buffer/index.d.ts
105 | proxy_checker/node_modules/follow-redirects/README.md
106 | proxy_checker/node_modules/follow-redirects/package.json
107 | proxy_checker/node_modules/follow-redirects/LICENSE
108 | proxy_checker/node_modules/follow-redirects/index.js
109 | proxy_checker/node_modules/follow-redirects/https.js
110 | proxy_checker/node_modules/follow-redirects/http.js
111 | proxy_checker/node_modules/debug/src/node.js
112 | proxy_checker/node_modules/debug/src/index.js
113 | proxy_checker/node_modules/debug/src/debug.js
114 | proxy_checker/node_modules/debug/src/browser.js
115 | proxy_checker/node_modules/debug/README.md
116 | proxy_checker/node_modules/debug/package.json
117 | proxy_checker/node_modules/debug/node.js
118 | proxy_checker/node_modules/debug/Makefile
119 | proxy_checker/node_modules/debug/LICENSE
120 | proxy_checker/node_modules/debug/karma.conf.js
121 | proxy_checker/node_modules/debug/CHANGELOG.md
122 | proxy_checker/node_modules/debug/.travis.yml
123 | proxy_checker/node_modules/debug/.npmignore
124 | proxy_checker/node_modules/debug/.eslintrc
125 | proxy_checker/node_modules/debug/.coveralls.yml
126 | proxy_checker/node_modules/axios/UPGRADE_GUIDE.md
127 | proxy_checker/node_modules/axios/README.md
128 | proxy_checker/node_modules/axios/package.json
129 | proxy_checker/node_modules/axios/LICENSE
130 | proxy_checker/node_modules/axios/lib/utils.js
131 | proxy_checker/node_modules/axios/lib/helpers/spread.js
132 | proxy_checker/node_modules/axios/lib/helpers/README.md
133 | proxy_checker/node_modules/axios/lib/helpers/parseHeaders.js
134 | proxy_checker/node_modules/axios/lib/helpers/normalizeHeaderName.js
135 | proxy_checker/node_modules/axios/lib/helpers/isURLSameOrigin.js
136 | proxy_checker/node_modules/axios/lib/helpers/isAbsoluteURL.js
137 | proxy_checker/node_modules/axios/lib/helpers/deprecatedMethod.js
138 | proxy_checker/node_modules/axios/lib/helpers/cookies.js
139 | proxy_checker/node_modules/axios/lib/helpers/combineURLs.js
140 | proxy_checker/node_modules/axios/lib/helpers/buildURL.js
141 | proxy_checker/node_modules/axios/lib/helpers/bind.js
142 | proxy_checker/node_modules/axios/lib/defaults.js
143 | proxy_checker/node_modules/axios/lib/core/transformData.js
144 | proxy_checker/node_modules/axios/lib/core/settle.js
145 | proxy_checker/node_modules/axios/lib/core/README.md
146 | proxy_checker/node_modules/axios/lib/core/InterceptorManager.js
147 | proxy_checker/node_modules/axios/lib/core/enhanceError.js
148 | proxy_checker/node_modules/axios/lib/core/dispatchRequest.js
149 | proxy_checker/node_modules/axios/lib/core/createError.js
150 | proxy_checker/node_modules/axios/lib/core/Axios.js
151 | proxy_checker/node_modules/axios/lib/cancel/isCancel.js
152 | proxy_checker/node_modules/axios/lib/cancel/CancelToken.js
153 | proxy_checker/node_modules/axios/lib/cancel/Cancel.js
154 | proxy_checker/node_modules/axios/lib/axios.js
155 | proxy_checker/node_modules/axios/lib/adapters/xhr.js
156 | proxy_checker/node_modules/axios/lib/adapters/README.md
157 | proxy_checker/node_modules/axios/lib/adapters/http.js
158 | proxy_checker/node_modules/axios/index.js
159 | proxy_checker/node_modules/axios/index.d.ts
160 | proxy_checker/node_modules/axios/CHANGELOG.md
161 | proxy_checker/node_modules/.package-lock.json
162 |
--------------------------------------------------------------------------------
/META-INF/MANIFEST.MF:
--------------------------------------------------------------------------------
1 | Manifest-Version: 1.0
2 | Main-Class: com.scrapium.Main
3 |
4 |
--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 |
2 |

3 |
4 | requests - but as fast as possible.
Proxied asynchronous multi-threaded scraper via concurrent queues written in Java.
5 |
6 | A template for fast web scrapers
7 |
8 |
9 |
10 |
11 |
12 |
13 |
14 |
15 |
16 |
21 |
22 |
23 |
24 |
25 |
26 |
27 |
28 |
29 |
30 |
31 |
32 |
33 |
34 |
38 |
39 |
40 |
47 |
48 |
49 |
50 | ## WIP: Information coming soon
51 |
52 | Original Repo: Debug development repo for scraping tweets and market data via RDS. Using optimisations techniques such as (Threading, asynchronous I/O, non-blocking I/O - ConcurrentLinkedQueues, and runnable tasks for making requests and saving tweets to the database. (Rewritten in Java)
53 |
54 | Adjust speed:
55 |
56 | Scraper scraper = new Scraper(2, 2000, 10);
57 |
58 | To adjust scraper rate, ensure your internet is fast enough, the scraper will increase in speed over time, as optimum proxies are found.
59 |
60 | Run the program as is, the debug version does not download from Twitter. Achieved RPS will be shown in the console.
61 |
62 | PLEASE NOTE THE FOLLOWING:
63 | - This code only runs properly on Linux, preferably cloud hosted.
64 | - The program initially takes time to work out which proxies are most successful.
65 | - If settings in Main.java are too high, the program will fail all requests.
66 | - The bot uses public proxies, enter proxies in checked_proxies.txt
67 | - !!! The bot must have excessive internet bandwidth.
68 | - On Linux, max open files should be set to a high number
69 |
70 | Troubleshooting
71 | - Requests drop to 0
72 | - Maybe you're using too many resources - there's a perfect balance.
73 | - Having too many worker threads leads to blocks and switching between threads - slowing the system down.
74 | - Using too much memory and the program can't allocate memory
75 | - There are no proxies available
76 | - There are no tasks left
77 | - You have reached the maximum co-currency
78 | - Something may be causing a hang - ie. in the request handler, ie. not updating the coroutine count properly
79 |
80 |
--------------------------------------------------------------------------------
/assets/logo.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/couldbejake/fast/a3932892ef78602637d20d679d8bcd7cead1abef/assets/logo.png
--------------------------------------------------------------------------------
/build.gradle:
--------------------------------------------------------------------------------
1 | plugins {
2 | id 'java'
3 | }
4 |
5 | repositories {
6 | mavenCentral()
7 | jcenter()
8 | }
9 |
10 | dependencies {
11 | implementation 'com.google.code.gson:gson:2.9.1'
12 | implementation 'net.java.dev.jna:jna:5.10.0'
13 | implementation 'org.apache.httpcomponents.client5:httpclient5:5.2.1'
14 | implementation 'org.slf4j:slf4j-api:2.0.7'
15 | implementation 'org.slf4j:slf4j-simple:2.0.7'
16 | implementation 'org.postgresql:postgresql:42.3.1'
17 | implementation 'org.apache.commons:commons-dbcp2:2.9.0'
18 | implementation 'org.apache.httpcomponents:httpcore-nio:4.4.14'
19 | implementation group: 'org.asynchttpclient', name: 'async-http-client', version: '2.12.3'
20 | implementation 'oauth.signpost:signpost-core:1.2.1.2'
21 | implementation 'com.squareup.okhttp3:okhttp:4.9.1'
22 |
23 | implementation 'org.apache.httpcomponents:httpclient:4.5.13'
24 | implementation 'commons-codec:commons-codec:1.15'
25 | implementation 'org.glassfish:javax.json:1.1.4'
26 | }
27 |
--------------------------------------------------------------------------------
/gradle/wrapper/gradle-wrapper.jar:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/couldbejake/fast/a3932892ef78602637d20d679d8bcd7cead1abef/gradle/wrapper/gradle-wrapper.jar
--------------------------------------------------------------------------------
/gradle/wrapper/gradle-wrapper.properties:
--------------------------------------------------------------------------------
1 | distributionBase=GRADLE_USER_HOME
2 | distributionPath=wrapper/dists
3 | distributionUrl=https\://services.gradle.org/distributions/gradle-8.0-bin.zip
4 | networkTimeout=10000
5 | zipStoreBase=GRADLE_USER_HOME
6 | zipStorePath=wrapper/dists
7 |
--------------------------------------------------------------------------------
/gradlew:
--------------------------------------------------------------------------------
1 | #!/bin/sh
2 |
3 | #
4 | # Copyright © 2015-2021 the original authors.
5 | #
6 | # Licensed under the Apache License, Version 2.0 (the "License");
7 | # you may not use this file except in compliance with the License.
8 | # You may obtain a copy of the License at
9 | #
10 | # https://www.apache.org/licenses/LICENSE-2.0
11 | #
12 | # Unless required by applicable law or agreed to in writing, software
13 | # distributed under the License is distributed on an "AS IS" BASIS,
14 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15 | # See the License for the specific language governing permissions and
16 | # limitations under the License.
17 | #
18 |
19 | ##############################################################################
20 | #
21 | # Gradle start up script for POSIX generated by Gradle.
22 | #
23 | # Important for running:
24 | #
25 | # (1) You need a POSIX-compliant shell to run this script. If your /bin/sh is
26 | # noncompliant, but you have some other compliant shell such as ksh or
27 | # bash, then to run this script, type that shell name before the whole
28 | # command line, like:
29 | #
30 | # ksh Gradle
31 | #
32 | # Busybox and similar reduced shells will NOT work, because this script
33 | # requires all of these POSIX shell features:
34 | # * functions;
35 | # * expansions «$var», «${var}», «${var:-default}», «${var+SET}»,
36 | # «${var#prefix}», «${var%suffix}», and «$( cmd )»;
37 | # * compound commands having a testable exit status, especially «case»;
38 | # * various built-in commands including «command», «set», and «ulimit».
39 | #
40 | # Important for patching:
41 | #
42 | # (2) This script targets any POSIX shell, so it avoids extensions provided
43 | # by Bash, Ksh, etc; in particular arrays are avoided.
44 | #
45 | # The "traditional" practice of packing multiple parameters into a
46 | # space-separated string is a well documented source of bugs and security
47 | # problems, so this is (mostly) avoided, by progressively accumulating
48 | # options in "$@", and eventually passing that to Java.
49 | #
50 | # Where the inherited environment variables (DEFAULT_JVM_OPTS, JAVA_OPTS,
51 | # and GRADLE_OPTS) rely on word-splitting, this is performed explicitly;
52 | # see the in-line comments for details.
53 | #
54 | # There are tweaks for specific operating systems such as AIX, CygWin,
55 | # Darwin, MinGW, and NonStop.
56 | #
57 | # (3) This script is generated from the Groovy template
58 | # https://github.com/gradle/gradle/blob/HEAD/subprojects/plugins/src/main/resources/org/gradle/api/internal/plugins/unixStartScript.txt
59 | # within the Gradle project.
60 | #
61 | # You can find Gradle at https://github.com/gradle/gradle/.
62 | #
63 | ##############################################################################
64 |
65 | # Attempt to set APP_HOME
66 |
67 | # Resolve links: $0 may be a link
68 | app_path=$0
69 |
70 | # Need this for daisy-chained symlinks.
71 | while
72 | APP_HOME=${app_path%"${app_path##*/}"} # leaves a trailing /; empty if no leading path
73 | [ -h "$app_path" ]
74 | do
75 | ls=$( ls -ld "$app_path" )
76 | link=${ls#*' -> '}
77 | case $link in #(
78 | /*) app_path=$link ;; #(
79 | *) app_path=$APP_HOME$link ;;
80 | esac
81 | done
82 |
83 | # This is normally unused
84 | # shellcheck disable=SC2034
85 | APP_BASE_NAME=${0##*/}
86 | APP_HOME=$( cd "${APP_HOME:-./}" && pwd -P ) || exit
87 |
88 | # Add default JVM options here. You can also use JAVA_OPTS and GRADLE_OPTS to pass JVM options to this script.
89 | DEFAULT_JVM_OPTS='"-Xmx64m" "-Xms64m"'
90 |
91 | # Use the maximum available, or set MAX_FD != -1 to use that value.
92 | MAX_FD=maximum
93 |
94 | warn () {
95 | echo "$*"
96 | } >&2
97 |
98 | die () {
99 | echo
100 | echo "$*"
101 | echo
102 | exit 1
103 | } >&2
104 |
105 | # OS specific support (must be 'true' or 'false').
106 | cygwin=false
107 | msys=false
108 | darwin=false
109 | nonstop=false
110 | case "$( uname )" in #(
111 | CYGWIN* ) cygwin=true ;; #(
112 | Darwin* ) darwin=true ;; #(
113 | MSYS* | MINGW* ) msys=true ;; #(
114 | NONSTOP* ) nonstop=true ;;
115 | esac
116 |
117 | CLASSPATH=$APP_HOME/gradle/wrapper/gradle-wrapper.jar
118 |
119 |
120 | # Determine the Java command to use to start the JVM.
121 | if [ -n "$JAVA_HOME" ] ; then
122 | if [ -x "$JAVA_HOME/jre/sh/java" ] ; then
123 | # IBM's JDK on AIX uses strange locations for the executables
124 | JAVACMD=$JAVA_HOME/jre/sh/java
125 | else
126 | JAVACMD=$JAVA_HOME/bin/java
127 | fi
128 | if [ ! -x "$JAVACMD" ] ; then
129 | die "ERROR: JAVA_HOME is set to an invalid directory: $JAVA_HOME
130 |
131 | Please set the JAVA_HOME variable in your environment to match the
132 | location of your Java installation."
133 | fi
134 | else
135 | JAVACMD=java
136 | which java >/dev/null 2>&1 || die "ERROR: JAVA_HOME is not set and no 'java' command could be found in your PATH.
137 |
138 | Please set the JAVA_HOME variable in your environment to match the
139 | location of your Java installation."
140 | fi
141 |
142 | # Increase the maximum file descriptors if we can.
143 | if ! "$cygwin" && ! "$darwin" && ! "$nonstop" ; then
144 | case $MAX_FD in #(
145 | max*)
146 | # In POSIX sh, ulimit -H is undefined. That's why the result is checked to see if it worked.
147 | # shellcheck disable=SC3045
148 | MAX_FD=$( ulimit -H -n ) ||
149 | warn "Could not query maximum file descriptor limit"
150 | esac
151 | case $MAX_FD in #(
152 | '' | soft) :;; #(
153 | *)
154 | # In POSIX sh, ulimit -n is undefined. That's why the result is checked to see if it worked.
155 | # shellcheck disable=SC3045
156 | ulimit -n "$MAX_FD" ||
157 | warn "Could not set maximum file descriptor limit to $MAX_FD"
158 | esac
159 | fi
160 |
161 | # Collect all arguments for the java command, stacking in reverse order:
162 | # * args from the command line
163 | # * the main class name
164 | # * -classpath
165 | # * -D...appname settings
166 | # * --module-path (only if needed)
167 | # * DEFAULT_JVM_OPTS, JAVA_OPTS, and GRADLE_OPTS environment variables.
168 |
169 | # For Cygwin or MSYS, switch paths to Windows format before running java
170 | if "$cygwin" || "$msys" ; then
171 | APP_HOME=$( cygpath --path --mixed "$APP_HOME" )
172 | CLASSPATH=$( cygpath --path --mixed "$CLASSPATH" )
173 |
174 | JAVACMD=$( cygpath --unix "$JAVACMD" )
175 |
176 | # Now convert the arguments - kludge to limit ourselves to /bin/sh
177 | for arg do
178 | if
179 | case $arg in #(
180 | -*) false ;; # don't mess with options #(
181 | /?*) t=${arg#/} t=/${t%%/*} # looks like a POSIX filepath
182 | [ -e "$t" ] ;; #(
183 | *) false ;;
184 | esac
185 | then
186 | arg=$( cygpath --path --ignore --mixed "$arg" )
187 | fi
188 | # Roll the args list around exactly as many times as the number of
189 | # args, so each arg winds up back in the position where it started, but
190 | # possibly modified.
191 | #
192 | # NB: a `for` loop captures its iteration list before it begins, so
193 | # changing the positional parameters here affects neither the number of
194 | # iterations, nor the values presented in `arg`.
195 | shift # remove old arg
196 | set -- "$@" "$arg" # push replacement arg
197 | done
198 | fi
199 |
200 | # Collect all arguments for the java command;
201 | # * $DEFAULT_JVM_OPTS, $JAVA_OPTS, and $GRADLE_OPTS can contain fragments of
202 | # shell script including quotes and variable substitutions, so put them in
203 | # double quotes to make sure that they get re-expanded; and
204 | # * put everything else in single quotes, so that it's not re-expanded.
205 |
206 | set -- \
207 | "-Dorg.gradle.appname=$APP_BASE_NAME" \
208 | -classpath "$CLASSPATH" \
209 | org.gradle.wrapper.GradleWrapperMain \
210 | "$@"
211 |
212 | # Stop when "xargs" is not available.
213 | if ! command -v xargs >/dev/null 2>&1
214 | then
215 | die "xargs is not available"
216 | fi
217 |
218 | # Use "xargs" to parse quoted args.
219 | #
220 | # With -n1 it outputs one arg per line, with the quotes and backslashes removed.
221 | #
222 | # In Bash we could simply go:
223 | #
224 | # readarray ARGS < <( xargs -n1 <<<"$var" ) &&
225 | # set -- "${ARGS[@]}" "$@"
226 | #
227 | # but POSIX shell has neither arrays nor command substitution, so instead we
228 | # post-process each arg (as a line of input to sed) to backslash-escape any
229 | # character that might be a shell metacharacter, then use eval to reverse
230 | # that process (while maintaining the separation between arguments), and wrap
231 | # the whole thing up as a single "set" statement.
232 | #
233 | # This will of course break if any of these variables contains a newline or
234 | # an unmatched quote.
235 | #
236 |
237 | eval "set -- $(
238 | printf '%s\n' "$DEFAULT_JVM_OPTS $JAVA_OPTS $GRADLE_OPTS" |
239 | xargs -n1 |
240 | sed ' s~[^-[:alnum:]+,./:=@_]~\\&~g; ' |
241 | tr '\n' ' '
242 | )" '"$@"'
243 |
244 | exec "$JAVACMD" "$@"
245 |
--------------------------------------------------------------------------------
/gradlew.bat:
--------------------------------------------------------------------------------
1 | @rem
2 | @rem Copyright 2015 the original author or authors.
3 | @rem
4 | @rem Licensed under the Apache License, Version 2.0 (the "License");
5 | @rem you may not use this file except in compliance with the License.
6 | @rem You may obtain a copy of the License at
7 | @rem
8 | @rem https://www.apache.org/licenses/LICENSE-2.0
9 | @rem
10 | @rem Unless required by applicable law or agreed to in writing, software
11 | @rem distributed under the License is distributed on an "AS IS" BASIS,
12 | @rem WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | @rem See the License for the specific language governing permissions and
14 | @rem limitations under the License.
15 | @rem
16 |
17 | @if "%DEBUG%"=="" @echo off
18 | @rem ##########################################################################
19 | @rem
20 | @rem Gradle startup script for Windows
21 | @rem
22 | @rem ##########################################################################
23 |
24 | @rem Set local scope for the variables with windows NT shell
25 | if "%OS%"=="Windows_NT" setlocal
26 |
27 | set DIRNAME=%~dp0
28 | if "%DIRNAME%"=="" set DIRNAME=.
29 | @rem This is normally unused
30 | set APP_BASE_NAME=%~n0
31 | set APP_HOME=%DIRNAME%
32 |
33 | @rem Resolve any "." and ".." in APP_HOME to make it shorter.
34 | for %%i in ("%APP_HOME%") do set APP_HOME=%%~fi
35 |
36 | @rem Add default JVM options here. You can also use JAVA_OPTS and GRADLE_OPTS to pass JVM options to this script.
37 | set DEFAULT_JVM_OPTS="-Xmx64m" "-Xms64m"
38 |
39 | @rem Find java.exe
40 | if defined JAVA_HOME goto findJavaFromJavaHome
41 |
42 | set JAVA_EXE=java.exe
43 | %JAVA_EXE% -version >NUL 2>&1
44 | if %ERRORLEVEL% equ 0 goto execute
45 |
46 | echo.
47 | echo ERROR: JAVA_HOME is not set and no 'java' command could be found in your PATH.
48 | echo.
49 | echo Please set the JAVA_HOME variable in your environment to match the
50 | echo location of your Java installation.
51 |
52 | goto fail
53 |
54 | :findJavaFromJavaHome
55 | set JAVA_HOME=%JAVA_HOME:"=%
56 | set JAVA_EXE=%JAVA_HOME%/bin/java.exe
57 |
58 | if exist "%JAVA_EXE%" goto execute
59 |
60 | echo.
61 | echo ERROR: JAVA_HOME is set to an invalid directory: %JAVA_HOME%
62 | echo.
63 | echo Please set the JAVA_HOME variable in your environment to match the
64 | echo location of your Java installation.
65 |
66 | goto fail
67 |
68 | :execute
69 | @rem Setup the command line
70 |
71 | set CLASSPATH=%APP_HOME%\gradle\wrapper\gradle-wrapper.jar
72 |
73 |
74 | @rem Execute Gradle
75 | "%JAVA_EXE%" %DEFAULT_JVM_OPTS% %JAVA_OPTS% %GRADLE_OPTS% "-Dorg.gradle.appname=%APP_BASE_NAME%" -classpath "%CLASSPATH%" org.gradle.wrapper.GradleWrapperMain %*
76 |
77 | :end
78 | @rem End local scope for the variables with windows NT shell
79 | if %ERRORLEVEL% equ 0 goto mainEnd
80 |
81 | :fail
82 | rem Set variable GRADLE_EXIT_CONSOLE if you need the _script_ return code instead of
83 | rem the _cmd.exe /c_ return code!
84 | set EXIT_CODE=%ERRORLEVEL%
85 | if %EXIT_CODE% equ 0 set EXIT_CODE=1
86 | if not ""=="%GRADLE_EXIT_CONSOLE%" exit %EXIT_CODE%
87 | exit /b %EXIT_CODE%
88 |
89 | :mainEnd
90 | if "%OS%"=="Windows_NT" endlocal
91 |
92 | :omega
93 |
--------------------------------------------------------------------------------
/proxy_list.xml:
--------------------------------------------------------------------------------
1 |
2 |
3 | https://raw.githubusercontent.com/zevtyardt/proxy-list/main/http.txt
4 | https://raw.githubusercontent.com/caliphdev/Proxy-List/master/http.txt
5 | https://raw.githubusercontent.com/mmpx12/proxy-list/master/https.txt
6 | https://raw.githubusercontent.com/roosterkid/openproxylist/main/HTTPS_RAW.txt
7 | https://raw.githubusercontent.com/Bardiafa/Proxy-Leecher/main/proxies.txt
8 | https://raw.githubusercontent.com/monosans/proxy-list/main/proxies/http.txt
9 | https://raw.githubusercontent.com/yemixzy/proxy-list/main/proxies/http.txt
10 | https://raw.githubusercontent.com/HyperBeats/proxy-list/main/https.txt
11 | https://raw.githubusercontent.com/Zaeem20/FREE_PROXIES_LIST/master/https.txt
12 | https://raw.githubusercontent.com/ALIILAPRO/Proxy/main/http.txt
13 | https://raw.githubusercontent.com/ShiftyTR/Proxy-List/master/https.txt
14 | https://raw.githubusercontent.com/prxchk/proxy-list/main/http.txt
15 | https://raw.githubusercontent.com/ErcinDedeoglu/proxies/main/proxies/https.txt
16 | https://raw.githubusercontent.com/mertguvencli/http-proxy-list/main/proxy-list/data.txt
17 | https://www.proxyscan.io/download?type=https
18 | https://raw.githubusercontent.com/aslisk/proxyhttps/main/https.txt
19 | https://raw.githubusercontent.com/zevtyardt/proxy-list/main/socks4.txt
20 | https://raw.githubusercontent.com/mmpx12/proxy-list/master/socks4.txt
21 | https://raw.githubusercontent.com/iptotal/free-proxy-list/master/socks4.txt
22 | https://raw.githubusercontent.com/roosterkid/openproxylist/main/SOCKS4_RAW.txt
23 | https://raw.githubusercontent.com/monosans/proxy-list/main/proxies/socks4.txt
24 | https://raw.githubusercontent.com/HyperBeats/proxy-list/main/socks4.txt
25 | https://raw.githubusercontent.com/Zaeem20/FREE_PROXIES_LIST/master/socks4.txt
26 | https://raw.githubusercontent.com/ALIILAPRO/Proxy/main/socks4.txt
27 | https://raw.githubusercontent.com/ShiftyTR/Proxy-List/master/socks4.txt
28 | https://raw.githubusercontent.com/prxchk/proxy-list/main/socks4.txt
29 | https://raw.githubusercontent.com/ErcinDedeoglu/proxies/main/proxies/socks4.txt
30 | https://api.openproxylist.xyz/socks4.txt
31 | https://www.proxy-list.download/api/v1/get?type=socks4
32 | https://www.proxyscan.io/download?type=socks4
33 | https://raw.githubusercontent.com/TheSpeedX/PROXY-List/master/socks4.txt
34 | https://raw.githubusercontent.com/jetkai/proxy-list/main/online-proxies/txt/proxies-socks4.txt
35 | https://raw.githubusercontent.com/zevtyardt/proxy-list/main/socks5.txt
36 | https://raw.githubusercontent.com/caliphdev/Proxy-List/master/socks5.txt
37 | https://raw.githubusercontent.com/roosterkid/openproxylist/main/SOCKS5_RAW.txt
38 | https://raw.githubusercontent.com/iptotal/free-proxy-list/master/socks5.txt
39 | https://raw.githubusercontent.com/monosans/proxy-list/main/proxies/socks5.txt
40 | https://raw.githubusercontent.com/HyperBeats/proxy-list/main/socks5.txt
41 | https://raw.githubusercontent.com/Zaeem20/FREE_PROXIES_LIST/master/socks5.txt
42 | https://raw.githubusercontent.com/ALIILAPRO/Proxy/main/socks5.txt
43 | https://raw.githubusercontent.com/ShiftyTR/Proxy-List/master/socks5.txt>
44 | https://raw.githubusercontent.com/prxchk/proxy-list/main/socks5.txt
45 | https://raw.githubusercontent.com/ErcinDedeoglu/proxies/main/proxies/socks5.txt
46 | https://api.openproxylist.xyz/socks5.txt
47 | https://www.proxy-list.download/api/v1/get?type=socks5
48 | https://www.proxyscan.io/download?type=socks5
49 | https://raw.githubusercontent.com/hookzof/socks5_list/master/proxy.txt
50 | https://raw.githubusercontent.com/mmpx12/proxy-list/master/socks5.txt
51 | https://raw.githubusercontent.com/TheSpeedX/PROXY-List/master/socks5.txt
52 | https://raw.githubusercontent.com/jetkai/proxy-list/main/online-proxies/txt/proxies-socks5.txt
53 |
54 |
55 |
56 |
--------------------------------------------------------------------------------
/proxyhandler-v1.md:
--------------------------------------------------------------------------------
1 |
2 |
3 | We have outlined a plan for a proxy picker, to pick proxies that haven't been used.
4 |
5 | The following restrictions are required:
6 |
7 | - Proxies must timeout after 500 successful/failed requests in total are made.
8 | - If a proxies invalid attempts exceeds 30 attempts within a 5 minute window, the proxy next_avaliable should be set 10 minutes in the future
9 | - If a proxy is successful, delta should increment. If it fails it should decrement.
10 | - If a GT (Twitter Guest Token) has been recently updated, skip the update process, as it's possible a cocurrent/paralell thread has attempted this recently.
11 | - As threading is used, a single proxy should not be used too many times.
12 | - Proxies should be added to a queue by a proxy gen thread that updates a FILO queue.
13 |
14 |
15 | ```
16 | Proxy:
17 | usage_count -> 1 -> 500 # holds information so that one proxy isn't used more than 500 times in quick succession
18 | next_avaliable -> TIME # holds the next avaliable time a proxy can be used
19 |
20 | gt_last_updated -> TIME # holds a guest token (a token used when making a request. It can be invalid.)
21 | success_delta -> 0 LIMIT ( 5, 000 -> 10, 000 ) # holds information on how many times a proxy was successful
22 | failed_count -> 0 (0 -> 100) # holds information on the number of concequative failed attemps
23 | ```
24 |
25 | ```
26 |
27 | add_proxy_to_queue(): # proxies are added into a queue, and then used at a later point
28 |
29 | # makes a query to the database to get a free proxy using parameters.
30 | proxy = get_proxy(get a proxy where the next_avaliable time is now, sort by success_delta)
31 |
32 | if(there are proxies avaliable):
33 |
34 | if(usage_count > 500): # checks if the proxy has been used more than 500 times recently
35 |
36 | # we are close to the rate limit of the proxy
37 |
38 | usage_count = 0 # reset the usage count
39 | failed_count = 0 # reset the failed count (should this be done here?)
40 |
41 | next_avaliable = time + 15 minutes # up the next_avaliable time to 15 minutes into the future.
42 |
43 | # get a new proxy, since this one can not be used
44 | (END)
45 |
46 | if(failed_count > 100): # check if there have been over 100 sequential failed requests
47 |
48 | usage_count = 0 # reset the usage count (should this be done here?)
49 | failed_count = 0 # reset the failed count
50 | next_avaliable = time + 10 minutes # up the next_avaliable time to 15 minutes into the future.
51 |
52 | # get a new proxy, since this one can not be used
53 | (END)
54 |
55 | +1 usage count # add 1 to the usage count
56 |
57 | make_regular_request()
58 |
59 | if(HTTP request success && Proxy is working): # if the HTTP request was a success using proxy
60 | + 1 success_delta # include the success delta
61 | failed_count = 0 # reset the consecutive fail count {p1}
62 | if(guest_token_invalid): # if the Guest token (GT) was invalid
63 | if(gt_last_updated is longer than 2 minutes ago): # and hasn't been updated recently
64 | make_gt_request() # make a separate request to get a new guest token
65 | update_gt() # update the guest token
66 | next_avaliable = 0 # allow this proxy to be used again
67 | else:
68 | save_reqest() # if the guest token was correct, and the HTTP request didn't fail due to a proxy
69 | else:
70 | -1 success_delta # decrease the success delta
71 | +1 failed_count # increase the consecutive fail count
72 | next_avaliable = NOW() + 5 minutes # make the proxy usable again in 5 minutes
73 |
74 | ```
75 |
76 | Additional suggestions:
77 | - Use a ML model to choose which proxy to use.
78 | - Backoff strategy 2^n seconds timeout. n = consecutive filed attempts -> max n + $r (randomness).
79 | - Use a rolling time window instead
80 |
81 |
82 |
83 |
84 |
85 |
86 |
87 |
88 |
89 |
90 |
91 |
92 |
93 |
94 |
95 |
96 |
97 |
98 |
99 |
100 |
101 |
102 |
103 |
104 |
105 |
106 |
107 |
--------------------------------------------------------------------------------
/proxyhandler-v2.md:
--------------------------------------------------------------------------------
1 | get_new_proxy():
2 |
3 | proxy = get_proxy_query() # where next_avaliable is in the future, sort by success_delta
4 |
5 | if(proxy != null):
6 |
7 | if(proxy.usage_count > 500):
8 | update_proxy(proxy, {
9 | usage_count: 0,
10 | failed_count: 0,
11 | next_avaliable: NOW() + 15 minutes
12 | })
13 | return get_new_proxy()
14 |
15 | if(proxy.failed_count > 100):
16 | update_proxy(proxy, {
17 | usage_count: 0,
18 | failed_count: 0,
19 | next_avaliable: NOW() + 10 minutes
20 | })
21 |
22 | return get_new_proxy()
23 |
24 | proxy.usage_count += 1
25 |
26 | response = make_request("make search request", proxy)
27 |
28 | if(response.success is True):
29 | proxy.success_delta += 1
30 | proxy.failed_count = 0
31 | if(response.guest_token_invalid == true):
32 | if(proxy.guest_token_last_updated more than 2 minutes ago):
33 | response2 = make_request("make guest token request", proxy)
34 | update_proxy({
35 | guest_token: response2.guest_token
36 | })
37 | proxy.next_avliable = 0
38 | else:
39 | save_response_to_db(response)
40 | else:
41 | proxy.success_delta -= 1
42 | proxy.failed_count += 1
43 |
--------------------------------------------------------------------------------
/src/main/java/com/scrapium/CustomSignalHandler.java:
--------------------------------------------------------------------------------
1 | package com.scrapium;
2 |
3 | import sun.misc.Signal;
4 | import sun.misc.SignalHandler;
5 |
6 | public class CustomSignalHandler {
7 | public static void handleTSTPSignal(final Runnable onStop) {
8 | Signal.handle(new Signal("TSTP"), new SignalHandler() {
9 | @Override
10 | public void handle(Signal signal) {
11 | onStop.run();
12 | }
13 | });
14 | }
15 | }
--------------------------------------------------------------------------------
/src/main/java/com/scrapium/DatabaseConnection.java:
--------------------------------------------------------------------------------
1 | package com.scrapium;
2 |
3 | //import org.apache.commons.dbcp2.BasicDataSource;
4 |
5 | import javax.sql.DataSource;
6 | import java.sql.Connection;
7 | import java.sql.SQLException;
8 |
9 | public class DatabaseConnection {
10 | private static DataSource dataSource;
11 |
12 | /*
13 | static {
14 | BasicDataSource ds = new BasicDataSource();
15 | ds.setUrl("jdbc:postgresql://localhost:5432/scrapium_proxies");
16 | ds.setUsername("scrapium_user");
17 | ds.setPassword("6F3dNfvz3eL3Vb3ol");
18 | ds.setInitialSize(5); // Set the initial number of connections in the pool
19 | ds.setMaxTotal(10); // Set the maximum number of connections in the pool
20 | dataSource = ds;
21 | } */
22 |
23 | public static Connection getConnection() throws SQLException {
24 | return dataSource.getConnection();
25 | }
26 |
27 | }
--------------------------------------------------------------------------------
/src/main/java/com/scrapium/Main.java:
--------------------------------------------------------------------------------
1 | package com.scrapium;
2 |
3 | import com.scrapium.tests.Benchmark;
4 |
5 | import java.util.ArrayList;
6 | // makes sense to use PostgreSQL for data, and Redis for caching & analytics
7 |
8 | /*
9 | Troubleshooting
10 | - Requests drop to 0
11 | - Maybe you're using too many resources - there's a perfect balance.
12 | - Having too many worker threads leads to blocks and switching between threads - slowing the system down.
13 | - Using too much memory and the program can't allocate memory
14 | - There are no proxies available
15 | - There are no tasks left
16 | - You have reached the maximum co-currency
17 | - Something may be causing a hang - ie. in the request handler, ie. not updating the coroutine count properly
18 |
19 | */
20 |
21 | public class Main {
22 |
23 | public static void main(String[] args) {
24 | runService();
25 | }
26 |
27 | public static void runTest(){
28 | Benchmark.runTest();
29 | }
30 |
31 | public static void runService(){
32 |
33 | // Scraper(consumerCount, maxCoroutineCount, conSocketTimeout)
34 |
35 | // consumerCount - The number of threads running scraper tasks
36 | // maxCoroutineCount - The max amount of asynchronous calls that should be made for each thread
37 | // conSocketTimeout - The amount of time before connectionSocketTimeout will occur.
38 |
39 | // calls
40 |
41 | // scraper.logger.successRequestCount.get() - Will get the amount of total successful requests since .scrape() is called.
42 | // scraper.logger.failedRequestCount.get() - Will get the amount of total failed requests since .scrape() is called.
43 |
44 | // note: The last parameter of Scrape() is not currently used.
45 |
46 | // 6, 5000 - AWS 800 requests per second
47 |
48 | Scraper scraper = new Scraper(2, 2000, 10);
49 |
50 | scraper.scrape();
51 |
52 | }
53 |
54 |
55 | }
--------------------------------------------------------------------------------
/src/main/java/com/scrapium/Scraper.java:
--------------------------------------------------------------------------------
1 | package com.scrapium;
2 |
3 | import com.scrapium.proxium.ProxyService;
4 | import com.scrapium.threads.LoggingThread;
5 | import com.scrapium.threads.ProducerThread;
6 | import com.scrapium.threads.ProxyThread;
7 | import com.scrapium.threads.TweetThread;
8 | import com.scrapium.tweetium.TaskService;
9 | import com.scrapium.utils.DebugLogger;
10 |
11 | import java.util.ArrayList;
12 | import java.util.Iterator;
13 | import java.util.concurrent.*;
14 |
15 | public class Scraper {
16 |
17 |
18 | public ProxyService proxyService;
19 | public long conSocketTimeout;
20 | private int consumerCount;
21 | public int maxCoroutineCount;
22 |
23 | private final ExecutorService threadPool;
24 | private TaskService taskService;
25 |
26 | //public AtomicInteger coroutineCount;
27 | public LoggingThread logger;
28 | public ProxyThread proxyThread;
29 |
30 | private ProducerThread producer;
31 | private ArrayList threads;
32 |
33 | // the number of coroutines currently running
34 |
35 | public Scraper(int consumerCount, int maxCoroutineCount, int conSocketTimeout) {
36 |
37 | this.proxyService = new ProxyService();
38 |
39 | this.consumerCount = consumerCount;
40 | this.maxCoroutineCount = maxCoroutineCount;
41 | this.conSocketTimeout = conSocketTimeout;
42 |
43 | this.threadPool = Executors.newFixedThreadPool(consumerCount + 3);
44 | this.taskService = new TaskService();
45 | this.threads = new ArrayList();
46 |
47 |
48 | /*
49 | // Handle the SIGINT signal (CTRL + C)
50 | Runtime.getRuntime().addShutdownHook(new Thread(() -> {
51 | System.out.println("Shutting down gracefully...");
52 | this.stop();
53 | }));
54 |
55 | String osName = System.getProperty("os.name");
56 | if (!osName.toLowerCase().contains("windows")) {
57 | // the environment is not Windows
58 |
59 | // Handle the SIGTSTP signal (CTRL + Z)
60 | CustomSignalHandler.handleTSTPSignal(() -> {
61 | this.stop();
62 | System.out.println("SIGTSTP signal received!");
63 | System.exit(0);
64 | });
65 | } */
66 |
67 | }
68 |
69 | public void scrape() {
70 |
71 | this.logger = new LoggingThread(this, taskService);
72 | //threads.add(this.logger);
73 | threadPool.submit(this.logger);
74 |
75 | this.proxyThread = new ProxyThread(this, this.proxyService);
76 | //threads.add(this.proxyThread);
77 | threadPool.submit(this.proxyThread);
78 |
79 | this.producer = new ProducerThread(this, taskService);
80 | //threads.add(this.producer);
81 | threadPool.submit(this.producer);
82 |
83 | for (int i = 0; i < consumerCount; i++) {
84 | DebugLogger.log("Scraper: Created consumer thread.");
85 | TweetThread tweetThread = new TweetThread(i + 1, this, taskService);
86 | // threads.add(tweetThread);
87 | threadPool.submit(tweetThread);
88 | }
89 | }
90 |
91 | public void stop() {
92 | for (Iterator iterator = threads.iterator(); iterator.hasNext(); ) {
93 | ThreadBase item = iterator.next();
94 | item.running = false;
95 | }
96 |
97 | try {
98 | System.out.println("Attempting to shutdown thread pool...");
99 | threadPool.shutdown();
100 | threadPool.awaitTermination(400, TimeUnit.SECONDS);
101 | } catch (InterruptedException e) {
102 | e.printStackTrace();
103 | System.err.println("Thread pool termination interrupted.");
104 | } finally {
105 | if (!threadPool.isTerminated()) {
106 | System.err.println("Forcing thread pool shutdown...");
107 | threadPool.shutdownNow();
108 | try {
109 | threadPool.awaitTermination(60, TimeUnit.SECONDS);
110 | } catch (InterruptedException e) {
111 | e.printStackTrace();
112 | throw new RuntimeException(e);
113 | }
114 | }
115 | System.out.println("Thread pool shutdown complete.");
116 | }
117 | }
118 | }
119 |
--------------------------------------------------------------------------------
/src/main/java/com/scrapium/ThreadBase.java:
--------------------------------------------------------------------------------
1 | package com.scrapium;
2 |
3 | public class ThreadBase {
4 | public volatile boolean running = true;
5 |
6 | }
7 |
--------------------------------------------------------------------------------
/src/main/java/com/scrapium/TweetThreadTaskProcessor.java:
--------------------------------------------------------------------------------
1 | package com.scrapium;
2 |
3 | import com.scrapium.proxium.Proxy;
4 | import com.scrapium.threads.LoggingThread;
5 | import com.scrapium.tweetium.TaskService;
6 | import com.scrapium.tweetium.TweetTask;
7 | import com.scrapium.utils.DebugLogger;
8 | import io.netty.handler.ssl.SslContext;
9 | import io.netty.handler.ssl.SslContextBuilder;
10 | import org.apache.hc.client5.http.auth.AuthScope;
11 | import org.apache.hc.client5.http.auth.CredentialsProvider;
12 | import org.apache.hc.client5.http.auth.UsernamePasswordCredentials;
13 | import org.apache.hc.client5.http.impl.auth.BasicCredentialsProvider;
14 | import org.asynchttpclient.*;
15 | import org.asynchttpclient.proxy.ProxyServer;
16 | import org.asynchttpclient.proxy.ProxyType;
17 |
18 | import javax.net.ssl.*;
19 |
20 | import static org.asynchttpclient.Dsl.*;
21 |
22 |
23 | import java.io.IOException;
24 | import java.net.Socket;
25 | import java.security.KeyManagementException;
26 | import java.security.NoSuchAlgorithmException;
27 | import java.security.cert.CertificateException;
28 | import java.security.cert.X509Certificate;
29 | import java.time.Duration;
30 | import java.time.Instant;
31 | import java.util.concurrent.*;
32 | import java.util.concurrent.atomic.AtomicInteger;
33 | import java.util.Base64;
34 |
35 | public class TweetThreadTaskProcessor {
36 | private AsyncHttpClient c;
37 | private final DefaultAsyncHttpClientConfig clientConfig;
38 |
39 | /*
40 | Notes TODO:
41 | - AtomicReference isn't efficient (create a new object instead)
42 | */
43 |
44 | private Scraper scraper;
45 | private TaskService taskService;
46 | private final int threadID;
47 | private volatile boolean tweetThreadRunning;
48 | private AtomicInteger coroutineCount;
49 |
50 | private int requestCount;
51 | private Instant lastCleanup;
52 |
53 | private final boolean DO_CLEANUP = false;
54 |
55 | private SSLContext createSslContext() throws Exception {
56 | X509TrustManager tm = new X509TrustManager() {
57 |
58 | public void checkClientTrusted(X509Certificate[] xcs,
59 | String string) throws CertificateException {
60 | }
61 |
62 | public void checkServerTrusted(X509Certificate[] xcs,
63 | String string) throws CertificateException {
64 | }
65 |
66 | public X509Certificate[] getAcceptedIssuers() {
67 | return null;
68 | }
69 | };
70 |
71 | SSLContext ctx = SSLContext.getInstance("TLS");
72 | ctx.init(null, new TrustManager[] { tm }, null);
73 | return ctx;
74 | }
75 |
76 | public TweetThreadTaskProcessor(int threadID, boolean running, Scraper scraper, TaskService taskService, AtomicInteger coroutineCount) {
77 | this.threadID = threadID;
78 | this.scraper = scraper;
79 | this.taskService = taskService;
80 | this.coroutineCount = coroutineCount;
81 | this.tweetThreadRunning = running;
82 |
83 |
84 |
85 | this.clientConfig = new DefaultAsyncHttpClientConfig.Builder()
86 | .setConnectTimeout(8000)
87 | .setRequestTimeout(8000)
88 | .setReadTimeout(5000)
89 | .setMaxConnections(5000)
90 | .setMaxRequestRetry(1)
91 | .build();
92 |
93 | this.c = asyncHttpClient(this.clientConfig);
94 |
95 |
96 | this.lastCleanup = Instant.now();
97 |
98 | }
99 |
100 | public void doClientCleanupTick(){
101 | if(DO_CLEANUP){
102 | if(this.lastCleanup.isBefore(Instant.now().minusSeconds(180))){
103 | System.out.println("[!] Doing client clean up.");
104 | this.lastCleanup = Instant.now();
105 | try {
106 | this.c.close();
107 | } catch (IOException e) {
108 | throw new RuntimeException(e);
109 | }
110 | this.coroutineCount.set(0);
111 | this.c = asyncHttpClient(this.clientConfig);
112 | }
113 | }
114 | }
115 |
116 | /*
117 | Run Continuously
118 | */
119 | public void processNextTask(){
120 |
121 | if(!DO_CLEANUP || this.lastCleanup.isBefore(Instant.now().minusSeconds(10))){
122 | DebugLogger.log("TweetThreadTask: Before attempting to increase request count.");
123 |
124 | if(this.taskService.hasNextTask()){
125 | Proxy proxy = this.scraper.proxyService.getNewProxy();
126 | TweetTask task = this.taskService.getNextTask();
127 |
128 |
129 | if(proxy != null){
130 |
131 | // Debugging version only makes debug requests!
132 | Request request1 = new RequestBuilder("GET")
133 | .setUrl("http://httpforever.com")
134 | .setProxyServer(new ProxyServer.Builder(proxy.getIP(), proxy.getPort()).build())
135 | .build();
136 |
137 | c.executeRequest(request1, new handler(c, proxy, task, this));
138 | } else {
139 | System.out.println("No proxies are available!");
140 | }
141 | }
142 | }
143 | }
144 |
145 |
146 | public Scraper getScraper(){
147 | return this.scraper;
148 | }
149 |
150 | public LoggingThread getLogger(){
151 | return this.scraper.logger;
152 | }
153 |
154 | public int getCoroutineCount() { return this.coroutineCount.get(); }
155 |
156 | public void incrementCoroutineCount() { this.coroutineCount.incrementAndGet(); }
157 | public void decrementCoroutineCount() { this.coroutineCount.decrementAndGet(); }
158 |
159 | public TaskService getTaskService(){
160 | return this.taskService;
161 | }
162 | }
--------------------------------------------------------------------------------
/src/main/java/com/scrapium/handler.java:
--------------------------------------------------------------------------------
1 | package com.scrapium;
2 |
3 | import com.scrapium.proxium.Proxy;
4 | import com.scrapium.tweetium.TweetTask;
5 | import io.netty.handler.codec.http.HttpHeaders;
6 | import org.asynchttpclient.*;
7 |
8 | import java.io.IOException;
9 |
10 | public class handler implements AsyncHandler {
11 | private final AsyncHttpClient client;
12 | private final TweetThreadTaskProcessor processor;
13 | private final Proxy proxy;
14 | private final TweetTask task;
15 | private Integer status;
16 |
17 | public handler(AsyncHttpClient client, Proxy proxy, TweetTask task, TweetThreadTaskProcessor tweetThreadTaskProcessor) {
18 | this.client = client;
19 | this.proxy = proxy;
20 | this.task = task;
21 | this.processor = tweetThreadTaskProcessor;
22 | this.processor.incrementCoroutineCount();
23 | }
24 | @Override
25 | public AsyncHandler.State onStatusReceived(HttpResponseStatus responseStatus) throws Exception {
26 | status = responseStatus.getStatusCode();
27 | if(status >= 200 && status < 300){
28 | this.processor.getScraper().logger.increaseSuccessRequestCount();
29 | proxy.onSuccess();
30 | System.out.print("V");
31 | processor.getTaskService().successfulTask(task);
32 | } else {
33 | this.processor.getScraper().logger.increaseFailedRequestCount();
34 | proxy.onFailure();
35 | System.out.print("X");
36 | }
37 | //try { c.close(); } catch (IOException e) { throw new RuntimeException(e); }
38 | return State.CONTINUE;
39 | }
40 |
41 | @Override
42 | public State onHeadersReceived(HttpHeaders headers) throws Exception {
43 | //try { c.close(); } catch (IOException e) { throw new RuntimeException(e); }
44 | return State.CONTINUE;
45 | }
46 |
47 | @Override
48 | public AsyncHandler.State onBodyPartReceived(HttpResponseBodyPart bodyPart) throws Exception {
49 |
50 | //try { c.close(); } catch (IOException e) { throw new RuntimeException(e); }
51 | return State.CONTINUE;
52 |
53 | }
54 | @Override
55 | public Integer onCompleted() throws Exception {
56 | this.processor.decrementCoroutineCount();
57 | return 200;
58 | }
59 |
60 | @Override
61 | public void onThrowable(Throwable t) {
62 | proxy.onFailure();
63 | //System.out.print("E");
64 | // Handle exceptions here
65 | this.processor.getScraper().logger.increaseFailedRequestCount();
66 | this.processor.decrementCoroutineCount();
67 | processor.getTaskService().failTask(task);
68 | //System.err.println("An error occurred: " + t.getMessage());
69 | }
70 |
71 |
72 | }
--------------------------------------------------------------------------------
/src/main/java/com/scrapium/proxium/Proxy.java:
--------------------------------------------------------------------------------
1 | package com.scrapium.proxium;
2 |
3 | import java.sql.Time;
4 | import java.sql.Timestamp;
5 | import java.util.concurrent.atomic.AtomicInteger;
6 | import java.util.concurrent.atomic.AtomicLong;
7 | import java.util.regex.Matcher;
8 | import java.util.regex.Pattern;
9 |
10 | public class Proxy {
11 |
12 | private int id;
13 | private String connectionString;
14 | private AtomicInteger usageCount;
15 | private AtomicInteger successCount;
16 | private AtomicInteger failedCount;
17 | private AtomicInteger failStreak;
18 | private AtomicLong cooldownUntil;
19 |
20 |
21 | public Proxy(int id, String connectionString, int _usageCount, int _successCount, int _failedCount, int _failStreak, Timestamp _cooldownUntil) {
22 | this.id = id;
23 | this.connectionString = connectionString;
24 | this.usageCount = new AtomicInteger(_usageCount);
25 | this.successCount = new AtomicInteger(_successCount);
26 | this.failedCount = new AtomicInteger(_failedCount);
27 | this.failStreak = new AtomicInteger(_failStreak);
28 |
29 | long coolUntil = ( _cooldownUntil == null ) ? 0 : _cooldownUntil.getTime();
30 | this.cooldownUntil = new AtomicLong(coolUntil);
31 |
32 | }
33 |
34 | public void onSuccess(){
35 | this.usageCount.incrementAndGet();
36 | this.successCount.incrementAndGet();
37 | this.failStreak.set(0);
38 | this.cooldownUntil.set(System.currentTimeMillis());
39 | }
40 |
41 | public void onFailure() {
42 | this.usageCount.incrementAndGet();
43 | this.failedCount.incrementAndGet();
44 | this.failStreak.incrementAndGet();
45 |
46 | if(this.failStreak.get() > 50){
47 |
48 | //System.out.println("Proxy fail streak over 50.");
49 | int baseCooldownTime = 1000;
50 | int maxCooldownTime = 120000;
51 | double exponentialFactor = 0.5;//1.2;
52 | long cooldownTime = baseCooldownTime * (long) Math.pow(exponentialFactor, failStreak.get() - 50);
53 |
54 | if(cooldownTime > maxCooldownTime){
55 | cooldownTime = maxCooldownTime;
56 | }
57 |
58 | long cooldownUntil = System.currentTimeMillis() + cooldownTime;
59 |
60 |
61 |
62 | if(cooldownUntil > this.cooldownUntil.get()){
63 | //System.out.println("[" + this.id + "] Proxy has failed, setting time to " + cooldownTime + ", fail streak = " + failStreak.get());
64 | this.cooldownUntil.set(System.currentTimeMillis() + cooldownTime);
65 | }
66 | }
67 | }
68 |
69 | public String getConnectionString(){
70 | return this.connectionString;
71 | }
72 | public String getIP() {
73 | return extractWithPattern(this.connectionString, "(\\d{1,3}\\.\\d{1,3}\\.\\d{1,3}\\.\\d{1,3})");
74 | }
75 | public int getPort() {
76 | return Integer.valueOf(extractWithPattern(this.connectionString, "(?<=:)(\\d+)"));
77 | }
78 |
79 | public static String extractWithPattern(String input, String pattern) {
80 | Pattern compiledPattern = Pattern.compile(pattern);
81 | Matcher matcher = compiledPattern.matcher(input);
82 | if (matcher.find()) {
83 | return matcher.group();
84 | }
85 | return "";
86 | }
87 |
88 |
89 | public int getUsageCount() {
90 | return this.usageCount.get();
91 | }
92 |
93 | public int getSuccessCount() {
94 | return this.successCount.get();
95 | }
96 |
97 | public int getFailedCount() {
98 | return this.failedCount.get();
99 | }
100 |
101 | public int getFailStreak() {
102 | return this.failStreak.get();
103 | }
104 |
105 | public Timestamp getCooldownUntil() {
106 | return new Timestamp(this.cooldownUntil.get());
107 | }
108 |
109 | public int getID() {
110 | return this.id;
111 | }
112 |
113 | public void debug_incrementUsageCount() {
114 | this.usageCount.incrementAndGet();
115 | }
116 |
117 | public int getSuccessDelta(){
118 | return this.getSuccessCount() - this.getFailedCount();
119 | }
120 |
121 | public boolean inCoolDown(){
122 | if(this.cooldownUntil.get() < System.currentTimeMillis()){
123 | return false;
124 | }
125 | return true;
126 | }
127 |
128 | @Override
129 | public String toString() {
130 | return "Proxy{" +
131 | "id=" + id +
132 | ", connectionString='" + connectionString + '\'' +
133 | ", usageCount=" + usageCount +
134 | ", successCount=" + successCount +
135 | ", failedCount=" + failedCount +
136 | ", failStreak=" + failStreak +
137 | ", cooldownUntil=" + cooldownUntil +
138 | '}';
139 | }
140 | }
--------------------------------------------------------------------------------
/src/main/java/com/scrapium/proxium/ProxyService.java:
--------------------------------------------------------------------------------
1 | package com.scrapium.proxium;
2 |
3 | import java.io.BufferedReader;
4 | import java.io.FileReader;
5 | import java.io.IOException;
6 | import java.sql.*;
7 | import java.util.*;
8 | import java.util.concurrent.CopyOnWriteArrayList;
9 |
10 | public class ProxyService {
11 |
12 |
13 | private final Random rand;
14 | private ArrayList proxies;
15 | private ArrayList availableProxies;
16 |
17 | public ProxyService (){
18 | this.proxies = new ArrayList();
19 | this.availableProxies = new ArrayList();
20 | this.rand = new Random();
21 | }
22 |
23 | public void loadProxies() {
24 | synchronized (this.proxies){
25 | try (BufferedReader br = new BufferedReader(new FileReader("./checked_proxies.txt"))) {
26 | String _proxy_entry;
27 |
28 | int i = 0;
29 |
30 | while ((_proxy_entry = br.readLine()) != null) {
31 |
32 | String proxy_entry = _proxy_entry.replaceAll("[\\r\\n]+", "");
33 | String connString = proxy_entry;
34 |
35 | Proxy proxy = new Proxy(
36 | i++,
37 | connString,
38 | 0,
39 | 0,
40 | 0,
41 | 0,
42 | new Timestamp(0)
43 | );
44 |
45 | //System.out.println("added ");
46 | //System.out.print(proxy.toString());
47 |
48 | this.proxies.add(proxy);
49 |
50 | }
51 |
52 | System.out.println("Loaded (" + i + ") proxies!");
53 | /*
54 | } catch (SQLException e) {
55 | e.printStackTrace();
56 | System.out.println("Failed to get connection!");
57 | }*/
58 |
59 | } catch (IOException e) {
60 | System.err.format("IOException: %s%n", e);
61 | }
62 | }
63 | }
64 |
65 | /*
66 | String query = "SELECT id, connection_string, usage_count, success_count, failed_count, fail_streak, cooldown_until " +
67 | "FROM test_proxy " +
68 | "WHERE (cooldown_until IS NULL OR NOW() > cooldown_until) " +
69 | "ORDER BY CASE WHEN usage_count = 0 THEN 1 ELSE success_count / usage_count END DESC, last_used ASC " +
70 | "LIMIT 50";
71 | */
72 |
73 | public void updateAvailableProxies(){
74 | synchronized (this.availableProxies){
75 | this.availableProxies = new ArrayList();
76 |
77 | // benchmark the below for a better solution.
78 | for (int i = 0; i < this.proxies.size(); i++){
79 | // TODO: check isCoolDown function
80 | if(!this.proxies.get(i).inCoolDown()){
81 | availableProxies.add(this.proxies.get(i));
82 | }
83 | }
84 | }
85 |
86 | //System.out.println("Available proxy count = " + this.availableProxies.size());
87 |
88 | synchronized (this.availableProxies) {
89 | if (this.availableProxies.size() < 50) {
90 | System.out.println("!! INCREDIBLY LOW AVAILABLE PROXY POOL SIZE (" + availableProxies.size() + ")");
91 | }
92 | }
93 |
94 |
95 | synchronized (this.availableProxies) {
96 | //System.out.println("[proxyman] (" + availableProxies.size() + ") available proxies.");
97 | Collections.sort(availableProxies, new Comparator() {
98 | public int compare(Proxy p1, Proxy p2) {
99 | return Integer.compare(p2.getSuccessDelta() - p2.getFailedCount(), p1.getSuccessDelta() - p2.getFailedCount());
100 | }
101 | });
102 | }
103 |
104 | //System.out.println("[proxyman] Sorted available proxies!");
105 | }
106 |
107 | // get one of the top 50 proxies, that aren't currently banned.
108 | public Proxy getNewProxy() {
109 |
110 | boolean proxyInCoolDown = true;
111 | int attempts = 0;
112 |
113 | Proxy randomProxy = null;
114 |
115 | while(proxyInCoolDown && attempts <= 150){
116 | synchronized (this.availableProxies) {
117 |
118 | if (availableProxies.size() == 0) {
119 | System.out.println("No available Proxies....");
120 | return randomProxy;
121 | }
122 | int randInd = rand.nextInt(30);
123 | if (randInd > availableProxies.size()) {
124 | randInd = availableProxies.size() - 1;
125 | }
126 | randomProxy = availableProxies.get(randInd);
127 | proxyInCoolDown = randomProxy.inCoolDown();
128 | attempts++;
129 | }
130 | }
131 |
132 | if(attempts > 100){
133 | System.out.println("Warning: iterated over 150 random proxies and couldn't find a viable proxy NOT in cooldown.");
134 |
135 | // TODO: reset all proxies
136 | }
137 |
138 | return randomProxy;
139 | }
140 |
141 | public int getAvailableProxyCount(){
142 | synchronized (this.availableProxies){
143 | return availableProxies.size();
144 | }
145 | }
146 | }
--------------------------------------------------------------------------------
/src/main/java/com/scrapium/tests/Benchmark.java:
--------------------------------------------------------------------------------
1 | package com.scrapium.tests;
2 |
3 | import com.scrapium.Scraper;
4 | import com.scrapium.utils.TimeUtils;
5 |
6 | import java.util.HashMap;
7 | import java.util.Map;
8 |
9 | public class Benchmark {
10 | public static void runTest() {
11 | Map configResults = new HashMap<>();
12 | String bestConfigKey = "";
13 | int highestSuccessfulRequests = 0;
14 |
15 | double timePerTest = 5 * 60 * 1000; // 30 seconds
16 |
17 | int totalTestCount = (((6-1)/2) * ((15000-100)/250) * ((28 - 4)/10));
18 | int totalTestTime = (int) (totalTestCount * timePerTest);
19 |
20 | int testIter = 0;
21 |
22 | System.out.println("\n== Test started ==\n");
23 | System.out.println("- Total Tests = " + (totalTestCount));
24 | System.out.println("- Test will be completed " + TimeUtils.timeToString((totalTestTime/1000)));
25 |
26 | for (int maxCoroutineCount = 100; maxCoroutineCount <= 15000; maxCoroutineCount += 250) { // 100 -> 2000
27 | for (int consumerCount = 1; consumerCount <= 6; consumerCount += 2) { // 1 -> 8
28 | for (int conSocketTimeout = 6; conSocketTimeout <= 28; conSocketTimeout += 10) { // 4 -> 28
29 |
30 | Scraper scraper = new Scraper(consumerCount, maxCoroutineCount, conSocketTimeout);
31 | scraper.scrape();
32 |
33 | String configKey = String.format("c_%d_m_%d_t_%d", consumerCount, maxCoroutineCount, conSocketTimeout);
34 |
35 | System.out.println("\n[" + testIter + "/" + totalTestCount + "] Starting test: "+ configKey + "\n");
36 |
37 | int timeRemaining = (int) (totalTestTime - testIter * timePerTest);
38 | System.out.println("( Test will be completed " + TimeUtils.timeToString(timeRemaining/1000) + " )\n");
39 |
40 | try {
41 | Thread.sleep((long) timePerTest);
42 | } catch (InterruptedException e) {
43 | e.printStackTrace();
44 | throw new RuntimeException(e);
45 | }
46 |
47 | int successfulRequests = scraper.logger.successRequestCount.get();
48 | int failedRequests = scraper.logger.failedRequestCount.get();
49 |
50 |
51 | configResults.put(configKey, successfulRequests);
52 |
53 |
54 | scraper.stop();
55 |
56 | System.out.printf("\n"+ "Test (" + testIter + "/" + totalTestCount + ") Finished Configuration: %s | Successful Requests: %d | Failed Requests: %d%n\n",
57 | configKey, successfulRequests, failedRequests);
58 |
59 |
60 | testIter++;
61 |
62 | if (successfulRequests > highestSuccessfulRequests) {
63 | highestSuccessfulRequests = successfulRequests;
64 | bestConfigKey = configKey;
65 | }
66 |
67 | try {
68 | Thread.sleep(10000);
69 | } catch (InterruptedException e) {
70 | e.printStackTrace();
71 | throw new RuntimeException(e);
72 | }
73 | }
74 | }
75 | }
76 |
77 | System.out.println("\n== All Configuration Results ==");
78 | System.out.println("\n== C=threads m=coroutines t=timeout");
79 |
80 | for (Map.Entry entry : configResults.entrySet()) {
81 | System.out.printf("Configuration: %s | Successful Requests: %d%n", entry.getKey(), entry.getValue());
82 | }
83 |
84 | System.out.printf("\nBest Configuration: %s | Highest Successful Requests: %d%n", bestConfigKey, highestSuccessfulRequests);
85 | }
86 | }
87 |
--------------------------------------------------------------------------------
/src/main/java/com/scrapium/threads/LoggingThread.java:
--------------------------------------------------------------------------------
1 | package com.scrapium.threads;
2 |
3 | import com.scrapium.Scraper;
4 | import com.scrapium.ThreadBase;
5 | import com.scrapium.tweetium.TaskService;
6 |
7 | import java.time.Duration;
8 | import java.time.Instant;
9 | import java.util.concurrent.atomic.AtomicInteger;
10 |
11 | public class LoggingThread extends ThreadBase implements Runnable {
12 |
13 | private final Instant scraperStart;
14 | private Scraper scraper;
15 | private AtomicInteger coroutineCount;
16 | private int lastRequestCount = 0;
17 | public AtomicInteger successRequestCount;
18 | public AtomicInteger failedRequestCount;
19 |
20 | private long startEpoch;
21 | private int lastSuccessCount;
22 | private int lastFailedCount;
23 | private long lastLogEpoch;
24 |
25 | private TaskService taskService;
26 | public LoggingThread(Scraper scraper, TaskService taskService) {
27 | this.scraper = scraper;
28 | this.taskService = taskService;
29 | this.successRequestCount = new AtomicInteger(0);
30 | this.failedRequestCount = new AtomicInteger(0);
31 | this.startEpoch = System.currentTimeMillis() / 1000;
32 |
33 | this.scraperStart = Instant.now();
34 | }
35 |
36 | public static String format(Duration d) {
37 | long days = d.toDays();
38 | d = d.minusDays(days);
39 | long hours = d.toHours();
40 | d = d.minusHours(hours);
41 | long minutes = d.toMinutes();
42 | d = d.minusMinutes(minutes);
43 | long seconds = d.getSeconds() ;
44 | return
45 | (days == 0?"":days+" days,")+
46 | (hours == 0?"":hours+" hours,")+
47 | (minutes == 0?"":minutes+" minutes,")+
48 | (seconds == 0?"":seconds+" seconds,");
49 | }
50 |
51 | @Override
52 | public void run() {
53 | while (this.running) {
54 |
55 | long currentEpoch = System.currentTimeMillis() / 1000;
56 |
57 | int successDelta = this.successRequestCount.get() - this.lastSuccessCount;
58 | int failedDelta = this.failedRequestCount.get() - this.lastFailedCount;
59 |
60 | double successPS = successDelta / (currentEpoch - this.lastLogEpoch);
61 | double failedPS = failedDelta / (currentEpoch - this.lastLogEpoch);
62 |
63 | int secondSinceStart = (int) (currentEpoch - this.startEpoch);
64 |
65 | double successPSTotal = this.successRequestCount.get() / (secondSinceStart == 0 ? 1 : secondSinceStart);
66 |
67 | String out = "\n\n=== Tweet Scraper ===\n";
68 | out += ("Requests : " + (this.successRequestCount.get() + this.failedRequestCount.get())) + "\n";
69 | out += ("Success/s: " + (successPS)) + "\n";
70 | out += ("Success Total/s: " + (successPSTotal)) + "\n";
71 | out += ("Failed/s: " + (failedPS)) + "\n";
72 | out += ("Available Proxies: " + (this.scraper.proxyService.getAvailableProxyCount())) + "\n";
73 | out += ("Running for: " + format(Duration.between(this.scraperStart, Instant.now())));
74 |
75 | System.out.println(out);
76 |
77 | /*
78 | if(successPS == 0 && Duration.between(this.scraperStart, Instant.now()).toSeconds() > 30 ){
79 | System.out.println("Requests per second == 0... EXITING.");
80 | System.exit(0);
81 | }*/
82 |
83 | this.lastSuccessCount = this.successRequestCount.get();
84 | this.lastFailedCount = this.failedRequestCount.get();
85 | this.lastLogEpoch = System.currentTimeMillis() / 1000;
86 |
87 |
88 | try {
89 | Thread.sleep(2000);
90 | } catch (InterruptedException e) {
91 | e.printStackTrace();
92 | throw new RuntimeException(e);
93 | }
94 | }
95 | }
96 |
97 | public void increaseSuccessRequestCount(){
98 | successRequestCount.incrementAndGet();
99 | }
100 |
101 | public void increaseFailedRequestCount(){
102 | failedRequestCount.incrementAndGet();
103 | }
104 |
105 |
106 | }
107 |
--------------------------------------------------------------------------------
/src/main/java/com/scrapium/threads/ProducerThread.java:
--------------------------------------------------------------------------------
1 | package com.scrapium.threads;
2 |
3 | import com.scrapium.Scraper;
4 | import com.scrapium.ThreadBase;
5 | import com.scrapium.tweetium.TaskService;
6 | import com.scrapium.tweetium.TweetTask;
7 |
8 | public class ProducerThread extends ThreadBase implements Runnable {
9 |
10 |
11 | private final Scraper scraper;
12 | private TaskService taskService;
13 |
14 | private int debug_epoch = 1575072000;
15 | private String debug_search = "$BTC";
16 | public ProducerThread(Scraper scraper, TaskService taskService) {
17 | this.scraper = scraper;
18 | this.taskService = taskService;
19 | }
20 | @Override
21 | public void run() {
22 | while (this.running) {
23 |
24 | if(this.taskService.doesQueueHaveFreeSpace()){
25 | TweetTask newTask = new TweetTask(debug_search, debug_epoch, debug_epoch + 30);
26 | debug_epoch += 30;
27 | this.taskService.addNewTweetTaskEnd(newTask);
28 | }
29 | }
30 | }
31 | }
--------------------------------------------------------------------------------
/src/main/java/com/scrapium/threads/ProxyThread.java:
--------------------------------------------------------------------------------
1 | package com.scrapium.threads;
2 |
3 | import com.scrapium.Scraper;
4 | import com.scrapium.ThreadBase;
5 | import com.scrapium.proxium.ProxyService;
6 |
7 | public class ProxyThread extends ThreadBase implements Runnable {
8 |
9 |
10 | private final Scraper scraper;
11 | private final ProxyService proxyService;
12 |
13 | public ProxyThread(Scraper scraper, ProxyService proxyService) {
14 | this.scraper = scraper;
15 | this.proxyService = proxyService;
16 | this.proxyService.loadProxies();
17 | }
18 |
19 | @Override
20 | public void run() {
21 | while (this.running) {
22 |
23 | this.proxyService.updateAvailableProxies();
24 |
25 | try {
26 | Thread.sleep(500);
27 | } catch (InterruptedException e) {
28 | throw new RuntimeException(e);
29 | }
30 | }
31 | }
32 | }
--------------------------------------------------------------------------------
/src/main/java/com/scrapium/threads/TweetThread.java:
--------------------------------------------------------------------------------
1 | package com.scrapium.threads;
2 |
3 | import com.scrapium.Scraper;
4 | import com.scrapium.ThreadBase;
5 | import com.scrapium.TweetThreadTaskProcessor;
6 | import com.scrapium.tweetium.TaskService;
7 | import com.scrapium.utils.DebugLogger;
8 |
9 | import java.util.concurrent.atomic.AtomicInteger;
10 |
11 | public class TweetThread extends ThreadBase implements Runnable {
12 |
13 |
14 |
15 | private final Scraper scraper;
16 | private final int threadID;
17 | private TaskService taskService;
18 | private AtomicInteger coroutineCount;
19 |
20 | private final TweetThreadTaskProcessor taskProcessor;
21 |
22 | // possibly move maxCoroutineCount to scraper, so it doesn't need to be updated in each class - blocking.
23 |
24 | public TweetThread(int i, Scraper scraper, TaskService taskService) {
25 | this.threadID = i;
26 | this.scraper = scraper;
27 | this.taskService = taskService;
28 | this.coroutineCount = new AtomicInteger(0);
29 | this.taskProcessor = new TweetThreadTaskProcessor(this.threadID, this.running, this.scraper, taskService, this.coroutineCount);
30 | }
31 |
32 | @Override
33 | public void run() {
34 | while (this.running) {
35 | // move tick to different thread for code speed-up
36 | this.taskProcessor.doClientCleanupTick();
37 |
38 | if (this.coroutineCount.get() < scraper.maxCoroutineCount) {
39 | DebugLogger.log("TweetThread: Ran cycle");
40 | DebugLogger.log("TweetThread: Task Taken");
41 | this.taskProcessor.processNextTask();
42 | DebugLogger.log("Decrementing counter");
43 |
44 | } else {
45 | if(this.coroutineCount.get() >= scraper.maxCoroutineCount){
46 | //System.out.println("Skipping thread execution!");
47 | //System.out.println(" Reason: MAX CO-ROUTINES (" + this.coroutineCount.get() + "/" + scraper.maxCoroutineCount + ")");
48 | }
49 |
50 | try {
51 | Thread.sleep(150); // Sleep when the maximum number of tasks are being executed
52 | } catch (InterruptedException e) {
53 | throw new RuntimeException(e);
54 | }
55 | }
56 |
57 | /*
58 | try {
59 | //System.out.println("tweetThread coroutine count = " + this.coroutineCount.get());
60 | //System.out.println("tweetThread taskQueue = " + this.taskQueue.size());
61 |
62 | if (this.taskQueue.size() > 0 && this.coroutineCount.get() < scraper.maxCoroutineCount) {
63 | DebugLogger.log("TweetThread: Ran cycle");
64 | DebugLogger.log("TweetThread: Task Taken");
65 | this.taskProcessor.processNextTask();
66 | DebugLogger.log("Decrementing counter");
67 |
68 | } else {
69 |
70 | if(this.taskQueue.size() == 0){
71 | //System.out.println("Skipping thread execution!");
72 | //System.out.println(" Reason: QUEUE EMPTY");
73 | }
74 | if(this.coroutineCount.get() >= scraper.maxCoroutineCount){
75 | //System.out.println("Skipping thread execution!");
76 | //System.out.println(" Reason: MAX CO-ROUTINES (" + this.coroutineCount.get() + "/" + scraper.maxCoroutineCount + ")");
77 | }
78 |
79 | Thread.sleep(150); // Sleep when the maximum number of tasks are being executed
80 | }
81 | } catch (Exception e) {
82 | e.printStackTrace();
83 | DebugLogger.log("Interrupted Exception!");
84 | } */
85 | }
86 |
87 | // TODO: readd close request client;
88 | //System.out.println("closeRequestClient called");
89 | //this.taskProcessor.closeRequestClient();
90 | }
91 | }
92 |
--------------------------------------------------------------------------------
/src/main/java/com/scrapium/tweetium/TaskService.java:
--------------------------------------------------------------------------------
1 | package com.scrapium.tweetium;
2 |
3 | import java.util.concurrent.ConcurrentLinkedDeque;
4 | import java.util.concurrent.ConcurrentLinkedQueue;
5 |
6 | public class TaskService {
7 |
8 | private ConcurrentLinkedDeque backlogTweetQueue;
9 | private ConcurrentLinkedQueue inProcessingTweetQueue;
10 |
11 | public TaskService() {
12 | backlogTweetQueue = new ConcurrentLinkedDeque<>();
13 | inProcessingTweetQueue = new ConcurrentLinkedQueue<>();
14 | }
15 |
16 | public boolean doesQueueHaveFreeSpace(){
17 | return (backlogTweetQueue.size() < 5000);
18 | }
19 |
20 | // maybe separate this into two lists.
21 | public TweetTask getNextTask(){
22 | if(backlogTweetQueue.size() == 0){
23 | System.out.println("[X] No tasks in the queue!");
24 | return null;
25 | }
26 | TweetTask task = backlogTweetQueue.poll();
27 | //task.setState(TweetTask.TweetTaskState.PROCESSING);
28 | inProcessingTweetQueue.add(task);
29 | return task;
30 | }
31 |
32 | public boolean hasNextTask(){
33 | return (backlogTweetQueue.size() > 0);
34 | }
35 |
36 | public void successfulTask(TweetTask task){
37 | inProcessingTweetQueue.remove(task);
38 | if(task.hasContinuation()/* && task.getState() == TweetTask.TweetTaskState.COMPLETED */){
39 | // continue with next request
40 | TweetTask continuationTask = task.getConsecutiveRequest();
41 | //continuationTask.setState(TweetTask.TweetTaskState.PROCESSING);
42 | this.backlogTweetQueue.addFirst(continuationTask);
43 | } else {
44 | // no continuation
45 | }
46 | }
47 |
48 | public void failTask(TweetTask task){
49 | inProcessingTweetQueue.remove(task);
50 | this.backlogTweetQueue.addFirst(task);
51 | }
52 |
53 | // fix branch prediction failures
54 | public void cleanup(){
55 | // do cleanup, if items in visible for more than 4 minutes, push to front of tweet queue
56 | }
57 |
58 | public void addNewTweetTaskEnd(TweetTask tweetTask) {
59 | this.backlogTweetQueue.addLast(tweetTask);
60 | }
61 | }
62 |
--------------------------------------------------------------------------------
/src/main/java/com/scrapium/tweetium/TweetTask.java:
--------------------------------------------------------------------------------
1 | package com.scrapium.tweetium;
2 |
3 | public class TweetTask {
4 |
5 | private final String searchTerm;
6 | private final int fromEpoch;
7 | private final int toEpoch;
8 | private final String cursor;
9 |
10 | public TweetTask(String searchTerm, int fromEpoch, int toEpoch) {
11 | this.searchTerm = searchTerm;
12 | this.fromEpoch = fromEpoch;
13 | this.toEpoch = toEpoch;
14 | this.cursor = "";
15 | }
16 |
17 | public TweetTask(String searchTerm, int fromEpoch, int toEpoch, String cursor) {
18 | this.searchTerm = searchTerm;
19 | this.fromEpoch = fromEpoch;
20 | this.toEpoch = toEpoch;
21 | this.cursor = cursor;
22 | }
23 |
24 | public TweetTask getConsecutiveRequest() {
25 | return null;
26 | }
27 |
28 |
29 | public boolean hasContinuation() {
30 | return false;
31 | }
32 |
33 | @Override
34 | public String toString() {
35 | return "TweetTask{" +
36 | "searchTerm='" + searchTerm + '\'' +
37 | ", fromEpoch=" + fromEpoch +
38 | ", toEpoch=" + toEpoch +
39 | ", cursor='" + cursor + '\'' +
40 | '}';
41 | }
42 | }
43 |
--------------------------------------------------------------------------------
/src/main/java/com/scrapium/utils/DebugLogger.java:
--------------------------------------------------------------------------------
1 | package com.scrapium.utils;
2 |
3 | public class DebugLogger {
4 | public static void log(String message) {
5 | //System.out.println(message);
6 | }
7 | }
8 |
--------------------------------------------------------------------------------
/src/main/java/com/scrapium/utils/TimeUtils.java:
--------------------------------------------------------------------------------
1 | package com.scrapium.utils;
2 |
3 | import java.sql.Timestamp;
4 | import java.util.Calendar;
5 |
6 | public class TimeUtils {
7 |
8 | public static String timeToString(int seconds) {
9 | if (seconds < 60) {
10 | return "in " + seconds + " seconds";
11 | } else if (seconds < 3600) {
12 | int minutes = seconds / 60;
13 | return "in " + minutes + " minute" + (minutes == 1 ? "" : "s");
14 | } else if (seconds < 86400) {
15 | int hours = seconds / 3600;
16 | return "in " + hours + " hour" + (hours == 1 ? "" : "s");
17 | } else {
18 | int days = seconds / 86400;
19 | return "in " + days + " day" + (days == 1 ? "" : "s");
20 | }
21 | }
22 |
23 | public static Timestamp nowPlusMinutes(int minutes){
24 | Timestamp currentTimestamp = new Timestamp(System.currentTimeMillis());
25 |
26 | // Create a Calendar instance and set the time to the current timestamp
27 | Calendar calendar = Calendar.getInstance();
28 | calendar.setTime(currentTimestamp);
29 |
30 | // Add 15 minutes to the calendar
31 | calendar.add(Calendar.MINUTE, minutes);
32 |
33 | // Get the new timestamp with the updated time
34 | Timestamp newTimeStamp = new Timestamp(calendar.getTimeInMillis());
35 |
36 | return newTimeStamp;
37 | }
38 |
39 |
40 | }
--------------------------------------------------------------------------------
/src/schema.sql:
--------------------------------------------------------------------------------
1 | CREATE TABLE proxies (
2 | id SERIAL PRIMARY KEY,
3 | conn_string VARCHAR(255) NOT NULL,
4 | ip_address VARCHAR(255) NOT NULL,
5 | port INTEGER NOT NULL,
6 | is_socks BOOLEAN NOT NULL,
7 | usage_count INTEGER DEFAULT 0,
8 | retry_count INTEGER DEFAULT 0,
9 | next_available TIMESTAMP DEFAULT '1970-01-01 00:00:00'::TIMESTAMP,
10 | guest_token VARCHAR(255),
11 | guest_token_updated TIMESTAMP DEFAULT '1970-01-01 00:00:00'::TIMESTAMP,
12 | success_delta INTEGER DEFAULT 0,
13 | failed_count INTEGER DEFAULT 0,
14 | last_updated TIMESTAMP DEFAULT '1970-01-01 00:00:00'::TIMESTAMP
15 | );
16 |
17 | GRANT SELECT, INSERT, UPDATE, DELETE ON TABLE proxies TO scrapium_user;
18 | GRANT USAGE, SELECT ON SEQUENCE proxies_id_seq1 TO scrapium_user;
19 |
20 | #
21 |
22 | -- Create the test_proxy table
23 | CREATE TABLE test_proxy (
24 | id SERIAL PRIMARY KEY,
25 | connection_string VARCHAR(255) UNIQUE,
26 | usage_count INTEGER DEFAULT 0,
27 | success_count INTEGER DEFAULT 0,
28 | failed_count INTEGER DEFAULT 0,
29 | fail_streak INTEGER DEFAULT 0,
30 | cooldown_until TIMESTAMPTZ
31 | );
32 |
33 | -- Grant privileges to scrapium_user
34 | GRANT ALL PRIVILEGES ON TABLE test_proxy TO scrapium_user;
35 | GRANT USAGE, SELECT ON SEQUENCE test_proxy_id_seq TO scrapium_user;
36 |
37 |
38 | #
39 |
40 |
41 |
42 | CREATE TABLE test_proxy (
43 | id SERIAL PRIMARY KEY,
44 | connection_string VARCHAR(255),
45 | usage_count INTEGER NOT NULL DEFAULT 0,
46 | success_count INTEGER NOT NULL DEFAULT 0,
47 | failed_count INTEGER NOT NULL DEFAULT 0,
48 | fail_streak INTEGER NOT NULL DEFAULT 0,
49 | cooldown_until TIMESTAMP WITH TIME ZONE,
50 | last_used TIMESTAMP WITH TIME ZONE,
51 | status VARCHAR(50) NOT NULL DEFAULT 'active'
52 | );
53 |
54 | CREATE INDEX idx_cooldown ON test_proxy (cooldown_until);
55 | CREATE INDEX idx_usage_count ON test_proxy (usage_count);
56 | CREATE INDEX idx_last_used ON test_proxy (last_used);
57 | GRANT ALL PRIVILEGES ON TABLE test_proxy TO scrapium_user;
58 | GRANT USAGE, SELECT ON SEQUENCE test_proxy_id_seq TO scrapium_user;
--------------------------------------------------------------------------------