├── .gitignore
├── LICENSE.md
├── README.md
├── args.go
├── args_test.go
├── docs
├── sequence-diagram.png
└── web-sequence-diagram.txt
├── html.go
├── html_test.go
├── httpGet.go
├── imageCrawler.go
├── imageCrawler_test.go
├── main.go
├── urlCrawler.go
└── urlCrawler_test.go
/.gitignore:
--------------------------------------------------------------------------------
1 |
2 | #
3 | # Ignore the binary
4 | #
5 | cat-crawler
6 | main
7 |
8 | #
9 | # Vi/vim swapfiles
10 | #
11 | *.swp
12 |
13 | #
14 | # My test output
15 | #
16 | output.txt
17 |
18 | #
19 | # Ignore our default downloads directory
20 | #
21 | cat-crawler-downloads
22 |
23 | #
24 | # Et tu, OS/X?
25 | #
26 | .DS_Store
27 |
28 |
--------------------------------------------------------------------------------
/LICENSE.md:
--------------------------------------------------------------------------------
1 | GNU GENERAL PUBLIC LICENSE
2 | ==========================
3 |
4 | Version 3, 29 June 2007
5 |
6 | Copyright © 2007 Free Software Foundation, Inc. <>
7 |
8 | Everyone is permitted to copy and distribute verbatim copies of this license
9 | document, but changing it is not allowed.
10 |
11 | ## Preamble
12 |
13 | The GNU General Public License is a free, copyleft license for software and other
14 | kinds of works.
15 |
16 | The licenses for most software and other practical works are designed to take away
17 | your freedom to share and change the works. By contrast, the GNU General Public
18 | License is intended to guarantee your freedom to share and change all versions of a
19 | program--to make sure it remains free software for all its users. We, the Free
20 | Software Foundation, use the GNU General Public License for most of our software; it
21 | applies also to any other work released this way by its authors. You can apply it to
22 | your programs, too.
23 |
24 | When we speak of free software, we are referring to freedom, not price. Our General
25 | Public Licenses are designed to make sure that you have the freedom to distribute
26 | copies of free software (and charge for them if you wish), that you receive source
27 | code or can get it if you want it, that you can change the software or use pieces of
28 | it in new free programs, and that you know you can do these things.
29 |
30 | To protect your rights, we need to prevent others from denying you these rights or
31 | asking you to surrender the rights. Therefore, you have certain responsibilities if
32 | you distribute copies of the software, or if you modify it: responsibilities to
33 | respect the freedom of others.
34 |
35 | For example, if you distribute copies of such a program, whether gratis or for a fee,
36 | you must pass on to the recipients the same freedoms that you received. You must make
37 | sure that they, too, receive or can get the source code. And you must show them these
38 | terms so they know their rights.
39 |
40 | Developers that use the GNU GPL protect your rights with two steps: (1) assert
41 | copyright on the software, and (2) offer you this License giving you legal permission
42 | to copy, distribute and/or modify it.
43 |
44 | For the developers' and authors' protection, the GPL clearly explains that there is
45 | no warranty for this free software. For both users' and authors' sake, the GPL
46 | requires that modified versions be marked as changed, so that their problems will not
47 | be attributed erroneously to authors of previous versions.
48 |
49 | Some devices are designed to deny users access to install or run modified versions of
50 | the software inside them, although the manufacturer can do so. This is fundamentally
51 | incompatible with the aim of protecting users' freedom to change the software. The
52 | systematic pattern of such abuse occurs in the area of products for individuals to
53 | use, which is precisely where it is most unacceptable. Therefore, we have designed
54 | this version of the GPL to prohibit the practice for those products. If such problems
55 | arise substantially in other domains, we stand ready to extend this provision to
56 | those domains in future versions of the GPL, as needed to protect the freedom of
57 | users.
58 |
59 | Finally, every program is threatened constantly by software patents. States should
60 | not allow patents to restrict development and use of software on general-purpose
61 | computers, but in those that do, we wish to avoid the special danger that patents
62 | applied to a free program could make it effectively proprietary. To prevent this, the
63 | GPL assures that patents cannot be used to render the program non-free.
64 |
65 | The precise terms and conditions for copying, distribution and modification follow.
66 |
67 | ## TERMS AND CONDITIONS
68 |
69 | ### 0. Definitions.
70 |
71 | “This License” refers to version 3 of the GNU General Public License.
72 |
73 | “Copyright” also means copyright-like laws that apply to other kinds of
74 | works, such as semiconductor masks.
75 |
76 | “The Program” refers to any copyrightable work licensed under this
77 | License. Each licensee is addressed as “you”. “Licensees” and
78 | “recipients” may be individuals or organizations.
79 |
80 | To “modify” a work means to copy from or adapt all or part of the work in
81 | a fashion requiring copyright permission, other than the making of an exact copy. The
82 | resulting work is called a “modified version” of the earlier work or a
83 | work “based on” the earlier work.
84 |
85 | A “covered work” means either the unmodified Program or a work based on
86 | the Program.
87 |
88 | To “propagate” a work means to do anything with it that, without
89 | permission, would make you directly or secondarily liable for infringement under
90 | applicable copyright law, except executing it on a computer or modifying a private
91 | copy. Propagation includes copying, distribution (with or without modification),
92 | making available to the public, and in some countries other activities as well.
93 |
94 | To “convey” a work means any kind of propagation that enables other
95 | parties to make or receive copies. Mere interaction with a user through a computer
96 | network, with no transfer of a copy, is not conveying.
97 |
98 | An interactive user interface displays “Appropriate Legal Notices” to the
99 | extent that it includes a convenient and prominently visible feature that (1)
100 | displays an appropriate copyright notice, and (2) tells the user that there is no
101 | warranty for the work (except to the extent that warranties are provided), that
102 | licensees may convey the work under this License, and how to view a copy of this
103 | License. If the interface presents a list of user commands or options, such as a
104 | menu, a prominent item in the list meets this criterion.
105 |
106 | ### 1. Source Code.
107 |
108 | The “source code” for a work means the preferred form of the work for
109 | making modifications to it. “Object code” means any non-source form of a
110 | work.
111 |
112 | A “Standard Interface” means an interface that either is an official
113 | standard defined by a recognized standards body, or, in the case of interfaces
114 | specified for a particular programming language, one that is widely used among
115 | developers working in that language.
116 |
117 | The “System Libraries” of an executable work include anything, other than
118 | the work as a whole, that (a) is included in the normal form of packaging a Major
119 | Component, but which is not part of that Major Component, and (b) serves only to
120 | enable use of the work with that Major Component, or to implement a Standard
121 | Interface for which an implementation is available to the public in source code form.
122 | A “Major Component”, in this context, means a major essential component
123 | (kernel, window system, and so on) of the specific operating system (if any) on which
124 | the executable work runs, or a compiler used to produce the work, or an object code
125 | interpreter used to run it.
126 |
127 | The “Corresponding Source” for a work in object code form means all the
128 | source code needed to generate, install, and (for an executable work) run the object
129 | code and to modify the work, including scripts to control those activities. However,
130 | it does not include the work's System Libraries, or general-purpose tools or
131 | generally available free programs which are used unmodified in performing those
132 | activities but which are not part of the work. For example, Corresponding Source
133 | includes interface definition files associated with source files for the work, and
134 | the source code for shared libraries and dynamically linked subprograms that the work
135 | is specifically designed to require, such as by intimate data communication or
136 | control flow between those subprograms and other parts of the work.
137 |
138 | The Corresponding Source need not include anything that users can regenerate
139 | automatically from other parts of the Corresponding Source.
140 |
141 | The Corresponding Source for a work in source code form is that same work.
142 |
143 | ### 2. Basic Permissions.
144 |
145 | All rights granted under this License are granted for the term of copyright on the
146 | Program, and are irrevocable provided the stated conditions are met. This License
147 | explicitly affirms your unlimited permission to run the unmodified Program. The
148 | output from running a covered work is covered by this License only if the output,
149 | given its content, constitutes a covered work. This License acknowledges your rights
150 | of fair use or other equivalent, as provided by copyright law.
151 |
152 | You may make, run and propagate covered works that you do not convey, without
153 | conditions so long as your license otherwise remains in force. You may convey covered
154 | works to others for the sole purpose of having them make modifications exclusively
155 | for you, or provide you with facilities for running those works, provided that you
156 | comply with the terms of this License in conveying all material for which you do not
157 | control copyright. Those thus making or running the covered works for you must do so
158 | exclusively on your behalf, under your direction and control, on terms that prohibit
159 | them from making any copies of your copyrighted material outside their relationship
160 | with you.
161 |
162 | Conveying under any other circumstances is permitted solely under the conditions
163 | stated below. Sublicensing is not allowed; section 10 makes it unnecessary.
164 |
165 | ### 3. Protecting Users' Legal Rights From Anti-Circumvention Law.
166 |
167 | No covered work shall be deemed part of an effective technological measure under any
168 | applicable law fulfilling obligations under article 11 of the WIPO copyright treaty
169 | adopted on 20 December 1996, or similar laws prohibiting or restricting circumvention
170 | of such measures.
171 |
172 | When you convey a covered work, you waive any legal power to forbid circumvention of
173 | technological measures to the extent such circumvention is effected by exercising
174 | rights under this License with respect to the covered work, and you disclaim any
175 | intention to limit operation or modification of the work as a means of enforcing,
176 | against the work's users, your or third parties' legal rights to forbid circumvention
177 | of technological measures.
178 |
179 | ### 4. Conveying Verbatim Copies.
180 |
181 | You may convey verbatim copies of the Program's source code as you receive it, in any
182 | medium, provided that you conspicuously and appropriately publish on each copy an
183 | appropriate copyright notice; keep intact all notices stating that this License and
184 | any non-permissive terms added in accord with section 7 apply to the code; keep
185 | intact all notices of the absence of any warranty; and give all recipients a copy of
186 | this License along with the Program.
187 |
188 | You may charge any price or no price for each copy that you convey, and you may offer
189 | support or warranty protection for a fee.
190 |
191 | ### 5. Conveying Modified Source Versions.
192 |
193 | You may convey a work based on the Program, or the modifications to produce it from
194 | the Program, in the form of source code under the terms of section 4, provided that
195 | you also meet all of these conditions:
196 |
197 | * a) The work must carry prominent notices stating that you modified it, and giving a
198 | relevant date.
199 | * b) The work must carry prominent notices stating that it is released under this
200 | License and any conditions added under section 7. This requirement modifies the
201 | requirement in section 4 to “keep intact all notices”.
202 | * c) You must license the entire work, as a whole, under this License to anyone who
203 | comes into possession of a copy. This License will therefore apply, along with any
204 | applicable section 7 additional terms, to the whole of the work, and all its parts,
205 | regardless of how they are packaged. This License gives no permission to license the
206 | work in any other way, but it does not invalidate such permission if you have
207 | separately received it.
208 | * d) If the work has interactive user interfaces, each must display Appropriate Legal
209 | Notices; however, if the Program has interactive interfaces that do not display
210 | Appropriate Legal Notices, your work need not make them do so.
211 |
212 | A compilation of a covered work with other separate and independent works, which are
213 | not by their nature extensions of the covered work, and which are not combined with
214 | it such as to form a larger program, in or on a volume of a storage or distribution
215 | medium, is called an “aggregate” if the compilation and its resulting
216 | copyright are not used to limit the access or legal rights of the compilation's users
217 | beyond what the individual works permit. Inclusion of a covered work in an aggregate
218 | does not cause this License to apply to the other parts of the aggregate.
219 |
220 | ### 6. Conveying Non-Source Forms.
221 |
222 | You may convey a covered work in object code form under the terms of sections 4 and
223 | 5, provided that you also convey the machine-readable Corresponding Source under the
224 | terms of this License, in one of these ways:
225 |
226 | * a) Convey the object code in, or embodied in, a physical product (including a
227 | physical distribution medium), accompanied by the Corresponding Source fixed on a
228 | durable physical medium customarily used for software interchange.
229 | * b) Convey the object code in, or embodied in, a physical product (including a
230 | physical distribution medium), accompanied by a written offer, valid for at least
231 | three years and valid for as long as you offer spare parts or customer support for
232 | that product model, to give anyone who possesses the object code either (1) a copy of
233 | the Corresponding Source for all the software in the product that is covered by this
234 | License, on a durable physical medium customarily used for software interchange, for
235 | a price no more than your reasonable cost of physically performing this conveying of
236 | source, or (2) access to copy the Corresponding Source from a network server at no
237 | charge.
238 | * c) Convey individual copies of the object code with a copy of the written offer to
239 | provide the Corresponding Source. This alternative is allowed only occasionally and
240 | noncommercially, and only if you received the object code with such an offer, in
241 | accord with subsection 6b.
242 | * d) Convey the object code by offering access from a designated place (gratis or for
243 | a charge), and offer equivalent access to the Corresponding Source in the same way
244 | through the same place at no further charge. You need not require recipients to copy
245 | the Corresponding Source along with the object code. If the place to copy the object
246 | code is a network server, the Corresponding Source may be on a different server
247 | (operated by you or a third party) that supports equivalent copying facilities,
248 | provided you maintain clear directions next to the object code saying where to find
249 | the Corresponding Source. Regardless of what server hosts the Corresponding Source,
250 | you remain obligated to ensure that it is available for as long as needed to satisfy
251 | these requirements.
252 | * e) Convey the object code using peer-to-peer transmission, provided you inform
253 | other peers where the object code and Corresponding Source of the work are being
254 | offered to the general public at no charge under subsection 6d.
255 |
256 | A separable portion of the object code, whose source code is excluded from the
257 | Corresponding Source as a System Library, need not be included in conveying the
258 | object code work.
259 |
260 | A “User Product” is either (1) a “consumer product”, which
261 | means any tangible personal property which is normally used for personal, family, or
262 | household purposes, or (2) anything designed or sold for incorporation into a
263 | dwelling. In determining whether a product is a consumer product, doubtful cases
264 | shall be resolved in favor of coverage. For a particular product received by a
265 | particular user, “normally used” refers to a typical or common use of
266 | that class of product, regardless of the status of the particular user or of the way
267 | in which the particular user actually uses, or expects or is expected to use, the
268 | product. A product is a consumer product regardless of whether the product has
269 | substantial commercial, industrial or non-consumer uses, unless such uses represent
270 | the only significant mode of use of the product.
271 |
272 | “Installation Information” for a User Product means any methods,
273 | procedures, authorization keys, or other information required to install and execute
274 | modified versions of a covered work in that User Product from a modified version of
275 | its Corresponding Source. The information must suffice to ensure that the continued
276 | functioning of the modified object code is in no case prevented or interfered with
277 | solely because modification has been made.
278 |
279 | If you convey an object code work under this section in, or with, or specifically for
280 | use in, a User Product, and the conveying occurs as part of a transaction in which
281 | the right of possession and use of the User Product is transferred to the recipient
282 | in perpetuity or for a fixed term (regardless of how the transaction is
283 | characterized), the Corresponding Source conveyed under this section must be
284 | accompanied by the Installation Information. But this requirement does not apply if
285 | neither you nor any third party retains the ability to install modified object code
286 | on the User Product (for example, the work has been installed in ROM).
287 |
288 | The requirement to provide Installation Information does not include a requirement to
289 | continue to provide support service, warranty, or updates for a work that has been
290 | modified or installed by the recipient, or for the User Product in which it has been
291 | modified or installed. Access to a network may be denied when the modification itself
292 | materially and adversely affects the operation of the network or violates the rules
293 | and protocols for communication across the network.
294 |
295 | Corresponding Source conveyed, and Installation Information provided, in accord with
296 | this section must be in a format that is publicly documented (and with an
297 | implementation available to the public in source code form), and must require no
298 | special password or key for unpacking, reading or copying.
299 |
300 | ### 7. Additional Terms.
301 |
302 | “Additional permissions” are terms that supplement the terms of this
303 | License by making exceptions from one or more of its conditions. Additional
304 | permissions that are applicable to the entire Program shall be treated as though they
305 | were included in this License, to the extent that they are valid under applicable
306 | law. If additional permissions apply only to part of the Program, that part may be
307 | used separately under those permissions, but the entire Program remains governed by
308 | this License without regard to the additional permissions.
309 |
310 | When you convey a copy of a covered work, you may at your option remove any
311 | additional permissions from that copy, or from any part of it. (Additional
312 | permissions may be written to require their own removal in certain cases when you
313 | modify the work.) You may place additional permissions on material, added by you to a
314 | covered work, for which you have or can give appropriate copyright permission.
315 |
316 | Notwithstanding any other provision of this License, for material you add to a
317 | covered work, you may (if authorized by the copyright holders of that material)
318 | supplement the terms of this License with terms:
319 |
320 | * a) Disclaiming warranty or limiting liability differently from the terms of
321 | sections 15 and 16 of this License; or
322 | * b) Requiring preservation of specified reasonable legal notices or author
323 | attributions in that material or in the Appropriate Legal Notices displayed by works
324 | containing it; or
325 | * c) Prohibiting misrepresentation of the origin of that material, or requiring that
326 | modified versions of such material be marked in reasonable ways as different from the
327 | original version; or
328 | * d) Limiting the use for publicity purposes of names of licensors or authors of the
329 | material; or
330 | * e) Declining to grant rights under trademark law for use of some trade names,
331 | trademarks, or service marks; or
332 | * f) Requiring indemnification of licensors and authors of that material by anyone
333 | who conveys the material (or modified versions of it) with contractual assumptions of
334 | liability to the recipient, for any liability that these contractual assumptions
335 | directly impose on those licensors and authors.
336 |
337 | All other non-permissive additional terms are considered “further
338 | restrictions” within the meaning of section 10. If the Program as you received
339 | it, or any part of it, contains a notice stating that it is governed by this License
340 | along with a term that is a further restriction, you may remove that term. If a
341 | license document contains a further restriction but permits relicensing or conveying
342 | under this License, you may add to a covered work material governed by the terms of
343 | that license document, provided that the further restriction does not survive such
344 | relicensing or conveying.
345 |
346 | If you add terms to a covered work in accord with this section, you must place, in
347 | the relevant source files, a statement of the additional terms that apply to those
348 | files, or a notice indicating where to find the applicable terms.
349 |
350 | Additional terms, permissive or non-permissive, may be stated in the form of a
351 | separately written license, or stated as exceptions; the above requirements apply
352 | either way.
353 |
354 | ### 8. Termination.
355 |
356 | You may not propagate or modify a covered work except as expressly provided under
357 | this License. Any attempt otherwise to propagate or modify it is void, and will
358 | automatically terminate your rights under this License (including any patent licenses
359 | granted under the third paragraph of section 11).
360 |
361 | However, if you cease all violation of this License, then your license from a
362 | particular copyright holder is reinstated (a) provisionally, unless and until the
363 | copyright holder explicitly and finally terminates your license, and (b) permanently,
364 | if the copyright holder fails to notify you of the violation by some reasonable means
365 | prior to 60 days after the cessation.
366 |
367 | Moreover, your license from a particular copyright holder is reinstated permanently
368 | if the copyright holder notifies you of the violation by some reasonable means, this
369 | is the first time you have received notice of violation of this License (for any
370 | work) from that copyright holder, and you cure the violation prior to 30 days after
371 | your receipt of the notice.
372 |
373 | Termination of your rights under this section does not terminate the licenses of
374 | parties who have received copies or rights from you under this License. If your
375 | rights have been terminated and not permanently reinstated, you do not qualify to
376 | receive new licenses for the same material under section 10.
377 |
378 | ### 9. Acceptance Not Required for Having Copies.
379 |
380 | You are not required to accept this License in order to receive or run a copy of the
381 | Program. Ancillary propagation of a covered work occurring solely as a consequence of
382 | using peer-to-peer transmission to receive a copy likewise does not require
383 | acceptance. However, nothing other than this License grants you permission to
384 | propagate or modify any covered work. These actions infringe copyright if you do not
385 | accept this License. Therefore, by modifying or propagating a covered work, you
386 | indicate your acceptance of this License to do so.
387 |
388 | ### 10. Automatic Licensing of Downstream Recipients.
389 |
390 | Each time you convey a covered work, the recipient automatically receives a license
391 | from the original licensors, to run, modify and propagate that work, subject to this
392 | License. You are not responsible for enforcing compliance by third parties with this
393 | License.
394 |
395 | An “entity transaction” is a transaction transferring control of an
396 | organization, or substantially all assets of one, or subdividing an organization, or
397 | merging organizations. If propagation of a covered work results from an entity
398 | transaction, each party to that transaction who receives a copy of the work also
399 | receives whatever licenses to the work the party's predecessor in interest had or
400 | could give under the previous paragraph, plus a right to possession of the
401 | Corresponding Source of the work from the predecessor in interest, if the predecessor
402 | has it or can get it with reasonable efforts.
403 |
404 | You may not impose any further restrictions on the exercise of the rights granted or
405 | affirmed under this License. For example, you may not impose a license fee, royalty,
406 | or other charge for exercise of rights granted under this License, and you may not
407 | initiate litigation (including a cross-claim or counterclaim in a lawsuit) alleging
408 | that any patent claim is infringed by making, using, selling, offering for sale, or
409 | importing the Program or any portion of it.
410 |
411 | ### 11. Patents.
412 |
413 | A “contributor” is a copyright holder who authorizes use under this
414 | License of the Program or a work on which the Program is based. The work thus
415 | licensed is called the contributor's “contributor version”.
416 |
417 | A contributor's “essential patent claims” are all patent claims owned or
418 | controlled by the contributor, whether already acquired or hereafter acquired, that
419 | would be infringed by some manner, permitted by this License, of making, using, or
420 | selling its contributor version, but do not include claims that would be infringed
421 | only as a consequence of further modification of the contributor version. For
422 | purposes of this definition, “control” includes the right to grant patent
423 | sublicenses in a manner consistent with the requirements of this License.
424 |
425 | Each contributor grants you a non-exclusive, worldwide, royalty-free patent license
426 | under the contributor's essential patent claims, to make, use, sell, offer for sale,
427 | import and otherwise run, modify and propagate the contents of its contributor
428 | version.
429 |
430 | In the following three paragraphs, a “patent license” is any express
431 | agreement or commitment, however denominated, not to enforce a patent (such as an
432 | express permission to practice a patent or covenant not to sue for patent
433 | infringement). To “grant” such a patent license to a party means to make
434 | such an agreement or commitment not to enforce a patent against the party.
435 |
436 | If you convey a covered work, knowingly relying on a patent license, and the
437 | Corresponding Source of the work is not available for anyone to copy, free of charge
438 | and under the terms of this License, through a publicly available network server or
439 | other readily accessible means, then you must either (1) cause the Corresponding
440 | Source to be so available, or (2) arrange to deprive yourself of the benefit of the
441 | patent license for this particular work, or (3) arrange, in a manner consistent with
442 | the requirements of this License, to extend the patent license to downstream
443 | recipients. “Knowingly relying” means you have actual knowledge that, but
444 | for the patent license, your conveying the covered work in a country, or your
445 | recipient's use of the covered work in a country, would infringe one or more
446 | identifiable patents in that country that you have reason to believe are valid.
447 |
448 | If, pursuant to or in connection with a single transaction or arrangement, you
449 | convey, or propagate by procuring conveyance of, a covered work, and grant a patent
450 | license to some of the parties receiving the covered work authorizing them to use,
451 | propagate, modify or convey a specific copy of the covered work, then the patent
452 | license you grant is automatically extended to all recipients of the covered work and
453 | works based on it.
454 |
455 | A patent license is “discriminatory” if it does not include within the
456 | scope of its coverage, prohibits the exercise of, or is conditioned on the
457 | non-exercise of one or more of the rights that are specifically granted under this
458 | License. You may not convey a covered work if you are a party to an arrangement with
459 | a third party that is in the business of distributing software, under which you make
460 | payment to the third party based on the extent of your activity of conveying the
461 | work, and under which the third party grants, to any of the parties who would receive
462 | the covered work from you, a discriminatory patent license (a) in connection with
463 | copies of the covered work conveyed by you (or copies made from those copies), or (b)
464 | primarily for and in connection with specific products or compilations that contain
465 | the covered work, unless you entered into that arrangement, or that patent license
466 | was granted, prior to 28 March 2007.
467 |
468 | Nothing in this License shall be construed as excluding or limiting any implied
469 | license or other defenses to infringement that may otherwise be available to you
470 | under applicable patent law.
471 |
472 | ### 12. No Surrender of Others' Freedom.
473 |
474 | If conditions are imposed on you (whether by court order, agreement or otherwise)
475 | that contradict the conditions of this License, they do not excuse you from the
476 | conditions of this License. If you cannot convey a covered work so as to satisfy
477 | simultaneously your obligations under this License and any other pertinent
478 | obligations, then as a consequence you may not convey it at all. For example, if you
479 | agree to terms that obligate you to collect a royalty for further conveying from
480 | those to whom you convey the Program, the only way you could satisfy both those terms
481 | and this License would be to refrain entirely from conveying the Program.
482 |
483 | ### 13. Use with the GNU Affero General Public License.
484 |
485 | Notwithstanding any other provision of this License, you have permission to link or
486 | combine any covered work with a work licensed under version 3 of the GNU Affero
487 | General Public License into a single combined work, and to convey the resulting work.
488 | The terms of this License will continue to apply to the part which is the covered
489 | work, but the special requirements of the GNU Affero General Public License, section
490 | 13, concerning interaction through a network will apply to the combination as such.
491 |
492 | ### 14. Revised Versions of this License.
493 |
494 | The Free Software Foundation may publish revised and/or new versions of the GNU
495 | General Public License from time to time. Such new versions will be similar in spirit
496 | to the present version, but may differ in detail to address new problems or concerns.
497 |
498 | Each version is given a distinguishing version number. If the Program specifies that
499 | a certain numbered version of the GNU General Public License “or any later
500 | version” applies to it, you have the option of following the terms and
501 | conditions either of that numbered version or of any later version published by the
502 | Free Software Foundation. If the Program does not specify a version number of the GNU
503 | General Public License, you may choose any version ever published by the Free
504 | Software Foundation.
505 |
506 | If the Program specifies that a proxy can decide which future versions of the GNU
507 | General Public License can be used, that proxy's public statement of acceptance of a
508 | version permanently authorizes you to choose that version for the Program.
509 |
510 | Later license versions may give you additional or different permissions. However, no
511 | additional obligations are imposed on any author or copyright holder as a result of
512 | your choosing to follow a later version.
513 |
514 | ### 15. Disclaimer of Warranty.
515 |
516 | THERE IS NO WARRANTY FOR THE PROGRAM, TO THE EXTENT PERMITTED BY APPLICABLE LAW.
517 | EXCEPT WHEN OTHERWISE STATED IN WRITING THE COPYRIGHT HOLDERS AND/OR OTHER PARTIES
518 | PROVIDE THE PROGRAM “AS IS” WITHOUT WARRANTY OF ANY KIND, EITHER
519 | EXPRESSED OR IMPLIED, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF
520 | MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE. THE ENTIRE RISK AS TO THE
521 | QUALITY AND PERFORMANCE OF THE PROGRAM IS WITH YOU. SHOULD THE PROGRAM PROVE
522 | DEFECTIVE, YOU ASSUME THE COST OF ALL NECESSARY SERVICING, REPAIR OR CORRECTION.
523 |
524 | ### 16. Limitation of Liability.
525 |
526 | IN NO EVENT UNLESS REQUIRED BY APPLICABLE LAW OR AGREED TO IN WRITING WILL ANY
527 | COPYRIGHT HOLDER, OR ANY OTHER PARTY WHO MODIFIES AND/OR CONVEYS THE PROGRAM AS
528 | PERMITTED ABOVE, BE LIABLE TO YOU FOR DAMAGES, INCLUDING ANY GENERAL, SPECIAL,
529 | INCIDENTAL OR CONSEQUENTIAL DAMAGES ARISING OUT OF THE USE OR INABILITY TO USE THE
530 | PROGRAM (INCLUDING BUT NOT LIMITED TO LOSS OF DATA OR DATA BEING RENDERED INACCURATE
531 | OR LOSSES SUSTAINED BY YOU OR THIRD PARTIES OR A FAILURE OF THE PROGRAM TO OPERATE
532 | WITH ANY OTHER PROGRAMS), EVEN IF SUCH HOLDER OR OTHER PARTY HAS BEEN ADVISED OF THE
533 | POSSIBILITY OF SUCH DAMAGES.
534 |
535 | ### 17. Interpretation of Sections 15 and 16.
536 |
537 | If the disclaimer of warranty and limitation of liability provided above cannot be
538 | given local legal effect according to their terms, reviewing courts shall apply local
539 | law that most closely approximates an absolute waiver of all civil liability in
540 | connection with the Program, unless a warranty or assumption of liability accompanies
541 | a copy of the Program in return for a fee.
542 |
543 | END OF TERMS AND CONDITIONS
544 |
545 | ## How to Apply These Terms to Your New Programs
546 |
547 | If you develop a new program, and you want it to be of the greatest possible use to
548 | the public, the best way to achieve this is to make it free software which everyone
549 | can redistribute and change under these terms.
550 |
551 | To do so, attach the following notices to the program. It is safest to attach them
552 | to the start of each source file to most effectively state the exclusion of warranty;
553 | and each file should have at least the “copyright” line and a pointer to
554 | where the full notice is found.
555 |
556 |
557 | Copyright (C)
558 |
559 | This program is free software: you can redistribute it and/or modify
560 | it under the terms of the GNU General Public License as published by
561 | the Free Software Foundation, either version 3 of the License, or
562 | (at your option) any later version.
563 |
564 | This program is distributed in the hope that it will be useful,
565 | but WITHOUT ANY WARRANTY; without even the implied warranty of
566 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
567 | GNU General Public License for more details.
568 |
569 | You should have received a copy of the GNU General Public License
570 | along with this program. If not, see .
571 |
572 | Also add information on how to contact you by electronic and paper mail.
573 |
574 | If the program does terminal interaction, make it output a short notice like this
575 | when it starts in an interactive mode:
576 |
577 | Copyright (C)
578 | This program comes with ABSOLUTELY NO WARRANTY; for details type `show w'.
579 | This is free software, and you are welcome to redistribute it
580 | under certain conditions; type `show c' for details.
581 |
582 | The hypothetical commands `show w' and `show c' should show the appropriate parts of
583 | the General Public License. Of course, your program's commands might be different;
584 | for a GUI interface, you would use an “about box”.
585 |
586 | You should also get your employer (if you work as a programmer) or school, if any, to
587 | sign a “copyright disclaimer” for the program, if necessary. For more
588 | information on this, and how to apply and follow the GNU GPL, see
589 | <>.
590 |
591 | The GNU General Public License does not permit incorporating your program into
592 | proprietary programs. If your program is a subroutine library, you may consider it
593 | more useful to permit linking proprietary applications with the library. If this is
594 | what you want to do, use the GNU Lesser General Public License instead of this
595 | License. But first, please read
596 | <>.
--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 | ## Cat Crawler
2 |
3 | A webcrawler I'm writing in Golang that I can use to find and download cat pictures.
4 |
5 | ### Installation
6 |
7 | - Make sure your GOPATH environment variable is set up properly:
8 | `export GOPATH=$HOME/golib`
9 | - Make sure the bin directory is in your path:
10 | `PATH=$PATH:$GOPATH/bin`
11 | - Now install the package
12 | `go get -v github.com/dmuth/cat-crawler`
13 |
14 | ### Running the crawler
15 | cat-crawler [--seed-url url[,url[,url[...]]]] [ --num-connections n ] [--allow-urls [url,[url,[...]]]] [--search-string cat]
16 | --seed-url What URL to start at? More than one URL may be
17 | specified in comma-delimited format.
18 | --num-connections How many concurrent connections?
19 | --search-string A string we want to search for in ALT and TITLE attributes on images
20 | --allow-urls If specified, only URLs starting with the URLs listed here are crawled
21 | --stats Print out stats once a second using my stats package
22 |
23 | ### Examples
24 | cat-crawler --seed-url cnn.com --num-connections 1
25 | Get top stories. :-)
26 |
27 | cat-crawler --seed-url (any URL) --num-connections 1000
28 | This will saturate your download bandwidth. Seriously, don't do it.
29 |
30 | cat-crawler --seed-url cnn.com --num-connections 1 --allow-urls cnn.com
31 | Don't leave CNN's website
32 |
33 | cat-crawler --seed-url cnn.com --num-connections 1 --allow-urls foobar
34 | After crawling the first page, nothing will happen. Oops.
35 |
36 | ### Sequence diagram
37 |
38 | 
39 |
40 |
41 | ### Development
42 |
43 | go get -v github.com/dmuth/cat-crawler && cat-crawler [options]
44 |
45 |
46 | ### Running the tests
47 |
48 | go get -v -a github.com/dmuth/procedural-webserver # Dependency
49 | go test -v github.com/dmuth/cat-crawler
50 |
51 | You should see results like this:
52 |
53 | === RUN TestSplitHostnames
54 | --- PASS: TestSplitHostnames (0.00 seconds)
55 | === RUN TestHtmlNew
56 | --- PASS: TestHtmlNew (0.00 seconds)
57 | === RUN TestHtmlBadImg
58 | --- PASS: TestHtmlBadImg (0.00 seconds)
59 | === RUN TestHtmlLinksAndImages
60 | --- PASS: TestHtmlLinksAndImages (0.00 seconds)
61 | === RUN TestHtmlNoLinks
62 | --- PASS: TestHtmlNoLinks (0.00 seconds)
63 | === RUN TestHtmlNoImages
64 | --- PASS: TestHtmlNoImages (0.00 seconds)
65 | === RUN TestHtmlNoLinksNorImages
66 | --- PASS: TestHtmlNoLinksNorImages (0.00 seconds)
67 | === RUN TestHtmlPortNumberInBaseUrl
68 | --- PASS: TestHtmlPortNumberInBaseUrl (0.00 seconds)
69 | === RUN TestGetFilenameFromUrl
70 | --- PASS: TestGetFilenameFromUrl (0.00 seconds)
71 | === RUN Test
72 | --- PASS: Test (0.00 seconds)
73 | === RUN TestFilterUrl
74 | --- PASS: TestFilterUrl (0.00 seconds)
75 | === RUN TestIsUrlAllowed
76 | --- PASS: TestIsUrlAllowed (0.00 seconds)
77 | PASS
78 | ok github.com/dmuth/cat-crawler 0.037s
79 |
80 |
81 | ### Depdendencies
82 |
83 | This repo uses other packages I wrote:
84 | - [log4go](https://github.com/dmuth/google-go-log4go)
85 | - [golang-stats](https://github.com/dmuth/golang-stats)
86 |
87 |
88 | ### Bugs
89 |
90 | - I am not accessing the maps inside of an array.
91 | - Fix: A separate source file, with a single goroutine which service requests through a channel is a possibility
92 |
93 |
94 | ### TODO
95 |
96 | - Rate limiting by domain in URL crawler
97 | - I could have an array of key=domain, value=count and a goroutine
98 | that decrements count regularly
99 | - Could get a bit crazy on the memory, though!
100 | - Write instrumentation to detect how many goroutines are active/idle
101 | - GoStatStart(key)
102 | - GoStatStop(key)
103 | - go GoStatDump(interval)
104 |
105 |
106 | ### Contact
107 |
108 | Questions? Complaints? Here's my contact info: http://www.dmuth.org/contact
109 |
110 |
111 |
112 |
--------------------------------------------------------------------------------
/args.go:
--------------------------------------------------------------------------------
1 | package main
2 |
3 | import "flag"
4 |
5 | //import "fmt"
6 | import "regexp"
7 | import "strings"
8 | import "os"
9 |
10 | import log "github.com/dmuth/google-go-log4go"
11 |
12 | //
13 | // Configuration for what was passed in on the command line.
14 | //
15 | type Config struct {
16 | SeedUrls []string
17 | AllowUrls []string
18 | SearchString string
19 | NumConnections uint
20 | Stats bool
21 | }
22 |
23 | /**
24 | * Parse our command line arguments.
25 | * @return {config} Our configuration info
26 | */
27 | func ParseArgs() (retval Config) {
28 |
29 | retval = Config{[]string{}, []string{}, "", 1, false}
30 |
31 | hostnames := flag.String("seed-url",
32 | "http://www.cnn.com/",
33 | "URL to start with.")
34 | allowUrls := flag.String("allow-urls",
35 | "", "Url base names to crawl. "+
36 | "If specified, this basically acts like a whitelist. "+
37 | "This may be a comma-delimited list. "+
38 | "Examples: http://cnn.com/, http://www.apple.com/store")
39 | flag.UintVar(&retval.NumConnections, "num-connections",
40 | 1, "How many concurrent outbound connections?")
41 | flag.StringVar(&retval.SearchString, "search-string",
42 | "cat", "String to search for in alt and title tags of graphics")
43 | flag.BoolVar(&retval.Stats, "stats", false, "To print out stats once per second")
44 |
45 | h := flag.Bool("h", false, "To get this help")
46 | help := flag.Bool("help", false, "To get this help")
47 | debug_level := flag.String("debug-level", "info", "Set the debug level")
48 |
49 | flag.Parse()
50 |
51 | log.SetLevelString(*debug_level)
52 | log.Error("Debug level: " + *debug_level)
53 |
54 | if *h || *help {
55 | flag.PrintDefaults()
56 | os.Exit(1)
57 | }
58 |
59 | retval.SeedUrls = SplitHostnames(*hostnames)
60 | retval.AllowUrls = SplitHostnames(*allowUrls)
61 |
62 | return (retval)
63 |
64 | } // End of ParseArgs()
65 |
66 | /**
67 | * Take a comma-delimited string of hostnames and turn it into an array of URLs.
68 | *
69 | * @param {string} Input The comma-delimited string
70 | *
71 | * @return {[]string} Array of URLs
72 | */
73 | func SplitHostnames(Input string) (retval []string) {
74 |
75 | Results := strings.Split(Input, ",")
76 |
77 | for _, value := range Results {
78 |
79 | if value != "" {
80 | pattern := "^http(s)?://"
81 | match, _ := regexp.MatchString(pattern, value)
82 | if !match {
83 | value = "http://" + value
84 | }
85 |
86 | }
87 |
88 | retval = append(retval, value)
89 |
90 | }
91 |
92 | return (retval)
93 |
94 | } // End of SplitHostnames()
95 |
--------------------------------------------------------------------------------
/args_test.go:
--------------------------------------------------------------------------------
1 | package main
2 |
3 | //import "fmt"
4 | import "testing"
5 |
6 | //import log "github.com/dmuth/google-go-log4go"
7 |
8 | func TestSplitHostnames(t *testing.T) {
9 |
10 | //log.SetLevelString("info")
11 |
12 | Input := "test,test2,http://test3,https://test4/,test5:8080/,test6:8080/foobar,"
13 | Output := SplitHostnames(Input)
14 | Expected := []string{
15 | "http://test",
16 | "http://test2",
17 | "http://test3",
18 | "https://test4/",
19 | "http://test5:8080/",
20 | "http://test6:8080/foobar",
21 | "",
22 | }
23 |
24 | for key, value := range Output {
25 |
26 | if value != Expected[key] {
27 | t.Errorf("Value '%s' doesn't match expected '%s'!", value, Expected[key])
28 | }
29 |
30 | }
31 |
32 | Input = "test"
33 | Output = SplitHostnames(Input)
34 | Expected = []string{
35 | "http://test",
36 | }
37 |
38 | for key, value := range Output {
39 |
40 | if value != Expected[key] {
41 | t.Errorf("Value '%s' doesn't match expected '%s'!", value, Expected[key])
42 | }
43 |
44 | }
45 |
46 | } // End of TestSplitHostnames()
47 |
--------------------------------------------------------------------------------
/docs/sequence-diagram.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/dmuth/cat-crawler/c40794fb8afb912734a41131438a28de2e8147b4/docs/sequence-diagram.png
--------------------------------------------------------------------------------
/docs/web-sequence-diagram.txt:
--------------------------------------------------------------------------------
1 |
2 |
3 | #
4 | # Paste this into https://www.websequencediagrams.com/ to create a sequence diagram
5 | #
6 | participant Main
7 | participant UrlCrawler
8 | participant HtmlParser
9 | participant ImageCrawler
10 | participant Filesystem
11 |
12 | Note over UrlCrawler, ImageCrawler: 1,000 Goroutines of each crawler, 1 of HtmlParser
13 | Main->UrlCrawler: Seed URL
14 | UrlCrawler->HtmlParser: Send HTML from crawled URLs
15 | HtmlParser->UrlCrawler: Send found URLs to be crawled
16 | HtmlParser->ImageCrawler: Send images to crawl
17 | ImageCrawler->Filesystem: Write matched images
18 |
19 |
20 |
--------------------------------------------------------------------------------
/html.go:
--------------------------------------------------------------------------------
1 | package main
2 |
3 | //import "fmt"
4 | import "regexp"
5 |
6 | import log "github.com/dmuth/google-go-log4go"
7 | import stats "github.com/dmuth/golang-stats"
8 |
9 | /**
10 | * Representation of our parsed Html
11 | */
12 | type Image struct {
13 | html string
14 | src string
15 | alt string
16 | title string
17 | }
18 | type HtmlParsed struct {
19 | links []string
20 | images []Image
21 | }
22 |
23 | /**
24 | * Set up our parser to run in the background.
25 | *
26 | * @param {chan string} UrlCrawlerIn URLs written to this will be sent
27 | * off to the URL crawler.
28 | *
29 | * @return {chan string} A channel which will be used to ingest HTML for parsing.
30 | */
31 | func NewHtml(UrlCrawlerIn chan string) (
32 | chan []string, chan Image) {
33 |
34 | HtmlCrawlerIn := make(chan []string)
35 |
36 | BufferSize := 1000
37 | //BufferSize = 1 // Debugging
38 | ImageCrawlerIn := make(chan Image, BufferSize)
39 |
40 | go HtmlParseWorker(HtmlCrawlerIn, UrlCrawlerIn, ImageCrawlerIn)
41 |
42 | return HtmlCrawlerIn, ImageCrawlerIn
43 |
44 | } // End of NewHtml()
45 |
46 | /**
47 | * This function is run as a goroutine and ingests Html to parse. It then
48 | * sends off URLs and image URLs.
49 | *
50 | * @param {chan string} HtmlIn Incoming HTML
51 | * @param {chan string} UrlCrawlerIn URLs written to this will be sent
52 | * off to the URL crawler.
53 | * @param {chan string} ImageCrawlerOut URLs written to this will be sent
54 | * off to the image crawler.
55 | *
56 | */
57 | func HtmlParseWorker(HtmlIn chan []string, UrlCrawlerIn chan string,
58 | ImageCrawlerIn chan Image) {
59 |
60 | //
61 | // Loop through HTML and parse all the things.
62 | //
63 | for {
64 | in := <-HtmlIn
65 | BaseUrl := in[0]
66 | Html := in[1]
67 | Parsed := HtmlParseString(BaseUrl, Html)
68 |
69 | //
70 | // Put these into goroutines so that we can get back to parsing
71 | //
72 | go HtmlParseWorkerLinks(&Parsed, UrlCrawlerIn)
73 | go HtmlParseWorkerImages(&Parsed, ImageCrawlerIn)
74 |
75 | }
76 |
77 | } // End of HtmlParseWorker()
78 |
79 | /**
80 | * Another goroutine that loops through our links and sends them off to UrlCrawler
81 | *
82 | * @param {HtmlParsed} Our parsed HTML elements.
83 | * @param {chan string} The channel to send URLs to our URL crawler
84 | *
85 | */
86 | func HtmlParseWorkerLinks(Parsed *HtmlParsed, UrlCrawlerIn chan string) {
87 |
88 | for i := range Parsed.links {
89 | Row := Parsed.links[i]
90 | log.Debugf("Sending to UrlCrawler: %d: %s", i, Row)
91 | stats.IncrStat("urls_to_be_crawled")
92 | UrlCrawlerIn <- Row
93 | }
94 |
95 | } // End of HtmlParseWorkerLinks()
96 |
97 | /**
98 | * Another goroutine that loops through our images and sends them off to
99 | * the ImageCrawler.
100 | *
101 | * @param {HtmlParsed} Our parsed HTML elements.
102 | * @param {ImageCrawlerIn} The channel to send images to our image crawler
103 | *
104 | */
105 | func HtmlParseWorkerImages(Parsed *HtmlParsed, ImageCrawlerIn chan Image) {
106 |
107 | for i := range Parsed.images {
108 | Row := Parsed.images[i]
109 | log.Debugf("Sending to ImageCrawler: %d: %s", i, Row)
110 | stats.IncrStat("images_to_be_crawled")
111 | ImageCrawlerIn <- Row
112 | }
113 |
114 | } // End of HtmlParseWorkerImages()
115 |
116 | /**
117 | * Parse an HTML response and get links and images.
118 | *
119 | * @param {string} BaseUrl The URL of the page we got these links from
120 | * @param {string} Body The body of the page
121 | *
122 | * @return {HtmlParsed} Structure of links and images
123 | */
124 | func HtmlParseString(BaseUrl string, Body string) (retval HtmlParsed) {
125 |
126 | //
127 | // Break up our base URL into a host and URI
128 | //
129 | regex, _ := regexp.Compile("(https?://[^/]+)(.*)")
130 | results := regex.FindStringSubmatch(BaseUrl)
131 | BaseUrlHost := results[1]
132 | BaseUrlUri := results[2]
133 |
134 | retval.links = HtmlParseLinks(BaseUrlHost, BaseUrlUri, Body)
135 | retval.images = HtmlParseImages(BaseUrlHost, BaseUrlUri, Body)
136 |
137 | return (retval)
138 |
139 | } // End of HtmlParseString()
140 |
141 | /**
142 | * Grab our links out of the body, and fully qualify them.
143 | *
144 | * @param {string} BaseUrlHost The http:// and hostname part of our base URL
145 | * @param {string} BaseUrlUri Our base URI
146 | * @param {string} Body The body of the webpage
147 | *
148 | * @return {[]string} Array of links
149 | */
150 | func HtmlParseLinks(BaseUrlHost string, BaseUrlUri string, Body string) (retval []string) {
151 |
152 | //
153 | // Get all of our links
154 | //
155 | regex, _ := regexp.Compile("(?s)" +
156 | "href=\"" +
157 | "(" +
158 | "(https?://([^/]+))?" +
159 | "([^\"]+)" +
160 | ")\"")
161 | results := regex.FindAllStringSubmatch(Body, -1)
162 |
163 | for i := range results {
164 |
165 | result := results[i]
166 |
167 | HostAndMethod := result[2]
168 | Uri := result[4]
169 |
170 | //
171 | // If a host and method is specified, just glue them back together.
172 | //
173 | Url := ""
174 | if HostAndMethod != "" {
175 | Url = HostAndMethod + Uri
176 |
177 | } else {
178 | //
179 | // Otherwise, it's on the same host. Determine if
180 | // it's a relative or absolute link.
181 | //
182 | FirstChar := string(Uri[0])
183 | if FirstChar == "/" {
184 | Url = BaseUrlHost + Uri
185 | } else {
186 | Url = BaseUrlHost + BaseUrlUri + "/" + Uri
187 | }
188 |
189 | }
190 |
191 | //fmt.Println("FINAL URL", Url)
192 |
193 | retval = append(retval, Url)
194 |
195 | }
196 |
197 | return (retval)
198 |
199 | } // End of HtmlParseLinks()
200 |
201 | /**
202 | * Grab image links out of the body, and fully qualify them.
203 | *
204 | * @param {string} BaseUrlHost The http:// and hostname part of our base URL
205 | * @param {string} BaseUrlUri Our base URI
206 | * @param {string} Body The body of the webpage
207 | *
208 | * @return {[]Image} Array of images
209 | */
210 | func HtmlParseImages(BaseUrlHost string, BaseUrlUri string, Body string) (retval []Image) {
211 |
212 | retval = htmlParseImageTags(Body)
213 |
214 | for i := range retval {
215 | htmlParseSrc(BaseUrlHost, BaseUrlUri, &retval[i])
216 | if retval[i].src != "" {
217 | htmlParseAlt(&retval[i])
218 | htmlParseTitle(&retval[i])
219 | }
220 | }
221 |
222 | return (retval)
223 |
224 | } // End of HtmlParseImages()
225 |
226 | /**
227 | * Grab our image tags out of the body.
228 | *
229 | * @param {string} Body The HTML body
230 | *
231 | * @return {[]Image} Array of Image elements
232 | */
233 | func htmlParseImageTags(Body string) (retval []Image) {
234 |
235 | regex, _ := regexp.Compile("(?s)" +
236 | "
]+>")
237 | results := regex.FindAllStringSubmatch(Body, -1)
238 |
239 | for i := range results {
240 | image := Image{results[i][0], "", "", ""}
241 | retval = append(retval, image)
242 | }
243 |
244 | return (retval)
245 |
246 | } // End of htmlParseImageTags()
247 |
248 | /**
249 | * Parse the src tag out of our image.
250 | *
251 | * @param {string} BaseUrlHost The http:// and hostname part of our base URL
252 | * @param {string} BaseUrlUri Our base URI
253 | * @param {*Image} Pointer to our image structure
254 | */
255 | func htmlParseSrc(BaseUrlHost string, BaseUrlUri string, image *Image) {
256 |
257 | regex, _ := regexp.Compile("(?s)" +
258 | "
]+src=\"" +
259 | "(" +
260 | "(https?://([^/]+))?" +
261 | "([^\"]+)" +
262 | ")\"")
263 | result := regex.FindStringSubmatch(image.html)
264 |
265 | //
266 | // Bail out if we have no source
267 | //
268 | if len(result) == 0 {
269 | return
270 | }
271 |
272 | HostAndMethod := result[2]
273 | Uri := result[4]
274 |
275 | //
276 | // If a host and method is specified, just glue them back together.
277 | //
278 | Url := ""
279 | if HostAndMethod != "" {
280 | Url = HostAndMethod + Uri
281 |
282 | } else {
283 | //
284 | // Otherwise, it's on the same host. Determine if
285 | // it's a relative or absolute link.
286 | //
287 | FirstChar := string(Uri[0])
288 | if FirstChar == "/" {
289 | Url = BaseUrlHost + Uri
290 | } else {
291 | Url = BaseUrlHost + BaseUrlUri + "/" + Uri
292 | }
293 |
294 | }
295 |
296 | image.src = Url
297 |
298 | } // End of htmlParseSrc()
299 |
300 | /**
301 | * Parse the alt tag out of our image.
302 | */
303 | func htmlParseAlt(image *Image) {
304 |
305 | regex, _ := regexp.Compile("(?s)" +
306 | "
]+alt=\"([^\"]+)\"")
307 | result := regex.FindStringSubmatch(image.html)
308 | if len(result) > 1 {
309 | image.alt = result[1]
310 | }
311 |
312 | } // End of htmlParseAlt()
313 |
314 | /**
315 | * Parse the title tag out of our image.
316 | */
317 | func htmlParseTitle(image *Image) {
318 |
319 | regex, _ := regexp.Compile("(?s)" +
320 | "
]+title=\"([^\"]+)\"")
321 | result := regex.FindStringSubmatch(image.html)
322 | if len(result) > 1 {
323 | image.title = result[1]
324 | }
325 |
326 | } // End of htmlParseTitle()
327 |
--------------------------------------------------------------------------------
/html_test.go:
--------------------------------------------------------------------------------
1 | package main
2 |
3 | //import "fmt"
4 | import "testing"
5 |
6 | //import log "github.com/dmuth/google-go-log4go"
7 |
8 | func TestHtmlNew(t *testing.T) {
9 |
10 | HtmlString := "foobar1 content" +
11 | "foobar2 content" +
12 | "foobar3 content" +
13 | "foobar4 content\n" +
14 | "foobar5 content\n" +
15 | "foobar6 content\n" +
16 | "
" +
17 | "
" +
18 | "
" +
19 | "
" +
20 | "
" +
21 | "
" +
22 | ""
23 |
24 | //
25 | // Jack the buffer sizes way up so we don't have blocking.
26 | //
27 | UrlCrawlerIn := make(chan string, 100)
28 |
29 | HtmlBodyIn, ImageCrawlerOut := NewHtml(UrlCrawlerIn)
30 | HtmlBodyIn <- []string{"http://www.cnn.com/", HtmlString}
31 |
32 | ExpectedUrl := "http://www.cnn.com//foobar1"
33 | Url := <-UrlCrawlerIn
34 |
35 | if Url != ExpectedUrl {
36 | t.Errorf("Result '%s' didn't match expected '%s'", Url, ExpectedUrl)
37 | }
38 |
39 | ExpectedImageUrl := "http://www.cnn.com//foobar1.png"
40 | Image := <-ImageCrawlerOut
41 | if Image.src != ExpectedImageUrl {
42 | t.Errorf("Result '%s' didn't match expected '%s'", Image.src, ExpectedImageUrl)
43 | }
44 |
45 | } // End of TestHtmlNew()
46 |
47 | /**
48 | * Throw in some bad image tags.
49 | */
50 | func TestHtmlBadImg(t *testing.T) {
51 |
52 | HtmlString := "foobar1 content" +
53 | "foobar2 content" +
54 | //
55 | // Bad tags.
56 | //
57 | "foobar2 content" +
58 | "
" +
59 | "
" +
60 | ""
61 |
62 | //
63 | // Jack the buffer sizes way up so we don't have blocking.
64 | //
65 | UrlCrawlerIn := make(chan string, 100)
66 |
67 | HtmlBodyIn, ImageCrawlerOut := NewHtml(UrlCrawlerIn)
68 | HtmlBodyIn <- []string{"http://www.cnn.com/", HtmlString}
69 |
70 | ExpectedUrl := "http://www.cnn.com//foobar1"
71 | Url := <-UrlCrawlerIn
72 |
73 | if Url != ExpectedUrl {
74 | t.Errorf("Result '%s' didn't match expected '%s'", Url, ExpectedUrl)
75 | }
76 |
77 | ExpectedImageUrl := ""
78 | Image := <-ImageCrawlerOut
79 | if Image.src != ExpectedImageUrl {
80 | t.Errorf("Result '%s' didn't match expected '%s'", Image.src, ExpectedImageUrl)
81 | }
82 |
83 | } // End of TestHtmlBadImg()
84 |
85 | func TestHtmlLinksAndImages(t *testing.T) {
86 |
87 | //log.SetLevelString("info")
88 |
89 | HtmlString := "foobar1 content" +
90 | "foobar2 content" +
91 | "foobar3 content" +
92 | "foobar4 content\n" +
93 | "foobar5 content\n" +
94 | "foobar6 content\n" +
95 | "
" +
96 | "
" +
97 | "
" +
98 | "
" +
99 | "
" +
100 | "
" +
101 | ""
102 |
103 | Results := HtmlParseString("http://www.cnn.com/world", HtmlString)
104 |
105 | ExpectedLinks := []string{
106 | "http://www.cnn.com/world/foobar1",
107 | "http://www.cnn.com/foobar2",
108 | "http://localhost/foobar3",
109 | "https://localhost/foobar4",
110 | "http://localhost:8080/foobar5",
111 | "https://localhost:8080/foobar6",
112 | }
113 | ExpectedImages := []string{
114 | "http://www.cnn.com/world/foobar1.png",
115 | "http://www.cnn.com/foobar2.png",
116 | "http://localhost/foobar3.png",
117 | "https://localhost/foobar4.png",
118 | "http://localhost:8080/foobar5.png",
119 | "https://localhost:8080/foobar6.png",
120 | }
121 | ExpectedAlt := []string{
122 | "foobar1 alt tag",
123 | "foobar2 alt tag",
124 | "foobar3 alt tag",
125 | "",
126 | "foobar5 alt tag",
127 | "foobar6 alt tag",
128 | }
129 | ExpectedTitles := []string{
130 | "",
131 | "",
132 | "",
133 | "foobar4 title",
134 | "",
135 | "",
136 | }
137 |
138 | for i := range ExpectedLinks {
139 | if Results.links[i] != ExpectedLinks[i] {
140 | t.Errorf("Result '%s' didn't match expected '%s'", Results.links[i], ExpectedLinks[i])
141 | }
142 | }
143 |
144 | for i := range ExpectedImages {
145 | if Results.images[i].src != ExpectedImages[i] {
146 | t.Errorf("Images '%s' didn't match expected '%s'", Results.images[i].src, ExpectedImages[i])
147 | }
148 | if Results.images[i].alt != ExpectedAlt[i] {
149 | t.Errorf("Alt '%s' didn't match expected '%s'", Results.images[i].alt, ExpectedAlt[i])
150 | }
151 | if Results.images[i].title != ExpectedTitles[i] {
152 | t.Errorf("Title '%s' didn't match expected '%s'", Results.images[i].title, ExpectedTitles[i])
153 | }
154 | }
155 |
156 | } // End of TestHtmlLinksAndImages()
157 |
158 | func TestHtmlNoLinks(t *testing.T) {
159 |
160 | HtmlString := "" +
161 | "
" +
162 | "
" +
163 | "
" +
164 | "
" +
165 | "
" +
166 | "
" +
167 | ""
168 |
169 | Results := HtmlParseString("http://www.cnn.com/world", HtmlString)
170 |
171 | ExpectedLinks := []string{}
172 | ExpectedImages := []string{
173 | "http://www.cnn.com/world/foobar1.png",
174 | "http://www.cnn.com/foobar2.png",
175 | "http://localhost/foobar3.png",
176 | "https://localhost/foobar4.png",
177 | "http://localhost:8080/foobar5.png",
178 | "https://localhost:8080/foobar6.png",
179 | }
180 | ExpectedAlt := []string{
181 | "foobar1 alt tag",
182 | "foobar2 alt tag",
183 | "foobar3 alt tag",
184 | "",
185 | "foobar5 alt tag",
186 | "foobar6 alt tag",
187 | }
188 | ExpectedTitles := []string{
189 | "",
190 | "",
191 | "",
192 | "foobar4 title",
193 | "",
194 | "",
195 | }
196 |
197 | for i := range ExpectedLinks {
198 | if Results.links[i] != ExpectedLinks[i] {
199 | t.Errorf("Result '%s' didn't match expected '%s'", Results.links[i], ExpectedLinks[i])
200 | }
201 | }
202 |
203 | for i := range ExpectedImages {
204 | if Results.images[i].src != ExpectedImages[i] {
205 | t.Errorf("Images '%s' didn't match expected '%s'", Results.images[i].src, ExpectedImages[i])
206 | }
207 | if Results.images[i].alt != ExpectedAlt[i] {
208 | t.Errorf("Alt '%s' didn't match expected '%s'", Results.images[i].alt, ExpectedAlt[i])
209 | }
210 | if Results.images[i].title != ExpectedTitles[i] {
211 | t.Errorf("Title '%s' didn't match expected '%s'", Results.images[i].title, ExpectedTitles[i])
212 | }
213 | }
214 |
215 | } // End of TestHtmlNoLinks()
216 |
217 | func TestHtmlNoImages(t *testing.T) {
218 |
219 | //log.SetLevelString("info")
220 |
221 | HtmlString := "foobar1 content" +
222 | "foobar2 content" +
223 | "foobar3 content" +
224 | "foobar4 content\n" +
225 | "foobar5 content\n" +
226 | "foobar6 content\n" +
227 | ""
228 |
229 | Results := HtmlParseString("http://www.cnn.com/world", HtmlString)
230 |
231 | ExpectedLinks := []string{
232 | "http://www.cnn.com/world/foobar1",
233 | "http://www.cnn.com/foobar2",
234 | "http://localhost/foobar3",
235 | "https://localhost/foobar4",
236 | "http://localhost:8080/foobar5",
237 | "https://localhost:8080/foobar6",
238 | }
239 | ExpectedImages := []string{}
240 | ExpectedAlt := []string{}
241 | ExpectedTitles := []string{}
242 |
243 | for i := range ExpectedLinks {
244 | if Results.links[i] != ExpectedLinks[i] {
245 | t.Errorf("Result '%s' didn't match expected '%s'", Results.links[i], ExpectedLinks[i])
246 | }
247 | }
248 |
249 | for i := range ExpectedImages {
250 | if Results.images[i].src != ExpectedImages[i] {
251 | t.Errorf("Images '%s' didn't match expected '%s'", Results.images[i].src, ExpectedImages[i])
252 | }
253 | if Results.images[i].alt != ExpectedAlt[i] {
254 | t.Errorf("Alt '%s' didn't match expected '%s'", Results.images[i].alt, ExpectedAlt[i])
255 | }
256 | if Results.images[i].title != ExpectedTitles[i] {
257 | t.Errorf("Title '%s' didn't match expected '%s'", Results.images[i].title, ExpectedTitles[i])
258 | }
259 | }
260 |
261 | } // End of TestHtmlNoImages()
262 |
263 | func TestHtmlNoLinksNorImages(t *testing.T) {
264 |
265 | //log.SetLevelString("info")
266 |
267 | HtmlString := "blah blah blah"
268 |
269 | Results := HtmlParseString("http://www.cnn.com/world", HtmlString)
270 |
271 | ExpectedLinks := []string{}
272 | ExpectedImages := []string{}
273 | ExpectedAlt := []string{}
274 | ExpectedTitles := []string{}
275 |
276 | for i := range ExpectedLinks {
277 | if Results.links[i] != ExpectedLinks[i] {
278 | t.Errorf("Result '%s' didn't match expected '%s'", Results.links[i], ExpectedLinks[i])
279 | }
280 | }
281 |
282 | for i := range ExpectedImages {
283 | if Results.images[i].src != ExpectedImages[i] {
284 | t.Errorf("Images '%s' didn't match expected '%s'", Results.images[i].src, ExpectedImages[i])
285 | }
286 | if Results.images[i].alt != ExpectedAlt[i] {
287 | t.Errorf("Alt '%s' didn't match expected '%s'", Results.images[i].alt, ExpectedAlt[i])
288 | }
289 | if Results.images[i].title != ExpectedTitles[i] {
290 | t.Errorf("Title '%s' didn't match expected '%s'", Results.images[i].title, ExpectedTitles[i])
291 | }
292 | }
293 |
294 | } // End of TestHtmlNoLinksNorImages()
295 |
296 | func TestHtmlPortNumberInBaseUrl(t *testing.T) {
297 |
298 | //log.SetLevelString("info")
299 |
300 | HtmlString := "foobar1 content" +
301 | "foobar2 content" +
302 | "foobar3 content" +
303 | "foobar4 content\n" +
304 | "foobar5 content\n" +
305 | "foobar6 content\n" +
306 | "
" +
307 | "
" +
308 | "
" +
309 | "
" +
310 | "
" +
311 | "
" +
312 | ""
313 |
314 | Results := HtmlParseString("https://www.cnn.com:8433/world", HtmlString)
315 |
316 | ExpectedLinks := []string{
317 | "https://www.cnn.com:8433/world/foobar1",
318 | "https://www.cnn.com:8433/foobar2",
319 | "http://localhost/foobar3",
320 | "https://localhost/foobar4",
321 | "http://localhost:8080/foobar5",
322 | "https://localhost:8080/foobar6",
323 | }
324 | ExpectedImages := []string{
325 | "https://www.cnn.com:8433/world/foobar1.png",
326 | "https://www.cnn.com:8433/foobar2.png",
327 | "http://localhost/foobar3.png",
328 | "https://localhost/foobar4.png",
329 | "http://localhost:8080/foobar5.png",
330 | "https://localhost:8080/foobar6.png",
331 | }
332 | ExpectedAlt := []string{
333 | "foobar1 alt tag",
334 | "foobar2 alt tag",
335 | "foobar3 alt tag",
336 | "",
337 | "foobar5 alt tag",
338 | "foobar6 alt tag",
339 | }
340 | ExpectedTitles := []string{
341 | "",
342 | "",
343 | "",
344 | "foobar4 title",
345 | "",
346 | "",
347 | }
348 |
349 | for i := range ExpectedLinks {
350 | if Results.links[i] != ExpectedLinks[i] {
351 | t.Errorf("Result '%s' didn't match expected '%s'", Results.links[i], ExpectedLinks[i])
352 | }
353 | }
354 |
355 | for i := range ExpectedImages {
356 | if Results.images[i].src != ExpectedImages[i] {
357 | t.Errorf("Images '%s' didn't match expected '%s'", Results.images[i].src, ExpectedImages[i])
358 | }
359 | if Results.images[i].alt != ExpectedAlt[i] {
360 | t.Errorf("Alt '%s' didn't match expected '%s'", Results.images[i].alt, ExpectedAlt[i])
361 | }
362 | if Results.images[i].title != ExpectedTitles[i] {
363 | t.Errorf("Title '%s' didn't match expected '%s'", Results.images[i].title, ExpectedTitles[i])
364 | }
365 | }
366 |
367 | } // End of TestHtmlPortNumberInBaseUrl()
368 |
--------------------------------------------------------------------------------
/httpGet.go:
--------------------------------------------------------------------------------
1 | package main
2 |
3 | import "fmt"
4 | import "io/ioutil"
5 | import "net/http"
6 |
7 | import log "github.com/dmuth/google-go-log4go"
8 |
9 | /**
10 | * Our response object.
11 | */
12 | type Response struct {
13 | //
14 | // The URL we just crawled
15 | //
16 | Url string
17 | //
18 | // Our content-type
19 | //
20 | ContentType string
21 | //
22 | // HTTP code
23 | //
24 | Code int
25 | //
26 | // The actual page content.
27 | //
28 | Body string
29 | }
30 |
31 | /**
32 | * Retrieve a URL via HTTP GET.
33 | *
34 | * @param {string} url The URL to retrieve.
35 | * @return {Response} A response consisting of our code and body
36 | */
37 | func httpGet(url string) (retval Response) {
38 |
39 | retval.Url = url
40 |
41 | client := &http.Client{}
42 |
43 | req, err := http.NewRequest("GET", url, nil)
44 | if err != nil {
45 | log.Warnf("Error fetching %s: %s", url, err)
46 | retval.Body = fmt.Sprintf("%s", err)
47 | retval.Code = 0
48 | return (retval)
49 | }
50 |
51 | req.Header.Set("User-Agent",
52 | "Doug's cat picture crawler. https://github.com/dmuth/cat-crawler")
53 |
54 | resp, err := client.Do(req)
55 | if err != nil {
56 | log.Warnf("Error fetching %s: %s", url, err)
57 | retval.Body = fmt.Sprintf("%s", err)
58 | retval.Code = 0
59 | return (retval)
60 | }
61 |
62 | defer resp.Body.Close()
63 | body, err := ioutil.ReadAll(resp.Body)
64 | if err != nil {
65 | log.Warnf("Error fetching %s: %s", url, err)
66 | retval.Body = fmt.Sprintf("%s", err)
67 | retval.Code = 0
68 | return (retval)
69 | }
70 |
71 | if _, ok := resp.Header["Content-Type"]; ok {
72 |
73 | retval.ContentType = resp.Header["Content-Type"][0]
74 | }
75 |
76 | retval.Body = fmt.Sprintf("%s", body)
77 | retval.Code = resp.StatusCode
78 |
79 | return (retval)
80 |
81 | } // End of httpGet()
82 |
--------------------------------------------------------------------------------
/imageCrawler.go:
--------------------------------------------------------------------------------
1 | package main
2 |
3 | //import "fmt"
4 | import "regexp"
5 | import "strings"
6 | import "os"
7 |
8 | import log "github.com/dmuth/google-go-log4go"
9 | import stats "github.com/dmuth/golang-stats"
10 |
11 | //
12 | // Keep track of if we crawled hosts with specific URLs
13 | //
14 | var hostsCrawledImages map[string]map[string]bool
15 |
16 | /**
17 | * Fire up 1 or more crawlers to start grabbing images.
18 | *
19 | * @param {config} Our configuration
20 | * @param {chan} Image in Image data structures will be read from here.
21 | * @param {uint} NumConnections How many go threads to fire up?
22 | *
23 | */
24 | func NewImageCrawler(config Config, in chan Image, NumConnections uint) {
25 |
26 | hostsCrawledImages = make(map[string]map[string]bool)
27 | for i := 0; i < int(NumConnections); i++ {
28 | go crawlImages(config, in)
29 | }
30 |
31 | } // End of NewImageCrawler()
32 |
33 | /**
34 | * Continuously read images and crawl them.
35 | *
36 | * @param {config} Our configuration
37 | *
38 | */
39 | func crawlImages(config Config, in chan Image) {
40 |
41 | for {
42 | stats.IncrStat("go_image_crawler_waiting")
43 | image := <-in
44 | stats.DecrStat("go_image_crawler_waiting")
45 | stats.DecrStat("images_to_be_crawled")
46 |
47 | // src, alt, title
48 | Url := image.src
49 |
50 | if Url == "" {
51 | continue
52 | }
53 |
54 | //
55 | // If we've been here before, stop
56 | //
57 | if imageBeenHereUrl(Url) {
58 | log.Debugf("crawlImages(): We've already been to '%s', skipping!", Url)
59 | stats.IncrStat("images_skipped")
60 | continue
61 | }
62 | setImageBeenHereUrl(Url)
63 |
64 | match := false
65 | if strings.Contains(strings.ToLower(image.alt), config.SearchString) {
66 | match = true
67 | log.Debugf("Match found on ALT tag for URL '%s'!", Url)
68 | }
69 | if strings.Contains(strings.ToLower(image.title), config.SearchString) {
70 | match = true
71 | log.Infof("Match found on TITLE tag for URL '%s'!", Url)
72 | }
73 |
74 | if !match {
75 | log.Debugf("No match for %s found in alt and title tags for URL '%s', stopping!", Url)
76 | stats.IncrStat("images_not_matched")
77 | continue
78 | }
79 |
80 | log.Infof("Image: About to crawl '%s'...", Url)
81 | response := httpGet(Url)
82 | stats.IncrStat("images_crawled")
83 | log.Infof("Image: Response code %d on URL '%s'", response.Code, response.Url)
84 |
85 | //
86 | // If the content-type isn't an image, stop.
87 | //
88 | regex, _ := regexp.Compile("^image")
89 | results := regex.FindString(response.ContentType)
90 | if len(results) == 0 {
91 | log.Errorf("Skipping Content-Type of '%s', on URL '%s'",
92 | response.ContentType, response.Url)
93 | continue
94 | }
95 |
96 | filename := getFilenameFromUrl(Url)
97 |
98 | writeImage(filename, response.Body)
99 |
100 | }
101 |
102 | } // End of crawlImages()
103 |
104 | /**
105 | * Wrapper for imageBeenHere() which takes a URL
106 | */
107 | func imageBeenHereUrl(url string) bool {
108 |
109 | //
110 | // Grab our URL parts
111 | //
112 | results := getUrlParts(url)
113 | if len(results) < 5 {
114 | log.Warnf("imageBeenHereUrl(): Unable to parse URL: '%s'", url)
115 | return (true)
116 | }
117 | host := results[1]
118 | uri := results[4]
119 |
120 | if imageBeenHere(host, uri) {
121 | return (true)
122 | }
123 |
124 | //
125 | // Assume false
126 | //
127 | return (false)
128 |
129 | }
130 |
131 | /**
132 | * Wrapper for setImageBeenHere() which takes a URL.
133 | */
134 | func setImageBeenHereUrl(url string) {
135 |
136 | //
137 | // Grab our URL parts
138 | //
139 | results := getUrlParts(url)
140 | host := results[1]
141 | uri := results[4]
142 |
143 | setImageBeenHere(host, uri)
144 |
145 | }
146 |
147 | /**
148 | * Make the deterination if we've been to this image before.
149 | *
150 | * @param {string} host The hostname
151 | * @param {string} uri The URI
152 | *
153 | * @return {bool} True if we've crawled this image before, false otherwise.
154 | */
155 | func imageBeenHere(host string, uri string) bool {
156 |
157 | //
158 | // Create our host entry if we don't already have it.
159 | //
160 | if _, ok := hostsCrawledImages[host]; !ok {
161 | hostsCrawledImages[host] = make(map[string]bool)
162 | }
163 |
164 | //
165 | // See if we've been here before.
166 | //
167 | _, ok := hostsCrawledImages[host][uri]
168 | if ok {
169 | return (true)
170 | } else {
171 | return (false)
172 | }
173 |
174 | } // End of imageBeenHere()
175 |
176 | /**
177 | * We've been to this image before!
178 | */
179 | func setImageBeenHere(host string, uri string) {
180 |
181 | //
182 | // Create our host entry if we don't already have it.
183 | //
184 | if _, ok := hostsCrawledImages[host]; !ok {
185 | hostsCrawledImages[host] = make(map[string]bool)
186 | }
187 |
188 | hostsCrawledImages[host][uri] = true
189 |
190 | }
191 |
192 | /**
193 | * Convert our URL into a filename
194 | */
195 | func getFilenameFromUrl(Url string) (retval string) {
196 |
197 | retval = Url
198 |
199 | results := getUrlParts(Url)
200 | Host := results[3]
201 | Uri := results[4]
202 |
203 | regex, _ := regexp.Compile("/$")
204 | Uri = regex.ReplaceAllLiteralString(Uri, "")
205 |
206 | retval = Host + Uri
207 |
208 | //
209 | // Trim the filename if it's too long.
210 | // This isn't a perfect fix, but it'll work for now.
211 | //
212 | MaxLen := 80
213 | if len(retval) > MaxLen {
214 | retval = retval[:(MaxLen - 1)]
215 | }
216 |
217 | return (retval)
218 |
219 | } // End of getFilenameFromUrl()
220 |
221 | /**
222 | * Write our image out to disk.
223 | *
224 | * @param {string} Filename The name of the file
225 | *
226 | * @param {string} Body The content of the image
227 | */
228 | func writeImage(Filename string, Body string) {
229 |
230 | cwd, _ := os.Getwd()
231 |
232 | //
233 | // Create our target and nuke the filename from the end
234 | //
235 | target := cwd + "/cat-crawler-downloads/" + Filename
236 | regex, _ := regexp.Compile("/[^/]+$")
237 | dir := regex.ReplaceAllLiteralString(target, "")
238 |
239 | //
240 | // Try making our directory
241 | // We want to panic if there are any issues since it could
242 | // mean awful things like a full disk!
243 | //
244 | result := os.MkdirAll(dir, 0750)
245 | if result != nil {
246 | log.Errorf("Error creating directory: %s", result)
247 | panic(result)
248 | }
249 |
250 | //
251 | // Now write the file.
252 | //
253 | file, err := os.Create(target)
254 | if err != nil {
255 | log.Warnf("Error opening file: %s", err)
256 |
257 | } else {
258 | n, err := file.Write([]byte(Body))
259 | if err != nil {
260 | log.Warnf("Error writing file: '%s': %s", target, err)
261 | }
262 | log.Infof("%d bytes written to file '%s'", n, target)
263 |
264 | err = file.Close()
265 | if err != nil {
266 | log.Errorf("Error closing file: %s", err)
267 | panic(err)
268 | }
269 |
270 | }
271 |
272 | } // End of writeImage()
273 |
--------------------------------------------------------------------------------
/imageCrawler_test.go:
--------------------------------------------------------------------------------
1 | package main
2 |
3 | //import "fmt"
4 | import "testing"
5 |
6 | func TestGetFilenameFromUrl(t *testing.T) {
7 |
8 | Urls := []string{
9 | "http://www.apple.com/image.png",
10 | "http://www.apple.com/image",
11 | "http://www.apple.com/CSS/Resources/foobar.png",
12 | "http://www.apple.com/CSS/foobar",
13 | "http://logging.apache.org/log4j/1.2/css/print.css",
14 | "http://logging.apache.org/css/print.css",
15 | "http://www.flickr.com/photos/dmuth/6071648896",
16 | "http://www.flickr.com/photos/dmuth/6071648896/",
17 | "https://www.flickr.com/photos/dmuth/6071648896/",
18 | "https://www.flickr.com/photos/dmuth/6071648896/" +
19 | "1234567890" +
20 | "1234567890" +
21 | "1234567890" +
22 | "1234567890" +
23 | "1234567890" +
24 | "1234567890",
25 | }
26 |
27 | Expected := []string{
28 | "www.apple.com/image.png",
29 | "www.apple.com/image",
30 | "www.apple.com/CSS/Resources/foobar.png",
31 | "www.apple.com/CSS/foobar",
32 | "logging.apache.org/log4j/1.2/css/print.css",
33 | "logging.apache.org/css/print.css",
34 | "www.flickr.com/photos/dmuth/6071648896",
35 | "www.flickr.com/photos/dmuth/6071648896",
36 | "www.flickr.com/photos/dmuth/6071648896",
37 | "www.flickr.com/photos/dmuth/6071648896/1234567890123456789012345678901234567890",
38 | }
39 |
40 | for key, value := range Urls {
41 | Result := getFilenameFromUrl(value)
42 | if Result != Expected[key] {
43 | t.Errorf("Filename '%s' != expected value '%s",
44 | Result, Expected[key])
45 | }
46 | }
47 |
48 | } // End of TestGetFilenameFromUrl()
49 |
50 | func TestImageBeenHereUrl(t *testing.T) {
51 |
52 | hostsCrawledImages = make(map[string]map[string]bool)
53 |
54 | url := "http://www.cnn.com/"
55 | result := imageBeenHereUrl(url)
56 | if result != false {
57 | t.Errorf("Expected result to be false for %s, got %s", url, result)
58 | }
59 |
60 | setImageBeenHereUrl(url)
61 | result = imageBeenHereUrl(url)
62 | if result != true {
63 | t.Errorf("Expected result to be true for %s, got %s", url, result)
64 | }
65 |
66 | url = "http://www.cnn.com/foobar"
67 | result = imageBeenHereUrl(url)
68 | if result != false {
69 | t.Errorf("Expected result to be false for %s, got %s", url, result)
70 | }
71 |
72 | setImageBeenHereUrl(url)
73 | result = imageBeenHereUrl(url)
74 | if result != true {
75 | t.Errorf("Expected result to be true for %s, got %s", url, result)
76 | }
77 |
78 | }
79 |
80 | func TestImageBeenHere(t *testing.T) {
81 |
82 | hostsCrawledImages = make(map[string]map[string]bool)
83 |
84 | result := imageBeenHere("test", "test2")
85 | if result != false {
86 | t.Errorf("Expected result to be false, got %s", result)
87 | }
88 |
89 | setImageBeenHere("test", "test2")
90 |
91 | result = imageBeenHere("test", "test2")
92 | if result != true {
93 | t.Errorf("Expected result to be true, got %s", result)
94 | }
95 |
96 | result = imageBeenHere("test", "test")
97 | if result != false {
98 | t.Errorf("Expected result to be false, got %s", result)
99 | }
100 |
101 | }
102 |
--------------------------------------------------------------------------------
/main.go:
--------------------------------------------------------------------------------
1 | package main
2 |
3 | //import "fmt" // Debugging
4 | import "os"
5 | import "os/signal"
6 | import "syscall"
7 |
8 | import log "github.com/dmuth/google-go-log4go"
9 | import stats "github.com/dmuth/golang-stats"
10 |
11 | func main() {
12 |
13 | //
14 | // Parse our arguments and report them
15 | //
16 | config := ParseArgs()
17 | log.Infof("Config: %s", config)
18 | log.Infof("SeedURLs: %s", config.SeedUrls)
19 | if len(config.AllowUrls) > 0 {
20 | log.Infof("Only allowing URLs starting with: %s", config.AllowUrls)
21 | }
22 |
23 | //
24 | // Catch our interrupt signal
25 | //
26 | go sigInt()
27 |
28 | interval := 1.0
29 | //interval := .1 // Debugging
30 | if config.Stats {
31 | go stats.StatDump(interval)
32 | }
33 |
34 | NumConnections := config.NumConnections
35 |
36 | //
37 | // Start the crawler and seed it with our very first URL
38 | //
39 | UrlCrawlerIn, UrlCrawlerOut := NewUrlCrawler(uint(NumConnections), config.AllowUrls)
40 |
41 | //UrlCrawlerIn <- "http://localhost:8080/" // Debugging
42 | for _, value := range config.SeedUrls {
43 | stats.IncrStat("urls_to_be_crawled")
44 | UrlCrawlerIn <- value
45 | }
46 |
47 | //
48 | // Create our HTML parser
49 | //
50 | HtmlBodyIn, ImageCrawlerIn := NewHtml(UrlCrawlerIn)
51 |
52 | //
53 | // Start up our image crawler
54 | //
55 | NewImageCrawler(config, ImageCrawlerIn, NumConnections)
56 |
57 | for {
58 | //
59 | // Read a result from our crawler
60 | //
61 | Res := <-UrlCrawlerOut
62 |
63 | if Res.Code != 200 {
64 | log.Debugf("Skipping non-2xx response of %d on URL '%s'",
65 | Res.Code, Res.Url)
66 | continue
67 | }
68 |
69 | //
70 | // Pass it into the HTML parser. It will in turn send any URLs
71 | // it finds into the URL Crawler and any images to the Image Crawler.
72 | //
73 | HtmlBodyIn <- []string{Res.Url, Res.Body, Res.ContentType}
74 |
75 | }
76 |
77 | } // End of main()
78 |
79 | /**
80 | * Wait for ctrl-c to happen, then exit!
81 | */
82 | func sigInt() {
83 | ch := make(chan os.Signal)
84 | signal.Notify(ch, syscall.SIGINT)
85 | <-ch
86 | log.Error("CTRL-C; exiting")
87 | os.Exit(0)
88 | }
89 |
--------------------------------------------------------------------------------
/urlCrawler.go:
--------------------------------------------------------------------------------
1 | package main
2 |
3 | //import "fmt" // Debugging
4 | import "regexp"
5 | import "strings"
6 |
7 | import log "github.com/dmuth/google-go-log4go"
8 | import stats "github.com/dmuth/golang-stats"
9 |
10 | //
11 | // Keep track of if we crawled hosts with specific URLs
12 | //
13 | var hostsCrawled map[string]map[string]bool
14 |
15 | //
16 | // Our allowed URLs to crawl. If empty, all URLs are crawled.
17 | //
18 | var allowedUrls []string
19 |
20 | /**
21 | * Spin up 1 or more goroutines to do crawling.
22 | *
23 | * @param {int} num_instances
24 | * @returm {chan string, chan Response} Our channel to read URLs from,
25 | * our channel to write responses to.
26 | */
27 | func NewUrlCrawler(NumInstances uint, AllowedUrls []string) (in chan string, out chan Response) {
28 |
29 | hostsCrawled = make(map[string]map[string]bool)
30 | allowedUrls = AllowedUrls
31 |
32 | //
33 | // I haven't yet decided if I want a buffer for this
34 | //
35 | //InBufferSize := 1000
36 | InBufferSize := 0
37 |
38 | //
39 | // If we don't have a large output buffer, using multiple seed URLs
40 | // will cause blocking to happen (ooops!)
41 | //
42 | OutBufferSize := 1000
43 | in = make(chan string, InBufferSize)
44 | out = make(chan Response, OutBufferSize)
45 |
46 | for i := uint(0); i < NumInstances; i++ {
47 | log.Infof("Spun up crawler instance #%d", (i + 1))
48 | go crawlUrls(in, out)
49 | }
50 |
51 | return in, out
52 |
53 | } // End of NewUrlCrawler()
54 |
55 | /**
56 | * This is run as a goroutine which is responsible for doing the crawling and
57 | * returning the results.
58 | *
59 | * @param {chan string} in Our channel to read URLs to crawl from
60 | * @param {chan Response} out Responses will be written on this channel
61 | *
62 | * @return {Response} A response consisting of our code and body
63 | */
64 | func crawlUrls(in chan string, out chan Response) {
65 |
66 | for {
67 |
68 | log.Debug("About to ingest a URL...")
69 | stats.IncrStat("go_url_crawler_waiting")
70 | url := <-in
71 | stats.DecrStat("go_url_crawler_waiting")
72 | stats.DecrStat("urls_to_be_crawled")
73 |
74 | if !isUrlAllowed(url) {
75 | log.Debugf("URL '%s' is not allowed!", url)
76 | continue
77 | }
78 |
79 | url = filterUrl(url)
80 |
81 | if urlBeenHere(url) {
82 | log.Debugf("We've already been to '%s', skipping!", url)
83 | continue
84 | }
85 |
86 | if !sanityCheck(url) {
87 | //
88 | // In the future, I might make the in channel take a data
89 | // structure which includes the referrer so I can dig
90 | // into bad URLs. With a backhoe.
91 | //
92 | log.Warnf("URL '%s' fails sanity check, skipping!", url)
93 | continue
94 | }
95 |
96 | log.Infof("About to crawl '%s'...", url)
97 | out <- httpGet(url)
98 | log.Infof("Done crawling '%s'!", url)
99 |
100 | }
101 |
102 | } // End of crawl()
103 |
104 | /**
105 | * Filter meaningless things out of URLs. Like hashmarks.
106 | *
107 | * @param {string} url The URL
108 | *
109 | * @return {string} The filtered URL
110 | */
111 | func filterUrl(url string) string {
112 |
113 | //
114 | // First, nuke hashmarks (thanks, Apple!)
115 | //
116 | regex, _ := regexp.Compile("([^#]+)#")
117 | results := regex.FindStringSubmatch(url)
118 | if len(results) >= 2 {
119 | url = results[1]
120 | }
121 |
122 | //
123 | // Replace groups of 2 or more slashes with a single slash (thanks, log4j!)
124 | //
125 | regex, _ = regexp.Compile("[^:](/[/]+)")
126 | for {
127 | results = regex.FindStringSubmatch(url)
128 | if len(results) < 2 {
129 | break
130 | }
131 |
132 | Dir := results[1]
133 | //url = regex.ReplaceAllString(url, "/")
134 | url = strings.Replace(url, Dir, "/", -1)
135 |
136 | }
137 |
138 | //
139 | // Fix broken methods (thanks, Flickr!)
140 | //
141 | regex, _ = regexp.Compile("^(http)(s)?(:/)[^/]")
142 | results = regex.FindStringSubmatch(url)
143 | if len(results) > 0 {
144 | BrokenMethod := results[1] + results[2] + results[3]
145 | url = strings.Replace(url, BrokenMethod, BrokenMethod+"/", 1)
146 | }
147 |
148 | //
149 | // Now, remove references to parent directories, because that's just
150 | // ASKING for path loops. (thanks, Apple!)
151 | //
152 | // Do this by looping as long as we have ".." present.
153 | //
154 | regex, _ = regexp.Compile("([^/]+/\\.\\./)")
155 | for {
156 | results = regex.FindStringSubmatch(url)
157 | if len(results) < 2 {
158 | break
159 | }
160 |
161 | Dir := results[1]
162 | url = strings.Replace(url, Dir, "", -1)
163 |
164 | }
165 |
166 | //
167 | // Replace paths of single dots
168 | //
169 | regex, _ = regexp.Compile("/\\./")
170 | url = regex.ReplaceAllString(url, "/")
171 |
172 | return (url)
173 |
174 | } // End of filterUrl()
175 |
176 | /**
177 | * Have we already been to this URL?
178 | *
179 | * @param {string} url The URL we want to crawl
180 | *
181 | * @return {bool} True if we've crawled this URL before, false if we have not.
182 | */
183 | func urlBeenHere(url string) (retval bool) {
184 |
185 | retval = true
186 |
187 | //
188 | // Grab our URL parts
189 | //
190 | results := getUrlParts(url)
191 | if len(results) < 5 {
192 | //
193 | // TODO: Use data structure and print referrer here!
194 | //
195 | log.Warnf("urlBeenHere(): Unable to parse URL: '%s'", url)
196 | return (true)
197 | }
198 | Host := results[1]
199 | Uri := results[4]
200 |
201 | //
202 | // Create our host entry if we don't already have it.
203 | //
204 | if _, ok := hostsCrawled[Host]; !ok {
205 | hostsCrawled[Host] = make(map[string]bool)
206 | }
207 |
208 | //
209 | // If this is our first time here, cool. Otherwise, skip.
210 | //
211 | if _, ok := hostsCrawled[Host][Uri]; !ok {
212 | hostsCrawled[Host][Uri] = true
213 | retval = false
214 | }
215 |
216 | return retval
217 |
218 | } // End of urlBeenHere()
219 |
220 | /**
221 | * Split up our URL into its component parts
222 | */
223 | func getUrlParts(url string) (retval []string) {
224 |
225 | regex, _ := regexp.Compile("((https?://)([^/]+))(.*)")
226 | retval = regex.FindStringSubmatch(url)
227 |
228 | if len(retval) < 5 {
229 | log.Warnf("getUrlParts(): Unable to parse URL: '%s'", url)
230 | }
231 |
232 | return (retval)
233 |
234 | } // End of getUrlParts()
235 |
236 | /**
237 | * Check to see if this URL is sane.
238 | *
239 | * @return {bool} True if the URL looks okay, false otherwise.
240 | */
241 | func sanityCheck(url string) (retval bool) {
242 |
243 | retval = true
244 |
245 | regex, _ := regexp.Compile(" ")
246 | result := regex.FindString(url)
247 |
248 | if result != "" {
249 | retval = false
250 | }
251 |
252 | return (retval)
253 |
254 | } // End of sanityCheck()
255 |
256 | /**
257 | * Is this URL on our allowed list?
258 | *
259 | * @param {string} The URL to check
260 | *
261 | * @return {bool} If allowed, true. Otherwise, false.
262 | */
263 | func isUrlAllowed(url string) (retval bool) {
264 |
265 | if len(allowedUrls) == 0 {
266 | return true
267 | }
268 |
269 | //
270 | // Loop through our URLs and return true on the first match
271 | //
272 | for _, value := range allowedUrls {
273 | pattern := "^" + value
274 | match, _ := regexp.MatchString(pattern, url)
275 | if match {
276 | return true
277 | }
278 | }
279 |
280 | //
281 | // If we got here, no match was found. Return false.
282 | //
283 | return false
284 |
285 | } // End of isUrlAllowed()
286 |
--------------------------------------------------------------------------------
/urlCrawler_test.go:
--------------------------------------------------------------------------------
1 | package main
2 |
3 | //import "fmt"
4 | import "regexp"
5 | import "testing"
6 |
7 | //import log "github.com/dmuth/google-go-log4go"
8 | import server "github.com/dmuth/procedural-webserver"
9 |
10 | func Test(t *testing.T) {
11 |
12 | //log.SetLevelString("info")
13 |
14 | //
15 | // Start up our server
16 | //
17 | port := 8080
18 | server_obj := server.NewServer(port, 5, 20, 5, 20, "test_seed")
19 | go server_obj.Start()
20 |
21 | in, out := NewUrlCrawler(10, []string{})
22 |
23 | url := "http://localhost:8080/test2"
24 | in <- url
25 | result := <-out
26 |
27 | if result.Url != url {
28 | t.Errorf("URL '%s' does not match '%s'!", result.Url, url)
29 | }
30 |
31 | if result.Code != 200 {
32 | t.Errorf("Code %d does not match 200!", result.Code)
33 | }
34 |
35 | in <- "http://localhost:8080/test2?code=404"
36 | result = <-out
37 |
38 | if result.Code != 404 {
39 | t.Errorf("Code %d does not match 404!", result.Code)
40 | }
41 |
42 | //
43 | // Try a bad port
44 | //
45 | in <- "http://localhost:12345/test2?code=404"
46 | result = <-out
47 |
48 | if result.Code != 0 {
49 | t.Errorf("Code %d does not match 0!", result.Code)
50 | }
51 |
52 | pattern := "connection refused"
53 | match, _ := regexp.MatchString(pattern, result.Body)
54 | if !match {
55 | t.Errorf("Could not find pattern '%s' in result '%s'", pattern, result)
56 | }
57 |
58 | //in <- "http://www.cnn.com/robots.txt"
59 | //in <- "http://localhost:8080/test2?delay=1s"
60 | //in <- "http://httpbin.org/headers"
61 |
62 | server_obj.Stop()
63 |
64 | } // End of Test()
65 |
66 | func TestFilterUrl(t *testing.T) {
67 |
68 | Urls := []string{
69 | "http://www.apple.com/",
70 | "http://www.apple.com/#",
71 | "http://www.apple.com/#foobar",
72 | "http://www.apple.com/what#foobar",
73 | "http://www.apple.com/CSS/ie7.css/../Resources/foobar",
74 | "http://www.apple.com/CSS/ie7.css/../Resources/../foobar",
75 | "http://www.apple.com/CSS/ie7.css/../Resources/../foobar/baz",
76 | "http://logging.apache.org/log4j/1.2/./css/print.css",
77 | "http://logging.apache.org/log4j/1.2///////css/print.css",
78 | "http://logging.apache.org/log4j/1.2///..///..///./css/print.css",
79 | "http:/www.flickr.com/photos/dmuth/6071648896/",
80 | "https:/www.flickr.com/photos/dmuth/6071648896/",
81 | }
82 | Expected := []string{
83 | "http://www.apple.com/",
84 | "http://www.apple.com/",
85 | "http://www.apple.com/",
86 | "http://www.apple.com/what",
87 | "http://www.apple.com/CSS/Resources/foobar",
88 | "http://www.apple.com/CSS/foobar",
89 | "http://www.apple.com/CSS/foobar/baz",
90 | "http://logging.apache.org/log4j/1.2/css/print.css",
91 | "http://logging.apache.org/log4j/1.2/css/print.css",
92 | "http://logging.apache.org/css/print.css",
93 | "http://www.flickr.com/photos/dmuth/6071648896/",
94 | "https://www.flickr.com/photos/dmuth/6071648896/",
95 | }
96 |
97 | for i := range Urls {
98 | Url := filterUrl(Urls[i])
99 | if Url != Expected[i] {
100 | t.Errorf("Filtered URL '%s' does not match expected URL '%s'!",
101 | Url, Expected[i])
102 | }
103 | }
104 |
105 | } // End of TestFilterUrl()
106 |
107 | func TestIsUrlAllowed(t *testing.T) {
108 |
109 | _, _ = NewUrlCrawler(10, []string{
110 | "http://foo/",
111 | "https://bar/baz",
112 | })
113 | Urls := []string{
114 | "http://google.com/",
115 | "http://foo",
116 | "http://foo/",
117 | "http://foo/bar",
118 | "http://bar",
119 | "http://bar/baz",
120 | "https://bar/baz",
121 | "https://bar/baz/",
122 | }
123 | Expected := []bool{
124 | false,
125 | false,
126 | true,
127 | true,
128 | false,
129 | false,
130 | true,
131 | true,
132 | }
133 |
134 | for key, value := range Urls {
135 | result := isUrlAllowed(value)
136 | if result != Expected[key] {
137 | t.Errorf("For URL '%s', expected %s and got %s",
138 | value, Expected[key], result)
139 | }
140 | }
141 |
142 | _, _ = NewUrlCrawler(10, []string{})
143 | for key, value := range Urls {
144 | result := isUrlAllowed(value)
145 | if result != true {
146 | t.Errorf("For URL '%s', expected %s and got %s",
147 | value, Expected[key], result)
148 | }
149 | }
150 |
151 | } // End of TestIsUrlAllowed()
152 |
--------------------------------------------------------------------------------