├── .hgignore
├── README.md
├── slides
├── Makefile
├── slides.html
└── slides.md
├── src
├── Download.hs
├── Links.hs
├── Main.hs
├── RankPages.hs
└── Spider.hs
└── strange-loop-2011.cabal
/.hgignore:
--------------------------------------------------------------------------------
1 | ^(?:dist|cabal-dev)$
2 | \.(?:aux|eventlog|h[ip]|log|[oa]|orig|prof|ps|rej|swp)$
3 | ~$
4 | syntax: glob
5 | .\#*
6 |
--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 | # Slides from the Haskell workshop at Strange Loop 2011
2 |
3 | I gave a 3-hour "intro to Haskell" workshop at the Strange Loop
4 | conference, on September 18, 2011. These are the slides I used,
5 | with a couple of post-workshop bugs fixed.
6 |
7 | You can [read the slides online
8 | here](http://bos.github.com/strange-loop-2011/slides/slides.html).
9 |
10 | They're formatted from Markdown to HTML using John MacFarlane's
11 | fabulous [pandoc tool](http://johnmacfarlane.net/pandoc/).
12 |
13 | Feel free to use them yourself, but if you do, please give attribution
14 | to me. Thanks!
15 |
--------------------------------------------------------------------------------
/slides/Makefile:
--------------------------------------------------------------------------------
1 | all: slides.html
2 |
3 | slides.html: slides.md
4 | pandoc --offline -s -t slidy -o $@ $<
5 |
6 | clean:
7 | -rm -f slides.html
8 |
--------------------------------------------------------------------------------
/slides/slides.html:
--------------------------------------------------------------------------------
1 |
2 |
4 |
5 |
6 |
7 |
8 |
9 |
10 | Haskell: Functional Programming, Solid Code, Big Data
11 |
30 |
33 |
36 |
37 |
38 |
39 |
Haskell: Functional Programming, Solid Code, Big Data
40 |
41 | Bryan O'Sullivan
42 |
43 |
2011-09-18
44 |
45 |
46 |
47 |
Welcome!
48 |
main = putStrLn "hello!"
49 |
50 | My name is Bryan O'Sullivan
51 | I started using Haskell in 1993
52 | I wrote a book about it, "Real World Haskell"
53 |
56 | I write lots of open source code
57 |
60 | My company invests heavily and openly in Haskell
61 |
64 |
65 |
66 |
67 |
68 |
69 |
Copy these slides (if you want)
70 |
git clone https://github.com/bos/strange-loop-2011
71 |
72 |
73 |
74 |
75 |
76 |
My Haskell background
77 |
78 | Began using Haskell at university
79 | When I graduated in 1995, I switched to more mainstream languages
80 |
81 | - C, C++, Java, Python, Perl, etc.
82 |
83 | My interest reawakened around 2005
84 | I've found Haskell to be a great general-purpose language
85 | The community of people is amazing
86 |
87 |
88 |
89 |
90 |
91 |
What to expect 1
92 |
93 | Haskell is a fairly big language
94 | Since so much is unfamiliar to newcomers, expect to wander far from your comfort zone
95 | I'm going to teach you interesting things, but not everything
96 |
97 |
98 |
99 |
100 |
101 |
What to expect 2
102 |
103 | This is a hands-on workshop: you'll be writing code!
104 | There will be a short break every 45 minutes
105 | Don't be afraid to ask questions!
106 |
107 |
108 |
109 |
110 |
111 |
Your tools
112 |
125 |
126 |
127 |
128 |
129 |
What else is needed?
130 |
131 | A text editor
132 | A terminal window
133 |
134 |
135 |
136 |
137 |
138 |
Problem definition
139 |
Given a web site, we want to scrape it and find important web pages.
140 |
This involves a lot of figuring stuff out!
141 |
142 | Learn Haskell
143 | Download one web page
144 | Extract links from a page, so we can find more pages to download
145 | Once we're done, compute which ones are important
146 | Make it all fast?
147 |
148 |
149 |
150 |
151 |
152 |
Let's get started!
153 |
Create a file named Hello.hs
and give it the following contents:
154 |
main = putStrLn "hello, world!"
155 |
The suffix .hs
is the standard for Haskell source files.
156 |
File names start with a capital letter, and everyone uses CamelCase
.
157 |
158 |
159 |
160 |
161 |
Building it
162 |
This command will look for Hello.hs
in the current directory, and compile it:
163 |
ghc --make Hello
164 |
165 |
The generated executable will be named Hello
(Hello.exe
on Windows).
166 |
167 | - That
--make
option up there tells GHC to automatically deal with dependencies on source files and packages.
168 |
169 |
170 |
171 |
172 |
173 |
Checking in
174 |
Is everyone able to build and run their Hello
executable?
175 |
176 |
177 |
178 |
179 |
Something a little more convenient
180 |
It's nice to have fast, native code at our fingertips.
181 |
But when I'm working, I expect a few things:
182 |
186 |
For these circumstances, a full compiler is a bit slow.
187 |
Instead, I often use the interactive interpreter, ghci
.
188 |
189 |
190 |
191 |
192 |
Let's start GHCi
193 |
Easily done:
194 |
ghci
195 |
196 |
It will display a startup banner, followed by a prompt:
197 |
Prelude>
198 |
199 |
This default prompt tells us which modules are available to play with.
200 |
201 |
202 |
203 |
204 |
Play around
205 |
The ghci
interpreter evaluates expressions interactively.
206 |
Try it out:
207 |
2 + 2
208 |
123456781234567812345678 * 87654321876543
209 |
"foo" ++ "bar"
210 |
(That ++
is the "append" operator.)
211 |
212 |
213 |
214 |
215 |
Directives
216 |
All interpreter directives start with a ":
" character.
217 |
Let's load our source file into ghci
:
218 |
:load Hello.hs
219 |
220 |
Now the ghci
prompt changes:
221 |
*Main>
222 |
223 |
224 |
225 |
226 |
227 |
Running our code in ghci
228 |
We defined a function named main
, so let's invoke it:
229 |
main
230 |
231 |
Did that work for you?
232 |
What about this?
233 |
putStrLn "hi mom!"
234 |
235 |
236 |
237 |
238 |
A few more useful directives
239 |
Remember, all ghci
directives start with a ":
".
240 |
241 | :help
tells us what directives are available.
242 | :reload
reloads the file we last :load
ed.
243 | :edit
launches our text editor on the file you most recently :load
ed. (Does not automatically :reload
.)
244 | :quit
exits back to the shell.
245 |
246 |
247 |
248 |
249 |
250 |
Final ghci efficiency tips
251 |
We can abbreviate directives:
252 |
253 | :e
is treated as :edit
254 | :r
is :reload
255 |
256 |
We also have command line history and editing.
257 |
258 | On Unix, compatible with readline
.
259 | On Windows, the same as cmd.exe
.
260 |
261 |
262 |
263 |
264 |
265 |
Getting used to the cycle
266 |
Use :edit
or your text editor to change the "hello" text.
267 |
Use :reload
to reload the source file.
268 |
Test out your redefinition of main
.
269 |
270 | - For practice, hit the "up arrow" key to cycle through your command history until you get back to the last time you typed
main
.
271 |
272 |
273 |
274 |
275 |
276 |
Lists and strings
277 |
[1,2,3,4]
278 |
['h','e','l','l','o']
279 |
Double quotes are just syntactic sugar for the longer form:
280 |
"hello"
281 |
What does this print?
282 |
"foo" == ['f','o','o']
283 |
284 |
285 |
286 |
287 |
Calling functions: 1
288 |
We use white space to separate a function from its argument:
289 |
head "foo"
290 |
head [1,2,3]
291 |
tail [1,2,3]
292 |
293 |
294 |
295 |
296 |
Calling functions: 2
297 |
If a function takes multiple arguments, we separate them with white space:
298 |
min 3 4
299 |
If an argument is a compound expression, wrap it in parentheses:
300 |
compare (3+5) (2+7)
301 |
max (min 3 4) 5
302 |
303 |
304 |
305 |
306 |
Quick exercises: 1
307 |
Use ghci
as a calculator.
308 |
The **
operator performs exponentiation.
309 |
310 | - If I invest 5 quatloos at 3% compound interest per annum, how many quatloos will I have after 10 years?
311 |
312 |
313 |
314 |
315 |
316 |
Quick exercises: 2
317 |
The notation ['a'..'z']
generates a list from start to end, inclusive.
318 |
The sum
function adds the elements of a list.
319 |
320 | - What is the sum of the numbers between 9 and 250, inclusive, minus 2?
321 |
322 |
323 |
324 |
325 |
326 |
Quick exercises: 3
327 |
The show
function renders a value as a string. Try it!
328 |
show (1 == 2)
329 |
The length
function tells us how many elements are in a list.
330 |
length [1,2,3]
331 |
332 | - How many digits are in the product of all numbers between 0xBE and 0xEF, inclusive?
333 |
334 |
335 |
336 |
337 |
338 |
Defining a function
339 |
It is pretty simple to define a new function.
340 |
Open up your text editor, create a new file with a .hs
extension, and get writing!
341 |
isOdd x = (rem x 2) == 1
342 |
343 | We start with the name of the function.
344 | Next come the names we want to give its parameter(s), separated by white space.
345 | After those come a single =
character, with the body of the function following.
346 |
347 |
Load your source file into ghci
and give myOdd
a try.
348 |
349 |
350 |
351 |
352 |
Making life more interesting
353 |
Now we can define very simple functions, but we're missing some important building blocks for more fun.
354 |
So let's get to it!
355 |
356 |
357 |
358 |
359 |
Conditional execution
360 |
Q: What does the familiar if
look like in Haskell?
361 |
A: Familiar!
362 |
gcd a b = if b == 0
then a
else gcd b (rem a b)
363 |
We have the following elements:
364 |
369 |
370 |
371 |
372 |
373 |
Finally! A tiny bit about types
374 |
The two possible results of an if
expression must have the same type.
375 |
If then
evaluates to a String
, well else
must too!
376 |
For instance, this makes no sense:
377 |
if True
then 3.14
else "wombat"
378 |
We are forbidden from writing ill-typed expressions like this.
379 |
380 |
381 |
382 |
383 |
What about else?
384 |
In imperative languages, we can usually leave out the else
clause after an if
.
385 |
Not so in Haskell.
386 |
Why does this make sense for imperative languages, but not Haskell?
387 |
388 |
389 |
390 |
391 |
A nearly trivial exercise
392 |
Write a function that appends ", world"
to its argument if the argument is "hello"
, or just returns its argument unmodified otherwise.
393 |
394 | - Remember, the "append" function is an operator named
++
.
395 |
396 |
397 |
398 |
399 |
400 |
Lists in Haskell
401 |
We already know what a list looks like in Haskell:
402 |
[1,2,3]
403 |
And of course there's the syntactic sugar for strings:
404 |
"foo" == ['f','o','o']
405 |
But is this everything there is to know?
406 |
407 |
408 |
409 |
410 |
List constructors
411 |
Supposing we want to construct a list from first principles.
412 |
413 | We write the empty list as []
.
414 | Given an existing list, we can add another element to the front of the list using the :
operator.
415 |
416 |
417 |
418 |
419 |
420 |
Type this into ghci
421 |
Add an element to an empty list:
422 |
1 : []
423 |
424 |
425 |
426 |
427 |
From single-element lists onwards
428 |
What about extending that list?
429 |
2 : (1 : [])
430 |
You're probably guessing now that [2,1]
is syntactic sugar for 2:(1:[])
. And you're right!
431 |
What is the result of this expression?
432 |
5 : 8 : [] == [5,8]
433 |
434 |
435 |
436 |
437 |
Constructors
438 |
We refer to []
and :
as constructors, because we use them to construct lists.
439 |
When you create a list, the Haskell runtime has to remember which constructors you used, and where.
440 |
So the value [5,8]
is represented as:
441 |
442 | A :
constructor, with 5
as its first parameter, and as its second ...
443 | Another :
constructor, this time with 8
as its first parameter, and now as its second ...
444 | A []
constructor
445 |
446 |
447 |
448 |
449 |
450 |
What did we see?
451 |
Depending on your background, I bet you're thinking something like this:
452 |
456 |
Right on.
457 |
458 |
459 |
460 |
461 |
Why do we care about constructors?
462 |
Of course Haskell has to remember what a list is constructed of.
463 |
It also lets us inspect a list, to see which constructors were used.
464 |
How do we do this?
465 |
import Data.Char
isCapitalized name
= case name of
(first:rest) -> isUpper first
[] -> False
466 |
467 |
468 |
469 |
470 |
Welcome to the case expression
471 |
A case
expression allows us to inspect a structure to see how it was constructed.
472 |
isCapitalized name
= case name of
[] -> False
(first:rest) -> isUpper first
473 |
474 | In between case
and of
is the expression we are inspecting.
475 | If the constructor used was the empty-list constructor []
, then clearly the name
we're inspecting is empty, hence not capitalized.
476 |
477 |
If the constructor used was the "add to the front" :
operator, then things get more interesting.
478 |
479 | Whatever was the first parameter of the :
constructor is bound to the name first
.
480 | The second parameter of the :
constructor (i.e. everything in the list after the first element) is bound to the name rest
.
481 | The expression following the ->
is evaluated with these values.
482 |
483 |
484 |
485 |
486 |
487 |
Pattern matching
488 |
The case
expression performs what we call pattern matching.
489 |
490 | Patterns are checked from top to bottom.
491 | As soon as a match is found, its right hand side (after the ->
) is used as the result of the entire case
expression.
492 | If no match succeeds, an exception is thrown.
493 |
494 |
495 |
496 |
497 |
498 |
A worked example
499 |
Let's step through the machinery of what happens if we evaluate this expression.
500 |
isCapitalized "Ann"
501 |
502 |
503 |
504 |
505 |
Whew! A few exercises!
506 |
Finally! We can write slightly more complex functions.
507 |
Now that you can inspect the front of a list, you should be able to process an entire list recursively.
508 |
First, please write a function named myLength
that computes the number of elements in a list.
509 |
Next, write a function named countCaps
that calculates the number of capital letters in a string.
510 |
countCaps "Monkey Butter" == 2
511 |
512 |
513 |
514 |
515 |
Counting capital letters
516 |
Wow, that countCaps function was a pain, right?
517 |
Here's my definition that uses only the machinery we've learned so far:
518 |
countCaps string =
case string of
[] -> 0
(x:xs) -> if isUpper x
then 1 + countCaps xs
else countCaps xs
519 |
520 |
521 |
522 |
523 |
Huh.
524 |
I thought Haskell was all about concision!?
525 |
526 |
527 |
528 |
529 |
Conciseness 1: top-level pattern matching
530 |
countCaps [] = 0
countCaps (x:xs) =
if isUpper x
then 1 + countCaps xs
else countCaps xs
531 |
We can define a function as a series of equations, each containing a pattern match.
532 |
This is nice syntactic sugar for case
.
533 |
534 |
535 |
536 |
537 |
Conciseness 2: guards
538 |
countCaps [] = 0
countCaps (x:xs)
| isUpper x = 1 + countCaps xs
| otherwise = countCaps xs
539 |
After each |
is a guard.
540 |
541 | If a pattern matches, we evaluate each Boolean guard expression from top to bottom.
542 | When one succeeds, we evaluate the RHS as the body of the function.
543 |
544 |
(Yes, patterns in a case
can have guards too.)
545 |
546 |
547 |
548 |
549 |
Before
550 |
Like the original version, but with use of case
stripped out:
551 |
countCaps xs =
if null xs
then 0
else if isUpper (head xs)
then 1 + countCaps (tail xs)
else countCaps (tail xs)
552 |
553 |
554 |
555 |
556 |
After
557 |
Both shorter and easier to follow:
558 |
countCaps [] = 0
countCaps (x:xs)
| isUpper x = 1 + countCaps xs
| otherwise = countCaps xs
559 |
560 |
561 |
562 |
563 |
Another approach
564 |
Write a new version of countCaps
as follows:
565 |
566 | Write a function that goes through a list, and which generates a new list that contains only its capital letters.
567 | Use length
to count the number of elements.
568 |
569 |
This should give the same result as your first function. Right?
570 |
571 |
572 |
573 |
574 |
A change of specification
575 |
Suppose we want to count the number of lowercase letters in a string.
576 |
This seems almost the same as our function for counting uppercase letters.
577 |
What can we do with this observation?
578 |
579 |
580 |
581 |
582 |
Higher order functions
583 |
Higher order function: a function that accepts another function as a parameter.
584 |
filter pred [] = []
filter pred (x:xs)
| pred x = x : filter pred xs
| otherwise = filter pred xs
585 |
How can we use this to define countLowerCase
?
586 |
587 |
588 |
589 |
590 |
Data in, data out
591 |
By now, we've seen several definitions like this:
592 |
countLowerCase string =
length (filter isLower string)
593 |
This is a recurring pattern:
594 |
595 | A function of one argument
596 | It's being fed the result of ...
597 | ... another function of one argument
598 |
599 |
600 |
601 |
602 |
603 |
Function composition
604 |
Haskell doesn't limit us to giving functions alphanumeric names.
605 |
Here, we define a function named simply ".
", which we can use as an operator:
606 |
(f . g) x = f (g x)
607 |
How to use this?
608 |
countLowerCase = length . filter isLower
609 |
610 |
611 |
612 |
613 |
Understanding composition
614 |
If that seemed hard to follow, let's make it clearer.
615 |
We'll plug the arguments into the RHS of our function definition:
616 |
(f . g) x = f (g x)
617 |
We had length
as the first argument to ".
", and filter isLower
as the second:
618 |
(length . filter isLower) x
= length (filter isLower x)
619 |
620 |
621 |
622 |
623 |
Local variables
624 |
Inside an expression, we can introduce new variables using let
.
625 |
let x = 2
y = 4
in x + y
626 |
630 |
631 |
632 |
633 |
634 |
White space
635 |
Haskell is sensitive to white space!
636 |
637 | A top-level definition starts in the leftmost column.
638 | After the beginning of a definition, if the next non-empty line is indented further, it is treated as a continuation of that definition.
639 | Never use tabs in your source files.
640 |
641 |
642 |
643 |
644 |
645 |
White space and local variables
646 |
If you're defining local variables, they must start in the same column.
647 |
This is good:
648 |
let x = 2
y = 4
in x + y
649 |
But this will lead to a compiler error:
650 |
let x = 2
y = 4
in x + y
651 |
652 |
653 |
654 |
655 |
Composition exercise
656 |
Using function composition wherever you can, write a function that accepts a string and returns a new string containing only the words that begin with vowels.
657 |
658 | - You'll want to play with the
words
and unwords
functions before you start.
659 |
660 |
Example:
661 |
disemvowel "I think, therefore I am."
== "I I am."
662 |
663 |
664 |
665 |
666 |
My solution
667 |
Here's how I wrote disemvowel
:
668 |
disemvowel =
let isVowel c = toLower c `elem` "aeiou"
in unwords . filter (isVowel . head) . words
669 |
Does this remind you of a Unix shell pipeline, only right-to-left?
670 |
671 |
672 |
673 |
674 |
Problem definition, once again
675 |
Given a web site, we want to scrape it and find important web pages.
676 |
We're now Haskell experts, right?
677 |
678 | - Download one web page
679 |
680 |
681 |
682 |
683 |
684 |
Let's download a web page!
685 |
We'd really like to rely on a library to download a web page for us.
686 |
At times like this, there's a very handy central repository of open source Haskell software:
687 |
691 |
Go there now!
692 |
Click on the Packages link at the top of the page to browse packages.
693 |
Alas, the list is overwhelmingly long, but we can find libraries for all kinds of tasks if we're patient.
694 |
Are we patient?
695 |
696 |
697 |
698 |
699 |
Ugh!
700 |
Scrolling through thousands of libraries is hard - surely there's a better way?
701 |
Enter the cabal
command!
702 |
Run this command in a terminal window:
703 |
cabal update
704 |
705 |
This downloads the latest index of all software on Hackage.
706 |
With the index updated, we can search it:
707 |
cabal list http
708 |
709 |
That still gives us 20+ packages to comb through, but at least it's better than the 3,400 on the Packages web page.
710 |
711 |
712 |
713 |
714 |
Short-cutting the search
715 |
The best HTTP client library is named http-enumerator
.
716 |
We can read about it online:
717 |
720 |
That landing page for a package is intimidating, but look towards the bottom, at the section labeled "Modules".
721 |
What do you see?
722 |
723 |
724 |
725 |
726 |
Installing a package
727 |
Before we can use http-enumerator
, we must install it.
728 |
To install the http-enumerator
package, we just issue a single command:
729 |
cabal install http-enumerator
730 |
731 |
This command figures out all the other libraries that http-enumerator
depends on, and downloads, compiles, and installs the whole lot.
732 |
Expect it to take a few minutes and print a lot of output.
733 |
734 |
735 |
736 |
737 |
Reading docs: packages and modules
738 |
While we're waiting for the http-enumerator
package and all of its dependencies to install, let's try to figure out how we should use it.
739 |
Remember the link to API documentation at the end of the package's web page? Click through to the API docs.
740 |
An API page begins with a title that looks something like this:
741 |
Network.HTTP.Enumerator
742 |
743 |
This is the name of a module.
744 |
A module is a collection of related code.
745 |
A package is a collection of related modules.
746 |
(This will sound familiar if you know Python.)
747 |
748 |
749 |
750 |
751 |
Reading docs: the rest
752 |
After the initial blurb, a module's docs consists of type signatures and descriptions.
753 |
Here is a really simple type signature:
754 |
foo :: String
755 |
756 |
How the heck do we read this?
757 |
The name of the thing being defined comes before the ::
characters.
758 |
Its type follows after the ::
.
759 |
This means "the value named foo
has the type String
".
760 |
761 |
762 |
763 |
764 |
Haskell's type system
765 |
Up until now, we have not bothered talking about types or type signatures.
766 |
Every expression and value in Haskell has a single type.
767 |
Those types can almost always be inferred automatically by the compiler or interpreter.
768 |
769 |
770 |
771 |
772 |
The most common basic types
773 |
774 | Bool
775 | Int
776 | Char
777 | Double
778 |
779 |
780 |
781 |
782 |
783 |
A function signature
784 |
Here's another type signature:
785 |
words :: String -> [String]
786 |
787 |
Here we see a new symbol, ->
, which means "this is a function".
788 |
The type after the last ->
is the return type of the function.
789 |
All of its predecessors are argument types.
790 |
So this is a function that takes one String
argument, and returns... what?
791 |
792 |
793 |
794 |
795 |
List notation
796 |
The notation [a]
means "a list of values, all of some type a
".
797 |
So [String]
means "a list of values, all of type String
".
798 |
799 |
800 |
801 |
802 |
Type synonyms
803 |
What's a String
?
804 |
805 | - It's not special, just a synonym for
[Char]
, i.e. "a list of Char
".
806 |
807 |
We can introduce new synonyms of our own.
808 |
type Dollars = Int
809 |
A type synonym can be handy for documenting an intended use for an existing type.
810 |
811 |
812 |
813 |
814 |
Words
815 |
words :: String -> [String]
816 |
817 |
We can now read that this function accepts a string as argument, and returns a list of strings.
818 |
From reading its name and type signature, can you guess what words
might do?
819 |
820 |
821 |
822 |
823 |
Another signature
824 |
Tell me about this signature:
825 |
mystery :: [String] -> String
826 |
827 |
What are some reasonable possible behaviours for this function?
828 |
829 |
830 |
831 |
832 |
Reading real-world docs
833 |
Here is the very first signature from http-enumerator
:
834 |
simpleHttp
835 | :: (MonadIO m, Failure HttpException m) =>
836 | String -> m ByteString
837 |
838 |
This is more complex! How the heck do we read it?
839 |
The bits between ::
and '=>' are constraints on where we can use simpleHttp
- but let's ignore constraints for now.
840 |
841 | - Important: it's often safe to gloss over things we don't (yet) understand.
842 |
843 |
We'll also ignore that mysterious lowercase m
for a bit.
844 |
What can we tell about this function?
845 |
846 |
847 |
848 |
849 |
ByteString
850 |
A ByteString
is a blob of binary data.
851 |
Unlike String
, it is not represented as a list, but as a packed array.
852 |
However, it contains binary bytes, not text!
853 |
854 | - Don't use
ByteString
for working with data that you have to manipulate as text.
855 |
856 |
857 |
858 |
859 |
860 |
Let's play in ghci!
861 |
Does everyone have http-enumerator
installed now?
862 |
Fire up ghci
, and let's play with the module:
863 |
import Network.HTTP.Enumerator
864 |
865 |
Notice that after we type this, the prompt changes:
866 |
Prelude Network.HTTP.Enumerator>
867 |
868 |
This tells us that the module has loaded and is available.
869 |
870 |
871 |
872 |
873 |
Wait! Are you on Windows?
874 |
On Windows, we have to set up Winsock before any networking will work.
875 |
First, let's load the lowest-level networking module:
876 |
import Network.Socket
877 |
878 |
And here's how we initialize Winsock:
879 |
withSocketsDo (return ())
880 |
881 |
(It's harmless to do this on Unix.)
882 |
883 |
884 |
885 |
886 |
With that out of the way ...
887 |
Finally - let's load a web page!
888 |
simpleHttp "http://example.com/"
889 |
890 |
Did that just print a ton of HTML in the terminal window? All right!
891 |
892 |
893 |
894 |
895 |
From binary to text
896 |
Now we have a ByteString
, which we need to turn into text for manipulating.
897 |
Let's cheat, and assume that all web pages are encoded in UTF-8.
898 |
899 |
900 |
901 |
902 |
Pure code
903 |
So far, all of the code we have written has been "pure".
904 |
905 | The behaviour of all of our functions has depended only on their inputs.
906 | All of our data is immutable.
907 | There's thus no way to change a global variable and modify the behaviour of a function.
908 |
909 |
910 |
911 |
912 |
913 |
Impure code
914 |
And yet ... somehow we downloaded a web page!
915 |
916 | - Web pages clearly are not pure.
917 |
918 |
So can we write code like this?
919 |
length (simpleHttp "http://x.org/")
920 |
NO.
921 |
The type system segregates code that must be pure from code that may have side effects ("impure" code).
922 |
923 |
924 |
925 |
926 |
Are we stuck?
927 |
Well, let's look at a simpler example than simpleHttp
.
928 |
Type this in ghci
:
929 |
:type readFile
930 |
931 |
This will tell us what the type of readFile
is.
932 |
933 |
934 |
935 |
936 |
IO
937 |
The :type
directive should print something like this:
938 |
readFile :: FilePath -> IO String
939 |
Notice that IO
on the result type?
940 |
It means "this function may have side effects".
941 |
We often refer to impure functions, with IO
in the result type, as actions.
942 |
943 | - This helps to distinguish them from pure functions.
944 |
945 |
946 |
947 |
948 |
949 |
Mixing IO and other stuff
950 |
The type system keeps track of which functions have IO
in their types, and keeps us honest.
951 |
We can still mix pure and impure code in a natural way:
952 |
charCount fileName = do
contents <- readFile fileName
return (length contents)
953 |
954 |
955 |
956 |
957 |
"do" notation
958 |
Critical to what we just saw was the do
keyword at the beginning of the function definition.
959 |
This introduces a series of IO
actions, one per line.
960 |
961 |
962 |
963 |
964 |
Capturing the results of impure code
965 |
To capture the result of an IO
action, we use <-
instead of =
.
966 |
contents <- readFile fileName
967 |
The result (contents
) is pure - it does not have the IO
type.
968 |
This is how we supply pure code with data returned from impure code.
969 |
970 |
971 |
972 |
973 |
The "return" action
974 |
This is not the return
type you're used to!
975 |
It takes a pure value (without IO
in its type), and wraps it with the IO
type.
976 |
Pure code can't call impure code, but it can thread data back into the impure world using return
.
977 |
978 |
979 |
980 |
981 |
Haskell programs and IO
982 |
When you write a Haskell program, its entry point must be named main
.
983 |
The type of main
must be:
984 |
main :: IO ()
985 |
()
is named "unit", and means more or less the same thing as void
in C or Java.
986 |
What this means is that all Haskell programs are impure!
987 |
988 |
989 |
990 |
991 |
Binary to text
992 |
Remember we were planning to cheat earlier?
993 |
We had this:
994 |
simpleHttp :: String -> IO ByteString
995 |
We need something whose result is an IO String
instead.
996 |
How should that look?
997 |
998 |
999 |
1000 |
1001 |
UTF-8 conversion
1002 |
To do the conversion, let's grab a package named utf8-string
.
1003 |
cabal install utf8-string
1004 |
1005 |
That contains a package named Data.ByteString.Lazy.UTF8
.
1006 |
import Data.ByteString.Lazy.UTF8
1007 |
It defines a function named toString
:
1008 |
toString :: ByteString -> String
1009 |
1010 |
1011 |
1012 |
1013 |
UTF-8 conversion exercise
1014 |
Write an action that downloads a URL and converts it from a ByteString
to a String
using toString
.
1015 |
Write a type signature for the action.
1016 |
1017 | Haskell definitions usually don't require type signatures.
1018 | Nevertheless, we write them for documentation on almost all top-level definitions.
1019 |
1020 |
1021 |
1022 |
1023 |
1024 |
Downloading and saving a web page
1025 |
Use your download
function to save a local copy of the page you just wrote.
1026 |
saveAs :: String -> Int -> IO ()
1027 |
For simplicity, let's save the local files as names containing numbers:
1028 |
makeFileName :: Int -> FilePath
makeFileName k = "download-" ++ show k ++ ".html"
1029 |
To save a local copy of a file, you'll need the writeFile
action.
1030 |
1031 |
1032 |
1033 |
1034 |
Shoveling through HTML
1035 |
Two truisms:
1036 |
1040 |
So! Let's use another library.
1041 |
cabal install tagsoup
1042 |
1043 |
The tagsoup
package can parse arbitrarily messy HTML.
1044 |
It will feed us a list of events, like a SAX parser.
1045 |
1046 |
1047 |
1048 |
1049 |
Dealing with problems
1050 |
Try this:
1051 |
head [1]
1052 |
Now try this:
1053 |
head []
1054 |
1055 |
1056 |
1057 |
1058 |
Oops
1059 |
If we pass an empty list, the head
function throws an exception.
1060 |
Suppose we need a version of head
that will not throw an exception.
1061 |
safeHead :: [a] -> ????
1062 |
What should the ????
be?
1063 |
Let's invent something.
1064 |
safeHead (x:xs) = Some x
safeHead [] = None
1065 |
1066 |
1067 |
1068 |
1069 |
Some? None?
1070 |
1074 |
To bring these constructors into existence, we need to declare a new type.
1075 |
data Perhaps a = Some a
| None
1076 |
The |
character separates the constructors. We can read it as:
1077 |
1078 | The Perhaps
type has two constructors:
1079 | Some
followed by a single argument
1080 | or None
with no arguments
1081 |
1082 |
1083 |
1084 |
1085 |
1086 |
Maybe
1087 |
Actually, Haskell already has a Perhaps
type.
1088 |
data Maybe a = Just a
| Nothing
1089 |
The a
is a type parameter, meaning that when we write this type, we have to supply another type as a parameter:
1090 |
1091 | Maybe Int
1092 | Maybe String
1093 |
1094 |
1095 |
1096 |
1097 |
1098 |
Using constructors
1099 |
If we want to construct a Maybe Int
using the Just
constructor, we must pass it an Int
.
1100 |
Just 1 :: Maybe Int
Nothing :: Maybe Int
1101 |
This will not work, because the types don't match:
1102 |
Just [1] :: Maybe String
1103 |
1104 |
1105 |
1106 |
1107 |
Pattern matching over constructors
1108 |
We can pattern match over the constructors for Maybe
just as we did for lists.
1109 |
case foo of
Just x -> x
Nothing -> bar
1110 |
1111 |
1112 |
1113 |
1114 |
Tags
1115 |
The tagsoup
package defines the following type:
1116 |
data Tag = TagOpen String [Attribute]
| TagClose String
| TagText String
| TagComment String
| TagWarning String
| TagPosition Row Column
1117 |
What do you think the constructors mean?
1118 |
1119 |
1120 |
1121 |
1122 |
Pattern matching on a Tag
1123 |
Suppose we want to write a predicate that will tell is if a Tag
is an opening tag.
1124 |
1128 |
1129 |
1130 |
1131 |
1132 |
Don't care!
1133 |
Our first body looked like this:
1134 |
isOpenTag (TagOpen x y) = True
isOpenTag (TagClose x) = False
isOpenTag (TagText x) = False
isOpenTag (TagComment x) = False
isOpenTag (TagWarning x) = False
isOpenTag (TagPosition x y) = False
1135 |
Concise, but ugly.
1136 |
1140 |
1141 |
1142 |
1143 |
1144 |
The wild card pattern
1145 |
We can write "I don't care what this pattern or variable is" using the "_
" character.
1146 |
isOpenTag (TagOpen _ _) = True
isOpenTag _ = False
1147 |
The wild card pattern always matches.
1148 |
1149 | Since we don't care about x
or y
, we can state that explicitly using _
.
1150 | Since we don't care about any constructor except TagOpen
, we can match all the others using _
.
1151 |
1152 |
1153 |
1154 |
1155 |
1156 |
Just a quick question
1157 |
Why don't we write the function like this?
1158 |
isOpenTag _ = False
isOpenTag (TagOpen _ _) = True
1159 |
1160 |
1161 |
1162 |
1163 |
Extracting links from a web page
1164 |
Suppose we have a page in memory already.
1165 |
1166 | Browse the tagsoup
docs, in the Text.HTML.TagSoup
module.
1167 | Find a function that will parse a web page into a series of tags.
1168 |
1169 |
1170 |
1171 |
1172 |
1173 |
Let's use it!
1174 |
processPage url = do
page <- download url
return (parseTags page)
1175 |
1176 |
1177 |
1178 |
1179 |
Tidying tags up
1180 |
Parsed tags can contain a mixture of tag names.
1181 |
<A HREF="...">
1182 |
1183 |
<a hrEF="...">
1184 |
1185 |
1186 | - Find a
tagsoup
function that will turn tag names and attributes to lower case.
1187 |
1188 |
1189 |
1190 |
1191 |
1192 |
Canonical tags
1193 |
Let's use our function to clean up the result of parseTags
.
1194 |
processPage url = do
page <- download url
return
(canonicalizeTags
(parseTags page))
1195 |
1196 |
1197 |
1198 |
1199 |
Extracting links
1200 |
We only care about open tags that are links, so <a>
tags.
1201 |
1202 | How would we write the type of a function that will indicate whether a Tag
is an open tag with the correct name?
1203 | How would we use this function to extract only the open tags from a list of parsed tags?
1204 |
1205 |
1206 |
1207 |
1208 |
1209 |
Whee!
1210 |
This cascade is getting a bit ridiculous.
1211 |
processPage url = do
page <- download url
return
(filter (isTagOpenName "a")
(canonicalizeTags
(parseTags page)))
1212 |
Two observations:
1213 |
1217 |
1218 |
1219 |
1220 |
1221 |
A rewriting exercise
1222 |
Take this function and split it into pure and impure parts.
1223 |
Write the pure part using function composition.
1224 |
processPage url = do
page <- download url
return
(filter (isTagOpenName "a")
(canonicalizeTags
(parseTags page)))
1225 |
1226 |
1227 |
1228 |
1229 |
My solution
1230 |
processPage url = do
page <- download url
return (process page)
process =
filter (isTagOpenName "a") .
canonicalizeTags .
parseTags
1231 |
1232 |
1233 |
1234 |
1235 |
More stuff to filter out
1236 |
Let's skip nofollow
links.
1237 |
We want to get the "rel"
attribute of a tag.
1238 |
1239 | - Find a function that extracts an attribute from a tag.
1240 |
1241 |
1242 |
1243 |
1244 |
1245 |
No following
1246 |
nofollow tag = fromAttrib "rel" tag == "nofollow"
1247 |
process =
filter (not . nofollow) .
filter (isTagOpenName "a") .
canonicalizeTags .
parseTags
1248 |
1249 |
1250 |
1251 |
1252 |
We have a list of <a> tags
1253 |
How would we extract the "href"
attribute from every element of the list?
1254 |
1255 |
1256 |
1257 |
1258 |
Only non-empty <a href> tags
1259 |
process =
filter (not . null) .
map (fromAttrib "href") .
filter (not . nofollow) .
filter (isTagOpenName "a") .
canonicalizeTags .
parseTags
1260 |
1261 |
1262 |
1263 |
1264 |
Canonical URLs
1265 |
Links can be absolute, relative, or invalid garbage, and we only want valid-looking absolute links.
1266 |
To properly create an absolute link, we need to know the absolute URL of the page we're looking at.
1267 |
canonicalizeLink :: String -> String -> Maybe String
1268 |
1269 |
1270 |
1271 |
1272 |
Working with URIs
1273 |
The Network.URI
package contains some functions we might find handy.
1274 |
parseURI :: String -> Maybe URI
parseURIReference :: String -> Maybe URI
uriToString id "" :: URI -> String
nonStrictRelativeTo :: URI -> URI -> Maybe URI
1275 |
1276 |
1277 |
1278 |
1279 |
A monster of indentation
1280 |
This is really hard to read!
1281 |
import Network.URI
canon :: String -> String -> Maybe String
canon referer path =
case parseURI referer of
Nothing -> Nothing
Just r ->
case parseURIReference path of
Nothing -> Nothing
Just p ->
case nonStrictRelativeTo p r of
Nothing -> Nothing
Just u ->
Just (uriToString id u "")
1282 |
Surely there's a better way.
1283 |
1284 |
1285 |
1286 |
1287 |
Stair stepping
1288 |
Notice that that function was a series of case
inspections of Maybe
values?
1289 |
Suppose we had a function that accepted a normal value, and returned a Maybe
value.
1290 |
a -> Maybe b
1291 |
And suppose we had a concise syntax for writing an anonymous function.
1292 |
\a -> "hi mom! " ++ a
1293 |
The \
is pronounced "lambda".
1294 |
1295 |
1296 |
1297 |
1298 |
Observation
1299 |
The case
analysis is quite verbose. Suppose we had a function that performed it, and called another function if our value was Just
.
1300 |
bind :: Maybe a -> (a -> Maybe b) -> Maybe b
bind Nothing _ = Nothing
bind (Just value) action = action value
1301 |
1302 |
1303 |
1304 |
1305 |
Using bind
1306 |
How could we use this?
1307 |
canon1 referer path =
parseURI referer `bind`
\r -> parseURIReference path `bind`
\p -> nonStrictRelativeTo p r `bind`
\u -> Just (uriToString id u "")
1308 |
If we enclose a function name in backticks, we can use the function as an infix operator.
1309 |
1310 |
1311 |
1312 |
1313 |
Reformatting the code
1314 |
canon referer path =
parseURI referer `bind` \r ->
parseURIReference path `bind` \p ->
nonStrictRelativeTo p r `bind` \u ->
Just (uriToString id u "")
1315 |
1316 |
1317 |
1318 |
1319 |
A built-in name for bind
1320 |
The >>=
operator is a more general version of our bind
function.
1321 |
canon referer path =
parseURI referer >>= \r ->
parseURIReference path >>= \p ->
nonStrictRelativeTo p r >>= \u ->
Just (uriToString id u "")
1322 |
1323 |
1324 |
1325 |
1326 |
Using syntactic sugar
1327 |
Here's some tidier syntax that should look familiar.
1328 |
canonicalize :: String -> String -> Maybe String
canonicalize referer path = do
r <- parseURI referer
p <- parseURIReference path
u <- nonStrictRelativeTo p r
return (uriToString id u "")
1329 |
1330 |
1331 |
1332 |
1333 |
Nearly there
1334 |
process url =
map (canonicalize url) .
filter (not . null) .
map (fromAttrib "href") .
filter (\t -> fromAttrib "rel" t /= "nofollow") .
filter (isTagOpenName "a") .
canonicalizeTags .
parseTags
1335 |
One awkward thing: what is the type of this function?
1336 |
1337 |
1338 |
1339 |
1340 |
From [Maybe a] to [a]
1341 |
Go to this web site:
1342 |
1345 |
Type this into the search box:
1346 |
[Maybe a] -> [a]
1347 |
What does the first result say?
1348 |
1349 |
1350 |
1351 |
1352 |
We're there!
1353 |
import Data.Maybe
import Network.URI
links url =
catMaybes .
map (canonicalize url) .
filter (not . null) .
map (fromAttrib "href") .
filter (\t -> fromAttrib "rel" t /= "nofollow") .
filter (isTagOpenName "a") .
canonicalizeTags .
parseTags
1354 |
1355 |
1356 |
1357 |
1358 |
From links to spidering
1359 |
If we can download the links from one page, we can easily write a spider to follow those links.
1360 |
To keep things simple, let's set a limit on the number of pages we'll download.
1361 |
What information do we want to generate?
1362 |
What do we need to track along the way?
1363 |
1364 |
1365 |
1366 |
1367 |
What we need to track
1368 |
Here's the state we need to maintain:
1369 |
1370 | The number of pages we have downloaded
1371 | A collection of pages we have seen links to, but haven't downloaded
1372 | A collection of pages and their outbound links
1373 |
1374 |
1375 |
1376 |
1377 |
1378 |
Tracking what we've seen
1379 |
For any given page, we need to remember both it and all the pages it links to.
1380 |
One possibility for associating the two is a tuple:
1381 |
("http://x.org/", ["http://microsoft.com/"])
1382 |
Tuples are useful any time we want mixed-type data without the hassle of creating a new type.
1383 |
Speaking of a new type, here's how we'd define one:
1384 |
data Link = Link String [String]
-- Let's define some accessors, too.
linkFrom (Link url _) = url
linkTo (Link _ links) = links
1385 |
1386 |
1387 |
1388 |
1389 |
Avoiding duplication
1390 |
We don't want to visit any URL twice.
1391 |
How do we avoid this?
1392 |
visited url = elem url . map linkTo
1393 |
This function has a problem - what is that problem?
1394 |
1395 |
1396 |
1397 |
1398 |
Better performance
1399 |
We really want a structure with a fast lookup operation.
1400 |
What would you use in your language?
1401 |
1402 |
1403 |
1404 |
1405 |
Maps and importing
1406 |
In Haskell, we have mutable hash tables, but we don't use them.
1407 |
Instead, we use immutable key-value maps.
1408 |
We must perform fancy module importing tricks because the Data.Map
module defines a lot of names that would otherwise overlap with built-in names.
1409 |
This means "only import the name Map
from Data.Map
":
1410 |
import Data.Map (Map)
1411 |
And this means "import everything from Data.Map
, but all those names must be prefixed with Map.
":
1412 |
import qualified Data.Map as Map
1413 |
1414 |
1415 |
1416 |
1417 |
What use is an immutable data structure?
1418 |
Everyone knows how to add a key and value to a hash table, right?
1419 |
And that seems like a fundamental operation.
1420 |
What do we do with maps?
1421 |
1422 | - Create a new map that is identical to the one we supply, with the requested element added.
1423 |
1424 |
How can this possibly work? Is it efficient?
1425 |
1426 |
1427 |
1428 |
1429 |
A fistful of dollars
1430 |
Here's a surprisingly handy built-in operator:
1431 |
f $ x = f x
1432 |
Why is this useful? Because it lets us eliminate parentheses.
1433 |
Before:
1434 |
explode k = error ("failed on " ++ show k)
1435 |
After:
1436 |
explode k = error $ "failed on " ++ show k
1437 |
1438 |
1439 |
1440 |
1441 |
Partial application
1442 |
This is annoying to write:
1443 |
increment k = 1 + k
1444 |
Almost as bad:
1445 |
\k -> 1 + k
1446 |
Much handier, and identical:
1447 |
(1+)
1448 |
In fact, this is valid:
1449 |
increment = (1+)
1450 |
1451 |
1452 |
1453 |
1454 |
Spidering, in all its glory
1455 |
spider :: Int -> URL -> IO (Map URL [URL])
spider count url0 = go 0 Map.empty (Set.singleton url0)
where
go k seen queue0
| k >= count = return seen
| otherwise =
case Set.minView queue0 of
Nothing -> return seen
Just (url, queue) -> do
page <- download url
let ls = links url page
newSeen = Map.insert url ls seen
notSeen = Set.fromList .
filter (`Map.notMember` newSeen) $ ls
newQueue = queue `Set.union` notSeen
go (k+1) newSeen newQueue
1456 |
1457 |
1458 |
1459 |
1460 |
Where do we stand?
1461 |
We can now:
1462 |
1467 |
What remains?
1468 |
1472 |
1473 |
1474 |
1475 |
1476 |
Fin
1477 |
At this point, if we have miraculously not run out of time, we're going on a choose-your-own-adventure session in Emacs.
1478 |
Thanks for sticking with the slideshow so far!
1479 |
1480 |
1481 |
1482 |
--------------------------------------------------------------------------------
/slides/slides.md:
--------------------------------------------------------------------------------
1 | % Haskell: Functional Programming, Solid Code, Big Data
2 | % Bryan O'Sullivan
3 | % 2011-09-18
4 |
5 | # Welcome!
6 |
7 | ~~~~ {.haskell}
8 | main = putStrLn "hello!"
9 | ~~~~
10 |
11 | * My name is Bryan O'Sullivan
12 |
13 | * I started using Haskell in 1993
14 |
15 | * I wrote a book about it, "Real World Haskell"
16 |
17 | * [realworldhaskell.org](http://book.realworldhaskell.org/)
18 |
19 | * I write lots of open source code
20 |
21 | * [github.com/bos](https://github.com/bos)
22 |
23 | * My company invests heavily and openly in Haskell
24 |
25 | * [github.com/mailrank](https://github.com/mailrank)
26 |
27 |
28 | # Copy these slides (if you want)
29 |
30 | ~~~~
31 | git clone https://github.com/bos/strange-loop-2011
32 | ~~~~
33 |
34 |
35 | # My Haskell background
36 |
37 | * Began using Haskell at university
38 |
39 | * When I graduated in 1995, I switched to more mainstream languages
40 |
41 | * C, C++, Java, Python, Perl, etc.
42 |
43 | * My interest reawakened around 2005
44 |
45 | * I've found Haskell to be a great general-purpose language
46 |
47 | * The community of people is amazing
48 |
49 |
50 | # What to expect 1
51 |
52 | * Haskell is a fairly big language
53 |
54 | * Since so much is unfamiliar to newcomers, expect to wander far from
55 | your comfort zone
56 |
57 | * I'm going to teach you *interesting* things, but not *everything*
58 |
59 |
60 | # What to expect 2
61 |
62 | * This is a *hands-on* workshop: you'll be writing code!
63 |
64 | * There will be a short break every 45 minutes
65 |
66 | * Don't be afraid to ask questions!
67 |
68 |
69 | # Your tools
70 |
71 | * You've already installed the Haskell Platform, right?
72 |
73 | * [hackage.haskell.org/platform](http://hackage.haskell.org/platform/)
74 |
75 | * This gives us a great toolchain
76 |
77 | * The GHC compiler (`ghc`)
78 |
79 | * The GHCi interpreter (`ghci`)
80 |
81 | * The Cabal package manager (`cabal`)
82 |
83 | * Some handy libraries and tools
84 |
85 |
86 | # What else is needed?
87 |
88 | * A text editor
89 |
90 | * A terminal window
91 |
92 |
93 | # Problem definition
94 |
95 | Given a web site, we want to scrape it and find important web pages.
96 |
97 | This involves a lot of figuring stuff out!
98 |
99 | 1. Learn Haskell
100 |
101 | 1. Download one web page
102 |
103 | 1. Extract links from a page, so we can find more pages to download
104 |
105 | 1. Once we're done, compute which ones are important
106 |
107 | 1. Make it all fast?
108 |
109 |
110 | # Let's get started!
111 |
112 | Create a file named `Hello.hs` and give it the following contents:
113 |
114 | ~~~~ {.haskell}
115 | main = putStrLn "hello, world!"
116 | ~~~~
117 |
118 | The suffix `.hs` is the standard for Haskell source files.
119 |
120 | File names start with a capital letter, and everyone uses `CamelCase`.
121 |
122 |
123 | # Building it
124 |
125 | This command will look for `Hello.hs` in the current directory, and
126 | compile it:
127 |
128 | ~~~~
129 | ghc --make Hello
130 | ~~~~
131 |
132 | The generated executable will be named `Hello` (`Hello.exe` on
133 | Windows).
134 |
135 | * That `--make` option up there tells GHC to automatically deal with
136 | dependencies on source files and packages.
137 |
138 |
139 | # Checking in
140 |
141 | Is everyone able to build and run their `Hello` executable?
142 |
143 |
144 | # Something a little more convenient
145 |
146 | It's nice to have fast, native code at our fingertips.
147 |
148 | But when *I'm* working, I expect a few things:
149 |
150 | * I do lots of exploration.
151 |
152 | * I make tons of mistakes.
153 |
154 | For these circumstances, a full compiler is a bit slow.
155 |
156 | Instead, I often use the interactive interpreter, `ghci`.
157 |
158 |
159 | # Let's start GHCi
160 |
161 | Easily done:
162 |
163 | ~~~~
164 | ghci
165 | ~~~~
166 |
167 | It will display a startup banner, followed by a prompt:
168 |
169 | ~~~~
170 | Prelude>
171 | ~~~~
172 |
173 | This default prompt tells us which modules are available to play with.
174 |
175 |
176 | # Play around
177 |
178 | The `ghci` interpreter evaluates expressions interactively.
179 |
180 | Try it out:
181 |
182 | ~~~~ {.haskell}
183 | 2 + 2
184 | ~~~~
185 |
186 | ~~~~ {.haskell}
187 | 123456781234567812345678 * 87654321876543
188 | ~~~~
189 |
190 | ~~~~ {.haskell}
191 | "foo" ++ "bar"
192 | ~~~~
193 |
194 | (That `++` is the "append" operator.)
195 |
196 |
197 | # Directives
198 |
199 | All interpreter directives start with a "`:`" character.
200 |
201 | Let's load our source file into `ghci`:
202 |
203 | ~~~~
204 | :load Hello.hs
205 | ~~~~
206 |
207 | Now the `ghci` prompt changes:
208 |
209 | ~~~~
210 | *Main>
211 | ~~~~
212 |
213 |
214 | # Running our code in ghci
215 |
216 | We defined a function named `main`, so let's invoke it:
217 |
218 | ~~~~
219 | main
220 | ~~~~
221 |
222 | Did that work for you?
223 |
224 | What about this?
225 |
226 | ~~~~ {.haskell}
227 | putStrLn "hi mom!"
228 | ~~~~
229 |
230 |
231 | # A few more useful directives
232 |
233 | Remember, all `ghci` directives start with a "`:`".
234 |
235 | * `:help` tells us what directives are available.
236 |
237 | * `:reload` reloads the file we last `:load`ed.
238 |
239 | * `:edit` launches our text editor on the file you most recently
240 | `:load`ed. (Does *not* automatically `:reload`.)
241 |
242 | * `:quit` exits back to the shell.
243 |
244 |
245 | # Final ghci efficiency tips
246 |
247 | We can abbreviate directives:
248 |
249 | * `:e` is treated as `:edit`
250 |
251 | * `:r` is `:reload`
252 |
253 | We also have command line history and editing.
254 |
255 | * On Unix, compatible with `readline`.
256 |
257 | * On Windows, the same as `cmd.exe`.
258 |
259 |
260 | # Getting used to the cycle
261 |
262 | Use `:edit` or your text editor to change the "hello" text.
263 |
264 | Use `:reload` to reload the source file.
265 |
266 | Test out your redefinition of `main`.
267 |
268 | * For practice, hit the "up arrow" key to cycle through your command
269 | history until you get back to the last time you typed `main`.
270 |
271 |
272 | # Lists and strings
273 |
274 | ~~~~ {.haskell}
275 | [1,2,3,4]
276 | ~~~~
277 |
278 | ~~~~ {.haskell}
279 | ['h','e','l','l','o']
280 | ~~~~
281 |
282 | Double quotes are just syntactic sugar for the longer form:
283 |
284 | ~~~~ {.haskell}
285 | "hello"
286 | ~~~~
287 |
288 | What does this print?
289 |
290 | ~~~~ {.haskell}
291 | "foo" == ['f','o','o']
292 | ~~~~
293 |
294 |
295 | # Calling functions: 1
296 |
297 | We use white space to separate a function from its argument:
298 |
299 | ~~~~ {.haskell}
300 | head "foo"
301 | ~~~~
302 |
303 | ~~~~ {.haskell}
304 | head [1,2,3]
305 | ~~~~
306 |
307 | ~~~~ {.haskell}
308 | tail [1,2,3]
309 | ~~~~
310 |
311 |
312 | # Calling functions: 2
313 |
314 | If a function takes multiple arguments, we separate them with white
315 | space:
316 |
317 | ~~~~ {.haskell}
318 | min 3 4
319 | ~~~~
320 |
321 | If an argument is a compound expression, wrap it in parentheses:
322 |
323 | ~~~~ {.haskell}
324 | compare (3+5) (2+7)
325 | ~~~~
326 |
327 | ~~~~ {.haskell}
328 | max (min 3 4) 5
329 | ~~~~
330 |
331 |
332 | # Quick exercises: 1
333 |
334 | Use `ghci` as a calculator.
335 |
336 | The `**` operator performs exponentiation.
337 |
338 | * If I invest 5 quatloos at 3% compound interest per annum, how many
339 | quatloos will I have after 10 years?
340 |
341 |
342 | # Quick exercises: 2
343 |
344 | The notation `['a'..'z']` generates a list from start to end,
345 | inclusive.
346 |
347 | The `sum` function adds the elements of a list.
348 |
349 | * What is the sum of the numbers between 9 and 250, inclusive, *minus* 2?
350 |
351 |
352 | # Quick exercises: 3
353 |
354 | The `show` function renders a value as a string. Try it!
355 |
356 | ~~~~ {.haskell}
357 | show (1 == 2)
358 | ~~~~
359 |
360 | The `length` function tells us how many elements are in a list.
361 |
362 | ~~~~ {.haskell}
363 | length [1,2,3]
364 | ~~~~
365 |
366 | * How many digits are in the product of all numbers between 0xBE and
367 | 0xEF, inclusive?
368 |
369 |
370 | # Defining a function
371 |
372 | It is pretty simple to define a new function.
373 |
374 | Open up your text editor, create a new file with a `.hs` extension,
375 | and get writing!
376 |
377 | ~~~~ {.haskell}
378 | isOdd x = (rem x 2) == 1
379 | ~~~~
380 |
381 | * We start with the name of the function.
382 |
383 | * Next come the names we want to give its parameter(s), separated by
384 | white space.
385 |
386 | * After those come a single `=` character, with the *body* of the
387 | function following.
388 |
389 | Load your source file into `ghci` and give `myOdd` a try.
390 |
391 |
392 | # Making life more interesting
393 |
394 | Now we can define very simple functions, but we're missing some
395 | important building blocks for more fun.
396 |
397 | So let's get to it!
398 |
399 |
400 | # Conditional execution
401 |
402 | Q: What does the familiar `if` look like in Haskell?
403 |
404 | A: Familiar!
405 |
406 | ~~~~ {.haskell}
407 | gcd a b = if b == 0
408 | then a
409 | else gcd b (rem a b)
410 | ~~~~
411 |
412 | We have the following elements:
413 |
414 | * A Boolean expression
415 |
416 | * `then` an expression that will be the result if the Boolean is
417 | `True`
418 |
419 | * `else` an expression that will be the result if the Boolean is
420 | `False`
421 |
422 |
423 | # Finally! A tiny bit about types
424 |
425 | The two possible results of an `if` expression must have the same
426 | type.
427 |
428 | If `then` evaluates to a `String`, well `else` must too!
429 |
430 | For instance, this makes no sense:
431 |
432 | ~~~~ {.haskell}
433 | if True
434 | then 3.14
435 | else "wombat"
436 | ~~~~
437 |
438 | We are forbidden from writing ill-typed expressions like this.
439 |
440 |
441 | # What about else?
442 |
443 | In imperative languages, we can usually leave out the `else` clause
444 | after an `if`.
445 |
446 | Not so in Haskell.
447 |
448 | Why does this make sense for imperative languages, but not Haskell?
449 |
450 |
451 | # A nearly trivial exercise
452 |
453 | Write a function that appends `", world"` to its argument if the
454 | argument is `"hello"`, or just returns its argument unmodified
455 | otherwise.
456 |
457 | * Remember, the "append" function is an operator named `++`.
458 |
459 |
460 | # Lists in Haskell
461 |
462 | We already know what a list looks like in Haskell:
463 |
464 | ~~~~ {.haskell}
465 | [1,2,3]
466 | ~~~~
467 |
468 | And of course there's the syntactic sugar for strings:
469 |
470 | ~~~~ {.haskell}
471 | "foo" == ['f','o','o']
472 | ~~~~
473 |
474 | But is this everything there is to know?
475 |
476 |
477 | # List constructors
478 |
479 | Supposing we want to construct a list from first principles.
480 |
481 | * We write the *empty list* as `[]`.
482 |
483 | * Given an existing list, we can add another element to the *front* of
484 | the list using the `:` operator.
485 |
486 |
487 | # Type this into ghci
488 |
489 | Add an element to an empty list:
490 |
491 | ~~~~ {.haskell}
492 | 1 : []
493 | ~~~~
494 |
495 |
496 | # From single-element lists onwards
497 |
498 | What about extending that list?
499 |
500 | ~~~~ {.haskell}
501 | 2 : (1 : [])
502 | ~~~~
503 |
504 | You're probably guessing now that `[2,1]` is syntactic sugar for
505 | `2:(1:[])`. And you're right!
506 |
507 | What is the result of this expression?
508 |
509 | ~~~~ {.haskell}
510 | 5 : 8 : [] == [5,8]
511 | ~~~~
512 |
513 |
514 | # Constructors
515 |
516 | We refer to `[]` and `:` as *constructors*, because we use them to
517 | construct lists.
518 |
519 | When you create a list, the Haskell runtime has to remember which
520 | constructors you used, and where.
521 |
522 | So the value `[5,8]` is represented as:
523 |
524 | * A `:` constructor, with `5` as its first parameter, and as its
525 | second ...
526 |
527 | * Another `:` constructor, this time with `8` as its first parameter,
528 | and now as its second ...
529 |
530 | * A `[]` constructor
531 |
532 |
533 | # What did we see?
534 |
535 | Depending on your background, I bet you're thinking something like
536 | this:
537 |
538 | * "Hey! Haskell lists look like singly linked lists!"
539 |
540 | * "Hey! That looks just like lists built out of `cons` cells in Lisp!"
541 |
542 | Right on.
543 |
544 |
545 | # Why do we care about constructors?
546 |
547 | Of course Haskell has to remember what a list is constructed of.
548 |
549 | It also lets *us* inspect a list, to see which constructors were used.
550 |
551 | How do we do this?
552 |
553 | ~~~~ {.haskell}
554 | import Data.Char
555 |
556 | isCapitalized name
557 | = case name of
558 | (first:rest) -> isUpper first
559 | [] -> False
560 | ~~~~
561 |
562 |
563 | # Welcome to the case expression
564 |
565 | A `case` expression allows us to *inspect* a structure to see how it
566 | was constructed.
567 |
568 | ~~~~ {.haskell}
569 | isCapitalized name
570 | = case name of
571 | [] -> False
572 | (first:rest) -> isUpper first
573 | ~~~~
574 |
575 | * In between `case` and `of` is the expression we are inspecting.
576 |
577 | * If the constructor used was the empty-list constructor `[]`, then
578 | clearly the `name` we're inspecting is empty, hence not capitalized.
579 |
580 | If the constructor used was the "add to the front" `:` operator,
581 | then things get more interesting.
582 |
583 | * Whatever was the first parameter of the `:` constructor is bound
584 | to the name `first`.
585 |
586 | * The second parameter of the `:` constructor (i.e. everything in the
587 | list after the first element) is bound to the name `rest`.
588 |
589 | * The expression following the `->` is evaluated with these values.
590 |
591 |
592 | # Pattern matching
593 |
594 | The `case` expression performs what we call *pattern matching*.
595 |
596 | * Patterns are checked from top to bottom.
597 |
598 | * As soon as a match is found, its right hand side (after the `->`) is
599 | used as the result of the entire `case` expression.
600 |
601 | * If no match succeeds, an exception is thrown.
602 |
603 |
604 | # A worked example
605 |
606 | Let's step through the machinery of what happens if we evaluate this
607 | expression.
608 |
609 | ~~~~ {.haskell}
610 | isCapitalized "Ann"
611 | ~~~~
612 |
613 |
614 | # Whew! A few exercises!
615 |
616 | Finally! We can write slightly more complex functions.
617 |
618 | Now that you can inspect the front of a list, you should be able to
619 | process an *entire* list recursively.
620 |
621 | First, please write a function named `myLength` that computes the
622 | number of elements in a list.
623 |
624 | Next, write a function named `countCaps` that calculates the number of
625 | capital letters in a string.
626 |
627 | ~~~~ {.haskell}
628 | countCaps "Monkey Butter" == 2
629 | ~~~~
630 |
631 |
632 | # Counting capital letters
633 |
634 | Wow, that countCaps function was a pain, right?
635 |
636 | Here's my definition that uses only the machinery we've learned so
637 | far:
638 |
639 | ~~~~ {.haskell}
640 | countCaps string =
641 | case string of
642 | [] -> 0
643 | (x:xs) -> if isUpper x
644 | then 1 + countCaps xs
645 | else countCaps xs
646 | ~~~~
647 |
648 |
649 | # Huh.
650 |
651 | I thought Haskell was all about concision!?
652 |
653 |
654 | # Conciseness 1: top-level pattern matching
655 |
656 | ~~~~ {.haskell}
657 | countCaps [] = 0
658 | countCaps (x:xs) =
659 | if isUpper x
660 | then 1 + countCaps xs
661 | else countCaps xs
662 | ~~~~
663 |
664 | We can define a function as a series of equations, each containing a
665 | pattern match.
666 |
667 | This is nice syntactic sugar for `case`.
668 |
669 |
670 | # Conciseness 2: guards
671 |
672 | ~~~~ {.haskell}
673 | countCaps [] = 0
674 | countCaps (x:xs)
675 | | isUpper x = 1 + countCaps xs
676 | | otherwise = countCaps xs
677 | ~~~~
678 |
679 | After each `|` is a *guard*.
680 |
681 | * If a pattern matches, we evaluate each Boolean guard expression from
682 | top to bottom.
683 |
684 | * When one succeeds, we evaluate the RHS as the body of the function.
685 |
686 | (Yes, patterns in a `case` can have guards too.)
687 |
688 |
689 | # Before
690 |
691 | Like the original version, but with use of `case` stripped out:
692 |
693 | ~~~~ {.haskell}
694 | countCaps xs =
695 | if null xs
696 | then 0
697 | else if isUpper (head xs)
698 | then 1 + countCaps (tail xs)
699 | else countCaps (tail xs)
700 | ~~~~
701 |
702 | # After
703 |
704 | Both shorter and easier to follow:
705 |
706 | ~~~~ {.haskell}
707 | countCaps [] = 0
708 | countCaps (x:xs)
709 | | isUpper x = 1 + countCaps xs
710 | | otherwise = countCaps xs
711 | ~~~~
712 |
713 |
714 | # Another approach
715 |
716 | Write a new version of `countCaps` as follows:
717 |
718 | * Write a function that goes through a list, and which generates a new
719 | list that contains only its capital letters.
720 |
721 | * Use `length` to count the number of elements.
722 |
723 | This should give the same result as your first function. Right?
724 |
725 |
726 | # A change of specification
727 |
728 | Suppose we want to count the number of lowercase letters in a string.
729 |
730 | This seems almost the same as our function for counting uppercase
731 | letters.
732 |
733 | What can we do with this observation?
734 |
735 |
736 | # Higher order functions
737 |
738 | *Higher order function*: a function that accepts another function as a
739 | parameter.
740 |
741 | ~~~~ {.haskell}
742 | filter pred [] = []
743 | filter pred (x:xs)
744 | | pred x = x : filter pred xs
745 | | otherwise = filter pred xs
746 | ~~~~
747 |
748 | How can we use this to define `countLowerCase`?
749 |
750 |
751 | # Data in, data out
752 |
753 | By now, we've seen several definitions like this:
754 |
755 | ~~~~ {.haskell}
756 | countLowerCase string =
757 | length (filter isLower string)
758 | ~~~~
759 |
760 | This is a recurring pattern:
761 |
762 | * A function of one argument
763 |
764 | * It's being fed the result of ...
765 |
766 | * ... another function of one argument
767 |
768 |
769 | # Function composition
770 |
771 | Haskell doesn't limit us to giving functions alphanumeric names.
772 |
773 | Here, we define a function named simply "`.`", which we can use as an
774 | operator:
775 |
776 | ~~~~ {.haskell}
777 | (f . g) x = f (g x)
778 | ~~~~
779 |
780 | How to use this?
781 |
782 | ~~~~ {.haskell}
783 | countLowerCase = length . filter isLower
784 | ~~~~
785 |
786 |
787 | # Understanding composition
788 |
789 | If that seemed hard to follow, let's make it clearer.
790 |
791 | We'll plug the arguments into the RHS of our function definition:
792 |
793 | ~~~~ {.haskell}
794 | (f . g) x = f (g x)
795 | ~~~~
796 |
797 | We had `length` as the first argument to "`.`", and `filter isLower`
798 | as the second:
799 |
800 | ~~~~ {.haskell}
801 | (length . filter isLower) x
802 | = length (filter isLower x)
803 | ~~~~
804 |
805 |
806 | # Local variables
807 |
808 | Inside an expression, we can introduce new variables using `let`.
809 |
810 | ~~~~ {.haskell}
811 | let x = 2
812 | y = 4
813 | in x + y
814 | ~~~~
815 |
816 | * Local definitions come after the `let`.
817 |
818 | * The expression where we use them comes after the `in`.
819 |
820 |
821 | # White space
822 |
823 | Haskell is sensitive to white space!
824 |
825 | * A top-level definition starts in the leftmost column.
826 |
827 | * After the beginning of a definition, if the next non-empty line is
828 | indented further, it is treated as a continuation of that
829 | definition.
830 |
831 | * Never use tabs in your source files.
832 |
833 |
834 | # White space and local variables
835 |
836 | If you're defining local variables, they must start in the same
837 | column.
838 |
839 | This is good:
840 |
841 | ~~~~ {.haskell}
842 | let x = 2
843 | y = 4
844 | in x + y
845 | ~~~~
846 |
847 | But this will lead to a compiler error:
848 |
849 | ~~~~ {.haskell}
850 | let x = 2
851 | y = 4
852 | in x + y
853 | ~~~~
854 |
855 |
856 | # Composition exercise
857 |
858 | Using function composition wherever you can, write a function that
859 | accepts a string and returns a new string containing only the words
860 | that begin with vowels.
861 |
862 | * You'll want to play with the `words` and `unwords` functions before
863 | you start.
864 |
865 | Example:
866 |
867 | ~~~~ {.haskell}
868 | disemvowel "I think, therefore I am."
869 | == "I I am."
870 | ~~~~
871 |
872 |
873 | # My solution
874 |
875 | Here's how I wrote `disemvowel`:
876 |
877 | ~~~~ {.haskell}
878 | disemvowel =
879 | let isVowel c = toLower c `elem` "aeiou"
880 | in unwords . filter (isVowel . head) . words
881 | ~~~~
882 |
883 | Does this remind you of a Unix shell pipeline, only right-to-left?
884 |
885 |
886 | # Problem definition, once again
887 |
888 | Given a web site, we want to scrape it and find important web pages.
889 |
890 | We're now Haskell experts, right?
891 |
892 | * Download one web page
893 |
894 |
895 | # Let's download a web page!
896 |
897 | We'd really like to rely on a library to download a web page for
898 | us.
899 |
900 | At times like this, there's a very handy central repository of open
901 | source Haskell software:
902 |
903 | * [http://hackage.haskell.org](http://hackage.haskell.org/)
904 |
905 | * (Everyone just calls it "Hackage")
906 |
907 | Go there now!
908 |
909 | Click on the
910 | [Packages](http://hackage.haskell.org/packages/archive/pkg-list.html)
911 | link at the top of the page to browse packages.
912 |
913 | Alas, the list is overwhelmingly long, but we can find libraries for
914 | all kinds of tasks if we're patient.
915 |
916 | Are we patient?
917 |
918 |
919 | # Ugh!
920 |
921 | Scrolling through thousands of libraries is hard - surely there's a
922 | better way?
923 |
924 | Enter the `cabal` command!
925 |
926 | Run this command in a terminal window:
927 |
928 | ~~~~
929 | cabal update
930 | ~~~~
931 |
932 | This downloads the latest index of all software on Hackage.
933 |
934 | With the index updated, we can search it:
935 |
936 | ~~~~
937 | cabal list http
938 | ~~~~
939 |
940 | That still gives us 20+ packages to comb through, but at least it's
941 | better than the 3,400 on the Packages web page.
942 |
943 |
944 | # Short-cutting the search
945 |
946 | The best HTTP client library is named `http-enumerator`.
947 |
948 | We can read about it online:
949 |
950 | * [hackage.haskell.org/package/http-enumerator](http://hackage.haskell.org/package/http-enumerator)
951 |
952 | That landing page for a package is intimidating, but look towards the
953 | bottom, at the section labeled "Modules".
954 |
955 | What do you see?
956 |
957 |
958 | # Installing a package
959 |
960 | Before we can use `http-enumerator`, we must install it.
961 |
962 | To install the `http-enumerator` package, we just issue a single
963 | command:
964 |
965 | ~~~~
966 | cabal install http-enumerator
967 | ~~~~
968 |
969 | This command figures out all the other libraries that
970 | `http-enumerator` depends on, and downloads, compiles, and installs
971 | the whole lot.
972 |
973 | Expect it to take a few minutes and print a lot of output.
974 |
975 |
976 | # Reading docs: packages and modules
977 |
978 | While we're waiting for the `http-enumerator` package and all of its
979 | dependencies to install, let's try to figure out how we should use it.
980 |
981 | Remember the link to API documentation at the end of the package's web
982 | page? Click through to the API docs.
983 |
984 | An API page begins with a title that looks something like this:
985 |
986 | ~~~~
987 | Network.HTTP.Enumerator
988 | ~~~~
989 |
990 | This is the name of a *module*.
991 |
992 | A module is a collection of related code.
993 |
994 | A *package* is a collection of related modules.
995 |
996 | (This will sound familiar if you know Python.)
997 |
998 |
999 | # Reading docs: the rest
1000 |
1001 | After the initial blurb, a module's docs consists of type signatures
1002 | and descriptions.
1003 |
1004 | Here is a really simple type signature:
1005 |
1006 | ~~~~
1007 | foo :: String
1008 | ~~~~
1009 |
1010 | How the heck do we read this?
1011 |
1012 | The *name* of the thing being defined comes before the `::`
1013 | characters.
1014 |
1015 | Its *type* follows after the `::`.
1016 |
1017 | This means "the value named `foo` has the type `String`".
1018 |
1019 |
1020 | # Haskell's type system
1021 |
1022 | Up until now, we have not bothered talking about types or type
1023 | signatures.
1024 |
1025 | Every expression and value in Haskell has a single type.
1026 |
1027 | Those types can almost always be *inferred* automatically by the
1028 | compiler or interpreter.
1029 |
1030 |
1031 | # The most common basic types
1032 |
1033 | * `Bool`
1034 |
1035 | * `Int`
1036 |
1037 | * `Char`
1038 |
1039 | * `Double`
1040 |
1041 |
1042 | # A function signature
1043 |
1044 | Here's another type signature:
1045 |
1046 | ~~~~
1047 | words :: String -> [String]
1048 | ~~~~
1049 |
1050 | Here we see a new symbol, `->`, which means "this is a function".
1051 |
1052 | The type after the last `->` is the return type of the function.
1053 |
1054 | All of its predecessors are argument types.
1055 |
1056 | So this is a function that takes one `String` argument, and
1057 | returns... what?
1058 |
1059 |
1060 | # List notation
1061 |
1062 | The notation `[a]` means "a list of values, all of some type `a`".
1063 |
1064 | So `[String]` means "a list of values, all of type `String`".
1065 |
1066 |
1067 | # Type synonyms
1068 |
1069 | What's a `String`?
1070 |
1071 | * It's not special, just a *synonym* for `[Char]`, i.e. "a list of
1072 | `Char`".
1073 |
1074 | We can introduce new synonyms of our own.
1075 |
1076 | ~~~~ {.haskell}
1077 | type Dollars = Int
1078 | ~~~~
1079 |
1080 | A type synonym can be handy for documenting an intended use for an
1081 | existing type.
1082 |
1083 |
1084 | # Words
1085 |
1086 | ~~~~
1087 | words :: String -> [String]
1088 | ~~~~
1089 |
1090 | We can now read that this function accepts a string as argument, and
1091 | returns a list of strings.
1092 |
1093 | From reading its name and type signature, can you guess what `words`
1094 | might do?
1095 |
1096 |
1097 | # Another signature
1098 |
1099 | Tell me about this signature:
1100 |
1101 | ~~~~
1102 | mystery :: [String] -> String
1103 | ~~~~
1104 |
1105 | What are some reasonable possible behaviours for this function?
1106 |
1107 |
1108 | # Reading real-world docs
1109 |
1110 | Here is the very first signature from `http-enumerator`:
1111 |
1112 | ~~~~
1113 | simpleHttp
1114 | :: (MonadIO m, Failure HttpException m) =>
1115 | String -> m ByteString
1116 | ~~~~
1117 |
1118 | This is more complex! How the heck do we read it?
1119 |
1120 | The bits between `::` and '=>' are *constraints* on where we can use
1121 | `simpleHttp` - but let's ignore constraints for now.
1122 |
1123 | * *Important*: it's often safe to gloss over things we don't (yet)
1124 | understand.
1125 |
1126 | We'll also ignore that mysterious lowercase `m` for a bit.
1127 |
1128 | What can we tell about this function?
1129 |
1130 |
1131 | # ByteString
1132 |
1133 | A `ByteString` is a blob of binary data.
1134 |
1135 | Unlike `String`, it is not represented as a list, but as a packed
1136 | array.
1137 |
1138 | However, it contains binary *bytes*, not text!
1139 |
1140 | * Don't use `ByteString` for working with data that you have to
1141 | manipulate as text.
1142 |
1143 |
1144 | # Let's play in ghci!
1145 |
1146 | Does everyone have `http-enumerator` installed now?
1147 |
1148 | Fire up `ghci`, and let's play with the module:
1149 |
1150 | ~~~~
1151 | import Network.HTTP.Enumerator
1152 | ~~~~
1153 |
1154 | Notice that after we type this, the prompt changes:
1155 |
1156 | ~~~~
1157 | Prelude Network.HTTP.Enumerator>
1158 | ~~~~
1159 |
1160 | This tells us that the module has loaded and is available.
1161 |
1162 |
1163 | # Wait! Are you on Windows?
1164 |
1165 | On Windows, we have to set up Winsock before any networking will work.
1166 |
1167 | First, let's load the lowest-level networking module:
1168 |
1169 | ~~~~
1170 | import Network.Socket
1171 | ~~~~
1172 |
1173 | And here's how we initialize Winsock:
1174 |
1175 | ~~~~
1176 | withSocketsDo (return ())
1177 | ~~~~
1178 |
1179 | (It's harmless to do this on Unix.)
1180 |
1181 |
1182 | # With that out of the way ...
1183 |
1184 | Finally - let's load a web page!
1185 |
1186 | ~~~~
1187 | simpleHttp "http://example.com/"
1188 | ~~~~
1189 |
1190 | Did that just print a ton of HTML in the terminal window? All right!
1191 |
1192 |
1193 | # From binary to text
1194 |
1195 | Now we have a `ByteString`, which we need to turn into text for
1196 | manipulating.
1197 |
1198 | Let's cheat, and assume that all web pages are encoded in UTF-8.
1199 |
1200 |
1201 | # Pure code
1202 |
1203 | So far, all of the code we have written has been "pure".
1204 |
1205 | * The behaviour of all of our functions has depended only on their
1206 | inputs.
1207 |
1208 | * All of our data is immutable.
1209 |
1210 | * There's thus no way to change a global variable and modify the
1211 | behaviour of a function.
1212 |
1213 |
1214 | # Impure code
1215 |
1216 | And yet ... somehow we downloaded a web page!
1217 |
1218 | * Web pages clearly are *not* pure.
1219 |
1220 | So can we write code like this?
1221 |
1222 | ~~~~ {.haskell}
1223 | length (simpleHttp "http://x.org/")
1224 | ~~~~
1225 |
1226 | NO.
1227 |
1228 | The type system segregates code that must be pure from code that may
1229 | have side effects ("impure" code).
1230 |
1231 |
1232 | # Are we stuck?
1233 |
1234 | Well, let's look at a simpler example than `simpleHttp`.
1235 |
1236 | Type this in `ghci`:
1237 |
1238 | ~~~~
1239 | :type readFile
1240 | ~~~~
1241 |
1242 | This will tell us what the type of `readFile` is.
1243 |
1244 |
1245 | # IO
1246 |
1247 | The `:type` directive should print something like this:
1248 |
1249 | ~~~~ {.haskell}
1250 | readFile :: FilePath -> IO String
1251 | ~~~~
1252 |
1253 | Notice that `IO` on the result type?
1254 |
1255 | It means "this function may have side effects".
1256 |
1257 | We often refer to impure functions, with `IO` in the result type, as
1258 | *actions*.
1259 |
1260 | * This helps to distinguish them from pure functions.
1261 |
1262 |
1263 | # Mixing IO and other stuff
1264 |
1265 | The type system keeps track of which functions have `IO` in their
1266 | types, and keeps us honest.
1267 |
1268 | We can still mix pure and impure code in a natural way:
1269 |
1270 | ~~~~ {.haskell}
1271 | charCount fileName = do
1272 | contents <- readFile fileName
1273 | return (length contents)
1274 | ~~~~
1275 |
1276 |
1277 | # "do" notation
1278 |
1279 | Critical to what we just saw was the `do` keyword at the beginning of
1280 | the function definition.
1281 |
1282 | This introduces a series of `IO` actions, one per line.
1283 |
1284 |
1285 | # Capturing the results of impure code
1286 |
1287 |
1288 | To capture the result of an `IO` action, we use `<-` instead of `=`.
1289 |
1290 | ~~~~ {.haskell}
1291 | contents <- readFile fileName
1292 | ~~~~
1293 |
1294 | The result (`contents`) is pure - it *does not have* the `IO` type.
1295 |
1296 | This is how we supply pure code with data returned from impure code.
1297 |
1298 |
1299 | # The "return" action
1300 |
1301 | This is *not* the `return` type you're used to!
1302 |
1303 | It takes a *pure* value (without `IO` in its type), and *wraps* it
1304 | with the `IO` type.
1305 |
1306 | Pure code can't call impure code, but it can thread data back into the
1307 | impure world using `return`.
1308 |
1309 |
1310 | # Haskell programs and IO
1311 |
1312 | When you write a Haskell program, its entry point must be named
1313 | `main`.
1314 |
1315 | The type of `main` must be:
1316 |
1317 | ~~~~ {.haskell}
1318 | main :: IO ()
1319 | ~~~~
1320 |
1321 | `()` is named "unit", and means more or less the same thing as `void`
1322 | in C or Java.
1323 |
1324 | What this means is that *all* Haskell programs are impure!
1325 |
1326 |
1327 | # Binary to text
1328 |
1329 | Remember we were planning to cheat earlier?
1330 |
1331 | We had this:
1332 |
1333 | ~~~~ {.haskell}
1334 | simpleHttp :: String -> IO ByteString
1335 | ~~~~
1336 |
1337 | We need something whose result is an `IO String` instead.
1338 |
1339 | How should that look?
1340 |
1341 |
1342 | # UTF-8 conversion
1343 |
1344 | To do the conversion, let's grab a package named `utf8-string`.
1345 |
1346 | ~~~~
1347 | cabal install utf8-string
1348 | ~~~~
1349 |
1350 | That contains a package named `Data.ByteString.Lazy.UTF8`.
1351 |
1352 | ~~~~ {.haskell}
1353 | import Data.ByteString.Lazy.UTF8
1354 | ~~~~
1355 |
1356 | It defines a function named `toString`:
1357 |
1358 | ~~~~ {.haskell}
1359 | toString :: ByteString -> String
1360 | ~~~~
1361 |
1362 |
1363 | # UTF-8 conversion exercise
1364 |
1365 | Write an action that downloads a URL and converts it from a
1366 | `ByteString` to a `String` using `toString`.
1367 |
1368 | Write a type signature for the action.
1369 |
1370 | * Haskell definitions usually don't require type signatures.
1371 |
1372 | * Nevertheless, we write them for *documentation* on almost all
1373 | top-level definitions.
1374 |
1375 |
1376 | # Downloading and saving a web page
1377 |
1378 | Use your `download` function to save a local copy of the page you just
1379 | wrote.
1380 |
1381 | ~~~~ {.haskell}
1382 | saveAs :: String -> Int -> IO ()
1383 | ~~~~
1384 |
1385 | For simplicity, let's save the local files as names containing
1386 | numbers:
1387 |
1388 | ~~~~ {.haskell}
1389 | makeFileName :: Int -> FilePath
1390 | makeFileName k = "download-" ++ show k ++ ".html"
1391 | ~~~~
1392 |
1393 | To save a local copy of a file, you'll need the `writeFile` action.
1394 |
1395 |
1396 | # Shoveling through HTML
1397 |
1398 | Two truisms:
1399 |
1400 | * Most HTML in the wild is a mess.
1401 |
1402 | * Even parsing well formed HTML is complicated.
1403 |
1404 | So! Let's use another library.
1405 |
1406 | ~~~~
1407 | cabal install tagsoup
1408 | ~~~~
1409 |
1410 | The `tagsoup` package can parse arbitrarily messy HTML.
1411 |
1412 | It will feed us a list of events, like a SAX parser.
1413 |
1414 |
1415 | # Dealing with problems
1416 |
1417 | Try this:
1418 |
1419 | ~~~~ {.haskell}
1420 | head [1]
1421 | ~~~~
1422 |
1423 | Now try this:
1424 |
1425 | ~~~~ {.haskell}
1426 | head []
1427 | ~~~~
1428 |
1429 |
1430 | # Oops
1431 |
1432 | If we pass an empty list, the `head` function throws an exception.
1433 |
1434 | Suppose we need a version of `head` that will *not* throw an
1435 | exception.
1436 |
1437 | ~~~~ {.haskell}
1438 | safeHead :: [a] -> ????
1439 | ~~~~
1440 |
1441 | What should the `????` be?
1442 |
1443 | Let's invent something.
1444 |
1445 | ~~~~ {.haskell}
1446 | safeHead (x:xs) = Some x
1447 | safeHead [] = None
1448 | ~~~~
1449 |
1450 |
1451 | # Some? None?
1452 |
1453 | * We're using a constructor named `Some` to capture the idea "we have
1454 | a result".
1455 |
1456 | * The constructor `None` indicates "we don't have a result here".
1457 |
1458 | To bring these constructors into existence, we need to declare a new
1459 | type.
1460 |
1461 | ~~~~ {.haskell}
1462 | data Perhaps a = Some a
1463 | | None
1464 | ~~~~
1465 |
1466 | The `|` character separates the constructors. We can read it as:
1467 |
1468 | * The `Perhaps` type has two constructors:
1469 |
1470 | * `Some` followed by a single argument
1471 |
1472 | * or `None` with no arguments
1473 |
1474 |
1475 | # Maybe
1476 |
1477 | Actually, Haskell already has a `Perhaps` type.
1478 |
1479 | ~~~~ {.haskell}
1480 | data Maybe a = Just a
1481 | | Nothing
1482 | ~~~~
1483 |
1484 | The `a` is a *type parameter*, meaning that when we write this type,
1485 | we have to supply another type as a parameter:
1486 |
1487 | * `Maybe Int`
1488 |
1489 | * `Maybe String`
1490 |
1491 |
1492 | # Using constructors
1493 |
1494 | If we want to construct a `Maybe Int` using the `Just` constructor, we
1495 | must pass it an `Int`.
1496 |
1497 | ~~~~ {.haskell}
1498 | Just 1 :: Maybe Int
1499 | Nothing :: Maybe Int
1500 | ~~~~
1501 |
1502 | This will not work, because the types don't match:
1503 |
1504 | ~~~~ {.haskell}
1505 | Just [1] :: Maybe String
1506 | ~~~~
1507 |
1508 |
1509 | # Pattern matching over constructors
1510 |
1511 | We can pattern match over the constructors for `Maybe` just as we did
1512 | for lists.
1513 |
1514 | ~~~~ {.haskell}
1515 | case foo of
1516 | Just x -> x
1517 | Nothing -> bar
1518 | ~~~~
1519 |
1520 |
1521 | # Tags
1522 |
1523 | The `tagsoup` package defines the following type:
1524 |
1525 | ~~~~ {.haskell}
1526 | data Tag = TagOpen String [Attribute]
1527 | | TagClose String
1528 | | TagText String
1529 | | TagComment String
1530 | | TagWarning String
1531 | | TagPosition Row Column
1532 | ~~~~
1533 |
1534 | What do you think the constructors mean?
1535 |
1536 |
1537 | # Pattern matching on a Tag
1538 |
1539 | Suppose we want to write a predicate that will tell is if a `Tag` is
1540 | an opening tag.
1541 |
1542 | * What should the type of this function be?
1543 |
1544 | * What should its body look like?
1545 |
1546 |
1547 | # Don't care!
1548 |
1549 | Our first body looked like this:
1550 |
1551 | ~~~~ {.haskell}
1552 | isOpenTag (TagOpen x y) = True
1553 | isOpenTag (TagClose x) = False
1554 | isOpenTag (TagText x) = False
1555 | isOpenTag (TagComment x) = False
1556 | isOpenTag (TagWarning x) = False
1557 | isOpenTag (TagPosition x y) = False
1558 | ~~~~
1559 |
1560 | Concise, but ugly.
1561 |
1562 | * We really only care about one constructor.
1563 |
1564 | * We never use the variables `x` or `y` that we declare.
1565 |
1566 |
1567 | # The wild card pattern
1568 |
1569 | We can write "I don't care what this pattern or variable is" using the
1570 | "`_`" character.
1571 |
1572 | ~~~~ {.haskell}
1573 | isOpenTag (TagOpen _ _) = True
1574 | isOpenTag _ = False
1575 | ~~~~
1576 |
1577 | The wild card pattern always matches.
1578 |
1579 | * Since we don't care about `x` or `y`, we can state that explicitly
1580 | using `_`.
1581 |
1582 | * Since we don't care about any constructor except `TagOpen`, we can
1583 | match all the others using `_`.
1584 |
1585 |
1586 | # Just a quick question
1587 |
1588 | Why don't we write the function like this?
1589 |
1590 | ~~~~ {.haskell}
1591 | isOpenTag _ = False
1592 | isOpenTag (TagOpen _ _) = True
1593 | ~~~~
1594 |
1595 |
1596 | # Extracting links from a web page
1597 |
1598 | Suppose we have a page in memory already.
1599 |
1600 | * Browse the `tagsoup` docs, in the `Text.HTML.TagSoup` module.
1601 |
1602 | * Find a function that will parse a web page into a series of tags.
1603 |
1604 |
1605 | # Let's use it!
1606 |
1607 | ~~~~ {.haskell}
1608 | processPage url = do
1609 | page <- download url
1610 | return (parseTags page)
1611 | ~~~~
1612 |
1613 |
1614 | # Tidying tags up
1615 |
1616 | Parsed tags can contain a mixture of tag names.
1617 |
1618 | ~~~~
1619 |
1620 | ~~~~
1621 |
1622 | ~~~~
1623 |
1624 | ~~~~
1625 |
1626 | * Find a `tagsoup` function that will turn tag names and attributes to
1627 | lower case.
1628 |
1629 |
1630 | # Canonical tags
1631 |
1632 | Let's use our function to clean up the result of `parseTags`.
1633 |
1634 | ~~~~ {.haskell}
1635 | processPage url = do
1636 | page <- download url
1637 | return
1638 | (canonicalizeTags
1639 | (parseTags page))
1640 | ~~~~
1641 |
1642 |
1643 | # Extracting links
1644 |
1645 | We only care about open tags that are links, so `` tags.
1646 |
1647 | * How would we write the type of a function that will indicate whether
1648 | a `Tag` is an open tag with the correct name?
1649 |
1650 | * How would we use this function to extract only the open tags from a
1651 | list of parsed tags?
1652 |
1653 |
1654 | # Whee!
1655 |
1656 | This cascade is getting a bit ridiculous.
1657 |
1658 | ~~~~ {.haskell}
1659 | processPage url = do
1660 | page <- download url
1661 | return
1662 | (filter (isTagOpenName "a")
1663 | (canonicalizeTags
1664 | (parseTags page)))
1665 | ~~~~
1666 |
1667 | Two observations:
1668 |
1669 | * Our action is now mostly pure code.
1670 |
1671 | * It sure looks like a pipeline.
1672 |
1673 |
1674 | # A rewriting exercise
1675 |
1676 | Take this function and split it into pure and impure parts.
1677 |
1678 | Write the pure part using function composition.
1679 |
1680 | ~~~~ {.haskell}
1681 | processPage url = do
1682 | page <- download url
1683 | return
1684 | (filter (isTagOpenName "a")
1685 | (canonicalizeTags
1686 | (parseTags page)))
1687 | ~~~~
1688 |
1689 |
1690 | # My solution
1691 |
1692 | ~~~~ {.haskell}
1693 | processPage url = do
1694 | page <- download url
1695 | return (process page)
1696 |
1697 | process =
1698 | filter (isTagOpenName "a") .
1699 | canonicalizeTags .
1700 | parseTags
1701 | ~~~~
1702 |
1703 |
1704 | # More stuff to filter out
1705 |
1706 | Let's skip `nofollow` links.
1707 |
1708 | We want to get the `"rel"` attribute of a tag.
1709 |
1710 | * Find a function that extracts an attribute from a tag.
1711 |
1712 |
1713 | # No following
1714 |
1715 | ~~~~ {.haskell}
1716 | nofollow tag = fromAttrib "rel" tag == "nofollow"
1717 | ~~~~
1718 |
1719 | ~~~ {.haskell}
1720 | process =
1721 | filter (not . nofollow) .
1722 | filter (isTagOpenName "a") .
1723 | canonicalizeTags .
1724 | parseTags
1725 | ~~~~
1726 |
1727 |
1728 | # We have a list of \ tags
1729 |
1730 | How would we extract the `"href"` attribute from every element of the
1731 | list?
1732 |
1733 |
1734 | # Only non-empty \ tags
1735 |
1736 | ~~~~ {.haskell}
1737 | process =
1738 | filter (not . null) .
1739 | map (fromAttrib "href") .
1740 | filter (not . nofollow) .
1741 | filter (isTagOpenName "a") .
1742 | canonicalizeTags .
1743 | parseTags
1744 | ~~~~
1745 |
1746 |
1747 | # Canonical URLs
1748 |
1749 | Links can be absolute, relative, or invalid garbage, and we only want
1750 | valid-looking absolute links.
1751 |
1752 | To properly create an absolute link, we need to know the absolute URL
1753 | of the page we're looking at.
1754 |
1755 | ~~~~ {.haskell}
1756 | canonicalizeLink :: String -> String -> Maybe String
1757 | ~~~~
1758 |
1759 |
1760 | # Working with URIs
1761 |
1762 | The `Network.URI` package contains some functions we might find handy.
1763 |
1764 | ~~~~ {.haskell}
1765 | parseURI :: String -> Maybe URI
1766 | parseURIReference :: String -> Maybe URI
1767 | uriToString id "" :: URI -> String
1768 | nonStrictRelativeTo :: URI -> URI -> Maybe URI
1769 | ~~~~
1770 |
1771 |
1772 | # A monster of indentation
1773 |
1774 | This is really hard to read!
1775 |
1776 | ~~~~ {.haskell}
1777 | import Network.URI
1778 |
1779 | canon :: String -> String -> Maybe String
1780 | canon referer path =
1781 | case parseURI referer of
1782 | Nothing -> Nothing
1783 | Just r ->
1784 | case parseURIReference path of
1785 | Nothing -> Nothing
1786 | Just p ->
1787 | case nonStrictRelativeTo p r of
1788 | Nothing -> Nothing
1789 | Just u ->
1790 | Just (uriToString id u "")
1791 | ~~~~
1792 |
1793 | Surely there's a better way.
1794 |
1795 |
1796 | # Stair stepping
1797 |
1798 | Notice that that function was a series of `case` inspections of
1799 | `Maybe` values?
1800 |
1801 | Suppose we had a function that accepted a normal value, and returned a
1802 | `Maybe` value.
1803 |
1804 | ~~~~ {.haskell}
1805 | a -> Maybe b
1806 | ~~~~
1807 |
1808 | And suppose we had a concise syntax for writing an anonymous function.
1809 |
1810 | ~~~~ {.haskell}
1811 | \a -> "hi mom! " ++ a
1812 | ~~~~
1813 |
1814 | The `\` is pronounced "lambda".
1815 |
1816 |
1817 | # Observation
1818 |
1819 | The `case` analysis is quite verbose. Suppose we had a function that
1820 | performed it, and called another function if our value was `Just`.
1821 |
1822 | ~~~~ {.haskell}
1823 | bind :: Maybe a -> (a -> Maybe b) -> Maybe b
1824 | bind Nothing _ = Nothing
1825 | bind (Just value) action = action value
1826 | ~~~~
1827 |
1828 |
1829 | # Using bind
1830 |
1831 | How could we use this?
1832 |
1833 | ~~~~ {.haskell}
1834 | canon1 referer path =
1835 | parseURI referer `bind`
1836 | \r -> parseURIReference path `bind`
1837 | \p -> nonStrictRelativeTo p r `bind`
1838 | \u -> Just (uriToString id u "")
1839 | ~~~~
1840 |
1841 | If we enclose a function name in backticks, we can use the function as
1842 | an infix operator.
1843 |
1844 |
1845 | # Reformatting the code
1846 |
1847 | ~~~~ {.haskell}
1848 | canon referer path =
1849 | parseURI referer `bind` \r ->
1850 | parseURIReference path `bind` \p ->
1851 | nonStrictRelativeTo p r `bind` \u ->
1852 | Just (uriToString id u "")
1853 | ~~~~
1854 |
1855 |
1856 | # A built-in name for bind
1857 |
1858 | The `>>=` operator is a more general version of our `bind` function.
1859 |
1860 | ~~~~ {.haskell}
1861 | canon referer path =
1862 | parseURI referer >>= \r ->
1863 | parseURIReference path >>= \p ->
1864 | nonStrictRelativeTo p r >>= \u ->
1865 | Just (uriToString id u "")
1866 | ~~~~
1867 |
1868 |
1869 | # Using syntactic sugar
1870 |
1871 | Here's some tidier syntax that should look familiar.
1872 |
1873 | ~~~~ {.haskell}
1874 | canonicalize :: String -> String -> Maybe String
1875 |
1876 | canonicalize referer path = do
1877 | r <- parseURI referer
1878 | p <- parseURIReference path
1879 | u <- nonStrictRelativeTo p r
1880 | return (uriToString id u "")
1881 | ~~~~
1882 |
1883 |
1884 | # Nearly there
1885 |
1886 | ~~~~ {.haskell}
1887 | process url =
1888 | map (canonicalize url) .
1889 | filter (not . null) .
1890 | map (fromAttrib "href") .
1891 | filter (\t -> fromAttrib "rel" t /= "nofollow") .
1892 | filter (isTagOpenName "a") .
1893 | canonicalizeTags .
1894 | parseTags
1895 | ~~~~
1896 |
1897 | One awkward thing: what is the type of this function?
1898 |
1899 |
1900 | # From [Maybe a] to [a]
1901 |
1902 | Go to this web site:
1903 |
1904 | * [haskell.org/hoogle](http://haskell.org/hoogle)
1905 |
1906 | Type this into the search box:
1907 |
1908 | ~~~~ {.haskell}
1909 | [Maybe a] -> [a]
1910 | ~~~~
1911 |
1912 | What does the first result say?
1913 |
1914 |
1915 | # We're there!
1916 |
1917 | ~~~~ {.haskell}
1918 | import Data.Maybe
1919 | import Network.URI
1920 |
1921 | links url =
1922 | catMaybes .
1923 | map (canonicalize url) .
1924 | filter (not . null) .
1925 | map (fromAttrib "href") .
1926 | filter (\t -> fromAttrib "rel" t /= "nofollow") .
1927 | filter (isTagOpenName "a") .
1928 | canonicalizeTags .
1929 | parseTags
1930 | ~~~~
1931 |
1932 |
1933 | # From links to spidering
1934 |
1935 | If we can download the links from one page, we can easily write a
1936 | spider to follow those links.
1937 |
1938 | To keep things simple, let's set a limit on the number of pages we'll
1939 | download.
1940 |
1941 | What information do we want to generate?
1942 |
1943 | What do we need to track along the way?
1944 |
1945 |
1946 | # What we need to track
1947 |
1948 | Here's the state we need to maintain:
1949 |
1950 | * The number of pages we have downloaded
1951 |
1952 | * A collection of pages we have seen links to, but haven't downloaded
1953 |
1954 | * A collection of pages and their outbound links
1955 |
1956 |
1957 | # Tracking what we've seen
1958 |
1959 | For any given page, we need to remember both it and all the pages it
1960 | links to.
1961 |
1962 | One possibility for associating the two is a *tuple*:
1963 |
1964 | ~~~~ {.haskell}
1965 | ("http://x.org/", ["http://microsoft.com/"])
1966 | ~~~~
1967 |
1968 | Tuples are useful any time we want mixed-type data without the hassle
1969 | of creating a new type.
1970 |
1971 | Speaking of a new type, here's how we'd define one:
1972 |
1973 | ~~~~ {.haskell}
1974 | data Link = Link String [String]
1975 |
1976 | -- Let's define some accessors, too.
1977 | linkFrom (Link url _) = url
1978 | linkTo (Link _ links) = links
1979 | ~~~~
1980 |
1981 |
1982 | # Avoiding duplication
1983 |
1984 | We don't want to visit any URL twice.
1985 |
1986 | How do we avoid this?
1987 |
1988 | ~~~~ {.haskell}
1989 | visited url = elem url . map linkTo
1990 | ~~~~
1991 |
1992 | This function has a problem - what is that problem?
1993 |
1994 |
1995 | # Better performance
1996 |
1997 | We really want a structure with a fast lookup operation.
1998 |
1999 | What would you use in your language?
2000 |
2001 |
2002 | # Maps and importing
2003 |
2004 | In Haskell, we have mutable hash tables, but we don't use them.
2005 |
2006 | Instead, we use *immutable* key-value maps.
2007 |
2008 | We must perform fancy module importing tricks because the `Data.Map`
2009 | module defines a lot of names that would otherwise overlap with
2010 | built-in names.
2011 |
2012 | This means "only import the name `Map` from `Data.Map`":
2013 |
2014 | ~~~~ {.haskell}
2015 | import Data.Map (Map)
2016 | ~~~~
2017 |
2018 | And this means "import everything from `Data.Map`, but all those names
2019 | must be prefixed with `Map.`":
2020 |
2021 | ~~~~ {.haskell}
2022 | import qualified Data.Map as Map
2023 | ~~~~
2024 |
2025 |
2026 | # What use is an immutable data structure?
2027 |
2028 | Everyone knows how to add a key and value to a hash table, right?
2029 |
2030 | And that seems like a fundamental operation.
2031 |
2032 | What do we do with maps?
2033 |
2034 | * Create a *new* map that is identical to the one we supply, with the
2035 | requested element added.
2036 |
2037 | How can this possibly work? Is it efficient?
2038 |
2039 |
2040 | # A fistful of dollars
2041 |
2042 | Here's a surprisingly handy built-in operator:
2043 |
2044 | ~~~~ {.haskell}
2045 | f $ x = f x
2046 | ~~~~
2047 |
2048 | Why is this useful? Because it lets us eliminate parentheses.
2049 |
2050 | Before:
2051 |
2052 | ~~~~ {.haskell}
2053 | explode k = error ("failed on " ++ show k)
2054 | ~~~~
2055 |
2056 | After:
2057 |
2058 | ~~~~ {.haskell}
2059 | explode k = error $ "failed on " ++ show k
2060 | ~~~~
2061 |
2062 |
2063 | # Partial application
2064 |
2065 | This is annoying to write:
2066 |
2067 | ~~~~ {.haskell}
2068 | increment k = 1 + k
2069 | ~~~~
2070 |
2071 | Almost as bad:
2072 |
2073 | ~~~~ {.haskell}
2074 | \k -> 1 + k
2075 | ~~~~
2076 |
2077 | Much handier, and identical:
2078 |
2079 | ~~~~ {.haskell}
2080 | (1+)
2081 | ~~~~
2082 |
2083 | In fact, this is valid:
2084 |
2085 | ~~~~ {.haskell}
2086 | increment = (1+)
2087 | ~~~~
2088 |
2089 | # Spidering, in all its glory
2090 |
2091 | ~~~~ {.haskell}
2092 | spider :: Int -> URL -> IO (Map URL [URL])
2093 | spider count url0 = go 0 Map.empty (Set.singleton url0)
2094 | where
2095 | go k seen queue0
2096 | | k >= count = return seen
2097 | | otherwise =
2098 | case Set.minView queue0 of
2099 | Nothing -> return seen
2100 | Just (url, queue) -> do
2101 | page <- download url
2102 | let ls = links url page
2103 | newSeen = Map.insert url ls seen
2104 | notSeen = Set.fromList .
2105 | filter (`Map.notMember` newSeen) $ ls
2106 | newQueue = queue `Set.union` notSeen
2107 | go (k+1) newSeen newQueue
2108 | ~~~~
2109 |
2110 |
2111 | # Where do we stand?
2112 |
2113 | We can now:
2114 |
2115 | * Download a web page
2116 |
2117 | * Extract its links
2118 |
2119 | * Spider out from there, without repeat visits
2120 |
2121 | What remains?
2122 |
2123 | * We could spider multiple pages concurrently
2124 |
2125 | * Or we could compute which pages are "important"
2126 |
2127 |
2128 | # Fin
2129 |
2130 | At this point, if we have miraculously not run out of time, we're
2131 | going on a choose-your-own-adventure session in Emacs.
2132 |
2133 | Thanks for sticking with the slideshow so far!
2134 |
--------------------------------------------------------------------------------
/src/Download.hs:
--------------------------------------------------------------------------------
1 | module Download where
2 |
3 | import Data.ByteString.Lazy.UTF8
4 | import Network.HTTP.Enumerator
5 |
6 | download :: String -> IO String
7 | download url = do
8 | page <- simpleHttp url
9 | return (toString page)
10 |
--------------------------------------------------------------------------------
/src/Links.hs:
--------------------------------------------------------------------------------
1 | module Links where
2 |
3 | import Data.Maybe
4 | import Network.URI
5 | import Text.HTML.TagSoup
6 |
7 | links :: String -> String -> [String]
8 | links url = catMaybes .
9 | map (canonicalizeLink url) .
10 | filter (not . null) .
11 | map (fromAttrib "href") .
12 | filter (\t -> fromAttrib "rel" t /= "nofollow") .
13 | filter (isTagOpenName "a") .
14 | canonicalizeTags .
15 | parseTags
16 |
17 | canonicalizeLink :: String -> String -> Maybe String
18 | canonicalizeLink referer path = do
19 | r <- parseURI referer
20 | p <- parseURIReference path
21 | n <- p `nonStrictRelativeTo` r
22 | let u = uriToString id n ""
23 | return (takeWhile (/= '#') u)
24 |
--------------------------------------------------------------------------------
/src/Main.hs:
--------------------------------------------------------------------------------
1 | import System.Environment
2 | import Download
3 | import Links
4 |
5 | main = do
6 | args <- getArgs
7 | putStrLn ("So! Your args are " ++ show args)
8 | page <- download (head args)
9 | print (links page)
10 |
--------------------------------------------------------------------------------
/src/RankPages.hs:
--------------------------------------------------------------------------------
1 | {-# LANGUAGE BangPatterns, OverloadedStrings, RecordWildCards #-}
2 |
3 | import Data.Bits ((.&.))
4 | import Data.Function (on)
5 | import MailRank.Functions (every)
6 | import Data.Hashable (Hashable(..))
7 | import qualified Data.HashMap.Strict as H
8 | import Data.List (foldl')
9 |
10 | data Link = Link {
11 | sender :: {-# UNPACK #-} !Int
12 | , recipient :: {-# UNPACK #-} !Int
13 | } deriving (Eq, Show)
14 |
15 | instance Hashable Link where
16 | hash Link{..} = hash sender `hashWithSalt` recipient
17 | {-# INLINE hash #-}
18 | hashWithSalt s Link{..} =
19 | s `hashWithSalt` sender `hashWithSalt` recipient
20 | {-# INLINE hashWithSalt #-}
21 |
22 | -- | This matrix maps pages to the pages they've linked to. The outer
23 | -- vector is indexed by page ID, and the inner contains the ID of
24 | -- every page they've linked to.
25 | type OutgoingLinks = [[Int]]
26 |
27 | -- | This matrix maps pages to the pages they've been linked from.
28 | -- The outer vector is indexed by page ID, and the inner contains
29 | -- the ID of every page they've received from.
30 | type IncomingLinks = [[Int]]
31 |
32 | -- | Map from page ID to the reciprocal of the number of pages
33 | -- they've linked to.
34 | type LinkFactors = [Double]
35 |
36 | -- | Indices of silent pages (those that have incoming links, but no
37 | -- outgoing links).
38 | type Silents = [Int]
39 |
40 | transpose :: OutgoingLinks -> (IncomingLinks, LinkFactors, Silents)
41 | transpose outgoingLinks = (incomingLinks, linkFactors, silent)
42 | where
43 | linkFactors = map (recip . fromIntegral . length) $
44 | outgoingLinks
45 | silent = map fst . filter (null . snd) . imap (,) $
46 | outgoingLinks
47 | incomingLinks = generate outgoingLinks $ \i ->
48 | maybe [] id $ H.lookup i incoming
49 | where incoming = ifoldl' step H.empty outgoingLinks
50 | step m0 i = foldl' (\m j -> H.insertWith (++) j [i] m) m0
51 |
52 | data Rank = Rank {
53 | rankIter :: {-# UNPACK #-} !Int
54 | , rankVector :: [Double]
55 | }
56 |
57 | ranks :: IncomingLinks -> LinkFactors -> Silents -> Double
58 | -> [Rank]
59 | ranks incoming factors silent alpha =
60 | iterate iter $ Rank 0 (replicate count (1/n))
61 | where
62 | iter (Rank k old0) = Rank (k+1) (map step incoming)
63 | where
64 | step link = h + a + i
65 | where
66 | h | null link = 0
67 | | otherwise = alpha * backpermute old link `dot`
68 | backpermute factors link
69 | i = (1 - alpha) * sum old / n
70 | a | null silent = 0
71 | | otherwise = alpha * sum (backpermute old silent) / n
72 | old | k .&. 16 == 15 = map (/ sum old0) old0
73 | | otherwise = old0
74 | count = length factors
75 | n = fromIntegral count
76 |
77 | rank :: OutgoingLinks -> Double -> Double -> Rank
78 | rank outgoing alpha epsilon = snd . head . filter ((< epsilon * n) . fst) .
79 | take 8 . every 10 . zipWith dist xs . tail $ xs
80 | where
81 | (incoming, factors, silent) = transpose outgoing
82 | dist a b = ((distance `on` rankVector) b a, b)
83 | xs = ranks incoming factors silent alpha
84 | n = fromIntegral (length incoming)
85 |
86 | distance :: [Double] -> [Double] -> Double
87 | distance a b = sqrt (d `dot` d)
88 | where d = zipWith (-) a b
89 |
90 | dot :: [Double] -> [Double] -> Double
91 | dot a b = sum (zipWith (*) a b)
92 |
93 | backpermute :: [a] -> [Int] -> [a]
94 | backpermute xs is = map (xs!!) is
95 |
96 | imap :: (Int -> a -> b) -> [a] -> [b]
97 | imap f = go 0
98 | where go _ [] = []
99 | go !i (x:xs) = f i x : go (i+1) xs
100 |
101 | generate :: [b] -> (Int -> a) -> [a]
102 | generate xs f = imap (\i _ -> f i) xs
103 |
104 | ifoldl' :: (a -> Int -> b -> a) -> a -> [b] -> a
105 | ifoldl' f z0 = go z0 0
106 | where go z !i (x:xs) = let !z' = f z i x
107 | in go z' (i+1) xs
108 | go z _ _ = z
109 |
--------------------------------------------------------------------------------
/src/Spider.hs:
--------------------------------------------------------------------------------
1 | import Download
2 | import Links
3 | import qualified Data.Map as Map
4 | import Data.Map (Map)
5 | import qualified Data.Set as Set
6 |
7 | type URL = String
8 |
9 | spider :: Int -> URL -> IO (Map URL [URL])
10 | spider count url0 = go 0 Map.empty (Set.singleton url0)
11 | where
12 | go k seen queue0
13 | | k >= count = return seen
14 | | otherwise =
15 | case Set.minView queue0 of
16 | Nothing -> return seen
17 | Just (url, queue) -> do
18 | page <- download url
19 | let ls = links url page
20 | newSeen = Map.insert url ls seen
21 | notSeen = Set.fromList .
22 | filter (`Map.notMember` newSeen) $ ls
23 | newQueue = queue `Set.union` notSeen
24 | go (k+1) newSeen newQueue
25 |
--------------------------------------------------------------------------------
/strange-loop-2011.cabal:
--------------------------------------------------------------------------------
1 | name: strange-loop
2 | version: 0
3 | cabal-version: >= 1.2
4 | build-type: Simple
5 |
6 | executable strange-loop
7 | hs-source-dirs: src
8 | main-is: Main.hs
9 |
10 | build-depends:
11 | base,
12 | bytestring,
13 | http-enumerator,
14 | tagsoup,
15 | utf8-string
16 |
--------------------------------------------------------------------------------