├── .gitignore
├── LICENSE
├── README.md
├── Scraping.xlsm
├── examples.bas
├── src
├── Scraping.cls
└── XmlScraping.cls
└── tests
├── Test_Scraping.bas
└── Test_XmlScraping.bas
/.gitignore:
--------------------------------------------------------------------------------
1 | ~$*
2 | css
3 | .sass-cache
4 |
--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
1 | MIT License
2 |
3 | Copyright (c) 2017 Excel VBA
4 |
5 | Permission is hereby granted, free of charge, to any person obtaining a copy
6 | of this software and associated documentation files (the "Software"), to deal
7 | in the Software without restriction, including without limitation the rights
8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 |
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 |
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 |
--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 | # Web Scraping
2 | Extract data from websites easily.
3 |
4 | ## Examples
5 | ```vb
6 | Sub do_a_search_on_wikipedia()
7 |
8 | Dim doc As New Scraping
9 | Dim search As String
10 |
11 | search = "document object model"
12 |
13 | doc.gotoPage "https://en.wikipedia.org/wiki/Main_Page", True 'browser visible
14 |
15 | doc.id("searchInput").fieldValue search
16 | doc.id("searchButton").click
17 |
18 | End Sub
19 | ```
20 |
21 | ```vb
22 | Sub extract_the_titles_of_the_questions_in_stackoverflow()
23 |
24 | Dim i As Integer
25 | Dim doc As New XmlScraping
26 | Dim numberTitles As Integer
27 |
28 | doc.gotoPage "https://stackoverflow.com/"
29 |
30 | numberTitles = doc.css(".summary h3 a").count
31 |
32 | For i = 0 To numberTitles - 1
33 | Cells(i + 1, 1) = doc.css(".summary h3 a").index(i).text
34 | Next i
35 |
36 | End Sub
37 | ```
38 |
--------------------------------------------------------------------------------
/Scraping.xlsm:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/victorze-vba/Scraping/d6f134059e0bcdc6d2fa8fb50e86e5f4e17ae997/Scraping.xlsm
--------------------------------------------------------------------------------
/examples.bas:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/victorze-vba/Scraping/d6f134059e0bcdc6d2fa8fb50e86e5f4e17ae997/examples.bas
--------------------------------------------------------------------------------
/src/Scraping.cls:
--------------------------------------------------------------------------------
1 | VERSION 1.0 CLASS
2 | BEGIN
3 | MultiUse = -1 'True
4 | END
5 | Attribute VB_Name = "Scraping"
6 | Attribute VB_GlobalNameSpace = False
7 | Attribute VB_Creatable = False
8 | Attribute VB_PredeclaredId = False
9 | Attribute VB_Exposed = True
10 | ''
11 | ' Scraping v0.1.1 Alpha
12 | ' (c) Victor Zevallos - https://github.com/vba-dev/vba-scraping
13 | '
14 | ' Library used: Microsoft Internet Controls,
15 | ' Microsoft HTML Object Library
16 | '
17 | ' @class Scraping
18 | ' @author victorzevallos@protonmail.com
19 | ' @license MIT (http://www.opensource.org/licenses/mit-license.php)
20 | '' ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ '
21 |
22 | Option Explicit
23 |
24 | ' --------------------------------------------- '
25 | ' Properties
26 | ' --------------------------------------------- '
27 |
28 | Private ie As SHDocVw.InternetExplorer
29 |
30 | Private doc As New MSHTML.HTMLDocument
31 |
32 | Private element As MSHTML.IHTMLElement
33 |
34 | Private children As MSHTML.IHTMLDOMChildrenCollection
35 |
36 | ' --------------------------------------------- '
37 | ' Public Methods
38 | ' --------------------------------------------- '
39 |
40 | ''
41 | ' Visit a url and you can choose to view the browser
42 | '
43 | ' @param {String} url
44 | ' @param {Boolean} visibleBrowser
45 | ' @return void
46 | ''
47 | Public Sub gotoPage(url As String, Optional visibleBrowser As Boolean = False)
48 | Set ie = New SHDocVw.InternetExplorer
49 |
50 | ie.visible = visibleBrowser
51 |
52 | ie.Navigate url
53 |
54 | loadBrowser
55 |
56 | Set doc = ie.Document
57 | End Sub
58 |
59 | ''
60 | ' Select a collection of elements
61 | '
62 | ' @param {String} selector
63 | ' @return Scraping
64 | '
65 | ' Example selector id: "#nameId" | class: ".nameClas" | tag: "nameTag"
66 | ''
67 | Public Function css(selector As String) As Scraping
68 | Set children = doc.querySelectorAll(selector)
69 |
70 | Set css = Me
71 | End Function
72 |
73 | ''
74 | ' Select the first element in the collection
75 | '
76 | ' @param {String} selector
77 | ' @return Scraping
78 | ''
79 | Public Function at_css(selector As String) As Scraping
80 | Set element = doc.querySelector(selector)
81 |
82 | Set at_css = Me
83 | End Function
84 |
85 | ''
86 | ' Select an item from an elements collection
87 | '
88 | ' @param {Integer} i
89 | ' @return Scraping
90 | ''
91 | Public Function index(i As Integer) As Scraping
92 | Set element = children(i)
93 |
94 | Set index = Me
95 | End Function
96 |
97 | ''
98 | ' Return to an object to work with tables
99 | '
100 | ' @return MSHTML.HTMLTableRow
101 | ''
102 | Public Function rowTable() As MSHTML.HTMLTableRow
103 | Set rowTable = element
104 | End Function
105 |
106 | ''
107 | ' Return the number of items in the collection
108 | '
109 | ' @return Integer
110 | ''
111 | Public Function count() As Integer
112 | count = children.Length
113 | End Function
114 |
115 | ''
116 | ' Select an elements with the specified id
117 | '
118 | ' @param {String} idName
119 | ' @return Scraping
120 | ''
121 | Public Function id(idName As String) As Scraping
122 | Set element = doc.getElementById(idName)
123 |
124 | Set id = Me
125 | End Function
126 |
127 | ''
128 | ' Return the text of the selected element
129 | '
130 | ' @return String
131 | ''
132 | Public Function text() As String
133 | text = element.innerText
134 | End Function
135 |
136 | ''
137 | ' Return the html of the selected element
138 | '
139 | ' @return String
140 | ''
141 | Public Function html() As String
142 | html = element.innerHTML
143 | End Function
144 |
145 | ''
146 | ' Return the value of the specified attribute
147 | '
148 | ' @param {String} atributeName
149 | ' @return String
150 | ''
151 | Public Function attr(atributeName As String) As String
152 | attr = element.getAttribute(atributeName)
153 | End Function
154 |
155 | ''
156 | ' Click an element
157 | '
158 | ' @return void
159 | ''
160 | Public Sub click(Optional sleep As Byte = 3)
161 | element.click
162 |
163 | loadBrowser
164 |
165 | pause sleep
166 |
167 | Set doc = ie.Document
168 | End Sub
169 |
170 | ''
171 | ' Assigns a value to a text or select element
172 | '
173 | ' @return void
174 | ''
175 | Public Function fieldValue(str As String) As Scraping
176 | element.value = str
177 |
178 | Set fieldValue = Me
179 | End Function
180 |
181 | ' --------------------------------------------- '
182 | ' Private Methods
183 | ' --------------------------------------------- '
184 |
185 | ''
186 | ' Wait for the browser to load
187 | '
188 | ' @return void
189 | ''
190 | Private Sub loadBrowser()
191 | Do While ie.ReadyState <> READYSTATE_COMPLETE Or ie.Busy
192 | Loop
193 | End Sub
194 |
195 | ''
196 | ' Pause in seconds
197 | '
198 | ' @return void
199 | ''
200 | Private Sub pause(Seconds As Byte)
201 | Application.wait (Now + TimeValue("0:00:0" + CStr(Seconds)))
202 | End Sub
203 |
204 | ''
205 | ' Runs when object is no longer used
206 | '
207 | ' @return void
208 | ''
209 | Private Sub Class_Terminate()
210 | If Not ie.visible Then ie.quit
211 |
212 | Set ie = Nothing
213 | End Sub
214 |
--------------------------------------------------------------------------------
/src/XmlScraping.cls:
--------------------------------------------------------------------------------
1 | VERSION 1.0 CLASS
2 | BEGIN
3 | MultiUse = -1 'True
4 | END
5 | Attribute VB_Name = "XmlScraping"
6 | Attribute VB_GlobalNameSpace = False
7 | Attribute VB_Creatable = False
8 | Attribute VB_PredeclaredId = False
9 | Attribute VB_Exposed = False
10 | ''
11 | ' Scraping v0.1.1 Alpha
12 | ' (c) Victor Zevallos - https://github.com/vba-dev/vba-scraping
13 | '
14 | ' Library used: Microsoft XML v6.0
15 | ' Microsoft HTML Object Library
16 | '
17 | ' @class XmlScraping
18 | ' @author victorzevallos@protonmail.com
19 | ' @license MIT (http://www.opensource.org/licenses/mit-license.php)
20 | '' ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ '
21 |
22 | Option Explicit
23 |
24 | ' --------------------------------------------- '
25 | ' Properties
26 | ' --------------------------------------------- '
27 |
28 | Private doc As New MSHTML.HTMLDocument
29 |
30 | Private element As MSHTML.IHTMLElement
31 |
32 | Private children As MSHTML.IHTMLDOMChildrenCollection
33 |
34 | ' --------------------------------------------- '
35 | ' Public Methods
36 | ' --------------------------------------------- '
37 |
38 | ''
39 | ' Visit the url
40 | '
41 | ' @param {String} url
42 | ' @return void
43 | ''
44 | Public Sub gotoPage(url As String)
45 | Dim XMLPage As New MSXML2.XMLHTTP60
46 |
47 | XMLPage.Open "GET", url, False
48 | XMLPage.send
49 |
50 | doc.body.innerHTML = XMLPage.responseText
51 | End Sub
52 |
53 | ''
54 | ' Select a collection of elements
55 | '
56 | ' @param {String} selector
57 | ' @return XmlScraping
58 | '
59 | ' Example selector id: "#nameId" | class: ".nameClas" | tag: "nameTag"
60 | ''
61 | Public Function css(selector As String) As XmlScraping
62 | Set children = doc.querySelectorAll(selector)
63 |
64 | Set css = Me
65 | End Function
66 |
67 | ''
68 | ' Select the first element in the collection
69 | '
70 | ' @param {String} selector
71 | ' @return XmlScraping
72 | ''
73 | Public Function at_css(selector As String) As XmlScraping
74 | Set element = doc.querySelector(selector)
75 |
76 | Set at_css = Me
77 | End Function
78 |
79 | ''
80 | ' Select an item from an elements collection
81 | '
82 | ' @param {Integer} i
83 | ' @return XmlScraping
84 | ''
85 | Public Function index(i As Integer) As XmlScraping
86 | Set element = children(i)
87 |
88 | Set index = Me
89 | End Function
90 |
91 | ''
92 | ' return to an object to work with tables
93 | '
94 | ' @return MSHTML.HTMLTableRow
95 | ''
96 | Public Function rowTable() As MSHTML.HTMLTableRow
97 | Set rowTable = element
98 | End Function
99 |
100 | ''
101 | ' Return the number of items in the collection
102 | '
103 | ' @return Integer
104 | ''
105 | Public Function count() As Integer
106 | count = children.Length
107 | End Function
108 |
109 | ''
110 | ' Select an elements with the specified id
111 | '
112 | ' @param {String} idName
113 | ' @return XmlScraping
114 | ''
115 | Public Function id(idName As String) As XmlScraping
116 | Set element = doc.getElementById(idName)
117 |
118 | Set id = Me
119 | End Function
120 |
121 | ''
122 | ' Return the text of the selected element
123 | '
124 | ' @return String
125 | ''
126 | Public Function text() As String
127 | text = element.innerText
128 | End Function
129 |
130 | ''
131 | ' Return the html of the selected element
132 | '
133 | ' @return String
134 | ''
135 | Public Function html() As String
136 | html = element.innerHTML
137 | End Function
138 |
139 | ''
140 | ' Return the value of the specified attribute
141 | '
142 | ' @param {String} atributeName
143 | ' @return String
144 | ''
145 | Public Function attr(atributeName As String) As String
146 | attr = element.getAttribute(atributeName)
147 | End Function
148 |
149 |
--------------------------------------------------------------------------------
/tests/Test_Scraping.bas:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/victorze-vba/Scraping/d6f134059e0bcdc6d2fa8fb50e86e5f4e17ae997/tests/Test_Scraping.bas
--------------------------------------------------------------------------------
/tests/Test_XmlScraping.bas:
--------------------------------------------------------------------------------
1 | Attribute VB_Name = "Test_XmlScraping"
2 | Option Explicit
3 |
4 | Sub TestTextHtml()
5 | Dim Specs As New SpecSuite
6 | Dim doc As New XmlScraping
7 |
8 | doc.gotoPage "https://stackoverflow.com/"
9 |
10 | With Specs.It("Extract the text of an element with id 'nav-questions'")
11 | .Expect(doc.id("nav-questions").text).ToEqual "Questions"
12 | End With
13 |
14 | With Specs.It("Extract to the html of the first element of a collection with class .js-gps-track")
15 | .Expect(doc.css(".js-gps-track").index(0).html).ToEqual "Stack Overflow "
16 | End With
17 |
18 | InlineRunner.RunSuite Specs
19 | End Sub
20 |
21 | Sub TestCollection()
22 | Dim Specs As New SpecSuite
23 | Dim doc As New XmlScraping
24 |
25 | doc.gotoPage "https://vba-dev.github.io/vba-scraping/"
26 |
27 | With Specs.It("Select tag")
28 | .Expect(doc.css("span").index(0).text).ToEqual ""
29 | .Expect(doc.css("span").index(2).html).ToEqual "Submit"
30 | End With
31 |
32 | With Specs.It("Select class")
33 | .Expect(doc.at_css(".title").text).ToEqual "VBA Scraping"
34 | .Expect(doc.css(".title").index(0).text).ToEqual "VBA Scraping"
35 | End With
36 |
37 | With Specs.It("Select class")
38 | .Expect(doc.css(".download a").index(0).attr("href")).ToEqual "about:Scraping_web.xlsm"
39 | End With
40 |
41 | InlineRunner.RunSuite Specs
42 | End Sub
43 |
44 |
--------------------------------------------------------------------------------