├── .gitignore ├── LICENSE ├── README.md ├── Scraping.xlsm ├── examples.bas ├── src ├── Scraping.cls └── XmlScraping.cls └── tests ├── Test_Scraping.bas └── Test_XmlScraping.bas /.gitignore: -------------------------------------------------------------------------------- 1 | ~$* 2 | css 3 | .sass-cache 4 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2017 Excel VBA 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Web Scraping 2 | Extract data from websites easily. 3 | 4 | ## Examples 5 | ```vb 6 | Sub do_a_search_on_wikipedia() 7 | 8 | Dim doc As New Scraping 9 | Dim search As String 10 | 11 | search = "document object model" 12 | 13 | doc.gotoPage "https://en.wikipedia.org/wiki/Main_Page", True 'browser visible 14 | 15 | doc.id("searchInput").fieldValue search 16 | doc.id("searchButton").click 17 | 18 | End Sub 19 | ``` 20 | 21 | ```vb 22 | Sub extract_the_titles_of_the_questions_in_stackoverflow() 23 | 24 | Dim i As Integer 25 | Dim doc As New XmlScraping 26 | Dim numberTitles As Integer 27 | 28 | doc.gotoPage "https://stackoverflow.com/" 29 | 30 | numberTitles = doc.css(".summary h3 a").count 31 | 32 | For i = 0 To numberTitles - 1 33 | Cells(i + 1, 1) = doc.css(".summary h3 a").index(i).text 34 | Next i 35 | 36 | End Sub 37 | ``` 38 | -------------------------------------------------------------------------------- /Scraping.xlsm: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/victorze-vba/Scraping/d6f134059e0bcdc6d2fa8fb50e86e5f4e17ae997/Scraping.xlsm -------------------------------------------------------------------------------- /examples.bas: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/victorze-vba/Scraping/d6f134059e0bcdc6d2fa8fb50e86e5f4e17ae997/examples.bas -------------------------------------------------------------------------------- /src/Scraping.cls: -------------------------------------------------------------------------------- 1 | VERSION 1.0 CLASS 2 | BEGIN 3 | MultiUse = -1 'True 4 | END 5 | Attribute VB_Name = "Scraping" 6 | Attribute VB_GlobalNameSpace = False 7 | Attribute VB_Creatable = False 8 | Attribute VB_PredeclaredId = False 9 | Attribute VB_Exposed = True 10 | '' 11 | ' Scraping v0.1.1 Alpha 12 | ' (c) Victor Zevallos - https://github.com/vba-dev/vba-scraping 13 | ' 14 | ' Library used: Microsoft Internet Controls, 15 | ' Microsoft HTML Object Library 16 | ' 17 | ' @class Scraping 18 | ' @author victorzevallos@protonmail.com 19 | ' @license MIT (http://www.opensource.org/licenses/mit-license.php) 20 | '' ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ' 21 | 22 | Option Explicit 23 | 24 | ' --------------------------------------------- ' 25 | ' Properties 26 | ' --------------------------------------------- ' 27 | 28 | Private ie As SHDocVw.InternetExplorer 29 | 30 | Private doc As New MSHTML.HTMLDocument 31 | 32 | Private element As MSHTML.IHTMLElement 33 | 34 | Private children As MSHTML.IHTMLDOMChildrenCollection 35 | 36 | ' --------------------------------------------- ' 37 | ' Public Methods 38 | ' --------------------------------------------- ' 39 | 40 | '' 41 | ' Visit a url and you can choose to view the browser 42 | ' 43 | ' @param {String} url 44 | ' @param {Boolean} visibleBrowser 45 | ' @return void 46 | '' 47 | Public Sub gotoPage(url As String, Optional visibleBrowser As Boolean = False) 48 | Set ie = New SHDocVw.InternetExplorer 49 | 50 | ie.visible = visibleBrowser 51 | 52 | ie.Navigate url 53 | 54 | loadBrowser 55 | 56 | Set doc = ie.Document 57 | End Sub 58 | 59 | '' 60 | ' Select a collection of elements 61 | ' 62 | ' @param {String} selector 63 | ' @return Scraping 64 | ' 65 | ' Example selector id: "#nameId" | class: ".nameClas" | tag: "nameTag" 66 | '' 67 | Public Function css(selector As String) As Scraping 68 | Set children = doc.querySelectorAll(selector) 69 | 70 | Set css = Me 71 | End Function 72 | 73 | '' 74 | ' Select the first element in the collection 75 | ' 76 | ' @param {String} selector 77 | ' @return Scraping 78 | '' 79 | Public Function at_css(selector As String) As Scraping 80 | Set element = doc.querySelector(selector) 81 | 82 | Set at_css = Me 83 | End Function 84 | 85 | '' 86 | ' Select an item from an elements collection 87 | ' 88 | ' @param {Integer} i 89 | ' @return Scraping 90 | '' 91 | Public Function index(i As Integer) As Scraping 92 | Set element = children(i) 93 | 94 | Set index = Me 95 | End Function 96 | 97 | '' 98 | ' Return to an object to work with tables 99 | ' 100 | ' @return MSHTML.HTMLTableRow 101 | '' 102 | Public Function rowTable() As MSHTML.HTMLTableRow 103 | Set rowTable = element 104 | End Function 105 | 106 | '' 107 | ' Return the number of items in the collection 108 | ' 109 | ' @return Integer 110 | '' 111 | Public Function count() As Integer 112 | count = children.Length 113 | End Function 114 | 115 | '' 116 | ' Select an elements with the specified id 117 | ' 118 | ' @param {String} idName 119 | ' @return Scraping 120 | '' 121 | Public Function id(idName As String) As Scraping 122 | Set element = doc.getElementById(idName) 123 | 124 | Set id = Me 125 | End Function 126 | 127 | '' 128 | ' Return the text of the selected element 129 | ' 130 | ' @return String 131 | '' 132 | Public Function text() As String 133 | text = element.innerText 134 | End Function 135 | 136 | '' 137 | ' Return the html of the selected element 138 | ' 139 | ' @return String 140 | '' 141 | Public Function html() As String 142 | html = element.innerHTML 143 | End Function 144 | 145 | '' 146 | ' Return the value of the specified attribute 147 | ' 148 | ' @param {String} atributeName 149 | ' @return String 150 | '' 151 | Public Function attr(atributeName As String) As String 152 | attr = element.getAttribute(atributeName) 153 | End Function 154 | 155 | '' 156 | ' Click an element 157 | ' 158 | ' @return void 159 | '' 160 | Public Sub click(Optional sleep As Byte = 3) 161 | element.click 162 | 163 | loadBrowser 164 | 165 | pause sleep 166 | 167 | Set doc = ie.Document 168 | End Sub 169 | 170 | '' 171 | ' Assigns a value to a text or select element 172 | ' 173 | ' @return void 174 | '' 175 | Public Function fieldValue(str As String) As Scraping 176 | element.value = str 177 | 178 | Set fieldValue = Me 179 | End Function 180 | 181 | ' --------------------------------------------- ' 182 | ' Private Methods 183 | ' --------------------------------------------- ' 184 | 185 | '' 186 | ' Wait for the browser to load 187 | ' 188 | ' @return void 189 | '' 190 | Private Sub loadBrowser() 191 | Do While ie.ReadyState <> READYSTATE_COMPLETE Or ie.Busy 192 | Loop 193 | End Sub 194 | 195 | '' 196 | ' Pause in seconds 197 | ' 198 | ' @return void 199 | '' 200 | Private Sub pause(Seconds As Byte) 201 | Application.wait (Now + TimeValue("0:00:0" + CStr(Seconds))) 202 | End Sub 203 | 204 | '' 205 | ' Runs when object is no longer used 206 | ' 207 | ' @return void 208 | '' 209 | Private Sub Class_Terminate() 210 | If Not ie.visible Then ie.quit 211 | 212 | Set ie = Nothing 213 | End Sub 214 | -------------------------------------------------------------------------------- /src/XmlScraping.cls: -------------------------------------------------------------------------------- 1 | VERSION 1.0 CLASS 2 | BEGIN 3 | MultiUse = -1 'True 4 | END 5 | Attribute VB_Name = "XmlScraping" 6 | Attribute VB_GlobalNameSpace = False 7 | Attribute VB_Creatable = False 8 | Attribute VB_PredeclaredId = False 9 | Attribute VB_Exposed = False 10 | '' 11 | ' Scraping v0.1.1 Alpha 12 | ' (c) Victor Zevallos - https://github.com/vba-dev/vba-scraping 13 | ' 14 | ' Library used: Microsoft XML v6.0 15 | ' Microsoft HTML Object Library 16 | ' 17 | ' @class XmlScraping 18 | ' @author victorzevallos@protonmail.com 19 | ' @license MIT (http://www.opensource.org/licenses/mit-license.php) 20 | '' ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ' 21 | 22 | Option Explicit 23 | 24 | ' --------------------------------------------- ' 25 | ' Properties 26 | ' --------------------------------------------- ' 27 | 28 | Private doc As New MSHTML.HTMLDocument 29 | 30 | Private element As MSHTML.IHTMLElement 31 | 32 | Private children As MSHTML.IHTMLDOMChildrenCollection 33 | 34 | ' --------------------------------------------- ' 35 | ' Public Methods 36 | ' --------------------------------------------- ' 37 | 38 | '' 39 | ' Visit the url 40 | ' 41 | ' @param {String} url 42 | ' @return void 43 | '' 44 | Public Sub gotoPage(url As String) 45 | Dim XMLPage As New MSXML2.XMLHTTP60 46 | 47 | XMLPage.Open "GET", url, False 48 | XMLPage.send 49 | 50 | doc.body.innerHTML = XMLPage.responseText 51 | End Sub 52 | 53 | '' 54 | ' Select a collection of elements 55 | ' 56 | ' @param {String} selector 57 | ' @return XmlScraping 58 | ' 59 | ' Example selector id: "#nameId" | class: ".nameClas" | tag: "nameTag" 60 | '' 61 | Public Function css(selector As String) As XmlScraping 62 | Set children = doc.querySelectorAll(selector) 63 | 64 | Set css = Me 65 | End Function 66 | 67 | '' 68 | ' Select the first element in the collection 69 | ' 70 | ' @param {String} selector 71 | ' @return XmlScraping 72 | '' 73 | Public Function at_css(selector As String) As XmlScraping 74 | Set element = doc.querySelector(selector) 75 | 76 | Set at_css = Me 77 | End Function 78 | 79 | '' 80 | ' Select an item from an elements collection 81 | ' 82 | ' @param {Integer} i 83 | ' @return XmlScraping 84 | '' 85 | Public Function index(i As Integer) As XmlScraping 86 | Set element = children(i) 87 | 88 | Set index = Me 89 | End Function 90 | 91 | '' 92 | ' return to an object to work with tables 93 | ' 94 | ' @return MSHTML.HTMLTableRow 95 | '' 96 | Public Function rowTable() As MSHTML.HTMLTableRow 97 | Set rowTable = element 98 | End Function 99 | 100 | '' 101 | ' Return the number of items in the collection 102 | ' 103 | ' @return Integer 104 | '' 105 | Public Function count() As Integer 106 | count = children.Length 107 | End Function 108 | 109 | '' 110 | ' Select an elements with the specified id 111 | ' 112 | ' @param {String} idName 113 | ' @return XmlScraping 114 | '' 115 | Public Function id(idName As String) As XmlScraping 116 | Set element = doc.getElementById(idName) 117 | 118 | Set id = Me 119 | End Function 120 | 121 | '' 122 | ' Return the text of the selected element 123 | ' 124 | ' @return String 125 | '' 126 | Public Function text() As String 127 | text = element.innerText 128 | End Function 129 | 130 | '' 131 | ' Return the html of the selected element 132 | ' 133 | ' @return String 134 | '' 135 | Public Function html() As String 136 | html = element.innerHTML 137 | End Function 138 | 139 | '' 140 | ' Return the value of the specified attribute 141 | ' 142 | ' @param {String} atributeName 143 | ' @return String 144 | '' 145 | Public Function attr(atributeName As String) As String 146 | attr = element.getAttribute(atributeName) 147 | End Function 148 | 149 | -------------------------------------------------------------------------------- /tests/Test_Scraping.bas: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/victorze-vba/Scraping/d6f134059e0bcdc6d2fa8fb50e86e5f4e17ae997/tests/Test_Scraping.bas -------------------------------------------------------------------------------- /tests/Test_XmlScraping.bas: -------------------------------------------------------------------------------- 1 | Attribute VB_Name = "Test_XmlScraping" 2 | Option Explicit 3 | 4 | Sub TestTextHtml() 5 | Dim Specs As New SpecSuite 6 | Dim doc As New XmlScraping 7 | 8 | doc.gotoPage "https://stackoverflow.com/" 9 | 10 | With Specs.It("Extract the text of an element with id 'nav-questions'") 11 | .Expect(doc.id("nav-questions").text).ToEqual "Questions" 12 | End With 13 | 14 | With Specs.It("Extract to the html of the first element of a collection with class .js-gps-track") 15 | .Expect(doc.css(".js-gps-track").index(0).html).ToEqual "Stack Overflow " 16 | End With 17 | 18 | InlineRunner.RunSuite Specs 19 | End Sub 20 | 21 | Sub TestCollection() 22 | Dim Specs As New SpecSuite 23 | Dim doc As New XmlScraping 24 | 25 | doc.gotoPage "https://vba-dev.github.io/vba-scraping/" 26 | 27 | With Specs.It("Select tag") 28 | .Expect(doc.css("span").index(0).text).ToEqual "" 29 | .Expect(doc.css("span").index(2).html).ToEqual "Submit" 30 | End With 31 | 32 | With Specs.It("Select class") 33 | .Expect(doc.at_css(".title").text).ToEqual "VBA Scraping" 34 | .Expect(doc.css(".title").index(0).text).ToEqual "VBA Scraping" 35 | End With 36 | 37 | With Specs.It("Select class") 38 | .Expect(doc.css(".download a").index(0).attr("href")).ToEqual "about:Scraping_web.xlsm" 39 | End With 40 | 41 | InlineRunner.RunSuite Specs 42 | End Sub 43 | 44 | --------------------------------------------------------------------------------