├── LICENSE ├── NumberFormats ├── Docs and samples │ ├── ExcelGetNumberFormats Sample.xlsx │ ├── README.md │ ├── Sample ExcelGetNumberFormats Function Output.JPG │ ├── Sample ExcelWorkbook load.JPG │ └── Sample Source data.JPG ├── Excel.GetNumberFormats.pq └── README.md ├── README.md ├── README_RUS.md ├── RowsOutline ├── ExcelWorksheetsRowOutlines.pq ├── ExcelWorksheetsRowOutlines_Tester.xlsx ├── ExcelWorksheetsRowOutlines_TesterDataSet.xlsx └── README.md └── UnZip.pq /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2017 Maxim Zelensky 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /NumberFormats/Docs and samples/ExcelGetNumberFormats Sample.xlsx: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/hohlick/Power-Query-Excel-Formats/920f3b7c70e08ef77b4008b1f1f64440fc09b040/NumberFormats/Docs and samples/ExcelGetNumberFormats Sample.xlsx -------------------------------------------------------------------------------- /NumberFormats/Docs and samples/README.md: -------------------------------------------------------------------------------- 1 | Documentation files and samples for Excel.GetNumberFormats 2 | -------------------------------------------------------------------------------- /NumberFormats/Docs and samples/Sample ExcelGetNumberFormats Function Output.JPG: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/hohlick/Power-Query-Excel-Formats/920f3b7c70e08ef77b4008b1f1f64440fc09b040/NumberFormats/Docs and samples/Sample ExcelGetNumberFormats Function Output.JPG -------------------------------------------------------------------------------- /NumberFormats/Docs and samples/Sample ExcelWorkbook load.JPG: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/hohlick/Power-Query-Excel-Formats/920f3b7c70e08ef77b4008b1f1f64440fc09b040/NumberFormats/Docs and samples/Sample ExcelWorkbook load.JPG -------------------------------------------------------------------------------- /NumberFormats/Docs and samples/Sample Source data.JPG: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/hohlick/Power-Query-Excel-Formats/920f3b7c70e08ef77b4008b1f1f64440fc09b040/NumberFormats/Docs and samples/Sample Source data.JPG -------------------------------------------------------------------------------- /NumberFormats/Excel.GetNumberFormats.pq: -------------------------------------------------------------------------------- 1 | /* 2 | Name = Excel.GetNumberFormats, 3 | Description = 4 | Get cell number formats from specified Excel workbook, worksheet and column. The output is similar to Excel.Workbook, but additional column will be added 5 | 6 | Arguments: 7 | 8 | FullPath (text): full *.XLSX/M file path (for example, "C:\Temp\Test.xlsx") 9 | 10 | SheetName (text): single worksheet name to get formats from (for example, "Sheet1", "Report Table" etc.) 11 | 12 | ColumnN (whole number): optional column number in R1C1 notation (column C = 3, column D = 5 etc.). Default is 1 13 | 14 | AddToTable (logical): optional selector, defines whether to add column formats directly to datasheet table (true) or as separate table (false). Default is true 15 | 16 | Category = Accessing data functions, 17 | Source = local, 18 | Author = Maxim Zelensky: http://Excel-Inside.pro 19 | Project = https://github.com/hohlick/Power-Query-Excel-Formats 20 | Examples = 21 | Description: Get fromats from 1st column (""A"") on Sheet1 from file located at C:\Temp\Test.xlsx and add them to data table , 22 | Code: ExcelGetNumberFormats(""C:\Temp\Test.xlsx"", ""Sheet1"", 1), 23 | Result: "Formats from column A from Sheet1 worksheet added to Sheet1 data table as ""Column1.NumberFormat"" column, whole result placed in ""DataWithFormats"" column" 24 | */ 25 | let 26 | functionbody = 27 | ( FullPath as text, 28 | SheetName as text, 29 | optional ColumnN as nullable number, 30 | optional AddToTable as nullable logical 31 | ) as table => 32 | 33 | let 34 | 35 | // Parameters check 36 | SheetName = {SheetName}, 37 | /* list conversion for further sheets list compatibility 38 | if SheetName is list then SheetName else 39 | if not (try Text.From(SheetName))[HasError] then {SheetName} else if SheetName is null then {null} 40 | */ 41 | ColumnN = if ColumnN = null then 1 else ColumnN, // 1 by default 42 | AddToTable = if AddToTable = null then true else AddToTable, 43 | 44 | 45 | // UnZip 46 | UnZip = 47 | (ZIPFile) => 48 | let 49 | Header = BinaryFormat.Record([ 50 | MiscHeader = BinaryFormat.Binary(14), 51 | BinarySize = BinaryFormat.ByteOrder(BinaryFormat.UnsignedInteger32, ByteOrder.LittleEndian), 52 | FileSize = BinaryFormat.ByteOrder(BinaryFormat.UnsignedInteger32, ByteOrder.LittleEndian), 53 | FileNameLen= BinaryFormat.ByteOrder(BinaryFormat.UnsignedInteger16, ByteOrder.LittleEndian), 54 | ExtrasLen = BinaryFormat.ByteOrder(BinaryFormat.UnsignedInteger16, ByteOrder.LittleEndian) 55 | ]), 56 | 57 | HeaderChoice = BinaryFormat.Choice( 58 | BinaryFormat.ByteOrder(BinaryFormat.UnsignedInteger32, ByteOrder.LittleEndian), 59 | each if _ <> 67324752 // not the IsValid number? then return a dummy formatter 60 | then BinaryFormat.Record([IsValid = false, Filename=null, Content=null]) 61 | else BinaryFormat.Choice( 62 | BinaryFormat.Binary(26), // Header payload - 14+4+4+2+2 63 | each BinaryFormat.Record([ 64 | IsValid = true, 65 | Filename = BinaryFormat.Text(Header(_)[FileNameLen]), 66 | Extras = BinaryFormat.Text(Header(_)[ExtrasLen]), 67 | Content = BinaryFormat.Transform( 68 | BinaryFormat.Binary(Header(_)[BinarySize]), 69 | (x) => try Binary.Buffer(Binary.Decompress(x, Compression.Deflate)) otherwise null 70 | ) 71 | ]), 72 | type binary // enable streaming 73 | ) 74 | ), 75 | 76 | ZipFormat = BinaryFormat.List(HeaderChoice, each _[IsValid] = true), 77 | 78 | Entries = List.Transform( 79 | List.RemoveLastN( ZipFormat(ZIPFile), 1), 80 | (e) => [FileName = e[Filename], Content = e[Content] ] 81 | ) 82 | in 83 | Table.FromRecords(Entries), 84 | 85 | // fnColumnA1Number 86 | fnColumnA1Number = 87 | (A1ref as text) as record => 88 | 89 | let 90 | // A1ref = "XFD125", 91 | Source = Text.Upper(A1ref), 92 | ToList = Text.ToList(Source), 93 | RowN = Number.From( 94 | Text.Combine( 95 | List.Transform( 96 | ToList, 97 | each if Text.Lower(_)=_ then _ else null) 98 | ) 99 | ) as number, 100 | 101 | ColumnLettersRev = 102 | List.Buffer( 103 | List.Reverse( 104 | List.RemoveNulls( 105 | List.Transform(ToList, each if Text.Lower(_)<>_ then _ else null) ) ) ), 106 | 107 | ListAZ = List.Buffer({"A".."Z"}), 108 | ColumnN = 109 | List.Accumulate( 110 | List.Positions(ColumnLettersRev), 111 | 0, 112 | (s, c) => s + (List.PositionOf(ListAZ, ColumnLettersRev{c} ) + 1) * ( {1,26,676}{c} ) 113 | ) as number 114 | in 115 | [Row = RowN, Column = ColumnN], 116 | 117 | // UnZipped 118 | 119 | UnZipped = Table.Buffer(UnZip(File.Contents(FullPath))), 120 | 121 | 122 | // fnLinkFormatsToCells 123 | fnLinkFormatsToCells = (SheetRef) as table => 124 | let 125 | // styles 126 | styles = 127 | let 128 | Source = UnZipped{[FileName="xl/styles.xml"]}[Content], 129 | ImportedXML = Table.Buffer(Xml.Tables(Source, null, 65001)) 130 | in 131 | ImportedXML, 132 | 133 | // cellXfs 134 | cellXfs = 135 | let 136 | Source = styles, 137 | Table = Source{[Name = "cellXfs"]}[Table], 138 | xf = Table{0}[xf], 139 | AddedStyleID = Table.AddIndexColumn(xf, "StyleID", 0, 1), 140 | ChangedType = Table.TransformColumnTypes(AddedStyleID,{{"Attribute:numFmtId", type number}}), 141 | Merged_numFmts = Table.Join(ChangedType, {"Attribute:numFmtId"}, numFmts, {"numFmtId"}), 142 | SelectedColumns = Table.SelectColumns(Merged_numFmts,{"StyleID", "formatCode"}) 143 | in 144 | SelectedColumns, 145 | 146 | // numFmts 147 | numFmts = 148 | let 149 | 150 | numFmtsTable = styles{[Name = "numFmts"]}[Table][numFmt]{0}, 151 | Renamed = Table.RenameColumns(numFmtsTable,{{"Attribute:numFmtId", "numFmtId"}, {"Attribute:formatCode", "formatCode"}}), 152 | ChangedType = Table.TransformColumnTypes(Renamed,{{"numFmtId", type number}, {"formatCode", type text}}), 153 | numFmts_builtin = 154 | let 155 | Source = Table.FromRows( 156 | Json.Document( 157 | Binary.Decompress( 158 | Binary.FromText( 159 | "VZE7D8IgEID/SkNjorEP2lL7cKgOxsnEuBIGE5p0EAc78e8F7hCcet93jxxXzgklGbnO7/nzfBGRcVIZpi6qbVRQgMZAmqUpEPPk04Ot3cAAin2IFeJlj7WVHZwmUzkBNoCGUdjpSuVS5lqDaY2RuTLSm4M3gJ1rCene8DIqlZxv5f0GbkA3rmuka4oaqApFIOyyqpSl1smvqun8+5Pj1n134Pvg+WOW4i85REfDNhNBktE4GfWGitY90e/F7AH4IqJVGRxhXQv8SW4bO8KcHoxd4USE+AI=", 160 | BinaryEncoding.Base64), 161 | Compression.Deflate) 162 | ), 163 | let _t = ((type text) meta [Serialized.Text = true]) in type table [numFmtId = _t, formatCode = _t]), 164 | Typed = Table.TransformColumnTypes(Source, {{"numFmtId", type number}}) 165 | in 166 | Typed, 167 | 168 | AppendedBiultIn = Table.Combine({numFmts_builtin, ChangedType}) 169 | in 170 | AppendedBiultIn, 171 | 172 | // LinkFormatsToCells itself 173 | Source = Xml.Tables(UnZipped{[FileName = SheetRef]}[Content]), 174 | 175 | // sheet used range top-left cell 176 | dim = Source{[Name="dimension"]}[Table][#"Attribute:ref"]{0}, 177 | TLC = fnColumnA1Number(Text.Split(dim, ":"){0}), 178 | 179 | GetRows = Source{[Name="sheetData"]}[Table], 180 | GR2 =Table.SelectColumns(GetRows[Table]{0}, {"c", "Attribute:r"}), 181 | Renamed = Table.RenameColumns(GR2, {{"Attribute:r", "row"}}), 182 | Expanded = Table.ExpandTableColumn(Renamed, "c", {"Attribute:r", "Attribute:s"}, {"cell", "StyleID"}), 183 | GetColumnNumber = Table.AddColumn(Expanded, "column", each fnColumnA1Number([cell])[Column], type number), 184 | FilteredByColumnNumber = Table.SelectRows(GetColumnNumber, each ([column] = ColumnN)), 185 | ChangedType = Table.TransformColumnTypes(FilteredByColumnNumber,{{"StyleID", type number}, {"row", type number}, {"cell", type text}}), 186 | AddRowIndex = Table.AddColumn(ChangedType, "rowIndex", each [row]-TLC[Row], type number), 187 | MergeByStyleRef = Table.Join(AddRowIndex, {"StyleID"}, cellXfs, {"StyleID"}), 188 | Removed = Table.SelectColumns(MergeByStyleRef,{"rowIndex", "formatCode"}), 189 | Sorted = Table.Sort(Removed,{{"rowIndex", Order.Ascending}}), 190 | Result = if Table.IsEmpty(GetRows) 191 | then 192 | #table( 193 | type table [ 194 | rowIndex = number, 195 | formatCode=text 196 | ], 197 | {}) 198 | else Sorted 199 | in 200 | Table.RenameColumns(Result,{{"formatCode", "Column" & Text.From(ColumnN-TLC[Column]+1) & ".NumberFormat"}}), 201 | 202 | // SourceSheets 203 | SourceSheets = 204 | let 205 | Source = Excel.Workbook(File.Contents(FullPath), null, true), 206 | SheetsOnly = Table.SelectRows(Source, each ([Kind] = "Sheet")), 207 | SheetsIndex = Table.AddIndexColumn(SheetsOnly, "Index", 1, 1) 208 | in 209 | Table.SelectRows(SheetsIndex, each List.Contains(SheetName, [Name])), 210 | // 211 | SheetsWithFormats = 212 | let 213 | AddSheetRef = Table.TransformColumns(SourceSheets, {{"Index", each "xl/worksheets/sheet" & Text.From(_) & ".xml"}}), 214 | DataWithFormats = Table.AddColumn( 215 | AddSheetRef, 216 | "DataWithFormats", 217 | each Table.RemoveColumns(Table.Sort( 218 | Table.Join( 219 | fnLinkFormatsToCells([Index]), 220 | {"rowIndex"}, 221 | Table.AddIndexColumn([Data], "Index", 0, 1), 222 | {"Index"}, 223 | JoinKind.RightOuter 224 | ), {{"Index", Order.Ascending}}), 225 | {"rowIndex", "Index"} 226 | ) 227 | ), 228 | ColumnFormatsOnly = Table.AddColumn( 229 | AddSheetRef, 230 | "NumberFormats", 231 | each fnLinkFormatsToCells([Index]) 232 | ) 233 | in 234 | if AddToTable then DataWithFormats else ColumnFormatsOnly, 235 | 236 | Result = if Table.IsEmpty(SourceSheets) then SourceSheets else SheetsWithFormats 237 | in 238 | Result, 239 | 240 | ExcelGetNumberFormatsType = type function ( 241 | FullPath as (type text meta [ 242 | Documentation.FieldCaption = "Full Path to the workbook", 243 | Documentation.FieldDescription = "Full path to the Excel XLSX/M workbook", 244 | Documentation.SampleValues = {"C:\Temp\Test.xlsx"} 245 | ]), 246 | SheetName as (type text meta [ 247 | Documentation.FieldCaption = "Sheet name", 248 | Documentation.FieldDescription = "Name of the single worksheet to get formats from", 249 | Documentation.SampleValues = {"Sheet1", "Sample Sheet", "My Report"} 250 | ]), 251 | optional ColumnN as (type nullable number meta [ 252 | Documentation.FieldCaption = "Column number", 253 | Documentation.FieldDescription = "Column number in R1C1 notation (A =1, D = 4 etc.)", 254 | Documentation.SampleValues = {1, 4, 5} 255 | ]), 256 | optional AddToTable as (type nullable logical meta [ 257 | Documentation.FieldCaption = "Add parsed formats as column to data table", 258 | Documentation.FieldDescription = "Add parsed formats to worksheet data table or show formats as separate table", 259 | Documentation.AllowedValues = { true, false } 260 | ])) 261 | as table meta [ 262 | Documentation.Name = "Excel.GetNumberFormats", 263 | Documentation.Description = "Get cell number formats from specified Excel workbook, worksheet and column" , 264 | Documentation.LongDescription = "
Get cell number formats from specified Excel workbook, worksheet and column. The output is similar to Excel.Workbook
, but additional column will be added
Arguments:
266 |FullPath
(text)
: full *.XLSX/M file path (for example, ""C:\Temp\Test.xlsx""
)
267 | SheetName
(text)
: single worksheet name to get formats from (for example, ""Sheet1""
, ""Report Table""
etc.)
268 | ColumnN
(whole number)
: optional column number in R1C1 notation (column C = 3, column D = 5 etc.). Default is 1
269 | AddToTable
(logical)
: optional selector, defines whether to add column formats directly to datasheet table (true
) or as separate table (false
). Default is true
Part of the GitHub project https://github.com/hohlick/Power-Query-Excel-Formats
", 271 | Documentation.Category = "Accessing data functions", 272 | Documentation.Source = "local", 273 | Documentation.Author = "Maxim Zelensky: http://Excel-Inside.pro", 274 | Documentation.Examples = { 275 | [Description = "Get fromats from 1st column (""A"") onSheet1
from file located at C:\Temp\Test.xlsx
and add them to data table" ,
276 | Code = "ExcelGetNumberFormats(""C:\Temp\Test.xlsx"", ""Sheet1"", 1)",
277 | Result = "Formats from column A from Sheet1 worksheet added to Sheet1 data table as ""Column1.NumberFormat"" column, whole result placed in ""DataWithFormats"" column"]
278 | }
279 | ]
280 |
281 | in
282 | Value.ReplaceType(functionbody, ExcelGetNumberFormatsType)
283 |
--------------------------------------------------------------------------------
/NumberFormats/README.md:
--------------------------------------------------------------------------------
1 | ## Excel.GetNumberFormats
2 |
3 | ### Code
4 |
5 | [Excel.GetNumberFormats.pq](/NumberFormats/Excel.GetNumberFormats.pq)
6 |
7 | ### Function
8 |
9 | **`ExcelGetNumberFormats`**
10 | (
11 | `FullPath` *as text*,
12 | `SheetName` *as text*,
13 | *optional* `ColumnN` *as nullable number*,
14 | *optional* `AddToTable` *as nullable logical*
15 | ) as table
16 |
17 | ### Description
18 |
19 | Get cell number formats from specified Excel workbook, worksheet and column. The output is similar to Excel.Workbook, but additional column will be added
20 |
21 | ### Arguments:
22 |
23 | - **`FullPath`** *(text)*: full *.XLSX/M file path (for example, "C:\Temp\Test.xlsx")
24 | - **`SheetName`** *(text)*: single worksheet name to get formats from (for example, "Sheet1", "Report Table" etc.)
25 | - **`ColumnN`** *(whole number)*: optional column number in R1C1 notation (column C = 3, column D = 5 etc.). Default is 1
26 | - **`AddToTable`** *(logical)*: optional selector, defines whether to add column formats directly to datasheet table (`true`) or as separate table (`false`). Default is `true`
27 |
28 | ### Examples
29 | **Description:** Get fromats from 2nd column (column "B") on "Sample Sheet" from file located at "C:\Temp\Test.xlsx" and add them to data table
30 |
31 | Download a [sample file](/NumberFormats/Docs%20and%20samples/ExcelGetNumberFormats%20Sample.xlsx)
32 |
33 | 
34 |
35 | This source data contains some currency formats (you can see different EUR, USD and RUB financial and currency formats applied) and custom text formats (defining visible indent of text in the cells). If you try to get this data by built-in Excel.Workbook function, custom number formats will be almost completely lost (except date format):
36 |
37 | 
38 |
39 | **Code:** `ExcelGetNumberFormats("C:\Temp\Test.xlsx", "Sample Sheet", 2)`
40 |
41 | **Result:** Formats from column B from `Sample Sheet` worksheet added to `Sample Sheet` data table as ["Column1.NumberFormat"](#note) column, whole result placed in "DataWithFormats" column. This column could be extracted to the table and then parsed as needed:
42 |
43 | 
44 |
45 | #### Note:
46 | As worksheet used range in the sample file starts from column `B`, after import it became `Column1` (empty column `A` ignored). So Number formats column in this case also became `Column1.NumberFormat`.
47 |
--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 | [README на русском](README_RUS.md)
2 |
3 | # Power-Query-Excel-Formats
4 | A collection of M code to get various formats from Excel sheets in Power Query
5 |
6 | ## Main purpose
7 |
8 | Information, stored in the Excel workbooks, often has additional metadata, important for analyzis. This metadata could be stored in various forms, mostly as cell formats, number formats, colours, etc. Often a row, column or cell format is a critical element of the workbook data set.
9 |
10 | At the moment (Aug 2017) the Microsoft Power Query and corresponding "Query Editor" in Microsoft Power BI do not allow users to get additional information (stored in Excel workbooks and spreadsheets as various applied formats) natively, except (sometimes) the data types of calculated values.
11 |
12 | A wide range of formats and the complexity of extracting their parameters by other tools, such as Power Query, lead to the loss of a noticeable piece of information. Additional problem is storing extracted formats data in Power Query for further use.
13 | Задачи и методы
14 |
15 | ## Tasks
16 |
17 | Develop a set of functions to extract/import specific info about sheet and/or cell formats into Power Query.
18 |
19 | In the future - develop universal functions:
20 |
21 | * spreadsheet information (info about rows, columns, sheet in whole)
22 | * cells info (colors, fonts, alignment, number formats, indents etc.)
23 |
24 | The versatility of the methods due to the same tools (unzip and XML parsing) and the similarity of data sources. Specific kind of function result can be selected via function argument.
25 |
26 | ---
27 |
28 | ### Methods
29 |
30 | #### Unzip
31 |
32 | Main method is unpacking of XLSX/XLSM as zip and working with XML documents inside. Unpack performed via custom function [UnZip.pq](UnZip.pq) by Mike White. But any other analogue to unpack zip archives in Power Query can be used.
33 |
34 | #### XML Parsing
35 |
36 | After UnZip the XML files (`binary` type) from workbook structure become available for the (current) main function. Possible parse methods - with built-in functions `Xml.Tables` or `Xml.Document`, or with other suitable XML parsing methods.
37 |
38 | * Main problem: cell formats stored separate from cells, cells itself stored inside row element, cell address stored in A1 notation (need additional convert to R1C1-style or similar).
39 | * Additional problem: linking/mapping extracted format info with cell position in Power Query table.
40 |
41 | ---
42 | ## Work plan
43 | *(released projects have hyperlinks)*
44 |
45 | 1. Sheet structure:
46 | - [rows outline levels](../../tree/master/RowsOutline),
47 | - columns outline levels,
48 | - extended rows state (visibility, spans, outlines, collapsed, etc.),
49 | - extended columns state.
50 | 2. Cell indents and alignment
51 | 3. [Cell number formats](../../tree/master/NumberFormats)
52 | 4. Cell color
53 | 5. Top-left rows and columns addition to UsedRange/dimension (see this [post about UsedRange pitfall](http://excel-inside.pro/blog/2017/05/23/excel-sheet-as-a-source-to-power-query-and-power-bi-a-pitfall-of-usedrange/))
54 | 6. Additional formats, conditional formats and further development
55 |
--------------------------------------------------------------------------------
/README_RUS.md:
--------------------------------------------------------------------------------
1 |
2 | [README in English](README.md)
3 | # Power-Query-Excel-Formats
4 | Коллекция кода на языке M для извлечения разных форматов из листов Excel в Power Query / Power BI
5 |
6 |
7 | ## Проблема
8 | Информация на листах в книгах Excel часто имеет дополнительные метаданные, важные при анализе. Эти метаданные могут храниться в различных формах, в большинстве случаев - в виде форматов ячеек, числовых форматов, цвета и так далее. Часто формат строки, столбца или ячейки является критичным элементом набора данных.
9 |
10 | В настоящий момент (август 2017) Microsoft Power Query и соответствующий редактор запросов в Microsoft Power BI не позволяет пользователям стандартными средствами извлекать из книг и листов Excel информацию о примененном форматировании листов и ячеек, за исключением некоторых типов данных (например, Дата).
11 |
12 | Большое разнообразие форматов и сложность их извлечения сторонними программами (а не нативными средствами Excel), такими как Power BI или Power Query, приводит к потере значительной части информации при импорте. Дополнительной проблемой является сопоставление извлеченных форматов с табличными данными Power Query для дальнейшего использования.
13 |
14 | ---
15 | ## Задачи и методы
16 | ### Задачи
17 |
18 | Создаем набор отдельных функций, позволяющих извлечь заданную информацию о формате листа и/или ячейки.
19 |
20 | В перспективе - создаем универсальные функции:
21 | - для информации о листе (инфо о строках, столбцах, листе в целом)
22 | - для информации о ячейках (форматирование цветом, шрифт, выравнивание, числовые форматы, отступы и т.п.)
23 |
24 | Универсальность обусловлена одинаковыми применяемыми методами (распаковка и разбор XML), различный вывод в перспективе может задаваться параметрами функций.
25 |
26 | ### Методы
27 | #### Распаковка
28 |
29 | Основной метод - распаковка файла XLSX/XLSM как архива и работа с XML документами внутри.
30 | Распаковка осуществляется функцией [UnZip.pq](UnZip.pq), автор Mike White. В принципе, можно использовать любой аналог для работы с zip-архивами в Power Query.
31 |
32 | #### Разбор XML
33 |
34 | После распаковки на вход в основную функцию поступает набор XML файлов (binary).
35 | Возможные способы разбора - при помощи встроенных функций Xml.Tables или Xml.Document, либо при помощи других способов парсинга XML.
36 |
37 | * **Основная проблема**: внутри книги данные о форматах хранятся отдельно от ячеек, сами ячейки хранятся внутри строк, адрес ячейки задается в формате A1 (необходим дополнительный парсинг в R1C1 или аналог).
38 | * **Дополнительная проблема**: связывание информации о форматах с позицией ячейки в выводимой таблице Power Query.
39 |
40 | ---
41 | ## План работ
42 | *(готовые проекты снабжены ссылками)*
43 |
44 | 1. Структура листа:
45 | - [уровни группировки (вложенности) строк](../../tree/master/RowsOutline),
46 | - уровни группировки (вложенности) столбцов,
47 | - расширенная информация о строках (видимость, высота, вложенность, состояние группировки и т.д.),
48 | - расширенная информация о столбцах (видимость, ширина, вложенность, состояние группировки и т.д.).
49 | 2. Отступы и выравнивание ячеек
50 | 3. [Числовые форматы ячеек](../../tree/master/NumberFormats)
51 | 4. Цвет фона ячейки
52 | 5. Добавление пустых строк и столбцов сверху-слева от UsedRange/dimension ([о ловушке UsedRange см. этот пост](http://excel-inside.pro/ru/blog/2017/05/23/импорт-данных-в-power-query-и-power-bi-из-excel-ловушка-usedrange/))
53 | 6. Условное форматирование, дополнительные форматы и дальнейшая разработка
54 |
--------------------------------------------------------------------------------
/RowsOutline/ExcelWorksheetsRowOutlines.pq:
--------------------------------------------------------------------------------
1 | /*
2 | Author: Maxim Zelensky, info@excel-inside.pro, http://excel-inside.pro
3 | Date Updated: 2017-08-14
4 | Purpose: Get outline levels of rows from Excel worksheets in Power Query.
5 | Note: Included copy of Mark White's UnZip function.
6 |
7 | // Parameters:
8 |
9 | // FullPath
10 | Type: text,
11 | Description: full path to workbook. Mandatory
12 | Example: "C:\PQ\Outline\test2.xlsx"
13 |
14 | // SheetNames
15 | Type: any
16 | Description: text or list of worksheet names. Optional
17 | If argument:
18 | not provided,
19 | or null,
20 | or empty list {},
21 | or argument type is different from text/list,
22 | then all worksheets from workbook will be analyzed.
23 |
24 | Example: {"Sheet1", "Sheet3"}
25 | Example: "Sheet1"
26 |
27 | // AddOutlinesToData
28 | Type: nullable logical
29 | Description: defines whether add outlineLevel column to the sheet [Data] table. Optional
30 | If null or not provided then = true
31 | Example: true, false, null
32 |
33 | */
34 |
35 | // fnGetRowsOutline
36 |
37 | (FullPath as text, optional SheetNames as any, optional AddOutlinesToData as nullable logical) as table =>
38 |
39 | let
40 |
41 |
42 | /*
43 | /////////////////////////////////////////////////////////////////////////////////////////////
44 | // Functions
45 | /////////////////////////////////////////////////////////////////////////////////////////////
46 | */
47 |
48 | // UnZip function copy
49 |
50 | fnUnZip = (ZIPFile) =>
51 | let
52 | Header = BinaryFormat.Record([
53 | MiscHeader = BinaryFormat.Binary(14),
54 | BinarySize = BinaryFormat.ByteOrder(BinaryFormat.UnsignedInteger32, ByteOrder.LittleEndian),
55 | FileSize = BinaryFormat.ByteOrder(BinaryFormat.UnsignedInteger32, ByteOrder.LittleEndian),
56 | FileNameLen= BinaryFormat.ByteOrder(BinaryFormat.UnsignedInteger16, ByteOrder.LittleEndian),
57 | ExtrasLen = BinaryFormat.ByteOrder(BinaryFormat.UnsignedInteger16, ByteOrder.LittleEndian)
58 | ]),
59 |
60 | HeaderChoice = BinaryFormat.Choice(
61 | BinaryFormat.ByteOrder(BinaryFormat.UnsignedInteger32, ByteOrder.LittleEndian),
62 | each if _ <> 67324752 // not the IsValid number? then return a dummy formatter
63 | then BinaryFormat.Record([IsValid = false, Filename=null, Content=null])
64 | else BinaryFormat.Choice(
65 | BinaryFormat.Binary(26), // Header payload - 14+4+4+2+2
66 | each BinaryFormat.Record([
67 | IsValid = true,
68 | Filename = BinaryFormat.Text(Header(_)[FileNameLen]),
69 | Extras = BinaryFormat.Text(Header(_)[ExtrasLen]),
70 | Content = BinaryFormat.Transform(
71 | BinaryFormat.Binary(Header(_)[BinarySize]),
72 | (x) => try Binary.Buffer(Binary.Decompress(x, Compression.Deflate)) otherwise null
73 | )
74 | ]),
75 | type binary // enable streaming
76 | )
77 | ),
78 |
79 | ZipFormat = BinaryFormat.List(HeaderChoice, each _[IsValid] = true),
80 |
81 | Entries = List.Transform(
82 | List.RemoveLastN( ZipFormat(ZIPFile), 1),
83 | (e) => [FileName = e[Filename], Content = e[Content] ]
84 | )
85 | in
86 | Table.FromRecords(Entries),
87 |
88 | // RowsOutline
89 | fnRowsOutline = (WSrel as text) =>
90 | let
91 | Source = UnZipped,
92 | Sheet1 = Source{[FileName="xl/" & WSrel]}[Content],
93 | XMLContent = Xml.Tables(Sheet1,null,65001),
94 | sheetDataTable = XMLContent{[Name="sheetData"]}[Table],
95 | Filtered = Table.SelectRows(sheetDataTable, each [Name] = "row"),
96 | rowTable = Filtered{0}[Table],
97 | RemovedOtherColumns = Table.SelectColumns(rowTable,{"Attribute:r", "Attribute:outlineLevel"}, MissingField.UseNull),
98 | RenamedColumns = Table.RenameColumns(RemovedOtherColumns,{{"Attribute:r", "RowN"}, {"Attribute:outlineLevel", "outlineLevel"}}),
99 | Typed = Table.TransformColumnTypes(RenamedColumns,{{"RowN", Int64.Type}, {"outlineLevel", Int64.Type}}),
100 | RowIndices = List.Zip( { {List.Min(Typed[RowN])..(List.Max(Typed[RowN]))} }),
101 | RowIndTable = Table.AddIndexColumn(#table(type table [RowNumber=Int64.Type], RowIndices), "Index"),
102 | ExpandMissedRows = Table.Join(RowIndTable, {"RowNumber"}, Typed, {"RowN"},JoinKind.LeftOuter),
103 | RemovedOtherColumns1 = Table.SelectColumns(ExpandMissedRows,{"Index", "outlineLevel"}),
104 | PreSort = Table.Sort(RemovedOtherColumns1, {"Index"}),
105 | Result = if Table.IsEmpty(sheetDataTable) then #table(type table [Index = number, outlineLevel = Int64.Type],{}) else PreSort
106 | in
107 | Result,
108 | /////////////////////////////////////////////////////////////////////////////////////////////
109 |
110 |
111 | Source = Excel.Workbook(File.Contents(FullPath), false, true),
112 |
113 | // leave sheets only
114 | FilteredSheets = Table.SelectRows(Source, each ([Kind] = "Sheet")),
115 |
116 | // sheets in PQ initially in appearance order, i.e. sheets index (despite visibility)
117 | AddSheetsIndex = Table.AddIndexColumn(FilteredSheets, "Index", 1, 1),
118 |
119 | // check SheetNames parameter
120 | SheetNames = if SheetNames is text then {SheetNames} else if SheetNames is list then SheetNames else null,
121 |
122 | // filter sheets by name if provided
123 | FilteredByNames = if SheetNames = null or List.IsEmpty(SheetNames) then AddSheetsIndex else Table.SelectRows(AddSheetsIndex, each List.Contains(SheetNames, [Name])),
124 |
125 | // UnZip file
126 | UnZipped = Table.Buffer(fnUnZip(File.Contents(FullPath))),
127 | /*
128 | let
129 | Source = Folder.Files(Folder),
130 | file = Source{[Name = FileName, Folder Path = Folder & "\"]}[Content],
131 | UnZippedFile = Table.Buffer(fnUnZip(file))
132 | in
133 | Table.Buffer(UnZippedFile),
134 | */
135 |
136 | // relations id table for sheets
137 | workbook =
138 | let
139 | Source = UnZipped,
140 | Content = Source{[FileName ="xl/workbook.xml"]}[Content],
141 | ImportedXML = Xml.Tables(Content,null,TextEncoding.Utf8),
142 | sheetsTable = ImportedXML{[Name = "sheets"]}[Table],
143 | sheetTable = sheetsTable{[Name = "sheet"]}[Table],
144 | ExpandedRel = Table.ExpandTableColumn(sheetTable, "http://schemas.openxmlformats.org/officeDocument/2006/relationships", {"Attribute:id"}, {"Attribute:id"}),
145 | typed = Table.TransformColumnTypes(ExpandedRel,{{"Attribute:name", type text}, {"Attribute:sheetId", Int64.Type}, {"Attribute:id", type text}})
146 | in
147 | typed,
148 |
149 | // sheets relations id to XML target files
150 | workbook_rels =
151 | let
152 | Source = UnZipped,
153 | Filtered = Table.SelectRows(Source, each [FileName]="xl/_rels/workbook.xml.rels"),
154 | GetXML = Table.TransformColumns(Filtered, {"Content", each Xml.Tables(_,null,65001)}),
155 | XMLContent = GetXML{0}[Content]{[Name="Relationship"]}[Table],
156 | FilteredSheetsRel = Table.SelectRows(XMLContent, each [#"Attribute:Type"] = "http://schemas.openxmlformats.org/officeDocument/2006/relationships/worksheet"),
157 | Removed = Table.RemoveColumns(FilteredSheetsRel,{"Attribute:Type"})
158 | in
159 | Removed,
160 |
161 | // merge relations id (via sheets name)
162 | MergedRelationsID = Table.Join(FilteredByNames, {"Name"}, workbook, {"Attribute:name"}),
163 |
164 | // join workbook relations
165 | MergedRelationsTarget = Table.Join(MergedRelationsID,{"Attribute:id"},workbook_rels,{"Attribute:Id"}),
166 |
167 | // invoke RowsOutline function to get separate index+outlineLevel table
168 | AddOutlineTable = Table.AddColumn(MergedRelationsTarget, "RowsOutline", each fnRowsOutline([#"Attribute:Target"]), type table),
169 |
170 | // join [AddOutlineTable] and [Data] by rows index
171 | AddedRowsOutline = Table.AddColumn(AddOutlineTable, "DataWithOutline", each Table.RemoveColumns(Table.Join([RowsOutline], {"Index"}, Table.AddIndexColumn([Data],"Index"), {"Index"}), {"Index"}), type table),
172 |
173 | // add outlines to Data or not
174 | Result = if AddOutlinesToData or AddOutlinesToData = null then AddedRowsOutline else AddOutlineTable,
175 |
176 | // sort order
177 | Sorted = Table.Sort(Result,{{"Index", Order.Ascending}}),
178 |
179 | // Clean table
180 | RemovedColumns = Table.RemoveColumns(Sorted, {"Index", "Attribute:Id", "Attribute:Target", "Attribute:name", "Attribute:sheetId", "Attribute:id"})
181 |
182 | in
183 | RemovedColumns
184 |
--------------------------------------------------------------------------------
/RowsOutline/ExcelWorksheetsRowOutlines_Tester.xlsx:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/hohlick/Power-Query-Excel-Formats/920f3b7c70e08ef77b4008b1f1f64440fc09b040/RowsOutline/ExcelWorksheetsRowOutlines_Tester.xlsx
--------------------------------------------------------------------------------
/RowsOutline/ExcelWorksheetsRowOutlines_TesterDataSet.xlsx:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/hohlick/Power-Query-Excel-Formats/920f3b7c70e08ef77b4008b1f1f64440fc09b040/RowsOutline/ExcelWorksheetsRowOutlines_TesterDataSet.xlsx
--------------------------------------------------------------------------------
/RowsOutline/README.md:
--------------------------------------------------------------------------------
1 | # Rows Outline
2 |
3 | *Easiest part* :)
4 |
5 | ## Idea
6 |
7 | As Power Query import data from Excel spreadsheet into a table structure, then row outline levels could be stored as additional column for such tables.
8 |
9 | ***
10 |
11 | ## Realisation
12 |
13 | fnGetRowsOutline(
14 | FullPath as text,
15 | optional SheetNames as any,
16 | optional AddOutlinesToData as nullable logical
17 | ) as table
18 |
19 | Sample is [there](#sample)
20 |
21 | ### Code:
22 | [ExcelWorksheetsRowOutlines.pq](../RowsOutline/ExcelWorksheetsRowOutlines.pq)
23 |
24 | ### Description:
25 | Returns spreadsheets (not tables) data from Excel workbook (xlsx or xlsm tested), adding information about rows outline levels.
26 | As rows outline levels is the property of rows (not cells), it is possible to return outline level for each used row.
27 |
28 | Based on `Excel.Workbook` built-in function, but adds (one or two, depending on the third argument) additional columns to its result:
29 |
30 | * `RowsOutline` column with a table of two columns:
31 | * `RowIndex` as number, (zero-based) - an index to further relations to [Data] column contents
32 | * `outlineLevel` as Int64.Type
33 | * `DataWithOutline` column, where `outlineLevel` column is added as the first column to raw sheet data (`Excel.Workbook` `[Data]` column).
34 |
35 | ### Function arguments:
36 |
37 | #### `FullPath`
38 |
39 | *Type:* text,
40 |
41 | *Description:* full path to workbook. **Mandatory**
42 |
43 | *Example:* "C:\PQ\Outline\test2.xlsx"
44 |
45 | #### `SheetNames`
46 |
47 | *Type*: any
48 |
49 | *Description*: text or list of worksheet names to extract. **Optional**
50 |
51 | If argument: ExcelWorksheetsRowOutlines_TesterDataSet.xlsx
52 |
53 | - not provided,
54 | - or null,
55 | - or empty list {},
56 | - or argument type is different from text/list,
57 |
58 | then all worksheets from workbook will be analyzed.
59 |
60 | *Example*:
61 | * {"Sheet1", "Sheet3"}
62 | * "Sheet1"
63 |
64 |
65 | #### `AddOutlinesToData`
66 |
67 | *Type*: nullable logical
68 |
69 | *Description*: defines whether add outlineLevel column to the sheet [Data] table. **Optional**
70 |
71 | If null or not provided then `true`
72 |
73 | *Example*:
74 | * true,
75 | * false,
76 | * null
77 |
78 | ***
79 |
80 | ## Notes:
81 | 1. Included copy of [Mark White's UnZip function](../UnZip.pq).
82 | 2. Both functions (`Excel.Workbook` and `ExcelWorksheetsRowOutlines`) return cells range from worksheet, based on `UsedRange` VBA property (or `dimension` sheet atteribute in SpreadsheetML schema).
83 |
84 | ***
85 |
86 | ## Sample
87 | There two files:
88 | * [`ExcelWorksheetsRowOutlines_TesterDataSet.xlsx`](ExcelWorksheetsRowOutlines_TesterDataSet.xlsx), which contains simple workbook with some rows grouping
89 | * [`ExcelWorksheetsRowOutlines_Tester.xlsx`](ExcelWorksheetsRowOutlines_Tester.xlsx), which contains code and several queries to show how function works
90 |
91 | Download both files, place them somewhere, and write down a full path to the `ExcelWorksheetsRowOutlines_TesterDataSet.xlsx` in the `ExcelWorksheetsRowOutlines_Tester.xlsx ` (see parameter table inside), then refresh loaded query. You can also look on the other possible function results in Power Query Editor.
92 |
--------------------------------------------------------------------------------
/UnZip.pq:
--------------------------------------------------------------------------------
1 | /*
2 | // Author: Mark White
3 | // Reference: http://sql10.blogspot.ru/2016/06/reading-zip-files-in-powerquery-m.html
4 | // Code Source: https://pastebin.com/KJ8MyAPb
5 |
6 | // ZIPFile should be a binary!
7 | */
8 |
9 | (ZIPFile) =>
10 | let
11 | Header = BinaryFormat.Record([
12 | MiscHeader = BinaryFormat.Binary(14),
13 | BinarySize = BinaryFormat.ByteOrder(BinaryFormat.UnsignedInteger32, ByteOrder.LittleEndian),
14 | FileSize = BinaryFormat.ByteOrder(BinaryFormat.UnsignedInteger32, ByteOrder.LittleEndian),
15 | FileNameLen= BinaryFormat.ByteOrder(BinaryFormat.UnsignedInteger16, ByteOrder.LittleEndian),
16 | ExtrasLen = BinaryFormat.ByteOrder(BinaryFormat.UnsignedInteger16, ByteOrder.LittleEndian)
17 | ]),
18 |
19 | HeaderChoice = BinaryFormat.Choice(
20 | BinaryFormat.ByteOrder(BinaryFormat.UnsignedInteger32, ByteOrder.LittleEndian),
21 | each if _ <> 67324752 // not the IsValid number? then return a dummy formatter
22 | then BinaryFormat.Record([IsValid = false, Filename=null, Content=null])
23 | else BinaryFormat.Choice(
24 | BinaryFormat.Binary(26), // Header payload - 14+4+4+2+2
25 | each BinaryFormat.Record([
26 | IsValid = true,
27 | Filename = BinaryFormat.Text(Header(_)[FileNameLen]),
28 | Extras = BinaryFormat.Text(Header(_)[ExtrasLen]),
29 | Content = BinaryFormat.Transform(
30 | BinaryFormat.Binary(Header(_)[BinarySize]),
31 | (x) => try Binary.Buffer(Binary.Decompress(x, Compression.Deflate)) otherwise null
32 | )
33 | ]),
34 | type binary // enable streaming
35 | )
36 | ),
37 |
38 | ZipFormat = BinaryFormat.List(HeaderChoice, each _[IsValid] = true),
39 |
40 | Entries = List.Transform(
41 | List.RemoveLastN( ZipFormat(ZIPFile), 1),
42 | (e) => [FileName = e[Filename], Content = e[Content] ]
43 | )
44 | in
45 | Table.FromRecords(Entries)
46 |
--------------------------------------------------------------------------------