├── .gitignore ├── tests └── data │ ├── Book1.xlsx │ ├── inlinestrings.xlsx │ └── UPS.Galaxy.VS.PX.xlsx ├── Cargo.toml ├── LICENSE.txt ├── src ├── main.rs ├── utils.rs ├── lib.rs ├── ws.rs └── wb.rs ├── .github └── workflows │ ├── pre-release.yml │ └── tagged-release.yml ├── README.md └── Cargo.lock /.gitignore: -------------------------------------------------------------------------------- 1 | /target 2 | *.DS_Store 3 | *~$*xlsx 4 | -------------------------------------------------------------------------------- /tests/data/Book1.xlsx: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/xlprotips/xl/HEAD/tests/data/Book1.xlsx -------------------------------------------------------------------------------- /tests/data/inlinestrings.xlsx: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/xlprotips/xl/HEAD/tests/data/inlinestrings.xlsx -------------------------------------------------------------------------------- /tests/data/UPS.Galaxy.VS.PX.xlsx: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/xlprotips/xl/HEAD/tests/data/UPS.Galaxy.VS.PX.xlsx -------------------------------------------------------------------------------- /Cargo.toml: -------------------------------------------------------------------------------- 1 | [package] 2 | name = "xl" 3 | version = "0.1.8" 4 | edition = "2018" 5 | license = "MIT" 6 | description = "A package to let you process *big* Excel files very quickly" 7 | repository = "https://github.com/xlprotips/xl/" 8 | 9 | [dependencies] 10 | zip = "0.5.13" 11 | quick-xml = "0.22.0" 12 | chrono = "0.4" 13 | 14 | [lib] 15 | name = "xl" 16 | path = "src/lib.rs" 17 | 18 | [[bin]] 19 | name = "xlcat" 20 | path = "src/main.rs" 21 | -------------------------------------------------------------------------------- /LICENSE.txt: -------------------------------------------------------------------------------- 1 | Copyright 2021 Kevin Ryan 2 | 3 | Permission is hereby granted, free of charge, to any person obtaining a copy of 4 | this software and associated documentation files (the "Software"), to deal in 5 | the Software without restriction, including without limitation the rights to 6 | use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies 7 | of the Software, and to permit persons to whom the Software is furnished to do 8 | so, subject to the following conditions: 9 | 10 | The above copyright notice and this permission notice shall be included in all 11 | copies or substantial portions of the Software. 12 | 13 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 14 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 15 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 16 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 17 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 18 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 19 | SOFTWARE. 20 | -------------------------------------------------------------------------------- /src/main.rs: -------------------------------------------------------------------------------- 1 | use std::env; 2 | use std::process; 3 | 4 | fn main() { 5 | let args: Vec = env::args().collect(); 6 | let config = xl::Config::new(&args).unwrap_or_else(|err| { 7 | match err { 8 | xl::ConfigError::NeedPathAndTab(_) => { 9 | eprintln!("Error: {}", err); 10 | xl::usage(); 11 | }, 12 | xl::ConfigError::NeedTab => { 13 | eprintln!("Error: {}", err); 14 | if let Ok(mut wb) = xl::Workbook::open(&args[1]) { 15 | eprintln!("The following sheets are available in '{}':", &args[1]); 16 | for sheet_name in wb.sheets().by_name() { 17 | eprintln!(" {}", sheet_name); 18 | } 19 | } else { 20 | eprintln!("(that workbook also does not seem to exist or is not a valid xlsx file)"); 21 | } 22 | eprintln!("\nSee help by using -h flag."); 23 | }, 24 | _ => { 25 | eprintln!("Error: {}", err); 26 | eprintln!("\nSee help by using -h flag."); 27 | }, 28 | } 29 | process::exit(1); 30 | }); 31 | if let Err(e) = xl::run(config) { 32 | eprintln!("Runtime error: {}", e); 33 | process::exit(1); 34 | } 35 | } 36 | -------------------------------------------------------------------------------- /.github/workflows/pre-release.yml: -------------------------------------------------------------------------------- 1 | name: pre-release 2 | on: 3 | push: 4 | branches: [ main ] 5 | 6 | env: 7 | CARGO_TERM_COLOR: always 8 | 9 | jobs: 10 | create-release: 11 | name: create-release 12 | runs-on: ubuntu-latest 13 | outputs: 14 | upload_url: ${{ steps.release.outputs.upload_url }} 15 | version: latest 16 | steps: 17 | - name: Create GitHub release 18 | id: release 19 | uses: marvinpinto/action-automatic-releases@latest 20 | with: 21 | repo_token: ${{ secrets.GITHUB_TOKEN }} 22 | automatic_release_tag: latest 23 | draft: false 24 | prerelease: true 25 | title: "Development Release" 26 | 27 | build-pre-release: 28 | name: build-pre-release 29 | runs-on: ${{ matrix.os }} 30 | needs: ['create-release'] 31 | strategy: 32 | matrix: 33 | build: [linux, macos, win-msvc, win32-msvc] 34 | include: 35 | - build: linux 36 | os: ubuntu-18.04 37 | rust: nightly 38 | target: x86_64-unknown-linux-musl 39 | - build: macos 40 | os: macos-latest 41 | rust: nightly 42 | target: x86_64-apple-darwin 43 | - build: win-msvc 44 | os: windows-2019 45 | rust: nightly 46 | target: x86_64-pc-windows-msvc 47 | - build: win32-msvc 48 | os: windows-2019 49 | rust: nightly 50 | target: i686-pc-windows-msvc 51 | 52 | steps: 53 | - name: Checkout repository 54 | uses: actions/checkout@v2 55 | - name: Install Rust 56 | uses: actions-rs/toolchain@v1 57 | with: 58 | toolchain: ${{ matrix.rust }} 59 | target: ${{ matrix.target }} 60 | - name: Run Tests 61 | run: cargo test 62 | - name: Build release binary 63 | run: cargo build --release 64 | - name: Build archive 65 | shell: bash 66 | run: | 67 | staging="xlcat-${{ needs.create-release.outputs.version }}-${{ matrix.target }}" 68 | mkdir -p "$staging" 69 | cp {README.md,LICENSE.txt} "$staging/" 70 | if [ "${{ matrix.os }}" = "windows-2019" ]; then 71 | cp "target/release/xlcat.exe" "$staging/" 72 | 7z a "$staging.zip" "$staging" 73 | echo "ASSET=$staging.zip" >> $GITHUB_ENV 74 | else 75 | cp "target/release/xlcat" "$staging/" 76 | tar czf "$staging.tar.gz" "$staging" 77 | echo "ASSET=$staging.tar.gz" >> $GITHUB_ENV 78 | fi 79 | - name: Upload release archive 80 | uses: actions/upload-release-asset@v1.0.1 81 | env: 82 | GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} 83 | with: 84 | upload_url: ${{ needs.create-release.outputs.upload_url }} 85 | asset_path: ${{ env.ASSET }} 86 | asset_name: ${{ env.ASSET }} 87 | asset_content_type: application/octet-stream 88 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # xl / xlcat 2 | 3 | xlcat is like cat except for Excel files. Specifically, xlsx files (it won't 4 | work on xls files unfortunately). It can handle *extremely large* Excel files 5 | and will start spitting out the contents almost immediately. It is able to do 6 | this by making some assumptions about the underlying xml and then exploiting 7 | those assumptions via a [high-performance xml pull 8 | parser](https://github.com/tafia/quick-xml). 9 | 10 | xlcat takes the ideas from [sxl](https://github.com/ktr/sxl/), a Python library 11 | that does something very similar, and puts them into a command-line app. 12 | 13 | ## Getting Started 14 | 15 | You can download xlcat from the 16 | [releases](https://github.com/xlprotips/xl/releases) page. Once you've 17 | downloaded a binary for your operating system, you can use the tool to view an 18 | Excel file with: 19 | 20 | ```bash 21 | xlcat 22 | ``` 23 | 24 | This will start spitting out the entire Excel file to your screen. If you have 25 | a really big file, you may want to limit how many rows you print to screen. The 26 | following will print the first 10 lines of the "Book1.xlsx" file included in 27 | this repository: 28 | 29 | ```bash 30 | $ xlcat tests/data/Book1.xlsx Sheet1 -n 10 31 | 1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18 32 | 19,20,21,22,23,24,25,26,27,28,29,30,31,32,33,34,35,36 33 | 37,38,39,40,41,42,43,44,45,46,47,48,49,50,51,52,53,54 34 | 55,56,57,58,59,60,61,62,63,64,65,66,67,68,69,70,71,72 35 | 73,74,75,76,77,78,79,80,81,82,83,84,85,86,87,88,89,90 36 | 91,92,93,94,95,2018-01-31,97,98,99,2018-02-28,101,102,103,104,105,106,107,108 37 | 109,110,111,112,113,114,115,116,117,2018-03-01,119,120,121,122,123,124,125,126 38 | 127,128,129,130,131,132,133,134,135,136,137,138,139,140,141,142,143,144 39 | 145,146,147,148,149,150,151,152,153,154,155,156,157,158,159,160,161,162 40 | 163,164,165,166,167,168,169,"Test",171,172,173,174,175,176,177,178,179,180 41 | ``` 42 | 43 | You could obviously limit the number of rows with `head` or something similar, 44 | but this makes it slightly easier to do without a separate tool. 45 | 46 | ## xl library 47 | 48 | If you install the Rust crate with something like: 49 | 50 | ```toml 51 | [dependencies] 52 | xl = "0.1.0" 53 | ``` 54 | 55 | You should be able to use the library as follows: 56 | 57 | ```rust 58 | use xl::Workbook; 59 | 60 | fn main () { 61 | let mut wb = xl::Workbook::open("tests/data/Book1.xlsx").unwrap(); 62 | let sheets = wb.sheets(); 63 | let sheet = sheets.get("Sheet1"); 64 | for row in sheet.rows(&mut wb).take(5) { 65 | println!("{}", row); 66 | } 67 | } 68 | ``` 69 | 70 | This API will likely change in the future. In particular, I do not like having 71 | to pass the wb object in to the rows iterator, so I will probably try to find a 72 | way to eliminate that part of the code. 73 | 74 | You can run tests with the standard `cargo test`. 75 | 76 | ## License 77 | 78 | The project is licensed under the MIT License - see the [License](/LICENSE.txt) 79 | file for details 80 | -------------------------------------------------------------------------------- /.github/workflows/tagged-release.yml: -------------------------------------------------------------------------------- 1 | name: tagged-release 2 | 3 | on: 4 | push: 5 | tags: 6 | - "v*" 7 | 8 | env: 9 | CARGO_TERM_COLOR: always 10 | 11 | jobs: 12 | create-release: 13 | name: create-release 14 | runs-on: ubuntu-latest 15 | outputs: 16 | upload_url: ${{ steps.release.outputs.upload_url }} 17 | version: ${{ env.XL_VERSION }} 18 | steps: 19 | - name: Get the release version from the tag 20 | shell: bash 21 | if: env.XL_VERSION == '' 22 | run: | 23 | # See: https://tinyurl.com/35zwc9hp 24 | echo "XL_VERSION=${GITHUB_REF#refs/tags/}" >> $GITHUB_ENV 25 | echo "version is: ${{ env.XL_VERSION }}" 26 | - name: Create GitHub release 27 | id: release 28 | uses: marvinpinto/action-automatic-releases@latest 29 | with: 30 | repo_token: ${{ secrets.GITHUB_TOKEN }} 31 | prerelease: false 32 | 33 | build-tagged-release: 34 | name: build-tagged-release 35 | runs-on: ${{ matrix.os }} 36 | needs: ['create-release'] 37 | strategy: 38 | matrix: 39 | build: [linux, macos, win-msvc, win32-msvc] 40 | include: 41 | - build: linux 42 | os: ubuntu-18.04 43 | rust: nightly 44 | target: x86_64-unknown-linux-musl 45 | - build: macos 46 | os: macos-latest 47 | rust: nightly 48 | target: x86_64-apple-darwin 49 | - build: win-msvc 50 | os: windows-2019 51 | rust: nightly 52 | target: x86_64-pc-windows-msvc 53 | - build: win32-msvc 54 | os: windows-2019 55 | rust: nightly 56 | target: i686-pc-windows-msvc 57 | 58 | steps: 59 | - name: Checkout repository 60 | uses: actions/checkout@v2 61 | - name: Install Rust 62 | uses: actions-rs/toolchain@v1 63 | with: 64 | toolchain: ${{ matrix.rust }} 65 | target: ${{ matrix.target }} 66 | - name: Run Tests 67 | run: cargo test 68 | - name: Build release binary 69 | run: cargo build --release 70 | - name: Build archive 71 | shell: bash 72 | run: | 73 | staging="xlcat-${{ needs.create-release.outputs.version }}-${{ matrix.target }}" 74 | mkdir -p "$staging" 75 | cp {README.md,LICENSE.txt} "$staging/" 76 | if [ "${{ matrix.os }}" = "windows-2019" ]; then 77 | cp "target/release/xlcat.exe" "$staging/" 78 | 7z a "$staging.zip" "$staging" 79 | echo "ASSET=$staging.zip" >> $GITHUB_ENV 80 | else 81 | cp "target/release/xlcat" "$staging/" 82 | tar czf "$staging.tar.gz" "$staging" 83 | echo "ASSET=$staging.tar.gz" >> $GITHUB_ENV 84 | fi 85 | - name: Upload release archive 86 | uses: actions/upload-release-asset@v1.0.1 87 | env: 88 | GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} 89 | with: 90 | upload_url: ${{ needs.create-release.outputs.upload_url }} 91 | asset_path: ${{ env.ASSET }} 92 | asset_name: ${{ env.ASSET }} 93 | asset_content_type: application/octet-stream 94 | -------------------------------------------------------------------------------- /src/utils.rs: -------------------------------------------------------------------------------- 1 | use std::convert::TryInto; 2 | use chrono::{Duration, NaiveDate, NaiveDateTime, NaiveTime}; 3 | use quick_xml::events::attributes::{Attribute, Attributes}; 4 | use crate::wb::DateSystem; 5 | 6 | const XL_MAX_COL: u16 = 16384; 7 | const XL_MIN_COL: u16 = 1; 8 | 9 | /// Return column letter for column number `n` 10 | pub fn num2col(n: u16) -> Option { 11 | if !(XL_MIN_COL..=XL_MAX_COL).contains(&n) { return None } 12 | let mut s = String::new(); 13 | let mut n = n; 14 | while n > 0 { 15 | let r: u8 = ((n - 1) % 26).try_into().unwrap(); 16 | n = (n - 1) / 26; 17 | s.push((65 + r) as char) 18 | } 19 | Some(s.chars().rev().collect::()) 20 | } 21 | 22 | /// Return column number for column letter `letter` 23 | pub fn col2num(letter: &str) -> Option { 24 | let letter = letter.to_uppercase(); 25 | let mut num: u16 = 0; 26 | for c in letter.chars() { 27 | if !('A'..='Z').contains(&c) { return None } 28 | num = num * 26 + ((c as u16) - ('A' as u16)) + 1; 29 | } 30 | if !(XL_MIN_COL..=XL_MAX_COL).contains(&num) { return None } 31 | Some(num) 32 | } 33 | 34 | pub fn attr_value(a: &Attribute) -> String { 35 | String::from_utf8(a.value.to_vec()).unwrap() 36 | } 37 | 38 | pub fn get(attrs: Attributes, which: &[u8]) -> Option { 39 | for attr in attrs { 40 | let a = attr.unwrap(); 41 | if a.key == which { 42 | return Some(attr_value(&a)) 43 | } 44 | } 45 | None 46 | } 47 | 48 | pub enum DateConversion { 49 | Date(NaiveDate), 50 | DateTime(NaiveDateTime), 51 | Time(NaiveTime), 52 | Number(i64), 53 | } 54 | 55 | /// Return date of "number" based on the date system provided. 56 | /// 57 | /// The date system is either the 1904 system or the 1900 system depending on which date system 58 | /// the spreadsheet is using. See for more information on date systems in 59 | /// Excel. 60 | pub fn excel_number_to_date(number: f64, date_system: &DateSystem) -> DateConversion { 61 | let base = match date_system { 62 | DateSystem::V1900 => { 63 | // Under the 1900 base system, 1 represents 1/1/1900 (so we start with a base date of 64 | // 12/31/1899). 65 | let mut base = NaiveDate::from_ymd(1899, 12, 31).and_hms(0, 0, 0); 66 | // BUT (!), Excel considers 1900 a leap-year which it is not. As such, it will happily 67 | // represent 2/29/1900 with the number 60, but we cannot convert that value to a date 68 | // so we throw an error. 69 | if (number - 60.0).abs() < 0.0001 { 70 | panic!("Bad date in Excel file - 2/29/1900 not valid") 71 | // Otherwise, if the value is greater than 60 we need to adjust the base date to 72 | // 12/30/1899 to account for this leap year bug. 73 | } else if number > 60.0 { 74 | base -= Duration::days(1) 75 | } 76 | base 77 | }, 78 | DateSystem::V1904 => { 79 | // Under the 1904 system, 1 represent 1/2/1904 so we start with a base date of 80 | // 1/1/1904. 81 | NaiveDate::from_ymd(1904, 1, 1).and_hms(0, 0, 0) 82 | } 83 | }; 84 | let days = number.trunc() as i64; 85 | if days < -693594 { 86 | return DateConversion::Number(days) 87 | } 88 | let partial_days = number - (days as f64); 89 | let seconds = (partial_days * 86400000.0).round() as i64; 90 | let milliseconds = Duration::milliseconds(seconds % 1000); 91 | let seconds = Duration::seconds(seconds / 1000); 92 | let date = base + Duration::days(days) + seconds + milliseconds; 93 | if days == 0 { 94 | DateConversion::Time(date.time()) 95 | } else if date.time() == NaiveTime::from_hms(0, 0, 0) { 96 | DateConversion::Date(date.date()) 97 | } else { 98 | DateConversion::DateTime(date) 99 | } 100 | } 101 | 102 | #[cfg(test)] 103 | mod tests { 104 | use super::*; 105 | 106 | #[test] 107 | fn num_to_letter_w() { 108 | assert_eq!(num2col(23), Some(String::from("W"))); 109 | } 110 | 111 | #[test] 112 | fn num_to_letter_aa() { 113 | assert_eq!(num2col(27), Some(String::from("AA"))); 114 | } 115 | 116 | #[test] 117 | fn num_to_letter_ab() { 118 | assert_eq!(num2col(28), Some(String::from("AB"))); 119 | } 120 | 121 | #[test] 122 | fn num_to_letter_xfd() { 123 | assert_eq!(num2col(16384), Some(String::from("XFD"))); 124 | } 125 | 126 | #[test] 127 | fn num_to_letter_xfe() { 128 | assert_eq!(num2col(16385), None); 129 | } 130 | 131 | #[test] 132 | fn num_to_letter_0() { 133 | assert_eq!(num2col(0), None); 134 | } 135 | 136 | #[test] 137 | fn letter_to_num_w() { 138 | assert_eq!(col2num("W"), Some(23)); 139 | } 140 | 141 | #[test] 142 | fn letter_to_num_aa() { 143 | assert_eq!(col2num("AA"), Some(27)); 144 | } 145 | 146 | #[test] 147 | fn letter_to_num_ab() { 148 | assert_eq!(col2num("AB"), Some(28)); 149 | } 150 | 151 | #[test] 152 | fn letter_to_num_xfd() { 153 | assert_eq!(col2num("XFD"), Some(16384)); 154 | } 155 | 156 | #[test] 157 | fn letter_to_num_xfe() { 158 | assert_eq!(col2num("XFE"), None); 159 | } 160 | 161 | #[test] 162 | fn letter_to_num_ab_lower() { 163 | assert_eq!(col2num("ab"), Some(28)); 164 | } 165 | 166 | #[test] 167 | fn letter_to_num_number() { 168 | assert_eq!(col2num("12"), None); 169 | } 170 | 171 | #[test] 172 | fn letter_to_num_semicolon() { 173 | assert_eq!(col2num(";"), None); 174 | } 175 | } 176 | -------------------------------------------------------------------------------- /Cargo.lock: -------------------------------------------------------------------------------- 1 | # This file is automatically @generated by Cargo. 2 | # It is not intended for manual editing. 3 | version = 3 4 | 5 | [[package]] 6 | name = "adler" 7 | version = "1.0.2" 8 | source = "registry+https://github.com/rust-lang/crates.io-index" 9 | checksum = "f26201604c87b1e01bd3d98f8d5d9a8fcbb815e8cedb41ffccbeb4bf593a35fe" 10 | 11 | [[package]] 12 | name = "autocfg" 13 | version = "1.0.1" 14 | source = "registry+https://github.com/rust-lang/crates.io-index" 15 | checksum = "cdb031dd78e28731d87d56cc8ffef4a8f36ca26c38fe2de700543e627f8a464a" 16 | 17 | [[package]] 18 | name = "byteorder" 19 | version = "1.4.3" 20 | source = "registry+https://github.com/rust-lang/crates.io-index" 21 | checksum = "14c189c53d098945499cdfa7ecc63567cf3886b3332b312a5b4585d8d3a6a610" 22 | 23 | [[package]] 24 | name = "bzip2" 25 | version = "0.4.3" 26 | source = "registry+https://github.com/rust-lang/crates.io-index" 27 | checksum = "6afcd980b5f3a45017c57e57a2fcccbb351cc43a356ce117ef760ef8052b89b0" 28 | dependencies = [ 29 | "bzip2-sys", 30 | "libc", 31 | ] 32 | 33 | [[package]] 34 | name = "bzip2-sys" 35 | version = "0.1.11+1.0.8" 36 | source = "registry+https://github.com/rust-lang/crates.io-index" 37 | checksum = "736a955f3fa7875102d57c82b8cac37ec45224a07fd32d58f9f7a186b6cd4cdc" 38 | dependencies = [ 39 | "cc", 40 | "libc", 41 | "pkg-config", 42 | ] 43 | 44 | [[package]] 45 | name = "cc" 46 | version = "1.0.68" 47 | source = "registry+https://github.com/rust-lang/crates.io-index" 48 | checksum = "4a72c244c1ff497a746a7e1fb3d14bd08420ecda70c8f25c7112f2781652d787" 49 | 50 | [[package]] 51 | name = "cfg-if" 52 | version = "1.0.0" 53 | source = "registry+https://github.com/rust-lang/crates.io-index" 54 | checksum = "baf1de4339761588bc0619e3cbc0120ee582ebb74b53b4efbf79117bd2da40fd" 55 | 56 | [[package]] 57 | name = "chrono" 58 | version = "0.4.19" 59 | source = "registry+https://github.com/rust-lang/crates.io-index" 60 | checksum = "670ad68c9088c2a963aaa298cb369688cf3f9465ce5e2d4ca10e6e0098a1ce73" 61 | dependencies = [ 62 | "libc", 63 | "num-integer", 64 | "num-traits", 65 | "time", 66 | "winapi", 67 | ] 68 | 69 | [[package]] 70 | name = "crc32fast" 71 | version = "1.2.1" 72 | source = "registry+https://github.com/rust-lang/crates.io-index" 73 | checksum = "81156fece84ab6a9f2afdb109ce3ae577e42b1228441eded99bd77f627953b1a" 74 | dependencies = [ 75 | "cfg-if", 76 | ] 77 | 78 | [[package]] 79 | name = "flate2" 80 | version = "1.0.20" 81 | source = "registry+https://github.com/rust-lang/crates.io-index" 82 | checksum = "cd3aec53de10fe96d7d8c565eb17f2c687bb5518a2ec453b5b1252964526abe0" 83 | dependencies = [ 84 | "cfg-if", 85 | "crc32fast", 86 | "libc", 87 | "miniz_oxide", 88 | ] 89 | 90 | [[package]] 91 | name = "libc" 92 | version = "0.2.98" 93 | source = "registry+https://github.com/rust-lang/crates.io-index" 94 | checksum = "320cfe77175da3a483efed4bc0adc1968ca050b098ce4f2f1c13a56626128790" 95 | 96 | [[package]] 97 | name = "memchr" 98 | version = "2.4.0" 99 | source = "registry+https://github.com/rust-lang/crates.io-index" 100 | checksum = "b16bd47d9e329435e309c58469fe0791c2d0d1ba96ec0954152a5ae2b04387dc" 101 | 102 | [[package]] 103 | name = "miniz_oxide" 104 | version = "0.4.4" 105 | source = "registry+https://github.com/rust-lang/crates.io-index" 106 | checksum = "a92518e98c078586bc6c934028adcca4c92a53d6a958196de835170a01d84e4b" 107 | dependencies = [ 108 | "adler", 109 | "autocfg", 110 | ] 111 | 112 | [[package]] 113 | name = "num-integer" 114 | version = "0.1.44" 115 | source = "registry+https://github.com/rust-lang/crates.io-index" 116 | checksum = "d2cc698a63b549a70bc047073d2949cce27cd1c7b0a4a862d08a8031bc2801db" 117 | dependencies = [ 118 | "autocfg", 119 | "num-traits", 120 | ] 121 | 122 | [[package]] 123 | name = "num-traits" 124 | version = "0.2.14" 125 | source = "registry+https://github.com/rust-lang/crates.io-index" 126 | checksum = "9a64b1ec5cda2586e284722486d802acf1f7dbdc623e2bfc57e65ca1cd099290" 127 | dependencies = [ 128 | "autocfg", 129 | ] 130 | 131 | [[package]] 132 | name = "pkg-config" 133 | version = "0.3.19" 134 | source = "registry+https://github.com/rust-lang/crates.io-index" 135 | checksum = "3831453b3449ceb48b6d9c7ad7c96d5ea673e9b470a1dc578c2ce6521230884c" 136 | 137 | [[package]] 138 | name = "proc-macro2" 139 | version = "1.0.27" 140 | source = "registry+https://github.com/rust-lang/crates.io-index" 141 | checksum = "f0d8caf72986c1a598726adc988bb5984792ef84f5ee5aa50209145ee8077038" 142 | dependencies = [ 143 | "unicode-xid", 144 | ] 145 | 146 | [[package]] 147 | name = "quick-xml" 148 | version = "0.22.0" 149 | source = "registry+https://github.com/rust-lang/crates.io-index" 150 | checksum = "8533f14c8382aaad0d592c812ac3b826162128b65662331e1127b45c3d18536b" 151 | dependencies = [ 152 | "memchr", 153 | ] 154 | 155 | [[package]] 156 | name = "quote" 157 | version = "1.0.9" 158 | source = "registry+https://github.com/rust-lang/crates.io-index" 159 | checksum = "c3d0b9745dc2debf507c8422de05d7226cc1f0644216dfdfead988f9b1ab32a7" 160 | dependencies = [ 161 | "proc-macro2", 162 | ] 163 | 164 | [[package]] 165 | name = "syn" 166 | version = "1.0.73" 167 | source = "registry+https://github.com/rust-lang/crates.io-index" 168 | checksum = "f71489ff30030d2ae598524f61326b902466f72a0fb1a8564c001cc63425bcc7" 169 | dependencies = [ 170 | "proc-macro2", 171 | "quote", 172 | "unicode-xid", 173 | ] 174 | 175 | [[package]] 176 | name = "thiserror" 177 | version = "1.0.26" 178 | source = "registry+https://github.com/rust-lang/crates.io-index" 179 | checksum = "93119e4feac1cbe6c798c34d3a53ea0026b0b1de6a120deef895137c0529bfe2" 180 | dependencies = [ 181 | "thiserror-impl", 182 | ] 183 | 184 | [[package]] 185 | name = "thiserror-impl" 186 | version = "1.0.26" 187 | source = "registry+https://github.com/rust-lang/crates.io-index" 188 | checksum = "060d69a0afe7796bf42e9e2ff91f5ee691fb15c53d38b4b62a9a53eb23164745" 189 | dependencies = [ 190 | "proc-macro2", 191 | "quote", 192 | "syn", 193 | ] 194 | 195 | [[package]] 196 | name = "time" 197 | version = "0.1.44" 198 | source = "registry+https://github.com/rust-lang/crates.io-index" 199 | checksum = "6db9e6914ab8b1ae1c260a4ae7a49b6c5611b40328a735b21862567685e73255" 200 | dependencies = [ 201 | "libc", 202 | "wasi", 203 | "winapi", 204 | ] 205 | 206 | [[package]] 207 | name = "unicode-xid" 208 | version = "0.2.2" 209 | source = "registry+https://github.com/rust-lang/crates.io-index" 210 | checksum = "8ccb82d61f80a663efe1f787a51b16b5a51e3314d6ac365b08639f52387b33f3" 211 | 212 | [[package]] 213 | name = "wasi" 214 | version = "0.10.0+wasi-snapshot-preview1" 215 | source = "registry+https://github.com/rust-lang/crates.io-index" 216 | checksum = "1a143597ca7c7793eff794def352d41792a93c481eb1042423ff7ff72ba2c31f" 217 | 218 | [[package]] 219 | name = "winapi" 220 | version = "0.3.9" 221 | source = "registry+https://github.com/rust-lang/crates.io-index" 222 | checksum = "5c839a674fcd7a98952e593242ea400abe93992746761e38641405d28b00f419" 223 | dependencies = [ 224 | "winapi-i686-pc-windows-gnu", 225 | "winapi-x86_64-pc-windows-gnu", 226 | ] 227 | 228 | [[package]] 229 | name = "winapi-i686-pc-windows-gnu" 230 | version = "0.4.0" 231 | source = "registry+https://github.com/rust-lang/crates.io-index" 232 | checksum = "ac3b87c63620426dd9b991e5ce0329eff545bccbbb34f3be09ff6fb6ab51b7b6" 233 | 234 | [[package]] 235 | name = "winapi-x86_64-pc-windows-gnu" 236 | version = "0.4.0" 237 | source = "registry+https://github.com/rust-lang/crates.io-index" 238 | checksum = "712e227841d057c1ee1cd2fb22fa7e5a5461ae8e48fa2ca79ec42cfc1931183f" 239 | 240 | [[package]] 241 | name = "xl" 242 | version = "0.1.8" 243 | dependencies = [ 244 | "chrono", 245 | "quick-xml", 246 | "zip", 247 | ] 248 | 249 | [[package]] 250 | name = "zip" 251 | version = "0.5.13" 252 | source = "registry+https://github.com/rust-lang/crates.io-index" 253 | checksum = "93ab48844d61251bb3835145c521d88aa4031d7139e8485990f60ca911fa0815" 254 | dependencies = [ 255 | "byteorder", 256 | "bzip2", 257 | "crc32fast", 258 | "flate2", 259 | "thiserror", 260 | "time", 261 | ] 262 | -------------------------------------------------------------------------------- /src/lib.rs: -------------------------------------------------------------------------------- 1 | //! This library is intended to help you deal with big Excel files. The library was originally 2 | //! created as a Python library () after learning that neither pandas, 3 | //! openpyxl, xlwings, nor win32com had the ability to open large Excel files without loading them 4 | //! completely into memory. This doesn't work when you have *huge* Excel files (especially if you 5 | //! only want to examine a bit of the file - the first 10 rows say). `sxl` (and this library) solve 6 | //! the problem by parsing the SpreadsheetML / XML xlsx files using a streaming parser. So you can 7 | //! see the first ten rows of any tab within any Excel file extremely quickly. 8 | //! 9 | //! This particular module provides the plumbing to connect the command-line interface to the xl 10 | //! library code. It parses arguments passed on the command line, determines if we can act on 11 | //! those arguments, and then provides a `Config` object back that can be passed into the `run` 12 | //! function if we can. 13 | //! 14 | //! In order to call `xlcat`, you need to provide a path to a valid workbook and a tab that can be 15 | //! found in that workbook (either by name or by number). You can (optionally) also pass the number 16 | //! of rows you want to see with the `-n` flag (e.g., `-n 10` limits the output to the first ten 17 | //! rows). 18 | //! 19 | //! # Example Usage 20 | //! 21 | //! Here is a sample of how you might use this library: 22 | //! 23 | //! use xl::Workbook; 24 | //! 25 | //! fn main () { 26 | //! let mut wb = xl::Workbook::open("tests/data/Book1.xlsx").unwrap(); 27 | //! let sheets = wb.sheets(); 28 | //! let sheet = sheets.get("Sheet1"); 29 | //! } 30 | 31 | mod wb; 32 | mod ws; 33 | mod utils; 34 | 35 | use std::fmt; 36 | pub use wb::Workbook; 37 | pub use ws::{Worksheet, ExcelValue}; 38 | pub use utils::{col2num, excel_number_to_date, num2col}; 39 | 40 | #[derive(Debug, Clone, PartialEq)] 41 | pub enum OutputFormat { 42 | Csv, 43 | Markdown, 44 | } 45 | 46 | enum SheetNameOrNum { 47 | Name(String), 48 | Num(usize), 49 | } 50 | 51 | pub struct Config { 52 | /// Which xlsx file should we print? 53 | workbook_path: String, 54 | /// Which tab should we print? 55 | tab: SheetNameOrNum, 56 | /// How many rows should we print? 57 | nrows: Option, 58 | /// Should we show usage information? 59 | want_help: bool, 60 | /// Should we show the current version? 61 | want_version: bool, 62 | /// What output format should we use? 63 | pub output_format: OutputFormat, 64 | } 65 | 66 | pub enum ConfigError<'a> { 67 | NeedPathAndTab(&'a str), 68 | NeedTab, 69 | RowsMustBeInt, 70 | NeedNumRows, 71 | UnknownFlag(&'a str), 72 | InvalidFormat(&'a str), 73 | NeedFormat, 74 | } 75 | 76 | impl<'a> fmt::Display for ConfigError<'a> { 77 | fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { 78 | match self { 79 | ConfigError::NeedPathAndTab(exe) => write!(f, "need to provide path and tab when running '{}'. See usage below.", exe), 80 | ConfigError::NeedTab => write!(f, "must also provide which tab you want to view in workbook"), 81 | ConfigError::RowsMustBeInt => write!(f, "number of rows must be an integer value"), 82 | ConfigError::NeedNumRows => write!(f, "must provide number of rows when using -n"), 83 | ConfigError::UnknownFlag(flag) => write!(f, "unknown flag: {}", flag), 84 | ConfigError::InvalidFormat(fmt) => write!(f, "invalid format '{}'. Valid formats are 'csv' and 'markdown'", fmt), 85 | ConfigError::NeedFormat => write!(f, "must provide format when using --fmt"), 86 | } 87 | } 88 | } 89 | 90 | impl Config { 91 | pub fn new(args: &[String]) -> Result { 92 | if args.len() < 2 { 93 | return Err(ConfigError::NeedPathAndTab(&args[0])) 94 | } else if args.len() < 3 { 95 | return match args[1].as_ref() { 96 | "-h" | "--help" => Ok(Config { 97 | workbook_path: "".to_owned(), 98 | tab: SheetNameOrNum::Num(0), 99 | nrows: None, 100 | want_version: false, 101 | want_help: true, 102 | output_format: OutputFormat::Csv, 103 | }), 104 | "-v" | "--version" => Ok(Config { 105 | workbook_path: "".to_owned(), 106 | tab: SheetNameOrNum::Num(0), 107 | nrows: None, 108 | want_version: true, 109 | want_help: false, 110 | output_format: OutputFormat::Csv, 111 | }), 112 | _ => Err(ConfigError::NeedTab) 113 | } 114 | } 115 | let workbook_path = args[1].clone(); 116 | let tab = match args[2].parse::() { 117 | Ok(num) => SheetNameOrNum::Num(num), 118 | Err(_) => SheetNameOrNum::Name(args[2].clone()) 119 | }; 120 | let mut config = Config { workbook_path, tab, nrows: None, want_help: false, want_version: false, output_format: OutputFormat::Csv, }; 121 | let mut iter = args[3..].iter(); 122 | while let Some(flag) = iter.next() { 123 | let flag = &flag[..]; 124 | match flag { 125 | "-n" => { 126 | if let Some(nrows) = iter.next() { 127 | if let Ok(nrows) = nrows.parse::() { 128 | config.nrows = Some(nrows) 129 | } else { 130 | return Err(ConfigError::RowsMustBeInt) 131 | } 132 | } else { 133 | return Err(ConfigError::NeedNumRows) 134 | } 135 | }, 136 | "--fmt" => { 137 | if let Some(format) = iter.next() { 138 | match format.as_ref() { 139 | "csv" => config.output_format = OutputFormat::Csv, 140 | "markdown" => config.output_format = OutputFormat::Markdown, 141 | _ => return Err(ConfigError::InvalidFormat(format)), 142 | } 143 | } else { 144 | return Err(ConfigError::NeedFormat) 145 | } 146 | }, 147 | _ => return Err(ConfigError::UnknownFlag(flag)), 148 | } 149 | } 150 | Ok(config) 151 | } 152 | } 153 | 154 | pub fn run(config: Config) -> Result<(), String> { 155 | if config.want_help { 156 | usage(); 157 | std::process::exit(0); 158 | } 159 | if config.want_version { 160 | version(); 161 | std::process::exit(0); 162 | } 163 | match crate::Workbook::new(&config.workbook_path) { 164 | Ok(mut wb) => { 165 | let sheets = wb.sheets(); 166 | let sheet = match config.tab { 167 | SheetNameOrNum::Name(n) => sheets.get(&*n), 168 | SheetNameOrNum::Num(n) => sheets.get(n), 169 | }; 170 | if let Some(ws) = sheet { 171 | let nrows = if let Some(nrows) = config.nrows { 172 | nrows as usize 173 | } else { 174 | 1048576 // max number of rows in an Excel worksheet 175 | }; 176 | match config.output_format { 177 | OutputFormat::Csv => { 178 | for row in ws.rows(&mut wb).take(nrows) { 179 | println!("{}", row); 180 | } 181 | }, 182 | OutputFormat::Markdown => { 183 | // Collect all CSV rows first, then convert to markdown 184 | let mut csv_rows: Vec = Vec::new(); 185 | for row in ws.rows(&mut wb).take(nrows) { 186 | let csv_line = format!("{}", row); 187 | if !csv_line.trim().is_empty() { 188 | csv_rows.push(csv_line); 189 | } 190 | } 191 | 192 | // Convert CSV to markdown 193 | print_csv_as_markdown(&csv_rows); 194 | }, 195 | } 196 | } else { 197 | return Err("that sheet does not exist".to_owned()) 198 | } 199 | Ok(()) 200 | }, 201 | Err(e) => Err(e) 202 | } 203 | } 204 | 205 | pub fn usage() { 206 | println!(concat!( 207 | "\n", 208 | "xlcat 0.1.8\n", 209 | "Kevin Ryan \n", 210 | "\n", 211 | "xlcat is like cat, but for Excel files (xlsx files to be precise). You simply\n", 212 | "give it the path of the xlsx and the tab you want to view, and it prints the\n", 213 | "data in that tab to your screen in a comma-delimited format.\n", 214 | "\n", 215 | "You can read about the project at https://xlpro.tips/posts/xlcat. The project\n", 216 | "page is hosted at https://github.com/xlprotips/xl.\n", 217 | "\n", 218 | "USAGE:\n", 219 | " xlcat PATH TAB [-n NUM] [--fmt FORMAT] [-h | --help]\n", 220 | "\n", 221 | "ARGS:\n", 222 | " PATH Where the xlsx file is located on your filesystem.\n", 223 | " TAB Which tab in the xlsx you want to print to screen.\n", 224 | "\n", 225 | "OPTIONS:\n", 226 | " -n Limit the number of rows we print to .\n", 227 | " --fmt FORMAT Output format: 'csv' (default) or 'markdown'.\n", 228 | )); 229 | } 230 | 231 | pub fn version() { 232 | println!("xlcat 0.1.8"); 233 | } 234 | 235 | /// Convert CSV rows to markdown table format 236 | fn print_csv_as_markdown(csv_rows: &[String]) { 237 | if csv_rows.is_empty() { 238 | return; 239 | } 240 | 241 | let mut rows_data: Vec> = Vec::new(); 242 | 243 | // Parse CSV rows 244 | for csv_row in csv_rows { 245 | let fields = parse_csv_row(csv_row); 246 | if !fields.is_empty() && fields.iter().any(|f| !f.trim().is_empty()) { 247 | rows_data.push(fields); 248 | } 249 | } 250 | 251 | if rows_data.is_empty() { 252 | return; 253 | } 254 | 255 | // Find max columns 256 | let max_cols = rows_data.iter().map(|row| row.len()).max().unwrap_or(0); 257 | 258 | // Print header (first row) 259 | if let Some(header) = rows_data.first() { 260 | print!("|"); 261 | for i in 0..max_cols { 262 | let empty_string = String::new(); 263 | let cell = header.get(i).unwrap_or(&empty_string); 264 | let cleaned = clean_cell_for_markdown(cell); 265 | print!(" {} |", cleaned); 266 | } 267 | println!(); 268 | 269 | // Print separator row 270 | print!("|"); 271 | for _ in 0..max_cols { 272 | print!(" --- |"); 273 | } 274 | println!(); 275 | 276 | // Print data rows 277 | for row in rows_data.iter().skip(1) { 278 | print!("|"); 279 | for i in 0..max_cols { 280 | let empty_string = String::new(); 281 | let cell = row.get(i).unwrap_or(&empty_string); 282 | let cleaned = clean_cell_for_markdown(cell); 283 | print!(" {} |", cleaned); 284 | } 285 | println!(); 286 | } 287 | } 288 | } 289 | 290 | /// Simple CSV parser that handles quoted fields 291 | fn parse_csv_row(csv_row: &str) -> Vec { 292 | let mut fields = Vec::new(); 293 | let mut current_field = String::new(); 294 | let mut in_quotes = false; 295 | let mut chars = csv_row.chars().peekable(); 296 | 297 | while let Some(c) = chars.next() { 298 | match c { 299 | '"' => { 300 | if in_quotes && chars.peek() == Some(&'"') { 301 | // Double quote escape 302 | current_field.push('"'); 303 | chars.next(); // consume second quote 304 | } else { 305 | in_quotes = !in_quotes; 306 | } 307 | }, 308 | ',' if !in_quotes => { 309 | fields.push(current_field.trim().to_string()); 310 | current_field.clear(); 311 | }, 312 | _ => current_field.push(c), 313 | } 314 | } 315 | 316 | // Add the last field 317 | fields.push(current_field.trim().to_string()); 318 | fields 319 | } 320 | 321 | /// Clean cell content for markdown output 322 | fn clean_cell_for_markdown(cell: &str) -> String { 323 | cell.replace('|', "\\|") 324 | .replace('\n', " ") 325 | .replace('\r', " ") 326 | } 327 | -------------------------------------------------------------------------------- /src/ws.rs: -------------------------------------------------------------------------------- 1 | //! This module implements all the functionality specific to Excel worksheets. This mostly means 2 | 3 | use crate::utils; 4 | 5 | use std::borrow::Cow; 6 | use std::cmp; 7 | use std::fmt; 8 | use std::io::BufReader; 9 | use std::mem; 10 | use std::ops::Index; 11 | use chrono::{NaiveDate, NaiveDateTime, NaiveTime}; 12 | use zip::read::ZipFile; 13 | use quick_xml::Reader; 14 | use quick_xml::events::Event; 15 | // use quick_xml::events::attributes::Attribute; 16 | use crate::wb::{DateSystem, Workbook}; 17 | 18 | /// The `SheetReader` is used in a `RowIter` to navigate a worksheet. It contains a pointer to the 19 | /// worksheet `ZipFile` in the xlsx file, the list of strings used in the workbook, the styles used 20 | /// in the workbook, and the date system of the workbook. None of these fields are "public," but 21 | /// must be provided through the `SheetReader::new` method. See that method for documentation of 22 | /// each item. 23 | pub struct SheetReader<'a> { 24 | reader: Reader>>, 25 | strings: &'a [String], 26 | styles: &'a [String], 27 | date_system: &'a DateSystem, 28 | } 29 | 30 | impl<'a> SheetReader<'a> { 31 | /// Create a new `SheetReader`. The parameters are: 32 | /// 33 | /// - The `reader` should be a reader object pointing to the sheets xml within the zip file. 34 | /// - The `strings` argument should be reference to the vector of strings used in the xlsx. As 35 | /// background, xlsx files do not store strings directly in each spreadsheet's xml file. 36 | /// Instead, there is a special file that contains all the strings in the workbook that 37 | /// basically boils down to a big list of strings. Whenever a string is needed in a 38 | /// particular worksheet, the xml has the index of the string in that file. So we need this 39 | /// information to print out any string values in a worksheet. 40 | /// - The `styles` are used to determine the data type (primarily for dates). While each cell 41 | /// has a 'cell type,' dates are a little trickier to get right. So we use the style 42 | /// information when we can. 43 | /// - Lastly, the `date_system` is used to determine what date we are looking at for cells that 44 | /// contain date values. See the documentation for the `DateSystem` enum for more 45 | /// information. 46 | pub fn new( 47 | reader: Reader>>, 48 | strings: &'a [String], 49 | styles: &'a [String], 50 | date_system: &'a DateSystem) -> SheetReader<'a> { 51 | SheetReader { reader, strings, styles, date_system } 52 | } 53 | } 54 | 55 | /// find the number of rows and columns used in a particular worksheet. takes the workbook xlsx 56 | /// location as its first parameter, and the location of the worksheet in question (within the zip) 57 | /// as the second parameter. Returns a tuple of (rows, columns) in the worksheet. 58 | fn used_area(used_area_range: &str) -> (u32, u16) { 59 | let mut end: isize = -1; 60 | for (i, c) in used_area_range.chars().enumerate() { 61 | if c == ':' { end = i as isize; break } 62 | } 63 | if end == -1 { 64 | (0, 0) 65 | } else { 66 | let end_range = &used_area_range[end as usize..]; 67 | let mut end = 0; 68 | // note, the extra '1' (in various spots below) is to deal with the ':' part of the 69 | // range 70 | for (i, c) in end_range[1..].chars().enumerate() { 71 | if !c.is_ascii_alphabetic() { 72 | end = i + 1; 73 | break 74 | } 75 | } 76 | let col = utils::col2num(&end_range[1..end]).unwrap(); 77 | let row: u32 = end_range[end..].parse().unwrap(); 78 | (row, col) 79 | } 80 | } 81 | 82 | /// The Worksheet is the primary object in this module since this is where most of the valuable 83 | /// data is. See the methods below for how to use. 84 | #[derive(Debug)] 85 | pub struct Worksheet { 86 | pub name: String, 87 | pub position: u8, 88 | /// location where we can find this worksheet in its xlsx file 89 | target: String, 90 | } 91 | 92 | impl Worksheet { 93 | /// Create a new worksheet. Note that this method will probably not be called directly. 94 | /// Instead, you'll normally get a worksheet from a `Workbook` object. E.g.,: 95 | /// 96 | /// use xl::{Workbook, Worksheet}; 97 | /// 98 | /// let mut wb = Workbook::open("tests/data/Book1.xlsx").unwrap(); 99 | /// let sheets = wb.sheets(); 100 | /// let ws = sheets.get("Time"); 101 | /// assert!(ws.is_some()); 102 | pub fn new(name: String, position: u8, target: String) -> Self { 103 | Worksheet { name, position, target, } 104 | } 105 | 106 | /// Obtain a `RowIter` for this worksheet (that is in `workbook`). This is, arguably, the main 107 | /// part of the library. You use this method to iterate through all the values in this sheet. 108 | /// The simplest thing you can do is print the values out (which is what `xlcat` does), but you 109 | /// could do more if you wanted. 110 | /// 111 | /// # Example usage 112 | /// 113 | /// use xl::{Workbook, Worksheet, ExcelValue}; 114 | /// 115 | /// let mut wb = Workbook::open("tests/data/Book1.xlsx").unwrap(); 116 | /// let sheets = wb.sheets(); 117 | /// let ws = sheets.get("Sheet1").unwrap(); 118 | /// let mut rows = ws.rows(&mut wb); 119 | /// let row1 = rows.next().unwrap(); 120 | /// assert_eq!(row1[0].raw_value, "1"); 121 | /// assert_eq!(row1[1].value, ExcelValue::Number(2f64)); 122 | pub fn rows<'a>(&self, workbook: &'a mut Workbook) -> RowIter<'a> { 123 | let reader = workbook.sheet_reader(&self.target); 124 | RowIter { 125 | worksheet_reader: reader, 126 | want_row: 1, 127 | next_row: None, 128 | num_cols: 0, 129 | num_rows: 0, 130 | done_file: false, 131 | } 132 | } 133 | 134 | } 135 | 136 | /// `ExcelValue` is the enum that holds the equivalent "rust value" of a `Cell`s "raw_value." 137 | #[derive(Debug, PartialEq)] 138 | pub enum ExcelValue<'a> { 139 | Bool(bool), 140 | Date(NaiveDate), 141 | DateTime(NaiveDateTime), 142 | Error(String), 143 | None, 144 | Number(f64), 145 | String(Cow<'a, str>), 146 | Time(NaiveTime), 147 | } 148 | 149 | impl fmt::Display for ExcelValue<'_> { 150 | fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { 151 | match self { 152 | ExcelValue::Bool(b) => write!(f, "{}", b), 153 | ExcelValue::Date(d) => write!(f, "{}", d), 154 | ExcelValue::DateTime(d) => write!(f, "{}", d), 155 | ExcelValue::Error(e) => write!(f, "#{}", e), 156 | ExcelValue::None => write!(f, ""), 157 | ExcelValue::Number(n) => write!(f, "{}", n), 158 | ExcelValue::String(s) => write!(f, "\"{}\"", s.replace(r#"""#, r#""""#)), 159 | ExcelValue::Time(t) => write!(f, "\"{}\"", t), 160 | } 161 | } 162 | } 163 | 164 | #[derive(Debug)] 165 | pub struct Cell<'a> { 166 | /// The value you get by converting the raw_value (a string) into a Rust value 167 | pub value: ExcelValue<'a>, 168 | /// The formula (may be "empty") of the cell 169 | pub formula: String, 170 | /// What cell are we looking at? E.g., B3, A1, etc. 171 | pub reference: String, 172 | /// The cell style (e.g., the style you see in Excel by hitting Ctrl+1 and going to the 173 | /// "Number" tab). 174 | pub style: String, 175 | /// The type of cell as recorded by Excel (s = string using sharedStrings.xml, str = raw 176 | /// string, b = boolean, etc.). This may change from a `String` type to an `Enum` of some sorts 177 | /// in the future. 178 | pub cell_type: String, 179 | /// The raw string value recorded in the xml 180 | pub raw_value: String, 181 | } 182 | 183 | impl Cell<'_> { 184 | /// return the row/column coordinates of the current cell 185 | pub fn coordinates(&self) -> (u16, u32) { 186 | // let (col, row) = split_cell_reference(&self.reference); 187 | let (col, row) = { 188 | let r = &self.reference; 189 | let mut end = 0; 190 | for (i, c) in r.chars().enumerate() { 191 | if !c.is_ascii_alphabetic() { 192 | end = i; 193 | break 194 | } 195 | } 196 | (&r[..end], &r[end..]) 197 | }; 198 | let col = utils::col2num(col).unwrap(); 199 | let row = row.parse().unwrap(); 200 | (col, row) 201 | } 202 | } 203 | 204 | #[derive(Debug)] 205 | pub struct Row<'a>(pub Vec>, pub usize); 206 | 207 | impl<'a> Index for Row<'a> { 208 | type Output = Cell<'a>; 209 | 210 | fn index(&self, column_index: u16) -> &Self::Output { 211 | &self.0[column_index as usize] 212 | } 213 | } 214 | 215 | impl fmt::Display for Row<'_> { 216 | fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { 217 | let vec = &self.0; 218 | for (count, v) in vec.iter().enumerate() { 219 | if count != 0 { write!(f, ",")?; } 220 | write!(f, "{}", v)?; 221 | } 222 | write!(f, "") 223 | } 224 | } 225 | 226 | impl fmt::Display for Cell<'_> { 227 | fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { 228 | write!(f, "{}", self.value) 229 | } 230 | } 231 | 232 | pub struct RowIter<'a> { 233 | worksheet_reader: SheetReader<'a>, 234 | want_row: usize, 235 | next_row: Option>, 236 | num_rows: u32, 237 | num_cols: u16, 238 | done_file: bool, 239 | } 240 | 241 | fn new_cell() -> Cell<'static> { 242 | Cell { 243 | value: ExcelValue::None, 244 | formula: "".to_string(), 245 | reference: "".to_string(), 246 | style: "".to_string(), 247 | cell_type: "".to_string(), 248 | raw_value: "".to_string(), 249 | } 250 | } 251 | 252 | fn empty_row(num_cols: u16, this_row: usize) -> Option> { 253 | let mut row = vec![]; 254 | for n in 0..num_cols { 255 | let mut c = new_cell(); 256 | c.reference.push_str(&utils::num2col(n + 1).unwrap()); 257 | c.reference.push_str(&this_row.to_string()); 258 | row.push(c); 259 | } 260 | Some(Row(row, this_row)) 261 | } 262 | 263 | impl<'a> Iterator for RowIter<'a> { 264 | type Item = Row<'a>; 265 | 266 | fn next(&mut self) -> Option { 267 | // the xml in the xlsx file will not contain elements for empty rows. So 268 | // we need to "simulate" the empty rows since the user expects to see 269 | // them when they iterate over the worksheet. 270 | if let Some(Row(_, row_num)) = &self.next_row { 271 | // since we are currently buffering a row, we know we will either return it or a 272 | // "simulated" (i.e., emtpy) row. So we grab the current row and update the fact that 273 | // we will soon want a new row. We then figure out if we have the row we want or if we 274 | // need to keep spitting out empty rows. 275 | let current_row = self.want_row; 276 | self.want_row += 1; 277 | if *row_num == current_row { 278 | // we finally hit the row we were looking for, so we reset the buffer and return 279 | // the row that was sitting in it. 280 | let mut r = None; 281 | mem::swap(&mut r, &mut self.next_row); 282 | return r 283 | } else { 284 | // otherwise, we must still be sitting behind the row we want. So we return an 285 | // empty row to simulate the row that exists in the spreadsheet. 286 | return empty_row(self.num_cols, current_row) 287 | } 288 | } else if self.done_file && self.want_row < self.num_rows as usize { 289 | self.want_row += 1; 290 | return empty_row(self.num_cols, self.want_row - 1) 291 | } 292 | let mut buf = Vec::new(); 293 | let reader = &mut self.worksheet_reader.reader; 294 | let strings = self.worksheet_reader.strings; 295 | let styles = self.worksheet_reader.styles; 296 | let date_system = self.worksheet_reader.date_system; 297 | let next_row = { 298 | let mut row: Vec = Vec::with_capacity(self.num_cols as usize); 299 | let mut in_cell = false; 300 | let mut in_value = false; 301 | let mut c = new_cell(); 302 | let mut this_row: usize = 0; 303 | loop { 304 | match reader.read_event(&mut buf) { 305 | /* may be able to get a better estimate for the used area */ 306 | Ok(Event::Empty(ref e)) if e.name() == b"dimension" => { 307 | if let Some(used_area_range) = utils::get(e.attributes(), b"ref") { 308 | if used_area_range != "A1" { 309 | let (rows, cols) = used_area(&used_area_range); 310 | self.num_cols = cols; 311 | self.num_rows = rows; 312 | } 313 | } 314 | }, 315 | /* -- end search for used area */ 316 | Ok(Event::Start(ref e)) if e.name() == b"row" => { 317 | this_row = utils::get(e.attributes(), b"r").unwrap().parse().unwrap(); 318 | }, 319 | Ok(Event::Start(ref e)) if e.name() == b"c" => { 320 | in_cell = true; 321 | e.attributes() 322 | .for_each(|a| { 323 | let a = a.unwrap(); 324 | if a.key == b"r" { 325 | c.reference = utils::attr_value(&a); 326 | } 327 | if a.key == b"t" { 328 | c.cell_type = utils::attr_value(&a); 329 | } 330 | if a.key == b"s" { 331 | if let Ok(num) = utils::attr_value(&a).parse::() { 332 | if let Some(style) = styles.get(num) { 333 | c.style = style.to_string(); 334 | } 335 | } 336 | } 337 | }); 338 | }, 339 | Ok(Event::Start(ref e)) if e.name() == b"v" || e.name() == b"t" => { 340 | in_value = true; 341 | }, 342 | // note: because v elements are children of c elements, 343 | // need this check to go before the 'in_cell' check 344 | Ok(Event::Text(ref e)) if in_value => { 345 | c.raw_value = e.unescape_and_decode(&reader).unwrap(); 346 | c.value = match &c.cell_type[..] { 347 | "s" => { 348 | if let Ok(pos) = c.raw_value.parse::() { 349 | let s = &strings[pos]; // .to_string() 350 | ExcelValue::String(Cow::Borrowed(s)) 351 | } else { 352 | ExcelValue::String(Cow::Owned(c.raw_value.clone())) 353 | } 354 | }, 355 | "str" | "inlineStr" => { 356 | ExcelValue::String(Cow::Owned(c.raw_value.clone())) 357 | }, 358 | "b" => { 359 | if c.raw_value == "0" { 360 | ExcelValue::Bool(false) 361 | } else { 362 | ExcelValue::Bool(true) 363 | } 364 | }, 365 | "bl" => ExcelValue::None, 366 | "e" => ExcelValue::Error(c.raw_value.to_string()), 367 | _ if is_date(&c) => { 368 | let num = c.raw_value.parse::().unwrap(); 369 | match utils::excel_number_to_date(num, date_system) { 370 | utils::DateConversion::Date(date) => ExcelValue::Date(date), 371 | utils::DateConversion::DateTime(date) => ExcelValue::DateTime(date), 372 | utils::DateConversion::Time(time) => ExcelValue::Time(time), 373 | utils::DateConversion::Number(num) => ExcelValue::Number(num as f64), 374 | } 375 | 376 | }, 377 | _ => ExcelValue::Number(c.raw_value.parse::().unwrap()), 378 | }; 379 | }, 380 | Ok(Event::Text(ref e)) if in_cell => { 381 | let txt = e.unescape_and_decode(&reader).unwrap(); 382 | c.formula.push_str(&txt) 383 | }, 384 | Ok(Event::End(ref e)) if e.name() == b"v" || e.name() == b"t" => { 385 | in_value = false; 386 | }, 387 | Ok(Event::End(ref e)) if e.name() == b"c" => { 388 | if let Some(prev) = row.last() { 389 | let (mut last_col, _) = prev.coordinates(); 390 | let (this_col, this_row) = c.coordinates(); 391 | while this_col > last_col + 1 { 392 | let mut cell = new_cell(); 393 | cell.reference.push_str(&utils::num2col(last_col + 1).unwrap()); 394 | cell.reference.push_str(&this_row.to_string()); 395 | row.push(cell); 396 | last_col += 1; 397 | } 398 | row.push(c); 399 | } else { 400 | let (this_col, this_row) = c.coordinates(); 401 | for n in 1..this_col { 402 | let mut cell = new_cell(); 403 | cell.reference.push_str(&utils::num2col(n).unwrap()); 404 | cell.reference.push_str(&this_row.to_string()); 405 | row.push(cell); 406 | } 407 | row.push(c); 408 | } 409 | c = new_cell(); 410 | in_cell = false; 411 | }, 412 | Ok(Event::End(ref e)) if e.name() == b"row" => { 413 | self.num_cols = cmp::max(self.num_cols, row.len() as u16); 414 | while row.len() < self.num_cols as usize { 415 | let mut cell = new_cell(); 416 | cell.reference.push_str(&utils::num2col(row.len() as u16 + 1).unwrap()); 417 | cell.reference.push_str(&this_row.to_string()); 418 | row.push(cell); 419 | } 420 | let next_row = Some(Row(row, this_row)); 421 | if this_row == self.want_row { 422 | break next_row 423 | } else { 424 | self.next_row = next_row; 425 | break empty_row(self.num_cols, self.want_row) 426 | } 427 | }, 428 | Ok(Event::Eof) => break None, 429 | Err(e) => panic!("Error at position {}: {:?}", reader.buffer_position(), e), 430 | _ => (), 431 | } 432 | buf.clear(); 433 | } 434 | }; 435 | self.want_row += 1; 436 | if next_row.is_none() && self.want_row - 1 < self.num_rows as usize { 437 | self.done_file = true; 438 | return empty_row(self.num_cols, self.want_row - 1); 439 | } 440 | next_row 441 | } 442 | } 443 | 444 | fn is_date(cell: &Cell) -> bool { 445 | let is_d = cell.style == "d"; 446 | let is_like_d_and_not_like_red = cell.style.contains('d') && !cell.style.contains("Red"); 447 | let is_like_m = cell.style.contains('m'); 448 | if is_d || is_like_d_and_not_like_red || is_like_m { 449 | true 450 | } else { 451 | cell.style.contains('y') 452 | } 453 | } 454 | 455 | #[cfg(test)] 456 | mod tests { 457 | use crate::{ExcelValue, Workbook}; 458 | use std::borrow::Cow; 459 | 460 | #[test] 461 | fn test_ups() { 462 | let mut wb = Workbook::open("./tests/data/UPS.Galaxy.VS.PX.xlsx").unwrap(); 463 | let sheets = wb.sheets(); 464 | let ws = sheets.get("Table001 (Page 1-19)").unwrap(); 465 | let mut row_iter = ws.rows(&mut wb); 466 | let row2 = row_iter.nth(1).unwrap(); 467 | assert_eq!(row2[3].value, ExcelValue::Number(0.0)); 468 | let row3 = row_iter.next().unwrap(); 469 | assert_eq!(row3[4].value, ExcelValue::String(Cow::Borrowed("Bit"))); 470 | } 471 | } 472 | -------------------------------------------------------------------------------- /src/wb.rs: -------------------------------------------------------------------------------- 1 | //! This module provides the functionality necessary to interact with an Excel workbook (i.e., the 2 | //! entire file). 3 | 4 | use std::collections::HashMap; 5 | use std::fs; 6 | use std::fs::File; 7 | use std::io::BufReader; 8 | use quick_xml::Reader; 9 | use quick_xml::events::Event; 10 | use zip::ZipArchive; 11 | use crate::ws::{SheetReader, Worksheet}; 12 | use crate::utils; 13 | 14 | /// Excel spreadsheets support two different date systems: 15 | /// 16 | /// - the 1900 date system 17 | /// - the 1904 date system 18 | /// 19 | /// Under the 1900 system, the first date supported is January 1, 1900. Under the 1904 system, the 20 | /// first date supported is January 1, 1904. Under either system, a date is represented as the 21 | /// number of days that have elapsed since the first date. So you can't actually tell what date a 22 | /// number represents unless you also know the date system the spreadsheet uses. 23 | /// 24 | /// See for more information. 25 | #[derive(Debug)] 26 | pub enum DateSystem { 27 | V1900, 28 | V1904, 29 | } 30 | 31 | /// The Workbook is the primary object you will use in this module. The public interface allows you 32 | /// to see the path of the workbook as well as its date system. 33 | /// 34 | /// # Example usage: 35 | /// 36 | /// use xl::Workbook; 37 | /// let mut wb = Workbook::open("tests/data/Book1.xlsx").unwrap(); 38 | /// 39 | #[derive(Debug)] 40 | pub struct Workbook { 41 | pub path: String, 42 | xls: ZipArchive, 43 | // encoding: String, 44 | pub date_system: DateSystem, 45 | strings: Vec, 46 | styles: Vec, 47 | } 48 | 49 | /// A `SheetMap` is an object containing all the sheets in a given workbook. The only way to obtain 50 | /// a `SheetMap` is from an `xl::Worksheet` object. 51 | /// 52 | /// # Example usage: 53 | /// 54 | /// use xl::{Workbook, Worksheet}; 55 | /// 56 | /// let mut wb = Workbook::open("tests/data/Book1.xlsx").unwrap(); 57 | /// let sheets = wb.sheets(); 58 | #[derive(Debug)] 59 | pub struct SheetMap { 60 | sheets_by_name: HashMap::, 61 | sheets_by_num: Vec>, 62 | } 63 | 64 | impl SheetMap { 65 | /// After you obtain a `SheetMap`, `by_name` gives you a list of sheets in the `SheetMap` 66 | /// ordered by their position in the workbook. 67 | /// 68 | /// Example usage: 69 | /// 70 | /// use xl::{Workbook, Worksheet}; 71 | /// 72 | /// let mut wb = Workbook::open("tests/data/Book1.xlsx").unwrap(); 73 | /// let sheets = wb.sheets(); 74 | /// let sheet_names = sheets.by_name(); 75 | /// assert_eq!(sheet_names[2], "Time"); 76 | /// 77 | /// Note that the returned array is **ZERO** based rather than **ONE** based like `get`. The 78 | /// reason for this is that we want `get` to act like VBA, but here we are only looking for a 79 | /// list of names so the `Option` type seemed like overkill. (We have `get` act like VBA 80 | /// because I expect people who will use this library will be very used to that "style" and may 81 | /// expect the same thing in this library. If it becomes an issue, we can change it later). 82 | pub fn by_name(&self) -> Vec<&str> { 83 | self.sheets_by_num 84 | .iter() 85 | .filter(|&s| s.is_some()) 86 | .map(|s| &s.as_ref().unwrap().name[..]) 87 | .collect() 88 | } 89 | } 90 | 91 | /// Struct to let you refer to sheets by name or by position (1-based). 92 | pub enum SheetNameOrNum<'a> { 93 | Name(&'a str), 94 | Pos(usize), 95 | } 96 | 97 | /// Trait to make it easy to use `get` when trying to get a sheet. You will probably not use this 98 | /// struct directly. 99 | pub trait SheetAccessTrait { fn go(&self) -> SheetNameOrNum; } 100 | 101 | impl SheetAccessTrait for &str { 102 | fn go(&self) -> SheetNameOrNum { SheetNameOrNum::Name(*self) } 103 | } 104 | 105 | impl SheetAccessTrait for usize { 106 | fn go(&self) -> SheetNameOrNum { SheetNameOrNum::Pos(*self) } 107 | } 108 | 109 | impl SheetMap { 110 | /// An easy way to obtain a reference to a `Worksheet` within this `Workbook`. Note that we 111 | /// return an `Option` because the sheet you want may not exist in the workbook. Also note that 112 | /// when you try to `get` a worksheet by number (i.e., by its position within the workbook), 113 | /// the tabs use **1-based indexing** rather than 0-based indexing (like the rest of Rust and 114 | /// most of the programming world). This was an intentional design choice to make things 115 | /// consistent with VBA. It's possible it may change in the future, but it seems intuitive 116 | /// enough if you are familiar with VBA and Excel programming, so it may not. 117 | /// 118 | /// # Example usage 119 | /// 120 | /// use xl::{Workbook, Worksheet}; 121 | /// 122 | /// let mut wb = Workbook::open("tests/data/Book1.xlsx").unwrap(); 123 | /// let sheets = wb.sheets(); 124 | /// 125 | /// // by sheet name 126 | /// let time_sheet = sheets.get("Time"); 127 | /// assert!(time_sheet.is_some()); 128 | /// 129 | /// // unknown sheet name 130 | /// let unknown_sheet = sheets.get("not in this workbook"); 131 | /// assert!(unknown_sheet.is_none()); 132 | /// 133 | /// // by position 134 | /// let unknown_sheet = sheets.get(1); 135 | /// assert_eq!(unknown_sheet.unwrap().name, "Sheet1"); 136 | pub fn get(&self, sheet: T) -> Option<&Worksheet> { 137 | let sheet = sheet.go(); 138 | match sheet { 139 | SheetNameOrNum::Name(n) => { 140 | match self.sheets_by_name.get(n) { 141 | Some(p) => self.sheets_by_num.get(*p as usize)?.as_ref(), 142 | None => None 143 | } 144 | }, 145 | SheetNameOrNum::Pos(n) => self.sheets_by_num.get(n)?.as_ref(), 146 | } 147 | } 148 | 149 | /// The number of active sheets in the workbook. 150 | /// 151 | /// # Example usage 152 | /// 153 | /// use xl::{Workbook, Worksheet}; 154 | /// 155 | /// let mut wb = Workbook::open("tests/data/Book1.xlsx").unwrap(); 156 | /// let sheets = wb.sheets(); 157 | /// assert_eq!(sheets.len(), 4); 158 | pub fn len(&self) -> u8 { 159 | (self.sheets_by_num.len() - 1) as u8 160 | } 161 | } 162 | 163 | impl Workbook { 164 | /// xlsx zips contain an xml file that has a mapping of "ids" to "targets." The ids are used 165 | /// to uniquely identify sheets within the file. The targets have information on where the 166 | /// sheets can be found within the zip. This function returns a hashmap of id -> target so that 167 | /// you can quickly determine the name of the sheet xml file within the zip. 168 | fn rels(&mut self) -> HashMap { 169 | let mut map = HashMap::new(); 170 | match self.xls.by_name("xl/_rels/workbook.xml.rels") { 171 | Ok(rels) => { 172 | // Looking for tree structure like: 173 | // Relationships 174 | // Relationship(id = "abc", target = "def") 175 | // Relationship(id = "ghi", target = "lkm") 176 | // etc. 177 | // Each relationship contains an id that is used to reference 178 | // the sheet and a target which tells us where we can find the 179 | // sheet in the zip file. 180 | // 181 | // Uncomment the following line to print out a copy of what 182 | // the xml looks like (will probably not be too big). 183 | // let _ = std::io::copy(&mut rels, &mut std::io::stdout()); 184 | 185 | let reader = BufReader::new(rels); 186 | let mut reader = Reader::from_reader(reader); 187 | reader.trim_text(true); 188 | 189 | let mut buf = Vec::new(); 190 | loop { 191 | match reader.read_event(&mut buf) { 192 | Ok(Event::Empty(ref e)) if e.name() == b"Relationship" => { 193 | let mut id = String::new(); 194 | let mut target = String::new(); 195 | e.attributes() 196 | .for_each(|a| { 197 | let a = a.unwrap(); 198 | if a.key == b"Id" { 199 | id = utils::attr_value(&a); 200 | } 201 | if a.key == b"Target" { 202 | target = utils::attr_value(&a); 203 | } 204 | }); 205 | map.insert(id, target); 206 | }, 207 | Ok(Event::Eof) => break, // exits the loop when reaching end of file 208 | Err(e) => panic!("Error at position {}: {:?}", reader.buffer_position(), e), 209 | _ => (), // There are several other `Event`s we do not consider here 210 | } 211 | buf.clear(); 212 | } 213 | 214 | map 215 | }, 216 | Err(_) => map 217 | } 218 | } 219 | 220 | /// Return `SheetMap` of all sheets in this workbook. See `SheetMap` class and associated 221 | /// methods for more detailed documentation. 222 | pub fn sheets(&mut self) -> SheetMap { 223 | let rels = self.rels(); 224 | let num_sheets = rels.iter().filter(|(_, v)| v.starts_with("worksheet")).count(); 225 | let mut sheets = SheetMap { 226 | sheets_by_name: HashMap::new(), 227 | sheets_by_num: Vec::with_capacity(num_sheets + 1), 228 | }; 229 | sheets.sheets_by_num.push(None); // never a "0" sheet (consistent with VBA) 230 | 231 | match self.xls.by_name("xl/workbook.xml") { 232 | Ok(wb) => { 233 | // let _ = std::io::copy(&mut wb, &mut std::io::stdout()); 234 | let reader = BufReader::new(wb); 235 | let mut reader = Reader::from_reader(reader); 236 | reader.trim_text(true); 237 | 238 | let mut buf = Vec::new(); 239 | let mut current_sheet_num: u8 = 0; 240 | loop { 241 | match reader.read_event(&mut buf) { 242 | Ok(Event::Empty(ref e)) if e.name() == b"sheet" => { 243 | current_sheet_num += 1; 244 | let mut name = String::new(); 245 | let mut id = String::new(); 246 | let mut num = 0; 247 | e.attributes() 248 | .for_each(|a| { 249 | let a = a.unwrap(); 250 | if a.key == b"r:id" { 251 | id = utils::attr_value(&a); 252 | } 253 | if a.key == b"name" { 254 | name = utils::attr_value(&a); 255 | } 256 | if a.key == b"sheetId" { 257 | if let Ok(r) = utils::attr_value(&a).parse() { 258 | num = r; 259 | } 260 | } 261 | }); 262 | sheets.sheets_by_name.insert(name.clone(), current_sheet_num); 263 | let target = { 264 | let s = rels.get(&id).unwrap(); 265 | if let Some(stripped) = s.strip_prefix('/') { 266 | stripped.to_string() 267 | } else { 268 | "xl/".to_owned() + s 269 | } 270 | }; 271 | let ws = Worksheet::new(name, current_sheet_num, target); 272 | sheets.sheets_by_num.push(Some(ws)); 273 | }, 274 | Ok(Event::Eof) => { 275 | break 276 | }, 277 | Err(e) => panic!("Error at position {}: {:?}", reader.buffer_position(), e), 278 | _ => (), 279 | } 280 | buf.clear(); 281 | } 282 | sheets 283 | }, 284 | Err(_) => sheets 285 | } 286 | } 287 | 288 | /// Open an existing workbook (xlsx file). Returns a `Result` in case there is an error opening 289 | /// the workbook. 290 | /// 291 | /// # Example usage: 292 | /// 293 | /// use xl::Workbook; 294 | /// 295 | /// let mut wb = Workbook::open("tests/data/Book1.xlsx"); 296 | /// assert!(wb.is_ok()); 297 | /// 298 | /// // non-existant file 299 | /// let mut wb = Workbook::open("Non-existant xlsx"); 300 | /// assert!(wb.is_err()); 301 | /// 302 | /// // non-xlsx file 303 | /// let mut wb = Workbook::open("src/main.rs"); 304 | /// assert!(wb.is_err()); 305 | pub fn new(path: &str) -> Result { 306 | if !std::path::Path::new(&path).exists() { 307 | let err = format!("'{}' does not exist", &path); 308 | return Err(err); 309 | } 310 | let zip_file = match fs::File::open(&path) { 311 | Ok(z) => z, 312 | Err(e) => return Err(e.to_string()), 313 | }; 314 | match zip::ZipArchive::new(zip_file) { 315 | Ok(mut xls) => { 316 | let strings = strings(&mut xls); 317 | let styles = find_styles(&mut xls); 318 | let date_system = get_date_system(&mut xls); 319 | Ok(Workbook { 320 | path: path.to_string(), 321 | xls, 322 | // encoding: String::from("utf8"), 323 | date_system, 324 | strings, 325 | styles, 326 | }) 327 | }, 328 | Err(e) => Err(e.to_string()) 329 | } 330 | } 331 | 332 | /// Alternative name for `Workbook::new`. 333 | pub fn open(path: &str) -> Result { Workbook::new(path) } 334 | 335 | /// Simple method to print out all the inner files of the xlsx zip. 336 | pub fn contents(&mut self) { 337 | for i in 0 .. self.xls.len() { 338 | let file = self.xls.by_index(i).unwrap(); 339 | let outpath = match file.enclosed_name() { 340 | Some(path) => path.to_owned(), 341 | None => continue, 342 | }; 343 | 344 | if (&*file.name()).ends_with('/') { 345 | println!("File {}: \"{}\"", i, outpath.display()); 346 | } else { 347 | println!( 348 | "File {}: \"{}\" ({} bytes)", 349 | i, 350 | outpath.display(), 351 | file.size() 352 | ); 353 | } 354 | } 355 | } 356 | 357 | /// Create a SheetReader for the given worksheet. A `SheetReader` is a struct in the 358 | /// `xl::Worksheet` class that can be used to iterate over rows, etc. See documentation in the 359 | /// `xl::Worksheet` module for more information. 360 | pub fn sheet_reader<'a>(&'a mut self, zip_target: &str) -> SheetReader<'a> { 361 | let target = match self.xls.by_name(zip_target) { 362 | Ok(ws) => ws, 363 | Err(_) => panic!("Could not find worksheet: {}", zip_target) 364 | }; 365 | // let _ = std::io::copy(&mut target, &mut std::io::stdout()); 366 | let reader = BufReader::new(target); 367 | let mut reader = Reader::from_reader(reader); 368 | reader.trim_text(true); 369 | SheetReader::new(reader, &self.strings, &self.styles, &self.date_system) 370 | } 371 | 372 | } 373 | 374 | 375 | fn strings(zip_file: &mut ZipArchive) -> Vec { 376 | let mut strings = Vec::new(); 377 | match zip_file.by_name("xl/sharedStrings.xml") { 378 | Ok(strings_file) => { 379 | let reader = BufReader::new(strings_file); 380 | let mut reader = Reader::from_reader(reader); 381 | reader.trim_text(true); 382 | let mut buf = Vec::new(); 383 | let mut this_string = String::new(); 384 | let mut preserve_space = false; 385 | loop { 386 | match reader.read_event(&mut buf) { 387 | Ok(Event::Start(ref e)) if e.name() == b"t" => { 388 | if let Some(att) = utils::get(e.attributes(), b"xml:space") { 389 | if att == "preserve" { 390 | preserve_space = true; 391 | } else { 392 | preserve_space = false; 393 | } 394 | } else { 395 | preserve_space = false; 396 | } 397 | }, 398 | Ok(Event::Text(ref e)) => this_string.push_str(&e.unescape_and_decode(&reader).unwrap()[..]), 399 | Ok(Event::Empty(ref e)) if e.name() == b"t" => strings.push("".to_owned()), 400 | Ok(Event::End(ref e)) if e.name() == b"t" => { 401 | if preserve_space { 402 | strings.push(this_string.to_owned()); 403 | } else { 404 | strings.push(this_string.trim().to_owned()); 405 | } 406 | this_string = String::new(); 407 | }, 408 | Ok(Event::Eof) => break, 409 | Err(e) => panic!("Error at position {}: {:?}", reader.buffer_position(), e), 410 | _ => (), 411 | } 412 | buf.clear(); 413 | } 414 | strings 415 | }, 416 | Err(_) => strings 417 | } 418 | } 419 | 420 | /// find the number of rows and columns used in a particular worksheet. takes the workbook xlsx 421 | /// location as its first parameter, and the location of the worksheet in question (within the zip) 422 | /// as the second parameter. Returns a tuple of (rows, columns) in the worksheet. 423 | fn find_styles(xlsx: &mut ZipArchive) -> Vec { 424 | let mut styles = Vec::new(); 425 | let mut number_formats = standard_styles(); 426 | let styles_xml = match xlsx.by_name("xl/styles.xml") { 427 | Ok(s) => s, 428 | Err(_) => return styles 429 | }; 430 | // let _ = std::io::copy(&mut styles_xml, &mut std::io::stdout()); 431 | let reader = BufReader::new(styles_xml); 432 | let mut reader = Reader::from_reader(reader); 433 | reader.trim_text(true); 434 | let mut buf = Vec::new(); 435 | let mut record_styles = false; 436 | loop { 437 | match reader.read_event(&mut buf) { 438 | Ok(Event::Empty(ref e)) if e.name() == b"numFmt" => { 439 | let id = utils::get(e.attributes(), b"numFmtId").unwrap(); 440 | let code = utils::get(e.attributes(), b"formatCode").unwrap(); 441 | number_formats.insert(id, code); 442 | }, 443 | Ok(Event::Start(ref e)) if e.name() == b"cellXfs" => { 444 | // Section 2.1.589 Part 1 Section 18.3.1.4, c (Cell) 445 | // Item g. states that Office specifies that @s indexes into the cellXfs collection 446 | // in the style part. See https://tinyurl.com/yju9a6ox for more information. 447 | record_styles = true; 448 | }, 449 | Ok(Event::End(ref e)) if e.name() == b"cellXfs" => record_styles = false, 450 | Ok(Event::Start(ref e)) | Ok(Event::Empty(ref e)) if record_styles && e.name() == b"xf" => { 451 | let id = utils::get(e.attributes(), b"numFmtId").unwrap(); 452 | if number_formats.contains_key(&id) { 453 | styles.push(number_formats.get(&id).unwrap().to_string()); 454 | } 455 | }, 456 | Ok(Event::Eof) => break, 457 | Err(e) => panic!("Error at position {}: {:?}", reader.buffer_position(), e), 458 | _ => (), 459 | } 460 | buf.clear(); 461 | } 462 | styles 463 | } 464 | 465 | /// Return hashmap of standard styles (ISO/IEC 29500:2011 in Part 1, section 18.8.30) 466 | fn standard_styles() -> HashMap { 467 | let mut styles = HashMap::new(); 468 | let standard_styles = [ 469 | ["0", "General",], 470 | ["1", "0",], 471 | ["2", "0.00",], 472 | ["3", "#,##0",], 473 | ["4", "#,##0.00",], 474 | ["9", "0%",], 475 | ["10", "0.00%",], 476 | ["11", "0.00E+00",], 477 | ["12", "# ?/?",], 478 | ["13", "# ??/??",], 479 | ["14", "mm-dd-yy",], 480 | ["15", "d-mmm-yy",], 481 | ["16", "d-mmm",], 482 | ["17", "mmm-yy",], 483 | ["18", "h:mm AM/PM",], 484 | ["19", "h:mm:ss AM/PM",], 485 | ["20", "h:mm",], 486 | ["21", "h:mm:ss",], 487 | ["22", "m/d/yy h:mm",], 488 | ["37", "#,##0 ;(#,##0)",], 489 | ["38", "#,##0 ;[Red](#,##0)",], 490 | ["39", "#,##0.00;(#,##0.00)",], 491 | ["40", "#,##0.00;[Red](#,##0.00)",], 492 | ["45", "mm:ss",], 493 | ["46", "[h]:mm:ss",], 494 | ["47", "mmss.0",], 495 | ["48", "##0.0E+0",], 496 | ["49", "@",], 497 | ]; 498 | for style in standard_styles { 499 | let [id, code] = style; 500 | styles.insert(id.to_string(), code.to_string()); 501 | } 502 | styles 503 | } 504 | 505 | fn get_date_system(xlsx: &mut ZipArchive) -> DateSystem { 506 | match xlsx.by_name("xl/workbook.xml") { 507 | Ok(wb) => { 508 | let reader = BufReader::new(wb); 509 | let mut reader = Reader::from_reader(reader); 510 | reader.trim_text(true); 511 | let mut buf = Vec::new(); 512 | loop { 513 | match reader.read_event(&mut buf) { 514 | Ok(Event::Empty(ref e)) if e.name() == b"workbookPr" => { 515 | if let Some(system) = utils::get(e.attributes(), b"date1904") { 516 | if system == "1" { 517 | break DateSystem::V1904 518 | } 519 | } 520 | break DateSystem::V1900 521 | }, 522 | Ok(Event::Eof) => break DateSystem::V1900, 523 | Err(e) => panic!("Error at position {}: {:?}", reader.buffer_position(), e), 524 | _ => (), 525 | } 526 | buf.clear(); 527 | } 528 | }, 529 | Err(_) => panic!("Could not find xl/workbook.xml") 530 | } 531 | } 532 | 533 | #[cfg(test)] 534 | mod tests { 535 | mod access { 536 | use super::super::*; 537 | 538 | #[test] 539 | fn open_wb() { 540 | let wb = Workbook::open("tests/data/Book1.xlsx"); 541 | assert!(wb.is_ok()); 542 | } 543 | 544 | #[test] 545 | fn all_sheets() { 546 | let mut wb = Workbook::open("tests/data/Book1.xlsx").unwrap(); 547 | let num_sheets = wb.sheets().len(); 548 | assert_eq!(num_sheets, 4); 549 | } 550 | 551 | #[test] 552 | fn sheet_by_name_exists() { 553 | let mut wb = Workbook::open("tests/data/Book1.xlsx").unwrap(); 554 | let sheets = wb.sheets(); 555 | assert!(sheets.get("Time").is_some()); 556 | } 557 | 558 | #[test] 559 | fn sheet_by_num_exists() { 560 | let mut wb = Workbook::open("tests/data/Book1.xlsx").unwrap(); 561 | let sheets = wb.sheets(); 562 | assert!(sheets.get(1).is_some()); 563 | } 564 | 565 | #[test] 566 | fn sheet_by_name_not_exists() { 567 | let mut wb = Workbook::open("tests/data/Book1.xlsx").unwrap(); 568 | let sheets = wb.sheets(); 569 | assert!(!sheets.get("Unknown").is_some()); 570 | } 571 | 572 | #[test] 573 | fn sheet_by_num_not_exists() { 574 | let mut wb = Workbook::open("tests/data/Book1.xlsx").unwrap(); 575 | let sheets = wb.sheets(); 576 | assert!(!sheets.get(0).is_some()); 577 | } 578 | 579 | #[test] 580 | fn correct_sheet_name() { 581 | let mut wb = Workbook::open("tests/data/Book1.xlsx").unwrap(); 582 | let sheets = wb.sheets(); 583 | assert_eq!(sheets.get("Time").unwrap().name, "Time"); 584 | } 585 | 586 | #[test] 587 | fn inline_strings() { 588 | let mut wb = Workbook::open("tests/data/inlinestrings.xlsx").unwrap(); 589 | let sheets = wb.sheets(); 590 | let ws = sheets.get("Sheet Name").unwrap(); 591 | let row1 = ws.rows(&mut wb).nth(0).unwrap(); 592 | let v1 = &row1[0]; 593 | assert_eq!(v1.to_string(), "\"Cell A1\"".to_string()); 594 | } 595 | } 596 | } 597 | --------------------------------------------------------------------------------