├── .gitignore
├── LICENSE
├── README.md
├── build.sbt
├── docs
    ├── IcebergSparkSQL.bib
    ├── IcebergSparkSQL.org
    ├── IcebergSparkSQL.pdf
    ├── basicExample.sql
    ├── iceSnapshots.png
    ├── iceTblCommit.png
    ├── iceTblScan.png
    ├── insIcePlan.png
    ├── insPlan.png
    └── writePath.png
├── project
    ├── assembly.sbt
    ├── build.properties
    └── plugins.sbt
├── scalastyle-config.xml
└── src
    ├── main
        └── scala
        │   └── org
        │       └── apache
        │           └── spark
        │               └── sql
        │                   ├── execution
        │                       └── datasources
        │                       │   └── iceberg
        │                       │       └── PartitioningUtils.scala
        │                   └── iceberg
        │                       ├── Config.scala
        │                       ├── CreateIcebergTable.scala
        │                       ├── IceTableScanExec.scala
        │                       ├── IcebergFileCommitProtocol.scala
        │                       ├── InsertIntoIcebergTable.scala
        │                       ├── parsing
        │                           ├── AbstractSparkSQLParser.scala
        │                           └── IceParser.scala
        │                       ├── planning
        │                           ├── CreateTableRules.scala
        │                           ├── IcebergScanStrategy.scala
        │                           ├── IcebergTableValidationChecks.scala
        │                           ├── IcebergTableWriteRule.scala
        │                           └── SparkSessionExtensions.scala
        │                       ├── table
        │                           ├── SparkTableOperations.scala
        │                           └── SparkTables.scala
        │                       └── utils
        │                           ├── ColumnDependenciesExprVisitor.scala
        │                           ├── DataFileUtils.scala
        │                           ├── DelegationUtils.scala
        │                           ├── ExpressionConversions.scala
        │                           ├── LiteralUtils.scala
        │                           ├── ParquetMetrics.scala
        │                           ├── SchemaConversions.scala
        │                           ├── TableUtils.scala
        │                           ├── Transforms.scala
        │                           ├── TypeConversions.scala
        │                           └── utils.scala
    └── test
        ├── resources
            └── store_sales
            │   ├── ss_sold_date_sk=0905245
            │       ├── part-00004-eeb13bdd-e31e-4992-aa61-60c2c1729751.c000.snappy.parquet
            │       ├── part-00010-eeb13bdd-e31e-4992-aa61-60c2c1729751.c000.snappy.parquet
            │       ├── part-00017-eeb13bdd-e31e-4992-aa61-60c2c1729751.c000.snappy.parquet
            │       ├── part-00022-eeb13bdd-e31e-4992-aa61-60c2c1729751.c000.snappy.parquet
            │       └── part-00026-eeb13bdd-e31e-4992-aa61-60c2c1729751.c000.snappy.parquet
            │   ├── ss_sold_date_sk=0906245
            │       ├── part-00016-eeb13bdd-e31e-4992-aa61-60c2c1729751.c000.snappy.parquet
            │       ├── part-00018-eeb13bdd-e31e-4992-aa61-60c2c1729751.c000.snappy.parquet
            │       ├── part-00020-eeb13bdd-e31e-4992-aa61-60c2c1729751.c000.snappy.parquet
            │       ├── part-00023-eeb13bdd-e31e-4992-aa61-60c2c1729751.c000.snappy.parquet
            │       └── part-00029-eeb13bdd-e31e-4992-aa61-60c2c1729751.c000.snappy.parquet
            │   ├── ss_sold_date_sk=0907245
            │       ├── part-00000-eeb13bdd-e31e-4992-aa61-60c2c1729751.c000.snappy.parquet
            │       ├── part-00003-eeb13bdd-e31e-4992-aa61-60c2c1729751.c000.snappy.parquet
            │       ├── part-00008-eeb13bdd-e31e-4992-aa61-60c2c1729751.c000.snappy.parquet
            │       ├── part-00012-eeb13bdd-e31e-4992-aa61-60c2c1729751.c000.snappy.parquet
            │       └── part-00019-eeb13bdd-e31e-4992-aa61-60c2c1729751.c000.snappy.parquet
            │   ├── ss_sold_date_sk=0908245
            │       ├── part-00005-eeb13bdd-e31e-4992-aa61-60c2c1729751.c000.snappy.parquet
            │       ├── part-00006-eeb13bdd-e31e-4992-aa61-60c2c1729751.c000.snappy.parquet
            │       ├── part-00014-eeb13bdd-e31e-4992-aa61-60c2c1729751.c000.snappy.parquet
            │       ├── part-00024-eeb13bdd-e31e-4992-aa61-60c2c1729751.c000.snappy.parquet
            │       └── part-00025-eeb13bdd-e31e-4992-aa61-60c2c1729751.c000.snappy.parquet
            │   ├── ss_sold_date_sk=0909245
            │       ├── part-00001-eeb13bdd-e31e-4992-aa61-60c2c1729751.c000.snappy.parquet
            │       ├── part-00002-eeb13bdd-e31e-4992-aa61-60c2c1729751.c000.snappy.parquet
            │       ├── part-00009-eeb13bdd-e31e-4992-aa61-60c2c1729751.c000.snappy.parquet
            │       ├── part-00013-eeb13bdd-e31e-4992-aa61-60c2c1729751.c000.snappy.parquet
            │       └── part-00021-eeb13bdd-e31e-4992-aa61-60c2c1729751.c000.snappy.parquet
            │   └── ss_sold_date_sk=0910245
            │       ├── part-00007-eeb13bdd-e31e-4992-aa61-60c2c1729751.c000.snappy.parquet
            │       ├── part-00011-eeb13bdd-e31e-4992-aa61-60c2c1729751.c000.snappy.parquet
            │       ├── part-00015-eeb13bdd-e31e-4992-aa61-60c2c1729751.c000.snappy.parquet
            │       ├── part-00027-eeb13bdd-e31e-4992-aa61-60c2c1729751.c000.snappy.parquet
            │       └── part-00028-eeb13bdd-e31e-4992-aa61-60c2c1729751.c000.snappy.parquet
        └── scala
            └── org
                └── apache
                    └── spark
                        └── sql
                            └── hive
                                └── test
                                    └── iceberg
                                        ├── AbstractTest.scala
                                        ├── BasicCreateAndInsertTest.scala
                                        ├── TestIcebergHive.scala
                                        └── TestTables.scala


/.gitignore:
--------------------------------------------------------------------------------
 1 | *.class
 2 | *.log
 3 | *.pyc
 4 | sbt/*.jar
 5 | *.crc
 6 | 
 7 | # sbt specific
 8 | .cache/
 9 | .history/
10 | .lib/
11 | dist/*
12 | target/
13 | lib_managed/
14 | src_managed/
15 | project/boot/
16 | project/plugins/project/
17 | 
18 | # idea
19 | .idea/
20 | *.iml
21 | 
22 | # Mac
23 | .DS_Store
24 | 
25 | # emacs
26 | *.*~
27 | 
28 | # docs
29 | docs/private/
30 | docs/*.html
31 | docs/*.tex
32 | docs/auto/
33 | *-blx.bib
34 | *.bbl
35 | *.blg
36 | *.fdb_latexmk
37 | *.fls
38 | docs/*.xml
39 | docs/*.key
40 | *.tex
41 | *.tiff
42 | docs/benchmark/*.html
43 | **/auto/
44 | docs/notes/*_ImplNotes.org
45 | 
46 | 
47 | derby.log
48 | 
49 | #test
50 | src/test/resources/store_sales_out/
51 | src/test/scala/org/apache/spark/sql/hive/test/iceberg/temp/
52 | _SUCCESS
53 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
  1 |                                  Apache License
  2 |                            Version 2.0, January 2004
  3 |                         http://www.apache.org/licenses/
  4 | 
  5 |    TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
  6 | 
  7 |    1. Definitions.
  8 | 
  9 |       "License" shall mean the terms and conditions for use, reproduction,
 10 |       and distribution as defined by Sections 1 through 9 of this document.
 11 | 
 12 |       "Licensor" shall mean the copyright owner or entity authorized by
 13 |       the copyright owner that is granting the License.
 14 | 
 15 |       "Legal Entity" shall mean the union of the acting entity and all
 16 |       other entities that control, are controlled by, or are under common
 17 |       control with that entity. For the purposes of this definition,
 18 |       "control" means (i) the power, direct or indirect, to cause the
 19 |       direction or management of such entity, whether by contract or
 20 |       otherwise, or (ii) ownership of fifty percent (50%) or more of the
 21 |       outstanding shares, or (iii) beneficial ownership of such entity.
 22 | 
 23 |       "You" (or "Your") shall mean an individual or Legal Entity
 24 |       exercising permissions granted by this License.
 25 | 
 26 |       "Source" form shall mean the preferred form for making modifications,
 27 |       including but not limited to software source code, documentation
 28 |       source, and configuration files.
 29 | 
 30 |       "Object" form shall mean any form resulting from mechanical
 31 |       transformation or translation of a Source form, including but
 32 |       not limited to compiled object code, generated documentation,
 33 |       and conversions to other media types.
 34 | 
 35 |       "Work" shall mean the work of authorship, whether in Source or
 36 |       Object form, made available under the License, as indicated by a
 37 |       copyright notice that is included in or attached to the work
 38 |       (an example is provided in the Appendix below).
 39 | 
 40 |       "Derivative Works" shall mean any work, whether in Source or Object
 41 |       form, that is based on (or derived from) the Work and for which the
 42 |       editorial revisions, annotations, elaborations, or other modifications
 43 |       represent, as a whole, an original work of authorship. For the purposes
 44 |       of this License, Derivative Works shall not include works that remain
 45 |       separable from, or merely link (or bind by name) to the interfaces of,
 46 |       the Work and Derivative Works thereof.
 47 | 
 48 |       "Contribution" shall mean any work of authorship, including
 49 |       the original version of the Work and any modifications or additions
 50 |       to that Work or Derivative Works thereof, that is intentionally
 51 |       submitted to Licensor for inclusion in the Work by the copyright owner
 52 |       or by an individual or Legal Entity authorized to submit on behalf of
 53 |       the copyright owner. For the purposes of this definition, "submitted"
 54 |       means any form of electronic, verbal, or written communication sent
 55 |       to the Licensor or its representatives, including but not limited to
 56 |       communication on electronic mailing lists, source code control systems,
 57 |       and issue tracking systems that are managed by, or on behalf of, the
 58 |       Licensor for the purpose of discussing and improving the Work, but
 59 |       excluding communication that is conspicuously marked or otherwise
 60 |       designated in writing by the copyright owner as "Not a Contribution."
 61 | 
 62 |       "Contributor" shall mean Licensor and any individual or Legal Entity
 63 |       on behalf of whom a Contribution has been received by Licensor and
 64 |       subsequently incorporated within the Work.
 65 | 
 66 |    2. Grant of Copyright License. Subject to the terms and conditions of
 67 |       this License, each Contributor hereby grants to You a perpetual,
 68 |       worldwide, non-exclusive, no-charge, royalty-free, irrevocable
 69 |       copyright license to reproduce, prepare Derivative Works of,
 70 |       publicly display, publicly perform, sublicense, and distribute the
 71 |       Work and such Derivative Works in Source or Object form.
 72 | 
 73 |    3. Grant of Patent License. Subject to the terms and conditions of
 74 |       this License, each Contributor hereby grants to You a perpetual,
 75 |       worldwide, non-exclusive, no-charge, royalty-free, irrevocable
 76 |       (except as stated in this section) patent license to make, have made,
 77 |       use, offer to sell, sell, import, and otherwise transfer the Work,
 78 |       where such license applies only to those patent claims licensable
 79 |       by such Contributor that are necessarily infringed by their
 80 |       Contribution(s) alone or by combination of their Contribution(s)
 81 |       with the Work to which such Contribution(s) was submitted. If You
 82 |       institute patent litigation against any entity (including a
 83 |       cross-claim or counterclaim in a lawsuit) alleging that the Work
 84 |       or a Contribution incorporated within the Work constitutes direct
 85 |       or contributory patent infringement, then any patent licenses
 86 |       granted to You under this License for that Work shall terminate
 87 |       as of the date such litigation is filed.
 88 | 
 89 |    4. Redistribution. You may reproduce and distribute copies of the
 90 |       Work or Derivative Works thereof in any medium, with or without
 91 |       modifications, and in Source or Object form, provided that You
 92 |       meet the following conditions:
 93 | 
 94 |       (a) You must give any other recipients of the Work or
 95 |           Derivative Works a copy of this License; and
 96 | 
 97 |       (b) You must cause any modified files to carry prominent notices
 98 |           stating that You changed the files; and
 99 | 
100 |       (c) You must retain, in the Source form of any Derivative Works
101 |           that You distribute, all copyright, patent, trademark, and
102 |           attribution notices from the Source form of the Work,
103 |           excluding those notices that do not pertain to any part of
104 |           the Derivative Works; and
105 | 
106 |       (d) If the Work includes a "NOTICE" text file as part of its
107 |           distribution, then any Derivative Works that You distribute must
108 |           include a readable copy of the attribution notices contained
109 |           within such NOTICE file, excluding those notices that do not
110 |           pertain to any part of the Derivative Works, in at least one
111 |           of the following places: within a NOTICE text file distributed
112 |           as part of the Derivative Works; within the Source form or
113 |           documentation, if provided along with the Derivative Works; or,
114 |           within a display generated by the Derivative Works, if and
115 |           wherever such third-party notices normally appear. The contents
116 |           of the NOTICE file are for informational purposes only and
117 |           do not modify the License. You may add Your own attribution
118 |           notices within Derivative Works that You distribute, alongside
119 |           or as an addendum to the NOTICE text from the Work, provided
120 |           that such additional attribution notices cannot be construed
121 |           as modifying the License.
122 | 
123 |       You may add Your own copyright statement to Your modifications and
124 |       may provide additional or different license terms and conditions
125 |       for use, reproduction, or distribution of Your modifications, or
126 |       for any such Derivative Works as a whole, provided Your use,
127 |       reproduction, and distribution of the Work otherwise complies with
128 |       the conditions stated in this License.
129 | 
130 |    5. Submission of Contributions. Unless You explicitly state otherwise,
131 |       any Contribution intentionally submitted for inclusion in the Work
132 |       by You to the Licensor shall be under the terms and conditions of
133 |       this License, without any additional terms or conditions.
134 |       Notwithstanding the above, nothing herein shall supersede or modify
135 |       the terms of any separate license agreement you may have executed
136 |       with Licensor regarding such Contributions.
137 | 
138 |    6. Trademarks. This License does not grant permission to use the trade
139 |       names, trademarks, service marks, or product names of the Licensor,
140 |       except as required for reasonable and customary use in describing the
141 |       origin of the Work and reproducing the content of the NOTICE file.
142 | 
143 |    7. Disclaimer of Warranty. Unless required by applicable law or
144 |       agreed to in writing, Licensor provides the Work (and each
145 |       Contributor provides its Contributions) on an "AS IS" BASIS,
146 |       WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
147 |       implied, including, without limitation, any warranties or conditions
148 |       of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
149 |       PARTICULAR PURPOSE. You are solely responsible for determining the
150 |       appropriateness of using or redistributing the Work and assume any
151 |       risks associated with Your exercise of permissions under this License.
152 | 
153 |    8. Limitation of Liability. In no event and under no legal theory,
154 |       whether in tort (including negligence), contract, or otherwise,
155 |       unless required by applicable law (such as deliberate and grossly
156 |       negligent acts) or agreed to in writing, shall any Contributor be
157 |       liable to You for damages, including any direct, indirect, special,
158 |       incidental, or consequential damages of any character arising as a
159 |       result of this License or out of the use or inability to use the
160 |       Work (including but not limited to damages for loss of goodwill,
161 |       work stoppage, computer failure or malfunction, or any and all
162 |       other commercial damages or losses), even if such Contributor
163 |       has been advised of the possibility of such damages.
164 | 
165 |    9. Accepting Warranty or Additional Liability. While redistributing
166 |       the Work or Derivative Works thereof, You may choose to offer,
167 |       and charge a fee for, acceptance of support, warranty, indemnity,
168 |       or other liability obligations and/or rights consistent with this
169 |       License. However, in accepting such obligations, You may act only
170 |       on Your own behalf and on Your sole responsibility, not on behalf
171 |       of any other Contributor, and only if You agree to indemnify,
172 |       defend, and hold each Contributor harmless for any liability
173 |       incurred by, or claims asserted against, such Contributor by reason
174 |       of your accepting any such warranty or additional liability.
175 | 
176 |    END OF TERMS AND CONDITIONS
177 | 
178 |    APPENDIX: How to apply the Apache License to your work.
179 | 
180 |       To apply the Apache License to your work, attach the following
181 |       boilerplate notice, with the fields enclosed by brackets "[]"
182 |       replaced with your own identifying information. (Don't include
183 |       the brackets!)  The text should be enclosed in the appropriate
184 |       comment syntax for the file format. We also recommend that a
185 |       file or class name and description of purpose be included on the
186 |       same "printed page" as the copyright notice for easier
187 |       identification within third-party archives.
188 | 
189 |    Copyright [yyyy] [name of copyright owner]
190 | 
191 |    Licensed under the Apache License, Version 2.0 (the "License");
192 |    you may not use this file except in compliance with the License.
193 |    You may obtain a copy of the License at
194 | 
195 |        http://www.apache.org/licenses/LICENSE-2.0
196 | 
197 |    Unless required by applicable law or agreed to in writing, software
198 |    distributed under the License is distributed on an "AS IS" BASIS,
199 |    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
200 |    See the License for the specific language governing permissions and
201 |    limitations under the License.
202 | 


--------------------------------------------------------------------------------
/build.sbt:
--------------------------------------------------------------------------------
 1 | scalaVersion in ThisBuild := "2.11.8"
 2 | 
 3 | crossScalaVersions := Seq("2.10.5", "2.11.8")
 4 | 
 5 | parallelExecution in Test := false
 6 | 
 7 | val sparkVersion = "2.4.4"
 8 | val icebergVersion = "0.6.3"
 9 | val scalatestVersion = "3.0.5"
10 | val icebergSQLVersion = "0.0.1-SNAPSHOT"
11 | val derbyVersion = "10.11.1.1"
12 | 
13 | resolvers += "jitpack" at "https://jitpack.io"
14 | 
15 | val coreDependencies = Seq(
16 |   "org.apache.spark" %% "spark-core" % sparkVersion % "provided",
17 |   "org.apache.spark" %% "spark-sql" % sparkVersion % "provided",
18 |   "org.apache.spark" %% "spark-hive" % sparkVersion % "provided",
19 |   "org.apache.derby" % "derby" % derbyVersion %   "provided" force(),
20 |   "com.github.netflix.iceberg" % "iceberg-api" % icebergVersion ,
21 |   "com.github.netflix.iceberg" % "iceberg-common" % icebergVersion ,
22 |   "com.github.netflix.iceberg" % "iceberg-core" % icebergVersion 
23 | )
24 | 
25 | val coreTestDependencies = Seq(
26 |   "org.scalatest" %% "scalatest" % scalatestVersion % "test",
27 |   "org.pegdown"    %  "pegdown"     % "1.6.0"  % "test"
28 | )
29 | 
30 | lazy val commonSettings = Seq(
31 |   organization := "org.rhbutani",
32 | 
33 |   version := icebergSQLVersion,
34 | 
35 |   javaOptions := Seq("-Xms1g", "-Xmx3g",
36 |     "-Duser.timezone=UTC",
37 |     "-Dscalac.patmat.analysisBudget=512",
38 |     "-XX:MaxPermSize=256M"),
39 | 
40 |   // Target Java 8
41 |   scalacOptions += "-target:jvm-1.8",
42 |   javacOptions in compile ++= Seq("-source", "1.8", "-target", "1.8"),
43 | 
44 |   scalacOptions := Seq("-feature", "-deprecation"
45 |     //, "-Ylog-classpath"
46 |     ),
47 | 
48 |   licenses := Seq("Apache License, Version 2.0" ->
49 |     url("http://www.apache.org/licenses/LICENSE-2.0")
50 |   ),
51 | 
52 |   homepage := Some(url("https://github.com/hbutani/icebergSparkSQL")),
53 | 
54 |   publishArtifact in (Test,packageDoc) := false,
55 | 
56 |   pomIncludeRepository := { _ => false },
57 | 
58 |   test in assembly := {},
59 | 
60 |   pomExtra := (
61 |     <scm>
62 |       <url>https://github.com/hbutani/icebergSparkSQL.git</url>
63 |       <connection>scm:git:git@github.com:hbutani/icebergSparkSQL.git</connection>
64 |     </scm>
65 |       <developers>
66 |         <developer>
67 |           <name>Harish Butani</name>
68 |           <organization>hbutani</organization>
69 |           <organizationUrl>http://github.com/hbutani</organizationUrl>
70 |         </developer>
71 |       </developers>),
72 | 
73 |   fork in Test := false,
74 |   parallelExecution in Test := false
75 | )
76 | 
77 | lazy val root = project.in(file("."))
78 | //    .enablePlugins(GitVersioning)
79 |   .settings(commonSettings: _*)
80 |   .settings(
81 |     name := s"icebergSparkSQL",
82 |     libraryDependencies ++= (coreDependencies ++ coreTestDependencies),
83 |     publishArtifact in (Compile, packageBin) := false,
84 |     publishArtifact in Test := false,
85 |     testOptions in Test ++= Seq(
86 |       Tests.Argument(TestFrameworks.ScalaTest, "-h", "target/test-reports"),
87 |       Tests.Argument(TestFrameworks.ScalaTest, "-oDT")
88 |     ),
89 |     aggregate in assembly := false
90 |   )
91 |   .settings(addArtifact(artifact in (Compile, assembly), assembly).settings: _*)
92 | 
93 | 


--------------------------------------------------------------------------------
/docs/IcebergSparkSQL.bib:
--------------------------------------------------------------------------------
 1 | @misc{icespec,
 2 |    title = {Iceberg Table Spec},
 3 |    howpublished={\url{https://iceberg.apache.org/spec/}}
 4 | }
 5 | 
 6 | @misc{icepres,
 7 |    title = {Iceberg, A modern table format for big data},
 8 |    howpublished={\url{https://tinyurl.com/y6z6rcg4}}
 9 | }
10 | 
11 | @misc{icenet,
12 |    title = {Iceberg Netflix Github},
13 |    howpublished={\url{https://github.com/Netflix/iceberg}}
14 | }
15 | 
16 | @misc{iceapache,
17 |    title = {Iceberg Apache Github},
18 |    howpublished={\url{https://github.com/apache/incubator-iceberg}}
19 | }
20 | 
21 | @misc{sparkdsv2,
22 |    title = {Apache Spark Data Source V2},
23 |    howpublished={\url{https://databricks.com/session/apache-spark-data-source-v2}}
24 | }
25 | 
26 | @misc{sparkdsv2jira1,
27 |   title = {Data source API v2},
28 |   howpublished={\url{https://tinyurl.com/y5u576gk}}
29 | }
30 | 
31 | @misc{sparkdsv2jira2,
32 |   title = {Data Source V2 improvements},
33 |   howpublished={\url{https://tinyurl.com/yylna72p}}
34 | }
35 | 
36 | @misc{oow,
37 | title={Innovating with the Oracle Platform for Data Science and Big Data},
38 | author={Greg Pavlik and Diwakar Goel},
39 | howpublished={\url{https://tinyurl.com/y59k23bf}}
40 | }
41 | 
42 | @misc{orasnap,
43 | title={Interactive Data Lake Queries At Scale},
44 | author={Peter Jeffcock},
45 | howpublished = {\url{https://tinyurl.com/y8hbyp9q}}
46 | }
47 | 
48 | @misc{icesparksql,
49 | title={Iceberg Spark SQL Github},
50 | howpublished = {\url{https://github.com/hbutani/icebergSparkSQL}}
51 | }


--------------------------------------------------------------------------------
/docs/IcebergSparkSQL.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/hbutani/icebergSQL/63c26ae00bb03bb7d05bfbeac4fe6787b019d505/docs/IcebergSparkSQL.pdf


--------------------------------------------------------------------------------
/docs/basicExample.sql:
--------------------------------------------------------------------------------
  1 | -- input table
  2 | create table if not exists store_sales
  3 |     (
  4 |       ss_sold_time_sk           int,
  5 |       ss_item_sk                int,
  6 |       ss_customer_sk            int,
  7 |       ss_cdemo_sk               int,
  8 |       ss_hdemo_sk               int,
  9 |       ss_addr_sk                int,
 10 |       ss_store_sk               int,
 11 |       ss_promo_sk               int,
 12 |       ss_quantity               int,
 13 |       ss_wholesale_cost         decimal(7,2),
 14 |       ss_list_price             decimal(7,2),
 15 |       ss_sales_price            decimal(7,2),
 16 |       ss_ext_sales_price        decimal(7,2),
 17 |       ss_sold_month             string,
 18 |       ss_sold_day               string,
 19 |       ss_sold_date_sk string
 20 |     )
 21 |     USING parquet
 22 |     OPTIONS (
 23 |       path '/Users/hbutani/sparkline/icebergSQL/src/test/resources/store_sales'
 24 |     )
 25 |     partitioned by (ss_sold_date_sk)
 26 | ;
 27 | 
 28 | msck repair table store_sales;
 29 | 
 30 | -- managed table
 31 | create table if not exists store_sales_out
 32 |     (
 33 |       ss_sold_time_sk           int,
 34 |       ss_item_sk                int,
 35 |       ss_customer_sk            int,
 36 |       ss_cdemo_sk               int,
 37 |       ss_hdemo_sk               int,
 38 |       ss_addr_sk                int,
 39 |       ss_store_sk               int,
 40 |       ss_promo_sk               int,
 41 |       ss_quantity               int,
 42 |       ss_wholesale_cost         decimal(7,2),
 43 |       ss_list_price             decimal(7,2),
 44 |       ss_sales_price            decimal(7,2),
 45 |       ss_ext_sales_price        decimal(7,2),
 46 |       ss_sold_month             string,
 47 |       ss_sold_day               string,
 48 |       ss_sold_date_sk string
 49 |     )
 50 |     USING parquet
 51 |     OPTIONS (
 52 |       path '/tmp/store_sales_out',
 53 |       addTableManagement "true",
 54 |       columnDependencies "ss_sold_date_sk=ss_sold_month:truncate[2],ss_sold_date_sk=ss_sold_day:truncate[4]"
 55 |     )
 56 |     partitioned by (ss_sold_date_sk)
 57 | ;
 58 | 
 59 | -- first insert
 60 | insert into  store_sales_out
 61 |   select  *  from store_sales;
 62 | 
 63 | -- show snapshots
 64 | select * from `store_sales_out$snapshots`;
 65 | 
 66 | -- count is 2109
 67 | select count(*) from store_sales_out;
 68 | 
 69 | -- query on  ss_sold_date_sk='0906245'
 70 | select count(*)
 71 | from store_sales_out
 72 | where ss_sold_date_sk='0906245';
 73 | 
 74 | --  insert data gain
 75 | insert into  store_sales_out
 76 |   select  *  from store_sales;
 77 | 
 78 | -- observe 2 snapshots
 79 | select * from `store_sales_out$snapshots`;
 80 | 
 81 | -- table now has twice number of rows
 82 | select count(*) from store_sales_out;
 83 | 
 84 | -- query as of first snapshot
 85 | -- 1568658741390 is some time between 2 snapshots
 86 | -- observe 2109 rows
 87 | as of '1568658741390'
 88 | select count(*) from store_sales_out;
 89 | 
 90 | -- now do an insert overwrite
 91 | insert overwrite table  store_sales_out
 92 | select  *  from store_sales;
 93 | 
 94 | -- observe 3 snapshots
 95 | -- latest snapshot adds x files and deletes 2x files
 96 | select * from `store_sales_out$snapshots`;
 97 | 
 98 | -- table now has 2109 rows again
 99 | select count(*) from store_sales_out;
100 | 
101 | -- query as of second snapshot
102 | -- 1568659495070 is some time between 2 snapshots
103 | -- observe 4218 rows
104 | as of '1568659495070'
105 | select count(*) from store_sales_out;
106 | 
107 | -- SnapShot on an insert overwrite of 1 partition:
108 | insert overwrite table  store_sales_out partition ( ss_sold_date_sk='0906245' )
109 | select ss_sold_time_sk,ss_item_sk,ss_customer_sk,ss_cdemo_sk,ss_hdemo_sk,ss_addr_sk,ss_store_sk,ss_promo_sk,ss_quantity,ss_wholesale_cost,ss_list_price,ss_sales_price,ss_ext_sales_price,ss_sold_month,ss_sold_day from store_sales
110 | where ss_sold_date_sk='0906245' ;
111 | 
112 | -- observe 4 snapshots
113 | select * from `store_sales_out$snapshots`;
114 | 
115 | -- table now has 2109 rows again
116 | select count(*) from store_sales_out;
117 | 
118 | -- SnapShot on an insert overwrite of 1 partition, with source predicate: 5 files added, 5 files deleted
119 | insert overwrite table  store_sales_out partition ( ss_sold_date_sk='0905245' )
120 | select ss_sold_time_sk,ss_item_sk,ss_customer_sk,ss_cdemo_sk,ss_hdemo_sk,ss_addr_sk,ss_store_sk,ss_promo_sk,ss_quantity,ss_wholesale_cost,ss_list_price,ss_sales_price,ss_ext_sales_price,ss_sold_month,ss_sold_day from store_sales
121 | where ss_sold_date_sk='0905245' and ss_item_sk < 5000;
122 | 
123 | -- observe 5 snapshots
124 | select * from `store_sales_out$snapshots`;
125 | 
126 | -- table now has 1877 rows again
127 | select count(*) from store_sales_out;
128 | 
129 | -- now a query with ss_item_sk > 5000 on ss_sold_date_sk=0905245 should be a null scan
130 | -- observe no new job listed in Spark UI
131 | select * from store_sales_out
132 | where ss_item_sk > 5000 and ss_sold_date_sk='0905245';
133 | 


--------------------------------------------------------------------------------
/docs/iceSnapshots.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/hbutani/icebergSQL/63c26ae00bb03bb7d05bfbeac4fe6787b019d505/docs/iceSnapshots.png


--------------------------------------------------------------------------------
/docs/iceTblCommit.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/hbutani/icebergSQL/63c26ae00bb03bb7d05bfbeac4fe6787b019d505/docs/iceTblCommit.png


--------------------------------------------------------------------------------
/docs/iceTblScan.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/hbutani/icebergSQL/63c26ae00bb03bb7d05bfbeac4fe6787b019d505/docs/iceTblScan.png


--------------------------------------------------------------------------------
/docs/insIcePlan.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/hbutani/icebergSQL/63c26ae00bb03bb7d05bfbeac4fe6787b019d505/docs/insIcePlan.png


--------------------------------------------------------------------------------
/docs/insPlan.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/hbutani/icebergSQL/63c26ae00bb03bb7d05bfbeac4fe6787b019d505/docs/insPlan.png


--------------------------------------------------------------------------------
/docs/writePath.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/hbutani/icebergSQL/63c26ae00bb03bb7d05bfbeac4fe6787b019d505/docs/writePath.png


--------------------------------------------------------------------------------
/project/assembly.sbt:
--------------------------------------------------------------------------------
1 | addSbtPlugin("com.eed3si9n" % "sbt-assembly" % "0.14.9")
2 | 


--------------------------------------------------------------------------------
/project/build.properties:
--------------------------------------------------------------------------------
 1 | #
 2 | # Licensed to the Apache Software Foundation (ASF) under one or more
 3 | # contributor license agreements.  See the NOTICE file distributed with
 4 | # this work for additional information regarding copyright ownership.
 5 | # The ASF licenses this file to You under the Apache License, Version 2.0
 6 | # (the "License"); you may not use this file except in compliance with
 7 | # the License.  You may obtain a copy of the License at
 8 | #
 9 | #    http://www.apache.org/licenses/LICENSE-2.0
10 | #
11 | # Unless required by applicable law or agreed to in writing, software
12 | # distributed under the License is distributed on an "AS IS" BASIS,
13 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 | # See the License for the specific language governing permissions and
15 | # limitations under the License.
16 | #
17 | 
18 | sbt.version=0.13.18
19 | 


--------------------------------------------------------------------------------
/project/plugins.sbt:
--------------------------------------------------------------------------------
 1 | // You may use this file to add plugin dependencies for sbt.
 2 | 
 3 | resolvers ++= Seq(
 4 |   "Central" at "https://oss.sonatype.org/content/repositories/releases/"
 5 | )
 6 | resolvers += "Typesafe repository" at "http://repo.typesafe.com/typesafe/releases/"
 7 | 
 8 | addSbtPlugin("net.virtual-void" % "sbt-dependency-graph" % "0.9.2")
 9 | 
10 | addSbtPlugin("com.github.gseitz" % "sbt-release" % "1.0.11")
11 | 
12 | addSbtPlugin("com.jsuereth" % "sbt-pgp" % "1.0.0")
13 | 
14 | addSbtPlugin("com.typesafe.sbt" % "sbt-native-packager" % "1.3.22")
15 | 
16 | addSbtPlugin("org.scalastyle" %% "scalastyle-sbt-plugin" % "1.0.0")
17 | 
18 | addSbtPlugin("com.typesafe.sbt" % "sbt-git" % "1.0.0")
19 | 


--------------------------------------------------------------------------------
/scalastyle-config.xml:
--------------------------------------------------------------------------------
  1 | <!--
  2 |   ~ Licensed to the Apache Software Foundation (ASF) under one or more
  3 |   ~ contributor license agreements.  See the NOTICE file distributed with
  4 |   ~ this work for additional information regarding copyright ownership.
  5 |   ~ The ASF licenses this file to You under the Apache License, Version 2.0
  6 |   ~ (the "License"); you may not use this file except in compliance with
  7 |   ~ the License.  You may obtain a copy of the License at
  8 |   ~
  9 |   ~    http://www.apache.org/licenses/LICENSE-2.0
 10 |   ~
 11 |   ~ Unless required by applicable law or agreed to in writing, software
 12 |   ~ distributed under the License is distributed on an "AS IS" BASIS,
 13 |   ~ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 14 |   ~ See the License for the specific language governing permissions and
 15 |   ~ limitations under the License.
 16 |   -->
 17 | <!-- If you wish to turn off checking for a section of code, you can put a comment in the source
 18 |  before and after the section, with the following syntax: -->
 19 | <!-- // scalastyle:off -->
 20 | <!-- ... -->
 21 | <!-- // naughty stuff -->
 22 | <!-- ... -->
 23 | <!-- // scalastyle:on -->
 24 | 
 25 | <scalastyle>
 26 |     <name>Scalastyle standard configuration</name>
 27 |     <check level="error" class="org.scalastyle.file.FileTabChecker" enabled="true"></check>
 28 |     <!-- <check level="error" class="org.scalastyle.file.FileLengthChecker" enabled="true"> -->
 29 |     <!--  <parameters> -->
 30 |     <!--   <parameter name="maxFileLength"><![CDATA[800]]></parameter> -->
 31 |     <!--  </parameters> -->
 32 |     <!-- </check> -->
 33 |     <check level="error" class="org.scalastyle.file.HeaderMatchesChecker" enabled="true">
 34 |         <parameters>
 35 |             <parameter name="header"><![CDATA[/*
 36 |  * Licensed to the Apache Software Foundation (ASF) under one or more
 37 |  * contributor license agreements.  See the NOTICE file distributed with
 38 |  * this work for additional information regarding copyright ownership.
 39 |  * The ASF licenses this file to You under the Apache License, Version 2.0
 40 |  * (the "License"); you may not use this file except in compliance with
 41 |  * the License.  You may obtain a copy of the License at
 42 |  *
 43 |  *    http://www.apache.org/licenses/LICENSE-2.0
 44 |  *
 45 |  * Unless required by applicable law or agreed to in writing, software
 46 |  * distributed under the License is distributed on an "AS IS" BASIS,
 47 |  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 48 |  * See the License for the specific language governing permissions and
 49 |  * limitations under the License.
 50 |  */]]></parameter>
 51 |         </parameters>
 52 |     </check>
 53 |     <check level="error" class="org.scalastyle.scalariform.SpacesAfterPlusChecker" enabled="true"></check>
 54 |     <check level="error" class="org.scalastyle.file.WhitespaceEndOfLineChecker" enabled="false"></check>
 55 |     <check level="error" class="org.scalastyle.scalariform.SpacesBeforePlusChecker" enabled="true"></check>
 56 |     <check level="error" class="org.scalastyle.file.FileLineLengthChecker" enabled="true">
 57 |         <parameters>
 58 |             <parameter name="maxLineLength"><![CDATA[100]]></parameter>
 59 |             <parameter name="tabSize"><![CDATA[2]]></parameter>
 60 |             <parameter name="ignoreImports">true</parameter>
 61 |         </parameters>
 62 |     </check>
 63 |     <check level="error" class="org.scalastyle.scalariform.ClassNamesChecker" enabled="true">
 64 |         <parameters>
 65 |             <parameter name="regex"><![CDATA[[A-Z][A-Za-z]*]]></parameter>
 66 |         </parameters>
 67 |     </check>
 68 |     <check level="error" class="org.scalastyle.scalariform.ObjectNamesChecker" enabled="true">
 69 |         <parameters>
 70 |             <parameter name="regex"><![CDATA[[A-Z][A-Za-z]*]]></parameter>
 71 |         </parameters>
 72 |     </check>
 73 |     <check level="error" class="org.scalastyle.scalariform.PackageObjectNamesChecker" enabled="true">
 74 |         <parameters>
 75 |             <parameter name="regex"><![CDATA[^[a-z][A-Za-z]*$]]></parameter>
 76 |         </parameters>
 77 |     </check>
 78 |     <check level="error" class="org.scalastyle.scalariform.EqualsHashCodeChecker" enabled="false"></check>
 79 |     <!-- <check level="error" class="org.scalastyle.scalariform.IllegalImportsChecker" enabled="true"> -->
 80 |     <!--  <parameters> -->
 81 |     <!--   <parameter name="illegalImports"><![CDATA[sun._,java.awt._]]></parameter> -->
 82 |     <!--  </parameters> -->
 83 |     <!-- </check> -->
 84 |     <check level="error" class="org.scalastyle.scalariform.ParameterNumberChecker" enabled="true">
 85 |         <parameters>
 86 |             <parameter name="maxParameters"><![CDATA[10]]></parameter>
 87 |         </parameters>
 88 |     </check>
 89 |     <!-- <check level="error" class="org.scalastyle.scalariform.MagicNumberChecker" enabled="true"> -->
 90 |     <!--  <parameters> -->
 91 |     <!--   <parameter name="ignore"><![CDATA[-1,0,1,2,3]]></parameter> -->
 92 |     <!--  </parameters> -->
 93 |     <!-- </check> -->
 94 |     <check level="error" class="org.scalastyle.scalariform.NoWhitespaceBeforeLeftBracketChecker" enabled="false"></check>
 95 |     <check level="error" class="org.scalastyle.scalariform.NoWhitespaceAfterLeftBracketChecker" enabled="false"></check>
 96 |     <!-- <check level="error" class="org.scalastyle.scalariform.ReturnChecker" enabled="true"></check> -->
 97 |     <!-- <check level="error" class="org.scalastyle.scalariform.NullChecker" enabled="true"></check> -->
 98 |     <!-- <check level="error" class="org.scalastyle.scalariform.NoCloneChecker" enabled="true"></check> -->
 99 |     <!-- <check level="error" class="org.scalastyle.scalariform.NoFinalizeChecker" enabled="true"></check> -->
100 |     <!-- <check level="error" class="org.scalastyle.scalariform.CovariantEqualsChecker" enabled="true"></check> -->
101 |     <!-- <check level="error" class="org.scalastyle.scalariform.StructuralTypeChecker" enabled="true"></check> -->
102 |     <!-- <check level="error" class="org.scalastyle.file.RegexChecker" enabled="true"> -->
103 |     <!--  <parameters> -->
104 |     <!--   <parameter name="regex"><![CDATA[println]]></parameter> -->
105 |     <!--  </parameters> -->
106 |     <!-- </check> -->
107 |     <!-- <check level="error" class="org.scalastyle.scalariform.NumberOfTypesChecker" enabled="true"> -->
108 |     <!--  <parameters> -->
109 |     <!--   <parameter name="maxTypes"><![CDATA[30]]></parameter> -->
110 |     <!--  </parameters> -->
111 |     <!-- </check> -->
112 |     <!-- <check level="error" class="org.scalastyle.scalariform.CyclomaticComplexityChecker" enabled="true"> -->
113 |     <!--  <parameters> -->
114 |     <!--   <parameter name="maximum"><![CDATA[10]]></parameter> -->
115 |     <!--  </parameters> -->
116 |     <!-- </check> -->
117 |     <check level="error" class="org.scalastyle.scalariform.UppercaseLChecker" enabled="true"></check>
118 |     <check level="error" class="org.scalastyle.scalariform.SimplifyBooleanExpressionChecker" enabled="false"></check>
119 |     <check level="error" class="org.scalastyle.scalariform.IfBraceChecker" enabled="true">
120 |         <parameters>
121 |             <parameter name="singleLineAllowed"><![CDATA[true]]></parameter>
122 |             <parameter name="doubleLineAllowed"><![CDATA[true]]></parameter>
123 |         </parameters>
124 |     </check>
125 |     <!-- <check level="error" class="org.scalastyle.scalariform.MethodLengthChecker" enabled="true"> -->
126 |     <!--  <parameters> -->
127 |     <!--   <parameter name="maxLength"><![CDATA[50]]></parameter> -->
128 |     <!--  </parameters> -->
129 |     <!-- </check> -->
130 |     <!-- <check level="error" class="org.scalastyle.scalariform.MethodNamesChecker" enabled="true"> -->
131 |     <!--  <parameters> -->
132 |     <!--   <parameter name="regex"><![CDATA[^[a-z][A-Za-z0-9]*$]]></parameter> -->
133 |     <!--  </parameters> -->
134 |     <!-- </check> -->
135 |     <!-- <check level="error" class="org.scalastyle.scalariform.NumberOfMethodsInTypeChecker" enabled="true"> -->
136 |     <!--  <parameters> -->
137 |     <!--   <parameter name="maxMethods"><![CDATA[30]]></parameter> -->
138 |     <!--  </parameters> -->
139 |     <!-- </check> -->
140 |     <!-- <check level="error" class="org.scalastyle.scalariform.PublicMethodsHaveTypeChecker" enabled="true"></check> -->
141 |     <check level="error" class="org.scalastyle.file.NewLineAtEofChecker" enabled="true"></check>
142 |     <check level="error" class="org.scalastyle.file.NoNewLineAtEofChecker" enabled="false"></check>
143 |     <check level="error" class="org.scalastyle.scalariform.NonASCIICharacterChecker" enabled="true"></check>
144 |     <check level="error" class="org.scalastyle.scalariform.SpaceAfterCommentStartChecker" enabled="true"></check>
145 | </scalastyle>


--------------------------------------------------------------------------------
/src/main/scala/org/apache/spark/sql/execution/datasources/iceberg/PartitioningUtils.scala:
--------------------------------------------------------------------------------
  1 | /*
  2 |  * Licensed to the Apache Software Foundation (ASF) under one or more
  3 |  * contributor license agreements.  See the NOTICE file distributed with
  4 |  * this work for additional information regarding copyright ownership.
  5 |  * The ASF licenses this file to You under the Apache License, Version 2.0
  6 |  * (the "License"); you may not use this file except in compliance with
  7 |  * the License.  You may obtain a copy of the License at
  8 |  *
  9 |  *    http://www.apache.org/licenses/LICENSE-2.0
 10 |  *
 11 |  * Unless required by applicable law or agreed to in writing, software
 12 |  * distributed under the License is distributed on an "AS IS" BASIS,
 13 |  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 14 |  * See the License for the specific language governing permissions and
 15 |  * limitations under the License.
 16 |  */
 17 | 
 18 | package org.apache.spark.sql.execution.datasources.iceberg
 19 | 
 20 | import org.apache.hadoop.fs.Path
 21 | import org.apache.spark.sql.SparkSession
 22 | import org.apache.spark.sql.catalyst.InternalRow
 23 | import org.apache.spark.sql.catalyst.catalog.CatalogTypes.TablePartitionSpec
 24 | import org.apache.spark.sql.catalyst.expressions.{And, AttributeReference, Cast, EqualTo, Expression, GenericInternalRow, Literal}
 25 | import org.apache.spark.sql.catalyst.util.{CaseInsensitiveMap, DateTimeUtils}
 26 | import org.apache.spark.sql.execution.datasources.{PartitionPath, PartitionSpec => SparkPSec, PartitioningUtils => SparkPartitioningUtils}
 27 | import org.apache.spark.sql.types.{StringType, StructType}
 28 | import org.apache.spark.unsafe.types.UTF8String
 29 | 
 30 | /**
 31 |  * '''Partitioning Data Structures and concepts'''
 32 |  *
 33 |  * 1. ''Partitioning Scheme:'' represents the structure of a Table's partitions. Represented
 34 |  * as a Spark [[StructType]]
 35 |  * 2. ''Partition Value or Row:'' represents the values of a Table partition. Represented
 36 |  * as a Spark [[InternalRow]]
 37 |  * 3. ''Partition Specification:'' represents a set of partitions being acted on in a ''Insert''
 38 |  * commad. Capture as a [[TablePartitionSpec]]
 39 |  * 4. ''Partition Info : '' details about a Table Partition captured as [[PartitionPath]] (this
 40 |  * name is confusing)
 41 |  * 5. ''Partition Set :'' represents details on a set of Table Partitions captured as
 42 |  * [[PartitionSpec]](terrible name)
 43 |  */
 44 | object PartitioningUtils {
 45 | 
 46 |   /**
 47 |    * A wrapper for the [[SparkPartitioningUtils]] parsePartition function.
 48 |    * Used to introspect a set of filesystem paths and construct the set of
 49 |    * [[PartitionPath]] at these paths, and combine them into a [[PartitionSpec]]
 50 |    *
 51 |    * @param paths
 52 |    * @param basePaths
 53 |    * @param partSchema
 54 |    * @param sparkSession
 55 |    * @return
 56 |    */
 57 |   def parsePartitions(paths: Seq[Path],
 58 |                       basePaths: Set[Path],
 59 |                       partSchema: StructType
 60 |                      )(implicit sparkSession: SparkSession) : SparkPSec = {
 61 | 
 62 |     val tzId = sparkSession.sessionState.conf.sessionLocalTimeZone
 63 | 
 64 |     SparkPartitioningUtils.parsePartitions(
 65 |       paths,
 66 |       typeInference = sparkSession.sessionState.conf.partitionColumnTypeInferenceEnabled,
 67 |       basePaths = basePaths,
 68 |       Some(partSchema),
 69 |       sparkSession.sqlContext.conf.caseSensitiveAnalysis,
 70 |       DateTimeUtils.getTimeZone(tzId))
 71 |   }
 72 | 
 73 |   /**
 74 |    * A thin wrapper on `parsePartitions` method. This converts a filesystem path
 75 |    * into a ''Partition Info'', returning it as a [[PartitionPath]]
 76 |    * @param p
 77 |    * @param partSchema
 78 |    * @param sparkSession
 79 |    * @return
 80 |    */
 81 |   def parsePath(p : Path,
 82 |                 partSchema: StructType
 83 |                )(implicit sparkSession: SparkSession) : PartitionPath = {
 84 |     val sPSpec = parsePartitions(Seq(p), Set.empty, partSchema)
 85 |     sPSpec.partitions.head
 86 |   }
 87 | 
 88 |   /**
 89 |    * Convert a fileSystem path into a ''Partition Value'' return it as an [[InternalRow]]
 90 |    * @param p
 91 |    * @param partSchema
 92 |    * @return
 93 |    */
 94 |   def partitionRowFromFullPath(p : String, partSchema : StructType) : InternalRow = {
 95 |     val pathString = removeFileName(p)
 96 |     val m = CaseInsensitiveMap(partitionValuesFromFullPath(pathString))
 97 | 
 98 |     assert(m.size == partSchema.size)
 99 | 
100 |     val casts : Array[Cast] = for(
101 |       f <- partSchema.fields
102 |     ) yield {
103 |       val s = m(f.name)
104 |       val l = Literal.create(UTF8String.fromString(s), StringType)
105 |       Cast(l, f.dataType)
106 |     }
107 |     val values : Seq[Any] = casts.map(c => c.eval(null))
108 |     new GenericInternalRow(values.toArray)
109 | 
110 |   }
111 | 
112 |   /**
113 |    * Convert a ''Partition Value'' into a [[Expression][Spark Expression]].
114 |    * The Expression is a conjunct of Equalty conditions. For each partition
115 |    * column we have an equality condition that the partition column equals the value in
116 |    * the given partition Row/value.
117 |    *
118 |    * @param partSchema
119 |    * @param partRow
120 |    * @return
121 |    */
122 |   def partitionRowToPredicate(partSchema : StructType,
123 |                               partRow : InternalRow) : Expression = {
124 | 
125 |     val exprs = for((f,i) <- partSchema.zipWithIndex) yield {
126 |       val attr = AttributeReference(f.name, f.dataType, f.nullable, f.metadata)()
127 |       EqualTo(attr, Literal(partRow.get(i, f.dataType)))
128 |     }
129 |     exprs.reduce(And)
130 |   }
131 | 
132 |   /**
133 |    * Parse a filesystem path into a ''Partition Specification:'' and return it as
134 |    * [[TablePartitionSpec]]. Look for folder names of the form 'name=value' at the end of the
135 |    * path and map them into entries in the [[TablePartitionSpec]]
136 |    *
137 |    * @param pathString
138 |    * @return
139 |    */
140 |   def partitionValuesFromFullPath(pathString : String) : TablePartitionSpec = {
141 |     var m = Map.empty[String, String]
142 |     def add(col : String, value: String) : Unit =
143 |       m += (col -> value)
144 | 
145 |     mapPartitionSpecs(pathString,
146 |       (path, col, value) => add(col, value)
147 |     )
148 |     m
149 |   }
150 | 
151 |   /**
152 |    * Map a [[TablePartitionSpec]] into a [Expression][Spark Expression]].
153 |    * The Expression is a conjunct of Equalty conditions. For each partition
154 |    * column we have an equality condition that the partition column equals the value in
155 |    * the given [[TablePartitionSpec]]
156 |    *
157 |    * @param partitionSchema
158 |    * @param pSpec
159 |    * @return
160 |    */
161 |   def expressionFromPartitionSpec(partitionSchema : StructType,
162 |                                   pSpec : CaseInsensitiveMap[String]
163 |                                  ) : Option[Expression] = {
164 |     val exprs = for(
165 |       attr <- partitionSchema.toAttributes;
166 |       v <- pSpec.get(attr.name)
167 |     ) yield {
168 |       EqualTo(attr, Cast(Literal(UTF8String.fromString(v), StringType), attr.dataType))
169 |     }
170 | 
171 |     exprs match {
172 |       case e if e.size == 0 => None
173 |       case e if e.size == 1 => Some(e.head)
174 |       case _ => Some(exprs.reduce(And))
175 |     }
176 |   }
177 | 
178 |   private def mapPartitionSpecs(fullPath : String,
179 |                                 op : (String, String, String) => Unit) : Unit = {
180 |     var p = new Path(fullPath)
181 |     var reachedNonPartName = false
182 | 
183 |     while(!reachedNonPartName) {
184 |       val name = p.getName
185 |       partitionSpec(name) match {
186 |         case Some((col, value)) => op(name, col,value)
187 |         case None => {
188 |           reachedNonPartName = true
189 |         }
190 |       }
191 |       p = p.getParent
192 |     }
193 |   }
194 | 
195 |   private def partitionSpec(fileName : String) : Option[(String,String)] = {
196 |     val equalSignIndex = fileName.indexOf('=')
197 |     if (equalSignIndex == -1) {
198 |       None
199 |     } else {
200 |       val columnName = fileName.take(equalSignIndex)
201 |       if (columnName.isEmpty) return None
202 |       val rawColumnValue = fileName.drop(equalSignIndex + 1)
203 |       if (rawColumnValue.nonEmpty) {
204 |         Some((columnName, rawColumnValue))
205 |       } else None
206 |     }
207 |   }
208 | 
209 |   def removeFileName(pathStr : String): String = {
210 |     if (pathStr.endsWith("/")) {
211 |       pathStr
212 |     } else {
213 |       val path = new Path(pathStr)
214 |       path.getParent.toString
215 |     }
216 |   }
217 | 
218 | }
219 | 


--------------------------------------------------------------------------------
/src/main/scala/org/apache/spark/sql/iceberg/Config.scala:
--------------------------------------------------------------------------------
 1 | /*
 2 |  * Licensed to the Apache Software Foundation (ASF) under one or more
 3 |  * contributor license agreements.  See the NOTICE file distributed with
 4 |  * this work for additional information regarding copyright ownership.
 5 |  * The ASF licenses this file to You under the Apache License, Version 2.0
 6 |  * (the "License"); you may not use this file except in compliance with
 7 |  * the License.  You may obtain a copy of the License at
 8 |  *
 9 |  *    http://www.apache.org/licenses/LICENSE-2.0
10 |  *
11 |  * Unless required by applicable law or agreed to in writing, software
12 |  * distributed under the License is distributed on an "AS IS" BASIS,
13 |  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 |  * See the License for the specific language governing permissions and
15 |  * limitations under the License.
16 |  */
17 | 
18 | package org.apache.spark.sql.iceberg
19 | 
20 | import org.apache.spark.sql.catalyst.catalog.CatalogTable
21 | import org.apache.spark.sql.catalyst.util.CaseInsensitiveMap
22 | 
23 | object Config {
24 | 
25 |   /**
26 |    * Use to specify that the system should provide additional management capabilities
27 |     * for a table.
28 |    */
29 |   val ADD_MANAGEMENT = "addTableManagement"
30 |   val DEFAULT_ADD_MANAGEMENT = "false"
31 | 
32 | 
33 |   private def getOption(tblDesc : CatalogTable,
34 |                         optionName : String) : Option[String] = {
35 | 
36 |     def get(m : Map[String, String]) : Option[String] = {
37 |       m.get(optionName).orElse(CaseInsensitiveMap(m).get(optionName))
38 |     }
39 | 
40 |     get(tblDesc.storage.properties).orElse(get(tblDesc.properties))
41 | 
42 |   }
43 | 
44 |   def provideManagement(tblDesc : CatalogTable) : Boolean = {
45 |     getOption(tblDesc, ADD_MANAGEMENT).
46 |       getOrElse(DEFAULT_ADD_MANAGEMENT).toBoolean
47 |   }
48 | 
49 |   def isManaged(tblDesc : CatalogTable) : Boolean = provideManagement(tblDesc)
50 | 
51 | 
52 |   /**
53 |     * For tables `addTableManagement=true` use this option to specify
54 |    * column dependencies. This specified as a comma-sperated list of comma
55 |    * separated list of column dependence. A 'column dependence' is of the form
56 |    * `srcCol=destCol:transformFn`, for example
57 |    * `date_col=day_col:extract[2]` where `date_col` is a string in the form `DD-MM-YYYY`.
58 |    * Semantically a column dependence implies that the destCol value can be determined
59 |    * from a srcCol value; the columns are in a one-to-one` or `many-to-one` relationship.
60 |    * The src and dest columns can be any column (data or partition columns) of the table.
61 |    *
62 |    * Currently we support [[Transforms][Iceberg Transforms]] as mapping functions,
63 |    * they must be a string representation of an [[Transforms][Iceberg Transforms]].
64 |    * So users can relate columns based on `date` or `timestamp` elements,
65 |    * based on `truncating` values or `value buckets`.
66 |    *
67 |    * During a table scan we will attempt to transform a predicate on the `srcCol`
68 |    * into a predicate on the `destCol`. For example `date_col='09-12-2019'` will be transformed
69 |    * into `day_col='09'` and applied. If the `destCol` is a partition column
70 |    * this can lead to partition pruning. For example if the table is partitioned by
71 |    * `day_col` then from a predicate `date_col='09-12-2019'` the  inferred predicate
72 |    * `day_col='09'` will lead to only partitions from the 9th day of each month to
73 |    * be scanned. In case the `destCol` is a data column the inferred predicate can  lead
74 |    * to datafiles being pruned based on the statistics available on the column.
75 |    */
76 | 
77 |   val COLUMN_DEPENDENCIES = "columnDependencies"
78 | 
79 |   def columnDependencies(tblDesc : CatalogTable) : Option[String] = {
80 |     getOption(tblDesc, COLUMN_DEPENDENCIES)
81 |   }
82 | 
83 | }
84 | 


--------------------------------------------------------------------------------
/src/main/scala/org/apache/spark/sql/iceberg/CreateIcebergTable.scala:
--------------------------------------------------------------------------------
 1 | /*
 2 |  * Licensed to the Apache Software Foundation (ASF) under one or more
 3 |  * contributor license agreements.  See the NOTICE file distributed with
 4 |  * this work for additional information regarding copyright ownership.
 5 |  * The ASF licenses this file to You under the Apache License, Version 2.0
 6 |  * (the "License"); you may not use this file except in compliance with
 7 |  * the License.  You may obtain a copy of the License at
 8 |  *
 9 |  *    http://www.apache.org/licenses/LICENSE-2.0
10 |  *
11 |  * Unless required by applicable law or agreed to in writing, software
12 |  * distributed under the License is distributed on an "AS IS" BASIS,
13 |  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 |  * See the License for the specific language governing permissions and
15 |  * limitations under the License.
16 |  */
17 | 
18 | package org.apache.spark.sql.iceberg
19 | 
20 | import org.apache.spark.sql.catalyst.catalog.CatalogTable
21 | import org.apache.spark.sql.catalyst.plans.logical.LogicalPlan
22 | import org.apache.spark.sql.execution.SparkPlan
23 | import org.apache.spark.sql.execution.command.{CreateDataSourceTableAsSelectCommand, CreateDataSourceTableCommand, DataWritingCommand, RunnableCommand}
24 | import org.apache.spark.sql.iceberg.table.SparkTablesForCreate
25 | import org.apache.spark.sql.iceberg.utils.{SchemaConversions, _}
26 | import org.apache.spark.sql.{Row, SparkSession}
27 | 
28 | trait IcebergTableCreation {
29 |   val catalogTable: CatalogTable
30 | 
31 |   def createIcebergTable(sparkSession: SparkSession) : Unit = {
32 |     implicit val ss = currentSparkSession
33 |     import scala.collection.JavaConversions._
34 |     val (schema, partSpec) = SchemaConversions.toIcebergSpec(catalogTable)
35 |     val iceTables = new SparkTablesForCreate(ss, catalogTable)
36 |     val tId = parseIdentifier(catalogTable.qualifiedName, ss)
37 |     iceTables.create(schema, partSpec, Map.empty[String,String], catalogTable.qualifiedName)
38 |   }
39 | }
40 | 
41 | case class CreateIcebergTable(createTblCmd : CreateDataSourceTableCommand)
42 |   extends RunnableCommand with IcebergTableCreation {
43 | 
44 |   val catalogTable = createTblCmd.table
45 | 
46 |   override def run(sparkSession: SparkSession): Seq[Row] = {
47 |     val rows = createTblCmd.run(sparkSession)
48 |     createIcebergTable(sparkSession)
49 |     rows
50 |   }
51 | }
52 | 
53 | 
54 | case class CreateIcebergTableAsSelect(createTblCmd : CreateDataSourceTableAsSelectCommand)
55 |   extends DataWritingCommand with IcebergTableCreation {
56 | 
57 |   override def query: LogicalPlan = createTblCmd.query
58 | 
59 |   override def outputColumnNames: Seq[String] = createTblCmd.outputColumnNames
60 | 
61 |   override def run(sparkSession: SparkSession, child: SparkPlan): Seq[Row] = {
62 |     val rows = createTblCmd.run(sparkSession, child)
63 |     createIcebergTable(sparkSession)
64 |     rows
65 |   }
66 | 
67 |   override val catalogTable: CatalogTable = createTblCmd.table
68 | }


--------------------------------------------------------------------------------
/src/main/scala/org/apache/spark/sql/iceberg/IceTableScanExec.scala:
--------------------------------------------------------------------------------
  1 | /*
  2 |  * Licensed to the Apache Software Foundation (ASF) under one or more
  3 |  * contributor license agreements.  See the NOTICE file distributed with
  4 |  * this work for additional information regarding copyright ownership.
  5 |  * The ASF licenses this file to You under the Apache License, Version 2.0
  6 |  * (the "License"); you may not use this file except in compliance with
  7 |  * the License.  You may obtain a copy of the License at
  8 |  *
  9 |  *    http://www.apache.org/licenses/LICENSE-2.0
 10 |  *
 11 |  * Unless required by applicable law or agreed to in writing, software
 12 |  * distributed under the License is distributed on an "AS IS" BASIS,
 13 |  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 14 |  * See the License for the specific language governing permissions and
 15 |  * limitations under the License.
 16 |  */
 17 | 
 18 | package org.apache.spark.sql.iceberg
 19 | 
 20 | import com.netflix.iceberg.DataFile
 21 | import com.netflix.iceberg.expressions.{ExpressionVisitors, Expression => IceExpression}
 22 | import org.apache.spark.rdd.RDD
 23 | import org.apache.spark.sql.catalyst.InternalRow
 24 | import org.apache.spark.sql.catalyst.catalog.CatalogTable
 25 | import org.apache.spark.sql.catalyst.expressions.codegen.{CodegenContext, ExprCode}
 26 | import org.apache.spark.sql.catalyst.expressions.{And, Attribute, Expression, Literal, SortOrder}
 27 | import org.apache.spark.sql.catalyst.plans.physical.Partitioning
 28 | import org.apache.spark.sql.execution.datasources.PartitionDirectory
 29 | import org.apache.spark.sql.execution.{CodegenSupport, FileSourceScanExec, SparkPlan, UnaryExecNode}
 30 | import org.apache.spark.sql.iceberg.table.SparkTables
 31 | import org.apache.spark.sql.iceberg.utils.DelegationUtils.{DelegatedField, DelegatedMethod0, DelegatedMethod1}
 32 | import org.apache.spark.sql.iceberg.utils.{ColumnDependenciesExprVisitor, ExpressionConversions, TableUtils, Transforms}
 33 | import com.netflix.iceberg.transforms.{Transform => IceTransform}
 34 | import org.apache.spark.sql.iceberg.utils.TypeConversions.TypeMapping
 35 | 
 36 | /**
 37 |  * Setup as a parent Physical Operator of a [[FileSourceScanExec]]. During execution
 38 |  * before handing over control to its child [[FileSourceScanExec]] operator it updates its
 39 |  * `selectedPartitions` member.
 40 |  *
 41 |  * This is computed based on the `partitionFilters` and `dataFilters` associated with this scan.
 42 |  * These are converted to an [[IceExpression]], further [[IceExpression]] are added based on
 43 |  * `column dependencies` defined for this table. From the current Iceberg snaphot a list of
 44 |  * [[DataFile]] are computed. Finally the [[FileSourceScanExec]] list of selected partitions
 45 |  * list is updated to remove files from [[PartitionDirectory]] instances not in this list of
 46 |  * DataFiles.
 47 |  *
 48 |  * @param child
 49 |  * @param catalogTable
 50 |  */
 51 | case class IceTableScanExec(child: FileSourceScanExec,
 52 |                             @transient catalogTable: CatalogTable
 53 |                            )
 54 |   extends UnaryExecNode with CodegenSupport {
 55 | 
 56 |   @transient private val iceTable = {
 57 |     implicit val ss = sqlContext.sparkSession
 58 |     val iceTables = new SparkTables(ss, catalogTable)
 59 |     val tbId = utils.qualifiedTableIdentifier(catalogTable.identifier, ss)
 60 |     iceTables.load(tbId.database.getOrElse(null), tbId.table)
 61 |   }
 62 | 
 63 |   lazy val sparkExpressionToPushToIce: Expression = {
 64 |     val alwaysTrue : Expression = Literal(true)
 65 |     (child.partitionFilters ++ child.dataFilters).foldLeft(alwaysTrue)(And)
 66 |   }
 67 | 
 68 |   @transient private lazy val icebergFilter: IceExpression = {
 69 |     val iceExpr = ExpressionConversions.convert(sparkExpressionToPushToIce)
 70 |     columnDependencies.map { cds =>
 71 |       val exprVisitor = new ColumnDependenciesExprVisitor(iceTable.schema().asStruct(), cds)
 72 |       ExpressionVisitors.visit(iceExpr, exprVisitor)
 73 |     }.getOrElse(iceExpr)
 74 |   }
 75 | 
 76 |   @transient private lazy val iceDataFiles : Seq[DataFile] = {
 77 |     val iceScan = iceTable.newScan().
 78 |       useSnapshot(TableUtils.getThreadSnapShotId(iceTable)).
 79 |       filter(icebergFilter)
 80 |     import scala.collection.JavaConversions._
 81 |     iceScan.planFiles().map(_.file).toSeq
 82 |   }
 83 | 
 84 |   @transient private val selectedPartitionsDelegate =
 85 |     new DelegatedField[Array[PartitionDirectory], FileSourceScanExec, FileSourceScanExec](
 86 |       child, "org$apache$spark$sql$execution$FileSourceScanExec$$selectedPartitions")
 87 | 
 88 |   @transient private val doExecute_d =
 89 |     new DelegatedMethod0[SparkPlan, FileSourceScanExec, RDD[InternalRow]](child, "doExecute")
 90 | 
 91 |   @transient private val doProduce_d =
 92 |     new DelegatedMethod1[CodegenSupport, FileSourceScanExec, String, CodegenContext](
 93 |       child, "doProduce")
 94 | 
 95 | 
 96 |   private var updateSelectedPartitionsDone : Boolean = false
 97 | 
 98 |   private def updateSelectedPartitions : Unit = {
 99 |     if (!updateSelectedPartitionsDone) {
100 |       child.metadata // trigger construction of child selectedPartitions
101 | 
102 |       val dataFiles = iceDataFiles
103 |       val dFilePaths = dataFiles.map(_.path().toString).toSet
104 |       val sParts = selectedPartitionsDelegate.value
105 | 
106 |       val  updtSelParts = sParts.map { sPart =>
107 |         PartitionDirectory(
108 |           sPart.values,
109 |           sPart.files.filter(fs => dFilePaths.contains(fs.getPath.toString))
110 |         )
111 |       }
112 |       selectedPartitionsDelegate.value = updtSelParts
113 |       updateSelectedPartitionsDone = true
114 |     }
115 |   }
116 | 
117 |   override def inputRDDs(): Seq[RDD[InternalRow]] = {
118 |     updateSelectedPartitions
119 |     child.inputRDDs()
120 |   }
121 | 
122 |   override protected def doExecute(): RDD[InternalRow] = {
123 |     updateSelectedPartitions
124 |     doExecute_d.apply
125 |   }
126 | 
127 |   override def output: Seq[Attribute] = child.output
128 | 
129 |   override protected def doProduce(ctx: CodegenContext): String = {
130 |     updateSelectedPartitions
131 |     child.produce(ctx, this)
132 |   }
133 | 
134 |   override  def doConsume(ctx: CodegenContext, input: Seq[ExprCode], row: ExprCode): String = {
135 |     updateSelectedPartitions
136 |     parent.doConsume(ctx, input, row)
137 |   }
138 | 
139 |   override lazy val (outputPartitioning, outputOrdering) : (Partitioning, Seq[SortOrder]) =
140 |     (child.outputPartitioning, child.outputOrdering)
141 | 
142 |   override def doCanonicalize(): IceTableScanExec = {
143 |     IceTableScanExec(child.doCanonicalize(), catalogTable)
144 |   }
145 | 
146 |   private def columnDependencies : Option[Map[String, Map[String, IceTransform[_,_]]]] = {
147 |     Config.columnDependencies(catalogTable).flatMap { colDeps =>
148 |       val typMapping = TypeMapping(catalogTable.schema, iceTable.schema().asStruct())
149 |       val t = Transforms.fromOption(colDeps, typMapping)(sqlContext.sparkSession)
150 |       t.right.toOption
151 |     }
152 |   }
153 | 
154 |   // for testing only
155 |   def showPartitionsScanned : Seq[PartitionDirectory] = {
156 |     updateSelectedPartitions
157 |     selectedPartitionsDelegate.value
158 |   }
159 | 
160 |   // for testing only
161 |   def getIceFilter = icebergFilter
162 | 
163 | }
164 | 


--------------------------------------------------------------------------------
/src/main/scala/org/apache/spark/sql/iceberg/IcebergFileCommitProtocol.scala:
--------------------------------------------------------------------------------
  1 | /*
  2 |  * Licensed to the Apache Software Foundation (ASF) under one or more
  3 |  * contributor license agreements.  See the NOTICE file distributed with
  4 |  * this work for additional information regarding copyright ownership.
  5 |  * The ASF licenses this file to You under the Apache License, Version 2.0
  6 |  * (the "License"); you may not use this file except in compliance with
  7 |  * the License.  You may obtain a copy of the License at
  8 |  *
  9 |  *    http://www.apache.org/licenses/LICENSE-2.0
 10 |  *
 11 |  * Unless required by applicable law or agreed to in writing, software
 12 |  * distributed under the License is distributed on an "AS IS" BASIS,
 13 |  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 14 |  * See the License for the specific language governing permissions and
 15 |  * limitations under the License.
 16 |  */
 17 | 
 18 | package org.apache.spark.sql.iceberg
 19 | 
 20 | import com.netflix.iceberg.{DataFile, PartitionSpec}
 21 | import org.apache.hadoop.fs.{FileSystem, Path}
 22 | import org.apache.hadoop.mapreduce.{JobContext, TaskAttemptContext, TaskAttemptID}
 23 | import org.apache.spark.internal.io.FileCommitProtocol
 24 | import org.apache.spark.sql.iceberg.utils.{DataFileUtils, _}
 25 | import org.apache.spark.sql.types.StructType
 26 | 
 27 | import scala.collection.mutable.{ArrayBuffer, Map => MMap}
 28 | 
 29 | /**
 30 |  * Provide the following function on top of the 'normal' Commit Protocol.
 31 |  * Commit actions are simply deferred to the 'designate' except in the following:
 32 |  * - track files created for each Task in a [[TaskPaths]] instance. This
 33 |  *   tracks the temporary file location and also the location that the
 34 |  *   file will be moved to on a commit.
 35 |  * - on Task Commit build an Iceberg [[DataFile]] instance. Currently only if the
 36 |  *   file is a parquet file we will also build column level stats.
 37 |  *   - The [[TaskCommitMessage]] we send back has a payload of [[IcebergTaskCommitMessage]],
 38 |  *     which encapsulates the TaskCommitMessage build by the  'designate' and the
 39 |  *     DataFile instances.
 40 |  * - we ignore ''deleteWithJob'' invocations, as we want to keep historical files around.
 41 |  *   These will be removed via a `clear snapshot` command.
 42 |  * - on a ''commitJob'' we extract all the [[DataFile]] instances from the
 43 |  *   [[IcebergTaskCommitMessage]] messages and expose a ''addedDataFiles'' list
 44 |  *   which is used by [[IceTableScanExec]] to build the new Iceberg Table Snapshot.
 45 |  *
 46 |  * @param designate
 47 |  * @param path
 48 |  * @param partitionSchema
 49 |  * @param icePartSpec
 50 |  * @param iceSchema
 51 |  * @param iceFmt
 52 |  */
 53 | class IcebergFileCommitProtocol(val designate: FileCommitProtocol,
 54 |                                 val path: String,
 55 |                                 val partitionSchema: StructType,
 56 |                                 val icePartSpec: PartitionSpec,
 57 |                                 val iceSchema: IceSchema,
 58 |                                 val iceFmt: IceFormat
 59 |                                )
 60 |   extends FileCommitProtocol with Serializable {
 61 | 
 62 |   import org.apache.spark.sql.iceberg.IcebergFileCommitProtocol._
 63 | 
 64 |   @transient var taskFiles = MMap[TaskAttemptID, ArrayBuffer[TaskPaths]]()
 65 |   @transient val addedDataFiles = ArrayBuffer[DataFile]()
 66 | 
 67 |   private def setupTaskFileMap: Unit = {
 68 |     if (taskFiles == null) {
 69 |       taskFiles = MMap[TaskAttemptID, ArrayBuffer[TaskPaths]]()
 70 |     }
 71 |   }
 72 | 
 73 |   override def setupJob(jobContext: JobContext): Unit =
 74 |     designate.setupJob(jobContext)
 75 | 
 76 |   override def commitJob(jobContext: JobContext,
 77 |                          taskCommits: Seq[FileCommitProtocol.TaskCommitMessage]
 78 |                         ): Unit = {
 79 |     val desgMgs = taskCommits.map(m =>
 80 |       new FileCommitProtocol.TaskCommitMessage(
 81 |         m.obj.asInstanceOf[IcebergTaskCommitMessage].designatePayload
 82 |       )
 83 |     )
 84 |     designate.commitJob(jobContext, desgMgs)
 85 |     for (
 86 |       tasmCmtMsg <- taskCommits
 87 |     ) {
 88 |       val iceTaskMsg = tasmCmtMsg.obj.asInstanceOf[IcebergTaskCommitMessage]
 89 |       addedDataFiles.appendAll(iceTaskMsg.dataFiles)
 90 |     }
 91 |   }
 92 | 
 93 |   override def abortJob(jobContext: JobContext): Unit =
 94 |     designate.abortJob(jobContext)
 95 | 
 96 |   override def setupTask(taskContext: TaskAttemptContext): Unit = {
 97 |     designate.setupTask(taskContext)
 98 |     setupTaskFileMap
 99 |     taskFiles(taskContext.getTaskAttemptID) = ArrayBuffer.empty[TaskPaths]
100 |   }
101 | 
102 |   override def newTaskTempFile(taskContext: TaskAttemptContext, dir: Option[String], ext: String)
103 |   : String = {
104 |     val taskfilePath = designate.newTaskTempFile(taskContext, dir, ext)
105 |     val fNm = new Path(taskfilePath).getName
106 |     val finalPath = dir.map { d =>
107 |       new Path(new Path(path, d), fNm)
108 |     }.getOrElse {
109 |       new Path(path, fNm)
110 |     }
111 | 
112 |     setupTaskFileMap
113 |     taskFiles(taskContext.getTaskAttemptID).append(TaskPaths(taskfilePath, finalPath))
114 |     taskfilePath
115 |   }
116 | 
117 |   override def newTaskTempFileAbsPath(taskContext: TaskAttemptContext,
118 |                                       absoluteDir: String, ext: String
119 |                                      ): String = {
120 |     val taskfilePath = designate.newTaskTempFileAbsPath(taskContext, absoluteDir, ext)
121 |     val fNm = new Path(taskfilePath).getName
122 |     val finalPath = new Path(absoluteDir, fNm)
123 |     taskFiles(taskContext.getTaskAttemptID).append(TaskPaths(taskfilePath, finalPath))
124 |     taskfilePath
125 |   }
126 | 
127 |   override def commitTask(taskContext: TaskAttemptContext
128 |                          ): FileCommitProtocol.TaskCommitMessage = {
129 |     val tPaths = taskFiles(taskContext.getTaskAttemptID)
130 |     val dFiles = tPaths.map {
131 |       case TaskPaths(taskPath, finalPath) =>
132 |         DataFileUtils.buildDataFileFormPath(
133 |           taskPath, iceSchema, partitionSchema,
134 |           icePartSpec, iceFmt,
135 |           taskContext.getConfiguration,
136 |           finalPath.toString
137 |         )
138 |     }
139 | 
140 |     val dsgMsg = designate.commitTask(taskContext)
141 |     new FileCommitProtocol.TaskCommitMessage(
142 |       IcebergTaskCommitMessage(dsgMsg.obj, dFiles)
143 |     )
144 |   }
145 | 
146 |   override def abortTask(taskContext: TaskAttemptContext): Unit =
147 |     designate.abortTask(taskContext)
148 | 
149 |   override def deleteWithJob(fs: FileSystem, path: Path, recursive: Boolean): Boolean = {
150 |     false
151 |   }
152 | }
153 | 
154 | object IcebergFileCommitProtocol {
155 |   val REAL_FILE_COMMIT_PROTOCOL_CLASS = "spark.sql.sources.real.commitProtocolClass"
156 | 
157 |   case class TaskPaths(taskPath: String, finalPath: Path)
158 | 
159 |   case class IcebergTaskCommitMessage(
160 |                                        designatePayload: Any,
161 |                                        dataFiles: Seq[DataFile]
162 |                                      )
163 | 
164 | }
165 | 


--------------------------------------------------------------------------------
/src/main/scala/org/apache/spark/sql/iceberg/InsertIntoIcebergTable.scala:
--------------------------------------------------------------------------------
  1 | /*
  2 |  * Licensed to the Apache Software Foundation (ASF) under one or more
  3 |  * contributor license agreements.  See the NOTICE file distributed with
  4 |  * this work for additional information regarding copyright ownership.
  5 |  * The ASF licenses this file to You under the Apache License, Version 2.0
  6 |  * (the "License"); you may not use this file except in compliance with
  7 |  * the License.  You may obtain a copy of the License at
  8 |  *
  9 |  *    http://www.apache.org/licenses/LICENSE-2.0
 10 |  *
 11 |  * Unless required by applicable law or agreed to in writing, software
 12 |  * distributed under the License is distributed on an "AS IS" BASIS,
 13 |  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 14 |  * See the License for the specific language governing permissions and
 15 |  * limitations under the License.
 16 |  */
 17 | 
 18 | package org.apache.spark.sql.iceberg
 19 | 
 20 | import com.netflix.iceberg.{DataFile, PendingUpdate, Snapshot}
 21 | import org.apache.hadoop.fs.{FileSystem, Path}
 22 | import org.apache.spark.internal.io.FileCommitProtocol
 23 | import org.apache.spark.sql.catalyst.catalog.CatalogTypes.TablePartitionSpec
 24 | import org.apache.spark.sql.catalyst.catalog.{BucketSpec, CatalogTable}
 25 | import org.apache.spark.sql.catalyst.expressions.{Attribute, Expression}
 26 | import org.apache.spark.sql.catalyst.plans.logical.LogicalPlan
 27 | import org.apache.spark.sql.catalyst.util.CaseInsensitiveMap
 28 | import org.apache.spark.sql.execution.SparkPlan
 29 | import org.apache.spark.sql.execution.command.{AlterTableAddPartitionCommand, AlterTableDropPartitionCommand, CommandUtils, DataWritingCommand}
 30 | import org.apache.spark.sql.execution.datasources._
 31 | import org.apache.spark.sql.execution.datasources.iceberg.{PartitioningUtils => IcePartitioningUtils}
 32 | import org.apache.spark.sql.iceberg.table.SparkTables
 33 | import org.apache.spark.sql.iceberg.utils.{DataFileUtils, ExpressionConversions, IceTable}
 34 | import org.apache.spark.sql.internal.SQLConf.PartitionOverwriteMode
 35 | import org.apache.spark.sql.util.SchemaUtils
 36 | import org.apache.spark.sql.{AnalysisException, Row, SaveMode, SparkSession}
 37 | 
 38 | /**
 39 |  * A drop-in replacement for [[InsertIntoHadoopFsRelationCommand]] setup by the
 40 |  * [[org.apache.spark.sql.iceberg.planning.IcebergTableWriteRule]]. By and large
 41 |  * follows the same execution flow as [[InsertIntoHadoopFsRelationCommand]] with the
 42 |  * following behavior overrides.
 43 |  *
 44 |  *  - The write must be on a [[CatalogTable]]. So `catalogTable` parameter is not optional.
 45 |  *  - Since this is a `iceberg managed table` we load the [[IceTable]] metadata for this table.
 46 |  *  - `initialMatchingPartitions` is computed from the [[IceTable]] metadata; see method
 47 |  *    `matchIceDataFiles`
 48 |  *  - since data files must be managed by iceberg we require that ''custom partition locations''
 49 |  *    are not configured for this table.
 50 |  *  - we setup an [[IcebergFileCommitProtocol]] that wraps the underlying [[FileCommitProtocol]].
 51 |  *    The  [[IcebergFileCommitProtocol]] mostly defers to the underlying commitProtocol instance;
 52 |  *    in the process it ensures iceberg [[DataFile]] instances are created for new  files on
 53 |  *    task commit which are then delivered to the Driver [[IcebergFileCommitProtocol]] instance
 54 |  *    via [[TaskCommitMessage]].
 55 |  *  - The underlying [[FileCommitProtocol]] is setup with `dynamicPartitionOverwrite` mode
 56 |  *    set to false. Since [[IceTable]] metadata is used by scan operations to compute what
 57 |  *    files to scan we don't have to do an all-or-nothing replacement of files in a
 58 |  *    partition that is needed for dynamic partition mode using the [[FileCommitProtocol]].
 59 |  *  - in case of `dynamicPartitionOverwrite` mode we don't clear specified source Partitions, because
 60 |  *    we want the current files to be able execute queries against older snapshots.
 61 |  *  - once the job finishes the Catalog is updated with 'new' and 'deleted' partitions just as it is in
 62 |  *    a regular [[InsertIntoHadoopFsRelationCommand]]
 63 |  *  - then based on the 'initial set' of [[DataFile]] and the set of [[DataFile]] created by tasks
 64 |  *    of this job a new iceberg [[Snapshot]] is created.
 65 |  *  - finally cache invalidation and stats update actions happen just like in a regular
 66 |  *    [[InsertIntoHadoopFsRelationCommand]]
 67 |  *
 68 |  * @param outputPath
 69 |  * @param staticPartitions
 70 |  * @param ifPartitionNotExists
 71 |  * @param partitionColumns
 72 |  * @param bucketSpec
 73 |  * @param fileFormat
 74 |  * @param options
 75 |  * @param query
 76 |  * @param mode
 77 |  * @param catalogTable
 78 |  * @param fileIndex
 79 |  * @param outputColumnNames
 80 |  */
 81 | case class InsertIntoIcebergTable(outputPath: Path,
 82 |                                   staticPartitions: TablePartitionSpec,
 83 |                                   ifPartitionNotExists: Boolean,
 84 |                                   partitionColumns: Seq[Attribute],
 85 |                                   bucketSpec: Option[BucketSpec],
 86 |                                   fileFormat: FileFormat,
 87 |                                   options: Map[String, String],
 88 |                                   query: LogicalPlan,
 89 |                                   mode: SaveMode,
 90 |                                   catalogTable: CatalogTable,
 91 |                                   fileIndex: Option[FileIndex],
 92 |                                   outputColumnNames: Seq[String]
 93 |                                  ) extends DataWritingCommand {
 94 | 
 95 |   val partitionSchema = catalogTable.partitionSchema
 96 | 
 97 |   /**
 98 |    * Construct an iceberg [[com.netflix.iceberg.TableScan]], optionally apply a filter based on
 99 |    * the `staticPartitions` provide on this invocation. Execute the scan to get a [[DataFile]],
100 |    * and build a set of [[TablePartitionSpec]] from the datafiles.
101 |    *
102 |    * @param icebergTable
103 |    * @return
104 |    */
105 |   private def matchIceDataFiles(icebergTable :IceTable) :
106 |   (Seq[DataFile], Set[TablePartitionSpec]) = {
107 |     val spkPartExpression : Option[Expression] = IcePartitioningUtils.
108 |       expressionFromPartitionSpec(partitionSchema,CaseInsensitiveMap(staticPartitions))
109 |     val iceExpr = spkPartExpression.flatMap(ExpressionConversions.convertStrict(_))
110 |     var iceScan = icebergTable.newScan()
111 | 
112 |     if (iceExpr.isDefined) {
113 |       iceScan = iceScan.filter(iceExpr.get)
114 |     }
115 | 
116 |     import scala.collection.JavaConversions._
117 |     val dfs = iceScan.planFiles().map(_.file).toSeq
118 |     val matchingPSepc = (dfs.map(df => DataFileUtils.sparkPartitionMap(df))).toSet
119 |     (dfs, matchingPSepc)
120 |   }
121 | 
122 |   private def checkNoCustomPartitionLocations(fs: FileSystem,
123 |                                               qualifiedOutputPath: Path
124 |                                              )(implicit sparkSession: SparkSession) : Unit = {
125 | 
126 |     val customLocPart =
127 |       sparkSession.sessionState.catalog.listPartitions(catalogTable.identifier).find { p =>
128 |       val defaultLocation = qualifiedOutputPath.suffix(
129 |         "/" + PartitioningUtils.getPathFragment(p.spec, partitionSchema)).toString
130 |       val catalogLocation = new Path(p.location).makeQualified(
131 |         fs.getUri, fs.getWorkingDirectory).toString
132 |       catalogLocation != defaultLocation
133 |     }
134 | 
135 |     if (customLocPart.isDefined) {
136 |       throw new AnalysisException(
137 |         s"Cannot have custom partition locations for a managed table"
138 |       )
139 |     }
140 |   }
141 | 
142 |   private def createSnapShot(iceTable : IceTable,
143 |                              delFiles : Seq[DataFile],
144 |                              addFiles : Seq[DataFile]
145 |                             ) : Unit = {
146 | 
147 |     val hasDelFiles = delFiles.nonEmpty
148 |     val hasAddFiles = addFiles.nonEmpty
149 | 
150 |     import scala.collection.JavaConversions._
151 |     var updt : PendingUpdate[Snapshot] = null
152 |     (hasDelFiles, hasAddFiles) match {
153 |       case (true, true) => {
154 |         val r = iceTable.newRewrite()
155 |         val ds = delFiles.toSet
156 |         val as = addFiles.toSet
157 |         updt = r.rewriteFiles(ds, as)
158 |       }
159 |       case (false, true) => {
160 |         var r = iceTable.newAppend()
161 |         for(f <- addFiles) {
162 |           r =r .appendFile(f)
163 |         }
164 |         updt = r
165 |       }
166 |       case (true, false) => {
167 |         var r = iceTable.newDelete()
168 |         for(f <- delFiles) {
169 |           r = r.deleteFile(f)
170 |         }
171 |         updt = r
172 |       }
173 |       case (false, false) => ()
174 |     }
175 | 
176 |     if (updt != null) {
177 |       updt.commit()
178 |     }
179 |   }
180 | 
181 |   private def _run(child: SparkPlan,
182 |                    iceTable : IceTable
183 |                   )(implicit sparkSession: SparkSession): Unit = {
184 | 
185 |     // Most formats don't do well with duplicate columns, so lets not allow that
186 |     SchemaUtils.checkColumnNameDuplication(
187 |       outputColumnNames,
188 |       s"when inserting into $outputPath",
189 |       sparkSession.sessionState.conf.caseSensitiveAnalysis)
190 | 
191 |     val hadoopConf = sparkSession.sessionState.newHadoopConfWithOptions(options)
192 |     val fs = outputPath.getFileSystem(hadoopConf)
193 |     val qualifiedOutputPath = outputPath.makeQualified(fs.getUri, fs.getWorkingDirectory)
194 | 
195 |     val partitionsTrackedByCatalog = sparkSession.sessionState.conf.manageFilesourcePartitions &&
196 |       catalogTable.partitionColumnNames.nonEmpty &&
197 |       catalogTable.tracksPartitionsInCatalog
198 | 
199 |     val (initialMatchingDataFiles: Seq[DataFile],
200 |     initialMatchingPartitions: Set[TablePartitionSpec]) = matchIceDataFiles(iceTable)
201 | 
202 |     checkNoCustomPartitionLocations(fs, qualifiedOutputPath)
203 | 
204 |     val pathExists = fs.exists(qualifiedOutputPath)
205 |     val parameters = CaseInsensitiveMap(options)
206 |     val partitionOverwriteMode = parameters.get("partitionOverwriteMode")
207 |       // scalastyle:off caselocale
208 |       .map(mode => PartitionOverwriteMode.withName(mode.toUpperCase))
209 |       // scalastyle:on caselocale
210 |       .getOrElse(sparkSession.sessionState.conf.partitionOverwriteMode)
211 | 
212 |     val enableDynamicOverwrite = partitionOverwriteMode == PartitionOverwriteMode.DYNAMIC
213 |     // This config only makes sense when we are overwriting a partitioned dataset with dynamic
214 |     // partition columns.
215 |     val dynamicPartitionOverwrite = enableDynamicOverwrite && mode == SaveMode.Overwrite &&
216 |       staticPartitions.size < partitionColumns.length
217 | 
218 |     val committer : IcebergFileCommitProtocol = {
219 |       val designate = FileCommitProtocol.instantiate(
220 |         sparkSession.sessionState.conf.fileCommitProtocolClass,
221 |         jobId = java.util.UUID.randomUUID().toString,
222 |         outputPath = outputPath.toString,
223 |         dynamicPartitionOverwrite = false
224 |       )
225 | 
226 |       new IcebergFileCommitProtocol(designate,
227 |       outputPath.toString,
228 |       partitionSchema,
229 |       iceTable.spec(),
230 |       iceTable.schema(),
231 |         utils.iceFormat(fileFormat)
232 |       )
233 |     }
234 | 
235 |     val doInsertion = (mode, pathExists) match {
236 |       case (SaveMode.ErrorIfExists, true) =>
237 |         throw new AnalysisException(s"path $qualifiedOutputPath already exists.")
238 |       case (SaveMode.Overwrite, true) =>
239 |         if (ifPartitionNotExists && initialMatchingDataFiles.nonEmpty) {
240 |           false
241 |         } else {
242 |           true
243 |         }
244 |       case (SaveMode.Append, _) | (SaveMode.Overwrite, _) | (SaveMode.ErrorIfExists, false) =>
245 |         true
246 |       case (SaveMode.Ignore, exists) =>
247 |         !exists
248 |       case (s, exists) =>
249 |         throw new IllegalStateException(s"unsupported save mode $s ($exists)")
250 |     }
251 | 
252 |     if (doInsertion) {
253 | 
254 |       def refreshUpdatedPartitions(updatedPartitionPaths: Set[String]): Unit = {
255 |         if (partitionsTrackedByCatalog) {
256 |           val updatedPartitions = updatedPartitionPaths.map(PartitioningUtils.parsePathFragment)
257 |           val newPartitions = updatedPartitions -- initialMatchingPartitions
258 |           if (newPartitions.nonEmpty) {
259 |             AlterTableAddPartitionCommand(
260 |               catalogTable.identifier, newPartitions.toSeq.map(p => (p, None)),
261 |               ifNotExists = true).run(sparkSession)
262 |           }
263 |           // For dynamic partition overwrite, we never remove partitions but only update existing
264 |           // ones.
265 |           if (mode == SaveMode.Overwrite && !dynamicPartitionOverwrite) {
266 |             val deletedPartitions = initialMatchingPartitions.toSet -- updatedPartitions
267 |             if (deletedPartitions.nonEmpty) {
268 |               AlterTableDropPartitionCommand(
269 |                 catalogTable.identifier, deletedPartitions.toSeq,
270 |                 ifExists = true, purge = false,
271 |                 retainData = true /* already deleted */).run(sparkSession)
272 |             }
273 |           }
274 |         }
275 |       }
276 | 
277 |       val updatedPartitionPaths =
278 |         FileFormatWriter.write(
279 |           sparkSession = sparkSession,
280 |           plan = child,
281 |           fileFormat = fileFormat,
282 |           committer = committer,
283 |           outputSpec = FileFormatWriter.OutputSpec(
284 |             qualifiedOutputPath.toString, Map.empty, outputColumns),
285 |           hadoopConf = hadoopConf,
286 |           partitionColumns = partitionColumns,
287 |           bucketSpec = bucketSpec,
288 |           statsTrackers = Seq(basicWriteJobStatsTracker(hadoopConf)),
289 |           options = options)
290 | 
291 |       createSnapShot(iceTable,
292 |         if (mode == SaveMode.Overwrite) initialMatchingDataFiles else Seq.empty,
293 |         committer.addedDataFiles
294 |       )
295 | 
296 |       // update metastore partition metadata
297 |       if (updatedPartitionPaths.isEmpty && staticPartitions.nonEmpty
298 |         && partitionColumns.length == staticPartitions.size) {
299 |         // Avoid empty static partition can't loaded to datasource table.
300 |         val staticPathFragment =
301 |           PartitioningUtils.getPathFragment(staticPartitions, partitionColumns)
302 |         refreshUpdatedPartitions(Set(staticPathFragment))
303 |       } else {
304 |         refreshUpdatedPartitions(updatedPartitionPaths)
305 |       }
306 | 
307 |       // refresh cached files in FileIndex
308 |       fileIndex.foreach(_.refresh())
309 |       // refresh data cache if table is cached
310 |       sparkSession.catalog.refreshByPath(outputPath.toString)
311 | 
312 |       CommandUtils.updateTableStats(sparkSession, catalogTable)
313 | 
314 |     } else {
315 |       logInfo("Skipping insertion into a relation that already exists.")
316 |     }
317 | 
318 |   }
319 | 
320 |   override def run(sparkSession: SparkSession, child: SparkPlan): Seq[Row] = {
321 | 
322 |     implicit  val ss = sparkSession
323 |     val iceTables = new SparkTables(ss, catalogTable)
324 |     val tbId = utils.qualifiedTableIdentifier(catalogTable.identifier, ss)
325 |     val iceTable = iceTables.load(tbId.database.getOrElse(null), tbId.table)
326 | 
327 |     _run(child, iceTable)
328 |     Seq.empty[Row]
329 |   }
330 | }
331 | 
332 | object InsertIntoIcebergTable {
333 |   def apply(ihr: InsertIntoHadoopFsRelationCommand,
334 |             catalogTable: CatalogTable
335 |            ) : InsertIntoIcebergTable = {
336 |     InsertIntoIcebergTable(
337 |       ihr.outputPath,
338 |       ihr.staticPartitions,
339 |       ihr.ifPartitionNotExists,
340 |       ihr.partitionColumns,
341 |       ihr.bucketSpec,
342 |       ihr.fileFormat,
343 |       ihr.options,
344 |       ihr.query,
345 |       ihr.mode,
346 |       catalogTable,
347 |       ihr.fileIndex,
348 |       ihr.outputColumnNames
349 |     )
350 |   }
351 | }


--------------------------------------------------------------------------------
/src/main/scala/org/apache/spark/sql/iceberg/parsing/AbstractSparkSQLParser.scala:
--------------------------------------------------------------------------------
  1 | /*
  2 |  * Licensed to the Apache Software Foundation (ASF) under one or more
  3 |  * contributor license agreements.  See the NOTICE file distributed with
  4 |  * this work for additional information regarding copyright ownership.
  5 |  * The ASF licenses this file to You under the Apache License, Version 2.0
  6 |  * (the "License"); you may not use this file except in compliance with
  7 |  * the License.  You may obtain a copy of the License at
  8 |  *
  9 |  *    http://www.apache.org/licenses/LICENSE-2.0
 10 |  *
 11 |  * Unless required by applicable law or agreed to in writing, software
 12 |  * distributed under the License is distributed on an "AS IS" BASIS,
 13 |  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 14 |  * See the License for the specific language governing permissions and
 15 |  * limitations under the License.
 16 |  */
 17 | 
 18 | package org.apache.spark.sql.iceberg.parsing
 19 | 
 20 | import org.apache.spark.sql.catalyst.plans.logical.LogicalPlan
 21 | 
 22 | import scala.util.parsing.combinator.PackratParsers
 23 | import scala.util.parsing.combinator.lexical.StdLexical
 24 | import scala.util.parsing.combinator.syntactical.StandardTokenParsers
 25 | import scala.util.parsing.input.CharArrayReader.EofCh
 26 | import scala.language.implicitConversions
 27 | 
 28 | private[parsing] abstract class AbstractSparkSQLParser
 29 |   extends StandardTokenParsers with PackratParsers {
 30 | 
 31 |   def parse(input: String): LogicalPlan = synchronized {
 32 |     // Initialize the Keywords.
 33 |     initLexical
 34 |     phrase(start)(new lexical.Scanner(input)) match {
 35 |       case Success(plan, _) => plan
 36 |       case failureOrError => sys.error(failureOrError.toString)
 37 |     }
 38 |   }
 39 |   /* One time initialization of lexical.This avoid reinitialization of  lexical in parse method */
 40 |   protected lazy val initLexical: Unit = lexical.initialize(reservedWords)
 41 | 
 42 |   protected case class Keyword(str: String) {
 43 |     def normalize: String = lexical.normalizeKeyword(str)
 44 |     def parser: Parser[String] = normalize
 45 |   }
 46 | 
 47 |   protected implicit def asParser(k: Keyword): Parser[String] = k.parser
 48 | 
 49 |   // By default, use Reflection to find the reserved words defined in the sub class.
 50 |   // NOTICE, Since the Keyword properties defined by sub class, we couldn't call this
 51 |   // method during the parent class instantiation, because the sub class instance
 52 |   // isn't created yet.
 53 |   protected lazy val reservedWords: Seq[String] =
 54 |   this
 55 |     .getClass
 56 |     .getMethods
 57 |     .filter(_.getReturnType == classOf[Keyword])
 58 |     .map(_.invoke(this).asInstanceOf[Keyword].normalize)
 59 | 
 60 |   // Set the keywords as empty by default, will change that later.
 61 |   override val lexical = new SqlLexical
 62 | 
 63 |   protected def start: Parser[LogicalPlan]
 64 | 
 65 |   // Returns the whole input string
 66 |   protected lazy val wholeInput: Parser[String] = new Parser[String] {
 67 |     def apply(in: Input): ParseResult[String] =
 68 |       Success(in.source.toString, in.drop(in.source.length()))
 69 |   }
 70 | 
 71 |   // Returns the rest of the input string that are not parsed yet
 72 |   protected lazy val restInput: Parser[String] = new Parser[String] {
 73 |     def apply(in: Input): ParseResult[String] =
 74 |       Success(
 75 |         in.source.subSequence(in.offset, in.source.length()).toString,
 76 |         in.drop(in.source.length()))
 77 |   }
 78 | }
 79 | 
 80 | class SqlLexical extends StdLexical {
 81 |   case class DecimalLit(chars: String) extends Token {
 82 |     override def toString: String = chars
 83 |   }
 84 | 
 85 |   /* This is a work around to support the lazy setting */
 86 |   def initialize(keywords: Seq[String]): Unit = {
 87 |     reserved.clear()
 88 |     reserved ++= keywords
 89 |   }
 90 | 
 91 |   /* Normal the keyword string */
 92 |   def normalizeKeyword(str: String): String = str.toLowerCase
 93 | 
 94 |   delimiters += (
 95 |     "@", "*", "+", "-", "<", "=", "<>", "!=", "<=", ">=", ">", "/", "(", ")",
 96 |     ",", ";", "%", "{", "}", ":", "[", "]", ".", "&", "|", "^", "~", "<=>"
 97 |   )
 98 | 
 99 |   protected override def processIdent(name: String) = {
100 |     val token = normalizeKeyword(name)
101 |     if (reserved contains token) Keyword(token) else Identifier(name)
102 |   }
103 | 
104 |   override lazy val token: Parser[Token] =
105 |     ( rep1(digit) ~ scientificNotation ^^ { case i ~ s => DecimalLit(i.mkString + s) }
106 |       | '.' ~> (rep1(digit) ~ scientificNotation) ^^
107 |       { case i ~ s => DecimalLit("0." + i.mkString + s) }
108 |       | rep1(digit) ~ ('.' ~> digit.*) ~ scientificNotation ^^
109 |       { case i1 ~ i2 ~ s => DecimalLit(i1.mkString + "." + i2.mkString + s) }
110 |       | digit.* ~ identChar ~ (identChar | digit).* ^^
111 |       { case first ~ middle ~ rest => processIdent((first ++ (middle :: rest)).mkString) }
112 |       | rep1(digit) ~ ('.' ~> digit.*).? ^^ {
113 |       case i ~ None => NumericLit(i.mkString)
114 |       case i ~ Some(d) => DecimalLit(i.mkString + "." + d.mkString)
115 |     }
116 |       | '\'' ~> chrExcept('\'', '\n', EofCh).* <~ '\'' ^^
117 |       { case chars => StringLit(chars mkString "") }
118 |       | '"' ~> chrExcept('"', '\n', EofCh).* <~ '"' ^^
119 |       { case chars => StringLit(chars mkString "") }
120 |       | '`' ~> chrExcept('`', '\n', EofCh).* <~ '`' ^^
121 |       { case chars => Identifier(chars mkString "") }
122 |       | EofCh ^^^ EOF
123 |       | '\'' ~> failure("unclosed string literal")
124 |       | '"' ~> failure("unclosed string literal")
125 |       | delim
126 |       | failure("illegal character")
127 |       )
128 | 
129 |   override def identChar: Parser[Elem] = letter | elem('_')
130 | 
131 |   private lazy val scientificNotation: Parser[String] =
132 |     (elem('e') | elem('E')) ~> (elem('+') | elem('-')).? ~ rep1(digit) ^^ {
133 |       case s ~ rest => "e" + s.mkString + rest.mkString
134 |     }
135 | 
136 |   override def whitespace: Parser[Any] =
137 |     ( whitespaceChar
138 |       | '/' ~ '*' ~ comment
139 |       | '/' ~ '/' ~ chrExcept(EofCh, '\n').*
140 |       | '#' ~ chrExcept(EofCh, '\n').*
141 |       | '-' ~ '-' ~ chrExcept(EofCh, '\n').*
142 |       | '/' ~ '*' ~ failure("unclosed comment")
143 |       ).*
144 | }
145 | 
146 | 


--------------------------------------------------------------------------------
/src/main/scala/org/apache/spark/sql/iceberg/parsing/IceParser.scala:
--------------------------------------------------------------------------------
  1 | /*
  2 |  * Licensed to the Apache Software Foundation (ASF) under one or more
  3 |  * contributor license agreements.  See the NOTICE file distributed with
  4 |  * this work for additional information regarding copyright ownership.
  5 |  * The ASF licenses this file to You under the Apache License, Version 2.0
  6 |  * (the "License"); you may not use this file except in compliance with
  7 |  * the License.  You may obtain a copy of the License at
  8 |  *
  9 |  *    http://www.apache.org/licenses/LICENSE-2.0
 10 |  *
 11 |  * Unless required by applicable law or agreed to in writing, software
 12 |  * distributed under the License is distributed on an "AS IS" BASIS,
 13 |  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 14 |  * See the License for the specific language governing permissions and
 15 |  * limitations under the License.
 16 |  */
 17 | 
 18 | package org.apache.spark.sql.iceberg.parsing
 19 | 
 20 | import org.apache.spark.sql.SparkSession
 21 | import org.apache.spark.sql.catalyst.analysis.UnresolvedAttribute
 22 | import org.apache.spark.sql.catalyst.expressions.Expression
 23 | import org.apache.spark.sql.catalyst.{FunctionIdentifier, TableIdentifier}
 24 | import org.apache.spark.sql.catalyst.parser.{ParseException, ParserInterface}
 25 | import org.apache.spark.sql.catalyst.plans.logical.{LogicalPlan, Project}
 26 | import org.apache.spark.sql.iceberg.utils
 27 | import org.apache.spark.sql.iceberg.utils.TableUtils
 28 | import org.apache.spark.sql.iceberg.utils.TableUtils.SNAPSHOTSVIEW_SUFFIX
 29 | import org.apache.spark.sql.types.{DataType, StructType}
 30 | 
 31 | import scala.util.Try
 32 | 
 33 | class SparkIceParser(baseParser : ParserInterface) extends ParserInterface {
 34 | 
 35 |   val iceParser = new IceParser(baseParser)
 36 | 
 37 |   override def parsePlan(sqlText: String): LogicalPlan = {
 38 | 
 39 |     val splParsedPlan = Try {
 40 |       iceParser.parse2(sqlText)
 41 |     }.getOrElse(iceParser.Failure("Not valid ice extension", null))
 42 | 
 43 |     if (splParsedPlan.successful ) {
 44 |       splParsedPlan.get
 45 |     } else {
 46 |       try {
 47 |         baseParser.parsePlan(sqlText)
 48 |       } catch {
 49 |         case pe : ParseException => {
 50 |           val splFailureDetails = splParsedPlan.asInstanceOf[IceParser#NoSuccess].msg
 51 |           throw new ParseException(pe.command,
 52 |             pe.message + s"\nIce parse attempt message: $splFailureDetails",
 53 |             pe.start,
 54 |             pe.stop
 55 |           )
 56 |         }
 57 |       }
 58 |     }
 59 |   }
 60 | 
 61 |   def parseExpression(sqlText: String): Expression =
 62 |     baseParser.parseExpression(sqlText)
 63 | 
 64 |   def parseTableIdentifier(sqlText: String): TableIdentifier =
 65 |     baseParser.parseTableIdentifier(sqlText)
 66 | 
 67 |   override def parseFunctionIdentifier(sqlText: String): FunctionIdentifier =
 68 |     baseParser.parseFunctionIdentifier(sqlText)
 69 | 
 70 |   override def parseTableSchema(sqlText: String): StructType =
 71 |     baseParser.parseTableSchema(sqlText)
 72 | 
 73 |   override def parseDataType(sqlText: String): DataType =
 74 |     baseParser.parseDataType(sqlText)
 75 | }
 76 | 
 77 | 
 78 | class IceParser(val baseParser : ParserInterface) extends AbstractSparkSQLParser {
 79 | 
 80 |   def sparkSession = SparkSession.getActiveSession.get
 81 | 
 82 |   def parse2(input: String): ParseResult[LogicalPlan] = synchronized {
 83 |     // Initialize the Keywords.
 84 |     initLexical
 85 |     phrase(start)(new lexical.Scanner(input))
 86 |   }
 87 | 
 88 |   protected override lazy val start: Parser[LogicalPlan] =
 89 |     selectSnapshotViewStar | selectSnapshotViewProject | asOfSelect
 90 | 
 91 |   private lazy val selectSnapshotViewStar : Parser[LogicalPlan] =
 92 |     (SELECT ~ STAR ~ FROM) ~> qualifiedId ^^ {
 93 |       case tblNm if tblNm.endsWith(TableUtils.SNAPSHOTSVIEW_SUFFIX) =>
 94 |         TableUtils.snapShotsLocalRelation(
 95 |           tblNm.substring(0, tblNm.length - SNAPSHOTSVIEW_SUFFIX.length)
 96 |         )(sparkSession)
 97 |   }
 98 | 
 99 |   private lazy val selectSnapshotViewProject : Parser[LogicalPlan] =
100 |     (SELECT ~> qualifiedCols) ~ (FROM ~> qualifiedId) ^^ {
101 |       case qCols ~ tblNm if tblNm.endsWith(TableUtils.SNAPSHOTSVIEW_SUFFIX) =>
102 |         val sRel = TableUtils.snapShotsLocalRelation(
103 |           tblNm.substring(0, tblNm.length - SNAPSHOTSVIEW_SUFFIX.length)
104 |         )(sparkSession)
105 |         Project(qCols.map(UnresolvedAttribute(_)), sRel)
106 |     }
107 | 
108 |   private lazy val asOfSelect : Parser[LogicalPlan] =
109 |     (AS ~ OF) ~> stringLit  ~ restInput ^^ {
110 |       case asOfTime ~ query => {
111 |         TableUtils.setThreadSnapShotMillis(utils.convertToEpoch(asOfTime))
112 |         baseParser.parsePlan(query)
113 |       }
114 |     }
115 | 
116 |   private lazy val qualifiedCols : Parser[Seq[String]] =
117 |     repsep(qualifiedCol, ",")
118 | 
119 |   private lazy val qualifiedCol : Parser[String] =
120 |     (ident ~ opt("." ~> ident) ~ opt("." ~> ident)) ^^ {
121 |       case c ~ None ~ None => c
122 |       case t ~ Some(c) ~ None => s"$t.$c"
123 |       case d ~ Some(t) ~ Some(c) => s"$d.$t.$c"
124 |     }
125 | 
126 |   private lazy val qualifiedId : Parser[String] =
127 |     (ident ~ ("." ~> ident).?) ^^ {
128 |       case ~(n, None) => n
129 |       case ~(q, Some(n)) => s"$q.$n"
130 |     }
131 | 
132 |   protected val SELECT = Keyword("SELECT")
133 |   protected val STAR = Keyword("*")
134 |   protected val FROM = Keyword("FROM")
135 |   protected val AS = Keyword("AS")
136 |   protected val OF = Keyword("OF")
137 | 
138 | }
139 | 


--------------------------------------------------------------------------------
/src/main/scala/org/apache/spark/sql/iceberg/planning/CreateTableRules.scala:
--------------------------------------------------------------------------------
 1 | /*
 2 |  * Licensed to the Apache Software Foundation (ASF) under one or more
 3 |  * contributor license agreements.  See the NOTICE file distributed with
 4 |  * this work for additional information regarding copyright ownership.
 5 |  * The ASF licenses this file to You under the Apache License, Version 2.0
 6 |  * (the "License"); you may not use this file except in compliance with
 7 |  * the License.  You may obtain a copy of the License at
 8 |  *
 9 |  *    http://www.apache.org/licenses/LICENSE-2.0
10 |  *
11 |  * Unless required by applicable law or agreed to in writing, software
12 |  * distributed under the License is distributed on an "AS IS" BASIS,
13 |  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 |  * See the License for the specific language governing permissions and
15 |  * limitations under the License.
16 |  */
17 | 
18 | package org.apache.spark.sql.iceberg.planning
19 | 
20 | import org.apache.spark.sql.catalyst.plans.logical.LogicalPlan
21 | import org.apache.spark.sql.catalyst.rules.Rule
22 | import org.apache.spark.sql.execution.command.{CreateDataSourceTableAsSelectCommand, CreateDataSourceTableCommand}
23 | import org.apache.spark.sql.iceberg.Config.provideManagement
24 | import org.apache.spark.sql.iceberg.{CreateIcebergTable, CreateIcebergTableAsSelect}
25 | import org.apache.spark.sql.{SaveMode, SparkSession}
26 | 
27 | import scala.collection.mutable.ArrayBuffer
28 | 
29 | object CreateTableCheck extends Rule[LogicalPlan] {
30 | 
31 |   import IcebergTableValidationChecks.validate
32 | 
33 |   private def checkPlan(plan : LogicalPlan) : Unit = plan match {
34 |     case CreateDataSourceTableCommand(table, ignoreIfExists) if provideManagement(table) => {
35 |       val errs = ArrayBuffer[String]()
36 |       if (ignoreIfExists) {
37 |         errs + "managed table supported only when 'ignoreIfExists' option is false"
38 |       }
39 |       validate("create", table, errs)
40 | 
41 |     }
42 |     case CreateDataSourceTableAsSelectCommand(table, mode, _, _) if provideManagement(table) => {
43 |       val errs = ArrayBuffer[String]()
44 |       if (mode != SaveMode.ErrorIfExists) {
45 |         errs + "managed table supported only for SaveMode = 'ErrorIfExists'"
46 |       }
47 |       validate("create as", table, errs)
48 |     }
49 |     case _ => ()
50 |   }
51 | 
52 |   override def apply(plan: LogicalPlan): LogicalPlan = {
53 |     checkPlan(plan)
54 |     plan
55 |   }
56 | 
57 | }
58 | 
59 | case class CreateIcebergTableRule(ss: SparkSession) extends Rule[LogicalPlan] {
60 | 
61 |   override def apply(plan: LogicalPlan): LogicalPlan = plan transformUp {
62 |     case ct@CreateDataSourceTableCommand(table, ignoreIfExists) if provideManagement(table) => {
63 |       CreateIcebergTable(ct)
64 |     }
65 |     case ct@CreateDataSourceTableAsSelectCommand(table, mode, _, _) if provideManagement(table) => {
66 |       CreateIcebergTableAsSelect(ct)
67 |     }
68 |   }
69 | }
70 | 


--------------------------------------------------------------------------------
/src/main/scala/org/apache/spark/sql/iceberg/planning/IcebergScanStrategy.scala:
--------------------------------------------------------------------------------
 1 | /*
 2 |  * Licensed to the Apache Software Foundation (ASF) under one or more
 3 |  * contributor license agreements.  See the NOTICE file distributed with
 4 |  * this work for additional information regarding copyright ownership.
 5 |  * The ASF licenses this file to You under the Apache License, Version 2.0
 6 |  * (the "License"); you may not use this file except in compliance with
 7 |  * the License.  You may obtain a copy of the License at
 8 |  *
 9 |  *    http://www.apache.org/licenses/LICENSE-2.0
10 |  *
11 |  * Unless required by applicable law or agreed to in writing, software
12 |  * distributed under the License is distributed on an "AS IS" BASIS,
13 |  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 |  * See the License for the specific language governing permissions and
15 |  * limitations under the License.
16 |  */
17 | 
18 | package org.apache.spark.sql.iceberg.planning
19 | 
20 | import org.apache.spark.internal.Logging
21 | import org.apache.spark.sql.{SparkSession, Strategy}
22 | import org.apache.spark.sql.catalyst.planning.PhysicalOperation
23 | import org.apache.spark.sql.catalyst.plans.logical.LogicalPlan
24 | import org.apache.spark.sql.execution.{FileSourceScanExec, SparkPlan}
25 | import org.apache.spark.sql.execution.datasources.{FileSourceStrategy, HadoopFsRelation, LogicalRelation}
26 | import org.apache.spark.sql.iceberg.Config.isManaged
27 | import org.apache.spark.sql.iceberg.IceTableScanExec
28 | 
29 | case class IcebergScanStrategy(ss: SparkSession) extends Strategy with Logging {
30 | 
31 |   def apply(plan: LogicalPlan): Seq[SparkPlan] = plan match {
32 |     case PhysicalOperation(projects, filters,
33 |     l @ LogicalRelation(fsRelation: HadoopFsRelation, _, table, _))
34 |     if table.isDefined && isManaged(table.get) => {
35 |       FileSourceStrategy(plan).map { plan =>
36 |         plan.transformUp {
37 |           case fsScan: FileSourceScanExec => {
38 |             IceTableScanExec(fsScan, table.get)
39 |           }
40 |         }
41 |       }
42 |     }
43 |     case _ => Nil
44 |   }
45 | 
46 | }
47 | 


--------------------------------------------------------------------------------
/src/main/scala/org/apache/spark/sql/iceberg/planning/IcebergTableValidationChecks.scala:
--------------------------------------------------------------------------------
 1 | /*
 2 |  * Licensed to the Apache Software Foundation (ASF) under one or more
 3 |  * contributor license agreements.  See the NOTICE file distributed with
 4 |  * this work for additional information regarding copyright ownership.
 5 |  * The ASF licenses this file to You under the Apache License, Version 2.0
 6 |  * (the "License"); you may not use this file except in compliance with
 7 |  * the License.  You may obtain a copy of the License at
 8 |  *
 9 |  *    http://www.apache.org/licenses/LICENSE-2.0
10 |  *
11 |  * Unless required by applicable law or agreed to in writing, software
12 |  * distributed under the License is distributed on an "AS IS" BASIS,
13 |  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 |  * See the License for the specific language governing permissions and
15 |  * limitations under the License.
16 |  */
17 | 
18 | package org.apache.spark.sql.iceberg.planning
19 | 
20 | import org.apache.spark.sql.AnalysisException
21 | import org.apache.spark.sql.catalyst.catalog.CatalogTable
22 | import org.apache.spark.sql.iceberg.Config.columnDependencies
23 | import org.apache.spark.sql.iceberg.utils.{Transforms, TypeConversions, currentSparkSession}
24 | 
25 | import scala.collection.mutable.ArrayBuffer
26 | 
27 | object IcebergTableValidationChecks {
28 | 
29 |   private def validationRules(table : CatalogTable,
30 |                               errs : ArrayBuffer[String]) : Unit = {
31 |     val typMapping = TypeConversions.convertStructType(table.schema)
32 | 
33 |     if (table.bucketSpec.isDefined) {
34 |       errs += "not support for bucketed table"
35 |     }
36 | 
37 |     if (table.partitionColumnNames.isEmpty) {
38 |       errs += "no support for non partitioned table"
39 |     }
40 | 
41 |     for (partMappingStr <- columnDependencies(table)) {
42 |       Transforms.fromOption(partMappingStr, typMapping)(currentSparkSession) match {
43 |         case Left(tErrs) => errs ++= tErrs
44 |         case _ => ()
45 |       }
46 |     }
47 |   }
48 | 
49 |   def validate(action : String,
50 |                table : CatalogTable,
51 |                errs : ArrayBuffer[String] = ArrayBuffer[String]()) : Unit = {
52 |     validationRules(table, errs)
53 | 
54 |     if (errs.nonEmpty) {
55 |       throw new AnalysisException(
56 |         s"""Cannot ${action} ${table.qualifiedName} as a managed table:
57 |            |${errs.mkString("\n\t", "\n\t", "\n")})""".stripMargin
58 |       )
59 |     }
60 |   }
61 | 
62 | 
63 | }
64 | 


--------------------------------------------------------------------------------
/src/main/scala/org/apache/spark/sql/iceberg/planning/IcebergTableWriteRule.scala:
--------------------------------------------------------------------------------
 1 | /*
 2 |  * Licensed to the Apache Software Foundation (ASF) under one or more
 3 |  * contributor license agreements.  See the NOTICE file distributed with
 4 |  * this work for additional information regarding copyright ownership.
 5 |  * The ASF licenses this file to You under the Apache License, Version 2.0
 6 |  * (the "License"); you may not use this file except in compliance with
 7 |  * the License.  You may obtain a copy of the License at
 8 |  *
 9 |  *    http://www.apache.org/licenses/LICENSE-2.0
10 |  *
11 |  * Unless required by applicable law or agreed to in writing, software
12 |  * distributed under the License is distributed on an "AS IS" BASIS,
13 |  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 |  * See the License for the specific language governing permissions and
15 |  * limitations under the License.
16 |  */
17 | 
18 | package org.apache.spark.sql.iceberg.planning
19 | 
20 | import org.apache.spark.sql.SparkSession
21 | import org.apache.spark.sql.catalyst.plans.logical.LogicalPlan
22 | import org.apache.spark.sql.catalyst.rules.Rule
23 | import org.apache.spark.sql.execution.datasources.InsertIntoHadoopFsRelationCommand
24 | import org.apache.spark.sql.iceberg.{Config, InsertIntoIcebergTable}
25 | 
26 | case class IcebergTableWriteRule(ss: SparkSession) extends Rule[LogicalPlan] {
27 | 
28 |   import Config._
29 | 
30 |   override def apply(plan: LogicalPlan): LogicalPlan = plan transformUp {
31 |     case ihr: InsertIntoHadoopFsRelationCommand
32 |       if ihr.catalogTable.isDefined && isManaged(ihr.catalogTable.get)
33 |     => {
34 |       val catalogTable = {
35 |         val ct = ihr.catalogTable.get
36 |         IcebergTableValidationChecks.validate("insert into", ct)
37 |         ct
38 |       }
39 |       InsertIntoIcebergTable(ihr, catalogTable)
40 |     }
41 |   }
42 | }


--------------------------------------------------------------------------------
/src/main/scala/org/apache/spark/sql/iceberg/planning/SparkSessionExtensions.scala:
--------------------------------------------------------------------------------
 1 | /*
 2 |  * Licensed to the Apache Software Foundation (ASF) under one or more
 3 |  * contributor license agreements.  See the NOTICE file distributed with
 4 |  * this work for additional information regarding copyright ownership.
 5 |  * The ASF licenses this file to You under the Apache License, Version 2.0
 6 |  * (the "License"); you may not use this file except in compliance with
 7 |  * the License.  You may obtain a copy of the License at
 8 |  *
 9 |  *    http://www.apache.org/licenses/LICENSE-2.0
10 |  *
11 |  * Unless required by applicable law or agreed to in writing, software
12 |  * distributed under the License is distributed on an "AS IS" BASIS,
13 |  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 |  * See the License for the specific language governing permissions and
15 |  * limitations under the License.
16 |  */
17 | 
18 | package org.apache.spark.sql.iceberg.planning
19 | 
20 | import org.apache.spark.sql.iceberg.parsing.SparkIceParser
21 | import org.apache.spark.sql.{SparkSessionExtensions => ApacheSparkSessionExtensions}
22 | 
23 | /**
24 |   * Include this class in ''spark.sql.extensions'' for these extensions to take effect.
25 |   */
26 | class SparkSessionExtensions extends Function1[ApacheSparkSessionExtensions, Unit] {
27 | 
28 |   override def apply(spkExtensions: ApacheSparkSessionExtensions): Unit = {
29 | 
30 |     spkExtensions.injectPostHocResolutionRule {spkSession =>
31 |       CreateTableCheck
32 |     }
33 | 
34 |     spkExtensions.injectOptimizerRule(CreateIcebergTableRule)
35 |     spkExtensions.injectOptimizerRule(IcebergTableWriteRule)
36 |     spkExtensions.injectPlannerStrategy(IcebergScanStrategy)
37 |     spkExtensions.injectParser((_, pI) => new SparkIceParser(pI))
38 |   }
39 | }
40 | 


--------------------------------------------------------------------------------
/src/main/scala/org/apache/spark/sql/iceberg/table/SparkTableOperations.scala:
--------------------------------------------------------------------------------
  1 | /*
  2 |  * Licensed to the Apache Software Foundation (ASF) under one or more
  3 |  * contributor license agreements.  See the NOTICE file distributed with
  4 |  * this work for additional information regarding copyright ownership.
  5 |  * The ASF licenses this file to You under the Apache License, Version 2.0
  6 |  * (the "License"); you may not use this file except in compliance with
  7 |  * the License.  You may obtain a copy of the License at
  8 |  *
  9 |  *    http://www.apache.org/licenses/LICENSE-2.0
 10 |  *
 11 |  * Unless required by applicable law or agreed to in writing, software
 12 |  * distributed under the License is distributed on an "AS IS" BASIS,
 13 |  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 14 |  * See the License for the specific language governing permissions and
 15 |  * limitations under the License.
 16 |  */
 17 | 
 18 | package org.apache.spark.sql.iceberg.table
 19 | 
 20 | import java.lang.String.format
 21 | import java.util.{Locale, Objects}
 22 | 
 23 | import com.netflix.iceberg.exceptions.CommitFailedException
 24 | import com.netflix.iceberg.{BaseMetastoreTableOperations, TableMetadata}
 25 | import org.apache.spark.internal.Logging
 26 | import org.apache.spark.sql.SparkSession
 27 | import org.apache.spark.sql.catalyst.QualifiedTableName
 28 | import org.apache.spark.sql.catalyst.catalog.CatalogTable
 29 | import org.apache.spark.sql.execution.command.AlterTableSetPropertiesCommand
 30 | 
 31 | /**
 32 |   * Very similar to `HiveTableOperations` from `iceberg-hive`.
 33 |   * The `refresh` step checks that the `catalogTable` is setup as
 34 |   * a Iceberg Table.
 35 |   * The ''metadata_location'' property points to the latest iceberg metadata file.
 36 |   *
 37 |   * As part of a  metadata commit an [[AlterTableSetPropertiesCommand]] is issued
 38 |   * that updates the ''metadata_location'' property.
 39 |   *
 40 |   * @param sparkSession
 41 |   * @param catalogTable
 42 |   */
 43 | class SparkTableOperations private[iceberg] (val sparkSession: SparkSession,
 44 |                                              val catalogTable: CatalogTable)
 45 |   extends BaseMetastoreTableOperations(sparkSession.sparkContext.hadoopConfiguration) with Logging {
 46 | 
 47 |   val tableIdentifier = catalogTable.identifier
 48 | 
 49 |   val qualTblNm = {
 50 |     QualifiedTableName(
 51 |       catalogTable.identifier.database.getOrElse(
 52 |         sparkSession.sessionState.catalog.getCurrentDatabase
 53 |       ),
 54 |       catalogTable.identifier.table
 55 |     )
 56 |   }
 57 | 
 58 |   private var metadataLocation : Option[String] = None
 59 | 
 60 |   private def setMetadataLocation(l : String) : Unit = {
 61 |     metadataLocation = Some(l)
 62 |   }
 63 | 
 64 |   override def refresh(): TableMetadata = {
 65 | 
 66 |     import BaseMetastoreTableOperations.{ICEBERG_TABLE_TYPE_VALUE, METADATA_LOCATION_PROP, TABLE_TYPE_PROP}
 67 | 
 68 |     if (!metadataLocation.isDefined) {
 69 |       val catTable = catalogTable
 70 |       val tableType = catTable.properties.get(TABLE_TYPE_PROP)
 71 | 
 72 |       if (!tableType.isDefined || !tableType.get.equalsIgnoreCase(ICEBERG_TABLE_TYPE_VALUE)) {
 73 |         throw new IllegalArgumentException(
 74 |           format("Invalid tableName, not Iceberg: %s", tableIdentifier.unquotedString)
 75 |         )
 76 |       }
 77 |       metadataLocation = catTable.properties.get(METADATA_LOCATION_PROP)
 78 |     }
 79 | 
 80 |     if (!metadataLocation.isDefined) {
 81 |       throw new IllegalArgumentException(
 82 |         format(format("%s is missing %s property",
 83 |           tableIdentifier.unquotedString, METADATA_LOCATION_PROP)
 84 |         )
 85 |       )
 86 |     }
 87 |     refreshFromMetadataLocation(metadataLocation.get)
 88 |     current
 89 |   }
 90 | 
 91 |   override def commit(base: TableMetadata, metadata: TableMetadata): Unit = {
 92 | 
 93 |     import BaseMetastoreTableOperations.{ICEBERG_TABLE_TYPE_VALUE, METADATA_LOCATION_PROP, PREVIOUS_METADATA_LOCATION_PROP, TABLE_TYPE_PROP}
 94 | 
 95 |     // if the metadata is already out of date, reject it
 96 |     if (base != current) {
 97 |       throw new CommitFailedException(format("stale table metadata for %s",
 98 |         tableIdentifier.unquotedString))
 99 |     }
100 | 
101 |     // if the metadata is not changed, return early
102 |     if (base == metadata) {
103 |       logInfo("Nothing to commit.")
104 |       return
105 |     }
106 | 
107 |     val newMetadataLocation: String = writeNewMetadata(metadata, currentVersion + 1)
108 | 
109 |     var threw: Boolean = true
110 |     try {
111 |       val catTable = catalogTable
112 | 
113 |       val metadataLocation = catTable.properties.getOrElse(METADATA_LOCATION_PROP, null)
114 | 
115 |       if (!Objects.equals(currentMetadataLocation, metadataLocation)) {
116 |         throw new CommitFailedException(
117 |           format("metadataLocation = %s is not same as table metadataLocation %s for %s",
118 |             currentMetadataLocation, metadataLocation, tableIdentifier.unquotedString))
119 |       }
120 | 
121 |       val currentTimeMillis = System.currentTimeMillis
122 |       var properties: List[(String, String)] = List(
123 |         (TABLE_TYPE_PROP, ICEBERG_TABLE_TYPE_VALUE.toUpperCase(Locale.ENGLISH)),
124 |         (METADATA_LOCATION_PROP, newMetadataLocation)
125 |       )
126 | 
127 |       if (currentMetadataLocation != null && !currentMetadataLocation.isEmpty) {
128 |         properties = properties :+ (PREVIOUS_METADATA_LOCATION_PROP, currentMetadataLocation)
129 |       }
130 | 
131 |       val alterTableCmd = new AlterTableSetPropertiesCommand(
132 |         tableIdentifier,
133 |         properties.toMap,
134 |         false)
135 |       alterTableCmd.run(sparkSession)
136 |       log.info(s"Updated table $tableIdentifier with new metadata location: ${newMetadataLocation}")
137 | 
138 |       sparkSession.sessionState.catalog.invalidateCachedTable(qualTblNm)
139 |       setMetadataLocation(newMetadataLocation)
140 | 
141 |       threw = false
142 |     } finally {
143 |       if (threw) { // if anything went wrong, clean up the uncommitted metadata file
144 |         io.deleteFile(newMetadataLocation)
145 |       }
146 |     }
147 |     requestRefresh()
148 | 
149 |   }
150 | }
151 | 
152 | class SparkTableOperationsForCreate private[iceberg] (sparkSession: SparkSession,
153 |                                                       catalogTable: CatalogTable)
154 | extends SparkTableOperations(sparkSession, catalogTable) {
155 | 
156 |   override def refresh(): TableMetadata = {
157 |     refreshFromMetadataLocation(null)
158 |     current()
159 |   }
160 | }


--------------------------------------------------------------------------------
/src/main/scala/org/apache/spark/sql/iceberg/table/SparkTables.scala:
--------------------------------------------------------------------------------
 1 | /*
 2 |  * Licensed to the Apache Software Foundation (ASF) under one or more
 3 |  * contributor license agreements.  See the NOTICE file distributed with
 4 |  * this work for additional information regarding copyright ownership.
 5 |  * The ASF licenses this file to You under the Apache License, Version 2.0
 6 |  * (the "License"); you may not use this file except in compliance with
 7 |  * the License.  You may obtain a copy of the License at
 8 |  *
 9 |  *    http://www.apache.org/licenses/LICENSE-2.0
10 |  *
11 |  * Unless required by applicable law or agreed to in writing, software
12 |  * distributed under the License is distributed on an "AS IS" BASIS,
13 |  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 |  * See the License for the specific language governing permissions and
15 |  * limitations under the License.
16 |  */
17 | 
18 | package org.apache.spark.sql.iceberg.table
19 | 
20 | import java.util
21 | 
22 | import com.netflix.iceberg.TableMetadata.newTableMetadata
23 | import com.netflix.iceberg.{BaseMetastoreTableOperations, BaseMetastoreTables, BaseTable, PartitionSpec, Schema, Table}
24 | import org.apache.hadoop.conf.Configuration
25 | import org.apache.spark.sql.SparkSession
26 | import org.apache.spark.sql.catalyst.catalog.{CatalogTable, CatalogUtils}
27 | import org.apache.spark.sql.iceberg.utils._
28 | 
29 | /**
30 |   * Very similar to `HiveTables` from `iceberg-hive`.
31 |   * Setups a [[SparkTableOperations]], `tableIdentifier` is interpreted using the session parser.
32 |   *
33 |   * Uses of this should pass the Spark table's [[CatalogTable]] on which Table operations will be
34 |   * applied and the current [[SparkSession]].
35 |   *
36 |   * @param sparkSession
37 |   * @param catalogTable
38 |   */
39 | private[iceberg] class SparkTables(val sparkSession: SparkSession,
40 |                                    val catalogTable: CatalogTable)
41 |   extends BaseMetastoreTables(sparkSession.sparkContext.hadoopConfiguration) {
42 | 
43 |   override def newTableOps(configuration: Configuration,
44 |                            database: String,
45 |                            table: String
46 |                           ): BaseMetastoreTableOperations = {
47 |     new SparkTableOperations(sparkSession, catalogTable)
48 |   }
49 | 
50 |   override def create(schema: Schema,
51 |                       partitionSpec: PartitionSpec,
52 |                       map: util.Map[String, String],
53 |                       tableIdentifier: String
54 |                      ): Table = {
55 |     val tId = parseIdentifier(tableIdentifier, sparkSession)
56 |     create(schema, partitionSpec, map, tId.database.getOrElse(null), tId.table)
57 |   }
58 | 
59 |   override def create(schema: Schema, spec: PartitionSpec,
60 |                       properties: util.Map[String, String],
61 |                       database: String,
62 |                       table: String
63 |                      ): Table = {
64 |     val conf = sparkSession.sparkContext.hadoopConfiguration
65 |     val ops = newTableOps(conf, database, table)
66 |     val location = catalogTable.storage.locationUri.map(CatalogUtils.URIToString(_)).get
67 |     val metadata = newTableMetadata(ops, schema, spec, location, properties)
68 |     ops.commit(null, metadata)
69 |     new BaseTable(ops, database + "." + table)
70 |   }
71 | 
72 |   override def load(tableIdentifier: String): Table = {
73 |     val tId = parseIdentifier(tableIdentifier, sparkSession)
74 |     val qtid = qualifiedTableIdentifier(tId, sparkSession)
75 |     load(qtid.database.getOrElse(null), qtid.table)
76 |   }
77 | }
78 | 
79 | private[iceberg] class SparkTablesForCreate(sparkSession: SparkSession,
80 |                                             catalogTable: CatalogTable)
81 |   extends SparkTables(sparkSession, catalogTable) {
82 | 
83 |   override def newTableOps(configuration: Configuration,
84 |                            database: String,
85 |                            table: String
86 |                           ): BaseMetastoreTableOperations = {
87 |     new SparkTableOperationsForCreate(sparkSession, catalogTable)
88 |   }
89 | 
90 | }
91 | 


--------------------------------------------------------------------------------
/src/main/scala/org/apache/spark/sql/iceberg/utils/ColumnDependenciesExprVisitor.scala:
--------------------------------------------------------------------------------
 1 | /*
 2 |  * Licensed to the Apache Software Foundation (ASF) under one or more
 3 |  * contributor license agreements.  See the NOTICE file distributed with
 4 |  * this work for additional information regarding copyright ownership.
 5 |  * The ASF licenses this file to You under the Apache License, Version 2.0
 6 |  * (the "License"); you may not use this file except in compliance with
 7 |  * the License.  You may obtain a copy of the License at
 8 |  *
 9 |  *    http://www.apache.org/licenses/LICENSE-2.0
10 |  *
11 |  * Unless required by applicable law or agreed to in writing, software
12 |  * distributed under the License is distributed on an "AS IS" BASIS,
13 |  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 |  * See the License for the specific language governing permissions and
15 |  * limitations under the License.
16 |  */
17 | 
18 | package org.apache.spark.sql.iceberg.utils
19 | 
20 | import com.netflix.iceberg.expressions.{BoundPredicate, Expression, Expressions, UnboundPredicate}
21 | import com.netflix.iceberg.expressions.ExpressionVisitors.ExpressionVisitor
22 | import com.netflix.iceberg.transforms.{Transform => IceTransform}
23 | import com.netflix.iceberg.types.Types
24 | import org.apache.spark.sql.catalyst.util.CaseInsensitiveMap
25 | 
26 | class ColumnDependenciesExprVisitor(iceType : Types.StructType,
27 |                                     columnDep : Map[String, Map[String, IceTransform[_,_]]]
28 |                                    ) extends ExpressionVisitor[Expression] {
29 | 
30 |   val caseInsensitiveMap : Map[String, Map[String, IceTransform[_,_]]]
31 |   = CaseInsensitiveMap(columnDep)
32 | 
33 |   override def alwaysTrue(): Expression = Expressions.alwaysTrue
34 | 
35 |   override def alwaysFalse(): Expression = Expressions.alwaysFalse()
36 | 
37 |   override def not(result: Expression): Expression = Expressions.not(result)
38 | 
39 |   override def and(leftResult: Expression, rightResult: Expression): Expression =
40 |     Expressions.and(leftResult, rightResult)
41 | 
42 |   override def or(leftResult: Expression, rightResult: Expression): Expression =
43 |     Expressions.or(leftResult, rightResult)
44 | 
45 |   override def predicate[T](pred: BoundPredicate[T]): Expression = pred
46 | 
47 |   override def predicate[T](pred: UnboundPredicate[T]): Expression = {
48 | 
49 |     val boundExpr = pred.bind(iceType.asStructType(), false)
50 |     val srcCol = pred.ref().name
51 | 
52 |     if (boundExpr.isInstanceOf[BoundPredicate[_]]) {
53 |       val bndPred = boundExpr.asInstanceOf[BoundPredicate[T]]
54 |       val transformPreds : Seq[Expression] = (for (
55 |         transformsMap <- caseInsensitiveMap.get(srcCol).toSeq;
56 |         (destCol, iceTrans) <- transformsMap.iterator
57 |       ) yield {
58 |           iceTrans.asInstanceOf[IceTransform[T,_]].project(destCol, bndPred)
59 |       }).filter(_ != null)
60 | 
61 |       transformPreds.foldLeft(pred.asInstanceOf[Expression])(Expressions.and(_,_))
62 | 
63 |     } else {
64 |       pred
65 |     }
66 |   }
67 | 
68 | }
69 | 


--------------------------------------------------------------------------------
/src/main/scala/org/apache/spark/sql/iceberg/utils/DataFileUtils.scala:
--------------------------------------------------------------------------------
 1 | /*
 2 |  * Licensed to the Apache Software Foundation (ASF) under one or more
 3 |  * contributor license agreements.  See the NOTICE file distributed with
 4 |  * this work for additional information regarding copyright ownership.
 5 |  * The ASF licenses this file to You under the Apache License, Version 2.0
 6 |  * (the "License"); you may not use this file except in compliance with
 7 |  * the License.  You may obtain a copy of the License at
 8 |  *
 9 |  *    http://www.apache.org/licenses/LICENSE-2.0
10 |  *
11 |  * Unless required by applicable law or agreed to in writing, software
12 |  * distributed under the License is distributed on an "AS IS" BASIS,
13 |  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 |  * See the License for the specific language governing permissions and
15 |  * limitations under the License.
16 |  */
17 | 
18 | package org.apache.spark.sql.iceberg.utils
19 | 
20 | import com.netflix.iceberg.hadoop.HadoopInputFile
21 | import com.netflix.iceberg.{DataFile, DataFiles, Metrics, PartitionSpec}
22 | import org.apache.hadoop.conf.Configuration
23 | import org.apache.hadoop.fs.Path
24 | import org.apache.spark.sql.execution.datasources.iceberg.PartitioningUtils
25 | import org.apache.spark.sql.types.StructType
26 | 
27 | /**
28 |  * Utilities to construct and extract information from [[DataFile]]
29 |  */
30 | object DataFileUtils {
31 | 
32 |   def buildDataFileFormPath(pathString : String,
33 |                             iceSchema: IceSchema,
34 |                             partSchema: StructType,
35 |                             partSpec : PartitionSpec,
36 |                             iceFmt : IceFormat,
37 |                             conf : Configuration,
38 |                             finalPath : String
39 |                            ) : DataFile = {
40 | 
41 |     val path = new Path(pathString)
42 |     var bldr = DataFiles.builder(partSpec)
43 |     val inFile = HadoopInputFile.fromPath(path, conf)
44 |     bldr.withInputFile(inFile)
45 | 
46 |     val metrics : Option[Metrics] = iceMetrics(inFile, iceSchema, iceFmt)
47 |     val pSpec = icePartStruct(pathString, partSchema)
48 | 
49 |     bldr.withPartition(pSpec)
50 |     bldr.withFormat(iceFmt)
51 |     if (metrics.isDefined) {
52 |       bldr.withMetrics(metrics.get)
53 |     }
54 |     bldr.withPath(finalPath)
55 |     bldr.build()
56 |   }
57 | 
58 |   def sparkPartitionMap(df : DataFile) : Map[String, String] =
59 |     PartitioningUtils.partitionValuesFromFullPath(
60 |       PartitioningUtils.removeFileName(df.path().toString)
61 |     )
62 | 
63 | 
64 | }
65 | 


--------------------------------------------------------------------------------
/src/main/scala/org/apache/spark/sql/iceberg/utils/DelegationUtils.scala:
--------------------------------------------------------------------------------
  1 | /*
  2 |  * Licensed to the Apache Software Foundation (ASF) under one or more
  3 |  * contributor license agreements.  See the NOTICE file distributed with
  4 |  * this work for additional information regarding copyright ownership.
  5 |  * The ASF licenses this file to You under the Apache License, Version 2.0
  6 |  * (the "License"); you may not use this file except in compliance with
  7 |  * the License.  You may obtain a copy of the License at
  8 |  *
  9 |  *    http://www.apache.org/licenses/LICENSE-2.0
 10 |  *
 11 |  * Unless required by applicable law or agreed to in writing, software
 12 |  * distributed under the License is distributed on an "AS IS" BASIS,
 13 |  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 14 |  * See the License for the specific language governing permissions and
 15 |  * limitations under the License.
 16 |  */
 17 | 
 18 | package org.apache.spark.sql.iceberg.utils
 19 | 
 20 | import scala.reflect.ClassTag
 21 | 
 22 | object DelegationUtils {
 23 | 
 24 |   /*
 25 |    * FT : field type
 26 |    * IT : instance type
 27 |    * ST : super type
 28 |    */
 29 | 
 30 | 
 31 |   class DelegatedField[FT, ST : ClassTag, IT <: ST](val instance : IT, fieldName : String) {
 32 |     val field = {
 33 |       val clz = implicitly[ClassTag[ST]].runtimeClass
 34 |       val f = clz.getDeclaredField(fieldName)
 35 |       f.setAccessible(true)
 36 |       f
 37 |     }
 38 | 
 39 |     def value : FT = field.get(instance).asInstanceOf[FT]
 40 | 
 41 |     def `value_=`(value : FT) : Unit = {
 42 |       field.set(instance, value)
 43 |     }
 44 | 
 45 |   }
 46 | 
 47 |   abstract class DelegatedMethod[ST : ClassTag, IT <: ST, RT]
 48 |   (instance : IT, methodName : String) {
 49 |     val method = {
 50 |       val clz = implicitly[ClassTag[ST]].runtimeClass
 51 |       val m = clz.getDeclaredMethod(methodName)
 52 |       m.setAccessible(true)
 53 |       m
 54 |     }
 55 |   }
 56 | 
 57 |   class DelegatedMethod0[ST : ClassTag, IT <: ST, RT](val instance : IT, methodName : String) {
 58 | 
 59 |     val method = {
 60 |       val clz = implicitly[ClassTag[ST]].runtimeClass
 61 |       val m = clz.getDeclaredMethod(methodName)
 62 |       m.setAccessible(true)
 63 |       m
 64 |     }
 65 | 
 66 |     def apply : RT = {
 67 |       method.invoke(instance).asInstanceOf[RT]
 68 |     }
 69 |   }
 70 | 
 71 |   class DelegatedMethod1[ST : ClassTag, IT <: ST, RT, AT1 : ClassTag]
 72 |   (val instance : IT, methodName : String) {
 73 | 
 74 |     val method = {
 75 |       val clz = implicitly[ClassTag[ST]].runtimeClass
 76 |       val m = clz.getDeclaredMethod(methodName, implicitly[ClassTag[AT1]].runtimeClass)
 77 |       m.setAccessible(true)
 78 |       m
 79 |     }
 80 | 
 81 |     def apply(arg1 : AT1) : RT = {
 82 |       method.invoke(instance, arg1.asInstanceOf[AnyRef]).asInstanceOf[RT]
 83 |     }
 84 |   }
 85 | 
 86 |   class DelegatedMethod2[ST : ClassTag, IT <: ST, RT, AT1 : ClassTag, AT2 : ClassTag]
 87 |   (val instance : IT, methodName : String) {
 88 | 
 89 |     val method = {
 90 |       val clz = implicitly[ClassTag[ST]].runtimeClass
 91 |       val m = clz.getDeclaredMethod(methodName,
 92 |         implicitly[ClassTag[AT1]].runtimeClass,
 93 |         implicitly[ClassTag[AT2]].runtimeClass)
 94 |       m.setAccessible(true)
 95 |       m
 96 |     }
 97 | 
 98 |     def apply(arg1 : AT1, arg2 : AT2) : RT = {
 99 |       method.invoke(instance, arg1.asInstanceOf[AnyRef], arg2.asInstanceOf[AnyRef]).asInstanceOf[RT]
100 |     }
101 |   }
102 | 
103 |   // scalastyle:off line.size.limit
104 |   class DelegatedMethod3[ST : ClassTag, IT <: ST, RT, AT1 : ClassTag, AT2 : ClassTag, AT3 : ClassTag]
105 |   (val instance : IT, methodName : String) {
106 | 
107 |     val method = {
108 |       val clz = implicitly[ClassTag[ST]].runtimeClass
109 |       val m = clz.getDeclaredMethod(methodName,
110 |         implicitly[ClassTag[AT1]].runtimeClass,
111 |         implicitly[ClassTag[AT2]].runtimeClass,
112 |         implicitly[ClassTag[AT3]].runtimeClass)
113 |       m.setAccessible(true)
114 |       m
115 |     }
116 | 
117 |     def apply(arg1 : AT1, arg2 : AT2, arg3 : AT3) : RT = {
118 |       method.invoke(instance,
119 |         arg1.asInstanceOf[AnyRef],
120 |         arg2.asInstanceOf[AnyRef],
121 |         arg3.asInstanceOf[AnyRef]).asInstanceOf[RT]
122 |     }
123 |   }
124 |   // scalastyle:on line.size.limit
125 | 
126 | }
127 | 


--------------------------------------------------------------------------------
/src/main/scala/org/apache/spark/sql/iceberg/utils/ExpressionConversions.scala:
--------------------------------------------------------------------------------
  1 | /*
  2 |  * Licensed to the Apache Software Foundation (ASF) under one or more
  3 |  * contributor license agreements.  See the NOTICE file distributed with
  4 |  * this work for additional information regarding copyright ownership.
  5 |  * The ASF licenses this file to You under the Apache License, Version 2.0
  6 |  * (the "License"); you may not use this file except in compliance with
  7 |  * the License.  You may obtain a copy of the License at
  8 |  *
  9 |  *    http://www.apache.org/licenses/LICENSE-2.0
 10 |  *
 11 |  * Unless required by applicable law or agreed to in writing, software
 12 |  * distributed under the License is distributed on an "AS IS" BASIS,
 13 |  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 14 |  * See the License for the specific language governing permissions and
 15 |  * limitations under the License.
 16 |  */
 17 | 
 18 | package org.apache.spark.sql.iceberg.utils
 19 | 
 20 | import com.netflix.iceberg.expressions.Expression.Operation
 21 | import com.netflix.iceberg.expressions.Projections.ProjectionEvaluator
 22 | import com.netflix.iceberg.expressions.{BoundPredicate, ExpressionVisitors, Expressions, NamedReference, Predicate, UnboundPredicate, Expression => IceExpression}
 23 | import com.netflix.iceberg.transforms.{Transform => IceTransform}
 24 | import com.netflix.iceberg.types.Types
 25 | import org.apache.spark.sql.catalyst.expressions.{And, Attribute, BinaryComparison, Expression, In, InSet, IsNotNull, IsNull, Not, Or}
 26 | 
 27 | object ExpressionConversions {
 28 | 
 29 |   import org.apache.spark.sql.iceberg.utils.LiteralUtils._
 30 | 
 31 |   private[iceberg] object Internals {
 32 | 
 33 |     val sparkToIcebergCompOp = Map[String, Operation](
 34 |       "<" -> Operation.LT,
 35 |       "<=" -> Operation.LT_EQ,
 36 |       "=" -> Operation.EQ,
 37 |       ">=" -> Operation.GT_EQ,
 38 |       ">" -> Operation.GT,
 39 |       "<=>" -> Operation.EQ
 40 |     )
 41 | 
 42 |     def predicate(at: Attribute, op: Operation, il: IceLiteral[_]):
 43 |     Predicate[_, NamedReference] = {
 44 |       Expressions.predicate(op, at.name, il)
 45 |     }
 46 | 
 47 |     object IcebergAttrLiteralCompare {
 48 |       def unapply(e: Expression): Option[IceExpression] = e match {
 49 |         case bc: BinaryComparison => (bc.left, bc.right) match {
 50 |           case (at: Attribute, IcebergLiteral(iL)) =>
 51 |             Some(predicate(at, sparkToIcebergCompOp(bc.symbol), iL))
 52 |           case (IcebergLiteral(iL), at: Attribute) =>
 53 |             Some(predicate(at, sparkToIcebergCompOp(bc.symbol).flipLR(), iL))
 54 |           case _ => None
 55 |         }
 56 |         case _ => None
 57 |       }
 58 |     }
 59 | 
 60 |     object IcebergIn {
 61 | 
 62 |       def inIceExpr(at: Attribute, vals: Seq[IceLiteral[_]]) = {
 63 |         val aF: IceExpression = Expressions.alwaysFalse()
 64 |         Some(
 65 |           vals.foldLeft(aF) {
 66 |             case (c, l) => Expressions.or(c, predicate(at, Operation.EQ, l))
 67 |           }
 68 |         )
 69 |       }
 70 | 
 71 |       def unapply(e: Expression): Option[IceExpression] = e match {
 72 |         case In(at: Attribute, IcebergLiteralList(vals)) => inIceExpr(at, vals)
 73 |         case InSet(at: Attribute, IcebergLiteralList(vals)) => inIceExpr(at, vals)
 74 |         case _ => None
 75 |       }
 76 |     }
 77 | 
 78 |     object StrictIcebergExpression {
 79 |       self =>
 80 | 
 81 |       def unapply(expr: Expression): Option[IceExpression] = expr match {
 82 |         case IsNull(a: Attribute) => Some(Expressions.isNull(a.name))
 83 |         case IsNotNull(a: Attribute) => Some(Expressions.notNull(a.name))
 84 |         case IcebergAttrLiteralCompare(iceExpr) => Some(iceExpr)
 85 |         case IcebergIn(iceExpr) => Some(iceExpr)
 86 |         case Not(self(iceExpr)) => Some(Expressions.not(iceExpr))
 87 |         case And(self(lIceExpr), l@self(rICeExpr)) => Some(Expressions.and(lIceExpr, rICeExpr))
 88 |         case Or(self(lIceExpr), l@self(rICeExpr)) => Some(Expressions.or(lIceExpr, rICeExpr))
 89 |         case _ => None
 90 |       }
 91 | 
 92 |     }
 93 | 
 94 |     private object CopyIceRewriteNotTransform
 95 |       extends ExpressionVisitors.ExpressionVisitor[IceExpression] {
 96 |       override def alwaysTrue(): IceExpression = Expressions.alwaysTrue()
 97 | 
 98 |       override def alwaysFalse(): IceExpression = Expressions.alwaysFalse()
 99 | 
100 |       override def not(result: IceExpression): IceExpression = result.negate()
101 | 
102 |       override def and(leftResult: IceExpression, rightResult: IceExpression): IceExpression =
103 |         Expressions.and(leftResult, rightResult)
104 | 
105 |       override def or(leftResult: IceExpression, rightResult: IceExpression): IceExpression =
106 |         Expressions.or(leftResult, rightResult)
107 | 
108 |       override def predicate[T](pred: BoundPredicate[T]): IceExpression = pred
109 | 
110 |       override def predicate[T](pred: UnboundPredicate[T]): IceExpression = pred
111 |     }
112 | 
113 |     class ApplyPartitionTransforms(val iceSchema: Types.StructType,
114 |                                    val transformMap: Map[String, Map[String, IceTransform[_, _]]],
115 |                                    val strict: Boolean
116 |                                   ) extends ProjectionEvaluator {
117 |       override def project(expr: IceExpression): IceExpression =
118 |         ExpressionVisitors.visit(ExpressionVisitors.visit(expr, CopyIceRewriteNotTransform), this)
119 | 
120 |       override def alwaysTrue(): IceExpression = Expressions.alwaysTrue()
121 | 
122 |       override def alwaysFalse(): IceExpression = Expressions.alwaysFalse()
123 | 
124 |       override def not(result: IceExpression): IceExpression = {
125 |         throw new UnsupportedOperationException("[BUG] project called on expression with a not")
126 |       }
127 | 
128 |       override def and(leftResult: IceExpression, rightResult: IceExpression): IceExpression = {
129 |         Expressions.and(leftResult, rightResult)
130 |       }
131 | 
132 |       override def or(leftResult: IceExpression, rightResult: IceExpression): IceExpression = {
133 |         Expressions.or(leftResult, rightResult)
134 |       }
135 | 
136 |       override def predicate[T](pred: BoundPredicate[T]): IceExpression = {
137 |         val sNm = iceSchema.field(pred.ref().fieldId()).name()
138 |         val transformPreds = transformMap.get(sNm).map { transforms =>
139 |           for ((pNm, pT) <- transforms) yield {
140 |             if (strict) {
141 |               (pT.asInstanceOf[IceTransform[T, _]]).projectStrict(pNm, pred)
142 |             } else {
143 |               (pT.asInstanceOf[IceTransform[T, _]]).project(pNm, pred)
144 |             }
145 |           }
146 |         }
147 | 
148 |         transformPreds.map { tPreds =>
149 |           tPreds.foldLeft(Expressions.alwaysTrue().asInstanceOf[IceExpression]) {
150 |             case (c, tp) => Expressions.and(c, tp)
151 |           }
152 |         }.getOrElse(pred)
153 | 
154 |       }
155 | 
156 |       override def predicate[T](pred: UnboundPredicate[T]): IceExpression = {
157 |         val bPred = pred.bind(iceSchema, true)
158 | 
159 |         bPred match {
160 |           case bp: BoundPredicate[_] => predicate(bp)
161 |           case _ => pred
162 |         }
163 |       }
164 |     }
165 | 
166 |   }
167 | 
168 |   import Internals._
169 | 
170 |   def convertStrict(e: Expression): Option[IceExpression] =
171 |     StrictIcebergExpression.unapply(e)
172 | 
173 |   def convert(e: Expression): IceExpression = e match {
174 |     case StrictIcebergExpression(ice) => ice
175 |     case And(l, r) => Expressions.and(convert(l), convert(r))
176 |     case _ => Expressions.alwaysTrue()
177 |   }
178 | 
179 |   def pushFiltersForScan(e : Expression,
180 |                          iceSchema: Types.StructType,
181 |                          transformMap: Map[String, Map[String, IceTransform[_, _]]]
182 |                         ) : IceExpression = {
183 |     val convExpr = convert(e)
184 |     val partTransform =
185 |       new ApplyPartitionTransforms(iceSchema, transformMap, false)
186 |     partTransform.project(convExpr)
187 |   }
188 | 
189 | }
190 | 


--------------------------------------------------------------------------------
/src/main/scala/org/apache/spark/sql/iceberg/utils/LiteralUtils.scala:
--------------------------------------------------------------------------------
 1 | /*
 2 |  * Licensed to the Apache Software Foundation (ASF) under one or more
 3 |  * contributor license agreements.  See the NOTICE file distributed with
 4 |  * this work for additional information regarding copyright ownership.
 5 |  * The ASF licenses this file to You under the Apache License, Version 2.0
 6 |  * (the "License"); you may not use this file except in compliance with
 7 |  * the License.  You may obtain a copy of the License at
 8 |  *
 9 |  *    http://www.apache.org/licenses/LICENSE-2.0
10 |  *
11 |  * Unless required by applicable law or agreed to in writing, software
12 |  * distributed under the License is distributed on an "AS IS" BASIS,
13 |  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 |  * See the License for the specific language governing permissions and
15 |  * limitations under the License.
16 |  */
17 | 
18 | package org.apache.spark.sql.iceberg.utils
19 | 
20 | import java.lang.{Double => JavaDouble, Float => JavaFloat, Long => JavaLong}
21 | import java.math.{BigDecimal => JavaBigDecimal, BigInteger => JavaBigInteger}
22 | 
23 | import com.netflix.iceberg.expressions.Literal.{of => ofIceLiteral}
24 | import com.netflix.iceberg.types.Types
25 | import org.apache.commons.io.Charsets
26 | import org.apache.spark.sql.catalyst.expressions.{Cast, Literal}
27 | import org.apache.spark.sql.types._
28 | import org.apache.parquet.io.api.Binary
29 | 
30 | /**
31 |  * Utilities to convert from and to [[IceLiteral]]
32 |  */
33 | object LiteralUtils {
34 | 
35 |   object IcebergLiteral {
36 |     def unapply(e: Any): Option[IceLiteral[_]] = e match {
37 |       case l: Literal => (l.value, l.dataType) match {
38 |         case (_, StringType) => Some(ofIceLiteral(l.value.toString))
39 |         case (_, dt: DecimalType) =>
40 |           Some(ofIceLiteral(l.value.asInstanceOf[Decimal].toJavaBigDecimal))
41 |         case (v: Double, DoubleType) => Some(ofIceLiteral(v))
42 |         case (v: Float, FloatType) => Some(ofIceLiteral(v))
43 |         case (v: Long, LongType) => Some(ofIceLiteral(v))
44 |         case (v: Byte, ByteType) => Some(ofIceLiteral(v))
45 |         case (v: Short, ShortType) => Some(ofIceLiteral(v))
46 |         case (v: Int, IntegerType) => Some(ofIceLiteral(v))
47 |         case (v: Boolean, BooleanType) => Some(ofIceLiteral(v))
48 |         case (v: Array[Byte], dt: ArrayType) if dt.elementType == ByteType =>
49 |           Some(ofIceLiteral(v))
50 |         case _ => None
51 |       }
52 |       case cst@Cast(lt : Literal, dt, _) => {
53 |         val cVal = cst.eval(null)
54 |         IcebergLiteral.unapply(Literal(cVal, dt))
55 |       }
56 |       case _ => None
57 |     }
58 |   }
59 | 
60 |   def fromParquetPrimitive[T](v: T,
61 |                            iceType: IceType
62 |                           ): IceLiteral[T] = (v, iceType) match {
63 |     case (v : Boolean, _) => ofIceLiteral(v).to(iceType)
64 |     case (v : Integer, _) => ofIceLiteral(v).to(iceType)
65 |     case (v : JavaLong, _) => ofIceLiteral(v).to(iceType)
66 |     case (v : JavaFloat, _) => ofIceLiteral(v).to(iceType)
67 |     case (v : JavaDouble, _) => ofIceLiteral(v).to(iceType)
68 |     case (v : Binary, t : Types.StringType) =>
69 |       ofIceLiteral(Charsets.UTF_8.decode(v.toByteBuffer)).to(iceType)
70 |     case (v : Binary, iT : Types.DecimalType) =>
71 |       ofIceLiteral(new JavaBigDecimal(new JavaBigInteger(v.getBytes), iT.scale())).to(iceType)
72 |     case (v : Binary, t : Types.BinaryType) => ofIceLiteral(v.toByteBuffer).to(iceType)
73 |     case _ => throw new IllegalArgumentException("Unsupported primitive type: " + iceType)
74 |   }
75 | 
76 |   object IcebergLiteralList {
77 |     def unapply(eL: Seq[Any]): Option[Seq[IceLiteral[_]]] = {
78 |       val litOps: Seq[Option[IceLiteral[_]]] = eL.map(IcebergLiteral.unapply(_))
79 |       val zero: Option[Seq[IceLiteral[_]]] = Some(Seq.empty[IceLiteral[_]])
80 |       litOps.foldLeft(zero) {
81 |         case (None, _) => None
82 |         case (Some(l), None) => None
83 |         case (Some(l), Some(iL)) => Some(iL +: l)
84 |       }
85 |     }
86 |   }
87 | }
88 | 


--------------------------------------------------------------------------------
/src/main/scala/org/apache/spark/sql/iceberg/utils/ParquetMetrics.scala:
--------------------------------------------------------------------------------
  1 | /*
  2 |  * Licensed to the Apache Software Foundation (ASF) under one or more
  3 |  * contributor license agreements.  See the NOTICE file distributed with
  4 |  * this work for additional information regarding copyright ownership.
  5 |  * The ASF licenses this file to You under the Apache License, Version 2.0
  6 |  * (the "License"); you may not use this file except in compliance with
  7 |  * the License.  You may obtain a copy of the License at
  8 |  *
  9 |  *    http://www.apache.org/licenses/LICENSE-2.0
 10 |  *
 11 |  * Unless required by applicable law or agreed to in writing, software
 12 |  * distributed under the License is distributed on an "AS IS" BASIS,
 13 |  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 14 |  * See the License for the specific language governing permissions and
 15 |  * limitations under the License.
 16 |  */
 17 | 
 18 | package org.apache.spark.sql.iceberg.utils
 19 | 
 20 | import java.io.IOException
 21 | import java.lang.{Integer => JInt, Long => JLong}
 22 | import java.nio.ByteBuffer
 23 | import java.util
 24 | 
 25 | import com.netflix.iceberg.Metrics
 26 | import com.netflix.iceberg.exceptions.RuntimeIOException
 27 | import com.netflix.iceberg.expressions.Literal
 28 | import com.netflix.iceberg.hadoop.HadoopInputFile
 29 | import com.netflix.iceberg.types.Conversions
 30 | import org.apache.parquet.hadoop.ParquetFileReader
 31 | import org.apache.parquet.hadoop.metadata.ParquetMetadata
 32 | 
 33 | import scala.collection.mutable.{Map => MMap, Set => MSet}
 34 | import scala.language.existentials
 35 | 
 36 | object ParquetMetrics {
 37 | 
 38 |   def fromInputFile(file: HadoopInputFile,
 39 |                     iceSchema: IceSchema): Metrics = {
 40 |     val parqFile =
 41 |       org.apache.parquet.hadoop.util.HadoopInputFile.fromStatus(file.getStat, file.getConf)
 42 |     val reader = ParquetFileReader.open(parqFile)
 43 |     try {
 44 |       fromMetadata(reader.getFooter, iceSchema)
 45 |     } catch {
 46 |       case e: IOException =>
 47 |         throw new RuntimeIOException(e, "Failed to read footer of file: %s", file)
 48 |     } finally if (reader != null) reader.close()
 49 |   }
 50 | 
 51 |   def fromMetadata(metadata: ParquetMetadata,
 52 |                    fileSchema: IceSchema
 53 |                   ): Metrics = {
 54 | 
 55 |     var rowCount: Long = 0L
 56 |     val columnSizes = MMap[JInt, JLong]()
 57 |     val valueCounts = MMap[JInt, JLong]()
 58 |     val nullValueCounts = MMap[JInt, JLong]()
 59 |     val lowerBounds = MMap[JInt, IceLiteral[_]]()
 60 |     val upperBounds = MMap[JInt, IceLiteral[_]]()
 61 | 
 62 |     val missingStats = MSet[JInt]()
 63 | 
 64 |     val parquetType = metadata.getFileMetaData.getSchema
 65 |     val blocks = metadata.getBlocks
 66 |     import scala.collection.JavaConversions._
 67 |     for (block <- blocks) {
 68 |       rowCount += block.getRowCount
 69 |       import scala.collection.JavaConversions._
 70 |       for (column <- block.getColumns) {
 71 | 
 72 |         val field = fileSchema.findField(column.getPath.toDotString)
 73 | 
 74 |         if (field != null) {
 75 |           val fieldId = field.fieldId()
 76 |           increment(columnSizes, fieldId, column.getTotalSize)
 77 |           increment(valueCounts, fieldId, column.getValueCount)
 78 |           val stats = column.getStatistics
 79 | 
 80 |           if (stats == null) {
 81 |             missingStats.add(fieldId)
 82 |           }
 83 |           else if (!stats.isEmpty) {
 84 |             increment(nullValueCounts, fieldId, stats.getNumNulls)
 85 |             val field = fileSchema.asStruct.field(fieldId)
 86 |             if (field != null && stats.hasNonNullValue) {
 87 |               updateMin(
 88 |                 lowerBounds,
 89 |                 fieldId,
 90 |                 LiteralUtils.fromParquetPrimitive(stats.genericGetMin, field.`type`)
 91 |               )
 92 |               updateMax(
 93 |                 upperBounds,
 94 |                 fieldId,
 95 |                 LiteralUtils.fromParquetPrimitive(stats.genericGetMax, field.`type`)
 96 |               )
 97 |             }
 98 |           }
 99 |         }
100 |       }
101 |     }
102 |     // discard accumulated values if any stats were missing
103 |     for (fieldId <- missingStats) {
104 |       nullValueCounts.remove(fieldId)
105 |       lowerBounds.remove(fieldId)
106 |       upperBounds.remove(fieldId)
107 |     }
108 | 
109 |     new Metrics(
110 |       rowCount,
111 |       javaMap(columnSizes.toMap),
112 |       javaMap(valueCounts.toMap),
113 |       javaMap(nullValueCounts.toMap),
114 |       javaMap(toBufferMap(fileSchema, lowerBounds)),
115 |       javaMap(toBufferMap(fileSchema, upperBounds))
116 |     )
117 |   }
118 | 
119 |   private def increment(columns: MMap[JInt, JLong],
120 |                         fieldId: Int,
121 |                         amount: JLong): Unit = {
122 |     val v: JLong = columns.getOrElse(fieldId, 0L)
123 |     columns(fieldId) = (v + amount)
124 |   }
125 | 
126 |   private def updateMin[T](lowerBounds: MMap[JInt, Literal[_]],
127 |                            id: Int,
128 |                            min: Literal[T]
129 |                           ) = {
130 |     val currentMinO: Option[Literal[_]] = lowerBounds.get(id)
131 | 
132 |     def currentMin: Literal[T] = currentMinO.get.asInstanceOf[Literal[T]]
133 | 
134 |     if (!currentMinO.isDefined ||
135 |       min.comparator().compare(min.value(), currentMin.value()) < 0
136 |     ) {
137 |       lowerBounds(id) = min
138 |     }
139 |   }
140 | 
141 |   private def updateMax[T](upperBounds: MMap[JInt, Literal[_]],
142 |                            id: Int,
143 |                            max: Literal[T]
144 |                           ) = {
145 |     val currentMaxO: Option[Literal[_]] = upperBounds.get(id)
146 | 
147 |     def currentMax: Literal[T] = currentMaxO.get.asInstanceOf[Literal[T]]
148 | 
149 |     if (!currentMaxO.isDefined ||
150 |       max.comparator().compare(max.value(), currentMax.value()) > 0
151 |     ) {
152 |       upperBounds(id) = max
153 |     }
154 |   }
155 | 
156 |   private def toBufferMap(schema: IceSchema,
157 |                           map: MMap[JInt, IceLiteral[_]]
158 |                          ): Map[JInt, ByteBuffer] = {
159 | 
160 |     val mb = MMap[JInt, ByteBuffer]()
161 | 
162 |     for ((i, l) <- map.iterator) {
163 |       val t = schema.findType(i)
164 |       mb(i) = Conversions.toByteBuffer(t, l.value())
165 |     }
166 | 
167 |     mb.toMap
168 |   }
169 | 
170 |   private def javaMap[K,V](m : Map[K,V]) : java.util.Map[K,V] = {
171 |     val jm = new util.HashMap[K,V](m.size)
172 |     for(
173 |       (k,v) <- m
174 |     ) {
175 |       jm.put(k,v)
176 |     }
177 |     jm
178 |   }
179 | 
180 | }
181 | 


--------------------------------------------------------------------------------
/src/main/scala/org/apache/spark/sql/iceberg/utils/SchemaConversions.scala:
--------------------------------------------------------------------------------
 1 | /*
 2 |  * Licensed to the Apache Software Foundation (ASF) under one or more
 3 |  * contributor license agreements.  See the NOTICE file distributed with
 4 |  * this work for additional information regarding copyright ownership.
 5 |  * The ASF licenses this file to You under the Apache License, Version 2.0
 6 |  * (the "License"); you may not use this file except in compliance with
 7 |  * the License.  You may obtain a copy of the License at
 8 |  *
 9 |  *    http://www.apache.org/licenses/LICENSE-2.0
10 |  *
11 |  * Unless required by applicable law or agreed to in writing, software
12 |  * distributed under the License is distributed on an "AS IS" BASIS,
13 |  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 |  * See the License for the specific language governing permissions and
15 |  * limitations under the License.
16 |  */
17 | 
18 | package org.apache.spark.sql.iceberg.utils
19 | 
20 | import com.netflix.iceberg.{PartitionSpec, Schema}
21 | import org.apache.spark.sql.SparkSession
22 | import org.apache.spark.sql.catalyst.catalog.CatalogTable
23 | import org.apache.spark.sql.iceberg.utils.TypeConversions.TypeMapping
24 | 
25 | object SchemaConversions {
26 | 
27 |   private def toIcebergSchema(typeMapping: TypeMapping) : Schema = {
28 |     new Schema(typeMapping.iceType.fields())
29 |   }
30 | 
31 |   private def toIcebergPartitionSpec(catalogTable: CatalogTable,
32 |                                      typeMapping: TypeMapping,
33 |                                      schema : Schema
34 |                                     )(implicit sparkSession : SparkSession) : PartitionSpec = {
35 | 
36 |     if (catalogTable.partitionColumnNames.nonEmpty) {
37 |       var bldr = PartitionSpec.builderFor(schema)
38 |       for(
39 |         pField <- catalogTable.partitionSchema.fields;
40 |         iceF = schema.findField(pField.name)
41 |       ) {
42 |         bldr = bldr.identity(iceF.name())
43 |       }
44 |       bldr.build()
45 |     } else {
46 |       null
47 |     }
48 |   }
49 | 
50 |   def toIcebergSpec(catalogTable: CatalogTable)(implicit sparkSession : SparkSession) :
51 |   (Schema, PartitionSpec) = {
52 | 
53 |     val typMap = TypeConversions.convertStructType(catalogTable.schema)
54 |     val schema = toIcebergSchema(typMap)
55 |     val pSpec = toIcebergPartitionSpec(catalogTable, typMap, schema)
56 | 
57 |     (schema, pSpec)
58 | 
59 |   }
60 | 
61 | }
62 | 


--------------------------------------------------------------------------------
/src/main/scala/org/apache/spark/sql/iceberg/utils/TableUtils.scala:
--------------------------------------------------------------------------------
  1 | /*
  2 |  * Licensed to the Apache Software Foundation (ASF) under one or more
  3 |  * contributor license agreements.  See the NOTICE file distributed with
  4 |  * this work for additional information regarding copyright ownership.
  5 |  * The ASF licenses this file to You under the Apache License, Version 2.0
  6 |  * (the "License"); you may not use this file except in compliance with
  7 |  * the License.  You may obtain a copy of the License at
  8 |  *
  9 |  *    http://www.apache.org/licenses/LICENSE-2.0
 10 |  *
 11 |  * Unless required by applicable law or agreed to in writing, software
 12 |  * distributed under the License is distributed on an "AS IS" BASIS,
 13 |  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 14 |  * See the License for the specific language governing permissions and
 15 |  * limitations under the License.
 16 |  */
 17 | 
 18 | package org.apache.spark.sql.iceberg.utils
 19 | 
 20 | import java.util.Locale
 21 | 
 22 | import com.netflix.iceberg.Snapshot
 23 | import org.apache.spark.sql.catalyst.InternalRow
 24 | import org.apache.spark.sql.catalyst.expressions.{GenericInternalRow, GenericRow}
 25 | import org.apache.spark.sql.catalyst.plans.logical.LocalRelation
 26 | import org.apache.spark.sql.iceberg.table.SparkTables
 27 | import org.apache.spark.sql.types._
 28 | import org.apache.spark.sql.{DataFrame, Row, SparkSession}
 29 | import org.apache.spark.unsafe.types.UTF8String
 30 | 
 31 | object TableUtils {
 32 | 
 33 |   protected[this] def formatName(name: String)(implicit ss : SparkSession) : String = {
 34 |     if (ss.sqlContext.conf.caseSensitiveAnalysis) name else name.toLowerCase(Locale.ROOT)
 35 |   }
 36 | 
 37 |   def iceTable(tblNm : String)(implicit ss : SparkSession) : IceTable = {
 38 |     val name = parseIdentifier(tblNm, ss)
 39 | 
 40 |     val db = formatName(name.database.getOrElse(ss.sessionState.catalog.getCurrentDatabase))
 41 |     val table = formatName(name.table)
 42 |     val catalogTable = ss.sessionState.catalog.externalCatalog.getTable(db, table)
 43 |     val iceTables = new SparkTables(ss, catalogTable)
 44 |     val tbId = qualifiedTableIdentifier(catalogTable.identifier, ss)
 45 |     iceTables.load(tbId.database.getOrElse(null), tbId.table)
 46 |   }
 47 | 
 48 |   val SNAPSHOT_SCHEMA : StructType = {
 49 |     StructType(
 50 |       Seq(
 51 |         StructField("id", LongType),
 52 |         StructField("parentId", LongType),
 53 |         StructField("timeMillis", LongType),
 54 |         StructField("numAddedFiles", IntegerType),
 55 |         StructField("numdDeletedFiles", IntegerType),
 56 |         StructField("manifestListLocation", StringType)
 57 |       )
 58 |     )
 59 |   }
 60 | 
 61 |   def toRow(pId : Long,
 62 |             sShot : Snapshot) : Row = {
 63 |     import scala.collection.JavaConversions._
 64 |     new GenericRow(
 65 |       Array[Any](
 66 |         sShot.snapshotId(),
 67 |         pId,
 68 |         sShot.timestampMillis(),
 69 |         sShot.addedFiles().size,
 70 |         sShot.deletedFiles().size,
 71 |         sShot.manifestListLocation()
 72 |       )
 73 |     )
 74 |   }
 75 | 
 76 |   def toIRow(pId : Long,
 77 |             sShot : Snapshot) : InternalRow = {
 78 |     import scala.collection.JavaConversions._
 79 |     new GenericInternalRow(
 80 |       Array[Any](
 81 |         sShot.snapshotId(),
 82 |         pId,
 83 |         sShot.timestampMillis(),
 84 |         sShot.addedFiles().size,
 85 |         sShot.deletedFiles().size,
 86 |         UTF8String.fromString(sShot.manifestListLocation())
 87 |       )
 88 |     )
 89 |   }
 90 | 
 91 |   def snapShots[T](iceTable : IceTable,
 92 |                 toRow : (Long, Snapshot) => T)(implicit ss : SparkSession) : Seq[T] = {
 93 |     import scala.collection.JavaConversions._
 94 |     val rows = for (sShot <- iceTable.snapshots()) yield {
 95 | 
 96 |       val pId : Long = {
 97 |         val l = sShot.parentId()
 98 |         if (l == null) -1 else l
 99 |       }
100 |       toRow(pId, sShot)
101 |     }
102 |     rows.toSeq
103 |   }
104 | 
105 |   private val snapshot_millis = new ThreadLocal[Long] {
106 |     override def initialValue = -1L
107 |   }
108 | 
109 |   def setThreadSnapShotMillis(sMillis : Long) = {
110 |     snapshot_millis.set(sMillis)
111 |   }
112 | 
113 |   def getThreadSnapShotId(iceTable: IceTable) : Long = {
114 |     val sMillis = snapshot_millis.get
115 |     setThreadSnapShotMillis(-1L)
116 |     if (sMillis == -1) {
117 |       iceTable.currentSnapshot().snapshotId()
118 |     } else {
119 |       snapShotId(iceTable, sMillis)
120 |     }
121 |   }
122 | 
123 |   def snapShotId(iceTable : IceTable, timeMs : Long) : Long = {
124 |     import scala.collection.JavaConversions._
125 |     var spId = iceTable.currentSnapshot().snapshotId()
126 |     for (sShot <- iceTable.snapshots()) {
127 |       if (sShot.timestampMillis() <= timeMs) {
128 |         spId = sShot.snapshotId()
129 |       }
130 |     }
131 |     spId
132 |   }
133 | 
134 |   def snapShotsDF(iceTable : IceTable)(implicit ss : SparkSession) : DataFrame =
135 |     dataframe(snapShots(iceTable, toRow), SNAPSHOT_SCHEMA)
136 | 
137 |   def snapShotsDF(tblNm : String)(implicit ss : SparkSession) : DataFrame = {
138 |     val iTbl = iceTable(tblNm)
139 |     dataframe(snapShots(iTbl, toRow), SNAPSHOT_SCHEMA)
140 |   }
141 | 
142 |   val SNAPSHOTSVIEW_SUFFIX = "$snapshots"
143 | 
144 |   def snapShotsLocalRelation(tblNm : String)(implicit ss : SparkSession) : LocalRelation = {
145 |     val iTbl = iceTable(tblNm)
146 |     LocalRelation(SNAPSHOT_SCHEMA.toAttributes, snapShots(iTbl, toIRow))
147 |   }
148 | }
149 | 


--------------------------------------------------------------------------------
/src/main/scala/org/apache/spark/sql/iceberg/utils/Transforms.scala:
--------------------------------------------------------------------------------
  1 | /*
  2 |  * Licensed to the Apache Software Foundation (ASF) under one or more
  3 |  * contributor license agreements.  See the NOTICE file distributed with
  4 |  * this work for additional information regarding copyright ownership.
  5 |  * The ASF licenses this file to You under the Apache License, Version 2.0
  6 |  * (the "License"); you may not use this file except in compliance with
  7 |  * the License.  You may obtain a copy of the License at
  8 |  *
  9 |  *    http://www.apache.org/licenses/LICENSE-2.0
 10 |  *
 11 |  * Unless required by applicable law or agreed to in writing, software
 12 |  * distributed under the License is distributed on an "AS IS" BASIS,
 13 |  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 14 |  * See the License for the specific language governing permissions and
 15 |  * limitations under the License.
 16 |  */
 17 | 
 18 | package org.apache.spark.sql.iceberg.utils
 19 | 
 20 | import com.netflix.iceberg.transforms.{Transform => IceTransform, Transforms => IcebergTransforms}
 21 | import com.netflix.iceberg.types.{Type, Types}
 22 | import org.apache.spark.sql.SparkSession
 23 | import org.apache.spark.sql.iceberg.Config
 24 | import org.apache.spark.sql.iceberg.utils.TypeConversions.TypeMapping
 25 | import org.apache.spark.sql.types.StructField
 26 | 
 27 | import scala.collection.mutable.{ArrayBuffer, Map => MMap}
 28 | import scala.util.Try
 29 | 
 30 | object Transforms {
 31 | 
 32 |   val TRANSFORM_REGX = """(([^\s,=:]+=[^\s,=:]+:[^\s,=:]+)(?:,\s*)?)+""".r
 33 | 
 34 |   /**
 35 |     * Validate [[Config.COLUMN_DEPENDENCIES]] specified for a ''Managed Table''.
 36 |     * - the option must be in the form of a comma-separated list of ''srCol=destCol:tnExpr''
 37 |     * - each ''srCol'' must resolve to a attribute in the given [[TypeMapping.sparkType]]
 38 |     * - each ''destCol'' must resolve to a attribute in the given [[TypeMapping.sparkType]]
 39 |     * - each ''tnExpr'' must resolve to a valid [[Transforms]] for the column's [[Type]]
 40 |     *
 41 |     * @param valueMappings
 42 |     * @param typeMapping
 43 |     * @param sparkSession
 44 |     * @return
 45 |     */
 46 |   def fromOption(valueMappings : String,
 47 |                  typeMapping : TypeMapping
 48 |                 )(implicit sparkSession : SparkSession) :
 49 |   Either[Array[String],Map[String, Map[String, IceTransform[_,_]]]] = {
 50 | 
 51 |     val errs = ArrayBuffer[String]()
 52 |     val transforms = MMap[String, Map[String, IceTransform[_,_]]]()
 53 | 
 54 |     def parseTransforms : Array[String] = {
 55 |       valueMappings.split(",").map(_.trim)
 56 |     }
 57 | 
 58 |     def parseEntry(s : String) : Option[(String,String,String)] = {
 59 |       val kv = s.split("=")
 60 |       if (kv.size != 2) {
 61 |         errs +=
 62 |           s"Cannot parse transform ${s} in partition transformations: ${valueMappings}"
 63 |         None
 64 |       } else {
 65 |         val vSplit = kv(1).split(":")
 66 |         if (vSplit.size != 2) {
 67 |           errs +=
 68 |             s"Cannot parse transform ${s} in partition transformations: ${valueMappings}"
 69 |           None
 70 |         } else {
 71 |           Some((kv(0).trim, vSplit(0).trim, vSplit(1).trim))
 72 |         }
 73 |       }
 74 |     }
 75 | 
 76 |     def validateAttr(attr : String) : Option[(StructField, Types.NestedField)] = {
 77 |       typeMapping.findField(attr)
 78 |     }
 79 | 
 80 |     def validateTransform(iceF : Types.NestedField,
 81 |                           t: String) : Option[IceTransform[_,_]] = {
 82 | 
 83 |       Try {
 84 |         Some(IcebergTransforms.fromString(iceF.`type`(), t))
 85 |       }.recover {
 86 |         case throwable: Throwable =>
 87 |           errs += s"Failed to parse transform ${t} in partition transformations: " +
 88 |             s"${valueMappings}; (${throwable.getMessage})"
 89 |           None
 90 |       }.get
 91 |     }
 92 | 
 93 |     for(
 94 |       transform <- parseTransforms;
 95 |       (sc, pc,v) <- parseEntry(transform) ;
 96 |       (_, sIceF) <- validateAttr(sc);
 97 |       t <- validateTransform(sIceF, v);
 98 |       (_, pIceF) <- validateAttr(pc)
 99 |     ) {
100 |       val tm = transforms.getOrElse(sIceF.name, Map[String, IceTransform[_,_]]())
101 |       transforms(sIceF.name) = (tm + ((pIceF.name(), t)))
102 |     }
103 | 
104 |     if (errs.nonEmpty) {
105 |       Left(errs.toArray)
106 |     } else {
107 |       Right(transforms.toMap)
108 |     }
109 | 
110 |   }
111 | }
112 | 


--------------------------------------------------------------------------------
/src/main/scala/org/apache/spark/sql/iceberg/utils/TypeConversions.scala:
--------------------------------------------------------------------------------
  1 | /*
  2 |  * Licensed to the Apache Software Foundation (ASF) under one or more
  3 |  * contributor license agreements.  See the NOTICE file distributed with
  4 |  * this work for additional information regarding copyright ownership.
  5 |  * The ASF licenses this file to You under the Apache License, Version 2.0
  6 |  * (the "License"); you may not use this file except in compliance with
  7 |  * the License.  You may obtain a copy of the License at
  8 |  *
  9 |  *    http://www.apache.org/licenses/LICENSE-2.0
 10 |  *
 11 |  * Unless required by applicable law or agreed to in writing, software
 12 |  * distributed under the License is distributed on an "AS IS" BASIS,
 13 |  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 14 |  * See the License for the specific language governing permissions and
 15 |  * limitations under the License.
 16 |  */
 17 | 
 18 | package org.apache.spark.sql.iceberg.utils
 19 | 
 20 | import com.netflix.iceberg.types.{Type, Types}
 21 | import org.apache.spark.sql.SparkSession
 22 | import org.apache.spark.sql.types._
 23 | 
 24 | object TypeConversions {
 25 | 
 26 |   def convertStructType(st : StructType) : TypeMapping = {
 27 |     implicit val nextId = new Function0[Int] {
 28 |       var cnt = -1
 29 |       def apply : Int = {
 30 |         cnt += 1
 31 |         cnt
 32 |       }
 33 |     }
 34 |     TypeMapping(st, structType(st).asInstanceOf[Types.StructType])
 35 |   }
 36 | 
 37 |   def toIcebergType(dt : DataType)(implicit nextId : () => Int) : Type =  dt match {
 38 |     case st : StructType => structType(st)
 39 |     case mt : MapType => mapType(mt)
 40 |     case at : ArrayType => arrType(at)
 41 |     case at : AtomicType => atomicType(at)
 42 |     case _ =>
 43 |       throw new UnsupportedOperationException(s"Spark datatype ${dt.toString} is not supported")
 44 |   }
 45 | 
 46 |   def structType(st : StructType)(implicit nextId : () => Int) : Type = {
 47 |     import scala.collection.JavaConversions._
 48 | 
 49 |     val iceFields : java.util.List[Types.NestedField] = (
 50 |       for( (field, i) <- st.fields.zipWithIndex) yield {
 51 | 
 52 |       val (id, nm, typ) = (nextId(), field.name, toIcebergType(field.dataType))
 53 |       if (field.nullable) {
 54 |         Types.NestedField.optional(id, nm, typ)
 55 |       } else Types.NestedField.required(id, nm, typ)
 56 | 
 57 |     }).toList
 58 | 
 59 |     Types.StructType.of(iceFields)
 60 |   }
 61 | 
 62 |   def mapType(mT : MapType)(implicit nextId : () => Int) : Type = {
 63 | 
 64 |     val (kId,vId,kT, vT) =
 65 |       (nextId(), nextId(), toIcebergType(mT.keyType), toIcebergType(mT.valueType))
 66 | 
 67 |     if (mT.valueContainsNull) {
 68 |       Types.MapType.ofOptional(kId, vId, kT, vT)
 69 |     }
 70 |     else Types.MapType.ofRequired(kId,vId,kT, vT)
 71 |   }
 72 | 
 73 |   def arrType(aT : ArrayType)(implicit nextId : () => Int) : Type = {
 74 | 
 75 |     val (eId,eT) = (nextId(), toIcebergType(aT.elementType))
 76 | 
 77 |     if (aT.containsNull) {
 78 |       Types.ListType.ofOptional(eId, eT)
 79 |     }
 80 |     else Types.ListType.ofRequired(eId, eT)
 81 |   }
 82 | 
 83 | 
 84 |   def atomicType(aT: AtomicType): Type = aT match {
 85 |     case b : BooleanType => Types.BooleanType.get
 86 |     case i : IntegerType => Types.IntegerType.get
 87 |     case s : ShortType => Types.IntegerType.get
 88 |     case b : ByteType => Types.IntegerType.get
 89 |     case l : LongType => Types.LongType.get
 90 |     case f : FloatType => Types.FloatType.get
 91 |     case d : DoubleType => Types.DoubleType.get
 92 |     case s : StringType => Types.StringType.get
 93 |     case c : CharType => Types.StringType.get
 94 |     case c : VarcharType => Types.StringType.get
 95 |     case d : DateType => Types.DateType.get
 96 |     case t : TimestampType => Types.TimestampType.withZone
 97 |     case d : DecimalType => Types.DecimalType.of(d.precision, d.scale)
 98 |     case b : BinaryType => Types.BinaryType.get
 99 |     case _ => throw new UnsupportedOperationException("Not a supported type: " + aT.catalogString)
100 |   }
101 | 
102 |   case class TypeMapping(sparkType : StructType,
103 |                          iceType : Types.StructType) {
104 | 
105 |     def findField(nm : String)(implicit sparkSession : SparkSession) :
106 |     Option[(StructField, Types.NestedField)] = {
107 |       (sparkType.fields.find {
108 |         case f if sparkSession.sqlContext.conf.resolver(f.name, nm) => true
109 |         case _ => false
110 |       }).map(t => (t, iceType.field(t.name) ))
111 |     }
112 |   }
113 | 
114 | }
115 | 


--------------------------------------------------------------------------------
/src/main/scala/org/apache/spark/sql/iceberg/utils/utils.scala:
--------------------------------------------------------------------------------
  1 | /*
  2 |  * Licensed to the Apache Software Foundation (ASF) under one or more
  3 |  * contributor license agreements.  See the NOTICE file distributed with
  4 |  * this work for additional information regarding copyright ownership.
  5 |  * The ASF licenses this file to You under the Apache License, Version 2.0
  6 |  * (the "License"); you may not use this file except in compliance with
  7 |  * the License.  You may obtain a copy of the License at
  8 |  *
  9 |  *    http://www.apache.org/licenses/LICENSE-2.0
 10 |  *
 11 |  * Unless required by applicable law or agreed to in writing, software
 12 |  * distributed under the License is distributed on an "AS IS" BASIS,
 13 |  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 14 |  * See the License for the specific language governing permissions and
 15 |  * limitations under the License.
 16 |  */
 17 | 
 18 | package org.apache.spark.sql.iceberg
 19 | 
 20 | import com.netflix.iceberg.hadoop.HadoopInputFile
 21 | import com.netflix.iceberg.{DataFile, FileFormat, StructLike, Table}
 22 | import org.apache.spark.sql.catalyst.catalog.SessionCatalog
 23 | import org.apache.spark.sql.catalyst.plans.logical.LogicalPlan
 24 | import org.apache.spark.sql.catalyst.{InternalRow, TableIdentifier}
 25 | import org.apache.spark.sql.execution.datasources.iceberg.PartitioningUtils
 26 | import org.apache.spark.sql.execution.datasources.parquet.ParquetFileFormat
 27 | import org.apache.spark.sql.execution.datasources.{FileFormat => SparkFileFormat}
 28 | import org.apache.spark.sql.types._
 29 | import org.apache.spark.sql._
 30 | import org.apache.spark.sql.catalyst.util.DateTimeUtils
 31 | import org.apache.spark.unsafe.types.UTF8String
 32 | 
 33 | import scala.util.Try
 34 | 
 35 | trait PlanUtils {
 36 | 
 37 |   def dataFrame(lP: LogicalPlan)(
 38 |     implicit sqlContext: SQLContext): DataFrame = {
 39 |     Dataset.ofRows(sqlContext.sparkSession, lP)
 40 |   }
 41 | 
 42 |   // for testing
 43 |   def iceTableScanExec(df : DataFrame,
 44 |                        strict : Boolean = true
 45 |                       ) : Option[IceTableScanExec] = {
 46 |     val icScanO = (df.queryExecution.executedPlan find {
 47 |       case is : IceTableScanExec => true
 48 |       case _ => false
 49 |     }).map(_.asInstanceOf[IceTableScanExec])
 50 | 
 51 |     if (!icScanO.isDefined & strict) {
 52 |       throw new AnalysisException(s"Failed to find a IceTableScan in given dataframe")
 53 |     }
 54 |     icScanO
 55 |   }
 56 | 
 57 |   // for testing
 58 |   def filesScanned(df : DataFrame,
 59 |                    strict : Boolean = true
 60 |                   ) : Seq[String] =  {
 61 |     for(
 62 |       iScan <- iceTableScanExec(df,strict).toSeq;
 63 |       pdirs = iScan.showPartitionsScanned;
 64 |       pd <- pdirs;
 65 |       fs <- pd.files
 66 |     ) yield {
 67 |       fs.getPath.toString
 68 |     }
 69 |   }
 70 | }
 71 | 
 72 | trait ExprUtils {
 73 | 
 74 |   def isApproximateNumeric(dt: DataType) = dt.isInstanceOf[FractionalType]
 75 |   def isNumeric(dt: DataType) = NumericType.acceptsType(dt)
 76 | 
 77 | }
 78 | 
 79 | trait SessionUtils {
 80 | 
 81 |   def currentSparkSession: SparkSession = {
 82 |     var spkSessionO = SparkSession.getActiveSession
 83 |     if (!spkSessionO.isDefined) {
 84 |       spkSessionO = SparkSession.getDefaultSession
 85 |     }
 86 |     spkSessionO.getOrElse(???)
 87 |   }
 88 | 
 89 |   def parseIdentifier(name: String,
 90 |                       sparkSession: SparkSession): TableIdentifier = {
 91 |     sparkSession.sessionState.sqlParser.parseTableIdentifier(name)
 92 |   }
 93 | 
 94 |   def qualifiedTableIdentifier(tabId: TableIdentifier,
 95 |                                sparkSession: SparkSession): TableIdentifier = {
 96 |     val catalog = sparkSession.sessionState.catalog
 97 | 
 98 |     val db = tabId.database.getOrElse(catalog.getCurrentDatabase)
 99 |     db match {
100 |       case SessionCatalog.DEFAULT_DATABASE => TableIdentifier(tabId.table)
101 |       case _ => TableIdentifier(tabId.table, Some(db))
102 |     }
103 |   }
104 | 
105 |   def dataframe(rows: Seq[Row],
106 |                 schema: StructType
107 |                )(implicit ss: SparkSession): DataFrame = {
108 |     ss.createDataFrame(
109 |       ss.sparkContext.parallelize(rows, 1),
110 |       schema
111 |     )
112 |   }
113 | 
114 |   def convertToEpoch(s : String) : Long = {
115 |     Try {
116 |       DateTimeUtils.stringToTimestamp(UTF8String.fromString(s))
117 |         .map(ts => DateTimeUtils.toMillis(ts))
118 |         .getOrElse(s.toLong)
119 |     } getOrElse(
120 |       throw new AnalysisException(s"Cannot intepret value '${s}' as a timestamp")
121 |       )
122 |   }
123 | 
124 |   def convertToTimestampString(millis : Long) : String = {
125 |     DateTimeUtils.timestampToString(millis * 1000)
126 |   }
127 | }
128 | 
129 | package object utils extends PlanUtils with ExprUtils with SessionUtils {
130 | 
131 |   type IcePSpec = com.netflix.iceberg.PartitionSpec
132 |   type IceStruct = com.netflix.iceberg.StructLike
133 |   type IceTable = com.netflix.iceberg.Table
134 |   type IceType = com.netflix.iceberg.types.Type
135 |   type IceSchema = com.netflix.iceberg.Schema
136 |   type IceFormat = com.netflix.iceberg.FileFormat
137 |   type IceMetrics = com.netflix.iceberg.Metrics
138 |   type IceLiteral[T] = com.netflix.iceberg.expressions.Literal[T]
139 | 
140 |   class StructLikeInternalRow(val struct: StructType) extends StructLike {
141 |     val fields : Array[StructField] = struct.fields
142 |     val types : Array[DataType] = fields.map(_.dataType)
143 | 
144 |     private var row : InternalRow = null
145 | 
146 |     def setRow(row: InternalRow) : StructLikeInternalRow = {
147 |       this.row = row
148 |       this
149 |     }
150 | 
151 |     override def size: Int = types.length
152 | 
153 |     @SuppressWarnings(Array("unchecked"))
154 |     override def get[T](pos: Int, javaClass: Class[T]): T = types(pos) match {
155 |       case StringType => row.getUTF8String(pos).toString.asInstanceOf[T]
156 |       case _ => javaClass.cast(row.get(pos, types(pos)))
157 |     }
158 | 
159 |     override def set[T](pos: Int, value: T): Unit = ???
160 |   }
161 | 
162 |   def toIceRow(sr : InternalRow, ss : StructType) : StructLike = {
163 |     val r = new StructLikeInternalRow(ss)
164 |     r.setRow(sr)
165 |     r
166 |   }
167 | 
168 |   def iceFormat(ffName : String) : IceFormat = {
169 |     Try {
170 |       com.netflix.iceberg.FileFormat.valueOf(ffName)
171 |     }.getOrElse(com.netflix.iceberg.FileFormat.AVRO)
172 |   }
173 | 
174 |   def partitionSpec(iceTable : Table) = iceTable.spec()
175 | 
176 |   def iceFormat(ff : SparkFileFormat) : FileFormat = {
177 |     if (ff.isInstanceOf[ParquetFileFormat]) {
178 |       FileFormat.PARQUET
179 |     } else {
180 |       FileFormat.AVRO
181 |     }
182 |   }
183 | 
184 |   def iceMetrics(inFile: HadoopInputFile,
185 |                  iceSchema: IceSchema,
186 |                  iceFmt : IceFormat
187 |                 ) : Option[IceMetrics] = iceFmt match {
188 |     case com.netflix.iceberg.FileFormat.PARQUET =>
189 |       Some(ParquetMetrics.fromInputFile(inFile, iceSchema))
190 |     case _ => None
191 |   }
192 | 
193 |   def icePartStruct(pathString : String,
194 |                     partSchema: StructType
195 |                    ) : IceStruct = {
196 |     val iRow = PartitioningUtils.partitionRowFromFullPath(pathString, partSchema)
197 |     toIceRow(iRow, partSchema)
198 |   }
199 | 
200 | }
201 | 


--------------------------------------------------------------------------------
/src/test/resources/store_sales/ss_sold_date_sk=0905245/part-00004-eeb13bdd-e31e-4992-aa61-60c2c1729751.c000.snappy.parquet:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/hbutani/icebergSQL/63c26ae00bb03bb7d05bfbeac4fe6787b019d505/src/test/resources/store_sales/ss_sold_date_sk=0905245/part-00004-eeb13bdd-e31e-4992-aa61-60c2c1729751.c000.snappy.parquet


--------------------------------------------------------------------------------
/src/test/resources/store_sales/ss_sold_date_sk=0905245/part-00010-eeb13bdd-e31e-4992-aa61-60c2c1729751.c000.snappy.parquet:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/hbutani/icebergSQL/63c26ae00bb03bb7d05bfbeac4fe6787b019d505/src/test/resources/store_sales/ss_sold_date_sk=0905245/part-00010-eeb13bdd-e31e-4992-aa61-60c2c1729751.c000.snappy.parquet


--------------------------------------------------------------------------------
/src/test/resources/store_sales/ss_sold_date_sk=0905245/part-00017-eeb13bdd-e31e-4992-aa61-60c2c1729751.c000.snappy.parquet:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/hbutani/icebergSQL/63c26ae00bb03bb7d05bfbeac4fe6787b019d505/src/test/resources/store_sales/ss_sold_date_sk=0905245/part-00017-eeb13bdd-e31e-4992-aa61-60c2c1729751.c000.snappy.parquet


--------------------------------------------------------------------------------
/src/test/resources/store_sales/ss_sold_date_sk=0905245/part-00022-eeb13bdd-e31e-4992-aa61-60c2c1729751.c000.snappy.parquet:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/hbutani/icebergSQL/63c26ae00bb03bb7d05bfbeac4fe6787b019d505/src/test/resources/store_sales/ss_sold_date_sk=0905245/part-00022-eeb13bdd-e31e-4992-aa61-60c2c1729751.c000.snappy.parquet


--------------------------------------------------------------------------------
/src/test/resources/store_sales/ss_sold_date_sk=0905245/part-00026-eeb13bdd-e31e-4992-aa61-60c2c1729751.c000.snappy.parquet:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/hbutani/icebergSQL/63c26ae00bb03bb7d05bfbeac4fe6787b019d505/src/test/resources/store_sales/ss_sold_date_sk=0905245/part-00026-eeb13bdd-e31e-4992-aa61-60c2c1729751.c000.snappy.parquet


--------------------------------------------------------------------------------
/src/test/resources/store_sales/ss_sold_date_sk=0906245/part-00016-eeb13bdd-e31e-4992-aa61-60c2c1729751.c000.snappy.parquet:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/hbutani/icebergSQL/63c26ae00bb03bb7d05bfbeac4fe6787b019d505/src/test/resources/store_sales/ss_sold_date_sk=0906245/part-00016-eeb13bdd-e31e-4992-aa61-60c2c1729751.c000.snappy.parquet


--------------------------------------------------------------------------------
/src/test/resources/store_sales/ss_sold_date_sk=0906245/part-00018-eeb13bdd-e31e-4992-aa61-60c2c1729751.c000.snappy.parquet:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/hbutani/icebergSQL/63c26ae00bb03bb7d05bfbeac4fe6787b019d505/src/test/resources/store_sales/ss_sold_date_sk=0906245/part-00018-eeb13bdd-e31e-4992-aa61-60c2c1729751.c000.snappy.parquet


--------------------------------------------------------------------------------
/src/test/resources/store_sales/ss_sold_date_sk=0906245/part-00020-eeb13bdd-e31e-4992-aa61-60c2c1729751.c000.snappy.parquet:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/hbutani/icebergSQL/63c26ae00bb03bb7d05bfbeac4fe6787b019d505/src/test/resources/store_sales/ss_sold_date_sk=0906245/part-00020-eeb13bdd-e31e-4992-aa61-60c2c1729751.c000.snappy.parquet


--------------------------------------------------------------------------------
/src/test/resources/store_sales/ss_sold_date_sk=0906245/part-00023-eeb13bdd-e31e-4992-aa61-60c2c1729751.c000.snappy.parquet:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/hbutani/icebergSQL/63c26ae00bb03bb7d05bfbeac4fe6787b019d505/src/test/resources/store_sales/ss_sold_date_sk=0906245/part-00023-eeb13bdd-e31e-4992-aa61-60c2c1729751.c000.snappy.parquet


--------------------------------------------------------------------------------
/src/test/resources/store_sales/ss_sold_date_sk=0906245/part-00029-eeb13bdd-e31e-4992-aa61-60c2c1729751.c000.snappy.parquet:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/hbutani/icebergSQL/63c26ae00bb03bb7d05bfbeac4fe6787b019d505/src/test/resources/store_sales/ss_sold_date_sk=0906245/part-00029-eeb13bdd-e31e-4992-aa61-60c2c1729751.c000.snappy.parquet


--------------------------------------------------------------------------------
/src/test/resources/store_sales/ss_sold_date_sk=0907245/part-00000-eeb13bdd-e31e-4992-aa61-60c2c1729751.c000.snappy.parquet:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/hbutani/icebergSQL/63c26ae00bb03bb7d05bfbeac4fe6787b019d505/src/test/resources/store_sales/ss_sold_date_sk=0907245/part-00000-eeb13bdd-e31e-4992-aa61-60c2c1729751.c000.snappy.parquet


--------------------------------------------------------------------------------
/src/test/resources/store_sales/ss_sold_date_sk=0907245/part-00003-eeb13bdd-e31e-4992-aa61-60c2c1729751.c000.snappy.parquet:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/hbutani/icebergSQL/63c26ae00bb03bb7d05bfbeac4fe6787b019d505/src/test/resources/store_sales/ss_sold_date_sk=0907245/part-00003-eeb13bdd-e31e-4992-aa61-60c2c1729751.c000.snappy.parquet


--------------------------------------------------------------------------------
/src/test/resources/store_sales/ss_sold_date_sk=0907245/part-00008-eeb13bdd-e31e-4992-aa61-60c2c1729751.c000.snappy.parquet:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/hbutani/icebergSQL/63c26ae00bb03bb7d05bfbeac4fe6787b019d505/src/test/resources/store_sales/ss_sold_date_sk=0907245/part-00008-eeb13bdd-e31e-4992-aa61-60c2c1729751.c000.snappy.parquet


--------------------------------------------------------------------------------
/src/test/resources/store_sales/ss_sold_date_sk=0907245/part-00012-eeb13bdd-e31e-4992-aa61-60c2c1729751.c000.snappy.parquet:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/hbutani/icebergSQL/63c26ae00bb03bb7d05bfbeac4fe6787b019d505/src/test/resources/store_sales/ss_sold_date_sk=0907245/part-00012-eeb13bdd-e31e-4992-aa61-60c2c1729751.c000.snappy.parquet


--------------------------------------------------------------------------------
/src/test/resources/store_sales/ss_sold_date_sk=0907245/part-00019-eeb13bdd-e31e-4992-aa61-60c2c1729751.c000.snappy.parquet:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/hbutani/icebergSQL/63c26ae00bb03bb7d05bfbeac4fe6787b019d505/src/test/resources/store_sales/ss_sold_date_sk=0907245/part-00019-eeb13bdd-e31e-4992-aa61-60c2c1729751.c000.snappy.parquet


--------------------------------------------------------------------------------
/src/test/resources/store_sales/ss_sold_date_sk=0908245/part-00005-eeb13bdd-e31e-4992-aa61-60c2c1729751.c000.snappy.parquet:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/hbutani/icebergSQL/63c26ae00bb03bb7d05bfbeac4fe6787b019d505/src/test/resources/store_sales/ss_sold_date_sk=0908245/part-00005-eeb13bdd-e31e-4992-aa61-60c2c1729751.c000.snappy.parquet


--------------------------------------------------------------------------------
/src/test/resources/store_sales/ss_sold_date_sk=0908245/part-00006-eeb13bdd-e31e-4992-aa61-60c2c1729751.c000.snappy.parquet:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/hbutani/icebergSQL/63c26ae00bb03bb7d05bfbeac4fe6787b019d505/src/test/resources/store_sales/ss_sold_date_sk=0908245/part-00006-eeb13bdd-e31e-4992-aa61-60c2c1729751.c000.snappy.parquet


--------------------------------------------------------------------------------
/src/test/resources/store_sales/ss_sold_date_sk=0908245/part-00014-eeb13bdd-e31e-4992-aa61-60c2c1729751.c000.snappy.parquet:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/hbutani/icebergSQL/63c26ae00bb03bb7d05bfbeac4fe6787b019d505/src/test/resources/store_sales/ss_sold_date_sk=0908245/part-00014-eeb13bdd-e31e-4992-aa61-60c2c1729751.c000.snappy.parquet


--------------------------------------------------------------------------------
/src/test/resources/store_sales/ss_sold_date_sk=0908245/part-00024-eeb13bdd-e31e-4992-aa61-60c2c1729751.c000.snappy.parquet:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/hbutani/icebergSQL/63c26ae00bb03bb7d05bfbeac4fe6787b019d505/src/test/resources/store_sales/ss_sold_date_sk=0908245/part-00024-eeb13bdd-e31e-4992-aa61-60c2c1729751.c000.snappy.parquet


--------------------------------------------------------------------------------
/src/test/resources/store_sales/ss_sold_date_sk=0908245/part-00025-eeb13bdd-e31e-4992-aa61-60c2c1729751.c000.snappy.parquet:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/hbutani/icebergSQL/63c26ae00bb03bb7d05bfbeac4fe6787b019d505/src/test/resources/store_sales/ss_sold_date_sk=0908245/part-00025-eeb13bdd-e31e-4992-aa61-60c2c1729751.c000.snappy.parquet


--------------------------------------------------------------------------------
/src/test/resources/store_sales/ss_sold_date_sk=0909245/part-00001-eeb13bdd-e31e-4992-aa61-60c2c1729751.c000.snappy.parquet:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/hbutani/icebergSQL/63c26ae00bb03bb7d05bfbeac4fe6787b019d505/src/test/resources/store_sales/ss_sold_date_sk=0909245/part-00001-eeb13bdd-e31e-4992-aa61-60c2c1729751.c000.snappy.parquet


--------------------------------------------------------------------------------
/src/test/resources/store_sales/ss_sold_date_sk=0909245/part-00002-eeb13bdd-e31e-4992-aa61-60c2c1729751.c000.snappy.parquet:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/hbutani/icebergSQL/63c26ae00bb03bb7d05bfbeac4fe6787b019d505/src/test/resources/store_sales/ss_sold_date_sk=0909245/part-00002-eeb13bdd-e31e-4992-aa61-60c2c1729751.c000.snappy.parquet


--------------------------------------------------------------------------------
/src/test/resources/store_sales/ss_sold_date_sk=0909245/part-00009-eeb13bdd-e31e-4992-aa61-60c2c1729751.c000.snappy.parquet:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/hbutani/icebergSQL/63c26ae00bb03bb7d05bfbeac4fe6787b019d505/src/test/resources/store_sales/ss_sold_date_sk=0909245/part-00009-eeb13bdd-e31e-4992-aa61-60c2c1729751.c000.snappy.parquet


--------------------------------------------------------------------------------
/src/test/resources/store_sales/ss_sold_date_sk=0909245/part-00013-eeb13bdd-e31e-4992-aa61-60c2c1729751.c000.snappy.parquet:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/hbutani/icebergSQL/63c26ae00bb03bb7d05bfbeac4fe6787b019d505/src/test/resources/store_sales/ss_sold_date_sk=0909245/part-00013-eeb13bdd-e31e-4992-aa61-60c2c1729751.c000.snappy.parquet


--------------------------------------------------------------------------------
/src/test/resources/store_sales/ss_sold_date_sk=0909245/part-00021-eeb13bdd-e31e-4992-aa61-60c2c1729751.c000.snappy.parquet:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/hbutani/icebergSQL/63c26ae00bb03bb7d05bfbeac4fe6787b019d505/src/test/resources/store_sales/ss_sold_date_sk=0909245/part-00021-eeb13bdd-e31e-4992-aa61-60c2c1729751.c000.snappy.parquet


--------------------------------------------------------------------------------
/src/test/resources/store_sales/ss_sold_date_sk=0910245/part-00007-eeb13bdd-e31e-4992-aa61-60c2c1729751.c000.snappy.parquet:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/hbutani/icebergSQL/63c26ae00bb03bb7d05bfbeac4fe6787b019d505/src/test/resources/store_sales/ss_sold_date_sk=0910245/part-00007-eeb13bdd-e31e-4992-aa61-60c2c1729751.c000.snappy.parquet


--------------------------------------------------------------------------------
/src/test/resources/store_sales/ss_sold_date_sk=0910245/part-00011-eeb13bdd-e31e-4992-aa61-60c2c1729751.c000.snappy.parquet:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/hbutani/icebergSQL/63c26ae00bb03bb7d05bfbeac4fe6787b019d505/src/test/resources/store_sales/ss_sold_date_sk=0910245/part-00011-eeb13bdd-e31e-4992-aa61-60c2c1729751.c000.snappy.parquet


--------------------------------------------------------------------------------
/src/test/resources/store_sales/ss_sold_date_sk=0910245/part-00015-eeb13bdd-e31e-4992-aa61-60c2c1729751.c000.snappy.parquet:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/hbutani/icebergSQL/63c26ae00bb03bb7d05bfbeac4fe6787b019d505/src/test/resources/store_sales/ss_sold_date_sk=0910245/part-00015-eeb13bdd-e31e-4992-aa61-60c2c1729751.c000.snappy.parquet


--------------------------------------------------------------------------------
/src/test/resources/store_sales/ss_sold_date_sk=0910245/part-00027-eeb13bdd-e31e-4992-aa61-60c2c1729751.c000.snappy.parquet:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/hbutani/icebergSQL/63c26ae00bb03bb7d05bfbeac4fe6787b019d505/src/test/resources/store_sales/ss_sold_date_sk=0910245/part-00027-eeb13bdd-e31e-4992-aa61-60c2c1729751.c000.snappy.parquet


--------------------------------------------------------------------------------
/src/test/resources/store_sales/ss_sold_date_sk=0910245/part-00028-eeb13bdd-e31e-4992-aa61-60c2c1729751.c000.snappy.parquet:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/hbutani/icebergSQL/63c26ae00bb03bb7d05bfbeac4fe6787b019d505/src/test/resources/store_sales/ss_sold_date_sk=0910245/part-00028-eeb13bdd-e31e-4992-aa61-60c2c1729751.c000.snappy.parquet


--------------------------------------------------------------------------------
/src/test/scala/org/apache/spark/sql/hive/test/iceberg/AbstractTest.scala:
--------------------------------------------------------------------------------
  1 | /*
  2 |  * Licensed to the Apache Software Foundation (ASF) under one or more
  3 |  * contributor license agreements.  See the NOTICE file distributed with
  4 |  * this work for additional information regarding copyright ownership.
  5 |  * The ASF licenses this file to You under the Apache License, Version 2.0
  6 |  * (the "License"); you may not use this file except in compliance with
  7 |  * the License.  You may obtain a copy of the License at
  8 |  *
  9 |  *    http://www.apache.org/licenses/LICENSE-2.0
 10 |  *
 11 |  * Unless required by applicable law or agreed to in writing, software
 12 |  * distributed under the License is distributed on an "AS IS" BASIS,
 13 |  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 14 |  * See the License for the specific language governing permissions and
 15 |  * limitations under the License.
 16 |  */
 17 | 
 18 | package org.apache.spark.sql.hive.test.iceberg
 19 | 
 20 | import java.io.File
 21 | import java.util.TimeZone
 22 | 
 23 | import org.apache.spark.internal.Logging
 24 | import org.apache.spark.sql.catalyst.plans.logical.LocalRelation
 25 | import org.apache.spark.sql.iceberg.planning.SparkSessionExtensions
 26 | import org.apache.spark.sql.types.DataType
 27 | import org.apache.spark.sql.{DataFrame, Row}
 28 | import org.joda.time.DateTimeZone
 29 | import org.scalatest.{BeforeAndAfterAll, fixture}
 30 | import org.apache.spark.sql.iceberg.utils
 31 | 
 32 | abstract class AbstractTest extends fixture.FunSuite
 33 |   with  fixture.TestDataFixture with BeforeAndAfterAll with TestTables with Logging {
 34 | 
 35 |   override def beforeAll() = {
 36 |     println("*** Starting TestCase " ++ this.toString() )
 37 | //    System.setProperty("user.timezone", "UTC")
 38 | //    TimeZone.setDefault(TimeZone.getTimeZone("UTC"))
 39 | //    DateTimeZone.setDefault(DateTimeZone.forID("UTC"))
 40 | 
 41 |     new SparkSessionExtensions()(TestIcebergHive.sparkSession.extensions)
 42 | 
 43 |     TestIcebergHive.sparkContext.setLogLevel("ERROR")
 44 |     TestIcebergHive.setConf("spark.sql.files.openCostInBytes", (128 * 1024 * 1024).toString)
 45 |     TestIcebergHive.setConf("spark.sql.files.maxPartitionBytes", (16 * 1024 * 1024).toString)
 46 | 
 47 |     setupStoreSalesTables
 48 | 
 49 |   }
 50 | 
 51 |   def result(df: DataFrame): Array[Row] = {
 52 |     df.collect()
 53 |   }
 54 | 
 55 |   def test(nm: String, sql: String,
 56 |            showPlan: Boolean = false,
 57 |            showResults: Boolean = false,
 58 |            setupSQL: Option[(String, String)] = None): Unit = {
 59 |     test(nm) { td =>
 60 |       println("*** *** Running Test " ++ nm)
 61 | 
 62 |       try {
 63 | 
 64 |         for((s,e) <- setupSQL) {
 65 |           TestIcebergHive.sql(s)
 66 |         }
 67 | 
 68 |         try {
 69 |           val df = sqlAndLog(nm, sql)
 70 |           if (showPlan) {
 71 |             logPlan(nm, df)
 72 |           }
 73 |           if (showResults) {
 74 |             df.show(20, false)
 75 |           }
 76 |         } finally {
 77 |         }
 78 |       } finally {
 79 |         for((s,e) <- setupSQL) {
 80 |           TestIcebergHive.sql(e)
 81 |         }
 82 |       }
 83 |     }
 84 |   }
 85 | 
 86 |   def cTest(nm: String,
 87 |             dsql: String,
 88 |             bsql: String,
 89 |             devAllowedInAproxNumeric: Double = 0.0): Unit = {
 90 |     test(nm) { td =>
 91 |       println("*** *** Running Correctness Test " ++ nm)
 92 | 
 93 |       try {
 94 |         val df1 = sqlAndLog(nm, dsql)
 95 |         val df2 = sqlAndLog(nm, bsql)
 96 |         assert(isTwoDataFrameEqual(df1, df2, devAllowedInAproxNumeric))
 97 |       } finally {
 98 |       }
 99 |     }
100 |   }
101 | 
102 |   def sqlAndLog(nm: String, sqlStr: String): DataFrame = {
103 |     logDebug(s"\n$nm SQL:\n" + sqlStr)
104 |     TestIcebergHive.sql(sqlStr)
105 |   }
106 | 
107 |   def logPlan(nm: String, df: DataFrame): Unit = {
108 |     logInfo(s"\n$nm Plan:")
109 |     logInfo(s"\nLogical Plan:\n" + df.queryExecution.logical.toString)
110 |     logInfo(s"\nOptimized Plan:\n" + df.queryExecution.optimizedPlan.toString)
111 |     logInfo(s"\nPhysical Plan:\n" + df.queryExecution.sparkPlan.toString)
112 |   }
113 | 
114 |   def roundValue(chooseRounding: Boolean, v: Any, dt: DataType): Any = {
115 |     if (chooseRounding && v != null &&
116 |       utils.isNumeric(dt) &&
117 |       !Set[Any](Double.PositiveInfinity, Double.NegativeInfinity,
118 |         Float.PositiveInfinity, Float.NegativeInfinity).contains(v)
119 |     ) {
120 |       BigDecimal(v.toString).setScale(1, BigDecimal.RoundingMode.HALF_UP).toDouble
121 |     } else if (v == Float.PositiveInfinity){
122 |       Double.PositiveInfinity
123 |     } else if (v == Float.NegativeInfinity){
124 |       Double.NegativeInfinity
125 |     } else {
126 |       v
127 |     }
128 |   }
129 | 
130 |   def isTwoDataFrameEqual(df1: DataFrame,
131 |                           df2: DataFrame,
132 |                           devAllowedInAproxNumeric: Double,
133 |                           Sorted: Boolean = false,
134 |                           chooseRounding: Boolean = true): Boolean = {
135 | 
136 | 
137 |     if (df1.schema != df2.schema) {
138 |       logWarning(
139 |         s"""
140 |            |different schemas issue:
141 |            | df1 schema = ${df1.schema}
142 |            | df2.schema = ${df2.schema}
143 |          """.stripMargin)
144 |       // return false
145 |     }
146 | 
147 |     import collection.JavaConversions._
148 | 
149 |     var df11 = df1
150 |     var df21 = df2
151 | 
152 |     var df1_ilist = df11.queryExecution.executedPlan.executeCollect()
153 |     var df2_ilist = df21.queryExecution.executedPlan.executeCollect()
154 | 
155 | 
156 |     if (!Sorted && df1_ilist.size > 1) {
157 | 
158 |       val sortCols = df11.columns
159 | 
160 |       df1_ilist = {
161 |         df11 = utils.dataFrame(
162 |           LocalRelation(df11.queryExecution.optimizedPlan.output, df1_ilist)
163 |         )(TestIcebergHive.sparkSession.sqlContext)
164 |         df11 = df11.sort(sortCols.head, sortCols.tail:_*).
165 |           select(sortCols.head, sortCols.tail:_*)
166 |         df11.queryExecution.executedPlan.executeCollect()
167 |       }
168 | 
169 |       df2_ilist = {
170 |         df21 = utils.dataFrame(
171 |           LocalRelation(df21.queryExecution.optimizedPlan.output, df2_ilist)
172 |         )(TestIcebergHive.sparkSession.sqlContext)
173 |         df21 = df21.sort(sortCols.head, sortCols.tail:_*).
174 |           select(sortCols.head, sortCols.tail:_*)
175 |         df21.queryExecution.executedPlan.executeCollect()
176 |       }
177 |     }
178 | 
179 |     val df1_count = df1_ilist.size
180 |     val df2_count = df2_ilist.size
181 |     if (df1_count != df2_count) {
182 |       println(df1_count + "\t" + df2_count)
183 |       println("The row count is not equal")
184 |       println(s"""df1=\n${df1_ilist.mkString("\n")}\ndf2=\n ${df2_ilist.mkString("\n")}""")
185 |       return false
186 |     }
187 | 
188 |     for (i <- 0 to df1_count.toInt - 1) {
189 |       for (j <- 0 to df1.columns.size - 1) {
190 |         val res1 = roundValue(chooseRounding,
191 |           df1_ilist(i).get(j, df11.schema(j).dataType),
192 |           df11.schema(j).dataType)
193 |         val res2 = roundValue(chooseRounding,
194 |           df2_ilist(i).get(j, df21.schema(j).dataType),
195 |           df21.schema(j).dataType)
196 |         // account for difference in null aggregation of javascript
197 |         if (res2 == null && res1 != null) {
198 |           if (!Set(Int.MaxValue, Int.MinValue,
199 |             Long.MaxValue, Long.MinValue,
200 |             Double.PositiveInfinity, Double.NegativeInfinity,
201 |             Float.PositiveInfinity, Float.NegativeInfinity,
202 |             0).contains(res1)) {
203 |             println(s"values in row $i, column $j don't match: ${res1} != ${res2}")
204 |             println(s"""df1=\n${df1_ilist.mkString("\n")}\ndf2=\n ${df2_ilist.mkString("\n")}""")
205 |             return false
206 |           }
207 |         } else if ((utils.isApproximateNumeric(df1.schema(j).dataType) &&
208 |           (Math.abs(res1.asInstanceOf[Double] - res2.asInstanceOf[Double]) >
209 |             devAllowedInAproxNumeric)) ||
210 |           (!utils.isApproximateNumeric(df1.schema(j).dataType) && res1 != res2)) {
211 |           println(s"values in row $i, column $j don't match: ${res1} != ${res2}")
212 |           println(s"""df1=\n${df1_ilist.mkString("\n")}\ndf2=\n ${df2_ilist.mkString("\n")}""")
213 |           return false
214 |         }
215 |       }
216 |     }
217 |     logDebug("The two dataframe is equal " + df1_count)
218 |     // println(df1_list.mkString("", "\n", ""))
219 |     return true
220 |   }
221 | 
222 |   def delete(f : File) : Unit = {
223 |     if (f.exists()) {
224 |       if (f.isDirectory) {
225 |         f.listFiles().foreach(delete(_))
226 |         f.delete()
227 |       } else {
228 |         f.delete()
229 |       }
230 |     }
231 |   }
232 | 
233 | 
234 | }
235 | 


--------------------------------------------------------------------------------
/src/test/scala/org/apache/spark/sql/hive/test/iceberg/BasicCreateAndInsertTest.scala:
--------------------------------------------------------------------------------
  1 | /*
  2 |  * Licensed to the Apache Software Foundation (ASF) under one or more
  3 |  * contributor license agreements.  See the NOTICE file distributed with
  4 |  * this work for additional information regarding copyright ownership.
  5 |  * The ASF licenses this file to You under the Apache License, Version 2.0
  6 |  * (the "License"); you may not use this file except in compliance with
  7 |  * the License.  You may obtain a copy of the License at
  8 |  *
  9 |  *    http://www.apache.org/licenses/LICENSE-2.0
 10 |  *
 11 |  * Unless required by applicable law or agreed to in writing, software
 12 |  * distributed under the License is distributed on an "AS IS" BASIS,
 13 |  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 14 |  * See the License for the specific language governing permissions and
 15 |  * limitations under the License.
 16 |  */
 17 | 
 18 | package org.apache.spark.sql.hive.test.iceberg
 19 | 
 20 | import java.util.Date
 21 | 
 22 | import org.apache.spark.sql.DataFrame
 23 | import org.apache.spark.sql.iceberg.utils
 24 | import org.apache.spark.sql.iceberg.utils.TableUtils
 25 | 
 26 | class BasicCreateAndInsertTest extends AbstractTest {
 27 | 
 28 |   def showSnapShots(table: String): Unit = {
 29 |     val sqlText = "select * from `" +
 30 |       table + TableUtils.SNAPSHOTSVIEW_SUFFIX +
 31 |       "`"
 32 |     println(sqlText)
 33 |     TestIcebergHive.sql(sqlText).
 34 |       show(10000, false)
 35 |   }
 36 | 
 37 |   val allColumns: String = " * "
 38 |   val excludeSoldDate: String = "ss_sold_time_sk,ss_item_sk,ss_customer_sk,ss_cdemo_sk," +
 39 |     "ss_hdemo_sk,ss_addr_sk,ss_store_sk,ss_promo_sk,ss_quantity,ss_wholesale_cost," +
 40 |     "ss_list_price,ss_sales_price,ss_ext_sales_price,ss_sold_month,ss_sold_day"
 41 | 
 42 |   def _insertStatement(insClause: String,
 43 |                        toTable: String,
 44 |                        fromTable: String,
 45 |                        partitionPredicate: String,
 46 |                        sourcePredciate: String,
 47 |                        showSnapShot: Boolean
 48 |                       ): Unit = {
 49 | 
 50 |     val partSpec = if (partitionPredicate != null) {
 51 |       s"partition ( ${partitionPredicate} )"
 52 |     } else ""
 53 | 
 54 |     val whereClause = (partitionPredicate, sourcePredciate) match {
 55 |       case (null, null) => ""
 56 |       case (p, null) => s"where ${p} "
 57 |       case (null, s) => s"where ${s} "
 58 |       case (p, s) => s"where ${p} and ${s} "
 59 |     }
 60 | 
 61 |     val selList = if (partitionPredicate != null) {
 62 |       excludeSoldDate
 63 |     } else allColumns
 64 | 
 65 |     val insStat =
 66 |       s"""
 67 |          |${insClause} ${toTable} ${partSpec}
 68 |          |select ${selList} from ${fromTable}
 69 |          |${whereClause}
 70 |       """.stripMargin
 71 | 
 72 |     println(insStat)
 73 | 
 74 |     TestIcebergHive.sql(insStat)
 75 |     if (showSnapShot) {
 76 |       showSnapShots(toTable)
 77 |     }
 78 | 
 79 |   }
 80 | 
 81 |   def insert(toTable: String,
 82 |              fromTable: String,
 83 |              partitionPredicate: String = null,
 84 |              sourcePredciate: String = null,
 85 |              showSnapShot: Boolean = true
 86 |             ): Unit = {
 87 |     _insertStatement(
 88 |       "insert into ",
 89 |       toTable,
 90 |       fromTable,
 91 |       partitionPredicate,
 92 |       sourcePredciate,
 93 |       showSnapShot
 94 |     )
 95 |   }
 96 | 
 97 |   def insertOverwrite(toTable: String,
 98 |                       fromTable: String,
 99 |                       partitionPredicate: String = null,
100 |                       sourcePredciate: String = null,
101 |                       showSnapShot: Boolean = true
102 |                      ): Unit = {
103 | 
104 |     _insertStatement(
105 |       "insert overwrite table ",
106 |       toTable,
107 |       fromTable,
108 |       partitionPredicate,
109 |       sourcePredciate,
110 |       showSnapShot
111 |     )
112 | 
113 |   }
114 | 
115 |   def asOf(tStr : String,
116 |            tblNm : String
117 |           ) : Unit = {
118 | 
119 |     val sqlText =
120 |       s"""
121 |          |as of '${tStr}'
122 |          |select count(*) from ${tblNm}
123 |          |""".stripMargin
124 | 
125 |     println(sqlText)
126 |     TestIcebergHive.sql(sqlText).
127 |       show(10000, false)
128 | 
129 |   }
130 | 
131 |   test("test1") { td =>
132 | 
133 |     var df: DataFrame = null
134 | 
135 |     println("Initially no snapShots:")
136 |     showSnapShots("store_sales_out")
137 | 
138 |     println("SnapShot on an insert: 30 files added")
139 |     insert("store_sales_out", "store_sales")
140 |     TestIcebergHive.sql("select count(*) from store_sales_out").
141 |       show(10000, false)
142 | 
143 |     val timeAtFirstInsert = utils.convertToTimestampString(System.currentTimeMillis())
144 |     // wait 10 mSecs, so we can demo as Of queries of first insert
145 |     Thread.sleep(10)
146 | 
147 |     //  a query with predicate ss_sold_date_sk='0906245'
148 |     // has predicate ss_sold_month='09' added
149 |     println("Query with  ss_sold_date_sk='0906245' has predciate ss_sold_month='09' added")
150 |     df = TestIcebergHive.sql(
151 |       "select count(*) from store_sales_out where ss_sold_date_sk='0906245'")
152 |     assert(
153 |       utils.iceTableScanExec(df).get.getIceFilter.toString.
154 |         contains("""ref(name="ss_sold_month") == "09"""")
155 |     )
156 |     df.show(1000, false)
157 | 
158 |     println("SnapShot on another insert: 30 files added again")
159 |     insert("store_sales_out", "store_sales")
160 | 
161 |     TestIcebergHive.sql("select count(*) from store_sales_out").
162 |       show(10000, false)
163 | 
164 |     asOf(timeAtFirstInsert, "store_sales_out")
165 | 
166 |     println("SnapShot on an insert overwrite: 30 files added, 60 files deleted")
167 |     insertOverwrite("store_sales_out", "store_sales")
168 |     TestIcebergHive.sql("select count(*) from store_sales_out").
169 |       show(10000, false)
170 | 
171 |     println("SnapShot on an insert overwrite of 1 partition: 5 files added, 5 files deleted")
172 |     insertOverwrite("store_sales_out", "store_sales", "ss_sold_date_sk='0906245'")
173 |     TestIcebergHive.sql("select count(*) from store_sales_out").
174 |       show(10000, false)
175 | 
176 | 
177 |     println("SnapShot on an insert overwrite of 1 partition, with source predicate:" +
178 |       " 5 files added, 5 files deleted")
179 |     insertOverwrite("store_sales_out", "store_sales",
180 |       "ss_sold_date_sk='0905245'", "ss_item_sk < 5000")
181 |     TestIcebergHive.sql("select count(*) from store_sales_out").
182 |       show(10000, false)
183 |     asOf(timeAtFirstInsert, "store_sales_out")
184 | 
185 |     // now a query with ss_item_sk > 5000 on ss_sold_date_sk=0905245 should be a null scan
186 |     println("now a query with ss_item_sk > 5000 on ss_sold_date_sk=0905245 should be a null scan")
187 |     df = TestIcebergHive.sql(
188 |       "select * from store_sales_out where ss_item_sk > 5000 and ss_sold_date_sk='0905245'")
189 |     assert(utils.filesScanned(df).size == 0)
190 |     df.show(10000, false)
191 | 
192 |     println("Test for fix Issue 5: DelegatedMethod0 is not serializable")
193 |     df = TestIcebergHive.sql(
194 |       """
195 |         |select *
196 |         |from store_sales_out a
197 |         |     join store_sales b on a.ss_item_sk=b.ss_item_sk
198 |         |where a.ss_item_sk=17908
199 |         |""".stripMargin
200 |     )
201 |     df.show(10000, false)
202 |   }
203 | 
204 | }
205 | 


--------------------------------------------------------------------------------
/src/test/scala/org/apache/spark/sql/hive/test/iceberg/TestIcebergHive.scala:
--------------------------------------------------------------------------------
 1 | /*
 2 |  * Licensed to the Apache Software Foundation (ASF) under one or more
 3 |  * contributor license agreements.  See the NOTICE file distributed with
 4 |  * this work for additional information regarding copyright ownership.
 5 |  * The ASF licenses this file to You under the Apache License, Version 2.0
 6 |  * (the "License"); you may not use this file except in compliance with
 7 |  * the License.  You may obtain a copy of the License at
 8 |  *
 9 |  *    http://www.apache.org/licenses/LICENSE-2.0
10 |  *
11 |  * Unless required by applicable law or agreed to in writing, software
12 |  * distributed under the License is distributed on an "AS IS" BASIS,
13 |  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 |  * See the License for the specific language governing permissions and
15 |  * limitations under the License.
16 |  */
17 | 
18 | package org.apache.spark.sql.hive.test.iceberg
19 | 
20 | import org.apache.spark.sql.catalyst.optimizer.ConvertToLocalRelation
21 | import org.apache.spark.{SparkConf, SparkContext}
22 | import org.apache.spark.sql.hive.test.TestHiveContext
23 | import org.apache.spark.sql.internal.SQLConf
24 | 
25 | object TestIcebergHive extends TestHiveContext(
26 |     new SparkContext(
27 |       System.getProperty("spark.sql.test.master", "local[1]"),
28 |       "TestSQLContext",
29 |       new SparkConf()
30 |         .set("spark.sql.test", "")
31 |         .set(SQLConf.CODEGEN_FALLBACK.key, "false")
32 |         .set("spark.sql.hive.metastore.barrierPrefixes",
33 |           "org.apache.spark.sql.hive.execution.PairSerDe")
34 |         .set("spark.sql.warehouse.dir", TestHiveContext.makeWarehouseDir().toURI.getPath)
35 |         // SPARK-8910
36 |         .set("spark.ui.enabled", "false")
37 |         .set("spark.unsafe.exceptionOnMemoryLeak", "true")
38 |         // Disable ConvertToLocalRelation for better test coverage. Test cases built on
39 |         // LocalRelation will exercise the optimization rules better by disabling it as
40 |         // this rule may potentially block testing of other optimization rules such as
41 |         // ConstantPropagation etc.
42 |         .set(SQLConf.OPTIMIZER_EXCLUDED_RULES.key, ConvertToLocalRelation.ruleName)
43 |         .set("spark.sql.extensions", "org.apache.spark.sql.iceberg.planning.SparkSessionExtensions")
44 |     ), false
45 | )


--------------------------------------------------------------------------------
/src/test/scala/org/apache/spark/sql/hive/test/iceberg/TestTables.scala:
--------------------------------------------------------------------------------
 1 | /*
 2 |  * Licensed to the Apache Software Foundation (ASF) under one or more
 3 |  * contributor license agreements.  See the NOTICE file distributed with
 4 |  * this work for additional information regarding copyright ownership.
 5 |  * The ASF licenses this file to You under the Apache License, Version 2.0
 6 |  * (the "License"); you may not use this file except in compliance with
 7 |  * the License.  You may obtain a copy of the License at
 8 |  *
 9 |  *    http://www.apache.org/licenses/LICENSE-2.0
10 |  *
11 |  * Unless required by applicable law or agreed to in writing, software
12 |  * distributed under the License is distributed on an "AS IS" BASIS,
13 |  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 |  * See the License for the specific language governing permissions and
15 |  * limitations under the License.
16 |  */
17 | 
18 | package org.apache.spark.sql.hive.test.iceberg
19 | 
20 | import java.io.File
21 | import java.nio.file.Files
22 | 
23 | trait TestTables {
24 |   self : AbstractTest =>
25 | 
26 |   val data_folder = new File("src/test/resources").getAbsoluteFile
27 |   val data_path = data_folder.getAbsolutePath
28 | 
29 |   val store_sales_out_folder = new File(data_folder, "store_sales_out")
30 | 
31 | 
32 |   def createStoreSalesTable(tName : String,
33 |                             tFolder : String,
34 |                             tableManagement : Boolean = false,
35 |                             tableTransforms : Seq[String] = Seq.empty
36 |                            ) : Unit = {
37 | 
38 |     val transformOption = if (tableManagement && tableTransforms.nonEmpty) {
39 |       s""",
40 |          |columnDependencies "${tableTransforms.mkString(",")}"
41 |          |""".stripMargin
42 |     } else ""
43 | 
44 |     TestIcebergHive.sql(
45 |       s"""create table if not exists ${tName}
46 |     (
47 |             ss_sold_time_sk           int,
48 |          |      ss_item_sk                int,
49 |          |      ss_customer_sk            int,
50 |          |      ss_cdemo_sk               int,
51 |          |      ss_hdemo_sk               int,
52 |          |      ss_addr_sk                int,
53 |          |      ss_store_sk               int,
54 |          |      ss_promo_sk               int,
55 |          |      ss_quantity               int,
56 |          |      ss_wholesale_cost         decimal(7,2),
57 |          |      ss_list_price             decimal(7,2),
58 |          |      ss_sales_price            decimal(7,2),
59 |          |      ss_ext_sales_price        decimal(7,2),
60 |          |      ss_sold_month             string,
61 |          |      ss_sold_day               string,
62 |          |      ss_sold_date_sk string
63 |          |
64 |     )
65 |     USING parquet
66 |     OPTIONS (
67 |          |path "$data_path/$tFolder",
68 |          |addTableManagement "${tableManagement}" ${transformOption}
69 |          |)
70 |     partitioned by (ss_sold_date_sk)""".stripMargin
71 |     )
72 |     TestIcebergHive.sql(s"msck repair table ${tName}")
73 |   }
74 | 
75 |   def setupStoreSalesTables : Unit = {
76 | 
77 |     delete(store_sales_out_folder)
78 |     Files.createDirectories(store_sales_out_folder.toPath)
79 |     createStoreSalesTable("store_sales_out",
80 |       "store_sales_out",
81 |       true,
82 |       Seq("ss_sold_date_sk=ss_sold_month:truncate[2]", "ss_sold_date_sk=ss_sold_day:truncate[4]")
83 |     )
84 | 
85 |     createStoreSalesTable("store_sales", "store_sales")
86 |   }
87 | 
88 | 
89 | }
90 | 


--------------------------------------------------------------------------------