├── tests ├── __init__.py ├── constants.py ├── input_data │ ├── receipt.parquet │ ├── running-example.parquet │ ├── receipt │ │ ├── @@partitioning=0 │ │ │ └── 9c8faa65a8bf4e3398a0143ccbf91002.parquet │ │ ├── @@partitioning=1 │ │ │ └── 5fae4bc2bbbe4166b609bd1e4ee26e40.parquet │ │ ├── @@partitioning=10 │ │ │ └── 2af530ddd33a4239b3e25d1c0cc1388d.parquet │ │ ├── @@partitioning=100 │ │ │ └── 3cfd18de55814ca28210b1b1ee5f1a13.parquet │ │ ├── @@partitioning=101 │ │ │ └── 5f83fdd1c6804aaf90be72e3e079eb84.parquet │ │ ├── @@partitioning=102 │ │ │ └── c5d4b5fa7e2248d4b0a45c3013ec41ac.parquet │ │ ├── @@partitioning=103 │ │ │ └── 77f1a134807c410f9969687cd8070722.parquet │ │ ├── @@partitioning=104 │ │ │ └── 8eb240bdd2d34fc6b179ebaf568070ef.parquet │ │ ├── @@partitioning=105 │ │ │ └── 92867c1d04884232a2c5333f3a5e5259.parquet │ │ ├── @@partitioning=106 │ │ │ └── a3176abb78284217a893ca7a1eccbc7e.parquet │ │ ├── @@partitioning=107 │ │ │ └── 9a5fab8daee14ccca9fa05a3c41efa13.parquet │ │ ├── @@partitioning=108 │ │ │ └── c8cfffc751a54cb39037159b4e0059ec.parquet │ │ ├── @@partitioning=109 │ │ │ └── 80ba8c72141a4d5dbbe4272446379ffb.parquet │ │ ├── @@partitioning=11 │ │ │ └── 3fc4b68a53dd4442a57af05c5fd05e7e.parquet │ │ ├── @@partitioning=110 │ │ │ └── ee1e7eea73ed408a8766600ed69b68ce.parquet │ │ ├── @@partitioning=111 │ │ │ └── e3e73d910e234359a1fde9167e612e52.parquet │ │ ├── @@partitioning=112 │ │ │ └── a5f765fc7a7a40e083e99dd4e508b5ff.parquet │ │ ├── @@partitioning=113 │ │ │ └── 27f98cfc4fae432ca5da83b8dc2e7012.parquet │ │ ├── @@partitioning=114 │ │ │ └── 5b8ea367199143e3a2813314e802902c.parquet │ │ ├── @@partitioning=115 │ │ │ └── ffcbfd9354094dd59e049abfef584f79.parquet │ │ ├── @@partitioning=116 │ │ │ └── 86a3dda83b37495e8cc192b1f20c6fc7.parquet │ │ ├── @@partitioning=117 │ │ │ └── 3a638945265c49a4909b4eaf8147b5dd.parquet │ │ ├── @@partitioning=118 │ │ │ └── ecc09fc5afa04b94b462d021c4c5eb7f.parquet │ │ ├── @@partitioning=119 │ │ │ └── 2ae73a13137c4965a432f8caa89bf51c.parquet │ │ ├── @@partitioning=12 │ │ │ └── d5320e0ac9a547d58c6dee3b5aaf03b8.parquet │ │ ├── @@partitioning=120 │ │ │ └── 16c4ac329c644b1388c0010f5578dd88.parquet │ │ ├── @@partitioning=121 │ │ │ └── 0e6ae36fc31b4bef96ea22ff3ef3d333.parquet │ │ ├── @@partitioning=122 │ │ │ └── ae940c9cd5a3401baf7f90ac2eff8f77.parquet │ │ ├── @@partitioning=123 │ │ │ └── c7c7d9179cc341b39ea57abe756e5c44.parquet │ │ ├── @@partitioning=124 │ │ │ └── 85c5945eab66433e859898bea8b27870.parquet │ │ ├── @@partitioning=125 │ │ │ └── 7b45f583130e4787bcecf21d4e2f3916.parquet │ │ ├── @@partitioning=126 │ │ │ └── 141d2b08a1c14633a747a2b02be325b7.parquet │ │ ├── @@partitioning=127 │ │ │ └── 70b3cb2e622641af8361a5820ed05270.parquet │ │ ├── @@partitioning=13 │ │ │ └── 56fa658c37e64286b108c9b4f33362c9.parquet │ │ ├── @@partitioning=14 │ │ │ └── a1ce0e3591ba441891d49702f68e4b03.parquet │ │ ├── @@partitioning=15 │ │ │ └── a095226766ef49398fa0351dc36d7bfb.parquet │ │ ├── @@partitioning=16 │ │ │ └── d021cca539ff4e9bbe4bdb20fc15be6b.parquet │ │ ├── @@partitioning=17 │ │ │ └── ce3b54422119424eb9ebdd2eebd19420.parquet │ │ ├── @@partitioning=18 │ │ │ └── 85f8b9ca6f1845cfb1d7d04c3b31a08e.parquet │ │ ├── @@partitioning=19 │ │ │ └── 442c9f75853d41e1986407827db2b5ec.parquet │ │ ├── @@partitioning=2 │ │ │ └── 9d33d9e15fa54b398d3d7ba90ba561ef.parquet │ │ ├── @@partitioning=20 │ │ │ └── a54b66b9a8d348edaf522e08e981dd8d.parquet │ │ ├── @@partitioning=21 │ │ │ └── 7962182880ed437d9b2636bb9f0c36e1.parquet │ │ ├── @@partitioning=22 │ │ │ └── 956efb04488c42b59fb1e47b9c58981c.parquet │ │ ├── @@partitioning=23 │ │ │ └── 16cb1b8b464d46baafa51c36d558ffa4.parquet │ │ ├── @@partitioning=24 │ │ │ └── d6da9e22500842499eaca917854201cd.parquet │ │ ├── @@partitioning=25 │ │ │ └── 4570d89881ed42c08a41f7359c988892.parquet │ │ ├── @@partitioning=26 │ │ │ └── da7095c4ba924cfd870eca452c2210a7.parquet │ │ ├── @@partitioning=27 │ │ │ └── f10fd728cf4a4dd29ca578e6f5b7d163.parquet │ │ ├── @@partitioning=28 │ │ │ └── d70798504e6e49738414f91c2c2a0b07.parquet │ │ ├── @@partitioning=29 │ │ │ └── d8234dd8cc4b4460989ce7cd171a393b.parquet │ │ ├── @@partitioning=3 │ │ │ └── 367b71119b144bb889b2c8b21147ea63.parquet │ │ ├── @@partitioning=30 │ │ │ └── 1341a73cc9e640559d9a3232d573bb97.parquet │ │ ├── @@partitioning=31 │ │ │ └── bedebc130a3c466fba78906bf3937b9f.parquet │ │ ├── @@partitioning=32 │ │ │ └── 7998f9ce1b7d42589721eb1276107c26.parquet │ │ ├── @@partitioning=33 │ │ │ └── 61aeb8def350485c9df448d656f023c8.parquet │ │ ├── @@partitioning=34 │ │ │ └── 4cb8882347514d77b9fe1026f54ca57c.parquet │ │ ├── @@partitioning=35 │ │ │ └── 359477e5e4c54c71a7b21d09a9380ecc.parquet │ │ ├── @@partitioning=36 │ │ │ └── 98a5add33458417fbc332815298aca12.parquet │ │ ├── @@partitioning=37 │ │ │ └── 4cf32527217d4a38b2b6122d5f251526.parquet │ │ ├── @@partitioning=38 │ │ │ └── 2d329b5523d0457797ee2bd0b557adf1.parquet │ │ ├── @@partitioning=39 │ │ │ └── e764ef673bdf4060bc5f617e60bd654c.parquet │ │ ├── @@partitioning=4 │ │ │ └── edc1e19d80444f2589370b07c056530e.parquet │ │ ├── @@partitioning=40 │ │ │ └── e3900574306440b89b69f32404ecb276.parquet │ │ ├── @@partitioning=41 │ │ │ └── 04ec822e7c8e4954b3d205870ff71cc1.parquet │ │ ├── @@partitioning=42 │ │ │ └── 437565a352bc4a60b7c805a35c22d5d0.parquet │ │ ├── @@partitioning=43 │ │ │ └── 4ff5fc069830432998ffbdf2ea686bbe.parquet │ │ ├── @@partitioning=44 │ │ │ └── 612dd93522824aa08d85b1a2bc95f640.parquet │ │ ├── @@partitioning=45 │ │ │ └── e15ebcf7cd704c76a76cea624964bdcc.parquet │ │ ├── @@partitioning=46 │ │ │ └── 35bbd6866c464676a0778738d51a52dd.parquet │ │ ├── @@partitioning=47 │ │ │ └── f1a82c702e954b358488128476df877e.parquet │ │ ├── @@partitioning=48 │ │ │ └── 0d23fd2c7ec74cefba3d85438920fd01.parquet │ │ ├── @@partitioning=49 │ │ │ └── 36649b37fe534496ba6fb7d695f07685.parquet │ │ ├── @@partitioning=5 │ │ │ └── 0cf85db0014c426faf4ab95c699faf3c.parquet │ │ ├── @@partitioning=50 │ │ │ └── a1eadd40a1bc4777be42317886824b6a.parquet │ │ ├── @@partitioning=51 │ │ │ └── 432e9c7e1fe3413699665cc0d190b8da.parquet │ │ ├── @@partitioning=52 │ │ │ └── 25faadf083c944738c1c6b515cb484c4.parquet │ │ ├── @@partitioning=53 │ │ │ └── 7eecd66696b3491bb88f55945e4eda79.parquet │ │ ├── @@partitioning=54 │ │ │ └── c0fbc5af9d8e48eca911bdae3635f628.parquet │ │ ├── @@partitioning=55 │ │ │ └── 03e70784390148219fa843c230f6145d.parquet │ │ ├── @@partitioning=56 │ │ │ └── 446948113fc1472d8ebe12e9109146a6.parquet │ │ ├── @@partitioning=57 │ │ │ └── 93095d5dcd574806a3cbcad084213536.parquet │ │ ├── @@partitioning=58 │ │ │ └── a988de5f748a430087f63b0c42f0d0bb.parquet │ │ ├── @@partitioning=59 │ │ │ └── 79e4ff740ce6425287e4bb319de86d10.parquet │ │ ├── @@partitioning=6 │ │ │ └── 74d11133dfe64b47b26d60a4f8fc3fcb.parquet │ │ ├── @@partitioning=60 │ │ │ └── 64f2c5cd7f2c41ff8c41c6983770f597.parquet │ │ ├── @@partitioning=61 │ │ │ └── 176b2d6dd5a94507a9b8a97fb34b6448.parquet │ │ ├── @@partitioning=62 │ │ │ └── dde9034be47d49cd9c2c9cf1d445f7ec.parquet │ │ ├── @@partitioning=63 │ │ │ └── 5021e7fb3c104231a35faf20f51876cb.parquet │ │ ├── @@partitioning=64 │ │ │ └── 1bdfebb4143c441c9b0009ee5acc1c03.parquet │ │ ├── @@partitioning=65 │ │ │ └── 55625bbdd0f54667a6eb7e2fd6269847.parquet │ │ ├── @@partitioning=66 │ │ │ └── 0e076c36a7004c9fbdae493a3fa003bf.parquet │ │ ├── @@partitioning=67 │ │ │ └── e402d5bd805a4ed5bfc266b36fb695ef.parquet │ │ ├── @@partitioning=68 │ │ │ └── 8f4608a274924f5eb4b388c5cc83843c.parquet │ │ ├── @@partitioning=69 │ │ │ └── ef10536765264b4a91b84cd59ebb94f4.parquet │ │ ├── @@partitioning=7 │ │ │ └── ddd5d4826f5649098345c92168eaacba.parquet │ │ ├── @@partitioning=70 │ │ │ └── aaa24d7d4bb34ad2a91e24529ea0b539.parquet │ │ ├── @@partitioning=71 │ │ │ └── 2ebbfd301f334541b6c70529f862e7c5.parquet │ │ ├── @@partitioning=72 │ │ │ └── 02385bb7a79e4b61a6583adf9d1092fb.parquet │ │ ├── @@partitioning=73 │ │ │ └── 60c5e48107494b5493ff6b258a3c01b0.parquet │ │ ├── @@partitioning=74 │ │ │ └── e8e8e52bdbd448aea7ee599403f47694.parquet │ │ ├── @@partitioning=75 │ │ │ └── 14fe0948605c447b9cde78a75e5ac4f5.parquet │ │ ├── @@partitioning=76 │ │ │ └── c55ef0ee492a46cab562a9e4e9d7e0eb.parquet │ │ ├── @@partitioning=77 │ │ │ └── 47cbfa7312ca43b494b434510de137b9.parquet │ │ ├── @@partitioning=78 │ │ │ └── 438c11bff4dd4ccd9fe30162973847f7.parquet │ │ ├── @@partitioning=79 │ │ │ └── 4da8267db40f47748911336a91f3ffb3.parquet │ │ ├── @@partitioning=8 │ │ │ └── df050f729b464424b2aaa417bc94284d.parquet │ │ ├── @@partitioning=80 │ │ │ └── 95fb4bd665c642de848c941eee38c499.parquet │ │ ├── @@partitioning=81 │ │ │ └── d2f54e1aeec9411f85a063ed49456c80.parquet │ │ ├── @@partitioning=82 │ │ │ └── f876e1a583a94201b7fdb6e6a99ee936.parquet │ │ ├── @@partitioning=83 │ │ │ └── 2e48ca0da28c4cad9840184e4dbc69a0.parquet │ │ ├── @@partitioning=84 │ │ │ └── e5d459b80f6f4bc8a6d27cabfb18e905.parquet │ │ ├── @@partitioning=85 │ │ │ └── d39c9243df794592a35f5defc5e9cab2.parquet │ │ ├── @@partitioning=86 │ │ │ └── 082bbff63aa14064b95e5ab8a55360ee.parquet │ │ ├── @@partitioning=87 │ │ │ └── 952670c1d29844d7adb1c3c829e05d7b.parquet │ │ ├── @@partitioning=88 │ │ │ └── 99dee8f7dccb43e281ad846122fddd44.parquet │ │ ├── @@partitioning=89 │ │ │ └── 7fbf92b524db460ab887984a6d550245.parquet │ │ ├── @@partitioning=9 │ │ │ └── f1d8c3c34acc4534bdf98f7404470e0b.parquet │ │ ├── @@partitioning=90 │ │ │ └── 44d3684fb5d34557b6918f643069623c.parquet │ │ ├── @@partitioning=91 │ │ │ └── a3e8d08f9fc3415e8125d3cf85cc8996.parquet │ │ ├── @@partitioning=92 │ │ │ └── 8295e31735b14dcba827e0153d0b1ad0.parquet │ │ ├── @@partitioning=93 │ │ │ └── 97cae9e1281b4179ad123142e4018e9a.parquet │ │ ├── @@partitioning=94 │ │ │ └── 30e2a0ee8d484502aca9921d6fa403f8.parquet │ │ ├── @@partitioning=95 │ │ │ └── 640aa952cbbf40ecbc36edb57796d2cd.parquet │ │ ├── @@partitioning=96 │ │ │ └── 8211320dc69d4e3cbd0356fdea9539f8.parquet │ │ ├── @@partitioning=97 │ │ │ └── e4032ee22725400d879d7608c9a739a3.parquet │ │ ├── @@partitioning=98 │ │ │ └── a88b3e98a5a14ae4a67a94715f55714b.parquet │ │ └── @@partitioning=99 │ │ │ └── 45e2d237bad1461ebf527c82f117696a.parquet │ └── running-example.csv ├── filtering_paths_test.py ├── filtering_cases_test.py ├── filtering_start_test.py ├── filtering_ts_test.py ├── parquet_export_test.py ├── dfg_test.py ├── filtering_end_test.py ├── csv_import_test.py ├── parquet_import_test.py ├── filtering_attr_test.py ├── filtering_var_test.py └── test_output_data │ ├── running-example_freq.svg │ └── running-example_perf.svg ├── pm4pyspark ├── importer │ ├── constants.py │ ├── csv │ │ ├── __init__.py │ │ └── spark_df_imp.py │ ├── __init__.py │ └── parquet │ │ ├── __init__.py │ │ └── spark_df_imp.py ├── __init__.py ├── exporter │ ├── __init__.py │ └── parquet │ │ ├── __init__.py │ │ └── spark_df_exp.py └── algo │ ├── __init__.py │ ├── discovery │ ├── __init__.py │ └── dfg │ │ ├── __init__.py │ │ ├── df_statistics.py │ │ └── factory.py │ └── filtering │ ├── cases │ ├── __init__.py │ └── cases_filter.py │ ├── paths │ ├── __init__.py │ └── paths_filter.py │ ├── variants │ ├── __init__.py │ └── variants_filter.py │ ├── timestamp │ ├── __init__.py │ └── timestamp_filter.py │ ├── attributes │ ├── __init__.py │ └── attributes_filter.py │ ├── end_activities │ ├── __init__.py │ └── end_activities_filter.py │ ├── start_activities │ ├── __init__.py │ └── start_activities_filter.py │ └── __init__.py ├── README.md └── .gitignore /tests/__init__.py: -------------------------------------------------------------------------------- 1 | from tests import constants 2 | -------------------------------------------------------------------------------- /pm4pyspark/importer/constants.py: -------------------------------------------------------------------------------- 1 | DEFAULT_NUM_PARTITION = 3 2 | -------------------------------------------------------------------------------- /pm4pyspark/__init__.py: -------------------------------------------------------------------------------- 1 | from pm4pyspark import algo, importer, exporter 2 | -------------------------------------------------------------------------------- /pm4pyspark/exporter/__init__.py: -------------------------------------------------------------------------------- 1 | from pm4pyspark.exporter import parquet 2 | -------------------------------------------------------------------------------- /pm4pyspark/algo/__init__.py: -------------------------------------------------------------------------------- 1 | from pm4pyspark.algo import discovery, filtering 2 | -------------------------------------------------------------------------------- /pm4pyspark/algo/discovery/__init__.py: -------------------------------------------------------------------------------- 1 | from pm4pyspark.algo.discovery import dfg 2 | -------------------------------------------------------------------------------- /pm4pyspark/importer/csv/__init__.py: -------------------------------------------------------------------------------- 1 | from pm4pyspark.importer.csv import spark_df_imp 2 | -------------------------------------------------------------------------------- /pm4pyspark/importer/__init__.py: -------------------------------------------------------------------------------- 1 | from pm4pyspark.importer import csv, parquet, constants 2 | -------------------------------------------------------------------------------- /tests/constants.py: -------------------------------------------------------------------------------- 1 | INPUT_DATA_DIR = "input_data" 2 | OUTPUT_DATA_DIR = "test_output_data" 3 | -------------------------------------------------------------------------------- /pm4pyspark/exporter/parquet/__init__.py: -------------------------------------------------------------------------------- 1 | from pm4pyspark.exporter.parquet import spark_df_exp 2 | -------------------------------------------------------------------------------- /pm4pyspark/importer/parquet/__init__.py: -------------------------------------------------------------------------------- 1 | from pm4pyspark.importer.parquet import spark_df_imp 2 | -------------------------------------------------------------------------------- /pm4pyspark/algo/filtering/cases/__init__.py: -------------------------------------------------------------------------------- 1 | from pm4pyspark.algo.filtering.cases import cases_filter 2 | -------------------------------------------------------------------------------- /pm4pyspark/algo/filtering/paths/__init__.py: -------------------------------------------------------------------------------- 1 | from pm4pyspark.algo.filtering.paths import paths_filter 2 | -------------------------------------------------------------------------------- /pm4pyspark/algo/discovery/dfg/__init__.py: -------------------------------------------------------------------------------- 1 | from pm4pyspark.algo.discovery.dfg import df_statistics, factory 2 | -------------------------------------------------------------------------------- /pm4pyspark/algo/filtering/variants/__init__.py: -------------------------------------------------------------------------------- 1 | from pm4pyspark.algo.filtering.variants import variants_filter 2 | -------------------------------------------------------------------------------- /pm4pyspark/algo/filtering/timestamp/__init__.py: -------------------------------------------------------------------------------- 1 | from pm4pyspark.algo.filtering.timestamp import timestamp_filter 2 | -------------------------------------------------------------------------------- /pm4pyspark/algo/filtering/attributes/__init__.py: -------------------------------------------------------------------------------- 1 | from pm4pyspark.algo.filtering.attributes import attributes_filter 2 | -------------------------------------------------------------------------------- /pm4pyspark/algo/filtering/end_activities/__init__.py: -------------------------------------------------------------------------------- 1 | from pm4pyspark.algo.filtering.end_activities import end_activities_filter 2 | -------------------------------------------------------------------------------- /tests/input_data/receipt.parquet: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/jihoon-yang/pm4pyspark-source/HEAD/tests/input_data/receipt.parquet -------------------------------------------------------------------------------- /pm4pyspark/algo/filtering/start_activities/__init__.py: -------------------------------------------------------------------------------- 1 | from pm4pyspark.algo.filtering.start_activities import start_activities_filter 2 | -------------------------------------------------------------------------------- /tests/input_data/running-example.parquet: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/jihoon-yang/pm4pyspark-source/HEAD/tests/input_data/running-example.parquet -------------------------------------------------------------------------------- /pm4pyspark/algo/filtering/__init__.py: -------------------------------------------------------------------------------- 1 | from pm4pyspark.algo.filtering import start_activities, end_activities, attributes, cases, \ 2 | variants, paths, timestamp 3 | -------------------------------------------------------------------------------- /tests/input_data/receipt/@@partitioning=0/9c8faa65a8bf4e3398a0143ccbf91002.parquet: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/jihoon-yang/pm4pyspark-source/HEAD/tests/input_data/receipt/@@partitioning=0/9c8faa65a8bf4e3398a0143ccbf91002.parquet -------------------------------------------------------------------------------- /tests/input_data/receipt/@@partitioning=1/5fae4bc2bbbe4166b609bd1e4ee26e40.parquet: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/jihoon-yang/pm4pyspark-source/HEAD/tests/input_data/receipt/@@partitioning=1/5fae4bc2bbbe4166b609bd1e4ee26e40.parquet -------------------------------------------------------------------------------- /tests/input_data/receipt/@@partitioning=10/2af530ddd33a4239b3e25d1c0cc1388d.parquet: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/jihoon-yang/pm4pyspark-source/HEAD/tests/input_data/receipt/@@partitioning=10/2af530ddd33a4239b3e25d1c0cc1388d.parquet -------------------------------------------------------------------------------- /tests/input_data/receipt/@@partitioning=100/3cfd18de55814ca28210b1b1ee5f1a13.parquet: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/jihoon-yang/pm4pyspark-source/HEAD/tests/input_data/receipt/@@partitioning=100/3cfd18de55814ca28210b1b1ee5f1a13.parquet -------------------------------------------------------------------------------- /tests/input_data/receipt/@@partitioning=101/5f83fdd1c6804aaf90be72e3e079eb84.parquet: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/jihoon-yang/pm4pyspark-source/HEAD/tests/input_data/receipt/@@partitioning=101/5f83fdd1c6804aaf90be72e3e079eb84.parquet -------------------------------------------------------------------------------- /tests/input_data/receipt/@@partitioning=102/c5d4b5fa7e2248d4b0a45c3013ec41ac.parquet: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/jihoon-yang/pm4pyspark-source/HEAD/tests/input_data/receipt/@@partitioning=102/c5d4b5fa7e2248d4b0a45c3013ec41ac.parquet -------------------------------------------------------------------------------- /tests/input_data/receipt/@@partitioning=103/77f1a134807c410f9969687cd8070722.parquet: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/jihoon-yang/pm4pyspark-source/HEAD/tests/input_data/receipt/@@partitioning=103/77f1a134807c410f9969687cd8070722.parquet -------------------------------------------------------------------------------- /tests/input_data/receipt/@@partitioning=104/8eb240bdd2d34fc6b179ebaf568070ef.parquet: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/jihoon-yang/pm4pyspark-source/HEAD/tests/input_data/receipt/@@partitioning=104/8eb240bdd2d34fc6b179ebaf568070ef.parquet -------------------------------------------------------------------------------- /tests/input_data/receipt/@@partitioning=105/92867c1d04884232a2c5333f3a5e5259.parquet: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/jihoon-yang/pm4pyspark-source/HEAD/tests/input_data/receipt/@@partitioning=105/92867c1d04884232a2c5333f3a5e5259.parquet -------------------------------------------------------------------------------- /tests/input_data/receipt/@@partitioning=106/a3176abb78284217a893ca7a1eccbc7e.parquet: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/jihoon-yang/pm4pyspark-source/HEAD/tests/input_data/receipt/@@partitioning=106/a3176abb78284217a893ca7a1eccbc7e.parquet -------------------------------------------------------------------------------- /tests/input_data/receipt/@@partitioning=107/9a5fab8daee14ccca9fa05a3c41efa13.parquet: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/jihoon-yang/pm4pyspark-source/HEAD/tests/input_data/receipt/@@partitioning=107/9a5fab8daee14ccca9fa05a3c41efa13.parquet -------------------------------------------------------------------------------- /tests/input_data/receipt/@@partitioning=108/c8cfffc751a54cb39037159b4e0059ec.parquet: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/jihoon-yang/pm4pyspark-source/HEAD/tests/input_data/receipt/@@partitioning=108/c8cfffc751a54cb39037159b4e0059ec.parquet -------------------------------------------------------------------------------- /tests/input_data/receipt/@@partitioning=109/80ba8c72141a4d5dbbe4272446379ffb.parquet: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/jihoon-yang/pm4pyspark-source/HEAD/tests/input_data/receipt/@@partitioning=109/80ba8c72141a4d5dbbe4272446379ffb.parquet -------------------------------------------------------------------------------- /tests/input_data/receipt/@@partitioning=11/3fc4b68a53dd4442a57af05c5fd05e7e.parquet: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/jihoon-yang/pm4pyspark-source/HEAD/tests/input_data/receipt/@@partitioning=11/3fc4b68a53dd4442a57af05c5fd05e7e.parquet -------------------------------------------------------------------------------- /tests/input_data/receipt/@@partitioning=110/ee1e7eea73ed408a8766600ed69b68ce.parquet: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/jihoon-yang/pm4pyspark-source/HEAD/tests/input_data/receipt/@@partitioning=110/ee1e7eea73ed408a8766600ed69b68ce.parquet -------------------------------------------------------------------------------- /tests/input_data/receipt/@@partitioning=111/e3e73d910e234359a1fde9167e612e52.parquet: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/jihoon-yang/pm4pyspark-source/HEAD/tests/input_data/receipt/@@partitioning=111/e3e73d910e234359a1fde9167e612e52.parquet -------------------------------------------------------------------------------- /tests/input_data/receipt/@@partitioning=112/a5f765fc7a7a40e083e99dd4e508b5ff.parquet: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/jihoon-yang/pm4pyspark-source/HEAD/tests/input_data/receipt/@@partitioning=112/a5f765fc7a7a40e083e99dd4e508b5ff.parquet -------------------------------------------------------------------------------- /tests/input_data/receipt/@@partitioning=113/27f98cfc4fae432ca5da83b8dc2e7012.parquet: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/jihoon-yang/pm4pyspark-source/HEAD/tests/input_data/receipt/@@partitioning=113/27f98cfc4fae432ca5da83b8dc2e7012.parquet -------------------------------------------------------------------------------- /tests/input_data/receipt/@@partitioning=114/5b8ea367199143e3a2813314e802902c.parquet: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/jihoon-yang/pm4pyspark-source/HEAD/tests/input_data/receipt/@@partitioning=114/5b8ea367199143e3a2813314e802902c.parquet -------------------------------------------------------------------------------- /tests/input_data/receipt/@@partitioning=115/ffcbfd9354094dd59e049abfef584f79.parquet: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/jihoon-yang/pm4pyspark-source/HEAD/tests/input_data/receipt/@@partitioning=115/ffcbfd9354094dd59e049abfef584f79.parquet -------------------------------------------------------------------------------- /tests/input_data/receipt/@@partitioning=116/86a3dda83b37495e8cc192b1f20c6fc7.parquet: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/jihoon-yang/pm4pyspark-source/HEAD/tests/input_data/receipt/@@partitioning=116/86a3dda83b37495e8cc192b1f20c6fc7.parquet -------------------------------------------------------------------------------- /tests/input_data/receipt/@@partitioning=117/3a638945265c49a4909b4eaf8147b5dd.parquet: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/jihoon-yang/pm4pyspark-source/HEAD/tests/input_data/receipt/@@partitioning=117/3a638945265c49a4909b4eaf8147b5dd.parquet -------------------------------------------------------------------------------- /tests/input_data/receipt/@@partitioning=118/ecc09fc5afa04b94b462d021c4c5eb7f.parquet: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/jihoon-yang/pm4pyspark-source/HEAD/tests/input_data/receipt/@@partitioning=118/ecc09fc5afa04b94b462d021c4c5eb7f.parquet -------------------------------------------------------------------------------- /tests/input_data/receipt/@@partitioning=119/2ae73a13137c4965a432f8caa89bf51c.parquet: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/jihoon-yang/pm4pyspark-source/HEAD/tests/input_data/receipt/@@partitioning=119/2ae73a13137c4965a432f8caa89bf51c.parquet -------------------------------------------------------------------------------- /tests/input_data/receipt/@@partitioning=12/d5320e0ac9a547d58c6dee3b5aaf03b8.parquet: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/jihoon-yang/pm4pyspark-source/HEAD/tests/input_data/receipt/@@partitioning=12/d5320e0ac9a547d58c6dee3b5aaf03b8.parquet -------------------------------------------------------------------------------- /tests/input_data/receipt/@@partitioning=120/16c4ac329c644b1388c0010f5578dd88.parquet: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/jihoon-yang/pm4pyspark-source/HEAD/tests/input_data/receipt/@@partitioning=120/16c4ac329c644b1388c0010f5578dd88.parquet -------------------------------------------------------------------------------- /tests/input_data/receipt/@@partitioning=121/0e6ae36fc31b4bef96ea22ff3ef3d333.parquet: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/jihoon-yang/pm4pyspark-source/HEAD/tests/input_data/receipt/@@partitioning=121/0e6ae36fc31b4bef96ea22ff3ef3d333.parquet -------------------------------------------------------------------------------- /tests/input_data/receipt/@@partitioning=122/ae940c9cd5a3401baf7f90ac2eff8f77.parquet: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/jihoon-yang/pm4pyspark-source/HEAD/tests/input_data/receipt/@@partitioning=122/ae940c9cd5a3401baf7f90ac2eff8f77.parquet -------------------------------------------------------------------------------- /tests/input_data/receipt/@@partitioning=123/c7c7d9179cc341b39ea57abe756e5c44.parquet: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/jihoon-yang/pm4pyspark-source/HEAD/tests/input_data/receipt/@@partitioning=123/c7c7d9179cc341b39ea57abe756e5c44.parquet -------------------------------------------------------------------------------- /tests/input_data/receipt/@@partitioning=124/85c5945eab66433e859898bea8b27870.parquet: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/jihoon-yang/pm4pyspark-source/HEAD/tests/input_data/receipt/@@partitioning=124/85c5945eab66433e859898bea8b27870.parquet -------------------------------------------------------------------------------- /tests/input_data/receipt/@@partitioning=125/7b45f583130e4787bcecf21d4e2f3916.parquet: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/jihoon-yang/pm4pyspark-source/HEAD/tests/input_data/receipt/@@partitioning=125/7b45f583130e4787bcecf21d4e2f3916.parquet -------------------------------------------------------------------------------- /tests/input_data/receipt/@@partitioning=126/141d2b08a1c14633a747a2b02be325b7.parquet: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/jihoon-yang/pm4pyspark-source/HEAD/tests/input_data/receipt/@@partitioning=126/141d2b08a1c14633a747a2b02be325b7.parquet -------------------------------------------------------------------------------- /tests/input_data/receipt/@@partitioning=127/70b3cb2e622641af8361a5820ed05270.parquet: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/jihoon-yang/pm4pyspark-source/HEAD/tests/input_data/receipt/@@partitioning=127/70b3cb2e622641af8361a5820ed05270.parquet -------------------------------------------------------------------------------- /tests/input_data/receipt/@@partitioning=13/56fa658c37e64286b108c9b4f33362c9.parquet: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/jihoon-yang/pm4pyspark-source/HEAD/tests/input_data/receipt/@@partitioning=13/56fa658c37e64286b108c9b4f33362c9.parquet -------------------------------------------------------------------------------- /tests/input_data/receipt/@@partitioning=14/a1ce0e3591ba441891d49702f68e4b03.parquet: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/jihoon-yang/pm4pyspark-source/HEAD/tests/input_data/receipt/@@partitioning=14/a1ce0e3591ba441891d49702f68e4b03.parquet -------------------------------------------------------------------------------- /tests/input_data/receipt/@@partitioning=15/a095226766ef49398fa0351dc36d7bfb.parquet: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/jihoon-yang/pm4pyspark-source/HEAD/tests/input_data/receipt/@@partitioning=15/a095226766ef49398fa0351dc36d7bfb.parquet -------------------------------------------------------------------------------- /tests/input_data/receipt/@@partitioning=16/d021cca539ff4e9bbe4bdb20fc15be6b.parquet: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/jihoon-yang/pm4pyspark-source/HEAD/tests/input_data/receipt/@@partitioning=16/d021cca539ff4e9bbe4bdb20fc15be6b.parquet -------------------------------------------------------------------------------- /tests/input_data/receipt/@@partitioning=17/ce3b54422119424eb9ebdd2eebd19420.parquet: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/jihoon-yang/pm4pyspark-source/HEAD/tests/input_data/receipt/@@partitioning=17/ce3b54422119424eb9ebdd2eebd19420.parquet -------------------------------------------------------------------------------- /tests/input_data/receipt/@@partitioning=18/85f8b9ca6f1845cfb1d7d04c3b31a08e.parquet: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/jihoon-yang/pm4pyspark-source/HEAD/tests/input_data/receipt/@@partitioning=18/85f8b9ca6f1845cfb1d7d04c3b31a08e.parquet -------------------------------------------------------------------------------- /tests/input_data/receipt/@@partitioning=19/442c9f75853d41e1986407827db2b5ec.parquet: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/jihoon-yang/pm4pyspark-source/HEAD/tests/input_data/receipt/@@partitioning=19/442c9f75853d41e1986407827db2b5ec.parquet -------------------------------------------------------------------------------- /tests/input_data/receipt/@@partitioning=2/9d33d9e15fa54b398d3d7ba90ba561ef.parquet: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/jihoon-yang/pm4pyspark-source/HEAD/tests/input_data/receipt/@@partitioning=2/9d33d9e15fa54b398d3d7ba90ba561ef.parquet -------------------------------------------------------------------------------- /tests/input_data/receipt/@@partitioning=20/a54b66b9a8d348edaf522e08e981dd8d.parquet: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/jihoon-yang/pm4pyspark-source/HEAD/tests/input_data/receipt/@@partitioning=20/a54b66b9a8d348edaf522e08e981dd8d.parquet -------------------------------------------------------------------------------- /tests/input_data/receipt/@@partitioning=21/7962182880ed437d9b2636bb9f0c36e1.parquet: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/jihoon-yang/pm4pyspark-source/HEAD/tests/input_data/receipt/@@partitioning=21/7962182880ed437d9b2636bb9f0c36e1.parquet -------------------------------------------------------------------------------- /tests/input_data/receipt/@@partitioning=22/956efb04488c42b59fb1e47b9c58981c.parquet: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/jihoon-yang/pm4pyspark-source/HEAD/tests/input_data/receipt/@@partitioning=22/956efb04488c42b59fb1e47b9c58981c.parquet -------------------------------------------------------------------------------- /tests/input_data/receipt/@@partitioning=23/16cb1b8b464d46baafa51c36d558ffa4.parquet: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/jihoon-yang/pm4pyspark-source/HEAD/tests/input_data/receipt/@@partitioning=23/16cb1b8b464d46baafa51c36d558ffa4.parquet -------------------------------------------------------------------------------- /tests/input_data/receipt/@@partitioning=24/d6da9e22500842499eaca917854201cd.parquet: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/jihoon-yang/pm4pyspark-source/HEAD/tests/input_data/receipt/@@partitioning=24/d6da9e22500842499eaca917854201cd.parquet -------------------------------------------------------------------------------- /tests/input_data/receipt/@@partitioning=25/4570d89881ed42c08a41f7359c988892.parquet: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/jihoon-yang/pm4pyspark-source/HEAD/tests/input_data/receipt/@@partitioning=25/4570d89881ed42c08a41f7359c988892.parquet -------------------------------------------------------------------------------- /tests/input_data/receipt/@@partitioning=26/da7095c4ba924cfd870eca452c2210a7.parquet: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/jihoon-yang/pm4pyspark-source/HEAD/tests/input_data/receipt/@@partitioning=26/da7095c4ba924cfd870eca452c2210a7.parquet -------------------------------------------------------------------------------- /tests/input_data/receipt/@@partitioning=27/f10fd728cf4a4dd29ca578e6f5b7d163.parquet: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/jihoon-yang/pm4pyspark-source/HEAD/tests/input_data/receipt/@@partitioning=27/f10fd728cf4a4dd29ca578e6f5b7d163.parquet -------------------------------------------------------------------------------- /tests/input_data/receipt/@@partitioning=28/d70798504e6e49738414f91c2c2a0b07.parquet: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/jihoon-yang/pm4pyspark-source/HEAD/tests/input_data/receipt/@@partitioning=28/d70798504e6e49738414f91c2c2a0b07.parquet -------------------------------------------------------------------------------- /tests/input_data/receipt/@@partitioning=29/d8234dd8cc4b4460989ce7cd171a393b.parquet: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/jihoon-yang/pm4pyspark-source/HEAD/tests/input_data/receipt/@@partitioning=29/d8234dd8cc4b4460989ce7cd171a393b.parquet -------------------------------------------------------------------------------- /tests/input_data/receipt/@@partitioning=3/367b71119b144bb889b2c8b21147ea63.parquet: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/jihoon-yang/pm4pyspark-source/HEAD/tests/input_data/receipt/@@partitioning=3/367b71119b144bb889b2c8b21147ea63.parquet -------------------------------------------------------------------------------- /tests/input_data/receipt/@@partitioning=30/1341a73cc9e640559d9a3232d573bb97.parquet: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/jihoon-yang/pm4pyspark-source/HEAD/tests/input_data/receipt/@@partitioning=30/1341a73cc9e640559d9a3232d573bb97.parquet -------------------------------------------------------------------------------- /tests/input_data/receipt/@@partitioning=31/bedebc130a3c466fba78906bf3937b9f.parquet: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/jihoon-yang/pm4pyspark-source/HEAD/tests/input_data/receipt/@@partitioning=31/bedebc130a3c466fba78906bf3937b9f.parquet -------------------------------------------------------------------------------- /tests/input_data/receipt/@@partitioning=32/7998f9ce1b7d42589721eb1276107c26.parquet: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/jihoon-yang/pm4pyspark-source/HEAD/tests/input_data/receipt/@@partitioning=32/7998f9ce1b7d42589721eb1276107c26.parquet -------------------------------------------------------------------------------- /tests/input_data/receipt/@@partitioning=33/61aeb8def350485c9df448d656f023c8.parquet: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/jihoon-yang/pm4pyspark-source/HEAD/tests/input_data/receipt/@@partitioning=33/61aeb8def350485c9df448d656f023c8.parquet -------------------------------------------------------------------------------- /tests/input_data/receipt/@@partitioning=34/4cb8882347514d77b9fe1026f54ca57c.parquet: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/jihoon-yang/pm4pyspark-source/HEAD/tests/input_data/receipt/@@partitioning=34/4cb8882347514d77b9fe1026f54ca57c.parquet -------------------------------------------------------------------------------- /tests/input_data/receipt/@@partitioning=35/359477e5e4c54c71a7b21d09a9380ecc.parquet: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/jihoon-yang/pm4pyspark-source/HEAD/tests/input_data/receipt/@@partitioning=35/359477e5e4c54c71a7b21d09a9380ecc.parquet -------------------------------------------------------------------------------- /tests/input_data/receipt/@@partitioning=36/98a5add33458417fbc332815298aca12.parquet: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/jihoon-yang/pm4pyspark-source/HEAD/tests/input_data/receipt/@@partitioning=36/98a5add33458417fbc332815298aca12.parquet -------------------------------------------------------------------------------- /tests/input_data/receipt/@@partitioning=37/4cf32527217d4a38b2b6122d5f251526.parquet: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/jihoon-yang/pm4pyspark-source/HEAD/tests/input_data/receipt/@@partitioning=37/4cf32527217d4a38b2b6122d5f251526.parquet -------------------------------------------------------------------------------- /tests/input_data/receipt/@@partitioning=38/2d329b5523d0457797ee2bd0b557adf1.parquet: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/jihoon-yang/pm4pyspark-source/HEAD/tests/input_data/receipt/@@partitioning=38/2d329b5523d0457797ee2bd0b557adf1.parquet -------------------------------------------------------------------------------- /tests/input_data/receipt/@@partitioning=39/e764ef673bdf4060bc5f617e60bd654c.parquet: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/jihoon-yang/pm4pyspark-source/HEAD/tests/input_data/receipt/@@partitioning=39/e764ef673bdf4060bc5f617e60bd654c.parquet -------------------------------------------------------------------------------- /tests/input_data/receipt/@@partitioning=4/edc1e19d80444f2589370b07c056530e.parquet: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/jihoon-yang/pm4pyspark-source/HEAD/tests/input_data/receipt/@@partitioning=4/edc1e19d80444f2589370b07c056530e.parquet -------------------------------------------------------------------------------- /tests/input_data/receipt/@@partitioning=40/e3900574306440b89b69f32404ecb276.parquet: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/jihoon-yang/pm4pyspark-source/HEAD/tests/input_data/receipt/@@partitioning=40/e3900574306440b89b69f32404ecb276.parquet -------------------------------------------------------------------------------- /tests/input_data/receipt/@@partitioning=41/04ec822e7c8e4954b3d205870ff71cc1.parquet: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/jihoon-yang/pm4pyspark-source/HEAD/tests/input_data/receipt/@@partitioning=41/04ec822e7c8e4954b3d205870ff71cc1.parquet -------------------------------------------------------------------------------- /tests/input_data/receipt/@@partitioning=42/437565a352bc4a60b7c805a35c22d5d0.parquet: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/jihoon-yang/pm4pyspark-source/HEAD/tests/input_data/receipt/@@partitioning=42/437565a352bc4a60b7c805a35c22d5d0.parquet -------------------------------------------------------------------------------- /tests/input_data/receipt/@@partitioning=43/4ff5fc069830432998ffbdf2ea686bbe.parquet: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/jihoon-yang/pm4pyspark-source/HEAD/tests/input_data/receipt/@@partitioning=43/4ff5fc069830432998ffbdf2ea686bbe.parquet -------------------------------------------------------------------------------- /tests/input_data/receipt/@@partitioning=44/612dd93522824aa08d85b1a2bc95f640.parquet: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/jihoon-yang/pm4pyspark-source/HEAD/tests/input_data/receipt/@@partitioning=44/612dd93522824aa08d85b1a2bc95f640.parquet -------------------------------------------------------------------------------- /tests/input_data/receipt/@@partitioning=45/e15ebcf7cd704c76a76cea624964bdcc.parquet: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/jihoon-yang/pm4pyspark-source/HEAD/tests/input_data/receipt/@@partitioning=45/e15ebcf7cd704c76a76cea624964bdcc.parquet -------------------------------------------------------------------------------- /tests/input_data/receipt/@@partitioning=46/35bbd6866c464676a0778738d51a52dd.parquet: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/jihoon-yang/pm4pyspark-source/HEAD/tests/input_data/receipt/@@partitioning=46/35bbd6866c464676a0778738d51a52dd.parquet -------------------------------------------------------------------------------- /tests/input_data/receipt/@@partitioning=47/f1a82c702e954b358488128476df877e.parquet: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/jihoon-yang/pm4pyspark-source/HEAD/tests/input_data/receipt/@@partitioning=47/f1a82c702e954b358488128476df877e.parquet -------------------------------------------------------------------------------- /tests/input_data/receipt/@@partitioning=48/0d23fd2c7ec74cefba3d85438920fd01.parquet: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/jihoon-yang/pm4pyspark-source/HEAD/tests/input_data/receipt/@@partitioning=48/0d23fd2c7ec74cefba3d85438920fd01.parquet -------------------------------------------------------------------------------- /tests/input_data/receipt/@@partitioning=49/36649b37fe534496ba6fb7d695f07685.parquet: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/jihoon-yang/pm4pyspark-source/HEAD/tests/input_data/receipt/@@partitioning=49/36649b37fe534496ba6fb7d695f07685.parquet -------------------------------------------------------------------------------- /tests/input_data/receipt/@@partitioning=5/0cf85db0014c426faf4ab95c699faf3c.parquet: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/jihoon-yang/pm4pyspark-source/HEAD/tests/input_data/receipt/@@partitioning=5/0cf85db0014c426faf4ab95c699faf3c.parquet -------------------------------------------------------------------------------- /tests/input_data/receipt/@@partitioning=50/a1eadd40a1bc4777be42317886824b6a.parquet: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/jihoon-yang/pm4pyspark-source/HEAD/tests/input_data/receipt/@@partitioning=50/a1eadd40a1bc4777be42317886824b6a.parquet -------------------------------------------------------------------------------- /tests/input_data/receipt/@@partitioning=51/432e9c7e1fe3413699665cc0d190b8da.parquet: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/jihoon-yang/pm4pyspark-source/HEAD/tests/input_data/receipt/@@partitioning=51/432e9c7e1fe3413699665cc0d190b8da.parquet -------------------------------------------------------------------------------- /tests/input_data/receipt/@@partitioning=52/25faadf083c944738c1c6b515cb484c4.parquet: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/jihoon-yang/pm4pyspark-source/HEAD/tests/input_data/receipt/@@partitioning=52/25faadf083c944738c1c6b515cb484c4.parquet -------------------------------------------------------------------------------- /tests/input_data/receipt/@@partitioning=53/7eecd66696b3491bb88f55945e4eda79.parquet: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/jihoon-yang/pm4pyspark-source/HEAD/tests/input_data/receipt/@@partitioning=53/7eecd66696b3491bb88f55945e4eda79.parquet -------------------------------------------------------------------------------- /tests/input_data/receipt/@@partitioning=54/c0fbc5af9d8e48eca911bdae3635f628.parquet: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/jihoon-yang/pm4pyspark-source/HEAD/tests/input_data/receipt/@@partitioning=54/c0fbc5af9d8e48eca911bdae3635f628.parquet -------------------------------------------------------------------------------- /tests/input_data/receipt/@@partitioning=55/03e70784390148219fa843c230f6145d.parquet: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/jihoon-yang/pm4pyspark-source/HEAD/tests/input_data/receipt/@@partitioning=55/03e70784390148219fa843c230f6145d.parquet -------------------------------------------------------------------------------- /tests/input_data/receipt/@@partitioning=56/446948113fc1472d8ebe12e9109146a6.parquet: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/jihoon-yang/pm4pyspark-source/HEAD/tests/input_data/receipt/@@partitioning=56/446948113fc1472d8ebe12e9109146a6.parquet -------------------------------------------------------------------------------- /tests/input_data/receipt/@@partitioning=57/93095d5dcd574806a3cbcad084213536.parquet: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/jihoon-yang/pm4pyspark-source/HEAD/tests/input_data/receipt/@@partitioning=57/93095d5dcd574806a3cbcad084213536.parquet -------------------------------------------------------------------------------- /tests/input_data/receipt/@@partitioning=58/a988de5f748a430087f63b0c42f0d0bb.parquet: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/jihoon-yang/pm4pyspark-source/HEAD/tests/input_data/receipt/@@partitioning=58/a988de5f748a430087f63b0c42f0d0bb.parquet -------------------------------------------------------------------------------- /tests/input_data/receipt/@@partitioning=59/79e4ff740ce6425287e4bb319de86d10.parquet: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/jihoon-yang/pm4pyspark-source/HEAD/tests/input_data/receipt/@@partitioning=59/79e4ff740ce6425287e4bb319de86d10.parquet -------------------------------------------------------------------------------- /tests/input_data/receipt/@@partitioning=6/74d11133dfe64b47b26d60a4f8fc3fcb.parquet: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/jihoon-yang/pm4pyspark-source/HEAD/tests/input_data/receipt/@@partitioning=6/74d11133dfe64b47b26d60a4f8fc3fcb.parquet -------------------------------------------------------------------------------- /tests/input_data/receipt/@@partitioning=60/64f2c5cd7f2c41ff8c41c6983770f597.parquet: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/jihoon-yang/pm4pyspark-source/HEAD/tests/input_data/receipt/@@partitioning=60/64f2c5cd7f2c41ff8c41c6983770f597.parquet -------------------------------------------------------------------------------- /tests/input_data/receipt/@@partitioning=61/176b2d6dd5a94507a9b8a97fb34b6448.parquet: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/jihoon-yang/pm4pyspark-source/HEAD/tests/input_data/receipt/@@partitioning=61/176b2d6dd5a94507a9b8a97fb34b6448.parquet -------------------------------------------------------------------------------- /tests/input_data/receipt/@@partitioning=62/dde9034be47d49cd9c2c9cf1d445f7ec.parquet: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/jihoon-yang/pm4pyspark-source/HEAD/tests/input_data/receipt/@@partitioning=62/dde9034be47d49cd9c2c9cf1d445f7ec.parquet -------------------------------------------------------------------------------- /tests/input_data/receipt/@@partitioning=63/5021e7fb3c104231a35faf20f51876cb.parquet: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/jihoon-yang/pm4pyspark-source/HEAD/tests/input_data/receipt/@@partitioning=63/5021e7fb3c104231a35faf20f51876cb.parquet -------------------------------------------------------------------------------- /tests/input_data/receipt/@@partitioning=64/1bdfebb4143c441c9b0009ee5acc1c03.parquet: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/jihoon-yang/pm4pyspark-source/HEAD/tests/input_data/receipt/@@partitioning=64/1bdfebb4143c441c9b0009ee5acc1c03.parquet -------------------------------------------------------------------------------- /tests/input_data/receipt/@@partitioning=65/55625bbdd0f54667a6eb7e2fd6269847.parquet: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/jihoon-yang/pm4pyspark-source/HEAD/tests/input_data/receipt/@@partitioning=65/55625bbdd0f54667a6eb7e2fd6269847.parquet -------------------------------------------------------------------------------- /tests/input_data/receipt/@@partitioning=66/0e076c36a7004c9fbdae493a3fa003bf.parquet: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/jihoon-yang/pm4pyspark-source/HEAD/tests/input_data/receipt/@@partitioning=66/0e076c36a7004c9fbdae493a3fa003bf.parquet -------------------------------------------------------------------------------- /tests/input_data/receipt/@@partitioning=67/e402d5bd805a4ed5bfc266b36fb695ef.parquet: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/jihoon-yang/pm4pyspark-source/HEAD/tests/input_data/receipt/@@partitioning=67/e402d5bd805a4ed5bfc266b36fb695ef.parquet -------------------------------------------------------------------------------- /tests/input_data/receipt/@@partitioning=68/8f4608a274924f5eb4b388c5cc83843c.parquet: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/jihoon-yang/pm4pyspark-source/HEAD/tests/input_data/receipt/@@partitioning=68/8f4608a274924f5eb4b388c5cc83843c.parquet -------------------------------------------------------------------------------- /tests/input_data/receipt/@@partitioning=69/ef10536765264b4a91b84cd59ebb94f4.parquet: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/jihoon-yang/pm4pyspark-source/HEAD/tests/input_data/receipt/@@partitioning=69/ef10536765264b4a91b84cd59ebb94f4.parquet -------------------------------------------------------------------------------- /tests/input_data/receipt/@@partitioning=7/ddd5d4826f5649098345c92168eaacba.parquet: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/jihoon-yang/pm4pyspark-source/HEAD/tests/input_data/receipt/@@partitioning=7/ddd5d4826f5649098345c92168eaacba.parquet -------------------------------------------------------------------------------- /tests/input_data/receipt/@@partitioning=70/aaa24d7d4bb34ad2a91e24529ea0b539.parquet: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/jihoon-yang/pm4pyspark-source/HEAD/tests/input_data/receipt/@@partitioning=70/aaa24d7d4bb34ad2a91e24529ea0b539.parquet -------------------------------------------------------------------------------- /tests/input_data/receipt/@@partitioning=71/2ebbfd301f334541b6c70529f862e7c5.parquet: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/jihoon-yang/pm4pyspark-source/HEAD/tests/input_data/receipt/@@partitioning=71/2ebbfd301f334541b6c70529f862e7c5.parquet -------------------------------------------------------------------------------- /tests/input_data/receipt/@@partitioning=72/02385bb7a79e4b61a6583adf9d1092fb.parquet: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/jihoon-yang/pm4pyspark-source/HEAD/tests/input_data/receipt/@@partitioning=72/02385bb7a79e4b61a6583adf9d1092fb.parquet -------------------------------------------------------------------------------- /tests/input_data/receipt/@@partitioning=73/60c5e48107494b5493ff6b258a3c01b0.parquet: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/jihoon-yang/pm4pyspark-source/HEAD/tests/input_data/receipt/@@partitioning=73/60c5e48107494b5493ff6b258a3c01b0.parquet -------------------------------------------------------------------------------- /tests/input_data/receipt/@@partitioning=74/e8e8e52bdbd448aea7ee599403f47694.parquet: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/jihoon-yang/pm4pyspark-source/HEAD/tests/input_data/receipt/@@partitioning=74/e8e8e52bdbd448aea7ee599403f47694.parquet -------------------------------------------------------------------------------- /tests/input_data/receipt/@@partitioning=75/14fe0948605c447b9cde78a75e5ac4f5.parquet: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/jihoon-yang/pm4pyspark-source/HEAD/tests/input_data/receipt/@@partitioning=75/14fe0948605c447b9cde78a75e5ac4f5.parquet -------------------------------------------------------------------------------- /tests/input_data/receipt/@@partitioning=76/c55ef0ee492a46cab562a9e4e9d7e0eb.parquet: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/jihoon-yang/pm4pyspark-source/HEAD/tests/input_data/receipt/@@partitioning=76/c55ef0ee492a46cab562a9e4e9d7e0eb.parquet -------------------------------------------------------------------------------- /tests/input_data/receipt/@@partitioning=77/47cbfa7312ca43b494b434510de137b9.parquet: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/jihoon-yang/pm4pyspark-source/HEAD/tests/input_data/receipt/@@partitioning=77/47cbfa7312ca43b494b434510de137b9.parquet -------------------------------------------------------------------------------- /tests/input_data/receipt/@@partitioning=78/438c11bff4dd4ccd9fe30162973847f7.parquet: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/jihoon-yang/pm4pyspark-source/HEAD/tests/input_data/receipt/@@partitioning=78/438c11bff4dd4ccd9fe30162973847f7.parquet -------------------------------------------------------------------------------- /tests/input_data/receipt/@@partitioning=79/4da8267db40f47748911336a91f3ffb3.parquet: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/jihoon-yang/pm4pyspark-source/HEAD/tests/input_data/receipt/@@partitioning=79/4da8267db40f47748911336a91f3ffb3.parquet -------------------------------------------------------------------------------- /tests/input_data/receipt/@@partitioning=8/df050f729b464424b2aaa417bc94284d.parquet: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/jihoon-yang/pm4pyspark-source/HEAD/tests/input_data/receipt/@@partitioning=8/df050f729b464424b2aaa417bc94284d.parquet -------------------------------------------------------------------------------- /tests/input_data/receipt/@@partitioning=80/95fb4bd665c642de848c941eee38c499.parquet: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/jihoon-yang/pm4pyspark-source/HEAD/tests/input_data/receipt/@@partitioning=80/95fb4bd665c642de848c941eee38c499.parquet -------------------------------------------------------------------------------- /tests/input_data/receipt/@@partitioning=81/d2f54e1aeec9411f85a063ed49456c80.parquet: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/jihoon-yang/pm4pyspark-source/HEAD/tests/input_data/receipt/@@partitioning=81/d2f54e1aeec9411f85a063ed49456c80.parquet -------------------------------------------------------------------------------- /tests/input_data/receipt/@@partitioning=82/f876e1a583a94201b7fdb6e6a99ee936.parquet: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/jihoon-yang/pm4pyspark-source/HEAD/tests/input_data/receipt/@@partitioning=82/f876e1a583a94201b7fdb6e6a99ee936.parquet -------------------------------------------------------------------------------- /tests/input_data/receipt/@@partitioning=83/2e48ca0da28c4cad9840184e4dbc69a0.parquet: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/jihoon-yang/pm4pyspark-source/HEAD/tests/input_data/receipt/@@partitioning=83/2e48ca0da28c4cad9840184e4dbc69a0.parquet -------------------------------------------------------------------------------- /tests/input_data/receipt/@@partitioning=84/e5d459b80f6f4bc8a6d27cabfb18e905.parquet: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/jihoon-yang/pm4pyspark-source/HEAD/tests/input_data/receipt/@@partitioning=84/e5d459b80f6f4bc8a6d27cabfb18e905.parquet -------------------------------------------------------------------------------- /tests/input_data/receipt/@@partitioning=85/d39c9243df794592a35f5defc5e9cab2.parquet: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/jihoon-yang/pm4pyspark-source/HEAD/tests/input_data/receipt/@@partitioning=85/d39c9243df794592a35f5defc5e9cab2.parquet -------------------------------------------------------------------------------- /tests/input_data/receipt/@@partitioning=86/082bbff63aa14064b95e5ab8a55360ee.parquet: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/jihoon-yang/pm4pyspark-source/HEAD/tests/input_data/receipt/@@partitioning=86/082bbff63aa14064b95e5ab8a55360ee.parquet -------------------------------------------------------------------------------- /tests/input_data/receipt/@@partitioning=87/952670c1d29844d7adb1c3c829e05d7b.parquet: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/jihoon-yang/pm4pyspark-source/HEAD/tests/input_data/receipt/@@partitioning=87/952670c1d29844d7adb1c3c829e05d7b.parquet -------------------------------------------------------------------------------- /tests/input_data/receipt/@@partitioning=88/99dee8f7dccb43e281ad846122fddd44.parquet: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/jihoon-yang/pm4pyspark-source/HEAD/tests/input_data/receipt/@@partitioning=88/99dee8f7dccb43e281ad846122fddd44.parquet -------------------------------------------------------------------------------- /tests/input_data/receipt/@@partitioning=89/7fbf92b524db460ab887984a6d550245.parquet: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/jihoon-yang/pm4pyspark-source/HEAD/tests/input_data/receipt/@@partitioning=89/7fbf92b524db460ab887984a6d550245.parquet -------------------------------------------------------------------------------- /tests/input_data/receipt/@@partitioning=9/f1d8c3c34acc4534bdf98f7404470e0b.parquet: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/jihoon-yang/pm4pyspark-source/HEAD/tests/input_data/receipt/@@partitioning=9/f1d8c3c34acc4534bdf98f7404470e0b.parquet -------------------------------------------------------------------------------- /tests/input_data/receipt/@@partitioning=90/44d3684fb5d34557b6918f643069623c.parquet: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/jihoon-yang/pm4pyspark-source/HEAD/tests/input_data/receipt/@@partitioning=90/44d3684fb5d34557b6918f643069623c.parquet -------------------------------------------------------------------------------- /tests/input_data/receipt/@@partitioning=91/a3e8d08f9fc3415e8125d3cf85cc8996.parquet: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/jihoon-yang/pm4pyspark-source/HEAD/tests/input_data/receipt/@@partitioning=91/a3e8d08f9fc3415e8125d3cf85cc8996.parquet -------------------------------------------------------------------------------- /tests/input_data/receipt/@@partitioning=92/8295e31735b14dcba827e0153d0b1ad0.parquet: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/jihoon-yang/pm4pyspark-source/HEAD/tests/input_data/receipt/@@partitioning=92/8295e31735b14dcba827e0153d0b1ad0.parquet -------------------------------------------------------------------------------- /tests/input_data/receipt/@@partitioning=93/97cae9e1281b4179ad123142e4018e9a.parquet: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/jihoon-yang/pm4pyspark-source/HEAD/tests/input_data/receipt/@@partitioning=93/97cae9e1281b4179ad123142e4018e9a.parquet -------------------------------------------------------------------------------- /tests/input_data/receipt/@@partitioning=94/30e2a0ee8d484502aca9921d6fa403f8.parquet: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/jihoon-yang/pm4pyspark-source/HEAD/tests/input_data/receipt/@@partitioning=94/30e2a0ee8d484502aca9921d6fa403f8.parquet -------------------------------------------------------------------------------- /tests/input_data/receipt/@@partitioning=95/640aa952cbbf40ecbc36edb57796d2cd.parquet: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/jihoon-yang/pm4pyspark-source/HEAD/tests/input_data/receipt/@@partitioning=95/640aa952cbbf40ecbc36edb57796d2cd.parquet -------------------------------------------------------------------------------- /tests/input_data/receipt/@@partitioning=96/8211320dc69d4e3cbd0356fdea9539f8.parquet: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/jihoon-yang/pm4pyspark-source/HEAD/tests/input_data/receipt/@@partitioning=96/8211320dc69d4e3cbd0356fdea9539f8.parquet -------------------------------------------------------------------------------- /tests/input_data/receipt/@@partitioning=97/e4032ee22725400d879d7608c9a739a3.parquet: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/jihoon-yang/pm4pyspark-source/HEAD/tests/input_data/receipt/@@partitioning=97/e4032ee22725400d879d7608c9a739a3.parquet -------------------------------------------------------------------------------- /tests/input_data/receipt/@@partitioning=98/a88b3e98a5a14ae4a67a94715f55714b.parquet: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/jihoon-yang/pm4pyspark-source/HEAD/tests/input_data/receipt/@@partitioning=98/a88b3e98a5a14ae4a67a94715f55714b.parquet -------------------------------------------------------------------------------- /tests/input_data/receipt/@@partitioning=99/45e2d237bad1461ebf527c82f117696a.parquet: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/jihoon-yang/pm4pyspark-source/HEAD/tests/input_data/receipt/@@partitioning=99/45e2d237bad1461ebf527c82f117696a.parquet -------------------------------------------------------------------------------- /tests/filtering_paths_test.py: -------------------------------------------------------------------------------- 1 | import os 2 | from tests.constants import INPUT_DATA_DIR, OUTPUT_DATA_DIR 3 | from pm4pyspark.importer.csv import spark_df_imp as importer 4 | 5 | from pm4pyspark.algo.filtering.paths import paths_filter 6 | 7 | 8 | df_ex = importer.import_sparkdf_from_path(os.path.join(INPUT_DATA_DIR, "running-example.csv"), header=True, inferSchema=True) 9 | filtered_df = paths_filter.apply(df_ex, [('check ticket', 'decide')]) 10 | filtered_df.show(filtered_df.count(), truncate=False) 11 | -------------------------------------------------------------------------------- /pm4pyspark/exporter/parquet/spark_df_exp.py: -------------------------------------------------------------------------------- 1 | import pyspark.sql.functions as F 2 | 3 | from pyspark.sql import SparkSession 4 | from pyspark.sql.types import * 5 | 6 | 7 | 8 | 9 | def export_sparkdf(df, path, case_id_key="case:concept:name", mode=None, partitionBy="@@partitioning", compression=None, num_partitions=128): 10 | 11 | get_hash = F.udf(lambda x: abs(hash(x)) % num_partitions, LongType()) 12 | 13 | df = df.withColumn(partitionBy, get_hash(case_id_key)) 14 | for c in df.columns: 15 | df = df.withColumnRenamed(c, c.replace(':', 'AAA')) 16 | 17 | df.write.parquet(path, mode=mode, partitionBy=partitionBy, compression=compression) 18 | -------------------------------------------------------------------------------- /tests/filtering_cases_test.py: -------------------------------------------------------------------------------- 1 | import os 2 | from tests.constants import INPUT_DATA_DIR, OUTPUT_DATA_DIR 3 | from pm4pyspark.importer.csv import spark_df_imp as importer 4 | 5 | from pm4pyspark.algo.filtering.cases import cases_filter 6 | 7 | 8 | spark_df = importer.import_sparkdf_from_path(os.path.join(INPUT_DATA_DIR, "running-example.csv"), header=True, inferSchema=True) 9 | spark_df.cache() 10 | 11 | cases_filter.filter_on_ncases(spark_df, max_no_cases=3).show() 12 | 13 | case_size_df = cases_filter.filter_on_case_size(spark_df, min_case_size=9, max_case_size=9) 14 | 15 | perf_df = cases_filter.filter_on_case_performance(spark_df, max_case_performance=800000) 16 | 17 | spark_df.unpersist() 18 | -------------------------------------------------------------------------------- /tests/filtering_start_test.py: -------------------------------------------------------------------------------- 1 | import os 2 | from tests.constants import INPUT_DATA_DIR 3 | from pm4pyspark.importer.csv import spark_df_imp as importer 4 | 5 | from pm4pyspark.algo.filtering.start_activities import start_activities_filter 6 | 7 | 8 | spark_df = importer.import_sparkdf_from_path(os.path.join(INPUT_DATA_DIR, "running-example.csv"), header=True, inferSchema=True) 9 | spark_df.cache() 10 | 11 | print(start_activities_filter.get_start_activities(spark_df)) 12 | 13 | filtered_df = start_activities_filter.filter_df_on_start_activities(spark_df, ["check ticket"], grouped_df=spark_df.groupby("org:resource")) 14 | 15 | filtered_df_nocc = start_activities_filter.filter_df_on_start_activities_nocc(spark_df, 6) 16 | 17 | applied_auto_df = start_activities_filter.apply_auto_filter(spark_df) 18 | 19 | spark_df.unpersist() 20 | -------------------------------------------------------------------------------- /tests/filtering_ts_test.py: -------------------------------------------------------------------------------- 1 | import os 2 | import time 3 | import pandas as pd 4 | from tests.constants import INPUT_DATA_DIR 5 | from pm4pyspark.importer.csv import spark_df_imp as importer 6 | from datetime import datetime 7 | 8 | from pm4pyspark.algo.filtering.timestamp import timestamp_filter 9 | 10 | spark_df = importer.import_sparkdf_from_path(os.path.join(INPUT_DATA_DIR, "receipt.csv"), header=True) 11 | spark_df.cache() 12 | 13 | df_timest_contained = timestamp_filter.filter_traces_contained(spark_df, "2011-03-09 00:00:00", "2012-01-18 23:59:59") 14 | print(df_timest_contained.count()) 15 | print(df_timest_contained.groupby("case:concept:name").count().count()) 16 | 17 | df_timest_intersecting = timestamp_filter.filter_traces_intersecting(spark_df, "2011-03-09 00:00:00", "2012-01-18 23:59:59") 18 | df_timest_intersecting.show() 19 | 20 | spark_df.unpersist() 21 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Big Data Process Mining in Python
Integration of Spark in PM4PY for Preprocessing Event Data and Discover Process Models 2 | 3 | 4 | [PM4Py](https://github.com/pm4py) is the Process Mining library in Python and it aims at seamless integration with any kind of databases and technology. 5 | 6 | **PM4PySpark** is the integration of [*Apache Spark*](https://spark.apache.org) in PM4Py. Especially, this Big Data connectors for PM4Py has a focus on embracing the big data world and to handle huge amount of data, with a particular focus on the Spark ecosystem: 7 | 8 | - Loading CSV files into Apache Spark 9 | - Loading and writing Parquet files into Apache Spark 10 | - Calculating in an efficient way the Directly Follows Graph (DFG) on top of Apache Spark DataFrames 11 | - Managing filtering operations (timeframe, attributes, start/end activities, paths, variants, cases) on top of Apache Spark 12 | -------------------------------------------------------------------------------- /tests/parquet_export_test.py: -------------------------------------------------------------------------------- 1 | import os 2 | import time 3 | 4 | from tests.constants import INPUT_DATA_DIR, OUTPUT_DATA_DIR 5 | from pm4pyspark.importer.parquet import spark_df_imp as parquet_importer 6 | from pm4pyspark.exporter.parquet import spark_df_exp as parquet_exporter 7 | 8 | 9 | 10 | 11 | dir_path = os.path.join(INPUT_DATA_DIR, "receipt") 12 | spark_df_dir = parquet_importer.import_sparkdf_from_path(dir_path) 13 | print(spark_df_dir.count()) 14 | 15 | out_path = os.path.join(OUTPUT_DATA_DIR, "receipt128") 16 | parquet_exporter.export_sparkdf(spark_df_dir, out_path, mode="overwrite") 17 | 18 | out_path2 = os.path.join(OUTPUT_DATA_DIR, "receipt64") 19 | parquet_exporter.export_sparkdf(spark_df_dir, out_path2, num_partitions=64, mode="overwrite") 20 | 21 | 22 | test_df_128 = parquet_importer.import_sparkdf_from_path(out_path) 23 | test_df_64 = parquet_importer.import_sparkdf_from_path(out_path2) 24 | print(test_df_128.count()) 25 | print(test_df_64.count()) 26 | -------------------------------------------------------------------------------- /tests/dfg_test.py: -------------------------------------------------------------------------------- 1 | import os 2 | from tests.constants import INPUT_DATA_DIR, OUTPUT_DATA_DIR 3 | from pm4py.visualization.dfg import factory as dfg_vis_factory 4 | from pm4pyspark.importer.csv import spark_df_imp as importer 5 | from pm4pyspark.algo.discovery.dfg import factory as dfg_factory 6 | 7 | 8 | parameters = {"format":"svg"} 9 | 10 | 11 | event_stream_ex = importer.import_event_stream(os.path.join(INPUT_DATA_DIR, "running-example.csv"), parameters={"header": True}) 12 | log_ex = importer.transform_event_stream_to_event_log(event_stream_ex) 13 | df_ex = importer.import_sparkdf_from_path(os.path.join(INPUT_DATA_DIR, "running-example.csv"), header=True, inferSchema=True) 14 | 15 | dfg_freq = dfg_factory.apply(df_ex) 16 | gviz_freq = dfg_vis_factory.apply(dfg_freq, log=log_ex, parameters=parameters, variant="frequency") 17 | dfg_vis_factory.save(gviz_freq, os.path.join(OUTPUT_DATA_DIR, "running-example_freq.svg")) 18 | 19 | dfg_perf = dfg_factory.apply(df_ex, variant="performance") 20 | gviz_perf = dfg_vis_factory.apply(dfg_perf, log=log_ex, parameters=parameters, variant="performance") 21 | dfg_vis_factory.save(gviz_perf, os.path.join(OUTPUT_DATA_DIR, "running-example_perf.svg")) 22 | -------------------------------------------------------------------------------- /tests/filtering_end_test.py: -------------------------------------------------------------------------------- 1 | import os 2 | from tests.constants import INPUT_DATA_DIR 3 | from pm4pyspark.importer.csv import spark_df_imp as importer 4 | 5 | from pm4pyspark.algo.filtering.end_activities import end_activities_filter 6 | 7 | 8 | spark_df = importer.import_sparkdf_from_path(os.path.join(INPUT_DATA_DIR, "running-example.csv"), header=True, inferSchema=True) 9 | spark_df.cache() 10 | 11 | end_ac = end_activities_filter.get_end_activities(spark_df) 12 | filtered_df = end_activities_filter.filter_df_on_end_activities(spark_df, {'T07-5 Draft intern advice aspect 5'}) 13 | filtered_df_apply = end_activities_filter.apply(spark_df, ["T05 Print and send confirmation of receipt", "T10 Determine necessity to stop indication"]) 14 | print(filtered_df_apply.count()) 15 | print(filtered_df_apply.groupby("case:concept:name").count().count()) 16 | 17 | filtered_df_nocc, rdict = end_activities_filter.filter_df_on_end_activities_nocc(spark_df, 400, return_dict=True) 18 | filtered_df_nocc.show(filtered_df_nocc.count()) 19 | print(filtered_df_nocc.count()) 20 | print(rdict) 21 | 22 | filtered_auto_filter = end_activities_filter.apply_auto_filter(spark_df) 23 | print(filtered_auto_filter.count()) 24 | spark_df.unpersist() 25 | -------------------------------------------------------------------------------- /tests/csv_import_test.py: -------------------------------------------------------------------------------- 1 | import os 2 | import time 3 | 4 | from tests.constants import INPUT_DATA_DIR 5 | from pm4pyspark.importer.csv import spark_df_imp as csv_importer 6 | 7 | file_path = os.path.join(INPUT_DATA_DIR, "running-example.csv") 8 | file_path2 = os.path.join(INPUT_DATA_DIR, "receipt.csv") 9 | 10 | 11 | spark_df_wo_timeconversion = csv_importer.import_sparkdf_from_path_wo_timeconversion(file_path, header=True) 12 | spark_df = csv_importer.import_sparkdf_from_path(file_path, header=True, inferSchema=True) 13 | spark_df_sorted = csv_importer.import_sparkdf_from_path(file_path, header=True, sort=True) 14 | 15 | spark_df_wo_timeconversion1 = csv_importer.import_sparkdf_from_path_wo_timeconversion(file_path2, header=True) 16 | spark_df1 = csv_importer.import_sparkdf_from_path(file_path2, header=True) 17 | spark_df_sorted1 = csv_importer.import_sparkdf_from_path(file_path2, header=True, sort=True) 18 | 19 | spark_df_wo_timeconversion.show(truncate=False) 20 | spark_df.show() 21 | spark_df_sorted.show() 22 | 23 | spark_df_wo_timeconversion1.show(truncate=False) 24 | spark_df1.show() 25 | spark_df_sorted1.show() 26 | 27 | event_stream = csv_importer.import_event_stream(file_path, header=True) 28 | log = csv_importer.transform_event_stream_to_event_log(event_stream) 29 | print(log) 30 | -------------------------------------------------------------------------------- /tests/parquet_import_test.py: -------------------------------------------------------------------------------- 1 | import os 2 | import time 3 | 4 | from tests.constants import INPUT_DATA_DIR 5 | from pm4pyspark.importer.parquet import spark_df_imp as parquet_importer 6 | 7 | 8 | 9 | 10 | file_path = os.path.join(INPUT_DATA_DIR, "running-example.parquet") 11 | file_path2 = os.path.join(INPUT_DATA_DIR, "receipt.parquet") 12 | dir_path = os.path.join(INPUT_DATA_DIR, "receipt") 13 | 14 | 15 | spark_df = parquet_importer.import_sparkdf_from_path(file_path) 16 | spark_df_sorted = parquet_importer.import_sparkdf_from_path(file_path, sort=True) 17 | 18 | spark_df1 = parquet_importer.import_sparkdf_from_path(file_path2) 19 | spark_df_sorted1 = parquet_importer.import_sparkdf_from_path(file_path2, sort=True) 20 | 21 | spark_df_dir = parquet_importer.import_sparkdf_from_path(dir_path) 22 | spark_df_dir_sorted = parquet_importer.import_sparkdf_from_path(dir_path, sort=True) 23 | 24 | spark_df.show() 25 | spark_df_sorted.show() 26 | 27 | spark_df1.show() 28 | spark_df_sorted1.show() 29 | 30 | spark_df_dir.show() 31 | spark_df_dir_sorted.show() 32 | 33 | print(spark_df_dir.count()) 34 | print(spark_df_dir_sorted.count()) 35 | 36 | event_stream = parquet_importer.import_event_stream(file_path) 37 | log = parquet_importer.transform_event_stream_to_event_log(event_stream) 38 | print(log) 39 | -------------------------------------------------------------------------------- /tests/filtering_attr_test.py: -------------------------------------------------------------------------------- 1 | import os 2 | import time 3 | from tests.constants import INPUT_DATA_DIR 4 | from pm4pyspark.importer.csv import spark_df_imp as importer 5 | 6 | from pm4pyspark.algo.filtering.attributes import attributes_filter 7 | 8 | spark_df = importer.import_sparkdf_from_path(os.path.join(INPUT_DATA_DIR, "running-example.csv"), header=True, inferSchema=True) 9 | spark_df.cache() 10 | 11 | activities = attributes_filter.get_attribute_values(spark_df, attribute_key="concept:name") 12 | resources = attributes_filter.get_attribute_values(spark_df, attribute_key="org:resource") 13 | 14 | filtered_df = attributes_filter.filter_df_on_attribute_values(spark_df, {'examine casually'}, positive=False) 15 | filtered_df.show(filtered_df.count()) 16 | 17 | filtered_num_df = attributes_filter.apply_numeric_events(spark_df, 50, 100, parameters={"pm4py:param:attribute_key":"Costs"}) 18 | 19 | filtered_num_tr_df2 = attributes_filter.apply_numeric(spark_df, 0, 7000, parameters={"pm4py:param:attribute_key":"_c0"}) 20 | 21 | filtered_event_df = attributes_filter.apply_events(spark_df, values={"examine casually"}) 22 | 23 | filtered_thresh_df = attributes_filter.filter_df_keeping_activ_exc_thresh(spark_df, 7, most_common_variant={"reject request"}) 24 | 25 | filtered_top_5_act_df = attributes_filter.filter_df_keeping_spno_activities(spark_df, max_no_activities=5) 26 | 27 | print(attributes_filter.get_kde_date_attribute(spark_df)) 28 | 29 | spark_df.unpersist() 30 | -------------------------------------------------------------------------------- /tests/filtering_var_test.py: -------------------------------------------------------------------------------- 1 | import os 2 | import time 3 | from tests.constants import INPUT_DATA_DIR 4 | from pm4pyspark.importer.csv import spark_df_imp as importer 5 | 6 | from pm4pyspark.algo.filtering.variants import variants_filter 7 | 8 | spark_df = importer.import_sparkdf_from_path(os.path.join(INPUT_DATA_DIR, "receipt.csv"), header=True) 9 | spark_df.cache() 10 | 11 | variants_df = variants_filter.get_variants_df(spark_df) 12 | variants_df.show(variants_df.count()) 13 | 14 | print(variants_filter.get_variant_statistics(spark_df, parameters={'max_variants_to_return': 3})) 15 | ddf, dlist = variants_filter.get_variants_df_and_list(spark_df) 16 | print(dlist) 17 | 18 | start_time = time.time() 19 | variants_df2 = variants_filter.get_variants_df_with_case_duration(spark_df) 20 | variants_df2.show() 21 | 22 | event_with_caseid1 = variants_filter.get_events(spark_df, 1) 23 | print(event_with_caseid1) 24 | stat_with_duration = variants_filter.get_variant_statistics_with_case_duration(spark_df) 25 | print(stat_with_duration) 26 | 27 | case_description = variants_filter.get_cases_description(spark_df) 28 | print(case_description) 29 | 30 | applied_df = variants_filter.apply(spark_df, ["Confirmation of receipt,T02 Check confirmation of receipt,T04 Determine confirmation of receipt,T05 Print and send confirmation of receipt,T06 Determine necessity of stop advice,T10 Determine necessity to stop indication"]) 31 | variants_count_applied_df = variants_filter.get_variant_statistics(applied_df) 32 | print(variants_count_applied_df) 33 | 34 | auto_applied_df = variants_filter.apply_auto_filter(spark_df) 35 | print(auto_applied_df.count()) 36 | 37 | spark_df.unpersist() 38 | -------------------------------------------------------------------------------- /pm4pyspark/algo/filtering/paths/paths_filter.py: -------------------------------------------------------------------------------- 1 | import pyspark.sql.functions as F 2 | 3 | from pm4py.algo.filtering.common.filtering_constants import CASE_CONCEPT_NAME 4 | from pm4py.objects.log.util.xes import DEFAULT_NAME_KEY, DEFAULT_TIMESTAMP_KEY 5 | from pm4py.util.constants import PARAMETER_CONSTANT_TIMESTAMP_KEY 6 | from pm4py.util.constants import PARAMETER_CONSTANT_ATTRIBUTE_KEY 7 | from pm4py.util.constants import PARAMETER_CONSTANT_CASEID_KEY 8 | from pyspark.sql.window import Window 9 | 10 | 11 | 12 | 13 | def apply(df, paths, parameters=None): 14 | """Applies a filter on traces containing / not containing a path 15 | """ 16 | 17 | if parameters is None: 18 | parameters = {} 19 | paths = [path[0] + "," + path[1] for path in paths] 20 | 21 | timestamp_key = parameters[ 22 | PARAMETER_CONSTANT_TIMESTAMP_KEY] if PARAMETER_CONSTANT_TIMESTAMP_KEY in parameters else DEFAULT_TIMESTAMP_KEY 23 | case_id_glue = parameters[ 24 | PARAMETER_CONSTANT_CASEID_KEY] if PARAMETER_CONSTANT_CASEID_KEY in parameters else CASE_CONCEPT_NAME 25 | attribute_key = parameters[ 26 | PARAMETER_CONSTANT_ATTRIBUTE_KEY] if PARAMETER_CONSTANT_ATTRIBUTE_KEY in parameters else DEFAULT_NAME_KEY 27 | positive = parameters["positive"] if "positive" in parameters else True 28 | 29 | df_reduced = df.select(case_id_glue, attribute_key) 30 | 31 | w = Window().partitionBy(df_reduced[case_id_glue]).orderBy(df_reduced[case_id_glue]) 32 | df_reduced_shift = df_reduced.withColumn(case_id_glue + "_1", F.lag(case_id_glue, -1, 'NaN').over(w)) 33 | df_reduced_shift = df_reduced_shift.withColumn(attribute_key + "_1", F.lag(attribute_key, -1, 'NaN').over(w)) 34 | stacked_df = df_reduced_shift.withColumn("@@path", F.concat(df_reduced_shift[attribute_key], F.lit(","), df_reduced_shift[attribute_key + "_1"])) 35 | stacked_df = stacked_df.filter(stacked_df["@@path"].isin(paths)).select(case_id_glue) 36 | 37 | if positive: 38 | return df.join(F.broadcast(stacked_df), case_id_glue) 39 | else: 40 | return df.join(F.broadcast(stacked_df), case_id_glue, 'leftanti') 41 | 42 | 43 | def apply_auto_filter(df, parameters=None): 44 | del df 45 | del parameters 46 | raise Exception("apply_auto_filter method not available for paths filter on dataframe") 47 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | # Byte-compiled / optimized / DLL files 2 | __pycache__/ 3 | .idea/ 4 | *.py[cod] 5 | *$py.class 6 | 7 | # C extensions 8 | *.so 9 | 10 | # Distribution / packaging 11 | .Python 12 | build/ 13 | develop-eggs/ 14 | dist/ 15 | downloads/ 16 | eggs/ 17 | .eggs/ 18 | lib/ 19 | lib64/ 20 | parts/ 21 | sdist/ 22 | var/ 23 | wheels/ 24 | pip-wheel-metadata/ 25 | share/python-wheels/ 26 | *.egg-info/ 27 | .installed.cfg 28 | *.egg 29 | MANIFEST 30 | 31 | # PyInstaller 32 | # Usually these files are written by a python script from a template 33 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 34 | *.manifest 35 | *.spec 36 | 37 | # Installer logs 38 | pip-log.txt 39 | pip-delete-this-directory.txt 40 | 41 | # Unit test / coverage reports 42 | htmlcov/ 43 | .tox/ 44 | .nox/ 45 | .coverage 46 | .coverage.* 47 | .cache 48 | nosetests.xml 49 | coverage.xml 50 | *.cover 51 | .hypothesis/ 52 | .pytest_cache/ 53 | 54 | # Translations 55 | *.mo 56 | *.pot 57 | 58 | # Django stuff: 59 | *.log 60 | local_settings.py 61 | db.sqlite3 62 | db.sqlite3-journal 63 | 64 | # Flask stuff: 65 | instance/ 66 | .webassets-cache 67 | 68 | # Scrapy stuff: 69 | .scrapy 70 | 71 | # Sphinx documentation 72 | docs/_build/ 73 | 74 | # PyBuilder 75 | target/ 76 | 77 | # Jupyter Notebook 78 | .ipynb_checkpoints 79 | 80 | # IPython 81 | profile_default/ 82 | ipython_config.py 83 | 84 | # pyenv 85 | .python-version 86 | 87 | # pipenv 88 | # According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control. 89 | # However, in case of collaboration, if having platform-specific dependencies or dependencies 90 | # having no cross-platform support, pipenv may install dependencies that don't work, or not 91 | # install all needed dependencies. 92 | #Pipfile.lock 93 | 94 | # celery beat schedule file 95 | celerybeat-schedule 96 | 97 | # SageMath parsed files 98 | *.sage.py 99 | 100 | # Environments 101 | .env 102 | .venv 103 | env/ 104 | venv/ 105 | ENV/ 106 | env.bak/ 107 | venv.bak/ 108 | 109 | # Spyder project settings 110 | .spyderproject 111 | .spyproject 112 | 113 | # Rope project settings 114 | .ropeproject 115 | 116 | # mkdocs documentation 117 | /site 118 | 119 | # mypy 120 | .mypy_cache/ 121 | .dmypy.json 122 | dmypy.json 123 | 124 | # Pyre type checker 125 | .pyre/ 126 | -------------------------------------------------------------------------------- /pm4pyspark/algo/filtering/cases/cases_filter.py: -------------------------------------------------------------------------------- 1 | import pyspark.sql.functions as F 2 | 3 | 4 | 5 | 6 | def filter_on_ncases(df, case_id_glue="case:concept:name", max_no_cases=1000): 7 | """Filters the Spark dataframe keeping only the specified maximum number of traces 8 | """ 9 | 10 | # With conversion to RDD. 11 | #cases_to_keep = df.select(case_id_glue).distinct().rdd.map(lambda row : row[0]).collect() 12 | #cases_to_keep = cases_to_keep[0:min(len(cases_to_keep), max_no_cases)] 13 | #return df.filter(df[case_id_glue].isin(cases_to_keep)) 14 | 15 | #Without conversion to RDD (better). 16 | grouped_df = df.groupBy(case_id_glue).count().limit(max_no_cases).drop("count") 17 | 18 | return df.join(F.broadcast(grouped_df), case_id_glue) 19 | 20 | 21 | def filter_on_case_size(df, case_id_glue="case:concept:name", min_case_size=2, max_case_size=None): 22 | """Filters the Spark dataframe keeping only traces with at least the specified number of events 23 | """ 24 | 25 | size_df = df.groupBy(case_id_glue).count() 26 | if max_case_size: 27 | size_df = size_df.filter((size_df["count"] >= min_case_size) & (size_df["count"] <= max_case_size)) 28 | else: 29 | size_df = size_df.filter(size_df["count"] >= min_case_size) 30 | return df.join(F.broadcast(size_df), case_id_glue).drop("count") 31 | 32 | 33 | def filter_on_case_performance(df, case_id_glue="case:concept:name", timestamp_key="time:timestamp", 34 | min_case_performance=0, max_case_performance=10000000000): 35 | """Filters the Spark dataframe on case performance 36 | """ 37 | 38 | grouped_df = df.groupby(case_id_glue) 39 | start_end_df = grouped_df.agg(F.min(timestamp_key).alias(timestamp_key), F.max(timestamp_key).alias(timestamp_key+"_1")) 40 | 41 | start_end_df = start_end_df.withColumn("caseDuration", F.unix_timestamp(start_end_df[timestamp_key+"_1"]) - F.unix_timestamp(start_end_df[timestamp_key])) 42 | start_end_df = start_end_df.filter((start_end_df["caseDuration"] > min_case_performance) & (start_end_df["caseDuration"] < max_case_performance))\ 43 | .select(case_id_glue) 44 | 45 | return df.join(F.broadcast(start_end_df), case_id_glue) 46 | 47 | 48 | def apply(df, parameters=None): 49 | del df 50 | del parameters 51 | raise Exception("apply method not available for case filter") 52 | 53 | 54 | def apply_auto_filter(df, parameters=None): 55 | del df 56 | del parameters 57 | raise Exception("apply_auto_filter method not available for case filter") 58 | -------------------------------------------------------------------------------- /pm4pyspark/algo/discovery/dfg/df_statistics.py: -------------------------------------------------------------------------------- 1 | import pyspark.sql.functions as F 2 | 3 | from pyspark.sql.window import Window 4 | 5 | 6 | 7 | 8 | def get_dfg_graph(df, measure="frequency", activity_key="concept:name", case_id_glue="case:concept:name", 9 | timestamp_key="time:timestamp", perf_aggregation_key="mean", sort_caseid_required=True, 10 | sort_timestamp_along_case_id=True, window=1): 11 | """Gets DFG graph from the Spark dataframe 12 | """ 13 | 14 | #if sort_caseid_required: 15 | # if sort_timestamp_along_case_id: 16 | # df = df.orderBy(case_id_glue, timestamp_key) 17 | # else: 18 | # df = df.orderBy(case_id_glue) 19 | 20 | if measure == "frequency": 21 | df_reduced = df.select(case_id_glue, activity_key) 22 | else: 23 | df_reduced = df.select(case_id_glue, activity_key, timestamp_key) 24 | 25 | w = Window.partitionBy(case_id_glue).orderBy(case_id_glue) 26 | df_reduced_shift = df_reduced.withColumn(case_id_glue + "_1", F.lag(case_id_glue, -window, 'NaN').over(w)) 27 | df_reduced_shift = df_reduced_shift.withColumn(activity_key + "_1", F.lag(activity_key, -window, 'NaN').over(w)) 28 | if measure != "frequency": 29 | df_reduced_shift = df_reduced_shift.withColumn(timestamp_key + "_1", F.lag(timestamp_key, -window, 'NaN').over(w)) 30 | df_successive_rows = df_reduced_shift.filter(df_reduced_shift[case_id_glue] == df_reduced_shift[case_id_glue + "_1"]) 31 | 32 | if measure == "performance" or measure == "both": 33 | df_successive_rows = df_successive_rows.withColumn("caseDuration", 34 | F.unix_timestamp(df_successive_rows[timestamp_key+"_1"]) 35 | - F.unix_timestamp(df_successive_rows[timestamp_key])) 36 | directly_follows_grouping = df_successive_rows.groupby(activity_key, activity_key + "_1") 37 | 38 | if measure == "frequency" or measure == "both": 39 | dfg_frequency = directly_follows_grouping.count().rdd.map(lambda row: ((row[0], row[1]), row[2])) 40 | 41 | if measure == "performance" or measure == "both": 42 | dfg_performance = directly_follows_grouping.agg({"caseDuration":perf_aggregation_key})\ 43 | .rdd.map(lambda row: ((row[0], row[1]), row[2])) 44 | if measure == "frequency": 45 | return dfg_frequency.collectAsMap() 46 | 47 | if measure == "performance": 48 | return dfg_performance.collectAsMap() 49 | 50 | if measure == "both": 51 | return [dfg_frequency.collectAsMap(), dfg_performance.collectAsMap()] 52 | -------------------------------------------------------------------------------- /pm4pyspark/importer/parquet/spark_df_imp.py: -------------------------------------------------------------------------------- 1 | from pm4py.objects.log import log as log_instance 2 | from pm4py.objects.conversion.log.versions import to_event_log 3 | from pm4pyspark.importer.constants import DEFAULT_NUM_PARTITION 4 | from pyspark.sql import SparkSession 5 | 6 | 7 | def apply(path, parameters=None): 8 | """Imports a Parquet file 9 | """ 10 | 11 | if parameters is None: 12 | parameters = {} 13 | 14 | numPartition = parameters["numPartition"] if "numPartition" in parameters else DEFAULT_NUM_PARTITION 15 | 16 | spark = (SparkSession. 17 | builder. 18 | master('local[*]'). 19 | config('spark.sql.shuffle.partitions', numPartition). 20 | getOrCreate()) 21 | 22 | spark_df = spark.read.parquet(path) 23 | for c in spark_df.columns: 24 | spark_df = spark_df.withColumnRenamed(c, c.replace('AAA', ':')) 25 | 26 | return spark_df 27 | 28 | 29 | def import_sparkdf_from_path(path, sort=False, sort_field="time:timestamp", ascending=True, numPartition=DEFAULT_NUM_PARTITION): 30 | """Imports a Spark DataFrame from the given path of PARQUET format file 31 | """ 32 | 33 | parameters = {} 34 | parameters["numPartition"] = numPartition 35 | 36 | spark_df = apply(path, parameters=parameters) 37 | 38 | if sort and sort_field: 39 | if ascending is True: 40 | spark_df = spark_df.orderBy(sort_field) 41 | else: 42 | spark_df = spark_df.orderBy(sort_field, ascending=False) 43 | 44 | return spark_df 45 | 46 | 47 | def import_event_stream(path, sort=True, sort_field="time:timestamp", ascending=True, numPartition=DEFAULT_NUM_PARTITION): 48 | """Imports an `EventStream` from the given path of PARQUET format file 49 | """ 50 | 51 | spark_df = import_sparkdf_from_path(path, sort=sort, sort_field=sort_field, ascending=ascending, numPartition=numPartition) 52 | rdd = spark_df.rdd.map(lambda row: row.asDict()) 53 | event_stream = rdd.collect() 54 | event_stream = log_instance.EventStream(event_stream, attributes={'origin': 'parquet'}) 55 | return event_stream 56 | 57 | 58 | def transform_event_stream_to_event_log(event_stream, case_glue="case:concept:name", include_case_attributes=True, enable_deepcopy=False): 59 | """Transforms an `EventStream` to an `EventLog` 60 | """ 61 | 62 | log = to_event_log.transform_event_stream_to_event_log(event_stream, 63 | case_glue=case_glue, 64 | include_case_attributes=include_case_attributes, 65 | enable_deepcopy=enable_deepcopy) 66 | 67 | return log 68 | -------------------------------------------------------------------------------- /pm4pyspark/algo/discovery/dfg/factory.py: -------------------------------------------------------------------------------- 1 | import pyspark 2 | 3 | from pm4py import util as pmutil 4 | from pm4py.algo.discovery.dfg.versions import native, performance 5 | from pm4py.objects.conversion.log import factory as log_conversion 6 | from pm4py.objects.log.util import general as log_util 7 | from pm4py.objects.log.util import xes as xes_util 8 | 9 | from pm4pyspark.algo.discovery.dfg import df_statistics 10 | from pm4pyspark.importer.csv import spark_df_imp as importer 11 | 12 | DFG_NATIVE = 'native' 13 | DFG_FREQUENCY = 'frequency' 14 | DFG_PERFORMANCE = 'performance' 15 | DFG_FREQUENCY_GREEDY = 'frequency_greedy' 16 | DFG_PERFORMANCE_GREEDY = 'performance_greedy' 17 | 18 | VERSIONS = {DFG_NATIVE: native.apply, DFG_FREQUENCY: native.apply, DFG_PERFORMANCE: performance.apply, 19 | DFG_FREQUENCY_GREEDY: native.apply, DFG_PERFORMANCE_GREEDY: performance.apply} 20 | 21 | def apply(log, parameters=None, variant=DFG_NATIVE): 22 | """Calculates DFG graph (frequency or performance) starting from the Spark DataFrame 23 | """ 24 | 25 | if parameters is None: 26 | parameters = {} 27 | if pmutil.constants.PARAMETER_CONSTANT_ACTIVITY_KEY not in parameters: 28 | parameters[pmutil.constants.PARAMETER_CONSTANT_ACTIVITY_KEY] = xes_util.DEFAULT_NAME_KEY 29 | if pmutil.constants.PARAMETER_CONSTANT_TIMESTAMP_KEY not in parameters: 30 | parameters[pmutil.constants.PARAMETER_CONSTANT_TIMESTAMP_KEY] = xes_util.DEFAULT_TIMESTAMP_KEY 31 | if pmutil.constants.PARAMETER_CONSTANT_CASEID_KEY not in parameters: 32 | parameters[pmutil.constants.PARAMETER_CONSTANT_CASEID_KEY] = log_util.CASE_ATTRIBUTE_GLUE 33 | 34 | if isinstance(log, pyspark.sql.DataFrame): 35 | df = importer.convert_timestamp_to_utc_in_df(log, timest_columns=[ 36 | parameters[pmutil.constants.PARAMETER_CONSTANT_TIMESTAMP_KEY]]) 37 | dfg_frequency, dfg_performance = df_statistics.get_dfg_graph(df, measure="both", 38 | activity_key=parameters[ 39 | pmutil.constants.PARAMETER_CONSTANT_ACTIVITY_KEY], 40 | timestamp_key=parameters[ 41 | pmutil.constants.PARAMETER_CONSTANT_TIMESTAMP_KEY], 42 | case_id_glue=parameters[ 43 | pmutil.constants.PARAMETER_CONSTANT_CASEID_KEY]) 44 | if 'native' in variant or 'frequency' in variant: 45 | return dfg_frequency 46 | else: 47 | return dfg_performance 48 | 49 | return VERSIONS[variant](log_conversion.apply(log, parameters, log_conversion.TO_EVENT_LOG), parameters=parameters) 50 | -------------------------------------------------------------------------------- /tests/input_data/running-example.csv: -------------------------------------------------------------------------------- 1 | ,Activity,Costs,Resource,case:concept:name,case:creator,concept:name,org:resource,time:timestamp 2 | 0,register request,50,Pete,3,Fluxicon Nitro,register request,Pete,2010-12-30 14:32:00+01:00 3 | 1,examine casually,400,Mike,3,Fluxicon Nitro,examine casually,Mike,2010-12-30 15:06:00+01:00 4 | 2,check ticket,100,Ellen,3,Fluxicon Nitro,check ticket,Ellen,2010-12-30 16:34:00+01:00 5 | 3,decide,200,Sara,3,Fluxicon Nitro,decide,Sara,2011-01-06 09:18:00+01:00 6 | 4,reinitiate request,200,Sara,3,Fluxicon Nitro,reinitiate request,Sara,2011-01-06 12:18:00+01:00 7 | 5,examine thoroughly,400,Sean,3,Fluxicon Nitro,examine thoroughly,Sean,2011-01-06 13:06:00+01:00 8 | 6,check ticket,100,Pete,3,Fluxicon Nitro,check ticket,Pete,2011-01-08 11:43:00+01:00 9 | 7,decide,200,Sara,3,Fluxicon Nitro,decide,Sara,2011-01-09 09:55:00+01:00 10 | 8,pay compensation,200,Ellen,3,Fluxicon Nitro,pay compensation,Ellen,2011-01-15 10:45:00+01:00 11 | 9,register request,50,Mike,2,Fluxicon Nitro,register request,Mike,2010-12-30 11:32:00+01:00 12 | 10,check ticket,100,Mike,2,Fluxicon Nitro,check ticket,Mike,2010-12-30 12:12:00+01:00 13 | 11,examine casually,400,Sean,2,Fluxicon Nitro,examine casually,Sean,2010-12-30 14:16:00+01:00 14 | 12,decide,200,Sara,2,Fluxicon Nitro,decide,Sara,2011-01-05 11:22:00+01:00 15 | 13,pay compensation,200,Ellen,2,Fluxicon Nitro,pay compensation,Ellen,2011-01-08 12:05:00+01:00 16 | 14,register request,50,Pete,1,Fluxicon Nitro,register request,Pete,2010-12-30 11:02:00+01:00 17 | 15,examine thoroughly,400,Sue,1,Fluxicon Nitro,examine thoroughly,Sue,2010-12-31 10:06:00+01:00 18 | 16,check ticket,100,Mike,1,Fluxicon Nitro,check ticket,Mike,2011-01-05 15:12:00+01:00 19 | 17,decide,200,Sara,1,Fluxicon Nitro,decide,Sara,2011-01-06 11:18:00+01:00 20 | 18,reject request,200,Pete,1,Fluxicon Nitro,reject request,Pete,2011-01-07 14:24:00+01:00 21 | 19,register request,50,Mike,6,Fluxicon Nitro,register request,Mike,2011-01-06 15:02:00+01:00 22 | 20,examine casually,400,Ellen,6,Fluxicon Nitro,examine casually,Ellen,2011-01-06 16:06:00+01:00 23 | 21,check ticket,100,Mike,6,Fluxicon Nitro,check ticket,Mike,2011-01-07 16:22:00+01:00 24 | 22,decide,200,Sara,6,Fluxicon Nitro,decide,Sara,2011-01-07 16:52:00+01:00 25 | 23,pay compensation,200,Mike,6,Fluxicon Nitro,pay compensation,Mike,2011-01-16 11:47:00+01:00 26 | 24,register request,50,Ellen,5,Fluxicon Nitro,register request,Ellen,2011-01-06 09:02:00+01:00 27 | 25,examine casually,400,Mike,5,Fluxicon Nitro,examine casually,Mike,2011-01-07 10:16:00+01:00 28 | 26,check ticket,100,Pete,5,Fluxicon Nitro,check ticket,Pete,2011-01-08 11:22:00+01:00 29 | 27,decide,200,Sara,5,Fluxicon Nitro,decide,Sara,2011-01-10 13:28:00+01:00 30 | 28,reinitiate request,200,Sara,5,Fluxicon Nitro,reinitiate request,Sara,2011-01-11 16:18:00+01:00 31 | 29,check ticket,100,Ellen,5,Fluxicon Nitro,check ticket,Ellen,2011-01-14 14:33:00+01:00 32 | 30,examine casually,400,Mike,5,Fluxicon Nitro,examine casually,Mike,2011-01-16 15:50:00+01:00 33 | 31,decide,200,Sara,5,Fluxicon Nitro,decide,Sara,2011-01-19 11:18:00+01:00 34 | 32,reinitiate request,200,Sara,5,Fluxicon Nitro,reinitiate request,Sara,2011-01-20 12:48:00+01:00 35 | 33,examine casually,400,Sue,5,Fluxicon Nitro,examine casually,Sue,2011-01-21 09:06:00+01:00 36 | 34,check ticket,100,Pete,5,Fluxicon Nitro,check ticket,Pete,2011-01-21 11:34:00+01:00 37 | 35,decide,200,Sara,5,Fluxicon Nitro,decide,Sara,2011-01-23 13:12:00+01:00 38 | 36,reject request,200,Mike,5,Fluxicon Nitro,reject request,Mike,2011-01-24 14:56:00+01:00 39 | 37,register request,50,Pete,4,Fluxicon Nitro,register request,Pete,2011-01-06 15:02:00+01:00 40 | 38,check ticket,100,Mike,4,Fluxicon Nitro,check ticket,Mike,2011-01-07 12:06:00+01:00 41 | 39,examine thoroughly,400,Sean,4,Fluxicon Nitro,examine thoroughly,Sean,2011-01-08 14:43:00+01:00 42 | 40,decide,200,Sara,4,Fluxicon Nitro,decide,Sara,2011-01-09 12:02:00+01:00 43 | 41,reject request,200,Ellen,4,Fluxicon Nitro,reject request,Ellen,2011-01-12 15:44:00+01:00 44 | -------------------------------------------------------------------------------- /pm4pyspark/algo/filtering/timestamp/timestamp_filter.py: -------------------------------------------------------------------------------- 1 | import pyspark.sql.functions as F 2 | 3 | from pm4py.algo.filtering.common.filtering_constants import CASE_CONCEPT_NAME 4 | from pm4py.algo.filtering.common.timestamp.timestamp_common import get_dt_from_string 5 | from pm4py.util.constants import PARAMETER_CONSTANT_TIMESTAMP_KEY, PARAMETER_CONSTANT_CASEID_KEY 6 | from pm4py.objects.log.util.xes import DEFAULT_TIMESTAMP_KEY 7 | from pm4pyspark.importer.csv import spark_df_imp as importer 8 | from pm4pyspark.importer.csv import spark_df_imp as csv_importer 9 | 10 | from pyspark.sql.window import Window 11 | from pyspark.sql.types import * 12 | 13 | 14 | 15 | 16 | def filter_traces_contained(df, dt1, dt2, parameters=None): 17 | """Gets traces that are contained in the given interval 18 | """ 19 | 20 | if parameters is None: 21 | parameters = {} 22 | timestamp_key = parameters[ 23 | PARAMETER_CONSTANT_TIMESTAMP_KEY] if PARAMETER_CONSTANT_TIMESTAMP_KEY in parameters else DEFAULT_TIMESTAMP_KEY 24 | case_id_glue = parameters[ 25 | PARAMETER_CONSTANT_CASEID_KEY] if PARAMETER_CONSTANT_CASEID_KEY in parameters else CASE_CONCEPT_NAME 26 | dt1 = get_dt_from_string(dt1) 27 | dt2 = get_dt_from_string(dt2) 28 | df_converted = importer.convert_timestamp_to_utc_in_df(df, timest_columns={timestamp_key}) 29 | df_ordered = df_converted.orderBy(case_id_glue, timestamp_key) 30 | w = Window().partitionBy(case_id_glue).orderBy(timestamp_key) 31 | w2 = Window().partitionBy(case_id_glue).orderBy(F.desc(timestamp_key)) 32 | stacked = df_ordered.withColumn(timestamp_key + "_last", F.max(df_ordered[timestamp_key]).over(w2)) 33 | stacked = stacked.withColumn(timestamp_key + "_first", F.min(stacked[timestamp_key]).over(w)) 34 | stacked = stacked.filter(stacked[timestamp_key + "_first"] > dt1) 35 | stacked = stacked.filter(stacked[timestamp_key + "_last"] < dt2) 36 | stacked_dropped = stacked.drop(timestamp_key + "_last", timestamp_key + "_first") 37 | 38 | return stacked_dropped 39 | 40 | 41 | def filter_traces_intersecting(df, dt1, dt2, parameters=None): 42 | """Filters traces intersecting the given interval 43 | """ 44 | 45 | if parameters is None: 46 | parameters = {} 47 | timestamp_key = parameters[ 48 | PARAMETER_CONSTANT_TIMESTAMP_KEY] if PARAMETER_CONSTANT_TIMESTAMP_KEY in parameters else DEFAULT_TIMESTAMP_KEY 49 | case_id_glue = parameters[ 50 | PARAMETER_CONSTANT_CASEID_KEY] if PARAMETER_CONSTANT_CASEID_KEY in parameters else CASE_CONCEPT_NAME 51 | dt1 = get_dt_from_string(dt1) 52 | dt2 = get_dt_from_string(dt2) 53 | df_converted = importer.convert_timestamp_to_utc_in_df(df, timest_columns={timestamp_key}) 54 | df_ordered = df_converted.orderBy(case_id_glue, timestamp_key) 55 | w = Window().partitionBy(case_id_glue).orderBy(timestamp_key) 56 | w2 = Window().partitionBy(case_id_glue).orderBy(F.desc(timestamp_key)) 57 | stacked = df_ordered.withColumn(timestamp_key + "_last", F.max(df_ordered[timestamp_key]).over(w2)) 58 | stacked = stacked.withColumn(timestamp_key + "_first", F.min(stacked[timestamp_key]).over(w)) 59 | 60 | #stacked1 = stacked.filter(stacked[timestamp_key + "_first"].between(dt1, dt2)) 61 | stacked1 = stacked.filter((stacked[timestamp_key + "_first"] > dt1) & (stacked[timestamp_key + "_first"] < dt2)) 62 | #stacked2 = stacked.filter(stacked[timestamp_key + "_last"].between(dt1, dt2)) 63 | stacked2 = stacked.filter((stacked[timestamp_key + "_last"] > dt1) & (stacked[timestamp_key + "_last"] < dt2)) 64 | stacked3 = stacked.filter(stacked[timestamp_key + "_first"] < dt1) 65 | stacked3 = stacked3.filter(stacked3[timestamp_key + "_last"] > dt2) 66 | 67 | 68 | stacked = stacked1.union(stacked2) 69 | stacked = stacked.union(stacked3) 70 | stacked = stacked.drop(timestamp_key + "_last", timestamp_key + "_first")\ 71 | .distinct().orderBy(case_id_glue, timestamp_key) 72 | 73 | return stacked 74 | 75 | 76 | def apply_events(df, dt1, dt2, parameters=None): 77 | """Gets a new Spark DataFrame with all the events contained in the given interval 78 | """ 79 | 80 | if parameters is None: 81 | parameters = {} 82 | timestamp_key = parameters[ 83 | PARAMETER_CONSTANT_TIMESTAMP_KEY] if PARAMETER_CONSTANT_TIMESTAMP_KEY in parameters else DEFAULT_TIMESTAMP_KEY 84 | 85 | if df.schema[timestamp_key].dataType != StringType(): 86 | dt1 = get_dt_from_string(dt1) 87 | dt2 = get_dt_from_string(dt2) 88 | filtered_df = df.filter((df[timestamp_key] > dt1) & (df[timestamp_key] < dt2)) 89 | 90 | return filtered_df 91 | -------------------------------------------------------------------------------- /pm4pyspark/importer/csv/spark_df_imp.py: -------------------------------------------------------------------------------- 1 | import datetime 2 | import pyspark 3 | import pyspark.sql.functions as F 4 | 5 | from dateutil import parser, tz 6 | from pm4py.objects.log import log as log_instance 7 | from pm4py.objects.conversion.log.versions import to_event_log 8 | from pm4pyspark.importer.constants import DEFAULT_NUM_PARTITION 9 | from pyspark.sql import SparkSession 10 | from pyspark.sql.types import * 11 | 12 | 13 | 14 | 15 | def convert_timestamp_to_utc_in_df(df, timest_columns=None): 16 | """Converts datatype of column "time:timestamp" from `StringType` to `TimestampType` as UTC timezone 17 | """ 18 | 19 | if timest_columns is None: 20 | timest_columns = {"time:timestamp"} 21 | for col in timest_columns: 22 | if df.schema[col].dataType == StringType(): 23 | utc_zone = tz.gettz("UTC") 24 | func = F.udf(lambda x: parser.parse(x).astimezone(utc_zone).isoformat(timespec='milliseconds'), StringType()) 25 | func2 = F.udf(lambda x: datetime.datetime.strptime(''.join(x[:-6].rsplit(':', 0)), '%Y-%m-%dT%H:%M:%S.%f'), TimestampType()) 26 | #df = df.withColumn(col + "_utc", func2(func(df[col]))) 27 | #df = df.drop(col).withColumnRenamed(col + "_utc", col) 28 | df = df.withColumn(col, func2(func(df[col]))) 29 | 30 | return df 31 | 32 | 33 | def import_sparkdf_from_path_wo_timeconversion(path, sep=None, quote=None, header=None, inferSchema=None, numPartition=DEFAULT_NUM_PARTITION): 34 | """Imports a Spark DataFrame from the given path of CSV format file (without time conversion) 35 | """ 36 | 37 | spark = (SparkSession. 38 | builder. 39 | master('local[*]'). 40 | config('spark.sql.shuffle.partitions', numPartition). 41 | getOrCreate()) 42 | 43 | spark_df = spark.read.csv(path, sep=sep, quote=quote, header=header, inferSchema=inferSchema) 44 | 45 | return spark_df 46 | 47 | 48 | def convert_caseid_column_to_str(df, case_id_glue="case:concept:name"): 49 | """Converts Case ID column to StringType 50 | """ 51 | 52 | df = df.withColumn(case_id_glue, df[case_id_glue].cast(StringType())) 53 | 54 | return df 55 | 56 | 57 | 58 | def import_sparkdf_from_path(path, sep=None, quote=None, header=None, inferSchema=None, timest_columns=None, 59 | sort=False, sort_field="time:timestamp", ascending=True, numPartition=DEFAULT_NUM_PARTITION): 60 | """Imports a Spark DataFrame from the given path of CSV format file (with time conversion) 61 | """ 62 | 63 | spark_df = import_sparkdf_from_path_wo_timeconversion(path, sep=sep, quote=quote, header=header, 64 | inferSchema=inferSchema, numPartition=numPartition) 65 | spark_df = convert_timestamp_to_utc_in_df(spark_df, timest_columns=timest_columns) 66 | 67 | if sort and sort_field: 68 | if ascending is True: 69 | spark_df = spark_df.orderBy(sort_field) 70 | else: 71 | spark_df = spark_df.orderBy(sort_field, ascending=False) 72 | 73 | return spark_df 74 | 75 | 76 | def import_event_stream(path, sep=None, quote=None, header=None, inferSchema=True, timest_columns=None, sort=True, 77 | sort_field="time:timestamp", ascending=True, numPartition=DEFAULT_NUM_PARTITION): 78 | """Imports an `EventStream` from the given path of CSV format file 79 | """ 80 | 81 | spark_df = import_sparkdf_from_path(path, sep=sep, quote=quote, header=header, inferSchema=inferSchema, 82 | timest_columns=timest_columns, sort=sort, sort_field=sort_field, 83 | ascending=ascending, numPartition=numPartition) 84 | rdd = spark_df.rdd.map(lambda row: row.asDict()) 85 | event_stream = rdd.collect() 86 | event_stream = log_instance.EventStream(event_stream, attributes={'origin': 'csv'}) 87 | #pair_rdd = rdd.map(lambda s: (s[0], (s[1], s[2]))) 88 | #pair_rdd_group = pair_rdd.groupByKey().mapVal0ues(list) 89 | #return pair_rdd_group.collect() 90 | return event_stream 91 | 92 | 93 | def transform_event_stream_to_event_log(event_stream, case_glue="case:concept:name", include_case_attributes=True, enable_deepcopy=False): 94 | """Transforms an `EventStream` to an `EventLog` 95 | """ 96 | 97 | log = to_event_log.transform_event_stream_to_event_log(event_stream, 98 | case_glue=case_glue, 99 | include_case_attributes=include_case_attributes, 100 | enable_deepcopy=enable_deepcopy) 101 | 102 | return log 103 | -------------------------------------------------------------------------------- /pm4pyspark/algo/filtering/start_activities/start_activities_filter.py: -------------------------------------------------------------------------------- 1 | import pyspark.sql.functions as F 2 | 3 | from pm4py.algo.filtering.common.filtering_constants import CASE_CONCEPT_NAME, DECREASING_FACTOR 4 | from pm4py.algo.filtering.common.start_activities import start_activities_common 5 | from pm4py.objects.log.util.xes import DEFAULT_NAME_KEY, DEFAULT_TIMESTAMP_KEY 6 | from pm4py.util.constants import PARAMETER_CONSTANT_TIMESTAMP_KEY 7 | from pm4py.util.constants import PARAMETER_CONSTANT_ACTIVITY_KEY 8 | from pm4py.util.constants import PARAMETER_CONSTANT_CASEID_KEY 9 | from pm4py.util.constants import GROUPED_DATAFRAME 10 | from pyspark.sql.window import Window 11 | 12 | 13 | 14 | 15 | def apply(df, values, parameters=None): 16 | """Filters the Spark dataframe on start activities 17 | """ 18 | 19 | if parameters is None: 20 | parameters = {} 21 | 22 | timestamp_key = parameters[ 23 | PARAMETER_CONSTANT_TIMESTAMP_KEY] if PARAMETER_CONSTANT_TIMESTAMP_KEY in parameters else DEFAULT_TIMESTAMP_KEY 24 | case_id_glue = parameters[ 25 | PARAMETER_CONSTANT_CASEID_KEY] if PARAMETER_CONSTANT_CASEID_KEY in parameters else CASE_CONCEPT_NAME 26 | activity_key = parameters[ 27 | PARAMETER_CONSTANT_ACTIVITY_KEY] if PARAMETER_CONSTANT_ACTIVITY_KEY in parameters else DEFAULT_NAME_KEY 28 | grouped_df = parameters[GROUPED_DATAFRAME] if GROUPED_DATAFRAME in parameters else None 29 | positive = parameters["positive"] if "positive" in parameters else True 30 | 31 | return filter_df_on_start_activities(df, values, timestamp_key=timestamp_key, case_id_glue=case_id_glue, activity_key=activity_key, 32 | positive=positive, grouped_df=grouped_df) 33 | 34 | 35 | def apply_auto_filter(df, parameters=None): 36 | """Applies auto filter on end activities 37 | """ 38 | 39 | if parameters is None: 40 | parameters = {} 41 | 42 | timestamp_key = parameters[ 43 | PARAMETER_CONSTANT_TIMESTAMP_KEY] if PARAMETER_CONSTANT_TIMESTAMP_KEY in parameters else DEFAULT_TIMESTAMP_KEY 44 | case_id_glue = parameters[ 45 | PARAMETER_CONSTANT_CASEID_KEY] if PARAMETER_CONSTANT_CASEID_KEY in parameters else CASE_CONCEPT_NAME 46 | activity_key = parameters[ 47 | PARAMETER_CONSTANT_ACTIVITY_KEY] if PARAMETER_CONSTANT_ACTIVITY_KEY in parameters else DEFAULT_NAME_KEY 48 | decreasing_factor = parameters[ 49 | "decreasingFactor"] if "decreasingFactor" in parameters else DECREASING_FACTOR 50 | grouped_df = parameters[GROUPED_DATAFRAME] if GROUPED_DATAFRAME in parameters else None 51 | 52 | start_activities = get_start_activities(df, parameters=parameters) 53 | salist = start_activities_common.get_sorted_start_activities_list(start_activities) 54 | sathreshold = start_activities_common.get_start_activities_threshold(salist, decreasing_factor) 55 | 56 | return filter_df_on_start_activities_nocc(df, sathreshold, sa_count0=start_activities, timestamp_key=timestamp_key, case_id_glue=case_id_glue, 57 | activity_key=activity_key, grouped_df=grouped_df) 58 | 59 | 60 | def get_start_activities(df, parameters=None): 61 | """Gets start activities count 62 | """ 63 | 64 | if parameters is None: 65 | parameters = {} 66 | 67 | timestamp_key = parameters[ 68 | PARAMETER_CONSTANT_TIMESTAMP_KEY] if PARAMETER_CONSTANT_TIMESTAMP_KEY in parameters else DEFAULT_TIMESTAMP_KEY 69 | case_id_glue = parameters[ 70 | PARAMETER_CONSTANT_CASEID_KEY] if PARAMETER_CONSTANT_CASEID_KEY in parameters else CASE_CONCEPT_NAME 71 | activity_key = parameters[ 72 | PARAMETER_CONSTANT_ACTIVITY_KEY] if PARAMETER_CONSTANT_ACTIVITY_KEY in parameters else DEFAULT_NAME_KEY 73 | grouped_df = parameters[GROUPED_DATAFRAME] if GROUPED_DATAFRAME in parameters else df.groupby(case_id_glue) 74 | 75 | df_start = grouped_df.agg(F.first(activity_key).alias(activity_key)).select(activity_key) 76 | rdd_start = df_start.rdd.map(lambda row: (row[0], 1)).reduceByKey(lambda x, y : x + y) 77 | 78 | return rdd_start.collectAsMap() 79 | 80 | 81 | def filter_df_on_start_activities(df, values, timestamp_key=DEFAULT_TIMESTAMP_KEY, case_id_glue=CASE_CONCEPT_NAME, 82 | activity_key=DEFAULT_NAME_KEY, grouped_df=None, positive=True): 83 | """Filters the Spark dataframe on start activities 84 | """ 85 | 86 | if grouped_df is None: 87 | grouped_df = df.groupby(case_id_glue) 88 | 89 | grouped_df = grouped_df.agg(F.first(activity_key).alias(activity_key+"_1")) 90 | df_start = grouped_df.filter(grouped_df[activity_key+"_1"].isin(values)) 91 | 92 | if positive: 93 | return df.join(F.broadcast(df_start), grouped_df.columns[0]).drop(activity_key+"_1") 94 | else: 95 | return df.join(F.broadcast(df_start), grouped_df.columns[0], "leftanti") 96 | 97 | 98 | def filter_df_on_start_activities_nocc(df, nocc, sa_count0=None, timestamp_key=DEFAULT_TIMESTAMP_KEY, 99 | case_id_glue=CASE_CONCEPT_NAME, activity_key=DEFAULT_NAME_KEY, grouped_df=None): 100 | """Filters the Spark dataframe on start activities number of occurrences 101 | """ 102 | 103 | if grouped_df is None: 104 | grouped_df = df.groupby(case_id_glue) 105 | if sa_count0 is None: 106 | parameters = { 107 | PARAMETER_CONSTANT_TIMESTAMP_KEY: timestamp_key, 108 | PARAMETER_CONSTANT_CASEID_KEY: case_id_glue, 109 | PARAMETER_CONSTANT_ACTIVITY_KEY: activity_key, 110 | GROUPED_DATAFRAME: grouped_df 111 | } 112 | sa_count0 = get_start_activities(df, parameters=parameters) 113 | sa_count = [k for k, v in sa_count0.items() if v >= nocc] 114 | 115 | if len(sa_count) < len(sa_count0): 116 | grouped_df = grouped_df.agg(F.first(activity_key).alias(activity_key+"_1")) 117 | df_start = grouped_df.filter(grouped_df[activity_key+"_1"].isin(sa_count)) 118 | return df.join(F.broadcast(df_start), grouped_df.columns[0]).drop(activity_key+"_1") 119 | return df 120 | -------------------------------------------------------------------------------- /pm4pyspark/algo/filtering/end_activities/end_activities_filter.py: -------------------------------------------------------------------------------- 1 | import pyspark.sql.functions as F 2 | 3 | from pm4py.algo.filtering.common.end_activities import end_activities_common 4 | from pm4py.algo.filtering.common.filtering_constants import CASE_CONCEPT_NAME, DECREASING_FACTOR 5 | from pm4py.objects.log.util.xes import DEFAULT_NAME_KEY, DEFAULT_TIMESTAMP_KEY 6 | from pm4py.util.constants import PARAMETER_CONSTANT_TIMESTAMP_KEY 7 | from pm4py.util.constants import PARAMETER_CONSTANT_ACTIVITY_KEY 8 | from pm4py.util.constants import PARAMETER_CONSTANT_CASEID_KEY 9 | from pm4py.util.constants import PARAM_MOST_COMMON_VARIANT 10 | from pm4py.util.constants import RETURN_EA_COUNT_DICT_AUTOFILTER 11 | from pm4py.util.constants import GROUPED_DATAFRAME 12 | 13 | 14 | 15 | 16 | def apply(df, values, parameters=None): 17 | """Filters the Spark dataframe on end activities 18 | """ 19 | if parameters is None: 20 | parameters = {} 21 | timestamp_key = parameters[ 22 | PARAMETER_CONSTANT_TIMESTAMP_KEY] if PARAMETER_CONSTANT_TIMESTAMP_KEY in parameters else DEFAULT_TIMESTAMP_KEY 23 | case_id_glue = parameters[ 24 | PARAMETER_CONSTANT_CASEID_KEY] if PARAMETER_CONSTANT_CASEID_KEY in parameters else CASE_CONCEPT_NAME 25 | activity_key = parameters[ 26 | PARAMETER_CONSTANT_ACTIVITY_KEY] if PARAMETER_CONSTANT_ACTIVITY_KEY in parameters else DEFAULT_NAME_KEY 27 | grouped_df = parameters[GROUPED_DATAFRAME] if GROUPED_DATAFRAME in parameters else None 28 | positive = parameters["positive"] if "positive" in parameters else True 29 | 30 | return filter_df_on_end_activities(df, values, timestamp_key=timestamp_key, case_id_glue=case_id_glue, activity_key=activity_key, 31 | positive=positive, grouped_df=grouped_df) 32 | 33 | 34 | def apply_auto_filter(df, parameters=None): 35 | """Applies auto filter on end activities 36 | """ 37 | if parameters is None: 38 | parameters = {} 39 | 40 | most_common_variant = parameters[PARAM_MOST_COMMON_VARIANT] if PARAM_MOST_COMMON_VARIANT in parameters else None 41 | 42 | if most_common_variant is None: 43 | most_common_variant = [] 44 | 45 | timestamp_key = parameters[ 46 | PARAMETER_CONSTANT_TIMESTAMP_KEY] if PARAMETER_CONSTANT_TIMESTAMP_KEY in parameters else DEFAULT_TIMESTAMP_KEY 47 | case_id_glue = parameters[ 48 | PARAMETER_CONSTANT_CASEID_KEY] if PARAMETER_CONSTANT_CASEID_KEY in parameters else CASE_CONCEPT_NAME 49 | activity_key = parameters[ 50 | PARAMETER_CONSTANT_ACTIVITY_KEY] if PARAMETER_CONSTANT_ACTIVITY_KEY in parameters else DEFAULT_NAME_KEY 51 | grouped_df = parameters[GROUPED_DATAFRAME] if GROUPED_DATAFRAME in parameters else None 52 | return_dict = parameters[ 53 | RETURN_EA_COUNT_DICT_AUTOFILTER] if RETURN_EA_COUNT_DICT_AUTOFILTER in parameters else False 54 | 55 | decreasing_factor = parameters[ 56 | "decreasingFactor"] if "decreasingFactor" in parameters else DECREASING_FACTOR 57 | if df.count() > 0: 58 | end_activities = get_end_activities(df, parameters=parameters) 59 | ealist = end_activities_common.get_sorted_end_activities_list(end_activities) 60 | eathreshold = end_activities_common.get_end_activities_threshold(ealist, decreasing_factor) 61 | 62 | return filter_df_on_end_activities_nocc(df, eathreshold, ea_count0=end_activities, timestamp_key=timestamp_key, 63 | case_id_glue=case_id_glue, activity_key=activity_key, grouped_df=grouped_df, 64 | return_dict=return_dict, most_common_variant=most_common_variant) 65 | 66 | if return_dict: 67 | return df, {} 68 | 69 | return df 70 | 71 | 72 | def get_end_activities(df, parameters=None): 73 | """Gets end activities count 74 | """ 75 | 76 | if parameters is None: 77 | parameters = {} 78 | 79 | timestamp_key = parameters[ 80 | PARAMETER_CONSTANT_TIMESTAMP_KEY] if PARAMETER_CONSTANT_TIMESTAMP_KEY in parameters else DEFAULT_TIMESTAMP_KEY 81 | case_id_glue = parameters[ 82 | PARAMETER_CONSTANT_CASEID_KEY] if PARAMETER_CONSTANT_CASEID_KEY in parameters else CASE_CONCEPT_NAME 83 | activity_key = parameters[ 84 | PARAMETER_CONSTANT_ACTIVITY_KEY] if PARAMETER_CONSTANT_ACTIVITY_KEY in parameters else DEFAULT_NAME_KEY 85 | grouped_df = parameters[GROUPED_DATAFRAME] if GROUPED_DATAFRAME in parameters else df.groupby(case_id_glue) 86 | 87 | df_end = grouped_df.agg(F.last(activity_key).alias(activity_key)).select(activity_key) 88 | rdd_end = df_end.rdd.map(lambda row: (row[0], 1)).reduceByKey(lambda x, y : x + y) 89 | 90 | return rdd_end.collectAsMap() 91 | 92 | 93 | def filter_df_on_end_activities(df, values, timestamp_key=DEFAULT_TIMESTAMP_KEY, case_id_glue=CASE_CONCEPT_NAME, 94 | activity_key=DEFAULT_NAME_KEY, grouped_df=None, positive=True): 95 | """Filters the Spark dataframe on end activities 96 | """ 97 | 98 | if grouped_df is None: 99 | grouped_df = df.groupby(case_id_glue) 100 | 101 | grouped_df = grouped_df.agg(F.last(activity_key).alias(activity_key+"_1")) 102 | df_end = grouped_df.filter(grouped_df[activity_key+"_1"].isin(values)) 103 | 104 | if positive: 105 | return df.join(F.broadcast(df_end), grouped_df.columns[0]).drop(activity_key+"_1") 106 | else: 107 | return df.join(F.broadcast(df_end), grouped_df.columns[0], "leftanti") 108 | 109 | 110 | def filter_df_on_end_activities_nocc(df, nocc, ea_count0=None, timestamp_key=DEFAULT_TIMESTAMP_KEY, 111 | case_id_glue=CASE_CONCEPT_NAME, activity_key=DEFAULT_NAME_KEY, 112 | grouped_df=None, return_dict=False, most_common_variant=None): 113 | """Filters the Spark dataframe on end activities number of occurrences 114 | """ 115 | 116 | if most_common_variant is None: 117 | most_common_variant = [] 118 | 119 | if df.count() > 0: 120 | if grouped_df is None: 121 | grouped_df = df.groupby(case_id_glue) 122 | if ea_count0 is None: 123 | parameters = { 124 | PARAMETER_CONSTANT_TIMESTAMP_KEY: timestamp_key, 125 | PARAMETER_CONSTANT_CASEID_KEY: case_id_glue, 126 | PARAMETER_CONSTANT_ACTIVITY_KEY: activity_key, 127 | GROUPED_DATAFRAME: grouped_df 128 | } 129 | ea_count0 = get_end_activities(df, parameters=parameters) 130 | ea_count = [k for k, v in ea_count0.items() if 131 | v >= nocc or (len(most_common_variant) > 0 and k == most_common_variant[-1])] 132 | ea_count_dict = {k: v for k, v in ea_count0.items() if 133 | v >= nocc or (len(most_common_variant) > 0 and k == most_common_variant[-1])} 134 | 135 | # Using join operation 136 | if len(ea_count) < len(ea_count0): 137 | grouped_df = grouped_df.agg(F.last(activity_key).alias(activity_key+"_1")) 138 | df_end = grouped_df.filter(grouped_df[activity_key+"_1"].isin(ea_count)) 139 | if return_dict: 140 | return df.join(F.broadcast(df_end), grouped_df.columns[0]).drop(activity_key+"_1"), ea_count_dict 141 | return df.join(F.broadcast(df_end), grouped_df.columns[0]) 142 | if return_dict: 143 | return df, ea_count_dict 144 | return df 145 | -------------------------------------------------------------------------------- /pm4pyspark/algo/filtering/attributes/attributes_filter.py: -------------------------------------------------------------------------------- 1 | import pyspark.sql.functions as F 2 | 3 | from pm4py.algo.filtering.common.attributes import attributes_common 4 | from pm4py.algo.filtering.common.filtering_constants import CASE_CONCEPT_NAME, DECREASING_FACTOR 5 | from pm4py.objects.log.util.xes import DEFAULT_NAME_KEY, DEFAULT_TIMESTAMP_KEY 6 | from pm4py.util.constants import PARAMETER_CONSTANT_ATTRIBUTE_KEY 7 | from pm4py.util.constants import PARAMETER_CONSTANT_CASEID_KEY 8 | from pm4py.util.constants import PARAM_MOST_COMMON_VARIANT 9 | 10 | 11 | 12 | 13 | def apply_numeric_events(df, int1, int2, parameters=None): 14 | """Applies a filter on events (numerical filter) 15 | """ 16 | 17 | if parameters is None: 18 | parameters = {} 19 | attribute_key = parameters[ 20 | PARAMETER_CONSTANT_ATTRIBUTE_KEY] if PARAMETER_CONSTANT_ATTRIBUTE_KEY in parameters else DEFAULT_NAME_KEY 21 | positive = parameters["positive"] if "positive" in parameters else True 22 | if positive: 23 | return df.filter(df[attribute_key].between(int1, int2)) 24 | else: 25 | return df.filter(~df[attribute_key].between(int1, int2)) 26 | 27 | 28 | def apply_numeric(df, int1, int2, parameters=None): 29 | """Filters the Spark dataframe on attribute values (filter cases) 30 | """ 31 | 32 | if parameters is None: 33 | parameters = {} 34 | attribute_key = parameters[ 35 | PARAMETER_CONSTANT_ATTRIBUTE_KEY] if PARAMETER_CONSTANT_ATTRIBUTE_KEY in parameters else DEFAULT_NAME_KEY 36 | case_id_glue = parameters[ 37 | PARAMETER_CONSTANT_CASEID_KEY] if PARAMETER_CONSTANT_CASEID_KEY in parameters else CASE_CONCEPT_NAME 38 | positive = parameters["positive"] if "positive" in parameters else True 39 | 40 | df_filtered = df.filter(df[attribute_key].between(int1, int2)) 41 | df_filtered = df_filtered.groupBy(case_id_glue).count() 42 | #filtered_index = df_filtered.select(case_id_glue).rdd.map(lambda x: x[0]).collect() 43 | if positive: 44 | return df.join(F.broadcast(df_filtered), case_id_glue).drop("count") 45 | else: 46 | df_left_joined = df.join(F.broadcast(df_filtered), case_id_glue, "left") 47 | return df_left_joined.filter(df_left_joined["count"].isNull()).drop("count") 48 | 49 | 50 | def apply_events(df, values, parameters=None): 51 | """Filters the Spark dataframe on attribute values (filter events) 52 | """ 53 | 54 | if parameters is None: 55 | parameters = {} 56 | attribute_key = parameters[ 57 | PARAMETER_CONSTANT_ATTRIBUTE_KEY] if PARAMETER_CONSTANT_ATTRIBUTE_KEY in parameters else DEFAULT_NAME_KEY 58 | positive = parameters["positive"] if "positive" in parameters else True 59 | if positive: 60 | return df.filter(df[attribute_key].isin(values)) 61 | else: 62 | return df.filter(~df[attribute_key].isin(values)) 63 | 64 | 65 | def apply(df, values, parameters=None): 66 | """Filters the Spark dataframe on attribute values (filter traces) 67 | """ 68 | 69 | if parameters is None: 70 | parameters = {} 71 | 72 | case_id_glue = parameters[ 73 | PARAMETER_CONSTANT_CASEID_KEY] if PARAMETER_CONSTANT_CASEID_KEY in parameters else CASE_CONCEPT_NAME 74 | attribute_key = parameters[ 75 | PARAMETER_CONSTANT_ATTRIBUTE_KEY] if PARAMETER_CONSTANT_ATTRIBUTE_KEY in parameters else DEFAULT_NAME_KEY 76 | positive = parameters["positive"] if "positive" in parameters else True 77 | 78 | return filter_df_on_attribute_values(df, values, case_id_glue=case_id_glue, attribute_key=attribute_key, 79 | positive=positive) 80 | 81 | 82 | def apply_auto_filter(df, parameters=None): 83 | """Applies auto filter on activity values 84 | """ 85 | if parameters is None: 86 | parameters = {} 87 | 88 | most_common_variant = parameters[PARAM_MOST_COMMON_VARIANT] if PARAM_MOST_COMMON_VARIANT in parameters else None 89 | 90 | if most_common_variant is None: 91 | most_common_variant = [] 92 | 93 | activity_key = parameters[ 94 | PARAMETER_CONSTANT_ACTIVITY_KEY] if PARAMETER_CONSTANT_ACTIVITY_KEY in parameters else DEFAULT_NAME_KEY 95 | decreasing_factor = parameters[ 96 | "decreasingFactor"] if "decreasingFactor" in parameters else DECREASING_FACTOR 97 | 98 | if df.count() > 0: 99 | activities = get_attribute_values(df, activity_key) 100 | alist = attributes_common.get_sorted_attributes_list(activities) 101 | thresh = attributes_common.get_attributes_threshold(alist, decreasing_factor) 102 | 103 | return filter_df_keeping_activ_exc_thresh(df, thresh, activity_key=activity_key, act_count0=activities, 104 | most_common_variant=most_common_variant) 105 | return df 106 | 107 | 108 | def get_attribute_values(df, attribute_key, parameters=None): 109 | """Returns a list of attribute values contained in the specified column of the CSV 110 | """ 111 | 112 | if parameters is None: 113 | parameters = {} 114 | str(parameters) 115 | df = df.select(attribute_key) 116 | rdd_df = df.rdd.map(lambda event: (event[0], 1)).reduceByKey(lambda x, y : x + y)\ 117 | .sortBy(lambda x: -x[1]) 118 | 119 | return rdd_df.collectAsMap() 120 | 121 | 122 | def filter_df_on_attribute_values(df, values, case_id_glue="case:concept:name", attribute_key="concept:name", 123 | positive=True): 124 | """Filters the Spark dataframe on attribute values 125 | """ 126 | 127 | df_filtered = df.filter(df[attribute_key].isin(values)) 128 | df_filtered = df_filtered.groupBy(case_id_glue).count() 129 | if positive: 130 | return df.join(F.broadcast(df_filtered), case_id_glue).drop("count") 131 | else: 132 | return df.join(F.broadcast(df_filtered), case_id_glue, "leftanti") 133 | 134 | 135 | def filter_df_keeping_activ_exc_thresh(df, thresh, act_count0=None, activity_key="concept:name", 136 | most_common_variant=None): 137 | """Filters the Spark dataframe keeping activities exceeding the threshold 138 | """ 139 | 140 | if most_common_variant is None: 141 | most_common_variant = [] 142 | 143 | if act_count0 is None: 144 | act_count0 = get_attribute_values(df, activity_key) 145 | act_count = [k for k, v in act_count0.items() if v >= thresh or k in most_common_variant] 146 | if len(act_count) < len(act_count0): 147 | df = df.filter(df[activity_key].isin(act_count)) 148 | return df 149 | 150 | 151 | def filter_df_keeping_spno_activities(df, activity_key="concept:name", max_no_activities=25): 152 | """Filters the Spark dataframe on the specified number of attributes 153 | """ 154 | 155 | activity_values_dict = get_attribute_values(df, activity_key) 156 | activity_values_ordered_list = [] 157 | for act in activity_values_dict: 158 | activity_values_ordered_list.append([act, activity_values_dict[act]]) 159 | activity_values_ordered_list = sorted(activity_values_ordered_list, key=lambda x: (x[1], x[0]), reverse=True) 160 | activity_values_ordered_list = activity_values_ordered_list[ 161 | 0:min(len(activity_values_ordered_list), max_no_activities)] 162 | activity_to_keep = [x[0] for x in activity_values_ordered_list] 163 | 164 | if len(activity_to_keep) < len(activity_values_dict): 165 | df = df.filter(df[activity_key].isin(activity_to_keep)) 166 | return df 167 | 168 | 169 | def get_kde_numeric_attribute(df, attribute, parameters=None): 170 | """Gets the KDE estimation for the distribution of a numeric attribute values 171 | """ 172 | values = df.select(attribute).rdd.map(lambda row : row[0]).collect() 173 | 174 | return attributes_common.get_kde_numeric_attribute(values, parameters=parameters) 175 | 176 | 177 | def get_kde_numeric_attribute_json(df, attribute, parameters=None): 178 | """ 179 | Gets the KDE estimation for the distribution of a numeric attribute values 180 | (expressed as JSON) 181 | """ 182 | values = df.select(attribute).rdd.map(lambda row : row[0]).collect() 183 | 184 | return attributes_common.get_kde_numeric_attribute_json(values, parameters=parameters) 185 | 186 | 187 | def get_kde_date_attribute(df, attribute=DEFAULT_TIMESTAMP_KEY, parameters=None): 188 | """Gets the KDE estimation for the distribution of a date attribute values 189 | """ 190 | date_values = df.select(attribute).rdd.map(lambda row : row[0]).collect() 191 | 192 | return attributes_common.get_kde_date_attribute(date_values, parameters=parameters) 193 | 194 | 195 | def get_kde_date_attribute_json(df, attribute=DEFAULT_TIMESTAMP_KEY, parameters=None): 196 | """ 197 | Gets the KDE estimation for the distribution of a date attribute values 198 | (expressed as JSON) 199 | """ 200 | values = df.select(attribute).rdd.map(lambda row : row[0]).collect() 201 | 202 | return attributes_common.get_kde_date_attribute_json(values, parameters=parameters) 203 | -------------------------------------------------------------------------------- /pm4pyspark/algo/filtering/variants/variants_filter.py: -------------------------------------------------------------------------------- 1 | import pyspark.sql.functions as F 2 | 3 | from pm4py.algo.filtering.common.filtering_constants import CASE_CONCEPT_NAME, DECREASING_FACTOR 4 | from pm4py.objects.log.util.xes import DEFAULT_NAME_KEY, DEFAULT_TIMESTAMP_KEY 5 | from pm4py.statistics.traces.common import case_duration as case_duration_commons 6 | from pm4py.util.constants import PARAMETER_CONSTANT_TIMESTAMP_KEY 7 | from pm4py.util.constants import PARAMETER_CONSTANT_ACTIVITY_KEY 8 | from pm4py.util.constants import PARAMETER_CONSTANT_CASEID_KEY 9 | from pyspark.sql.window import Window 10 | 11 | 12 | 13 | 14 | def apply_auto_filter(df, parameters=None): 15 | """Applies an automatic filter on variants 16 | """ 17 | if parameters is None: 18 | parameters = {} 19 | case_id_glue = parameters[ 20 | PARAMETER_CONSTANT_CASEID_KEY] if PARAMETER_CONSTANT_CASEID_KEY in parameters else CASE_CONCEPT_NAME 21 | variants_df = get_variants_df(df, parameters=parameters) 22 | parameters["variants_df"] = variants_df 23 | variants = get_variant_statistics(df, parameters=parameters) 24 | decreasing_factor = parameters[ 25 | "decreasingFactor"] if "decreasingFactor" in parameters else DECREASING_FACTOR 26 | 27 | admitted_variants = [] 28 | if len(variants) > 0: 29 | current_variant_count = variants[0][case_id_glue] 30 | 31 | for i in range(len(variants)): 32 | if variants[i][case_id_glue] >= decreasing_factor * current_variant_count: 33 | admitted_variants.append(variants[i]["variant"]) 34 | else: 35 | break 36 | current_variant_count = variants[i][case_id_glue] 37 | 38 | return apply(df, admitted_variants, parameters=parameters) 39 | 40 | 41 | def apply(df, admitted_variants, parameters=None): 42 | """Applies a filter on variants 43 | """ 44 | if parameters is None: 45 | parameters = {} 46 | 47 | case_id_glue = parameters[ 48 | PARAMETER_CONSTANT_CASEID_KEY] if PARAMETER_CONSTANT_CASEID_KEY in parameters else CASE_CONCEPT_NAME 49 | positive = parameters["positive"] if "positive" in parameters else True 50 | variants_df = parameters["variants_df"] if "variants_df" in parameters else get_variants_df(df, 51 | parameters=parameters) 52 | variants_df = variants_df.filter(variants_df["variant"].isin(admitted_variants)) 53 | 54 | if positive: 55 | return df.join(F.broadcast(variants_df), case_id_glue) 56 | else: 57 | return df.join(F.broadcast(variants_df), case_id_glue, "leftanti") 58 | 59 | 60 | def get_variant_statistics(df, parameters=None): 61 | """Gets variants from the Spark dataframe 62 | """ 63 | if parameters is None: 64 | parameters = {} 65 | 66 | case_id_glue = parameters[ 67 | PARAMETER_CONSTANT_CASEID_KEY] if PARAMETER_CONSTANT_CASEID_KEY in parameters else CASE_CONCEPT_NAME 68 | max_variants_to_return = parameters["max_variants_to_return"] if "max_variants_to_return" in parameters else None 69 | variants_df = parameters["variants_df"] if "variants_df" in parameters else get_variants_df(df, 70 | parameters=parameters) 71 | 72 | variants_df_count = variants_df.groupby("variant").count().orderBy("count", ascending=False) 73 | variants_df_count = variants_df_count.withColumnRenamed("count", case_id_glue) 74 | rdd = variants_df_count.rdd.map(lambda row: row.asDict()) 75 | if max_variants_to_return: 76 | return rdd.take(max_variants_to_return) 77 | return rdd.collect() 78 | 79 | 80 | def get_variant_statistics_with_case_duration(df, parameters=None): 81 | """Gets variants from the Spark dataframe with case duration 82 | """ 83 | if parameters is None: 84 | parameters = {} 85 | case_id_glue = parameters[ 86 | PARAMETER_CONSTANT_CASEID_KEY] if PARAMETER_CONSTANT_CASEID_KEY in parameters else CASE_CONCEPT_NAME 87 | max_variants_to_return = parameters["max_variants_to_return"] if "max_variants_to_return" in parameters else None 88 | variants_df = parameters["variants_df"] if "variants_df" in parameters else get_variants_df_with_case_duration(df, 89 | parameters=parameters) 90 | 91 | variants_df = variants_df.groupby("variant").agg( 92 | F.mean("caseDuration").alias("caseDuration"), 93 | F.count(F.lit(1)).alias("count") 94 | ).orderBy("count", ascending=False) 95 | variants_list = variants_df.rdd.map(lambda row: row.asDict()) 96 | if max_variants_to_return: 97 | return variants_list.take(max_variants_to_return) 98 | return variants_list.collect() 99 | 100 | 101 | def get_variants_df_and_list(df, parameters=None): 102 | """(Technical method) Provides variants_df and variants_list out of the box 103 | """ 104 | if parameters is None: 105 | parameters = {} 106 | case_id_glue = parameters[ 107 | PARAMETER_CONSTANT_CASEID_KEY] if PARAMETER_CONSTANT_CASEID_KEY in parameters else CASE_CONCEPT_NAME 108 | variants_df = get_variants_df(df, parameters=parameters) 109 | parameters["variants_df"] = variants_df 110 | variants_stats = get_variant_statistics(df, parameters=parameters) 111 | variants_list = [] 112 | for vd in variants_stats: 113 | variant = vd["variant"] 114 | count = vd[case_id_glue] 115 | variants_list.append([variant, count]) 116 | variants_list = sorted(variants_list, key=lambda x: (x[1], x[0]), reverse=True) 117 | return variants_df, variants_list 118 | 119 | 120 | def get_cases_description(df, parameters=None): 121 | """Gets a description of traces present in the Spark dataframe 122 | """ 123 | if parameters is None: 124 | parameters = {} 125 | 126 | case_id_glue = parameters[ 127 | PARAMETER_CONSTANT_CASEID_KEY] if PARAMETER_CONSTANT_CASEID_KEY in parameters else CASE_CONCEPT_NAME 128 | timestamp_key = parameters[ 129 | PARAMETER_CONSTANT_TIMESTAMP_KEY] if PARAMETER_CONSTANT_TIMESTAMP_KEY in parameters else DEFAULT_TIMESTAMP_KEY 130 | enable_sort = parameters["enable_sort"] if "enable_sort" in parameters else True 131 | sort_by_column = parameters["sort_by_column"] if "sort_by_column" in parameters else "startTime" 132 | sort_ascending = parameters["sort_ascending"] if "sort_ascending" in parameters else True 133 | max_ret_cases = parameters["max_ret_cases"] if "max_ret_cases" in parameters else None 134 | 135 | ordered_df = df.orderBy(timestamp_key).select(case_id_glue, timestamp_key) 136 | grouped_df = ordered_df.groupby(case_id_glue) 137 | 138 | start_df = grouped_df.agg(F.min(timestamp_key).alias(timestamp_key)) 139 | first_eve_df = ordered_df.join(F.broadcast(start_df), start_df.columns) 140 | end_df = grouped_df.agg(F.max(timestamp_key).alias(timestamp_key)) 141 | last_eve_df = ordered_df.join(F.broadcast(end_df), end_df.columns) 142 | last_eve_df = last_eve_df.withColumnRenamed(timestamp_key, timestamp_key+"_2") 143 | 144 | stacked_df = first_eve_df.join(last_eve_df, case_id_glue).orderBy(case_id_glue) 145 | stacked_df = stacked_df.withColumn("caseDuration", F.unix_timestamp(stacked_df[timestamp_key+"_2"]) - F.unix_timestamp(stacked_df[timestamp_key])) 146 | stacked_df = stacked_df.withColumn("startTime", F.unix_timestamp(stacked_df[timestamp_key])).drop(timestamp_key) 147 | stacked_df = stacked_df.withColumn("endTime", F.unix_timestamp(stacked_df[timestamp_key+"_2"])).drop(timestamp_key+"_2") 148 | 149 | if enable_sort: 150 | stacked_df = stacked_df.orderBy(sort_by_column, ascending=sort_ascending) 151 | if max_ret_cases is not None: 152 | stacked_df = stacked_df.limit(max_ret_cases) 153 | rdd = stacked_df.rdd.map(lambda x: (x[case_id_glue], {'caseDuration': x['caseDuration'], 154 | 'startTime': x['startTime'], 155 | 'endTime': x['endTime']})) 156 | return rdd.collectAsMap() 157 | 158 | 159 | def get_variants_df(df, parameters=None): 160 | """Gets variants dataframe from the Spark dataframe 161 | """ 162 | if parameters is None: 163 | parameters = {} 164 | 165 | timestamp_key = parameters[ 166 | PARAMETER_CONSTANT_TIMESTAMP_KEY] if PARAMETER_CONSTANT_TIMESTAMP_KEY in parameters else DEFAULT_TIMESTAMP_KEY 167 | case_id_glue = parameters[ 168 | PARAMETER_CONSTANT_CASEID_KEY] if PARAMETER_CONSTANT_CASEID_KEY in parameters else CASE_CONCEPT_NAME 169 | activity_key = parameters[ 170 | PARAMETER_CONSTANT_ACTIVITY_KEY] if PARAMETER_CONSTANT_ACTIVITY_KEY in parameters else DEFAULT_NAME_KEY 171 | 172 | df = df.select(case_id_glue, activity_key) 173 | grouped_df = df.withColumn("@@id", F.monotonically_increasing_id())\ 174 | .groupBy(case_id_glue)\ 175 | .agg(F.collect_list(F.struct("@@id", activity_key)).alias("variant"))\ 176 | .select(case_id_glue, F.sort_array("variant").getItem(activity_key).alias("variant")) 177 | grouped_df = grouped_df.withColumn("variant", F.concat_ws(",", "variant")) 178 | 179 | return grouped_df 180 | 181 | 182 | def get_variants_df_with_case_duration(df, parameters=None): 183 | """Gets variants dataframe from the Spark dataframe, with case duration that is included 184 | """ 185 | if parameters is None: 186 | parameters = {} 187 | 188 | case_id_glue = parameters[ 189 | PARAMETER_CONSTANT_CASEID_KEY] if PARAMETER_CONSTANT_CASEID_KEY in parameters else CASE_CONCEPT_NAME 190 | activity_key = parameters[ 191 | PARAMETER_CONSTANT_ACTIVITY_KEY] if PARAMETER_CONSTANT_ACTIVITY_KEY in parameters else DEFAULT_NAME_KEY 192 | timestamp_key = parameters[ 193 | PARAMETER_CONSTANT_TIMESTAMP_KEY] if PARAMETER_CONSTANT_TIMESTAMP_KEY in parameters else DEFAULT_TIMESTAMP_KEY 194 | 195 | ordered_df = df.orderBy(timestamp_key).select(case_id_glue, timestamp_key, activity_key) 196 | grouped_df = ordered_df.groupby(case_id_glue) 197 | df1 = grouped_df.agg(F.collect_list(activity_key).alias("variant")) 198 | df1 = df1.withColumn("variant", F.concat_ws(",", "variant")).orderBy(case_id_glue) 199 | 200 | start_df = grouped_df.agg(F.min(timestamp_key).alias(timestamp_key)) 201 | first_eve_df = ordered_df.join(F.broadcast(start_df), start_df.columns) 202 | end_df = grouped_df.agg(F.max(timestamp_key).alias(timestamp_key)) 203 | last_eve_df = ordered_df.join(F.broadcast(end_df), end_df.columns) 204 | last_eve_df = last_eve_df.withColumnRenamed(timestamp_key, timestamp_key+"_2") 205 | last_eve_df = last_eve_df.withColumnRenamed(activity_key, activity_key+"_2") 206 | 207 | stacked_df = first_eve_df.join(last_eve_df, case_id_glue).orderBy(case_id_glue) 208 | stacked_df = stacked_df.withColumn("caseDuration", F.unix_timestamp(stacked_df[timestamp_key+"_2"]) - F.unix_timestamp(stacked_df[timestamp_key])) 209 | new_df = df1.join(stacked_df, case_id_glue) 210 | return new_df 211 | 212 | 213 | def get_events(df, case_id, parameters=None): 214 | """Gets events belonging to the specified case 215 | """ 216 | if parameters is None: 217 | parameters = {} 218 | case_id_glue = parameters[ 219 | PARAMETER_CONSTANT_CASEID_KEY] if PARAMETER_CONSTANT_CASEID_KEY in parameters else CASE_CONCEPT_NAME 220 | return df.filter(df[case_id_glue] == case_id).rdd.map(lambda row: row.asDict()).collect() 221 | 222 | 223 | def get_kde_caseduration(df, parameters=None): 224 | """Gets the estimation of KDE density for the case durations calculated on the Spark dataframe 225 | """ 226 | cases = get_cases_description(df, parameters=parameters) 227 | duration_values = [x["caseDuration"] for x in cases.values()] 228 | 229 | return case_duration_commons.get_kde_caseduration(duration_values, parameters=parameters) 230 | 231 | 232 | def get_kde_caseduration_json(df, parameters=None): 233 | """ 234 | Gets the estimation of KDE density for the case durations calculated on the Spark dataframe 235 | (expressed as JSON) 236 | """ 237 | cases = get_cases_description(df, parameters=parameters) 238 | duration_values = [x["caseDuration"] for x in cases.values()] 239 | 240 | return case_duration_commons.get_kde_caseduration_json(duration_values, parameters=parameters) 241 | -------------------------------------------------------------------------------- /tests/test_output_data/running-example_freq.svg: -------------------------------------------------------------------------------- 1 | 2 | 4 | 6 | 7 | 9 | 10 | %3 11 | 12 | 13 | 8684302879212509984 14 | 15 | pay compensation (3) 16 | 17 | 18 | 19 | -7763589763592192766 20 | 21 | examine thoroughly (3) 22 | 23 | 24 | 25 | -1458491756056867707 26 | 27 | decide (9) 28 | 29 | 30 | 31 | -7763589763592192766->-1458491756056867707 32 | 33 | 34 | 1 35 | 36 | 37 | 38 | -3507900551512223537 39 | 40 | check ticket (9) 41 | 42 | 43 | 44 | -7763589763592192766->-3507900551512223537 45 | 46 | 47 | 2 48 | 49 | 50 | 51 | -8143538810031436636 52 | 53 | register request (6) 54 | 55 | 56 | 57 | -8143538810031436636->-7763589763592192766 58 | 59 | 60 | 1 61 | 62 | 63 | 64 | -8143538810031436636->-3507900551512223537 65 | 66 | 67 | 2 68 | 69 | 70 | 71 | 165821456136335993 72 | 73 | examine casually (6) 74 | 75 | 76 | 77 | -8143538810031436636->165821456136335993 78 | 79 | 80 | 3 81 | 82 | 83 | 84 | -1458491756056867707->8684302879212509984 85 | 86 | 87 | 3 88 | 89 | 90 | 91 | 6992677090462848556 92 | 93 | reject request (3) 94 | 95 | 96 | 97 | -1458491756056867707->6992677090462848556 98 | 99 | 100 | 3 101 | 102 | 103 | 104 | -6809943729785549651 105 | 106 | reinitiate request (3) 107 | 108 | 109 | 110 | -1458491756056867707->-6809943729785549651 111 | 112 | 113 | 3 114 | 115 | 116 | 117 | -6809943729785549651->-7763589763592192766 118 | 119 | 120 | 1 121 | 122 | 123 | 124 | -6809943729785549651->-3507900551512223537 125 | 126 | 127 | 1 128 | 129 | 130 | 131 | -6809943729785549651->165821456136335993 132 | 133 | 134 | 1 135 | 136 | 137 | 138 | -3507900551512223537->-7763589763592192766 139 | 140 | 141 | 1 142 | 143 | 144 | 145 | -3507900551512223537->-1458491756056867707 146 | 147 | 148 | 6 149 | 150 | 151 | 152 | -3507900551512223537->165821456136335993 153 | 154 | 155 | 2 156 | 157 | 158 | 159 | 165821456136335993->-1458491756056867707 160 | 161 | 162 | 2 163 | 164 | 165 | 166 | 165821456136335993->-3507900551512223537 167 | 168 | 169 | 4 170 | 171 | 172 | 173 | -------------------------------------------------------------------------------- /tests/test_output_data/running-example_perf.svg: -------------------------------------------------------------------------------- 1 | 2 | 4 | 6 | 7 | 9 | 10 | %3 11 | 12 | 13 | 8684302879212509984 14 | 15 | pay compensation 16 | 17 | 18 | 19 | -7763589763592192766 20 | 21 | examine thoroughly 22 | 23 | 24 | 25 | -1458491756056867707 26 | 27 | decide 28 | 29 | 30 | 31 | -7763589763592192766->-1458491756056867707 32 | 33 | 34 | 21h 35 | 36 | 37 | 38 | -3507900551512223537 39 | 40 | check ticket 41 | 42 | 43 | 44 | -7763589763592192766->-3507900551512223537 45 | 46 | 47 | 3D 48 | 49 | 50 | 51 | -8143538810031436636 52 | 53 | register request 54 | 55 | 56 | 57 | -8143538810031436636->-7763589763592192766 58 | 59 | 60 | 23h 61 | 62 | 63 | 64 | -8143538810031436636->-3507900551512223537 65 | 66 | 67 | 10h 68 | 69 | 70 | 71 | 165821456136335993 72 | 73 | examine casually 74 | 75 | 76 | 77 | -8143538810031436636->165821456136335993 78 | 79 | 80 | 8h 81 | 82 | 83 | 84 | -1458491756056867707->8684302879212509984 85 | 86 | 87 | 5D 88 | 89 | 90 | 91 | 6992677090462848556 92 | 93 | reject request 94 | 95 | 96 | 97 | -1458491756056867707->6992677090462848556 98 | 99 | 100 | 1D 101 | 102 | 103 | 104 | -6809943729785549651 105 | 106 | reinitiate request 107 | 108 | 109 | 110 | -1458491756056867707->-6809943729785549651 111 | 112 | 113 | 18h 114 | 115 | 116 | 117 | -6809943729785549651->-7763589763592192766 118 | 119 | 120 | 48m 121 | 122 | 123 | 124 | -6809943729785549651->-3507900551512223537 125 | 126 | 127 | 2D 128 | 129 | 130 | 131 | -6809943729785549651->165821456136335993 132 | 133 | 134 | 20h 135 | 136 | 137 | 138 | -3507900551512223537->-7763589763592192766 139 | 140 | 141 | 1D 142 | 143 | 144 | 145 | -3507900551512223537->-1458491756056867707 146 | 147 | 148 | 2D 149 | 150 | 151 | 152 | -3507900551512223537->165821456136335993 153 | 154 | 155 | 1D 156 | 157 | 158 | 159 | 165821456136335993->-1458491756056867707 160 | 161 | 162 | 4D 163 | 164 | 165 | 166 | 165821456136335993->-3507900551512223537 167 | 168 | 169 | 13h 170 | 171 | 172 | 173 | --------------------------------------------------------------------------------