├── tests
├── __init__.py
├── constants.py
├── input_data
│ ├── receipt.parquet
│ ├── running-example.parquet
│ ├── receipt
│ │ ├── @@partitioning=0
│ │ │ └── 9c8faa65a8bf4e3398a0143ccbf91002.parquet
│ │ ├── @@partitioning=1
│ │ │ └── 5fae4bc2bbbe4166b609bd1e4ee26e40.parquet
│ │ ├── @@partitioning=10
│ │ │ └── 2af530ddd33a4239b3e25d1c0cc1388d.parquet
│ │ ├── @@partitioning=100
│ │ │ └── 3cfd18de55814ca28210b1b1ee5f1a13.parquet
│ │ ├── @@partitioning=101
│ │ │ └── 5f83fdd1c6804aaf90be72e3e079eb84.parquet
│ │ ├── @@partitioning=102
│ │ │ └── c5d4b5fa7e2248d4b0a45c3013ec41ac.parquet
│ │ ├── @@partitioning=103
│ │ │ └── 77f1a134807c410f9969687cd8070722.parquet
│ │ ├── @@partitioning=104
│ │ │ └── 8eb240bdd2d34fc6b179ebaf568070ef.parquet
│ │ ├── @@partitioning=105
│ │ │ └── 92867c1d04884232a2c5333f3a5e5259.parquet
│ │ ├── @@partitioning=106
│ │ │ └── a3176abb78284217a893ca7a1eccbc7e.parquet
│ │ ├── @@partitioning=107
│ │ │ └── 9a5fab8daee14ccca9fa05a3c41efa13.parquet
│ │ ├── @@partitioning=108
│ │ │ └── c8cfffc751a54cb39037159b4e0059ec.parquet
│ │ ├── @@partitioning=109
│ │ │ └── 80ba8c72141a4d5dbbe4272446379ffb.parquet
│ │ ├── @@partitioning=11
│ │ │ └── 3fc4b68a53dd4442a57af05c5fd05e7e.parquet
│ │ ├── @@partitioning=110
│ │ │ └── ee1e7eea73ed408a8766600ed69b68ce.parquet
│ │ ├── @@partitioning=111
│ │ │ └── e3e73d910e234359a1fde9167e612e52.parquet
│ │ ├── @@partitioning=112
│ │ │ └── a5f765fc7a7a40e083e99dd4e508b5ff.parquet
│ │ ├── @@partitioning=113
│ │ │ └── 27f98cfc4fae432ca5da83b8dc2e7012.parquet
│ │ ├── @@partitioning=114
│ │ │ └── 5b8ea367199143e3a2813314e802902c.parquet
│ │ ├── @@partitioning=115
│ │ │ └── ffcbfd9354094dd59e049abfef584f79.parquet
│ │ ├── @@partitioning=116
│ │ │ └── 86a3dda83b37495e8cc192b1f20c6fc7.parquet
│ │ ├── @@partitioning=117
│ │ │ └── 3a638945265c49a4909b4eaf8147b5dd.parquet
│ │ ├── @@partitioning=118
│ │ │ └── ecc09fc5afa04b94b462d021c4c5eb7f.parquet
│ │ ├── @@partitioning=119
│ │ │ └── 2ae73a13137c4965a432f8caa89bf51c.parquet
│ │ ├── @@partitioning=12
│ │ │ └── d5320e0ac9a547d58c6dee3b5aaf03b8.parquet
│ │ ├── @@partitioning=120
│ │ │ └── 16c4ac329c644b1388c0010f5578dd88.parquet
│ │ ├── @@partitioning=121
│ │ │ └── 0e6ae36fc31b4bef96ea22ff3ef3d333.parquet
│ │ ├── @@partitioning=122
│ │ │ └── ae940c9cd5a3401baf7f90ac2eff8f77.parquet
│ │ ├── @@partitioning=123
│ │ │ └── c7c7d9179cc341b39ea57abe756e5c44.parquet
│ │ ├── @@partitioning=124
│ │ │ └── 85c5945eab66433e859898bea8b27870.parquet
│ │ ├── @@partitioning=125
│ │ │ └── 7b45f583130e4787bcecf21d4e2f3916.parquet
│ │ ├── @@partitioning=126
│ │ │ └── 141d2b08a1c14633a747a2b02be325b7.parquet
│ │ ├── @@partitioning=127
│ │ │ └── 70b3cb2e622641af8361a5820ed05270.parquet
│ │ ├── @@partitioning=13
│ │ │ └── 56fa658c37e64286b108c9b4f33362c9.parquet
│ │ ├── @@partitioning=14
│ │ │ └── a1ce0e3591ba441891d49702f68e4b03.parquet
│ │ ├── @@partitioning=15
│ │ │ └── a095226766ef49398fa0351dc36d7bfb.parquet
│ │ ├── @@partitioning=16
│ │ │ └── d021cca539ff4e9bbe4bdb20fc15be6b.parquet
│ │ ├── @@partitioning=17
│ │ │ └── ce3b54422119424eb9ebdd2eebd19420.parquet
│ │ ├── @@partitioning=18
│ │ │ └── 85f8b9ca6f1845cfb1d7d04c3b31a08e.parquet
│ │ ├── @@partitioning=19
│ │ │ └── 442c9f75853d41e1986407827db2b5ec.parquet
│ │ ├── @@partitioning=2
│ │ │ └── 9d33d9e15fa54b398d3d7ba90ba561ef.parquet
│ │ ├── @@partitioning=20
│ │ │ └── a54b66b9a8d348edaf522e08e981dd8d.parquet
│ │ ├── @@partitioning=21
│ │ │ └── 7962182880ed437d9b2636bb9f0c36e1.parquet
│ │ ├── @@partitioning=22
│ │ │ └── 956efb04488c42b59fb1e47b9c58981c.parquet
│ │ ├── @@partitioning=23
│ │ │ └── 16cb1b8b464d46baafa51c36d558ffa4.parquet
│ │ ├── @@partitioning=24
│ │ │ └── d6da9e22500842499eaca917854201cd.parquet
│ │ ├── @@partitioning=25
│ │ │ └── 4570d89881ed42c08a41f7359c988892.parquet
│ │ ├── @@partitioning=26
│ │ │ └── da7095c4ba924cfd870eca452c2210a7.parquet
│ │ ├── @@partitioning=27
│ │ │ └── f10fd728cf4a4dd29ca578e6f5b7d163.parquet
│ │ ├── @@partitioning=28
│ │ │ └── d70798504e6e49738414f91c2c2a0b07.parquet
│ │ ├── @@partitioning=29
│ │ │ └── d8234dd8cc4b4460989ce7cd171a393b.parquet
│ │ ├── @@partitioning=3
│ │ │ └── 367b71119b144bb889b2c8b21147ea63.parquet
│ │ ├── @@partitioning=30
│ │ │ └── 1341a73cc9e640559d9a3232d573bb97.parquet
│ │ ├── @@partitioning=31
│ │ │ └── bedebc130a3c466fba78906bf3937b9f.parquet
│ │ ├── @@partitioning=32
│ │ │ └── 7998f9ce1b7d42589721eb1276107c26.parquet
│ │ ├── @@partitioning=33
│ │ │ └── 61aeb8def350485c9df448d656f023c8.parquet
│ │ ├── @@partitioning=34
│ │ │ └── 4cb8882347514d77b9fe1026f54ca57c.parquet
│ │ ├── @@partitioning=35
│ │ │ └── 359477e5e4c54c71a7b21d09a9380ecc.parquet
│ │ ├── @@partitioning=36
│ │ │ └── 98a5add33458417fbc332815298aca12.parquet
│ │ ├── @@partitioning=37
│ │ │ └── 4cf32527217d4a38b2b6122d5f251526.parquet
│ │ ├── @@partitioning=38
│ │ │ └── 2d329b5523d0457797ee2bd0b557adf1.parquet
│ │ ├── @@partitioning=39
│ │ │ └── e764ef673bdf4060bc5f617e60bd654c.parquet
│ │ ├── @@partitioning=4
│ │ │ └── edc1e19d80444f2589370b07c056530e.parquet
│ │ ├── @@partitioning=40
│ │ │ └── e3900574306440b89b69f32404ecb276.parquet
│ │ ├── @@partitioning=41
│ │ │ └── 04ec822e7c8e4954b3d205870ff71cc1.parquet
│ │ ├── @@partitioning=42
│ │ │ └── 437565a352bc4a60b7c805a35c22d5d0.parquet
│ │ ├── @@partitioning=43
│ │ │ └── 4ff5fc069830432998ffbdf2ea686bbe.parquet
│ │ ├── @@partitioning=44
│ │ │ └── 612dd93522824aa08d85b1a2bc95f640.parquet
│ │ ├── @@partitioning=45
│ │ │ └── e15ebcf7cd704c76a76cea624964bdcc.parquet
│ │ ├── @@partitioning=46
│ │ │ └── 35bbd6866c464676a0778738d51a52dd.parquet
│ │ ├── @@partitioning=47
│ │ │ └── f1a82c702e954b358488128476df877e.parquet
│ │ ├── @@partitioning=48
│ │ │ └── 0d23fd2c7ec74cefba3d85438920fd01.parquet
│ │ ├── @@partitioning=49
│ │ │ └── 36649b37fe534496ba6fb7d695f07685.parquet
│ │ ├── @@partitioning=5
│ │ │ └── 0cf85db0014c426faf4ab95c699faf3c.parquet
│ │ ├── @@partitioning=50
│ │ │ └── a1eadd40a1bc4777be42317886824b6a.parquet
│ │ ├── @@partitioning=51
│ │ │ └── 432e9c7e1fe3413699665cc0d190b8da.parquet
│ │ ├── @@partitioning=52
│ │ │ └── 25faadf083c944738c1c6b515cb484c4.parquet
│ │ ├── @@partitioning=53
│ │ │ └── 7eecd66696b3491bb88f55945e4eda79.parquet
│ │ ├── @@partitioning=54
│ │ │ └── c0fbc5af9d8e48eca911bdae3635f628.parquet
│ │ ├── @@partitioning=55
│ │ │ └── 03e70784390148219fa843c230f6145d.parquet
│ │ ├── @@partitioning=56
│ │ │ └── 446948113fc1472d8ebe12e9109146a6.parquet
│ │ ├── @@partitioning=57
│ │ │ └── 93095d5dcd574806a3cbcad084213536.parquet
│ │ ├── @@partitioning=58
│ │ │ └── a988de5f748a430087f63b0c42f0d0bb.parquet
│ │ ├── @@partitioning=59
│ │ │ └── 79e4ff740ce6425287e4bb319de86d10.parquet
│ │ ├── @@partitioning=6
│ │ │ └── 74d11133dfe64b47b26d60a4f8fc3fcb.parquet
│ │ ├── @@partitioning=60
│ │ │ └── 64f2c5cd7f2c41ff8c41c6983770f597.parquet
│ │ ├── @@partitioning=61
│ │ │ └── 176b2d6dd5a94507a9b8a97fb34b6448.parquet
│ │ ├── @@partitioning=62
│ │ │ └── dde9034be47d49cd9c2c9cf1d445f7ec.parquet
│ │ ├── @@partitioning=63
│ │ │ └── 5021e7fb3c104231a35faf20f51876cb.parquet
│ │ ├── @@partitioning=64
│ │ │ └── 1bdfebb4143c441c9b0009ee5acc1c03.parquet
│ │ ├── @@partitioning=65
│ │ │ └── 55625bbdd0f54667a6eb7e2fd6269847.parquet
│ │ ├── @@partitioning=66
│ │ │ └── 0e076c36a7004c9fbdae493a3fa003bf.parquet
│ │ ├── @@partitioning=67
│ │ │ └── e402d5bd805a4ed5bfc266b36fb695ef.parquet
│ │ ├── @@partitioning=68
│ │ │ └── 8f4608a274924f5eb4b388c5cc83843c.parquet
│ │ ├── @@partitioning=69
│ │ │ └── ef10536765264b4a91b84cd59ebb94f4.parquet
│ │ ├── @@partitioning=7
│ │ │ └── ddd5d4826f5649098345c92168eaacba.parquet
│ │ ├── @@partitioning=70
│ │ │ └── aaa24d7d4bb34ad2a91e24529ea0b539.parquet
│ │ ├── @@partitioning=71
│ │ │ └── 2ebbfd301f334541b6c70529f862e7c5.parquet
│ │ ├── @@partitioning=72
│ │ │ └── 02385bb7a79e4b61a6583adf9d1092fb.parquet
│ │ ├── @@partitioning=73
│ │ │ └── 60c5e48107494b5493ff6b258a3c01b0.parquet
│ │ ├── @@partitioning=74
│ │ │ └── e8e8e52bdbd448aea7ee599403f47694.parquet
│ │ ├── @@partitioning=75
│ │ │ └── 14fe0948605c447b9cde78a75e5ac4f5.parquet
│ │ ├── @@partitioning=76
│ │ │ └── c55ef0ee492a46cab562a9e4e9d7e0eb.parquet
│ │ ├── @@partitioning=77
│ │ │ └── 47cbfa7312ca43b494b434510de137b9.parquet
│ │ ├── @@partitioning=78
│ │ │ └── 438c11bff4dd4ccd9fe30162973847f7.parquet
│ │ ├── @@partitioning=79
│ │ │ └── 4da8267db40f47748911336a91f3ffb3.parquet
│ │ ├── @@partitioning=8
│ │ │ └── df050f729b464424b2aaa417bc94284d.parquet
│ │ ├── @@partitioning=80
│ │ │ └── 95fb4bd665c642de848c941eee38c499.parquet
│ │ ├── @@partitioning=81
│ │ │ └── d2f54e1aeec9411f85a063ed49456c80.parquet
│ │ ├── @@partitioning=82
│ │ │ └── f876e1a583a94201b7fdb6e6a99ee936.parquet
│ │ ├── @@partitioning=83
│ │ │ └── 2e48ca0da28c4cad9840184e4dbc69a0.parquet
│ │ ├── @@partitioning=84
│ │ │ └── e5d459b80f6f4bc8a6d27cabfb18e905.parquet
│ │ ├── @@partitioning=85
│ │ │ └── d39c9243df794592a35f5defc5e9cab2.parquet
│ │ ├── @@partitioning=86
│ │ │ └── 082bbff63aa14064b95e5ab8a55360ee.parquet
│ │ ├── @@partitioning=87
│ │ │ └── 952670c1d29844d7adb1c3c829e05d7b.parquet
│ │ ├── @@partitioning=88
│ │ │ └── 99dee8f7dccb43e281ad846122fddd44.parquet
│ │ ├── @@partitioning=89
│ │ │ └── 7fbf92b524db460ab887984a6d550245.parquet
│ │ ├── @@partitioning=9
│ │ │ └── f1d8c3c34acc4534bdf98f7404470e0b.parquet
│ │ ├── @@partitioning=90
│ │ │ └── 44d3684fb5d34557b6918f643069623c.parquet
│ │ ├── @@partitioning=91
│ │ │ └── a3e8d08f9fc3415e8125d3cf85cc8996.parquet
│ │ ├── @@partitioning=92
│ │ │ └── 8295e31735b14dcba827e0153d0b1ad0.parquet
│ │ ├── @@partitioning=93
│ │ │ └── 97cae9e1281b4179ad123142e4018e9a.parquet
│ │ ├── @@partitioning=94
│ │ │ └── 30e2a0ee8d484502aca9921d6fa403f8.parquet
│ │ ├── @@partitioning=95
│ │ │ └── 640aa952cbbf40ecbc36edb57796d2cd.parquet
│ │ ├── @@partitioning=96
│ │ │ └── 8211320dc69d4e3cbd0356fdea9539f8.parquet
│ │ ├── @@partitioning=97
│ │ │ └── e4032ee22725400d879d7608c9a739a3.parquet
│ │ ├── @@partitioning=98
│ │ │ └── a88b3e98a5a14ae4a67a94715f55714b.parquet
│ │ └── @@partitioning=99
│ │ │ └── 45e2d237bad1461ebf527c82f117696a.parquet
│ └── running-example.csv
├── filtering_paths_test.py
├── filtering_cases_test.py
├── filtering_start_test.py
├── filtering_ts_test.py
├── parquet_export_test.py
├── dfg_test.py
├── filtering_end_test.py
├── csv_import_test.py
├── parquet_import_test.py
├── filtering_attr_test.py
├── filtering_var_test.py
└── test_output_data
│ ├── running-example_freq.svg
│ └── running-example_perf.svg
├── pm4pyspark
├── importer
│ ├── constants.py
│ ├── csv
│ │ ├── __init__.py
│ │ └── spark_df_imp.py
│ ├── __init__.py
│ └── parquet
│ │ ├── __init__.py
│ │ └── spark_df_imp.py
├── __init__.py
├── exporter
│ ├── __init__.py
│ └── parquet
│ │ ├── __init__.py
│ │ └── spark_df_exp.py
└── algo
│ ├── __init__.py
│ ├── discovery
│ ├── __init__.py
│ └── dfg
│ │ ├── __init__.py
│ │ ├── df_statistics.py
│ │ └── factory.py
│ └── filtering
│ ├── cases
│ ├── __init__.py
│ └── cases_filter.py
│ ├── paths
│ ├── __init__.py
│ └── paths_filter.py
│ ├── variants
│ ├── __init__.py
│ └── variants_filter.py
│ ├── timestamp
│ ├── __init__.py
│ └── timestamp_filter.py
│ ├── attributes
│ ├── __init__.py
│ └── attributes_filter.py
│ ├── end_activities
│ ├── __init__.py
│ └── end_activities_filter.py
│ ├── start_activities
│ ├── __init__.py
│ └── start_activities_filter.py
│ └── __init__.py
├── README.md
└── .gitignore
/tests/__init__.py:
--------------------------------------------------------------------------------
1 | from tests import constants
2 |
--------------------------------------------------------------------------------
/pm4pyspark/importer/constants.py:
--------------------------------------------------------------------------------
1 | DEFAULT_NUM_PARTITION = 3
2 |
--------------------------------------------------------------------------------
/pm4pyspark/__init__.py:
--------------------------------------------------------------------------------
1 | from pm4pyspark import algo, importer, exporter
2 |
--------------------------------------------------------------------------------
/pm4pyspark/exporter/__init__.py:
--------------------------------------------------------------------------------
1 | from pm4pyspark.exporter import parquet
2 |
--------------------------------------------------------------------------------
/pm4pyspark/algo/__init__.py:
--------------------------------------------------------------------------------
1 | from pm4pyspark.algo import discovery, filtering
2 |
--------------------------------------------------------------------------------
/pm4pyspark/algo/discovery/__init__.py:
--------------------------------------------------------------------------------
1 | from pm4pyspark.algo.discovery import dfg
2 |
--------------------------------------------------------------------------------
/pm4pyspark/importer/csv/__init__.py:
--------------------------------------------------------------------------------
1 | from pm4pyspark.importer.csv import spark_df_imp
2 |
--------------------------------------------------------------------------------
/pm4pyspark/importer/__init__.py:
--------------------------------------------------------------------------------
1 | from pm4pyspark.importer import csv, parquet, constants
2 |
--------------------------------------------------------------------------------
/tests/constants.py:
--------------------------------------------------------------------------------
1 | INPUT_DATA_DIR = "input_data"
2 | OUTPUT_DATA_DIR = "test_output_data"
3 |
--------------------------------------------------------------------------------
/pm4pyspark/exporter/parquet/__init__.py:
--------------------------------------------------------------------------------
1 | from pm4pyspark.exporter.parquet import spark_df_exp
2 |
--------------------------------------------------------------------------------
/pm4pyspark/importer/parquet/__init__.py:
--------------------------------------------------------------------------------
1 | from pm4pyspark.importer.parquet import spark_df_imp
2 |
--------------------------------------------------------------------------------
/pm4pyspark/algo/filtering/cases/__init__.py:
--------------------------------------------------------------------------------
1 | from pm4pyspark.algo.filtering.cases import cases_filter
2 |
--------------------------------------------------------------------------------
/pm4pyspark/algo/filtering/paths/__init__.py:
--------------------------------------------------------------------------------
1 | from pm4pyspark.algo.filtering.paths import paths_filter
2 |
--------------------------------------------------------------------------------
/pm4pyspark/algo/discovery/dfg/__init__.py:
--------------------------------------------------------------------------------
1 | from pm4pyspark.algo.discovery.dfg import df_statistics, factory
2 |
--------------------------------------------------------------------------------
/pm4pyspark/algo/filtering/variants/__init__.py:
--------------------------------------------------------------------------------
1 | from pm4pyspark.algo.filtering.variants import variants_filter
2 |
--------------------------------------------------------------------------------
/pm4pyspark/algo/filtering/timestamp/__init__.py:
--------------------------------------------------------------------------------
1 | from pm4pyspark.algo.filtering.timestamp import timestamp_filter
2 |
--------------------------------------------------------------------------------
/pm4pyspark/algo/filtering/attributes/__init__.py:
--------------------------------------------------------------------------------
1 | from pm4pyspark.algo.filtering.attributes import attributes_filter
2 |
--------------------------------------------------------------------------------
/pm4pyspark/algo/filtering/end_activities/__init__.py:
--------------------------------------------------------------------------------
1 | from pm4pyspark.algo.filtering.end_activities import end_activities_filter
2 |
--------------------------------------------------------------------------------
/tests/input_data/receipt.parquet:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jihoon-yang/pm4pyspark-source/HEAD/tests/input_data/receipt.parquet
--------------------------------------------------------------------------------
/pm4pyspark/algo/filtering/start_activities/__init__.py:
--------------------------------------------------------------------------------
1 | from pm4pyspark.algo.filtering.start_activities import start_activities_filter
2 |
--------------------------------------------------------------------------------
/tests/input_data/running-example.parquet:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jihoon-yang/pm4pyspark-source/HEAD/tests/input_data/running-example.parquet
--------------------------------------------------------------------------------
/pm4pyspark/algo/filtering/__init__.py:
--------------------------------------------------------------------------------
1 | from pm4pyspark.algo.filtering import start_activities, end_activities, attributes, cases, \
2 | variants, paths, timestamp
3 |
--------------------------------------------------------------------------------
/tests/input_data/receipt/@@partitioning=0/9c8faa65a8bf4e3398a0143ccbf91002.parquet:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jihoon-yang/pm4pyspark-source/HEAD/tests/input_data/receipt/@@partitioning=0/9c8faa65a8bf4e3398a0143ccbf91002.parquet
--------------------------------------------------------------------------------
/tests/input_data/receipt/@@partitioning=1/5fae4bc2bbbe4166b609bd1e4ee26e40.parquet:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jihoon-yang/pm4pyspark-source/HEAD/tests/input_data/receipt/@@partitioning=1/5fae4bc2bbbe4166b609bd1e4ee26e40.parquet
--------------------------------------------------------------------------------
/tests/input_data/receipt/@@partitioning=10/2af530ddd33a4239b3e25d1c0cc1388d.parquet:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jihoon-yang/pm4pyspark-source/HEAD/tests/input_data/receipt/@@partitioning=10/2af530ddd33a4239b3e25d1c0cc1388d.parquet
--------------------------------------------------------------------------------
/tests/input_data/receipt/@@partitioning=100/3cfd18de55814ca28210b1b1ee5f1a13.parquet:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jihoon-yang/pm4pyspark-source/HEAD/tests/input_data/receipt/@@partitioning=100/3cfd18de55814ca28210b1b1ee5f1a13.parquet
--------------------------------------------------------------------------------
/tests/input_data/receipt/@@partitioning=101/5f83fdd1c6804aaf90be72e3e079eb84.parquet:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jihoon-yang/pm4pyspark-source/HEAD/tests/input_data/receipt/@@partitioning=101/5f83fdd1c6804aaf90be72e3e079eb84.parquet
--------------------------------------------------------------------------------
/tests/input_data/receipt/@@partitioning=102/c5d4b5fa7e2248d4b0a45c3013ec41ac.parquet:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jihoon-yang/pm4pyspark-source/HEAD/tests/input_data/receipt/@@partitioning=102/c5d4b5fa7e2248d4b0a45c3013ec41ac.parquet
--------------------------------------------------------------------------------
/tests/input_data/receipt/@@partitioning=103/77f1a134807c410f9969687cd8070722.parquet:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jihoon-yang/pm4pyspark-source/HEAD/tests/input_data/receipt/@@partitioning=103/77f1a134807c410f9969687cd8070722.parquet
--------------------------------------------------------------------------------
/tests/input_data/receipt/@@partitioning=104/8eb240bdd2d34fc6b179ebaf568070ef.parquet:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jihoon-yang/pm4pyspark-source/HEAD/tests/input_data/receipt/@@partitioning=104/8eb240bdd2d34fc6b179ebaf568070ef.parquet
--------------------------------------------------------------------------------
/tests/input_data/receipt/@@partitioning=105/92867c1d04884232a2c5333f3a5e5259.parquet:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jihoon-yang/pm4pyspark-source/HEAD/tests/input_data/receipt/@@partitioning=105/92867c1d04884232a2c5333f3a5e5259.parquet
--------------------------------------------------------------------------------
/tests/input_data/receipt/@@partitioning=106/a3176abb78284217a893ca7a1eccbc7e.parquet:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jihoon-yang/pm4pyspark-source/HEAD/tests/input_data/receipt/@@partitioning=106/a3176abb78284217a893ca7a1eccbc7e.parquet
--------------------------------------------------------------------------------
/tests/input_data/receipt/@@partitioning=107/9a5fab8daee14ccca9fa05a3c41efa13.parquet:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jihoon-yang/pm4pyspark-source/HEAD/tests/input_data/receipt/@@partitioning=107/9a5fab8daee14ccca9fa05a3c41efa13.parquet
--------------------------------------------------------------------------------
/tests/input_data/receipt/@@partitioning=108/c8cfffc751a54cb39037159b4e0059ec.parquet:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jihoon-yang/pm4pyspark-source/HEAD/tests/input_data/receipt/@@partitioning=108/c8cfffc751a54cb39037159b4e0059ec.parquet
--------------------------------------------------------------------------------
/tests/input_data/receipt/@@partitioning=109/80ba8c72141a4d5dbbe4272446379ffb.parquet:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jihoon-yang/pm4pyspark-source/HEAD/tests/input_data/receipt/@@partitioning=109/80ba8c72141a4d5dbbe4272446379ffb.parquet
--------------------------------------------------------------------------------
/tests/input_data/receipt/@@partitioning=11/3fc4b68a53dd4442a57af05c5fd05e7e.parquet:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jihoon-yang/pm4pyspark-source/HEAD/tests/input_data/receipt/@@partitioning=11/3fc4b68a53dd4442a57af05c5fd05e7e.parquet
--------------------------------------------------------------------------------
/tests/input_data/receipt/@@partitioning=110/ee1e7eea73ed408a8766600ed69b68ce.parquet:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jihoon-yang/pm4pyspark-source/HEAD/tests/input_data/receipt/@@partitioning=110/ee1e7eea73ed408a8766600ed69b68ce.parquet
--------------------------------------------------------------------------------
/tests/input_data/receipt/@@partitioning=111/e3e73d910e234359a1fde9167e612e52.parquet:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jihoon-yang/pm4pyspark-source/HEAD/tests/input_data/receipt/@@partitioning=111/e3e73d910e234359a1fde9167e612e52.parquet
--------------------------------------------------------------------------------
/tests/input_data/receipt/@@partitioning=112/a5f765fc7a7a40e083e99dd4e508b5ff.parquet:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jihoon-yang/pm4pyspark-source/HEAD/tests/input_data/receipt/@@partitioning=112/a5f765fc7a7a40e083e99dd4e508b5ff.parquet
--------------------------------------------------------------------------------
/tests/input_data/receipt/@@partitioning=113/27f98cfc4fae432ca5da83b8dc2e7012.parquet:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jihoon-yang/pm4pyspark-source/HEAD/tests/input_data/receipt/@@partitioning=113/27f98cfc4fae432ca5da83b8dc2e7012.parquet
--------------------------------------------------------------------------------
/tests/input_data/receipt/@@partitioning=114/5b8ea367199143e3a2813314e802902c.parquet:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jihoon-yang/pm4pyspark-source/HEAD/tests/input_data/receipt/@@partitioning=114/5b8ea367199143e3a2813314e802902c.parquet
--------------------------------------------------------------------------------
/tests/input_data/receipt/@@partitioning=115/ffcbfd9354094dd59e049abfef584f79.parquet:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jihoon-yang/pm4pyspark-source/HEAD/tests/input_data/receipt/@@partitioning=115/ffcbfd9354094dd59e049abfef584f79.parquet
--------------------------------------------------------------------------------
/tests/input_data/receipt/@@partitioning=116/86a3dda83b37495e8cc192b1f20c6fc7.parquet:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jihoon-yang/pm4pyspark-source/HEAD/tests/input_data/receipt/@@partitioning=116/86a3dda83b37495e8cc192b1f20c6fc7.parquet
--------------------------------------------------------------------------------
/tests/input_data/receipt/@@partitioning=117/3a638945265c49a4909b4eaf8147b5dd.parquet:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jihoon-yang/pm4pyspark-source/HEAD/tests/input_data/receipt/@@partitioning=117/3a638945265c49a4909b4eaf8147b5dd.parquet
--------------------------------------------------------------------------------
/tests/input_data/receipt/@@partitioning=118/ecc09fc5afa04b94b462d021c4c5eb7f.parquet:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jihoon-yang/pm4pyspark-source/HEAD/tests/input_data/receipt/@@partitioning=118/ecc09fc5afa04b94b462d021c4c5eb7f.parquet
--------------------------------------------------------------------------------
/tests/input_data/receipt/@@partitioning=119/2ae73a13137c4965a432f8caa89bf51c.parquet:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jihoon-yang/pm4pyspark-source/HEAD/tests/input_data/receipt/@@partitioning=119/2ae73a13137c4965a432f8caa89bf51c.parquet
--------------------------------------------------------------------------------
/tests/input_data/receipt/@@partitioning=12/d5320e0ac9a547d58c6dee3b5aaf03b8.parquet:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jihoon-yang/pm4pyspark-source/HEAD/tests/input_data/receipt/@@partitioning=12/d5320e0ac9a547d58c6dee3b5aaf03b8.parquet
--------------------------------------------------------------------------------
/tests/input_data/receipt/@@partitioning=120/16c4ac329c644b1388c0010f5578dd88.parquet:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jihoon-yang/pm4pyspark-source/HEAD/tests/input_data/receipt/@@partitioning=120/16c4ac329c644b1388c0010f5578dd88.parquet
--------------------------------------------------------------------------------
/tests/input_data/receipt/@@partitioning=121/0e6ae36fc31b4bef96ea22ff3ef3d333.parquet:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jihoon-yang/pm4pyspark-source/HEAD/tests/input_data/receipt/@@partitioning=121/0e6ae36fc31b4bef96ea22ff3ef3d333.parquet
--------------------------------------------------------------------------------
/tests/input_data/receipt/@@partitioning=122/ae940c9cd5a3401baf7f90ac2eff8f77.parquet:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jihoon-yang/pm4pyspark-source/HEAD/tests/input_data/receipt/@@partitioning=122/ae940c9cd5a3401baf7f90ac2eff8f77.parquet
--------------------------------------------------------------------------------
/tests/input_data/receipt/@@partitioning=123/c7c7d9179cc341b39ea57abe756e5c44.parquet:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jihoon-yang/pm4pyspark-source/HEAD/tests/input_data/receipt/@@partitioning=123/c7c7d9179cc341b39ea57abe756e5c44.parquet
--------------------------------------------------------------------------------
/tests/input_data/receipt/@@partitioning=124/85c5945eab66433e859898bea8b27870.parquet:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jihoon-yang/pm4pyspark-source/HEAD/tests/input_data/receipt/@@partitioning=124/85c5945eab66433e859898bea8b27870.parquet
--------------------------------------------------------------------------------
/tests/input_data/receipt/@@partitioning=125/7b45f583130e4787bcecf21d4e2f3916.parquet:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jihoon-yang/pm4pyspark-source/HEAD/tests/input_data/receipt/@@partitioning=125/7b45f583130e4787bcecf21d4e2f3916.parquet
--------------------------------------------------------------------------------
/tests/input_data/receipt/@@partitioning=126/141d2b08a1c14633a747a2b02be325b7.parquet:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jihoon-yang/pm4pyspark-source/HEAD/tests/input_data/receipt/@@partitioning=126/141d2b08a1c14633a747a2b02be325b7.parquet
--------------------------------------------------------------------------------
/tests/input_data/receipt/@@partitioning=127/70b3cb2e622641af8361a5820ed05270.parquet:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jihoon-yang/pm4pyspark-source/HEAD/tests/input_data/receipt/@@partitioning=127/70b3cb2e622641af8361a5820ed05270.parquet
--------------------------------------------------------------------------------
/tests/input_data/receipt/@@partitioning=13/56fa658c37e64286b108c9b4f33362c9.parquet:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jihoon-yang/pm4pyspark-source/HEAD/tests/input_data/receipt/@@partitioning=13/56fa658c37e64286b108c9b4f33362c9.parquet
--------------------------------------------------------------------------------
/tests/input_data/receipt/@@partitioning=14/a1ce0e3591ba441891d49702f68e4b03.parquet:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jihoon-yang/pm4pyspark-source/HEAD/tests/input_data/receipt/@@partitioning=14/a1ce0e3591ba441891d49702f68e4b03.parquet
--------------------------------------------------------------------------------
/tests/input_data/receipt/@@partitioning=15/a095226766ef49398fa0351dc36d7bfb.parquet:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jihoon-yang/pm4pyspark-source/HEAD/tests/input_data/receipt/@@partitioning=15/a095226766ef49398fa0351dc36d7bfb.parquet
--------------------------------------------------------------------------------
/tests/input_data/receipt/@@partitioning=16/d021cca539ff4e9bbe4bdb20fc15be6b.parquet:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jihoon-yang/pm4pyspark-source/HEAD/tests/input_data/receipt/@@partitioning=16/d021cca539ff4e9bbe4bdb20fc15be6b.parquet
--------------------------------------------------------------------------------
/tests/input_data/receipt/@@partitioning=17/ce3b54422119424eb9ebdd2eebd19420.parquet:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jihoon-yang/pm4pyspark-source/HEAD/tests/input_data/receipt/@@partitioning=17/ce3b54422119424eb9ebdd2eebd19420.parquet
--------------------------------------------------------------------------------
/tests/input_data/receipt/@@partitioning=18/85f8b9ca6f1845cfb1d7d04c3b31a08e.parquet:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jihoon-yang/pm4pyspark-source/HEAD/tests/input_data/receipt/@@partitioning=18/85f8b9ca6f1845cfb1d7d04c3b31a08e.parquet
--------------------------------------------------------------------------------
/tests/input_data/receipt/@@partitioning=19/442c9f75853d41e1986407827db2b5ec.parquet:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jihoon-yang/pm4pyspark-source/HEAD/tests/input_data/receipt/@@partitioning=19/442c9f75853d41e1986407827db2b5ec.parquet
--------------------------------------------------------------------------------
/tests/input_data/receipt/@@partitioning=2/9d33d9e15fa54b398d3d7ba90ba561ef.parquet:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jihoon-yang/pm4pyspark-source/HEAD/tests/input_data/receipt/@@partitioning=2/9d33d9e15fa54b398d3d7ba90ba561ef.parquet
--------------------------------------------------------------------------------
/tests/input_data/receipt/@@partitioning=20/a54b66b9a8d348edaf522e08e981dd8d.parquet:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jihoon-yang/pm4pyspark-source/HEAD/tests/input_data/receipt/@@partitioning=20/a54b66b9a8d348edaf522e08e981dd8d.parquet
--------------------------------------------------------------------------------
/tests/input_data/receipt/@@partitioning=21/7962182880ed437d9b2636bb9f0c36e1.parquet:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jihoon-yang/pm4pyspark-source/HEAD/tests/input_data/receipt/@@partitioning=21/7962182880ed437d9b2636bb9f0c36e1.parquet
--------------------------------------------------------------------------------
/tests/input_data/receipt/@@partitioning=22/956efb04488c42b59fb1e47b9c58981c.parquet:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jihoon-yang/pm4pyspark-source/HEAD/tests/input_data/receipt/@@partitioning=22/956efb04488c42b59fb1e47b9c58981c.parquet
--------------------------------------------------------------------------------
/tests/input_data/receipt/@@partitioning=23/16cb1b8b464d46baafa51c36d558ffa4.parquet:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jihoon-yang/pm4pyspark-source/HEAD/tests/input_data/receipt/@@partitioning=23/16cb1b8b464d46baafa51c36d558ffa4.parquet
--------------------------------------------------------------------------------
/tests/input_data/receipt/@@partitioning=24/d6da9e22500842499eaca917854201cd.parquet:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jihoon-yang/pm4pyspark-source/HEAD/tests/input_data/receipt/@@partitioning=24/d6da9e22500842499eaca917854201cd.parquet
--------------------------------------------------------------------------------
/tests/input_data/receipt/@@partitioning=25/4570d89881ed42c08a41f7359c988892.parquet:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jihoon-yang/pm4pyspark-source/HEAD/tests/input_data/receipt/@@partitioning=25/4570d89881ed42c08a41f7359c988892.parquet
--------------------------------------------------------------------------------
/tests/input_data/receipt/@@partitioning=26/da7095c4ba924cfd870eca452c2210a7.parquet:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jihoon-yang/pm4pyspark-source/HEAD/tests/input_data/receipt/@@partitioning=26/da7095c4ba924cfd870eca452c2210a7.parquet
--------------------------------------------------------------------------------
/tests/input_data/receipt/@@partitioning=27/f10fd728cf4a4dd29ca578e6f5b7d163.parquet:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jihoon-yang/pm4pyspark-source/HEAD/tests/input_data/receipt/@@partitioning=27/f10fd728cf4a4dd29ca578e6f5b7d163.parquet
--------------------------------------------------------------------------------
/tests/input_data/receipt/@@partitioning=28/d70798504e6e49738414f91c2c2a0b07.parquet:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jihoon-yang/pm4pyspark-source/HEAD/tests/input_data/receipt/@@partitioning=28/d70798504e6e49738414f91c2c2a0b07.parquet
--------------------------------------------------------------------------------
/tests/input_data/receipt/@@partitioning=29/d8234dd8cc4b4460989ce7cd171a393b.parquet:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jihoon-yang/pm4pyspark-source/HEAD/tests/input_data/receipt/@@partitioning=29/d8234dd8cc4b4460989ce7cd171a393b.parquet
--------------------------------------------------------------------------------
/tests/input_data/receipt/@@partitioning=3/367b71119b144bb889b2c8b21147ea63.parquet:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jihoon-yang/pm4pyspark-source/HEAD/tests/input_data/receipt/@@partitioning=3/367b71119b144bb889b2c8b21147ea63.parquet
--------------------------------------------------------------------------------
/tests/input_data/receipt/@@partitioning=30/1341a73cc9e640559d9a3232d573bb97.parquet:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jihoon-yang/pm4pyspark-source/HEAD/tests/input_data/receipt/@@partitioning=30/1341a73cc9e640559d9a3232d573bb97.parquet
--------------------------------------------------------------------------------
/tests/input_data/receipt/@@partitioning=31/bedebc130a3c466fba78906bf3937b9f.parquet:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jihoon-yang/pm4pyspark-source/HEAD/tests/input_data/receipt/@@partitioning=31/bedebc130a3c466fba78906bf3937b9f.parquet
--------------------------------------------------------------------------------
/tests/input_data/receipt/@@partitioning=32/7998f9ce1b7d42589721eb1276107c26.parquet:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jihoon-yang/pm4pyspark-source/HEAD/tests/input_data/receipt/@@partitioning=32/7998f9ce1b7d42589721eb1276107c26.parquet
--------------------------------------------------------------------------------
/tests/input_data/receipt/@@partitioning=33/61aeb8def350485c9df448d656f023c8.parquet:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jihoon-yang/pm4pyspark-source/HEAD/tests/input_data/receipt/@@partitioning=33/61aeb8def350485c9df448d656f023c8.parquet
--------------------------------------------------------------------------------
/tests/input_data/receipt/@@partitioning=34/4cb8882347514d77b9fe1026f54ca57c.parquet:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jihoon-yang/pm4pyspark-source/HEAD/tests/input_data/receipt/@@partitioning=34/4cb8882347514d77b9fe1026f54ca57c.parquet
--------------------------------------------------------------------------------
/tests/input_data/receipt/@@partitioning=35/359477e5e4c54c71a7b21d09a9380ecc.parquet:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jihoon-yang/pm4pyspark-source/HEAD/tests/input_data/receipt/@@partitioning=35/359477e5e4c54c71a7b21d09a9380ecc.parquet
--------------------------------------------------------------------------------
/tests/input_data/receipt/@@partitioning=36/98a5add33458417fbc332815298aca12.parquet:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jihoon-yang/pm4pyspark-source/HEAD/tests/input_data/receipt/@@partitioning=36/98a5add33458417fbc332815298aca12.parquet
--------------------------------------------------------------------------------
/tests/input_data/receipt/@@partitioning=37/4cf32527217d4a38b2b6122d5f251526.parquet:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jihoon-yang/pm4pyspark-source/HEAD/tests/input_data/receipt/@@partitioning=37/4cf32527217d4a38b2b6122d5f251526.parquet
--------------------------------------------------------------------------------
/tests/input_data/receipt/@@partitioning=38/2d329b5523d0457797ee2bd0b557adf1.parquet:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jihoon-yang/pm4pyspark-source/HEAD/tests/input_data/receipt/@@partitioning=38/2d329b5523d0457797ee2bd0b557adf1.parquet
--------------------------------------------------------------------------------
/tests/input_data/receipt/@@partitioning=39/e764ef673bdf4060bc5f617e60bd654c.parquet:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jihoon-yang/pm4pyspark-source/HEAD/tests/input_data/receipt/@@partitioning=39/e764ef673bdf4060bc5f617e60bd654c.parquet
--------------------------------------------------------------------------------
/tests/input_data/receipt/@@partitioning=4/edc1e19d80444f2589370b07c056530e.parquet:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jihoon-yang/pm4pyspark-source/HEAD/tests/input_data/receipt/@@partitioning=4/edc1e19d80444f2589370b07c056530e.parquet
--------------------------------------------------------------------------------
/tests/input_data/receipt/@@partitioning=40/e3900574306440b89b69f32404ecb276.parquet:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jihoon-yang/pm4pyspark-source/HEAD/tests/input_data/receipt/@@partitioning=40/e3900574306440b89b69f32404ecb276.parquet
--------------------------------------------------------------------------------
/tests/input_data/receipt/@@partitioning=41/04ec822e7c8e4954b3d205870ff71cc1.parquet:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jihoon-yang/pm4pyspark-source/HEAD/tests/input_data/receipt/@@partitioning=41/04ec822e7c8e4954b3d205870ff71cc1.parquet
--------------------------------------------------------------------------------
/tests/input_data/receipt/@@partitioning=42/437565a352bc4a60b7c805a35c22d5d0.parquet:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jihoon-yang/pm4pyspark-source/HEAD/tests/input_data/receipt/@@partitioning=42/437565a352bc4a60b7c805a35c22d5d0.parquet
--------------------------------------------------------------------------------
/tests/input_data/receipt/@@partitioning=43/4ff5fc069830432998ffbdf2ea686bbe.parquet:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jihoon-yang/pm4pyspark-source/HEAD/tests/input_data/receipt/@@partitioning=43/4ff5fc069830432998ffbdf2ea686bbe.parquet
--------------------------------------------------------------------------------
/tests/input_data/receipt/@@partitioning=44/612dd93522824aa08d85b1a2bc95f640.parquet:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jihoon-yang/pm4pyspark-source/HEAD/tests/input_data/receipt/@@partitioning=44/612dd93522824aa08d85b1a2bc95f640.parquet
--------------------------------------------------------------------------------
/tests/input_data/receipt/@@partitioning=45/e15ebcf7cd704c76a76cea624964bdcc.parquet:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jihoon-yang/pm4pyspark-source/HEAD/tests/input_data/receipt/@@partitioning=45/e15ebcf7cd704c76a76cea624964bdcc.parquet
--------------------------------------------------------------------------------
/tests/input_data/receipt/@@partitioning=46/35bbd6866c464676a0778738d51a52dd.parquet:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jihoon-yang/pm4pyspark-source/HEAD/tests/input_data/receipt/@@partitioning=46/35bbd6866c464676a0778738d51a52dd.parquet
--------------------------------------------------------------------------------
/tests/input_data/receipt/@@partitioning=47/f1a82c702e954b358488128476df877e.parquet:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jihoon-yang/pm4pyspark-source/HEAD/tests/input_data/receipt/@@partitioning=47/f1a82c702e954b358488128476df877e.parquet
--------------------------------------------------------------------------------
/tests/input_data/receipt/@@partitioning=48/0d23fd2c7ec74cefba3d85438920fd01.parquet:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jihoon-yang/pm4pyspark-source/HEAD/tests/input_data/receipt/@@partitioning=48/0d23fd2c7ec74cefba3d85438920fd01.parquet
--------------------------------------------------------------------------------
/tests/input_data/receipt/@@partitioning=49/36649b37fe534496ba6fb7d695f07685.parquet:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jihoon-yang/pm4pyspark-source/HEAD/tests/input_data/receipt/@@partitioning=49/36649b37fe534496ba6fb7d695f07685.parquet
--------------------------------------------------------------------------------
/tests/input_data/receipt/@@partitioning=5/0cf85db0014c426faf4ab95c699faf3c.parquet:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jihoon-yang/pm4pyspark-source/HEAD/tests/input_data/receipt/@@partitioning=5/0cf85db0014c426faf4ab95c699faf3c.parquet
--------------------------------------------------------------------------------
/tests/input_data/receipt/@@partitioning=50/a1eadd40a1bc4777be42317886824b6a.parquet:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jihoon-yang/pm4pyspark-source/HEAD/tests/input_data/receipt/@@partitioning=50/a1eadd40a1bc4777be42317886824b6a.parquet
--------------------------------------------------------------------------------
/tests/input_data/receipt/@@partitioning=51/432e9c7e1fe3413699665cc0d190b8da.parquet:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jihoon-yang/pm4pyspark-source/HEAD/tests/input_data/receipt/@@partitioning=51/432e9c7e1fe3413699665cc0d190b8da.parquet
--------------------------------------------------------------------------------
/tests/input_data/receipt/@@partitioning=52/25faadf083c944738c1c6b515cb484c4.parquet:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jihoon-yang/pm4pyspark-source/HEAD/tests/input_data/receipt/@@partitioning=52/25faadf083c944738c1c6b515cb484c4.parquet
--------------------------------------------------------------------------------
/tests/input_data/receipt/@@partitioning=53/7eecd66696b3491bb88f55945e4eda79.parquet:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jihoon-yang/pm4pyspark-source/HEAD/tests/input_data/receipt/@@partitioning=53/7eecd66696b3491bb88f55945e4eda79.parquet
--------------------------------------------------------------------------------
/tests/input_data/receipt/@@partitioning=54/c0fbc5af9d8e48eca911bdae3635f628.parquet:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jihoon-yang/pm4pyspark-source/HEAD/tests/input_data/receipt/@@partitioning=54/c0fbc5af9d8e48eca911bdae3635f628.parquet
--------------------------------------------------------------------------------
/tests/input_data/receipt/@@partitioning=55/03e70784390148219fa843c230f6145d.parquet:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jihoon-yang/pm4pyspark-source/HEAD/tests/input_data/receipt/@@partitioning=55/03e70784390148219fa843c230f6145d.parquet
--------------------------------------------------------------------------------
/tests/input_data/receipt/@@partitioning=56/446948113fc1472d8ebe12e9109146a6.parquet:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jihoon-yang/pm4pyspark-source/HEAD/tests/input_data/receipt/@@partitioning=56/446948113fc1472d8ebe12e9109146a6.parquet
--------------------------------------------------------------------------------
/tests/input_data/receipt/@@partitioning=57/93095d5dcd574806a3cbcad084213536.parquet:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jihoon-yang/pm4pyspark-source/HEAD/tests/input_data/receipt/@@partitioning=57/93095d5dcd574806a3cbcad084213536.parquet
--------------------------------------------------------------------------------
/tests/input_data/receipt/@@partitioning=58/a988de5f748a430087f63b0c42f0d0bb.parquet:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jihoon-yang/pm4pyspark-source/HEAD/tests/input_data/receipt/@@partitioning=58/a988de5f748a430087f63b0c42f0d0bb.parquet
--------------------------------------------------------------------------------
/tests/input_data/receipt/@@partitioning=59/79e4ff740ce6425287e4bb319de86d10.parquet:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jihoon-yang/pm4pyspark-source/HEAD/tests/input_data/receipt/@@partitioning=59/79e4ff740ce6425287e4bb319de86d10.parquet
--------------------------------------------------------------------------------
/tests/input_data/receipt/@@partitioning=6/74d11133dfe64b47b26d60a4f8fc3fcb.parquet:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jihoon-yang/pm4pyspark-source/HEAD/tests/input_data/receipt/@@partitioning=6/74d11133dfe64b47b26d60a4f8fc3fcb.parquet
--------------------------------------------------------------------------------
/tests/input_data/receipt/@@partitioning=60/64f2c5cd7f2c41ff8c41c6983770f597.parquet:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jihoon-yang/pm4pyspark-source/HEAD/tests/input_data/receipt/@@partitioning=60/64f2c5cd7f2c41ff8c41c6983770f597.parquet
--------------------------------------------------------------------------------
/tests/input_data/receipt/@@partitioning=61/176b2d6dd5a94507a9b8a97fb34b6448.parquet:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jihoon-yang/pm4pyspark-source/HEAD/tests/input_data/receipt/@@partitioning=61/176b2d6dd5a94507a9b8a97fb34b6448.parquet
--------------------------------------------------------------------------------
/tests/input_data/receipt/@@partitioning=62/dde9034be47d49cd9c2c9cf1d445f7ec.parquet:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jihoon-yang/pm4pyspark-source/HEAD/tests/input_data/receipt/@@partitioning=62/dde9034be47d49cd9c2c9cf1d445f7ec.parquet
--------------------------------------------------------------------------------
/tests/input_data/receipt/@@partitioning=63/5021e7fb3c104231a35faf20f51876cb.parquet:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jihoon-yang/pm4pyspark-source/HEAD/tests/input_data/receipt/@@partitioning=63/5021e7fb3c104231a35faf20f51876cb.parquet
--------------------------------------------------------------------------------
/tests/input_data/receipt/@@partitioning=64/1bdfebb4143c441c9b0009ee5acc1c03.parquet:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jihoon-yang/pm4pyspark-source/HEAD/tests/input_data/receipt/@@partitioning=64/1bdfebb4143c441c9b0009ee5acc1c03.parquet
--------------------------------------------------------------------------------
/tests/input_data/receipt/@@partitioning=65/55625bbdd0f54667a6eb7e2fd6269847.parquet:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jihoon-yang/pm4pyspark-source/HEAD/tests/input_data/receipt/@@partitioning=65/55625bbdd0f54667a6eb7e2fd6269847.parquet
--------------------------------------------------------------------------------
/tests/input_data/receipt/@@partitioning=66/0e076c36a7004c9fbdae493a3fa003bf.parquet:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jihoon-yang/pm4pyspark-source/HEAD/tests/input_data/receipt/@@partitioning=66/0e076c36a7004c9fbdae493a3fa003bf.parquet
--------------------------------------------------------------------------------
/tests/input_data/receipt/@@partitioning=67/e402d5bd805a4ed5bfc266b36fb695ef.parquet:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jihoon-yang/pm4pyspark-source/HEAD/tests/input_data/receipt/@@partitioning=67/e402d5bd805a4ed5bfc266b36fb695ef.parquet
--------------------------------------------------------------------------------
/tests/input_data/receipt/@@partitioning=68/8f4608a274924f5eb4b388c5cc83843c.parquet:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jihoon-yang/pm4pyspark-source/HEAD/tests/input_data/receipt/@@partitioning=68/8f4608a274924f5eb4b388c5cc83843c.parquet
--------------------------------------------------------------------------------
/tests/input_data/receipt/@@partitioning=69/ef10536765264b4a91b84cd59ebb94f4.parquet:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jihoon-yang/pm4pyspark-source/HEAD/tests/input_data/receipt/@@partitioning=69/ef10536765264b4a91b84cd59ebb94f4.parquet
--------------------------------------------------------------------------------
/tests/input_data/receipt/@@partitioning=7/ddd5d4826f5649098345c92168eaacba.parquet:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jihoon-yang/pm4pyspark-source/HEAD/tests/input_data/receipt/@@partitioning=7/ddd5d4826f5649098345c92168eaacba.parquet
--------------------------------------------------------------------------------
/tests/input_data/receipt/@@partitioning=70/aaa24d7d4bb34ad2a91e24529ea0b539.parquet:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jihoon-yang/pm4pyspark-source/HEAD/tests/input_data/receipt/@@partitioning=70/aaa24d7d4bb34ad2a91e24529ea0b539.parquet
--------------------------------------------------------------------------------
/tests/input_data/receipt/@@partitioning=71/2ebbfd301f334541b6c70529f862e7c5.parquet:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jihoon-yang/pm4pyspark-source/HEAD/tests/input_data/receipt/@@partitioning=71/2ebbfd301f334541b6c70529f862e7c5.parquet
--------------------------------------------------------------------------------
/tests/input_data/receipt/@@partitioning=72/02385bb7a79e4b61a6583adf9d1092fb.parquet:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jihoon-yang/pm4pyspark-source/HEAD/tests/input_data/receipt/@@partitioning=72/02385bb7a79e4b61a6583adf9d1092fb.parquet
--------------------------------------------------------------------------------
/tests/input_data/receipt/@@partitioning=73/60c5e48107494b5493ff6b258a3c01b0.parquet:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jihoon-yang/pm4pyspark-source/HEAD/tests/input_data/receipt/@@partitioning=73/60c5e48107494b5493ff6b258a3c01b0.parquet
--------------------------------------------------------------------------------
/tests/input_data/receipt/@@partitioning=74/e8e8e52bdbd448aea7ee599403f47694.parquet:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jihoon-yang/pm4pyspark-source/HEAD/tests/input_data/receipt/@@partitioning=74/e8e8e52bdbd448aea7ee599403f47694.parquet
--------------------------------------------------------------------------------
/tests/input_data/receipt/@@partitioning=75/14fe0948605c447b9cde78a75e5ac4f5.parquet:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jihoon-yang/pm4pyspark-source/HEAD/tests/input_data/receipt/@@partitioning=75/14fe0948605c447b9cde78a75e5ac4f5.parquet
--------------------------------------------------------------------------------
/tests/input_data/receipt/@@partitioning=76/c55ef0ee492a46cab562a9e4e9d7e0eb.parquet:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jihoon-yang/pm4pyspark-source/HEAD/tests/input_data/receipt/@@partitioning=76/c55ef0ee492a46cab562a9e4e9d7e0eb.parquet
--------------------------------------------------------------------------------
/tests/input_data/receipt/@@partitioning=77/47cbfa7312ca43b494b434510de137b9.parquet:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jihoon-yang/pm4pyspark-source/HEAD/tests/input_data/receipt/@@partitioning=77/47cbfa7312ca43b494b434510de137b9.parquet
--------------------------------------------------------------------------------
/tests/input_data/receipt/@@partitioning=78/438c11bff4dd4ccd9fe30162973847f7.parquet:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jihoon-yang/pm4pyspark-source/HEAD/tests/input_data/receipt/@@partitioning=78/438c11bff4dd4ccd9fe30162973847f7.parquet
--------------------------------------------------------------------------------
/tests/input_data/receipt/@@partitioning=79/4da8267db40f47748911336a91f3ffb3.parquet:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jihoon-yang/pm4pyspark-source/HEAD/tests/input_data/receipt/@@partitioning=79/4da8267db40f47748911336a91f3ffb3.parquet
--------------------------------------------------------------------------------
/tests/input_data/receipt/@@partitioning=8/df050f729b464424b2aaa417bc94284d.parquet:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jihoon-yang/pm4pyspark-source/HEAD/tests/input_data/receipt/@@partitioning=8/df050f729b464424b2aaa417bc94284d.parquet
--------------------------------------------------------------------------------
/tests/input_data/receipt/@@partitioning=80/95fb4bd665c642de848c941eee38c499.parquet:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jihoon-yang/pm4pyspark-source/HEAD/tests/input_data/receipt/@@partitioning=80/95fb4bd665c642de848c941eee38c499.parquet
--------------------------------------------------------------------------------
/tests/input_data/receipt/@@partitioning=81/d2f54e1aeec9411f85a063ed49456c80.parquet:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jihoon-yang/pm4pyspark-source/HEAD/tests/input_data/receipt/@@partitioning=81/d2f54e1aeec9411f85a063ed49456c80.parquet
--------------------------------------------------------------------------------
/tests/input_data/receipt/@@partitioning=82/f876e1a583a94201b7fdb6e6a99ee936.parquet:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jihoon-yang/pm4pyspark-source/HEAD/tests/input_data/receipt/@@partitioning=82/f876e1a583a94201b7fdb6e6a99ee936.parquet
--------------------------------------------------------------------------------
/tests/input_data/receipt/@@partitioning=83/2e48ca0da28c4cad9840184e4dbc69a0.parquet:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jihoon-yang/pm4pyspark-source/HEAD/tests/input_data/receipt/@@partitioning=83/2e48ca0da28c4cad9840184e4dbc69a0.parquet
--------------------------------------------------------------------------------
/tests/input_data/receipt/@@partitioning=84/e5d459b80f6f4bc8a6d27cabfb18e905.parquet:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jihoon-yang/pm4pyspark-source/HEAD/tests/input_data/receipt/@@partitioning=84/e5d459b80f6f4bc8a6d27cabfb18e905.parquet
--------------------------------------------------------------------------------
/tests/input_data/receipt/@@partitioning=85/d39c9243df794592a35f5defc5e9cab2.parquet:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jihoon-yang/pm4pyspark-source/HEAD/tests/input_data/receipt/@@partitioning=85/d39c9243df794592a35f5defc5e9cab2.parquet
--------------------------------------------------------------------------------
/tests/input_data/receipt/@@partitioning=86/082bbff63aa14064b95e5ab8a55360ee.parquet:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jihoon-yang/pm4pyspark-source/HEAD/tests/input_data/receipt/@@partitioning=86/082bbff63aa14064b95e5ab8a55360ee.parquet
--------------------------------------------------------------------------------
/tests/input_data/receipt/@@partitioning=87/952670c1d29844d7adb1c3c829e05d7b.parquet:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jihoon-yang/pm4pyspark-source/HEAD/tests/input_data/receipt/@@partitioning=87/952670c1d29844d7adb1c3c829e05d7b.parquet
--------------------------------------------------------------------------------
/tests/input_data/receipt/@@partitioning=88/99dee8f7dccb43e281ad846122fddd44.parquet:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jihoon-yang/pm4pyspark-source/HEAD/tests/input_data/receipt/@@partitioning=88/99dee8f7dccb43e281ad846122fddd44.parquet
--------------------------------------------------------------------------------
/tests/input_data/receipt/@@partitioning=89/7fbf92b524db460ab887984a6d550245.parquet:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jihoon-yang/pm4pyspark-source/HEAD/tests/input_data/receipt/@@partitioning=89/7fbf92b524db460ab887984a6d550245.parquet
--------------------------------------------------------------------------------
/tests/input_data/receipt/@@partitioning=9/f1d8c3c34acc4534bdf98f7404470e0b.parquet:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jihoon-yang/pm4pyspark-source/HEAD/tests/input_data/receipt/@@partitioning=9/f1d8c3c34acc4534bdf98f7404470e0b.parquet
--------------------------------------------------------------------------------
/tests/input_data/receipt/@@partitioning=90/44d3684fb5d34557b6918f643069623c.parquet:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jihoon-yang/pm4pyspark-source/HEAD/tests/input_data/receipt/@@partitioning=90/44d3684fb5d34557b6918f643069623c.parquet
--------------------------------------------------------------------------------
/tests/input_data/receipt/@@partitioning=91/a3e8d08f9fc3415e8125d3cf85cc8996.parquet:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jihoon-yang/pm4pyspark-source/HEAD/tests/input_data/receipt/@@partitioning=91/a3e8d08f9fc3415e8125d3cf85cc8996.parquet
--------------------------------------------------------------------------------
/tests/input_data/receipt/@@partitioning=92/8295e31735b14dcba827e0153d0b1ad0.parquet:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jihoon-yang/pm4pyspark-source/HEAD/tests/input_data/receipt/@@partitioning=92/8295e31735b14dcba827e0153d0b1ad0.parquet
--------------------------------------------------------------------------------
/tests/input_data/receipt/@@partitioning=93/97cae9e1281b4179ad123142e4018e9a.parquet:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jihoon-yang/pm4pyspark-source/HEAD/tests/input_data/receipt/@@partitioning=93/97cae9e1281b4179ad123142e4018e9a.parquet
--------------------------------------------------------------------------------
/tests/input_data/receipt/@@partitioning=94/30e2a0ee8d484502aca9921d6fa403f8.parquet:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jihoon-yang/pm4pyspark-source/HEAD/tests/input_data/receipt/@@partitioning=94/30e2a0ee8d484502aca9921d6fa403f8.parquet
--------------------------------------------------------------------------------
/tests/input_data/receipt/@@partitioning=95/640aa952cbbf40ecbc36edb57796d2cd.parquet:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jihoon-yang/pm4pyspark-source/HEAD/tests/input_data/receipt/@@partitioning=95/640aa952cbbf40ecbc36edb57796d2cd.parquet
--------------------------------------------------------------------------------
/tests/input_data/receipt/@@partitioning=96/8211320dc69d4e3cbd0356fdea9539f8.parquet:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jihoon-yang/pm4pyspark-source/HEAD/tests/input_data/receipt/@@partitioning=96/8211320dc69d4e3cbd0356fdea9539f8.parquet
--------------------------------------------------------------------------------
/tests/input_data/receipt/@@partitioning=97/e4032ee22725400d879d7608c9a739a3.parquet:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jihoon-yang/pm4pyspark-source/HEAD/tests/input_data/receipt/@@partitioning=97/e4032ee22725400d879d7608c9a739a3.parquet
--------------------------------------------------------------------------------
/tests/input_data/receipt/@@partitioning=98/a88b3e98a5a14ae4a67a94715f55714b.parquet:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jihoon-yang/pm4pyspark-source/HEAD/tests/input_data/receipt/@@partitioning=98/a88b3e98a5a14ae4a67a94715f55714b.parquet
--------------------------------------------------------------------------------
/tests/input_data/receipt/@@partitioning=99/45e2d237bad1461ebf527c82f117696a.parquet:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jihoon-yang/pm4pyspark-source/HEAD/tests/input_data/receipt/@@partitioning=99/45e2d237bad1461ebf527c82f117696a.parquet
--------------------------------------------------------------------------------
/tests/filtering_paths_test.py:
--------------------------------------------------------------------------------
1 | import os
2 | from tests.constants import INPUT_DATA_DIR, OUTPUT_DATA_DIR
3 | from pm4pyspark.importer.csv import spark_df_imp as importer
4 |
5 | from pm4pyspark.algo.filtering.paths import paths_filter
6 |
7 |
8 | df_ex = importer.import_sparkdf_from_path(os.path.join(INPUT_DATA_DIR, "running-example.csv"), header=True, inferSchema=True)
9 | filtered_df = paths_filter.apply(df_ex, [('check ticket', 'decide')])
10 | filtered_df.show(filtered_df.count(), truncate=False)
11 |
--------------------------------------------------------------------------------
/pm4pyspark/exporter/parquet/spark_df_exp.py:
--------------------------------------------------------------------------------
1 | import pyspark.sql.functions as F
2 |
3 | from pyspark.sql import SparkSession
4 | from pyspark.sql.types import *
5 |
6 |
7 |
8 |
9 | def export_sparkdf(df, path, case_id_key="case:concept:name", mode=None, partitionBy="@@partitioning", compression=None, num_partitions=128):
10 |
11 | get_hash = F.udf(lambda x: abs(hash(x)) % num_partitions, LongType())
12 |
13 | df = df.withColumn(partitionBy, get_hash(case_id_key))
14 | for c in df.columns:
15 | df = df.withColumnRenamed(c, c.replace(':', 'AAA'))
16 |
17 | df.write.parquet(path, mode=mode, partitionBy=partitionBy, compression=compression)
18 |
--------------------------------------------------------------------------------
/tests/filtering_cases_test.py:
--------------------------------------------------------------------------------
1 | import os
2 | from tests.constants import INPUT_DATA_DIR, OUTPUT_DATA_DIR
3 | from pm4pyspark.importer.csv import spark_df_imp as importer
4 |
5 | from pm4pyspark.algo.filtering.cases import cases_filter
6 |
7 |
8 | spark_df = importer.import_sparkdf_from_path(os.path.join(INPUT_DATA_DIR, "running-example.csv"), header=True, inferSchema=True)
9 | spark_df.cache()
10 |
11 | cases_filter.filter_on_ncases(spark_df, max_no_cases=3).show()
12 |
13 | case_size_df = cases_filter.filter_on_case_size(spark_df, min_case_size=9, max_case_size=9)
14 |
15 | perf_df = cases_filter.filter_on_case_performance(spark_df, max_case_performance=800000)
16 |
17 | spark_df.unpersist()
18 |
--------------------------------------------------------------------------------
/tests/filtering_start_test.py:
--------------------------------------------------------------------------------
1 | import os
2 | from tests.constants import INPUT_DATA_DIR
3 | from pm4pyspark.importer.csv import spark_df_imp as importer
4 |
5 | from pm4pyspark.algo.filtering.start_activities import start_activities_filter
6 |
7 |
8 | spark_df = importer.import_sparkdf_from_path(os.path.join(INPUT_DATA_DIR, "running-example.csv"), header=True, inferSchema=True)
9 | spark_df.cache()
10 |
11 | print(start_activities_filter.get_start_activities(spark_df))
12 |
13 | filtered_df = start_activities_filter.filter_df_on_start_activities(spark_df, ["check ticket"], grouped_df=spark_df.groupby("org:resource"))
14 |
15 | filtered_df_nocc = start_activities_filter.filter_df_on_start_activities_nocc(spark_df, 6)
16 |
17 | applied_auto_df = start_activities_filter.apply_auto_filter(spark_df)
18 |
19 | spark_df.unpersist()
20 |
--------------------------------------------------------------------------------
/tests/filtering_ts_test.py:
--------------------------------------------------------------------------------
1 | import os
2 | import time
3 | import pandas as pd
4 | from tests.constants import INPUT_DATA_DIR
5 | from pm4pyspark.importer.csv import spark_df_imp as importer
6 | from datetime import datetime
7 |
8 | from pm4pyspark.algo.filtering.timestamp import timestamp_filter
9 |
10 | spark_df = importer.import_sparkdf_from_path(os.path.join(INPUT_DATA_DIR, "receipt.csv"), header=True)
11 | spark_df.cache()
12 |
13 | df_timest_contained = timestamp_filter.filter_traces_contained(spark_df, "2011-03-09 00:00:00", "2012-01-18 23:59:59")
14 | print(df_timest_contained.count())
15 | print(df_timest_contained.groupby("case:concept:name").count().count())
16 |
17 | df_timest_intersecting = timestamp_filter.filter_traces_intersecting(spark_df, "2011-03-09 00:00:00", "2012-01-18 23:59:59")
18 | df_timest_intersecting.show()
19 |
20 | spark_df.unpersist()
21 |
--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 | # Big Data Process Mining in Python
Integration of Spark in PM4PY for Preprocessing Event Data and Discover Process Models
2 |
3 |
4 | [PM4Py](https://github.com/pm4py) is the Process Mining library in Python and it aims at seamless integration with any kind of databases and technology.
5 |
6 | **PM4PySpark** is the integration of [*Apache Spark*](https://spark.apache.org) in PM4Py. Especially, this Big Data connectors for PM4Py has a focus on embracing the big data world and to handle huge amount of data, with a particular focus on the Spark ecosystem:
7 |
8 | - Loading CSV files into Apache Spark
9 | - Loading and writing Parquet files into Apache Spark
10 | - Calculating in an efficient way the Directly Follows Graph (DFG) on top of Apache Spark DataFrames
11 | - Managing filtering operations (timeframe, attributes, start/end activities, paths, variants, cases) on top of Apache Spark
12 |
--------------------------------------------------------------------------------
/tests/parquet_export_test.py:
--------------------------------------------------------------------------------
1 | import os
2 | import time
3 |
4 | from tests.constants import INPUT_DATA_DIR, OUTPUT_DATA_DIR
5 | from pm4pyspark.importer.parquet import spark_df_imp as parquet_importer
6 | from pm4pyspark.exporter.parquet import spark_df_exp as parquet_exporter
7 |
8 |
9 |
10 |
11 | dir_path = os.path.join(INPUT_DATA_DIR, "receipt")
12 | spark_df_dir = parquet_importer.import_sparkdf_from_path(dir_path)
13 | print(spark_df_dir.count())
14 |
15 | out_path = os.path.join(OUTPUT_DATA_DIR, "receipt128")
16 | parquet_exporter.export_sparkdf(spark_df_dir, out_path, mode="overwrite")
17 |
18 | out_path2 = os.path.join(OUTPUT_DATA_DIR, "receipt64")
19 | parquet_exporter.export_sparkdf(spark_df_dir, out_path2, num_partitions=64, mode="overwrite")
20 |
21 |
22 | test_df_128 = parquet_importer.import_sparkdf_from_path(out_path)
23 | test_df_64 = parquet_importer.import_sparkdf_from_path(out_path2)
24 | print(test_df_128.count())
25 | print(test_df_64.count())
26 |
--------------------------------------------------------------------------------
/tests/dfg_test.py:
--------------------------------------------------------------------------------
1 | import os
2 | from tests.constants import INPUT_DATA_DIR, OUTPUT_DATA_DIR
3 | from pm4py.visualization.dfg import factory as dfg_vis_factory
4 | from pm4pyspark.importer.csv import spark_df_imp as importer
5 | from pm4pyspark.algo.discovery.dfg import factory as dfg_factory
6 |
7 |
8 | parameters = {"format":"svg"}
9 |
10 |
11 | event_stream_ex = importer.import_event_stream(os.path.join(INPUT_DATA_DIR, "running-example.csv"), parameters={"header": True})
12 | log_ex = importer.transform_event_stream_to_event_log(event_stream_ex)
13 | df_ex = importer.import_sparkdf_from_path(os.path.join(INPUT_DATA_DIR, "running-example.csv"), header=True, inferSchema=True)
14 |
15 | dfg_freq = dfg_factory.apply(df_ex)
16 | gviz_freq = dfg_vis_factory.apply(dfg_freq, log=log_ex, parameters=parameters, variant="frequency")
17 | dfg_vis_factory.save(gviz_freq, os.path.join(OUTPUT_DATA_DIR, "running-example_freq.svg"))
18 |
19 | dfg_perf = dfg_factory.apply(df_ex, variant="performance")
20 | gviz_perf = dfg_vis_factory.apply(dfg_perf, log=log_ex, parameters=parameters, variant="performance")
21 | dfg_vis_factory.save(gviz_perf, os.path.join(OUTPUT_DATA_DIR, "running-example_perf.svg"))
22 |
--------------------------------------------------------------------------------
/tests/filtering_end_test.py:
--------------------------------------------------------------------------------
1 | import os
2 | from tests.constants import INPUT_DATA_DIR
3 | from pm4pyspark.importer.csv import spark_df_imp as importer
4 |
5 | from pm4pyspark.algo.filtering.end_activities import end_activities_filter
6 |
7 |
8 | spark_df = importer.import_sparkdf_from_path(os.path.join(INPUT_DATA_DIR, "running-example.csv"), header=True, inferSchema=True)
9 | spark_df.cache()
10 |
11 | end_ac = end_activities_filter.get_end_activities(spark_df)
12 | filtered_df = end_activities_filter.filter_df_on_end_activities(spark_df, {'T07-5 Draft intern advice aspect 5'})
13 | filtered_df_apply = end_activities_filter.apply(spark_df, ["T05 Print and send confirmation of receipt", "T10 Determine necessity to stop indication"])
14 | print(filtered_df_apply.count())
15 | print(filtered_df_apply.groupby("case:concept:name").count().count())
16 |
17 | filtered_df_nocc, rdict = end_activities_filter.filter_df_on_end_activities_nocc(spark_df, 400, return_dict=True)
18 | filtered_df_nocc.show(filtered_df_nocc.count())
19 | print(filtered_df_nocc.count())
20 | print(rdict)
21 |
22 | filtered_auto_filter = end_activities_filter.apply_auto_filter(spark_df)
23 | print(filtered_auto_filter.count())
24 | spark_df.unpersist()
25 |
--------------------------------------------------------------------------------
/tests/csv_import_test.py:
--------------------------------------------------------------------------------
1 | import os
2 | import time
3 |
4 | from tests.constants import INPUT_DATA_DIR
5 | from pm4pyspark.importer.csv import spark_df_imp as csv_importer
6 |
7 | file_path = os.path.join(INPUT_DATA_DIR, "running-example.csv")
8 | file_path2 = os.path.join(INPUT_DATA_DIR, "receipt.csv")
9 |
10 |
11 | spark_df_wo_timeconversion = csv_importer.import_sparkdf_from_path_wo_timeconversion(file_path, header=True)
12 | spark_df = csv_importer.import_sparkdf_from_path(file_path, header=True, inferSchema=True)
13 | spark_df_sorted = csv_importer.import_sparkdf_from_path(file_path, header=True, sort=True)
14 |
15 | spark_df_wo_timeconversion1 = csv_importer.import_sparkdf_from_path_wo_timeconversion(file_path2, header=True)
16 | spark_df1 = csv_importer.import_sparkdf_from_path(file_path2, header=True)
17 | spark_df_sorted1 = csv_importer.import_sparkdf_from_path(file_path2, header=True, sort=True)
18 |
19 | spark_df_wo_timeconversion.show(truncate=False)
20 | spark_df.show()
21 | spark_df_sorted.show()
22 |
23 | spark_df_wo_timeconversion1.show(truncate=False)
24 | spark_df1.show()
25 | spark_df_sorted1.show()
26 |
27 | event_stream = csv_importer.import_event_stream(file_path, header=True)
28 | log = csv_importer.transform_event_stream_to_event_log(event_stream)
29 | print(log)
30 |
--------------------------------------------------------------------------------
/tests/parquet_import_test.py:
--------------------------------------------------------------------------------
1 | import os
2 | import time
3 |
4 | from tests.constants import INPUT_DATA_DIR
5 | from pm4pyspark.importer.parquet import spark_df_imp as parquet_importer
6 |
7 |
8 |
9 |
10 | file_path = os.path.join(INPUT_DATA_DIR, "running-example.parquet")
11 | file_path2 = os.path.join(INPUT_DATA_DIR, "receipt.parquet")
12 | dir_path = os.path.join(INPUT_DATA_DIR, "receipt")
13 |
14 |
15 | spark_df = parquet_importer.import_sparkdf_from_path(file_path)
16 | spark_df_sorted = parquet_importer.import_sparkdf_from_path(file_path, sort=True)
17 |
18 | spark_df1 = parquet_importer.import_sparkdf_from_path(file_path2)
19 | spark_df_sorted1 = parquet_importer.import_sparkdf_from_path(file_path2, sort=True)
20 |
21 | spark_df_dir = parquet_importer.import_sparkdf_from_path(dir_path)
22 | spark_df_dir_sorted = parquet_importer.import_sparkdf_from_path(dir_path, sort=True)
23 |
24 | spark_df.show()
25 | spark_df_sorted.show()
26 |
27 | spark_df1.show()
28 | spark_df_sorted1.show()
29 |
30 | spark_df_dir.show()
31 | spark_df_dir_sorted.show()
32 |
33 | print(spark_df_dir.count())
34 | print(spark_df_dir_sorted.count())
35 |
36 | event_stream = parquet_importer.import_event_stream(file_path)
37 | log = parquet_importer.transform_event_stream_to_event_log(event_stream)
38 | print(log)
39 |
--------------------------------------------------------------------------------
/tests/filtering_attr_test.py:
--------------------------------------------------------------------------------
1 | import os
2 | import time
3 | from tests.constants import INPUT_DATA_DIR
4 | from pm4pyspark.importer.csv import spark_df_imp as importer
5 |
6 | from pm4pyspark.algo.filtering.attributes import attributes_filter
7 |
8 | spark_df = importer.import_sparkdf_from_path(os.path.join(INPUT_DATA_DIR, "running-example.csv"), header=True, inferSchema=True)
9 | spark_df.cache()
10 |
11 | activities = attributes_filter.get_attribute_values(spark_df, attribute_key="concept:name")
12 | resources = attributes_filter.get_attribute_values(spark_df, attribute_key="org:resource")
13 |
14 | filtered_df = attributes_filter.filter_df_on_attribute_values(spark_df, {'examine casually'}, positive=False)
15 | filtered_df.show(filtered_df.count())
16 |
17 | filtered_num_df = attributes_filter.apply_numeric_events(spark_df, 50, 100, parameters={"pm4py:param:attribute_key":"Costs"})
18 |
19 | filtered_num_tr_df2 = attributes_filter.apply_numeric(spark_df, 0, 7000, parameters={"pm4py:param:attribute_key":"_c0"})
20 |
21 | filtered_event_df = attributes_filter.apply_events(spark_df, values={"examine casually"})
22 |
23 | filtered_thresh_df = attributes_filter.filter_df_keeping_activ_exc_thresh(spark_df, 7, most_common_variant={"reject request"})
24 |
25 | filtered_top_5_act_df = attributes_filter.filter_df_keeping_spno_activities(spark_df, max_no_activities=5)
26 |
27 | print(attributes_filter.get_kde_date_attribute(spark_df))
28 |
29 | spark_df.unpersist()
30 |
--------------------------------------------------------------------------------
/tests/filtering_var_test.py:
--------------------------------------------------------------------------------
1 | import os
2 | import time
3 | from tests.constants import INPUT_DATA_DIR
4 | from pm4pyspark.importer.csv import spark_df_imp as importer
5 |
6 | from pm4pyspark.algo.filtering.variants import variants_filter
7 |
8 | spark_df = importer.import_sparkdf_from_path(os.path.join(INPUT_DATA_DIR, "receipt.csv"), header=True)
9 | spark_df.cache()
10 |
11 | variants_df = variants_filter.get_variants_df(spark_df)
12 | variants_df.show(variants_df.count())
13 |
14 | print(variants_filter.get_variant_statistics(spark_df, parameters={'max_variants_to_return': 3}))
15 | ddf, dlist = variants_filter.get_variants_df_and_list(spark_df)
16 | print(dlist)
17 |
18 | start_time = time.time()
19 | variants_df2 = variants_filter.get_variants_df_with_case_duration(spark_df)
20 | variants_df2.show()
21 |
22 | event_with_caseid1 = variants_filter.get_events(spark_df, 1)
23 | print(event_with_caseid1)
24 | stat_with_duration = variants_filter.get_variant_statistics_with_case_duration(spark_df)
25 | print(stat_with_duration)
26 |
27 | case_description = variants_filter.get_cases_description(spark_df)
28 | print(case_description)
29 |
30 | applied_df = variants_filter.apply(spark_df, ["Confirmation of receipt,T02 Check confirmation of receipt,T04 Determine confirmation of receipt,T05 Print and send confirmation of receipt,T06 Determine necessity of stop advice,T10 Determine necessity to stop indication"])
31 | variants_count_applied_df = variants_filter.get_variant_statistics(applied_df)
32 | print(variants_count_applied_df)
33 |
34 | auto_applied_df = variants_filter.apply_auto_filter(spark_df)
35 | print(auto_applied_df.count())
36 |
37 | spark_df.unpersist()
38 |
--------------------------------------------------------------------------------
/pm4pyspark/algo/filtering/paths/paths_filter.py:
--------------------------------------------------------------------------------
1 | import pyspark.sql.functions as F
2 |
3 | from pm4py.algo.filtering.common.filtering_constants import CASE_CONCEPT_NAME
4 | from pm4py.objects.log.util.xes import DEFAULT_NAME_KEY, DEFAULT_TIMESTAMP_KEY
5 | from pm4py.util.constants import PARAMETER_CONSTANT_TIMESTAMP_KEY
6 | from pm4py.util.constants import PARAMETER_CONSTANT_ATTRIBUTE_KEY
7 | from pm4py.util.constants import PARAMETER_CONSTANT_CASEID_KEY
8 | from pyspark.sql.window import Window
9 |
10 |
11 |
12 |
13 | def apply(df, paths, parameters=None):
14 | """Applies a filter on traces containing / not containing a path
15 | """
16 |
17 | if parameters is None:
18 | parameters = {}
19 | paths = [path[0] + "," + path[1] for path in paths]
20 |
21 | timestamp_key = parameters[
22 | PARAMETER_CONSTANT_TIMESTAMP_KEY] if PARAMETER_CONSTANT_TIMESTAMP_KEY in parameters else DEFAULT_TIMESTAMP_KEY
23 | case_id_glue = parameters[
24 | PARAMETER_CONSTANT_CASEID_KEY] if PARAMETER_CONSTANT_CASEID_KEY in parameters else CASE_CONCEPT_NAME
25 | attribute_key = parameters[
26 | PARAMETER_CONSTANT_ATTRIBUTE_KEY] if PARAMETER_CONSTANT_ATTRIBUTE_KEY in parameters else DEFAULT_NAME_KEY
27 | positive = parameters["positive"] if "positive" in parameters else True
28 |
29 | df_reduced = df.select(case_id_glue, attribute_key)
30 |
31 | w = Window().partitionBy(df_reduced[case_id_glue]).orderBy(df_reduced[case_id_glue])
32 | df_reduced_shift = df_reduced.withColumn(case_id_glue + "_1", F.lag(case_id_glue, -1, 'NaN').over(w))
33 | df_reduced_shift = df_reduced_shift.withColumn(attribute_key + "_1", F.lag(attribute_key, -1, 'NaN').over(w))
34 | stacked_df = df_reduced_shift.withColumn("@@path", F.concat(df_reduced_shift[attribute_key], F.lit(","), df_reduced_shift[attribute_key + "_1"]))
35 | stacked_df = stacked_df.filter(stacked_df["@@path"].isin(paths)).select(case_id_glue)
36 |
37 | if positive:
38 | return df.join(F.broadcast(stacked_df), case_id_glue)
39 | else:
40 | return df.join(F.broadcast(stacked_df), case_id_glue, 'leftanti')
41 |
42 |
43 | def apply_auto_filter(df, parameters=None):
44 | del df
45 | del parameters
46 | raise Exception("apply_auto_filter method not available for paths filter on dataframe")
47 |
--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
1 | # Byte-compiled / optimized / DLL files
2 | __pycache__/
3 | .idea/
4 | *.py[cod]
5 | *$py.class
6 |
7 | # C extensions
8 | *.so
9 |
10 | # Distribution / packaging
11 | .Python
12 | build/
13 | develop-eggs/
14 | dist/
15 | downloads/
16 | eggs/
17 | .eggs/
18 | lib/
19 | lib64/
20 | parts/
21 | sdist/
22 | var/
23 | wheels/
24 | pip-wheel-metadata/
25 | share/python-wheels/
26 | *.egg-info/
27 | .installed.cfg
28 | *.egg
29 | MANIFEST
30 |
31 | # PyInstaller
32 | # Usually these files are written by a python script from a template
33 | # before PyInstaller builds the exe, so as to inject date/other infos into it.
34 | *.manifest
35 | *.spec
36 |
37 | # Installer logs
38 | pip-log.txt
39 | pip-delete-this-directory.txt
40 |
41 | # Unit test / coverage reports
42 | htmlcov/
43 | .tox/
44 | .nox/
45 | .coverage
46 | .coverage.*
47 | .cache
48 | nosetests.xml
49 | coverage.xml
50 | *.cover
51 | .hypothesis/
52 | .pytest_cache/
53 |
54 | # Translations
55 | *.mo
56 | *.pot
57 |
58 | # Django stuff:
59 | *.log
60 | local_settings.py
61 | db.sqlite3
62 | db.sqlite3-journal
63 |
64 | # Flask stuff:
65 | instance/
66 | .webassets-cache
67 |
68 | # Scrapy stuff:
69 | .scrapy
70 |
71 | # Sphinx documentation
72 | docs/_build/
73 |
74 | # PyBuilder
75 | target/
76 |
77 | # Jupyter Notebook
78 | .ipynb_checkpoints
79 |
80 | # IPython
81 | profile_default/
82 | ipython_config.py
83 |
84 | # pyenv
85 | .python-version
86 |
87 | # pipenv
88 | # According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
89 | # However, in case of collaboration, if having platform-specific dependencies or dependencies
90 | # having no cross-platform support, pipenv may install dependencies that don't work, or not
91 | # install all needed dependencies.
92 | #Pipfile.lock
93 |
94 | # celery beat schedule file
95 | celerybeat-schedule
96 |
97 | # SageMath parsed files
98 | *.sage.py
99 |
100 | # Environments
101 | .env
102 | .venv
103 | env/
104 | venv/
105 | ENV/
106 | env.bak/
107 | venv.bak/
108 |
109 | # Spyder project settings
110 | .spyderproject
111 | .spyproject
112 |
113 | # Rope project settings
114 | .ropeproject
115 |
116 | # mkdocs documentation
117 | /site
118 |
119 | # mypy
120 | .mypy_cache/
121 | .dmypy.json
122 | dmypy.json
123 |
124 | # Pyre type checker
125 | .pyre/
126 |
--------------------------------------------------------------------------------
/pm4pyspark/algo/filtering/cases/cases_filter.py:
--------------------------------------------------------------------------------
1 | import pyspark.sql.functions as F
2 |
3 |
4 |
5 |
6 | def filter_on_ncases(df, case_id_glue="case:concept:name", max_no_cases=1000):
7 | """Filters the Spark dataframe keeping only the specified maximum number of traces
8 | """
9 |
10 | # With conversion to RDD.
11 | #cases_to_keep = df.select(case_id_glue).distinct().rdd.map(lambda row : row[0]).collect()
12 | #cases_to_keep = cases_to_keep[0:min(len(cases_to_keep), max_no_cases)]
13 | #return df.filter(df[case_id_glue].isin(cases_to_keep))
14 |
15 | #Without conversion to RDD (better).
16 | grouped_df = df.groupBy(case_id_glue).count().limit(max_no_cases).drop("count")
17 |
18 | return df.join(F.broadcast(grouped_df), case_id_glue)
19 |
20 |
21 | def filter_on_case_size(df, case_id_glue="case:concept:name", min_case_size=2, max_case_size=None):
22 | """Filters the Spark dataframe keeping only traces with at least the specified number of events
23 | """
24 |
25 | size_df = df.groupBy(case_id_glue).count()
26 | if max_case_size:
27 | size_df = size_df.filter((size_df["count"] >= min_case_size) & (size_df["count"] <= max_case_size))
28 | else:
29 | size_df = size_df.filter(size_df["count"] >= min_case_size)
30 | return df.join(F.broadcast(size_df), case_id_glue).drop("count")
31 |
32 |
33 | def filter_on_case_performance(df, case_id_glue="case:concept:name", timestamp_key="time:timestamp",
34 | min_case_performance=0, max_case_performance=10000000000):
35 | """Filters the Spark dataframe on case performance
36 | """
37 |
38 | grouped_df = df.groupby(case_id_glue)
39 | start_end_df = grouped_df.agg(F.min(timestamp_key).alias(timestamp_key), F.max(timestamp_key).alias(timestamp_key+"_1"))
40 |
41 | start_end_df = start_end_df.withColumn("caseDuration", F.unix_timestamp(start_end_df[timestamp_key+"_1"]) - F.unix_timestamp(start_end_df[timestamp_key]))
42 | start_end_df = start_end_df.filter((start_end_df["caseDuration"] > min_case_performance) & (start_end_df["caseDuration"] < max_case_performance))\
43 | .select(case_id_glue)
44 |
45 | return df.join(F.broadcast(start_end_df), case_id_glue)
46 |
47 |
48 | def apply(df, parameters=None):
49 | del df
50 | del parameters
51 | raise Exception("apply method not available for case filter")
52 |
53 |
54 | def apply_auto_filter(df, parameters=None):
55 | del df
56 | del parameters
57 | raise Exception("apply_auto_filter method not available for case filter")
58 |
--------------------------------------------------------------------------------
/pm4pyspark/algo/discovery/dfg/df_statistics.py:
--------------------------------------------------------------------------------
1 | import pyspark.sql.functions as F
2 |
3 | from pyspark.sql.window import Window
4 |
5 |
6 |
7 |
8 | def get_dfg_graph(df, measure="frequency", activity_key="concept:name", case_id_glue="case:concept:name",
9 | timestamp_key="time:timestamp", perf_aggregation_key="mean", sort_caseid_required=True,
10 | sort_timestamp_along_case_id=True, window=1):
11 | """Gets DFG graph from the Spark dataframe
12 | """
13 |
14 | #if sort_caseid_required:
15 | # if sort_timestamp_along_case_id:
16 | # df = df.orderBy(case_id_glue, timestamp_key)
17 | # else:
18 | # df = df.orderBy(case_id_glue)
19 |
20 | if measure == "frequency":
21 | df_reduced = df.select(case_id_glue, activity_key)
22 | else:
23 | df_reduced = df.select(case_id_glue, activity_key, timestamp_key)
24 |
25 | w = Window.partitionBy(case_id_glue).orderBy(case_id_glue)
26 | df_reduced_shift = df_reduced.withColumn(case_id_glue + "_1", F.lag(case_id_glue, -window, 'NaN').over(w))
27 | df_reduced_shift = df_reduced_shift.withColumn(activity_key + "_1", F.lag(activity_key, -window, 'NaN').over(w))
28 | if measure != "frequency":
29 | df_reduced_shift = df_reduced_shift.withColumn(timestamp_key + "_1", F.lag(timestamp_key, -window, 'NaN').over(w))
30 | df_successive_rows = df_reduced_shift.filter(df_reduced_shift[case_id_glue] == df_reduced_shift[case_id_glue + "_1"])
31 |
32 | if measure == "performance" or measure == "both":
33 | df_successive_rows = df_successive_rows.withColumn("caseDuration",
34 | F.unix_timestamp(df_successive_rows[timestamp_key+"_1"])
35 | - F.unix_timestamp(df_successive_rows[timestamp_key]))
36 | directly_follows_grouping = df_successive_rows.groupby(activity_key, activity_key + "_1")
37 |
38 | if measure == "frequency" or measure == "both":
39 | dfg_frequency = directly_follows_grouping.count().rdd.map(lambda row: ((row[0], row[1]), row[2]))
40 |
41 | if measure == "performance" or measure == "both":
42 | dfg_performance = directly_follows_grouping.agg({"caseDuration":perf_aggregation_key})\
43 | .rdd.map(lambda row: ((row[0], row[1]), row[2]))
44 | if measure == "frequency":
45 | return dfg_frequency.collectAsMap()
46 |
47 | if measure == "performance":
48 | return dfg_performance.collectAsMap()
49 |
50 | if measure == "both":
51 | return [dfg_frequency.collectAsMap(), dfg_performance.collectAsMap()]
52 |
--------------------------------------------------------------------------------
/pm4pyspark/importer/parquet/spark_df_imp.py:
--------------------------------------------------------------------------------
1 | from pm4py.objects.log import log as log_instance
2 | from pm4py.objects.conversion.log.versions import to_event_log
3 | from pm4pyspark.importer.constants import DEFAULT_NUM_PARTITION
4 | from pyspark.sql import SparkSession
5 |
6 |
7 | def apply(path, parameters=None):
8 | """Imports a Parquet file
9 | """
10 |
11 | if parameters is None:
12 | parameters = {}
13 |
14 | numPartition = parameters["numPartition"] if "numPartition" in parameters else DEFAULT_NUM_PARTITION
15 |
16 | spark = (SparkSession.
17 | builder.
18 | master('local[*]').
19 | config('spark.sql.shuffle.partitions', numPartition).
20 | getOrCreate())
21 |
22 | spark_df = spark.read.parquet(path)
23 | for c in spark_df.columns:
24 | spark_df = spark_df.withColumnRenamed(c, c.replace('AAA', ':'))
25 |
26 | return spark_df
27 |
28 |
29 | def import_sparkdf_from_path(path, sort=False, sort_field="time:timestamp", ascending=True, numPartition=DEFAULT_NUM_PARTITION):
30 | """Imports a Spark DataFrame from the given path of PARQUET format file
31 | """
32 |
33 | parameters = {}
34 | parameters["numPartition"] = numPartition
35 |
36 | spark_df = apply(path, parameters=parameters)
37 |
38 | if sort and sort_field:
39 | if ascending is True:
40 | spark_df = spark_df.orderBy(sort_field)
41 | else:
42 | spark_df = spark_df.orderBy(sort_field, ascending=False)
43 |
44 | return spark_df
45 |
46 |
47 | def import_event_stream(path, sort=True, sort_field="time:timestamp", ascending=True, numPartition=DEFAULT_NUM_PARTITION):
48 | """Imports an `EventStream` from the given path of PARQUET format file
49 | """
50 |
51 | spark_df = import_sparkdf_from_path(path, sort=sort, sort_field=sort_field, ascending=ascending, numPartition=numPartition)
52 | rdd = spark_df.rdd.map(lambda row: row.asDict())
53 | event_stream = rdd.collect()
54 | event_stream = log_instance.EventStream(event_stream, attributes={'origin': 'parquet'})
55 | return event_stream
56 |
57 |
58 | def transform_event_stream_to_event_log(event_stream, case_glue="case:concept:name", include_case_attributes=True, enable_deepcopy=False):
59 | """Transforms an `EventStream` to an `EventLog`
60 | """
61 |
62 | log = to_event_log.transform_event_stream_to_event_log(event_stream,
63 | case_glue=case_glue,
64 | include_case_attributes=include_case_attributes,
65 | enable_deepcopy=enable_deepcopy)
66 |
67 | return log
68 |
--------------------------------------------------------------------------------
/pm4pyspark/algo/discovery/dfg/factory.py:
--------------------------------------------------------------------------------
1 | import pyspark
2 |
3 | from pm4py import util as pmutil
4 | from pm4py.algo.discovery.dfg.versions import native, performance
5 | from pm4py.objects.conversion.log import factory as log_conversion
6 | from pm4py.objects.log.util import general as log_util
7 | from pm4py.objects.log.util import xes as xes_util
8 |
9 | from pm4pyspark.algo.discovery.dfg import df_statistics
10 | from pm4pyspark.importer.csv import spark_df_imp as importer
11 |
12 | DFG_NATIVE = 'native'
13 | DFG_FREQUENCY = 'frequency'
14 | DFG_PERFORMANCE = 'performance'
15 | DFG_FREQUENCY_GREEDY = 'frequency_greedy'
16 | DFG_PERFORMANCE_GREEDY = 'performance_greedy'
17 |
18 | VERSIONS = {DFG_NATIVE: native.apply, DFG_FREQUENCY: native.apply, DFG_PERFORMANCE: performance.apply,
19 | DFG_FREQUENCY_GREEDY: native.apply, DFG_PERFORMANCE_GREEDY: performance.apply}
20 |
21 | def apply(log, parameters=None, variant=DFG_NATIVE):
22 | """Calculates DFG graph (frequency or performance) starting from the Spark DataFrame
23 | """
24 |
25 | if parameters is None:
26 | parameters = {}
27 | if pmutil.constants.PARAMETER_CONSTANT_ACTIVITY_KEY not in parameters:
28 | parameters[pmutil.constants.PARAMETER_CONSTANT_ACTIVITY_KEY] = xes_util.DEFAULT_NAME_KEY
29 | if pmutil.constants.PARAMETER_CONSTANT_TIMESTAMP_KEY not in parameters:
30 | parameters[pmutil.constants.PARAMETER_CONSTANT_TIMESTAMP_KEY] = xes_util.DEFAULT_TIMESTAMP_KEY
31 | if pmutil.constants.PARAMETER_CONSTANT_CASEID_KEY not in parameters:
32 | parameters[pmutil.constants.PARAMETER_CONSTANT_CASEID_KEY] = log_util.CASE_ATTRIBUTE_GLUE
33 |
34 | if isinstance(log, pyspark.sql.DataFrame):
35 | df = importer.convert_timestamp_to_utc_in_df(log, timest_columns=[
36 | parameters[pmutil.constants.PARAMETER_CONSTANT_TIMESTAMP_KEY]])
37 | dfg_frequency, dfg_performance = df_statistics.get_dfg_graph(df, measure="both",
38 | activity_key=parameters[
39 | pmutil.constants.PARAMETER_CONSTANT_ACTIVITY_KEY],
40 | timestamp_key=parameters[
41 | pmutil.constants.PARAMETER_CONSTANT_TIMESTAMP_KEY],
42 | case_id_glue=parameters[
43 | pmutil.constants.PARAMETER_CONSTANT_CASEID_KEY])
44 | if 'native' in variant or 'frequency' in variant:
45 | return dfg_frequency
46 | else:
47 | return dfg_performance
48 |
49 | return VERSIONS[variant](log_conversion.apply(log, parameters, log_conversion.TO_EVENT_LOG), parameters=parameters)
50 |
--------------------------------------------------------------------------------
/tests/input_data/running-example.csv:
--------------------------------------------------------------------------------
1 | ,Activity,Costs,Resource,case:concept:name,case:creator,concept:name,org:resource,time:timestamp
2 | 0,register request,50,Pete,3,Fluxicon Nitro,register request,Pete,2010-12-30 14:32:00+01:00
3 | 1,examine casually,400,Mike,3,Fluxicon Nitro,examine casually,Mike,2010-12-30 15:06:00+01:00
4 | 2,check ticket,100,Ellen,3,Fluxicon Nitro,check ticket,Ellen,2010-12-30 16:34:00+01:00
5 | 3,decide,200,Sara,3,Fluxicon Nitro,decide,Sara,2011-01-06 09:18:00+01:00
6 | 4,reinitiate request,200,Sara,3,Fluxicon Nitro,reinitiate request,Sara,2011-01-06 12:18:00+01:00
7 | 5,examine thoroughly,400,Sean,3,Fluxicon Nitro,examine thoroughly,Sean,2011-01-06 13:06:00+01:00
8 | 6,check ticket,100,Pete,3,Fluxicon Nitro,check ticket,Pete,2011-01-08 11:43:00+01:00
9 | 7,decide,200,Sara,3,Fluxicon Nitro,decide,Sara,2011-01-09 09:55:00+01:00
10 | 8,pay compensation,200,Ellen,3,Fluxicon Nitro,pay compensation,Ellen,2011-01-15 10:45:00+01:00
11 | 9,register request,50,Mike,2,Fluxicon Nitro,register request,Mike,2010-12-30 11:32:00+01:00
12 | 10,check ticket,100,Mike,2,Fluxicon Nitro,check ticket,Mike,2010-12-30 12:12:00+01:00
13 | 11,examine casually,400,Sean,2,Fluxicon Nitro,examine casually,Sean,2010-12-30 14:16:00+01:00
14 | 12,decide,200,Sara,2,Fluxicon Nitro,decide,Sara,2011-01-05 11:22:00+01:00
15 | 13,pay compensation,200,Ellen,2,Fluxicon Nitro,pay compensation,Ellen,2011-01-08 12:05:00+01:00
16 | 14,register request,50,Pete,1,Fluxicon Nitro,register request,Pete,2010-12-30 11:02:00+01:00
17 | 15,examine thoroughly,400,Sue,1,Fluxicon Nitro,examine thoroughly,Sue,2010-12-31 10:06:00+01:00
18 | 16,check ticket,100,Mike,1,Fluxicon Nitro,check ticket,Mike,2011-01-05 15:12:00+01:00
19 | 17,decide,200,Sara,1,Fluxicon Nitro,decide,Sara,2011-01-06 11:18:00+01:00
20 | 18,reject request,200,Pete,1,Fluxicon Nitro,reject request,Pete,2011-01-07 14:24:00+01:00
21 | 19,register request,50,Mike,6,Fluxicon Nitro,register request,Mike,2011-01-06 15:02:00+01:00
22 | 20,examine casually,400,Ellen,6,Fluxicon Nitro,examine casually,Ellen,2011-01-06 16:06:00+01:00
23 | 21,check ticket,100,Mike,6,Fluxicon Nitro,check ticket,Mike,2011-01-07 16:22:00+01:00
24 | 22,decide,200,Sara,6,Fluxicon Nitro,decide,Sara,2011-01-07 16:52:00+01:00
25 | 23,pay compensation,200,Mike,6,Fluxicon Nitro,pay compensation,Mike,2011-01-16 11:47:00+01:00
26 | 24,register request,50,Ellen,5,Fluxicon Nitro,register request,Ellen,2011-01-06 09:02:00+01:00
27 | 25,examine casually,400,Mike,5,Fluxicon Nitro,examine casually,Mike,2011-01-07 10:16:00+01:00
28 | 26,check ticket,100,Pete,5,Fluxicon Nitro,check ticket,Pete,2011-01-08 11:22:00+01:00
29 | 27,decide,200,Sara,5,Fluxicon Nitro,decide,Sara,2011-01-10 13:28:00+01:00
30 | 28,reinitiate request,200,Sara,5,Fluxicon Nitro,reinitiate request,Sara,2011-01-11 16:18:00+01:00
31 | 29,check ticket,100,Ellen,5,Fluxicon Nitro,check ticket,Ellen,2011-01-14 14:33:00+01:00
32 | 30,examine casually,400,Mike,5,Fluxicon Nitro,examine casually,Mike,2011-01-16 15:50:00+01:00
33 | 31,decide,200,Sara,5,Fluxicon Nitro,decide,Sara,2011-01-19 11:18:00+01:00
34 | 32,reinitiate request,200,Sara,5,Fluxicon Nitro,reinitiate request,Sara,2011-01-20 12:48:00+01:00
35 | 33,examine casually,400,Sue,5,Fluxicon Nitro,examine casually,Sue,2011-01-21 09:06:00+01:00
36 | 34,check ticket,100,Pete,5,Fluxicon Nitro,check ticket,Pete,2011-01-21 11:34:00+01:00
37 | 35,decide,200,Sara,5,Fluxicon Nitro,decide,Sara,2011-01-23 13:12:00+01:00
38 | 36,reject request,200,Mike,5,Fluxicon Nitro,reject request,Mike,2011-01-24 14:56:00+01:00
39 | 37,register request,50,Pete,4,Fluxicon Nitro,register request,Pete,2011-01-06 15:02:00+01:00
40 | 38,check ticket,100,Mike,4,Fluxicon Nitro,check ticket,Mike,2011-01-07 12:06:00+01:00
41 | 39,examine thoroughly,400,Sean,4,Fluxicon Nitro,examine thoroughly,Sean,2011-01-08 14:43:00+01:00
42 | 40,decide,200,Sara,4,Fluxicon Nitro,decide,Sara,2011-01-09 12:02:00+01:00
43 | 41,reject request,200,Ellen,4,Fluxicon Nitro,reject request,Ellen,2011-01-12 15:44:00+01:00
44 |
--------------------------------------------------------------------------------
/pm4pyspark/algo/filtering/timestamp/timestamp_filter.py:
--------------------------------------------------------------------------------
1 | import pyspark.sql.functions as F
2 |
3 | from pm4py.algo.filtering.common.filtering_constants import CASE_CONCEPT_NAME
4 | from pm4py.algo.filtering.common.timestamp.timestamp_common import get_dt_from_string
5 | from pm4py.util.constants import PARAMETER_CONSTANT_TIMESTAMP_KEY, PARAMETER_CONSTANT_CASEID_KEY
6 | from pm4py.objects.log.util.xes import DEFAULT_TIMESTAMP_KEY
7 | from pm4pyspark.importer.csv import spark_df_imp as importer
8 | from pm4pyspark.importer.csv import spark_df_imp as csv_importer
9 |
10 | from pyspark.sql.window import Window
11 | from pyspark.sql.types import *
12 |
13 |
14 |
15 |
16 | def filter_traces_contained(df, dt1, dt2, parameters=None):
17 | """Gets traces that are contained in the given interval
18 | """
19 |
20 | if parameters is None:
21 | parameters = {}
22 | timestamp_key = parameters[
23 | PARAMETER_CONSTANT_TIMESTAMP_KEY] if PARAMETER_CONSTANT_TIMESTAMP_KEY in parameters else DEFAULT_TIMESTAMP_KEY
24 | case_id_glue = parameters[
25 | PARAMETER_CONSTANT_CASEID_KEY] if PARAMETER_CONSTANT_CASEID_KEY in parameters else CASE_CONCEPT_NAME
26 | dt1 = get_dt_from_string(dt1)
27 | dt2 = get_dt_from_string(dt2)
28 | df_converted = importer.convert_timestamp_to_utc_in_df(df, timest_columns={timestamp_key})
29 | df_ordered = df_converted.orderBy(case_id_glue, timestamp_key)
30 | w = Window().partitionBy(case_id_glue).orderBy(timestamp_key)
31 | w2 = Window().partitionBy(case_id_glue).orderBy(F.desc(timestamp_key))
32 | stacked = df_ordered.withColumn(timestamp_key + "_last", F.max(df_ordered[timestamp_key]).over(w2))
33 | stacked = stacked.withColumn(timestamp_key + "_first", F.min(stacked[timestamp_key]).over(w))
34 | stacked = stacked.filter(stacked[timestamp_key + "_first"] > dt1)
35 | stacked = stacked.filter(stacked[timestamp_key + "_last"] < dt2)
36 | stacked_dropped = stacked.drop(timestamp_key + "_last", timestamp_key + "_first")
37 |
38 | return stacked_dropped
39 |
40 |
41 | def filter_traces_intersecting(df, dt1, dt2, parameters=None):
42 | """Filters traces intersecting the given interval
43 | """
44 |
45 | if parameters is None:
46 | parameters = {}
47 | timestamp_key = parameters[
48 | PARAMETER_CONSTANT_TIMESTAMP_KEY] if PARAMETER_CONSTANT_TIMESTAMP_KEY in parameters else DEFAULT_TIMESTAMP_KEY
49 | case_id_glue = parameters[
50 | PARAMETER_CONSTANT_CASEID_KEY] if PARAMETER_CONSTANT_CASEID_KEY in parameters else CASE_CONCEPT_NAME
51 | dt1 = get_dt_from_string(dt1)
52 | dt2 = get_dt_from_string(dt2)
53 | df_converted = importer.convert_timestamp_to_utc_in_df(df, timest_columns={timestamp_key})
54 | df_ordered = df_converted.orderBy(case_id_glue, timestamp_key)
55 | w = Window().partitionBy(case_id_glue).orderBy(timestamp_key)
56 | w2 = Window().partitionBy(case_id_glue).orderBy(F.desc(timestamp_key))
57 | stacked = df_ordered.withColumn(timestamp_key + "_last", F.max(df_ordered[timestamp_key]).over(w2))
58 | stacked = stacked.withColumn(timestamp_key + "_first", F.min(stacked[timestamp_key]).over(w))
59 |
60 | #stacked1 = stacked.filter(stacked[timestamp_key + "_first"].between(dt1, dt2))
61 | stacked1 = stacked.filter((stacked[timestamp_key + "_first"] > dt1) & (stacked[timestamp_key + "_first"] < dt2))
62 | #stacked2 = stacked.filter(stacked[timestamp_key + "_last"].between(dt1, dt2))
63 | stacked2 = stacked.filter((stacked[timestamp_key + "_last"] > dt1) & (stacked[timestamp_key + "_last"] < dt2))
64 | stacked3 = stacked.filter(stacked[timestamp_key + "_first"] < dt1)
65 | stacked3 = stacked3.filter(stacked3[timestamp_key + "_last"] > dt2)
66 |
67 |
68 | stacked = stacked1.union(stacked2)
69 | stacked = stacked.union(stacked3)
70 | stacked = stacked.drop(timestamp_key + "_last", timestamp_key + "_first")\
71 | .distinct().orderBy(case_id_glue, timestamp_key)
72 |
73 | return stacked
74 |
75 |
76 | def apply_events(df, dt1, dt2, parameters=None):
77 | """Gets a new Spark DataFrame with all the events contained in the given interval
78 | """
79 |
80 | if parameters is None:
81 | parameters = {}
82 | timestamp_key = parameters[
83 | PARAMETER_CONSTANT_TIMESTAMP_KEY] if PARAMETER_CONSTANT_TIMESTAMP_KEY in parameters else DEFAULT_TIMESTAMP_KEY
84 |
85 | if df.schema[timestamp_key].dataType != StringType():
86 | dt1 = get_dt_from_string(dt1)
87 | dt2 = get_dt_from_string(dt2)
88 | filtered_df = df.filter((df[timestamp_key] > dt1) & (df[timestamp_key] < dt2))
89 |
90 | return filtered_df
91 |
--------------------------------------------------------------------------------
/pm4pyspark/importer/csv/spark_df_imp.py:
--------------------------------------------------------------------------------
1 | import datetime
2 | import pyspark
3 | import pyspark.sql.functions as F
4 |
5 | from dateutil import parser, tz
6 | from pm4py.objects.log import log as log_instance
7 | from pm4py.objects.conversion.log.versions import to_event_log
8 | from pm4pyspark.importer.constants import DEFAULT_NUM_PARTITION
9 | from pyspark.sql import SparkSession
10 | from pyspark.sql.types import *
11 |
12 |
13 |
14 |
15 | def convert_timestamp_to_utc_in_df(df, timest_columns=None):
16 | """Converts datatype of column "time:timestamp" from `StringType` to `TimestampType` as UTC timezone
17 | """
18 |
19 | if timest_columns is None:
20 | timest_columns = {"time:timestamp"}
21 | for col in timest_columns:
22 | if df.schema[col].dataType == StringType():
23 | utc_zone = tz.gettz("UTC")
24 | func = F.udf(lambda x: parser.parse(x).astimezone(utc_zone).isoformat(timespec='milliseconds'), StringType())
25 | func2 = F.udf(lambda x: datetime.datetime.strptime(''.join(x[:-6].rsplit(':', 0)), '%Y-%m-%dT%H:%M:%S.%f'), TimestampType())
26 | #df = df.withColumn(col + "_utc", func2(func(df[col])))
27 | #df = df.drop(col).withColumnRenamed(col + "_utc", col)
28 | df = df.withColumn(col, func2(func(df[col])))
29 |
30 | return df
31 |
32 |
33 | def import_sparkdf_from_path_wo_timeconversion(path, sep=None, quote=None, header=None, inferSchema=None, numPartition=DEFAULT_NUM_PARTITION):
34 | """Imports a Spark DataFrame from the given path of CSV format file (without time conversion)
35 | """
36 |
37 | spark = (SparkSession.
38 | builder.
39 | master('local[*]').
40 | config('spark.sql.shuffle.partitions', numPartition).
41 | getOrCreate())
42 |
43 | spark_df = spark.read.csv(path, sep=sep, quote=quote, header=header, inferSchema=inferSchema)
44 |
45 | return spark_df
46 |
47 |
48 | def convert_caseid_column_to_str(df, case_id_glue="case:concept:name"):
49 | """Converts Case ID column to StringType
50 | """
51 |
52 | df = df.withColumn(case_id_glue, df[case_id_glue].cast(StringType()))
53 |
54 | return df
55 |
56 |
57 |
58 | def import_sparkdf_from_path(path, sep=None, quote=None, header=None, inferSchema=None, timest_columns=None,
59 | sort=False, sort_field="time:timestamp", ascending=True, numPartition=DEFAULT_NUM_PARTITION):
60 | """Imports a Spark DataFrame from the given path of CSV format file (with time conversion)
61 | """
62 |
63 | spark_df = import_sparkdf_from_path_wo_timeconversion(path, sep=sep, quote=quote, header=header,
64 | inferSchema=inferSchema, numPartition=numPartition)
65 | spark_df = convert_timestamp_to_utc_in_df(spark_df, timest_columns=timest_columns)
66 |
67 | if sort and sort_field:
68 | if ascending is True:
69 | spark_df = spark_df.orderBy(sort_field)
70 | else:
71 | spark_df = spark_df.orderBy(sort_field, ascending=False)
72 |
73 | return spark_df
74 |
75 |
76 | def import_event_stream(path, sep=None, quote=None, header=None, inferSchema=True, timest_columns=None, sort=True,
77 | sort_field="time:timestamp", ascending=True, numPartition=DEFAULT_NUM_PARTITION):
78 | """Imports an `EventStream` from the given path of CSV format file
79 | """
80 |
81 | spark_df = import_sparkdf_from_path(path, sep=sep, quote=quote, header=header, inferSchema=inferSchema,
82 | timest_columns=timest_columns, sort=sort, sort_field=sort_field,
83 | ascending=ascending, numPartition=numPartition)
84 | rdd = spark_df.rdd.map(lambda row: row.asDict())
85 | event_stream = rdd.collect()
86 | event_stream = log_instance.EventStream(event_stream, attributes={'origin': 'csv'})
87 | #pair_rdd = rdd.map(lambda s: (s[0], (s[1], s[2])))
88 | #pair_rdd_group = pair_rdd.groupByKey().mapVal0ues(list)
89 | #return pair_rdd_group.collect()
90 | return event_stream
91 |
92 |
93 | def transform_event_stream_to_event_log(event_stream, case_glue="case:concept:name", include_case_attributes=True, enable_deepcopy=False):
94 | """Transforms an `EventStream` to an `EventLog`
95 | """
96 |
97 | log = to_event_log.transform_event_stream_to_event_log(event_stream,
98 | case_glue=case_glue,
99 | include_case_attributes=include_case_attributes,
100 | enable_deepcopy=enable_deepcopy)
101 |
102 | return log
103 |
--------------------------------------------------------------------------------
/pm4pyspark/algo/filtering/start_activities/start_activities_filter.py:
--------------------------------------------------------------------------------
1 | import pyspark.sql.functions as F
2 |
3 | from pm4py.algo.filtering.common.filtering_constants import CASE_CONCEPT_NAME, DECREASING_FACTOR
4 | from pm4py.algo.filtering.common.start_activities import start_activities_common
5 | from pm4py.objects.log.util.xes import DEFAULT_NAME_KEY, DEFAULT_TIMESTAMP_KEY
6 | from pm4py.util.constants import PARAMETER_CONSTANT_TIMESTAMP_KEY
7 | from pm4py.util.constants import PARAMETER_CONSTANT_ACTIVITY_KEY
8 | from pm4py.util.constants import PARAMETER_CONSTANT_CASEID_KEY
9 | from pm4py.util.constants import GROUPED_DATAFRAME
10 | from pyspark.sql.window import Window
11 |
12 |
13 |
14 |
15 | def apply(df, values, parameters=None):
16 | """Filters the Spark dataframe on start activities
17 | """
18 |
19 | if parameters is None:
20 | parameters = {}
21 |
22 | timestamp_key = parameters[
23 | PARAMETER_CONSTANT_TIMESTAMP_KEY] if PARAMETER_CONSTANT_TIMESTAMP_KEY in parameters else DEFAULT_TIMESTAMP_KEY
24 | case_id_glue = parameters[
25 | PARAMETER_CONSTANT_CASEID_KEY] if PARAMETER_CONSTANT_CASEID_KEY in parameters else CASE_CONCEPT_NAME
26 | activity_key = parameters[
27 | PARAMETER_CONSTANT_ACTIVITY_KEY] if PARAMETER_CONSTANT_ACTIVITY_KEY in parameters else DEFAULT_NAME_KEY
28 | grouped_df = parameters[GROUPED_DATAFRAME] if GROUPED_DATAFRAME in parameters else None
29 | positive = parameters["positive"] if "positive" in parameters else True
30 |
31 | return filter_df_on_start_activities(df, values, timestamp_key=timestamp_key, case_id_glue=case_id_glue, activity_key=activity_key,
32 | positive=positive, grouped_df=grouped_df)
33 |
34 |
35 | def apply_auto_filter(df, parameters=None):
36 | """Applies auto filter on end activities
37 | """
38 |
39 | if parameters is None:
40 | parameters = {}
41 |
42 | timestamp_key = parameters[
43 | PARAMETER_CONSTANT_TIMESTAMP_KEY] if PARAMETER_CONSTANT_TIMESTAMP_KEY in parameters else DEFAULT_TIMESTAMP_KEY
44 | case_id_glue = parameters[
45 | PARAMETER_CONSTANT_CASEID_KEY] if PARAMETER_CONSTANT_CASEID_KEY in parameters else CASE_CONCEPT_NAME
46 | activity_key = parameters[
47 | PARAMETER_CONSTANT_ACTIVITY_KEY] if PARAMETER_CONSTANT_ACTIVITY_KEY in parameters else DEFAULT_NAME_KEY
48 | decreasing_factor = parameters[
49 | "decreasingFactor"] if "decreasingFactor" in parameters else DECREASING_FACTOR
50 | grouped_df = parameters[GROUPED_DATAFRAME] if GROUPED_DATAFRAME in parameters else None
51 |
52 | start_activities = get_start_activities(df, parameters=parameters)
53 | salist = start_activities_common.get_sorted_start_activities_list(start_activities)
54 | sathreshold = start_activities_common.get_start_activities_threshold(salist, decreasing_factor)
55 |
56 | return filter_df_on_start_activities_nocc(df, sathreshold, sa_count0=start_activities, timestamp_key=timestamp_key, case_id_glue=case_id_glue,
57 | activity_key=activity_key, grouped_df=grouped_df)
58 |
59 |
60 | def get_start_activities(df, parameters=None):
61 | """Gets start activities count
62 | """
63 |
64 | if parameters is None:
65 | parameters = {}
66 |
67 | timestamp_key = parameters[
68 | PARAMETER_CONSTANT_TIMESTAMP_KEY] if PARAMETER_CONSTANT_TIMESTAMP_KEY in parameters else DEFAULT_TIMESTAMP_KEY
69 | case_id_glue = parameters[
70 | PARAMETER_CONSTANT_CASEID_KEY] if PARAMETER_CONSTANT_CASEID_KEY in parameters else CASE_CONCEPT_NAME
71 | activity_key = parameters[
72 | PARAMETER_CONSTANT_ACTIVITY_KEY] if PARAMETER_CONSTANT_ACTIVITY_KEY in parameters else DEFAULT_NAME_KEY
73 | grouped_df = parameters[GROUPED_DATAFRAME] if GROUPED_DATAFRAME in parameters else df.groupby(case_id_glue)
74 |
75 | df_start = grouped_df.agg(F.first(activity_key).alias(activity_key)).select(activity_key)
76 | rdd_start = df_start.rdd.map(lambda row: (row[0], 1)).reduceByKey(lambda x, y : x + y)
77 |
78 | return rdd_start.collectAsMap()
79 |
80 |
81 | def filter_df_on_start_activities(df, values, timestamp_key=DEFAULT_TIMESTAMP_KEY, case_id_glue=CASE_CONCEPT_NAME,
82 | activity_key=DEFAULT_NAME_KEY, grouped_df=None, positive=True):
83 | """Filters the Spark dataframe on start activities
84 | """
85 |
86 | if grouped_df is None:
87 | grouped_df = df.groupby(case_id_glue)
88 |
89 | grouped_df = grouped_df.agg(F.first(activity_key).alias(activity_key+"_1"))
90 | df_start = grouped_df.filter(grouped_df[activity_key+"_1"].isin(values))
91 |
92 | if positive:
93 | return df.join(F.broadcast(df_start), grouped_df.columns[0]).drop(activity_key+"_1")
94 | else:
95 | return df.join(F.broadcast(df_start), grouped_df.columns[0], "leftanti")
96 |
97 |
98 | def filter_df_on_start_activities_nocc(df, nocc, sa_count0=None, timestamp_key=DEFAULT_TIMESTAMP_KEY,
99 | case_id_glue=CASE_CONCEPT_NAME, activity_key=DEFAULT_NAME_KEY, grouped_df=None):
100 | """Filters the Spark dataframe on start activities number of occurrences
101 | """
102 |
103 | if grouped_df is None:
104 | grouped_df = df.groupby(case_id_glue)
105 | if sa_count0 is None:
106 | parameters = {
107 | PARAMETER_CONSTANT_TIMESTAMP_KEY: timestamp_key,
108 | PARAMETER_CONSTANT_CASEID_KEY: case_id_glue,
109 | PARAMETER_CONSTANT_ACTIVITY_KEY: activity_key,
110 | GROUPED_DATAFRAME: grouped_df
111 | }
112 | sa_count0 = get_start_activities(df, parameters=parameters)
113 | sa_count = [k for k, v in sa_count0.items() if v >= nocc]
114 |
115 | if len(sa_count) < len(sa_count0):
116 | grouped_df = grouped_df.agg(F.first(activity_key).alias(activity_key+"_1"))
117 | df_start = grouped_df.filter(grouped_df[activity_key+"_1"].isin(sa_count))
118 | return df.join(F.broadcast(df_start), grouped_df.columns[0]).drop(activity_key+"_1")
119 | return df
120 |
--------------------------------------------------------------------------------
/pm4pyspark/algo/filtering/end_activities/end_activities_filter.py:
--------------------------------------------------------------------------------
1 | import pyspark.sql.functions as F
2 |
3 | from pm4py.algo.filtering.common.end_activities import end_activities_common
4 | from pm4py.algo.filtering.common.filtering_constants import CASE_CONCEPT_NAME, DECREASING_FACTOR
5 | from pm4py.objects.log.util.xes import DEFAULT_NAME_KEY, DEFAULT_TIMESTAMP_KEY
6 | from pm4py.util.constants import PARAMETER_CONSTANT_TIMESTAMP_KEY
7 | from pm4py.util.constants import PARAMETER_CONSTANT_ACTIVITY_KEY
8 | from pm4py.util.constants import PARAMETER_CONSTANT_CASEID_KEY
9 | from pm4py.util.constants import PARAM_MOST_COMMON_VARIANT
10 | from pm4py.util.constants import RETURN_EA_COUNT_DICT_AUTOFILTER
11 | from pm4py.util.constants import GROUPED_DATAFRAME
12 |
13 |
14 |
15 |
16 | def apply(df, values, parameters=None):
17 | """Filters the Spark dataframe on end activities
18 | """
19 | if parameters is None:
20 | parameters = {}
21 | timestamp_key = parameters[
22 | PARAMETER_CONSTANT_TIMESTAMP_KEY] if PARAMETER_CONSTANT_TIMESTAMP_KEY in parameters else DEFAULT_TIMESTAMP_KEY
23 | case_id_glue = parameters[
24 | PARAMETER_CONSTANT_CASEID_KEY] if PARAMETER_CONSTANT_CASEID_KEY in parameters else CASE_CONCEPT_NAME
25 | activity_key = parameters[
26 | PARAMETER_CONSTANT_ACTIVITY_KEY] if PARAMETER_CONSTANT_ACTIVITY_KEY in parameters else DEFAULT_NAME_KEY
27 | grouped_df = parameters[GROUPED_DATAFRAME] if GROUPED_DATAFRAME in parameters else None
28 | positive = parameters["positive"] if "positive" in parameters else True
29 |
30 | return filter_df_on_end_activities(df, values, timestamp_key=timestamp_key, case_id_glue=case_id_glue, activity_key=activity_key,
31 | positive=positive, grouped_df=grouped_df)
32 |
33 |
34 | def apply_auto_filter(df, parameters=None):
35 | """Applies auto filter on end activities
36 | """
37 | if parameters is None:
38 | parameters = {}
39 |
40 | most_common_variant = parameters[PARAM_MOST_COMMON_VARIANT] if PARAM_MOST_COMMON_VARIANT in parameters else None
41 |
42 | if most_common_variant is None:
43 | most_common_variant = []
44 |
45 | timestamp_key = parameters[
46 | PARAMETER_CONSTANT_TIMESTAMP_KEY] if PARAMETER_CONSTANT_TIMESTAMP_KEY in parameters else DEFAULT_TIMESTAMP_KEY
47 | case_id_glue = parameters[
48 | PARAMETER_CONSTANT_CASEID_KEY] if PARAMETER_CONSTANT_CASEID_KEY in parameters else CASE_CONCEPT_NAME
49 | activity_key = parameters[
50 | PARAMETER_CONSTANT_ACTIVITY_KEY] if PARAMETER_CONSTANT_ACTIVITY_KEY in parameters else DEFAULT_NAME_KEY
51 | grouped_df = parameters[GROUPED_DATAFRAME] if GROUPED_DATAFRAME in parameters else None
52 | return_dict = parameters[
53 | RETURN_EA_COUNT_DICT_AUTOFILTER] if RETURN_EA_COUNT_DICT_AUTOFILTER in parameters else False
54 |
55 | decreasing_factor = parameters[
56 | "decreasingFactor"] if "decreasingFactor" in parameters else DECREASING_FACTOR
57 | if df.count() > 0:
58 | end_activities = get_end_activities(df, parameters=parameters)
59 | ealist = end_activities_common.get_sorted_end_activities_list(end_activities)
60 | eathreshold = end_activities_common.get_end_activities_threshold(ealist, decreasing_factor)
61 |
62 | return filter_df_on_end_activities_nocc(df, eathreshold, ea_count0=end_activities, timestamp_key=timestamp_key,
63 | case_id_glue=case_id_glue, activity_key=activity_key, grouped_df=grouped_df,
64 | return_dict=return_dict, most_common_variant=most_common_variant)
65 |
66 | if return_dict:
67 | return df, {}
68 |
69 | return df
70 |
71 |
72 | def get_end_activities(df, parameters=None):
73 | """Gets end activities count
74 | """
75 |
76 | if parameters is None:
77 | parameters = {}
78 |
79 | timestamp_key = parameters[
80 | PARAMETER_CONSTANT_TIMESTAMP_KEY] if PARAMETER_CONSTANT_TIMESTAMP_KEY in parameters else DEFAULT_TIMESTAMP_KEY
81 | case_id_glue = parameters[
82 | PARAMETER_CONSTANT_CASEID_KEY] if PARAMETER_CONSTANT_CASEID_KEY in parameters else CASE_CONCEPT_NAME
83 | activity_key = parameters[
84 | PARAMETER_CONSTANT_ACTIVITY_KEY] if PARAMETER_CONSTANT_ACTIVITY_KEY in parameters else DEFAULT_NAME_KEY
85 | grouped_df = parameters[GROUPED_DATAFRAME] if GROUPED_DATAFRAME in parameters else df.groupby(case_id_glue)
86 |
87 | df_end = grouped_df.agg(F.last(activity_key).alias(activity_key)).select(activity_key)
88 | rdd_end = df_end.rdd.map(lambda row: (row[0], 1)).reduceByKey(lambda x, y : x + y)
89 |
90 | return rdd_end.collectAsMap()
91 |
92 |
93 | def filter_df_on_end_activities(df, values, timestamp_key=DEFAULT_TIMESTAMP_KEY, case_id_glue=CASE_CONCEPT_NAME,
94 | activity_key=DEFAULT_NAME_KEY, grouped_df=None, positive=True):
95 | """Filters the Spark dataframe on end activities
96 | """
97 |
98 | if grouped_df is None:
99 | grouped_df = df.groupby(case_id_glue)
100 |
101 | grouped_df = grouped_df.agg(F.last(activity_key).alias(activity_key+"_1"))
102 | df_end = grouped_df.filter(grouped_df[activity_key+"_1"].isin(values))
103 |
104 | if positive:
105 | return df.join(F.broadcast(df_end), grouped_df.columns[0]).drop(activity_key+"_1")
106 | else:
107 | return df.join(F.broadcast(df_end), grouped_df.columns[0], "leftanti")
108 |
109 |
110 | def filter_df_on_end_activities_nocc(df, nocc, ea_count0=None, timestamp_key=DEFAULT_TIMESTAMP_KEY,
111 | case_id_glue=CASE_CONCEPT_NAME, activity_key=DEFAULT_NAME_KEY,
112 | grouped_df=None, return_dict=False, most_common_variant=None):
113 | """Filters the Spark dataframe on end activities number of occurrences
114 | """
115 |
116 | if most_common_variant is None:
117 | most_common_variant = []
118 |
119 | if df.count() > 0:
120 | if grouped_df is None:
121 | grouped_df = df.groupby(case_id_glue)
122 | if ea_count0 is None:
123 | parameters = {
124 | PARAMETER_CONSTANT_TIMESTAMP_KEY: timestamp_key,
125 | PARAMETER_CONSTANT_CASEID_KEY: case_id_glue,
126 | PARAMETER_CONSTANT_ACTIVITY_KEY: activity_key,
127 | GROUPED_DATAFRAME: grouped_df
128 | }
129 | ea_count0 = get_end_activities(df, parameters=parameters)
130 | ea_count = [k for k, v in ea_count0.items() if
131 | v >= nocc or (len(most_common_variant) > 0 and k == most_common_variant[-1])]
132 | ea_count_dict = {k: v for k, v in ea_count0.items() if
133 | v >= nocc or (len(most_common_variant) > 0 and k == most_common_variant[-1])}
134 |
135 | # Using join operation
136 | if len(ea_count) < len(ea_count0):
137 | grouped_df = grouped_df.agg(F.last(activity_key).alias(activity_key+"_1"))
138 | df_end = grouped_df.filter(grouped_df[activity_key+"_1"].isin(ea_count))
139 | if return_dict:
140 | return df.join(F.broadcast(df_end), grouped_df.columns[0]).drop(activity_key+"_1"), ea_count_dict
141 | return df.join(F.broadcast(df_end), grouped_df.columns[0])
142 | if return_dict:
143 | return df, ea_count_dict
144 | return df
145 |
--------------------------------------------------------------------------------
/pm4pyspark/algo/filtering/attributes/attributes_filter.py:
--------------------------------------------------------------------------------
1 | import pyspark.sql.functions as F
2 |
3 | from pm4py.algo.filtering.common.attributes import attributes_common
4 | from pm4py.algo.filtering.common.filtering_constants import CASE_CONCEPT_NAME, DECREASING_FACTOR
5 | from pm4py.objects.log.util.xes import DEFAULT_NAME_KEY, DEFAULT_TIMESTAMP_KEY
6 | from pm4py.util.constants import PARAMETER_CONSTANT_ATTRIBUTE_KEY
7 | from pm4py.util.constants import PARAMETER_CONSTANT_CASEID_KEY
8 | from pm4py.util.constants import PARAM_MOST_COMMON_VARIANT
9 |
10 |
11 |
12 |
13 | def apply_numeric_events(df, int1, int2, parameters=None):
14 | """Applies a filter on events (numerical filter)
15 | """
16 |
17 | if parameters is None:
18 | parameters = {}
19 | attribute_key = parameters[
20 | PARAMETER_CONSTANT_ATTRIBUTE_KEY] if PARAMETER_CONSTANT_ATTRIBUTE_KEY in parameters else DEFAULT_NAME_KEY
21 | positive = parameters["positive"] if "positive" in parameters else True
22 | if positive:
23 | return df.filter(df[attribute_key].between(int1, int2))
24 | else:
25 | return df.filter(~df[attribute_key].between(int1, int2))
26 |
27 |
28 | def apply_numeric(df, int1, int2, parameters=None):
29 | """Filters the Spark dataframe on attribute values (filter cases)
30 | """
31 |
32 | if parameters is None:
33 | parameters = {}
34 | attribute_key = parameters[
35 | PARAMETER_CONSTANT_ATTRIBUTE_KEY] if PARAMETER_CONSTANT_ATTRIBUTE_KEY in parameters else DEFAULT_NAME_KEY
36 | case_id_glue = parameters[
37 | PARAMETER_CONSTANT_CASEID_KEY] if PARAMETER_CONSTANT_CASEID_KEY in parameters else CASE_CONCEPT_NAME
38 | positive = parameters["positive"] if "positive" in parameters else True
39 |
40 | df_filtered = df.filter(df[attribute_key].between(int1, int2))
41 | df_filtered = df_filtered.groupBy(case_id_glue).count()
42 | #filtered_index = df_filtered.select(case_id_glue).rdd.map(lambda x: x[0]).collect()
43 | if positive:
44 | return df.join(F.broadcast(df_filtered), case_id_glue).drop("count")
45 | else:
46 | df_left_joined = df.join(F.broadcast(df_filtered), case_id_glue, "left")
47 | return df_left_joined.filter(df_left_joined["count"].isNull()).drop("count")
48 |
49 |
50 | def apply_events(df, values, parameters=None):
51 | """Filters the Spark dataframe on attribute values (filter events)
52 | """
53 |
54 | if parameters is None:
55 | parameters = {}
56 | attribute_key = parameters[
57 | PARAMETER_CONSTANT_ATTRIBUTE_KEY] if PARAMETER_CONSTANT_ATTRIBUTE_KEY in parameters else DEFAULT_NAME_KEY
58 | positive = parameters["positive"] if "positive" in parameters else True
59 | if positive:
60 | return df.filter(df[attribute_key].isin(values))
61 | else:
62 | return df.filter(~df[attribute_key].isin(values))
63 |
64 |
65 | def apply(df, values, parameters=None):
66 | """Filters the Spark dataframe on attribute values (filter traces)
67 | """
68 |
69 | if parameters is None:
70 | parameters = {}
71 |
72 | case_id_glue = parameters[
73 | PARAMETER_CONSTANT_CASEID_KEY] if PARAMETER_CONSTANT_CASEID_KEY in parameters else CASE_CONCEPT_NAME
74 | attribute_key = parameters[
75 | PARAMETER_CONSTANT_ATTRIBUTE_KEY] if PARAMETER_CONSTANT_ATTRIBUTE_KEY in parameters else DEFAULT_NAME_KEY
76 | positive = parameters["positive"] if "positive" in parameters else True
77 |
78 | return filter_df_on_attribute_values(df, values, case_id_glue=case_id_glue, attribute_key=attribute_key,
79 | positive=positive)
80 |
81 |
82 | def apply_auto_filter(df, parameters=None):
83 | """Applies auto filter on activity values
84 | """
85 | if parameters is None:
86 | parameters = {}
87 |
88 | most_common_variant = parameters[PARAM_MOST_COMMON_VARIANT] if PARAM_MOST_COMMON_VARIANT in parameters else None
89 |
90 | if most_common_variant is None:
91 | most_common_variant = []
92 |
93 | activity_key = parameters[
94 | PARAMETER_CONSTANT_ACTIVITY_KEY] if PARAMETER_CONSTANT_ACTIVITY_KEY in parameters else DEFAULT_NAME_KEY
95 | decreasing_factor = parameters[
96 | "decreasingFactor"] if "decreasingFactor" in parameters else DECREASING_FACTOR
97 |
98 | if df.count() > 0:
99 | activities = get_attribute_values(df, activity_key)
100 | alist = attributes_common.get_sorted_attributes_list(activities)
101 | thresh = attributes_common.get_attributes_threshold(alist, decreasing_factor)
102 |
103 | return filter_df_keeping_activ_exc_thresh(df, thresh, activity_key=activity_key, act_count0=activities,
104 | most_common_variant=most_common_variant)
105 | return df
106 |
107 |
108 | def get_attribute_values(df, attribute_key, parameters=None):
109 | """Returns a list of attribute values contained in the specified column of the CSV
110 | """
111 |
112 | if parameters is None:
113 | parameters = {}
114 | str(parameters)
115 | df = df.select(attribute_key)
116 | rdd_df = df.rdd.map(lambda event: (event[0], 1)).reduceByKey(lambda x, y : x + y)\
117 | .sortBy(lambda x: -x[1])
118 |
119 | return rdd_df.collectAsMap()
120 |
121 |
122 | def filter_df_on_attribute_values(df, values, case_id_glue="case:concept:name", attribute_key="concept:name",
123 | positive=True):
124 | """Filters the Spark dataframe on attribute values
125 | """
126 |
127 | df_filtered = df.filter(df[attribute_key].isin(values))
128 | df_filtered = df_filtered.groupBy(case_id_glue).count()
129 | if positive:
130 | return df.join(F.broadcast(df_filtered), case_id_glue).drop("count")
131 | else:
132 | return df.join(F.broadcast(df_filtered), case_id_glue, "leftanti")
133 |
134 |
135 | def filter_df_keeping_activ_exc_thresh(df, thresh, act_count0=None, activity_key="concept:name",
136 | most_common_variant=None):
137 | """Filters the Spark dataframe keeping activities exceeding the threshold
138 | """
139 |
140 | if most_common_variant is None:
141 | most_common_variant = []
142 |
143 | if act_count0 is None:
144 | act_count0 = get_attribute_values(df, activity_key)
145 | act_count = [k for k, v in act_count0.items() if v >= thresh or k in most_common_variant]
146 | if len(act_count) < len(act_count0):
147 | df = df.filter(df[activity_key].isin(act_count))
148 | return df
149 |
150 |
151 | def filter_df_keeping_spno_activities(df, activity_key="concept:name", max_no_activities=25):
152 | """Filters the Spark dataframe on the specified number of attributes
153 | """
154 |
155 | activity_values_dict = get_attribute_values(df, activity_key)
156 | activity_values_ordered_list = []
157 | for act in activity_values_dict:
158 | activity_values_ordered_list.append([act, activity_values_dict[act]])
159 | activity_values_ordered_list = sorted(activity_values_ordered_list, key=lambda x: (x[1], x[0]), reverse=True)
160 | activity_values_ordered_list = activity_values_ordered_list[
161 | 0:min(len(activity_values_ordered_list), max_no_activities)]
162 | activity_to_keep = [x[0] for x in activity_values_ordered_list]
163 |
164 | if len(activity_to_keep) < len(activity_values_dict):
165 | df = df.filter(df[activity_key].isin(activity_to_keep))
166 | return df
167 |
168 |
169 | def get_kde_numeric_attribute(df, attribute, parameters=None):
170 | """Gets the KDE estimation for the distribution of a numeric attribute values
171 | """
172 | values = df.select(attribute).rdd.map(lambda row : row[0]).collect()
173 |
174 | return attributes_common.get_kde_numeric_attribute(values, parameters=parameters)
175 |
176 |
177 | def get_kde_numeric_attribute_json(df, attribute, parameters=None):
178 | """
179 | Gets the KDE estimation for the distribution of a numeric attribute values
180 | (expressed as JSON)
181 | """
182 | values = df.select(attribute).rdd.map(lambda row : row[0]).collect()
183 |
184 | return attributes_common.get_kde_numeric_attribute_json(values, parameters=parameters)
185 |
186 |
187 | def get_kde_date_attribute(df, attribute=DEFAULT_TIMESTAMP_KEY, parameters=None):
188 | """Gets the KDE estimation for the distribution of a date attribute values
189 | """
190 | date_values = df.select(attribute).rdd.map(lambda row : row[0]).collect()
191 |
192 | return attributes_common.get_kde_date_attribute(date_values, parameters=parameters)
193 |
194 |
195 | def get_kde_date_attribute_json(df, attribute=DEFAULT_TIMESTAMP_KEY, parameters=None):
196 | """
197 | Gets the KDE estimation for the distribution of a date attribute values
198 | (expressed as JSON)
199 | """
200 | values = df.select(attribute).rdd.map(lambda row : row[0]).collect()
201 |
202 | return attributes_common.get_kde_date_attribute_json(values, parameters=parameters)
203 |
--------------------------------------------------------------------------------
/pm4pyspark/algo/filtering/variants/variants_filter.py:
--------------------------------------------------------------------------------
1 | import pyspark.sql.functions as F
2 |
3 | from pm4py.algo.filtering.common.filtering_constants import CASE_CONCEPT_NAME, DECREASING_FACTOR
4 | from pm4py.objects.log.util.xes import DEFAULT_NAME_KEY, DEFAULT_TIMESTAMP_KEY
5 | from pm4py.statistics.traces.common import case_duration as case_duration_commons
6 | from pm4py.util.constants import PARAMETER_CONSTANT_TIMESTAMP_KEY
7 | from pm4py.util.constants import PARAMETER_CONSTANT_ACTIVITY_KEY
8 | from pm4py.util.constants import PARAMETER_CONSTANT_CASEID_KEY
9 | from pyspark.sql.window import Window
10 |
11 |
12 |
13 |
14 | def apply_auto_filter(df, parameters=None):
15 | """Applies an automatic filter on variants
16 | """
17 | if parameters is None:
18 | parameters = {}
19 | case_id_glue = parameters[
20 | PARAMETER_CONSTANT_CASEID_KEY] if PARAMETER_CONSTANT_CASEID_KEY in parameters else CASE_CONCEPT_NAME
21 | variants_df = get_variants_df(df, parameters=parameters)
22 | parameters["variants_df"] = variants_df
23 | variants = get_variant_statistics(df, parameters=parameters)
24 | decreasing_factor = parameters[
25 | "decreasingFactor"] if "decreasingFactor" in parameters else DECREASING_FACTOR
26 |
27 | admitted_variants = []
28 | if len(variants) > 0:
29 | current_variant_count = variants[0][case_id_glue]
30 |
31 | for i in range(len(variants)):
32 | if variants[i][case_id_glue] >= decreasing_factor * current_variant_count:
33 | admitted_variants.append(variants[i]["variant"])
34 | else:
35 | break
36 | current_variant_count = variants[i][case_id_glue]
37 |
38 | return apply(df, admitted_variants, parameters=parameters)
39 |
40 |
41 | def apply(df, admitted_variants, parameters=None):
42 | """Applies a filter on variants
43 | """
44 | if parameters is None:
45 | parameters = {}
46 |
47 | case_id_glue = parameters[
48 | PARAMETER_CONSTANT_CASEID_KEY] if PARAMETER_CONSTANT_CASEID_KEY in parameters else CASE_CONCEPT_NAME
49 | positive = parameters["positive"] if "positive" in parameters else True
50 | variants_df = parameters["variants_df"] if "variants_df" in parameters else get_variants_df(df,
51 | parameters=parameters)
52 | variants_df = variants_df.filter(variants_df["variant"].isin(admitted_variants))
53 |
54 | if positive:
55 | return df.join(F.broadcast(variants_df), case_id_glue)
56 | else:
57 | return df.join(F.broadcast(variants_df), case_id_glue, "leftanti")
58 |
59 |
60 | def get_variant_statistics(df, parameters=None):
61 | """Gets variants from the Spark dataframe
62 | """
63 | if parameters is None:
64 | parameters = {}
65 |
66 | case_id_glue = parameters[
67 | PARAMETER_CONSTANT_CASEID_KEY] if PARAMETER_CONSTANT_CASEID_KEY in parameters else CASE_CONCEPT_NAME
68 | max_variants_to_return = parameters["max_variants_to_return"] if "max_variants_to_return" in parameters else None
69 | variants_df = parameters["variants_df"] if "variants_df" in parameters else get_variants_df(df,
70 | parameters=parameters)
71 |
72 | variants_df_count = variants_df.groupby("variant").count().orderBy("count", ascending=False)
73 | variants_df_count = variants_df_count.withColumnRenamed("count", case_id_glue)
74 | rdd = variants_df_count.rdd.map(lambda row: row.asDict())
75 | if max_variants_to_return:
76 | return rdd.take(max_variants_to_return)
77 | return rdd.collect()
78 |
79 |
80 | def get_variant_statistics_with_case_duration(df, parameters=None):
81 | """Gets variants from the Spark dataframe with case duration
82 | """
83 | if parameters is None:
84 | parameters = {}
85 | case_id_glue = parameters[
86 | PARAMETER_CONSTANT_CASEID_KEY] if PARAMETER_CONSTANT_CASEID_KEY in parameters else CASE_CONCEPT_NAME
87 | max_variants_to_return = parameters["max_variants_to_return"] if "max_variants_to_return" in parameters else None
88 | variants_df = parameters["variants_df"] if "variants_df" in parameters else get_variants_df_with_case_duration(df,
89 | parameters=parameters)
90 |
91 | variants_df = variants_df.groupby("variant").agg(
92 | F.mean("caseDuration").alias("caseDuration"),
93 | F.count(F.lit(1)).alias("count")
94 | ).orderBy("count", ascending=False)
95 | variants_list = variants_df.rdd.map(lambda row: row.asDict())
96 | if max_variants_to_return:
97 | return variants_list.take(max_variants_to_return)
98 | return variants_list.collect()
99 |
100 |
101 | def get_variants_df_and_list(df, parameters=None):
102 | """(Technical method) Provides variants_df and variants_list out of the box
103 | """
104 | if parameters is None:
105 | parameters = {}
106 | case_id_glue = parameters[
107 | PARAMETER_CONSTANT_CASEID_KEY] if PARAMETER_CONSTANT_CASEID_KEY in parameters else CASE_CONCEPT_NAME
108 | variants_df = get_variants_df(df, parameters=parameters)
109 | parameters["variants_df"] = variants_df
110 | variants_stats = get_variant_statistics(df, parameters=parameters)
111 | variants_list = []
112 | for vd in variants_stats:
113 | variant = vd["variant"]
114 | count = vd[case_id_glue]
115 | variants_list.append([variant, count])
116 | variants_list = sorted(variants_list, key=lambda x: (x[1], x[0]), reverse=True)
117 | return variants_df, variants_list
118 |
119 |
120 | def get_cases_description(df, parameters=None):
121 | """Gets a description of traces present in the Spark dataframe
122 | """
123 | if parameters is None:
124 | parameters = {}
125 |
126 | case_id_glue = parameters[
127 | PARAMETER_CONSTANT_CASEID_KEY] if PARAMETER_CONSTANT_CASEID_KEY in parameters else CASE_CONCEPT_NAME
128 | timestamp_key = parameters[
129 | PARAMETER_CONSTANT_TIMESTAMP_KEY] if PARAMETER_CONSTANT_TIMESTAMP_KEY in parameters else DEFAULT_TIMESTAMP_KEY
130 | enable_sort = parameters["enable_sort"] if "enable_sort" in parameters else True
131 | sort_by_column = parameters["sort_by_column"] if "sort_by_column" in parameters else "startTime"
132 | sort_ascending = parameters["sort_ascending"] if "sort_ascending" in parameters else True
133 | max_ret_cases = parameters["max_ret_cases"] if "max_ret_cases" in parameters else None
134 |
135 | ordered_df = df.orderBy(timestamp_key).select(case_id_glue, timestamp_key)
136 | grouped_df = ordered_df.groupby(case_id_glue)
137 |
138 | start_df = grouped_df.agg(F.min(timestamp_key).alias(timestamp_key))
139 | first_eve_df = ordered_df.join(F.broadcast(start_df), start_df.columns)
140 | end_df = grouped_df.agg(F.max(timestamp_key).alias(timestamp_key))
141 | last_eve_df = ordered_df.join(F.broadcast(end_df), end_df.columns)
142 | last_eve_df = last_eve_df.withColumnRenamed(timestamp_key, timestamp_key+"_2")
143 |
144 | stacked_df = first_eve_df.join(last_eve_df, case_id_glue).orderBy(case_id_glue)
145 | stacked_df = stacked_df.withColumn("caseDuration", F.unix_timestamp(stacked_df[timestamp_key+"_2"]) - F.unix_timestamp(stacked_df[timestamp_key]))
146 | stacked_df = stacked_df.withColumn("startTime", F.unix_timestamp(stacked_df[timestamp_key])).drop(timestamp_key)
147 | stacked_df = stacked_df.withColumn("endTime", F.unix_timestamp(stacked_df[timestamp_key+"_2"])).drop(timestamp_key+"_2")
148 |
149 | if enable_sort:
150 | stacked_df = stacked_df.orderBy(sort_by_column, ascending=sort_ascending)
151 | if max_ret_cases is not None:
152 | stacked_df = stacked_df.limit(max_ret_cases)
153 | rdd = stacked_df.rdd.map(lambda x: (x[case_id_glue], {'caseDuration': x['caseDuration'],
154 | 'startTime': x['startTime'],
155 | 'endTime': x['endTime']}))
156 | return rdd.collectAsMap()
157 |
158 |
159 | def get_variants_df(df, parameters=None):
160 | """Gets variants dataframe from the Spark dataframe
161 | """
162 | if parameters is None:
163 | parameters = {}
164 |
165 | timestamp_key = parameters[
166 | PARAMETER_CONSTANT_TIMESTAMP_KEY] if PARAMETER_CONSTANT_TIMESTAMP_KEY in parameters else DEFAULT_TIMESTAMP_KEY
167 | case_id_glue = parameters[
168 | PARAMETER_CONSTANT_CASEID_KEY] if PARAMETER_CONSTANT_CASEID_KEY in parameters else CASE_CONCEPT_NAME
169 | activity_key = parameters[
170 | PARAMETER_CONSTANT_ACTIVITY_KEY] if PARAMETER_CONSTANT_ACTIVITY_KEY in parameters else DEFAULT_NAME_KEY
171 |
172 | df = df.select(case_id_glue, activity_key)
173 | grouped_df = df.withColumn("@@id", F.monotonically_increasing_id())\
174 | .groupBy(case_id_glue)\
175 | .agg(F.collect_list(F.struct("@@id", activity_key)).alias("variant"))\
176 | .select(case_id_glue, F.sort_array("variant").getItem(activity_key).alias("variant"))
177 | grouped_df = grouped_df.withColumn("variant", F.concat_ws(",", "variant"))
178 |
179 | return grouped_df
180 |
181 |
182 | def get_variants_df_with_case_duration(df, parameters=None):
183 | """Gets variants dataframe from the Spark dataframe, with case duration that is included
184 | """
185 | if parameters is None:
186 | parameters = {}
187 |
188 | case_id_glue = parameters[
189 | PARAMETER_CONSTANT_CASEID_KEY] if PARAMETER_CONSTANT_CASEID_KEY in parameters else CASE_CONCEPT_NAME
190 | activity_key = parameters[
191 | PARAMETER_CONSTANT_ACTIVITY_KEY] if PARAMETER_CONSTANT_ACTIVITY_KEY in parameters else DEFAULT_NAME_KEY
192 | timestamp_key = parameters[
193 | PARAMETER_CONSTANT_TIMESTAMP_KEY] if PARAMETER_CONSTANT_TIMESTAMP_KEY in parameters else DEFAULT_TIMESTAMP_KEY
194 |
195 | ordered_df = df.orderBy(timestamp_key).select(case_id_glue, timestamp_key, activity_key)
196 | grouped_df = ordered_df.groupby(case_id_glue)
197 | df1 = grouped_df.agg(F.collect_list(activity_key).alias("variant"))
198 | df1 = df1.withColumn("variant", F.concat_ws(",", "variant")).orderBy(case_id_glue)
199 |
200 | start_df = grouped_df.agg(F.min(timestamp_key).alias(timestamp_key))
201 | first_eve_df = ordered_df.join(F.broadcast(start_df), start_df.columns)
202 | end_df = grouped_df.agg(F.max(timestamp_key).alias(timestamp_key))
203 | last_eve_df = ordered_df.join(F.broadcast(end_df), end_df.columns)
204 | last_eve_df = last_eve_df.withColumnRenamed(timestamp_key, timestamp_key+"_2")
205 | last_eve_df = last_eve_df.withColumnRenamed(activity_key, activity_key+"_2")
206 |
207 | stacked_df = first_eve_df.join(last_eve_df, case_id_glue).orderBy(case_id_glue)
208 | stacked_df = stacked_df.withColumn("caseDuration", F.unix_timestamp(stacked_df[timestamp_key+"_2"]) - F.unix_timestamp(stacked_df[timestamp_key]))
209 | new_df = df1.join(stacked_df, case_id_glue)
210 | return new_df
211 |
212 |
213 | def get_events(df, case_id, parameters=None):
214 | """Gets events belonging to the specified case
215 | """
216 | if parameters is None:
217 | parameters = {}
218 | case_id_glue = parameters[
219 | PARAMETER_CONSTANT_CASEID_KEY] if PARAMETER_CONSTANT_CASEID_KEY in parameters else CASE_CONCEPT_NAME
220 | return df.filter(df[case_id_glue] == case_id).rdd.map(lambda row: row.asDict()).collect()
221 |
222 |
223 | def get_kde_caseduration(df, parameters=None):
224 | """Gets the estimation of KDE density for the case durations calculated on the Spark dataframe
225 | """
226 | cases = get_cases_description(df, parameters=parameters)
227 | duration_values = [x["caseDuration"] for x in cases.values()]
228 |
229 | return case_duration_commons.get_kde_caseduration(duration_values, parameters=parameters)
230 |
231 |
232 | def get_kde_caseduration_json(df, parameters=None):
233 | """
234 | Gets the estimation of KDE density for the case durations calculated on the Spark dataframe
235 | (expressed as JSON)
236 | """
237 | cases = get_cases_description(df, parameters=parameters)
238 | duration_values = [x["caseDuration"] for x in cases.values()]
239 |
240 | return case_duration_commons.get_kde_caseduration_json(duration_values, parameters=parameters)
241 |
--------------------------------------------------------------------------------
/tests/test_output_data/running-example_freq.svg:
--------------------------------------------------------------------------------
1 |
2 |
4 |
6 |
7 |
173 |
--------------------------------------------------------------------------------
/tests/test_output_data/running-example_perf.svg:
--------------------------------------------------------------------------------
1 |
2 |
4 |
6 |
7 |
173 |
--------------------------------------------------------------------------------