Horje
pyspark split dataframe by rows Code Example
pyspark split dataframe by rows
from pyspark.sql.window import Window
from pyspark.sql.functions import monotonically_increasing_id, ntile

values = [(str(i),) for i in range(100)]
df = spark.createDataFrame(values, ('value',))

def split_by_row_index(df, num_partitions=4):
    # Let's assume you don't have a row_id column that has the row order
    t = df.withColumn('_row_id', monotonically_increasing_id())
    # Using ntile() because monotonically_increasing_id is discontinuous across partitions
    t = t.withColumn('_partition', ntile(num_partitions).over(Window.orderBy(t._row_id))) 
    return [t.filter(t._partition == i+1).drop('_row_id', '_partition') for i in range(partitions)]

[i.collect() for i in split_by_row_index(df)]




Whatever

Related
waves of ECG Code Example waves of ECG Code Example
what does boyfriend says in friday night funkin Code Example what does boyfriend says in friday night funkin Code Example
how to change hot keys in vim Code Example how to change hot keys in vim Code Example
domino's large pizza slices Code Example domino's large pizza slices Code Example
intellij ignore workspace.xml Code Example intellij ignore workspace.xml Code Example

Type:
Code Example
Category:
Coding
Sub Category:
Code Example
Uploaded by:
Admin
Views:
7