这样做的一个可能途径是:
import sys
import pyspark.sql.functions as F
from pyspark import SparkContext, SQLContext
from pyspark.sql.window import Window
import pandas as pd
import itertools
from pyspark.sql.types import *
sc = SparkContext( local )
sqlContext = SQLContext(sc)
sqlContext.setConf("spark.streaming.backpressure.enabled", True)
#schemaPeople = schemaPeople.withColumnRenamed()
data1 = [
[1, "2023-09-13", 2.976479132636267, 2, "2023-09-11"],
[1, "2023-09-14", 20.956374354696695, 2, "2023-09-12"],
[1, "2023-09-15", 98.13910624090528, 2, "2023-09-13"],
[1, "2023-09-16", 84.8245242496161, 2, "2023-09-14"],
[1, "2023-09-17", 56.3138752292941, 2, "2023-09-15"],
]
df1Columns = ["id", "date", "amount", "ndxDays", "ndxDaysDate"]
df1 = sqlContext.createDataFrame(data=data1, schema = df1Columns)
df1.show(n=10, truncate=False)
window_spec =Window.partitionBy( id ).orderBy(F.col("date"), F.col("ndxDaysDate")).rowsBetween(Window.unboundedPreceding, 0)
spark_df = df1.withColumn("cum_sum", F.sum(F.col("amount")).over(window_spec))
print("cum sum dataframe")
spark_df.show(n=100, truncate=False)
产出:
+---+----------+------------------+-------+-----------+
|id |date |amount |ndxDays|ndxDaysDate|
+---+----------+------------------+-------+-----------+
|1 |2023-09-13|2.976479132636267 |2 |2023-09-11 |
|1 |2023-09-14|20.956374354696695|2 |2023-09-12 |
|1 |2023-09-15|98.13910624090528 |2 |2023-09-13 |
|1 |2023-09-16|84.8245242496161 |2 |2023-09-14 |
|1 |2023-09-17|56.3138752292941 |2 |2023-09-15 |
+---+----------+------------------+-------+-----------+
cum sum dataframe
+---+----------+------------------+-------+-----------+------------------+
|id |date |amount |ndxDays|ndxDaysDate|cum_sum |
+---+----------+------------------+-------+-----------+------------------+
|1 |2023-09-13|2.976479132636267 |2 |2023-09-11 |2.976479132636267 |
|1 |2023-09-14|20.956374354696695|2 |2023-09-12 |23.932853487332963|
|1 |2023-09-15|98.13910624090528 |2 |2023-09-13 |122.07195972823824|
|1 |2023-09-16|84.8245242496161 |2 |2023-09-14 |206.89648397785436|
|1 |2023-09-17|56.3138752292941 |2 |2023-09-15 |263.2103592071485 |
+---+----------+------------------+-------+-----------+------------------+