from pyspark.sql import SparkSession
from pyspark.sql.functions import col
# 初始化Spark会话
if __name__ == '__main__':
spark = SparkSession.builder.appName("FilterDataFrame").getOrCreate()
# 创建示例数据
data = [(4000, None), (4000, 5000)]
df = spark.createDataFrame(data, ["num1", "num2"])
# 打印原始数据
df.show()
# 过滤第一列数据大于第二列数据的行
filtered_df = df.filter(col("num1") >= col("num2"))
# 显示过滤后的数据
filtered_df.show()