Spark DataFrame

from pyspark.sql import SQLContext
sqlContext = SQLContext(sc)
sqlContext 

df  = sqlContext.sql("""
  SELECT *
  FROM   some_schema.some_table
  WHERE  eff_date >= '2022-05-01'
  AND    eff_date < '2022-06-01'
""")
print(type(df))
<class 'pyspark.sql.dataframe.DataFrame'>
data = [('A', "1"),
        ('B', "2"),
        ('C', "3"),
        ('D', "4"),
        ("Cate", "ID")
        ]
print(type(data))  # <class 'list'>
df = spark.createDataFrame(data)
print(type(df))    # <class 'pyspark.sql.dataframe.DataFrame'>
data = [{"Category": 'A', "ID": 1, "Value": 121.44, "Truth": True},
        {"Category": 'B', "ID": 2, "Value": 300.01, "Truth": False},
        {"Category": 'C', "ID": 3, "Value": 10.99, "Truth": None},
        {"Category": 'D', "ID": 4, "Value": 33.87, "Truth": True}
        ]
print(type(data))  # <class 'list'>
df = spark.createDataFrame(data)
print(type(df))    # <class 'pyspark.sql.dataframe.DataFrame'>
li = df.collect()
print(type(li))    # <class 'list'>
print(li[0])
# Row(Category='A', ID=1, Truth=True, Value=121.44)

กำหนดค่าเป็น null ได้ด้วยการให้ค่า None หรือ ไม่ให้ค่าตัวแปรนั้น

data = [{"Category": 'A', "ID": 1, "Value": 121.44, "Truth": True, "date": "20220927"},
        {"Category": 'B', "ID": 2, "Value": 300.01, "Truth": False},
        {"Category": 'C', "ID": 3, "Value": 10.99, "Truth": None},
        {"Category": 'D', "ID": 4, "Value": 33.87, "Truth": True}
        ]
df = spark.createDataFrame(data)
display(df)