from pyspark.sql import SQLContext sqlContext = SQLContext(sc) sqlContext df = sqlContext.sql(""" SELECT * FROM some_schema.some_table WHERE eff_date >= '2022-05-01' AND eff_date < '2022-06-01' """) print(type(df))
<class 'pyspark.sql.dataframe.DataFrame'>
data = [('A', "1"), ('B', "2"), ('C', "3"), ('D', "4"), ("Cate", "ID") ] print(type(data)) # <class 'list'> df = spark.createDataFrame(data) print(type(df)) # <class 'pyspark.sql.dataframe.DataFrame'>
data = [{"Category": 'A', "ID": 1, "Value": 121.44, "Truth": True}, {"Category": 'B', "ID": 2, "Value": 300.01, "Truth": False}, {"Category": 'C', "ID": 3, "Value": 10.99, "Truth": None}, {"Category": 'D', "ID": 4, "Value": 33.87, "Truth": True} ] print(type(data)) # <class 'list'> df = spark.createDataFrame(data) print(type(df)) # <class 'pyspark.sql.dataframe.DataFrame'>
li = df.collect() print(type(li)) # <class 'list'>
print(li[0]) # Row(Category='A', ID=1, Truth=True, Value=121.44)
กำหนดค่าเป็น null
ได้ด้วยการให้ค่า None
หรือ ไม่ให้ค่าตัวแปรนั้น
data = [{"Category": 'A', "ID": 1, "Value": 121.44, "Truth": True, "date": "20220927"}, {"Category": 'B', "ID": 2, "Value": 300.01, "Truth": False}, {"Category": 'C', "ID": 3, "Value": 10.99, "Truth": None}, {"Category": 'D', "ID": 4, "Value": 33.87, "Truth": True} ] print(type(data)) # <class 'list'> print(type(data[0])) # <class 'dict'> df = spark.createDataFrame(data) display(df)
