from pyspark.sql import SQLContext sqlContext = SQLContext(sc) sqlContext df = sqlContext.sql(""" SELECT * FROM some_schema.some_table WHERE eff_date >= '2022-05-01' AND eff_date < '2022-06-01' """) print(type(df))
<class 'pyspark.sql.dataframe.DataFrame'>
data = [('A', "1"), ('B', "2"), ('C', "3"), ('D', "4"), ("Cate", "ID") ] print(type(data)) # <class 'list'> df = spark.createDataFrame(data) print(type(df)) # <class 'pyspark.sql.dataframe.DataFrame'>
data = [{"Category": 'A', "ID": 1, "Value": 121.44, "Truth": True}, {"Category": 'B', "ID": 2, "Value": 300.01, "Truth": False}, {"Category": 'C', "ID": 3, "Value": 10.99, "Truth": None}, {"Category": 'D', "ID": 4, "Value": 33.87, "Truth": True} ] print(type(data)) # <class 'list'> df = spark.createDataFrame(data) print(type(df)) # <class 'pyspark.sql.dataframe.DataFrame'>
li = df.collect() print(type(li)) # <class 'list'>
print(li[0]) # Row(Category='A', ID=1, Truth=True, Value=121.44)
กำหนดค่าเป็น null
ได้ด้วยการให้ค่า None
หรือ ไม่ให้ค่าตัวแปรนั้น
data = [{"Category": 'A', "ID": 1, "Value": 121.44, "Truth": True, "date": "20220927"}, {"Category": 'B', "ID": 2, "Value": 300.01, "Truth": False}, {"Category": 'C', "ID": 3, "Value": 10.99, "Truth": None}, {"Category": 'D', "ID": 4, "Value": 33.87, "Truth": True} ] print(type(data)) # <class 'list'> print(type(data[0])) # <class 'dict'> df = spark.createDataFrame(data) display(df)
Pyspark add new row to dataframe
create 1st dataframe
%python columns = ['Identifier', 'Value', 'Extra Discount'] vals = [(1, 150, 0), (2, 160, 12)] df = spark.createDataFrame(vals, columns) df.show()
+----------+-----+--------------+ |Identifier|Value|Extra Discount| +----------+-----+--------------+ | 1| 150| 0| | 2| 160| 12| +----------+-----+--------------+
create 2nd dataframe
%python newRow = spark.createDataFrame([(3,205,7)], columns) newRow.show()
+----------+-----+--------------+ |Identifier|Value|Extra Discount| +----------+-----+--------------+ | 3| 205| 7| +----------+-----+--------------+
union dataframe
%python new_df = df.union(newRow) new_df.show()
+----------+-----+--------------+ |Identifier|Value|Extra Discount| +----------+-----+--------------+ | 1| 150| 0| | 2| 160| 12| | 3| 205| 7| +----------+-----+--------------+
Merge pandas dataFrame
create dataframe
%python import pyspark.pandas as ps df1 = ps.DataFrame({'lkey': ['foo', 'bar', 'baz', 'foo'], 'value': [1, 2, 3, 5]}, columns=['lkey', 'value']) display(df1) # lkey value # 0 foo 1 # 1 bar 2 # 2 baz 3 # 3 foo 5 # df2 df2 = ps.DataFrame({'rkey': ['foo', 'bar', 'baz', 'foo'], 'value': [5, 6, 7, 8]}, columns=['rkey', 'value']) display(df2) # rkey value # 0 foo 5 # 1 bar 6 # 2 baz 7 # 3 foo 8
merge
%python merged = df1.merge(df2, left_on='lkey', right_on='rkey') display(merged) # lkey value_x rkey value_y # bar 2 bar 6 # baz 3 baz 7 # foo 1 foo 5 # foo 1 foo 8 # foo 5 foo 5 # foo 5 foo 8