1. เตรียมตารางด้วย SQL
สร้างตารางตัวอย่างชื่อ table_name
(เป็น managed table)
%sql CREATE OR REPLACE TABLE table_name ( id BIGINT GENERATED ALWAYS AS IDENTITY (START WITH 1 INCREMENT BY 1), name STRING, age INT, flag BOOLEAN, timestamp TIMESTAMP )
คำสั่งต่างๆที่เกี่ยวข้องกับตาราง
%sql SHOW CREATE TABLE table_name
%sql DESCRIBE TABLE table_name
%sql DESCRIBE HISTORY table_name
เพิ่มข้อมูลลงตาราง table_name
(ที่ timestamp
บวกเวลาไป 7 ชั่วโมง)
%sql INSERT INTO table_name (name, age, flag, timestamp) VALUES( 'jack', '18', false, TIMESTAMPADD(HOUR, + 7, CURRENT_TIMESTAMP()) )
ดูข้อมูลตาราง table_name
%sql SELECT * FROM table_name
หรือดูด้วย Delta Time Travel
%sql SELECT * FROM table_name VERSION AS OF 1
+----+------+-----+-------+------------------------------+ | id | name | age | flag | timestamp | +----+------+-----+-------+------------------------------+ | 1 | jack | 18 | false | 2022-09-07T16:29:05.071+0000 | | 2 | tip | 14 | false | 2022-09-07T17:17:00.285+0000 | +----+------+-----+-------+------------------------------+
ลบตาราง table_name
%sql DROP TABLE table_name
2. Spark SQL (Python)
เตรียมคิวรี
%python query = """ SELECT * FROM table_name """
รันคิวรี
%python df = spark.sql(query)
ดู schema
%python df.printSchema()
root |-- id: long (nullable = true) |-- name: string (nullable = true) |-- age: integer (nullable = true) |-- flag: boolean (nullable = true) |-- timestamp: timestamp (nullable = true)
%python print(df)
DataFrame[id: bigint, name: string, age: int, flag: boolean, timestamp: timestamp]
ดูข้อมูล
%python display(df)
+----+------+-----+-------+------------------------------+ | id | name | age | flag | timestamp | +----+------+-----+-------+------------------------------+ | 1 | jack | 18 | false | 2022-09-07T16:29:05.071+0000 | | 2 | tip | 14 | false | 2022-09-07T17:17:00.285+0000 | +----+------+-----+-------+------------------------------+
%python df.show()
+---+----+---+-----+--------------------+ | id|name|age| flag| timestamp| +---+----+---+-----+--------------------+ | 1|jack| 18|false|2022-09-07 17:10:...| | 2| tip| 14|false|2022-09-07 17:17:...| +---+----+---+-----+--------------------+
%python # Select only the "name" column df.select("name").show()
+----+ |name| +----+ |jack| | tip| +----+
%python # Select everybody, but increment the age by 1 df.select(df['name'], df['age'] + 1).show()
+----+---------+ |name|(age + 1)| +----+---------+ |jack| 19| | tip| 15| +----+---------+
%python # Select people older than 15 df.filter(df['age'] > 15).show()
+---+----+---+-----+--------------------+ | id|name|age| flag| timestamp| +---+----+---+-----+--------------------+ | 1|jack| 18|false|2022-09-07 17:10:...| +---+----+---+-----+--------------------+
%python # Count people by age df.groupBy("age").count().show()
+---+-----+ |age|count| +---+-----+ | 18| 1| | 14| 1| +---+-----+
%python print(type(df.collect())) df.collect()
[Row(id=1, name='jack', age=18, flag=False, timestamp=datetime.datetime(2022, 9, 7, 17, 10, 34, 193000)), Row(id=2, name='tip', age=14, flag=False, timestamp=datetime.datetime(2022, 9, 7, 17, 17, 0, 285000))]
%python for row in df.collect(): print(row['name'])
jack tip