Table

1. เตรียมตารางด้วย SQL

สร้างตารางตัวอย่างชื่อ table_name (เป็น managed table)

%sql 
CREATE OR REPLACE TABLE table_name (
  id BIGINT GENERATED ALWAYS AS IDENTITY (START WITH 1 INCREMENT BY 1),
  name STRING,
  age INT,
  flag BOOLEAN,
  timestamp TIMESTAMP
)

คำสั่งต่างๆที่เกี่ยวข้องกับตาราง

%sql
SHOW CREATE TABLE table_name
%sql
DESCRIBE TABLE table_name
%sql
DESCRIBE HISTORY table_name

เพิ่มข้อมูลลงตาราง table_name (ที่ timestamp บวกเวลาไป 7 ชั่วโมง)

%sql
INSERT INTO
  table_name (name, age, flag, timestamp)
VALUES(
    'jack',
    '18',
    false,
    TIMESTAMPADD(HOUR, + 7, CURRENT_TIMESTAMP())
  )

ดูข้อมูลตาราง table_name

%sql
SELECT
  *
FROM
  table_name

หรือดูด้วย Delta Time Travel

%sql
SELECT
  *
FROM
  table_name VERSION AS OF 1
+----+------+-----+-------+------------------------------+
| id | name | age | flag  | timestamp                    |
+----+------+-----+-------+------------------------------+
| 1  | jack | 18  | false | 2022-09-07T16:29:05.071+0000 |
| 2  | tip  | 14  | false | 2022-09-07T17:17:00.285+0000 |
+----+------+-----+-------+------------------------------+

ลบตาราง table_name

%sql
DROP TABLE table_name

2. Spark SQL (Python)

เตรียมคิวรี

%python
query = """
SELECT * 
FROM   table_name
"""

รันคิวรี

%python
df = spark.sql(query)

ดู schema

%python
df.printSchema()
root
 |-- id: long (nullable = true)
 |-- name: string (nullable = true)
 |-- age: integer (nullable = true)
 |-- flag: boolean (nullable = true)
 |-- timestamp: timestamp (nullable = true)
%python
print(df)
DataFrame[id: bigint, name: string, age: int, flag: boolean, timestamp: timestamp]

ดูข้อมูล

%python
display(df)
+----+------+-----+-------+------------------------------+
| id | name | age | flag  | timestamp                    |
+----+------+-----+-------+------------------------------+
| 1  | jack | 18  | false | 2022-09-07T16:29:05.071+0000 |
| 2  | tip  | 14  | false | 2022-09-07T17:17:00.285+0000 |
+----+------+-----+-------+------------------------------+
%python
df.show()
+---+----+---+-----+--------------------+
| id|name|age| flag|           timestamp|
+---+----+---+-----+--------------------+
|  1|jack| 18|false|2022-09-07 17:10:...|
|  2| tip| 14|false|2022-09-07 17:17:...|
+---+----+---+-----+--------------------+
%python
# Select only the "name" column
df.select("name").show()
+----+
|name|
+----+
|jack|
| tip|
+----+
%python
# Select everybody, but increment the age by 1
df.select(df['name'], df['age'] + 1).show()
+----+---------+
|name|(age + 1)|
+----+---------+
|jack|       19|
| tip|       15|
+----+---------+
%python
# Select people older than 15
df.filter(df['age'] > 15).show()
+---+----+---+-----+--------------------+
| id|name|age| flag|           timestamp|
+---+----+---+-----+--------------------+
|  1|jack| 18|false|2022-09-07 17:10:...|
+---+----+---+-----+--------------------+
%python
# Count people by age
df.groupBy("age").count().show()
+---+-----+
|age|count|
+---+-----+
| 18|    1|
| 14|    1|
+---+-----+
%python
print(type(df.collect()))
df.collect()
[Row(id=1, name='jack', age=18, flag=False, timestamp=datetime.datetime(2022, 9, 7, 17, 10, 34, 193000)),
 Row(id=2, name='tip', age=14, flag=False, timestamp=datetime.datetime(2022, 9, 7, 17, 17, 0, 285000))]
%python
for row in df.collect():
  print(row['name'])
jack
tip