SQL reference

Time Travel

DESCRIBE HISTORY customer_data_delta
SELECT COUNT(*)
FROM customer_data_delta
VERSION AS OF 1
SELECT SUM(total_orders) - (
  SELECT SUM(total_orders)
  FROM customer_counts
  VERSION AS OF 0
  WHERE Country='Sweden') AS new_entries
FROM customer_counts
WHERE Country='Sweden'

ZORDER

OPTIMIZE customer_counts
ZORDER by (CustomerID)
SELECT CustomerID, SUM(total_orders) AS total
FROM customer_counts
GROUP BY CustomerID
ORDER BY total DESC
SELECT CustomerID, COUNT(Country) AS num_countries
FROM customer_counts
GROUP BY CustomerID
SORT BY num_countries DESC

VACUUM

VACUUM customer_data_delta RETAIN 0 HOURS;

uuid

Returns an universally unique identifier (UUID) string.

> SELECT uuid();
 46707d92-02f4-4817-8116-a4c3b23e6266

คำสั่ง uuid() ใช้ได้ใน SELECT

ถ้าจะใช้ uuid() ใน MERGE ด้วยการให้ค่าตอน INSERT ตรงๆเลย จะ ERROR ประมาณว่า

AnalysisException: nondeterministic expressions are only allowed in Project, Filter, Aggregate, Window, or Generate

ให้ไปใช้ uuid() ที่ SELECT ใน USING แล้วค่อยอ้างอิงมาให้ค่าตรง INSERT แล้วจะไม่ ERROR

cast

Casts the value expr to the target data type type.

> SELECT cast(NULL AS INT);
  NULL

> SELECT cast(5.6 AS INT);
  5

> SELECT cast(5.6 AS DECIMAL(2, 0);
  6

> SELECT cast(-5.6 AS INT);
  -5

> SELECT cast(-5.6 AS DECIMAL(2, 0);
  -6

> SELECT cast(128 AS TINYINT);
  Overflow

> SELECT cast(128 AS DECIMAL(2, 0));
  Overflow

> SELECT cast('123' AS INT);
  123

> SELECT cast('123.0' AS INT);
  Invalid format

> SELECT cast(TIMESTAMP'1970-01-01 00:00:01' AS LONG);
  1

> SELECT cast(TIMESTAMP'1970-01-01 00:00:00.000001' AS DOUBLE);
  1.0E-6

> SELECT cast(TIMESTAMP'2022-02-01 00:00:00' AS SMALLINT);
  error: overflow

> SELECT cast(true AS BOOLEAN);
  1

ฟังก์ชันเกี่ยวกับเวลา

current_timestamp และ now

Returns the current timestamp at the start of query evaluation.

> SELECT current_timestamp();
 2022-06-08T10:22:43.561+0000
> SELECT current_timestamp;
 2022-06-08T10:23:00.961+0000
> SELECT now();
 2022-06-08T10:23:44.145+0000

current_timezone

Returns the current session local timezone.

> SELECT current_timezone();
 Etc/UTC
> SELECT current_timezone();
 Asia/Shanghai

from_utc_timestamp

Returns a timestamp in expr specified in UTC in the timezone timeZone.

> SELECT from_utc_timestamp(current_timestamp, 'Asia/Bangkok');
 2022-06-08T17:30:43.343+0000
> SELECT from_utc_timestamp('2022-06-08', 'Asia/Bangkok');
 2022-06-08T07:00:00.000+0000
> SELECT from_utc_timestamp('2022-06-08 00:30:00.0', 'GMT+7');
 2022-06-08T07:30:00.000+0000

to_utc_timestamp

Returns the timestamp in expr in a different timezone as UTC.

> SELECT to_utc_timestamp('2022-06-01', 'Asia/Bangkok');
 2022-05-31T17:00:00.000+0000
> SELECT to_utc_timestamp('2022-06-01', 'GMT+7');
 2022-05-31T17:00:00.000+0000
> SELECT to_utc_timestamp('2022-06-01 00:00:00 UTC+07:00', 'GMT+7');
 2022-05-31T10:00:00.000+0000
> SELECT to_utc_timestamp('2022-06-01 00:00:00 UTC+00:00', 'GMT+7');
 2022-05-31T17:00:00.000+0000
> SELECT to_utc_timestamp('2022-06-01 00:00:00 UTC-07:00', 'GMT+7');
 2022-06-01T00:00:00.000+0000

ใช้แบบ 'GMT+7' (กำหนด time zone offsets) ดีกว่าแบบ 'Asia/Bangkok' เพราะบาง timezone มีเรื่องของ daylight savings time (DST) Dates and timestamps | Databricks on AWS

As you can see from the preceding examples, the mapping of time zone names to offsets is ambiguous, and is not one to one. In the cases when it is possible, when constructing timestamps we recommend specifying exact time zone offsets, for example 2019-11-03 01:30:00 UTC-07:00.
SELECT CURRENT_TIMESTAMP AS UTC_time
,      from_utc_timestamp(CURRENT_TIMESTAMP, 'GMT+7') AS local_time
,      to_utc_timestamp(from_utc_timestamp(CURRENT_TIMESTAMP, 'GMT+7'), 'GMT+7') AS UTC_time_again;

timestampadd

Adds value units to a timestamp expr.

timestampadd(unit, value, expr)

unit
 { MICROSECOND |
   MILLISECOND |
   SECOND |
   MINUTE |
   HOUR |
   DAY | DAYOFYEAR |
   WEEK |
   MONTH |
   QUARTER |
   YEAR }
> SELECT timestampadd(HOUR, 7, TIMESTAMP'2022-02-28 00:00:00');
 2022-02-28T07:00:00.000+0000
> SELECT timestampadd(MONTH, -1, TIMESTAMP'2022-03-31 00:00:00');
 2022-02-28T00:00:00.000+0000
> SELECT TIMESTAMPADD(HOUR, +7, CURRENT_TIMESTAMP());
 2022-06-09T09:41:24.763+0000

to_date

Returns expr cast to a date using an optional formatting.

to_date(expr [, fmt] )
> SELECT to_date('2009-07-30 04:17:52');
 2009-07-30
> SELECT to_date('2016-12-31', 'yyyy-MM-dd');
 2016-12-31

to_timestamp

Returns expr cast to a timestamp using an optional formatting.

> SELECT to_timestamp('2016-12-31 00:12:00');
 2016-12-31T00:12:00.000+0000

> SELECT to_timestamp('2016-12-31', 'yyyy-MM-dd');
 2016-12-31T00:00:00.000+0000

> SELECT to_timestamp('2022-06-09 10:41:48' , 'yyyy-MM-dd HH:mm:ss');
 2022-06-09T10:41:48.000+0000

ฟังก์ชันเกี่ยวกับสตริง

substr และ substring

Returns the substring of expr that starts at pos and is of length len.

substr(expr, pos [, len] )
substr(expr FROM pos[ FOR len])

substring(expr, pos [, len])
substring(expr FROM pos [FOR len] ] )
> SELECT substring('Spark SQL', 5);
 k SQL
> SELECT substring('Spark SQL', -3);
 SQL
> SELECT substring('Spark SQL', 5, 1);
 k
> SELECT substring('Spark SQL' FROM 5);
 k SQL
> SELECT substring('Spark SQL' FROM -3);
 SQL
> SELECT substring('Spark SQL' FROM 5 FOR 1);
 k
> SELECT substring('Spark SQL' FROM -10 FOR 5);
 Spar
> SELECT current_timestamp;
 2022-06-16T15:58:43.875+0000
> SELECT cast(current_timestamp() AS STRING);
 2022-06-16 15:58:43.875
> SELECT substring(cast(current_timestamp() AS STRING) FROM -23 FOR 19);
 2022-06-16 15:58:43

md5

Returns an MD5 128-bit checksum of expr as a hex string.

> SELECT md5('Spark');
 8cde774d6f7333752ed72cacddb05126

Concatenation

concat

Returns the concatenation of the arguments.

> SELECT concat('Spark', 'SQL');
 SparkSQL
> SELECT concat(array(1, 2, 3), array(4, 5), array(6));
 [1,2,3,4,5,6]

concat_ws

Returns the concatenation strings separated by sep.

> SELECT concat_ws(' ', 'Spark', 'SQL');
  Spark SQL
> SELECT concat_ws('s');
  ''
> SELECT concat_ws(',', 'Spark', array('S', 'Q', NULL, 'L'), NULL);
  Spark,S,Q,L

array_join

Concatenates the elements of array.

> SELECT array_join(array('hello', 'world'), ' ');
 hello world
> SELECT array_join(array('hello', NULL ,'world'), ' ');
 hello world
> SELECT array_join(array('hello', NULL ,'world'), ' ', ',');
 Hello,world

collect_list

Returns an array consisting of all values in expr within the group.

collect_list ( [ALL | DISTINCT] expr ) [FILTER ( WHERE cond ) ]
> SELECT collect_list(col) FROM VALUES (1), (2), (NULL), (1) AS tab(col);
 [1,2,1]
> SELECT collect_list(DISTINCT col) FROM VALUES (1), (2), (NULL), (1) AS tab(col);
 [1,2]

รวมอาร์เรย์จาก collect_list มาเป็นสตริงด้วย array_join

> SELECT array_join(collect_list(col), ',') 
  FROM VALUES (1), (2), (NULL), (1) AS tab(col);
 1,2,1

ฟังก์ชันเกี่ยวกับตัวเลข

abs

Returns the absolute value of the numeric value in expr.

> SELECT abs(-1);
 1

> SELECT abs(cast(-32768 AS Smallint))
 Error: ARITHMETIC_OVERFLOW

mod

Returns the remainder after dividend / divisor.

> SELECT MOD(2, 1.8);
 0.2

> SELECT MOD(2, 0);
 Error: DIVIDE_BY_ZERO

hash

Returns a hash value of the arguments.

> SELECT hash('Spark', array(123), 2);
 -1321691492
> SELECT hash('Spark', array(123));
 1262264271
> SELECT hash('Spark');
 228093765

crc32

Returns a cyclic redundancy check value of expr.

> SELECT crc32('Spark');
 1557323817