Retail Revenue & Supply Chain – Databricks SQL

Analyze key retail and supply chain performance indicators for a fictional enterprise.

  1. Counter – Overall Customer Count
  2. Counter – TPCH – Number Suppliers
  3. Map – National Revenue Map
  4. Bar- National Revenue Trends
  5. Table – Customer Value
  6. Line – Order Revenue

1. Counter – Overall Customer Count

SELECT
  COUNT(distinct(c_custkey))
FROM
  `samples`.`tpch`.`customer`

2. Counter – TPCH – Number Suppliers

SELECT
  COUNT(distinct(s_suppkey)) AS num_suppliers
FROM
  `samples`.`tpch`.`supplier`

3. Map – National Revenue Map

SELECT
    initcap(n_name) AS `Nation`, 
    SUM(l_extendedprice * (1 - l_discount) * (length(n_name)/100)) AS revenue
FROM
    `samples`.`tpch`.`customer`,
    `samples`.`tpch`.`orders`,
    `samples`.`tpch`.`lineitem`,
    `samples`.`tpch`.`supplier`,
    `samples`.`tpch`.`nation`,
    `samples`.`tpch`.`region`
WHERE
    c_custkey = o_custkey
    AND l_orderkey = o_orderkey
    AND l_suppkey = s_suppkey
    AND c_nationkey = s_nationkey
    AND s_nationkey = n_nationkey
    AND n_regionkey = r_regionkey
GROUP BY
    INITCAP(n_name)
ORDER BY
    revenue DESC;

4. Bar- National Revenue Trends

SELECT
    year(o_orderdate) AS year,
    n_name AS nation,
    sum(l_extendedprice * (1 - l_discount) * (((length(n_name))/100) + (year(o_orderdate)-1993)/100)) AS revenue
FROM
    `samples`.`tpch`.`customer`,
    `samples`.`tpch`.`orders`,
    `samples`.`tpch`.`lineitem`,
    `samples`.`tpch`.`supplier`,
    `samples`.`tpch`.`nation`,
    `samples`.`tpch`.`region`
WHERE
    c_custkey = o_custkey
    AND l_orderkey = o_orderkey
    AND l_suppkey = s_suppkey
    AND c_nationkey = s_nationkey
    AND s_nationkey = n_nationkey
    AND n_regionkey = r_regionkey
    AND n_name in ('ARGENTINA', 'UNITED KINGDOM', 'FRANCE','BRAZIL', 'CHINA', 'UNITED STATES', 'JAPAN', 'JORDAN')
    AND o_orderdate >= DATE '1994-01-01'
GROUP BY
    1,2
ORDER BY
    nation ASC LIMIT 1000;

5. Table – Customer Value

SELECT
  customer_id AS `Customer ID #`,
  concat(
    '<div class="bg-',
    CASE
      WHEN total_revenue BETWEEN 0
      AND 1500000 THEN 'success'
      WHEN total_revenue BETWEEN 1500001
      AND 3000000 THEN 'warning'
      WHEN total_revenue BETWEEN 3000001
      AND 5000000 THEN 'danger'
      ELSE 'danger'
    END,
    '  text-center"> $',
    format_number(total_revenue, 0),
    '</div>'
  ) AS `Total Customer Revenue`
FROM
  (
    SELECT
      o_custkey AS customer_id,
      sum(o_totalprice) as total_revenue
    FROM
      `samples`.`tpch`.`orders`
    GROUP BY
      1
    HAVING
      total_revenue > 0
  )
ORDER BY
  1
LIMIT
  400

6. Line – Order Revenue

SELECT
  o_orderdate AS Date,
  o_orderpriority AS Priority,
  sum(o_totalprice) AS `Total Price`
FROM
  `samples`.`tpch`.`orders`
WHERE
  o_orderdate > '1994-01-01'
  AND o_orderdate < '1994-01-31'
GROUP BY
  1,
  2
ORDER BY
  1,
  2

NYC Taxi Trip Analysis – Databricks SQL

Explore NYC taxi rides over a one month time frame.

  1. ตาราง nyctaxi.trips
  2. Counter – Total Trips
  3. Table – Route Revenues
  4. Chart – Pickup Hour Distribution
  5. Scatter – Daily Fare to Distance Analysis

1. ตาราง nyctaxi.trips

SHOW CREATE TABLE samples.nyctaxi.trips
CREATE TABLE samples.nyctaxi.trips (
  tpep_pickup_datetime TIMESTAMP,
  tpep_dropoff_datetime TIMESTAMP,
  trip_distance DOUBLE,
  fare_amount DOUBLE,
  pickup_zip INT,
  dropoff_zip INT
) USING delta LOCATION 'dbfs:/databricks-datasets/nyctaxi-with-zipcodes/subsampled'
SELECT * 
FROM samples.nyctaxi.trips 
LIMIT 5
#tpep_pickup_datetimetpep_dropoff_datetimetrip_distancefare_amountpickup_zipdropoff_zip
12016-02-14 16:52:13.0002016-02-14 17:16:04.0004.9419.001028210171
22016-02-04 18:44:19.0002016-02-04 18:46:00.0000.283.501011010110
32016-02-17 17:13:57.0002016-02-17 17:17:55.0000.705.001010310023
42016-02-18 10:36:07.0002016-02-18 10:41:45.0000.806.001002210017
52016-02-22 14:14:41.0002016-02-22 14:31:52.0004.5117.001011010282

2. Counter – Total Trips

USE CATALOG SAMPLES;
SELECT
  count(*) as total_trips
FROM
  `samples`.`nyctaxi`.`trips`
WHERE
  tpep_pickup_datetime BETWEEN TIMESTAMP '{{ pickup_date.start }}'
  AND TIMESTAMP '{{ pickup_date.end }}'
  AND pickup_zip IN ({{ pickupzip }})

Counter

3. Table – Route Revenues

USE CATALOG SAMPLES;
SELECT
  T.route as `Route`,
  T.frequency as `Route Frequency`,
  T.total_fare as `Total Fares`
FROM
  (
    SELECT
      concat(pickup_zip, '-', dropoff_zip) AS route,
      count(*) as frequency,
      SUM(fare_amount) as total_fare
    FROM
      `samples`.`nyctaxi`.`trips`
    WHERE
      tpep_pickup_datetime BETWEEN TIMESTAMP '{{ pickup_date.start }}'
      AND TIMESTAMP '{{ pickup_date.end }}'
      AND pickup_zip IN ({{ pickupzip }})
    GROUP BY
      1
  ) T
ORDER BY
  1 ASC
LIMIT
  200

Table

4. Chart – Pickup Hour Distribution

USE CATALOG SAMPLES;
SELECT
  CASE
    WHEN T.pickup_hour = 0 THEN '00:00'
    WHEN T.pickup_hour = 1 THEN '01:00'
    WHEN T.pickup_hour = 2 THEN '02:00'
    WHEN T.pickup_hour = 3 THEN '03:00'
    WHEN T.pickup_hour = 4 THEN '04:00'
    WHEN T.pickup_hour = 5 THEN '05:00'
    WHEN T.pickup_hour = 6 THEN '06:00'
    WHEN T.pickup_hour = 7 THEN '07:00'
    WHEN T.pickup_hour = 8 THEN '08:00'
    WHEN T.pickup_hour = 9 THEN '09:00'
    WHEN T.pickup_hour = 10 THEN '10:00'
    WHEN T.pickup_hour = 11 THEN '11:00'
    WHEN T.pickup_hour = 12 THEN '12:00'
    WHEN T.pickup_hour = 13 THEN '13:00'
    WHEN T.pickup_hour = 14 THEN '14:00'
    WHEN T.pickup_hour = 15 THEN '15:00'
    WHEN T.pickup_hour = 16 THEN '16:00'
    WHEN T.pickup_hour = 17 THEN '17:00'
    WHEN T.pickup_hour = 18 THEN '18:00'
    WHEN T.pickup_hour = 19 THEN '19:00'
    WHEN T.pickup_hour = 20 THEN '20:00'
    WHEN T.pickup_hour = 21 THEN '21:00'
    WHEN T.pickup_hour = 22 THEN '22:00'
    WHEN T.pickup_hour = 23 THEN '23:00'
    ELSE 'N/A'
  END AS `Pickup Hour`,
  T.num AS `Number of Rides`
FROM
  (
    SELECT
      hour(tpep_pickup_datetime) AS pickup_hour,
      COUNT(*) AS num
    FROM
      `samples`.`nyctaxi`.`trips`
    WHERE
      tpep_pickup_datetime BETWEEN TIMESTAMP '{{ pickup_date.start }}'
      AND TIMESTAMP '{{ pickup_date.end }}'
      AND pickup_zip IN ({{ pickupzip }})
    GROUP BY
      1
  ) T

Chart

5. Scatter – Daily Fare to Distance Analysis

USE CATALOG SAMPLES;
SELECT
  T.weekday,
  CASE
    WHEN T.weekday = 1 THEN 'Sunday'
    WHEN T.weekday = 2 THEN 'Monday'
    WHEN T.weekday = 3 THEN 'Tuesday'
    WHEN T.weekday = 4 THEN 'Wednesday'
    WHEN T.weekday = 5 THEN 'Thursday'
    WHEN T.weekday = 6 THEN 'Friday'
    WHEN T.weekday = 7 THEN 'Saturday'
    ELSE 'N/A'
  END AS day_of_week,
  T.fare_amount,
  T.trip_distance
FROM
  (
    SELECT
      dayofweek(tpep_pickup_datetime) as weekday,
      *
    FROM
      `samples`.`nyctaxi`.`trips`
    WHERE
      (
        pickup_zip in ({{ pickupzip }})
        OR pickup_zip in (10018)
      )
      AND tpep_pickup_datetime BETWEEN TIMESTAMP '{{ pickup_date.start }}'
      AND TIMESTAMP '{{ pickup_date.end }}'
      AND trip_distance < 10
  ) T
ORDER BY
  T.weekday

Scatter

Base64 encode and decode in Scala

import java.util.Base64
import java.nio.charset.StandardCharsets

val plainText: String = "abcdefg"

val encoded: String = Base64.getEncoder.encodeToString(plainText.getBytes("UTF-8"))

val decoded: Array[Byte] = Base64.getDecoder.decode(encoded)

val str1 = new String(decoded)
val str2 = new String(decoded, StandardCharsets.UTF_8)

output

plainText: String = abcdefg
encoded: String = YWJjZGVmZw==
decoded: Array[Byte] = Array(97, 98, 99, 100, 101, 102, 103)
str1: String = abcdefg
str2: String = abcdefg

Scala – AES Encryption with CBC and PKCS7Padding

import java.io.UnsupportedEncodingException;
import java.security.InvalidAlgorithmParameterException;
import java.security.InvalidKeyException;
import java.security.NoSuchAlgorithmException;
import java.security.Security;
import java.util.Base64;

import javax.crypto.BadPaddingException;
import javax.crypto.Cipher;
import javax.crypto.IllegalBlockSizeException;
import javax.crypto.NoSuchPaddingException;
import javax.crypto.spec.IvParameterSpec;
import javax.crypto.spec.SecretKeySpec;

import org.apache.commons.codec.binary.Hex

val ENCRYPT_KEY = "aaaaaaaaaaaaaaaa"
val ENCRYPT_IV = "bbbbbbbbbbbbbbbb"

def encrypt(value : String) : String = {
    val plainText: String = value
    try {
        val key: Array[Byte] = ENCRYPT_KEY.getBytes("UTF-8")
        val ivs: Array[Byte] = ENCRYPT_IV.getBytes("UTF-8")
        val cipher:Cipher = Cipher.getInstance("AES/CBC/PKCS5Padding") // PKCS7Padding
        val secretKeySpec: SecretKeySpec = new SecretKeySpec(key, "AES")
        val paramSpec  = new IvParameterSpec(ivs)
        cipher.init(Cipher.ENCRYPT_MODE, secretKeySpec, paramSpec)
        val escapedString: String = Base64.getEncoder.encodeToString(cipher.doFinal(plainText.getBytes("UTF-8"))).trim()
        val decoded: Array[Byte] = Base64.getDecoder.decode(escapedString)
        val hexString: String = Hex.encodeHexString(decoded)
  
        return hexString;
    }
    catch {
        case e: Throwable =>println("Cannot encode "+ e)
        return value
    }  
}

val hexString = encrypt("test message");

output

hexString: String = be7dec3aba8964e18cf7a942f9a89880

validate result with The X Online Tools (the-x.cn)

Scala – Convert Base64 to Hex String

import java.util.Base64
import org.apache.commons.codec.binary.Hex

val guid: String = "YxRfXk827kPgkmMUX15PNg=="
val decoded: Array[Byte] = Base64.getDecoder.decode(guid)
val hexString: String = Hex.encodeHexString(decoded)

output

guid: String = YxRfXk827kPgkmMUX15PNg==
decoded: Array[Byte] = Array(99, 20, 95, 94, 79, 54, -18, 67, -32, -110, 99, 20, 95, 94, 79, 54)
hexString: String = 63145f5e4f36ee43e09263145f5e4f36

MkDocs

MkDocs is a fast, simple and downright gorgeous static site generator that’s geared towards building project documentation. Documentation source files are written in Markdown, and configured with a single YAML

Installation

To install MkDocs, run the following command from the command line:

pip install mkdocs
> mkdocs -h
Usage: mkdocs [OPTIONS] COMMAND [ARGS]...

  MkDocs - Project documentation with Markdown.

Options:
  -V, --version  Show the version and exit.
  -q, --quiet    Silence warnings
  -v, --verbose  Enable verbose output
  -h, --help     Show this message and exit.

Commands:
  build      Build the MkDocs documentation
  gh-deploy  Deploy your documentation to GitHub Pages
  new        Create a new MkDocs project
  serve      Run the builtin development server
> mkdocs --version
mkdocs, version 1.3.1 from c:\users\jack\python38-venv\env\lib\site-packages\mkdocs (Python 3.8)

For more details, see the Installation Guide.

Creating a new project

Getting started is super easy. To create a new project, run the following command from the command line:

mkdocs new my-project
cd my-project
$ tree
.
├── docs
│   └── index.md
└── mkdocs.yml

There’s a single configuration file named mkdocs.yml, and a folder named docs that will contain your documentation source files (docs is the default value for the docs_dir configuration setting). Right now the docs folder just contains a single documentation page, named index.md.

ไฟล์ mkdocs.yml

site_name: My Docs

ไฟล์ docs/index.md

# Welcome to MkDocs

For full documentation visit [mkdocs.org](https://www.mkdocs.org).

## Commands

* `mkdocs new [dir-name]` - Create a new project.
* `mkdocs serve` - Start the live-reloading docs server.
* `mkdocs build` - Build the documentation site.
* `mkdocs -h` - Print help message and exit.

## Project layout

    mkdocs.yml    # The configuration file.
    docs/
        index.md  # The documentation homepage.
        ...       # Other markdown pages, images and other files.

MkDocs comes with a built-in dev-server that lets you preview your documentation as you work on it. Make sure you’re in the same directory as the mkdocs.yml configuration file, and then start the server by running the mkdocs serve command:

> mkdocs serve
INFO     -  Building documentation...
INFO     -  Cleaning site directory
INFO     -  Documentation built in 0.24 seconds
INFO     -  [21:14:45] Watching paths for changes: 'docs', 'mkdocs.yml'
INFO     -  [21:14:45] Serving on http://127.0.0.1:8000/

Open up http://127.0.0.1:8000/ in your browser, and you’ll see the default home page being displayed:

Fenced code blocks

The fenced code blocks extension adds an alternate method of defining code blocks without indentation.

The first line should contain 3 or more backtick (`) characters, and the last line should contain the same number of backtick characters (`):

```
Fenced code blocks are like Standard
Markdown’s regular code blocks, except that
they’re not indented and instead rely on
start and end fence lines to delimit the
code block.
```

With this approach, the language can optionally be specified on the first line after the backticks which informs any syntax highlighters of the language used:

```python
def fn():
    pass
```

```sql
SELECT * FROM FOO
```

```c
main() {
    print('Hello');
}
```

Note that fenced code blocks can not be indented. Therefore, they cannot be nested inside list items, blockquotes, etc.

Building the site

That’s looking good. You’re ready to deploy the first pass of your MkLorum documentation. First build the documentation:

mkdocs build

This will create a new directory, named site. Take a look inside the directory:

$ ls site
about  fonts  index.html  license  search.html
css    img    js          mkdocs   sitemap.xml

Notice that your source documentation has been output as two HTML files named index.html and about/index.html. You also have various other media that’s been copied into the site directory as part of the documentation theme. You even have a sitemap.xml file and mkdocs/search_index.json.

If you’re using source code control such as git you probably don’t want to check your documentation builds into the repository. Add a line containing site/ to your .gitignore file.

echo "site/" >> .gitignore

If you’re using another source code control tool you’ll want to check its documentation on how to ignore specific directories.

Python installed path

sys.executable

A string giving the absolute path of the executable binary for the Python interpreter, on systems where this makes sense. If Python is unable to retrieve the real path to its executable, sys.executable will be an empty string or None.

>>> import sys
>>> sys.executable
'C:\\Python310\\python.exe'

os.path.dirname(path)

Return the directory name of pathname path. This is the first element of the pair returned by passing path to the function split().

>>> import os
>>> import sys
>>> os.path.dirname(sys.executable)
'C:\\Python310'
>>> import os
>>> import sys
>>> os.path.dirname(sys.executable)
'C:\\Users\\<username>\\AppData\\Local\\Programs\\Python\\Python38\\python.exe'

Databricks – Date and Time

current_date function (Databricks SQL)

Returns the current date at the start of query evaluation.

current_date()

Arguments

This function takes no arguments.

Returns

A DATE.

The braces are optional.

Examples

> SELECT current_date()
 2022-08-23
> SELECT current_date;
 2022-08-23

now function (Databricks SQL)

Returns the current timestamp at the start of query evaluation.

now()

Arguments

This function takes no arguments.

Returns

A TIMESTAMP.

Examples

> SELECT now()
 2022-08-23T04:57:51.871+0000
> SELECT current_timestamp()
> SELECT current_timestamp

to_date function (Databricks SQL)

Returns expr cast to a date using an optional formatting.

to_date(expr [, fmt] )

Returns

A DATE.

> SELECT to_date('2022-08-24 07:00:00');
 2022-08-24
> SELECT to_date('2022-08-24', 'yyyy-MM-dd');
 2022-08-24

from_unixtime function (Databricks SQL)

Returns unixTime in fmt.

from_unixtime(unixTime [, fmt])

Arguments

  • unixTime: A BIGINT expression representing seconds elapsed since 1969-12-31 at 16:00:00 (แต่เหมือนจะเป็น 1970-01-01 at 00:00:00).
  • fmt: An optional STRING expression with a valid format.

Returns

A STRING.

See Datetime patterns (Databricks SQL) for valid formats. The ‘yyyy-MM-dd HH:mm:ss’ pattern is used if omitted.

Examples

> SELECT from_unixtime(0);
 1970-01-01 00:00:00
> SELECT from_unixtime(0, 'yyyy-MM-dd HH:mm:ss');
 1970-01-01 00:00:00

to_unix_timestamp function (Databricks SQL)

Returns the timestamp in expr as a UNIX timestamp.

to_unix_timestamp(expr [, fmt] )

Arguments

  • expr: A STRING expression representing a timestamp.
  • fmt: An optional format STRING expression.

Returns

A BIGINT.

If fmt is supplied, it must conform with Datetime patterns (Databricks SQL).

If fmt is not supplied, the function is a synonym for cast(expr AS TIMESTAMP).

If fmt is malformed or its application does not result in a well formed timestamp, the function raises an error.

Examples

> SELECT to_unix_timestamp(current_date())
 1661299200
> SELECT to_unix_timestamp('2022-08-24', 'yyyy-MM-dd')
 1661299200
> SELECT to_unix_timestamp(current_timestamp())
 1661328640
> SELECT  to_unix_timestamp('2022-08-24 08:11:00', 'yyyy-MM-dd HH:mm:ss')
 1661328660
> SELECT to_unix_timestamp(current_timestamp()) - to_unix_timestamp(current_date())
 29230

Python – List all files and directories in a directory

List all files and directories in a directory

from os import listdir
my_path = '/home/jack'
files_list = listdir(my_path)
print(files_list)

List all files in a directory

from os import listdir
from os.path import isfile, join
my_path = '/home/jack'
files_list = [f for f in listdir(my_path) if isfile(join(my_path, f))]
print(files_list)

List all directories in a directory

from genericpath import isdir
from os import listdir
from os.path import isfile, join
my_path = '/home/jack'
files_list = [f for f in listdir(my_path) if isdir(join(my_path, f))]
print(files_list)

Python DateTime

datetime.date

from datetime import date
today = date.today()

print(type(today))
# <class 'datetime.date'>
print(today)
# 2022-08-19

datetime.datetime

เวลาปัจจุบันของ server

from datetime import datetime
now = datetime.now() # current date and time

print(type(now))
# <class 'datetime.datetime'>
print(now)
# 2022-08-19 07:51:55.893002

นำเวลาปัจจุบันของ server มาบวก 7 ชั่วโมง

import datetime
now = datetime.datetime.now()
print(now)
# 2022-08-19 08:32:36.803853

now = now + datetime.timedelta(hours=7)
print(now)
# 2022-08-19 15:32:36.803853
from datetime import datetime
now = datetime.now() # current date and time

year = now.strftime("%Y")
print("year:", year)
# year: 2022

month = now.strftime("%m")
print("month:", month)
# month: 08

day = now.strftime("%d")
print("day:", day)
# day: 19

time = now.strftime("%H:%M:%S")
print("time:", time)
# time: 08:12:42

date_time = now.strftime("%m/%d/%Y, %H:%M:%S")
print("date and time:",date_time)	
# date and time: 08/19/2022, 08:12:42

time.struct_time

from datetime import datetime
now = datetime.now() # current date and time
time_tuple = now.timetuple()

print(type(time_tuple))
# <class 'time.struct_time'>

print(now)
# 2022-08-19 07:55:03.144893

print(time_tuple)
# time.struct_time(tm_year=2022, tm_mon=8, tm_mday=19, tm_hour=7, tm_min=55, tm_sec=3, tm_wday=4, tm_yday=231, tm_isdst=-1)

timestamp

หาค่า timestamp จาก current date และ time

import time
from datetime import datetime
now = datetime.now() # current date and time
time_tuple = now.timetuple()
timestamp = int(time.mktime(time_tuple))

print(now)
# 2022-08-19 07:59:21.232243

print(timestamp)
# 1660895961

เปลี่ยน type ของข้อมูล (float, int, str, bytes)

import time
from datetime import datetime
now = datetime.now() # current date and time
time_tuple = now.timetuple()
timestamp = time.mktime(time_tuple)

print(type(timestamp))
print(timestamp)
# <class 'float'>
# 1660898887.0

timestamp = int(timestamp)
print(type(timestamp))
print(timestamp)
# <class 'int'>
# 1660898887

timestamp = str(timestamp)
print(type(timestamp))
print(timestamp)
# <class 'str'>
# 1660898887

timestamp = str.encode(timestamp)
print(type(timestamp))
print(timestamp)
# <class 'bytes'>
# b'1660898887'

timestamp = timestamp.decode()
print(type(timestamp))
print(timestamp)
# <class 'str'>
# 1660898887

หาค่า timestamp จากวันที่ ที่กำหนด

import time
from datetime import datetime

# date in string format
dt="19/08/2022"

time_tuple = datetime.strptime(dt, "%d/%m/%Y").timetuple()
print(time_tuple)
# time.struct_time(tm_year=2022, tm_mon=8, tm_mday=19, tm_hour=0, tm_min=0, tm_sec=0, tm_wday=4, tm_yday=231, tm_isdst=-1)

timestamp = time.mktime(time_tuple)
print(timestamp)
# 1660867200.0