NYC Taxi Trip Analysis – Databricks SQL

Explore NYC taxi rides over a one month time frame.

  1. ตาราง nyctaxi.trips
  2. Counter – Total Trips
  3. Table – Route Revenues
  4. Chart – Pickup Hour Distribution
  5. Scatter – Daily Fare to Distance Analysis

1. ตาราง nyctaxi.trips

SHOW CREATE TABLE samples.nyctaxi.trips
CREATE TABLE samples.nyctaxi.trips (
  tpep_pickup_datetime TIMESTAMP,
  tpep_dropoff_datetime TIMESTAMP,
  trip_distance DOUBLE,
  fare_amount DOUBLE,
  pickup_zip INT,
  dropoff_zip INT
) USING delta LOCATION 'dbfs:/databricks-datasets/nyctaxi-with-zipcodes/subsampled'
SELECT * 
FROM samples.nyctaxi.trips 
LIMIT 5
#tpep_pickup_datetimetpep_dropoff_datetimetrip_distancefare_amountpickup_zipdropoff_zip
12016-02-14 16:52:13.0002016-02-14 17:16:04.0004.9419.001028210171
22016-02-04 18:44:19.0002016-02-04 18:46:00.0000.283.501011010110
32016-02-17 17:13:57.0002016-02-17 17:17:55.0000.705.001010310023
42016-02-18 10:36:07.0002016-02-18 10:41:45.0000.806.001002210017
52016-02-22 14:14:41.0002016-02-22 14:31:52.0004.5117.001011010282

2. Counter – Total Trips

USE CATALOG SAMPLES;
SELECT
  count(*) as total_trips
FROM
  `samples`.`nyctaxi`.`trips`
WHERE
  tpep_pickup_datetime BETWEEN TIMESTAMP '{{ pickup_date.start }}'
  AND TIMESTAMP '{{ pickup_date.end }}'
  AND pickup_zip IN ({{ pickupzip }})

Counter

3. Table – Route Revenues

USE CATALOG SAMPLES;
SELECT
  T.route as `Route`,
  T.frequency as `Route Frequency`,
  T.total_fare as `Total Fares`
FROM
  (
    SELECT
      concat(pickup_zip, '-', dropoff_zip) AS route,
      count(*) as frequency,
      SUM(fare_amount) as total_fare
    FROM
      `samples`.`nyctaxi`.`trips`
    WHERE
      tpep_pickup_datetime BETWEEN TIMESTAMP '{{ pickup_date.start }}'
      AND TIMESTAMP '{{ pickup_date.end }}'
      AND pickup_zip IN ({{ pickupzip }})
    GROUP BY
      1
  ) T
ORDER BY
  1 ASC
LIMIT
  200

Table

4. Chart – Pickup Hour Distribution

USE CATALOG SAMPLES;
SELECT
  CASE
    WHEN T.pickup_hour = 0 THEN '00:00'
    WHEN T.pickup_hour = 1 THEN '01:00'
    WHEN T.pickup_hour = 2 THEN '02:00'
    WHEN T.pickup_hour = 3 THEN '03:00'
    WHEN T.pickup_hour = 4 THEN '04:00'
    WHEN T.pickup_hour = 5 THEN '05:00'
    WHEN T.pickup_hour = 6 THEN '06:00'
    WHEN T.pickup_hour = 7 THEN '07:00'
    WHEN T.pickup_hour = 8 THEN '08:00'
    WHEN T.pickup_hour = 9 THEN '09:00'
    WHEN T.pickup_hour = 10 THEN '10:00'
    WHEN T.pickup_hour = 11 THEN '11:00'
    WHEN T.pickup_hour = 12 THEN '12:00'
    WHEN T.pickup_hour = 13 THEN '13:00'
    WHEN T.pickup_hour = 14 THEN '14:00'
    WHEN T.pickup_hour = 15 THEN '15:00'
    WHEN T.pickup_hour = 16 THEN '16:00'
    WHEN T.pickup_hour = 17 THEN '17:00'
    WHEN T.pickup_hour = 18 THEN '18:00'
    WHEN T.pickup_hour = 19 THEN '19:00'
    WHEN T.pickup_hour = 20 THEN '20:00'
    WHEN T.pickup_hour = 21 THEN '21:00'
    WHEN T.pickup_hour = 22 THEN '22:00'
    WHEN T.pickup_hour = 23 THEN '23:00'
    ELSE 'N/A'
  END AS `Pickup Hour`,
  T.num AS `Number of Rides`
FROM
  (
    SELECT
      hour(tpep_pickup_datetime) AS pickup_hour,
      COUNT(*) AS num
    FROM
      `samples`.`nyctaxi`.`trips`
    WHERE
      tpep_pickup_datetime BETWEEN TIMESTAMP '{{ pickup_date.start }}'
      AND TIMESTAMP '{{ pickup_date.end }}'
      AND pickup_zip IN ({{ pickupzip }})
    GROUP BY
      1
  ) T

Chart

5. Scatter – Daily Fare to Distance Analysis

USE CATALOG SAMPLES;
SELECT
  T.weekday,
  CASE
    WHEN T.weekday = 1 THEN 'Sunday'
    WHEN T.weekday = 2 THEN 'Monday'
    WHEN T.weekday = 3 THEN 'Tuesday'
    WHEN T.weekday = 4 THEN 'Wednesday'
    WHEN T.weekday = 5 THEN 'Thursday'
    WHEN T.weekday = 6 THEN 'Friday'
    WHEN T.weekday = 7 THEN 'Saturday'
    ELSE 'N/A'
  END AS day_of_week,
  T.fare_amount,
  T.trip_distance
FROM
  (
    SELECT
      dayofweek(tpep_pickup_datetime) as weekday,
      *
    FROM
      `samples`.`nyctaxi`.`trips`
    WHERE
      (
        pickup_zip in ({{ pickupzip }})
        OR pickup_zip in (10018)
      )
      AND tpep_pickup_datetime BETWEEN TIMESTAMP '{{ pickup_date.start }}'
      AND TIMESTAMP '{{ pickup_date.end }}'
      AND trip_distance < 10
  ) T
ORDER BY
  T.weekday

Scatter