About this Block
BigQuery allows for the creation of User Defined SQL function written in either SQL or Javascript. This block shows how to create and use a MEDIAN function written in SQL.
LookML explores have a parameter sql_preamble: that allow you to specify sql statements that execute before the query is run. BigQuery has a CREATE TEMP FUNCTION statement that lets you create user defined functions. We have written a function computes median and added it to the explore.
Every query that runs from this explore will have the median function inserted before the SQL code for the query.
explore: sf_salary {
sql_preamble:
CREATE TEMP FUNCTION MEDIAN(a_num ARRAY<FLOAT64>)
RETURNS FLOAT64 AS ((
SELECT
AVG(num)
FROM (
SELECT
row_number() OVER (ORDER BY num) -1 as rn
, num
FROM UNNEST(a_num) num
)
WHERE
rn = TRUNC(ARRAY_LENGTH(a_num)/2)
OR (
MOD(ARRAY_LENGTH(a_num), 2) = 0 AND
rn = TRUNC(ARRAY_LENGTH(a_num)/2)-1 )
));
;;
}
Functions in BigQuery can take Array of any type (read about arrays in BigQuery here). We first collect all the numeric values into an array, useing the ARRAY_AGG() and then pass the array to the MEDIAN() user defined function which scans the array and looks for the median value.
measure: average_base_pay {
type: average
sql: ${pay} ;;
value_format_name: "decimal_0"
}
measure: median_base_pay {
type: number
sql: MEDIAN(ARRAY_AGG(${pay})) ;;
value_format_name: "decimal_0"
}
connection: "bigquery_publicdata_standard_sql"
explore: sf_salary {
sql_preamble:
CREATE TEMP FUNCTION MEDIAN(a_num ARRAY<FLOAT64>)
RETURNS FLOAT64 AS ((
SELECT
AVG(num)
FROM (
SELECT
row_number() OVER (ORDER BY num) -1 as rn
, num
FROM UNNEST(a_num) num
)
WHERE
rn = TRUNC(ARRAY_LENGTH(a_num)/2)
OR (
MOD(ARRAY_LENGTH(a_num), 2) = 0 AND
rn = TRUNC(ARRAY_LENGTH(a_num)/2)-1 )
));
;;
}
view: sf_salary {
sql_table_name: `lookerdata.sfsalary.salaries` ;;
measure: count {
type: count
drill_fields: [detail*]
}
dimension: id {
type: number
sql: ${TABLE}.Id ;;
}
dimension: employee_name {
type: string
sql: ${TABLE}.EmployeeName ;;
}
dimension: job_title {
type: string
sql: ${TABLE}.JobTitle ;;
}
# Pay is stored as a string, safely convert it to a float.
dimension: pay {
type: string
sql: CASE WHEN
REGEXP_CONTAINS(${TABLE}.TotalPay, r'^[\d\.]+$')
THEN CAST(${TABLE}.TotalPay AS FLOAT64)
END ;;
}
measure: average_base_pay {
type: average
sql: ${pay} ;;
value_format_name: "decimal_0"
}
measure: median_base_pay {
type: number
sql: MEDIAN(ARRAY_AGG(${pay})) ;;
value_format_name: "decimal_0"
}
dimension: year {
type: string
sql: ${TABLE}.Year ;;
}
dimension: status {
type: string
sql: ${TABLE}.Status ;;
}
set: detail {
fields: [
id,
employee_name,
job_title,
pay,
year,
status
]
}
}
The SQL for the Query
CREATE TEMP FUNCTION MEDIAN(a_num ARRAY<FLOAT64>)
RETURNS FLOAT64 AS ((
SELECT
AVG(num)
FROM (
SELECT
row_number() OVER (ORDER BY num) -1 as rn
, num
FROM UNNEST(a_num) num
)
WHERE
rn = TRUNC(ARRAY_LENGTH(a_num)/2)
OR (
MOD(ARRAY_LENGTH(a_num), 2) = 0 AND
rn = TRUNC(ARRAY_LENGTH(a_num)/2)-1 )
));
SELECT
sf_salary.JobTitle AS sf_salary_job_title,
COUNT(*) AS sf_salary_count,
MEDIAN(ARRAY_AGG((CASE WHEN
REGEXP_CONTAINS(sf_salary.TotalPay, r'^[\d\.]+$')
THEN CAST(sf_salary.TotalPay AS FLOAT64)
END))) AS sf_salary_median_base_pay,
AVG((CASE WHEN
REGEXP_CONTAINS(sf_salary.TotalPay, r'^[\d\.]+$')
THEN CAST(sf_salary.TotalPay AS FLOAT64)
END) ) AS sf_salary_average_base_pay
FROM `lookerdata.sfsalary.salaries` AS sf_salary
GROUP BY 1
ORDER BY 2 DESC
LIMIT 500