I recently came accross the hackernews dataset on bigquery public directory. I played with it a little bit and then decided to make this post. We’re going to use the bigquery-public-data.hacker_news.full
table to build a few analytics metrics such as MOM growth of users, churn, retention and … for hackernews forum. The engagement and activity data that we have access to is basically limited to comments and stories on hn. Which is the reason why these metrics must be way off and wrong by defention. But yet I find it interesting to build such metrics using a public dataset, from outside of a company.
WITH
monthly_activity AS (
SELECT
DATE_TRUNC(DATE(timestamp),MONTH) AS month,
CAST(COUNT(DISTINCT `by` ) AS FLOAT64) AS mau
FROM
bigquery-public-data.hacker_news.full
GROUP BY
month ),
total_users AS (
SELECT
DISTINCT DATE_TRUNC(DATE(timestamp),MONTH) AS month,
COUNT(`by`) AS total
FROM
bigquery-public-data.hacker_news.full
GROUP BY
1 )
SELECT
monthly_activity.month,
ROUND((monthly_activity.mau/total_users.total)*100,2) AS percentage_of_active_users
FROM
monthly_activity
JOIN
total_users
ON
monthly_activity.month = total_users.month
WHERE
monthly_activity.month >= DATE("2007-02-01")
ORDER BY
monthly_activity.month DESC
WITH
monthly_activity AS (
SELECT
DISTINCT DATE_TRUNC(DATE(timestamp),MONTH) AS month,
`by` AS userid
FROM
bigquery-public-data.hacker_news.full
),
all_users_running_sum AS (
SELECT
month,
SUM(all_users) OVER (ORDER BY month) AS running_sum
FROM (
SELECT
DISTINCT DATE_TRUNC(DATE(timestamp),MONTH) AS month,
COUNT(`by`) AS all_users
FROM
bigquery-public-DATA.hacker_news.full
GROUP BY 1
) user_cnt )
SELECT
churn.month,
ROUND((churn.churned_users/ (all_users_running_sum.running_sum))*100.00000,5) AS churn_percentage
FROM (
SELECT
DATE_ADD(last_month.month, INTERVAL 1 MONTH) AS month,
COUNT( DISTINCT last_month.userid) AS churned_users
FROM
monthly_activity last_month
LEFT JOIN monthly_activity this_month
ON
this_month.userid = last_month.userid
AND this_month.month = DATE_ADD(last_month.month, INTERVAL 1 MONTH)
WHERE
this_month.userid IS NULL
GROUP BY
last_month.month
ORDER BY month DESC ) churn
JOIN
all_users_running_sum
ON
churn.month = all_users_running_sum.month
WHERE
churn.month > DATE("2010-01-01")
ORDER BY month DESC
WITH
monthly_activity AS (
SELECT
DATE_TRUNC(DATE(timestamp),MONTH) AS month,
CAST(COUNT(DISTINCT `by` ) AS FLOAT64) AS mau
FROM
bigquery-public-data.hacker_news.full
GROUP BY
month )
SELECT
this_month.month,
ROUND(((this_month.mau - last_month.mau)/last_month.mau)*100,2) AS MOM_MAU
FROM
monthly_activity this_month
JOIN
monthly_activity last_month
ON
this_month.month = DATE_ADD(last_month.month, INTERVAL 1 MONTH)
WHERE
this_month.month < ( SELECT MAX(month) FROM monthly_activity)
AND this_month.month > DATE("2007-06-01")
ORDER BY this_month.month DESC
WITH
new_users AS (
SELECT
`by` AS userid,
MIN(DATE_TRUNC(DATE(timestamp),MONTH)) AS first_month,
FROM
bigquery-public-data.hacker_news.full
GROUP BY
userid )
SELECT
new_users.first_month AS month,
COUNT(new_users.userid) AS new_users_cnt
FROM
new_users
WHERE
new_users.first_month < ( SELECT MAX(first_month) FROM new_users)
GROUP BY
new_users.first_month
ORDER BY
new_users.first_month DESC
WITH
monthly_activity AS (
SELECT
DISTINCT DATE_TRUNC(DATE(timestamp),MONTH) AS month,
`by` AS userid
FROM
bigquery-public-data.hacker_news.full ),
first_activity AS (
SELECT
`by` AS userid,
MIN(DATE_TRUNC(DATE(timestamp),MONTH)) AS month,
FROM
bigquery-public-data.hacker_news.full
GROUP BY
userid ),
total_users AS (
SELECT
DISTINCT DATE_TRUNC(DATE(timestamp),MONTH) AS month,
COUNT(DISTINCT `by`) AS total
FROM
bigquery-public-data.hacker_news.full
GROUP BY
1 )
SELECT
reactivated.month,
ROUND((reactivated.reactivated_users / total_users.total)*100,2) AS reactivation_percentage
FROM (
SELECT
this_month.month AS month,
COUNT(DISTINCT this_month.userid) AS reactivated_users,
FROM
monthly_activity this_month
LEFT JOIN
monthly_activity last_month
ON
this_month.userid = last_month.userid
AND this_month.month = DATE_ADD(last_month.month, INTERVAL 1 MONTH)
LEFT JOIN
first_activity
ON
this_month.userid = first_activity.userid
AND first_activity.month != this_month.month
WHERE
last_month.userid IS NULL
AND this_month.month < (
SELECT
MAX(month)
FROM
monthly_activity)
GROUP BY
this_month.month ) reactivated
LEFT JOIN
total_users
ON
reactivated.month = total_users.month
WHERE
DATE_TRUNC(reactivated.month,MONTH) >= DATE("2007-05-01")
ORDER BY
reactivated.month DESC
WITH
January2012Pool AS (
SELECT
DISTINCT `by` AS userid
FROM
bigquery-public-data.hacker_news.full
WHERE
DATE_TRUNC(DATE(timestamp),MONTH) = "2012-01-01" ),
monthly_activity AS (
SELECT
DATE_TRUNC(DATE(timestamp),MONTH) AS month,
`by` AS userid
FROM
bigquery-public-data.hacker_news.full )
SELECT
monthly_activity.month,
COUNT(DISTINCT monthly_activity.userid) AS still_active_users
FROM
monthly_activity
WHERE
DATE_TRUNC(monthly_activity.month,YEAR) >= DATE("2012-01-01")
AND userid IN ( SELECT * FROM January2012Pool)
GROUP BY 1
ORDER BY
monthly_activity.month ASC
WITH
monthly_activity AS (
SELECT
DATE_TRUNC(DATE(timestamp),MONTH) AS month,
`by` AS userid
FROM
bigquery-public-data.hacker_news.full ),
total_users AS (
SELECT
DISTINCT DATE_TRUNC(DATE(timestamp),MONTH) AS month,
COUNT(DISTINCT `by`) AS total
FROM
bigquery-public-data.hacker_news.full
GROUP BY
1 )
SELECT
retained.month,
ROUND(((retained.retained_users / total_users.total))*100,2) AS retained_percentage
FROM (
SELECT
this_month.month,
COUNT(DISTINCT this_month.userid) AS retained_users
FROM
monthly_activity this_month
JOIN
monthly_activity last_month
ON
this_month.month = DATE_ADD(last_month.month, INTERVAL 1 MONTH)
AND this_month.userid = last_month.userid
GROUP BY
this_month.month) retained
LEFT JOIN
total_users
ON
retained.month = total_users.month
WHERE
DATE_TRUNC(retained.month,MONTH) >= DATE("2007-05-01")
ORDER BY
month DESC