Time series analysis for enrollments.total_active_time and active user counts

Pete5484
Community Participant

Wondering if anyone has any suggestions regarding some "time series-like" analytics for enrollments total_activity_time and active user counts?

I'm conscious that CD2 is "eventual consistency" so time series analysis of things like weekly total_activity_time and overall active user count trends arent possible. There's no real way to time travel back to earlier dates. So I'm wondering if something like a SCD could work, maybe on a weekly 4am trigger (but even then enrollments will get huge) though active users wouldn't be a problem. 

I've been thinking of doing something like:

 

INSERT INTO canvas_analytics.enrollments_activity_time
(   timestamp_column,
    year_week,
    user_id,
    enrollment_id,
    course_id,
    course_section_id,
    enrollment_type,
    role_id,
    total_activity_time,
    incremental_activity_time,
    workflow_state
    )
SELECT 
    current_timestamp(),
    CONCAT(
        EXTRACT(YEAR FROM DATE_SUB(current_date(), 1)), 
        '-', 
        LPAD(EXTRACT(WEEK FROM DATE_SUB(current_date(), 1)), 2, '0')
    ) AS year_week,
    e.user_id,
    e.id AS enrollment_id,
    e.course_id,
    e.course_section_id,
    e.type,
    e.role_id,
    e.total_activity_time,
    -- e.total_activity_time AS incremental_activity_time, --first time run only
    e.total_activity_time - ua.max_total_activity_time AS incremental_activity_time, --subsequent runs
    e.workflow_state
FROM canvas.enrollments e
JOIN canvas.pseudonyms p ON e.user_id = p.user_id
LEFT JOIN (
    SELECT enrollment_id, MAX(total_activity_time) AS max_total_activity_time
    FROM canvas_analytics.user_activity
    GROUP BY enrollment_id
) ua ON e.id = ua.enrollment_id
WHERE e.completed_at is NULL    
    AND p.last_request_at >= date_sub(current_date(), 7)
    AND e.type NOT IN ('StudentViewEnrollment', 'DesignerEnrollment','ObserverEnrollment')
    AND e.total_activity_time > ua.max_total_activity_time --remove for first time run only

 

  and

 

INSERT INTO canvas_analytics.user_counts
(timestamp_column, year_week, active_user_count, created_user_count, updated_user_count, deleted_user_count, suspended_user_count)

SELECT 
    current_timestamp(),
    CONCAT(
        EXTRACT(YEAR FROM DATE_SUB(current_date(), 1)), 
        '-', 
        LPAD(EXTRACT(WEEK FROM DATE_SUB(current_date(), 1)), 2, '0')
    ) AS year_week,
    COUNT(DISTINCT p.user_id),
    COUNT(DISTINCT CASE 
                        WHEN YEAR(p.created_at) = YEAR(DATE_SUB(current_date(), 1))
                        THEN p.user_id
                        ELSE NULL
                    END),
    COUNT(DISTINCT CASE 
                        WHEN YEAR(p.updated_at) = YEAR(DATE_SUB(current_date(), 1)) THEN p.user_id
                        ELSE NULL
                    END),
    COUNT(DISTINCT CASE 
                        WHEN YEAR(p.deleted_at) = YEAR(DATE_SUB(current_date(), 1)) THEN p.user_id
                        ELSE NULL
                    END),
    COUNT(DISTINCT CASE 
                    WHEN p.workflow_state IN ('suspended') AND YEAR(p.updated_at)= YEAR(DATE_SUB(current_date(), 1)) THEN p.user_id
                    ELSE NULL
                END)
FROM canvas.users u 
LEFT JOIN canvas.pseudonyms p ON p.user_id = u.id
WHERE YEAR(last_request_at) = YEAR(DATE_SUB(current_date(), 1))

 

 

Labels (2)