BigQuery + Open Source
BigQuery Scheduled Query Open Source Contribution GitHub Archive Data Pipeline ETL Dashboard Looker Studio Community Metrics Contributor Analytics Production
| Metric | SQL Aggregation | Insight | Dashboard |
|---|---|---|---|
| Daily Commits | COUNT WHERE type=PushEvent | Development activity trend | Line chart |
| PR Merge Rate | Merged / Total PRs | Review efficiency | Gauge |
| Issue Response Time | AVG(first_comment - created) | Community responsiveness | Metric card |
| New Contributors | COUNT DISTINCT new actors | Community growth | Bar chart |
| Star Growth | COUNT WatchEvent per day | Project popularity | Area chart |
| Top Contributors | COUNT events GROUP BY actor | Key contributors | Leaderboard |
Scheduled Query Setup
# === BigQuery Scheduled Query ===
# SQL Query — Daily GitHub Contribution Summary
# CREATE OR REPLACE TABLE `project.dataset.daily_contributions`
# PARTITION BY date
# AS
# SELECT
# DATE(created_at) AS date,
# repo.name AS repo_name,
# type AS event_type,
# COUNT(*) AS event_count,
# COUNT(DISTINCT actor.login) AS unique_contributors
# FROM `githubarchive.day.20*`
# WHERE
# _TABLE_SUFFIX = FORMAT_DATE('%y%m%d', @run_date)
# AND repo.name IN ('kubernetes/kubernetes', 'pytorch/pytorch', 'facebook/react')
# GROUP BY date, repo_name, event_type
# ORDER BY date DESC, event_count DESC;
# Schedule via CLI:
# bq mk --transfer_config \
# --project_id=my-project \
# --data_source=scheduled_query \
# --target_dataset=github_analytics \
# --display_name="Daily GitHub Contributions" \
# --schedule="every 24 hours" \
# --params='{
# "query": "SELECT ... FROM githubarchive.day.* ...",
# "destination_table_name_template": "daily_contributions_{run_date}",
# "write_disposition": "WRITE_TRUNCATE"
# }'
# Terraform:
# resource "google_bigquery_data_transfer_config" "github_daily" {
# display_name = "Daily GitHub Contributions"
# data_source_id = "scheduled_query"
# schedule = "every 24 hours"
# destination_dataset_id = google_bigquery_dataset.github.dataset_id
# params = {
# query = file("queries/daily_contributions.sql")
# destination_table_name_template = "daily_contributions"
# write_disposition = "WRITE_APPEND"
# }
# }
from dataclasses import dataclass
@dataclass
class ScheduleConfig:
name: str
schedule: str
query_type: str
destination: str
cost_estimate: str
configs = [
ScheduleConfig("Daily Contributions", "every 24 hours", "Aggregate events by repo/type",
"daily_contributions", "~$0.05/day"),
ScheduleConfig("Weekly Summary", "every week on Monday 08:00", "Weekly rollup metrics",
"weekly_summary", "~$0.10/week"),
ScheduleConfig("Contributor Leaderboard", "every 24 hours", "Top contributors ranked",
"contributor_leaderboard", "~$0.03/day"),
ScheduleConfig("PR Metrics", "every 6 hours", "PR open/merge/close rates",
"pr_metrics", "~$0.08/day"),
ScheduleConfig("Issue Health", "every 12 hours", "Issue response and resolution time",
"issue_health", "~$0.04/day"),
]
print("=== Scheduled Queries ===")
for c in configs:
print(f" [{c.name}] Schedule: {c.schedule}")
print(f" Query: {c.query_type}")
print(f" Dest: {c.destination} | Cost: {c.cost_estimate}")
GitHub Analytics Queries
# === GitHub Archive Analytics ===
# Top Contributors Query
# SELECT
# actor.login AS contributor,
# COUNT(*) AS total_events,
# COUNTIF(type = 'PushEvent') AS commits,
# COUNTIF(type = 'PullRequestEvent') AS pull_requests,
# COUNTIF(type = 'IssuesEvent') AS issues,
# COUNTIF(type = 'PullRequestReviewEvent') AS reviews,
# MIN(created_at) AS first_contribution,
# MAX(created_at) AS last_contribution
# FROM `githubarchive.day.20*`
# WHERE repo.name = 'kubernetes/kubernetes'
# AND _TABLE_SUFFIX BETWEEN '240101' AND '241231'
# GROUP BY contributor
# ORDER BY total_events DESC
# LIMIT 50;
# Community Health Score
# WITH metrics AS (
# SELECT
# DATE(created_at) AS date,
# COUNTIF(type = 'PushEvent') AS pushes,
# COUNTIF(type = 'PullRequestEvent') AS prs,
# COUNTIF(type = 'IssuesEvent') AS issues,
# COUNT(DISTINCT actor.login) AS contributors
# FROM `githubarchive.day.20*`
# WHERE repo.name = @repo_name
# GROUP BY date
# )
# SELECT
# date,
# pushes, prs, issues, contributors,
# (pushes * 1 + prs * 3 + issues * 2 + contributors * 5) AS health_score
# FROM metrics
# ORDER BY date DESC;
@dataclass
class RepoMetric:
repo: str
daily_commits: int
weekly_prs: int
open_issues: int
contributors_30d: int
health_score: int
trend: str
repos = [
RepoMetric("kubernetes/kubernetes", 85, 120, 2500, 450, 92, "Stable"),
RepoMetric("pytorch/pytorch", 65, 95, 1800, 380, 88, "Growing"),
RepoMetric("facebook/react", 25, 40, 800, 150, 75, "Stable"),
RepoMetric("golang/go", 30, 50, 6000, 200, 70, "Stable"),
RepoMetric("rust-lang/rust", 45, 80, 3500, 300, 85, "Growing"),
]
print("\n=== Repository Health Dashboard ===")
for r in repos:
print(f" [{r.repo}] Health: {r.health_score}/100 | Trend: {r.trend}")
print(f" Commits/day: {r.daily_commits} | PRs/week: {r.weekly_prs}")
print(f" Open Issues: {r.open_issues} | Contributors (30d): {r.contributors_30d}")
Cost Optimization
# === BigQuery Cost Optimization ===
@dataclass
class CostTip:
tip: str
savings: str
implementation: str
tips = [
CostTip("Use partition filter", "50-90% query cost reduction",
"WHERE _TABLE_SUFFIX = FORMAT_DATE('%y%m%d', @run_date)"),
CostTip("SELECT specific columns", "30-70% reduction",
"SELECT actor.login, type instead of SELECT *"),
CostTip("Use LIMIT for exploration", "Variable savings",
"LIMIT 1000 when exploring data"),
CostTip("Materialized Views", "Cache repeated aggregations",
"CREATE MATERIALIZED VIEW for daily summaries"),
CostTip("Clustering", "Faster filtered queries",
"CLUSTER BY repo_name, event_type"),
CostTip("Budget alerts", "Prevent bill shock",
"Set budget alert at $10/month threshold"),
]
print("Cost Optimization Tips:")
for t in tips:
print(f" [{t.tip}] Savings: {t.savings}")
print(f" How: {t.implementation}")
monthly_cost = {
"Scheduled Queries (5)": "$5-10/month",
"Ad-hoc Exploration": "$2-5/month",
"Storage (Summary Tables)": "$0.50/month",
"Looker Studio": "Free",
"Total Estimated": "$8-16/month",
"Free Tier Covers": "Up to 1TB query/month",
}
print(f"\n\nMonthly Cost Estimate:")
for k, v in monthly_cost.items():
print(f" [{k}]: {v}")
เคล็ดลับ
- Partition: ใช้ Partition Filter ทุก Query ลด Cost 50-90%
- MERGE: ใช้ MERGE Statement สำหรับ Upsert ป้องกัน Duplicate
- Terraform: จัดการ Scheduled Query ด้วย Terraform เป็น Code
- Alert: ตั้ง Email Notification เมื่อ Query Fail
- Budget: ตั้ง Budget Alert ป้องกันค่าใช้จ่ายเกิน
BigQuery Scheduled Query คืออะไร
ตั้งเวลารัน SQL อัตโนมัติ ทุกชั่วโมง วัน สัปดาห์ Destination Table ETL Console CLI Terraform @run_time @run_date Email Notification Service Account
วิเคราะห์ Open Source Contribution อย่างไร
GitHub Archive Public Dataset Event Push PR Issues Stars Forks Contributor Activity Community Health Response Time Scheduled Query Dashboard Leaderboard
สร้าง Data Pipeline อย่างไร
Scheduled Query ดึงข้อมูลทุกวัน Transform Aggregate Summary Table Looker Studio Dashboard Alert Anomaly MERGE Upsert Duplicate
ใช้ BigQuery ฟรีได้ไหม
Free Tier 1TB Query 10GB Storage Scheduled Query ฟรี Public Dataset LIMIT Partition Filter Materialized View Budget Alert Billing Dashboard
สรุป
BigQuery Scheduled Query Open Source GitHub Archive Data Pipeline ETL Dashboard Looker Studio Community Metrics Contributor Analytics Cost Optimization Production
