BigQuery Scheduled Query Open Source
BigQuery + Open Source
BigQuery Scheduled Query Open Source Contribution GitHub Archive Data Pipeline ETL Dashboard Looker Studio Community Metrics Contributor Analytics Production
| Metric | SQL Aggregation | Insight | Dashboard |
|---|---|---|---|
| Daily Commits | COUNT WHERE type=PushEvent | Development activity trend | Line chart |
| PR Merge Rate | Merged / Total PRs | Review efficiency | Gauge |
| Issue Response Time | AVG(first_comment - created) | Community responsiveness | Metric card |
| New Contributors | COUNT DISTINCT new actors | Community growth | Bar chart |
| Star Growth | COUNT WatchEvent per day | Project popularity | Area chart |
| Top Contributors | COUNT events GROUP BY actor | Key contributors | Leaderboard |
Scheduled Query Setup
=== BigQuery Scheduled Query ===
SQL Query — Daily GitHub Contribution Summary
CREATE OR REPLACE TABLE `project.dataset.daily_contributions`
PARTITION BY date
AS
SELECT
DATE(created_at) AS date,
repo.name AS repo_name,
type AS event_type,
COUNT(*) AS event_count,
COUNT(DISTINCT actor.login) AS unique_contributors
FROM `githubarchive.day.20*`
WHERE
_TABLE_SUFFIX = FORMAT_DATE('%y%m%d', @run_date)
AND repo.name IN ('kubernetes/kubernetes', 'pytorch/pytorch', 'facebook/react')
GROUP BY date, repo_name, event_type
ORDER BY date DESC, event_count DESC;
Schedule via CLI:
bq mk --transfer_config \
--project_id=my-project \
--data_source=scheduled_query \
--target_dataset=github_analytics \
--display_name="Daily GitHub Contributions" \
--schedule="every 24 hours" \
--params='{
"query": "SELECT ... FROM githubarchive.day.* ...",
"destination_table_name_template": "daily_contributions_{run_date}",
"write_disposition": "WRITE_TRUNCATE"
}'
Terraform:
resource "google_bigquery_data_transfer_config" "github_daily" {
display_name = "Daily GitHub Contributions"
data_source_id = "scheduled_query"
schedule = "every 24 hours"
destination_dataset_id = google_bigquery_dataset.github.dataset_id
params = {
query = file("queries/daily_contributions.sql")
destination_table_name_template = "daily_contributions"
write_disposition = "WRITE_APPEND"
}
}
from dataclasses import dataclass
@dataclass
class ScheduleConfig:
name: str
schedule: str
query_type: str
destination: str
cost_estimate: str
configs = [
ScheduleConfig("Daily Contributions", "every 24 hours", "Aggregate events by repo/type",
"daily_contributions", "~$0.05/day"),
ScheduleConfig("Weekly Summary", "every week on Monday 08:00", "Weekly rollup metrics",
"weekly_summary", "~$0.10/week"),
ScheduleConfig("Contributor Leaderboard", "every 24 hours", "Top contributors ranked",
"contributor_leaderboard", "~$0.03/day"),
ScheduleConfig("PR Metrics", "every 6 hours", "PR open/merge/close rates",
"pr_metrics", "~$0.08/day"),
ScheduleConfig("Issue Health", "every 12 hours", "Issue response and resolution time",
"issue_health", "~$0.04/day"),
]
print("=== Scheduled Queries ===")
for c in configs:
print(f" [{c.name}] Schedule: {c.schedule}")
print(f" Query: {c.query_type}")
print(f" Dest: {c.destination} | Cost: {c.cost_estimate}")
GitHub Analytics Queries
=== GitHub Archive Analytics ===
Top Contributors Query
SELECT
actor.login AS contributor,
COUNT(*) AS total_events,
COUNTIF(type = 'PushEvent') AS commits,
COUNTIF(type = 'PullRequestEvent') AS pull_requests,
COUNTIF(type = 'IssuesEvent') AS issues,
COUNTIF(type = 'PullRequestReviewEvent') AS reviews,
MIN(created_at) AS first_contribution,
MAX(created_at) AS last_contribution
FROM `githubarchive.day.20*`
WHERE repo.name = 'kubernetes/kubernetes'
AND _TABLE_SUFFIX BETWEEN '240101' AND '241231'
GROUP BY contributor
ORDER BY total_events DESC
LIMIT 50;
Community Health Score
WITH metrics AS (
SELECT
DATE(created_at) AS date,
COUNTIF(type = 'PushEvent') AS pushes,
COUNTIF(type = 'PullRequestEvent') AS prs,
COUNTIF(type = 'IssuesEvent') AS issues,
COUNT(DISTINCT actor.login) AS contributors
FROM `githubarchive.day.20*`
WHERE repo.name = @repo_name
GROUP BY date
)
SELECT
date,
pushes, prs, issues, contributors,
(pushes * 1 + prs * 3 + issues * 2 + contributors * 5) AS health_score
FROM metrics
ORDER BY date DESC;
@dataclass
class RepoMetric:
repo: str
daily_commits: int
weekly_prs: int
open_issues: int
contributors_30d: int
health_score: int
trend: str
repos = [
RepoMetric("kubernetes/kubernetes", 85, 120, 2500, 450, 92, "Stable"),
RepoMetric("pytorch/pytorch", 65, 95, 1800, 380, 88, "Growing"),
RepoMetric("facebook/react", 25, 40, 800, 150, 75, "Stable"),
RepoMetric("golang/go", 30, 50, 6000, 200, 70, "Stable"),
RepoMetric("rust-lang/rust", 45, 80, 3500, 300, 85, "Growing"),
]
print("\n=== Repository Health Dashboard ===")
for r in repos:
print(f" [{r.repo}] Health: {r.health_score}/100 | Trend: {r.trend}")
print(f" Commits/day: {r.daily_commits} | PRs/week: {r.weekly_prs}")
print(f" Open Issues: {r.open_issues} | Contributors (30d): {r.contributors_30d}")
Cost Optimization
# === BigQuery Cost Optimization ===
@dataclass
class CostTip:
tip: str
savings: str
implementation: str
tips = [
CostTip("Use partition filter", "50-90% query cost reduction",
"WHERE _TABLE_SUFFIX = FORMAT_DATE('%y%m%d', @run_date)"),
CostTip("SELECT specific columns", "30-70% reduction",
"SELECT actor.login, type instead of SELECT *"),
CostTip("Use LIMIT for exploration", "Variable savings",
"LIMIT 1000 when exploring data"),
CostTip("Materialized Views", "Cache repeated aggregations",
"CREATE MATERIALIZED VIEW for daily summaries"),
CostTip("Clustering", "Faster filtered queries",
"CLUSTER BY repo_name, event_type"),
CostTip("Budget alerts", "Prevent bill shock",
"Set budget alert at $10/month threshold"),
]
print("Cost Optimization Tips:")
for t in tips:
print(f" [{t.tip}] Savings: {t.savings}")
print(f" How: {t.implementation}")
monthly_cost = {
"Scheduled Queries (5)": "$5-10/month",
"Ad-hoc Exploration": "$2-5/month",
"Storage (Summary Tables)": "$0.50/month",
"Looker Studio": "Free",
"Total Estimated": "$8-16/month",
"Free Tier Covers": "Up to 1TB query/month",
}
print(f"\n\nMonthly Cost Estimate:")
for k, v in monthly_cost.items():
print(f" [{k}]: {v}")
เคล็ดลับ
- Partition: ใช้ Partition Filter ทุก Query ลด Cost 50-90%
- MERGE: ใช้ MERGE Statement สำหรับ Upsert ป้องกัน Duplicate
- Terraform: จัดการ Scheduled Query ด้วย Terraform เป็น Code
- Alert: ตั้ง Email Notification เมื่อ Query Fail
- Budget: ตั้ง Budget Alert ป้องกันค่าใช้จ่ายเกิน
BigQuery Scheduled Query คืออะไร
ตั้งเวลารัน SQL อัตโนมัติ ทุกชั่วโมง วัน สัปดาห์ Destination Table ETL Console CLI Terraform @run_time @run_date Email Notification Service Account