Technology

BigQuery Scheduled Query Open Source Contribution

bigquery scheduled query open source contribution
BigQuery Scheduled Query Open Source Contribution | SiamCafe Blog
2026-04-06· อ. บอม — SiamCafe.net· 9,266 คำ

BigQuery + Open Source

BigQuery Scheduled Query Open Source Contribution GitHub Archive Data Pipeline ETL Dashboard Looker Studio Community Metrics Contributor Analytics Production

MetricSQL AggregationInsightDashboard
Daily CommitsCOUNT WHERE type=PushEventDevelopment activity trendLine chart
PR Merge RateMerged / Total PRsReview efficiencyGauge
Issue Response TimeAVG(first_comment - created)Community responsivenessMetric card
New ContributorsCOUNT DISTINCT new actorsCommunity growthBar chart
Star GrowthCOUNT WatchEvent per dayProject popularityArea chart
Top ContributorsCOUNT events GROUP BY actorKey contributorsLeaderboard

Scheduled Query Setup

# === BigQuery Scheduled Query ===

# SQL Query — Daily GitHub Contribution Summary
# CREATE OR REPLACE TABLE `project.dataset.daily_contributions`
# PARTITION BY date
# AS
# SELECT
#   DATE(created_at) AS date,
#   repo.name AS repo_name,
#   type AS event_type,
#   COUNT(*) AS event_count,
#   COUNT(DISTINCT actor.login) AS unique_contributors
# FROM `githubarchive.day.20*`
# WHERE
#   _TABLE_SUFFIX = FORMAT_DATE('%y%m%d', @run_date)
#   AND repo.name IN ('kubernetes/kubernetes', 'pytorch/pytorch', 'facebook/react')
# GROUP BY date, repo_name, event_type
# ORDER BY date DESC, event_count DESC;

# Schedule via CLI:
# bq mk --transfer_config \
#   --project_id=my-project \
#   --data_source=scheduled_query \
#   --target_dataset=github_analytics \
#   --display_name="Daily GitHub Contributions" \
#   --schedule="every 24 hours" \
#   --params='{
#     "query": "SELECT ... FROM githubarchive.day.* ...",
#     "destination_table_name_template": "daily_contributions_{run_date}",
#     "write_disposition": "WRITE_TRUNCATE"
#   }'

# Terraform:
# resource "google_bigquery_data_transfer_config" "github_daily" {
#   display_name   = "Daily GitHub Contributions"
#   data_source_id = "scheduled_query"
#   schedule       = "every 24 hours"
#   destination_dataset_id = google_bigquery_dataset.github.dataset_id
#   params = {
#     query                      = file("queries/daily_contributions.sql")
#     destination_table_name_template = "daily_contributions"
#     write_disposition          = "WRITE_APPEND"
#   }
# }

from dataclasses import dataclass

@dataclass
class ScheduleConfig:
    name: str
    schedule: str
    query_type: str
    destination: str
    cost_estimate: str

configs = [
    ScheduleConfig("Daily Contributions", "every 24 hours", "Aggregate events by repo/type",
        "daily_contributions", "~$0.05/day"),
    ScheduleConfig("Weekly Summary", "every week on Monday 08:00", "Weekly rollup metrics",
        "weekly_summary", "~$0.10/week"),
    ScheduleConfig("Contributor Leaderboard", "every 24 hours", "Top contributors ranked",
        "contributor_leaderboard", "~$0.03/day"),
    ScheduleConfig("PR Metrics", "every 6 hours", "PR open/merge/close rates",
        "pr_metrics", "~$0.08/day"),
    ScheduleConfig("Issue Health", "every 12 hours", "Issue response and resolution time",
        "issue_health", "~$0.04/day"),
]

print("=== Scheduled Queries ===")
for c in configs:
    print(f"  [{c.name}] Schedule: {c.schedule}")
    print(f"    Query: {c.query_type}")
    print(f"    Dest: {c.destination} | Cost: {c.cost_estimate}")

GitHub Analytics Queries

# === GitHub Archive Analytics ===

# Top Contributors Query
# SELECT
#   actor.login AS contributor,
#   COUNT(*) AS total_events,
#   COUNTIF(type = 'PushEvent') AS commits,
#   COUNTIF(type = 'PullRequestEvent') AS pull_requests,
#   COUNTIF(type = 'IssuesEvent') AS issues,
#   COUNTIF(type = 'PullRequestReviewEvent') AS reviews,
#   MIN(created_at) AS first_contribution,
#   MAX(created_at) AS last_contribution
# FROM `githubarchive.day.20*`
# WHERE repo.name = 'kubernetes/kubernetes'
#   AND _TABLE_SUFFIX BETWEEN '240101' AND '241231'
# GROUP BY contributor
# ORDER BY total_events DESC
# LIMIT 50;

# Community Health Score
# WITH metrics AS (
#   SELECT
#     DATE(created_at) AS date,
#     COUNTIF(type = 'PushEvent') AS pushes,
#     COUNTIF(type = 'PullRequestEvent') AS prs,
#     COUNTIF(type = 'IssuesEvent') AS issues,
#     COUNT(DISTINCT actor.login) AS contributors
#   FROM `githubarchive.day.20*`
#   WHERE repo.name = @repo_name
#   GROUP BY date
# )
# SELECT
#   date,
#   pushes, prs, issues, contributors,
#   (pushes * 1 + prs * 3 + issues * 2 + contributors * 5) AS health_score
# FROM metrics
# ORDER BY date DESC;

@dataclass
class RepoMetric:
    repo: str
    daily_commits: int
    weekly_prs: int
    open_issues: int
    contributors_30d: int
    health_score: int
    trend: str

repos = [
    RepoMetric("kubernetes/kubernetes", 85, 120, 2500, 450, 92, "Stable"),
    RepoMetric("pytorch/pytorch", 65, 95, 1800, 380, 88, "Growing"),
    RepoMetric("facebook/react", 25, 40, 800, 150, 75, "Stable"),
    RepoMetric("golang/go", 30, 50, 6000, 200, 70, "Stable"),
    RepoMetric("rust-lang/rust", 45, 80, 3500, 300, 85, "Growing"),
]

print("\n=== Repository Health Dashboard ===")
for r in repos:
    print(f"  [{r.repo}] Health: {r.health_score}/100 | Trend: {r.trend}")
    print(f"    Commits/day: {r.daily_commits} | PRs/week: {r.weekly_prs}")
    print(f"    Open Issues: {r.open_issues} | Contributors (30d): {r.contributors_30d}")

Cost Optimization

# === BigQuery Cost Optimization ===

@dataclass
class CostTip:
    tip: str
    savings: str
    implementation: str

tips = [
    CostTip("Use partition filter", "50-90% query cost reduction",
        "WHERE _TABLE_SUFFIX = FORMAT_DATE('%y%m%d', @run_date)"),
    CostTip("SELECT specific columns", "30-70% reduction",
        "SELECT actor.login, type instead of SELECT *"),
    CostTip("Use LIMIT for exploration", "Variable savings",
        "LIMIT 1000 when exploring data"),
    CostTip("Materialized Views", "Cache repeated aggregations",
        "CREATE MATERIALIZED VIEW for daily summaries"),
    CostTip("Clustering", "Faster filtered queries",
        "CLUSTER BY repo_name, event_type"),
    CostTip("Budget alerts", "Prevent bill shock",
        "Set budget alert at $10/month threshold"),
]

print("Cost Optimization Tips:")
for t in tips:
    print(f"  [{t.tip}] Savings: {t.savings}")
    print(f"    How: {t.implementation}")

monthly_cost = {
    "Scheduled Queries (5)": "$5-10/month",
    "Ad-hoc Exploration": "$2-5/month",
    "Storage (Summary Tables)": "$0.50/month",
    "Looker Studio": "Free",
    "Total Estimated": "$8-16/month",
    "Free Tier Covers": "Up to 1TB query/month",
}

print(f"\n\nMonthly Cost Estimate:")
for k, v in monthly_cost.items():
    print(f"  [{k}]: {v}")

เคล็ดลับ

BigQuery Scheduled Query คืออะไร

ตั้งเวลารัน SQL อัตโนมัติ ทุกชั่วโมง วัน สัปดาห์ Destination Table ETL Console CLI Terraform @run_time @run_date Email Notification Service Account

วิเคราะห์ Open Source Contribution อย่างไร

GitHub Archive Public Dataset Event Push PR Issues Stars Forks Contributor Activity Community Health Response Time Scheduled Query Dashboard Leaderboard

สร้าง Data Pipeline อย่างไร

Scheduled Query ดึงข้อมูลทุกวัน Transform Aggregate Summary Table Looker Studio Dashboard Alert Anomaly MERGE Upsert Duplicate

ใช้ BigQuery ฟรีได้ไหม

Free Tier 1TB Query 10GB Storage Scheduled Query ฟรี Public Dataset LIMIT Partition Filter Materialized View Budget Alert Billing Dashboard

สรุป

BigQuery Scheduled Query Open Source GitHub Archive Data Pipeline ETL Dashboard Looker Studio Community Metrics Contributor Analytics Cost Optimization Production

📖 บทความที่เกี่ยวข้อง

BigQuery Scheduled Query Distributed Systemอ่านบทความ → BigQuery Scheduled Query Zero Downtime Deploymentอ่านบทความ → Linux Namespaces Open Source Contributionอ่านบทความ → React Query TanStack Open Source Contributionอ่านบทความ → BigQuery Scheduled Query Team Productivityอ่านบทความ →

📚 ดูบทความทั้งหมด →