Databricks Unity Catalog
Unity Catalog Centralized Governance Databricks Lakehouse Data Assets Tables Views Volumes Models Functions 3-Level Namespace Catalog.Schema.Table Fine-grained Access Control Data Lineage Audit Logs
Interview Preparation Data Engineering Delta Lake Spark Optimization Medallion Architecture ETL Patterns Data Quality Cost Optimization
Unity Catalog Architecture
# unity_catalog.py — Databricks Unity Catalog
from dataclasses import dataclass, field
from typing import List, Dict, Optional
@dataclass
class Table:
name: str
table_type: str # MANAGED, EXTERNAL
format: str # DELTA, PARQUET, CSV
columns: List[Dict[str, str]] = field(default_factory=list)
owner: str = ""
@dataclass
class Schema:
name: str
tables: List[Table] = field(default_factory=list)
owner: str = ""
@dataclass
class Catalog:
name: str
schemas: List[Schema] = field(default_factory=list)
owner: str = ""
class UnityCatalog:
"""Unity Catalog Management"""
def __init__(self):
self.catalogs: List[Catalog] = []
self.grants: List[dict] = []
def create_catalog(self, name: str, owner: str) -> Catalog:
cat = Catalog(name=name, owner=owner)
self.catalogs.append(cat)
return cat
def create_schema(self, catalog_name: str, schema_name: str, owner: str):
cat = next((c for c in self.catalogs if c.name == catalog_name), None)
if cat:
schema = Schema(name=schema_name, owner=owner)
cat.schemas.append(schema)
return schema
def create_table(self, catalog: str, schema: str, table: Table):
cat = next((c for c in self.catalogs if c.name == catalog), None)
if cat:
sch = next((s for s in cat.schemas if s.name == schema), None)
if sch:
sch.tables.append(table)
def grant(self, principal: str, privilege: str, target: str):
self.grants.append({
"principal": principal,
"privilege": privilege,
"target": target,
})
def show_hierarchy(self):
print(f"\n{'='*55}")
print(f"Unity Catalog Hierarchy")
print(f"{'='*55}")
for cat in self.catalogs:
print(f"\n Catalog: {cat.name} (Owner: {cat.owner})")
for schema in cat.schemas:
print(f" Schema: {schema.name}")
for table in schema.tables:
print(f" Table: {table.name} [{table.table_type}] ({table.format})")
# -- SQL Commands --
# CREATE CATALOG production;
# CREATE SCHEMA production.bronze;
# CREATE SCHEMA production.silver;
# CREATE SCHEMA production.gold;
#
# CREATE TABLE production.bronze.raw_events (
# event_id STRING,
# event_type STRING,
# user_id STRING,
# timestamp TIMESTAMP,
# payload STRING
# ) USING DELTA;
#
# -- Access Control
# GRANT USE CATALOG ON CATALOG production TO `data-engineers`;
# GRANT USE SCHEMA ON SCHEMA production.bronze TO `data-engineers`;
# GRANT SELECT ON TABLE production.bronze.raw_events TO `data-analysts`;
# GRANT MODIFY ON TABLE production.silver.users TO `etl-service`;
#
# -- Column-level Security
# GRANT SELECT (user_id, event_type) ON TABLE production.bronze.raw_events TO `limited-access`;
uc = UnityCatalog()
cat = uc.create_catalog("production", "admin")
uc.create_schema("production", "bronze", "data-engineers")
uc.create_schema("production", "silver", "data-engineers")
uc.create_schema("production", "gold", "data-analysts")
tables = [
Table("raw_events", "MANAGED", "DELTA", [{"name": "event_id", "type": "STRING"}]),
Table("raw_users", "MANAGED", "DELTA", [{"name": "user_id", "type": "STRING"}]),
]
for t in tables:
uc.create_table("production", "bronze", t)
uc.create_table("production", "silver",
Table("clean_events", "MANAGED", "DELTA"))
uc.create_table("production", "gold",
Table("daily_metrics", "MANAGED", "DELTA"))
uc.grant("data-engineers", "USE CATALOG", "production")
uc.grant("data-analysts", "SELECT", "production.gold.*")
uc.show_hierarchy()
print(f"\n Grants:")
for g in uc.grants:
print(f" GRANT {g['privilege']} ON {g['target']} TO {g['principal']}")
Interview Questions
# interview_questions.py — Databricks Interview Prep
interview = {
"Delta Lake": {
"questions": [
"Delta Lake คืออะไร ต่างจาก Parquet อย่างไร",
"ACID Transactions ใน Delta Lake ทำงานอย่างไร",
"Time Travel คืออะไร ใช้อย่างไร",
"Z-Ordering คืออะไร เมื่อไหร่ควรใช้",
"VACUUM ทำอะไร ตั้ง Retention อย่างไร",
"OPTIMIZE ทำอะไร ต่างจาก ZORDER อย่างไร",
],
"key_concepts": "Transaction Log, ACID, Schema Evolution, CDF",
},
"Unity Catalog": {
"questions": [
"Unity Catalog คืออะไร ต่างจาก Hive Metastore อย่างไร",
"3-Level Namespace คืออะไร",
"Fine-grained Access Control ตั้งค่าอย่างไร",
"Data Lineage ใช้อย่างไร",
"External Tables vs Managed Tables",
"Column-level Security ทำอย่างไร",
],
"key_concepts": "RBAC, Lineage, Audit, Governance",
},
"Spark Optimization": {
"questions": [
"Spark Partitioning ตั้งค่าอย่างไร",
"Broadcast Join ใช้เมื่อไหร่",
"Shuffle คืออะไร ลดอย่างไร",
"Caching vs Persist ต่างกันอย่างไร",
"AQE (Adaptive Query Execution) คืออะไร",
"Skew Join Optimization ทำอย่างไร",
],
"key_concepts": "Partitioning, Shuffle, AQE, Catalyst",
},
"Medallion Architecture": {
"questions": [
"Medallion Architecture คืออะไร",
"Bronze, Silver, Gold แตกต่างกันอย่างไร",
"Data Quality ตรวจสอบที่ Layer ไหน",
"SCD Type 2 ทำที่ Layer ไหน อย่างไร",
"Streaming + Batch ใน Medallion ทำอย่างไร",
],
"key_concepts": "Bronze (Raw), Silver (Clean), Gold (Business)",
},
}
print("Databricks Interview Questions:")
for topic, info in interview.items():
print(f"\n [{topic}]")
print(f" Key Concepts: {info['key_concepts']}")
for q in info["questions"]:
print(f" Q: {q}")
PySpark Code Examples
# pyspark_examples.py — PySpark for Interview
# from pyspark.sql import SparkSession
# from pyspark.sql.functions import col, count, sum, avg, when, lit
# from delta.tables import DeltaTable
# spark = SparkSession.builder.appName("interview").getOrCreate()
# 1. Delta Lake Operations
# -- Create Delta Table
# CREATE TABLE production.bronze.events
# USING DELTA
# PARTITIONED BY (event_date)
# AS SELECT * FROM raw_events;
# -- Time Travel
# SELECT * FROM production.bronze.events VERSION AS OF 5;
# SELECT * FROM production.bronze.events TIMESTAMP AS OF '2024-01-15';
# -- MERGE (Upsert)
# MERGE INTO production.silver.users AS target
# USING staging.new_users AS source
# ON target.user_id = source.user_id
# WHEN MATCHED THEN UPDATE SET *
# WHEN NOT MATCHED THEN INSERT *;
# -- Z-Ordering
# OPTIMIZE production.silver.events ZORDER BY (user_id, event_type);
# -- VACUUM
# VACUUM production.bronze.events RETAIN 168 HOURS;
# 2. PySpark Optimization
# df = spark.read.table("production.bronze.events")
#
# # Broadcast Join (Small Table < 10MB)
# from pyspark.sql.functions import broadcast
# small_df = spark.read.table("production.silver.dim_users")
# result = df.join(broadcast(small_df), "user_id")
#
# # Partitioning
# df.repartition(200, "event_date") \
# .write.mode("overwrite") \
# .partitionBy("event_date") \
# .saveAsTable("production.silver.events")
#
# # Caching
# df.cache() # MEMORY_ONLY
# df.persist(StorageLevel.MEMORY_AND_DISK)
#
# # AQE
# spark.conf.set("spark.sql.adaptive.enabled", "true")
# spark.conf.set("spark.sql.adaptive.coalescePartitions.enabled", "true")
# spark.conf.set("spark.sql.adaptive.skewJoin.enabled", "true")
# Certification Path
certs = {
"Databricks Certified Data Engineer Associate": {
"topics": "Delta Lake, ELT, Workflows, Unity Catalog Basics",
"difficulty": "ปานกลาง",
"prep_time": "2-4 สัปดาห์",
"cost": "$200",
},
"Databricks Certified Data Engineer Professional": {
"topics": "Advanced Delta, Streaming, Optimization, Production",
"difficulty": "ยาก",
"prep_time": "4-8 สัปดาห์",
"cost": "$300",
},
"Databricks Certified ML Professional": {
"topics": "MLflow, Feature Store, Model Serving, AutoML",
"difficulty": "ยาก",
"prep_time": "4-8 สัปดาห์",
"cost": "$300",
},
}
print("\nDatabricks Certifications:")
for cert, info in certs.items():
print(f"\n [{cert}]")
for key, value in info.items():
print(f" {key}: {value}")
เคล็ดลับ
- Hands-on: ฝึกบน Databricks Community Edition ฟรี
- Delta Lake: เข้าใจ Transaction Log, ACID, Time Travel ลึก
- Unity Catalog: รู้ 3-Level Namespace, RBAC, Lineage
- Medallion: อธิบาย Bronze/Silver/Gold ได้ชัดเจน
- Optimization: รู้ Broadcast Join, AQE, Z-Ordering
- Certification: สอบ Associate ก่อน แล้วค่อย Professional
Unity Catalog คืออะไร
Centralized Governance Databricks Lakehouse Data Assets Tables Views 3-Level Namespace Catalog.Schema.Table Fine-grained Access Control Data Lineage Audit Logs
Unity Catalog ต่างจาก Hive Metastore อย่างไร
Hive Legacy Workspace เดียว ไม่มี Fine-grained Access Unity Catalog ข้าม Workspaces RBAC Column-level Row-level Security Lineage Audit 3-Level Namespace
สัมภาษณ์ Data Engineer ถามอะไรเกี่ยวกับ Databricks
Delta Lake ACID Time Travel Z-Ordering Unity Catalog Access Control Lineage Spark Optimization Partitioning Caching Broadcast Join Medallion Architecture ETL
เตรียมสัมภาษณ์ Databricks อย่างไร
ศึกษา Delta Lake Unity Catalog Spark SQL ฝึก Hands-on Community Edition ฟรี สอบ Certified Data Engineer Associate Medallion Architecture PySpark SQL Mock Interview
สรุป
Databricks Unity Catalog Centralized Governance 3-Level Namespace RBAC Lineage Delta Lake ACID Time Travel Spark Optimization Medallion Architecture Interview Preparation Certification Community Edition
