Cybersecurity

Databricks Unity Catalog Interview Preparation

databricks unity catalog interview preparation
Databricks Unity Catalog Interview Preparation | SiamCafe Blog
2025-06-07· อ. บอม — SiamCafe.net· 9,903 คำ

Databricks Unity Catalog

Unity Catalog Centralized Governance Databricks Lakehouse Data Assets Tables Views Volumes Models Functions 3-Level Namespace Catalog.Schema.Table Fine-grained Access Control Data Lineage Audit Logs

Interview Preparation Data Engineering Delta Lake Spark Optimization Medallion Architecture ETL Patterns Data Quality Cost Optimization

Unity Catalog Architecture

# unity_catalog.py — Databricks Unity Catalog
from dataclasses import dataclass, field
from typing import List, Dict, Optional

@dataclass
class Table:
    name: str
    table_type: str  # MANAGED, EXTERNAL
    format: str  # DELTA, PARQUET, CSV
    columns: List[Dict[str, str]] = field(default_factory=list)
    owner: str = ""

@dataclass
class Schema:
    name: str
    tables: List[Table] = field(default_factory=list)
    owner: str = ""

@dataclass
class Catalog:
    name: str
    schemas: List[Schema] = field(default_factory=list)
    owner: str = ""

class UnityCatalog:
    """Unity Catalog Management"""

    def __init__(self):
        self.catalogs: List[Catalog] = []
        self.grants: List[dict] = []

    def create_catalog(self, name: str, owner: str) -> Catalog:
        cat = Catalog(name=name, owner=owner)
        self.catalogs.append(cat)
        return cat

    def create_schema(self, catalog_name: str, schema_name: str, owner: str):
        cat = next((c for c in self.catalogs if c.name == catalog_name), None)
        if cat:
            schema = Schema(name=schema_name, owner=owner)
            cat.schemas.append(schema)
            return schema

    def create_table(self, catalog: str, schema: str, table: Table):
        cat = next((c for c in self.catalogs if c.name == catalog), None)
        if cat:
            sch = next((s for s in cat.schemas if s.name == schema), None)
            if sch:
                sch.tables.append(table)

    def grant(self, principal: str, privilege: str, target: str):
        self.grants.append({
            "principal": principal,
            "privilege": privilege,
            "target": target,
        })

    def show_hierarchy(self):
        print(f"\n{'='*55}")
        print(f"Unity Catalog Hierarchy")
        print(f"{'='*55}")
        for cat in self.catalogs:
            print(f"\n  Catalog: {cat.name} (Owner: {cat.owner})")
            for schema in cat.schemas:
                print(f"    Schema: {schema.name}")
                for table in schema.tables:
                    print(f"      Table: {table.name} [{table.table_type}] ({table.format})")

# -- SQL Commands --
# CREATE CATALOG production;
# CREATE SCHEMA production.bronze;
# CREATE SCHEMA production.silver;
# CREATE SCHEMA production.gold;
#
# CREATE TABLE production.bronze.raw_events (
#   event_id STRING,
#   event_type STRING,
#   user_id STRING,
#   timestamp TIMESTAMP,
#   payload STRING
# ) USING DELTA;
#
# -- Access Control
# GRANT USE CATALOG ON CATALOG production TO `data-engineers`;
# GRANT USE SCHEMA ON SCHEMA production.bronze TO `data-engineers`;
# GRANT SELECT ON TABLE production.bronze.raw_events TO `data-analysts`;
# GRANT MODIFY ON TABLE production.silver.users TO `etl-service`;
#
# -- Column-level Security
# GRANT SELECT (user_id, event_type) ON TABLE production.bronze.raw_events TO `limited-access`;

uc = UnityCatalog()
cat = uc.create_catalog("production", "admin")
uc.create_schema("production", "bronze", "data-engineers")
uc.create_schema("production", "silver", "data-engineers")
uc.create_schema("production", "gold", "data-analysts")

tables = [
    Table("raw_events", "MANAGED", "DELTA", [{"name": "event_id", "type": "STRING"}]),
    Table("raw_users", "MANAGED", "DELTA", [{"name": "user_id", "type": "STRING"}]),
]
for t in tables:
    uc.create_table("production", "bronze", t)

uc.create_table("production", "silver",
    Table("clean_events", "MANAGED", "DELTA"))
uc.create_table("production", "gold",
    Table("daily_metrics", "MANAGED", "DELTA"))

uc.grant("data-engineers", "USE CATALOG", "production")
uc.grant("data-analysts", "SELECT", "production.gold.*")

uc.show_hierarchy()

print(f"\n  Grants:")
for g in uc.grants:
    print(f"    GRANT {g['privilege']} ON {g['target']} TO {g['principal']}")

Interview Questions

# interview_questions.py — Databricks Interview Prep
interview = {
    "Delta Lake": {
        "questions": [
            "Delta Lake คืออะไร ต่างจาก Parquet อย่างไร",
            "ACID Transactions ใน Delta Lake ทำงานอย่างไร",
            "Time Travel คืออะไร ใช้อย่างไร",
            "Z-Ordering คืออะไร เมื่อไหร่ควรใช้",
            "VACUUM ทำอะไร ตั้ง Retention อย่างไร",
            "OPTIMIZE ทำอะไร ต่างจาก ZORDER อย่างไร",
        ],
        "key_concepts": "Transaction Log, ACID, Schema Evolution, CDF",
    },
    "Unity Catalog": {
        "questions": [
            "Unity Catalog คืออะไร ต่างจาก Hive Metastore อย่างไร",
            "3-Level Namespace คืออะไร",
            "Fine-grained Access Control ตั้งค่าอย่างไร",
            "Data Lineage ใช้อย่างไร",
            "External Tables vs Managed Tables",
            "Column-level Security ทำอย่างไร",
        ],
        "key_concepts": "RBAC, Lineage, Audit, Governance",
    },
    "Spark Optimization": {
        "questions": [
            "Spark Partitioning ตั้งค่าอย่างไร",
            "Broadcast Join ใช้เมื่อไหร่",
            "Shuffle คืออะไร ลดอย่างไร",
            "Caching vs Persist ต่างกันอย่างไร",
            "AQE (Adaptive Query Execution) คืออะไร",
            "Skew Join Optimization ทำอย่างไร",
        ],
        "key_concepts": "Partitioning, Shuffle, AQE, Catalyst",
    },
    "Medallion Architecture": {
        "questions": [
            "Medallion Architecture คืออะไร",
            "Bronze, Silver, Gold แตกต่างกันอย่างไร",
            "Data Quality ตรวจสอบที่ Layer ไหน",
            "SCD Type 2 ทำที่ Layer ไหน อย่างไร",
            "Streaming + Batch ใน Medallion ทำอย่างไร",
        ],
        "key_concepts": "Bronze (Raw), Silver (Clean), Gold (Business)",
    },
}

print("Databricks Interview Questions:")
for topic, info in interview.items():
    print(f"\n  [{topic}]")
    print(f"    Key Concepts: {info['key_concepts']}")
    for q in info["questions"]:
        print(f"    Q: {q}")

PySpark Code Examples

# pyspark_examples.py — PySpark for Interview
# from pyspark.sql import SparkSession
# from pyspark.sql.functions import col, count, sum, avg, when, lit
# from delta.tables import DeltaTable

# spark = SparkSession.builder.appName("interview").getOrCreate()

# 1. Delta Lake Operations
# -- Create Delta Table
# CREATE TABLE production.bronze.events
# USING DELTA
# PARTITIONED BY (event_date)
# AS SELECT * FROM raw_events;

# -- Time Travel
# SELECT * FROM production.bronze.events VERSION AS OF 5;
# SELECT * FROM production.bronze.events TIMESTAMP AS OF '2024-01-15';

# -- MERGE (Upsert)
# MERGE INTO production.silver.users AS target
# USING staging.new_users AS source
# ON target.user_id = source.user_id
# WHEN MATCHED THEN UPDATE SET *
# WHEN NOT MATCHED THEN INSERT *;

# -- Z-Ordering
# OPTIMIZE production.silver.events ZORDER BY (user_id, event_type);

# -- VACUUM
# VACUUM production.bronze.events RETAIN 168 HOURS;

# 2. PySpark Optimization
# df = spark.read.table("production.bronze.events")
#
# # Broadcast Join (Small Table < 10MB)
# from pyspark.sql.functions import broadcast
# small_df = spark.read.table("production.silver.dim_users")
# result = df.join(broadcast(small_df), "user_id")
#
# # Partitioning
# df.repartition(200, "event_date") \
#   .write.mode("overwrite") \
#   .partitionBy("event_date") \
#   .saveAsTable("production.silver.events")
#
# # Caching
# df.cache()  # MEMORY_ONLY
# df.persist(StorageLevel.MEMORY_AND_DISK)
#
# # AQE
# spark.conf.set("spark.sql.adaptive.enabled", "true")
# spark.conf.set("spark.sql.adaptive.coalescePartitions.enabled", "true")
# spark.conf.set("spark.sql.adaptive.skewJoin.enabled", "true")

# Certification Path
certs = {
    "Databricks Certified Data Engineer Associate": {
        "topics": "Delta Lake, ELT, Workflows, Unity Catalog Basics",
        "difficulty": "ปานกลาง",
        "prep_time": "2-4 สัปดาห์",
        "cost": "$200",
    },
    "Databricks Certified Data Engineer Professional": {
        "topics": "Advanced Delta, Streaming, Optimization, Production",
        "difficulty": "ยาก",
        "prep_time": "4-8 สัปดาห์",
        "cost": "$300",
    },
    "Databricks Certified ML Professional": {
        "topics": "MLflow, Feature Store, Model Serving, AutoML",
        "difficulty": "ยาก",
        "prep_time": "4-8 สัปดาห์",
        "cost": "$300",
    },
}

print("\nDatabricks Certifications:")
for cert, info in certs.items():
    print(f"\n  [{cert}]")
    for key, value in info.items():
        print(f"    {key}: {value}")

เคล็ดลับ

Unity Catalog คืออะไร

Centralized Governance Databricks Lakehouse Data Assets Tables Views 3-Level Namespace Catalog.Schema.Table Fine-grained Access Control Data Lineage Audit Logs

Unity Catalog ต่างจาก Hive Metastore อย่างไร

Hive Legacy Workspace เดียว ไม่มี Fine-grained Access Unity Catalog ข้าม Workspaces RBAC Column-level Row-level Security Lineage Audit 3-Level Namespace

สัมภาษณ์ Data Engineer ถามอะไรเกี่ยวกับ Databricks

Delta Lake ACID Time Travel Z-Ordering Unity Catalog Access Control Lineage Spark Optimization Partitioning Caching Broadcast Join Medallion Architecture ETL

เตรียมสัมภาษณ์ Databricks อย่างไร

ศึกษา Delta Lake Unity Catalog Spark SQL ฝึก Hands-on Community Edition ฟรี สอบ Certified Data Engineer Associate Medallion Architecture PySpark SQL Mock Interview

สรุป

Databricks Unity Catalog Centralized Governance 3-Level Namespace RBAC Lineage Delta Lake ACID Time Travel Spark Optimization Medallion Architecture Interview Preparation Certification Community Edition

📖 บทความที่เกี่ยวข้อง

Databricks Unity Catalog Disaster Recovery Planอ่านบทความ → Databricks Unity Catalog Cloud Native Designอ่านบทความ → Databricks Unity Catalog Technical Debt Managementอ่านบทความ → Databricks Unity Catalog Certification Pathอ่านบทความ → Databricks Unity Catalog DevSecOps Integrationอ่านบทความ →

📚 ดูบทความทั้งหมด →