Unity Catalog Multi-Tenant Design ?????????????????????
Databricks Unity Catalog ???????????? unified governance solution ?????????????????? data ????????? AI assets ?????? Databricks Lakehouse Platform ??????????????????????????? data access, auditing, lineage ????????? discovery ????????????????????????????????? Multi-Tenant Design ????????????????????????????????????????????????????????? tenants (business units, teams, customers) ????????? Databricks workspace ??????????????????????????????????????? data isolation ????????????????????????????????????
Three-Level Namespace ????????? Unity Catalog Catalog (????????????????????????????????? ????????? tenant/business unit), Schema (database ?????????????????? catalog), Table/View/Function (objects ??????????????? schema) ???????????????????????? catalog_tenant_a.schema_sales.table_orders
?????????????????? Multi-Tenancy Catalog-per-Tenant ????????? catalog ????????? tenant isolation ??????????????????, Schema-per-Tenant ????????? catalog ??????????????? ????????? schema ????????? tenant ????????????????????????, Workspace-per-Tenant ????????? workspace ????????? tenant isolation ??????????????????????????? cost ?????????
?????????????????? Multi-Tenant Architecture
Architecture patterns ?????????????????? multi-tenant
# === Multi-Tenant Architecture ===
# 1. Terraform: Catalog-per-Tenant Pattern
cat > unity_catalog.tf << 'EOF'
variable "tenants" {
type = map(object({
name = string
storage = string
admin_group = string
environment = string
}))
default = {
tenant_a = {
name = "tenant_a"
storage = "s3://lakehouse-tenant-a"
admin_group = "tenant_a_admins"
environment = "production"
}
tenant_b = {
name = "tenant_b"
storage = "s3://lakehouse-tenant-b"
admin_group = "tenant_b_admins"
environment = "production"
}
}
}
# Create external location per tenant
resource "databricks_external_location" "tenant" {
for_each = var.tenants
name = "ext_loc_"
url = each.value.storage
credential_name = databricks_storage_credential.main.name
comment = "Storage for "
}
# Create catalog per tenant
resource "databricks_catalog" "tenant" {
for_each = var.tenants
name = each.value.name
comment = "Data catalog for "
storage_root = each.value.storage
properties = {
environment = each.value.environment
tenant = each.value.name
}
}
# Create standard schemas per tenant
resource "databricks_schema" "bronze" {
for_each = var.tenants
catalog_name = databricks_catalog.tenant[each.key].name
name = "bronze"
comment = "Raw data layer"
}
resource "databricks_schema" "silver" {
for_each = var.tenants
catalog_name = databricks_catalog.tenant[each.key].name
name = "silver"
comment = "Cleaned data layer"
}
resource "databricks_schema" "gold" {
for_each = var.tenants
catalog_name = databricks_catalog.tenant[each.key].name
name = "gold"
comment = "Business-ready data layer"
}
# Grant access to tenant admin groups
resource "databricks_grants" "catalog" {
for_each = var.tenants
catalog = databricks_catalog.tenant[each.key].name
grant {
principal = each.value.admin_group
privileges = ["USE_CATALOG", "USE_SCHEMA", "SELECT", "MODIFY", "CREATE_TABLE", "CREATE_FUNCTION"]
}
}
EOF
# 2. Shared catalog for cross-tenant data
cat > shared_catalog.tf << 'EOF'
resource "databricks_catalog" "shared" {
name = "shared_data"
comment = "Cross-tenant shared datasets"
}
resource "databricks_schema" "reference" {
catalog_name = databricks_catalog.shared.name
name = "reference_data"
comment = "Shared reference tables (country codes, currencies)"
}
# All tenants can read shared data
resource "databricks_grants" "shared_read" {
catalog = databricks_catalog.shared.name
grant {
principal = "all_tenants"
privileges = ["USE_CATALOG", "USE_SCHEMA", "SELECT"]
}
}
EOF
echo "Multi-tenant architecture configured"
Access Control ????????? Data Isolation
?????????????????? access control ?????????????????? multi-tenant
#!/usr/bin/env python3
# access_control.py ??? Unity Catalog Access Control Manager
import json
import logging
from typing import Dict, List
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger("access")
class UnityAccessControl:
"""Manage access control for multi-tenant Unity Catalog"""
def __init__(self):
self.tenants = {}
self.grants = []
def create_tenant_groups(self, tenant_name):
"""Create standard groups for a tenant"""
groups = {
f"{tenant_name}_admins": {
"role": "Catalog Admin",
"permissions": ["USE_CATALOG", "USE_SCHEMA", "SELECT", "MODIFY", "CREATE_TABLE", "CREATE_FUNCTION", "CREATE_SCHEMA"],
"description": "Full access to tenant catalog",
},
f"{tenant_name}_engineers": {
"role": "Data Engineer",
"permissions": ["USE_CATALOG", "USE_SCHEMA", "SELECT", "MODIFY", "CREATE_TABLE"],
"description": "Read/write access to all schemas",
},
f"{tenant_name}_analysts": {
"role": "Data Analyst",
"permissions": ["USE_CATALOG", "USE_SCHEMA", "SELECT"],
"description": "Read-only access to gold schema",
},
f"{tenant_name}_viewers": {
"role": "Viewer",
"permissions": ["USE_CATALOG", "USE_SCHEMA", "SELECT"],
"description": "Read-only access to specific tables",
},
}
self.tenants[tenant_name] = groups
return groups
def generate_grant_sql(self, tenant_name):
"""Generate SQL for granting permissions"""
catalog = tenant_name
sqls = []
# Admin grants
sqls.append(f"GRANT ALL PRIVILEGES ON CATALOG `{catalog}` TO `{tenant_name}_admins`;")
# Engineer grants
sqls.append(f"GRANT USE CATALOG ON CATALOG `{catalog}` TO `{tenant_name}_engineers`;")
for schema in ["bronze", "silver", "gold"]:
sqls.append(f"GRANT USE SCHEMA ON SCHEMA `{catalog}`.`{schema}` TO `{tenant_name}_engineers`;")
sqls.append(f"GRANT SELECT, MODIFY, CREATE TABLE ON SCHEMA `{catalog}`.`{schema}` TO `{tenant_name}_engineers`;")
# Analyst grants (gold only)
sqls.append(f"GRANT USE CATALOG ON CATALOG `{catalog}` TO `{tenant_name}_analysts`;")
sqls.append(f"GRANT USE SCHEMA ON SCHEMA `{catalog}`.`gold` TO `{tenant_name}_analysts`;")
sqls.append(f"GRANT SELECT ON SCHEMA `{catalog}`.`gold` TO `{tenant_name}_analysts`;")
# Deny cross-tenant access
sqls.append(f"-- IMPORTANT: No grants to other tenant groups on this catalog")
return sqls
def row_level_security(self, tenant_name):
"""Generate row-level security for shared tables"""
return {
"description": "Row-level security ?????????????????? shared tables",
"sql": f"""
CREATE OR REPLACE FUNCTION shared_data.tenant_filter()
RETURNS BOOLEAN
RETURN (
current_user() IN (
SELECT email FROM {tenant_name}.admin.authorized_users
)
OR IS_MEMBER('{tenant_name}_admins')
OR IS_MEMBER('{tenant_name}_engineers')
);
-- Apply row filter to shared table
ALTER TABLE shared_data.reference_data.global_customers
SET ROW FILTER shared_data.tenant_filter ON (tenant_id = '{tenant_name}');
""",
}
# Demo
acl = UnityAccessControl()
# Create groups for 2 tenants
for tenant in ["acme_corp", "beta_inc"]:
groups = acl.create_tenant_groups(tenant)
print(f"\n{tenant} Groups:")
for group_name, info in groups.items():
print(f" {group_name}: {info['role']} ??? {info['description']}")
# Generate SQL
sqls = acl.generate_grant_sql("acme_corp")
print(f"\nGrant SQL for acme_corp:")
for sql in sqls[:5]:
print(f" {sql}")
# Row-level security
rls = acl.row_level_security("acme_corp")
print(f"\nRow-Level Security: {rls['description']}")
Automation ????????? Provisioning
Automate tenant provisioning
# === Tenant Provisioning Automation ===
# 1. Python provisioning script
cat > provision_tenant.py << 'PYTHON'
#!/usr/bin/env python3
"""Automated tenant provisioning for Unity Catalog"""
import json
import logging
import argparse
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger("provision")
class TenantProvisioner:
def __init__(self, workspace_url, token):
self.workspace_url = workspace_url
self.token = token
self.headers = {"Authorization": f"Bearer {token}"}
def provision_tenant(self, tenant_config):
"""Full tenant provisioning workflow"""
tenant = tenant_config["name"]
steps = []
# Step 1: Create storage location
steps.append({
"step": "Create external location",
"api": "POST /api/2.1/unity-catalog/external-locations",
"body": {
"name": f"ext_{tenant}",
"url": tenant_config["storage_path"],
"credential_name": "main_credential",
},
})
# Step 2: Create catalog
steps.append({
"step": "Create catalog",
"api": "POST /api/2.1/unity-catalog/catalogs",
"body": {
"name": tenant,
"comment": f"Catalog for {tenant}",
"properties": {"tenant": tenant, "env": "production"},
},
})
# Step 3: Create schemas
for schema in ["bronze", "silver", "gold", "sandbox"]:
steps.append({
"step": f"Create schema: {schema}",
"api": "POST /api/2.1/unity-catalog/schemas",
"body": {
"name": schema,
"catalog_name": tenant,
"comment": f"{schema} layer for {tenant}",
},
})
# Step 4: Create groups
for role in ["admins", "engineers", "analysts"]:
steps.append({
"step": f"Create group: {tenant}_{role}",
"api": "POST /api/2.0/preview/scim/v2/Groups",
"body": {"displayName": f"{tenant}_{role}"},
})
# Step 5: Grant permissions
steps.append({
"step": "Grant catalog permissions",
"api": "PATCH /api/2.1/unity-catalog/permissions/catalog/{tenant}",
"body": {
"changes": [
{"principal": f"{tenant}_admins", "add": ["ALL_PRIVILEGES"]},
{"principal": f"{tenant}_engineers", "add": ["USE_CATALOG", "USE_SCHEMA", "SELECT", "MODIFY"]},
{"principal": f"{tenant}_analysts", "add": ["USE_CATALOG", "USE_SCHEMA", "SELECT"]},
],
},
})
return steps
def deprovision_tenant(self, tenant_name):
"""Remove tenant (soft delete)"""
return [
{"step": "Revoke all grants", "action": f"REVOKE ALL ON CATALOG {tenant_name}"},
{"step": "Disable groups", "action": f"Disable {tenant_name}_* groups"},
{"step": "Archive catalog", "action": f"Rename catalog to {tenant_name}_archived"},
{"step": "Retain data 90 days", "action": "Set deletion schedule"},
]
# Demo
provisioner = TenantProvisioner("https://workspace.cloud.databricks.com", "token")
config = {"name": "new_tenant", "storage_path": "s3://lakehouse-new-tenant"}
steps = provisioner.provision_tenant(config)
print(f"Provisioning Plan ({len(steps)} steps):")
for i, step in enumerate(steps, 1):
print(f" {i}. {step['step']}")
cleanup = provisioner.deprovision_tenant("old_tenant")
print(f"\nDeprovision Plan:")
for step in cleanup:
print(f" - {step['step']}: {step['action']}")
PYTHON
echo "Provisioning automation ready"
Monitoring ????????? Cost Allocation
????????????????????????????????????????????????????????????????????????????????????????????????????????? tenant
#!/usr/bin/env python3
# tenant_monitor.py ??? Multi-Tenant Monitoring
import json
import logging
from typing import Dict, List
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger("monitor")
class TenantMonitor:
def __init__(self):
pass
def dashboard(self):
return {
"tenants": {
"acme_corp": {
"users": 25,
"tables": 150,
"storage_gb": 450,
"dbu_usage_30d": 12500,
"cost_30d": "$3,750",
"top_queries": 8500,
"data_freshness": "5 min",
},
"beta_inc": {
"users": 12,
"tables": 80,
"storage_gb": 200,
"dbu_usage_30d": 6000,
"cost_30d": "$1,800",
"top_queries": 4200,
"data_freshness": "15 min",
},
"gamma_ltd": {
"users": 8,
"tables": 45,
"storage_gb": 100,
"dbu_usage_30d": 3000,
"cost_30d": "$900",
"top_queries": 1800,
"data_freshness": "1 hour",
},
},
"audit_events_24h": {
"total": 15000,
"by_type": {
"SELECT": 12000,
"INSERT": 1500,
"UPDATE": 800,
"CREATE": 200,
"GRANT": 50,
"DENY": 5,
},
"cross_tenant_attempts": 3,
"failed_auth": 12,
},
"governance": {
"tables_with_tags": "85%",
"tables_with_lineage": "92%",
"pii_columns_masked": "100%",
"compliance_score": "A",
},
"alerts": [
{"severity": "WARNING", "message": "acme_corp storage approaching 500GB limit"},
{"severity": "INFO", "message": "3 cross-tenant access attempts blocked (audit logged)"},
],
}
monitor = TenantMonitor()
dash = monitor.dashboard()
print("Multi-Tenant Dashboard:")
for tenant, info in dash["tenants"].items():
print(f"\n {tenant}:")
print(f" Users: {info['users']}, Tables: {info['tables']}, Storage: {info['storage_gb']}GB")
print(f" DBU (30d): {info['dbu_usage_30d']:,}, Cost: {info['cost_30d']}")
audit = dash["audit_events_24h"]
print(f"\nAudit (24h): {audit['total']:,} events, {audit['cross_tenant_attempts']} cross-tenant blocks")
gov = dash["governance"]
print(f"\nGovernance: Compliance {gov['compliance_score']}, PII masked {gov['pii_columns_masked']}")
for alert in dash["alerts"]:
print(f"\n[{alert['severity']}] {alert['message']}")
Best Practices ????????? Governance
???????????????????????????????????????????????? multi-tenant governance
# === Governance Best Practices ===
cat > governance.yaml << 'EOF'
multi_tenant_governance:
naming_convention:
catalog: "{tenant_name}"
schema: "{layer}_{optional_domain}"
table: "{entity}_{optional_suffix}"
examples:
- "acme_corp.bronze.raw_orders"
- "acme_corp.silver.cleaned_orders"
- "acme_corp.gold.daily_revenue"
- "shared_data.reference.country_codes"
data_classification:
tags:
- name: "pii"
description: "Personally Identifiable Information"
action: "Mask or encrypt"
- name: "confidential"
description: "Business confidential data"
action: "Restrict access"
- name: "public"
description: "Non-sensitive data"
action: "No restrictions"
isolation_patterns:
catalog_per_tenant:
isolation: "HIGH"
complexity: "MEDIUM"
cost: "MEDIUM"
best_for: "Strong isolation requirements, compliance"
schema_per_tenant:
isolation: "MEDIUM"
complexity: "LOW"
cost: "LOW"
best_for: "Small tenants, shared infrastructure"
workspace_per_tenant:
isolation: "HIGHEST"
complexity: "HIGH"
cost: "HIGH"
best_for: "Regulated industries, maximum isolation"
audit_requirements:
- "Log all data access events"
- "Track cross-tenant access attempts"
- "Monitor privilege escalation"
- "Retain audit logs 1+ year"
- "Regular access reviews (quarterly)"
- "Automated compliance reports"
EOF
python3 -c "
import yaml
with open('governance.yaml') as f:
data = yaml.safe_load(f)
gov = data['multi_tenant_governance']
print('Governance Best Practices:')
print('\nNaming Convention:')
for ex in gov['naming_convention']['examples']:
print(f' {ex}')
print('\nIsolation Patterns:')
for name, info in gov['isolation_patterns'].items():
print(f' {name}: isolation={info[\"isolation\"]}, cost={info[\"cost\"]}')
print(f' Best for: {info[\"best_for\"]}')
"
echo "Governance guide ready"
FAQ ??????????????????????????????????????????
Q: Catalog-per-Tenant ????????? Schema-per-Tenant ??????????????????????????????????
A: Catalog-per-Tenant ??????????????? Isolation ???????????????????????? Unity Catalog, ????????? storage location ????????? tenant ?????????, ?????????????????? permissions ???????????? (grant ????????? catalog level), ????????????????????? compliance/audit ????????????????????? ???????????? manage catalogs ????????????, cross-tenant queries ????????????????????????????????? Schema-per-Tenant ??????????????? ?????????????????????????????????????????? (catalog ???????????????), cross-tenant queries ????????????, ??????????????? tenants ???????????? ????????????????????? Isolation ?????????????????????, permission management ?????????????????????????????????, ?????????????????? storage ????????? ??????????????? ??????????????? compliance requirements (PDPA, GDPR) ???????????? tenants > 5 ????????? Catalog-per-Tenant ????????? tenants ????????????????????????????????? internal teams ????????? Schema-per-Tenant
Q: Cost allocation ????????????????????????????????????????????????????????????????
A: Databricks ?????????????????? cost allocation ???????????? Tags ????????? tags ????????? clusters, jobs, SQL warehouses ???????????? tenant, Cluster policies ??????????????? cluster specs ????????? tenant (max nodes, instance types), Usage logs ????????? system tables (system.billing.usage) query DBU usage ????????? tenant, Custom tagging ????????? custom tags ?????? cluster config ???????????? tenant=acme_corp ???????????? aggregate ????????? billing API ???????????????????????? Fixed allocation ????????????????????? contract (???????????? tenant A ???????????? 60%, B ???????????? 40%), Usage-based ????????????????????? DBU ?????????????????????????????? (fair ????????? unpredictable), Hybrid base fee + usage overage (???????????????) ????????? Databricks Account Console ???????????? export billing data ?????? BI tool ?????????????????? chargeback reports
Q: Data lineage ?????? multi-tenant ????????????????????????????
A: Unity Catalog ?????? built-in lineage tracking ??????????????????????????? Table-level lineage ????????????????????? table ??????????????? table ?????????, Column-level lineage ????????????????????? column ??????????????? column ????????? (Spark SQL), Job lineage ????????????????????? job ????????????????????????/?????????????????? table ?????????????????? multi-tenant Lineage ?????????????????? catalog (tenant ??????????????????????????? lineage ???????????????????????????), Cross-catalog lineage ???????????????????????????????????? permission, Admin ???????????? lineage ????????????????????????????????? tenants ????????????????????? ????????????????????? config ??????????????? Unity Catalog track ??????????????????????????? ?????????????????? Catalog Explorer ???????????? REST API ??????????????????????????? Data Contracts ??????????????? schema, quality, SLA ????????????????????? tenants
Q: PII data masking ????????????????????????????????? Unity Catalog?
A: Unity Catalog ?????????????????? Column Masking ????????? Row Filtering Dynamic masking ????????? column mask function ????????? return ????????? masked ????????? user/group ???????????? analysts ???????????? email ???????????? ***@***.com, engineers ???????????? email ???????????? SQL CREATE FUNCTION mask_email(email STRING) RETURNS STRING RETURN CASE WHEN IS_MEMBER('tenant_admins') THEN email ELSE CONCAT(LEFT(email,2), '***@***.com') END ???????????? ALTER TABLE SET COLUMN MASK mask_email ON column email ??????????????? ???????????????????????????????????? views ?????????, Policy centralized, Audit tracked ??????????????????????????? Row Filtering ????????? row filter function ???????????? rows ????????? tenant ?????????????????????????????? tenant ??????????????????????????? data ???????????????????????????
