Library to validate and normalize cloud specific tags (#819)

## Changes Prompted by the proposed fix for a tagging-related problem in #810, I investigated how tag validation works. This turned out to be quite a bit more complex than anticipated. Tags at the job level (or cluster level) are passed through to the underlying compute infrastructure and as such are tested against cloud-specific validation rules. GCP appears to be the most restrictive. It would be disappointing to always restrict to `\w+`, so this package implements validation and normalization rules for each cloud. It can pick the right cloud to use using a Go SDK configuration. ## Tests Exhaustive unit tests. The regular expressions were pulled by #814.
2023-09-29 10:49:08 +02:00 · 2023-09-29 10:49:08 +02:00 · 4226c88e98
parent 3685eb16f4
commit 4226c88e98
13 changed files with 532 additions and 0 deletions
--- a/libs/tags/aws.go
+++ b/libs/tags/aws.go
@ -0,0 +1,36 @@
+package tags
+
+import (
+	"regexp"
+	"unicode"
+
+	"golang.org/x/text/unicode/rangetable"
+)
+
+// The union of all characters allowed in AWS tags.
+// This must be used only after filtering out non-Latin1 characters,
+// because the [unicode] classes include non-Latin1 characters.
+var awsChars = rangetable.Merge(
+	unicode.Digit,
+	unicode.Space,
+	unicode.Letter,
+	rangetable.New('+', '-', '=', '.', ':', '/', '@'),
+)
+
+var awsTag = &tag{
+	keyLength:  127,
+	keyPattern: regexp.MustCompile(`^[\d \w\+\-=\.:\/@]*$`),
+	keyNormalize: chain(
+		normalizeMarks(),
+		replaceNotIn(latin1, '_'),
+		replaceNotIn(awsChars, '_'),
+	),
+
+	valueLength:  255,
+	valuePattern: regexp.MustCompile(`^[\d \w\+\-=\.:/@]*$`),
+	valueNormalize: chain(
+		normalizeMarks(),
+		replaceNotIn(latin1, '_'),
+		replaceNotIn(awsChars, '_'),
+	),
+}
--- a/libs/tags/aws_test.go
+++ b/libs/tags/aws_test.go
@ -0,0 +1,49 @@
+package tags
+
+import (
+	"strings"
+	"testing"
+
+	"github.com/stretchr/testify/assert"
+)
+
+func TestAwsNormalizeKey(t *testing.T) {
+	assert.Equal(t, "1 a b c", awsTag.NormalizeKey("1 a b c"))
+	assert.Equal(t, "+-=.:/@__", awsTag.NormalizeKey("+-=.:/@?)"))
+	assert.Equal(t, "test", awsTag.NormalizeKey("test"))
+
+	// Remove marks; unicode becomes underscore.
+	assert.Equal(t, "cafe _", awsTag.NormalizeKey("café 🍎"))
+
+	// Replace forbidden characters with underscore.
+	assert.Equal(t, "cafe __", awsTag.NormalizeKey("café 🍎?"))
+}
+
+func TestAwsNormalizeValue(t *testing.T) {
+	assert.Equal(t, "1 a b c", awsTag.NormalizeValue("1 a b c"))
+	assert.Equal(t, "+-=.:/@__", awsTag.NormalizeValue("+-=.:/@?)"))
+	assert.Equal(t, "test", awsTag.NormalizeValue("test"))
+
+	// Remove marks; unicode becomes underscore.
+	assert.Equal(t, "cafe _", awsTag.NormalizeValue("café 🍎"))
+
+	// Replace forbidden characters with underscore.
+	assert.Equal(t, "cafe __", awsTag.NormalizeValue("café 🍎?"))
+}
+
+func TestAwsValidateKey(t *testing.T) {
+	assert.ErrorContains(t, awsTag.ValidateKey(""), "not be empty")
+	assert.ErrorContains(t, awsTag.ValidateKey(strings.Repeat("a", 512)), "length")
+	assert.ErrorContains(t, awsTag.ValidateKey("café 🍎"), "latin")
+	assert.ErrorContains(t, awsTag.ValidateKey("????"), "pattern")
+	assert.NoError(t, awsTag.ValidateKey(strings.Repeat("a", 127)))
+	assert.NoError(t, awsTag.ValidateKey(awsTag.NormalizeKey("café 🍎")))
+}
+
+func TestAwsValidateValue(t *testing.T) {
+	assert.ErrorContains(t, awsTag.ValidateValue(strings.Repeat("a", 512)), "length")
+	assert.ErrorContains(t, awsTag.ValidateValue("café 🍎"), "latin1")
+	assert.ErrorContains(t, awsTag.ValidateValue("????"), "pattern")
+	assert.NoError(t, awsTag.ValidateValue(strings.Repeat("a", 127)))
+	assert.NoError(t, awsTag.ValidateValue(awsTag.NormalizeValue("café 🍎")))
+}
--- a/libs/tags/azure.go
+++ b/libs/tags/azure.go
@ -0,0 +1,25 @@
+package tags
+
+import (
+	"regexp"
+
+	"golang.org/x/text/unicode/rangetable"
+)
+
+// All characters that may not be used in Azure tag keys.
+var azureForbiddenChars = rangetable.New('<', '>', '*', '&', '%', ';', '\\', '/', '+', '?')
+
+var azureTag = &tag{
+	keyLength:  512,
+	keyPattern: regexp.MustCompile(`^[^<>\*&%;\\\/\+\?]*$`),
+	keyNormalize: chain(
+		replaceNotIn(latin1, '_'),
+		replaceIn(azureForbiddenChars, '_'),
+	),
+
+	valueLength:  256,
+	valuePattern: regexp.MustCompile(`^.*$`),
+	valueNormalize: chain(
+		replaceNotIn(latin1, '_'),
+	),
+}
--- a/libs/tags/azure_test.go
+++ b/libs/tags/azure_test.go
@ -0,0 +1,34 @@
+package tags
+
+import (
+	"strings"
+	"testing"
+
+	"github.com/stretchr/testify/assert"
+)
+
+func TestAzureNormalizeKey(t *testing.T) {
+	assert.Equal(t, "test", azureTag.NormalizeKey("test"))
+	assert.Equal(t, "café __", azureTag.NormalizeKey("café 🍎?"))
+}
+
+func TestAzureNormalizeValue(t *testing.T) {
+	assert.Equal(t, "test", azureTag.NormalizeValue("test"))
+	assert.Equal(t, "café _?", azureTag.NormalizeValue("café 🍎?"))
+}
+
+func TestAzureValidateKey(t *testing.T) {
+	assert.ErrorContains(t, azureTag.ValidateKey(""), "not be empty")
+	assert.ErrorContains(t, azureTag.ValidateKey(strings.Repeat("a", 513)), "length")
+	assert.ErrorContains(t, azureTag.ValidateKey("café 🍎"), "latin")
+	assert.ErrorContains(t, azureTag.ValidateKey("????"), "pattern")
+	assert.NoError(t, azureTag.ValidateKey(strings.Repeat("a", 127)))
+	assert.NoError(t, azureTag.ValidateKey(azureTag.NormalizeKey("café 🍎")))
+}
+
+func TestAzureValidateValue(t *testing.T) {
+	assert.ErrorContains(t, azureTag.ValidateValue(strings.Repeat("a", 513)), "length")
+	assert.ErrorContains(t, azureTag.ValidateValue("café 🍎"), "latin")
+	assert.NoError(t, azureTag.ValidateValue(strings.Repeat("a", 127)))
+	assert.NoError(t, azureTag.ValidateValue(azureTag.NormalizeValue("café 🍎")))
+}
--- a/libs/tags/cloud.go
+++ b/libs/tags/cloud.go
@ -0,0 +1,32 @@
+package tags
+
+import "github.com/databricks/databricks-sdk-go/config"
+
+type Cloud interface {
+	// ValidateKey checks if a tag key can be used with the cloud provider.
+	ValidateKey(key string) error
+
+	// ValidateValue checks if a tag value can be used with the cloud provider.
+	ValidateValue(value string) error
+
+	// NormalizeKey normalizes a tag key for the cloud provider.
+	NormalizeKey(key string) string
+
+	// NormalizeValue normalizes a tag value for the cloud provider.
+	NormalizeValue(value string) string
+}
+
+func ForCloud(cfg *config.Config) Cloud {
+	var t *tag
+	switch {
+	case cfg.IsAws():
+		t = awsTag
+	case cfg.IsAzure():
+		t = azureTag
+	case cfg.IsGcp():
+		t = gcpTag
+	default:
+		panic("unknown cloud provider")
+	}
+	return t
+}
--- a/libs/tags/cloud_test.go
+++ b/libs/tags/cloud_test.go
@ -0,0 +1,32 @@
+package tags
+
+import (
+	"testing"
+
+	"github.com/databricks/databricks-sdk-go/config"
+	"github.com/stretchr/testify/assert"
+)
+
+func TestForCloudAws(t *testing.T) {
+	c := &config.Config{
+		Host: "https://dbc-XXXXXXXX-YYYY.cloud.databricks.com/",
+	}
+
+	assert.Equal(t, awsTag, ForCloud(c))
+}
+
+func TestForCloudAzure(t *testing.T) {
+	c := &config.Config{
+		Host: "https://adb-xxx.y.azuredatabricks.net/",
+	}
+
+	assert.Equal(t, azureTag, ForCloud(c))
+}
+
+func TestForCloudGcp(t *testing.T) {
+	c := &config.Config{
+		Host: "https://123.4.gcp.databricks.com/",
+	}
+
+	assert.Equal(t, gcpTag, ForCloud(c))
+}
--- a/libs/tags/gcp.go
+++ b/libs/tags/gcp.go
@ -0,0 +1,63 @@
+package tags
+
+import (
+	"regexp"
+	"unicode"
+)
+
+// Tag keys and values on GCP are limited to 63 characters and must match the
+// regular expression `^([A-Za-z0-9][-A-Za-z0-9_.]*)?[A-Za-z0-9]$`.
+// For normalization, we define one table for the outer characters and
+// one table for the inner characters. The outer table is used to trim
+// leading and trailing characters, and the inner table is used to
+// replace invalid characters with underscores.
+
+var gcpOuter = &unicode.RangeTable{
+	R16: []unicode.Range16{
+		// 0-9
+		{0x0030, 0x0039, 1},
+		// A-Z
+		{0x0041, 0x005A, 1},
+		// a-z
+		{0x0061, 0x007A, 1},
+	},
+	LatinOffset: 3,
+}
+
+var gcpInner = &unicode.RangeTable{
+	R16: []unicode.Range16{
+		// Hyphen-minus (dash)
+		{0x002D, 0x002D, 1},
+		// Full stop (period)
+		{0x002E, 0x002E, 1},
+		// 0-9
+		{0x0030, 0x0039, 1},
+		// A-Z
+		{0x0041, 0x005A, 1},
+		// Low line (underscore)
+		{0x005F, 0x005F, 1},
+		// a-z
+		{0x0061, 0x007A, 1},
+	},
+	LatinOffset: 6,
+}
+
+var gcpTag = &tag{
+	keyLength:  63,
+	keyPattern: regexp.MustCompile(`^([A-Za-z0-9][-A-Za-z0-9_.]*)?[A-Za-z0-9]$`),
+	keyNormalize: chain(
+		normalizeMarks(),
+		replaceNotIn(latin1, '_'),
+		replaceNotIn(gcpInner, '_'),
+		trimIfNotIn(gcpOuter),
+	),
+
+	valueLength:  63,
+	valuePattern: regexp.MustCompile(`^(([A-Za-z0-9][-A-Za-z0-9_.]*)?[A-Za-z0-9])?$`),
+	valueNormalize: chain(
+		normalizeMarks(),
+		replaceNotIn(latin1, '_'),
+		replaceNotIn(gcpInner, '_'),
+		trimIfNotIn(gcpOuter),
+	),
+}
--- a/libs/tags/gcp_test.go
+++ b/libs/tags/gcp_test.go
@ -0,0 +1,65 @@
+package tags
+
+import (
+	"strings"
+	"testing"
+	"unicode"
+
+	"github.com/stretchr/testify/assert"
+)
+
+func TestGcpOuter(t *testing.T) {
+	assert.True(t, unicode.In('A', gcpOuter))
+	assert.True(t, unicode.In('Z', gcpOuter))
+	assert.True(t, unicode.In('a', gcpOuter))
+	assert.True(t, unicode.In('z', gcpOuter))
+	assert.True(t, unicode.In('0', gcpOuter))
+	assert.True(t, unicode.In('9', gcpOuter))
+	assert.False(t, unicode.In('-', gcpOuter))
+	assert.False(t, unicode.In('.', gcpOuter))
+	assert.False(t, unicode.In('_', gcpOuter))
+	assert.False(t, unicode.In('!', gcpOuter))
+}
+
+func TestGcpInner(t *testing.T) {
+	assert.True(t, unicode.In('A', gcpInner))
+	assert.True(t, unicode.In('Z', gcpInner))
+	assert.True(t, unicode.In('a', gcpInner))
+	assert.True(t, unicode.In('z', gcpInner))
+	assert.True(t, unicode.In('0', gcpInner))
+	assert.True(t, unicode.In('9', gcpInner))
+	assert.True(t, unicode.In('-', gcpInner))
+	assert.True(t, unicode.In('.', gcpInner))
+	assert.True(t, unicode.In('_', gcpInner))
+	assert.False(t, unicode.In('!', gcpInner))
+}
+
+func TestGcpNormalizeKey(t *testing.T) {
+	assert.Equal(t, "test", gcpTag.NormalizeKey("test"))
+	assert.Equal(t, "cafe", gcpTag.NormalizeKey("café 🍎?"))
+	assert.Equal(t, "cafe_foo", gcpTag.NormalizeKey("__café_foo__"))
+
+}
+
+func TestGcpNormalizeValue(t *testing.T) {
+	assert.Equal(t, "test", gcpTag.NormalizeValue("test"))
+	assert.Equal(t, "cafe", gcpTag.NormalizeValue("café 🍎?"))
+	assert.Equal(t, "cafe_foo", gcpTag.NormalizeValue("__café_foo__"))
+}
+
+func TestGcpValidateKey(t *testing.T) {
+	assert.ErrorContains(t, gcpTag.ValidateKey(""), "not be empty")
+	assert.ErrorContains(t, gcpTag.ValidateKey(strings.Repeat("a", 64)), "length")
+	assert.ErrorContains(t, gcpTag.ValidateKey("café 🍎"), "latin")
+	assert.ErrorContains(t, gcpTag.ValidateKey("????"), "pattern")
+	assert.NoError(t, gcpTag.ValidateKey(strings.Repeat("a", 32)))
+	assert.NoError(t, gcpTag.ValidateKey(gcpTag.NormalizeKey("café 🍎")))
+}
+
+func TestGcpValidateValue(t *testing.T) {
+	assert.ErrorContains(t, gcpTag.ValidateValue(strings.Repeat("a", 64)), "length")
+	assert.ErrorContains(t, gcpTag.ValidateValue("café 🍎"), "latin")
+	assert.ErrorContains(t, gcpTag.ValidateValue("????"), "pattern")
+	assert.NoError(t, gcpTag.ValidateValue(strings.Repeat("a", 32)))
+	assert.NoError(t, gcpTag.ValidateValue(gcpTag.NormalizeValue("café 🍎")))
+}
--- a/libs/tags/latin.go
+++ b/libs/tags/latin.go
@ -0,0 +1,11 @@
+package tags
+
+import "unicode"
+
+// Range table for all characters in the Latin1 character set.
+var latin1 = &unicode.RangeTable{
+	R16: []unicode.Range16{
+		{0x0000, 0x00ff, 1},
+	},
+	LatinOffset: 1,
+}
--- a/libs/tags/latin_test.go
+++ b/libs/tags/latin_test.go
@ -0,0 +1,16 @@
+package tags
+
+import (
+	"testing"
+	"unicode"
+
+	"github.com/stretchr/testify/assert"
+)
+
+func TestLatinTable(t *testing.T) {
+	assert.True(t, unicode.In('\u0000', latin1))
+	assert.True(t, unicode.In('A', latin1))
+	assert.True(t, unicode.In('Z', latin1))
+	assert.True(t, unicode.In('\u00ff', latin1))
+	assert.False(t, unicode.In('\u0100', latin1))
+}
--- a/libs/tags/tag.go
+++ b/libs/tags/tag.go
@ -0,0 +1,57 @@
+package tags
+
+import (
+	"fmt"
+	"regexp"
+	"strings"
+	"unicode"
+)
+
+// The tag type holds the validation and normalization rules for
+// a cloud provider's resource tags as applied by Databricks.
+type tag struct {
+	keyLength    int
+	keyPattern   *regexp.Regexp
+	keyNormalize transformer
+
+	valueLength    int
+	valuePattern   *regexp.Regexp
+	valueNormalize transformer
+}
+
+func (t *tag) ValidateKey(s string) error {
+	if len(s) == 0 {
+		return fmt.Errorf("key must not be empty")
+	}
+	if len(s) > t.keyLength {
+		return fmt.Errorf("key length %d exceeds maximum of %d", len(s), t.keyLength)
+	}
+	if strings.ContainsFunc(s, func(r rune) bool { return !unicode.Is(latin1, r) }) {
+		return fmt.Errorf("key contains non-latin1 characters")
+	}
+	if !t.keyPattern.MatchString(s) {
+		return fmt.Errorf("key %q does not match pattern %q", s, t.keyPattern)
+	}
+	return nil
+}
+
+func (t *tag) ValidateValue(s string) error {
+	if len(s) > t.valueLength {
+		return fmt.Errorf("value length %d exceeds maximum of %d", len(s), t.valueLength)
+	}
+	if strings.ContainsFunc(s, func(r rune) bool { return !unicode.Is(latin1, r) }) {
+		return fmt.Errorf("value contains non-latin1 characters")
+	}
+	if !t.valuePattern.MatchString(s) {
+		return fmt.Errorf("value %q does not match pattern %q", s, t.valuePattern)
+	}
+	return nil
+}
+
+func (t *tag) NormalizeKey(s string) string {
+	return t.keyNormalize.transform(s)
+}
+
+func (t *tag) NormalizeValue(s string) string {
+	return t.valueNormalize.transform(s)
+}
--- a/libs/tags/transform.go
+++ b/libs/tags/transform.go
@ -0,0 +1,87 @@
+package tags
+
+import (
+	"strings"
+	"unicode"
+
+	"golang.org/x/text/runes"
+	"golang.org/x/text/transform"
+	"golang.org/x/text/unicode/norm"
+)
+
+type transformer interface {
+	transform(string) string
+}
+
+type chainTransformer []transformer
+
+func (c chainTransformer) transform(s string) string {
+	for _, t := range c {
+		s = t.transform(s)
+	}
+	return s
+}
+
+func chain(t ...transformer) transformer {
+	return chainTransformer(t)
+}
+
+// Implement [transformer] interface with text/transform package.
+type textTransformer struct {
+	transform.Transformer
+}
+
+func (t textTransformer) transform(s string) string {
+	s, _, _ = transform.String(t, s)
+	return s
+}
+
+func normalizeMarks() transformer {
+	// Decompose unicode characters, then remove all non-spacing marks, then recompose.
+	// This turns 'é' into 'e' and 'ü' into 'u'.
+	return textTransformer{
+		transform.Chain(norm.NFD, runes.Remove(runes.In(unicode.Mn)), norm.NFC),
+	}
+}
+
+// Replaces characters in the given set with replacement.
+type replaceTransformer struct {
+	set         runes.Set
+	replacement rune
+}
+
+func (t replaceTransformer) transform(s string) string {
+	return strings.Map(func(r rune) rune {
+		if t.set.Contains(r) {
+			return t.replacement
+		}
+		return r
+	}, s)
+}
+
+func replaceIn(table *unicode.RangeTable, replacement rune) transformer {
+	return replaceTransformer{runes.In(table), replacement}
+}
+
+func replaceNotIn(table *unicode.RangeTable, replacement rune) transformer {
+	return replaceTransformer{runes.NotIn(table), replacement}
+}
+
+// Trims the given string of all characters in the given set.
+type trimTransformer struct {
+	set runes.Set
+}
+
+func (t trimTransformer) transform(s string) string {
+	return strings.TrimFunc(s, func(r rune) bool {
+		return t.set.Contains(r)
+	})
+}
+
+func trimIfIn(table *unicode.RangeTable) transformer {
+	return trimTransformer{runes.In(table)}
+}
+
+func trimIfNotIn(table *unicode.RangeTable) transformer {
+	return trimTransformer{runes.NotIn(table)}
+}
--- a/libs/tags/transform_test.go
+++ b/libs/tags/transform_test.go
@ -0,0 +1,25 @@
+package tags
+
+import (
+	"testing"
+	"unicode"
+
+	"github.com/stretchr/testify/assert"
+)
+
+func TestNormalizeMarks(t *testing.T) {
+	x := normalizeMarks()
+	assert.Equal(t, "cafe", x.transform("café"))
+	assert.Equal(t, "cafe 🍎", x.transform("café 🍎"))
+	assert.Equal(t, "Foo Bar", x.transform("Foo Bar"))
+}
+
+func TestReplace(t *testing.T) {
+	assert.Equal(t, "___abc___", replaceIn(unicode.Digit, '_').transform("000abc999"))
+	assert.Equal(t, "___000___", replaceNotIn(unicode.Digit, '_').transform("abc000abc"))
+}
+
+func TestTrim(t *testing.T) {
+	assert.Equal(t, "abc", trimIfIn(unicode.Digit).transform("000abc999"))
+	assert.Equal(t, "000", trimIfNotIn(unicode.Digit).transform("abc000abc"))
+}