Library to validate and normalize cloud specific tags (#819)

## Changes

Prompted by the proposed fix for a tagging-related problem in #810, I
investigated how tag validation works. This turned out to be quite a bit
more complex than anticipated. Tags at the job level (or cluster level)
are passed through to the underlying compute infrastructure and as such
are tested against cloud-specific validation rules. GCP appears to be
the most restrictive. It would be disappointing to always restrict to
`\w+`, so this package implements validation and normalization rules for
each cloud. It can pick the right cloud to use using a Go SDK
configuration.

## Tests

Exhaustive unit tests. The regular expressions were pulled by #814.
This commit is contained in:
Pieter Noordhuis 2023-09-29 10:49:08 +02:00 committed by GitHub
parent 3685eb16f4
commit 4226c88e98
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
13 changed files with 532 additions and 0 deletions

36
libs/tags/aws.go Normal file
View File

@ -0,0 +1,36 @@
package tags
import (
"regexp"
"unicode"
"golang.org/x/text/unicode/rangetable"
)
// The union of all characters allowed in AWS tags.
// This must be used only after filtering out non-Latin1 characters,
// because the [unicode] classes include non-Latin1 characters.
var awsChars = rangetable.Merge(
unicode.Digit,
unicode.Space,
unicode.Letter,
rangetable.New('+', '-', '=', '.', ':', '/', '@'),
)
var awsTag = &tag{
keyLength: 127,
keyPattern: regexp.MustCompile(`^[\d \w\+\-=\.:\/@]*$`),
keyNormalize: chain(
normalizeMarks(),
replaceNotIn(latin1, '_'),
replaceNotIn(awsChars, '_'),
),
valueLength: 255,
valuePattern: regexp.MustCompile(`^[\d \w\+\-=\.:/@]*$`),
valueNormalize: chain(
normalizeMarks(),
replaceNotIn(latin1, '_'),
replaceNotIn(awsChars, '_'),
),
}

49
libs/tags/aws_test.go Normal file
View File

@ -0,0 +1,49 @@
package tags
import (
"strings"
"testing"
"github.com/stretchr/testify/assert"
)
func TestAwsNormalizeKey(t *testing.T) {
assert.Equal(t, "1 a b c", awsTag.NormalizeKey("1 a b c"))
assert.Equal(t, "+-=.:/@__", awsTag.NormalizeKey("+-=.:/@?)"))
assert.Equal(t, "test", awsTag.NormalizeKey("test"))
// Remove marks; unicode becomes underscore.
assert.Equal(t, "cafe _", awsTag.NormalizeKey("café 🍎"))
// Replace forbidden characters with underscore.
assert.Equal(t, "cafe __", awsTag.NormalizeKey("café 🍎?"))
}
func TestAwsNormalizeValue(t *testing.T) {
assert.Equal(t, "1 a b c", awsTag.NormalizeValue("1 a b c"))
assert.Equal(t, "+-=.:/@__", awsTag.NormalizeValue("+-=.:/@?)"))
assert.Equal(t, "test", awsTag.NormalizeValue("test"))
// Remove marks; unicode becomes underscore.
assert.Equal(t, "cafe _", awsTag.NormalizeValue("café 🍎"))
// Replace forbidden characters with underscore.
assert.Equal(t, "cafe __", awsTag.NormalizeValue("café 🍎?"))
}
func TestAwsValidateKey(t *testing.T) {
assert.ErrorContains(t, awsTag.ValidateKey(""), "not be empty")
assert.ErrorContains(t, awsTag.ValidateKey(strings.Repeat("a", 512)), "length")
assert.ErrorContains(t, awsTag.ValidateKey("café 🍎"), "latin")
assert.ErrorContains(t, awsTag.ValidateKey("????"), "pattern")
assert.NoError(t, awsTag.ValidateKey(strings.Repeat("a", 127)))
assert.NoError(t, awsTag.ValidateKey(awsTag.NormalizeKey("café 🍎")))
}
func TestAwsValidateValue(t *testing.T) {
assert.ErrorContains(t, awsTag.ValidateValue(strings.Repeat("a", 512)), "length")
assert.ErrorContains(t, awsTag.ValidateValue("café 🍎"), "latin1")
assert.ErrorContains(t, awsTag.ValidateValue("????"), "pattern")
assert.NoError(t, awsTag.ValidateValue(strings.Repeat("a", 127)))
assert.NoError(t, awsTag.ValidateValue(awsTag.NormalizeValue("café 🍎")))
}

25
libs/tags/azure.go Normal file
View File

@ -0,0 +1,25 @@
package tags
import (
"regexp"
"golang.org/x/text/unicode/rangetable"
)
// All characters that may not be used in Azure tag keys.
var azureForbiddenChars = rangetable.New('<', '>', '*', '&', '%', ';', '\\', '/', '+', '?')
var azureTag = &tag{
keyLength: 512,
keyPattern: regexp.MustCompile(`^[^<>\*&%;\\\/\+\?]*$`),
keyNormalize: chain(
replaceNotIn(latin1, '_'),
replaceIn(azureForbiddenChars, '_'),
),
valueLength: 256,
valuePattern: regexp.MustCompile(`^.*$`),
valueNormalize: chain(
replaceNotIn(latin1, '_'),
),
}

34
libs/tags/azure_test.go Normal file
View File

@ -0,0 +1,34 @@
package tags
import (
"strings"
"testing"
"github.com/stretchr/testify/assert"
)
func TestAzureNormalizeKey(t *testing.T) {
assert.Equal(t, "test", azureTag.NormalizeKey("test"))
assert.Equal(t, "café __", azureTag.NormalizeKey("café 🍎?"))
}
func TestAzureNormalizeValue(t *testing.T) {
assert.Equal(t, "test", azureTag.NormalizeValue("test"))
assert.Equal(t, "café _?", azureTag.NormalizeValue("café 🍎?"))
}
func TestAzureValidateKey(t *testing.T) {
assert.ErrorContains(t, azureTag.ValidateKey(""), "not be empty")
assert.ErrorContains(t, azureTag.ValidateKey(strings.Repeat("a", 513)), "length")
assert.ErrorContains(t, azureTag.ValidateKey("café 🍎"), "latin")
assert.ErrorContains(t, azureTag.ValidateKey("????"), "pattern")
assert.NoError(t, azureTag.ValidateKey(strings.Repeat("a", 127)))
assert.NoError(t, azureTag.ValidateKey(azureTag.NormalizeKey("café 🍎")))
}
func TestAzureValidateValue(t *testing.T) {
assert.ErrorContains(t, azureTag.ValidateValue(strings.Repeat("a", 513)), "length")
assert.ErrorContains(t, azureTag.ValidateValue("café 🍎"), "latin")
assert.NoError(t, azureTag.ValidateValue(strings.Repeat("a", 127)))
assert.NoError(t, azureTag.ValidateValue(azureTag.NormalizeValue("café 🍎")))
}

32
libs/tags/cloud.go Normal file
View File

@ -0,0 +1,32 @@
package tags
import "github.com/databricks/databricks-sdk-go/config"
type Cloud interface {
// ValidateKey checks if a tag key can be used with the cloud provider.
ValidateKey(key string) error
// ValidateValue checks if a tag value can be used with the cloud provider.
ValidateValue(value string) error
// NormalizeKey normalizes a tag key for the cloud provider.
NormalizeKey(key string) string
// NormalizeValue normalizes a tag value for the cloud provider.
NormalizeValue(value string) string
}
func ForCloud(cfg *config.Config) Cloud {
var t *tag
switch {
case cfg.IsAws():
t = awsTag
case cfg.IsAzure():
t = azureTag
case cfg.IsGcp():
t = gcpTag
default:
panic("unknown cloud provider")
}
return t
}

32
libs/tags/cloud_test.go Normal file
View File

@ -0,0 +1,32 @@
package tags
import (
"testing"
"github.com/databricks/databricks-sdk-go/config"
"github.com/stretchr/testify/assert"
)
func TestForCloudAws(t *testing.T) {
c := &config.Config{
Host: "https://dbc-XXXXXXXX-YYYY.cloud.databricks.com/",
}
assert.Equal(t, awsTag, ForCloud(c))
}
func TestForCloudAzure(t *testing.T) {
c := &config.Config{
Host: "https://adb-xxx.y.azuredatabricks.net/",
}
assert.Equal(t, azureTag, ForCloud(c))
}
func TestForCloudGcp(t *testing.T) {
c := &config.Config{
Host: "https://123.4.gcp.databricks.com/",
}
assert.Equal(t, gcpTag, ForCloud(c))
}

63
libs/tags/gcp.go Normal file
View File

@ -0,0 +1,63 @@
package tags
import (
"regexp"
"unicode"
)
// Tag keys and values on GCP are limited to 63 characters and must match the
// regular expression `^([A-Za-z0-9][-A-Za-z0-9_.]*)?[A-Za-z0-9]$`.
// For normalization, we define one table for the outer characters and
// one table for the inner characters. The outer table is used to trim
// leading and trailing characters, and the inner table is used to
// replace invalid characters with underscores.
var gcpOuter = &unicode.RangeTable{
R16: []unicode.Range16{
// 0-9
{0x0030, 0x0039, 1},
// A-Z
{0x0041, 0x005A, 1},
// a-z
{0x0061, 0x007A, 1},
},
LatinOffset: 3,
}
var gcpInner = &unicode.RangeTable{
R16: []unicode.Range16{
// Hyphen-minus (dash)
{0x002D, 0x002D, 1},
// Full stop (period)
{0x002E, 0x002E, 1},
// 0-9
{0x0030, 0x0039, 1},
// A-Z
{0x0041, 0x005A, 1},
// Low line (underscore)
{0x005F, 0x005F, 1},
// a-z
{0x0061, 0x007A, 1},
},
LatinOffset: 6,
}
var gcpTag = &tag{
keyLength: 63,
keyPattern: regexp.MustCompile(`^([A-Za-z0-9][-A-Za-z0-9_.]*)?[A-Za-z0-9]$`),
keyNormalize: chain(
normalizeMarks(),
replaceNotIn(latin1, '_'),
replaceNotIn(gcpInner, '_'),
trimIfNotIn(gcpOuter),
),
valueLength: 63,
valuePattern: regexp.MustCompile(`^(([A-Za-z0-9][-A-Za-z0-9_.]*)?[A-Za-z0-9])?$`),
valueNormalize: chain(
normalizeMarks(),
replaceNotIn(latin1, '_'),
replaceNotIn(gcpInner, '_'),
trimIfNotIn(gcpOuter),
),
}

65
libs/tags/gcp_test.go Normal file
View File

@ -0,0 +1,65 @@
package tags
import (
"strings"
"testing"
"unicode"
"github.com/stretchr/testify/assert"
)
func TestGcpOuter(t *testing.T) {
assert.True(t, unicode.In('A', gcpOuter))
assert.True(t, unicode.In('Z', gcpOuter))
assert.True(t, unicode.In('a', gcpOuter))
assert.True(t, unicode.In('z', gcpOuter))
assert.True(t, unicode.In('0', gcpOuter))
assert.True(t, unicode.In('9', gcpOuter))
assert.False(t, unicode.In('-', gcpOuter))
assert.False(t, unicode.In('.', gcpOuter))
assert.False(t, unicode.In('_', gcpOuter))
assert.False(t, unicode.In('!', gcpOuter))
}
func TestGcpInner(t *testing.T) {
assert.True(t, unicode.In('A', gcpInner))
assert.True(t, unicode.In('Z', gcpInner))
assert.True(t, unicode.In('a', gcpInner))
assert.True(t, unicode.In('z', gcpInner))
assert.True(t, unicode.In('0', gcpInner))
assert.True(t, unicode.In('9', gcpInner))
assert.True(t, unicode.In('-', gcpInner))
assert.True(t, unicode.In('.', gcpInner))
assert.True(t, unicode.In('_', gcpInner))
assert.False(t, unicode.In('!', gcpInner))
}
func TestGcpNormalizeKey(t *testing.T) {
assert.Equal(t, "test", gcpTag.NormalizeKey("test"))
assert.Equal(t, "cafe", gcpTag.NormalizeKey("café 🍎?"))
assert.Equal(t, "cafe_foo", gcpTag.NormalizeKey("__café_foo__"))
}
func TestGcpNormalizeValue(t *testing.T) {
assert.Equal(t, "test", gcpTag.NormalizeValue("test"))
assert.Equal(t, "cafe", gcpTag.NormalizeValue("café 🍎?"))
assert.Equal(t, "cafe_foo", gcpTag.NormalizeValue("__café_foo__"))
}
func TestGcpValidateKey(t *testing.T) {
assert.ErrorContains(t, gcpTag.ValidateKey(""), "not be empty")
assert.ErrorContains(t, gcpTag.ValidateKey(strings.Repeat("a", 64)), "length")
assert.ErrorContains(t, gcpTag.ValidateKey("café 🍎"), "latin")
assert.ErrorContains(t, gcpTag.ValidateKey("????"), "pattern")
assert.NoError(t, gcpTag.ValidateKey(strings.Repeat("a", 32)))
assert.NoError(t, gcpTag.ValidateKey(gcpTag.NormalizeKey("café 🍎")))
}
func TestGcpValidateValue(t *testing.T) {
assert.ErrorContains(t, gcpTag.ValidateValue(strings.Repeat("a", 64)), "length")
assert.ErrorContains(t, gcpTag.ValidateValue("café 🍎"), "latin")
assert.ErrorContains(t, gcpTag.ValidateValue("????"), "pattern")
assert.NoError(t, gcpTag.ValidateValue(strings.Repeat("a", 32)))
assert.NoError(t, gcpTag.ValidateValue(gcpTag.NormalizeValue("café 🍎")))
}

11
libs/tags/latin.go Normal file
View File

@ -0,0 +1,11 @@
package tags
import "unicode"
// Range table for all characters in the Latin1 character set.
var latin1 = &unicode.RangeTable{
R16: []unicode.Range16{
{0x0000, 0x00ff, 1},
},
LatinOffset: 1,
}

16
libs/tags/latin_test.go Normal file
View File

@ -0,0 +1,16 @@
package tags
import (
"testing"
"unicode"
"github.com/stretchr/testify/assert"
)
func TestLatinTable(t *testing.T) {
assert.True(t, unicode.In('\u0000', latin1))
assert.True(t, unicode.In('A', latin1))
assert.True(t, unicode.In('Z', latin1))
assert.True(t, unicode.In('\u00ff', latin1))
assert.False(t, unicode.In('\u0100', latin1))
}

57
libs/tags/tag.go Normal file
View File

@ -0,0 +1,57 @@
package tags
import (
"fmt"
"regexp"
"strings"
"unicode"
)
// The tag type holds the validation and normalization rules for
// a cloud provider's resource tags as applied by Databricks.
type tag struct {
keyLength int
keyPattern *regexp.Regexp
keyNormalize transformer
valueLength int
valuePattern *regexp.Regexp
valueNormalize transformer
}
func (t *tag) ValidateKey(s string) error {
if len(s) == 0 {
return fmt.Errorf("key must not be empty")
}
if len(s) > t.keyLength {
return fmt.Errorf("key length %d exceeds maximum of %d", len(s), t.keyLength)
}
if strings.ContainsFunc(s, func(r rune) bool { return !unicode.Is(latin1, r) }) {
return fmt.Errorf("key contains non-latin1 characters")
}
if !t.keyPattern.MatchString(s) {
return fmt.Errorf("key %q does not match pattern %q", s, t.keyPattern)
}
return nil
}
func (t *tag) ValidateValue(s string) error {
if len(s) > t.valueLength {
return fmt.Errorf("value length %d exceeds maximum of %d", len(s), t.valueLength)
}
if strings.ContainsFunc(s, func(r rune) bool { return !unicode.Is(latin1, r) }) {
return fmt.Errorf("value contains non-latin1 characters")
}
if !t.valuePattern.MatchString(s) {
return fmt.Errorf("value %q does not match pattern %q", s, t.valuePattern)
}
return nil
}
func (t *tag) NormalizeKey(s string) string {
return t.keyNormalize.transform(s)
}
func (t *tag) NormalizeValue(s string) string {
return t.valueNormalize.transform(s)
}

87
libs/tags/transform.go Normal file
View File

@ -0,0 +1,87 @@
package tags
import (
"strings"
"unicode"
"golang.org/x/text/runes"
"golang.org/x/text/transform"
"golang.org/x/text/unicode/norm"
)
type transformer interface {
transform(string) string
}
type chainTransformer []transformer
func (c chainTransformer) transform(s string) string {
for _, t := range c {
s = t.transform(s)
}
return s
}
func chain(t ...transformer) transformer {
return chainTransformer(t)
}
// Implement [transformer] interface with text/transform package.
type textTransformer struct {
transform.Transformer
}
func (t textTransformer) transform(s string) string {
s, _, _ = transform.String(t, s)
return s
}
func normalizeMarks() transformer {
// Decompose unicode characters, then remove all non-spacing marks, then recompose.
// This turns 'é' into 'e' and 'ü' into 'u'.
return textTransformer{
transform.Chain(norm.NFD, runes.Remove(runes.In(unicode.Mn)), norm.NFC),
}
}
// Replaces characters in the given set with replacement.
type replaceTransformer struct {
set runes.Set
replacement rune
}
func (t replaceTransformer) transform(s string) string {
return strings.Map(func(r rune) rune {
if t.set.Contains(r) {
return t.replacement
}
return r
}, s)
}
func replaceIn(table *unicode.RangeTable, replacement rune) transformer {
return replaceTransformer{runes.In(table), replacement}
}
func replaceNotIn(table *unicode.RangeTable, replacement rune) transformer {
return replaceTransformer{runes.NotIn(table), replacement}
}
// Trims the given string of all characters in the given set.
type trimTransformer struct {
set runes.Set
}
func (t trimTransformer) transform(s string) string {
return strings.TrimFunc(s, func(r rune) bool {
return t.set.Contains(r)
})
}
func trimIfIn(table *unicode.RangeTable) transformer {
return trimTransformer{runes.In(table)}
}
func trimIfNotIn(table *unicode.RangeTable) transformer {
return trimTransformer{runes.NotIn(table)}
}

View File

@ -0,0 +1,25 @@
package tags
import (
"testing"
"unicode"
"github.com/stretchr/testify/assert"
)
func TestNormalizeMarks(t *testing.T) {
x := normalizeMarks()
assert.Equal(t, "cafe", x.transform("café"))
assert.Equal(t, "cafe 🍎", x.transform("café 🍎"))
assert.Equal(t, "Foo Bar", x.transform("Foo Bar"))
}
func TestReplace(t *testing.T) {
assert.Equal(t, "___abc___", replaceIn(unicode.Digit, '_').transform("000abc999"))
assert.Equal(t, "___000___", replaceNotIn(unicode.Digit, '_').transform("abc000abc"))
}
func TestTrim(t *testing.T) {
assert.Equal(t, "abc", trimIfIn(unicode.Digit).transform("000abc999"))
assert.Equal(t, "000", trimIfNotIn(unicode.Digit).transform("abc000abc"))
}