diff --git a/go.mod b/go.mod index 080cdcfd8e..fade96d4cc 100644 --- a/go.mod +++ b/go.mod @@ -19,6 +19,7 @@ require ( github.com/spf13/viper v1.21.0 github.com/stretchr/testify v1.11.1 github.com/yosida95/uritemplate/v3 v3.0.2 + github.com/yuin/goldmark v1.8.2 ) require ( diff --git a/go.sum b/go.sum index fbf06018f7..379128d026 100644 --- a/go.sum +++ b/go.sum @@ -82,6 +82,8 @@ github.com/subosito/gotenv v1.6.0/go.mod h1:Dk4QP5c2W3ibzajGcXpNraDfq2IrhjMIvMSW github.com/yosida95/uritemplate/v3 v3.0.2 h1:Ed3Oyj9yrmi9087+NczuL5BwkIc4wvTb5zIM+UJPGz4= github.com/yosida95/uritemplate/v3 v3.0.2/go.mod h1:ILOh0sOhIJR3+L/8afwt/kE++YT040gmv5BQTMR2HP4= github.com/yuin/goldmark v1.4.13/go.mod h1:6yULJ656Px+3vBD8DxQVa3kxgyrAnzto9xy5taEt/CY= +github.com/yuin/goldmark v1.8.2 h1:kEGpgqJXdgbkhcOgBxkC0X0PmoPG1ZyoZ117rDVp4zE= +github.com/yuin/goldmark v1.8.2/go.mod h1:ip/1k0VRfGynBgxOz0yCqHrbZXhcjxyuS66Brc7iBKg= go.yaml.in/yaml/v3 v3.0.4 h1:tfq32ie2Jv2UxXFdLJdh3jXuOzWiL1fo0bu/FbuKpbc= go.yaml.in/yaml/v3 v3.0.4/go.mod h1:DhzuOOF2ATzADvBadXxruRBLzYTpT36CKvDb3+aBEFg= golang.org/x/crypto v0.0.0-20190308221718-c2843e01d9a2/go.mod h1:djNgcEr1/C05ACkg1iLfiJU5Ep61QUkGW8qpdssI0+w= diff --git a/pkg/sanitize/sanitize.go b/pkg/sanitize/sanitize.go index e6401e4fb3..a41c184f8a 100644 --- a/pkg/sanitize/sanitize.go +++ b/pkg/sanitize/sanitize.go @@ -1,22 +1,52 @@ +// Package sanitize provides functions to sanitize untrusted user content from +// GitHub (issue titles, PR bodies, search results, etc.) before display. +// +// Threat model: +// - Input source: untrusted GitHub content that may contain malicious HTML/JS +// - Goal: strip HTML tags from prose without corrupting code samples +// - Defense: bluemonday HTML sanitizer + angle bracket protection in code blocks +// +// The sanitizer preserves angle brackets inside code (fenced blocks, inline spans, +// and indented blocks) because bluemonday treats , , etc. as unknown HTML +// tags and removes them. This would corrupt generic type syntax and other code. package sanitize import ( + "bytes" "strings" "sync" "unicode" "github.com/microcosm-cc/bluemonday" + "github.com/yuin/goldmark" + "github.com/yuin/goldmark/ast" + "github.com/yuin/goldmark/text" ) var policy *bluemonday.Policy var policyOnce sync.Once +// Sanitize removes HTML tags and invisible characters from untrusted input. +// +// Ordering invariant: FilterInvisibleCharacters must run before +// protectCodeAngleBrackets to prevent sentinel collision attacks. +// FilterInvisibleCharacters strips NUL bytes (0x00), so an attacker cannot +// inject literal "\x00LT\x00script\x00GT\x00" strings that would bypass +// FilterHTMLTags and get restored to in it.", + expected: "This has in it.", + }, + { + name: "fenced code block with generic types", + input: "Example:\n```go\nfunc Foo[T comparable](x T) {}\n```\nDone.", + expected: "Example:\n```go\nfunc Foo[T comparable](x T) {}\n```\nDone.", + }, + { + name: "multiple inline code spans with angle brackets", + input: "Compare `Map` and `Set`.", + expected: "Compare `Map` and `Set`.", + }, + { + name: "no code blocks passes through", + input: "No code here, just text.", + expected: "No code here, just text.", + }, + { + name: "sentinel collision does not bypass sanitizer", + input: "\x00LT\x00script\x00GT\x00alert(1)\x00LT\x00/script\x00GT\x00", + expected: "LTscriptGTalert(1)LT/scriptGT", // NUL bytes stripped; sentinels don't match; no