refactor(scrape): detect prose-style "must be X" discriminator values on variants

Sealed-interface union variants whose Type/Source field is declared as
bare prose (e.g. "Type of the result, must be article" or "Scope type,
must be all_private_chats") were skipped by extractEnumValues because
the existing patterns require curly-quoted values. The genapi emitter
already extracted these values via discBareRE for marshal-side
discriminator injection; lifting the same detection into the scraper
populates Field.EnumValues so planUnifiedUnionEnums folds them into
shared union-level enums automatically.

Unions newly unified (10): BotCommandScope, MenuButton, InputMedia,
InputPaidMedia, InputPollMedia, InputPollOptionMedia, InputProfilePhoto,
InputStoryContent, InlineQueryResult, PassportElementError.

InputMessageContent stays excluded — its variants dispatch
structurally on field presence and have no Type/Source field, so
planUnifiedUnionEnums correctly skips it.

Constants added: 60 typed enum constants across the 10 unions; the
corresponding variant struct fields are retyped from string to the
shared enum.

Internal call-site cleanups: 0 — no internal package referenced these
discriminator values via magic strings.

False positives the prose detector explicitly rejects: terminal
prose-word continuations like "must be sent", "must be shown above",
"must be specified", "must be paid", "must be active", "must be one
of 3, 6, or 12", "must be between 5 and 100000", "must be a Pay
button", "must be repainted". Guarded via terminal-position regex
anchor + closed-list isProseWord filter.

Determinism verified across two consecutive make regen-from-fixture
runs. go test -race ./..., go vet ./..., staticcheck ./... all clean.
This commit is contained in:
2026-05-09 20:37:07 +01:00
parent 5523ed2b06
commit fecef22f48
7 changed files with 1082 additions and 199 deletions
+87 -1
View File
@@ -27,6 +27,33 @@ import (
// emits the canonical Markdown / MarkdownV2 / HTML triple.
//
// Returns nil when the description does not look like an enum.
// extractEnumValues inspects a field-description string and returns the
// list of wire-level string values when the description matches one of
// the enum-like patterns Telegram uses in its docs. Order follows doc
// order; duplicates are removed but order of first occurrence is kept.
//
// Handled patterns (curly quotes “…” are required to avoid false
// positives on free-text quoting):
//
// - "Type of the chat, can be either “private”, “group”, … or “channel”"
// - "Currently, can be “mention”, “hashtag”, …"
// - "Currently, one of “XTR” … or “TON” …"
// - "Currently, must be one of “XTR” …"
// - "Currently, it can be one of “pending”, “approved”, “declined”."
// - "Must be one of “danger” …, “success” …"
// - "Must be one of “image/jpeg”, “image/gif”, or “video/mp4”"
// - "Format … must be one of “static” …, “animated” …, “video” …"
// - "Currently, either “upgrade” …, “transfer” …, “resale” …"
// - "..., always “creator”"
// - parse_mode parameter special case ("Mode for parsing entities …")
// emits the canonical Markdown / MarkdownV2 / HTML triple.
// - bare prose discriminator at end of description, e.g.
// "Type of the result, must be article" or
// "Scope type, must be all_private_chats". Used by sealed-interface
// union variants whose Type/Source field carries a single literal
// value declared without curly quotes.
//
// Returns nil when the description does not look like an enum.
func extractEnumValues(jsonName, desc string) []string {
if values := parseModeEnumValues(jsonName, desc); values != nil {
return values
@@ -34,12 +61,15 @@ func extractEnumValues(jsonName, desc string) []string {
trigger, triggerEnd, isAlways := findEnumTrigger(desc)
if trigger < 0 {
return nil
return extractProseDiscriminator(desc)
}
tail := desc[trigger:]
values := collectQuotedValues(tail)
if len(values) == 0 {
if v := extractProseDiscriminator(desc); v != nil {
return v
}
return nil
}
// First quoted value must sit close to the trigger phrase (e.g.
@@ -203,3 +233,59 @@ func dedupeStrings(in []string) []string {
}
return out
}
// proseDiscRE matches a terminal "must be <ident>" clause: the
// discriminator value sits at the END of the description (optionally
// followed by trailing punctuation/whitespace) so multi-clause prose
// like "must be shown above the message" is not picked up.
//
// The identifier is a snake_case wire literal: lowercase letters, digits,
// and underscores, starting with a letter. Numeric-only and prose words
// are filtered separately by isProseWord.
var proseDiscRE = regexp.MustCompile(`(?i)\bmust be\s+([a-z][a-z0-9_]*)\s*[.,]?\s*$`)
// extractProseDiscriminator detects unambiguous single-value
// discriminator declarations of the form "..., must be <ident>" used by
// sealed-interface union variants (e.g. "Type of the result, must be
// article" or "Scope type, must be all_private_chats"). Returns the
// extracted value as a one-element slice or nil when no match is found.
//
// The terminal-position anchor is what protects against prose like
// "must be shown above" or "must be one of 3, 6, or 12" — the candidate
// must close the description.
func extractProseDiscriminator(desc string) []string {
desc = strings.TrimSpace(desc)
if desc == "" {
return nil
}
m := proseDiscRE.FindStringSubmatch(desc)
if m == nil {
return nil
}
v := m[1]
if isProseWord(v) {
return nil
}
return []string{v}
}
// isProseWord rejects bare-prose continuations that pass the regex but
// are clearly English filler ("must be sent", "must be available"). The
// list is the closed set of words that empirically appear in the IR's
// "must be …" tails outside the variant-discriminator pattern. Wire
// identifiers are always single tokens with no English meaning, so any
// match here is a free-text false positive.
func isProseWord(s string) bool {
switch s {
case "a", "an", "the",
"sent", "shown", "set", "used", "passed", "specified", "available",
"applied", "supported", "assumed", "active", "paid", "between",
"of", "on", "in", "at", "by", "to", "from", "for", "with",
"and", "or", "no", "non",
"positive", "negative",
"administrator", "repainted",
"one", "exactly":
return true
}
return false
}
+53
View File
@@ -83,3 +83,56 @@ func TestExtractEnumValues_DedupeRepeatedValues(t *testing.T) {
got := extractEnumValues("currency", desc)
require.Equal(t, []string{"XTR"}, got)
}
func TestExtractEnumValues_ProseDiscriminator(t *testing.T) {
cases := []struct {
name string
desc string
want []string
}{
{"InlineQueryResultArticle", "Type of the result, must be article", []string{"article"}},
{"InlineQueryResultPhoto", "Type of the result, must be photo", []string{"photo"}},
{"InlineQueryResultMpeg4Gif", "Type of the result, must be mpeg4_gif", []string{"mpeg4_gif"}},
{"BotCommandScopeAllPrivateChats", "Scope type, must be all_private_chats", []string{"all_private_chats"}},
{"BotCommandScopeChat", "Scope type, must be chat", []string{"chat"}},
{"PassportElementErrorData", "Error source, must be data", []string{"data"}},
{"MenuButtonWebApp", "Type of the button, must be web_app", []string{"web_app"}},
{"InputProfilePhotoAnimated", "Type of the profile photo, must be animated", []string{"animated"}},
{"InputStoryContentVideo", "Type of the content, must be video", []string{"video"}},
{"InputPaidMediaPhoto", "Type of the media, must be photo", []string{"photo"}},
}
for _, tc := range cases {
t.Run(tc.name, func(t *testing.T) {
require.Equal(t, tc.want, extractEnumValues("type", tc.desc))
})
}
}
func TestExtractEnumValues_ProseFalsePositives(t *testing.T) {
cases := []struct {
name string
desc string
}{
{"available_only_for", "Optional. Bot-specified invoice payload. Can be available only for “invoice_payment” transactions."},
{"must_be_sent", "If True, the message must be sent immediately."},
{"must_be_shown_above", "Optional. True, if the link preview must be shown above the message text"},
{"must_be_specified", "The identifiers must be specified in a strictly increasing order."},
{"must_be_paid", "The number of Telegram Stars that must be paid to send the sticker"},
{"must_be_one_of_numbers", "Number of months the Telegram Premium subscription will be active for the user; must be one of 3, 6, or 12"},
{"must_be_between", "Currently, price in Telegram Stars must be between 5 and 100000"},
{"must_be_a_pay_button", "If not empty, the first button must be a Pay button."},
{"must_be_repainted", "True, if the sticker must be repainted to a text color in messages"},
{"must_be_active", "the subscription must be active up to the end of the current subscription period"},
}
for _, tc := range cases {
t.Run(tc.name, func(t *testing.T) {
require.Nil(t, extractEnumValues("type", tc.desc))
})
}
}
func TestExtractEnumValues_CanonicalMustBeOneOfStillWorks(t *testing.T) {
desc := "Currently, must be one of “Markdown”, “MarkdownV2”, “HTML”"
got := extractEnumValues("parse_mode_kind", desc)
require.Equal(t, []string{"Markdown", "MarkdownV2", "HTML"}, got)
}