mirror of
https://github.com/lukaszraczylo/go-telegram.git
synced 2026-06-10 23:09:04 +00:00
refactor(scrape): detect prose-style "must be X" discriminator values on variants
Sealed-interface union variants whose Type/Source field is declared as bare prose (e.g. "Type of the result, must be article" or "Scope type, must be all_private_chats") were skipped by extractEnumValues because the existing patterns require curly-quoted values. The genapi emitter already extracted these values via discBareRE for marshal-side discriminator injection; lifting the same detection into the scraper populates Field.EnumValues so planUnifiedUnionEnums folds them into shared union-level enums automatically. Unions newly unified (10): BotCommandScope, MenuButton, InputMedia, InputPaidMedia, InputPollMedia, InputPollOptionMedia, InputProfilePhoto, InputStoryContent, InlineQueryResult, PassportElementError. InputMessageContent stays excluded — its variants dispatch structurally on field presence and have no Type/Source field, so planUnifiedUnionEnums correctly skips it. Constants added: 60 typed enum constants across the 10 unions; the corresponding variant struct fields are retyped from string to the shared enum. Internal call-site cleanups: 0 — no internal package referenced these discriminator values via magic strings. False positives the prose detector explicitly rejects: terminal prose-word continuations like "must be sent", "must be shown above", "must be specified", "must be paid", "must be active", "must be one of 3, 6, or 12", "must be between 5 and 100000", "must be a Pay button", "must be repainted". Guarded via terminal-position regex anchor + closed-list isProseWord filter. Determinism verified across two consecutive make regen-from-fixture runs. go test -race ./..., go vet ./..., staticcheck ./... all clean.
This commit is contained in:
+87
-1
@@ -27,6 +27,33 @@ import (
|
||||
// emits the canonical Markdown / MarkdownV2 / HTML triple.
|
||||
//
|
||||
// Returns nil when the description does not look like an enum.
|
||||
// extractEnumValues inspects a field-description string and returns the
|
||||
// list of wire-level string values when the description matches one of
|
||||
// the enum-like patterns Telegram uses in its docs. Order follows doc
|
||||
// order; duplicates are removed but order of first occurrence is kept.
|
||||
//
|
||||
// Handled patterns (curly quotes “…” are required to avoid false
|
||||
// positives on free-text quoting):
|
||||
//
|
||||
// - "Type of the chat, can be either “private”, “group”, … or “channel”"
|
||||
// - "Currently, can be “mention”, “hashtag”, …"
|
||||
// - "Currently, one of “XTR” … or “TON” …"
|
||||
// - "Currently, must be one of “XTR” …"
|
||||
// - "Currently, it can be one of “pending”, “approved”, “declined”."
|
||||
// - "Must be one of “danger” …, “success” …"
|
||||
// - "Must be one of “image/jpeg”, “image/gif”, or “video/mp4”"
|
||||
// - "Format … must be one of “static” …, “animated” …, “video” …"
|
||||
// - "Currently, either “upgrade” …, “transfer” …, “resale” …"
|
||||
// - "..., always “creator”"
|
||||
// - parse_mode parameter special case ("Mode for parsing entities …")
|
||||
// emits the canonical Markdown / MarkdownV2 / HTML triple.
|
||||
// - bare prose discriminator at end of description, e.g.
|
||||
// "Type of the result, must be article" or
|
||||
// "Scope type, must be all_private_chats". Used by sealed-interface
|
||||
// union variants whose Type/Source field carries a single literal
|
||||
// value declared without curly quotes.
|
||||
//
|
||||
// Returns nil when the description does not look like an enum.
|
||||
func extractEnumValues(jsonName, desc string) []string {
|
||||
if values := parseModeEnumValues(jsonName, desc); values != nil {
|
||||
return values
|
||||
@@ -34,12 +61,15 @@ func extractEnumValues(jsonName, desc string) []string {
|
||||
|
||||
trigger, triggerEnd, isAlways := findEnumTrigger(desc)
|
||||
if trigger < 0 {
|
||||
return nil
|
||||
return extractProseDiscriminator(desc)
|
||||
}
|
||||
tail := desc[trigger:]
|
||||
|
||||
values := collectQuotedValues(tail)
|
||||
if len(values) == 0 {
|
||||
if v := extractProseDiscriminator(desc); v != nil {
|
||||
return v
|
||||
}
|
||||
return nil
|
||||
}
|
||||
// First quoted value must sit close to the trigger phrase (e.g.
|
||||
@@ -203,3 +233,59 @@ func dedupeStrings(in []string) []string {
|
||||
}
|
||||
return out
|
||||
}
|
||||
|
||||
// proseDiscRE matches a terminal "must be <ident>" clause: the
|
||||
// discriminator value sits at the END of the description (optionally
|
||||
// followed by trailing punctuation/whitespace) so multi-clause prose
|
||||
// like "must be shown above the message" is not picked up.
|
||||
//
|
||||
// The identifier is a snake_case wire literal: lowercase letters, digits,
|
||||
// and underscores, starting with a letter. Numeric-only and prose words
|
||||
// are filtered separately by isProseWord.
|
||||
var proseDiscRE = regexp.MustCompile(`(?i)\bmust be\s+([a-z][a-z0-9_]*)\s*[.,]?\s*$`)
|
||||
|
||||
// extractProseDiscriminator detects unambiguous single-value
|
||||
// discriminator declarations of the form "..., must be <ident>" used by
|
||||
// sealed-interface union variants (e.g. "Type of the result, must be
|
||||
// article" or "Scope type, must be all_private_chats"). Returns the
|
||||
// extracted value as a one-element slice or nil when no match is found.
|
||||
//
|
||||
// The terminal-position anchor is what protects against prose like
|
||||
// "must be shown above" or "must be one of 3, 6, or 12" — the candidate
|
||||
// must close the description.
|
||||
func extractProseDiscriminator(desc string) []string {
|
||||
desc = strings.TrimSpace(desc)
|
||||
if desc == "" {
|
||||
return nil
|
||||
}
|
||||
m := proseDiscRE.FindStringSubmatch(desc)
|
||||
if m == nil {
|
||||
return nil
|
||||
}
|
||||
v := m[1]
|
||||
if isProseWord(v) {
|
||||
return nil
|
||||
}
|
||||
return []string{v}
|
||||
}
|
||||
|
||||
// isProseWord rejects bare-prose continuations that pass the regex but
|
||||
// are clearly English filler ("must be sent", "must be available"). The
|
||||
// list is the closed set of words that empirically appear in the IR's
|
||||
// "must be …" tails outside the variant-discriminator pattern. Wire
|
||||
// identifiers are always single tokens with no English meaning, so any
|
||||
// match here is a free-text false positive.
|
||||
func isProseWord(s string) bool {
|
||||
switch s {
|
||||
case "a", "an", "the",
|
||||
"sent", "shown", "set", "used", "passed", "specified", "available",
|
||||
"applied", "supported", "assumed", "active", "paid", "between",
|
||||
"of", "on", "in", "at", "by", "to", "from", "for", "with",
|
||||
"and", "or", "no", "non",
|
||||
"positive", "negative",
|
||||
"administrator", "repainted",
|
||||
"one", "exactly":
|
||||
return true
|
||||
}
|
||||
return false
|
||||
}
|
||||
|
||||
@@ -83,3 +83,56 @@ func TestExtractEnumValues_DedupeRepeatedValues(t *testing.T) {
|
||||
got := extractEnumValues("currency", desc)
|
||||
require.Equal(t, []string{"XTR"}, got)
|
||||
}
|
||||
|
||||
func TestExtractEnumValues_ProseDiscriminator(t *testing.T) {
|
||||
cases := []struct {
|
||||
name string
|
||||
desc string
|
||||
want []string
|
||||
}{
|
||||
{"InlineQueryResultArticle", "Type of the result, must be article", []string{"article"}},
|
||||
{"InlineQueryResultPhoto", "Type of the result, must be photo", []string{"photo"}},
|
||||
{"InlineQueryResultMpeg4Gif", "Type of the result, must be mpeg4_gif", []string{"mpeg4_gif"}},
|
||||
{"BotCommandScopeAllPrivateChats", "Scope type, must be all_private_chats", []string{"all_private_chats"}},
|
||||
{"BotCommandScopeChat", "Scope type, must be chat", []string{"chat"}},
|
||||
{"PassportElementErrorData", "Error source, must be data", []string{"data"}},
|
||||
{"MenuButtonWebApp", "Type of the button, must be web_app", []string{"web_app"}},
|
||||
{"InputProfilePhotoAnimated", "Type of the profile photo, must be animated", []string{"animated"}},
|
||||
{"InputStoryContentVideo", "Type of the content, must be video", []string{"video"}},
|
||||
{"InputPaidMediaPhoto", "Type of the media, must be photo", []string{"photo"}},
|
||||
}
|
||||
for _, tc := range cases {
|
||||
t.Run(tc.name, func(t *testing.T) {
|
||||
require.Equal(t, tc.want, extractEnumValues("type", tc.desc))
|
||||
})
|
||||
}
|
||||
}
|
||||
|
||||
func TestExtractEnumValues_ProseFalsePositives(t *testing.T) {
|
||||
cases := []struct {
|
||||
name string
|
||||
desc string
|
||||
}{
|
||||
{"available_only_for", "Optional. Bot-specified invoice payload. Can be available only for “invoice_payment” transactions."},
|
||||
{"must_be_sent", "If True, the message must be sent immediately."},
|
||||
{"must_be_shown_above", "Optional. True, if the link preview must be shown above the message text"},
|
||||
{"must_be_specified", "The identifiers must be specified in a strictly increasing order."},
|
||||
{"must_be_paid", "The number of Telegram Stars that must be paid to send the sticker"},
|
||||
{"must_be_one_of_numbers", "Number of months the Telegram Premium subscription will be active for the user; must be one of 3, 6, or 12"},
|
||||
{"must_be_between", "Currently, price in Telegram Stars must be between 5 and 100000"},
|
||||
{"must_be_a_pay_button", "If not empty, the first button must be a Pay button."},
|
||||
{"must_be_repainted", "True, if the sticker must be repainted to a text color in messages"},
|
||||
{"must_be_active", "the subscription must be active up to the end of the current subscription period"},
|
||||
}
|
||||
for _, tc := range cases {
|
||||
t.Run(tc.name, func(t *testing.T) {
|
||||
require.Nil(t, extractEnumValues("type", tc.desc))
|
||||
})
|
||||
}
|
||||
}
|
||||
|
||||
func TestExtractEnumValues_CanonicalMustBeOneOfStillWorks(t *testing.T) {
|
||||
desc := "Currently, must be one of “Markdown”, “MarkdownV2”, “HTML”"
|
||||
got := extractEnumValues("parse_mode_kind", desc)
|
||||
require.Equal(t, []string{"Markdown", "MarkdownV2", "HTML"}, got)
|
||||
}
|
||||
|
||||
Reference in New Issue
Block a user