mirror of
https://github.com/lukaszraczylo/go-telegram.git
synced 2026-06-29 03:13:07 +00:00
refactor(scrape): detect prose-style "must be X" discriminator values on variants
Sealed-interface union variants whose Type/Source field is declared as bare prose (e.g. "Type of the result, must be article" or "Scope type, must be all_private_chats") were skipped by extractEnumValues because the existing patterns require curly-quoted values. The genapi emitter already extracted these values via discBareRE for marshal-side discriminator injection; lifting the same detection into the scraper populates Field.EnumValues so planUnifiedUnionEnums folds them into shared union-level enums automatically. Unions newly unified (10): BotCommandScope, MenuButton, InputMedia, InputPaidMedia, InputPollMedia, InputPollOptionMedia, InputProfilePhoto, InputStoryContent, InlineQueryResult, PassportElementError. InputMessageContent stays excluded — its variants dispatch structurally on field presence and have no Type/Source field, so planUnifiedUnionEnums correctly skips it. Constants added: 60 typed enum constants across the 10 unions; the corresponding variant struct fields are retyped from string to the shared enum. Internal call-site cleanups: 0 — no internal package referenced these discriminator values via magic strings. False positives the prose detector explicitly rejects: terminal prose-word continuations like "must be sent", "must be shown above", "must be specified", "must be paid", "must be active", "must be one of 3, 6, or 12", "must be between 5 and 100000", "must be a Pay button", "must be repainted". Guarded via terminal-position regex anchor + closed-list isProseWord filter. Determinism verified across two consecutive make regen-from-fixture runs. go test -race ./..., go vet ./..., staticcheck ./... all clean.
This commit is contained in:
+87
-1
@@ -27,6 +27,33 @@ import (
|
||||
// emits the canonical Markdown / MarkdownV2 / HTML triple.
|
||||
//
|
||||
// Returns nil when the description does not look like an enum.
|
||||
// extractEnumValues inspects a field-description string and returns the
|
||||
// list of wire-level string values when the description matches one of
|
||||
// the enum-like patterns Telegram uses in its docs. Order follows doc
|
||||
// order; duplicates are removed but order of first occurrence is kept.
|
||||
//
|
||||
// Handled patterns (curly quotes “…” are required to avoid false
|
||||
// positives on free-text quoting):
|
||||
//
|
||||
// - "Type of the chat, can be either “private”, “group”, … or “channel”"
|
||||
// - "Currently, can be “mention”, “hashtag”, …"
|
||||
// - "Currently, one of “XTR” … or “TON” …"
|
||||
// - "Currently, must be one of “XTR” …"
|
||||
// - "Currently, it can be one of “pending”, “approved”, “declined”."
|
||||
// - "Must be one of “danger” …, “success” …"
|
||||
// - "Must be one of “image/jpeg”, “image/gif”, or “video/mp4”"
|
||||
// - "Format … must be one of “static” …, “animated” …, “video” …"
|
||||
// - "Currently, either “upgrade” …, “transfer” …, “resale” …"
|
||||
// - "..., always “creator”"
|
||||
// - parse_mode parameter special case ("Mode for parsing entities …")
|
||||
// emits the canonical Markdown / MarkdownV2 / HTML triple.
|
||||
// - bare prose discriminator at end of description, e.g.
|
||||
// "Type of the result, must be article" or
|
||||
// "Scope type, must be all_private_chats". Used by sealed-interface
|
||||
// union variants whose Type/Source field carries a single literal
|
||||
// value declared without curly quotes.
|
||||
//
|
||||
// Returns nil when the description does not look like an enum.
|
||||
func extractEnumValues(jsonName, desc string) []string {
|
||||
if values := parseModeEnumValues(jsonName, desc); values != nil {
|
||||
return values
|
||||
@@ -34,12 +61,15 @@ func extractEnumValues(jsonName, desc string) []string {
|
||||
|
||||
trigger, triggerEnd, isAlways := findEnumTrigger(desc)
|
||||
if trigger < 0 {
|
||||
return nil
|
||||
return extractProseDiscriminator(desc)
|
||||
}
|
||||
tail := desc[trigger:]
|
||||
|
||||
values := collectQuotedValues(tail)
|
||||
if len(values) == 0 {
|
||||
if v := extractProseDiscriminator(desc); v != nil {
|
||||
return v
|
||||
}
|
||||
return nil
|
||||
}
|
||||
// First quoted value must sit close to the trigger phrase (e.g.
|
||||
@@ -203,3 +233,59 @@ func dedupeStrings(in []string) []string {
|
||||
}
|
||||
return out
|
||||
}
|
||||
|
||||
// proseDiscRE matches a terminal "must be <ident>" clause: the
|
||||
// discriminator value sits at the END of the description (optionally
|
||||
// followed by trailing punctuation/whitespace) so multi-clause prose
|
||||
// like "must be shown above the message" is not picked up.
|
||||
//
|
||||
// The identifier is a snake_case wire literal: lowercase letters, digits,
|
||||
// and underscores, starting with a letter. Numeric-only and prose words
|
||||
// are filtered separately by isProseWord.
|
||||
var proseDiscRE = regexp.MustCompile(`(?i)\bmust be\s+([a-z][a-z0-9_]*)\s*[.,]?\s*$`)
|
||||
|
||||
// extractProseDiscriminator detects unambiguous single-value
|
||||
// discriminator declarations of the form "..., must be <ident>" used by
|
||||
// sealed-interface union variants (e.g. "Type of the result, must be
|
||||
// article" or "Scope type, must be all_private_chats"). Returns the
|
||||
// extracted value as a one-element slice or nil when no match is found.
|
||||
//
|
||||
// The terminal-position anchor is what protects against prose like
|
||||
// "must be shown above" or "must be one of 3, 6, or 12" — the candidate
|
||||
// must close the description.
|
||||
func extractProseDiscriminator(desc string) []string {
|
||||
desc = strings.TrimSpace(desc)
|
||||
if desc == "" {
|
||||
return nil
|
||||
}
|
||||
m := proseDiscRE.FindStringSubmatch(desc)
|
||||
if m == nil {
|
||||
return nil
|
||||
}
|
||||
v := m[1]
|
||||
if isProseWord(v) {
|
||||
return nil
|
||||
}
|
||||
return []string{v}
|
||||
}
|
||||
|
||||
// isProseWord rejects bare-prose continuations that pass the regex but
|
||||
// are clearly English filler ("must be sent", "must be available"). The
|
||||
// list is the closed set of words that empirically appear in the IR's
|
||||
// "must be …" tails outside the variant-discriminator pattern. Wire
|
||||
// identifiers are always single tokens with no English meaning, so any
|
||||
// match here is a free-text false positive.
|
||||
func isProseWord(s string) bool {
|
||||
switch s {
|
||||
case "a", "an", "the",
|
||||
"sent", "shown", "set", "used", "passed", "specified", "available",
|
||||
"applied", "supported", "assumed", "active", "paid", "between",
|
||||
"of", "on", "in", "at", "by", "to", "from", "for", "with",
|
||||
"and", "or", "no", "non",
|
||||
"positive", "negative",
|
||||
"administrator", "repainted",
|
||||
"one", "exactly":
|
||||
return true
|
||||
}
|
||||
return false
|
||||
}
|
||||
|
||||
Reference in New Issue
Block a user