use host matching instead of regex (desc)

why? using regex on every single message the bot receives, even simple patterns, can be very harmful for your cpu lol
This commit is contained in:
stefanodvx 2025-04-16 14:11:55 +02:00
parent 84d005ade2
commit 3e351e7e43
7 changed files with 154 additions and 51 deletions

View file

@ -44,12 +44,17 @@ var igHeaders = map[string]string{
"User-Agent": util.ChromeUA, "User-Agent": util.ChromeUA,
} }
var instagramHost = []string{
"instagram.com",
}
var Extractor = &models.Extractor{ var Extractor = &models.Extractor{
Name: "Instagram", Name: "Instagram",
CodeName: "instagram", CodeName: "instagram",
Type: enums.ExtractorTypeSingle, Type: enums.ExtractorTypeSingle,
Category: enums.ExtractorCategorySocial, Category: enums.ExtractorCategorySocial,
URLPattern: regexp.MustCompile(`https:\/\/www\.instagram\.com\/(reel|p|tv)\/(?P<id>[a-zA-Z0-9_-]+)`), URLPattern: regexp.MustCompile(`https:\/\/www\.instagram\.com\/(reel|p|tv)\/(?P<id>[a-zA-Z0-9_-]+)`),
Host: instagramHost,
IsRedirect: false, IsRedirect: false,
Run: func(ctx *models.DownloadContext) (*models.ExtractorResponse, error) { Run: func(ctx *models.DownloadContext) (*models.ExtractorResponse, error) {
@ -66,6 +71,7 @@ var StoriesExtractor = &models.Extractor{
Type: enums.ExtractorTypeSingle, Type: enums.ExtractorTypeSingle,
Category: enums.ExtractorCategorySocial, Category: enums.ExtractorCategorySocial,
URLPattern: regexp.MustCompile(`https:\/\/www\.instagram\.com\/stories\/[a-zA-Z0-9._]+\/(?P<id>\d+)`), URLPattern: regexp.MustCompile(`https:\/\/www\.instagram\.com\/stories\/[a-zA-Z0-9._]+\/(?P<id>\d+)`),
Host: instagramHost,
IsRedirect: false, IsRedirect: false,
Run: func(ctx *models.DownloadContext) (*models.ExtractorResponse, error) { Run: func(ctx *models.DownloadContext) (*models.ExtractorResponse, error) {
@ -82,6 +88,7 @@ var ShareURLExtractor = &models.Extractor{
Type: enums.ExtractorTypeSingle, Type: enums.ExtractorTypeSingle,
Category: enums.ExtractorCategorySocial, Category: enums.ExtractorCategorySocial,
URLPattern: regexp.MustCompile(`https?:\/\/(www\.)?instagram\.com\/share\/((reels?|video|s|p)\/)?(?P<id>[^\/\?]+)`), URLPattern: regexp.MustCompile(`https?:\/\/(www\.)?instagram\.com\/share\/((reels?|video|s|p)\/)?(?P<id>[^\/\?]+)`),
Host: instagramHost,
IsRedirect: true, IsRedirect: true,
Run: func(ctx *models.DownloadContext) (*models.ExtractorResponse, error) { Run: func(ctx *models.DownloadContext) (*models.ExtractorResponse, error) {

View file

@ -6,6 +6,7 @@ import (
"io" "io"
"net/http" "net/http"
"regexp" "regexp"
"strings"
"govd/enums" "govd/enums"
"govd/models" "govd/models"
@ -17,14 +18,32 @@ const (
shortenerAPIFormat = "https://api.pinterest.com/url_shortener/%s/redirect/" shortenerAPIFormat = "https://api.pinterest.com/url_shortener/%s/redirect/"
) )
var httpSession = util.GetHTTPSession() var (
httpSession = util.GetHTTPSession()
validHost = []string{
"com", "fr", "de", "ch", "jp", "cl", "ca", "it", "co\\.uk", "nz", "ru", "com\\.au",
"at", "pt", "co\\.kr", "es", "com\\.mx", "dk", "ph", "th", "com\\.uy", "co", "nl",
"info", "kr", "ie", "vn", "com\\.vn", "ec", "mx", "in", "pe", "co\\.at", "hu",
"co\\.in", "co\\.nz", "id", "com\\.ec", "com\\.py", "tw", "be", "uk", "com\\.bo", "com\\.pe",
}
validHostRegex = strings.Join(validHost, "|")
validUrlPattern = `https?://(?:[^/]+\.)?pinterest\.(` + validHostRegex + `)/pin/(?:[\w-]+--)?(?P<id>\d+)`
pinValidUrlPattern = `https?://(www\.)?pin\.(` + validHostRegex + `)/(?P<id>\w+)`
)
var ShortExtractor = &models.Extractor{ var ShortExtractor = &models.Extractor{
Name: "Pinterest (Short)", Name: "Pinterest (Short)",
CodeName: "pinterest:short", CodeName: "pinterest:short",
Type: enums.ExtractorTypeSingle, Type: enums.ExtractorTypeSingle,
Category: enums.ExtractorCategorySocial, Category: enums.ExtractorCategorySocial,
URLPattern: regexp.MustCompile(`https?://(\w+\.)?pin\.\w+/(?P<id>\w+)`), URLPattern: regexp.MustCompile(pinValidUrlPattern),
Host: func() []string {
var domains []string
for _, domain := range validHost {
domains = append(domains, "pin."+domain)
}
return domains
}(),
IsRedirect: true, IsRedirect: true,
Run: func(ctx *models.DownloadContext) (*models.ExtractorResponse, error) { Run: func(ctx *models.DownloadContext) (*models.ExtractorResponse, error) {
@ -44,7 +63,15 @@ var Extractor = &models.Extractor{
CodeName: "pinterest", CodeName: "pinterest",
Type: enums.ExtractorTypeSingle, Type: enums.ExtractorTypeSingle,
Category: enums.ExtractorCategorySocial, Category: enums.ExtractorCategorySocial,
URLPattern: regexp.MustCompile(`https?://(\w+\.)?pinterest[\.\w]+/pin/(?P<id>\d+)`), URLPattern: regexp.MustCompile(validUrlPattern),
Host: func() []string {
var domains []string
for _, domain := range validHost {
domains = append(domains, "pinterest."+domain)
domains = append(domains, domain+".pinterest.com")
}
return domains
}(),
Run: func(ctx *models.DownloadContext) (*models.ExtractorResponse, error) { Run: func(ctx *models.DownloadContext) (*models.ExtractorResponse, error) {
media, err := ExtractPinMedia(ctx) media, err := ExtractPinMedia(ctx)

View file

@ -12,7 +12,15 @@ import (
"govd/util" "govd/util"
) )
var httpSession = util.GetHTTPSession() var (
httpSession = util.GetHTTPSession()
baseHost = []string{
"reddit.com",
"redditmedia.com",
"old.reddit.com",
"old.redditmedia.com",
}
)
var ShortExtractor = &models.Extractor{ var ShortExtractor = &models.Extractor{
Name: "Reddit (Short)", Name: "Reddit (Short)",
@ -20,6 +28,7 @@ var ShortExtractor = &models.Extractor{
Type: enums.ExtractorTypeSingle, Type: enums.ExtractorTypeSingle,
Category: enums.ExtractorCategorySocial, Category: enums.ExtractorCategorySocial,
URLPattern: regexp.MustCompile(`https?://(?P<host>(?:\w+\.)?reddit(?:media)?\.com)/(?P<slug>(?:(?:r|user)/[^/]+/)?s/(?P<id>[^/?#&]+))`), URLPattern: regexp.MustCompile(`https?://(?P<host>(?:\w+\.)?reddit(?:media)?\.com)/(?P<slug>(?:(?:r|user)/[^/]+/)?s/(?P<id>[^/?#&]+))`),
Host: baseHost,
IsRedirect: true, IsRedirect: true,
Run: func(ctx *models.DownloadContext) (*models.ExtractorResponse, error) { Run: func(ctx *models.DownloadContext) (*models.ExtractorResponse, error) {
@ -57,6 +66,7 @@ var Extractor = &models.Extractor{
Type: enums.ExtractorTypeSingle, Type: enums.ExtractorTypeSingle,
Category: enums.ExtractorCategorySocial, Category: enums.ExtractorCategorySocial,
URLPattern: regexp.MustCompile(`https?://(?P<host>(?:\w+\.)?reddit(?:media)?\.com)/(?P<slug>(?:(?:r|user)/[^/]+/)?comments/(?P<id>[^/?#&]+))`), URLPattern: regexp.MustCompile(`https?://(?P<host>(?:\w+\.)?reddit(?:media)?\.com)/(?P<slug>(?:(?:r|user)/[^/]+/)?comments/(?P<id>[^/?#&]+))`),
Host: baseHost,
Run: func(ctx *models.DownloadContext) (*models.ExtractorResponse, error) { Run: func(ctx *models.DownloadContext) (*models.ExtractorResponse, error) {
mediaList, err := MediaListFromAPI(ctx) mediaList, err := MediaListFromAPI(ctx)

View file

@ -23,7 +23,19 @@ const (
appUserAgent = packageID + " (Linux; U; Android 13; en_US; Pixel 7; Build/TD1A.220804.031; Cronet/58.0.2991.0)" appUserAgent = packageID + " (Linux; U; Android 13; en_US; Pixel 7; Build/TD1A.220804.031; Cronet/58.0.2991.0)"
) )
var httpSession = util.GetHTTPSession() var (
httpSession = util.GetHTTPSession()
baseHost = []string{
"tiktok.com",
"vxtiktok.com",
"vm.tiktok.com",
"vt.tiktok.com",
"vt.vxtiktok.com",
"vm.vxtiktok.com",
"m.tiktok.com",
"m.vxtiktok.com",
}
)
var VMExtractor = &models.Extractor{ var VMExtractor = &models.Extractor{
Name: "TikTok VM", Name: "TikTok VM",
@ -31,6 +43,7 @@ var VMExtractor = &models.Extractor{
Type: enums.ExtractorTypeSingle, Type: enums.ExtractorTypeSingle,
Category: enums.ExtractorCategorySocial, Category: enums.ExtractorCategorySocial,
URLPattern: regexp.MustCompile(`https:\/\/((?:vm|vt|www)\.)?(vx)?tiktok\.com\/(?:t\/)?(?P<id>[a-zA-Z0-9]+)`), URLPattern: regexp.MustCompile(`https:\/\/((?:vm|vt|www)\.)?(vx)?tiktok\.com\/(?:t\/)?(?P<id>[a-zA-Z0-9]+)`),
Host: baseHost,
IsRedirect: true, IsRedirect: true,
Run: func(ctx *models.DownloadContext) (*models.ExtractorResponse, error) { Run: func(ctx *models.DownloadContext) (*models.ExtractorResponse, error) {
@ -50,6 +63,7 @@ var Extractor = &models.Extractor{
Type: enums.ExtractorTypeSingle, Type: enums.ExtractorTypeSingle,
Category: enums.ExtractorCategorySocial, Category: enums.ExtractorCategorySocial,
URLPattern: regexp.MustCompile(`https?:\/\/((www|m)\.)?(vx)?tiktok\.com\/((?:embed|@[\w\.-]+)\/)?(v(ideo)?|p(hoto)?)\/(?P<id>[0-9]+)`), URLPattern: regexp.MustCompile(`https?:\/\/((www|m)\.)?(vx)?tiktok\.com\/((?:embed|@[\w\.-]+)\/)?(v(ideo)?|p(hoto)?)\/(?P<id>[0-9]+)`),
Host: baseHost,
Run: func(ctx *models.DownloadContext) (*models.ExtractorResponse, error) { Run: func(ctx *models.DownloadContext) (*models.ExtractorResponse, error) {
mediaList, err := MediaListFromAPI(ctx) mediaList, err := MediaListFromAPI(ctx)

View file

@ -25,6 +25,7 @@ var ShortExtractor = &models.Extractor{
Type: enums.ExtractorTypeSingle, Type: enums.ExtractorTypeSingle,
Category: enums.ExtractorCategorySocial, Category: enums.ExtractorCategorySocial,
URLPattern: regexp.MustCompile(`https?://t\.co/(?P<id>\w+)`), URLPattern: regexp.MustCompile(`https?://t\.co/(?P<id>\w+)`),
Host: []string{"t.co"},
IsRedirect: true, IsRedirect: true,
Run: func(ctx *models.DownloadContext) (*models.ExtractorResponse, error) { Run: func(ctx *models.DownloadContext) (*models.ExtractorResponse, error) {
@ -58,6 +59,12 @@ var Extractor = &models.Extractor{
Type: enums.ExtractorTypeSingle, Type: enums.ExtractorTypeSingle,
Category: enums.ExtractorCategorySocial, Category: enums.ExtractorCategorySocial,
URLPattern: regexp.MustCompile(`https?:\/\/(vx)?(twitter|x)\.com\/([^\/]+)\/status\/(?P<id>\d+)`), URLPattern: regexp.MustCompile(`https?:\/\/(vx)?(twitter|x)\.com\/([^\/]+)\/status\/(?P<id>\d+)`),
Host: []string{
"twitter.com",
"x.com",
"vxx.com",
"vxtwitter.com",
},
Run: func(ctx *models.DownloadContext) (*models.ExtractorResponse, error) { Run: func(ctx *models.DownloadContext) (*models.ExtractorResponse, error) {
mediaList, err := MediaListFromAPI(ctx) mediaList, err := MediaListFromAPI(ctx)

View file

@ -3,65 +3,102 @@ package ext
import ( import (
"fmt" "fmt"
"govd/models" "govd/models"
"net/url"
"strings"
"sync"
) )
var maxRedirects = 5 var (
maxRedirects = 5
func CtxByURL(url string) (*models.DownloadContext, error) { extractorsByHost map[string][]*models.Extractor
var redirectCount int extractorMapOnce sync.Once
)
currentURL := url func initExtractorMap() {
extractorMapOnce.Do(func() {
for redirectCount <= maxRedirects { extractorsByHost = make(map[string][]*models.Extractor)
for _, extractor := range List { for _, extractor := range List {
matches := extractor.URLPattern.FindStringSubmatch(currentURL) if len(extractor.Host) > 0 {
if matches == nil { for _, domain := range extractor.Host {
continue extractorsByHost[domain] = append(extractorsByHost[domain], extractor)
}
groupNames := extractor.URLPattern.SubexpNames()
if len(matches) == 0 {
continue
}
groups := make(map[string]string)
for i, name := range groupNames {
if name != "" {
groups[name] = matches[i]
} }
} }
groups["match"] = matches[0]
ctx := &models.DownloadContext{
MatchedContentID: groups["id"],
MatchedContentURL: groups["match"],
MatchedGroups: groups,
Extractor: extractor,
}
if !extractor.IsRedirect {
return ctx, nil
}
response, err := extractor.Run(ctx)
if err != nil {
return nil, err
}
if response.URL == "" {
return nil, fmt.Errorf("no URL found in response")
}
currentURL = response.URL
redirectCount++
break
} }
})
}
func CtxByURL(urlStr string) (*models.DownloadContext, error) {
initExtractorMap()
var redirectCount int
currentURL := urlStr
for redirectCount <= maxRedirects {
parsedURL, err := url.Parse(currentURL)
if err != nil {
return nil, fmt.Errorf("invalid URL: %v", err)
}
host := strings.TrimPrefix(parsedURL.Host, "www.")
extractors := extractorsByHost[host]
if len(extractors) == 0 {
return nil, nil
}
var extractor *models.Extractor
var matches []string
var groups map[string]string
for _, ext := range extractors {
matches = ext.URLPattern.FindStringSubmatch(currentURL)
if matches != nil {
extractor = ext
groupNames := ext.URLPattern.SubexpNames()
groups = make(map[string]string)
for i, name := range groupNames {
if name != "" && i < len(matches) {
groups[name] = matches[i]
}
}
groups["match"] = matches[0]
break
}
}
if extractor == nil || matches == nil {
return nil, nil
}
ctx := &models.DownloadContext{
MatchedContentID: groups["id"],
MatchedContentURL: groups["match"],
MatchedGroups: groups,
Extractor: extractor,
}
if !extractor.IsRedirect {
return ctx, nil
}
response, err := extractor.Run(ctx)
if err != nil {
return nil, err
}
if response.URL == "" {
return nil, fmt.Errorf("no URL found in response")
}
currentURL = response.URL
redirectCount++
if redirectCount > maxRedirects { if redirectCount > maxRedirects {
return nil, fmt.Errorf("exceeded maximum number of redirects (%d)", maxRedirects) return nil, fmt.Errorf("exceeded maximum number of redirects (%d)", maxRedirects)
} }
} }
return nil, nil
return nil, fmt.Errorf("failed to extract from URL: %s", urlStr)
} }
func ByCodeName(codeName string) *models.Extractor { func ByCodeName(codeName string) *models.Extractor {

View file

@ -11,6 +11,7 @@ type Extractor struct {
Type enums.ExtractorType Type enums.ExtractorType
Category enums.ExtractorCategory Category enums.ExtractorCategory
URLPattern *regexp.Regexp URLPattern *regexp.Regexp
Host []string
IsDRM bool IsDRM bool
IsRedirect bool IsRedirect bool