From 3e351e7e438bbf71e26ec6c81a5a1fec71dd62f2 Mon Sep 17 00:00:00 2001 From: stefanodvx <69367859+stefanodvx@users.noreply.github.com> Date: Wed, 16 Apr 2025 14:11:55 +0200 Subject: [PATCH] use host matching instead of regex (desc) why? using regex on every single message the bot receives, even simple patterns, can be very harmful for your cpu lol --- ext/instagram/main.go | 7 +++ ext/pinterest/main.go | 33 ++++++++++- ext/reddit/main.go | 12 +++- ext/tiktok/main.go | 16 +++++- ext/twitter/main.go | 7 +++ ext/util.go | 129 +++++++++++++++++++++++++++--------------- models/ext.go | 1 + 7 files changed, 154 insertions(+), 51 deletions(-) diff --git a/ext/instagram/main.go b/ext/instagram/main.go index 5ef9bb4..e4c72be 100644 --- a/ext/instagram/main.go +++ b/ext/instagram/main.go @@ -44,12 +44,17 @@ var igHeaders = map[string]string{ "User-Agent": util.ChromeUA, } +var instagramHost = []string{ + "instagram.com", +} + var Extractor = &models.Extractor{ Name: "Instagram", CodeName: "instagram", Type: enums.ExtractorTypeSingle, Category: enums.ExtractorCategorySocial, URLPattern: regexp.MustCompile(`https:\/\/www\.instagram\.com\/(reel|p|tv)\/(?P[a-zA-Z0-9_-]+)`), + Host: instagramHost, IsRedirect: false, Run: func(ctx *models.DownloadContext) (*models.ExtractorResponse, error) { @@ -66,6 +71,7 @@ var StoriesExtractor = &models.Extractor{ Type: enums.ExtractorTypeSingle, Category: enums.ExtractorCategorySocial, URLPattern: regexp.MustCompile(`https:\/\/www\.instagram\.com\/stories\/[a-zA-Z0-9._]+\/(?P\d+)`), + Host: instagramHost, IsRedirect: false, Run: func(ctx *models.DownloadContext) (*models.ExtractorResponse, error) { @@ -82,6 +88,7 @@ var ShareURLExtractor = &models.Extractor{ Type: enums.ExtractorTypeSingle, Category: enums.ExtractorCategorySocial, URLPattern: regexp.MustCompile(`https?:\/\/(www\.)?instagram\.com\/share\/((reels?|video|s|p)\/)?(?P[^\/\?]+)`), + Host: instagramHost, IsRedirect: true, Run: func(ctx *models.DownloadContext) (*models.ExtractorResponse, error) { diff --git a/ext/pinterest/main.go b/ext/pinterest/main.go index b0f6974..073c31d 100644 --- a/ext/pinterest/main.go +++ b/ext/pinterest/main.go @@ -6,6 +6,7 @@ import ( "io" "net/http" "regexp" + "strings" "govd/enums" "govd/models" @@ -17,14 +18,32 @@ const ( shortenerAPIFormat = "https://api.pinterest.com/url_shortener/%s/redirect/" ) -var httpSession = util.GetHTTPSession() +var ( + httpSession = util.GetHTTPSession() + validHost = []string{ + "com", "fr", "de", "ch", "jp", "cl", "ca", "it", "co\\.uk", "nz", "ru", "com\\.au", + "at", "pt", "co\\.kr", "es", "com\\.mx", "dk", "ph", "th", "com\\.uy", "co", "nl", + "info", "kr", "ie", "vn", "com\\.vn", "ec", "mx", "in", "pe", "co\\.at", "hu", + "co\\.in", "co\\.nz", "id", "com\\.ec", "com\\.py", "tw", "be", "uk", "com\\.bo", "com\\.pe", + } + validHostRegex = strings.Join(validHost, "|") + validUrlPattern = `https?://(?:[^/]+\.)?pinterest\.(` + validHostRegex + `)/pin/(?:[\w-]+--)?(?P\d+)` + pinValidUrlPattern = `https?://(www\.)?pin\.(` + validHostRegex + `)/(?P\w+)` +) var ShortExtractor = &models.Extractor{ Name: "Pinterest (Short)", CodeName: "pinterest:short", Type: enums.ExtractorTypeSingle, Category: enums.ExtractorCategorySocial, - URLPattern: regexp.MustCompile(`https?://(\w+\.)?pin\.\w+/(?P\w+)`), + URLPattern: regexp.MustCompile(pinValidUrlPattern), + Host: func() []string { + var domains []string + for _, domain := range validHost { + domains = append(domains, "pin."+domain) + } + return domains + }(), IsRedirect: true, Run: func(ctx *models.DownloadContext) (*models.ExtractorResponse, error) { @@ -44,7 +63,15 @@ var Extractor = &models.Extractor{ CodeName: "pinterest", Type: enums.ExtractorTypeSingle, Category: enums.ExtractorCategorySocial, - URLPattern: regexp.MustCompile(`https?://(\w+\.)?pinterest[\.\w]+/pin/(?P\d+)`), + URLPattern: regexp.MustCompile(validUrlPattern), + Host: func() []string { + var domains []string + for _, domain := range validHost { + domains = append(domains, "pinterest."+domain) + domains = append(domains, domain+".pinterest.com") + } + return domains + }(), Run: func(ctx *models.DownloadContext) (*models.ExtractorResponse, error) { media, err := ExtractPinMedia(ctx) diff --git a/ext/reddit/main.go b/ext/reddit/main.go index 1bf7f4e..1960f24 100644 --- a/ext/reddit/main.go +++ b/ext/reddit/main.go @@ -12,7 +12,15 @@ import ( "govd/util" ) -var httpSession = util.GetHTTPSession() +var ( + httpSession = util.GetHTTPSession() + baseHost = []string{ + "reddit.com", + "redditmedia.com", + "old.reddit.com", + "old.redditmedia.com", + } +) var ShortExtractor = &models.Extractor{ Name: "Reddit (Short)", @@ -20,6 +28,7 @@ var ShortExtractor = &models.Extractor{ Type: enums.ExtractorTypeSingle, Category: enums.ExtractorCategorySocial, URLPattern: regexp.MustCompile(`https?://(?P(?:\w+\.)?reddit(?:media)?\.com)/(?P(?:(?:r|user)/[^/]+/)?s/(?P[^/?#&]+))`), + Host: baseHost, IsRedirect: true, Run: func(ctx *models.DownloadContext) (*models.ExtractorResponse, error) { @@ -57,6 +66,7 @@ var Extractor = &models.Extractor{ Type: enums.ExtractorTypeSingle, Category: enums.ExtractorCategorySocial, URLPattern: regexp.MustCompile(`https?://(?P(?:\w+\.)?reddit(?:media)?\.com)/(?P(?:(?:r|user)/[^/]+/)?comments/(?P[^/?#&]+))`), + Host: baseHost, Run: func(ctx *models.DownloadContext) (*models.ExtractorResponse, error) { mediaList, err := MediaListFromAPI(ctx) diff --git a/ext/tiktok/main.go b/ext/tiktok/main.go index caa3752..4126ab2 100644 --- a/ext/tiktok/main.go +++ b/ext/tiktok/main.go @@ -23,7 +23,19 @@ const ( appUserAgent = packageID + " (Linux; U; Android 13; en_US; Pixel 7; Build/TD1A.220804.031; Cronet/58.0.2991.0)" ) -var httpSession = util.GetHTTPSession() +var ( + httpSession = util.GetHTTPSession() + baseHost = []string{ + "tiktok.com", + "vxtiktok.com", + "vm.tiktok.com", + "vt.tiktok.com", + "vt.vxtiktok.com", + "vm.vxtiktok.com", + "m.tiktok.com", + "m.vxtiktok.com", + } +) var VMExtractor = &models.Extractor{ Name: "TikTok VM", @@ -31,6 +43,7 @@ var VMExtractor = &models.Extractor{ Type: enums.ExtractorTypeSingle, Category: enums.ExtractorCategorySocial, URLPattern: regexp.MustCompile(`https:\/\/((?:vm|vt|www)\.)?(vx)?tiktok\.com\/(?:t\/)?(?P[a-zA-Z0-9]+)`), + Host: baseHost, IsRedirect: true, Run: func(ctx *models.DownloadContext) (*models.ExtractorResponse, error) { @@ -50,6 +63,7 @@ var Extractor = &models.Extractor{ Type: enums.ExtractorTypeSingle, Category: enums.ExtractorCategorySocial, URLPattern: regexp.MustCompile(`https?:\/\/((www|m)\.)?(vx)?tiktok\.com\/((?:embed|@[\w\.-]+)\/)?(v(ideo)?|p(hoto)?)\/(?P[0-9]+)`), + Host: baseHost, Run: func(ctx *models.DownloadContext) (*models.ExtractorResponse, error) { mediaList, err := MediaListFromAPI(ctx) diff --git a/ext/twitter/main.go b/ext/twitter/main.go index 9aefb4b..6b2746c 100644 --- a/ext/twitter/main.go +++ b/ext/twitter/main.go @@ -25,6 +25,7 @@ var ShortExtractor = &models.Extractor{ Type: enums.ExtractorTypeSingle, Category: enums.ExtractorCategorySocial, URLPattern: regexp.MustCompile(`https?://t\.co/(?P\w+)`), + Host: []string{"t.co"}, IsRedirect: true, Run: func(ctx *models.DownloadContext) (*models.ExtractorResponse, error) { @@ -58,6 +59,12 @@ var Extractor = &models.Extractor{ Type: enums.ExtractorTypeSingle, Category: enums.ExtractorCategorySocial, URLPattern: regexp.MustCompile(`https?:\/\/(vx)?(twitter|x)\.com\/([^\/]+)\/status\/(?P\d+)`), + Host: []string{ + "twitter.com", + "x.com", + "vxx.com", + "vxtwitter.com", + }, Run: func(ctx *models.DownloadContext) (*models.ExtractorResponse, error) { mediaList, err := MediaListFromAPI(ctx) diff --git a/ext/util.go b/ext/util.go index 3234b3a..ed7b9ea 100644 --- a/ext/util.go +++ b/ext/util.go @@ -3,65 +3,102 @@ package ext import ( "fmt" "govd/models" + "net/url" + "strings" + "sync" ) -var maxRedirects = 5 +var ( + maxRedirects = 5 -func CtxByURL(url string) (*models.DownloadContext, error) { - var redirectCount int + extractorsByHost map[string][]*models.Extractor + extractorMapOnce sync.Once +) - currentURL := url - - for redirectCount <= maxRedirects { +func initExtractorMap() { + extractorMapOnce.Do(func() { + extractorsByHost = make(map[string][]*models.Extractor) for _, extractor := range List { - matches := extractor.URLPattern.FindStringSubmatch(currentURL) - if matches == nil { - continue - } - - groupNames := extractor.URLPattern.SubexpNames() - if len(matches) == 0 { - continue - } - - groups := make(map[string]string) - for i, name := range groupNames { - if name != "" { - groups[name] = matches[i] + if len(extractor.Host) > 0 { + for _, domain := range extractor.Host { + extractorsByHost[domain] = append(extractorsByHost[domain], extractor) } } - groups["match"] = matches[0] - - ctx := &models.DownloadContext{ - MatchedContentID: groups["id"], - MatchedContentURL: groups["match"], - MatchedGroups: groups, - Extractor: extractor, - } - - if !extractor.IsRedirect { - return ctx, nil - } - - response, err := extractor.Run(ctx) - if err != nil { - return nil, err - } - if response.URL == "" { - return nil, fmt.Errorf("no URL found in response") - } - - currentURL = response.URL - redirectCount++ - - break } + }) +} + +func CtxByURL(urlStr string) (*models.DownloadContext, error) { + initExtractorMap() + + var redirectCount int + currentURL := urlStr + + for redirectCount <= maxRedirects { + parsedURL, err := url.Parse(currentURL) + if err != nil { + return nil, fmt.Errorf("invalid URL: %v", err) + } + + host := strings.TrimPrefix(parsedURL.Host, "www.") + + extractors := extractorsByHost[host] + if len(extractors) == 0 { + return nil, nil + } + + var extractor *models.Extractor + var matches []string + var groups map[string]string + + for _, ext := range extractors { + matches = ext.URLPattern.FindStringSubmatch(currentURL) + if matches != nil { + extractor = ext + groupNames := ext.URLPattern.SubexpNames() + groups = make(map[string]string) + for i, name := range groupNames { + if name != "" && i < len(matches) { + groups[name] = matches[i] + } + } + groups["match"] = matches[0] + break + } + } + + if extractor == nil || matches == nil { + return nil, nil + } + + ctx := &models.DownloadContext{ + MatchedContentID: groups["id"], + MatchedContentURL: groups["match"], + MatchedGroups: groups, + Extractor: extractor, + } + + if !extractor.IsRedirect { + return ctx, nil + } + + response, err := extractor.Run(ctx) + if err != nil { + return nil, err + } + if response.URL == "" { + return nil, fmt.Errorf("no URL found in response") + } + + currentURL = response.URL + redirectCount++ if redirectCount > maxRedirects { return nil, fmt.Errorf("exceeded maximum number of redirects (%d)", maxRedirects) } } - return nil, nil + + return nil, fmt.Errorf("failed to extract from URL: %s", urlStr) } func ByCodeName(codeName string) *models.Extractor { diff --git a/models/ext.go b/models/ext.go index 7ec2326..3e905c7 100644 --- a/models/ext.go +++ b/models/ext.go @@ -11,6 +11,7 @@ type Extractor struct { Type enums.ExtractorType Category enums.ExtractorCategory URLPattern *regexp.Regexp + Host []string IsDRM bool IsRedirect bool