use host matching instead of regex (desc)

why? using regex on every single message the bot receives, even simple patterns, can be very harmful for your cpu lol
2025-04-16 14:11:55 +02:00 · 2025-04-16 14:11:55 +02:00 · 3e351e7e43
commit 3e351e7e43
parent 84d005ade2
7 changed files with 154 additions and 51 deletions
--- a/ext/instagram/main.go
+++ b/ext/instagram/main.go
@ -44,12 +44,17 @@ var igHeaders = map[string]string{
 	"User-Agent":                util.ChromeUA,
 }
 var instagramHost = []string{
 	"instagram.com",
 }
 var Extractor = &models.Extractor{
 	Name:       "Instagram",
 	CodeName:   "instagram",
 	Type:       enums.ExtractorTypeSingle,
 	Category:   enums.ExtractorCategorySocial,
 	URLPattern: regexp.MustCompile(`https:\/\/www\.instagram\.com\/(reel|p|tv)\/(?P<id>[a-zA-Z0-9_-]+)`),
 	Host:       instagramHost,
 	IsRedirect: false,
 	Run: func(ctx *models.DownloadContext) (*models.ExtractorResponse, error) {
@ -66,6 +71,7 @@ var StoriesExtractor = &models.Extractor{
 	Type:       enums.ExtractorTypeSingle,
 	Category:   enums.ExtractorCategorySocial,
 	URLPattern: regexp.MustCompile(`https:\/\/www\.instagram\.com\/stories\/[a-zA-Z0-9._]+\/(?P<id>\d+)`),
 	Host:       instagramHost,
 	IsRedirect: false,
 	Run: func(ctx *models.DownloadContext) (*models.ExtractorResponse, error) {
@ -82,6 +88,7 @@ var ShareURLExtractor = &models.Extractor{
 	Type:       enums.ExtractorTypeSingle,
 	Category:   enums.ExtractorCategorySocial,
 	URLPattern: regexp.MustCompile(`https?:\/\/(www\.)?instagram\.com\/share\/((reels?|video|s|p)\/)?(?P<id>[^\/\?]+)`),
 	Host:       instagramHost,
 	IsRedirect: true,
 	Run: func(ctx *models.DownloadContext) (*models.ExtractorResponse, error) {
--- a/ext/pinterest/main.go
+++ b/ext/pinterest/main.go
@ -6,6 +6,7 @@ import (
 	"io"
 	"net/http"
 	"regexp"
 	"strings"
 	"govd/enums"
 	"govd/models"
@ -17,14 +18,32 @@ const (
 	shortenerAPIFormat  = "https://api.pinterest.com/url_shortener/%s/redirect/"
 )
-var httpSession = util.GetHTTPSession()
+var (
 	httpSession = util.GetHTTPSession()
 	validHost   = []string{
 		"com", "fr", "de", "ch", "jp", "cl", "ca", "it", "co\\.uk", "nz", "ru", "com\\.au",
 		"at", "pt", "co\\.kr", "es", "com\\.mx", "dk", "ph", "th", "com\\.uy", "co", "nl",
 		"info", "kr", "ie", "vn", "com\\.vn", "ec", "mx", "in", "pe", "co\\.at", "hu",
 		"co\\.in", "co\\.nz", "id", "com\\.ec", "com\\.py", "tw", "be", "uk", "com\\.bo", "com\\.pe",
 	}
 	validHostRegex     = strings.Join(validHost, "|")
 	validUrlPattern    = `https?://(?:[^/]+\.)?pinterest\.(` + validHostRegex + `)/pin/(?:[\w-]+--)?(?P<id>\d+)`
 	pinValidUrlPattern = `https?://(www\.)?pin\.(` + validHostRegex + `)/(?P<id>\w+)`
 )
 var ShortExtractor = &models.Extractor{
 	Name:       "Pinterest (Short)",
 	CodeName:   "pinterest:short",
 	Type:       enums.ExtractorTypeSingle,
 	Category:   enums.ExtractorCategorySocial,
-	URLPattern: regexp.MustCompile(`https?://(\w+\.)?pin\.\w+/(?P<id>\w+)`),
+	URLPattern: regexp.MustCompile(pinValidUrlPattern),
 	Host: func() []string {
 		var domains []string
 		for _, domain := range validHost {
 			domains = append(domains, "pin."+domain)
 		}
 		return domains
 	}(),
 	IsRedirect: true,
 	Run: func(ctx *models.DownloadContext) (*models.ExtractorResponse, error) {
@ -44,7 +63,15 @@ var Extractor = &models.Extractor{
 	CodeName:   "pinterest",
 	Type:       enums.ExtractorTypeSingle,
 	Category:   enums.ExtractorCategorySocial,
-	URLPattern: regexp.MustCompile(`https?://(\w+\.)?pinterest[\.\w]+/pin/(?P<id>\d+)`),
+	URLPattern: regexp.MustCompile(validUrlPattern),
 	Host: func() []string {
 		var domains []string
 		for _, domain := range validHost {
 			domains = append(domains, "pinterest."+domain)
 			domains = append(domains, domain+".pinterest.com")
 		}
 		return domains
 	}(),
 	Run: func(ctx *models.DownloadContext) (*models.ExtractorResponse, error) {
 		media, err := ExtractPinMedia(ctx)
--- a/ext/reddit/main.go
+++ b/ext/reddit/main.go
@ -12,7 +12,15 @@ import (
 	"govd/util"
 )
-var httpSession = util.GetHTTPSession()
+var (
 	httpSession = util.GetHTTPSession()
 	baseHost    = []string{
 		"reddit.com",
 		"redditmedia.com",
 		"old.reddit.com",
 		"old.redditmedia.com",
 	}
 )
 var ShortExtractor = &models.Extractor{
 	Name:       "Reddit (Short)",
@ -20,6 +28,7 @@ var ShortExtractor = &models.Extractor{
 	Type:       enums.ExtractorTypeSingle,
 	Category:   enums.ExtractorCategorySocial,
 	URLPattern: regexp.MustCompile(`https?://(?P<host>(?:\w+\.)?reddit(?:media)?\.com)/(?P<slug>(?:(?:r|user)/[^/]+/)?s/(?P<id>[^/?#&]+))`),
 	Host:       baseHost,
 	IsRedirect: true,
 	Run: func(ctx *models.DownloadContext) (*models.ExtractorResponse, error) {
@ -57,6 +66,7 @@ var Extractor = &models.Extractor{
 	Type:       enums.ExtractorTypeSingle,
 	Category:   enums.ExtractorCategorySocial,
 	URLPattern: regexp.MustCompile(`https?://(?P<host>(?:\w+\.)?reddit(?:media)?\.com)/(?P<slug>(?:(?:r|user)/[^/]+/)?comments/(?P<id>[^/?#&]+))`),
 	Host:       baseHost,
 	Run: func(ctx *models.DownloadContext) (*models.ExtractorResponse, error) {
 		mediaList, err := MediaListFromAPI(ctx)
--- a/ext/tiktok/main.go
+++ b/ext/tiktok/main.go
@ -23,7 +23,19 @@ const (
 	appUserAgent       = packageID + " (Linux; U; Android 13; en_US; Pixel 7; Build/TD1A.220804.031; Cronet/58.0.2991.0)"
 )
-var httpSession = util.GetHTTPSession()
+var (
 	httpSession = util.GetHTTPSession()
 	baseHost    = []string{
 		"tiktok.com",
 		"vxtiktok.com",
 		"vm.tiktok.com",
 		"vt.tiktok.com",
 		"vt.vxtiktok.com",
 		"vm.vxtiktok.com",
 		"m.tiktok.com",
 		"m.vxtiktok.com",
 	}
 )
 var VMExtractor = &models.Extractor{
 	Name:       "TikTok VM",
@ -31,6 +43,7 @@ var VMExtractor = &models.Extractor{
 	Type:       enums.ExtractorTypeSingle,
 	Category:   enums.ExtractorCategorySocial,
 	URLPattern: regexp.MustCompile(`https:\/\/((?:vm|vt|www)\.)?(vx)?tiktok\.com\/(?:t\/)?(?P<id>[a-zA-Z0-9]+)`),
 	Host:       baseHost,
 	IsRedirect: true,
 	Run: func(ctx *models.DownloadContext) (*models.ExtractorResponse, error) {
@ -50,6 +63,7 @@ var Extractor = &models.Extractor{
 	Type:       enums.ExtractorTypeSingle,
 	Category:   enums.ExtractorCategorySocial,
 	URLPattern: regexp.MustCompile(`https?:\/\/((www|m)\.)?(vx)?tiktok\.com\/((?:embed|@[\w\.-]+)\/)?(v(ideo)?|p(hoto)?)\/(?P<id>[0-9]+)`),
 	Host:       baseHost,
 	Run: func(ctx *models.DownloadContext) (*models.ExtractorResponse, error) {
 		mediaList, err := MediaListFromAPI(ctx)
--- a/ext/twitter/main.go
+++ b/ext/twitter/main.go
@ -25,6 +25,7 @@ var ShortExtractor = &models.Extractor{
 	Type:       enums.ExtractorTypeSingle,
 	Category:   enums.ExtractorCategorySocial,
 	URLPattern: regexp.MustCompile(`https?://t\.co/(?P<id>\w+)`),
 	Host:       []string{"t.co"},
 	IsRedirect: true,
 	Run: func(ctx *models.DownloadContext) (*models.ExtractorResponse, error) {
@ -58,6 +59,12 @@ var Extractor = &models.Extractor{
 	Type:       enums.ExtractorTypeSingle,
 	Category:   enums.ExtractorCategorySocial,
 	URLPattern: regexp.MustCompile(`https?:\/\/(vx)?(twitter|x)\.com\/([^\/]+)\/status\/(?P<id>\d+)`),
 	Host: []string{
 		"twitter.com",
 		"x.com",
 		"vxx.com",
 		"vxtwitter.com",
 	},
 	Run: func(ctx *models.DownloadContext) (*models.ExtractorResponse, error) {
 		mediaList, err := MediaListFromAPI(ctx)
--- a/ext/util.go
+++ b/ext/util.go
@ -3,65 +3,102 @@ package ext
 import (
 	"fmt"
 	"govd/models"
 	"net/url"
 	"strings"
 	"sync"
 )
-var maxRedirects = 5
+var (
 	maxRedirects = 5
-func CtxByURL(url string) (*models.DownloadContext, error) {
+	extractorsByHost map[string][]*models.Extractor
-	var redirectCount int
+	extractorMapOnce sync.Once
 )
-	currentURL := url
+func initExtractorMap() {
-
+	extractorMapOnce.Do(func() {
-	for redirectCount <= maxRedirects {
+		extractorsByHost = make(map[string][]*models.Extractor)
 		for _, extractor := range List {
-			matches := extractor.URLPattern.FindStringSubmatch(currentURL)
+			if len(extractor.Host) > 0 {
-			if matches == nil {
+				for _, domain := range extractor.Host {
-				continue
+					extractorsByHost[domain] = append(extractorsByHost[domain], extractor)
 			}
 			groupNames := extractor.URLPattern.SubexpNames()
 			if len(matches) == 0 {
 				continue
 			}
 			groups := make(map[string]string)
 			for i, name := range groupNames {
 				if name != "" {
 					groups[name] = matches[i]
 				}
 			}
 			groups["match"] = matches[0]
 			ctx := &models.DownloadContext{
 				MatchedContentID:  groups["id"],
 				MatchedContentURL: groups["match"],
 				MatchedGroups:     groups,
 				Extractor:         extractor,
 			}
 			if !extractor.IsRedirect {
 				return ctx, nil
 			}
 			response, err := extractor.Run(ctx)
 			if err != nil {
 				return nil, err
 			}
 			if response.URL == "" {
 				return nil, fmt.Errorf("no URL found in response")
 			}
 			currentURL = response.URL
 			redirectCount++
 			break
 		}
 	})
 }
 func CtxByURL(urlStr string) (*models.DownloadContext, error) {
 	initExtractorMap()
 	var redirectCount int
 	currentURL := urlStr
 	for redirectCount <= maxRedirects {
 		parsedURL, err := url.Parse(currentURL)
 		if err != nil {
 			return nil, fmt.Errorf("invalid URL: %v", err)
 		}
 		host := strings.TrimPrefix(parsedURL.Host, "www.")
 		extractors := extractorsByHost[host]
 		if len(extractors) == 0 {
 			return nil, nil
 		}
 		var extractor *models.Extractor
 		var matches []string
 		var groups map[string]string
 		for _, ext := range extractors {
 			matches = ext.URLPattern.FindStringSubmatch(currentURL)
 			if matches != nil {
 				extractor = ext
 				groupNames := ext.URLPattern.SubexpNames()
 				groups = make(map[string]string)
 				for i, name := range groupNames {
 					if name != "" && i < len(matches) {
 						groups[name] = matches[i]
 					}
 				}
 				groups["match"] = matches[0]
 				break
 			}
 		}
 		if extractor == nil || matches == nil {
 			return nil, nil
 		}
 		ctx := &models.DownloadContext{
 			MatchedContentID:  groups["id"],
 			MatchedContentURL: groups["match"],
 			MatchedGroups:     groups,
 			Extractor:         extractor,
 		}
 		if !extractor.IsRedirect {
 			return ctx, nil
 		}
 		response, err := extractor.Run(ctx)
 		if err != nil {
 			return nil, err
 		}
 		if response.URL == "" {
 			return nil, fmt.Errorf("no URL found in response")
 		}
 		currentURL = response.URL
 		redirectCount++
 		if redirectCount > maxRedirects {
 			return nil, fmt.Errorf("exceeded maximum number of redirects (%d)", maxRedirects)
 		}
 	}
-	return nil, nil
+
 	return nil, fmt.Errorf("failed to extract from URL: %s", urlStr)
 }
 func ByCodeName(codeName string) *models.Extractor {
--- a/models/ext.go
+++ b/models/ext.go
@ -11,6 +11,7 @@ type Extractor struct {
 	Type       enums.ExtractorType
 	Category   enums.ExtractorCategory
 	URLPattern *regexp.Regexp
 	Host       []string
 	IsDRM      bool
 	IsRedirect bool