use host matching instead of regex (desc)

why? using regex on every single message the bot receives, even simple patterns, can be very harmful for your cpu lol
2025-04-16 14:11:55 +02:00 · 2025-04-16 14:11:55 +02:00 · 3e351e7e43
commit 3e351e7e43
parent 84d005ade2
7 changed files with 154 additions and 51 deletions
--- a/ext/instagram/main.go
+++ b/ext/instagram/main.go
@ -44,12 +44,17 @@ var igHeaders = map[string]string{
 	"User-Agent":                util.ChromeUA,
 }

+var instagramHost = []string{
+	"instagram.com",
+}
+
 var Extractor = &models.Extractor{
 	Name:       "Instagram",
 	CodeName:   "instagram",
 	Type:       enums.ExtractorTypeSingle,
 	Category:   enums.ExtractorCategorySocial,
 	URLPattern: regexp.MustCompile(`https:\/\/www\.instagram\.com\/(reel|p|tv)\/(?P<id>[a-zA-Z0-9_-]+)`),
+	Host:       instagramHost,
 	IsRedirect: false,

 	Run: func(ctx *models.DownloadContext) (*models.ExtractorResponse, error) {
@ -66,6 +71,7 @@ var StoriesExtractor = &models.Extractor{
 	Type:       enums.ExtractorTypeSingle,
 	Category:   enums.ExtractorCategorySocial,
 	URLPattern: regexp.MustCompile(`https:\/\/www\.instagram\.com\/stories\/[a-zA-Z0-9._]+\/(?P<id>\d+)`),
+	Host:       instagramHost,
 	IsRedirect: false,

 	Run: func(ctx *models.DownloadContext) (*models.ExtractorResponse, error) {
@ -82,6 +88,7 @@ var ShareURLExtractor = &models.Extractor{
 	Type:       enums.ExtractorTypeSingle,
 	Category:   enums.ExtractorCategorySocial,
 	URLPattern: regexp.MustCompile(`https?:\/\/(www\.)?instagram\.com\/share\/((reels?|video|s|p)\/)?(?P<id>[^\/\?]+)`),
+	Host:       instagramHost,
 	IsRedirect: true,

 	Run: func(ctx *models.DownloadContext) (*models.ExtractorResponse, error) {
--- a/ext/pinterest/main.go
+++ b/ext/pinterest/main.go
@ -6,6 +6,7 @@ import (
 	"io"
 	"net/http"
 	"regexp"
+	"strings"

 	"govd/enums"
 	"govd/models"
@ -17,14 +18,32 @@ const (
 	shortenerAPIFormat  = "https://api.pinterest.com/url_shortener/%s/redirect/"
 )

-var httpSession = util.GetHTTPSession()
+var (
+	httpSession = util.GetHTTPSession()
+	validHost   = []string{
+		"com", "fr", "de", "ch", "jp", "cl", "ca", "it", "co\\.uk", "nz", "ru", "com\\.au",
+		"at", "pt", "co\\.kr", "es", "com\\.mx", "dk", "ph", "th", "com\\.uy", "co", "nl",
+		"info", "kr", "ie", "vn", "com\\.vn", "ec", "mx", "in", "pe", "co\\.at", "hu",
+		"co\\.in", "co\\.nz", "id", "com\\.ec", "com\\.py", "tw", "be", "uk", "com\\.bo", "com\\.pe",
+	}
+	validHostRegex     = strings.Join(validHost, "|")
+	validUrlPattern    = `https?://(?:[^/]+\.)?pinterest\.(` + validHostRegex + `)/pin/(?:[\w-]+--)?(?P<id>\d+)`
+	pinValidUrlPattern = `https?://(www\.)?pin\.(` + validHostRegex + `)/(?P<id>\w+)`
+)

 var ShortExtractor = &models.Extractor{
 	Name:       "Pinterest (Short)",
 	CodeName:   "pinterest:short",
 	Type:       enums.ExtractorTypeSingle,
 	Category:   enums.ExtractorCategorySocial,
-	URLPattern: regexp.MustCompile(`https?://(\w+\.)?pin\.\w+/(?P<id>\w+)`),
+	URLPattern: regexp.MustCompile(pinValidUrlPattern),
+	Host: func() []string {
+		var domains []string
+		for _, domain := range validHost {
+			domains = append(domains, "pin."+domain)
+		}
+		return domains
+	}(),
 	IsRedirect: true,

 	Run: func(ctx *models.DownloadContext) (*models.ExtractorResponse, error) {
@ -44,7 +63,15 @@ var Extractor = &models.Extractor{
 	CodeName:   "pinterest",
 	Type:       enums.ExtractorTypeSingle,
 	Category:   enums.ExtractorCategorySocial,
-	URLPattern: regexp.MustCompile(`https?://(\w+\.)?pinterest[\.\w]+/pin/(?P<id>\d+)`),
+	URLPattern: regexp.MustCompile(validUrlPattern),
+	Host: func() []string {
+		var domains []string
+		for _, domain := range validHost {
+			domains = append(domains, "pinterest."+domain)
+			domains = append(domains, domain+".pinterest.com")
+		}
+		return domains
+	}(),

 	Run: func(ctx *models.DownloadContext) (*models.ExtractorResponse, error) {
 		media, err := ExtractPinMedia(ctx)
--- a/ext/reddit/main.go
+++ b/ext/reddit/main.go
@ -12,7 +12,15 @@ import (
 	"govd/util"
 )

-var httpSession = util.GetHTTPSession()
+var (
+	httpSession = util.GetHTTPSession()
+	baseHost    = []string{
+		"reddit.com",
+		"redditmedia.com",
+		"old.reddit.com",
+		"old.redditmedia.com",
+	}
+)

 var ShortExtractor = &models.Extractor{
 	Name:       "Reddit (Short)",
@ -20,6 +28,7 @@ var ShortExtractor = &models.Extractor{
 	Type:       enums.ExtractorTypeSingle,
 	Category:   enums.ExtractorCategorySocial,
 	URLPattern: regexp.MustCompile(`https?://(?P<host>(?:\w+\.)?reddit(?:media)?\.com)/(?P<slug>(?:(?:r|user)/[^/]+/)?s/(?P<id>[^/?#&]+))`),
+	Host:       baseHost,
 	IsRedirect: true,

 	Run: func(ctx *models.DownloadContext) (*models.ExtractorResponse, error) {
@ -57,6 +66,7 @@ var Extractor = &models.Extractor{
 	Type:       enums.ExtractorTypeSingle,
 	Category:   enums.ExtractorCategorySocial,
 	URLPattern: regexp.MustCompile(`https?://(?P<host>(?:\w+\.)?reddit(?:media)?\.com)/(?P<slug>(?:(?:r|user)/[^/]+/)?comments/(?P<id>[^/?#&]+))`),
+	Host:       baseHost,

 	Run: func(ctx *models.DownloadContext) (*models.ExtractorResponse, error) {
 		mediaList, err := MediaListFromAPI(ctx)
--- a/ext/tiktok/main.go
+++ b/ext/tiktok/main.go
@ -23,7 +23,19 @@ const (
 	appUserAgent       = packageID + " (Linux; U; Android 13; en_US; Pixel 7; Build/TD1A.220804.031; Cronet/58.0.2991.0)"
 )

-var httpSession = util.GetHTTPSession()
+var (
+	httpSession = util.GetHTTPSession()
+	baseHost    = []string{
+		"tiktok.com",
+		"vxtiktok.com",
+		"vm.tiktok.com",
+		"vt.tiktok.com",
+		"vt.vxtiktok.com",
+		"vm.vxtiktok.com",
+		"m.tiktok.com",
+		"m.vxtiktok.com",
+	}
+)

 var VMExtractor = &models.Extractor{
 	Name:       "TikTok VM",
@ -31,6 +43,7 @@ var VMExtractor = &models.Extractor{
 	Type:       enums.ExtractorTypeSingle,
 	Category:   enums.ExtractorCategorySocial,
 	URLPattern: regexp.MustCompile(`https:\/\/((?:vm|vt|www)\.)?(vx)?tiktok\.com\/(?:t\/)?(?P<id>[a-zA-Z0-9]+)`),
+	Host:       baseHost,
 	IsRedirect: true,

 	Run: func(ctx *models.DownloadContext) (*models.ExtractorResponse, error) {
@ -50,6 +63,7 @@ var Extractor = &models.Extractor{
 	Type:       enums.ExtractorTypeSingle,
 	Category:   enums.ExtractorCategorySocial,
 	URLPattern: regexp.MustCompile(`https?:\/\/((www|m)\.)?(vx)?tiktok\.com\/((?:embed|@[\w\.-]+)\/)?(v(ideo)?|p(hoto)?)\/(?P<id>[0-9]+)`),
+	Host:       baseHost,

 	Run: func(ctx *models.DownloadContext) (*models.ExtractorResponse, error) {
 		mediaList, err := MediaListFromAPI(ctx)
--- a/ext/twitter/main.go
+++ b/ext/twitter/main.go
@ -25,6 +25,7 @@ var ShortExtractor = &models.Extractor{
 	Type:       enums.ExtractorTypeSingle,
 	Category:   enums.ExtractorCategorySocial,
 	URLPattern: regexp.MustCompile(`https?://t\.co/(?P<id>\w+)`),
+	Host:       []string{"t.co"},
 	IsRedirect: true,

 	Run: func(ctx *models.DownloadContext) (*models.ExtractorResponse, error) {
@ -58,6 +59,12 @@ var Extractor = &models.Extractor{
 	Type:       enums.ExtractorTypeSingle,
 	Category:   enums.ExtractorCategorySocial,
 	URLPattern: regexp.MustCompile(`https?:\/\/(vx)?(twitter|x)\.com\/([^\/]+)\/status\/(?P<id>\d+)`),
+	Host: []string{
+		"twitter.com",
+		"x.com",
+		"vxx.com",
+		"vxtwitter.com",
+	},

 	Run: func(ctx *models.DownloadContext) (*models.ExtractorResponse, error) {
 		mediaList, err := MediaListFromAPI(ctx)
--- a/ext/util.go
+++ b/ext/util.go
@ -3,34 +3,73 @@ package ext
 import (
 	"fmt"
 	"govd/models"
+	"net/url"
+	"strings"
+	"sync"
 )

-var maxRedirects = 5
+var (
+	maxRedirects = 5
+
+	extractorsByHost map[string][]*models.Extractor
+	extractorMapOnce sync.Once
+)
+
+func initExtractorMap() {
+	extractorMapOnce.Do(func() {
+		extractorsByHost = make(map[string][]*models.Extractor)
+		for _, extractor := range List {
+			if len(extractor.Host) > 0 {
+				for _, domain := range extractor.Host {
+					extractorsByHost[domain] = append(extractorsByHost[domain], extractor)
+				}
+			}
+		}
+	})
+}
+
+func CtxByURL(urlStr string) (*models.DownloadContext, error) {
+	initExtractorMap()

-func CtxByURL(url string) (*models.DownloadContext, error) {
 	var redirectCount int
-
-	currentURL := url
+	currentURL := urlStr

 	for redirectCount <= maxRedirects {
-		for _, extractor := range List {
-			matches := extractor.URLPattern.FindStringSubmatch(currentURL)
-			if matches == nil {
-				continue
+		parsedURL, err := url.Parse(currentURL)
+		if err != nil {
+			return nil, fmt.Errorf("invalid URL: %v", err)
 		}

-			groupNames := extractor.URLPattern.SubexpNames()
-			if len(matches) == 0 {
-				continue
+		host := strings.TrimPrefix(parsedURL.Host, "www.")
+
+		extractors := extractorsByHost[host]
+		if len(extractors) == 0 {
+			return nil, nil
 		}

-			groups := make(map[string]string)
+		var extractor *models.Extractor
+		var matches []string
+		var groups map[string]string
+
+		for _, ext := range extractors {
+			matches = ext.URLPattern.FindStringSubmatch(currentURL)
+			if matches != nil {
+				extractor = ext
+				groupNames := ext.URLPattern.SubexpNames()
+				groups = make(map[string]string)
 				for i, name := range groupNames {
-				if name != "" {
+					if name != "" && i < len(matches) {
 						groups[name] = matches[i]
 					}
 				}
 				groups["match"] = matches[0]
+				break
+			}
+		}
+
+		if extractor == nil || matches == nil {
+			return nil, nil
+		}

 		ctx := &models.DownloadContext{
 			MatchedContentID:  groups["id"],
@ -54,14 +93,12 @@ func CtxByURL(url string) (*models.DownloadContext, error) {
 		currentURL = response.URL
 		redirectCount++

-			break
-		}
-
 		if redirectCount > maxRedirects {
 			return nil, fmt.Errorf("exceeded maximum number of redirects (%d)", maxRedirects)
 		}
 	}
-	return nil, nil
+
+	return nil, fmt.Errorf("failed to extract from URL: %s", urlStr)
 }

 func ByCodeName(codeName string) *models.Extractor {
--- a/models/ext.go
+++ b/models/ext.go
@ -11,6 +11,7 @@ type Extractor struct {
 	Type       enums.ExtractorType
 	Category   enums.ExtractorCategory
 	URLPattern *regexp.Regexp
+	Host       []string
 	IsDRM      bool
 	IsRedirect bool