use host matching instead of regex (desc)
why? using regex on every single message the bot receives, even simple patterns, can be very harmful for your cpu lol
This commit is contained in:
parent
84d005ade2
commit
3e351e7e43
7 changed files with 154 additions and 51 deletions
|
@ -44,12 +44,17 @@ var igHeaders = map[string]string{
|
|||
"User-Agent": util.ChromeUA,
|
||||
}
|
||||
|
||||
var instagramHost = []string{
|
||||
"instagram.com",
|
||||
}
|
||||
|
||||
var Extractor = &models.Extractor{
|
||||
Name: "Instagram",
|
||||
CodeName: "instagram",
|
||||
Type: enums.ExtractorTypeSingle,
|
||||
Category: enums.ExtractorCategorySocial,
|
||||
URLPattern: regexp.MustCompile(`https:\/\/www\.instagram\.com\/(reel|p|tv)\/(?P<id>[a-zA-Z0-9_-]+)`),
|
||||
Host: instagramHost,
|
||||
IsRedirect: false,
|
||||
|
||||
Run: func(ctx *models.DownloadContext) (*models.ExtractorResponse, error) {
|
||||
|
@ -66,6 +71,7 @@ var StoriesExtractor = &models.Extractor{
|
|||
Type: enums.ExtractorTypeSingle,
|
||||
Category: enums.ExtractorCategorySocial,
|
||||
URLPattern: regexp.MustCompile(`https:\/\/www\.instagram\.com\/stories\/[a-zA-Z0-9._]+\/(?P<id>\d+)`),
|
||||
Host: instagramHost,
|
||||
IsRedirect: false,
|
||||
|
||||
Run: func(ctx *models.DownloadContext) (*models.ExtractorResponse, error) {
|
||||
|
@ -82,6 +88,7 @@ var ShareURLExtractor = &models.Extractor{
|
|||
Type: enums.ExtractorTypeSingle,
|
||||
Category: enums.ExtractorCategorySocial,
|
||||
URLPattern: regexp.MustCompile(`https?:\/\/(www\.)?instagram\.com\/share\/((reels?|video|s|p)\/)?(?P<id>[^\/\?]+)`),
|
||||
Host: instagramHost,
|
||||
IsRedirect: true,
|
||||
|
||||
Run: func(ctx *models.DownloadContext) (*models.ExtractorResponse, error) {
|
||||
|
|
|
@ -6,6 +6,7 @@ import (
|
|||
"io"
|
||||
"net/http"
|
||||
"regexp"
|
||||
"strings"
|
||||
|
||||
"govd/enums"
|
||||
"govd/models"
|
||||
|
@ -17,14 +18,32 @@ const (
|
|||
shortenerAPIFormat = "https://api.pinterest.com/url_shortener/%s/redirect/"
|
||||
)
|
||||
|
||||
var httpSession = util.GetHTTPSession()
|
||||
var (
|
||||
httpSession = util.GetHTTPSession()
|
||||
validHost = []string{
|
||||
"com", "fr", "de", "ch", "jp", "cl", "ca", "it", "co\\.uk", "nz", "ru", "com\\.au",
|
||||
"at", "pt", "co\\.kr", "es", "com\\.mx", "dk", "ph", "th", "com\\.uy", "co", "nl",
|
||||
"info", "kr", "ie", "vn", "com\\.vn", "ec", "mx", "in", "pe", "co\\.at", "hu",
|
||||
"co\\.in", "co\\.nz", "id", "com\\.ec", "com\\.py", "tw", "be", "uk", "com\\.bo", "com\\.pe",
|
||||
}
|
||||
validHostRegex = strings.Join(validHost, "|")
|
||||
validUrlPattern = `https?://(?:[^/]+\.)?pinterest\.(` + validHostRegex + `)/pin/(?:[\w-]+--)?(?P<id>\d+)`
|
||||
pinValidUrlPattern = `https?://(www\.)?pin\.(` + validHostRegex + `)/(?P<id>\w+)`
|
||||
)
|
||||
|
||||
var ShortExtractor = &models.Extractor{
|
||||
Name: "Pinterest (Short)",
|
||||
CodeName: "pinterest:short",
|
||||
Type: enums.ExtractorTypeSingle,
|
||||
Category: enums.ExtractorCategorySocial,
|
||||
URLPattern: regexp.MustCompile(`https?://(\w+\.)?pin\.\w+/(?P<id>\w+)`),
|
||||
URLPattern: regexp.MustCompile(pinValidUrlPattern),
|
||||
Host: func() []string {
|
||||
var domains []string
|
||||
for _, domain := range validHost {
|
||||
domains = append(domains, "pin."+domain)
|
||||
}
|
||||
return domains
|
||||
}(),
|
||||
IsRedirect: true,
|
||||
|
||||
Run: func(ctx *models.DownloadContext) (*models.ExtractorResponse, error) {
|
||||
|
@ -44,7 +63,15 @@ var Extractor = &models.Extractor{
|
|||
CodeName: "pinterest",
|
||||
Type: enums.ExtractorTypeSingle,
|
||||
Category: enums.ExtractorCategorySocial,
|
||||
URLPattern: regexp.MustCompile(`https?://(\w+\.)?pinterest[\.\w]+/pin/(?P<id>\d+)`),
|
||||
URLPattern: regexp.MustCompile(validUrlPattern),
|
||||
Host: func() []string {
|
||||
var domains []string
|
||||
for _, domain := range validHost {
|
||||
domains = append(domains, "pinterest."+domain)
|
||||
domains = append(domains, domain+".pinterest.com")
|
||||
}
|
||||
return domains
|
||||
}(),
|
||||
|
||||
Run: func(ctx *models.DownloadContext) (*models.ExtractorResponse, error) {
|
||||
media, err := ExtractPinMedia(ctx)
|
||||
|
|
|
@ -12,7 +12,15 @@ import (
|
|||
"govd/util"
|
||||
)
|
||||
|
||||
var httpSession = util.GetHTTPSession()
|
||||
var (
|
||||
httpSession = util.GetHTTPSession()
|
||||
baseHost = []string{
|
||||
"reddit.com",
|
||||
"redditmedia.com",
|
||||
"old.reddit.com",
|
||||
"old.redditmedia.com",
|
||||
}
|
||||
)
|
||||
|
||||
var ShortExtractor = &models.Extractor{
|
||||
Name: "Reddit (Short)",
|
||||
|
@ -20,6 +28,7 @@ var ShortExtractor = &models.Extractor{
|
|||
Type: enums.ExtractorTypeSingle,
|
||||
Category: enums.ExtractorCategorySocial,
|
||||
URLPattern: regexp.MustCompile(`https?://(?P<host>(?:\w+\.)?reddit(?:media)?\.com)/(?P<slug>(?:(?:r|user)/[^/]+/)?s/(?P<id>[^/?#&]+))`),
|
||||
Host: baseHost,
|
||||
IsRedirect: true,
|
||||
|
||||
Run: func(ctx *models.DownloadContext) (*models.ExtractorResponse, error) {
|
||||
|
@ -57,6 +66,7 @@ var Extractor = &models.Extractor{
|
|||
Type: enums.ExtractorTypeSingle,
|
||||
Category: enums.ExtractorCategorySocial,
|
||||
URLPattern: regexp.MustCompile(`https?://(?P<host>(?:\w+\.)?reddit(?:media)?\.com)/(?P<slug>(?:(?:r|user)/[^/]+/)?comments/(?P<id>[^/?#&]+))`),
|
||||
Host: baseHost,
|
||||
|
||||
Run: func(ctx *models.DownloadContext) (*models.ExtractorResponse, error) {
|
||||
mediaList, err := MediaListFromAPI(ctx)
|
||||
|
|
|
@ -23,7 +23,19 @@ const (
|
|||
appUserAgent = packageID + " (Linux; U; Android 13; en_US; Pixel 7; Build/TD1A.220804.031; Cronet/58.0.2991.0)"
|
||||
)
|
||||
|
||||
var httpSession = util.GetHTTPSession()
|
||||
var (
|
||||
httpSession = util.GetHTTPSession()
|
||||
baseHost = []string{
|
||||
"tiktok.com",
|
||||
"vxtiktok.com",
|
||||
"vm.tiktok.com",
|
||||
"vt.tiktok.com",
|
||||
"vt.vxtiktok.com",
|
||||
"vm.vxtiktok.com",
|
||||
"m.tiktok.com",
|
||||
"m.vxtiktok.com",
|
||||
}
|
||||
)
|
||||
|
||||
var VMExtractor = &models.Extractor{
|
||||
Name: "TikTok VM",
|
||||
|
@ -31,6 +43,7 @@ var VMExtractor = &models.Extractor{
|
|||
Type: enums.ExtractorTypeSingle,
|
||||
Category: enums.ExtractorCategorySocial,
|
||||
URLPattern: regexp.MustCompile(`https:\/\/((?:vm|vt|www)\.)?(vx)?tiktok\.com\/(?:t\/)?(?P<id>[a-zA-Z0-9]+)`),
|
||||
Host: baseHost,
|
||||
IsRedirect: true,
|
||||
|
||||
Run: func(ctx *models.DownloadContext) (*models.ExtractorResponse, error) {
|
||||
|
@ -50,6 +63,7 @@ var Extractor = &models.Extractor{
|
|||
Type: enums.ExtractorTypeSingle,
|
||||
Category: enums.ExtractorCategorySocial,
|
||||
URLPattern: regexp.MustCompile(`https?:\/\/((www|m)\.)?(vx)?tiktok\.com\/((?:embed|@[\w\.-]+)\/)?(v(ideo)?|p(hoto)?)\/(?P<id>[0-9]+)`),
|
||||
Host: baseHost,
|
||||
|
||||
Run: func(ctx *models.DownloadContext) (*models.ExtractorResponse, error) {
|
||||
mediaList, err := MediaListFromAPI(ctx)
|
||||
|
|
|
@ -25,6 +25,7 @@ var ShortExtractor = &models.Extractor{
|
|||
Type: enums.ExtractorTypeSingle,
|
||||
Category: enums.ExtractorCategorySocial,
|
||||
URLPattern: regexp.MustCompile(`https?://t\.co/(?P<id>\w+)`),
|
||||
Host: []string{"t.co"},
|
||||
IsRedirect: true,
|
||||
|
||||
Run: func(ctx *models.DownloadContext) (*models.ExtractorResponse, error) {
|
||||
|
@ -58,6 +59,12 @@ var Extractor = &models.Extractor{
|
|||
Type: enums.ExtractorTypeSingle,
|
||||
Category: enums.ExtractorCategorySocial,
|
||||
URLPattern: regexp.MustCompile(`https?:\/\/(vx)?(twitter|x)\.com\/([^\/]+)\/status\/(?P<id>\d+)`),
|
||||
Host: []string{
|
||||
"twitter.com",
|
||||
"x.com",
|
||||
"vxx.com",
|
||||
"vxtwitter.com",
|
||||
},
|
||||
|
||||
Run: func(ctx *models.DownloadContext) (*models.ExtractorResponse, error) {
|
||||
mediaList, err := MediaListFromAPI(ctx)
|
||||
|
|
71
ext/util.go
71
ext/util.go
|
@ -3,34 +3,73 @@ package ext
|
|||
import (
|
||||
"fmt"
|
||||
"govd/models"
|
||||
"net/url"
|
||||
"strings"
|
||||
"sync"
|
||||
)
|
||||
|
||||
var maxRedirects = 5
|
||||
var (
|
||||
maxRedirects = 5
|
||||
|
||||
extractorsByHost map[string][]*models.Extractor
|
||||
extractorMapOnce sync.Once
|
||||
)
|
||||
|
||||
func initExtractorMap() {
|
||||
extractorMapOnce.Do(func() {
|
||||
extractorsByHost = make(map[string][]*models.Extractor)
|
||||
for _, extractor := range List {
|
||||
if len(extractor.Host) > 0 {
|
||||
for _, domain := range extractor.Host {
|
||||
extractorsByHost[domain] = append(extractorsByHost[domain], extractor)
|
||||
}
|
||||
}
|
||||
}
|
||||
})
|
||||
}
|
||||
|
||||
func CtxByURL(urlStr string) (*models.DownloadContext, error) {
|
||||
initExtractorMap()
|
||||
|
||||
func CtxByURL(url string) (*models.DownloadContext, error) {
|
||||
var redirectCount int
|
||||
|
||||
currentURL := url
|
||||
currentURL := urlStr
|
||||
|
||||
for redirectCount <= maxRedirects {
|
||||
for _, extractor := range List {
|
||||
matches := extractor.URLPattern.FindStringSubmatch(currentURL)
|
||||
if matches == nil {
|
||||
continue
|
||||
parsedURL, err := url.Parse(currentURL)
|
||||
if err != nil {
|
||||
return nil, fmt.Errorf("invalid URL: %v", err)
|
||||
}
|
||||
|
||||
groupNames := extractor.URLPattern.SubexpNames()
|
||||
if len(matches) == 0 {
|
||||
continue
|
||||
host := strings.TrimPrefix(parsedURL.Host, "www.")
|
||||
|
||||
extractors := extractorsByHost[host]
|
||||
if len(extractors) == 0 {
|
||||
return nil, nil
|
||||
}
|
||||
|
||||
groups := make(map[string]string)
|
||||
var extractor *models.Extractor
|
||||
var matches []string
|
||||
var groups map[string]string
|
||||
|
||||
for _, ext := range extractors {
|
||||
matches = ext.URLPattern.FindStringSubmatch(currentURL)
|
||||
if matches != nil {
|
||||
extractor = ext
|
||||
groupNames := ext.URLPattern.SubexpNames()
|
||||
groups = make(map[string]string)
|
||||
for i, name := range groupNames {
|
||||
if name != "" {
|
||||
if name != "" && i < len(matches) {
|
||||
groups[name] = matches[i]
|
||||
}
|
||||
}
|
||||
groups["match"] = matches[0]
|
||||
break
|
||||
}
|
||||
}
|
||||
|
||||
if extractor == nil || matches == nil {
|
||||
return nil, nil
|
||||
}
|
||||
|
||||
ctx := &models.DownloadContext{
|
||||
MatchedContentID: groups["id"],
|
||||
|
@ -54,14 +93,12 @@ func CtxByURL(url string) (*models.DownloadContext, error) {
|
|||
currentURL = response.URL
|
||||
redirectCount++
|
||||
|
||||
break
|
||||
}
|
||||
|
||||
if redirectCount > maxRedirects {
|
||||
return nil, fmt.Errorf("exceeded maximum number of redirects (%d)", maxRedirects)
|
||||
}
|
||||
}
|
||||
return nil, nil
|
||||
|
||||
return nil, fmt.Errorf("failed to extract from URL: %s", urlStr)
|
||||
}
|
||||
|
||||
func ByCodeName(codeName string) *models.Extractor {
|
||||
|
|
|
@ -11,6 +11,7 @@ type Extractor struct {
|
|||
Type enums.ExtractorType
|
||||
Category enums.ExtractorCategory
|
||||
URLPattern *regexp.Regexp
|
||||
Host []string
|
||||
IsDRM bool
|
||||
IsRedirect bool
|
||||
|
||||
|
|
Loading…
Add table
Add a link
Reference in a new issue