use host matching instead of regex (desc)
why? using regex on every single message the bot receives, even simple patterns, can be very harmful for your cpu lol
This commit is contained in:
parent
84d005ade2
commit
3e351e7e43
7 changed files with 154 additions and 51 deletions
|
@ -44,12 +44,17 @@ var igHeaders = map[string]string{
|
||||||
"User-Agent": util.ChromeUA,
|
"User-Agent": util.ChromeUA,
|
||||||
}
|
}
|
||||||
|
|
||||||
|
var instagramHost = []string{
|
||||||
|
"instagram.com",
|
||||||
|
}
|
||||||
|
|
||||||
var Extractor = &models.Extractor{
|
var Extractor = &models.Extractor{
|
||||||
Name: "Instagram",
|
Name: "Instagram",
|
||||||
CodeName: "instagram",
|
CodeName: "instagram",
|
||||||
Type: enums.ExtractorTypeSingle,
|
Type: enums.ExtractorTypeSingle,
|
||||||
Category: enums.ExtractorCategorySocial,
|
Category: enums.ExtractorCategorySocial,
|
||||||
URLPattern: regexp.MustCompile(`https:\/\/www\.instagram\.com\/(reel|p|tv)\/(?P<id>[a-zA-Z0-9_-]+)`),
|
URLPattern: regexp.MustCompile(`https:\/\/www\.instagram\.com\/(reel|p|tv)\/(?P<id>[a-zA-Z0-9_-]+)`),
|
||||||
|
Host: instagramHost,
|
||||||
IsRedirect: false,
|
IsRedirect: false,
|
||||||
|
|
||||||
Run: func(ctx *models.DownloadContext) (*models.ExtractorResponse, error) {
|
Run: func(ctx *models.DownloadContext) (*models.ExtractorResponse, error) {
|
||||||
|
@ -66,6 +71,7 @@ var StoriesExtractor = &models.Extractor{
|
||||||
Type: enums.ExtractorTypeSingle,
|
Type: enums.ExtractorTypeSingle,
|
||||||
Category: enums.ExtractorCategorySocial,
|
Category: enums.ExtractorCategorySocial,
|
||||||
URLPattern: regexp.MustCompile(`https:\/\/www\.instagram\.com\/stories\/[a-zA-Z0-9._]+\/(?P<id>\d+)`),
|
URLPattern: regexp.MustCompile(`https:\/\/www\.instagram\.com\/stories\/[a-zA-Z0-9._]+\/(?P<id>\d+)`),
|
||||||
|
Host: instagramHost,
|
||||||
IsRedirect: false,
|
IsRedirect: false,
|
||||||
|
|
||||||
Run: func(ctx *models.DownloadContext) (*models.ExtractorResponse, error) {
|
Run: func(ctx *models.DownloadContext) (*models.ExtractorResponse, error) {
|
||||||
|
@ -82,6 +88,7 @@ var ShareURLExtractor = &models.Extractor{
|
||||||
Type: enums.ExtractorTypeSingle,
|
Type: enums.ExtractorTypeSingle,
|
||||||
Category: enums.ExtractorCategorySocial,
|
Category: enums.ExtractorCategorySocial,
|
||||||
URLPattern: regexp.MustCompile(`https?:\/\/(www\.)?instagram\.com\/share\/((reels?|video|s|p)\/)?(?P<id>[^\/\?]+)`),
|
URLPattern: regexp.MustCompile(`https?:\/\/(www\.)?instagram\.com\/share\/((reels?|video|s|p)\/)?(?P<id>[^\/\?]+)`),
|
||||||
|
Host: instagramHost,
|
||||||
IsRedirect: true,
|
IsRedirect: true,
|
||||||
|
|
||||||
Run: func(ctx *models.DownloadContext) (*models.ExtractorResponse, error) {
|
Run: func(ctx *models.DownloadContext) (*models.ExtractorResponse, error) {
|
||||||
|
|
|
@ -6,6 +6,7 @@ import (
|
||||||
"io"
|
"io"
|
||||||
"net/http"
|
"net/http"
|
||||||
"regexp"
|
"regexp"
|
||||||
|
"strings"
|
||||||
|
|
||||||
"govd/enums"
|
"govd/enums"
|
||||||
"govd/models"
|
"govd/models"
|
||||||
|
@ -17,14 +18,32 @@ const (
|
||||||
shortenerAPIFormat = "https://api.pinterest.com/url_shortener/%s/redirect/"
|
shortenerAPIFormat = "https://api.pinterest.com/url_shortener/%s/redirect/"
|
||||||
)
|
)
|
||||||
|
|
||||||
var httpSession = util.GetHTTPSession()
|
var (
|
||||||
|
httpSession = util.GetHTTPSession()
|
||||||
|
validHost = []string{
|
||||||
|
"com", "fr", "de", "ch", "jp", "cl", "ca", "it", "co\\.uk", "nz", "ru", "com\\.au",
|
||||||
|
"at", "pt", "co\\.kr", "es", "com\\.mx", "dk", "ph", "th", "com\\.uy", "co", "nl",
|
||||||
|
"info", "kr", "ie", "vn", "com\\.vn", "ec", "mx", "in", "pe", "co\\.at", "hu",
|
||||||
|
"co\\.in", "co\\.nz", "id", "com\\.ec", "com\\.py", "tw", "be", "uk", "com\\.bo", "com\\.pe",
|
||||||
|
}
|
||||||
|
validHostRegex = strings.Join(validHost, "|")
|
||||||
|
validUrlPattern = `https?://(?:[^/]+\.)?pinterest\.(` + validHostRegex + `)/pin/(?:[\w-]+--)?(?P<id>\d+)`
|
||||||
|
pinValidUrlPattern = `https?://(www\.)?pin\.(` + validHostRegex + `)/(?P<id>\w+)`
|
||||||
|
)
|
||||||
|
|
||||||
var ShortExtractor = &models.Extractor{
|
var ShortExtractor = &models.Extractor{
|
||||||
Name: "Pinterest (Short)",
|
Name: "Pinterest (Short)",
|
||||||
CodeName: "pinterest:short",
|
CodeName: "pinterest:short",
|
||||||
Type: enums.ExtractorTypeSingle,
|
Type: enums.ExtractorTypeSingle,
|
||||||
Category: enums.ExtractorCategorySocial,
|
Category: enums.ExtractorCategorySocial,
|
||||||
URLPattern: regexp.MustCompile(`https?://(\w+\.)?pin\.\w+/(?P<id>\w+)`),
|
URLPattern: regexp.MustCompile(pinValidUrlPattern),
|
||||||
|
Host: func() []string {
|
||||||
|
var domains []string
|
||||||
|
for _, domain := range validHost {
|
||||||
|
domains = append(domains, "pin."+domain)
|
||||||
|
}
|
||||||
|
return domains
|
||||||
|
}(),
|
||||||
IsRedirect: true,
|
IsRedirect: true,
|
||||||
|
|
||||||
Run: func(ctx *models.DownloadContext) (*models.ExtractorResponse, error) {
|
Run: func(ctx *models.DownloadContext) (*models.ExtractorResponse, error) {
|
||||||
|
@ -44,7 +63,15 @@ var Extractor = &models.Extractor{
|
||||||
CodeName: "pinterest",
|
CodeName: "pinterest",
|
||||||
Type: enums.ExtractorTypeSingle,
|
Type: enums.ExtractorTypeSingle,
|
||||||
Category: enums.ExtractorCategorySocial,
|
Category: enums.ExtractorCategorySocial,
|
||||||
URLPattern: regexp.MustCompile(`https?://(\w+\.)?pinterest[\.\w]+/pin/(?P<id>\d+)`),
|
URLPattern: regexp.MustCompile(validUrlPattern),
|
||||||
|
Host: func() []string {
|
||||||
|
var domains []string
|
||||||
|
for _, domain := range validHost {
|
||||||
|
domains = append(domains, "pinterest."+domain)
|
||||||
|
domains = append(domains, domain+".pinterest.com")
|
||||||
|
}
|
||||||
|
return domains
|
||||||
|
}(),
|
||||||
|
|
||||||
Run: func(ctx *models.DownloadContext) (*models.ExtractorResponse, error) {
|
Run: func(ctx *models.DownloadContext) (*models.ExtractorResponse, error) {
|
||||||
media, err := ExtractPinMedia(ctx)
|
media, err := ExtractPinMedia(ctx)
|
||||||
|
|
|
@ -12,7 +12,15 @@ import (
|
||||||
"govd/util"
|
"govd/util"
|
||||||
)
|
)
|
||||||
|
|
||||||
var httpSession = util.GetHTTPSession()
|
var (
|
||||||
|
httpSession = util.GetHTTPSession()
|
||||||
|
baseHost = []string{
|
||||||
|
"reddit.com",
|
||||||
|
"redditmedia.com",
|
||||||
|
"old.reddit.com",
|
||||||
|
"old.redditmedia.com",
|
||||||
|
}
|
||||||
|
)
|
||||||
|
|
||||||
var ShortExtractor = &models.Extractor{
|
var ShortExtractor = &models.Extractor{
|
||||||
Name: "Reddit (Short)",
|
Name: "Reddit (Short)",
|
||||||
|
@ -20,6 +28,7 @@ var ShortExtractor = &models.Extractor{
|
||||||
Type: enums.ExtractorTypeSingle,
|
Type: enums.ExtractorTypeSingle,
|
||||||
Category: enums.ExtractorCategorySocial,
|
Category: enums.ExtractorCategorySocial,
|
||||||
URLPattern: regexp.MustCompile(`https?://(?P<host>(?:\w+\.)?reddit(?:media)?\.com)/(?P<slug>(?:(?:r|user)/[^/]+/)?s/(?P<id>[^/?#&]+))`),
|
URLPattern: regexp.MustCompile(`https?://(?P<host>(?:\w+\.)?reddit(?:media)?\.com)/(?P<slug>(?:(?:r|user)/[^/]+/)?s/(?P<id>[^/?#&]+))`),
|
||||||
|
Host: baseHost,
|
||||||
IsRedirect: true,
|
IsRedirect: true,
|
||||||
|
|
||||||
Run: func(ctx *models.DownloadContext) (*models.ExtractorResponse, error) {
|
Run: func(ctx *models.DownloadContext) (*models.ExtractorResponse, error) {
|
||||||
|
@ -57,6 +66,7 @@ var Extractor = &models.Extractor{
|
||||||
Type: enums.ExtractorTypeSingle,
|
Type: enums.ExtractorTypeSingle,
|
||||||
Category: enums.ExtractorCategorySocial,
|
Category: enums.ExtractorCategorySocial,
|
||||||
URLPattern: regexp.MustCompile(`https?://(?P<host>(?:\w+\.)?reddit(?:media)?\.com)/(?P<slug>(?:(?:r|user)/[^/]+/)?comments/(?P<id>[^/?#&]+))`),
|
URLPattern: regexp.MustCompile(`https?://(?P<host>(?:\w+\.)?reddit(?:media)?\.com)/(?P<slug>(?:(?:r|user)/[^/]+/)?comments/(?P<id>[^/?#&]+))`),
|
||||||
|
Host: baseHost,
|
||||||
|
|
||||||
Run: func(ctx *models.DownloadContext) (*models.ExtractorResponse, error) {
|
Run: func(ctx *models.DownloadContext) (*models.ExtractorResponse, error) {
|
||||||
mediaList, err := MediaListFromAPI(ctx)
|
mediaList, err := MediaListFromAPI(ctx)
|
||||||
|
|
|
@ -23,7 +23,19 @@ const (
|
||||||
appUserAgent = packageID + " (Linux; U; Android 13; en_US; Pixel 7; Build/TD1A.220804.031; Cronet/58.0.2991.0)"
|
appUserAgent = packageID + " (Linux; U; Android 13; en_US; Pixel 7; Build/TD1A.220804.031; Cronet/58.0.2991.0)"
|
||||||
)
|
)
|
||||||
|
|
||||||
var httpSession = util.GetHTTPSession()
|
var (
|
||||||
|
httpSession = util.GetHTTPSession()
|
||||||
|
baseHost = []string{
|
||||||
|
"tiktok.com",
|
||||||
|
"vxtiktok.com",
|
||||||
|
"vm.tiktok.com",
|
||||||
|
"vt.tiktok.com",
|
||||||
|
"vt.vxtiktok.com",
|
||||||
|
"vm.vxtiktok.com",
|
||||||
|
"m.tiktok.com",
|
||||||
|
"m.vxtiktok.com",
|
||||||
|
}
|
||||||
|
)
|
||||||
|
|
||||||
var VMExtractor = &models.Extractor{
|
var VMExtractor = &models.Extractor{
|
||||||
Name: "TikTok VM",
|
Name: "TikTok VM",
|
||||||
|
@ -31,6 +43,7 @@ var VMExtractor = &models.Extractor{
|
||||||
Type: enums.ExtractorTypeSingle,
|
Type: enums.ExtractorTypeSingle,
|
||||||
Category: enums.ExtractorCategorySocial,
|
Category: enums.ExtractorCategorySocial,
|
||||||
URLPattern: regexp.MustCompile(`https:\/\/((?:vm|vt|www)\.)?(vx)?tiktok\.com\/(?:t\/)?(?P<id>[a-zA-Z0-9]+)`),
|
URLPattern: regexp.MustCompile(`https:\/\/((?:vm|vt|www)\.)?(vx)?tiktok\.com\/(?:t\/)?(?P<id>[a-zA-Z0-9]+)`),
|
||||||
|
Host: baseHost,
|
||||||
IsRedirect: true,
|
IsRedirect: true,
|
||||||
|
|
||||||
Run: func(ctx *models.DownloadContext) (*models.ExtractorResponse, error) {
|
Run: func(ctx *models.DownloadContext) (*models.ExtractorResponse, error) {
|
||||||
|
@ -50,6 +63,7 @@ var Extractor = &models.Extractor{
|
||||||
Type: enums.ExtractorTypeSingle,
|
Type: enums.ExtractorTypeSingle,
|
||||||
Category: enums.ExtractorCategorySocial,
|
Category: enums.ExtractorCategorySocial,
|
||||||
URLPattern: regexp.MustCompile(`https?:\/\/((www|m)\.)?(vx)?tiktok\.com\/((?:embed|@[\w\.-]+)\/)?(v(ideo)?|p(hoto)?)\/(?P<id>[0-9]+)`),
|
URLPattern: regexp.MustCompile(`https?:\/\/((www|m)\.)?(vx)?tiktok\.com\/((?:embed|@[\w\.-]+)\/)?(v(ideo)?|p(hoto)?)\/(?P<id>[0-9]+)`),
|
||||||
|
Host: baseHost,
|
||||||
|
|
||||||
Run: func(ctx *models.DownloadContext) (*models.ExtractorResponse, error) {
|
Run: func(ctx *models.DownloadContext) (*models.ExtractorResponse, error) {
|
||||||
mediaList, err := MediaListFromAPI(ctx)
|
mediaList, err := MediaListFromAPI(ctx)
|
||||||
|
|
|
@ -25,6 +25,7 @@ var ShortExtractor = &models.Extractor{
|
||||||
Type: enums.ExtractorTypeSingle,
|
Type: enums.ExtractorTypeSingle,
|
||||||
Category: enums.ExtractorCategorySocial,
|
Category: enums.ExtractorCategorySocial,
|
||||||
URLPattern: regexp.MustCompile(`https?://t\.co/(?P<id>\w+)`),
|
URLPattern: regexp.MustCompile(`https?://t\.co/(?P<id>\w+)`),
|
||||||
|
Host: []string{"t.co"},
|
||||||
IsRedirect: true,
|
IsRedirect: true,
|
||||||
|
|
||||||
Run: func(ctx *models.DownloadContext) (*models.ExtractorResponse, error) {
|
Run: func(ctx *models.DownloadContext) (*models.ExtractorResponse, error) {
|
||||||
|
@ -58,6 +59,12 @@ var Extractor = &models.Extractor{
|
||||||
Type: enums.ExtractorTypeSingle,
|
Type: enums.ExtractorTypeSingle,
|
||||||
Category: enums.ExtractorCategorySocial,
|
Category: enums.ExtractorCategorySocial,
|
||||||
URLPattern: regexp.MustCompile(`https?:\/\/(vx)?(twitter|x)\.com\/([^\/]+)\/status\/(?P<id>\d+)`),
|
URLPattern: regexp.MustCompile(`https?:\/\/(vx)?(twitter|x)\.com\/([^\/]+)\/status\/(?P<id>\d+)`),
|
||||||
|
Host: []string{
|
||||||
|
"twitter.com",
|
||||||
|
"x.com",
|
||||||
|
"vxx.com",
|
||||||
|
"vxtwitter.com",
|
||||||
|
},
|
||||||
|
|
||||||
Run: func(ctx *models.DownloadContext) (*models.ExtractorResponse, error) {
|
Run: func(ctx *models.DownloadContext) (*models.ExtractorResponse, error) {
|
||||||
mediaList, err := MediaListFromAPI(ctx)
|
mediaList, err := MediaListFromAPI(ctx)
|
||||||
|
|
71
ext/util.go
71
ext/util.go
|
@ -3,34 +3,73 @@ package ext
|
||||||
import (
|
import (
|
||||||
"fmt"
|
"fmt"
|
||||||
"govd/models"
|
"govd/models"
|
||||||
|
"net/url"
|
||||||
|
"strings"
|
||||||
|
"sync"
|
||||||
)
|
)
|
||||||
|
|
||||||
var maxRedirects = 5
|
var (
|
||||||
|
maxRedirects = 5
|
||||||
|
|
||||||
|
extractorsByHost map[string][]*models.Extractor
|
||||||
|
extractorMapOnce sync.Once
|
||||||
|
)
|
||||||
|
|
||||||
|
func initExtractorMap() {
|
||||||
|
extractorMapOnce.Do(func() {
|
||||||
|
extractorsByHost = make(map[string][]*models.Extractor)
|
||||||
|
for _, extractor := range List {
|
||||||
|
if len(extractor.Host) > 0 {
|
||||||
|
for _, domain := range extractor.Host {
|
||||||
|
extractorsByHost[domain] = append(extractorsByHost[domain], extractor)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
})
|
||||||
|
}
|
||||||
|
|
||||||
|
func CtxByURL(urlStr string) (*models.DownloadContext, error) {
|
||||||
|
initExtractorMap()
|
||||||
|
|
||||||
func CtxByURL(url string) (*models.DownloadContext, error) {
|
|
||||||
var redirectCount int
|
var redirectCount int
|
||||||
|
currentURL := urlStr
|
||||||
currentURL := url
|
|
||||||
|
|
||||||
for redirectCount <= maxRedirects {
|
for redirectCount <= maxRedirects {
|
||||||
for _, extractor := range List {
|
parsedURL, err := url.Parse(currentURL)
|
||||||
matches := extractor.URLPattern.FindStringSubmatch(currentURL)
|
if err != nil {
|
||||||
if matches == nil {
|
return nil, fmt.Errorf("invalid URL: %v", err)
|
||||||
continue
|
|
||||||
}
|
}
|
||||||
|
|
||||||
groupNames := extractor.URLPattern.SubexpNames()
|
host := strings.TrimPrefix(parsedURL.Host, "www.")
|
||||||
if len(matches) == 0 {
|
|
||||||
continue
|
extractors := extractorsByHost[host]
|
||||||
|
if len(extractors) == 0 {
|
||||||
|
return nil, nil
|
||||||
}
|
}
|
||||||
|
|
||||||
groups := make(map[string]string)
|
var extractor *models.Extractor
|
||||||
|
var matches []string
|
||||||
|
var groups map[string]string
|
||||||
|
|
||||||
|
for _, ext := range extractors {
|
||||||
|
matches = ext.URLPattern.FindStringSubmatch(currentURL)
|
||||||
|
if matches != nil {
|
||||||
|
extractor = ext
|
||||||
|
groupNames := ext.URLPattern.SubexpNames()
|
||||||
|
groups = make(map[string]string)
|
||||||
for i, name := range groupNames {
|
for i, name := range groupNames {
|
||||||
if name != "" {
|
if name != "" && i < len(matches) {
|
||||||
groups[name] = matches[i]
|
groups[name] = matches[i]
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
groups["match"] = matches[0]
|
groups["match"] = matches[0]
|
||||||
|
break
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
if extractor == nil || matches == nil {
|
||||||
|
return nil, nil
|
||||||
|
}
|
||||||
|
|
||||||
ctx := &models.DownloadContext{
|
ctx := &models.DownloadContext{
|
||||||
MatchedContentID: groups["id"],
|
MatchedContentID: groups["id"],
|
||||||
|
@ -54,14 +93,12 @@ func CtxByURL(url string) (*models.DownloadContext, error) {
|
||||||
currentURL = response.URL
|
currentURL = response.URL
|
||||||
redirectCount++
|
redirectCount++
|
||||||
|
|
||||||
break
|
|
||||||
}
|
|
||||||
|
|
||||||
if redirectCount > maxRedirects {
|
if redirectCount > maxRedirects {
|
||||||
return nil, fmt.Errorf("exceeded maximum number of redirects (%d)", maxRedirects)
|
return nil, fmt.Errorf("exceeded maximum number of redirects (%d)", maxRedirects)
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
return nil, nil
|
|
||||||
|
return nil, fmt.Errorf("failed to extract from URL: %s", urlStr)
|
||||||
}
|
}
|
||||||
|
|
||||||
func ByCodeName(codeName string) *models.Extractor {
|
func ByCodeName(codeName string) *models.Extractor {
|
||||||
|
|
|
@ -11,6 +11,7 @@ type Extractor struct {
|
||||||
Type enums.ExtractorType
|
Type enums.ExtractorType
|
||||||
Category enums.ExtractorCategory
|
Category enums.ExtractorCategory
|
||||||
URLPattern *regexp.Regexp
|
URLPattern *regexp.Regexp
|
||||||
|
Host []string
|
||||||
IsDRM bool
|
IsDRM bool
|
||||||
IsRedirect bool
|
IsRedirect bool
|
||||||
|
|
||||||
|
|
Loading…
Add table
Add a link
Reference in a new issue