use host matching instead of regex (desc)
why? using regex on every single message the bot receives, even simple patterns, can be very harmful for your cpu lol
This commit is contained in:
parent
84d005ade2
commit
3e351e7e43
7 changed files with 154 additions and 51 deletions
129
ext/util.go
129
ext/util.go
|
@ -3,65 +3,102 @@ package ext
|
|||
import (
|
||||
"fmt"
|
||||
"govd/models"
|
||||
"net/url"
|
||||
"strings"
|
||||
"sync"
|
||||
)
|
||||
|
||||
var maxRedirects = 5
|
||||
var (
|
||||
maxRedirects = 5
|
||||
|
||||
func CtxByURL(url string) (*models.DownloadContext, error) {
|
||||
var redirectCount int
|
||||
extractorsByHost map[string][]*models.Extractor
|
||||
extractorMapOnce sync.Once
|
||||
)
|
||||
|
||||
currentURL := url
|
||||
|
||||
for redirectCount <= maxRedirects {
|
||||
func initExtractorMap() {
|
||||
extractorMapOnce.Do(func() {
|
||||
extractorsByHost = make(map[string][]*models.Extractor)
|
||||
for _, extractor := range List {
|
||||
matches := extractor.URLPattern.FindStringSubmatch(currentURL)
|
||||
if matches == nil {
|
||||
continue
|
||||
}
|
||||
|
||||
groupNames := extractor.URLPattern.SubexpNames()
|
||||
if len(matches) == 0 {
|
||||
continue
|
||||
}
|
||||
|
||||
groups := make(map[string]string)
|
||||
for i, name := range groupNames {
|
||||
if name != "" {
|
||||
groups[name] = matches[i]
|
||||
if len(extractor.Host) > 0 {
|
||||
for _, domain := range extractor.Host {
|
||||
extractorsByHost[domain] = append(extractorsByHost[domain], extractor)
|
||||
}
|
||||
}
|
||||
groups["match"] = matches[0]
|
||||
|
||||
ctx := &models.DownloadContext{
|
||||
MatchedContentID: groups["id"],
|
||||
MatchedContentURL: groups["match"],
|
||||
MatchedGroups: groups,
|
||||
Extractor: extractor,
|
||||
}
|
||||
|
||||
if !extractor.IsRedirect {
|
||||
return ctx, nil
|
||||
}
|
||||
|
||||
response, err := extractor.Run(ctx)
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
if response.URL == "" {
|
||||
return nil, fmt.Errorf("no URL found in response")
|
||||
}
|
||||
|
||||
currentURL = response.URL
|
||||
redirectCount++
|
||||
|
||||
break
|
||||
}
|
||||
})
|
||||
}
|
||||
|
||||
func CtxByURL(urlStr string) (*models.DownloadContext, error) {
|
||||
initExtractorMap()
|
||||
|
||||
var redirectCount int
|
||||
currentURL := urlStr
|
||||
|
||||
for redirectCount <= maxRedirects {
|
||||
parsedURL, err := url.Parse(currentURL)
|
||||
if err != nil {
|
||||
return nil, fmt.Errorf("invalid URL: %v", err)
|
||||
}
|
||||
|
||||
host := strings.TrimPrefix(parsedURL.Host, "www.")
|
||||
|
||||
extractors := extractorsByHost[host]
|
||||
if len(extractors) == 0 {
|
||||
return nil, nil
|
||||
}
|
||||
|
||||
var extractor *models.Extractor
|
||||
var matches []string
|
||||
var groups map[string]string
|
||||
|
||||
for _, ext := range extractors {
|
||||
matches = ext.URLPattern.FindStringSubmatch(currentURL)
|
||||
if matches != nil {
|
||||
extractor = ext
|
||||
groupNames := ext.URLPattern.SubexpNames()
|
||||
groups = make(map[string]string)
|
||||
for i, name := range groupNames {
|
||||
if name != "" && i < len(matches) {
|
||||
groups[name] = matches[i]
|
||||
}
|
||||
}
|
||||
groups["match"] = matches[0]
|
||||
break
|
||||
}
|
||||
}
|
||||
|
||||
if extractor == nil || matches == nil {
|
||||
return nil, nil
|
||||
}
|
||||
|
||||
ctx := &models.DownloadContext{
|
||||
MatchedContentID: groups["id"],
|
||||
MatchedContentURL: groups["match"],
|
||||
MatchedGroups: groups,
|
||||
Extractor: extractor,
|
||||
}
|
||||
|
||||
if !extractor.IsRedirect {
|
||||
return ctx, nil
|
||||
}
|
||||
|
||||
response, err := extractor.Run(ctx)
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
if response.URL == "" {
|
||||
return nil, fmt.Errorf("no URL found in response")
|
||||
}
|
||||
|
||||
currentURL = response.URL
|
||||
redirectCount++
|
||||
|
||||
if redirectCount > maxRedirects {
|
||||
return nil, fmt.Errorf("exceeded maximum number of redirects (%d)", maxRedirects)
|
||||
}
|
||||
}
|
||||
return nil, nil
|
||||
|
||||
return nil, fmt.Errorf("failed to extract from URL: %s", urlStr)
|
||||
}
|
||||
|
||||
func ByCodeName(codeName string) *models.Extractor {
|
||||
|
|
Loading…
Add table
Add a link
Reference in a new issue