threads: new extractor
This commit is contained in:
parent
5336968e05
commit
effd834a47
3 changed files with 155 additions and 1 deletions
65
ext/threads/main.go
Normal file
65
ext/threads/main.go
Normal file
|
@ -0,0 +1,65 @@
|
|||
package threads
|
||||
|
||||
import (
|
||||
"fmt"
|
||||
"govd/enums"
|
||||
"govd/models"
|
||||
"govd/util"
|
||||
"io"
|
||||
"net/http"
|
||||
"regexp"
|
||||
)
|
||||
|
||||
var threadsHost = []string{"threads.net"}
|
||||
|
||||
var Extractor = &models.Extractor{
|
||||
Name: "Threads",
|
||||
CodeName: "threads",
|
||||
Type: enums.ExtractorTypeSingle,
|
||||
Category: enums.ExtractorCategorySocial,
|
||||
URLPattern: regexp.MustCompile(`https:\/\/(www\.)?threads\.net\/(?:@[^\/]+)\/p(?:ost)?\/(?P<id>[a-zA-Z0-9_-]+)`),
|
||||
Host: threadsHost,
|
||||
IsRedirect: false,
|
||||
|
||||
Run: func(ctx *models.DownloadContext) (*models.ExtractorResponse, error) {
|
||||
mediaList, err := GetEmbedMediaList(ctx)
|
||||
if err != nil {
|
||||
return nil, fmt.Errorf("failed to get media: %w", err)
|
||||
}
|
||||
return &models.ExtractorResponse{
|
||||
MediaList: mediaList,
|
||||
}, nil
|
||||
},
|
||||
}
|
||||
|
||||
func GetEmbedMediaList(ctx *models.DownloadContext) ([]*models.Media, error) {
|
||||
session := util.GetHTTPClient(ctx.Extractor.CodeName)
|
||||
embedURL := fmt.Sprintf(
|
||||
"https://www.threads.net/@_/post/%s/embed",
|
||||
ctx.MatchedContentID,
|
||||
)
|
||||
req, err := http.NewRequest(
|
||||
http.MethodGet,
|
||||
embedURL,
|
||||
nil,
|
||||
)
|
||||
if err != nil {
|
||||
return nil, fmt.Errorf("failed to create request: %w", err)
|
||||
}
|
||||
for k, v := range headers {
|
||||
req.Header.Set(k, v)
|
||||
}
|
||||
res, err := session.Do(req)
|
||||
if err != nil {
|
||||
return nil, fmt.Errorf("failed to send request: %w", err)
|
||||
}
|
||||
defer res.Body.Close()
|
||||
if res.StatusCode != http.StatusOK {
|
||||
return nil, fmt.Errorf("failed to get embed media: %s", res.Status)
|
||||
}
|
||||
body, err := io.ReadAll(res.Body)
|
||||
if err != nil {
|
||||
return nil, fmt.Errorf("failed to read response body: %w", err)
|
||||
}
|
||||
return ParseEmbedMedia(ctx, body)
|
||||
}
|
88
ext/threads/util.go
Normal file
88
ext/threads/util.go
Normal file
|
@ -0,0 +1,88 @@
|
|||
package threads
|
||||
|
||||
import (
|
||||
"bytes"
|
||||
"fmt"
|
||||
"govd/enums"
|
||||
"govd/models"
|
||||
"govd/util"
|
||||
|
||||
"github.com/PuerkitoBio/goquery"
|
||||
)
|
||||
|
||||
var headers = map[string]string{
|
||||
"Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.7",
|
||||
"Accept-Language": "en-GB,en;q=0.9",
|
||||
"Cache-Control": "max-age=0",
|
||||
"Dnt": "1",
|
||||
"Priority": "u=0, i",
|
||||
"Sec-Ch-Ua": `Chromium";v="124", "Google Chrome";v="124", "Not-A.Brand";v="99`,
|
||||
"Sec-Ch-Ua-Mobile": "?0",
|
||||
"Sec-Ch-Ua-Platform": "macOS",
|
||||
"Sec-Fetch-Dest": "document",
|
||||
"Sec-Fetch-Mode": "navigate",
|
||||
"Sec-Fetch-Site": "none",
|
||||
"Sec-Fetch-User": "?1",
|
||||
"Upgrade-Insecure-Requests": "1",
|
||||
"User-Agent": util.ChromeUA,
|
||||
}
|
||||
|
||||
func ParseEmbedMedia(
|
||||
ctx *models.DownloadContext,
|
||||
body []byte,
|
||||
) ([]*models.Media, error) {
|
||||
var mediaList []*models.Media
|
||||
|
||||
doc, err := goquery.NewDocumentFromReader(bytes.NewReader(body))
|
||||
if err != nil {
|
||||
return nil, fmt.Errorf("failed parsing HTML: %w", err)
|
||||
}
|
||||
|
||||
contentID := ctx.MatchedContentID
|
||||
contentURL := ctx.MatchedContentURL
|
||||
|
||||
var caption string
|
||||
doc.Find(".BodyContainerNoThreadLine .BodyTextContainer span").Each(func(i int, c *goquery.Selection) {
|
||||
caption = c.Text()
|
||||
})
|
||||
|
||||
doc.Find(".MediaContainer, .SoloMediaContainer").Each(func(i int, container *goquery.Selection) {
|
||||
container.Find("video").Each(func(j int, vid *goquery.Selection) {
|
||||
sourceEl := vid.Find("source")
|
||||
src, exists := sourceEl.Attr("src")
|
||||
if exists {
|
||||
media := ctx.Extractor.NewMedia(
|
||||
contentID,
|
||||
contentURL,
|
||||
)
|
||||
media.SetCaption(caption)
|
||||
media.AddFormat(&models.MediaFormat{
|
||||
Type: enums.MediaTypeVideo,
|
||||
FormatID: "video",
|
||||
URL: []string{src},
|
||||
VideoCodec: enums.MediaCodecAVC,
|
||||
AudioCodec: enums.MediaCodecAAC,
|
||||
})
|
||||
mediaList = append(mediaList, media)
|
||||
}
|
||||
})
|
||||
container.Find("img").Each(func(j int, img *goquery.Selection) {
|
||||
src, exists := img.Attr("src")
|
||||
if exists {
|
||||
media := ctx.Extractor.NewMedia(
|
||||
contentID,
|
||||
contentURL,
|
||||
)
|
||||
media.SetCaption(caption)
|
||||
media.AddFormat(&models.MediaFormat{
|
||||
Type: enums.MediaTypePhoto,
|
||||
FormatID: "image",
|
||||
URL: []string{src},
|
||||
})
|
||||
mediaList = append(mediaList, media)
|
||||
}
|
||||
})
|
||||
})
|
||||
|
||||
return mediaList, nil
|
||||
}
|
Loading…
Add table
Add a link
Reference in a new issue