threads: new extractor

This commit is contained in:
stefanodvx 2025-04-24 12:21:12 +02:00
parent 5336968e05
commit effd834a47
3 changed files with 155 additions and 1 deletions

View file

@ -6,6 +6,7 @@ import (
"govd/ext/pinterest"
"govd/ext/reddit"
"govd/ext/redgifs"
"govd/ext/threads"
"govd/ext/tiktok"
"govd/ext/twitter"
"govd/models"
@ -17,6 +18,7 @@ var List = []*models.Extractor{
instagram.Extractor,
instagram.StoriesExtractor,
instagram.ShareURLExtractor,
threads.Extractor,
twitter.Extractor,
twitter.ShortExtractor,
pinterest.Extractor,
@ -25,5 +27,4 @@ var List = []*models.Extractor{
reddit.ShortExtractor,
ninegag.Extractor,
redgifs.Extractor,
// todo: add every ext lol
}

65
ext/threads/main.go Normal file
View file

@ -0,0 +1,65 @@
package threads
import (
"fmt"
"govd/enums"
"govd/models"
"govd/util"
"io"
"net/http"
"regexp"
)
var threadsHost = []string{"threads.net"}
var Extractor = &models.Extractor{
Name: "Threads",
CodeName: "threads",
Type: enums.ExtractorTypeSingle,
Category: enums.ExtractorCategorySocial,
URLPattern: regexp.MustCompile(`https:\/\/(www\.)?threads\.net\/(?:@[^\/]+)\/p(?:ost)?\/(?P<id>[a-zA-Z0-9_-]+)`),
Host: threadsHost,
IsRedirect: false,
Run: func(ctx *models.DownloadContext) (*models.ExtractorResponse, error) {
mediaList, err := GetEmbedMediaList(ctx)
if err != nil {
return nil, fmt.Errorf("failed to get media: %w", err)
}
return &models.ExtractorResponse{
MediaList: mediaList,
}, nil
},
}
func GetEmbedMediaList(ctx *models.DownloadContext) ([]*models.Media, error) {
session := util.GetHTTPClient(ctx.Extractor.CodeName)
embedURL := fmt.Sprintf(
"https://www.threads.net/@_/post/%s/embed",
ctx.MatchedContentID,
)
req, err := http.NewRequest(
http.MethodGet,
embedURL,
nil,
)
if err != nil {
return nil, fmt.Errorf("failed to create request: %w", err)
}
for k, v := range headers {
req.Header.Set(k, v)
}
res, err := session.Do(req)
if err != nil {
return nil, fmt.Errorf("failed to send request: %w", err)
}
defer res.Body.Close()
if res.StatusCode != http.StatusOK {
return nil, fmt.Errorf("failed to get embed media: %s", res.Status)
}
body, err := io.ReadAll(res.Body)
if err != nil {
return nil, fmt.Errorf("failed to read response body: %w", err)
}
return ParseEmbedMedia(ctx, body)
}

88
ext/threads/util.go Normal file
View file

@ -0,0 +1,88 @@
package threads
import (
"bytes"
"fmt"
"govd/enums"
"govd/models"
"govd/util"
"github.com/PuerkitoBio/goquery"
)
var headers = map[string]string{
"Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.7",
"Accept-Language": "en-GB,en;q=0.9",
"Cache-Control": "max-age=0",
"Dnt": "1",
"Priority": "u=0, i",
"Sec-Ch-Ua": `Chromium";v="124", "Google Chrome";v="124", "Not-A.Brand";v="99`,
"Sec-Ch-Ua-Mobile": "?0",
"Sec-Ch-Ua-Platform": "macOS",
"Sec-Fetch-Dest": "document",
"Sec-Fetch-Mode": "navigate",
"Sec-Fetch-Site": "none",
"Sec-Fetch-User": "?1",
"Upgrade-Insecure-Requests": "1",
"User-Agent": util.ChromeUA,
}
func ParseEmbedMedia(
ctx *models.DownloadContext,
body []byte,
) ([]*models.Media, error) {
var mediaList []*models.Media
doc, err := goquery.NewDocumentFromReader(bytes.NewReader(body))
if err != nil {
return nil, fmt.Errorf("failed parsing HTML: %w", err)
}
contentID := ctx.MatchedContentID
contentURL := ctx.MatchedContentURL
var caption string
doc.Find(".BodyContainerNoThreadLine .BodyTextContainer span").Each(func(i int, c *goquery.Selection) {
caption = c.Text()
})
doc.Find(".MediaContainer, .SoloMediaContainer").Each(func(i int, container *goquery.Selection) {
container.Find("video").Each(func(j int, vid *goquery.Selection) {
sourceEl := vid.Find("source")
src, exists := sourceEl.Attr("src")
if exists {
media := ctx.Extractor.NewMedia(
contentID,
contentURL,
)
media.SetCaption(caption)
media.AddFormat(&models.MediaFormat{
Type: enums.MediaTypeVideo,
FormatID: "video",
URL: []string{src},
VideoCodec: enums.MediaCodecAVC,
AudioCodec: enums.MediaCodecAAC,
})
mediaList = append(mediaList, media)
}
})
container.Find("img").Each(func(j int, img *goquery.Selection) {
src, exists := img.Attr("src")
if exists {
media := ctx.Extractor.NewMedia(
contentID,
contentURL,
)
media.SetCaption(caption)
media.AddFormat(&models.MediaFormat{
Type: enums.MediaTypePhoto,
FormatID: "image",
URL: []string{src},
})
mediaList = append(mediaList, media)
}
})
})
return mediaList, nil
}