From effd834a47796c333c6de9d120e06d5aea6b4fe6 Mon Sep 17 00:00:00 2001 From: stefanodvx <69367859+stefanodvx@users.noreply.github.com> Date: Thu, 24 Apr 2025 12:21:12 +0200 Subject: [PATCH] threads: new extractor --- ext/main.go | 3 +- ext/threads/main.go | 65 +++++++++++++++++++++++++++++++++ ext/threads/util.go | 88 +++++++++++++++++++++++++++++++++++++++++++++ 3 files changed, 155 insertions(+), 1 deletion(-) create mode 100644 ext/threads/main.go create mode 100644 ext/threads/util.go diff --git a/ext/main.go b/ext/main.go index 8fd5231..c3c686f 100644 --- a/ext/main.go +++ b/ext/main.go @@ -6,6 +6,7 @@ import ( "govd/ext/pinterest" "govd/ext/reddit" "govd/ext/redgifs" + "govd/ext/threads" "govd/ext/tiktok" "govd/ext/twitter" "govd/models" @@ -17,6 +18,7 @@ var List = []*models.Extractor{ instagram.Extractor, instagram.StoriesExtractor, instagram.ShareURLExtractor, + threads.Extractor, twitter.Extractor, twitter.ShortExtractor, pinterest.Extractor, @@ -25,5 +27,4 @@ var List = []*models.Extractor{ reddit.ShortExtractor, ninegag.Extractor, redgifs.Extractor, - // todo: add every ext lol } diff --git a/ext/threads/main.go b/ext/threads/main.go new file mode 100644 index 0000000..204d3cb --- /dev/null +++ b/ext/threads/main.go @@ -0,0 +1,65 @@ +package threads + +import ( + "fmt" + "govd/enums" + "govd/models" + "govd/util" + "io" + "net/http" + "regexp" +) + +var threadsHost = []string{"threads.net"} + +var Extractor = &models.Extractor{ + Name: "Threads", + CodeName: "threads", + Type: enums.ExtractorTypeSingle, + Category: enums.ExtractorCategorySocial, + URLPattern: regexp.MustCompile(`https:\/\/(www\.)?threads\.net\/(?:@[^\/]+)\/p(?:ost)?\/(?P[a-zA-Z0-9_-]+)`), + Host: threadsHost, + IsRedirect: false, + + Run: func(ctx *models.DownloadContext) (*models.ExtractorResponse, error) { + mediaList, err := GetEmbedMediaList(ctx) + if err != nil { + return nil, fmt.Errorf("failed to get media: %w", err) + } + return &models.ExtractorResponse{ + MediaList: mediaList, + }, nil + }, +} + +func GetEmbedMediaList(ctx *models.DownloadContext) ([]*models.Media, error) { + session := util.GetHTTPClient(ctx.Extractor.CodeName) + embedURL := fmt.Sprintf( + "https://www.threads.net/@_/post/%s/embed", + ctx.MatchedContentID, + ) + req, err := http.NewRequest( + http.MethodGet, + embedURL, + nil, + ) + if err != nil { + return nil, fmt.Errorf("failed to create request: %w", err) + } + for k, v := range headers { + req.Header.Set(k, v) + } + res, err := session.Do(req) + if err != nil { + return nil, fmt.Errorf("failed to send request: %w", err) + } + defer res.Body.Close() + if res.StatusCode != http.StatusOK { + return nil, fmt.Errorf("failed to get embed media: %s", res.Status) + } + body, err := io.ReadAll(res.Body) + if err != nil { + return nil, fmt.Errorf("failed to read response body: %w", err) + } + return ParseEmbedMedia(ctx, body) +} diff --git a/ext/threads/util.go b/ext/threads/util.go new file mode 100644 index 0000000..b3d5713 --- /dev/null +++ b/ext/threads/util.go @@ -0,0 +1,88 @@ +package threads + +import ( + "bytes" + "fmt" + "govd/enums" + "govd/models" + "govd/util" + + "github.com/PuerkitoBio/goquery" +) + +var headers = map[string]string{ + "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.7", + "Accept-Language": "en-GB,en;q=0.9", + "Cache-Control": "max-age=0", + "Dnt": "1", + "Priority": "u=0, i", + "Sec-Ch-Ua": `Chromium";v="124", "Google Chrome";v="124", "Not-A.Brand";v="99`, + "Sec-Ch-Ua-Mobile": "?0", + "Sec-Ch-Ua-Platform": "macOS", + "Sec-Fetch-Dest": "document", + "Sec-Fetch-Mode": "navigate", + "Sec-Fetch-Site": "none", + "Sec-Fetch-User": "?1", + "Upgrade-Insecure-Requests": "1", + "User-Agent": util.ChromeUA, +} + +func ParseEmbedMedia( + ctx *models.DownloadContext, + body []byte, +) ([]*models.Media, error) { + var mediaList []*models.Media + + doc, err := goquery.NewDocumentFromReader(bytes.NewReader(body)) + if err != nil { + return nil, fmt.Errorf("failed parsing HTML: %w", err) + } + + contentID := ctx.MatchedContentID + contentURL := ctx.MatchedContentURL + + var caption string + doc.Find(".BodyContainerNoThreadLine .BodyTextContainer span").Each(func(i int, c *goquery.Selection) { + caption = c.Text() + }) + + doc.Find(".MediaContainer, .SoloMediaContainer").Each(func(i int, container *goquery.Selection) { + container.Find("video").Each(func(j int, vid *goquery.Selection) { + sourceEl := vid.Find("source") + src, exists := sourceEl.Attr("src") + if exists { + media := ctx.Extractor.NewMedia( + contentID, + contentURL, + ) + media.SetCaption(caption) + media.AddFormat(&models.MediaFormat{ + Type: enums.MediaTypeVideo, + FormatID: "video", + URL: []string{src}, + VideoCodec: enums.MediaCodecAVC, + AudioCodec: enums.MediaCodecAAC, + }) + mediaList = append(mediaList, media) + } + }) + container.Find("img").Each(func(j int, img *goquery.Selection) { + src, exists := img.Attr("src") + if exists { + media := ctx.Extractor.NewMedia( + contentID, + contentURL, + ) + media.SetCaption(caption) + media.AddFormat(&models.MediaFormat{ + Type: enums.MediaTypePhoto, + FormatID: "image", + URL: []string{src}, + }) + mediaList = append(mediaList, media) + } + }) + }) + + return mediaList, nil +}