govd/ext/twitter/main.go
stefanodvx 0d986d4573 fix: resolve memory leaks in download utility
1. writing chunks directly to disk instead of buffering in memory
2. using fixed-size buffers (32KB) for all I/O operations
3. optimizing buffer allocation strategy in downloadInMemory
4. implementing proper file synchronization with mutex locks
5. calculating chunk boundaries on-the-fly instead of pre-allocating

the memory profiling showed excessive allocations in bytes.growSlice
which has been addressed by minimizing intermediate buffers and
eliminating unnecessary memory copies

these changes should fix the observed OOM issues when downloading
large files while maintaining the same functionality
2025-04-28 14:35:22 +02:00

201 lines
4.9 KiB
Go

package twitter
import (
"fmt"
"io"
"net/http"
"regexp"
"govd/enums"
"govd/models"
"govd/util"
"github.com/bytedance/sonic"
"github.com/pkg/errors"
)
const (
apiHostname = "x.com"
apiBase = "https://" + apiHostname + "/i/api/graphql/"
apiEndpoint = apiBase + "2ICDjqPd81tulZcYrtpTuQ/TweetResultByRestId"
)
var ShortExtractor = &models.Extractor{
Name: "Twitter (Short)",
CodeName: "twitter_short",
Type: enums.ExtractorTypeSingle,
Category: enums.ExtractorCategorySocial,
URLPattern: regexp.MustCompile(`https?://t\.co/(?P<id>\w+)`),
Host: []string{"t.co"},
IsRedirect: true,
Run: func(ctx *models.DownloadContext) (*models.ExtractorResponse, error) {
client := util.GetHTTPClient(ctx.Extractor.CodeName)
req, err := http.NewRequest(http.MethodGet, ctx.MatchedContentURL, nil)
if err != nil {
return nil, fmt.Errorf("failed to create req: %w", err)
}
req.Header.Set("User-Agent", util.ChromeUA)
res, err := client.Do(req)
if err != nil {
return nil, fmt.Errorf("failed to send request: %w", err)
}
defer res.Body.Close()
body, err := io.ReadAll(res.Body)
if err != nil {
return nil, fmt.Errorf("failed to read body: %w", err)
}
matchedURL := Extractor.URLPattern.FindStringSubmatch(string(body))
if matchedURL == nil {
return nil, errors.New("failed to find url in body")
}
return &models.ExtractorResponse{
URL: matchedURL[0],
}, nil
},
}
var Extractor = &models.Extractor{
Name: "Twitter",
CodeName: "twitter",
Type: enums.ExtractorTypeSingle,
Category: enums.ExtractorCategorySocial,
URLPattern: regexp.MustCompile(`https?:\/\/(vx)?(twitter|x)\.com\/([^\/]+)\/status\/(?P<id>\d+)`),
Host: []string{
"twitter.com",
"x.com",
"vxtwitter.com",
},
Run: func(ctx *models.DownloadContext) (*models.ExtractorResponse, error) {
mediaList, err := MediaListFromAPI(ctx)
if err != nil {
return nil, fmt.Errorf("failed to get media: %w", err)
}
return &models.ExtractorResponse{
MediaList: mediaList,
}, nil
},
}
func MediaListFromAPI(ctx *models.DownloadContext) ([]*models.Media, error) {
var mediaList []*models.Media
client := util.GetHTTPClient(ctx.Extractor.CodeName)
tweetData, err := GetTweetAPI(
client, ctx.MatchedContentID)
if err != nil {
return nil, fmt.Errorf("failed to get tweet data: %w", err)
}
caption := CleanCaption(tweetData.FullText)
var mediaEntities []MediaEntity
if tweetData.ExtendedEntities != nil && len(tweetData.ExtendedEntities.Media) > 0 {
mediaEntities = tweetData.ExtendedEntities.Media
} else if tweetData.Entities != nil && len(tweetData.Entities.Media) > 0 {
mediaEntities = tweetData.Entities.Media
} else {
return nil, nil
}
for _, mediaEntity := range mediaEntities {
media := ctx.Extractor.NewMedia(
ctx.MatchedContentID,
ctx.MatchedContentURL,
)
media.SetCaption(caption)
switch mediaEntity.Type {
case "video", "animated_gif":
formats, err := ExtractVideoFormats(&mediaEntity)
if err != nil {
return nil, err
}
for _, format := range formats {
media.AddFormat(format)
}
case "photo":
media.AddFormat(&models.MediaFormat{
Type: enums.MediaTypePhoto,
FormatID: "photo",
URL: []string{mediaEntity.MediaURLHTTPS},
})
}
if len(media.Formats) > 0 {
mediaList = append(mediaList, media)
}
}
return mediaList, nil
}
func GetTweetAPI(
client models.HTTPClient,
tweetID string,
) (*Tweet, error) {
cookies, err := util.ParseCookieFile("twitter.txt")
if err != nil {
return nil, fmt.Errorf("failed to get cookies: %w", err)
}
headers := BuildAPIHeaders(cookies)
if headers == nil {
return nil, errors.New("failed to build headers. check cookies")
}
query := BuildAPIQuery(tweetID)
req, err := http.NewRequest(http.MethodGet, apiEndpoint, nil)
if err != nil {
return nil, fmt.Errorf("failed to create req: %w", err)
}
for key, value := range headers {
req.Header.Set(key, value)
}
for _, cookie := range cookies {
req.AddCookie(cookie)
}
q := req.URL.Query()
for key, value := range query {
q.Add(key, value)
}
req.URL.RawQuery = q.Encode()
resp, err := client.Do(req)
if err != nil {
return nil, fmt.Errorf("failed to send request: %w", err)
}
defer resp.Body.Close()
if resp.StatusCode != http.StatusOK {
return nil, fmt.Errorf("invalid response code: %s", resp.Status)
}
body, err := io.ReadAll(resp.Body)
if err != nil {
return nil, fmt.Errorf("failed to read body: %w", err)
}
var apiResponse APIResponse
err = sonic.ConfigFastest.Unmarshal(body, &apiResponse)
if err != nil {
return nil, fmt.Errorf("failed to parse response: %w", err)
}
result := apiResponse.Data.TweetResult.Result
if result == nil {
return nil, errors.New("failed to get tweet result")
}
var tweet *Tweet
if result.Tweet != nil {
tweet = result.Tweet
} else if result.Legacy != nil {
tweet = result.Legacy
} else {
return nil, errors.New("failed to get tweet data")
}
return tweet, nil
}