1. writing chunks directly to disk instead of buffering in memory 2. using fixed-size buffers (32KB) for all I/O operations 3. optimizing buffer allocation strategy in downloadInMemory 4. implementing proper file synchronization with mutex locks 5. calculating chunk boundaries on-the-fly instead of pre-allocating the memory profiling showed excessive allocations in bytes.growSlice which has been addressed by minimizing intermediate buffers and eliminating unnecessary memory copies these changes should fix the observed OOM issues when downloading large files while maintaining the same functionality
201 lines
4.9 KiB
Go
201 lines
4.9 KiB
Go
package twitter
|
|
|
|
import (
|
|
"fmt"
|
|
"io"
|
|
"net/http"
|
|
"regexp"
|
|
|
|
"govd/enums"
|
|
"govd/models"
|
|
"govd/util"
|
|
|
|
"github.com/bytedance/sonic"
|
|
"github.com/pkg/errors"
|
|
)
|
|
|
|
const (
|
|
apiHostname = "x.com"
|
|
apiBase = "https://" + apiHostname + "/i/api/graphql/"
|
|
apiEndpoint = apiBase + "2ICDjqPd81tulZcYrtpTuQ/TweetResultByRestId"
|
|
)
|
|
|
|
var ShortExtractor = &models.Extractor{
|
|
Name: "Twitter (Short)",
|
|
CodeName: "twitter_short",
|
|
Type: enums.ExtractorTypeSingle,
|
|
Category: enums.ExtractorCategorySocial,
|
|
URLPattern: regexp.MustCompile(`https?://t\.co/(?P<id>\w+)`),
|
|
Host: []string{"t.co"},
|
|
IsRedirect: true,
|
|
|
|
Run: func(ctx *models.DownloadContext) (*models.ExtractorResponse, error) {
|
|
client := util.GetHTTPClient(ctx.Extractor.CodeName)
|
|
req, err := http.NewRequest(http.MethodGet, ctx.MatchedContentURL, nil)
|
|
if err != nil {
|
|
return nil, fmt.Errorf("failed to create req: %w", err)
|
|
}
|
|
req.Header.Set("User-Agent", util.ChromeUA)
|
|
res, err := client.Do(req)
|
|
if err != nil {
|
|
return nil, fmt.Errorf("failed to send request: %w", err)
|
|
}
|
|
defer res.Body.Close()
|
|
body, err := io.ReadAll(res.Body)
|
|
if err != nil {
|
|
return nil, fmt.Errorf("failed to read body: %w", err)
|
|
}
|
|
matchedURL := Extractor.URLPattern.FindStringSubmatch(string(body))
|
|
if matchedURL == nil {
|
|
return nil, errors.New("failed to find url in body")
|
|
}
|
|
return &models.ExtractorResponse{
|
|
URL: matchedURL[0],
|
|
}, nil
|
|
},
|
|
}
|
|
|
|
var Extractor = &models.Extractor{
|
|
Name: "Twitter",
|
|
CodeName: "twitter",
|
|
Type: enums.ExtractorTypeSingle,
|
|
Category: enums.ExtractorCategorySocial,
|
|
URLPattern: regexp.MustCompile(`https?:\/\/(vx)?(twitter|x)\.com\/([^\/]+)\/status\/(?P<id>\d+)`),
|
|
Host: []string{
|
|
"twitter.com",
|
|
"x.com",
|
|
"vxtwitter.com",
|
|
},
|
|
|
|
Run: func(ctx *models.DownloadContext) (*models.ExtractorResponse, error) {
|
|
mediaList, err := MediaListFromAPI(ctx)
|
|
if err != nil {
|
|
return nil, fmt.Errorf("failed to get media: %w", err)
|
|
}
|
|
return &models.ExtractorResponse{
|
|
MediaList: mediaList,
|
|
}, nil
|
|
},
|
|
}
|
|
|
|
func MediaListFromAPI(ctx *models.DownloadContext) ([]*models.Media, error) {
|
|
var mediaList []*models.Media
|
|
client := util.GetHTTPClient(ctx.Extractor.CodeName)
|
|
|
|
tweetData, err := GetTweetAPI(
|
|
client, ctx.MatchedContentID)
|
|
if err != nil {
|
|
return nil, fmt.Errorf("failed to get tweet data: %w", err)
|
|
}
|
|
|
|
caption := CleanCaption(tweetData.FullText)
|
|
|
|
var mediaEntities []MediaEntity
|
|
if tweetData.ExtendedEntities != nil && len(tweetData.ExtendedEntities.Media) > 0 {
|
|
mediaEntities = tweetData.ExtendedEntities.Media
|
|
} else if tweetData.Entities != nil && len(tweetData.Entities.Media) > 0 {
|
|
mediaEntities = tweetData.Entities.Media
|
|
} else {
|
|
return nil, nil
|
|
}
|
|
|
|
for _, mediaEntity := range mediaEntities {
|
|
media := ctx.Extractor.NewMedia(
|
|
ctx.MatchedContentID,
|
|
ctx.MatchedContentURL,
|
|
)
|
|
media.SetCaption(caption)
|
|
|
|
switch mediaEntity.Type {
|
|
case "video", "animated_gif":
|
|
formats, err := ExtractVideoFormats(&mediaEntity)
|
|
if err != nil {
|
|
return nil, err
|
|
}
|
|
for _, format := range formats {
|
|
media.AddFormat(format)
|
|
}
|
|
case "photo":
|
|
media.AddFormat(&models.MediaFormat{
|
|
Type: enums.MediaTypePhoto,
|
|
FormatID: "photo",
|
|
URL: []string{mediaEntity.MediaURLHTTPS},
|
|
})
|
|
}
|
|
|
|
if len(media.Formats) > 0 {
|
|
mediaList = append(mediaList, media)
|
|
}
|
|
}
|
|
|
|
return mediaList, nil
|
|
}
|
|
|
|
func GetTweetAPI(
|
|
client models.HTTPClient,
|
|
tweetID string,
|
|
) (*Tweet, error) {
|
|
cookies, err := util.ParseCookieFile("twitter.txt")
|
|
if err != nil {
|
|
return nil, fmt.Errorf("failed to get cookies: %w", err)
|
|
}
|
|
headers := BuildAPIHeaders(cookies)
|
|
if headers == nil {
|
|
return nil, errors.New("failed to build headers. check cookies")
|
|
}
|
|
query := BuildAPIQuery(tweetID)
|
|
|
|
req, err := http.NewRequest(http.MethodGet, apiEndpoint, nil)
|
|
if err != nil {
|
|
return nil, fmt.Errorf("failed to create req: %w", err)
|
|
}
|
|
|
|
for key, value := range headers {
|
|
req.Header.Set(key, value)
|
|
}
|
|
|
|
for _, cookie := range cookies {
|
|
req.AddCookie(cookie)
|
|
}
|
|
|
|
q := req.URL.Query()
|
|
for key, value := range query {
|
|
q.Add(key, value)
|
|
}
|
|
req.URL.RawQuery = q.Encode()
|
|
|
|
resp, err := client.Do(req)
|
|
if err != nil {
|
|
return nil, fmt.Errorf("failed to send request: %w", err)
|
|
}
|
|
defer resp.Body.Close()
|
|
|
|
if resp.StatusCode != http.StatusOK {
|
|
return nil, fmt.Errorf("invalid response code: %s", resp.Status)
|
|
}
|
|
body, err := io.ReadAll(resp.Body)
|
|
if err != nil {
|
|
return nil, fmt.Errorf("failed to read body: %w", err)
|
|
}
|
|
|
|
var apiResponse APIResponse
|
|
err = sonic.ConfigFastest.Unmarshal(body, &apiResponse)
|
|
if err != nil {
|
|
return nil, fmt.Errorf("failed to parse response: %w", err)
|
|
}
|
|
|
|
result := apiResponse.Data.TweetResult.Result
|
|
if result == nil {
|
|
return nil, errors.New("failed to get tweet result")
|
|
}
|
|
|
|
var tweet *Tweet
|
|
if result.Tweet != nil {
|
|
tweet = result.Tweet
|
|
} else if result.Legacy != nil {
|
|
tweet = result.Legacy
|
|
} else {
|
|
return nil, errors.New("failed to get tweet data")
|
|
}
|
|
return tweet, nil
|
|
}
|