govd/ext/instagram/main.go
stefanodvx 93e964a28b instagram: new extraction method
new extraction method first tries to fetch content directly from instagram graphql API, fallback to html embed page. in case every method fail, rely on 3rd party
2025-04-23 18:52:05 +02:00

223 lines
6 KiB
Go

package instagram
import (
"fmt"
"govd/enums"
"govd/models"
"govd/util"
"io"
"net/http"
"regexp"
)
var instagramHost = []string{"instagram.com"}
var Extractor = &models.Extractor{
Name: "Instagram",
CodeName: "instagram",
Type: enums.ExtractorTypeSingle,
Category: enums.ExtractorCategorySocial,
URLPattern: regexp.MustCompile(`https:\/\/(www\.)?instagram\.com\/(reel|p|tv)\/(?P<id>[a-zA-Z0-9_-]+)`),
Host: instagramHost,
IsRedirect: false,
Run: func(ctx *models.DownloadContext) (*models.ExtractorResponse, error) {
// method 1: get media from GQL web API
mediaList, err := GetGQLMediaList(ctx)
if err == nil && len(mediaList) > 0 {
return &models.ExtractorResponse{
MediaList: mediaList,
}, nil
}
// method 2: get media from embed page
mediaList, err = GetEmbedMediaList(ctx)
if err == nil && len(mediaList) > 0 {
return &models.ExtractorResponse{
MediaList: mediaList,
}, nil
}
// method 3: get media from 3rd party service (unlikely)
mediaList, err = GetIGramMediaList(ctx)
if err == nil && len(mediaList) > 0 {
return &models.ExtractorResponse{
MediaList: mediaList,
}, nil
}
return nil, fmt.Errorf("failed to extract media: all methods failed")
},
}
var StoriesExtractor = &models.Extractor{
Name: "Instagram Stories",
CodeName: "instagram_stories",
Type: enums.ExtractorTypeSingle,
Category: enums.ExtractorCategorySocial,
URLPattern: regexp.MustCompile(`https:\/\/(www\.)?instagram\.com\/stories\/[a-zA-Z0-9._]+\/(?P<id>\d+)`),
Host: instagramHost,
IsRedirect: false,
Run: func(ctx *models.DownloadContext) (*models.ExtractorResponse, error) {
mediaList, err := GetIGramMediaList(ctx)
return &models.ExtractorResponse{
MediaList: mediaList,
}, err
},
}
var ShareURLExtractor = &models.Extractor{
Name: "Instagram Share URL",
CodeName: "instagram_share",
Type: enums.ExtractorTypeSingle,
Category: enums.ExtractorCategorySocial,
URLPattern: regexp.MustCompile(`https?:\/\/(www\.)?instagram\.com\/share\/((reels?|video|s|p)\/)?(?P<id>[^\/\?]+)`),
Host: instagramHost,
IsRedirect: true,
Run: func(ctx *models.DownloadContext) (*models.ExtractorResponse, error) {
client := util.GetHTTPClient(ctx.Extractor.CodeName)
redirectURL, err := util.GetLocationURL(
client,
ctx.MatchedContentURL,
igHeaders,
)
if err != nil {
return nil, fmt.Errorf("failed to get url location: %w", err)
}
return &models.ExtractorResponse{
URL: redirectURL,
}, nil
},
}
func GetGQLMediaList(
ctx *models.DownloadContext,
) ([]*models.Media, error) {
graphData, err := GetGQLData(ctx, ctx.MatchedContentID)
if err != nil {
return nil, fmt.Errorf("failed to get graph data: %w", err)
}
return ParseGQLMedia(ctx, graphData.ShortcodeMedia)
}
func GetEmbedMediaList(
ctx *models.DownloadContext,
) ([]*models.Media, error) {
session := util.GetHTTPClient(ctx.Extractor.CodeName)
embedURL := fmt.Sprintf("https://www.instagram.com/p/%s/embed/captioned", ctx.MatchedContentID)
req, err := http.NewRequest(
http.MethodGet,
embedURL,
nil,
)
if err != nil {
return nil, fmt.Errorf("failed to create request: %w", err)
}
for key, value := range igHeaders {
req.Header.Set(key, value)
}
resp, err := session.Do(req)
if err != nil {
return nil, fmt.Errorf("failed to send request: %w", err)
}
defer resp.Body.Close()
if resp.StatusCode != http.StatusOK {
return nil, fmt.Errorf("failed to get embed page: %s", resp.Status)
}
body, err := io.ReadAll(resp.Body)
if err != nil {
return nil, fmt.Errorf("failed to read response body: %w", err)
}
graphData, err := ParseEmbedGQL(body)
if err != nil {
return nil, fmt.Errorf("failed to parse embed page: %w", err)
}
return ParseGQLMedia(ctx, graphData)
}
func GetIGramMediaList(ctx *models.DownloadContext) ([]*models.Media, error) {
var mediaList []*models.Media
postURL := ctx.MatchedContentURL
details, err := GetFromIGram(ctx, postURL)
if err != nil {
return nil, fmt.Errorf("failed to get post: %w", err)
}
for _, item := range details.Items {
media := ctx.Extractor.NewMedia(
ctx.MatchedContentID,
ctx.MatchedContentURL,
)
urlObj := item.URL[0]
contentURL, err := GetCDNURL(urlObj.URL)
if err != nil {
return nil, err
}
thumbnailURL, err := GetCDNURL(item.Thumb)
if err != nil {
return nil, err
}
fileExt := urlObj.Ext
formatID := urlObj.Type
switch fileExt {
case "mp4":
media.AddFormat(&models.MediaFormat{
Type: enums.MediaTypeVideo,
FormatID: formatID,
URL: []string{contentURL},
VideoCodec: enums.MediaCodecAVC,
AudioCodec: enums.MediaCodecAAC,
Thumbnail: []string{thumbnailURL},
},
)
case "jpg", "webp", "heic", "jpeg":
media.AddFormat(&models.MediaFormat{
Type: enums.MediaTypePhoto,
FormatID: formatID,
URL: []string{contentURL},
})
default:
return nil, fmt.Errorf("unknown format: %s", fileExt)
}
mediaList = append(mediaList, media)
}
return mediaList, nil
}
func GetFromIGram(
ctx *models.DownloadContext,
contentURL string,
) (*IGramResponse, error) {
session := util.GetHTTPClient(ctx.Extractor.CodeName)
apiURL := fmt.Sprintf(
"https://%s/api/convert",
igramHostname,
)
payload, err := BuildIGramPayload(contentURL)
if err != nil {
return nil, fmt.Errorf("failed to build signed payload: %w", err)
}
req, err := http.NewRequest("POST", apiURL, payload)
if err != nil {
return nil, fmt.Errorf("failed to create request: %w", err)
}
req.Header.Set("Content-Type", "application/json")
req.Header.Set("User-Agent", util.ChromeUA)
resp, err := session.Do(req)
if err != nil {
return nil, fmt.Errorf("failed to send request: %w", err)
}
defer resp.Body.Close()
if resp.StatusCode != http.StatusOK {
return nil, fmt.Errorf("failed to get response: %s", resp.Status)
}
body, err := io.ReadAll(resp.Body)
if err != nil {
return nil, fmt.Errorf("failed to read response body: %w", err)
}
response, err := ParseIGramResponse(body)
if err != nil {
return nil, fmt.Errorf("failed to parse response: %w", err)
}
return response, nil
}