instagram: new extraction method
new extraction method first tries to fetch content directly from instagram graphql API, fallback to html embed page. in case every method fail, rely on 3rd party
This commit is contained in:
parent
1b3c426808
commit
93e964a28b
10 changed files with 494 additions and 110 deletions
|
@ -1,13 +1,15 @@
|
|||
package instagram
|
||||
|
||||
import (
|
||||
"crypto/rand"
|
||||
"crypto/sha256"
|
||||
"encoding/hex"
|
||||
"fmt"
|
||||
"govd/enums"
|
||||
"govd/models"
|
||||
"govd/util"
|
||||
"html"
|
||||
"io"
|
||||
"math/big"
|
||||
"net/http"
|
||||
"net/url"
|
||||
"regexp"
|
||||
|
@ -16,11 +18,21 @@ import (
|
|||
"time"
|
||||
|
||||
"github.com/bytedance/sonic"
|
||||
"github.com/titanous/json5"
|
||||
)
|
||||
|
||||
const (
|
||||
graphQLEndpoint = "https://www.instagram.com/graphql/query/"
|
||||
polarisAction = "PolarisPostActionLoadPostQueryQuery"
|
||||
|
||||
igramHostname = "api.igram.world"
|
||||
igramKey = "aaeaf2805cea6abef3f9d2b6a666fce62fd9d612a43ab772bb50ce81455112e0"
|
||||
igramTimestamp = "1742201548873"
|
||||
)
|
||||
|
||||
var (
|
||||
captionPattern = regexp.MustCompile(
|
||||
`(?s)<meta property="og:title" content=".*?: "(.*?)""`)
|
||||
embedPattern = regexp.MustCompile(
|
||||
`new ServerJS\(\)\);s\.handle\(({.*})\);requireLazy`)
|
||||
|
||||
igHeaders = map[string]string{
|
||||
"Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.7",
|
||||
|
@ -40,12 +52,129 @@ var (
|
|||
}
|
||||
)
|
||||
|
||||
func BuildSignedPayload(contentURL string) (io.Reader, error) {
|
||||
func ParseGQLMedia(
|
||||
ctx *models.DownloadContext,
|
||||
data *Media,
|
||||
) ([]*models.Media, error) {
|
||||
var mediaList []*models.Media
|
||||
|
||||
var caption string
|
||||
if data.EdgeMediaToCaption != nil && len(data.EdgeMediaToCaption.Edges) > 0 {
|
||||
caption = data.EdgeMediaToCaption.Edges[0].Node.Text
|
||||
}
|
||||
|
||||
mediaType := data.Typename
|
||||
contentID := ctx.MatchedContentID
|
||||
contentURL := ctx.MatchedContentURL
|
||||
|
||||
switch mediaType {
|
||||
case "GraphVideo", "XDTGraphVideo":
|
||||
media := ctx.Extractor.NewMedia(contentID, contentURL)
|
||||
media.SetCaption(caption)
|
||||
|
||||
media.AddFormat(&models.MediaFormat{
|
||||
FormatID: "video",
|
||||
Type: enums.MediaTypeVideo,
|
||||
VideoCodec: enums.MediaCodecAVC,
|
||||
AudioCodec: enums.MediaCodecAAC,
|
||||
URL: []string{data.VideoURL},
|
||||
Thumbnail: []string{data.DisplayURL},
|
||||
Width: int64(data.Dimensions.Width),
|
||||
Height: int64(data.Dimensions.Height),
|
||||
})
|
||||
|
||||
mediaList = append(mediaList, media)
|
||||
|
||||
case "GraphImage", "XDTGraphImage":
|
||||
media := ctx.Extractor.NewMedia(contentID, contentURL)
|
||||
media.SetCaption(caption)
|
||||
|
||||
media.AddFormat(&models.MediaFormat{
|
||||
FormatID: "image",
|
||||
Type: enums.MediaTypePhoto,
|
||||
URL: []string{data.DisplayURL},
|
||||
})
|
||||
|
||||
mediaList = append(mediaList, media)
|
||||
|
||||
case "GraphSidecar", "XDTGraphSidecar":
|
||||
if data.EdgeSidecarToChildren != nil && len(data.EdgeSidecarToChildren.Edges) > 0 {
|
||||
for _, edge := range data.EdgeSidecarToChildren.Edges {
|
||||
node := edge.Node
|
||||
media := ctx.Extractor.NewMedia(contentID, contentURL)
|
||||
media.SetCaption(caption)
|
||||
|
||||
switch node.Typename {
|
||||
case "GraphVideo", "XDTGraphVideo":
|
||||
media.AddFormat(&models.MediaFormat{
|
||||
FormatID: "video",
|
||||
Type: enums.MediaTypeVideo,
|
||||
VideoCodec: enums.MediaCodecAVC,
|
||||
AudioCodec: enums.MediaCodecAAC,
|
||||
URL: []string{node.VideoURL},
|
||||
Thumbnail: []string{node.DisplayURL},
|
||||
Width: int64(node.Dimensions.Width),
|
||||
Height: int64(node.Dimensions.Height),
|
||||
})
|
||||
|
||||
case "GraphImage", "XDTGraphImage":
|
||||
|
||||
media.AddFormat(&models.MediaFormat{
|
||||
FormatID: "image",
|
||||
Type: enums.MediaTypePhoto,
|
||||
URL: []string{node.DisplayURL},
|
||||
})
|
||||
}
|
||||
|
||||
mediaList = append(mediaList, media)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
return mediaList, nil
|
||||
}
|
||||
|
||||
func ParseEmbedGQL(
|
||||
body []byte,
|
||||
) (*Media, error) {
|
||||
match := embedPattern.FindStringSubmatch(string(body))
|
||||
if len(match) < 2 {
|
||||
return nil, fmt.Errorf("failed to find JSON in response")
|
||||
}
|
||||
jsonData := match[1]
|
||||
|
||||
var data map[string]interface{}
|
||||
if err := json5.Unmarshal([]byte(jsonData), &data); err != nil {
|
||||
return nil, fmt.Errorf("failed to unmarshal JSON: %w", err)
|
||||
}
|
||||
igCtx := util.TraverseJSON(data, "contextJSON")
|
||||
if igCtx == nil {
|
||||
return nil, fmt.Errorf("contextJSON not found in data")
|
||||
}
|
||||
var ctxJSON ContextJSON
|
||||
switch v := igCtx.(type) {
|
||||
case string:
|
||||
if err := json5.Unmarshal([]byte(v), &ctxJSON); err != nil {
|
||||
return nil, fmt.Errorf("failed to unmarshal contextJSON: %w", err)
|
||||
}
|
||||
default:
|
||||
return nil, fmt.Errorf("contextJSON is not a string")
|
||||
}
|
||||
if ctxJSON.GqlData == nil {
|
||||
return nil, fmt.Errorf("gql_data is nil")
|
||||
}
|
||||
if ctxJSON.GqlData.ShortcodeMedia == nil {
|
||||
return nil, fmt.Errorf("media is nil")
|
||||
}
|
||||
return ctxJSON.GqlData.ShortcodeMedia, nil
|
||||
}
|
||||
|
||||
func BuildIGramPayload(contentURL string) (io.Reader, error) {
|
||||
timestamp := strconv.FormatInt(time.Now().UnixMilli(), 10)
|
||||
hash := sha256.New()
|
||||
_, err := io.WriteString(
|
||||
hash,
|
||||
contentURL+timestamp+apiKey,
|
||||
contentURL+timestamp+igramKey,
|
||||
)
|
||||
if err != nil {
|
||||
return nil, fmt.Errorf("error writing to SHA256 hash: %w", err)
|
||||
|
@ -56,7 +185,7 @@ func BuildSignedPayload(contentURL string) (io.Reader, error) {
|
|||
payload := map[string]string{
|
||||
"url": contentURL,
|
||||
"ts": timestamp,
|
||||
"_ts": apiTimestamp,
|
||||
"_ts": igramTimestamp,
|
||||
"_tsc": "0", // ?
|
||||
"_s": secretString,
|
||||
}
|
||||
|
@ -69,15 +198,14 @@ func BuildSignedPayload(contentURL string) (io.Reader, error) {
|
|||
}
|
||||
|
||||
func ParseIGramResponse(body []byte) (*IGramResponse, error) {
|
||||
var rawResponse interface{}
|
||||
//move to the start of the body
|
||||
// Use sonic's decoder to unmarshal the raw response
|
||||
var rawResponse any
|
||||
|
||||
if err := sonic.ConfigFastest.Unmarshal(body, &rawResponse); err != nil {
|
||||
return nil, fmt.Errorf("failed to decode response1: %w", err)
|
||||
}
|
||||
|
||||
switch rawResponse.(type) {
|
||||
case []interface{}:
|
||||
case []any:
|
||||
// array of IGramMedia
|
||||
var media []*IGramMedia
|
||||
if err := sonic.ConfigFastest.Unmarshal(body, &media); err != nil {
|
||||
|
@ -86,7 +214,7 @@ func ParseIGramResponse(body []byte) (*IGramResponse, error) {
|
|||
return &IGramResponse{
|
||||
Items: media,
|
||||
}, nil
|
||||
case map[string]interface{}:
|
||||
case map[string]any:
|
||||
// single IGramMedia
|
||||
var media IGramMedia
|
||||
if err := sonic.ConfigFastest.Unmarshal(body, &media); err != nil {
|
||||
|
@ -113,53 +241,142 @@ func GetCDNURL(contentURL string) (string, error) {
|
|||
return cdnURL, nil
|
||||
}
|
||||
|
||||
func GetPostCaption(
|
||||
client models.HTTPClient,
|
||||
postURL string,
|
||||
) (string, error) {
|
||||
func GetGQLData(
|
||||
ctx *models.DownloadContext,
|
||||
shortcode string,
|
||||
) (*GraphQLData, error) {
|
||||
session := util.GetHTTPClient(ctx.Extractor.CodeName)
|
||||
graphHeaders, body, err := BuildGQLData()
|
||||
if err != nil {
|
||||
return nil, fmt.Errorf("failed to build GQL data: %w", err)
|
||||
}
|
||||
formData := url.Values{}
|
||||
for key, value := range body {
|
||||
formData.Set(key, value)
|
||||
}
|
||||
formData.Set("fb_api_caller_class", "RelayModern")
|
||||
formData.Set("fb_api_req_friendly_name", polarisAction)
|
||||
variables := map[string]any{
|
||||
"shortcode": shortcode,
|
||||
"fetch_tagged_user_count": nil,
|
||||
"hoisted_comment_id": nil,
|
||||
"hoisted_reply_id": nil,
|
||||
}
|
||||
variablesJSON, err := sonic.ConfigFastest.Marshal(variables)
|
||||
if err != nil {
|
||||
return nil, fmt.Errorf("failed to marshal variables: %w", err)
|
||||
}
|
||||
formData.Set("variables", string(variablesJSON))
|
||||
formData.Set("server_timestamps", "true")
|
||||
formData.Set("doc_id", "8845758582119845") // idk what this is
|
||||
req, err := http.NewRequest(
|
||||
http.MethodGet,
|
||||
postURL,
|
||||
nil,
|
||||
http.MethodPost,
|
||||
graphQLEndpoint,
|
||||
strings.NewReader(formData.Encode()),
|
||||
)
|
||||
if err != nil {
|
||||
return "", fmt.Errorf("failed to create request: %w", err)
|
||||
return nil, fmt.Errorf("failed to create request: %w", err)
|
||||
}
|
||||
req.Header.Set("User-Agent", util.ChromeUA)
|
||||
req.Header.Set("Accept", "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8")
|
||||
req.Header.Set("Accept-Language", "it-IT,it;q=0.8,en-US;q=0.5,en;q=0.3")
|
||||
req.Header.Set("Referer", "https://www.instagram.com/accounts/onetap/?next=%2F")
|
||||
req.Header.Set("Alt-Used", "www.instagram.com")
|
||||
req.Header.Set("Connection", "keep-alive")
|
||||
req.Header.Set("Upgrade-Insecure-Requests", "1")
|
||||
req.Header.Set("Sec-Fetch-Dest", "document")
|
||||
req.Header.Set("Sec-Fetch-Mode", "navigate")
|
||||
req.Header.Set("Sec-Fetch-Site", "same-origin")
|
||||
req.Header.Set("Priority", "u=0, i")
|
||||
req.Header.Set("Pragma", "no-cache")
|
||||
req.Header.Set("Cache-Control", "no-cache")
|
||||
req.Header.Set("TE", "trailers")
|
||||
|
||||
resp, err := client.Do(req)
|
||||
for key, value := range igHeaders {
|
||||
req.Header.Set(key, value)
|
||||
}
|
||||
for key, value := range graphHeaders {
|
||||
req.Header.Set(key, value)
|
||||
}
|
||||
resp, err := session.Do(req)
|
||||
if err != nil {
|
||||
return "", fmt.Errorf("failed to send request: %w", err)
|
||||
return nil, fmt.Errorf("failed to send request: %w", err)
|
||||
}
|
||||
defer resp.Body.Close()
|
||||
|
||||
if resp.StatusCode != http.StatusOK {
|
||||
// return an empty caption
|
||||
// probably 429 error
|
||||
return "", nil
|
||||
return nil, fmt.Errorf("invalid response code: %s", resp.Status)
|
||||
}
|
||||
body, err := io.ReadAll(resp.Body)
|
||||
if err != nil {
|
||||
return "", fmt.Errorf("failed to read response body: %w", err)
|
||||
var response GraphQLResponse
|
||||
decoder := sonic.ConfigFastest.NewDecoder(resp.Body)
|
||||
if err := decoder.Decode(&response); err != nil {
|
||||
return nil, fmt.Errorf("failed to parse response: %w", err)
|
||||
}
|
||||
|
||||
matches := captionPattern.FindStringSubmatch(string(body))
|
||||
if len(matches) < 2 {
|
||||
// post has no caption most likely
|
||||
return "", nil
|
||||
if response.Data == nil {
|
||||
return nil, fmt.Errorf("data is nil")
|
||||
}
|
||||
return html.UnescapeString(matches[1]), nil
|
||||
if response.Status != "ok" {
|
||||
return nil, fmt.Errorf("status is not ok: %s", response.Status)
|
||||
}
|
||||
if response.Data.ShortcodeMedia == nil {
|
||||
return nil, fmt.Errorf("media is nil")
|
||||
}
|
||||
return response.Data, nil
|
||||
}
|
||||
|
||||
func BuildGQLData() (map[string]string, map[string]string, error) {
|
||||
const (
|
||||
domain = "www"
|
||||
requestID = "b"
|
||||
clientCapabilityGrade = "EXCELLENT"
|
||||
sessionInternalID = "7436540909012459023"
|
||||
apiVersion = "1"
|
||||
rolloutHash = "1019933358"
|
||||
appID = "936619743392459"
|
||||
bloksVersionID = "6309c8d03d8a3f47a1658ba38b304a3f837142ef5f637ebf1f8f52d4b802951e"
|
||||
asbdID = "129477"
|
||||
hiddenState = "20126.HYP:instagram_web_pkg.2.1...0"
|
||||
loggedIn = "0"
|
||||
cometRequestID = "7"
|
||||
appVersion = "0"
|
||||
pixelRatio = "2"
|
||||
buildType = "trunk"
|
||||
)
|
||||
session := "::" + util.RandomAlphaString(6)
|
||||
sessionData := util.RandomBase64(8)
|
||||
csrfToken := util.RandomBase64(32)
|
||||
deviceID := util.RandomBase64(24)
|
||||
machineID := util.RandomBase64(24)
|
||||
dynamicFlags := util.RandomBase64(154)
|
||||
clientSessionRnd := util.RandomBase64(154)
|
||||
jazoestBig, err := rand.Int(rand.Reader, big.NewInt(10000))
|
||||
if err != nil {
|
||||
return nil, nil, fmt.Errorf("failed to generate jazoest: %w", err)
|
||||
}
|
||||
jazoest := strconv.FormatInt(jazoestBig.Int64()+1, 10)
|
||||
timestamp := strconv.FormatInt(time.Now().Unix(), 10)
|
||||
cookies := []string{
|
||||
"csrftoken=" + csrfToken,
|
||||
"ig_did=" + deviceID,
|
||||
"wd=1280x720",
|
||||
"dpr=2",
|
||||
"mid=" + machineID,
|
||||
"ig_nrcb=1",
|
||||
}
|
||||
headers := map[string]string{
|
||||
"x-ig-app-id": appID,
|
||||
"X-FB-LSD": sessionData,
|
||||
"X-CSRFToken": csrfToken,
|
||||
"X-Bloks-Version-Id": bloksVersionID,
|
||||
"x-asbd-id": asbdID,
|
||||
"cookie": strings.Join(cookies, "; "),
|
||||
"Content-Type": "application/x-www-form-urlencoded",
|
||||
"X-FB-Friendly-Name": polarisAction,
|
||||
}
|
||||
body := map[string]string{
|
||||
"__d": domain,
|
||||
"__a": apiVersion,
|
||||
"__s": session,
|
||||
"__hs": hiddenState,
|
||||
"__req": requestID,
|
||||
"__ccg": clientCapabilityGrade,
|
||||
"__rev": rolloutHash,
|
||||
"__hsi": sessionInternalID,
|
||||
"__dyn": dynamicFlags,
|
||||
"__csr": clientSessionRnd,
|
||||
"__user": loggedIn,
|
||||
"__comet_req": cometRequestID,
|
||||
"av": appVersion,
|
||||
"dpr": pixelRatio,
|
||||
"lsd": sessionData,
|
||||
"jazoest": jazoest,
|
||||
"__spin_r": rolloutHash,
|
||||
"__spin_b": buildType,
|
||||
"__spin_t": timestamp,
|
||||
}
|
||||
return headers, body, nil
|
||||
}
|
||||
|
|
Loading…
Add table
Add a link
Reference in a new issue