[instagram] Add base extractor for playlists and tag extractor

This commit is contained in:
jhwgh1968 2019-01-20 09:10:46 +00:00 committed by Sergey M
parent 15870747f0
commit 31fbedc06a
2 changed files with 105 additions and 37 deletions

View File

@ -494,7 +494,11 @@
from .inc import IncIE from .inc import IncIE
from .indavideo import IndavideoEmbedIE from .indavideo import IndavideoEmbedIE
from .infoq import InfoQIE from .infoq import InfoQIE
from .instagram import InstagramIE, InstagramUserIE from .instagram import (
InstagramIE,
InstagramUserIE,
InstagramTagIE,
)
from .internazionale import InternazionaleIE from .internazionale import InternazionaleIE
from .internetvideoarchive import InternetVideoArchiveIE from .internetvideoarchive import InternetVideoArchiveIE
from .iprima import IPrimaIE from .iprima import IPrimaIE

View File

@ -227,44 +227,37 @@ def get_count(key, kind):
} }
class InstagramUserIE(InfoExtractor): class InstagramPlaylistIE(InfoExtractor):
_VALID_URL = r'https?://(?:www\.)?instagram\.com/(?P<id>[^/]{2,})/?(?:$|[?#])' # A superclass for handling any kind of query based on GraphQL which
IE_DESC = 'Instagram user profile' # results in a playlist.
IE_NAME = 'instagram:user'
_TEST = {
'url': 'https://instagram.com/porsche',
'info_dict': {
'id': 'porsche',
'title': 'porsche',
},
'playlist_count': 5,
'params': {
'extract_flat': True,
'skip_download': True,
'playlistend': 5,
}
}
_gis_tmpl = None _gis_tmpl = None # used to cache GIS request type
def _entries(self, data): def _parse_graphql(self, webpage, item_id):
# Reads a webpage and returns its GraphQL data.
return self._parse_json(
self._search_regex(
r'sharedData\s*=\s*({.+?})\s*;\s*[<\n]', webpage, 'data'),
item_id)
def _extract_graphql(self, data, url):
# Parses GraphQL queries containing videos and generates a playlist.
def get_count(suffix): def get_count(suffix):
return int_or_none(try_get( return int_or_none(try_get(
node, lambda x: x['edge_media_' + suffix]['count'])) node, lambda x: x['edge_media_' + suffix]['count']))
uploader_id = data['entry_data']['ProfilePage'][0]['graphql']['user']['id'] uploader_id = self._match_id(url)
csrf_token = data['config']['csrf_token'] csrf_token = data['config']['csrf_token']
rhx_gis = data.get('rhx_gis') or '3c7ca9dcefcf966d11dacf1f151335e8' rhx_gis = data.get('rhx_gis') or '3c7ca9dcefcf966d11dacf1f151335e8'
self._set_cookie('instagram.com', 'ig_pr', '1')
cursor = '' cursor = ''
for page_num in itertools.count(1): for page_num in itertools.count(1):
variables = json.dumps({ variables = {
'id': uploader_id,
'first': 12, 'first': 12,
'after': cursor, 'after': cursor,
}) }
variables.update(self._query_vars_for(data))
variables = json.dumps(variables)
if self._gis_tmpl: if self._gis_tmpl:
gis_tmpls = [self._gis_tmpl] gis_tmpls = [self._gis_tmpl]
@ -276,21 +269,26 @@ def get_count(suffix):
'%s:%s:%s' % (rhx_gis, csrf_token, std_headers['User-Agent']), '%s:%s:%s' % (rhx_gis, csrf_token, std_headers['User-Agent']),
] ]
# try all of the ways to generate a GIS query, and not only use the
# first one that works, but cache it for future requests
for gis_tmpl in gis_tmpls: for gis_tmpl in gis_tmpls:
try: try:
media = self._download_json( json_data = self._download_json(
'https://www.instagram.com/graphql/query/', uploader_id, 'https://www.instagram.com/graphql/query/', uploader_id,
'Downloading JSON page %d' % page_num, headers={ 'Downloading JSON page %d' % page_num, headers={
'X-Requested-With': 'XMLHttpRequest', 'X-Requested-With': 'XMLHttpRequest',
'X-Instagram-GIS': hashlib.md5( 'X-Instagram-GIS': hashlib.md5(
('%s:%s' % (gis_tmpl, variables)).encode('utf-8')).hexdigest(), ('%s:%s' % (gis_tmpl, variables)).encode('utf-8')).hexdigest(),
}, query={ }, query={
'query_hash': '42323d64886122307be10013ad2dcc44', 'query_hash': self._QUERY_HASH,
'variables': variables, 'variables': variables,
})['data']['user']['edge_owner_to_timeline_media'] })
media = self._parse_timeline_from(json_data)
self._gis_tmpl = gis_tmpl self._gis_tmpl = gis_tmpl
break break
except ExtractorError as e: except ExtractorError as e:
# if it's an error caused by a bad query, and there are
# more GIS templates to try, ignore it and keep trying
if isinstance(e.cause, compat_HTTPError) and e.cause.code == 403: if isinstance(e.cause, compat_HTTPError) and e.cause.code == 403:
if gis_tmpl != gis_tmpls[-1]: if gis_tmpl != gis_tmpls[-1]:
continue continue
@ -348,14 +346,80 @@ def get_count(suffix):
break break
def _real_extract(self, url): def _real_extract(self, url):
username = self._match_id(url) user_or_tag = self._match_id(url)
webpage = self._download_webpage(url, user_or_tag)
data = self._parse_graphql(webpage, user_or_tag)
webpage = self._download_webpage(url, username) self._set_cookie('instagram.com', 'ig_pr', '1')
data = self._parse_json(
self._search_regex(
r'sharedData\s*=\s*({.+?})\s*;\s*[<\n]', webpage, 'data'),
username)
return self.playlist_result( return self.playlist_result(
self._entries(data), username, username) self._extract_graphql(data, url), user_or_tag, user_or_tag)
class InstagramUserIE(InstagramPlaylistIE):
_VALID_URL = r'https?://(?:www\.)?instagram\.com/(?P<id>[^/]{2,})/?(?:$|[?#])'
IE_DESC = 'Instagram user profile'
IE_NAME = 'instagram:user'
_TEST = {
'url': 'https://instagram.com/porsche',
'info_dict': {
'id': 'porsche',
'title': 'porsche',
},
'playlist_count': 5,
'params': {
'extract_flat': True,
'skip_download': True,
'playlistend': 5,
}
}
_QUERY_HASH = '42323d64886122307be10013ad2dcc44',
@staticmethod
def _parse_timeline_from(data):
# extracts the media timeline data from a GraphQL result
return data['data']['user']['edge_owner_to_timeline_media']
@staticmethod
def _query_vars_for(data):
# returns a dictionary of variables to add to the timeline query based
# on the GraphQL of the original page
return {
'id': data['entry_data']['ProfilePage'][0]['graphql']['user']['id']
}
class InstagramTagIE(InstagramPlaylistIE):
_VALID_URL = r'https?://(?:www\.)?instagram\.com/explore/tags/(?P<id>[^/]+)'
IE_DESC = 'Instagram hashtag search'
IE_NAME = 'instagram:tag'
_TEST = {
'url': 'https://instagram.com/explore/tags/lolcats',
'info_dict': {
'id': 'lolcats',
'title': 'lolcats',
},
'playlist_count': 50,
'params': {
'extract_flat': True,
'skip_download': True,
'playlistend': 50,
}
}
_QUERY_HASH = 'f92f56d47dc7a55b606908374b43a314',
@staticmethod
def _parse_timeline_from(data):
# extracts the media timeline data from a GraphQL result
return data['data']['hashtag']['edge_hashtag_to_media']
@staticmethod
def _query_vars_for(data):
# returns a dictionary of variables to add to the timeline query based
# on the GraphQL of the original page
return {
'tag_name':
data['entry_data']['TagPage'][0]['graphql']['hashtag']['name']
}