1
0
mirror of https://source.netsyms.com/Mirrors/youtube-dl synced 2026-04-23 20:36:55 +00:00

Compare commits

...

16 Commits

Author SHA1 Message Date
Philipp Hagemeister
1f99511210 release 2013.05.04 2013-05-04 07:12:33 +02:00
Philipp Hagemeister
0d94f2474c Work around a Python bug on Windows with UTF-8 configuration (#820) 2013-05-04 07:09:50 +02:00
Philipp Hagemeister
480b6c1e8b Fix comedycentral: newest 2013-05-04 02:53:26 +02:00
Philipp Hagemeister
95464f14d1 Credit @yasoob for IE 2013-05-03 20:08:16 +02:00
Philipp Hagemeister
c34407d16c Simplify RedTube 2013-05-03 20:07:35 +02:00
M.Yasoob Ullah Khalid
5e34d2ebbf Moved redtube info extractor to the end 2013-05-03 23:57:16 +06:00
M.Yasoob Ullah Khalid
815dd2ffa8 Redtube test now works
I just did a little makeover by changing redtube tests. Now they are passed.
2013-05-03 23:51:27 +06:00
M.Yasoob Ullah Khalid
ecd5fb49c5 added redtube.com in InfoExtractors (2nd pull request with the required amindments)
added redtube.com in InfoExtractors (2nd pull request with the required amindments). Now this script can also download redtube.com videos
2013-05-03 22:44:34 +06:00
M.Yasoob Ullah Khalid
b86174e7a3 added test for redtube.com
I just added the test for redtube.com
2013-05-03 22:40:56 +06:00
Jaime Marquínez Ferrándiz
2e2038dc35 TEDIE: report the correct talk title when a link with the language code is given 2013-05-02 18:28:07 +02:00
Jaime Marquínez Ferrándiz
46bfb42258 InfoExtractors: use _download_webpage in more IEs
IEs without tests are intact.
2013-05-02 18:18:27 +02:00
Jaime Marquínez Ferrándiz
feecf22511 InfoExtractors: fix some regular expressions where dots weren't escaped 2013-05-02 13:39:56 +02:00
Jaime Marquínez Ferrándiz
4c4f15eb78 Merge pull request #815 from JohnyMoSwag/master
Update for new source links on worldstarhiphop.com
2013-05-02 13:23:32 +02:00
Jaime Marquínez Ferrándiz
104ccdb8b4 TumblrIE: fix title matching 2013-05-02 13:12:41 +02:00
Johny Mo Swag
6ccff79594 Small update for additon of new video source links 2013-05-01 20:30:14 -07:00
Jaime Marquínez Ferrándiz
aed523ecc1 Add BandcampIE (closes #568) 2013-05-01 15:55:46 +02:00
4 changed files with 179 additions and 213 deletions

View File

@@ -112,7 +112,7 @@
{
"name": "Escapist",
"url": "http://www.escapistmagazine.com/videos/view/the-escapist-presents/6618-Breaking-Down-Baldurs-Gate",
"file": "6618-Breaking-Down-Baldurs-Gate.flv",
"file": "6618-Breaking-Down-Baldurs-Gate.mp4",
"md5": "c6793dbda81388f4264c1ba18684a74d",
"skip": "Fails with timeout on Travis"
},
@@ -344,7 +344,7 @@
"file": "17258355236.mp4",
"md5": "7c6a514d691b034ccf8567999e9e88a3",
"info_dict": {
"title": "A sample video from LeeAnn. (If you need an idea..."
"title": "Calling all Pris! - A sample video from LeeAnn. (If you need an idea..."
}
},
{
@@ -394,6 +394,23 @@
}
}
]
},
{
"name":"Bandcamp",
"url":"http://youtube-dl.bandcamp.com/track/youtube-dl-test-song",
"file":"1812978515.mp3",
"md5":"cdeb30cdae1921719a3cbcab696ef53c",
"info_dict": {
"title":"youtube-dl test song \"'/\\ä↭"
}
},
{
"name": "RedTube",
"url": "http://www.redtube.com/66418",
"file": "66418.mp4",
"md5": "7b8c22b5e7098a3e1c09709df1126d2d",
"info_dict":{
"title":"Sucked on a toilet"
}
}
]

View File

@@ -124,8 +124,8 @@ class InfoExtractor(object):
errnote = u'Unable to download webpage'
raise ExtractorError(u'%s: %s' % (errnote, compat_str(err)), sys.exc_info()[2])
def _download_webpage(self, url_or_request, video_id, note=None, errnote=None):
""" Returns the data of the page as a string """
def _download_webpage_handle(self, url_or_request, video_id, note=None, errnote=None):
""" Returns a tuple (page content as string, URL handle) """
urlh = self._request_webpage(url_or_request, video_id, note, errnote)
content_type = urlh.headers.get('Content-Type', '')
m = re.match(r'[a-zA-Z0-9_.-]+/[a-zA-Z0-9_.-]+\s*;\s*charset=(.+)', content_type)
@@ -142,7 +142,12 @@ class InfoExtractor(object):
self.to_screen(u'Dumping request to ' + url)
dump = base64.b64encode(webpage_bytes).decode('ascii')
self._downloader.to_screen(dump)
return webpage_bytes.decode(encoding, 'replace')
content = webpage_bytes.decode(encoding, 'replace')
return (content, urlh)
def _download_webpage(self, url_or_request, video_id, note=None, errnote=None):
""" Returns the data of the page as a string """
return self._download_webpage_handle(url_or_request, video_id, note, errnote)[0]
def to_screen(self, msg):
"""Print msg to screen, prefixing it with '[ie_name]'"""
@@ -779,7 +784,6 @@ class MetacafeIE(InfoExtractor):
'ext': video_extension.decode('utf-8'),
}]
class DailymotionIE(InfoExtractor):
"""Information Extractor for Dailymotion"""
@@ -1069,13 +1073,7 @@ class VimeoIE(InfoExtractor):
# Retrieve video webpage to extract further information
request = compat_urllib_request.Request(url, None, std_headers)
try:
self.report_download_webpage(video_id)
webpage_bytes = compat_urllib_request.urlopen(request).read()
webpage = webpage_bytes.decode('utf-8')
except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
self._downloader.report_error(u'Unable to retrieve video webpage: %s' % compat_str(err))
return
webpage = self._download_webpage(request, video_id)
# Now we begin extracting as much information as we can from what we
# retrieved. First we extract the information common to all extractors,
@@ -1682,10 +1680,6 @@ class YoutubePlaylistIE(InfoExtractor):
"""Receives a URL and returns True if suitable for this IE."""
return re.match(cls._VALID_URL, url, re.VERBOSE) is not None
def report_download_page(self, playlist_id, pagenum):
"""Report attempt to download playlist page with given number."""
self._downloader.to_screen(u'[youtube] PL %s: Downloading page #%s' % (playlist_id, pagenum))
def _real_extract(self, url):
# Extract playlist id
mobj = re.match(self._VALID_URL, url, re.VERBOSE)
@@ -1699,14 +1693,8 @@ class YoutubePlaylistIE(InfoExtractor):
videos = []
while True:
self.report_download_page(playlist_id, page_num)
url = self._TEMPLATE_URL % (playlist_id, self._MAX_RESULTS, self._MAX_RESULTS * (page_num - 1) + 1)
try:
page = compat_urllib_request.urlopen(url).read().decode('utf8')
except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
self._downloader.report_error(u'unable to download webpage: %s' % compat_str(err))
return
page = self._download_webpage(url, playlist_id, u'Downloading page #%s' % page_num)
try:
response = json.loads(page)
@@ -1745,10 +1733,6 @@ class YoutubeChannelIE(InfoExtractor):
_MORE_PAGES_URL = 'http://www.youtube.com/channel_ajax?action_load_more_videos=1&flow=list&paging=%s&view=0&sort=da&channel_id=%s'
IE_NAME = u'youtube:channel'
def report_download_page(self, channel_id, pagenum):
"""Report attempt to download channel page with given number."""
self._downloader.to_screen(u'[youtube] Channel %s: Downloading page #%s' % (channel_id, pagenum))
def extract_videos_from_page(self, page):
ids_in_page = []
for mobj in re.finditer(r'href="/watch\?v=([0-9A-Za-z_-]+)&?', page):
@@ -1768,14 +1752,9 @@ class YoutubeChannelIE(InfoExtractor):
video_ids = []
pagenum = 1
self.report_download_page(channel_id, pagenum)
url = self._TEMPLATE_URL % (channel_id, pagenum)
request = compat_urllib_request.Request(url)
try:
page = compat_urllib_request.urlopen(request).read().decode('utf8')
except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
self._downloader.report_error(u'unable to download webpage: %s' % compat_str(err))
return
page = self._download_webpage(url, channel_id,
u'Downloading page #%s' % pagenum)
# Extract video identifiers
ids_in_page = self.extract_videos_from_page(page)
@@ -1786,14 +1765,9 @@ class YoutubeChannelIE(InfoExtractor):
while True:
pagenum = pagenum + 1
self.report_download_page(channel_id, pagenum)
url = self._MORE_PAGES_URL % (pagenum, channel_id)
request = compat_urllib_request.Request(url)
try:
page = compat_urllib_request.urlopen(request).read().decode('utf8')
except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
self._downloader.report_error(u'unable to download webpage: %s' % compat_str(err))
return
page = self._download_webpage(url, channel_id,
u'Downloading page #%s' % pagenum)
page = json.loads(page)
@@ -1820,11 +1794,6 @@ class YoutubeUserIE(InfoExtractor):
_VIDEO_INDICATOR = r'/watch\?v=(.+?)[\<&]'
IE_NAME = u'youtube:user'
def report_download_page(self, username, start_index):
"""Report attempt to download user page."""
self._downloader.to_screen(u'[youtube] user %s: Downloading video ids from %d to %d' %
(username, start_index, start_index + self._GDATA_PAGE_SIZE))
def _real_extract(self, url):
# Extract username
mobj = re.match(self._VALID_URL, url)
@@ -1844,15 +1813,10 @@ class YoutubeUserIE(InfoExtractor):
while True:
start_index = pagenum * self._GDATA_PAGE_SIZE + 1
self.report_download_page(username, start_index)
request = compat_urllib_request.Request(self._GDATA_URL % (username, self._GDATA_PAGE_SIZE, start_index))
try:
page = compat_urllib_request.urlopen(request).read().decode('utf-8')
except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
self._downloader.report_error(u'unable to download webpage: %s' % compat_str(err))
return
gdata_url = self._GDATA_URL % (username, self._GDATA_PAGE_SIZE, start_index)
page = self._download_webpage(gdata_url, username,
u'Downloading video ids from %d to %d' % (start_index, start_index + self._GDATA_PAGE_SIZE))
# Extract video identifiers
ids_in_page = []
@@ -1886,11 +1850,6 @@ class BlipTVUserIE(InfoExtractor):
_PAGE_SIZE = 12
IE_NAME = u'blip.tv:user'
def report_download_page(self, username, pagenum):
"""Report attempt to download user page."""
self.to_screen(u'user %s: Downloading video ids from page %d' %
(username, pagenum))
def _real_extract(self, url):
# Extract username
mobj = re.match(self._VALID_URL, url)
@@ -1902,15 +1861,9 @@ class BlipTVUserIE(InfoExtractor):
page_base = 'http://m.blip.tv/pr/show_get_full_episode_list?users_id=%s&lite=0&esi=1'
request = compat_urllib_request.Request(url)
try:
page = compat_urllib_request.urlopen(request).read().decode('utf-8')
mobj = re.search(r'data-users-id="([^"]+)"', page)
page_base = page_base % mobj.group(1)
except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
self._downloader.report_error(u'unable to download webpage: %s' % compat_str(err))
return
page = self._download_webpage(url, username, u'Downloading user page')
mobj = re.search(r'data-users-id="([^"]+)"', page)
page_base = page_base % mobj.group(1)
# Download video ids using BlipTV Ajax calls. Result size per
@@ -1922,14 +1875,9 @@ class BlipTVUserIE(InfoExtractor):
pagenum = 1
while True:
self.report_download_page(username, pagenum)
url = page_base + "&page=" + str(pagenum)
request = compat_urllib_request.Request( url )
try:
page = compat_urllib_request.urlopen(request).read().decode('utf-8')
except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
self._downloader.report_error(u'unable to download webpage: %s' % str(err))
return
page = self._download_webpage(url, username,
u'Downloading video ids from page %d' % pagenum)
# Extract video identifiers
ids_in_page = []
@@ -2288,12 +2236,6 @@ class ComedyCentralIE(InfoExtractor):
"""Receives a URL and returns True if suitable for this IE."""
return re.match(cls._VALID_URL, url, re.VERBOSE) is not None
def report_config_download(self, episode_id, media_id):
self.to_screen(u'%s: Downloading configuration for %s' % (episode_id, media_id))
def report_index_download(self, episode_id):
self.to_screen(u'%s: Downloading show index' % episode_id)
def _print_formats(self, formats):
print('Available formats:')
for x in formats:
@@ -2327,24 +2269,15 @@ class ComedyCentralIE(InfoExtractor):
else:
epTitle = mobj.group('episode')
req = compat_urllib_request.Request(url)
self.report_extraction(epTitle)
try:
htmlHandle = compat_urllib_request.urlopen(req)
html = htmlHandle.read()
webpage = html.decode('utf-8')
except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
self._downloader.report_error(u'unable to download webpage: %s' % compat_str(err))
return
webpage,htmlHandle = self._download_webpage_handle(url, epTitle)
if dlNewest:
url = htmlHandle.geturl()
mobj = re.match(self._VALID_URL, url, re.VERBOSE)
if mobj is None:
self._downloader.report_error(u'Invalid redirected URL: ' + url)
return
raise ExtractorError(u'Invalid redirected URL: ' + url)
if mobj.group('episode') == '':
self._downloader.report_error(u'Redirected URL is still not specific: ' + url)
return
raise ExtractorError(u'Redirected URL is still not specific: ' + url)
epTitle = mobj.group('episode')
mMovieParams = re.findall('(?:<param name="movie" value="|var url = ")(http://media.mtvnservices.com/([^"]*(?:episode|video).*?:.*?))"', webpage)
@@ -2356,19 +2289,15 @@ class ComedyCentralIE(InfoExtractor):
altMovieParams = re.findall('data-mgid="([^"]*(?:episode|video).*?:.*?)"', webpage)
if len(altMovieParams) == 0:
self._downloader.report_error(u'unable to find Flash URL in webpage ' + url)
return
raise ExtractorError(u'unable to find Flash URL in webpage ' + url)
else:
mMovieParams = [("http://media.mtvnservices.com/" + altMovieParams[0], altMovieParams[0])]
uri = mMovieParams[0][1]
indexUrl = 'http://shadow.comedycentral.com/feeds/video_player/mrss/?' + compat_urllib_parse.urlencode({'uri': uri})
self.report_index_download(epTitle)
try:
indexXml = compat_urllib_request.urlopen(indexUrl).read()
except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
self._downloader.report_error(u'unable to download episode index: ' + compat_str(err))
return
indexXml = self._download_webpage(indexUrl, epTitle,
u'Downloading show index',
u'unable to download episode index')
results = []
@@ -2383,13 +2312,8 @@ class ComedyCentralIE(InfoExtractor):
configUrl = ('http://www.comedycentral.com/global/feeds/entertainment/media/mediaGenEntertainment.jhtml?' +
compat_urllib_parse.urlencode({'uri': mediaId}))
configReq = compat_urllib_request.Request(configUrl)
self.report_config_download(epTitle, shortMediaId)
try:
configXml = compat_urllib_request.urlopen(configReq).read()
except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
self._downloader.report_error(u'unable to download webpage: %s' % compat_str(err))
return
configXml = self._download_webpage(configUrl, epTitle,
u'Downloading configuration for %s' % shortMediaId)
cdoc = xml.etree.ElementTree.fromstring(configXml)
turls = []
@@ -2446,9 +2370,6 @@ class EscapistIE(InfoExtractor):
_VALID_URL = r'^(https?://)?(www\.)?escapistmagazine\.com/videos/view/(?P<showname>[^/]+)/(?P<episode>[^/?]+)[/?]?.*$'
IE_NAME = u'escapist'
def report_config_download(self, showName):
self.to_screen(u'%s: Downloading configuration' % showName)
def _real_extract(self, url):
mobj = re.match(self._VALID_URL, url)
if mobj is None:
@@ -2458,14 +2379,7 @@ class EscapistIE(InfoExtractor):
videoId = mobj.group('episode')
self.report_extraction(showName)
try:
webPage = compat_urllib_request.urlopen(url)
webPageBytes = webPage.read()
m = re.match(r'text/html; charset="?([^"]+)"?', webPage.headers['Content-Type'])
webPage = webPageBytes.decode(m.group(1) if m else 'utf-8')
except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
self._downloader.report_error(u'unable to download webpage: ' + compat_str(err))
return
webPage = self._download_webpage(url, showName)
descMatch = re.search('<meta name="description" content="([^"]*)"', webPage)
description = unescapeHTML(descMatch.group(1))
@@ -2476,14 +2390,9 @@ class EscapistIE(InfoExtractor):
configUrlMatch = re.search('config=(.*)$', playerUrl)
configUrl = compat_urllib_parse.unquote(configUrlMatch.group(1))
self.report_config_download(showName)
try:
configJSON = compat_urllib_request.urlopen(configUrl)
m = re.match(r'text/html; charset="?([^"]+)"?', configJSON.headers['Content-Type'])
configJSON = configJSON.read().decode(m.group(1) if m else 'utf-8')
except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
self._downloader.report_error(u'unable to download configuration: ' + compat_str(err))
return
configJSON = self._download_webpage(configUrl, showName,
u'Downloading configuration',
u'unable to download configuration')
# Technically, it's JavaScript, not JSON
configJSON = configJSON.replace("'", '"')
@@ -2661,31 +2570,22 @@ class SoundcloudIE(InfoExtractor):
# extract simple title (uploader + slug of song title)
slug_title = mobj.group(2)
simple_title = uploader + u'-' + slug_title
full_title = '%s/%s' % (uploader, slug_title)
self.report_resolve('%s/%s' % (uploader, slug_title))
self.report_resolve(full_title)
url = 'http://soundcloud.com/%s/%s' % (uploader, slug_title)
resolv_url = 'http://api.soundcloud.com/resolve.json?url=' + url + '&client_id=b45b1aa10f1ac2941910a7f0d10f8e28'
request = compat_urllib_request.Request(resolv_url)
try:
info_json_bytes = compat_urllib_request.urlopen(request).read()
info_json = info_json_bytes.decode('utf-8')
except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
self._downloader.report_error(u'unable to download video webpage: %s' % compat_str(err))
return
info_json = self._download_webpage(resolv_url, full_title, u'Downloading info JSON')
info = json.loads(info_json)
video_id = info['id']
self.report_extraction('%s/%s' % (uploader, slug_title))
self.report_extraction(full_title)
streams_url = 'https://api.sndcdn.com/i1/tracks/' + str(video_id) + '/streams?client_id=b45b1aa10f1ac2941910a7f0d10f8e28'
request = compat_urllib_request.Request(streams_url)
try:
stream_json_bytes = compat_urllib_request.urlopen(request).read()
stream_json = stream_json_bytes.decode('utf-8')
except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
self._downloader.report_error(u'unable to download stream definitions: %s' % compat_str(err))
return
stream_json = self._download_webpage(streams_url, full_title,
u'Downloading stream definitions',
u'unable to download stream definitions')
streams = json.loads(stream_json)
mediaURL = streams['http_mp3_128_url']
@@ -2728,18 +2628,13 @@ class SoundcloudSetIE(InfoExtractor):
# extract simple title (uploader + slug of song title)
slug_title = mobj.group(2)
simple_title = uploader + u'-' + slug_title
full_title = '%s/sets/%s' % (uploader, slug_title)
self.report_resolve('%s/sets/%s' % (uploader, slug_title))
self.report_resolve(full_title)
url = 'http://soundcloud.com/%s/sets/%s' % (uploader, slug_title)
resolv_url = 'http://api.soundcloud.com/resolve.json?url=' + url + '&client_id=b45b1aa10f1ac2941910a7f0d10f8e28'
request = compat_urllib_request.Request(resolv_url)
try:
info_json_bytes = compat_urllib_request.urlopen(request).read()
info_json = info_json_bytes.decode('utf-8')
except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
self._downloader.report_error(u'unable to download video webpage: %s' % compat_str(err))
return
info_json = self._download_webpage(resolv_url, full_title)
videos = []
info = json.loads(info_json)
@@ -2748,19 +2643,14 @@ class SoundcloudSetIE(InfoExtractor):
self._downloader.report_error(u'unable to download video webpage: %s' % compat_str(err['error_message']))
return
self.report_extraction(full_title)
for track in info['tracks']:
video_id = track['id']
self.report_extraction('%s/sets/%s' % (uploader, slug_title))
streams_url = 'https://api.sndcdn.com/i1/tracks/' + str(video_id) + '/streams?client_id=b45b1aa10f1ac2941910a7f0d10f8e28'
request = compat_urllib_request.Request(streams_url)
try:
stream_json_bytes = compat_urllib_request.urlopen(request).read()
stream_json = stream_json_bytes.decode('utf-8')
except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
self._downloader.report_error(u'unable to download stream definitions: %s' % compat_str(err))
return
stream_json = self._download_webpage(streams_url, video_id, u'Downloading track info JSON')
self.report_extraction(video_id)
streams = json.loads(stream_json)
mediaURL = streams['http_mp3_128_url']
@@ -3155,18 +3045,11 @@ class YoukuIE(InfoExtractor):
info_url = 'http://v.youku.com/player/getPlayList/VideoIDS/' + video_id
request = compat_urllib_request.Request(info_url, None, std_headers)
try:
self.report_download_webpage(video_id)
jsondata = compat_urllib_request.urlopen(request).read()
except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
self._downloader.report_error(u'Unable to retrieve video webpage: %s' % compat_str(err))
return
jsondata = self._download_webpage(info_url, video_id)
self.report_extraction(video_id)
try:
jsonstr = jsondata.decode('utf-8')
config = json.loads(jsonstr)
config = json.loads(jsondata)
video_title = config['data'][0]['title']
seed = config['data'][0]['seed']
@@ -3234,15 +3117,8 @@ class XNXXIE(InfoExtractor):
return
video_id = mobj.group(1)
self.report_download_webpage(video_id)
# Get webpage content
try:
webpage_bytes = compat_urllib_request.urlopen(url).read()
webpage = webpage_bytes.decode('utf-8')
except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
self._downloader.report_error(u'unable to download video webpage: %s' % err)
return
webpage = self._download_webpage(url, video_id)
result = re.search(self.VIDEO_URL_RE, webpage)
if result is None:
@@ -3314,12 +3190,7 @@ class GooglePlusIE(InfoExtractor):
# Step 1, Retrieve post webpage to extract further information
self.report_extract_entry(post_url)
request = compat_urllib_request.Request(post_url)
try:
webpage = compat_urllib_request.urlopen(request).read().decode('utf-8')
except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
self._downloader.report_error(u'Unable to retrieve entry webpage: %s' % compat_str(err))
return
webpage = self._download_webpage(post_url, video_id, u'Downloading entry webpage')
# Extract update date
upload_date = None
@@ -3356,12 +3227,7 @@ class GooglePlusIE(InfoExtractor):
self._downloader.report_error(u'unable to extract video page URL')
video_page = mobj.group(1)
request = compat_urllib_request.Request(video_page)
try:
webpage = compat_urllib_request.urlopen(request).read().decode('utf-8')
except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
self._downloader.report_error(u'Unable to retrieve video webpage: %s' % compat_str(err))
return
webpage = self._download_webpage(video_page, video_id, u'Downloading video page')
self.report_extract_vid_page(video_page)
@@ -3448,14 +3314,10 @@ class JustinTVIE(InfoExtractor):
(channel, offset, offset + self._JUSTIN_PAGE_LIMIT))
# Return count of items, list of *valid* items
def _parse_page(self, url):
try:
urlh = compat_urllib_request.urlopen(url)
webpage_bytes = urlh.read()
webpage = webpage_bytes.decode('utf-8', 'ignore')
except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
self._downloader.report_error(u'unable to download video info JSON: %s' % compat_str(err))
return
def _parse_page(self, url, video_id):
webpage = self._download_webpage(url, video_id,
u'Downloading video info JSON',
u'unable to download video info JSON')
response = json.loads(webpage)
if type(response) != list:
@@ -3507,7 +3369,7 @@ class JustinTVIE(InfoExtractor):
if paged:
self.report_download_page(video_id, offset)
page_url = api + ('?offset=%d&limit=%d' % (offset, limit))
page_count, page_info = self._parse_page(page_url)
page_count, page_info = self._parse_page(page_url, video_id)
info.extend(page_info)
if not paged or page_count != limit:
break
@@ -3554,7 +3416,7 @@ class FunnyOrDieIE(InfoExtractor):
return [info]
class SteamIE(InfoExtractor):
_VALID_URL = r"""http://store.steampowered.com/
_VALID_URL = r"""http://store\.steampowered\.com/
(agecheck/)?
(?P<urltype>video|app)/ #If the page is only for videos or for a game
(?P<gameID>\d+)/?
@@ -3625,16 +3487,15 @@ class WorldStarHipHopIE(InfoExtractor):
IE_NAME = u'WorldStarHipHop'
def _real_extract(self, url):
_src_url = r"""(http://hw-videos.*(?:mp4|flv))"""
webpage_src = compat_urllib_request.urlopen(url).read()
webpage_src = webpage_src.decode('utf-8')
mobj = re.search(_src_url, webpage_src)
_src_url = r"""(http://(hw-videos|hw-post1).*(?:mp4|flv))"""
m = re.match(self._VALID_URL, url)
video_id = m.group('id')
webpage_src = self._download_webpage(url, video_id)
mobj = re.search(_src_url, webpage_src)
if mobj is not None:
video_url = mobj.group()
if 'mp4' in video_url:
@@ -3988,12 +3849,13 @@ class KeekIE(InfoExtractor):
return [info]
class TEDIE(InfoExtractor):
_VALID_URL=r'''http://www.ted.com/
_VALID_URL=r'''http://www\.ted\.com/
(
((?P<type_playlist>playlists)/(?P<playlist_id>\d+)) # We have a playlist
|
((?P<type_talk>talks)) # We have a simple talk
)
(/lang/(.*?))? # The url may contain the language
/(?P<name>\w+) # Here goes the name and then ".html"
'''
@@ -4244,7 +4106,7 @@ class ARDIE(InfoExtractor):
return [info]
class TumblrIE(InfoExtractor):
_VALID_URL = r'http://(?P<blog_name>.*?).tumblr.com/((post)|(video))/(?P<id>\d*)/(.*?)'
_VALID_URL = r'http://(?P<blog_name>.*?)\.tumblr\.com/((post)|(video))/(?P<id>\d*)/(.*?)'
def _real_extract(self, url):
m_url = re.match(self._VALID_URL, url)
@@ -4254,7 +4116,7 @@ class TumblrIE(InfoExtractor):
url = 'http://%s.tumblr.com/post/%s/' % (blog, video_id)
webpage = self._download_webpage(url, video_id)
re_video = r'src=\\x22(?P<video_url>http://%s.tumblr.com/video_file/%s/(.*?))\\x22 type=\\x22video/(?P<ext>.*?)\\x22' % (blog, video_id)
re_video = r'src=\\x22(?P<video_url>http://%s\.tumblr\.com/video_file/%s/(.*?))\\x22 type=\\x22video/(?P<ext>.*?)\\x22' % (blog, video_id)
video = re.search(re_video, webpage)
if video is None:
self.to_screen("No video founded")
@@ -4267,8 +4129,8 @@ class TumblrIE(InfoExtractor):
# The only place where you can get a title, it's not complete,
# but searching in other places doesn't work for all videos
re_title = r'<title>(.*?) - (?P<title>.*?)</title>'
title = unescapeHTML(re.search(re_title, webpage).group('title'))
re_title = r'<title>(?P<title>.*?)</title>'
title = unescapeHTML(re.search(re_title, webpage, re.DOTALL).group('title'))
return [{'id': video_id,
'url': video_url,
@@ -4277,6 +4139,83 @@ class TumblrIE(InfoExtractor):
'ext': ext
}]
class BandcampIE(InfoExtractor):
_VALID_URL = r'http://.*?\.bandcamp\.com/track/(?P<title>.*)'
def _real_extract(self, url):
mobj = re.match(self._VALID_URL, url)
title = mobj.group('title')
webpage = self._download_webpage(url, title)
# We get the link to the free download page
m_download = re.search(r'freeDownloadPage: "(.*?)"', webpage)
if m_download is None:
self._downloader.report_error('No free songs founded')
return
download_link = m_download.group(1)
id = re.search(r'var TralbumData = {(.*?)id: (?P<id>\d*?)$',
webpage, re.MULTILINE|re.DOTALL).group('id')
download_webpage = self._download_webpage(download_link, id,
'Downloading free downloads page')
# We get the dictionary of the track from some javascrip code
info = re.search(r'items: (.*?),$',
download_webpage, re.MULTILINE).group(1)
info = json.loads(info)[0]
# We pick mp3-320 for now, until format selection can be easily implemented.
mp3_info = info[u'downloads'][u'mp3-320']
# If we try to use this url it says the link has expired
initial_url = mp3_info[u'url']
re_url = r'(?P<server>http://(.*?)\.bandcamp\.com)/download/track\?enc=mp3-320&fsig=(?P<fsig>.*?)&id=(?P<id>.*?)&ts=(?P<ts>.*)$'
m_url = re.match(re_url, initial_url)
#We build the url we will use to get the final track url
# This url is build in Bandcamp in the script download_bunde_*.js
request_url = '%s/statdownload/track?enc=mp3-320&fsig=%s&id=%s&ts=%s&.rand=665028774616&.vrs=1' % (m_url.group('server'), m_url.group('fsig'), id, m_url.group('ts'))
final_url_webpage = self._download_webpage(request_url, id, 'Requesting download url')
# If we could correctly generate the .rand field the url would be
#in the "download_url" key
final_url = re.search(r'"retry_url":"(.*?)"', final_url_webpage).group(1)
track_info = {'id':id,
'title' : info[u'title'],
'ext' : 'mp3',
'url' : final_url,
'thumbnail' : info[u'thumb_url'],
'uploader' : info[u'artist']
}
return [track_info]
class RedTubeIE(InfoExtractor):
"""Information Extractor for redtube"""
_VALID_URL = r'(?:http://)?(?:www\.)?redtube\.com/(?P<id>[0-9]+)'
def _real_extract(self,url):
mobj = re.match(self._VALID_URL, url)
if mobj is None:
raise ExtractorError(u'Invalid URL: %s' % url)
video_id = mobj.group('id')
video_extension = 'mp4'
webpage = self._download_webpage(url, video_id)
self.report_extraction(video_id)
mobj = re.search(r'<source src="'+'(.+)'+'" type="video/mp4">',webpage)
if mobj is None:
raise ExtractorError(u'Unable to extract media URL')
video_url = mobj.group(1)
mobj = re.search('<h1 class="videoTitle slidePanelMovable">(.+)</h1>',webpage)
if mobj is None:
raise ExtractorError(u'Unable to extract title')
video_title = mobj.group(1)
return [{
'id': video_id,
'url': video_url,
'ext': video_extension,
'title': video_title,
}]
def gen_extractors():
""" Return a list of an instance of every supported extractor.
@@ -4332,6 +4271,8 @@ def gen_extractors():
LiveLeakIE(),
ARDIE(),
TumblrIE(),
BandcampIE(),
RedTubeIE(),
GenericIE()
]

View File

@@ -25,10 +25,12 @@ __authors__ = (
'Jeff Crouse',
'Osama Khalid',
'Michael Walter',
'M. Yasoob Ullah Khalid',
)
__license__ = 'Public Domain'
import codecs
import getpass
import optparse
import os
@@ -334,6 +336,11 @@ def parseOpts(overrideArguments=None):
return parser, opts, args
def _real_main(argv=None):
# Compatibility fixes for Windows
if sys.platform == 'win32':
# https://github.com/rg3/youtube-dl/issues/820
codecs.register(lambda name: codecs.lookup('utf-8') if name == 'cp65001' else None)
parser, opts, args = parseOpts(argv)
# Open appropriate CookieJar
@@ -388,7 +395,8 @@ def _real_main(argv=None):
if 'http' in proxies and 'https' not in proxies:
proxies['https'] = proxies['http']
proxy_handler = compat_urllib_request.ProxyHandler(proxies)
opener = compat_urllib_request.build_opener(proxy_handler, cookie_processor, YoutubeDLHandler())
https_handler = compat_urllib_request.HTTPSHandler()
opener = compat_urllib_request.build_opener(https_handler, proxy_handler, cookie_processor, YoutubeDLHandler())
compat_urllib_request.install_opener(opener)
socket.setdefaulttimeout(300) # 5 minutes should be enough (famous last words)

View File

@@ -1,2 +1,2 @@
__version__ = '2013.05.01'
__version__ = '2013.05.04'